avifGuessBufferFileFormat:
  273|    392|{
  274|    392|    if (size == 0) {
  ------------------
  |  Branch (274:9): [True: 0, False: 392]
  ------------------
  275|      0|        return AVIF_APP_FILE_FORMAT_UNKNOWN;
  276|      0|    }
  277|       |
  278|    392|    avifROData header;
  279|    392|    header.data = data;
  280|    392|    header.size = size;
  281|       |
  282|    392|    if (avifPeekCompatibleFileType(&header)) {
  ------------------
  |  Branch (282:9): [True: 256, False: 136]
  ------------------
  283|    256|        return AVIF_APP_FILE_FORMAT_AVIF;
  284|    256|    }
  285|       |
  286|    136|    static const uint8_t signatureJPEG[2] = { 0xFF, 0xD8 };
  287|    136|    static const uint8_t signaturePNG[8] = { 0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A };
  288|    136|    static const uint8_t signatureY4M[9] = { 0x59, 0x55, 0x56, 0x34, 0x4D, 0x50, 0x45, 0x47, 0x32 }; // "YUV4MPEG2"
  289|    136|    struct avifHeaderSignature
  290|    136|    {
  291|    136|        avifAppFileFormat format;
  292|    136|        const uint8_t * magic;
  293|    136|        size_t magicSize;
  294|    136|    } signatures[] = { { AVIF_APP_FILE_FORMAT_JPEG, signatureJPEG, sizeof(signatureJPEG) },
  295|    136|                       { AVIF_APP_FILE_FORMAT_PNG, signaturePNG, sizeof(signaturePNG) },
  296|    136|                       { AVIF_APP_FILE_FORMAT_Y4M, signatureY4M, sizeof(signatureY4M) } };
  297|    136|    const size_t signaturesCount = sizeof(signatures) / sizeof(signatures[0]);
  298|       |
  299|    252|    for (size_t signatureIndex = 0; signatureIndex < signaturesCount; ++signatureIndex) {
  ------------------
  |  Branch (299:37): [True: 244, False: 8]
  ------------------
  300|    244|        const struct avifHeaderSignature * const signature = &signatures[signatureIndex];
  301|    244|        if (header.size < signature->magicSize) {
  ------------------
  |  Branch (301:13): [True: 0, False: 244]
  ------------------
  302|      0|            continue;
  303|      0|        }
  304|    244|        if (!memcmp(header.data, signature->magic, signature->magicSize)) {
  ------------------
  |  Branch (304:13): [True: 128, False: 116]
  ------------------
  305|    128|            return signature->format;
  306|    128|        }
  307|    244|    }
  308|       |
  309|      8|    return AVIF_APP_FILE_FORMAT_UNKNOWN;
  310|    136|}

_ZN4absl12lts_2024011613base_internal16SchedulingHelperC2ENS1_14SchedulingModeE:
  116|     32|  explicit SchedulingHelper(base_internal::SchedulingMode mode) : mode_(mode) {
  117|     32|    if (mode_ == base_internal::SCHEDULE_KERNEL_ONLY) {
  ------------------
  |  Branch (117:9): [True: 0, False: 32]
  ------------------
  118|      0|      guard_result_ = base_internal::SchedulingGuard::DisableRescheduling();
  119|      0|    }
  120|     32|  }
_ZN4absl12lts_2024011613base_internal16SchedulingHelperD2Ev:
  122|     32|  ~SchedulingHelper() {
  123|     32|    if (mode_ == base_internal::SCHEDULE_KERNEL_ONLY) {
  ------------------
  |  Branch (123:9): [True: 0, False: 32]
  ------------------
  124|      0|      base_internal::SchedulingGuard::EnableRescheduling(guard_result_);
  125|      0|    }
  126|     32|  }
_ZN4absl12lts_2024011613base_internal11ControlWordEPNS0_9once_flagE:
  193|     60|    absl::Nonnull<once_flag*> flag) {
  194|     60|  return &flag->control_;
  195|     60|}
_ZN4absl12lts_202401169call_onceIMNS0_14flags_internal8FlagImplEFvvEJPS3_EEEvRNS0_9once_flagEOT_DpOT0_:
  212|     58|void call_once(absl::once_flag& flag, Callable&& fn, Args&&... args) {
  213|     58|  std::atomic<uint32_t>* once = base_internal::ControlWord(&flag);
  214|     58|  uint32_t s = once->load(std::memory_order_acquire);
  215|     58|  if (ABSL_PREDICT_FALSE(s != base_internal::kOnceDone)) {
  ------------------
  |  |  178|     58|#define ABSL_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
  |  |  ------------------
  |  |  |  Branch (178:31): [True: 30, False: 28]
  |  |  |  Branch (178:49): [Folded, False: 58]
  |  |  |  Branch (178:58): [True: 30, False: 28]
  |  |  ------------------
  ------------------
  216|     30|    base_internal::CallOnceImpl(
  217|     30|        once, base_internal::SCHEDULE_COOPERATIVE_AND_KERNEL,
  218|     30|        std::forward<Callable>(fn), std::forward<Args>(args)...);
  219|     30|  }
  220|     58|}
_ZN4absl12lts_2024011613base_internal12CallOnceImplIMNS0_14flags_internal8FlagImplEFvvEJPS4_EEEvPNSt3__16atomicIjEENS1_14SchedulingModeEOT_DpOT0_:
  153|     30|    Args&&... args) {
  154|       |#ifndef NDEBUG
  155|       |  {
  156|       |    uint32_t old_control = control->load(std::memory_order_relaxed);
  157|       |    if (old_control != kOnceInit &&
  158|       |        old_control != kOnceRunning &&
  159|       |        old_control != kOnceWaiter &&
  160|       |        old_control != kOnceDone) {
  161|       |      ABSL_RAW_LOG(FATAL, "Unexpected value for control word: 0x%lx",
  162|       |                   static_cast<unsigned long>(old_control));  // NOLINT
  163|       |    }
  164|       |  }
  165|       |#endif  // NDEBUG
  166|     30|  static const base_internal::SpinLockWaitTransition trans[] = {
  167|     30|      {kOnceInit, kOnceRunning, true},
  168|     30|      {kOnceRunning, kOnceWaiter, false},
  169|     30|      {kOnceDone, kOnceDone, true}};
  170|       |
  171|       |  // Must do this before potentially modifying control word's state.
  172|     30|  base_internal::SchedulingHelper maybe_disable_scheduling(scheduling_mode);
  173|       |  // Short circuit the simplest case to avoid procedure call overhead.
  174|       |  // The base_internal::SpinLockWait() call returns either kOnceInit or
  175|       |  // kOnceDone. If it returns kOnceDone, it must have loaded the control word
  176|       |  // with std::memory_order_acquire and seen a value of kOnceDone.
  177|     30|  uint32_t old_control = kOnceInit;
  178|     30|  if (control->compare_exchange_strong(old_control, kOnceRunning,
  ------------------
  |  Branch (178:7): [True: 30, False: 0]
  ------------------
  179|     30|                                       std::memory_order_relaxed) ||
  180|      0|      base_internal::SpinLockWait(control, ABSL_ARRAYSIZE(trans), trans,
  ------------------
  |  |   45|      0|  (sizeof(::absl::macros_internal::ArraySizeHelper(array)))
  ------------------
  |  Branch (180:7): [True: 0, False: 0]
  ------------------
  181|     30|                                  scheduling_mode) == kOnceInit) {
  182|     30|    base_internal::invoke(std::forward<Callable>(fn),
  183|     30|                          std::forward<Args>(args)...);
  184|     30|    old_control =
  185|     30|        control->exchange(base_internal::kOnceDone, std::memory_order_release);
  186|     30|    if (old_control == base_internal::kOnceWaiter) {
  ------------------
  |  Branch (186:9): [True: 0, False: 30]
  ------------------
  187|      0|      base_internal::SpinLockWake(control, true);
  188|      0|    }
  189|     30|  }  // else *control is already kOnceDone
  190|     30|}
_ZN4absl12lts_202401169call_onceIRFvvEJEEEvRNS0_9once_flagEOT_DpOT0_:
  212|      2|void call_once(absl::once_flag& flag, Callable&& fn, Args&&... args) {
  213|      2|  std::atomic<uint32_t>* once = base_internal::ControlWord(&flag);
  214|      2|  uint32_t s = once->load(std::memory_order_acquire);
  215|      2|  if (ABSL_PREDICT_FALSE(s != base_internal::kOnceDone)) {
  ------------------
  |  |  178|      2|#define ABSL_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
  |  |  ------------------
  |  |  |  Branch (178:31): [True: 2, False: 0]
  |  |  |  Branch (178:49): [Folded, False: 2]
  |  |  |  Branch (178:58): [True: 2, False: 0]
  |  |  ------------------
  ------------------
  216|      2|    base_internal::CallOnceImpl(
  217|      2|        once, base_internal::SCHEDULE_COOPERATIVE_AND_KERNEL,
  218|      2|        std::forward<Callable>(fn), std::forward<Args>(args)...);
  219|      2|  }
  220|      2|}
_ZN4absl12lts_2024011613base_internal12CallOnceImplIRFvvEJEEEvPNSt3__16atomicIjEENS1_14SchedulingModeEOT_DpOT0_:
  153|      2|    Args&&... args) {
  154|       |#ifndef NDEBUG
  155|       |  {
  156|       |    uint32_t old_control = control->load(std::memory_order_relaxed);
  157|       |    if (old_control != kOnceInit &&
  158|       |        old_control != kOnceRunning &&
  159|       |        old_control != kOnceWaiter &&
  160|       |        old_control != kOnceDone) {
  161|       |      ABSL_RAW_LOG(FATAL, "Unexpected value for control word: 0x%lx",
  162|       |                   static_cast<unsigned long>(old_control));  // NOLINT
  163|       |    }
  164|       |  }
  165|       |#endif  // NDEBUG
  166|      2|  static const base_internal::SpinLockWaitTransition trans[] = {
  167|      2|      {kOnceInit, kOnceRunning, true},
  168|      2|      {kOnceRunning, kOnceWaiter, false},
  169|      2|      {kOnceDone, kOnceDone, true}};
  170|       |
  171|       |  // Must do this before potentially modifying control word's state.
  172|      2|  base_internal::SchedulingHelper maybe_disable_scheduling(scheduling_mode);
  173|       |  // Short circuit the simplest case to avoid procedure call overhead.
  174|       |  // The base_internal::SpinLockWait() call returns either kOnceInit or
  175|       |  // kOnceDone. If it returns kOnceDone, it must have loaded the control word
  176|       |  // with std::memory_order_acquire and seen a value of kOnceDone.
  177|      2|  uint32_t old_control = kOnceInit;
  178|      2|  if (control->compare_exchange_strong(old_control, kOnceRunning,
  ------------------
  |  Branch (178:7): [True: 2, False: 0]
  ------------------
  179|      2|                                       std::memory_order_relaxed) ||
  180|      0|      base_internal::SpinLockWait(control, ABSL_ARRAYSIZE(trans), trans,
  ------------------
  |  |   45|      0|  (sizeof(::absl::macros_internal::ArraySizeHelper(array)))
  ------------------
  |  Branch (180:7): [True: 0, False: 0]
  ------------------
  181|      2|                                  scheduling_mode) == kOnceInit) {
  182|      2|    base_internal::invoke(std::forward<Callable>(fn),
  183|      2|                          std::forward<Args>(args)...);
  184|      2|    old_control =
  185|      2|        control->exchange(base_internal::kOnceDone, std::memory_order_release);
  186|      2|    if (old_control == base_internal::kOnceWaiter) {
  ------------------
  |  Branch (186:9): [True: 0, False: 2]
  ------------------
  187|      0|      base_internal::SpinLockWake(control, true);
  188|      0|    }
  189|      2|  }  // else *control is already kOnceDone
  190|      2|}

_ZN4absl12lts_202401168bit_castImlTnNSt3__19enable_ifIXaaaaeqstT_stT0_sr3std21is_trivially_copyableIS5_EE5valuesr3std21is_trivially_copyableIS4_EE5valueEiE4typeELi0EEES4_RKS5_:
  163|  41.1k|inline constexpr Dest bit_cast(const Source& source) {
  164|  41.1k|  return __builtin_bit_cast(Dest, source);
  165|  41.1k|}
_ZN4absl12lts_202401168bit_castIlmTnNSt3__19enable_ifIXaaaaeqstT_stT0_sr3std21is_trivially_copyableIS5_EE5valuesr3std21is_trivially_copyableIS4_EE5valueEiE4typeELi0EEES4_RKS5_:
  163|  20.6k|inline constexpr Dest bit_cast(const Source& source) {
  164|  20.6k|  return __builtin_bit_cast(Dest, source);
  165|  20.6k|}
_ZN4absl12lts_202401168bit_castINS0_14flags_internal19FlagValueAndInitBitIN8fuzztest8internal14TimeBudgetTypeEEElTnNSt3__19enable_ifIXaaaaeqstT_stT0_sr3std21is_trivially_copyableISB_EE5valuesr3std21is_trivially_copyableISA_EE5valueEiE4typeELi0EEESA_RKSB_:
  163|      2|inline constexpr Dest bit_cast(const Source& source) {
  164|      2|  return __builtin_bit_cast(Dest, source);
  165|      2|}
_ZN4absl12lts_202401168bit_castINS0_14flags_internal19FlagValueAndInitBitIbEElTnNSt3__19enable_ifIXaaaaeqstT_stT0_sr3std21is_trivially_copyableIS8_EE5valuesr3std21is_trivially_copyableIS7_EE5valueEiE4typeELi0EEES7_RKS8_:
  163|      6|inline constexpr Dest bit_cast(const Source& source) {
  164|      6|  return __builtin_bit_cast(Dest, source);
  165|      6|}
_ZN4absl12lts_202401168bit_castIlNSt3__15arrayIcLm8EEETnNS2_9enable_ifIXaaaaeqstT_stT0_sr3std21is_trivially_copyableIS7_EE5valuesr3std21is_trivially_copyableIS6_EE5valueEiE4typeELi0EEES6_RKS7_:
  163|     10|inline constexpr Dest bit_cast(const Source& source) {
  164|     10|  return __builtin_bit_cast(Dest, source);
  165|     10|}
_ZN4absl12lts_202401168bit_castIdmTnNSt3__19enable_ifIXaaaaeqstT_stT0_sr3std21is_trivially_copyableIS5_EE5valuesr3std21is_trivially_copyableIS4_EE5valueEiE4typeELi0EEES4_RKS5_:
  163|  15.1k|inline constexpr Dest bit_cast(const Source& source) {
  164|  15.1k|  return __builtin_bit_cast(Dest, source);
  165|  15.1k|}

_ZN4absl12lts_202401169gbswap_32Ej:
   48|     20|inline uint32_t gbswap_32(uint32_t host_int) {
   49|     20|#if ABSL_HAVE_BUILTIN(__builtin_bswap32) || defined(__GNUC__)
   50|     20|  return __builtin_bswap32(host_int);
   51|       |#elif defined(_MSC_VER)
   52|       |  return _byteswap_ulong(host_int);
   53|       |#else
   54|       |  return (((host_int & uint32_t{0xFF}) << 24) |
   55|       |          ((host_int & uint32_t{0xFF00}) << 8) |
   56|       |          ((host_int & uint32_t{0xFF0000}) >> 8) |
   57|       |          ((host_int & uint32_t{0xFF000000}) >> 24));
   58|       |#endif
   59|     20|}
_ZN4absl12lts_202401169gbswap_16Et:
   61|      2|inline uint16_t gbswap_16(uint16_t host_int) {
   62|      2|#if ABSL_HAVE_BUILTIN(__builtin_bswap16) || defined(__GNUC__)
   63|      2|  return __builtin_bswap16(host_int);
   64|       |#elif defined(_MSC_VER)
   65|       |  return _byteswap_ushort(host_int);
   66|       |#else
   67|       |  return (((host_int & uint16_t{0xFF}) << 8) |
   68|       |          ((host_int & uint16_t{0xFF00}) >> 8));
   69|       |#endif
   70|      2|}
_ZN4absl12lts_2024011613little_endian10FromHost16Et:
  107|     70|inline uint16_t FromHost16(uint16_t x) { return x; }
_ZN4absl12lts_2024011613little_endian8ToHost64Em:
  114|     16|inline uint64_t ToHost64(uint64_t x) { return x; }
_ZN4absl12lts_2024011613little_endian6ToHostEm:
  140|     16|inline uint64_t ToHost(uint64_t x) { return ToHost64(x); }
_ZN4absl12lts_2024011613little_endian7Store16EPvt:
  168|     70|inline void Store16(absl::Nonnull<void *> p, uint16_t v) {
  169|     70|  ABSL_INTERNAL_UNALIGNED_STORE16(p, FromHost16(v));
  ------------------
  |  |   81|     70|  (absl::base_internal::UnalignedStore16(_p, _val))
  ------------------
  170|     70|}
_ZN4absl12lts_2024011610big_endian8ToHost16Et:
  198|      2|inline uint16_t ToHost16(uint16_t x) { return gbswap_16(x); }
_ZN4absl12lts_2024011610big_endian8ToHost32Ej:
  201|     20|inline uint32_t ToHost32(uint32_t x) { return gbswap_32(x); }
_ZN4absl12lts_2024011610big_endian6Load16EPKv:
  254|      2|inline uint16_t Load16(absl::Nonnull<const void *> p) {
  255|      2|  return ToHost16(ABSL_INTERNAL_UNALIGNED_LOAD16(p));
  ------------------
  |  |   74|      2|  (absl::base_internal::UnalignedLoad16(_p))
  ------------------
  256|      2|}
_ZN4absl12lts_2024011610big_endian6Load32EPKv:
  262|     20|inline uint32_t Load32(absl::Nonnull<const void *> p) {
  263|     20|  return ToHost32(ABSL_INTERNAL_UNALIGNED_LOAD32(p));
  ------------------
  |  |   76|     20|  (absl::base_internal::UnalignedLoad32(_p))
  ------------------
  264|     20|}

_ZN4absl12lts_2024011613base_internal10FastTypeIdIbEEPKvv:
   42|      4|constexpr inline FastTypeIdType FastTypeId() {
   43|      4|  return &FastTypeTag<Type>::dummy_var;
   44|      4|}
_ZN4absl12lts_2024011613base_internal10FastTypeIdINSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEEEPKvv:
   42|      2|constexpr inline FastTypeIdType FastTypeId() {
   43|      2|  return &FastTypeTag<Type>::dummy_var;
   44|      2|}
_ZN4absl12lts_2024011613base_internal10FastTypeIdImEEPKvv:
   42|      2|constexpr inline FastTypeIdType FastTypeId() {
   43|      2|  return &FastTypeTag<Type>::dummy_var;
   44|      2|}

_ZN4absl12lts_2024011613base_internal8SpinLockC2Ev:
   58|     16|  SpinLock() : lockword_(kSpinLockCooperative) {
   59|     16|    ABSL_TSAN_MUTEX_CREATE(this, __tsan_mutex_not_static);
   60|     16|  }
_ZN4absl12lts_2024011613base_internal8SpinLock4LockEv:
   80|     18|  inline void Lock() ABSL_EXCLUSIVE_LOCK_FUNCTION() {
   81|     18|    ABSL_TSAN_MUTEX_PRE_LOCK(this, 0);
   82|     18|    if (!TryLockImpl()) {
  ------------------
  |  Branch (82:9): [True: 0, False: 18]
  ------------------
   83|      0|      SlowLock();
   84|      0|    }
   85|     18|    ABSL_TSAN_MUTEX_POST_LOCK(this, 0, 0);
   86|     18|  }
_ZN4absl12lts_2024011613base_internal8SpinLock6UnlockEv:
  102|     18|  inline void Unlock() ABSL_UNLOCK_FUNCTION() {
  103|     18|    ABSL_TSAN_MUTEX_PRE_UNLOCK(this, 0);
  104|     18|    uint32_t lock_value = lockword_.load(std::memory_order_relaxed);
  105|     18|    lock_value = lockword_.exchange(lock_value & kSpinLockCooperative,
  106|     18|                                    std::memory_order_release);
  107|       |
  108|     18|    if ((lock_value & kSpinLockDisabledScheduling) != 0) {
  ------------------
  |  Branch (108:9): [True: 0, False: 18]
  ------------------
  109|      0|      base_internal::SchedulingGuard::EnableRescheduling(true);
  110|      0|    }
  111|     18|    if ((lock_value & kWaitTimeMask) != 0) {
  ------------------
  |  Branch (111:9): [True: 0, False: 18]
  ------------------
  112|       |      // Collect contentionz profile info, and speed the wakeup of any waiter.
  113|       |      // The wait_cycles value indicates how long this thread spent waiting
  114|       |      // for the lock.
  115|      0|      SlowUnlock(lock_value);
  116|      0|    }
  117|     18|    ABSL_TSAN_MUTEX_POST_UNLOCK(this, 0);
  118|     18|  }
_ZN4absl12lts_2024011613base_internal8SpinLock11TryLockImplEv:
  192|     18|  inline bool TryLockImpl() {
  193|     18|    uint32_t lock_value = lockword_.load(std::memory_order_relaxed);
  194|     18|    return (TryLockInternal(lock_value, 0) & kSpinLockHeld) == 0;
  195|     18|  }
_ZN4absl12lts_2024011613base_internal14SpinLockHolderC2EPNS1_8SpinLockE:
  208|     18|      : lock_(l) {
  209|     18|    l->Lock();
  210|     18|  }
_ZN4absl12lts_2024011613base_internal14SpinLockHolderD2Ev:
  211|     18|  inline ~SpinLockHolder() ABSL_UNLOCK_FUNCTION() { lock_->Unlock(); }
_ZN4absl12lts_2024011613base_internal8SpinLock15TryLockInternalEjj:
  237|     18|                                          uint32_t wait_cycles) {
  238|     18|  if ((lock_value & kSpinLockHeld) != 0) {
  ------------------
  |  Branch (238:7): [True: 0, False: 18]
  ------------------
  239|      0|    return lock_value;
  240|      0|  }
  241|       |
  242|     18|  uint32_t sched_disabled_bit = 0;
  243|     18|  if ((lock_value & kSpinLockCooperative) == 0) {
  ------------------
  |  Branch (243:7): [True: 0, False: 18]
  ------------------
  244|       |    // For non-cooperative locks we must make sure we mark ourselves as
  245|       |    // non-reschedulable before we attempt to CompareAndSwap.
  246|      0|    if (base_internal::SchedulingGuard::DisableRescheduling()) {
  ------------------
  |  Branch (246:9): [True: 0, False: 0]
  ------------------
  247|      0|      sched_disabled_bit = kSpinLockDisabledScheduling;
  248|      0|    }
  249|      0|  }
  250|       |
  251|     18|  if (!lockword_.compare_exchange_strong(
  ------------------
  |  Branch (251:7): [True: 0, False: 18]
  ------------------
  252|     18|          lock_value,
  253|     18|          kSpinLockHeld | lock_value | wait_cycles | sched_disabled_bit,
  254|     18|          std::memory_order_acquire, std::memory_order_relaxed)) {
  255|      0|    base_internal::SchedulingGuard::EnableRescheduling(sched_disabled_bit != 0);
  256|      0|  }
  257|       |
  258|     18|  return lock_value;
  259|     18|}

_ZN4absl12lts_2024011613base_internal15UnalignedLoad16EPKv:
   39|      2|inline uint16_t UnalignedLoad16(absl::Nonnull<const void *> p) {
   40|      2|  uint16_t t;
   41|      2|  memcpy(&t, p, sizeof t);
   42|      2|  return t;
   43|      2|}
_ZN4absl12lts_2024011613base_internal15UnalignedLoad32EPKv:
   45|    200|inline uint32_t UnalignedLoad32(absl::Nonnull<const void *> p) {
   46|    200|  uint32_t t;
   47|    200|  memcpy(&t, p, sizeof t);
   48|    200|  return t;
   49|    200|}
_ZN4absl12lts_2024011613base_internal15UnalignedLoad64EPKv:
   51|     84|inline uint64_t UnalignedLoad64(absl::Nonnull<const void *> p) {
   52|     84|  uint64_t t;
   53|     84|  memcpy(&t, p, sizeof t);
   54|     84|  return t;
   55|     84|}
_ZN4absl12lts_2024011613base_internal16UnalignedStore16EPvt:
   57|     70|inline void UnalignedStore16(absl::Nonnull<void *> p, uint16_t v) {
   58|     70|  memcpy(p, &v, sizeof v);
   59|     70|}

_ZN4absl12lts_2024011612NoDestructorINS0_14flags_internal12FlagRegistryEEC2IJETnNSt3__19enable_ifIXntsr3std7is_sameIFvDpRu7__decayIT_EEFvRS4_EEE5valueEiE4typeELi0EEEDpOS8_:
  127|      2|      : impl_(std::forward<Ts>(args)...) {}
_ZN4absl12lts_2024011612NoDestructorINS0_14flags_internal12FlagRegistryEE13PlacementImplC2IJEEEDpOT_:
  165|      2|    explicit PlacementImpl(Args&&... args) {
  166|      2|      new (&space_) T(std::forward<Args>(args)...);
  167|      2|    }
_ZN4absl12lts_2024011612NoDestructorINS0_14flags_internal12FlagRegistryEEdeEv:
  142|     38|  T& operator*() { return *get(); }
_ZN4absl12lts_2024011612NoDestructorINS0_14flags_internal12FlagRegistryEE3getEv:
  144|     38|  T* get() { return impl_.get(); }
_ZN4absl12lts_2024011612NoDestructorINS0_14flags_internal12FlagRegistryEE13PlacementImpl3getEv:
  171|     38|    T* get() { return Launder(reinterpret_cast<T*>(&space_)); }
_ZN4absl12lts_2024011612NoDestructorINS0_14flags_internal12FlagRegistryEE13PlacementImpl7LaunderIS3_EEPT_S8_:
  175|     38|    static P* Launder(P* p) {
  176|     38|#if defined(__cpp_lib_launder) && __cpp_lib_launder >= 201606L
  177|     38|      return std::launder(p);
  178|       |#elif ABSL_HAVE_BUILTIN(__builtin_launder)
  179|       |      return __builtin_launder(p);
  180|       |#else
  181|       |      // When `std::launder` or equivalent are not available, we rely on
  182|       |      // undefined behavior, which works as intended on Abseil's officially
  183|       |      // supported platforms as of Q3 2023.
  184|       |#if defined(__GNUC__) && !defined(__clang__)
  185|       |#pragma GCC diagnostic push
  186|       |#pragma GCC diagnostic ignored "-Wstrict-aliasing"
  187|       |#endif
  188|       |      return p;
  189|       |#if defined(__GNUC__) && !defined(__clang__)
  190|       |#pragma GCC diagnostic pop
  191|       |#endif
  192|       |#endif
  193|     38|    }

_ZN4absl12lts_2024011620PrefetchToLocalCacheEPKv:
  146|     12|    const void* addr) {
  147|     12|  __builtin_prefetch(addr, 0, 3);
  148|     12|}

_ZN4absl12lts_2024011618container_internal17FlatHashMapPolicyINSt3__117basic_string_viewIcNS3_11char_traitsIcEEEEPNS0_15CommandLineFlagEE7destroyINS3_9allocatorINS3_4pairIKS7_S9_EEEEEEvPT_PNS1_13map_slot_typeIS7_S9_EE:
  577|     32|  static void destroy(Allocator* alloc, slot_type* slot) {
  578|     32|    slot_policy::destroy(alloc, slot);
  579|     32|  }
_ZN4absl12lts_2024011618container_internal17FlatHashMapPolicyINSt3__117basic_string_viewIcNS3_11char_traitsIcEEEEPNS0_15CommandLineFlagEE7elementEPNS1_13map_slot_typeIS7_S9_EE:
  597|     60|  static std::pair<const K, V>& element(slot_type* slot) { return slot->value; }
_ZN4absl12lts_2024011618container_internal17FlatHashMapPolicyINSt3__117basic_string_viewIcNS3_11char_traitsIcEEEEPNS0_15CommandLineFlagEE5applyINS1_12raw_hash_setISA_NS1_10StringHashENS1_8StringEqENS3_9allocatorINS3_4pairIKS7_S9_EEEEE19EmplaceDecomposableEJSI_EEEDTclsr4absl18container_internalE13DecomposePairclsr3stdE7declvalIT_EEspclsr3stdE7declvalIT0_EEEEOSM_DpOSN_:
  590|     32|  apply(F&& f, Args&&... args) {
  591|     32|    return absl::container_internal::DecomposePair(std::forward<F>(f),
  592|     32|                                                   std::forward<Args>(args)...);
  593|     32|  }
_ZN4absl12lts_2024011618container_internal17FlatHashMapPolicyINSt3__117basic_string_viewIcNS3_11char_traitsIcEEEEPNS0_15CommandLineFlagEE5applyINS1_12raw_hash_setISA_NS1_10StringHashENS1_8StringEqENS3_9allocatorINS3_4pairIKS7_S9_EEEEE11HashElementEJRSI_EEEDTclsr4absl18container_internalE13DecomposePairclsr3stdE7declvalIT_EEspclsr3stdE7declvalIT0_EEEEOSN_DpOSO_:
  590|     28|  apply(F&& f, Args&&... args) {
  591|     28|    return absl::container_internal::DecomposePair(std::forward<F>(f),
  592|     28|                                                   std::forward<Args>(args)...);
  593|     28|  }
_ZN4absl12lts_2024011618container_internal17FlatHashMapPolicyINSt3__117basic_string_viewIcNS3_11char_traitsIcEEEEPNS0_15CommandLineFlagEE8transferINS3_9allocatorINS3_4pairIKS7_S9_EEEEEEDaPT_PNS1_13map_slot_typeIS7_S9_EESL_:
  583|     28|                       slot_type* old_slot) {
  584|     28|    return slot_policy::transfer(alloc, new_slot, old_slot);
  585|     28|  }
_ZN4absl12lts_2024011618container_internal17FlatHashMapPolicyINSt3__117basic_string_viewIcNS3_11char_traitsIcEEEEPNS0_15CommandLineFlagEE9constructINS3_9allocatorINS3_4pairIKS7_S9_EEEEJRKNS3_21piecewise_construct_tENS3_5tupleIJOSE_EEENSK_IJOS9_EEEEEEvPT_PNS1_13map_slot_typeIS7_S9_EEDpOT0_:
  572|     32|  static void construct(Allocator* alloc, slot_type* slot, Args&&... args) {
  573|     32|    slot_policy::construct(alloc, slot, std::forward<Args>(args)...);
  574|     32|  }
_ZN4absl12lts_2024011613flat_hash_mapINSt3__117basic_string_viewIcNS2_11char_traitsIcEEEEPNS0_15CommandLineFlagENS0_18container_internal10StringHashENS9_8StringEqENS2_9allocatorINS2_4pairIKS6_S8_EEEEEC2Ev:
  159|      2|  flat_hash_map() {}

_ZN4absl12lts_2024011618container_internal20common_policy_traitsINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEEvE7destroyINS4_9allocatorINS4_4pairIKS8_SA_EEEEEEvPT_PNS1_13map_slot_typeIS8_SA_EE:
   49|     32|  static void destroy(Alloc* alloc, slot_type* slot) {
   50|     32|    Policy::destroy(alloc, slot);
   51|     32|  }
_ZN4absl12lts_2024011618container_internal20common_policy_traitsINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEEvE7elementISB_EEDTclsrT_7elementfp_EEPNS1_13map_slot_typeIS8_SA_EE:
   75|     60|      -> decltype(P::element(slot)) {
   76|     60|    return P::element(slot);
   77|     60|  }
_ZN4absl12lts_2024011618container_internal20common_policy_traitsINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEEvE8transferINS4_9allocatorINS4_4pairIKS8_SA_EEEEEEvPT_PNS1_13map_slot_typeIS8_SA_EESN_:
   65|     28|  static void transfer(Alloc* alloc, slot_type* new_slot, slot_type* old_slot) {
   66|     28|    transfer_impl(alloc, new_slot, old_slot, Rank0{});
   67|     28|  }
_ZN4absl12lts_2024011618container_internal20common_policy_traitsINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEEvE13transfer_implINS4_9allocatorINS4_4pairIKS8_SA_EEEESB_EEDTclsrT0_8transferfp_fp0_fp1_EEPT_PNS1_13map_slot_typeIS8_SA_EESP_NSC_5Rank0E:
  101|     28|      -> decltype(P::transfer(alloc, new_slot, old_slot)) {
  102|     28|    return P::transfer(alloc, new_slot, old_slot);
  103|     28|  }
_ZN4absl12lts_2024011618container_internal20common_policy_traitsINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEEvE9constructINS4_9allocatorINS4_4pairIKS8_SA_EEEEJRKNS4_21piecewise_construct_tENS4_5tupleIJOSG_EEENSM_IJOSA_EEEEEEvPT_PNS1_13map_slot_typeIS8_SA_EEDpOT0_:
   42|     32|  static void construct(Alloc* alloc, slot_type* slot, Args&&... args) {
   43|     32|    Policy::construct(alloc, slot, std::forward<Args>(args)...);
   44|     32|  }

_ZNKR4absl12lts_2024011618container_internal25internal_compressed_tuple7StorageINS1_12CommonFieldsELm0ELb0EE3getEv:
   91|    312|  constexpr const T& get() const& { return value; }
_ZNR4absl12lts_2024011618container_internal25internal_compressed_tuple7StorageINS1_12CommonFieldsELm0ELb0EE3getEv:
   92|    396|  T& get() & { return value; }
_ZN4absl12lts_2024011618container_internal25internal_compressed_tuple7StorageINS1_12CommonFieldsELm0ELb0EEC2IS4_EENSt3__110in_place_tEOT_:
   90|      2|      : value(absl::forward<V>(v)) {}
_ZNR4absl12lts_2024011618container_internal25internal_compressed_tuple7StorageINS1_10StringHashELm1ELb1EE3getEv:
  106|     60|  T& get() & { return *this; }
_ZNKR4absl12lts_2024011618container_internal15CompressedTupleIJNS1_12CommonFieldsENS1_10StringHashENS1_8StringEqENSt3__19allocatorINS6_4pairIKNS6_17basic_string_viewIcNS6_11char_traitsIcEEEEPNS0_15CommandLineFlagEEEEEEE3getILi0EEERKNS1_25internal_compressed_tuple4ElemISI_XT_EE4typeEv:
  246|    312|  constexpr const ElemT<I>& get() const& {
  247|    312|    return StorageT<I>::get();
  248|    312|  }
_ZNR4absl12lts_2024011618container_internal15CompressedTupleIJNS1_12CommonFieldsENS1_10StringHashENS1_8StringEqENSt3__19allocatorINS6_4pairIKNS6_17basic_string_viewIcNS6_11char_traitsIcEEEEPNS0_15CommandLineFlagEEEEEEE3getILi3EEERNS1_25internal_compressed_tuple4ElemISI_XT_EE4typeEv:
  241|    104|  ElemT<I>& get() & {
  242|    104|    return StorageT<I>::get();
  243|    104|  }
_ZNR4absl12lts_2024011618container_internal25internal_compressed_tuple7StorageINSt3__19allocatorINS4_4pairIKNS4_17basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEEEELm3ELb1EE3getEv:
  106|    104|  T& get() & { return *this; }
_ZNR4absl12lts_2024011618container_internal15CompressedTupleIJNS1_12CommonFieldsENS1_10StringHashENS1_8StringEqENSt3__19allocatorINS6_4pairIKNS6_17basic_string_viewIcNS6_11char_traitsIcEEEEPNS0_15CommandLineFlagEEEEEEE3getILi0EEERNS1_25internal_compressed_tuple4ElemISI_XT_EE4typeEv:
  241|    396|  ElemT<I>& get() & {
  242|    396|    return StorageT<I>::get();
  243|    396|  }
_ZNR4absl12lts_2024011618container_internal15CompressedTupleIJNS1_12CommonFieldsENS1_10StringHashENS1_8StringEqENSt3__19allocatorINS6_4pairIKNS6_17basic_string_viewIcNS6_11char_traitsIcEEEEPNS0_15CommandLineFlagEEEEEEE3getILi1EEERNS1_25internal_compressed_tuple4ElemISI_XT_EE4typeEv:
  241|     60|  ElemT<I>& get() & {
  242|     60|    return StorageT<I>::get();
  243|     60|  }
_ZN4absl12lts_2024011618container_internal15CompressedTupleIJNS1_12CommonFieldsENS1_10StringHashENS1_8StringEqENSt3__19allocatorINS6_4pairIKNS6_17basic_string_viewIcNS6_11char_traitsIcEEEEPNS0_15CommandLineFlagEEEEEEEC2IS3_JS4_S5_SH_ETnNS6_9enable_ifIXsr4absl11conjunctionINS0_8negationINS6_7is_sameIFvSI_EFvNS6_5decayIT_E4typeEEEEEENS1_25internal_compressed_tuple27TupleItemsMoveConstructibleISI_JSP_DpT0_EEEEE5valueEbE4typeELb1EEEOSP_DpOSX_:
  236|      2|      : CompressedTuple::CompressedTupleImpl(absl::in_place,
  237|      2|                                             absl::forward<First>(first),
  238|      2|                                             absl::forward<Vs>(base)...) {}
_ZN4absl12lts_2024011618container_internal25internal_compressed_tuple19CompressedTupleImplINS1_15CompressedTupleIJNS1_12CommonFieldsENS1_10StringHashENS1_8StringEqENSt3__19allocatorINS8_4pairIKNS8_17basic_string_viewIcNS8_11char_traitsIcEEEEPNS0_15CommandLineFlagEEEEEEEENS8_16integer_sequenceImJLm0ELm1ELm2ELm3EEEELb1EEC2IJS5_S6_S7_SJ_EEENS8_10in_place_tEDpOT_:
  126|      8|      : Storage<Ts, I>(absl::in_place, absl::forward<Vs>(args))... {}
_ZN4absl12lts_2024011618container_internal25internal_compressed_tuple7StorageINS1_10StringHashELm1ELb1EEC2IS4_EENSt3__110in_place_tEOT_:
  103|      2|      : T(absl::forward<V>(v)) {}
_ZN4absl12lts_2024011618container_internal25internal_compressed_tuple7StorageINS1_8StringEqELm2ELb1EEC2IS4_EENSt3__110in_place_tEOT_:
  103|      2|      : T(absl::forward<V>(v)) {}
_ZN4absl12lts_2024011618container_internal25internal_compressed_tuple7StorageINSt3__19allocatorINS4_4pairIKNS4_17basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEEEELm3ELb1EEC2ISF_EENS4_10in_place_tEOT_:
  103|      2|      : T(absl::forward<V>(v)) {}

_ZN4absl12lts_2024011618container_internal27SanitizerPoisonMemoryRegionEPKvm:
  220|     34|inline void SanitizerPoisonMemoryRegion(const void* m, size_t s) {
  221|       |#ifdef ABSL_HAVE_ADDRESS_SANITIZER
  222|       |  ASAN_POISON_MEMORY_REGION(m, s);
  223|       |#endif
  224|       |#ifdef ABSL_HAVE_MEMORY_SANITIZER
  225|       |  __msan_poison(m, s);
  226|       |#endif
  227|     34|  (void)m;
  228|     34|  (void)s;
  229|     34|}
_ZN4absl12lts_2024011618container_internal29SanitizerUnpoisonMemoryRegionEPKvm:
  231|     74|inline void SanitizerUnpoisonMemoryRegion(const void* m, size_t s) {
  232|       |#ifdef ABSL_HAVE_ADDRESS_SANITIZER
  233|       |  ASAN_UNPOISON_MEMORY_REGION(m, s);
  234|       |#endif
  235|       |#ifdef ABSL_HAVE_MEMORY_SANITIZER
  236|       |  __msan_unpoison(m, s);
  237|       |#endif
  238|     74|  (void)m;
  239|     74|  (void)s;
  240|     74|}
_ZN4absl12lts_2024011618container_internal8AllocateILm8ENSt3__19allocatorIcEEEEPvPT0_m:
   55|     10|void* Allocate(Alloc* alloc, size_t n) {
   56|     10|  static_assert(Alignment > 0, "");
   57|     10|  assert(n && "n must be positive");
   58|     10|  using M = AlignedType<Alignment>;
   59|     10|  using A = typename absl::allocator_traits<Alloc>::template rebind_alloc<M>;
   60|     10|  using AT = typename absl::allocator_traits<Alloc>::template rebind_traits<M>;
   61|       |  // On macOS, "mem_alloc" is a #define with one argument defined in
   62|       |  // rpc/types.h, so we can't name the variable "mem_alloc" and initialize it
   63|       |  // with the "foo(bar)" syntax.
   64|     10|  A my_mem_alloc(*alloc);
   65|     10|  void* p = AT::allocate(my_mem_alloc, (n + sizeof(M) - 1) / sizeof(M));
   66|       |  assert(reinterpret_cast<uintptr_t>(p) % Alignment == 0 &&
   67|     10|         "allocator does not respect alignment");
   68|     10|  return p;
   69|     10|}
_ZN4absl12lts_2024011618container_internal10DeallocateILm8ENSt3__19allocatorIcEEEEvPT0_Pvm:
   74|      8|void Deallocate(Alloc* alloc, void* p, size_t n) {
   75|      8|  static_assert(Alignment > 0, "");
   76|      8|  assert(n && "n must be positive");
   77|      8|  using M = AlignedType<Alignment>;
   78|      8|  using A = typename absl::allocator_traits<Alloc>::template rebind_alloc<M>;
   79|      8|  using AT = typename absl::allocator_traits<Alloc>::template rebind_traits<M>;
   80|       |  // On macOS, "mem_alloc" is a #define with one argument defined in
   81|       |  // rpc/types.h, so we can't name the variable "mem_alloc" and initialize it
   82|       |  // with the "foo(bar)" syntax.
   83|      8|  A my_mem_alloc(*alloc);
   84|      8|  AT::deallocate(my_mem_alloc, static_cast<M*>(p),
   85|      8|                 (n + sizeof(M) - 1) / sizeof(M));
   86|      8|}
_ZN4absl12lts_2024011618container_internal15map_slot_policyINSt3__117basic_string_viewIcNS3_11char_traitsIcEEEEPNS0_15CommandLineFlagEE7destroyINS3_9allocatorINS3_4pairIKS7_S9_EEEEEEvPT_PNS1_13map_slot_typeIS7_S9_EE:
  417|     32|  static void destroy(Allocator* alloc, slot_type* slot) {
  418|     32|    if (kMutableKeys::value) {
  ------------------
  |  Branch (418:9): [True: 32, Folded]
  ------------------
  419|     32|      absl::allocator_traits<Allocator>::destroy(*alloc, &slot->mutable_value);
  420|     32|    } else {
  421|      0|      absl::allocator_traits<Allocator>::destroy(*alloc, &slot->value);
  422|      0|    }
  423|     32|  }
_ZN4absl12lts_2024011618container_internal8PairArgsIKNSt3__117basic_string_viewIcNS3_11char_traitsIcEEEEPNS0_15CommandLineFlagEEENS3_4pairINS3_5tupleIJRKT_EEENSC_IJRKT0_EEEEERKNSB_ISD_SH_EE:
  187|     28|    const std::pair<F, S>& p) {
  188|     28|  return PairArgs(p.first, p.second);
  189|     28|}
_ZN4absl12lts_2024011618container_internal8PairArgsIRKNSt3__117basic_string_viewIcNS3_11char_traitsIcEEEERKPNS0_15CommandLineFlagEEENS3_4pairINS3_5tupleIJOT_EEENSF_IJOT0_EEEEESH_SK_:
  181|     28|std::pair<std::tuple<F&&>, std::tuple<S&&>> PairArgs(F&& f, S&& s) {
  182|     28|  return {std::piecewise_construct, std::forward_as_tuple(std::forward<F>(f)),
  183|     28|          std::forward_as_tuple(std::forward<S>(s))};
  184|     28|}
_ZN4absl12lts_2024011618container_internal13DecomposePairINS1_12raw_hash_setINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS5_11char_traitsIcEEEEPNS0_15CommandLineFlagEEENS1_10StringHashENS1_8StringEqENS5_9allocatorINS5_4pairIKS9_SB_EEEEE19EmplaceDecomposableEJSI_EEEDTclsr15memory_internalE17DecomposePairImplclsr3stdE7forwardIT_Efp_Ecl8PairArgsspclsr3stdE7forwardIT0_Efp0_EEEEOSM_DpOSN_:
  206|     32|        std::forward<F>(f), PairArgs(std::forward<Args>(args)...))) {
  207|     32|  return memory_internal::DecomposePairImpl(
  208|     32|      std::forward<F>(f), PairArgs(std::forward<Args>(args)...));
  209|     32|}
_ZN4absl12lts_2024011618container_internal15memory_internal17DecomposePairImplINS1_12raw_hash_setINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS6_11char_traitsIcEEEEPNS0_15CommandLineFlagEEENS1_10StringHashENS1_8StringEqENS6_9allocatorINS6_4pairIKSA_SC_EEEEE19EmplaceDecomposableEOSI_NS6_5tupleIJOSC_EEEEEDTclclsr3stdE7declvalIT_EEclsr3stdE7declvalIRKT0_EEL_ZNS6_19piecewise_constructEEclsr3stdE7declvalINSO_IJSS_EEEEEclsr3stdE7declvalIT1_EEEEOSR_NSH_ISV_SW_EE:
  138|     32|DecomposePairImpl(F&& f, std::pair<std::tuple<K>, V> p) {
  139|     32|  const auto& key = std::get<0>(p.first);
  140|     32|  return std::forward<F>(f)(key, std::piecewise_construct, std::move(p.first),
  141|     32|                            std::move(p.second));
  142|     32|}
_ZN4absl12lts_2024011618container_internal13DecomposePairINS1_12raw_hash_setINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS5_11char_traitsIcEEEEPNS0_15CommandLineFlagEEENS1_10StringHashENS1_8StringEqENS5_9allocatorINS5_4pairIKS9_SB_EEEEE11HashElementEJRSI_EEEDTclsr15memory_internalE17DecomposePairImplclsr3stdE7forwardIT_Efp_Ecl8PairArgsspclsr3stdE7forwardIT0_Efp0_EEEEOSN_DpOSO_:
  206|     28|        std::forward<F>(f), PairArgs(std::forward<Args>(args)...))) {
  207|     28|  return memory_internal::DecomposePairImpl(
  208|     28|      std::forward<F>(f), PairArgs(std::forward<Args>(args)...));
  209|     28|}
_ZN4absl12lts_2024011618container_internal15memory_internal17DecomposePairImplINS1_12raw_hash_setINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS6_11char_traitsIcEEEEPNS0_15CommandLineFlagEEENS1_10StringHashENS1_8StringEqENS6_9allocatorINS6_4pairIKSA_SC_EEEEE11HashElementERSI_NS6_5tupleIJRKSC_EEEEEDTclclsr3stdE7declvalIT_EEclsr3stdE7declvalIRKT0_EEL_ZNS6_19piecewise_constructEEclsr3stdE7declvalINSO_IJST_EEEEEclsr3stdE7declvalIT1_EEEEOSS_NSH_ISW_SX_EE:
  138|     28|DecomposePairImpl(F&& f, std::pair<std::tuple<K>, V> p) {
  139|     28|  const auto& key = std::get<0>(p.first);
  140|     28|  return std::forward<F>(f)(key, std::piecewise_construct, std::move(p.first),
  141|     28|                            std::move(p.second));
  142|     28|}
_ZN4absl12lts_2024011618container_internal15map_slot_policyINSt3__117basic_string_viewIcNS3_11char_traitsIcEEEEPNS0_15CommandLineFlagEE8transferINS3_9allocatorINS3_4pairIKS7_S9_EEEEEEDaPT_PNS1_13map_slot_typeIS7_S9_EESL_:
  427|     28|                       slot_type* old_slot) {
  428|     28|    auto is_relocatable =
  429|     28|        typename absl::is_trivially_relocatable<value_type>::type();
  430|       |
  431|     28|    emplace(new_slot);
  432|     28|#if defined(__cpp_lib_launder) && __cpp_lib_launder >= 201606
  433|     28|    if (is_relocatable) {
  ------------------
  |  Branch (433:9): [True: 28, Folded]
  ------------------
  434|       |      // TODO(b/247130232,b/251814870): remove casts after fixing warnings.
  435|     28|      std::memcpy(static_cast<void*>(std::launder(&new_slot->value)),
  436|     28|                  static_cast<const void*>(&old_slot->value),
  437|     28|                  sizeof(value_type));
  438|     28|      return is_relocatable;
  439|     28|    }
  440|      0|#endif
  441|       |
  442|      0|    if (kMutableKeys::value) {
  ------------------
  |  Branch (442:9): [True: 0, Folded]
  ------------------
  443|      0|      absl::allocator_traits<Allocator>::construct(
  444|      0|          *alloc, &new_slot->mutable_value, std::move(old_slot->mutable_value));
  445|      0|    } else {
  446|      0|      absl::allocator_traits<Allocator>::construct(*alloc, &new_slot->value,
  447|      0|                                                   std::move(old_slot->value));
  448|      0|    }
  449|      0|    destroy(alloc, old_slot);
  450|      0|    return is_relocatable;
  451|     28|  }
_ZN4absl12lts_2024011618container_internal15map_slot_policyINSt3__117basic_string_viewIcNS3_11char_traitsIcEEEEPNS0_15CommandLineFlagEE7emplaceEPNS1_13map_slot_typeIS7_S9_EE:
  348|     60|  static void emplace(slot_type* slot) {
  349|       |    // The construction of union doesn't do anything at runtime but it allows us
  350|       |    // to access its members without violating aliasing rules.
  351|     60|    new (slot) slot_type;
  352|     60|  }
_ZN4absl12lts_2024011618container_internal13map_slot_typeINSt3__117basic_string_viewIcNS3_11char_traitsIcEEEEPNS0_15CommandLineFlagEEC2Ev:
  329|     60|  map_slot_type() {}
_ZN4absl12lts_2024011618container_internal15map_slot_policyINSt3__117basic_string_viewIcNS3_11char_traitsIcEEEEPNS0_15CommandLineFlagEE9constructINS3_9allocatorINS3_4pairIKS7_S9_EEEEJRKNS3_21piecewise_construct_tENS3_5tupleIJOSE_EEENSK_IJOS9_EEEEEEvPT_PNS1_13map_slot_typeIS7_S9_EEDpOT0_:
  383|     32|  static void construct(Allocator* alloc, slot_type* slot, Args&&... args) {
  384|     32|    emplace(slot);
  385|     32|    if (kMutableKeys::value) {
  ------------------
  |  Branch (385:9): [True: 32, Folded]
  ------------------
  386|     32|      absl::allocator_traits<Allocator>::construct(*alloc, &slot->mutable_value,
  387|     32|                                                   std::forward<Args>(args)...);
  388|     32|    } else {
  389|      0|      absl::allocator_traits<Allocator>::construct(*alloc, &slot->value,
  390|      0|                                                   std::forward<Args>(args)...);
  391|      0|    }
  392|     32|  }
_ZN4absl12lts_2024011618container_internal8PairArgsIKNSt3__117basic_string_viewIcNS3_11char_traitsIcEEEEPNS0_15CommandLineFlagEEENS3_4pairINS3_5tupleIJOT_EEENSC_IJOT0_EEEEEONSB_ISD_SG_EE:
  191|     32|std::pair<std::tuple<F&&>, std::tuple<S&&>> PairArgs(std::pair<F, S>&& p) {
  192|     32|  return PairArgs(std::forward<F>(p.first), std::forward<S>(p.second));
  193|     32|}
_ZN4absl12lts_2024011618container_internal8PairArgsIKNSt3__117basic_string_viewIcNS3_11char_traitsIcEEEEPNS0_15CommandLineFlagEEENS3_4pairINS3_5tupleIJOT_EEENSC_IJOT0_EEEEESE_SH_:
  181|     32|std::pair<std::tuple<F&&>, std::tuple<S&&>> PairArgs(F&& f, S&& s) {
  182|     32|  return {std::piecewise_construct, std::forward_as_tuple(std::forward<F>(f)),
  183|     32|          std::forward_as_tuple(std::forward<S>(s))};
  184|     32|}

_ZNK4absl12lts_2024011618container_internal10StringHashclENSt3__117basic_string_viewIcNS3_11char_traitsIcEEEE:
   77|     60|  size_t operator()(absl::string_view v) const {
   78|     60|    return absl::Hash<absl::string_view>{}(v);
   79|     60|  }

_ZN4absl12lts_2024011618container_internal18hash_policy_traitsINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEEvE5applyINS1_12raw_hash_setISB_NS1_10StringHashENS1_8StringEqENS4_9allocatorINS4_4pairIKS8_SA_EEEEE19EmplaceDecomposableEJSK_ESB_EEDTclsrT1_5applyclsr3stdE7forwardIT_Efp_Espclsr3stdE7forwardIT0_Efp0_EEEOSP_DpOSQ_:
  133|     32|      -> decltype(P::apply(std::forward<F>(f), std::forward<Ts>(ts)...)) {
  134|     32|    return P::apply(std::forward<F>(f), std::forward<Ts>(ts)...);
  135|     32|  }
_ZN4absl12lts_2024011618container_internal18hash_policy_traitsINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEEvE5applyINS1_12raw_hash_setISB_NS1_10StringHashENS1_8StringEqENS4_9allocatorINS4_4pairIKS8_SA_EEEEE11HashElementEJRSK_ESB_EEDTclsrT1_5applyclsr3stdE7forwardIT_Efp_Espclsr3stdE7forwardIT0_Efp0_EEEOSQ_DpOSR_:
  133|     28|      -> decltype(P::apply(std::forward<F>(f), std::forward<Ts>(ts)...)) {
  134|     28|    return P::apply(std::forward<F>(f), std::forward<Ts>(ts)...);
  135|     28|  }

_ZN4absl12lts_2024011618container_internal20HashtablezInfoHandleC2EDn:
  187|      2|  explicit HashtablezInfoHandle(std::nullptr_t) {}
_ZNK4absl12lts_2024011618container_internal20HashtablezInfoHandle9IsSampledEv:
  190|     10|  inline bool IsSampled() const { return false; }
_ZN4absl12lts_2024011618container_internal20HashtablezInfoHandle20RecordStorageChangedEmm:
  191|      2|  inline void RecordStorageChanged(size_t /*size*/, size_t /*capacity*/) {}
_ZN4absl12lts_2024011618container_internal20HashtablezInfoHandle12RecordRehashEm:
  192|      2|  inline void RecordRehash(size_t /*total_probe_length*/) {}
_ZN4absl12lts_2024011618container_internal20HashtablezInfoHandle12RecordInsertEmm:
  195|     32|  inline void RecordInsert(size_t /*hash*/, size_t /*distance_from_desired*/) {}
_ZN4absl12lts_2024011618container_internal6SampleEm:
  210|      2|    size_t inline_element_size ABSL_ATTRIBUTE_UNUSED) {
  211|       |#if defined(ABSL_INTERNAL_HASHTABLEZ_SAMPLE)
  212|       |  if (ABSL_PREDICT_TRUE(--global_next_sample.next_sample > 0)) {
  213|       |    return HashtablezInfoHandle(nullptr);
  214|       |  }
  215|       |  return HashtablezInfoHandle(
  216|       |      SampleSlow(global_next_sample, inline_element_size));
  217|       |#else
  218|      2|  return HashtablezInfoHandle(nullptr);
  219|      2|#endif  // !ABSL_PER_THREAD_TLS
  220|      2|}

_ZN4absl12lts_2024011618container_internal12raw_hash_mapINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEENS1_10StringHashENS1_8StringEqENS4_9allocatorINS4_4pairIKS8_SA_EEEEEC2Ev:
   64|      2|  raw_hash_map() {}

_ZN4absl12lts_2024011618container_internal17ClearBackingArrayERNS1_12CommonFieldsERKNS1_15PolicyFunctionsEb:
  256|      2|                       bool reuse) {
  257|      2|  c.set_size(0);
  258|      2|  if (reuse) {
  ------------------
  |  Branch (258:7): [True: 2, False: 0]
  ------------------
  259|      2|    ResetCtrl(c, policy.slot_size);
  260|      2|    ResetGrowthLeft(c);
  261|      2|    c.infoz().RecordStorageChanged(0, c.capacity());
  262|      2|  } else {
  263|       |    // We need to record infoz before calling dealloc, which will unregister
  264|       |    // infoz.
  265|      0|    c.infoz().RecordClearedReservation();
  266|      0|    c.infoz().RecordStorageChanged(0, 0);
  267|      0|    (*policy.dealloc)(c, policy);
  268|      0|    c.set_control(EmptyGroup());
  269|      0|    c.set_generation_ptr(EmptyGeneration());
  270|      0|    c.set_slots(nullptr);
  271|      0|    c.set_capacity(0);
  272|      0|  }
  273|      2|}
_ZNK4absl12lts_2024011618container_internal19HashSetResizeHelper38GrowIntoSingleGroupShuffleControlBytesEPNS1_6ctrl_tEm:
  276|      6|    ctrl_t* new_ctrl, size_t new_capacity) const {
  277|      6|  assert(is_single_group(new_capacity));
  278|      6|  constexpr size_t kHalfWidth = Group::kWidth / 2;
  279|      6|  assert(old_capacity_ < kHalfWidth);
  280|       |
  281|      6|  const size_t half_old_capacity = old_capacity_ / 2;
  282|       |
  283|       |  // NOTE: operations are done with compile time known size = kHalfWidth.
  284|       |  // Compiler optimizes that into single ASM operation.
  285|       |
  286|       |  // Copy second half of bytes to the beginning.
  287|       |  // We potentially copy more bytes in order to have compile time known size.
  288|       |  // Mirrored bytes from the old_ctrl_ will also be copied.
  289|       |  // In case of old_capacity_ == 3, we will copy 1st element twice.
  290|       |  // Examples:
  291|       |  // old_ctrl = 0S0EEEEEEE...
  292|       |  // new_ctrl = S0EEEEEEEE...
  293|       |  //
  294|       |  // old_ctrl = 01S01EEEEE...
  295|       |  // new_ctrl = 1S01EEEEEE...
  296|       |  //
  297|       |  // old_ctrl = 0123456S0123456EE...
  298|       |  // new_ctrl = 456S0123?????????...
  299|      6|  std::memcpy(new_ctrl, old_ctrl_ + half_old_capacity + 1, kHalfWidth);
  300|       |  // Clean up copied kSentinel from old_ctrl.
  301|      6|  new_ctrl[half_old_capacity] = ctrl_t::kEmpty;
  302|       |
  303|       |  // Clean up damaged or uninitialized bytes.
  304|       |
  305|       |  // Clean bytes after the intended size of the copy.
  306|       |  // Example:
  307|       |  // new_ctrl = 1E01EEEEEEE????
  308|       |  // *new_ctrl= 1E0EEEEEEEE????
  309|       |  // position      /
  310|      6|  std::memset(new_ctrl + old_capacity_ + 1, static_cast<int8_t>(ctrl_t::kEmpty),
  311|      6|              kHalfWidth);
  312|       |  // Clean non-mirrored bytes that are not initialized.
  313|       |  // For small old_capacity that may be inside of mirrored bytes zone.
  314|       |  // Examples:
  315|       |  // new_ctrl = 1E0EEEEEEEE??????????....
  316|       |  // *new_ctrl= 1E0EEEEEEEEEEEEE?????....
  317|       |  // position           /
  318|       |  //
  319|       |  // new_ctrl = 456E0123???????????...
  320|       |  // *new_ctrl= 456E0123EEEEEEEE???...
  321|       |  // position           /
  322|      6|  std::memset(new_ctrl + kHalfWidth, static_cast<int8_t>(ctrl_t::kEmpty),
  323|      6|              kHalfWidth);
  324|       |  // Clean last mirrored bytes that are not initialized
  325|       |  // and will not be overwritten by mirroring.
  326|       |  // Examples:
  327|       |  // new_ctrl = 1E0EEEEEEEEEEEEE????????
  328|       |  // *new_ctrl= 1E0EEEEEEEEEEEEEEEEEEEEE
  329|       |  // position           S       /
  330|       |  //
  331|       |  // new_ctrl = 456E0123EEEEEEEE???????????????
  332|       |  // *new_ctrl= 456E0123EEEEEEEE???????EEEEEEEE
  333|       |  // position                  S       /
  334|      6|  std::memset(new_ctrl + new_capacity + kHalfWidth,
  335|      6|              static_cast<int8_t>(ctrl_t::kEmpty), kHalfWidth);
  336|       |
  337|       |  // Create mirrored bytes. old_capacity_ < kHalfWidth
  338|       |  // Example:
  339|       |  // new_ctrl = 456E0123EEEEEEEE???????EEEEEEEE
  340|       |  // *new_ctrl= 456E0123EEEEEEEE456E0123EEEEEEE
  341|       |  // position                  S/
  342|      6|  ctrl_t g[kHalfWidth];
  343|      6|  std::memcpy(g, new_ctrl, kHalfWidth);
  344|      6|  std::memcpy(new_ctrl + new_capacity + 1, g, kHalfWidth);
  345|       |
  346|       |  // Finally set sentinel to its place.
  347|      6|  new_ctrl[new_capacity] = ctrl_t::kSentinel;
  348|      6|}
_ZNK4absl12lts_2024011618container_internal19HashSetResizeHelper43GrowIntoSingleGroupShuffleTransferableSlotsEPvS3_m:
  351|      6|    void* old_slots, void* new_slots, size_t slot_size) const {
  352|      6|  assert(old_capacity_ > 0);
  353|      6|  const size_t half_old_capacity = old_capacity_ / 2;
  354|       |
  355|      6|  SanitizerUnpoisonMemoryRegion(old_slots, slot_size * old_capacity_);
  356|      6|  std::memcpy(new_slots,
  357|      6|              SlotAddress(old_slots, half_old_capacity + 1, slot_size),
  358|      6|              slot_size * half_old_capacity);
  359|      6|  std::memcpy(SlotAddress(new_slots, half_old_capacity + 1, slot_size),
  360|      6|              old_slots, slot_size * (half_old_capacity + 1));
  361|      6|}
_ZN4absl12lts_2024011618container_internal19HashSetResizeHelper35GrowSizeIntoSingleGroupTransferableERNS1_12CommonFieldsEPvm:
  364|      6|    CommonFields& c, void* old_slots, size_t slot_size) {
  365|      6|  assert(old_capacity_ < Group::kWidth / 2);
  366|      6|  assert(is_single_group(c.capacity()));
  367|      6|  assert(IsGrowingIntoSingleGroupApplicable(old_capacity_, c.capacity()));
  368|       |
  369|      6|  GrowIntoSingleGroupShuffleControlBytes(c.control(), c.capacity());
  370|      6|  GrowIntoSingleGroupShuffleTransferableSlots(old_slots, c.slot_array(),
  371|      6|                                              slot_size);
  372|       |
  373|       |  // We poison since GrowIntoSingleGroupShuffleTransferableSlots
  374|       |  // may leave empty slots unpoisoned.
  375|      6|  PoisonSingleGroupEmptySlots(c, slot_size);
  376|      6|}

_ZN4absl12lts_2024011618container_internal13GroupSse2ImplC2EPKNS1_6ctrl_tE:
  615|    107|  explicit GroupSse2Impl(const ctrl_t* pos) {
  616|    107|    ctrl = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos));
  617|    107|  }
_ZNK4absl12lts_2024011618container_internal13GroupSse2Impl5MatchEh:
  620|     32|  BitMask<uint16_t, kWidth> Match(h2_t hash) const {
  621|     32|    auto match = _mm_set1_epi8(static_cast<char>(hash));
  622|     32|    BitMask<uint16_t, kWidth> result = BitMask<uint16_t, kWidth>(0);
  623|     32|    result = BitMask<uint16_t, kWidth>(
  624|     32|        static_cast<uint16_t>(_mm_movemask_epi8(_mm_cmpeq_epi8(match, ctrl))));
  625|     32|    return result;
  626|     32|  }
_ZNK4absl12lts_2024011618container_internal13GroupSse2Impl9MaskEmptyEv:
  629|     32|  NonIterableBitMask<uint16_t, kWidth> MaskEmpty() const {
  630|       |#ifdef ABSL_INTERNAL_HAVE_SSSE3
  631|       |    // This only works because ctrl_t::kEmpty is -128.
  632|       |    return NonIterableBitMask<uint16_t, kWidth>(
  633|       |        static_cast<uint16_t>(_mm_movemask_epi8(_mm_sign_epi8(ctrl, ctrl))));
  634|       |#else
  635|     32|    auto match = _mm_set1_epi8(static_cast<char>(ctrl_t::kEmpty));
  636|     32|    return NonIterableBitMask<uint16_t, kWidth>(
  637|     32|        static_cast<uint16_t>(_mm_movemask_epi8(_mm_cmpeq_epi8(match, ctrl))));
  638|     32|#endif
  639|     32|  }
_ZNK4absl12lts_2024011618container_internal13GroupSse2Impl18MaskEmptyOrDeletedEv:
  650|     62|  NonIterableBitMask<uint16_t, kWidth> MaskEmptyOrDeleted() const {
  651|     62|    auto special = _mm_set1_epi8(static_cast<char>(ctrl_t::kSentinel));
  652|     62|    return NonIterableBitMask<uint16_t, kWidth>(static_cast<uint16_t>(
  653|     62|        _mm_movemask_epi8(_mm_cmpgt_epi8_fixed(special, ctrl))));
  654|     62|  }
_ZNK4absl12lts_2024011618container_internal13GroupSse2Impl26CountLeadingEmptyOrDeletedEv:
  657|     13|  uint32_t CountLeadingEmptyOrDeleted() const {
  658|     13|    auto special = _mm_set1_epi8(static_cast<char>(ctrl_t::kSentinel));
  659|     13|    return TrailingZeros(static_cast<uint32_t>(
  660|     13|        _mm_movemask_epi8(_mm_cmpgt_epi8_fixed(special, ctrl)) + 1));
  661|     13|  }
_ZNK4absl12lts_2024011618container_internal34CommonFieldsGenerationInfoDisabled41should_rehash_for_bug_detection_on_insertEPKNS1_6ctrl_tEm:
  926|     32|  bool should_rehash_for_bug_detection_on_insert(const ctrl_t*, size_t) const {
  927|     32|    return false;
  928|     32|  }
_ZN4absl12lts_2024011618container_internal34CommonFieldsGenerationInfoDisabled36maybe_increment_generation_on_insertEv:
  932|     32|  void maybe_increment_generation_on_insert() {}
_ZN4absl12lts_2024011618container_internal34CommonFieldsGenerationInfoDisabled19set_reserved_growthEm:
  936|      2|  void set_reserved_growth(size_t) {}
_ZN4absl12lts_2024011618container_internal34CommonFieldsGenerationInfoDisabled20set_reservation_sizeEm:
  938|      2|  void set_reservation_size(size_t) {}
_ZNK4absl12lts_2024011618container_internal34CommonFieldsGenerationInfoDisabled10generationEv:
  939|     10|  GenerationType generation() const { return 0; }
_ZN4absl12lts_2024011618container_internal34CommonFieldsGenerationInfoDisabled14set_generationEh:
  940|     10|  void set_generation(GenerationType) {}
_ZNK4absl12lts_2024011618container_internal34CommonFieldsGenerationInfoDisabled14generation_ptrEv:
  941|     36|  GenerationType* generation_ptr() const { return nullptr; }
_ZN4absl12lts_2024011618container_internal34CommonFieldsGenerationInfoDisabled18set_generation_ptrEPh:
  942|     10|  void set_generation_ptr(GenerationType*) {}
_ZN4absl12lts_2024011618container_internal37HashSetIteratorGenerationInfoDisabledC2EPKh:
  965|     36|  explicit HashSetIteratorGenerationInfoDisabled(const GenerationType*) {}
_ZNK4absl12lts_2024011618container_internal37HashSetIteratorGenerationInfoDisabled10generationEv:
  967|    132|  GenerationType generation() const { return 0; }
_ZNK4absl12lts_2024011618container_internal37HashSetIteratorGenerationInfoDisabled14generation_ptrEv:
  969|    200|  const GenerationType* generation_ptr() const { return nullptr; }
_ZNK4absl12lts_2024011618container_internal12CommonFields7controlEv:
 1038|    578|  ctrl_t* control() const { return control_; }
_ZN4absl12lts_2024011618container_internal12CommonFields11set_controlEPNS1_6ctrl_tE:
 1039|     10|  void set_control(ctrl_t* c) { control_ = c; }
_ZNK4absl12lts_2024011618container_internal12CommonFields10slot_arrayEv:
 1047|    190|  void* slot_array() const { return slots_; }
_ZN4absl12lts_2024011618container_internal12CommonFields9set_slotsEPv:
 1048|     10|  void set_slots(void* s) { slots_ = s; }
_ZNK4absl12lts_2024011618container_internal12CommonFields4sizeEv:
 1051|     24|  size_t size() const { return size_ >> HasInfozShift(); }
_ZN4absl12lts_2024011618container_internal12CommonFields8set_sizeEm:
 1052|      2|  void set_size(size_t s) {
 1053|      2|    size_ = (s << HasInfozShift()) | (size_ & HasInfozMask());
 1054|      2|  }
_ZN4absl12lts_2024011618container_internal12CommonFields14increment_sizeEv:
 1055|     32|  void increment_size() {
 1056|       |    assert(size() < capacity());
 1057|     32|    size_ += size_t{1} << HasInfozShift();
 1058|     32|  }
_ZNK4absl12lts_2024011618container_internal12CommonFields8capacityEv:
 1065|    340|  size_t capacity() const { return capacity_; }
_ZN4absl12lts_2024011618container_internal12CommonFields12set_capacityEm:
 1066|     10|  void set_capacity(size_t c) {
 1067|       |    assert(c == 0 || IsValidCapacity(c));
 1068|     10|    capacity_ = c;
 1069|     10|  }
_ZNK4absl12lts_2024011618container_internal12CommonFields11growth_leftEv:
 1073|     64|  size_t growth_left() const {
 1074|     64|    const size_t* gl_ptr = reinterpret_cast<size_t*>(control()) - 1;
 1075|       |    assert(reinterpret_cast<uintptr_t>(gl_ptr) % alignof(size_t) == 0);
 1076|     64|    return *gl_ptr;
 1077|     64|  }
_ZN4absl12lts_2024011618container_internal12CommonFields15set_growth_leftEm:
 1078|     44|  void set_growth_left(size_t gl) {
 1079|     44|    size_t* gl_ptr = reinterpret_cast<size_t*>(control()) - 1;
 1080|       |    assert(reinterpret_cast<uintptr_t>(gl_ptr) % alignof(size_t) == 0);
 1081|     44|    *gl_ptr = gl;
 1082|     44|  }
_ZNK4absl12lts_2024011618container_internal12CommonFields9has_infozEv:
 1084|     54|  bool has_infoz() const {
 1085|     54|    return ABSL_PREDICT_FALSE((size_ & HasInfozMask()) != 0);
  ------------------
  |  |  178|     54|#define ABSL_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
  |  |  ------------------
  |  |  |  Branch (178:49): [Folded, False: 54]
  |  |  |  Branch (178:58): [True: 0, False: 54]
  |  |  ------------------
  ------------------
 1086|     54|  }
_ZN4absl12lts_2024011618container_internal12CommonFields13set_has_infozEb:
 1087|     10|  void set_has_infoz(bool has_infoz) {
 1088|     10|    size_ = (size() << HasInfozShift()) | static_cast<size_t>(has_infoz);
 1089|     10|  }
_ZN4absl12lts_2024011618container_internal12CommonFields5infozEv:
 1091|     44|  HashtablezInfoHandle infoz() {
 1092|     44|    return has_infoz()
  ------------------
  |  Branch (1092:12): [True: 0, False: 44]
  ------------------
 1093|     44|               ? *reinterpret_cast<HashtablezInfoHandle*>(backing_array_start())
 1094|     44|               : HashtablezInfoHandle();
 1095|     44|  }
_ZNK4absl12lts_2024011618container_internal12CommonFields41should_rehash_for_bug_detection_on_insertEv:
 1101|     32|  bool should_rehash_for_bug_detection_on_insert() const {
 1102|     32|    return CommonFieldsGenerationInfo::
 1103|     32|        should_rehash_for_bug_detection_on_insert(control(), capacity());
 1104|     32|  }
_ZN4absl12lts_2024011618container_internal12CommonFields13HasInfozShiftEv:
 1130|    124|  static constexpr size_t HasInfozShift() { return 1; }
_ZN4absl12lts_2024011618container_internal12CommonFields12HasInfozMaskEv:
 1131|     56|  static constexpr size_t HasInfozMask() {
 1132|     56|    return (size_t{1} << HasInfozShift()) - 1;
 1133|     56|  }
_ZN4absl12lts_2024011618container_internal19HashSetResizeHelperC2ERNS1_12CommonFieldsE:
 1525|     10|      : old_ctrl_(c.control()),
 1526|     10|        old_capacity_(c.capacity()),
 1527|     10|        had_infoz_(c.has_infoz()) {}
_ZN4absl12lts_2024011618container_internal19HashSetResizeHelper27FindFirstNonFullAfterResizeERKNS1_12CommonFieldsEmm:
 1537|     10|                                              size_t hash) {
 1538|     10|    if (!IsGrowingIntoSingleGroupApplicable(old_capacity, c.capacity())) {
  ------------------
  |  Branch (1538:9): [True: 2, False: 8]
  ------------------
 1539|      2|      return find_first_non_full(c, hash);
 1540|      2|    }
 1541|       |    // Find a location for the new element non-deterministically.
 1542|       |    // Note that any position is correct.
 1543|       |    // It will located at `half_old_capacity` or one of the other
 1544|       |    // empty slots with approximately 50% probability each.
 1545|      8|    size_t offset = probe(c, hash).offset();
 1546|       |
 1547|       |    // Note that we intentionally use unsigned int underflow.
 1548|      8|    if (offset - (old_capacity + 1) >= old_capacity) {
  ------------------
  |  Branch (1548:9): [True: 5, False: 3]
  ------------------
 1549|       |      // Offset fall on kSentinel or into the mostly occupied first half.
 1550|      5|      offset = old_capacity / 2;
 1551|      5|    }
 1552|       |    assert(IsEmpty(c.control()[offset]));
 1553|      8|    return FindInfo{offset, 0};
 1554|     10|  }
_ZNK4absl12lts_2024011618container_internal19HashSetResizeHelper8old_ctrlEv:
 1556|     30|  ctrl_t* old_ctrl() const { return old_ctrl_; }
_ZNK4absl12lts_2024011618container_internal19HashSetResizeHelper12old_capacityEv:
 1557|     42|  size_t old_capacity() const { return old_capacity_; }
_ZN4absl12lts_2024011618container_internal19HashSetResizeHelper34IsGrowingIntoSingleGroupApplicableEmm:
 1686|     20|                                                 size_t new_capacity) {
 1687|       |    // NOTE that `old_capacity < new_capacity` in order to have
 1688|       |    // `old_capacity < Group::kWidth / 2` to make faster copies of 8 bytes.
 1689|     20|    return is_single_group(new_capacity) && old_capacity < new_capacity;
  ------------------
  |  Branch (1689:12): [True: 16, False: 4]
  |  Branch (1689:45): [True: 16, False: 0]
  ------------------
 1690|     20|  }
_ZNK4absl12lts_2024011618container_internal19HashSetResizeHelper27PoisonSingleGroupEmptySlotsERNS1_12CommonFieldsEm:
 1753|      6|  void PoisonSingleGroupEmptySlots(CommonFields& c, size_t slot_size) const {
 1754|       |    // poison non full items
 1755|     56|    for (size_t i = 0; i < c.capacity(); ++i) {
  ------------------
  |  Branch (1755:24): [True: 50, False: 6]
  ------------------
 1756|     50|      if (!IsFull(c.control()[i])) {
  ------------------
  |  Branch (1756:11): [True: 28, False: 22]
  ------------------
 1757|     28|        SanitizerPoisonMemoryRegion(SlotAddress(c.slot_array(), i, slot_size),
 1758|     28|                                    slot_size);
 1759|     28|      }
 1760|     50|    }
 1761|      6|  }
_ZN4absl12lts_2024011618container_internal23SentinelEmptyGenerationEv:
  252|     10|constexpr GenerationType SentinelEmptyGeneration() { return 0; }
_ZN4absl12lts_2024011618container_internal14NextGenerationEh:
  254|     10|constexpr GenerationType NextGeneration(GenerationType generation) {
  255|     10|  return ++generation == SentinelEmptyGeneration() ? ++generation : generation;
  ------------------
  |  Branch (255:10): [True: 0, False: 10]
  ------------------
  256|     10|}
_ZN4absl12lts_2024011618container_internal18NumGenerationBytesEv:
  263|     28|constexpr size_t NumGenerationBytes() { return 0; }
_ZN4absl12lts_2024011618container_internal10EmptyGroupEv:
  521|      2|inline ctrl_t* EmptyGroup() {
  522|       |  // Const must be cast away here; no uses of this function will actually write
  523|       |  // to it, because it is only used for empty tables.
  524|      2|  return const_cast<ctrl_t*>(kEmptyGroup + 16);
  525|      2|}
_ZN4absl12lts_2024011618container_internal12PerTableSaltEPKNS1_6ctrl_tE:
  545|    102|inline size_t PerTableSalt(const ctrl_t* ctrl) {
  546|       |  // The low bits of the pointer have little or no entropy because of
  547|       |  // alignment. We shift the pointer to try to use higher entropy bits. A
  548|       |  // good number seems to be 12 bits, because that aligns with page size.
  549|    102|  return reinterpret_cast<uintptr_t>(ctrl) >> 12;
  550|    102|}
_ZN4absl12lts_2024011618container_internal2H1EmPKNS1_6ctrl_tE:
  552|    102|inline size_t H1(size_t hash, const ctrl_t* ctrl) {
  553|    102|  return (hash >> 7) ^ PerTableSalt(ctrl);
  554|    102|}
_ZN4absl12lts_2024011618container_internal2H2Em:
  559|     92|inline h2_t H2(size_t hash) { return hash & 0x7F; }
_ZN4absl12lts_2024011618container_internal7IsEmptyENS1_6ctrl_tE:
  562|     32|inline bool IsEmpty(ctrl_t c) { return c == ctrl_t::kEmpty; }
_ZN4absl12lts_2024011618container_internal6IsFullENS1_6ctrl_tE:
  563|    202|inline bool IsFull(ctrl_t c) { return c >= static_cast<ctrl_t>(0); }
_ZN4absl12lts_2024011618container_internal9IsDeletedENS1_6ctrl_tE:
  564|     10|inline bool IsDeleted(ctrl_t c) { return c == ctrl_t::kDeleted; }
_ZN4absl12lts_2024011618container_internal16IsEmptyOrDeletedENS1_6ctrl_tE:
  565|     47|inline bool IsEmptyOrDeleted(ctrl_t c) { return c < ctrl_t::kSentinel; }
_ZN4absl12lts_2024011618container_internal20_mm_cmpgt_epi8_fixedEDv2_xS2_:
  601|     75|inline __m128i _mm_cmpgt_epi8_fixed(__m128i a, __m128i b) {
  602|       |#if defined(__GNUC__) && !defined(__clang__)
  603|       |  if (std::is_unsigned<char>::value) {
  604|       |    const __m128i mask = _mm_set1_epi8(0x80);
  605|       |    const __m128i diff = _mm_subs_epi8(b, a);
  606|       |    return _mm_cmpeq_epi8(_mm_and_si128(diff, mask), mask);
  607|       |  }
  608|       |#endif
  609|     75|  return _mm_cmpgt_epi8(a, b);
  610|     75|}
_ZN4absl12lts_2024011618container_internal13ControlOffsetEb:
  988|     56|inline size_t ControlOffset(bool has_infoz) {
  989|     56|  return (has_infoz ? sizeof(HashtablezInfoHandle) : 0) + sizeof(size_t);
  ------------------
  |  Branch (989:11): [True: 0, False: 56]
  ------------------
  990|     56|}
_ZN4absl12lts_2024011618container_internal14NumClonedBytesEv:
  997|    164|constexpr size_t NumClonedBytes() { return Group::kWidth - 1; }
_ZN4absl12lts_2024011618container_internal16GenerationOffsetEmb:
 1001|     38|inline size_t GenerationOffset(size_t capacity, bool has_infoz) {
 1002|       |  assert(IsValidCapacity(capacity));
 1003|     38|  const size_t num_control_bytes = capacity + 1 + NumClonedBytes();
 1004|     38|  return ControlOffset(has_infoz) + num_control_bytes;
 1005|     38|}
_ZN4absl12lts_2024011618container_internal10SlotOffsetEmmb:
 1009|     28|inline size_t SlotOffset(size_t capacity, size_t slot_align, bool has_infoz) {
 1010|       |  assert(IsValidCapacity(capacity));
 1011|     28|  return (GenerationOffset(capacity, has_infoz) + NumGenerationBytes() +
 1012|     28|          slot_align - 1) &
 1013|     28|         (~slot_align + 1);
 1014|     28|}
_ZN4absl12lts_2024011618container_internal9AllocSizeEmmmb:
 1019|     18|                        bool has_infoz) {
 1020|     18|  return SlotOffset(capacity, slot_align, has_infoz) + capacity * slot_size;
 1021|     18|}
_ZN4absl12lts_2024011618container_internal12NextCapacityEm:
 1167|     10|inline size_t NextCapacity(size_t n) {
 1168|       |  assert(IsValidCapacity(n) || n == 0);
 1169|     10|  return n * 2 + 1;
 1170|     10|}
_ZN4absl12lts_2024011618container_internal16CapacityToGrowthEm:
 1197|     12|inline size_t CapacityToGrowth(size_t capacity) {
 1198|     12|  assert(IsValidCapacity(capacity));
 1199|       |  // `capacity*7/8`
 1200|     12|  if (Group::kWidth == 8 && capacity == 7) {
  ------------------
  |  Branch (1200:7): [Folded, False: 12]
  |  Branch (1200:29): [True: 0, False: 0]
  ------------------
 1201|       |    // x-x/8 does not work when x==7.
 1202|      0|    return 6;
 1203|      0|  }
 1204|     12|  return capacity - capacity / 8;
 1205|     12|}
_ZN4absl12lts_2024011618container_internal12AssertIsFullEPKNS1_6ctrl_tEhPKhPKc:
 1248|     64|                         const char* operation) {
 1249|     64|  if (!SwisstableDebugEnabled()) return;
  ------------------
  |  Branch (1249:7): [True: 64, Folded]
  ------------------
 1250|       |  // `SwisstableDebugEnabled()` is also true for release builds with hardening
 1251|       |  // enabled. To minimize their impact in those builds:
 1252|       |  // - use `ABSL_PREDICT_FALSE()` to provide a compiler hint for code layout
 1253|       |  // - use `ABSL_RAW_LOG()` with a format string to reduce code size and improve
 1254|       |  //   the chances that the hot paths will be inlined.
 1255|      0|  if (ABSL_PREDICT_FALSE(ctrl == nullptr)) {
  ------------------
  |  |  178|      0|#define ABSL_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
  |  |  ------------------
  |  |  |  Branch (178:31): [True: 0, False: 0]
  |  |  |  Branch (178:49): [Folded, False: 0]
  |  |  |  Branch (178:58): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1256|      0|    ABSL_RAW_LOG(FATAL, "%s called on end() iterator.", operation);
  ------------------
  |  |   45|      0|  do {                                                                         \
  |  |   46|      0|    constexpr const char* absl_raw_log_internal_basename =                     \
  |  |   47|      0|        ::absl::raw_log_internal::Basename(__FILE__, sizeof(__FILE__) - 1);    \
  |  |   48|      0|    ::absl::raw_log_internal::RawLog(ABSL_RAW_LOG_INTERNAL_##severity,         \
  |  |  ------------------
  |  |  |  |  110|      0|#define ABSL_RAW_LOG_INTERNAL_FATAL ::absl::LogSeverity::kFatal
  |  |  ------------------
  |  |   49|      0|                                     absl_raw_log_internal_basename, __LINE__, \
  |  |   50|      0|                                     __VA_ARGS__);                             \
  |  |   51|      0|    ABSL_RAW_LOG_INTERNAL_MAYBE_UNREACHABLE_##severity;                        \
  |  |  ------------------
  |  |  |  |  118|      0|#define ABSL_RAW_LOG_INTERNAL_MAYBE_UNREACHABLE_FATAL ABSL_UNREACHABLE()
  |  |  |  |  ------------------
  |  |  |  |  |  |  225|      0|  do {                                           \
  |  |  |  |  |  |  226|      0|    /* NOLINTNEXTLINE: misc-static-assert */     \
  |  |  |  |  |  |  227|      0|    assert(false && "ABSL_UNREACHABLE reached"); \
  |  |  |  |  |  |  228|      0|    ABSL_INTERNAL_UNREACHABLE_IMPL();            \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  203|      0|#define ABSL_INTERNAL_UNREACHABLE_IMPL() __builtin_unreachable()
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  229|      0|  } while (false)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (229:12): [Folded, False: 0]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   52|      0|  } while (0)
  |  |  ------------------
  |  |  |  Branch (52:12): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1257|      0|  }
 1258|      0|  if (ABSL_PREDICT_FALSE(ctrl == EmptyGroup())) {
  ------------------
  |  |  178|      0|#define ABSL_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
  |  |  ------------------
  |  |  |  Branch (178:31): [True: 0, False: 0]
  |  |  |  Branch (178:49): [Folded, False: 0]
  |  |  |  Branch (178:58): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1259|      0|    ABSL_RAW_LOG(FATAL, "%s called on default-constructed iterator.",
  ------------------
  |  |   45|      0|  do {                                                                         \
  |  |   46|      0|    constexpr const char* absl_raw_log_internal_basename =                     \
  |  |   47|      0|        ::absl::raw_log_internal::Basename(__FILE__, sizeof(__FILE__) - 1);    \
  |  |   48|      0|    ::absl::raw_log_internal::RawLog(ABSL_RAW_LOG_INTERNAL_##severity,         \
  |  |  ------------------
  |  |  |  |  110|      0|#define ABSL_RAW_LOG_INTERNAL_FATAL ::absl::LogSeverity::kFatal
  |  |  ------------------
  |  |   49|      0|                                     absl_raw_log_internal_basename, __LINE__, \
  |  |   50|      0|                                     __VA_ARGS__);                             \
  |  |   51|      0|    ABSL_RAW_LOG_INTERNAL_MAYBE_UNREACHABLE_##severity;                        \
  |  |  ------------------
  |  |  |  |  118|      0|#define ABSL_RAW_LOG_INTERNAL_MAYBE_UNREACHABLE_FATAL ABSL_UNREACHABLE()
  |  |  |  |  ------------------
  |  |  |  |  |  |  225|      0|  do {                                           \
  |  |  |  |  |  |  226|      0|    /* NOLINTNEXTLINE: misc-static-assert */     \
  |  |  |  |  |  |  227|      0|    assert(false && "ABSL_UNREACHABLE reached"); \
  |  |  |  |  |  |  228|      0|    ABSL_INTERNAL_UNREACHABLE_IMPL();            \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  203|      0|#define ABSL_INTERNAL_UNREACHABLE_IMPL() __builtin_unreachable()
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  229|      0|  } while (false)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (229:12): [Folded, False: 0]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   52|      0|  } while (0)
  |  |  ------------------
  |  |  |  Branch (52:12): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1260|      0|                 operation);
 1261|      0|  }
 1262|      0|  if (SwisstableGenerationsEnabled()) {
  ------------------
  |  Branch (1262:7): [Folded, False: 0]
  ------------------
 1263|      0|    if (ABSL_PREDICT_FALSE(generation != *generation_ptr)) {
  ------------------
  |  |  178|      0|#define ABSL_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
  |  |  ------------------
  |  |  |  Branch (178:31): [True: 0, False: 0]
  |  |  |  Branch (178:49): [Folded, False: 0]
  |  |  |  Branch (178:58): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1264|      0|      ABSL_RAW_LOG(FATAL,
  ------------------
  |  |   45|      0|  do {                                                                         \
  |  |   46|      0|    constexpr const char* absl_raw_log_internal_basename =                     \
  |  |   47|      0|        ::absl::raw_log_internal::Basename(__FILE__, sizeof(__FILE__) - 1);    \
  |  |   48|      0|    ::absl::raw_log_internal::RawLog(ABSL_RAW_LOG_INTERNAL_##severity,         \
  |  |  ------------------
  |  |  |  |  110|      0|#define ABSL_RAW_LOG_INTERNAL_FATAL ::absl::LogSeverity::kFatal
  |  |  ------------------
  |  |   49|      0|                                     absl_raw_log_internal_basename, __LINE__, \
  |  |   50|      0|                                     __VA_ARGS__);                             \
  |  |   51|      0|    ABSL_RAW_LOG_INTERNAL_MAYBE_UNREACHABLE_##severity;                        \
  |  |  ------------------
  |  |  |  |  118|      0|#define ABSL_RAW_LOG_INTERNAL_MAYBE_UNREACHABLE_FATAL ABSL_UNREACHABLE()
  |  |  |  |  ------------------
  |  |  |  |  |  |  225|      0|  do {                                           \
  |  |  |  |  |  |  226|      0|    /* NOLINTNEXTLINE: misc-static-assert */     \
  |  |  |  |  |  |  227|      0|    assert(false && "ABSL_UNREACHABLE reached"); \
  |  |  |  |  |  |  228|      0|    ABSL_INTERNAL_UNREACHABLE_IMPL();            \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  203|      0|#define ABSL_INTERNAL_UNREACHABLE_IMPL() __builtin_unreachable()
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  229|      0|  } while (false)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (229:12): [Folded, False: 0]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   52|      0|  } while (0)
  |  |  ------------------
  |  |  |  Branch (52:12): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1265|      0|                   "%s called on invalid iterator. The table could have "
 1266|      0|                   "rehashed or moved since this iterator was initialized.",
 1267|      0|                   operation);
 1268|      0|    }
 1269|      0|    if (ABSL_PREDICT_FALSE(!IsFull(*ctrl))) {
  ------------------
  |  |  178|      0|#define ABSL_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
  |  |  ------------------
  |  |  |  Branch (178:31): [True: 0, False: 0]
  |  |  |  Branch (178:49): [Folded, False: 0]
  |  |  |  Branch (178:58): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1270|      0|      ABSL_RAW_LOG(
  ------------------
  |  |   45|      0|  do {                                                                         \
  |  |   46|      0|    constexpr const char* absl_raw_log_internal_basename =                     \
  |  |   47|      0|        ::absl::raw_log_internal::Basename(__FILE__, sizeof(__FILE__) - 1);    \
  |  |   48|      0|    ::absl::raw_log_internal::RawLog(ABSL_RAW_LOG_INTERNAL_##severity,         \
  |  |  ------------------
  |  |  |  |  110|      0|#define ABSL_RAW_LOG_INTERNAL_FATAL ::absl::LogSeverity::kFatal
  |  |  ------------------
  |  |   49|      0|                                     absl_raw_log_internal_basename, __LINE__, \
  |  |   50|      0|                                     __VA_ARGS__);                             \
  |  |   51|      0|    ABSL_RAW_LOG_INTERNAL_MAYBE_UNREACHABLE_##severity;                        \
  |  |  ------------------
  |  |  |  |  118|      0|#define ABSL_RAW_LOG_INTERNAL_MAYBE_UNREACHABLE_FATAL ABSL_UNREACHABLE()
  |  |  |  |  ------------------
  |  |  |  |  |  |  225|      0|  do {                                           \
  |  |  |  |  |  |  226|      0|    /* NOLINTNEXTLINE: misc-static-assert */     \
  |  |  |  |  |  |  227|      0|    assert(false && "ABSL_UNREACHABLE reached"); \
  |  |  |  |  |  |  228|      0|    ABSL_INTERNAL_UNREACHABLE_IMPL();            \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  203|      0|#define ABSL_INTERNAL_UNREACHABLE_IMPL() __builtin_unreachable()
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  229|      0|  } while (false)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (229:12): [Folded, False: 0]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   52|      0|  } while (0)
  |  |  ------------------
  |  |  |  Branch (52:12): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1271|      0|          FATAL,
 1272|      0|          "%s called on invalid iterator. The element was likely erased.",
 1273|      0|          operation);
 1274|      0|    }
 1275|      0|  } else {
 1276|      0|    if (ABSL_PREDICT_FALSE(!IsFull(*ctrl))) {
  ------------------
  |  |  178|      0|#define ABSL_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
  |  |  ------------------
  |  |  |  Branch (178:31): [True: 0, False: 0]
  |  |  |  Branch (178:49): [Folded, False: 0]
  |  |  |  Branch (178:58): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1277|       |      ABSL_RAW_LOG(
  ------------------
  |  |   45|      0|  do {                                                                         \
  |  |   46|      0|    constexpr const char* absl_raw_log_internal_basename =                     \
  |  |   47|      0|        ::absl::raw_log_internal::Basename(__FILE__, sizeof(__FILE__) - 1);    \
  |  |   48|      0|    ::absl::raw_log_internal::RawLog(ABSL_RAW_LOG_INTERNAL_##severity,         \
  |  |  ------------------
  |  |  |  |  110|      0|#define ABSL_RAW_LOG_INTERNAL_FATAL ::absl::LogSeverity::kFatal
  |  |  ------------------
  |  |   49|      0|                                     absl_raw_log_internal_basename, __LINE__, \
  |  |   50|      0|                                     __VA_ARGS__);                             \
  |  |   51|      0|    ABSL_RAW_LOG_INTERNAL_MAYBE_UNREACHABLE_##severity;                        \
  |  |  ------------------
  |  |  |  |  118|      0|#define ABSL_RAW_LOG_INTERNAL_MAYBE_UNREACHABLE_FATAL ABSL_UNREACHABLE()
  |  |  |  |  ------------------
  |  |  |  |  |  |  225|      0|  do {                                           \
  |  |  |  |  |  |  226|      0|    /* NOLINTNEXTLINE: misc-static-assert */     \
  |  |  |  |  |  |  227|      0|    assert(false && "ABSL_UNREACHABLE reached"); \
  |  |  |  |  |  |  228|      0|    ABSL_INTERNAL_UNREACHABLE_IMPL();            \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  203|      0|#define ABSL_INTERNAL_UNREACHABLE_IMPL() __builtin_unreachable()
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  229|      0|  } while (false)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (229:12): [Folded, False: 0]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   52|      0|  } while (0)
  |  |  ------------------
  |  |  |  Branch (52:12): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1278|      0|          FATAL,
 1279|      0|          "%s called on invalid iterator. The element might have been erased "
 1280|      0|          "or the table might have rehashed. Consider running with "
 1281|      0|          "--config=asan to diagnose rehashing issues.",
 1282|      0|          operation);
 1283|      0|    }
 1284|      0|  }
 1285|      0|}
_ZN4absl12lts_2024011618container_internal26AssertIsValidForComparisonEPKNS1_6ctrl_tEhPKh:
 1290|     68|                                       const GenerationType* generation_ptr) {
 1291|     68|  if (!SwisstableDebugEnabled()) return;
  ------------------
  |  Branch (1291:7): [True: 68, Folded]
  ------------------
 1292|      0|  const bool ctrl_is_valid_for_comparison =
 1293|      0|      ctrl == nullptr || ctrl == EmptyGroup() || IsFull(*ctrl);
  ------------------
  |  Branch (1293:7): [True: 0, False: 0]
  |  Branch (1293:26): [True: 0, False: 0]
  |  Branch (1293:50): [True: 0, False: 0]
  ------------------
 1294|      0|  if (SwisstableGenerationsEnabled()) {
  ------------------
  |  Branch (1294:7): [Folded, False: 0]
  ------------------
 1295|      0|    if (ABSL_PREDICT_FALSE(generation != *generation_ptr)) {
  ------------------
  |  |  178|      0|#define ABSL_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
  |  |  ------------------
  |  |  |  Branch (178:31): [True: 0, False: 0]
  |  |  |  Branch (178:49): [Folded, False: 0]
  |  |  |  Branch (178:58): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1296|      0|      ABSL_RAW_LOG(FATAL,
  ------------------
  |  |   45|      0|  do {                                                                         \
  |  |   46|      0|    constexpr const char* absl_raw_log_internal_basename =                     \
  |  |   47|      0|        ::absl::raw_log_internal::Basename(__FILE__, sizeof(__FILE__) - 1);    \
  |  |   48|      0|    ::absl::raw_log_internal::RawLog(ABSL_RAW_LOG_INTERNAL_##severity,         \
  |  |  ------------------
  |  |  |  |  110|      0|#define ABSL_RAW_LOG_INTERNAL_FATAL ::absl::LogSeverity::kFatal
  |  |  ------------------
  |  |   49|      0|                                     absl_raw_log_internal_basename, __LINE__, \
  |  |   50|      0|                                     __VA_ARGS__);                             \
  |  |   51|      0|    ABSL_RAW_LOG_INTERNAL_MAYBE_UNREACHABLE_##severity;                        \
  |  |  ------------------
  |  |  |  |  118|      0|#define ABSL_RAW_LOG_INTERNAL_MAYBE_UNREACHABLE_FATAL ABSL_UNREACHABLE()
  |  |  |  |  ------------------
  |  |  |  |  |  |  225|      0|  do {                                           \
  |  |  |  |  |  |  226|      0|    /* NOLINTNEXTLINE: misc-static-assert */     \
  |  |  |  |  |  |  227|      0|    assert(false && "ABSL_UNREACHABLE reached"); \
  |  |  |  |  |  |  228|      0|    ABSL_INTERNAL_UNREACHABLE_IMPL();            \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  203|      0|#define ABSL_INTERNAL_UNREACHABLE_IMPL() __builtin_unreachable()
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  229|      0|  } while (false)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (229:12): [Folded, False: 0]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   52|      0|  } while (0)
  |  |  ------------------
  |  |  |  Branch (52:12): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1297|      0|                   "Invalid iterator comparison. The table could have rehashed "
 1298|      0|                   "or moved since this iterator was initialized.");
 1299|      0|    }
 1300|      0|    if (ABSL_PREDICT_FALSE(!ctrl_is_valid_for_comparison)) {
  ------------------
  |  |  178|      0|#define ABSL_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
  |  |  ------------------
  |  |  |  Branch (178:31): [True: 0, False: 0]
  |  |  |  Branch (178:49): [Folded, False: 0]
  |  |  |  Branch (178:58): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1301|      0|      ABSL_RAW_LOG(
  ------------------
  |  |   45|      0|  do {                                                                         \
  |  |   46|      0|    constexpr const char* absl_raw_log_internal_basename =                     \
  |  |   47|      0|        ::absl::raw_log_internal::Basename(__FILE__, sizeof(__FILE__) - 1);    \
  |  |   48|      0|    ::absl::raw_log_internal::RawLog(ABSL_RAW_LOG_INTERNAL_##severity,         \
  |  |  ------------------
  |  |  |  |  110|      0|#define ABSL_RAW_LOG_INTERNAL_FATAL ::absl::LogSeverity::kFatal
  |  |  ------------------
  |  |   49|      0|                                     absl_raw_log_internal_basename, __LINE__, \
  |  |   50|      0|                                     __VA_ARGS__);                             \
  |  |   51|      0|    ABSL_RAW_LOG_INTERNAL_MAYBE_UNREACHABLE_##severity;                        \
  |  |  ------------------
  |  |  |  |  118|      0|#define ABSL_RAW_LOG_INTERNAL_MAYBE_UNREACHABLE_FATAL ABSL_UNREACHABLE()
  |  |  |  |  ------------------
  |  |  |  |  |  |  225|      0|  do {                                           \
  |  |  |  |  |  |  226|      0|    /* NOLINTNEXTLINE: misc-static-assert */     \
  |  |  |  |  |  |  227|      0|    assert(false && "ABSL_UNREACHABLE reached"); \
  |  |  |  |  |  |  228|      0|    ABSL_INTERNAL_UNREACHABLE_IMPL();            \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  203|      0|#define ABSL_INTERNAL_UNREACHABLE_IMPL() __builtin_unreachable()
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  229|      0|  } while (false)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (229:12): [Folded, False: 0]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   52|      0|  } while (0)
  |  |  ------------------
  |  |  |  Branch (52:12): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1302|      0|          FATAL, "Invalid iterator comparison. The element was likely erased.");
 1303|      0|    }
 1304|      0|  } else {
 1305|      0|    ABSL_HARDENING_ASSERT(
  ------------------
  |  |  128|      0|#define ABSL_HARDENING_ASSERT(expr) ABSL_ASSERT(expr)
  |  |  ------------------
  |  |  |  |   95|      0|  (false ? static_cast<void>(expr) : static_cast<void>(0))
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (95:4): [Folded, False: 0]
  |  |  |  |  |  Branch (95:30): [True: 0, False: 0]
  |  |  |  |  |  Branch (95:30): [True: 0, Folded]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1306|      0|        ctrl_is_valid_for_comparison &&
 1307|      0|        "Invalid iterator comparison. The element might have been erased or "
 1308|      0|        "the table might have rehashed. Consider running with --config=asan to "
 1309|      0|        "diagnose rehashing issues.");
 1310|      0|  }
 1311|      0|}
_ZN4absl12lts_2024011618container_internal19AssertSameContainerEPKNS1_6ctrl_tES4_RKPKvS8_PKhSA_:
 1339|     34|                                const GenerationType* generation_ptr_b) {
 1340|     34|  if (!SwisstableDebugEnabled()) return;
  ------------------
  |  Branch (1340:7): [True: 34, Folded]
  ------------------
 1341|       |  // `SwisstableDebugEnabled()` is also true for release builds with hardening
 1342|       |  // enabled. To minimize their impact in those builds:
 1343|       |  // - use `ABSL_PREDICT_FALSE()` to provide a compiler hint for code layout
 1344|       |  // - use `ABSL_RAW_LOG()` with a format string to reduce code size and improve
 1345|       |  //   the chances that the hot paths will be inlined.
 1346|      0|  const bool a_is_default = ctrl_a == EmptyGroup();
 1347|      0|  const bool b_is_default = ctrl_b == EmptyGroup();
 1348|      0|  if (ABSL_PREDICT_FALSE(a_is_default != b_is_default)) {
  ------------------
  |  |  178|      0|#define ABSL_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
  |  |  ------------------
  |  |  |  Branch (178:31): [True: 0, False: 0]
  |  |  |  Branch (178:49): [Folded, False: 0]
  |  |  |  Branch (178:58): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1349|      0|    ABSL_RAW_LOG(
  ------------------
  |  |   45|      0|  do {                                                                         \
  |  |   46|      0|    constexpr const char* absl_raw_log_internal_basename =                     \
  |  |   47|      0|        ::absl::raw_log_internal::Basename(__FILE__, sizeof(__FILE__) - 1);    \
  |  |   48|      0|    ::absl::raw_log_internal::RawLog(ABSL_RAW_LOG_INTERNAL_##severity,         \
  |  |  ------------------
  |  |  |  |  110|      0|#define ABSL_RAW_LOG_INTERNAL_FATAL ::absl::LogSeverity::kFatal
  |  |  ------------------
  |  |   49|      0|                                     absl_raw_log_internal_basename, __LINE__, \
  |  |   50|      0|                                     __VA_ARGS__);                             \
  |  |   51|      0|    ABSL_RAW_LOG_INTERNAL_MAYBE_UNREACHABLE_##severity;                        \
  |  |  ------------------
  |  |  |  |  118|      0|#define ABSL_RAW_LOG_INTERNAL_MAYBE_UNREACHABLE_FATAL ABSL_UNREACHABLE()
  |  |  |  |  ------------------
  |  |  |  |  |  |  225|      0|  do {                                           \
  |  |  |  |  |  |  226|      0|    /* NOLINTNEXTLINE: misc-static-assert */     \
  |  |  |  |  |  |  227|      0|    assert(false && "ABSL_UNREACHABLE reached"); \
  |  |  |  |  |  |  228|      0|    ABSL_INTERNAL_UNREACHABLE_IMPL();            \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  203|      0|#define ABSL_INTERNAL_UNREACHABLE_IMPL() __builtin_unreachable()
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  229|      0|  } while (false)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (229:12): [Folded, False: 0]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   52|      0|  } while (0)
  |  |  ------------------
  |  |  |  Branch (52:12): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1350|      0|        FATAL,
 1351|      0|        "Invalid iterator comparison. Comparing default-constructed iterator "
 1352|      0|        "with non-default-constructed iterator.");
 1353|      0|  }
 1354|      0|  if (a_is_default && b_is_default) return;
  ------------------
  |  Branch (1354:7): [True: 0, False: 0]
  |  Branch (1354:23): [True: 0, False: 0]
  ------------------
 1355|       |
 1356|      0|  if (SwisstableGenerationsEnabled()) {
  ------------------
  |  Branch (1356:7): [Folded, False: 0]
  ------------------
 1357|      0|    if (ABSL_PREDICT_TRUE(generation_ptr_a == generation_ptr_b)) return;
  ------------------
  |  |  179|      0|#define ABSL_PREDICT_TRUE(x) (__builtin_expect(false || (x), true))
  |  |  ------------------
  |  |  |  Branch (179:30): [True: 0, False: 0]
  |  |  |  Branch (179:48): [Folded, False: 0]
  |  |  |  Branch (179:57): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1358|      0|    const bool a_is_empty = IsEmptyGeneration(generation_ptr_a);
 1359|      0|    const bool b_is_empty = IsEmptyGeneration(generation_ptr_b);
 1360|      0|    if (a_is_empty != b_is_empty) {
  ------------------
  |  Branch (1360:9): [True: 0, False: 0]
  ------------------
 1361|      0|      ABSL_RAW_LOG(FATAL,
  ------------------
  |  |   45|      0|  do {                                                                         \
  |  |   46|      0|    constexpr const char* absl_raw_log_internal_basename =                     \
  |  |   47|      0|        ::absl::raw_log_internal::Basename(__FILE__, sizeof(__FILE__) - 1);    \
  |  |   48|      0|    ::absl::raw_log_internal::RawLog(ABSL_RAW_LOG_INTERNAL_##severity,         \
  |  |  ------------------
  |  |  |  |  110|      0|#define ABSL_RAW_LOG_INTERNAL_FATAL ::absl::LogSeverity::kFatal
  |  |  ------------------
  |  |   49|      0|                                     absl_raw_log_internal_basename, __LINE__, \
  |  |   50|      0|                                     __VA_ARGS__);                             \
  |  |   51|      0|    ABSL_RAW_LOG_INTERNAL_MAYBE_UNREACHABLE_##severity;                        \
  |  |  ------------------
  |  |  |  |  118|      0|#define ABSL_RAW_LOG_INTERNAL_MAYBE_UNREACHABLE_FATAL ABSL_UNREACHABLE()
  |  |  |  |  ------------------
  |  |  |  |  |  |  225|      0|  do {                                           \
  |  |  |  |  |  |  226|      0|    /* NOLINTNEXTLINE: misc-static-assert */     \
  |  |  |  |  |  |  227|      0|    assert(false && "ABSL_UNREACHABLE reached"); \
  |  |  |  |  |  |  228|      0|    ABSL_INTERNAL_UNREACHABLE_IMPL();            \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  203|      0|#define ABSL_INTERNAL_UNREACHABLE_IMPL() __builtin_unreachable()
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  229|      0|  } while (false)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (229:12): [Folded, False: 0]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   52|      0|  } while (0)
  |  |  ------------------
  |  |  |  Branch (52:12): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1362|      0|                   "Invalid iterator comparison. Comparing iterator from a "
 1363|      0|                   "non-empty hashtable with an iterator from an empty "
 1364|      0|                   "hashtable.");
 1365|      0|    }
 1366|      0|    if (a_is_empty && b_is_empty) {
  ------------------
  |  Branch (1366:9): [True: 0, False: 0]
  |  Branch (1366:23): [True: 0, False: 0]
  ------------------
 1367|      0|      ABSL_RAW_LOG(FATAL,
  ------------------
  |  |   45|      0|  do {                                                                         \
  |  |   46|      0|    constexpr const char* absl_raw_log_internal_basename =                     \
  |  |   47|      0|        ::absl::raw_log_internal::Basename(__FILE__, sizeof(__FILE__) - 1);    \
  |  |   48|      0|    ::absl::raw_log_internal::RawLog(ABSL_RAW_LOG_INTERNAL_##severity,         \
  |  |  ------------------
  |  |  |  |  110|      0|#define ABSL_RAW_LOG_INTERNAL_FATAL ::absl::LogSeverity::kFatal
  |  |  ------------------
  |  |   49|      0|                                     absl_raw_log_internal_basename, __LINE__, \
  |  |   50|      0|                                     __VA_ARGS__);                             \
  |  |   51|      0|    ABSL_RAW_LOG_INTERNAL_MAYBE_UNREACHABLE_##severity;                        \
  |  |  ------------------
  |  |  |  |  118|      0|#define ABSL_RAW_LOG_INTERNAL_MAYBE_UNREACHABLE_FATAL ABSL_UNREACHABLE()
  |  |  |  |  ------------------
  |  |  |  |  |  |  225|      0|  do {                                           \
  |  |  |  |  |  |  226|      0|    /* NOLINTNEXTLINE: misc-static-assert */     \
  |  |  |  |  |  |  227|      0|    assert(false && "ABSL_UNREACHABLE reached"); \
  |  |  |  |  |  |  228|      0|    ABSL_INTERNAL_UNREACHABLE_IMPL();            \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  203|      0|#define ABSL_INTERNAL_UNREACHABLE_IMPL() __builtin_unreachable()
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  229|      0|  } while (false)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (229:12): [Folded, False: 0]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   52|      0|  } while (0)
  |  |  ------------------
  |  |  |  Branch (52:12): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1368|      0|                   "Invalid iterator comparison. Comparing iterators from "
 1369|      0|                   "different empty hashtables.");
 1370|      0|    }
 1371|      0|    const bool a_is_end = ctrl_a == nullptr;
 1372|      0|    const bool b_is_end = ctrl_b == nullptr;
 1373|      0|    if (a_is_end || b_is_end) {
  ------------------
  |  Branch (1373:9): [True: 0, False: 0]
  |  Branch (1373:21): [True: 0, False: 0]
  ------------------
 1374|      0|      ABSL_RAW_LOG(FATAL,
  ------------------
  |  |   45|      0|  do {                                                                         \
  |  |   46|      0|    constexpr const char* absl_raw_log_internal_basename =                     \
  |  |   47|      0|        ::absl::raw_log_internal::Basename(__FILE__, sizeof(__FILE__) - 1);    \
  |  |   48|      0|    ::absl::raw_log_internal::RawLog(ABSL_RAW_LOG_INTERNAL_##severity,         \
  |  |  ------------------
  |  |  |  |  110|      0|#define ABSL_RAW_LOG_INTERNAL_FATAL ::absl::LogSeverity::kFatal
  |  |  ------------------
  |  |   49|      0|                                     absl_raw_log_internal_basename, __LINE__, \
  |  |   50|      0|                                     __VA_ARGS__);                             \
  |  |   51|      0|    ABSL_RAW_LOG_INTERNAL_MAYBE_UNREACHABLE_##severity;                        \
  |  |  ------------------
  |  |  |  |  118|      0|#define ABSL_RAW_LOG_INTERNAL_MAYBE_UNREACHABLE_FATAL ABSL_UNREACHABLE()
  |  |  |  |  ------------------
  |  |  |  |  |  |  225|      0|  do {                                           \
  |  |  |  |  |  |  226|      0|    /* NOLINTNEXTLINE: misc-static-assert */     \
  |  |  |  |  |  |  227|      0|    assert(false && "ABSL_UNREACHABLE reached"); \
  |  |  |  |  |  |  228|      0|    ABSL_INTERNAL_UNREACHABLE_IMPL();            \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  203|      0|#define ABSL_INTERNAL_UNREACHABLE_IMPL() __builtin_unreachable()
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  229|      0|  } while (false)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (229:12): [Folded, False: 0]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   52|      0|  } while (0)
  |  |  ------------------
  |  |  |  Branch (52:12): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1375|      0|                   "Invalid iterator comparison. Comparing iterator with an "
 1376|      0|                   "end() iterator from a different hashtable.");
 1377|      0|    }
 1378|      0|    ABSL_RAW_LOG(FATAL,
  ------------------
  |  |   45|      0|  do {                                                                         \
  |  |   46|      0|    constexpr const char* absl_raw_log_internal_basename =                     \
  |  |   47|      0|        ::absl::raw_log_internal::Basename(__FILE__, sizeof(__FILE__) - 1);    \
  |  |   48|      0|    ::absl::raw_log_internal::RawLog(ABSL_RAW_LOG_INTERNAL_##severity,         \
  |  |  ------------------
  |  |  |  |  110|      0|#define ABSL_RAW_LOG_INTERNAL_FATAL ::absl::LogSeverity::kFatal
  |  |  ------------------
  |  |   49|      0|                                     absl_raw_log_internal_basename, __LINE__, \
  |  |   50|      0|                                     __VA_ARGS__);                             \
  |  |   51|      0|    ABSL_RAW_LOG_INTERNAL_MAYBE_UNREACHABLE_##severity;                        \
  |  |  ------------------
  |  |  |  |  118|      0|#define ABSL_RAW_LOG_INTERNAL_MAYBE_UNREACHABLE_FATAL ABSL_UNREACHABLE()
  |  |  |  |  ------------------
  |  |  |  |  |  |  225|      0|  do {                                           \
  |  |  |  |  |  |  226|      0|    /* NOLINTNEXTLINE: misc-static-assert */     \
  |  |  |  |  |  |  227|      0|    assert(false && "ABSL_UNREACHABLE reached"); \
  |  |  |  |  |  |  228|      0|    ABSL_INTERNAL_UNREACHABLE_IMPL();            \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  203|      0|#define ABSL_INTERNAL_UNREACHABLE_IMPL() __builtin_unreachable()
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  229|      0|  } while (false)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (229:12): [Folded, False: 0]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   52|      0|  } while (0)
  |  |  ------------------
  |  |  |  Branch (52:12): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1379|      0|                 "Invalid iterator comparison. Comparing non-end() iterators "
 1380|      0|                 "from different hashtables.");
 1381|      0|  } else {
 1382|      0|    ABSL_HARDENING_ASSERT(
  ------------------
  |  |  128|      0|#define ABSL_HARDENING_ASSERT(expr) ABSL_ASSERT(expr)
  |  |  ------------------
  |  |  |  |   95|      0|  (false ? static_cast<void>(expr) : static_cast<void>(0))
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (95:4): [Folded, False: 0]
  |  |  |  |  |  Branch (95:30): [True: 0, False: 0]
  |  |  |  |  |  Branch (95:30): [True: 0, Folded]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1383|      0|        AreItersFromSameContainer(ctrl_a, ctrl_b, slot_a, slot_b) &&
 1384|      0|        "Invalid iterator comparison. The iterators may be from different "
 1385|      0|        "containers or the container might have rehashed or moved. Consider "
 1386|      0|        "running with --config=asan to diagnose issues.");
 1387|      0|  }
 1388|      0|}
_ZN4absl12lts_2024011618container_internal15is_single_groupEm:
 1411|     20|inline bool is_single_group(size_t capacity) {
 1412|     20|  return capacity <= Group::kWidth;
 1413|     20|}
_ZN4absl12lts_2024011618container_internal5probeEPKNS1_6ctrl_tEmm:
 1417|    102|                                      size_t hash) {
 1418|    102|  return probe_seq<Group::kWidth>(H1(hash, ctrl), capacity);
 1419|    102|}
_ZN4absl12lts_2024011618container_internal5probeERKNS1_12CommonFieldsEm:
 1420|    102|inline probe_seq<Group::kWidth> probe(const CommonFields& common, size_t hash) {
 1421|    102|  return probe(common.control(), common.capacity(), hash);
 1422|    102|}
_ZN4absl12lts_2024011618container_internal15ResetGrowthLeftERNS1_12CommonFieldsE:
 1464|     12|inline void ResetGrowthLeft(CommonFields& common) {
 1465|     12|  common.set_growth_left(CapacityToGrowth(common.capacity()) - common.size());
 1466|     12|}
_ZN4absl12lts_2024011618container_internal9ResetCtrlERNS1_12CommonFieldsEm:
 1470|      6|inline void ResetCtrl(CommonFields& common, size_t slot_size) {
 1471|      6|  const size_t capacity = common.capacity();
 1472|      6|  ctrl_t* ctrl = common.control();
 1473|      6|  std::memset(ctrl, static_cast<int8_t>(ctrl_t::kEmpty),
 1474|      6|              capacity + 1 + NumClonedBytes());
 1475|      6|  ctrl[capacity] = ctrl_t::kSentinel;
 1476|      6|  SanitizerPoisonMemoryRegion(common.slot_array(), slot_size * capacity);
 1477|      6|}
_ZN4absl12lts_2024011618container_internal7SetCtrlERKNS1_12CommonFieldsEmNS1_6ctrl_tEm:
 1484|     60|                    size_t slot_size) {
 1485|     60|  const size_t capacity = common.capacity();
 1486|     60|  assert(i < capacity);
 1487|       |
 1488|     60|  auto* slot_i = static_cast<const char*>(common.slot_array()) + i * slot_size;
 1489|     60|  if (IsFull(h)) {
  ------------------
  |  Branch (1489:7): [True: 60, False: 0]
  ------------------
 1490|     60|    SanitizerUnpoisonMemoryRegion(slot_i, slot_size);
 1491|     60|  } else {
 1492|      0|    SanitizerPoisonMemoryRegion(slot_i, slot_size);
 1493|      0|  }
 1494|       |
 1495|     60|  ctrl_t* ctrl = common.control();
 1496|     60|  ctrl[i] = h;
 1497|     60|  ctrl[((i - NumClonedBytes()) & capacity) + (NumClonedBytes() & capacity)] = h;
 1498|     60|}
_ZN4absl12lts_2024011618container_internal7SetCtrlERKNS1_12CommonFieldsEmhm:
 1502|     60|                    size_t slot_size) {
 1503|     60|  SetCtrl(common, i, static_cast<ctrl_t>(h), slot_size);
 1504|     60|}
_ZN4absl12lts_2024011618container_internal11SlotAddressEPvmm:
 1513|     40|inline void* SlotAddress(void* slot_array, size_t slot, size_t slot_size) {
 1514|     40|  return reinterpret_cast<void*>(reinterpret_cast<char*>(slot_array) +
 1515|     40|                                 (slot * slot_size));
 1516|     40|}
_ZN4absl12lts_2024011618container_internal13TrailingZerosIjEEjT_:
  372|     13|uint32_t TrailingZeros(T x) {
  373|     13|  ABSL_ASSUME(x != 0);
  ------------------
  |  |  259|     13|#define ABSL_ASSUME(cond) __builtin_assume(cond)
  ------------------
  374|     13|  return static_cast<uint32_t>(countr_zero(x));
  375|     13|}
_ZN4absl12lts_2024011618container_internal19find_first_non_fullIvEENS1_8FindInfoERKNS1_12CommonFieldsEm:
 1432|     62|inline FindInfo find_first_non_full(const CommonFields& common, size_t hash) {
 1433|     62|  auto seq = probe(common, hash);
 1434|     62|  const ctrl_t* ctrl = common.control();
 1435|     62|  while (true) {
  ------------------
  |  Branch (1435:10): [True: 62, Folded]
  ------------------
 1436|     62|    GroupEmptyOrDeleted g{ctrl + seq.offset()};
 1437|     62|    auto mask = g.MaskEmptyOrDeleted();
 1438|     62|    if (mask) {
  ------------------
  |  Branch (1438:9): [True: 62, False: 0]
  ------------------
 1439|       |#if !defined(NDEBUG)
 1440|       |      // We want to add entropy even when ASLR is not enabled.
 1441|       |      // In debug build we will randomly insert in either the front or back of
 1442|       |      // the group.
 1443|       |      // TODO(kfm,sbenza): revisit after we do unconditional mixing
 1444|       |      if (!is_small(common.capacity()) && ShouldInsertBackwards(hash, ctrl)) {
 1445|       |        return {seq.offset(mask.HighestBitSet()), seq.index()};
 1446|       |      }
 1447|       |#endif
 1448|     62|      return {seq.offset(mask.LowestBitSet()), seq.index()};
 1449|     62|    }
 1450|      0|    seq.next();
 1451|       |    assert(seq.index() <= common.capacity() && "full table!");
 1452|      0|  }
 1453|     62|}
_ZNK4absl12lts_2024011618container_internal18NonIterableBitMaskItLi16ELi0EEcvbEv:
  391|     94|  explicit operator bool() const { return this->mask_ != 0; }
_ZNK4absl12lts_2024011618container_internal9probe_seqILm16EE6offsetEm:
  324|     62|  size_t offset(size_t i) const { return (offset_ + i) & mask_; }
_ZNK4absl12lts_2024011618container_internal18NonIterableBitMaskItLi16ELi0EE12LowestBitSetEv:
  394|     62|  uint32_t LowestBitSet() const {
  395|     62|    return container_internal::TrailingZeros(mask_) >> Shift;
  396|     62|  }
_ZN4absl12lts_2024011618container_internal13TrailingZerosItEEjT_:
  372|     62|uint32_t TrailingZeros(T x) {
  373|     62|  ABSL_ASSUME(x != 0);
  ------------------
  |  |  259|     62|#define ABSL_ASSUME(cond) __builtin_assume(cond)
  ------------------
  374|     62|  return static_cast<uint32_t>(countr_zero(x));
  375|     62|}
_ZNK4absl12lts_2024011618container_internal9probe_seqILm16EE5indexEv:
  332|     62|  size_t index() const { return index_; }
_ZNK4absl12lts_2024011618container_internal9probe_seqILm16EE6offsetEv:
  323|    102|  size_t offset() const { return offset_; }
_ZNK4absl12lts_2024011618container_internal7BitMaskItLi16ELi0EE5beginEv:
  454|     32|  BitMask begin() const { return *this; }
_ZNK4absl12lts_2024011618container_internal7BitMaskItLi16ELi0EE3endEv:
  455|     32|  BitMask end() const { return BitMask(0); }
_ZN4absl12lts_2024011618container_internalneERKNS1_7BitMaskItLi16ELi0EEES5_:
  461|     32|  friend bool operator!=(const BitMask& a, const BitMask& b) {
  462|     32|    return a.mask_ != b.mask_;
  463|     32|  }
_ZN4absl12lts_2024011618container_internal19HashSetResizeHelper13DeallocateOldILm8ENSt3__19allocatorIcEEEEvT0_mPv:
 1676|      8|  void DeallocateOld(CharAlloc alloc_ref, size_t slot_size, void* old_slots) {
 1677|      8|    SanitizerUnpoisonMemoryRegion(old_slots, slot_size * old_capacity_);
 1678|      8|    Deallocate<BackingArrayAlignment(AlignOfSlot)>(
 1679|      8|        &alloc_ref, old_ctrl_ - ControlOffset(had_infoz_),
 1680|      8|        AllocSize(old_capacity_, slot_size, AlignOfSlot, had_infoz_));
 1681|      8|  }
_ZN4absl12lts_2024011618container_internal12CommonFieldsC2Ev:
 1028|      2|  CommonFields() = default;
_ZN4absl12lts_2024011618container_internal9probe_seqILm16EEC2Emm:
  316|    102|  probe_seq(size_t hash, size_t mask) {
  317|       |    assert(((mask + 1) & mask) == 0 && "not a mask");
  318|    102|    mask_ = mask;
  319|    102|    offset_ = hash & mask_;
  320|    102|  }
_ZN4absl12lts_2024011618container_internal7BitMaskItLi16ELi0EEC2Et:
  437|     96|  explicit BitMask(T mask) : Base(mask) {}
_ZN4absl12lts_2024011618container_internal18NonIterableBitMaskItLi16ELi0EEC2Et:
  389|    190|  explicit NonIterableBitMask(T mask) : mask_(mask) {}
_ZNK4absl12lts_2024011618container_internal12raw_hash_setINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEENS1_10StringHashENS1_8StringEqENS4_9allocatorINS4_4pairIKS8_SA_EEEEE8capacityEv:
 2288|     24|  size_t capacity() const { return common().capacity(); }
_ZNK4absl12lts_2024011618container_internal12raw_hash_setINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEENS1_10StringHashENS1_8StringEqENS4_9allocatorINS4_4pairIKS8_SA_EEEEE6commonEv:
 3194|    312|  const CommonFields& common() const { return settings_.template get<0>(); }
_ZN4absl12lts_2024011618container_internal12raw_hash_setINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEENS1_10StringHashENS1_8StringEqENS4_9allocatorINS4_4pairIKS8_SA_EEEEE13destroy_slotsEv:
 2861|      2|  inline void destroy_slots() {
 2862|      2|    const size_t cap = capacity();
 2863|      2|    const ctrl_t* ctrl = control();
 2864|      2|    slot_type* slot = slot_array();
 2865|     64|    for (size_t i = 0; i != cap; ++i) {
  ------------------
  |  Branch (2865:24): [True: 62, False: 2]
  ------------------
 2866|     62|      if (IsFull(ctrl[i])) {
  ------------------
  |  Branch (2866:11): [True: 32, False: 30]
  ------------------
 2867|     32|        destroy(slot + i);
 2868|     32|      }
 2869|     62|    }
 2870|      2|  }
_ZNK4absl12lts_2024011618container_internal12raw_hash_setINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEENS1_10StringHashENS1_8StringEqENS4_9allocatorINS4_4pairIKS8_SA_EEEEE7controlEv:
 3196|    142|  ctrl_t* control() const { return common().control(); }
_ZNK4absl12lts_2024011618container_internal12raw_hash_setINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEENS1_10StringHashENS1_8StringEqENS4_9allocatorINS4_4pairIKS8_SA_EEEEE10slot_arrayEv:
 3197|     80|  slot_type* slot_array() const {
 3198|     80|    return static_cast<slot_type*>(common().slot_array());
 3199|     80|  }
_ZN4absl12lts_2024011618container_internal12raw_hash_setINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEENS1_10StringHashENS1_8StringEqENS4_9allocatorINS4_4pairIKS8_SA_EEEEE7destroyEPNS1_13map_slot_typeIS8_SA_EE:
 2854|     32|  inline void destroy(slot_type* slot) {
 2855|     32|    PolicyTraits::destroy(&alloc_ref(), slot);
 2856|     32|  }
_ZN4absl12lts_2024011618container_internal12raw_hash_setINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEENS1_10StringHashENS1_8StringEqENS4_9allocatorINS4_4pairIKS8_SA_EEEEE9alloc_refEv:
 3206|    104|  allocator_type& alloc_ref() { return settings_.template get<3>(); }
_ZN4absl12lts_2024011618container_internal12raw_hash_setINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEENS1_10StringHashENS1_8StringEqENS4_9allocatorINS4_4pairIKS8_SA_EEEEE5infozEv:
 3200|     34|  HashtablezInfoHandle infoz() { return common().infoz(); }
_ZN4absl12lts_2024011618container_internal12raw_hash_setINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEENS1_10StringHashENS1_8StringEqENS4_9allocatorINS4_4pairIKS8_SA_EEEEE6commonEv:
 3193|    396|  CommonFields& common() { return settings_.template get<0>(); }
_ZNK4absl12lts_2024011618container_internal12raw_hash_setINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEENS1_10StringHashENS1_8StringEqENS4_9allocatorINS4_4pairIKS8_SA_EEEEE19prefetch_heap_blockEv:
 3187|     32|  void prefetch_heap_block() const {
 3188|     32|#if ABSL_HAVE_BUILTIN(__builtin_prefetch) || defined(__GNUC__)
 3189|     32|    __builtin_prefetch(control(), 0, 1);
 3190|     32|#endif
 3191|     32|  }
_ZN4absl12lts_2024011618container_internal12raw_hash_setINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEENS1_10StringHashENS1_8StringEqENS4_9allocatorINS4_4pairIKS8_SA_EEEEE11iterator_atEm:
 3159|     34|  iterator iterator_at(size_t i) ABSL_ATTRIBUTE_LIFETIME_BOUND {
 3160|     34|    return {control() + i, slot_array() + i, common().generation_ptr()};
 3161|     34|  }
_ZN4absl12lts_2024011618container_internal12raw_hash_setINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEENS1_10StringHashENS1_8StringEqENS4_9allocatorINS4_4pairIKS8_SA_EEEEE8iteratorC2EPNS1_6ctrl_tEPNS1_13map_slot_typeIS8_SA_EEPKh:
 1984|     34|        : HashSetIteratorGenerationInfo(generation_ptr),
 1985|     34|          ctrl_(ctrl),
 1986|     34|          slot_(slot) {
 1987|       |      // This assumption helps the compiler know that any non-end iterator is
 1988|       |      // not equal to any end iterator.
 1989|     34|      ABSL_ASSUME(ctrl != nullptr);
  ------------------
  |  |  259|     34|#define ABSL_ASSUME(cond) __builtin_assume(cond)
  ------------------
 1990|     34|    }
_ZN4absl12lts_2024011618container_internal12raw_hash_setINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEENS1_10StringHashENS1_8StringEqENS4_9allocatorINS4_4pairIKS8_SA_EEEEE8hash_refEv:
 3202|     60|  hasher& hash_ref() { return settings_.template get<1>(); }
_ZN4absl12lts_2024011618container_internal12raw_hash_setINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEENS1_10StringHashENS1_8StringEqENS4_9allocatorINS4_4pairIKS8_SA_EEEEE3endEv:
 2271|      2|  iterator end() ABSL_ATTRIBUTE_LIFETIME_BOUND {
 2272|      2|    return iterator(common().generation_ptr());
 2273|      2|  }
_ZN4absl12lts_2024011618container_internal12raw_hash_setINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEENS1_10StringHashENS1_8StringEqENS4_9allocatorINS4_4pairIKS8_SA_EEEEE8iteratorC2EPKh:
 1993|      2|        : HashSetIteratorGenerationInfo(generation_ptr), ctrl_(nullptr) {}
_ZN4absl12lts_2024011618container_internalneERKNS1_12raw_hash_setINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEENS1_10StringHashENS1_8StringEqENS4_9allocatorINS4_4pairIKS8_SA_EEEEE8iteratorESM_:
 1977|     34|    friend bool operator!=(const iterator& a, const iterator& b) {
 1978|     34|      return !(a == b);
 1979|     34|    }
_ZN4absl12lts_2024011618container_internaleqERKNS1_12raw_hash_setINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEENS1_10StringHashENS1_8StringEqENS4_9allocatorINS4_4pairIKS8_SA_EEEEE8iteratorESM_:
 1970|     34|    friend bool operator==(const iterator& a, const iterator& b) {
 1971|     34|      AssertIsValidForComparison(a.ctrl_, a.generation(), a.generation_ptr());
 1972|     34|      AssertIsValidForComparison(b.ctrl_, b.generation(), b.generation_ptr());
 1973|     34|      AssertSameContainer(a.ctrl_, b.ctrl_, a.slot_, b.slot_,
 1974|     34|                          a.generation_ptr(), b.generation_ptr());
 1975|     34|      return a.ctrl_ == b.ctrl_;
 1976|     34|    }
_ZN4absl12lts_2024011618container_internal12raw_hash_setINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEENS1_10StringHashENS1_8StringEqENS4_9allocatorINS4_4pairIKS8_SA_EEEEE6insertISH_TnNS4_9enable_ifIXsr4absl11disjunctionINS4_14is_convertibleIT_NSF_IS8_SA_EEEENSJ_22SameAsElementReferenceISN_EEEE5valueEiE4typeELi0ESH_TnNSL_IXsr14IsDecomposableIT1_EE5valueEiE4typeELi0ETnPSN_LPSH_0EEENSF_INSJ_8iteratorEbEEOSN_:
 2320|     32|  std::pair<iterator, bool> insert(T&& value) ABSL_ATTRIBUTE_LIFETIME_BOUND {
 2321|     32|    return emplace(std::forward<T>(value));
 2322|     32|  }
_ZN4absl12lts_2024011618container_internal12raw_hash_setINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEENS1_10StringHashENS1_8StringEqENS4_9allocatorINS4_4pairIKS8_SA_EEEEE7emplaceIJSH_ETnNS4_9enable_ifIXsr14IsDecomposableIDpT_EE5valueEiE4typeELi0EEENSF_INSJ_8iteratorEbEEDpOSM_:
 2422|     32|      ABSL_ATTRIBUTE_LIFETIME_BOUND {
 2423|     32|    return PolicyTraits::apply(EmplaceDecomposable{*this},
 2424|     32|                               std::forward<Args>(args)...);
 2425|     32|  }
_ZNK4absl12lts_2024011618container_internal12raw_hash_setINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEENS1_10StringHashENS1_8StringEqENS4_9allocatorINS4_4pairIKS8_SA_EEEEE19EmplaceDecomposableclIS8_JRKNS4_21piecewise_construct_tENS4_5tupleIJOSG_EEENSP_IJOSA_EEEEEENSF_INSJ_8iteratorEbEERKT_DpOT0_:
 2822|     32|    std::pair<iterator, bool> operator()(const K& key, Args&&... args) const {
 2823|     32|      auto res = s.find_or_prepare_insert(key);
 2824|     32|      if (res.second) {
  ------------------
  |  Branch (2824:11): [True: 32, False: 0]
  ------------------
 2825|     32|        s.emplace_at(res.first, std::forward<Args>(args)...);
 2826|     32|      }
 2827|     32|      return {s.iterator_at(res.first), res.second};
 2828|     32|    }
_ZN4absl12lts_2024011618container_internal12raw_hash_setINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEENS1_10StringHashENS1_8StringEqENS4_9allocatorINS4_4pairIKS8_SA_EEEEE22find_or_prepare_insertIS8_EENSF_ImbEERKT_:
 3086|     32|  std::pair<size_t, bool> find_or_prepare_insert(const K& key) {
 3087|     32|    prefetch_heap_block();
 3088|     32|    auto hash = hash_ref()(key);
 3089|     32|    auto seq = probe(common(), hash);
 3090|     32|    const ctrl_t* ctrl = control();
 3091|     32|    while (true) {
  ------------------
  |  Branch (3091:12): [True: 32, Folded]
  ------------------
 3092|     32|      Group g{ctrl + seq.offset()};
 3093|     32|      for (uint32_t i : g.Match(H2(hash))) {
  ------------------
  |  Branch (3093:23): [True: 0, False: 32]
  ------------------
 3094|      0|        if (ABSL_PREDICT_TRUE(PolicyTraits::apply(
  ------------------
  |  |  179|      0|#define ABSL_PREDICT_TRUE(x) (__builtin_expect(false || (x), true))
  |  |  ------------------
  |  |  |  Branch (179:30): [True: 0, False: 0]
  |  |  |  Branch (179:48): [Folded, False: 0]
  |  |  |  Branch (179:57): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 3095|      0|                EqualElement<K>{key, eq_ref()},
 3096|      0|                PolicyTraits::element(slot_array() + seq.offset(i)))))
 3097|      0|          return {seq.offset(i), false};
 3098|      0|      }
 3099|     32|      if (ABSL_PREDICT_TRUE(g.MaskEmpty())) break;
  ------------------
  |  |  179|     32|#define ABSL_PREDICT_TRUE(x) (__builtin_expect(false || (x), true))
  |  |  ------------------
  |  |  |  Branch (179:30): [True: 32, False: 0]
  |  |  |  Branch (179:48): [Folded, False: 32]
  |  |  |  Branch (179:57): [True: 32, False: 0]
  |  |  ------------------
  ------------------
 3100|      0|      seq.next();
 3101|      0|      assert(seq.index() <= capacity() && "full table!");
 3102|      0|    }
 3103|     32|    return {prepare_insert(hash), true};
 3104|     32|  }
_ZN4absl12lts_2024011618container_internal12raw_hash_setINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEENS1_10StringHashENS1_8StringEqENS4_9allocatorINS4_4pairIKS8_SA_EEEEE14prepare_insertEm:
 3110|     32|  size_t prepare_insert(size_t hash) ABSL_ATTRIBUTE_NOINLINE {
 3111|     32|    const bool rehash_for_bug_detection =
 3112|     32|        common().should_rehash_for_bug_detection_on_insert();
 3113|     32|    if (rehash_for_bug_detection) {
  ------------------
  |  Branch (3113:9): [True: 0, False: 32]
  ------------------
 3114|       |      // Move to a different heap allocation in order to detect bugs.
 3115|      0|      const size_t cap = capacity();
 3116|      0|      resize(growth_left() > 0 ? cap : NextCapacity(cap));
  ------------------
  |  Branch (3116:14): [True: 0, False: 0]
  ------------------
 3117|      0|    }
 3118|     32|    auto target = find_first_non_full(common(), hash);
 3119|     32|    if (!rehash_for_bug_detection &&
  ------------------
  |  Branch (3119:9): [True: 32, False: 0]
  ------------------
 3120|     32|        ABSL_PREDICT_FALSE(growth_left() == 0 &&
  ------------------
  |  |  178|     42|#define ABSL_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
  |  |  ------------------
  |  |  |  Branch (178:31): [True: 10, False: 22]
  |  |  |  Branch (178:49): [Folded, False: 32]
  |  |  |  Branch (178:59): [True: 10, False: 22]
  |  |  |  Branch (178:59): [True: 10, False: 0]
  |  |  ------------------
  ------------------
 3121|     32|                           !IsDeleted(control()[target.offset]))) {
 3122|     10|      size_t old_capacity = capacity();
 3123|     10|      rehash_and_grow_if_necessary();
 3124|       |      // NOTE: It is safe to use `FindFirstNonFullAfterResize`.
 3125|       |      // `FindFirstNonFullAfterResize` must be called right after resize.
 3126|       |      // `rehash_and_grow_if_necessary` may *not* call `resize`
 3127|       |      // and perform `drop_deletes_without_resize` instead. But this
 3128|       |      // could happen only on big tables.
 3129|       |      // For big tables `FindFirstNonFullAfterResize` will always
 3130|       |      // fallback to normal `find_first_non_full`, so it is safe to use it.
 3131|     10|      target = HashSetResizeHelper::FindFirstNonFullAfterResize(
 3132|     10|          common(), old_capacity, hash);
 3133|     10|    }
 3134|     32|    common().increment_size();
 3135|     32|    set_growth_left(growth_left() - IsEmpty(control()[target.offset]));
 3136|     32|    SetCtrl(common(), target.offset, H2(hash), sizeof(slot_type));
 3137|     32|    common().maybe_increment_generation_on_insert();
 3138|     32|    infoz().RecordInsert(hash, target.probe_length);
 3139|     32|    return target.offset;
 3140|     32|  }
_ZN4absl12lts_2024011618container_internal12raw_hash_setINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEENS1_10StringHashENS1_8StringEqENS4_9allocatorINS4_4pairIKS8_SA_EEEEE6resizeEm:
 2905|     10|  ABSL_ATTRIBUTE_NOINLINE void resize(size_t new_capacity) {
 2906|     10|    assert(IsValidCapacity(new_capacity));
 2907|     10|    HashSetResizeHelper resize_helper(common());
 2908|     10|    auto* old_slots = slot_array();
 2909|     10|    common().set_capacity(new_capacity);
 2910|       |    // Note that `InitializeSlots` does different number initialization steps
 2911|       |    // depending on the values of `transfer_uses_memcpy` and capacities.
 2912|       |    // Refer to the comment in `InitializeSlots` for more details.
 2913|     10|    const bool grow_single_group =
 2914|     10|        resize_helper.InitializeSlots<CharAlloc, sizeof(slot_type),
 2915|     10|                                      PolicyTraits::transfer_uses_memcpy(),
 2916|     10|                                      alignof(slot_type)>(
 2917|     10|            common(), const_cast<std::remove_const_t<slot_type>*>(old_slots),
 2918|     10|            CharAlloc(alloc_ref()));
 2919|       |
 2920|     10|    if (resize_helper.old_capacity() == 0) {
  ------------------
  |  Branch (2920:9): [True: 2, False: 8]
  ------------------
 2921|       |      // InitializeSlots did all the work including infoz().RecordRehash().
 2922|      2|      return;
 2923|      2|    }
 2924|       |
 2925|      8|    if (grow_single_group) {
  ------------------
  |  Branch (2925:9): [True: 6, False: 2]
  ------------------
 2926|      6|      if (PolicyTraits::transfer_uses_memcpy()) {
  ------------------
  |  Branch (2926:11): [True: 6, Folded]
  ------------------
 2927|       |        // InitializeSlots did all the work.
 2928|      6|        return;
 2929|      6|      }
 2930|       |      // We want GrowSizeIntoSingleGroup to be called here in order to make
 2931|       |      // InitializeSlots not depend on PolicyTraits.
 2932|      0|      resize_helper.GrowSizeIntoSingleGroup<PolicyTraits>(common(), alloc_ref(),
 2933|      0|                                                          old_slots);
 2934|      2|    } else {
 2935|       |      // InitializeSlots prepares control bytes to correspond to empty table.
 2936|      2|      auto* new_slots = slot_array();
 2937|      2|      size_t total_probe_length = 0;
 2938|     32|      for (size_t i = 0; i != resize_helper.old_capacity(); ++i) {
  ------------------
  |  Branch (2938:26): [True: 30, False: 2]
  ------------------
 2939|     30|        if (IsFull(resize_helper.old_ctrl()[i])) {
  ------------------
  |  Branch (2939:13): [True: 28, False: 2]
  ------------------
 2940|     28|          size_t hash = PolicyTraits::apply(
 2941|     28|              HashElement{hash_ref()}, PolicyTraits::element(old_slots + i));
 2942|     28|          auto target = find_first_non_full(common(), hash);
 2943|     28|          size_t new_i = target.offset;
 2944|     28|          total_probe_length += target.probe_length;
 2945|     28|          SetCtrl(common(), new_i, H2(hash), sizeof(slot_type));
 2946|     28|          transfer(new_slots + new_i, old_slots + i);
 2947|     28|        }
 2948|     30|      }
 2949|      2|      infoz().RecordRehash(total_probe_length);
 2950|      2|    }
 2951|      2|    resize_helper.DeallocateOld<alignof(slot_type)>(
 2952|      2|        CharAlloc(alloc_ref()), sizeof(slot_type),
 2953|      2|        const_cast<std::remove_const_t<slot_type>*>(old_slots));
 2954|      2|  }
_ZN4absl12lts_2024011618container_internal19HashSetResizeHelper15InitializeSlotsINSt3__19allocatorIcEELm24ELb1ELm8EEEbRNS1_12CommonFieldsEPvT_:
 1595|     10|                                               Alloc alloc) {
 1596|     10|    assert(c.capacity());
 1597|       |    // Folks with custom allocators often make unwarranted assumptions about the
 1598|       |    // behavior of their classes vis-a-vis trivial destructability and what
 1599|       |    // calls they will or won't make.  Avoid sampling for people with custom
 1600|       |    // allocators to get us out of this mess.  This is not a hard guarantee but
 1601|       |    // a workaround while we plan the exact guarantee we want to provide.
 1602|     10|    const size_t sample_size =
 1603|     10|        (std::is_same<Alloc, std::allocator<char>>::value &&
  ------------------
  |  Branch (1603:10): [True: 10, Folded]
  ------------------
 1604|     10|         c.slot_array() == nullptr)
  ------------------
  |  Branch (1604:10): [True: 2, False: 8]
  ------------------
 1605|     10|            ? SizeOfSlot
 1606|     10|            : 0;
 1607|     10|    HashtablezInfoHandle infoz =
 1608|     10|        sample_size > 0 ? Sample(sample_size) : c.infoz();
  ------------------
  |  Branch (1608:9): [True: 2, False: 8]
  ------------------
 1609|       |
 1610|     10|    const bool has_infoz = infoz.IsSampled();
 1611|     10|    const size_t cap = c.capacity();
 1612|     10|    const size_t alloc_size =
 1613|     10|        AllocSize(cap, SizeOfSlot, AlignOfSlot, has_infoz);
 1614|     10|    char* mem = static_cast<char*>(
 1615|     10|        Allocate<BackingArrayAlignment(AlignOfSlot)>(&alloc, alloc_size));
 1616|     10|    const GenerationType old_generation = c.generation();
 1617|     10|    c.set_generation_ptr(reinterpret_cast<GenerationType*>(
 1618|     10|        mem + GenerationOffset(cap, has_infoz)));
 1619|     10|    c.set_generation(NextGeneration(old_generation));
 1620|     10|    c.set_control(reinterpret_cast<ctrl_t*>(mem + ControlOffset(has_infoz)));
 1621|     10|    c.set_slots(mem + SlotOffset(cap, AlignOfSlot, has_infoz));
 1622|     10|    ResetGrowthLeft(c);
 1623|       |
 1624|     10|    const bool grow_single_group =
 1625|     10|        IsGrowingIntoSingleGroupApplicable(old_capacity_, c.capacity());
 1626|     10|    if (old_capacity_ != 0 && grow_single_group) {
  ------------------
  |  Branch (1626:9): [True: 8, False: 2]
  |  Branch (1626:31): [True: 6, False: 2]
  ------------------
 1627|      6|      if (TransferUsesMemcpy) {
  ------------------
  |  Branch (1627:11): [True: 6, Folded]
  ------------------
 1628|      6|        GrowSizeIntoSingleGroupTransferable(c, old_slots, SizeOfSlot);
 1629|      6|        DeallocateOld<AlignOfSlot>(alloc, SizeOfSlot, old_slots);
 1630|      6|      } else {
 1631|      0|        GrowIntoSingleGroupShuffleControlBytes(c.control(), c.capacity());
 1632|      0|      }
 1633|      6|    } else {
 1634|      4|      ResetCtrl(c, SizeOfSlot);
 1635|      4|    }
 1636|       |
 1637|     10|    c.set_has_infoz(has_infoz);
 1638|     10|    if (has_infoz) {
  ------------------
  |  Branch (1638:9): [True: 0, False: 10]
  ------------------
 1639|      0|      infoz.RecordStorageChanged(c.size(), cap);
 1640|      0|      if (grow_single_group || old_capacity_ == 0) {
  ------------------
  |  Branch (1640:11): [True: 0, False: 0]
  |  Branch (1640:32): [True: 0, False: 0]
  ------------------
 1641|      0|        infoz.RecordRehash(0);
 1642|      0|      }
 1643|      0|      c.set_infoz(infoz);
 1644|      0|    }
 1645|     10|    return grow_single_group;
 1646|     10|  }
_ZNK4absl12lts_2024011618container_internal12raw_hash_setINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEENS1_10StringHashENS1_8StringEqENS4_9allocatorINS4_4pairIKS8_SA_EEEEE11HashElementclIS8_JRKNS4_21piecewise_construct_tENS4_5tupleIJRSG_EEENSP_IJRKSA_EEEEEEmRKT_DpOT0_:
 2804|     28|    size_t operator()(const K& key, Args&&...) const {
 2805|     28|      return h(key);
 2806|     28|    }
_ZN4absl12lts_2024011618container_internal12raw_hash_setINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEENS1_10StringHashENS1_8StringEqENS4_9allocatorINS4_4pairIKS8_SA_EEEEE8transferEPNS1_13map_slot_typeIS8_SA_EESM_:
 2857|     28|  inline void transfer(slot_type* to, slot_type* from) {
 2858|     28|    PolicyTraits::transfer(&alloc_ref(), to, from);
 2859|     28|  }
_ZNK4absl12lts_2024011618container_internal12raw_hash_setINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEENS1_10StringHashENS1_8StringEqENS4_9allocatorINS4_4pairIKS8_SA_EEEEE11growth_leftEv:
 3181|     64|  size_t growth_left() const { return common().growth_left(); }
_ZN4absl12lts_2024011618container_internal12raw_hash_setINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEENS1_10StringHashENS1_8StringEqENS4_9allocatorINS4_4pairIKS8_SA_EEEEE28rehash_and_grow_if_necessaryEv:
 2970|     10|  void rehash_and_grow_if_necessary() {
 2971|     10|    const size_t cap = capacity();
 2972|     10|    if (cap > Group::kWidth &&
  ------------------
  |  Branch (2972:9): [True: 0, False: 10]
  ------------------
 2973|       |        // Do these calculations in 64-bit to avoid overflow.
 2974|      0|        size() * uint64_t{32} <= cap * uint64_t{25}) {
  ------------------
  |  Branch (2974:9): [True: 0, False: 0]
  ------------------
 2975|       |      // Squash DELETED without growing if there is enough capacity.
 2976|       |      //
 2977|       |      // Rehash in place if the current size is <= 25/32 of capacity.
 2978|       |      // Rationale for such a high factor: 1) drop_deletes_without_resize() is
 2979|       |      // faster than resize, and 2) it takes quite a bit of work to add
 2980|       |      // tombstones.  In the worst case, seems to take approximately 4
 2981|       |      // insert/erase pairs to create a single tombstone and so if we are
 2982|       |      // rehashing because of tombstones, we can afford to rehash-in-place as
 2983|       |      // long as we are reclaiming at least 1/8 the capacity without doing more
 2984|       |      // than 2X the work.  (Where "work" is defined to be size() for rehashing
 2985|       |      // or rehashing in place, and 1 for an insert or erase.)  But rehashing in
 2986|       |      // place is faster per operation than inserting or even doubling the size
 2987|       |      // of the table, so we actually afford to reclaim even less space from a
 2988|       |      // resize-in-place.  The decision is to rehash in place if we can reclaim
 2989|       |      // at about 1/8th of the usable capacity (specifically 3/28 of the
 2990|       |      // capacity) which means that the total cost of rehashing will be a small
 2991|       |      // fraction of the total work.
 2992|       |      //
 2993|       |      // Here is output of an experiment using the BM_CacheInSteadyState
 2994|       |      // benchmark running the old case (where we rehash-in-place only if we can
 2995|       |      // reclaim at least 7/16*capacity) vs. this code (which rehashes in place
 2996|       |      // if we can recover 3/32*capacity).
 2997|       |      //
 2998|       |      // Note that although in the worst-case number of rehashes jumped up from
 2999|       |      // 15 to 190, but the number of operations per second is almost the same.
 3000|       |      //
 3001|       |      // Abridged output of running BM_CacheInSteadyState benchmark from
 3002|       |      // raw_hash_set_benchmark.   N is the number of insert/erase operations.
 3003|       |      //
 3004|       |      //      | OLD (recover >= 7/16        | NEW (recover >= 3/32)
 3005|       |      // size |    N/s LoadFactor NRehashes |    N/s LoadFactor NRehashes
 3006|       |      //  448 | 145284       0.44        18 | 140118       0.44        19
 3007|       |      //  493 | 152546       0.24        11 | 151417       0.48        28
 3008|       |      //  538 | 151439       0.26        11 | 151152       0.53        38
 3009|       |      //  583 | 151765       0.28        11 | 150572       0.57        50
 3010|       |      //  628 | 150241       0.31        11 | 150853       0.61        66
 3011|       |      //  672 | 149602       0.33        12 | 150110       0.66        90
 3012|       |      //  717 | 149998       0.35        12 | 149531       0.70       129
 3013|       |      //  762 | 149836       0.37        13 | 148559       0.74       190
 3014|       |      //  807 | 149736       0.39        14 | 151107       0.39        14
 3015|       |      //  852 | 150204       0.42        15 | 151019       0.42        15
 3016|      0|      drop_deletes_without_resize();
 3017|     10|    } else {
 3018|       |      // Otherwise grow the container.
 3019|     10|      resize(NextCapacity(cap));
 3020|     10|    }
 3021|     10|  }
_ZN4absl12lts_2024011618container_internal12raw_hash_setINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEENS1_10StringHashENS1_8StringEqENS4_9allocatorINS4_4pairIKS8_SA_EEEEE18GetPolicyFunctionsEv:
 3236|      2|  static const PolicyFunctions& GetPolicyFunctions() {
 3237|      2|    static constexpr PolicyFunctions value = {
 3238|      2|        sizeof(slot_type),
 3239|      2|        &raw_hash_set::hash_slot_fn,
 3240|      2|        PolicyTraits::transfer_uses_memcpy()
  ------------------
  |  Branch (3240:9): [True: 0, Folded]
  ------------------
 3241|      2|            ? TransferRelocatable<sizeof(slot_type)>
 3242|      2|            : &raw_hash_set::transfer_slot_fn,
 3243|      2|        (std::is_same<SlotAlloc, std::allocator<slot_type>>::value
  ------------------
  |  Branch (3243:10): [True: 0, Folded]
  ------------------
 3244|      2|             ? &DeallocateStandard<alignof(slot_type)>
 3245|      2|             : &raw_hash_set::dealloc_fn),
 3246|      2|    };
 3247|      2|    return value;
 3248|      2|  }
_ZN4absl12lts_2024011618container_internal12raw_hash_setINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEENS1_10StringHashENS1_8StringEqENS4_9allocatorINS4_4pairIKS8_SA_EEEEE15set_growth_leftEm:
 3182|     32|  void set_growth_left(size_t gl) { return common().set_growth_left(gl); }
_ZN4absl12lts_2024011618container_internal12raw_hash_setINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEENS1_10StringHashENS1_8StringEqENS4_9allocatorINS4_4pairIKS8_SA_EEEEE10emplace_atIJRKNS4_21piecewise_construct_tENS4_5tupleIJOSG_EEENSO_IJOSA_EEEEEEvmDpOT_:
 3151|     32|  void emplace_at(size_t i, Args&&... args) {
 3152|     32|    construct(slot_array() + i, std::forward<Args>(args)...);
 3153|       |
 3154|       |    assert(PolicyTraits::apply(FindElement{*this}, *iterator_at(i)) ==
 3155|     32|               iterator_at(i) &&
 3156|     32|           "constructed value does not match the lookup key");
 3157|     32|  }
_ZN4absl12lts_2024011618container_internal12raw_hash_setINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEENS1_10StringHashENS1_8StringEqENS4_9allocatorINS4_4pairIKS8_SA_EEEEE9constructIJRKNS4_21piecewise_construct_tENS4_5tupleIJOSG_EEENSO_IJOSA_EEEEEEvPNS1_13map_slot_typeIS8_SA_EEDpOT_:
 2851|     32|  inline void construct(slot_type* slot, Args&&... args) {
 2852|     32|    PolicyTraits::construct(&alloc_ref(), slot, std::forward<Args>(args)...);
 2853|     32|  }
_ZN4absl12lts_2024011618container_internal12raw_hash_setINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEENS1_10StringHashENS1_8StringEqENS4_9allocatorINS4_4pairIKS8_SA_EEEEE5beginEv:
 2266|      2|  iterator begin() ABSL_ATTRIBUTE_LIFETIME_BOUND {
 2267|      2|    auto it = iterator_at(0);
 2268|      2|    it.skip_empty_or_deleted();
 2269|      2|    return it;
 2270|      2|  }
_ZN4absl12lts_2024011618container_internal12raw_hash_setINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEENS1_10StringHashENS1_8StringEqENS4_9allocatorINS4_4pairIKS8_SA_EEEEE8iterator21skip_empty_or_deletedEv:
 1999|     34|    void skip_empty_or_deleted() {
 2000|     47|      while (IsEmptyOrDeleted(*ctrl_)) {
  ------------------
  |  Branch (2000:14): [True: 13, False: 34]
  ------------------
 2001|     13|        uint32_t shift =
 2002|     13|            GroupEmptyOrDeleted{ctrl_}.CountLeadingEmptyOrDeleted();
 2003|     13|        ctrl_ += shift;
 2004|     13|        slot_ += shift;
 2005|     13|      }
 2006|     34|      if (ABSL_PREDICT_FALSE(*ctrl_ == ctrl_t::kSentinel)) ctrl_ = nullptr;
  ------------------
  |  |  178|     34|#define ABSL_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
  |  |  ------------------
  |  |  |  Branch (178:31): [True: 2, False: 32]
  |  |  |  Branch (178:49): [Folded, False: 34]
  |  |  |  Branch (178:58): [True: 2, False: 32]
  |  |  ------------------
  ------------------
 2007|     34|    }
_ZN4absl12lts_2024011618container_internal12raw_hash_setINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEENS1_10StringHashENS1_8StringEqENS4_9allocatorINS4_4pairIKS8_SA_EEEEE8iteratorppEv:
 1956|     32|    iterator& operator++() {
 1957|     32|      AssertIsFull(ctrl_, generation(), generation_ptr(), "operator++");
 1958|     32|      ++ctrl_;
 1959|     32|      ++slot_;
 1960|     32|      skip_empty_or_deleted();
 1961|     32|      return *this;
 1962|     32|    }
_ZNK4absl12lts_2024011618container_internal12raw_hash_setINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEENS1_10StringHashENS1_8StringEqENS4_9allocatorINS4_4pairIKS8_SA_EEEEE8iteratordeEv:
 1944|     32|    reference operator*() const {
 1945|     32|      AssertIsFull(ctrl_, generation(), generation_ptr(), "operator*()");
 1946|     32|      return unchecked_deref();
 1947|     32|    }
_ZNK4absl12lts_2024011618container_internal12raw_hash_setINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEENS1_10StringHashENS1_8StringEqENS4_9allocatorINS4_4pairIKS8_SA_EEEEE8iterator15unchecked_derefEv:
 2029|     32|    reference unchecked_deref() const { return PolicyTraits::element(slot_); }
_ZNK4absl12lts_2024011618container_internal12raw_hash_setINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEENS1_10StringHashENS1_8StringEqENS4_9allocatorINS4_4pairIKS8_SA_EEEEE4sizeEv:
 2287|      2|  size_t size() const { return common().size(); }
_ZN4absl12lts_2024011618container_internal12raw_hash_setINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEENS1_10StringHashENS1_8StringEqENS4_9allocatorINS4_4pairIKS8_SA_EEEEE5clearEv:
 2291|      2|  ABSL_ATTRIBUTE_REINITIALIZES void clear() {
 2292|       |    // Iterating over this container is O(bucket_count()). When bucket_count()
 2293|       |    // is much greater than size(), iteration becomes prohibitively expensive.
 2294|       |    // For clear() it is more important to reuse the allocated array when the
 2295|       |    // container is small because allocation takes comparatively long time
 2296|       |    // compared to destruction of the elements of the container. So we pick the
 2297|       |    // largest bucket_count() threshold for which iteration is still fast and
 2298|       |    // past that we simply deallocate the array.
 2299|      2|    const size_t cap = capacity();
 2300|      2|    if (cap == 0) {
  ------------------
  |  Branch (2300:9): [True: 0, False: 2]
  ------------------
 2301|       |      // Already guaranteed to be empty; so nothing to do.
 2302|      2|    } else {
 2303|      2|      destroy_slots();
 2304|      2|      ClearBackingArray(common(), GetPolicyFunctions(), /*reuse=*/cap < 128);
 2305|      2|    }
 2306|      2|    common().set_reserved_growth(0);
 2307|      2|    common().set_reservation_size(0);
 2308|      2|  }
_ZN4absl12lts_2024011618container_internal12raw_hash_setINS1_17FlatHashMapPolicyINSt3__117basic_string_viewIcNS4_11char_traitsIcEEEEPNS0_15CommandLineFlagEEENS1_10StringHashENS1_8StringEqENS4_9allocatorINS4_4pairIKS8_SA_EEEEEC2Ev:
 2088|      2|      std::is_nothrow_default_constructible<allocator_type>::value) {}

_ZNK4absl12lts_2024011615CommandLineFlag8IsOfTypeIbEEbv:
   74|      4|  inline bool IsOfType() const {
   75|      4|    return TypeId() == base_internal::FastTypeId<T>();
   76|      4|  }

_ZN25AbslFlagDefaultGenForfuzz3GenEPv:
  252|      4|    static void Gen(void* absl_flag_default_loc) {                            \
  253|      4|      new (absl_flag_default_loc) Type(AbslFlagDefaultGenFor##name{}.value);  \
  254|      4|    }                                                                         \
_ZN29AbslFlagDefaultGenForfuzz_for3GenEPv:
  252|      2|    static void Gen(void* absl_flag_default_loc) {                            \
  253|      2|      new (absl_flag_default_loc) Type(AbslFlagDefaultGenFor##name{}.value);  \
  254|      2|    }                                                                         \
_ZN36AbslFlagDefaultGenForcorpus_database3GenEPv:
  252|      2|    static void Gen(void* absl_flag_default_loc) {                            \
  253|      2|      new (absl_flag_default_loc) Type(AbslFlagDefaultGenFor##name{}.value);  \
  254|      2|    }                                                                         \
_ZN34AbslFlagDefaultGenForreplay_corpus3GenEPv:
  252|      2|    static void Gen(void* absl_flag_default_loc) {                            \
  253|      2|      new (absl_flag_default_loc) Type(AbslFlagDefaultGenFor##name{}.value);  \
  254|      2|    }                                                                         \
_ZN38AbslFlagDefaultGenForreplay_corpus_for3GenEPv:
  252|      2|    static void Gen(void* absl_flag_default_loc) {                            \
  253|      2|      new (absl_flag_default_loc) Type(AbslFlagDefaultGenFor##name{}.value);  \
  254|      2|    }                                                                         \
_ZN37AbslFlagDefaultGenFortime_budget_type3GenEPv:
  252|      2|    static void Gen(void* absl_flag_default_loc) {                            \
  253|      2|      new (absl_flag_default_loc) Type(AbslFlagDefaultGenFor##name{}.value);  \
  254|      2|    }                                                                         \
_ZN41AbslFlagDefaultGenFortime_limit_per_input3GenEPv:
  252|      2|    static void Gen(void* absl_flag_default_loc) {                            \
  253|      2|      new (absl_flag_default_loc) Type(AbslFlagDefaultGenFor##name{}.value);  \
  254|      2|    }                                                                         \
_ZN25AbslFlagDefaultGenForjobs3GenEPv:
  252|      2|    static void Gen(void* absl_flag_default_loc) {                            \
  253|      2|      new (absl_flag_default_loc) Type(AbslFlagDefaultGenFor##name{}.value);  \
  254|      2|    }                                                                         \
_ZN4absl12lts_202401167GetFlagINSt3__18optionalImEEEET_RKNS0_14flags_internal4FlagIS5_EE:
   95|      2|ABSL_MUST_USE_RESULT T GetFlag(const absl::Flag<T>& flag) {
   96|      2|  return flags_internal::FlagImplPeer::InvokeGet<T>(flag);
   97|      2|}
_ZN4absl12lts_202401167GetFlagImEET_RKNS0_14flags_internal4FlagIS2_EE:
   95|      4|ABSL_MUST_USE_RESULT T GetFlag(const absl::Flag<T>& flag) {
   96|      4|  return flags_internal::FlagImplPeer::InvokeGet<T>(flag);
   97|      4|}
_ZN4absl12lts_202401167GetFlagINS0_8DurationEEET_RKNS0_14flags_internal4FlagIS3_EE:
   95|     10|ABSL_MUST_USE_RESULT T GetFlag(const absl::Flag<T>& flag) {
   96|     10|  return flags_internal::FlagImplPeer::InvokeGet<T>(flag);
   97|     10|}
_ZN4absl12lts_202401167GetFlagIN8fuzztest8internal14TimeBudgetTypeEEET_RKNS0_14flags_internal4FlagIS5_EE:
   95|      2|ABSL_MUST_USE_RESULT T GetFlag(const absl::Flag<T>& flag) {
   96|      2|  return flags_internal::FlagImplPeer::InvokeGet<T>(flag);
   97|      2|}
_ZN4absl12lts_202401167GetFlagIbEET_RKNS0_14flags_internal4FlagIS2_EE:
   95|      6|ABSL_MUST_USE_RESULT T GetFlag(const absl::Flag<T>& flag) {
   96|      6|  return flags_internal::FlagImplPeer::InvokeGet<T>(flag);
   97|      6|}
_ZN4absl12lts_202401167GetFlagINSt3__112basic_stringIcNS2_11char_traitsIcEENS2_9allocatorIcEEEEEET_RKNS0_14flags_internal4FlagIS9_EE:
   95|     14|ABSL_MUST_USE_RESULT T GetFlag(const absl::Flag<T>& flag) {
   96|     14|  return flags_internal::FlagImplPeer::InvokeGet<T>(flag);
   97|     14|}
_ZN29AbslFlagDefaultGenForflagfile3GenEPv:
  252|      2|    static void Gen(void* absl_flag_default_loc) {                            \
  253|      2|      new (absl_flag_default_loc) Type(AbslFlagDefaultGenFor##name{}.value);  \
  254|      2|    }                                                                         \
_ZN28AbslFlagDefaultGenForfromenv3GenEPv:
  252|      2|    static void Gen(void* absl_flag_default_loc) {                            \
  253|      2|      new (absl_flag_default_loc) Type(AbslFlagDefaultGenFor##name{}.value);  \
  254|      2|    }                                                                         \
_ZN31AbslFlagDefaultGenFortryfromenv3GenEPv:
  252|      2|    static void Gen(void* absl_flag_default_loc) {                            \
  253|      2|      new (absl_flag_default_loc) Type(AbslFlagDefaultGenFor##name{}.value);  \
  254|      2|    }                                                                         \
_ZN4absl12lts_202401167GetFlagINSt3__16vectorINS2_12basic_stringIcNS2_11char_traitsIcEENS2_9allocatorIcEEEENS7_IS9_EEEEEET_RKNS0_14flags_internal4FlagISC_EE:
   95|     10|ABSL_MUST_USE_RESULT T GetFlag(const absl::Flag<T>& flag) {
   96|     10|  return flags_internal::FlagImplPeer::InvokeGet<T>(flag);
   97|     10|}

_ZN4absl12lts_2024011614flags_internal15DynValueDeleterC2EPFPvNS1_6FlagOpEPKvS3_S3_E:
  135|      4|DynValueDeleter::DynValueDeleter(FlagOpFn op_arg) : op(op_arg) {}
_ZNK4absl12lts_2024011614flags_internal15DynValueDeleterclEPv:
  137|      4|void DynValueDeleter::operator()(void* ptr) const {
  138|      4|  if (op == nullptr) return;
  ------------------
  |  Branch (138:7): [True: 0, False: 4]
  ------------------
  139|       |
  140|      4|  Delete(op, ptr);
  141|      4|}
_ZN4absl12lts_2024011614flags_internal8FlagImpl4InitEv:
  143|     30|void FlagImpl::Init() {
  144|     30|  new (&data_guard_) absl::Mutex;
  145|       |
  146|     30|  auto def_kind = static_cast<FlagDefaultKind>(def_kind_);
  147|       |
  148|     30|  switch (ValueStorageKind()) {
  ------------------
  |  Branch (148:11): [True: 30, False: 0]
  ------------------
  149|      6|    case FlagValueStorageKind::kValueAndInitBit:
  ------------------
  |  Branch (149:5): [True: 6, False: 24]
  ------------------
  150|     10|    case FlagValueStorageKind::kOneWordAtomic: {
  ------------------
  |  Branch (150:5): [True: 4, False: 26]
  ------------------
  151|     10|      alignas(int64_t) std::array<char, sizeof(int64_t)> buf{};
  152|     10|      if (def_kind == FlagDefaultKind::kGenFunc) {
  ------------------
  |  Branch (152:11): [True: 2, False: 8]
  ------------------
  153|      2|        (*default_value_.gen_func)(buf.data());
  154|      8|      } else {
  155|      8|        assert(def_kind != FlagDefaultKind::kDynamicValue);
  156|      8|        std::memcpy(buf.data(), &default_value_, Sizeof(op_));
  157|      8|      }
  158|     10|      if (ValueStorageKind() == FlagValueStorageKind::kValueAndInitBit) {
  ------------------
  |  Branch (158:11): [True: 6, False: 4]
  ------------------
  159|       |        // We presume here the memory layout of FlagValueAndInitBit struct.
  160|      6|        uint8_t initialized = 1;
  161|      6|        std::memcpy(buf.data() + Sizeof(op_), &initialized,
  162|      6|                    sizeof(initialized));
  163|      6|      }
  164|       |      // Type can contain valid uninitialized bits, e.g. padding.
  165|     10|      ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(buf.data(), buf.size());
  166|     10|      OneWordValue().store(absl::bit_cast<int64_t>(buf),
  167|     10|                           std::memory_order_release);
  168|     10|      break;
  169|      6|    }
  170|      8|    case FlagValueStorageKind::kSequenceLocked: {
  ------------------
  |  Branch (170:5): [True: 8, False: 22]
  ------------------
  171|       |      // For this storage kind the default_value_ always points to gen_func
  172|       |      // during initialization.
  173|      8|      assert(def_kind == FlagDefaultKind::kGenFunc);
  174|      8|      (*default_value_.gen_func)(AtomicBufferValue());
  175|      8|      break;
  176|      6|    }
  177|     12|    case FlagValueStorageKind::kAlignedBuffer:
  ------------------
  |  Branch (177:5): [True: 12, False: 18]
  ------------------
  178|       |      // For this storage kind the default_value_ always points to gen_func
  179|       |      // during initialization.
  180|     12|      assert(def_kind == FlagDefaultKind::kGenFunc);
  181|     12|      (*default_value_.gen_func)(AlignedBufferValue());
  182|     12|      break;
  183|     30|  }
  184|     30|  seq_lock_.MarkInitialized();
  185|     30|}
_ZNK4absl12lts_2024011614flags_internal8FlagImpl9DataGuardEv:
  187|     58|absl::Mutex* FlagImpl::DataGuard() const {
  188|     58|  absl::call_once(const_cast<FlagImpl*>(this)->init_control_, &FlagImpl::Init,
  189|     58|                  const_cast<FlagImpl*>(this));
  190|       |
  191|       |  // data_guard_ is initialized inside Init.
  192|     58|  return reinterpret_cast<absl::Mutex*>(&data_guard_);
  193|     58|}
_ZNK4absl12lts_2024011614flags_internal8FlagImpl13MakeInitValueEv:
  220|      4|std::unique_ptr<void, DynValueDeleter> FlagImpl::MakeInitValue() const {
  221|      4|  void* res = nullptr;
  222|      4|  switch (DefaultKind()) {
  223|      0|    case FlagDefaultKind::kDynamicValue:
  ------------------
  |  Branch (223:5): [True: 0, False: 4]
  ------------------
  224|      0|      res = flags_internal::Clone(op_, default_value_.dynamic_value);
  225|      0|      break;
  226|      2|    case FlagDefaultKind::kGenFunc:
  ------------------
  |  Branch (226:5): [True: 2, False: 2]
  ------------------
  227|      2|      res = flags_internal::Alloc(op_);
  228|      2|      (*default_value_.gen_func)(res);
  229|      2|      break;
  230|      2|    default:
  ------------------
  |  Branch (230:5): [True: 2, False: 2]
  ------------------
  231|      2|      res = flags_internal::Clone(op_, &default_value_);
  232|      2|      break;
  233|      4|  }
  234|      4|  return {res, DynValueDeleter{op_}};
  235|      4|}
_ZN4absl12lts_2024011614flags_internal8FlagImpl10StoreValueEPKv:
  237|      4|void FlagImpl::StoreValue(const void* src) {
  238|      4|  switch (ValueStorageKind()) {
  ------------------
  |  Branch (238:11): [True: 4, False: 0]
  ------------------
  239|      0|    case FlagValueStorageKind::kValueAndInitBit:
  ------------------
  |  Branch (239:5): [True: 0, False: 4]
  ------------------
  240|      2|    case FlagValueStorageKind::kOneWordAtomic: {
  ------------------
  |  Branch (240:5): [True: 2, False: 2]
  ------------------
  241|       |      // Load the current value to avoid setting 'init' bit manually.
  242|      2|      int64_t one_word_val = OneWordValue().load(std::memory_order_acquire);
  243|      2|      std::memcpy(&one_word_val, src, Sizeof(op_));
  244|      2|      OneWordValue().store(one_word_val, std::memory_order_release);
  245|      2|      seq_lock_.IncrementModificationCount();
  246|      2|      break;
  247|      0|    }
  248|      0|    case FlagValueStorageKind::kSequenceLocked: {
  ------------------
  |  Branch (248:5): [True: 0, False: 4]
  ------------------
  249|      0|      seq_lock_.Write(AtomicBufferValue(), src, Sizeof(op_));
  250|      0|      break;
  251|      0|    }
  252|      2|    case FlagValueStorageKind::kAlignedBuffer:
  ------------------
  |  Branch (252:5): [True: 2, False: 2]
  ------------------
  253|      2|      Copy(op_, src, AlignedBufferValue());
  254|      2|      seq_lock_.IncrementModificationCount();
  255|      2|      break;
  256|      4|  }
  257|      4|  modified_ = true;
  258|      4|  InvokeCallback();
  259|      4|}
_ZNK4absl12lts_2024011614flags_internal8FlagImpl4NameEv:
  261|    308|absl::string_view FlagImpl::Name() const { return name_; }
_ZNK4absl12lts_2024011614flags_internal8FlagImpl8FilenameEv:
  263|     32|std::string FlagImpl::Filename() const {
  264|     32|  return flags_internal::GetUsageConfig().normalize_filename(filename_);
  265|     32|}
_ZNK4absl12lts_2024011614flags_internal8FlagImpl6TypeIdEv:
  272|      4|FlagFastTypeId FlagImpl::TypeId() const {
  273|      4|  return flags_internal::FastTypeId(op_);
  274|      4|}
_ZN4absl12lts_2024011614flags_internal8FlagImpl11SetCallbackEPFvvE:
  317|      8|void FlagImpl::SetCallback(const FlagCallbackFunc mutation_callback) {
  318|      8|  absl::MutexLock l(DataGuard());
  319|       |
  320|      8|  if (callback_ == nullptr) {
  ------------------
  |  Branch (320:7): [True: 8, False: 0]
  ------------------
  321|      8|    callback_ = new FlagCallback;
  322|      8|  }
  323|      8|  callback_->func = mutation_callback;
  324|       |
  325|      8|  InvokeCallback();
  326|      8|}
_ZNK4absl12lts_2024011614flags_internal8FlagImpl14InvokeCallbackEv:
  328|     12|void FlagImpl::InvokeCallback() const {
  329|     12|  if (!callback_) return;
  ------------------
  |  Branch (329:7): [True: 4, False: 8]
  ------------------
  330|       |
  331|       |  // Make a copy of the C-style function pointer that we are about to invoke
  332|       |  // before we release the lock guarding it.
  333|      8|  FlagCallbackFunc cb = callback_->func;
  334|       |
  335|       |  // If the flag has a mutation callback this function invokes it. While the
  336|       |  // callback is being invoked the primary flag's mutex is unlocked and it is
  337|       |  // re-locked back after call to callback is completed. Callback invocation is
  338|       |  // guarded by flag's secondary mutex instead which prevents concurrent
  339|       |  // callback invocation. Note that it is possible for other thread to grab the
  340|       |  // primary lock and update flag's value at any time during the callback
  341|       |  // invocation. This is by design. Callback can get a value of the flag if
  342|       |  // necessary, but it might be different from the value initiated the callback
  343|       |  // and it also can be different by the time the callback invocation is
  344|       |  // completed. Requires that *primary_lock be held in exclusive mode; it may be
  345|       |  // released and reacquired by the implementation.
  346|      8|  MutexRelock relock(*DataGuard());
  347|      8|  absl::MutexLock lock(&callback_->guard);
  348|      8|  cb();
  349|      8|}
_ZNK4absl12lts_2024011614flags_internal8FlagImpl18AlignedBufferValueEv:
  414|     38|void* FlagImpl::AlignedBufferValue() const {
  415|       |  assert(ValueStorageKind() == FlagValueStorageKind::kAlignedBuffer);
  416|     38|  return OffsetValue<void>();
  417|     38|}
_ZNK4absl12lts_2024011614flags_internal8FlagImpl17AtomicBufferValueEv:
  419|     16|std::atomic<uint64_t>* FlagImpl::AtomicBufferValue() const {
  420|       |  assert(ValueStorageKind() == FlagValueStorageKind::kSequenceLocked);
  421|     16|  return OffsetValue<std::atomic<uint64_t>>();
  422|     16|}
_ZNK4absl12lts_2024011614flags_internal8FlagImpl12OneWordValueEv:
  424|     20|std::atomic<int64_t>& FlagImpl::OneWordValue() const {
  425|       |  assert(ValueStorageKind() == FlagValueStorageKind::kOneWordAtomic ||
  426|     20|         ValueStorageKind() == FlagValueStorageKind::kValueAndInitBit);
  427|     20|  return OffsetValue<FlagOneWordValue>()->value;
  428|     20|}
_ZNK4absl12lts_2024011614flags_internal8FlagImpl8TryParseENSt3__117basic_string_viewIcNS3_11char_traitsIcEEEERNS3_12basic_stringIcS6_NS3_9allocatorIcEEEE:
  435|      4|    absl::string_view value, std::string& err) const {
  436|      4|  std::unique_ptr<void, DynValueDeleter> tentative_value = MakeInitValue();
  437|       |
  438|      4|  std::string parse_err;
  439|      4|  if (!flags_internal::Parse(op_, value, tentative_value.get(), &parse_err)) {
  ------------------
  |  Branch (439:7): [True: 0, False: 4]
  ------------------
  440|      0|    absl::string_view err_sep = parse_err.empty() ? "" : "; ";
  ------------------
  |  Branch (440:33): [True: 0, False: 0]
  ------------------
  441|      0|    err = absl::StrCat("Illegal value '", value, "' specified for flag '",
  442|      0|                       Name(), "'", err_sep, parse_err);
  443|      0|    return nullptr;
  444|      0|  }
  445|       |
  446|      4|  return tentative_value;
  447|      4|}
_ZNK4absl12lts_2024011614flags_internal8FlagImpl4ReadEPv:
  449|     32|void FlagImpl::Read(void* dst) const {
  450|     32|  auto* guard = DataGuard();  // Make sure flag initialized
  451|     32|  switch (ValueStorageKind()) {
  ------------------
  |  Branch (451:11): [True: 32, False: 0]
  ------------------
  452|      0|    case FlagValueStorageKind::kValueAndInitBit:
  ------------------
  |  Branch (452:5): [True: 0, False: 32]
  ------------------
  453|      0|    case FlagValueStorageKind::kOneWordAtomic: {
  ------------------
  |  Branch (453:5): [True: 0, False: 32]
  ------------------
  454|      0|      const int64_t one_word_val =
  455|      0|          OneWordValue().load(std::memory_order_acquire);
  456|      0|      std::memcpy(dst, &one_word_val, Sizeof(op_));
  457|      0|      break;
  458|      0|    }
  459|      8|    case FlagValueStorageKind::kSequenceLocked: {
  ------------------
  |  Branch (459:5): [True: 8, False: 24]
  ------------------
  460|      8|      ReadSequenceLockedData(dst);
  461|      8|      break;
  462|      0|    }
  463|     24|    case FlagValueStorageKind::kAlignedBuffer: {
  ------------------
  |  Branch (463:5): [True: 24, False: 8]
  ------------------
  464|     24|      absl::MutexLock l(guard);
  465|     24|      flags_internal::CopyConstruct(op_, AlignedBufferValue(), dst);
  466|     24|      break;
  467|      0|    }
  468|     32|  }
  469|     32|}
_ZNK4absl12lts_2024011614flags_internal8FlagImpl11ReadOneWordEv:
  471|      4|int64_t FlagImpl::ReadOneWord() const {
  472|       |  assert(ValueStorageKind() == FlagValueStorageKind::kOneWordAtomic ||
  473|      4|         ValueStorageKind() == FlagValueStorageKind::kValueAndInitBit);
  474|      4|  auto* guard = DataGuard();  // Make sure flag initialized
  475|      4|  (void)guard;
  476|      4|  return OneWordValue().load(std::memory_order_acquire);
  477|      4|}
_ZNK4absl12lts_2024011614flags_internal8FlagImpl11ReadOneBoolEv:
  479|      2|bool FlagImpl::ReadOneBool() const {
  480|       |  assert(ValueStorageKind() == FlagValueStorageKind::kValueAndInitBit);
  481|      2|  auto* guard = DataGuard();  // Make sure flag initialized
  482|      2|  (void)guard;
  483|      2|  return absl::bit_cast<FlagValueAndInitBit<bool>>(
  484|      2|             OneWordValue().load(std::memory_order_acquire))
  485|      2|      .value;
  486|      2|}
_ZNK4absl12lts_2024011614flags_internal8FlagImpl22ReadSequenceLockedDataEPv:
  488|      8|void FlagImpl::ReadSequenceLockedData(void* dst) const {
  489|      8|  size_t size = Sizeof(op_);
  490|       |  // Attempt to read using the sequence lock.
  491|      8|  if (ABSL_PREDICT_TRUE(seq_lock_.TryRead(dst, AtomicBufferValue(), size))) {
  ------------------
  |  |  179|      8|#define ABSL_PREDICT_TRUE(x) (__builtin_expect(false || (x), true))
  |  |  ------------------
  |  |  |  Branch (179:30): [True: 8, False: 0]
  |  |  |  Branch (179:48): [Folded, False: 8]
  |  |  |  Branch (179:57): [True: 8, False: 0]
  |  |  ------------------
  ------------------
  492|      8|    return;
  493|      8|  }
  494|       |  // We failed due to contention. Acquire the lock to prevent contention
  495|       |  // and try again.
  496|      0|  absl::ReaderMutexLock l(DataGuard());
  497|      0|  bool success = seq_lock_.TryRead(dst, AtomicBufferValue(), size);
  498|       |  assert(success);
  499|      0|  static_cast<void>(success);
  500|      0|}
_ZN4absl12lts_2024011614flags_internal8FlagImpl9ParseFromENSt3__117basic_string_viewIcNS3_11char_traitsIcEEEENS1_15FlagSettingModeENS1_11ValueSourceERNS3_12basic_stringIcS6_NS3_9allocatorIcEEEE:
  528|      4|                         ValueSource source, std::string& err) {
  529|      4|  absl::MutexLock l(DataGuard());
  530|       |
  531|      4|  switch (set_mode) {
  ------------------
  |  Branch (531:11): [True: 4, False: 0]
  ------------------
  532|      4|    case SET_FLAGS_VALUE: {
  ------------------
  |  Branch (532:5): [True: 4, False: 0]
  ------------------
  533|       |      // set or modify the flag's value
  534|      4|      auto tentative_value = TryParse(value, err);
  535|      4|      if (!tentative_value) return false;
  ------------------
  |  Branch (535:11): [True: 0, False: 4]
  ------------------
  536|       |
  537|      4|      StoreValue(tentative_value.get());
  538|       |
  539|      4|      if (source == kCommandLine) {
  ------------------
  |  Branch (539:11): [True: 4, False: 0]
  ------------------
  540|      4|        on_command_line_ = true;
  541|      4|      }
  542|      4|      break;
  543|      4|    }
  544|      0|    case SET_FLAG_IF_DEFAULT: {
  ------------------
  |  Branch (544:5): [True: 0, False: 4]
  ------------------
  545|       |      // set the flag's value, but only if it hasn't been set by someone else
  546|      0|      if (modified_) {
  ------------------
  |  Branch (546:11): [True: 0, False: 0]
  ------------------
  547|       |        // TODO(rogeeff): review and fix this semantic. Currently we do not fail
  548|       |        // in this case if flag is modified. This is misleading since the flag's
  549|       |        // value is not updated even though we return true.
  550|       |        // *err = absl::StrCat(Name(), " is already set to ",
  551|       |        //                     CurrentValue(), "\n");
  552|       |        // return false;
  553|      0|        return true;
  554|      0|      }
  555|      0|      auto tentative_value = TryParse(value, err);
  556|      0|      if (!tentative_value) return false;
  ------------------
  |  Branch (556:11): [True: 0, False: 0]
  ------------------
  557|       |
  558|      0|      StoreValue(tentative_value.get());
  559|      0|      break;
  560|      0|    }
  561|      0|    case SET_FLAGS_DEFAULT: {
  ------------------
  |  Branch (561:5): [True: 0, False: 4]
  ------------------
  562|      0|      auto tentative_value = TryParse(value, err);
  563|      0|      if (!tentative_value) return false;
  ------------------
  |  Branch (563:11): [True: 0, False: 0]
  ------------------
  564|       |
  565|      0|      if (DefaultKind() == FlagDefaultKind::kDynamicValue) {
  ------------------
  |  Branch (565:11): [True: 0, False: 0]
  ------------------
  566|      0|        void* old_value = default_value_.dynamic_value;
  567|      0|        default_value_.dynamic_value = tentative_value.release();
  568|      0|        tentative_value.reset(old_value);
  569|      0|      } else {
  570|      0|        default_value_.dynamic_value = tentative_value.release();
  571|      0|        def_kind_ = static_cast<uint8_t>(FlagDefaultKind::kDynamicValue);
  572|      0|      }
  573|       |
  574|      0|      if (!modified_) {
  ------------------
  |  Branch (574:11): [True: 0, False: 0]
  ------------------
  575|       |        // Need to set both default value *and* current, in this case.
  576|      0|        StoreValue(default_value_.dynamic_value);
  577|      0|        modified_ = false;
  578|      0|      }
  579|      0|      break;
  580|      0|    }
  581|      4|  }
  582|       |
  583|      4|  return true;
  584|      4|}
flag.cc:_ZN4absl12lts_2024011614flags_internal12_GLOBAL__N_111MutexRelockC2ERNS0_5MutexE:
   71|      8|  explicit MutexRelock(absl::Mutex& mu) : mu_(mu) { mu_.Unlock(); }
flag.cc:_ZN4absl12lts_2024011614flags_internal12_GLOBAL__N_111MutexRelockD2Ev:
   72|      8|  ~MutexRelock() { mu_.Lock(); }
_ZNK4absl12lts_2024011614flags_internal8FlagImpl11OffsetValueIvEEPT_v:
  406|     38|StorageT* FlagImpl::OffsetValue() const {
  407|     38|  char* p = reinterpret_cast<char*>(const_cast<FlagImpl*>(this));
  408|       |  // The offset is deduced via Flag value type specific op_.
  409|     38|  ptrdiff_t offset = flags_internal::ValueOffset(op_);
  410|       |
  411|     38|  return reinterpret_cast<StorageT*>(p + offset);
  412|     38|}
_ZNK4absl12lts_2024011614flags_internal8FlagImpl11OffsetValueINSt3__16atomicImEEEEPT_v:
  406|     16|StorageT* FlagImpl::OffsetValue() const {
  407|     16|  char* p = reinterpret_cast<char*>(const_cast<FlagImpl*>(this));
  408|       |  // The offset is deduced via Flag value type specific op_.
  409|     16|  ptrdiff_t offset = flags_internal::ValueOffset(op_);
  410|       |
  411|     16|  return reinterpret_cast<StorageT*>(p + offset);
  412|     16|}
_ZNK4absl12lts_2024011614flags_internal8FlagImpl11OffsetValueINS1_16FlagOneWordValueEEEPT_v:
  406|     20|StorageT* FlagImpl::OffsetValue() const {
  407|     20|  char* p = reinterpret_cast<char*>(const_cast<FlagImpl*>(this));
  408|       |  // The offset is deduced via Flag value type specific op_.
  409|     20|  ptrdiff_t offset = flags_internal::ValueOffset(op_);
  410|       |
  411|     20|  return reinterpret_cast<StorageT*>(p + offset);
  412|     20|}

_ZN4absl12lts_2024011614flags_internal16InitDefaultValueINSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEEET_SA_:
  277|      8|constexpr T InitDefaultValue(T t) {
  278|      8|  return t;
  279|      8|}
_ZN4absl12lts_2024011614flags_internal16InitDefaultValueINS0_8DurationEEET_S4_:
  277|      6|constexpr T InitDefaultValue(T t) {
  278|      6|  return t;
  279|      6|}
_ZN4absl12lts_2024011614flags_internal16InitDefaultValueIN8fuzztest8internal14TimeBudgetTypeEEET_S6_:
  277|      2|constexpr T InitDefaultValue(T t) {
  278|      2|  return t;
  279|      2|}
_ZN4absl12lts_2024011614flags_internal16InitDefaultValueINSt3__18optionalImEEEET_S6_:
  277|      2|constexpr T InitDefaultValue(T t) {
  278|      2|  return t;
  279|      2|}
_ZN4absl12lts_2024011614flags_internal12FlagImplPeer9InvokeGetINSt3__18optionalImEENS1_4FlagIS6_EEEET_RKT0_:
  697|      2|  static T InvokeGet(const FlagType& flag) {
  698|      2|    return flag.Get();
  699|      2|  }
_ZNK4absl12lts_2024011614flags_internal4FlagINSt3__18optionalImEEE3GetEv:
  657|      2|  T Get() const {
  658|       |    // See implementation notes in CommandLineFlag::Get().
  659|      2|    union U {
  660|      2|      T value;
  661|      2|      U() {}
  662|      2|      ~U() { value.~T(); }
  663|      2|    };
  664|      2|    U u;
  665|       |
  666|       |#if !defined(NDEBUG)
  667|       |    impl_.AssertValidType(base_internal::FastTypeId<T>(), &GenRuntimeTypeId<T>);
  668|       |#endif
  669|       |
  670|      2|    if (ABSL_PREDICT_FALSE(!value_.Get(impl_.seq_lock_, u.value))) {
  ------------------
  |  |  178|      2|#define ABSL_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
  |  |  ------------------
  |  |  |  Branch (178:31): [True: 2, False: 0]
  |  |  |  Branch (178:49): [Folded, False: 2]
  |  |  |  Branch (178:58): [True: 2, False: 0]
  |  |  ------------------
  ------------------
  671|      2|      impl_.Read(&u.value);
  672|      2|    }
  673|      2|    return std::move(u.value);
  674|      2|  }
_ZZNK4absl12lts_2024011614flags_internal4FlagINSt3__18optionalImEEE3GetEvEN1UC2Ev:
  661|      2|      U() {}
_ZNK4absl12lts_2024011614flags_internal9FlagValueINSt3__18optionalImEELNS1_20FlagValueStorageKindE2EE3GetERKNS1_12SequenceLockERS5_:
  384|      2|  bool Get(const SequenceLock& lock, T& dst) const {
  385|      2|    return lock.TryRead(&dst, value_words, sizeof(T));
  386|      2|  }
_ZZNK4absl12lts_2024011614flags_internal4FlagINSt3__18optionalImEEE3GetEvEN1UD2Ev:
  662|      2|      ~U() { value.~T(); }
_ZN4absl12lts_2024011614flags_internal12FlagImplPeer9InvokeGetImNS1_4FlagImEEEET_RKT0_:
  697|      4|  static T InvokeGet(const FlagType& flag) {
  698|      4|    return flag.Get();
  699|      4|  }
_ZNK4absl12lts_2024011614flags_internal4FlagImE3GetEv:
  657|      4|  T Get() const {
  658|       |    // See implementation notes in CommandLineFlag::Get().
  659|      4|    union U {
  660|      4|      T value;
  661|      4|      U() {}
  662|      4|      ~U() { value.~T(); }
  663|      4|    };
  664|      4|    U u;
  665|       |
  666|       |#if !defined(NDEBUG)
  667|       |    impl_.AssertValidType(base_internal::FastTypeId<T>(), &GenRuntimeTypeId<T>);
  668|       |#endif
  669|       |
  670|      4|    if (ABSL_PREDICT_FALSE(!value_.Get(impl_.seq_lock_, u.value))) {
  ------------------
  |  |  178|      4|#define ABSL_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
  |  |  ------------------
  |  |  |  Branch (178:31): [True: 2, False: 2]
  |  |  |  Branch (178:49): [Folded, False: 4]
  |  |  |  Branch (178:58): [True: 2, False: 2]
  |  |  ------------------
  ------------------
  671|      2|      impl_.Read(&u.value);
  672|      2|    }
  673|      4|    return std::move(u.value);
  674|      4|  }
_ZZNK4absl12lts_2024011614flags_internal4FlagImE3GetEvEN1UC2Ev:
  661|      4|      U() {}
_ZNK4absl12lts_2024011614flags_internal9FlagValueImLNS1_20FlagValueStorageKindE1EE3GetERKNS1_12SequenceLockERm:
  372|      4|  bool Get(const SequenceLock&, T& dst) const {
  373|      4|    int64_t one_word_val = value.load(std::memory_order_acquire);
  374|      4|    if (ABSL_PREDICT_FALSE(one_word_val == UninitializedFlagValue())) {
  ------------------
  |  |  178|      4|#define ABSL_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
  |  |  ------------------
  |  |  |  Branch (178:31): [True: 2, False: 2]
  |  |  |  Branch (178:49): [Folded, False: 4]
  |  |  |  Branch (178:58): [True: 2, False: 2]
  |  |  ------------------
  ------------------
  375|      2|      return false;
  376|      2|    }
  377|      2|    std::memcpy(&dst, static_cast<const void*>(&one_word_val), sizeof(T));
  378|      2|    return true;
  379|      4|  }
_ZN4absl12lts_2024011614flags_internal22UninitializedFlagValueEv:
  301|      4|constexpr int64_t UninitializedFlagValue() {
  302|      4|  return static_cast<int64_t>(0xababababababababll);
  303|      4|}
_ZNK4absl12lts_2024011614flags_internal8FlagImpl4ReadImTnNSt3__19enable_ifIXeqclsr14flags_internalE11StorageKindIT_EELNS1_20FlagValueStorageKindE1EEiE4typeELi0EEEvPS6_:
  457|      2|  void Read(T* value) const ABSL_LOCKS_EXCLUDED(*DataGuard()) {
  458|      2|    int64_t v = ReadOneWord();
  459|      2|    std::memcpy(value, static_cast<const void*>(&v), sizeof(T));
  460|      2|  }
_ZZNK4absl12lts_2024011614flags_internal4FlagImE3GetEvEN1UD2Ev:
  662|      4|      ~U() { value.~T(); }
_ZN4absl12lts_2024011614flags_internal12FlagImplPeer9InvokeGetINS0_8DurationENS1_4FlagIS4_EEEET_RKT0_:
  697|     10|  static T InvokeGet(const FlagType& flag) {
  698|     10|    return flag.Get();
  699|     10|  }
_ZNK4absl12lts_2024011614flags_internal4FlagINS0_8DurationEE3GetEv:
  657|     10|  T Get() const {
  658|       |    // See implementation notes in CommandLineFlag::Get().
  659|     10|    union U {
  660|     10|      T value;
  661|     10|      U() {}
  662|     10|      ~U() { value.~T(); }
  663|     10|    };
  664|     10|    U u;
  665|       |
  666|       |#if !defined(NDEBUG)
  667|       |    impl_.AssertValidType(base_internal::FastTypeId<T>(), &GenRuntimeTypeId<T>);
  668|       |#endif
  669|       |
  670|     10|    if (ABSL_PREDICT_FALSE(!value_.Get(impl_.seq_lock_, u.value))) {
  ------------------
  |  |  178|     10|#define ABSL_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
  |  |  ------------------
  |  |  |  Branch (178:31): [True: 6, False: 4]
  |  |  |  Branch (178:49): [Folded, False: 10]
  |  |  |  Branch (178:58): [True: 6, False: 4]
  |  |  ------------------
  ------------------
  671|      6|      impl_.Read(&u.value);
  672|      6|    }
  673|     10|    return std::move(u.value);
  674|     10|  }
_ZZNK4absl12lts_2024011614flags_internal4FlagINS0_8DurationEE3GetEvEN1UC2Ev:
  661|     10|      U() {}
_ZNK4absl12lts_2024011614flags_internal9FlagValueINS0_8DurationELNS1_20FlagValueStorageKindE2EE3GetERKNS1_12SequenceLockERS3_:
  384|     10|  bool Get(const SequenceLock& lock, T& dst) const {
  385|     10|    return lock.TryRead(&dst, value_words, sizeof(T));
  386|     10|  }
_ZZNK4absl12lts_2024011614flags_internal4FlagINS0_8DurationEE3GetEvEN1UD2Ev:
  662|     10|      ~U() { value.~T(); }
_ZN4absl12lts_2024011614flags_internal12FlagImplPeer9InvokeGetIN8fuzztest8internal14TimeBudgetTypeENS1_4FlagIS6_EEEET_RKT0_:
  697|      2|  static T InvokeGet(const FlagType& flag) {
  698|      2|    return flag.Get();
  699|      2|  }
_ZNK4absl12lts_2024011614flags_internal4FlagIN8fuzztest8internal14TimeBudgetTypeEE3GetEv:
  657|      2|  T Get() const {
  658|       |    // See implementation notes in CommandLineFlag::Get().
  659|      2|    union U {
  660|      2|      T value;
  661|      2|      U() {}
  662|      2|      ~U() { value.~T(); }
  663|      2|    };
  664|      2|    U u;
  665|       |
  666|       |#if !defined(NDEBUG)
  667|       |    impl_.AssertValidType(base_internal::FastTypeId<T>(), &GenRuntimeTypeId<T>);
  668|       |#endif
  669|       |
  670|      2|    if (ABSL_PREDICT_FALSE(!value_.Get(impl_.seq_lock_, u.value))) {
  ------------------
  |  |  178|      2|#define ABSL_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
  |  |  ------------------
  |  |  |  Branch (178:31): [True: 2, False: 0]
  |  |  |  Branch (178:49): [Folded, False: 2]
  |  |  |  Branch (178:58): [True: 2, False: 0]
  |  |  ------------------
  ------------------
  671|      2|      impl_.Read(&u.value);
  672|      2|    }
  673|      2|    return std::move(u.value);
  674|      2|  }
_ZZNK4absl12lts_2024011614flags_internal4FlagIN8fuzztest8internal14TimeBudgetTypeEE3GetEvEN1UC2Ev:
  661|      2|      U() {}
_ZNK4absl12lts_2024011614flags_internal9FlagValueIN8fuzztest8internal14TimeBudgetTypeELNS1_20FlagValueStorageKindE0EE3GetERKNS1_12SequenceLockERS5_:
  359|      2|  bool Get(const SequenceLock&, T& dst) const {
  360|      2|    int64_t storage = value.load(std::memory_order_acquire);
  361|      2|    if (ABSL_PREDICT_FALSE(storage == 0)) {
  ------------------
  |  |  178|      2|#define ABSL_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
  |  |  ------------------
  |  |  |  Branch (178:31): [True: 2, False: 0]
  |  |  |  Branch (178:49): [Folded, False: 2]
  |  |  |  Branch (178:58): [True: 2, False: 0]
  |  |  ------------------
  ------------------
  362|      2|      return false;
  363|      2|    }
  364|      0|    dst = absl::bit_cast<FlagValueAndInitBit<T>>(storage).value;
  365|      0|    return true;
  366|      2|  }
_ZNK4absl12lts_2024011614flags_internal8FlagImpl4ReadIN8fuzztest8internal14TimeBudgetTypeETnNSt3__19enable_ifIXeqclsr14flags_internalE11StorageKindIT_EELNS1_20FlagValueStorageKindE0EEiE4typeELi0EEEvPS9_:
  465|      2|  void Read(T* value) const ABSL_LOCKS_EXCLUDED(*DataGuard()) {
  466|      2|    *value = absl::bit_cast<FlagValueAndInitBit<T>>(ReadOneWord()).value;
  467|      2|  }
_ZZNK4absl12lts_2024011614flags_internal4FlagIN8fuzztest8internal14TimeBudgetTypeEE3GetEvEN1UD2Ev:
  662|      2|      ~U() { value.~T(); }
_ZN4absl12lts_2024011614flags_internal7FlagOpsIbEEPvNS1_6FlagOpEPKvS3_S3_:
  713|     14|void* FlagOps(FlagOp op, const void* v1, void* v2, void* v3) {
  714|     14|  switch (op) {
  ------------------
  |  Branch (714:11): [True: 14, False: 0]
  ------------------
  715|      0|    case FlagOp::kAlloc: {
  ------------------
  |  Branch (715:5): [True: 0, False: 14]
  ------------------
  716|      0|      std::allocator<T> alloc;
  717|      0|      return std::allocator_traits<std::allocator<T>>::allocate(alloc, 1);
  718|      0|    }
  719|      0|    case FlagOp::kDelete: {
  ------------------
  |  Branch (719:5): [True: 0, False: 14]
  ------------------
  720|      0|      T* p = static_cast<T*>(v2);
  721|      0|      p->~T();
  722|      0|      std::allocator<T> alloc;
  723|      0|      std::allocator_traits<std::allocator<T>>::deallocate(alloc, p, 1);
  724|      0|      return nullptr;
  725|      0|    }
  726|      0|    case FlagOp::kCopy:
  ------------------
  |  Branch (726:5): [True: 0, False: 14]
  ------------------
  727|      0|      *static_cast<T*>(v2) = *static_cast<const T*>(v1);
  728|      0|      return nullptr;
  729|      0|    case FlagOp::kCopyConstruct:
  ------------------
  |  Branch (729:5): [True: 0, False: 14]
  ------------------
  730|      0|      new (v2) T(*static_cast<const T*>(v1));
  731|      0|      return nullptr;
  732|      8|    case FlagOp::kSizeof:
  ------------------
  |  Branch (732:5): [True: 8, False: 6]
  ------------------
  733|      8|      return reinterpret_cast<void*>(static_cast<uintptr_t>(sizeof(T)));
  734|      0|    case FlagOp::kFastTypeId:
  ------------------
  |  Branch (734:5): [True: 0, False: 14]
  ------------------
  735|      0|      return const_cast<void*>(base_internal::FastTypeId<T>());
  736|      0|    case FlagOp::kRuntimeTypeId:
  ------------------
  |  Branch (736:5): [True: 0, False: 14]
  ------------------
  737|      0|      return const_cast<std::type_info*>(GenRuntimeTypeId<T>());
  738|      0|    case FlagOp::kParse: {
  ------------------
  |  Branch (738:5): [True: 0, False: 14]
  ------------------
  739|       |      // Initialize the temporary instance of type T based on current value in
  740|       |      // destination (which is going to be flag's default value).
  741|      0|      T temp(*static_cast<T*>(v2));
  742|      0|      if (!absl::ParseFlag<T>(*static_cast<const absl::string_view*>(v1), &temp,
  ------------------
  |  Branch (742:11): [True: 0, False: 0]
  ------------------
  743|      0|                              static_cast<std::string*>(v3))) {
  744|      0|        return nullptr;
  745|      0|      }
  746|      0|      *static_cast<T*>(v2) = std::move(temp);
  747|      0|      return v2;
  748|      0|    }
  749|      0|    case FlagOp::kUnparse:
  ------------------
  |  Branch (749:5): [True: 0, False: 14]
  ------------------
  750|      0|      *static_cast<std::string*>(v2) =
  751|      0|          absl::UnparseFlag<T>(*static_cast<const T*>(v1));
  752|      0|      return nullptr;
  753|      6|    case FlagOp::kValueOffset: {
  ------------------
  |  Branch (753:5): [True: 6, False: 8]
  ------------------
  754|       |      // Round sizeof(FlagImp) to a multiple of alignof(FlagValue<T>) to get the
  755|       |      // offset of the data.
  756|      6|      size_t round_to = alignof(FlagValue<T>);
  757|      6|      size_t offset =
  758|      6|          (sizeof(FlagImpl) + round_to - 1) / round_to * round_to;
  759|      6|      return reinterpret_cast<void*>(offset);
  760|      0|    }
  761|     14|  }
  762|      0|  return nullptr;
  763|     14|}
_ZN4absl12lts_2024011614flags_internal13FlagRegistrarIbLb1EEC2ERNS1_4FlagIbEEPKc:
  773|      4|  explicit FlagRegistrar(Flag<T>& flag, const char* filename) : flag_(flag) {
  774|      4|    if (do_register)
  ------------------
  |  Branch (774:9): [True: 4, Folded]
  ------------------
  775|      4|      flags_internal::RegisterCommandLineFlag(flag_.impl_, filename);
  776|      4|  }
_ZNO4absl12lts_2024011614flags_internal13FlagRegistrarIbLb1EE8OnUpdateEPFvvE:
  778|      2|  FlagRegistrar OnUpdate(FlagCallbackFunc cb) && {
  779|      2|    flag_.impl_.SetCallback(cb);
  780|      2|    return *this;
  781|      2|  }
_ZN4absl12lts_2024011614flags_internal12FlagImplPeer9InvokeGetIbNS1_4FlagIbEEEET_RKT0_:
  697|      6|  static T InvokeGet(const FlagType& flag) {
  698|      6|    return flag.Get();
  699|      6|  }
_ZNK4absl12lts_2024011614flags_internal4FlagIbE3GetEv:
  657|      6|  T Get() const {
  658|       |    // See implementation notes in CommandLineFlag::Get().
  659|      6|    union U {
  660|      6|      T value;
  661|      6|      U() {}
  662|      6|      ~U() { value.~T(); }
  663|      6|    };
  664|      6|    U u;
  665|       |
  666|       |#if !defined(NDEBUG)
  667|       |    impl_.AssertValidType(base_internal::FastTypeId<T>(), &GenRuntimeTypeId<T>);
  668|       |#endif
  669|       |
  670|      6|    if (ABSL_PREDICT_FALSE(!value_.Get(impl_.seq_lock_, u.value))) {
  ------------------
  |  |  178|      6|#define ABSL_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
  |  |  ------------------
  |  |  |  Branch (178:31): [True: 2, False: 4]
  |  |  |  Branch (178:49): [Folded, False: 6]
  |  |  |  Branch (178:58): [True: 2, False: 4]
  |  |  ------------------
  ------------------
  671|      2|      impl_.Read(&u.value);
  672|      2|    }
  673|      6|    return std::move(u.value);
  674|      6|  }
_ZZNK4absl12lts_2024011614flags_internal4FlagIbE3GetEvEN1UC2Ev:
  661|      6|      U() {}
_ZNK4absl12lts_2024011614flags_internal9FlagValueIbLNS1_20FlagValueStorageKindE0EE3GetERKNS1_12SequenceLockERb:
  359|      6|  bool Get(const SequenceLock&, T& dst) const {
  360|      6|    int64_t storage = value.load(std::memory_order_acquire);
  361|      6|    if (ABSL_PREDICT_FALSE(storage == 0)) {
  ------------------
  |  |  178|      6|#define ABSL_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
  |  |  ------------------
  |  |  |  Branch (178:31): [True: 2, False: 4]
  |  |  |  Branch (178:49): [Folded, False: 6]
  |  |  |  Branch (178:58): [True: 2, False: 4]
  |  |  ------------------
  ------------------
  362|      2|      return false;
  363|      2|    }
  364|      4|    dst = absl::bit_cast<FlagValueAndInitBit<T>>(storage).value;
  365|      4|    return true;
  366|      6|  }
_ZNK4absl12lts_2024011614flags_internal8FlagImpl4ReadEPb:
  450|      2|  void Read(bool* value) const ABSL_LOCKS_EXCLUDED(*DataGuard()) {
  451|      2|    *value = ReadOneBool();
  452|      2|  }
_ZZNK4absl12lts_2024011614flags_internal4FlagIbE3GetEvEN1UD2Ev:
  662|      6|      ~U() { value.~T(); }
_ZNK4absl12lts_2024011614flags_internal13FlagRegistrarIbLb1EEcvNS1_18FlagRegistrarEmptyEEv:
  786|      4|  operator FlagRegistrarEmpty() const { return {}; }  // NOLINT
_ZN4absl12lts_2024011614flags_internal7FlagOpsINSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEEEPvNS1_6FlagOpEPKvSA_SA_:
  713|     46|void* FlagOps(FlagOp op, const void* v1, void* v2, void* v3) {
  714|     46|  switch (op) {
  ------------------
  |  Branch (714:11): [True: 46, False: 0]
  ------------------
  715|      2|    case FlagOp::kAlloc: {
  ------------------
  |  Branch (715:5): [True: 2, False: 44]
  ------------------
  716|      2|      std::allocator<T> alloc;
  717|      2|      return std::allocator_traits<std::allocator<T>>::allocate(alloc, 1);
  718|      0|    }
  719|      2|    case FlagOp::kDelete: {
  ------------------
  |  Branch (719:5): [True: 2, False: 44]
  ------------------
  720|      2|      T* p = static_cast<T*>(v2);
  721|      2|      p->~T();
  722|      2|      std::allocator<T> alloc;
  723|      2|      std::allocator_traits<std::allocator<T>>::deallocate(alloc, p, 1);
  724|      2|      return nullptr;
  725|      0|    }
  726|      2|    case FlagOp::kCopy:
  ------------------
  |  Branch (726:5): [True: 2, False: 44]
  ------------------
  727|      2|      *static_cast<T*>(v2) = *static_cast<const T*>(v1);
  728|      2|      return nullptr;
  729|     14|    case FlagOp::kCopyConstruct:
  ------------------
  |  Branch (729:5): [True: 14, False: 32]
  ------------------
  730|     14|      new (v2) T(*static_cast<const T*>(v1));
  731|     14|      return nullptr;
  732|      0|    case FlagOp::kSizeof:
  ------------------
  |  Branch (732:5): [True: 0, False: 46]
  ------------------
  733|      0|      return reinterpret_cast<void*>(static_cast<uintptr_t>(sizeof(T)));
  734|      2|    case FlagOp::kFastTypeId:
  ------------------
  |  Branch (734:5): [True: 2, False: 44]
  ------------------
  735|      2|      return const_cast<void*>(base_internal::FastTypeId<T>());
  736|      0|    case FlagOp::kRuntimeTypeId:
  ------------------
  |  Branch (736:5): [True: 0, False: 46]
  ------------------
  737|      0|      return const_cast<std::type_info*>(GenRuntimeTypeId<T>());
  738|      2|    case FlagOp::kParse: {
  ------------------
  |  Branch (738:5): [True: 2, False: 44]
  ------------------
  739|       |      // Initialize the temporary instance of type T based on current value in
  740|       |      // destination (which is going to be flag's default value).
  741|      2|      T temp(*static_cast<T*>(v2));
  742|      2|      if (!absl::ParseFlag<T>(*static_cast<const absl::string_view*>(v1), &temp,
  ------------------
  |  Branch (742:11): [True: 0, False: 2]
  ------------------
  743|      2|                              static_cast<std::string*>(v3))) {
  744|      0|        return nullptr;
  745|      0|      }
  746|      2|      *static_cast<T*>(v2) = std::move(temp);
  747|      2|      return v2;
  748|      2|    }
  749|      0|    case FlagOp::kUnparse:
  ------------------
  |  Branch (749:5): [True: 0, False: 46]
  ------------------
  750|      0|      *static_cast<std::string*>(v2) =
  751|      0|          absl::UnparseFlag<T>(*static_cast<const T*>(v1));
  752|      0|      return nullptr;
  753|     22|    case FlagOp::kValueOffset: {
  ------------------
  |  Branch (753:5): [True: 22, False: 24]
  ------------------
  754|       |      // Round sizeof(FlagImp) to a multiple of alignof(FlagValue<T>) to get the
  755|       |      // offset of the data.
  756|     22|      size_t round_to = alignof(FlagValue<T>);
  757|     22|      size_t offset =
  758|     22|          (sizeof(FlagImpl) + round_to - 1) / round_to * round_to;
  759|     22|      return reinterpret_cast<void*>(offset);
  760|      2|    }
  761|     46|  }
  762|      0|  return nullptr;
  763|     46|}
_ZN4absl12lts_2024011614flags_internal13FlagRegistrarINSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEELb1EEC2ERNS1_4FlagIS9_EEPKc:
  773|      6|  explicit FlagRegistrar(Flag<T>& flag, const char* filename) : flag_(flag) {
  774|      6|    if (do_register)
  ------------------
  |  Branch (774:9): [True: 6, Folded]
  ------------------
  775|      6|      flags_internal::RegisterCommandLineFlag(flag_.impl_, filename);
  776|      6|  }
_ZNK4absl12lts_2024011614flags_internal13FlagRegistrarINSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEELb1EEcvNS1_18FlagRegistrarEmptyEEv:
  786|      6|  operator FlagRegistrarEmpty() const { return {}; }  // NOLINT
_ZN4absl12lts_2024011614flags_internal7FlagOpsINS0_8DurationEEEPvNS1_6FlagOpEPKvS4_S4_:
  713|     18|void* FlagOps(FlagOp op, const void* v1, void* v2, void* v3) {
  714|     18|  switch (op) {
  ------------------
  |  Branch (714:11): [True: 18, False: 0]
  ------------------
  715|      0|    case FlagOp::kAlloc: {
  ------------------
  |  Branch (715:5): [True: 0, False: 18]
  ------------------
  716|      0|      std::allocator<T> alloc;
  717|      0|      return std::allocator_traits<std::allocator<T>>::allocate(alloc, 1);
  718|      0|    }
  719|      0|    case FlagOp::kDelete: {
  ------------------
  |  Branch (719:5): [True: 0, False: 18]
  ------------------
  720|      0|      T* p = static_cast<T*>(v2);
  721|      0|      p->~T();
  722|      0|      std::allocator<T> alloc;
  723|      0|      std::allocator_traits<std::allocator<T>>::deallocate(alloc, p, 1);
  724|      0|      return nullptr;
  725|      0|    }
  726|      0|    case FlagOp::kCopy:
  ------------------
  |  Branch (726:5): [True: 0, False: 18]
  ------------------
  727|      0|      *static_cast<T*>(v2) = *static_cast<const T*>(v1);
  728|      0|      return nullptr;
  729|      0|    case FlagOp::kCopyConstruct:
  ------------------
  |  Branch (729:5): [True: 0, False: 18]
  ------------------
  730|      0|      new (v2) T(*static_cast<const T*>(v1));
  731|      0|      return nullptr;
  732|      6|    case FlagOp::kSizeof:
  ------------------
  |  Branch (732:5): [True: 6, False: 12]
  ------------------
  733|      6|      return reinterpret_cast<void*>(static_cast<uintptr_t>(sizeof(T)));
  734|      0|    case FlagOp::kFastTypeId:
  ------------------
  |  Branch (734:5): [True: 0, False: 18]
  ------------------
  735|      0|      return const_cast<void*>(base_internal::FastTypeId<T>());
  736|      0|    case FlagOp::kRuntimeTypeId:
  ------------------
  |  Branch (736:5): [True: 0, False: 18]
  ------------------
  737|      0|      return const_cast<std::type_info*>(GenRuntimeTypeId<T>());
  738|      0|    case FlagOp::kParse: {
  ------------------
  |  Branch (738:5): [True: 0, False: 18]
  ------------------
  739|       |      // Initialize the temporary instance of type T based on current value in
  740|       |      // destination (which is going to be flag's default value).
  741|      0|      T temp(*static_cast<T*>(v2));
  742|      0|      if (!absl::ParseFlag<T>(*static_cast<const absl::string_view*>(v1), &temp,
  ------------------
  |  Branch (742:11): [True: 0, False: 0]
  ------------------
  743|      0|                              static_cast<std::string*>(v3))) {
  744|      0|        return nullptr;
  745|      0|      }
  746|      0|      *static_cast<T*>(v2) = std::move(temp);
  747|      0|      return v2;
  748|      0|    }
  749|      0|    case FlagOp::kUnparse:
  ------------------
  |  Branch (749:5): [True: 0, False: 18]
  ------------------
  750|      0|      *static_cast<std::string*>(v2) =
  751|      0|          absl::UnparseFlag<T>(*static_cast<const T*>(v1));
  752|      0|      return nullptr;
  753|     12|    case FlagOp::kValueOffset: {
  ------------------
  |  Branch (753:5): [True: 12, False: 6]
  ------------------
  754|       |      // Round sizeof(FlagImp) to a multiple of alignof(FlagValue<T>) to get the
  755|       |      // offset of the data.
  756|     12|      size_t round_to = alignof(FlagValue<T>);
  757|     12|      size_t offset =
  758|     12|          (sizeof(FlagImpl) + round_to - 1) / round_to * round_to;
  759|     12|      return reinterpret_cast<void*>(offset);
  760|      0|    }
  761|     18|  }
  762|      0|  return nullptr;
  763|     18|}
_ZN4absl12lts_2024011614flags_internal13FlagRegistrarINS0_8DurationELb1EEC2ERNS1_4FlagIS3_EEPKc:
  773|      6|  explicit FlagRegistrar(Flag<T>& flag, const char* filename) : flag_(flag) {
  774|      6|    if (do_register)
  ------------------
  |  Branch (774:9): [True: 6, Folded]
  ------------------
  775|      6|      flags_internal::RegisterCommandLineFlag(flag_.impl_, filename);
  776|      6|  }
_ZNK4absl12lts_2024011614flags_internal13FlagRegistrarINS0_8DurationELb1EEcvNS1_18FlagRegistrarEmptyEEv:
  786|      6|  operator FlagRegistrarEmpty() const { return {}; }  // NOLINT
_ZN4absl12lts_2024011614flags_internal7FlagOpsIN8fuzztest8internal14TimeBudgetTypeEEEPvNS1_6FlagOpEPKvS6_S6_:
  713|      6|void* FlagOps(FlagOp op, const void* v1, void* v2, void* v3) {
  714|      6|  switch (op) {
  ------------------
  |  Branch (714:11): [True: 6, False: 0]
  ------------------
  715|      0|    case FlagOp::kAlloc: {
  ------------------
  |  Branch (715:5): [True: 0, False: 6]
  ------------------
  716|      0|      std::allocator<T> alloc;
  717|      0|      return std::allocator_traits<std::allocator<T>>::allocate(alloc, 1);
  718|      0|    }
  719|      0|    case FlagOp::kDelete: {
  ------------------
  |  Branch (719:5): [True: 0, False: 6]
  ------------------
  720|      0|      T* p = static_cast<T*>(v2);
  721|      0|      p->~T();
  722|      0|      std::allocator<T> alloc;
  723|      0|      std::allocator_traits<std::allocator<T>>::deallocate(alloc, p, 1);
  724|      0|      return nullptr;
  725|      0|    }
  726|      0|    case FlagOp::kCopy:
  ------------------
  |  Branch (726:5): [True: 0, False: 6]
  ------------------
  727|      0|      *static_cast<T*>(v2) = *static_cast<const T*>(v1);
  728|      0|      return nullptr;
  729|      0|    case FlagOp::kCopyConstruct:
  ------------------
  |  Branch (729:5): [True: 0, False: 6]
  ------------------
  730|      0|      new (v2) T(*static_cast<const T*>(v1));
  731|      0|      return nullptr;
  732|      2|    case FlagOp::kSizeof:
  ------------------
  |  Branch (732:5): [True: 2, False: 4]
  ------------------
  733|      2|      return reinterpret_cast<void*>(static_cast<uintptr_t>(sizeof(T)));
  734|      0|    case FlagOp::kFastTypeId:
  ------------------
  |  Branch (734:5): [True: 0, False: 6]
  ------------------
  735|      0|      return const_cast<void*>(base_internal::FastTypeId<T>());
  736|      0|    case FlagOp::kRuntimeTypeId:
  ------------------
  |  Branch (736:5): [True: 0, False: 6]
  ------------------
  737|      0|      return const_cast<std::type_info*>(GenRuntimeTypeId<T>());
  738|      0|    case FlagOp::kParse: {
  ------------------
  |  Branch (738:5): [True: 0, False: 6]
  ------------------
  739|       |      // Initialize the temporary instance of type T based on current value in
  740|       |      // destination (which is going to be flag's default value).
  741|      0|      T temp(*static_cast<T*>(v2));
  742|      0|      if (!absl::ParseFlag<T>(*static_cast<const absl::string_view*>(v1), &temp,
  ------------------
  |  Branch (742:11): [True: 0, False: 0]
  ------------------
  743|      0|                              static_cast<std::string*>(v3))) {
  744|      0|        return nullptr;
  745|      0|      }
  746|      0|      *static_cast<T*>(v2) = std::move(temp);
  747|      0|      return v2;
  748|      0|    }
  749|      0|    case FlagOp::kUnparse:
  ------------------
  |  Branch (749:5): [True: 0, False: 6]
  ------------------
  750|      0|      *static_cast<std::string*>(v2) =
  751|      0|          absl::UnparseFlag<T>(*static_cast<const T*>(v1));
  752|      0|      return nullptr;
  753|      4|    case FlagOp::kValueOffset: {
  ------------------
  |  Branch (753:5): [True: 4, False: 2]
  ------------------
  754|       |      // Round sizeof(FlagImp) to a multiple of alignof(FlagValue<T>) to get the
  755|       |      // offset of the data.
  756|      4|      size_t round_to = alignof(FlagValue<T>);
  757|      4|      size_t offset =
  758|      4|          (sizeof(FlagImpl) + round_to - 1) / round_to * round_to;
  759|      4|      return reinterpret_cast<void*>(offset);
  760|      0|    }
  761|      6|  }
  762|      0|  return nullptr;
  763|      6|}
_ZN4absl12lts_2024011614flags_internal13FlagRegistrarIN8fuzztest8internal14TimeBudgetTypeELb1EEC2ERNS1_4FlagIS5_EEPKc:
  773|      2|  explicit FlagRegistrar(Flag<T>& flag, const char* filename) : flag_(flag) {
  774|      2|    if (do_register)
  ------------------
  |  Branch (774:9): [True: 2, Folded]
  ------------------
  775|      2|      flags_internal::RegisterCommandLineFlag(flag_.impl_, filename);
  776|      2|  }
_ZNK4absl12lts_2024011614flags_internal13FlagRegistrarIN8fuzztest8internal14TimeBudgetTypeELb1EEcvNS1_18FlagRegistrarEmptyEEv:
  786|      2|  operator FlagRegistrarEmpty() const { return {}; }  // NOLINT
_ZN4absl12lts_2024011614flags_internal7FlagOpsImEEPvNS1_6FlagOpEPKvS3_S3_:
  713|     26|void* FlagOps(FlagOp op, const void* v1, void* v2, void* v3) {
  714|     26|  switch (op) {
  ------------------
  |  Branch (714:11): [True: 26, False: 0]
  ------------------
  715|      2|    case FlagOp::kAlloc: {
  ------------------
  |  Branch (715:5): [True: 2, False: 24]
  ------------------
  716|      2|      std::allocator<T> alloc;
  717|      2|      return std::allocator_traits<std::allocator<T>>::allocate(alloc, 1);
  718|      0|    }
  719|      2|    case FlagOp::kDelete: {
  ------------------
  |  Branch (719:5): [True: 2, False: 24]
  ------------------
  720|      2|      T* p = static_cast<T*>(v2);
  721|      2|      p->~T();
  722|      2|      std::allocator<T> alloc;
  723|      2|      std::allocator_traits<std::allocator<T>>::deallocate(alloc, p, 1);
  724|      2|      return nullptr;
  725|      0|    }
  726|      0|    case FlagOp::kCopy:
  ------------------
  |  Branch (726:5): [True: 0, False: 26]
  ------------------
  727|      0|      *static_cast<T*>(v2) = *static_cast<const T*>(v1);
  728|      0|      return nullptr;
  729|      2|    case FlagOp::kCopyConstruct:
  ------------------
  |  Branch (729:5): [True: 2, False: 24]
  ------------------
  730|      2|      new (v2) T(*static_cast<const T*>(v1));
  731|      2|      return nullptr;
  732|      6|    case FlagOp::kSizeof:
  ------------------
  |  Branch (732:5): [True: 6, False: 20]
  ------------------
  733|      6|      return reinterpret_cast<void*>(static_cast<uintptr_t>(sizeof(T)));
  734|      2|    case FlagOp::kFastTypeId:
  ------------------
  |  Branch (734:5): [True: 2, False: 24]
  ------------------
  735|      2|      return const_cast<void*>(base_internal::FastTypeId<T>());
  736|      0|    case FlagOp::kRuntimeTypeId:
  ------------------
  |  Branch (736:5): [True: 0, False: 26]
  ------------------
  737|      0|      return const_cast<std::type_info*>(GenRuntimeTypeId<T>());
  738|      2|    case FlagOp::kParse: {
  ------------------
  |  Branch (738:5): [True: 2, False: 24]
  ------------------
  739|       |      // Initialize the temporary instance of type T based on current value in
  740|       |      // destination (which is going to be flag's default value).
  741|      2|      T temp(*static_cast<T*>(v2));
  742|      2|      if (!absl::ParseFlag<T>(*static_cast<const absl::string_view*>(v1), &temp,
  ------------------
  |  Branch (742:11): [True: 0, False: 2]
  ------------------
  743|      2|                              static_cast<std::string*>(v3))) {
  744|      0|        return nullptr;
  745|      0|      }
  746|      2|      *static_cast<T*>(v2) = std::move(temp);
  747|      2|      return v2;
  748|      2|    }
  749|      0|    case FlagOp::kUnparse:
  ------------------
  |  Branch (749:5): [True: 0, False: 26]
  ------------------
  750|      0|      *static_cast<std::string*>(v2) =
  751|      0|          absl::UnparseFlag<T>(*static_cast<const T*>(v1));
  752|      0|      return nullptr;
  753|     10|    case FlagOp::kValueOffset: {
  ------------------
  |  Branch (753:5): [True: 10, False: 16]
  ------------------
  754|       |      // Round sizeof(FlagImp) to a multiple of alignof(FlagValue<T>) to get the
  755|       |      // offset of the data.
  756|     10|      size_t round_to = alignof(FlagValue<T>);
  757|     10|      size_t offset =
  758|     10|          (sizeof(FlagImpl) + round_to - 1) / round_to * round_to;
  759|     10|      return reinterpret_cast<void*>(offset);
  760|      2|    }
  761|     26|  }
  762|      0|  return nullptr;
  763|     26|}
_ZN4absl12lts_2024011614flags_internal13FlagRegistrarImLb1EEC2ERNS1_4FlagImEEPKc:
  773|      4|  explicit FlagRegistrar(Flag<T>& flag, const char* filename) : flag_(flag) {
  774|      4|    if (do_register)
  ------------------
  |  Branch (774:9): [True: 4, Folded]
  ------------------
  775|      4|      flags_internal::RegisterCommandLineFlag(flag_.impl_, filename);
  776|      4|  }
_ZNK4absl12lts_2024011614flags_internal13FlagRegistrarImLb1EEcvNS1_18FlagRegistrarEmptyEEv:
  786|      4|  operator FlagRegistrarEmpty() const { return {}; }  // NOLINT
_ZN4absl12lts_2024011614flags_internal7FlagOpsINSt3__18optionalImEEEEPvNS1_6FlagOpEPKvS6_S6_:
  713|      6|void* FlagOps(FlagOp op, const void* v1, void* v2, void* v3) {
  714|      6|  switch (op) {
  ------------------
  |  Branch (714:11): [True: 6, False: 0]
  ------------------
  715|      0|    case FlagOp::kAlloc: {
  ------------------
  |  Branch (715:5): [True: 0, False: 6]
  ------------------
  716|      0|      std::allocator<T> alloc;
  717|      0|      return std::allocator_traits<std::allocator<T>>::allocate(alloc, 1);
  718|      0|    }
  719|      0|    case FlagOp::kDelete: {
  ------------------
  |  Branch (719:5): [True: 0, False: 6]
  ------------------
  720|      0|      T* p = static_cast<T*>(v2);
  721|      0|      p->~T();
  722|      0|      std::allocator<T> alloc;
  723|      0|      std::allocator_traits<std::allocator<T>>::deallocate(alloc, p, 1);
  724|      0|      return nullptr;
  725|      0|    }
  726|      0|    case FlagOp::kCopy:
  ------------------
  |  Branch (726:5): [True: 0, False: 6]
  ------------------
  727|      0|      *static_cast<T*>(v2) = *static_cast<const T*>(v1);
  728|      0|      return nullptr;
  729|      0|    case FlagOp::kCopyConstruct:
  ------------------
  |  Branch (729:5): [True: 0, False: 6]
  ------------------
  730|      0|      new (v2) T(*static_cast<const T*>(v1));
  731|      0|      return nullptr;
  732|      2|    case FlagOp::kSizeof:
  ------------------
  |  Branch (732:5): [True: 2, False: 4]
  ------------------
  733|      2|      return reinterpret_cast<void*>(static_cast<uintptr_t>(sizeof(T)));
  734|      0|    case FlagOp::kFastTypeId:
  ------------------
  |  Branch (734:5): [True: 0, False: 6]
  ------------------
  735|      0|      return const_cast<void*>(base_internal::FastTypeId<T>());
  736|      0|    case FlagOp::kRuntimeTypeId:
  ------------------
  |  Branch (736:5): [True: 0, False: 6]
  ------------------
  737|      0|      return const_cast<std::type_info*>(GenRuntimeTypeId<T>());
  738|      0|    case FlagOp::kParse: {
  ------------------
  |  Branch (738:5): [True: 0, False: 6]
  ------------------
  739|       |      // Initialize the temporary instance of type T based on current value in
  740|       |      // destination (which is going to be flag's default value).
  741|      0|      T temp(*static_cast<T*>(v2));
  742|      0|      if (!absl::ParseFlag<T>(*static_cast<const absl::string_view*>(v1), &temp,
  ------------------
  |  Branch (742:11): [True: 0, False: 0]
  ------------------
  743|      0|                              static_cast<std::string*>(v3))) {
  744|      0|        return nullptr;
  745|      0|      }
  746|      0|      *static_cast<T*>(v2) = std::move(temp);
  747|      0|      return v2;
  748|      0|    }
  749|      0|    case FlagOp::kUnparse:
  ------------------
  |  Branch (749:5): [True: 0, False: 6]
  ------------------
  750|      0|      *static_cast<std::string*>(v2) =
  751|      0|          absl::UnparseFlag<T>(*static_cast<const T*>(v1));
  752|      0|      return nullptr;
  753|      4|    case FlagOp::kValueOffset: {
  ------------------
  |  Branch (753:5): [True: 4, False: 2]
  ------------------
  754|       |      // Round sizeof(FlagImp) to a multiple of alignof(FlagValue<T>) to get the
  755|       |      // offset of the data.
  756|      4|      size_t round_to = alignof(FlagValue<T>);
  757|      4|      size_t offset =
  758|      4|          (sizeof(FlagImpl) + round_to - 1) / round_to * round_to;
  759|      4|      return reinterpret_cast<void*>(offset);
  760|      0|    }
  761|      6|  }
  762|      0|  return nullptr;
  763|      6|}
_ZN4absl12lts_2024011614flags_internal13FlagRegistrarINSt3__18optionalImEELb1EEC2ERNS1_4FlagIS5_EEPKc:
  773|      2|  explicit FlagRegistrar(Flag<T>& flag, const char* filename) : flag_(flag) {
  774|      2|    if (do_register)
  ------------------
  |  Branch (774:9): [True: 2, Folded]
  ------------------
  775|      2|      flags_internal::RegisterCommandLineFlag(flag_.impl_, filename);
  776|      2|  }
_ZNK4absl12lts_2024011614flags_internal13FlagRegistrarINSt3__18optionalImEELb1EEcvNS1_18FlagRegistrarEmptyEEv:
  786|      2|  operator FlagRegistrarEmpty() const { return {}; }  // NOLINT
_ZN4absl12lts_2024011614flags_internal12FlagImplPeer9InvokeGetINSt3__112basic_stringIcNS4_11char_traitsIcEENS4_9allocatorIcEEEENS1_4FlagISA_EEEET_RKT0_:
  697|     14|  static T InvokeGet(const FlagType& flag) {
  698|     14|    return flag.Get();
  699|     14|  }
_ZNK4absl12lts_2024011614flags_internal4FlagINSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEE3GetEv:
  657|     14|  T Get() const {
  658|       |    // See implementation notes in CommandLineFlag::Get().
  659|     14|    union U {
  660|     14|      T value;
  661|     14|      U() {}
  662|     14|      ~U() { value.~T(); }
  663|     14|    };
  664|     14|    U u;
  665|       |
  666|       |#if !defined(NDEBUG)
  667|       |    impl_.AssertValidType(base_internal::FastTypeId<T>(), &GenRuntimeTypeId<T>);
  668|       |#endif
  669|       |
  670|     14|    if (ABSL_PREDICT_FALSE(!value_.Get(impl_.seq_lock_, u.value))) {
  ------------------
  |  |  178|     14|#define ABSL_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
  |  |  ------------------
  |  |  |  Branch (178:31): [True: 14, False: 0]
  |  |  |  Branch (178:49): [Folded, False: 14]
  |  |  |  Branch (178:58): [True: 14, False: 0]
  |  |  ------------------
  ------------------
  671|     14|      impl_.Read(&u.value);
  672|     14|    }
  673|     14|    return std::move(u.value);
  674|     14|  }
_ZZNK4absl12lts_2024011614flags_internal4FlagINSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEE3GetEvEN1UC2Ev:
  661|     14|      U() {}
_ZNK4absl12lts_2024011614flags_internal9FlagValueINSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEELNS1_20FlagValueStorageKindE3EE3GetERKNS1_12SequenceLockERS9_:
  397|     14|  bool Get(const SequenceLock&, T&) const { return false; }
_ZZNK4absl12lts_2024011614flags_internal4FlagINSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEE3GetEvEN1UD2Ev:
  662|     14|      ~U() { value.~T(); }
_ZNK4absl12lts_2024011614flags_internal8FlagImpl16ValueStorageKindEv:
  537|     76|  FlagValueStorageKind ValueStorageKind() const {
  538|     76|    return static_cast<FlagValueStorageKind>(value_storage_kind_);
  539|     76|  }
_ZNK4absl12lts_2024011614flags_internal8FlagImpl11DefaultKindEv:
  541|      4|      ABSL_EXCLUSIVE_LOCKS_REQUIRED(*DataGuard()) {
  542|      4|    return static_cast<FlagDefaultKind>(def_kind_);
  543|      4|  }
_ZN4absl12lts_2024011614flags_internal5AllocEPFPvNS1_6FlagOpEPKvS2_S2_E:
   97|      4|inline void* Alloc(FlagOpFn op) {
   98|      4|  return op(FlagOp::kAlloc, nullptr, nullptr, nullptr);
   99|      4|}
_ZN4absl12lts_2024011614flags_internal6DeleteEPFPvNS1_6FlagOpEPKvS2_S2_ES2_:
  101|      4|inline void Delete(FlagOpFn op, void* obj) {
  102|      4|  op(FlagOp::kDelete, nullptr, obj, nullptr);
  103|      4|}
_ZN4absl12lts_2024011614flags_internal4CopyEPFPvNS1_6FlagOpEPKvS2_S2_ES5_S2_:
  105|      2|inline void Copy(FlagOpFn op, const void* src, void* dst) {
  106|      2|  op(FlagOp::kCopy, src, dst, nullptr);
  107|      2|}
_ZN4absl12lts_2024011614flags_internal13CopyConstructEPFPvNS1_6FlagOpEPKvS2_S2_ES5_S2_:
  110|     26|inline void CopyConstruct(FlagOpFn op, const void* src, void* dst) {
  111|     26|  op(FlagOp::kCopyConstruct, src, dst, nullptr);
  112|     26|}
_ZN4absl12lts_2024011614flags_internal5CloneEPFPvNS1_6FlagOpEPKvS2_S2_ES5_:
  114|      2|inline void* Clone(FlagOpFn op, const void* obj) {
  115|      2|  void* res = flags_internal::Alloc(op);
  116|      2|  flags_internal::CopyConstruct(op, obj, res);
  117|      2|  return res;
  118|      2|}
_ZN4absl12lts_2024011614flags_internal5ParseEPFPvNS1_6FlagOpEPKvS2_S2_ENSt3__117basic_string_viewIcNS8_11char_traitsIcEEEES2_PNS8_12basic_stringIcSB_NS8_9allocatorIcEEEE:
  121|      4|                  std::string* error) {
  122|      4|  return op(FlagOp::kParse, &text, dst, error) != nullptr;
  123|      4|}
_ZN4absl12lts_2024011614flags_internal6SizeofEPFPvNS1_6FlagOpEPKvS2_S2_E:
  131|     24|inline size_t Sizeof(FlagOpFn op) {
  132|       |  // This sequence of casts reverses the sequence from
  133|       |  // `flags_internal::FlagOps()`
  134|     24|  return static_cast<size_t>(reinterpret_cast<intptr_t>(
  135|     24|      op(FlagOp::kSizeof, nullptr, nullptr, nullptr)));
  136|     24|}
_ZN4absl12lts_2024011614flags_internal10FastTypeIdEPFPvNS1_6FlagOpEPKvS2_S2_E:
  138|      4|inline FlagFastTypeId FastTypeId(FlagOpFn op) {
  139|      4|  return reinterpret_cast<FlagFastTypeId>(
  140|      4|      op(FlagOp::kFastTypeId, nullptr, nullptr, nullptr));
  141|      4|}
_ZN4absl12lts_2024011614flags_internal11ValueOffsetEPFPvNS1_6FlagOpEPKvS2_S2_E:
  151|     74|inline ptrdiff_t ValueOffset(FlagOpFn op) {
  152|       |  // This sequence of casts reverses the sequence from
  153|       |  // `flags_internal::FlagOps()`
  154|     74|  return static_cast<ptrdiff_t>(reinterpret_cast<intptr_t>(
  155|     74|      op(FlagOp::kValueOffset, nullptr, nullptr, nullptr)));
  156|     74|}
_ZN4absl12lts_2024011614flags_internal16InitDefaultValueINSt3__16vectorINS3_12basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEENS8_ISA_EEEEEET_NS1_11EmptyBracesE:
  282|      6|constexpr T InitDefaultValue(EmptyBraces) {
  283|      6|  return T{};
  284|      6|}
_ZN4absl12lts_2024011614flags_internal7FlagOpsINSt3__16vectorINS3_12basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEENS8_ISA_EEEEEEPvNS1_6FlagOpEPKvSD_SD_:
  713|     26|void* FlagOps(FlagOp op, const void* v1, void* v2, void* v3) {
  714|     26|  switch (op) {
  ------------------
  |  Branch (714:11): [True: 26, False: 0]
  ------------------
  715|      0|    case FlagOp::kAlloc: {
  ------------------
  |  Branch (715:5): [True: 0, False: 26]
  ------------------
  716|      0|      std::allocator<T> alloc;
  717|      0|      return std::allocator_traits<std::allocator<T>>::allocate(alloc, 1);
  718|      0|    }
  719|      0|    case FlagOp::kDelete: {
  ------------------
  |  Branch (719:5): [True: 0, False: 26]
  ------------------
  720|      0|      T* p = static_cast<T*>(v2);
  721|      0|      p->~T();
  722|      0|      std::allocator<T> alloc;
  723|      0|      std::allocator_traits<std::allocator<T>>::deallocate(alloc, p, 1);
  724|      0|      return nullptr;
  725|      0|    }
  726|      0|    case FlagOp::kCopy:
  ------------------
  |  Branch (726:5): [True: 0, False: 26]
  ------------------
  727|      0|      *static_cast<T*>(v2) = *static_cast<const T*>(v1);
  728|      0|      return nullptr;
  729|     10|    case FlagOp::kCopyConstruct:
  ------------------
  |  Branch (729:5): [True: 10, False: 16]
  ------------------
  730|     10|      new (v2) T(*static_cast<const T*>(v1));
  731|     10|      return nullptr;
  732|      0|    case FlagOp::kSizeof:
  ------------------
  |  Branch (732:5): [True: 0, False: 26]
  ------------------
  733|      0|      return reinterpret_cast<void*>(static_cast<uintptr_t>(sizeof(T)));
  734|      0|    case FlagOp::kFastTypeId:
  ------------------
  |  Branch (734:5): [True: 0, False: 26]
  ------------------
  735|      0|      return const_cast<void*>(base_internal::FastTypeId<T>());
  736|      0|    case FlagOp::kRuntimeTypeId:
  ------------------
  |  Branch (736:5): [True: 0, False: 26]
  ------------------
  737|      0|      return const_cast<std::type_info*>(GenRuntimeTypeId<T>());
  738|      0|    case FlagOp::kParse: {
  ------------------
  |  Branch (738:5): [True: 0, False: 26]
  ------------------
  739|       |      // Initialize the temporary instance of type T based on current value in
  740|       |      // destination (which is going to be flag's default value).
  741|      0|      T temp(*static_cast<T*>(v2));
  742|      0|      if (!absl::ParseFlag<T>(*static_cast<const absl::string_view*>(v1), &temp,
  ------------------
  |  Branch (742:11): [True: 0, False: 0]
  ------------------
  743|      0|                              static_cast<std::string*>(v3))) {
  744|      0|        return nullptr;
  745|      0|      }
  746|      0|      *static_cast<T*>(v2) = std::move(temp);
  747|      0|      return v2;
  748|      0|    }
  749|      0|    case FlagOp::kUnparse:
  ------------------
  |  Branch (749:5): [True: 0, False: 26]
  ------------------
  750|      0|      *static_cast<std::string*>(v2) =
  751|      0|          absl::UnparseFlag<T>(*static_cast<const T*>(v1));
  752|      0|      return nullptr;
  753|     16|    case FlagOp::kValueOffset: {
  ------------------
  |  Branch (753:5): [True: 16, False: 10]
  ------------------
  754|       |      // Round sizeof(FlagImp) to a multiple of alignof(FlagValue<T>) to get the
  755|       |      // offset of the data.
  756|     16|      size_t round_to = alignof(FlagValue<T>);
  757|     16|      size_t offset =
  758|     16|          (sizeof(FlagImpl) + round_to - 1) / round_to * round_to;
  759|     16|      return reinterpret_cast<void*>(offset);
  760|      0|    }
  761|     26|  }
  762|      0|  return nullptr;
  763|     26|}
_ZN4absl12lts_2024011614flags_internal13FlagRegistrarINSt3__16vectorINS3_12basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEENS8_ISA_EEEELb1EEC2ERNS1_4FlagISC_EEPKc:
  773|      8|  explicit FlagRegistrar(Flag<T>& flag, const char* filename) : flag_(flag) {
  774|      8|    if (do_register)
  ------------------
  |  Branch (774:9): [True: 8, Folded]
  ------------------
  775|      8|      flags_internal::RegisterCommandLineFlag(flag_.impl_, filename);
  776|      8|  }
_ZNO4absl12lts_2024011614flags_internal13FlagRegistrarINSt3__16vectorINS3_12basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEENS8_ISA_EEEELb1EE8OnUpdateEPFvvE:
  778|      6|  FlagRegistrar OnUpdate(FlagCallbackFunc cb) && {
  779|      6|    flag_.impl_.SetCallback(cb);
  780|      6|    return *this;
  781|      6|  }
_ZN4absl12lts_2024011614flags_internal12FlagImplPeer9InvokeGetINSt3__16vectorINS4_12basic_stringIcNS4_11char_traitsIcEENS4_9allocatorIcEEEENS9_ISB_EEEENS1_4FlagISD_EEEET_RKT0_:
  697|     10|  static T InvokeGet(const FlagType& flag) {
  698|     10|    return flag.Get();
  699|     10|  }
_ZNK4absl12lts_2024011614flags_internal4FlagINSt3__16vectorINS3_12basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEENS8_ISA_EEEEE3GetEv:
  657|     10|  T Get() const {
  658|       |    // See implementation notes in CommandLineFlag::Get().
  659|     10|    union U {
  660|     10|      T value;
  661|     10|      U() {}
  662|     10|      ~U() { value.~T(); }
  663|     10|    };
  664|     10|    U u;
  665|       |
  666|       |#if !defined(NDEBUG)
  667|       |    impl_.AssertValidType(base_internal::FastTypeId<T>(), &GenRuntimeTypeId<T>);
  668|       |#endif
  669|       |
  670|     10|    if (ABSL_PREDICT_FALSE(!value_.Get(impl_.seq_lock_, u.value))) {
  ------------------
  |  |  178|     10|#define ABSL_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
  |  |  ------------------
  |  |  |  Branch (178:31): [True: 10, False: 0]
  |  |  |  Branch (178:49): [Folded, False: 10]
  |  |  |  Branch (178:58): [True: 10, False: 0]
  |  |  ------------------
  ------------------
  671|     10|      impl_.Read(&u.value);
  672|     10|    }
  673|     10|    return std::move(u.value);
  674|     10|  }
_ZZNK4absl12lts_2024011614flags_internal4FlagINSt3__16vectorINS3_12basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEENS8_ISA_EEEEE3GetEvEN1UC2Ev:
  661|     10|      U() {}
_ZNK4absl12lts_2024011614flags_internal9FlagValueINSt3__16vectorINS3_12basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEENS8_ISA_EEEELNS1_20FlagValueStorageKindE3EE3GetERKNS1_12SequenceLockERSC_:
  397|     10|  bool Get(const SequenceLock&, T&) const { return false; }
_ZZNK4absl12lts_2024011614flags_internal4FlagINSt3__16vectorINS3_12basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEENS8_ISA_EEEEE3GetEvEN1UD2Ev:
  662|     10|      ~U() { value.~T(); }
_ZNK4absl12lts_2024011614flags_internal13FlagRegistrarINSt3__16vectorINS3_12basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEENS8_ISA_EEEELb1EEcvNS1_18FlagRegistrarEmptyEEv:
  786|      8|  operator FlagRegistrarEmpty() const { return {}; }  // NOLINT

_ZN4absl12lts_2024011614flags_internal21PrivateHandleAccessor9ParseFromERNS0_15CommandLineFlagENSt3__117basic_string_viewIcNS5_11char_traitsIcEEEENS1_15FlagSettingModeENS1_11ValueSourceERNS5_12basic_stringIcS8_NS5_9allocatorIcEEEE:
   58|      4|                                      std::string& error) {
   59|      4|  return flag.ParseFrom(value, set_mode, source, error);
   60|      4|}

_ZN4absl12lts_2024011614flags_internal21ProgramInvocationNameEv:
   36|      2|std::string ProgramInvocationName() {
   37|      2|  absl::MutexLock l(&program_name_guard);
   38|       |
   39|      2|  return program_name ? *program_name : "UNKNOWN";
  ------------------
  |  Branch (39:10): [True: 0, False: 2]
  ------------------
   40|      2|}
_ZN4absl12lts_2024011614flags_internal24SetProgramInvocationNameENSt3__117basic_string_viewIcNS2_11char_traitsIcEEEE:
   49|      2|void SetProgramInvocationName(absl::string_view prog_name_str) {
   50|      2|  absl::MutexLock l(&program_name_guard);
   51|       |
   52|      2|  if (!program_name)
  ------------------
  |  Branch (52:7): [True: 2, False: 0]
  ------------------
   53|      2|    program_name = new std::string(prog_name_str);
   54|      0|  else
   55|      0|    program_name->assign(prog_name_str.data(), prog_name_str.size());
   56|      2|}

_ZNK4absl12lts_2024011614flags_internal12SequenceLock7TryReadEPvPKNSt3__16atomicImEEm:
   80|     20|  bool TryRead(void* dst, const std::atomic<uint64_t>* src, size_t size) const {
   81|       |    // Acquire barrier ensures that no loads done by f() are reordered
   82|       |    // above the first load of the sequence counter.
   83|     20|    int64_t seq_before = lock_.load(std::memory_order_acquire);
   84|     20|    if (ABSL_PREDICT_FALSE(seq_before & 1) == 1) return false;
  ------------------
  |  |  178|     20|#define ABSL_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
  |  |  ------------------
  |  |  |  Branch (178:49): [Folded, False: 20]
  |  |  |  Branch (178:58): [True: 8, False: 12]
  |  |  ------------------
  ------------------
  |  Branch (84:9): [True: 8, False: 12]
  ------------------
   85|     12|    RelaxedCopyFromAtomic(dst, src, size);
   86|       |    // Another acquire fence ensures that the load of 'lock_' below is
   87|       |    // strictly ordered after the RelaxedCopyToAtomic call above.
   88|     12|    std::atomic_thread_fence(std::memory_order_acquire);
   89|     12|    int64_t seq_after = lock_.load(std::memory_order_relaxed);
   90|     12|    return ABSL_PREDICT_TRUE(seq_before == seq_after);
  ------------------
  |  |  179|     12|#define ABSL_PREDICT_TRUE(x) (__builtin_expect(false || (x), true))
  |  |  ------------------
  |  |  |  Branch (179:48): [Folded, False: 12]
  |  |  |  Branch (179:57): [True: 12, False: 0]
  |  |  ------------------
  ------------------
   91|     20|  }
_ZN4absl12lts_2024011614flags_internal12SequenceLock21RelaxedCopyFromAtomicEPvPKNSt3__16atomicImEEm:
  144|     12|                                    size_t size) {
  145|     12|    char* dst_byte = static_cast<char*>(dst);
  146|     26|    while (size >= sizeof(uint64_t)) {
  ------------------
  |  Branch (146:12): [True: 14, False: 12]
  ------------------
  147|     14|      uint64_t word = src->load(std::memory_order_relaxed);
  148|     14|      std::memcpy(dst_byte, &word, sizeof(word));
  149|     14|      dst_byte += sizeof(word);
  150|     14|      src++;
  151|     14|      size -= sizeof(word);
  152|     14|    }
  153|     12|    if (size > 0) {
  ------------------
  |  Branch (153:9): [True: 10, False: 2]
  ------------------
  154|     10|      uint64_t word = src->load(std::memory_order_relaxed);
  155|     10|      std::memcpy(dst_byte, &word, size);
  156|     10|    }
  157|     12|  }
_ZN4absl12lts_2024011614flags_internal12SequenceLock15MarkInitializedEv:
   63|     30|  void MarkInitialized() {
   64|       |    assert(lock_.load(std::memory_order_relaxed) == kUninitialized);
   65|     30|    lock_.store(0, std::memory_order_release);
   66|     30|  }
_ZN4absl12lts_2024011614flags_internal12SequenceLock26IncrementModificationCountEv:
  134|      4|  void IncrementModificationCount() {
  135|      4|    int64_t val = lock_.load(std::memory_order_relaxed);
  136|       |    assert(val != kUninitialized);
  137|      4|    lock_.store(val + 2, std::memory_order_relaxed);
  138|      4|  }

_ZN4absl12lts_2024011614flags_internal16HandleUsageFlagsERNSt3__113basic_ostreamIcNS2_11char_traitsIcEEEENS2_17basic_string_viewIcS5_EE:
  373|      2|                          absl::string_view program_usage_message) {
  374|      2|  switch (GetFlagsHelpMode()) {
  ------------------
  |  Branch (374:11): [True: 2, False: 0]
  ------------------
  375|      2|    case HelpMode::kNone:
  ------------------
  |  Branch (375:5): [True: 2, False: 0]
  ------------------
  376|      2|      break;
  377|      0|    case HelpMode::kImportant:
  ------------------
  |  Branch (377:5): [True: 0, False: 2]
  ------------------
  378|      0|      flags_internal::FlagsHelpImpl(
  379|      0|          out, flags_internal::GetUsageConfig().contains_help_flags,
  380|      0|          GetFlagsHelpFormat(), program_usage_message);
  381|      0|      break;
  382|       |
  383|      0|    case HelpMode::kShort:
  ------------------
  |  Branch (383:5): [True: 0, False: 2]
  ------------------
  384|      0|      flags_internal::FlagsHelpImpl(
  385|      0|          out, flags_internal::GetUsageConfig().contains_helpshort_flags,
  386|      0|          GetFlagsHelpFormat(), program_usage_message);
  387|      0|      break;
  388|       |
  389|      0|    case HelpMode::kFull:
  ------------------
  |  Branch (389:5): [True: 0, False: 2]
  ------------------
  390|      0|      flags_internal::FlagsHelp(out, "", GetFlagsHelpFormat(),
  391|      0|                                program_usage_message);
  392|      0|      break;
  393|       |
  394|      0|    case HelpMode::kPackage:
  ------------------
  |  Branch (394:5): [True: 0, False: 2]
  ------------------
  395|      0|      flags_internal::FlagsHelpImpl(
  396|      0|          out, flags_internal::GetUsageConfig().contains_helppackage_flags,
  397|      0|          GetFlagsHelpFormat(), program_usage_message);
  398|      0|      break;
  399|       |
  400|      0|    case HelpMode::kMatch: {
  ------------------
  |  Branch (400:5): [True: 0, False: 2]
  ------------------
  401|      0|      std::string substr = GetFlagsHelpMatchSubstr();
  402|      0|      if (substr.empty()) {
  ------------------
  |  Branch (402:11): [True: 0, False: 0]
  ------------------
  403|       |        // show all options
  404|      0|        flags_internal::FlagsHelp(out, substr, GetFlagsHelpFormat(),
  405|      0|                                  program_usage_message);
  406|      0|      } else {
  407|      0|        auto filter_cb = [&substr](const absl::CommandLineFlag& flag) {
  408|      0|          if (absl::StrContains(flag.Name(), substr)) return true;
  409|      0|          if (absl::StrContains(flag.Filename(), substr)) return true;
  410|      0|          if (absl::StrContains(flag.Help(), substr)) return true;
  411|       |
  412|      0|          return false;
  413|      0|        };
  414|      0|        flags_internal::FlagsHelpImpl(
  415|      0|            out, filter_cb, HelpFormat::kHumanReadable, program_usage_message);
  416|      0|      }
  417|      0|      break;
  418|      0|    }
  419|      0|    case HelpMode::kVersion:
  ------------------
  |  Branch (419:5): [True: 0, False: 2]
  ------------------
  420|      0|      if (flags_internal::GetUsageConfig().version_string)
  ------------------
  |  Branch (420:11): [True: 0, False: 0]
  ------------------
  421|      0|        out << flags_internal::GetUsageConfig().version_string();
  422|       |      // Unlike help, we may be asking for version in a script, so return 0
  423|      0|      break;
  424|       |
  425|      0|    case HelpMode::kOnlyCheckArgs:
  ------------------
  |  Branch (425:5): [True: 0, False: 2]
  ------------------
  426|      0|      break;
  427|      2|  }
  428|       |
  429|      2|  return GetFlagsHelpMode();
  430|      2|}
_ZN4absl12lts_2024011614flags_internal16GetFlagsHelpModeEv:
  459|      4|HelpMode GetFlagsHelpMode() {
  460|      4|  absl::MutexLock l(&help_attributes_guard);
  461|      4|  return help_mode;
  462|      4|}
_ZN4absl12lts_2024011614flags_internal9MaybeExitENS1_8HelpModeE:
  539|      2|void MaybeExit(HelpMode mode) {
  540|      2|  switch (mode) {
  541|      2|    case flags_internal::HelpMode::kNone:
  ------------------
  |  Branch (541:5): [True: 2, False: 0]
  ------------------
  542|      2|      return;
  543|      0|    case flags_internal::HelpMode::kOnlyCheckArgs:
  ------------------
  |  Branch (543:5): [True: 0, False: 2]
  ------------------
  544|      0|    case flags_internal::HelpMode::kVersion:
  ------------------
  |  Branch (544:5): [True: 0, False: 2]
  ------------------
  545|      0|      std::exit(0);
  546|      0|    default:  // For all the other modes we exit with 1
  ------------------
  |  Branch (546:5): [True: 0, False: 2]
  ------------------
  547|      0|      std::exit(1);
  548|      2|  }
  549|      2|}

_ZN4absl12lts_2024011614flags_internal13AbslParseFlagENSt3__117basic_string_viewIcNS2_11char_traitsIcEEEEPmPNS2_12basic_stringIcS5_NS2_9allocatorIcEEEE:
  119|      2|bool AbslParseFlag(absl::string_view text, unsigned long* dst, std::string*) {
  120|      2|  return ParseFlagImpl(text, *dst);
  121|      2|}
_ZN4absl12lts_2024011614flags_internal13AbslParseFlagENSt3__117basic_string_viewIcNS2_11char_traitsIcEEEEPNS2_12basic_stringIcS5_NS2_9allocatorIcEEEESB_:
  172|      2|bool AbslParseFlag(absl::string_view text, std::string* dst, std::string*) {
  173|      2|  dst->assign(text.data(), text.size());
  174|      2|  return true;
  175|      2|}
marshalling.cc:_ZN4absl12lts_2024011614flags_internalL11NumericBaseENSt3__117basic_string_viewIcNS2_11char_traitsIcEEEE:
   72|      2|static int NumericBase(absl::string_view text) {
   73|      2|  if (text.empty()) return 0;
  ------------------
  |  Branch (73:7): [True: 0, False: 2]
  ------------------
   74|      2|  size_t num_start = (text[0] == '-' || text[0] == '+') ? 1 : 0;
  ------------------
  |  Branch (74:23): [True: 0, False: 2]
  |  Branch (74:41): [True: 0, False: 2]
  ------------------
   75|      2|  const bool hex = (text.size() >= num_start + 2 && text[num_start] == '0' &&
  ------------------
  |  Branch (75:21): [True: 2, False: 0]
  |  Branch (75:53): [True: 0, False: 2]
  ------------------
   76|      0|                    (text[num_start + 1] == 'x' || text[num_start + 1] == 'X'));
  ------------------
  |  Branch (76:22): [True: 0, False: 0]
  |  Branch (76:52): [True: 0, False: 0]
  ------------------
   77|      2|  return hex ? 16 : 10;
  ------------------
  |  Branch (77:10): [True: 0, False: 2]
  ------------------
   78|      2|}
_ZN4absl12lts_2024011614flags_internal13ParseFlagImplImEEbNSt3__117basic_string_viewIcNS3_11char_traitsIcEEEERT_:
   81|      2|inline bool ParseFlagImpl(absl::string_view text, IntType& dst) {
   82|      2|  text = absl::StripAsciiWhitespace(text);
   83|       |
   84|      2|  return absl::numbers_internal::safe_strtoi_base(text, &dst,
   85|      2|                                                  NumericBase(text));
   86|      2|}

_ZN4absl12lts_202401169ParseFlagINSt3__112basic_stringIcNS2_11char_traitsIcEENS2_9allocatorIcEEEEEEbNS2_17basic_string_viewIcS5_EEPT_PS8_:
  333|      2|inline bool ParseFlag(absl::string_view input, T* dst, std::string* error) {
  334|      2|  return flags_internal::InvokeParseFlag(input, dst, error);
  335|      2|}
_ZN4absl12lts_2024011614flags_internal15InvokeParseFlagINSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEEEbNS3_17basic_string_viewIcS6_EEPT_PS9_:
  275|      2|bool InvokeParseFlag(absl::string_view input, T* dst, std::string* err) {
  276|       |  // Comment on next line provides a good compiler error message if T
  277|       |  // does not have AbslParseFlag(absl::string_view, T*, std::string*).
  278|      2|  return AbslParseFlag(input, dst, err);  // Is T missing AbslParseFlag?
  279|      2|}
_ZN4absl12lts_202401169ParseFlagImEEbNSt3__117basic_string_viewIcNS2_11char_traitsIcEEEEPT_PNS2_12basic_stringIcS5_NS2_9allocatorIcEEEE:
  333|      2|inline bool ParseFlag(absl::string_view input, T* dst, std::string* error) {
  334|      2|  return flags_internal::InvokeParseFlag(input, dst, error);
  335|      2|}
_ZN4absl12lts_2024011614flags_internal15InvokeParseFlagImEEbNSt3__117basic_string_viewIcNS3_11char_traitsIcEEEEPT_PNS3_12basic_stringIcS6_NS3_9allocatorIcEEEE:
  275|      2|bool InvokeParseFlag(absl::string_view input, T* dst, std::string* err) {
  276|       |  // Comment on next line provides a good compiler error message if T
  277|       |  // does not have AbslParseFlag(absl::string_view, T*, std::string*).
  278|      2|  return AbslParseFlag(input, dst, err);  // Is T missing AbslParseFlag?
  279|      2|}

_ZN4absl12lts_2024011614flags_internal24ParseAbseilFlagsOnlyImplEiPPcRNSt3__16vectorIS2_NS4_9allocatorIS2_EEEERNS5_INS0_16UnrecognizedFlagENS6_ISA_EEEENS1_16UsageFlagsActionE:
  744|      2|    UsageFlagsAction usage_flag_action) {
  745|      2|  ABSL_INTERNAL_CHECK(argc > 0, "Missing argv[0]");
  ------------------
  |  |   85|      2|  do {                                                             \
  |  |   86|      2|    if (ABSL_PREDICT_FALSE(!(condition))) {                        \
  |  |  ------------------
  |  |  |  |  178|      2|#define ABSL_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (178:31): [True: 0, False: 2]
  |  |  |  |  |  Branch (178:49): [Folded, False: 2]
  |  |  |  |  |  Branch (178:58): [True: 0, False: 2]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   87|      0|      std::string death_message = "Check " #condition " failed: "; \
  |  |   88|      0|      death_message += std::string(message);                       \
  |  |   89|      0|      ABSL_INTERNAL_LOG(FATAL, death_message);                     \
  |  |  ------------------
  |  |  |  |   76|      0|  do {                                                                    \
  |  |  |  |   77|      0|    constexpr const char* absl_raw_log_internal_filename = __FILE__;      \
  |  |  |  |   78|      0|    ::absl::raw_log_internal::internal_log_function(                      \
  |  |  |  |   79|      0|        ABSL_RAW_LOG_INTERNAL_##severity, absl_raw_log_internal_filename, \
  |  |  |  |  ------------------
  |  |  |  |  |  |  110|      0|#define ABSL_RAW_LOG_INTERNAL_FATAL ::absl::LogSeverity::kFatal
  |  |  |  |  ------------------
  |  |  |  |   80|      0|        __LINE__, message);                                               \
  |  |  |  |   81|      0|    ABSL_RAW_LOG_INTERNAL_MAYBE_UNREACHABLE_##severity;                   \
  |  |  |  |  ------------------
  |  |  |  |  |  |  118|      0|#define ABSL_RAW_LOG_INTERNAL_MAYBE_UNREACHABLE_FATAL ABSL_UNREACHABLE()
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  225|      0|  do {                                           \
  |  |  |  |  |  |  |  |  226|      0|    /* NOLINTNEXTLINE: misc-static-assert */     \
  |  |  |  |  |  |  |  |  227|      0|    assert(false && "ABSL_UNREACHABLE reached"); \
  |  |  |  |  |  |  |  |  228|      0|    ABSL_INTERNAL_UNREACHABLE_IMPL();            \
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |  203|      0|#define ABSL_INTERNAL_UNREACHABLE_IMPL() __builtin_unreachable()
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  229|      0|  } while (false)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (229:12): [Folded, False: 0]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   82|      0|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (82:12): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   90|      0|    }                                                              \
  |  |   91|      2|  } while (0)
  |  |  ------------------
  |  |  |  Branch (91:12): [Folded, False: 2]
  |  |  ------------------
  ------------------
  746|       |
  747|      2|  using flags_internal::ArgsList;
  748|      2|  using flags_internal::specified_flags;
  749|       |
  750|      2|  std::vector<std::string> flagfile_value;
  751|      2|  std::vector<ArgsList> input_args;
  752|       |
  753|       |  // Once parsing has started we will not allow more flag registrations.
  754|      2|  flags_internal::FinalizeRegistry();
  755|       |
  756|       |  // This routine does not return anything since we abort on failure.
  757|      2|  flags_internal::CheckDefaultValuesParsingRoundtrip();
  758|       |
  759|      2|  input_args.push_back(ArgsList(argc, argv));
  760|       |
  761|       |  // Set program invocation name if it is not set before.
  762|      2|  if (flags_internal::ProgramInvocationName() == "UNKNOWN") {
  ------------------
  |  Branch (762:7): [True: 2, False: 0]
  ------------------
  763|      2|    flags_internal::SetProgramInvocationName(argv[0]);
  764|      2|  }
  765|      2|  positional_args.push_back(argv[0]);
  766|       |
  767|      2|  absl::MutexLock l(&flags_internal::specified_flags_guard);
  768|      2|  if (specified_flags == nullptr) {
  ------------------
  |  Branch (768:7): [True: 2, False: 0]
  ------------------
  769|      2|    specified_flags = new std::vector<const CommandLineFlag*>;
  770|      2|  } else {
  771|      0|    specified_flags->clear();
  772|      0|  }
  773|       |
  774|       |  // Iterate through the list of the input arguments. First level are
  775|       |  // arguments originated from argc/argv. Following levels are arguments
  776|       |  // originated from recursive parsing of flagfile(s).
  777|      2|  bool success = true;
  778|      6|  while (!input_args.empty()) {
  ------------------
  |  Branch (778:10): [True: 6, False: 0]
  ------------------
  779|       |    // First we process the built-in generator flags.
  780|      6|    success &= flags_internal::HandleGeneratorFlags(input_args, flagfile_value);
  781|       |
  782|       |    // Select top-most (most recent) arguments list. If it is empty drop it
  783|       |    // and re-try.
  784|      6|    ArgsList& curr_list = input_args.back();
  785|       |
  786|       |    // Every ArgsList starts with real or fake program name, so we can always
  787|       |    // start by skipping it.
  788|      6|    curr_list.PopFront();
  789|       |
  790|      6|    if (curr_list.Size() == 0) {
  ------------------
  |  Branch (790:9): [True: 0, False: 6]
  ------------------
  791|      0|      input_args.pop_back();
  792|      0|      continue;
  793|      0|    }
  794|       |
  795|       |    // Handle the next argument in the current list. If the stack of argument
  796|       |    // lists contains only one element - we are processing an argument from
  797|       |    // the original argv.
  798|      6|    absl::string_view arg(curr_list.Front());
  799|      6|    bool arg_from_argv = input_args.size() == 1;
  800|       |
  801|       |    // If argument does not start with '-' or is just "-" - this is
  802|       |    // positional argument.
  803|      6|    if (!absl::ConsumePrefix(&arg, "-") || arg.empty()) {
  ------------------
  |  Branch (803:9): [True: 0, False: 6]
  |  Branch (803:44): [True: 0, False: 6]
  ------------------
  804|      0|      ABSL_INTERNAL_CHECK(arg_from_argv,
  ------------------
  |  |   85|      0|  do {                                                             \
  |  |   86|      0|    if (ABSL_PREDICT_FALSE(!(condition))) {                        \
  |  |  ------------------
  |  |  |  |  178|      0|#define ABSL_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (178:31): [True: 0, False: 0]
  |  |  |  |  |  Branch (178:49): [Folded, False: 0]
  |  |  |  |  |  Branch (178:58): [True: 0, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   87|      0|      std::string death_message = "Check " #condition " failed: "; \
  |  |   88|      0|      death_message += std::string(message);                       \
  |  |   89|      0|      ABSL_INTERNAL_LOG(FATAL, death_message);                     \
  |  |  ------------------
  |  |  |  |   76|      0|  do {                                                                    \
  |  |  |  |   77|      0|    constexpr const char* absl_raw_log_internal_filename = __FILE__;      \
  |  |  |  |   78|      0|    ::absl::raw_log_internal::internal_log_function(                      \
  |  |  |  |   79|      0|        ABSL_RAW_LOG_INTERNAL_##severity, absl_raw_log_internal_filename, \
  |  |  |  |  ------------------
  |  |  |  |  |  |  110|      0|#define ABSL_RAW_LOG_INTERNAL_FATAL ::absl::LogSeverity::kFatal
  |  |  |  |  ------------------
  |  |  |  |   80|      0|        __LINE__, message);                                               \
  |  |  |  |   81|      0|    ABSL_RAW_LOG_INTERNAL_MAYBE_UNREACHABLE_##severity;                   \
  |  |  |  |  ------------------
  |  |  |  |  |  |  118|      0|#define ABSL_RAW_LOG_INTERNAL_MAYBE_UNREACHABLE_FATAL ABSL_UNREACHABLE()
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  225|      0|  do {                                           \
  |  |  |  |  |  |  |  |  226|      0|    /* NOLINTNEXTLINE: misc-static-assert */     \
  |  |  |  |  |  |  |  |  227|      0|    assert(false && "ABSL_UNREACHABLE reached"); \
  |  |  |  |  |  |  |  |  228|      0|    ABSL_INTERNAL_UNREACHABLE_IMPL();            \
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |  203|      0|#define ABSL_INTERNAL_UNREACHABLE_IMPL() __builtin_unreachable()
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  229|      0|  } while (false)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (229:12): [Folded, False: 0]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   82|      0|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (82:12): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   90|      0|    }                                                              \
  |  |   91|      0|  } while (0)
  |  |  ------------------
  |  |  |  Branch (91:12): [Folded, False: 0]
  |  |  ------------------
  ------------------
  805|      0|                          "Flagfile cannot contain positional argument");
  806|       |
  807|      0|      positional_args.push_back(argv[curr_list.FrontIndex()]);
  808|      0|      continue;
  809|      0|    }
  810|       |
  811|       |    // Split the current argument on '=' to deduce the argument flag name and
  812|       |    // value. If flag name is empty it means we've got an "--" argument. Value
  813|       |    // can be empty either if there were no '=' in argument string at all or
  814|       |    // an argument looked like "--foo=". In a latter case is_empty_value is
  815|       |    // true.
  816|      6|    absl::string_view flag_name;
  817|      6|    absl::string_view value;
  818|      6|    bool is_empty_value = false;
  819|       |
  820|      6|    std::tie(flag_name, value, is_empty_value) =
  821|      6|        flags_internal::SplitNameAndValue(arg);
  822|       |
  823|       |    // Standalone "--" argument indicates that the rest of the arguments are
  824|       |    // positional. We do not support positional arguments in flagfiles.
  825|      6|    if (flag_name.empty()) {
  ------------------
  |  Branch (825:9): [True: 2, False: 4]
  ------------------
  826|      2|      ABSL_INTERNAL_CHECK(arg_from_argv,
  ------------------
  |  |   85|      2|  do {                                                             \
  |  |   86|      2|    if (ABSL_PREDICT_FALSE(!(condition))) {                        \
  |  |  ------------------
  |  |  |  |  178|      2|#define ABSL_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (178:31): [True: 0, False: 2]
  |  |  |  |  |  Branch (178:49): [Folded, False: 2]
  |  |  |  |  |  Branch (178:58): [True: 0, False: 2]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   87|      0|      std::string death_message = "Check " #condition " failed: "; \
  |  |   88|      0|      death_message += std::string(message);                       \
  |  |   89|      0|      ABSL_INTERNAL_LOG(FATAL, death_message);                     \
  |  |  ------------------
  |  |  |  |   76|      0|  do {                                                                    \
  |  |  |  |   77|      0|    constexpr const char* absl_raw_log_internal_filename = __FILE__;      \
  |  |  |  |   78|      0|    ::absl::raw_log_internal::internal_log_function(                      \
  |  |  |  |   79|      0|        ABSL_RAW_LOG_INTERNAL_##severity, absl_raw_log_internal_filename, \
  |  |  |  |  ------------------
  |  |  |  |  |  |  110|      0|#define ABSL_RAW_LOG_INTERNAL_FATAL ::absl::LogSeverity::kFatal
  |  |  |  |  ------------------
  |  |  |  |   80|      0|        __LINE__, message);                                               \
  |  |  |  |   81|      0|    ABSL_RAW_LOG_INTERNAL_MAYBE_UNREACHABLE_##severity;                   \
  |  |  |  |  ------------------
  |  |  |  |  |  |  118|      0|#define ABSL_RAW_LOG_INTERNAL_MAYBE_UNREACHABLE_FATAL ABSL_UNREACHABLE()
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  225|      0|  do {                                           \
  |  |  |  |  |  |  |  |  226|      0|    /* NOLINTNEXTLINE: misc-static-assert */     \
  |  |  |  |  |  |  |  |  227|      0|    assert(false && "ABSL_UNREACHABLE reached"); \
  |  |  |  |  |  |  |  |  228|      0|    ABSL_INTERNAL_UNREACHABLE_IMPL();            \
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |  203|      0|#define ABSL_INTERNAL_UNREACHABLE_IMPL() __builtin_unreachable()
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  229|      0|  } while (false)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (229:12): [Folded, False: 0]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   82|      0|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (82:12): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   90|      0|    }                                                              \
  |  |   91|      2|  } while (0)
  |  |  ------------------
  |  |  |  Branch (91:12): [Folded, False: 2]
  |  |  ------------------
  ------------------
  827|      2|                          "Flagfile cannot contain positional argument");
  828|       |
  829|      2|      curr_list.PopFront();
  830|      2|      break;
  831|      2|    }
  832|       |
  833|       |    // Locate the flag based on flag name. Handle both --foo and --nofoo.
  834|      4|    CommandLineFlag* flag = nullptr;
  835|      4|    bool is_negative = false;
  836|      4|    std::tie(flag, is_negative) = flags_internal::LocateFlag(flag_name);
  837|       |
  838|      4|    if (flag == nullptr) {
  ------------------
  |  Branch (838:9): [True: 0, False: 4]
  ------------------
  839|       |      // Usage flags are not modeled as Abseil flags. Locate them separately.
  840|      0|      if (flags_internal::DeduceUsageFlags(flag_name, value)) {
  ------------------
  |  Branch (840:11): [True: 0, False: 0]
  ------------------
  841|      0|        continue;
  842|      0|      }
  843|      0|      unrecognized_flags.emplace_back(arg_from_argv
  ------------------
  |  Branch (843:39): [True: 0, False: 0]
  ------------------
  844|      0|                                          ? UnrecognizedFlag::kFromArgv
  845|      0|                                          : UnrecognizedFlag::kFromFlagfile,
  846|      0|                                      flag_name);
  847|      0|      continue;
  848|      0|    }
  849|       |
  850|       |    // Deduce flag's value (from this or next argument).
  851|      4|    bool value_success = true;
  852|      4|    std::tie(value_success, value) = flags_internal::DeduceFlagValue(
  853|      4|        *flag, value, is_negative, is_empty_value, &curr_list);
  854|      4|    success &= value_success;
  855|       |
  856|       |    // Set the located flag to a new value, unless it is retired. Setting
  857|       |    // retired flag fails, but we ignoring it here while also reporting access
  858|       |    // to retired flag.
  859|      4|    std::string error;
  860|      4|    if (!flags_internal::PrivateHandleAccessor::ParseFrom(
  ------------------
  |  Branch (860:9): [True: 0, False: 4]
  ------------------
  861|      4|            *flag, value, flags_internal::SET_FLAGS_VALUE,
  862|      4|            flags_internal::kCommandLine, error)) {
  863|      0|      if (flag->IsRetired()) continue;
  ------------------
  |  Branch (863:11): [True: 0, False: 0]
  ------------------
  864|       |
  865|      0|      flags_internal::ReportUsageError(error, true);
  866|      0|      success = false;
  867|      4|    } else {
  868|      4|      specified_flags->push_back(flag);
  869|      4|    }
  870|      4|  }
  871|       |
  872|      2|  flags_internal::ResetGeneratorFlags(flagfile_value);
  873|       |
  874|       |  // All the remaining arguments are positional.
  875|      2|  if (!input_args.empty()) {
  ------------------
  |  Branch (875:7): [True: 2, False: 0]
  ------------------
  876|      2|    for (size_t arg_index = input_args.back().FrontIndex();
  877|     11|         arg_index < static_cast<size_t>(argc); ++arg_index) {
  ------------------
  |  Branch (877:10): [True: 9, False: 2]
  ------------------
  878|      9|      positional_args.push_back(argv[arg_index]);
  879|      9|    }
  880|      2|  }
  881|       |
  882|       |  // Trim and sort the vector.
  883|      2|  specified_flags->shrink_to_fit();
  884|      2|  std::sort(specified_flags->begin(), specified_flags->end(),
  885|      2|            flags_internal::SpecifiedFlagsCompare{});
  886|       |
  887|       |  // Filter out unrecognized flags, which are ok to ignore.
  888|      2|  std::vector<UnrecognizedFlag> filtered;
  889|      2|  filtered.reserve(unrecognized_flags.size());
  890|      2|  for (const auto& unrecognized : unrecognized_flags) {
  ------------------
  |  Branch (890:33): [True: 0, False: 2]
  ------------------
  891|      0|    if (flags_internal::CanIgnoreUndefinedFlag(unrecognized.flag_name))
  ------------------
  |  Branch (891:9): [True: 0, False: 0]
  ------------------
  892|      0|      continue;
  893|      0|    filtered.push_back(unrecognized);
  894|      0|  }
  895|       |
  896|      2|  std::swap(unrecognized_flags, filtered);
  897|       |
  898|      2|  if (!success) {
  ------------------
  |  Branch (898:7): [True: 0, False: 2]
  ------------------
  899|       |#if ABSL_FLAGS_STRIP_NAMES
  900|       |    flags_internal::ReportUsageError(
  901|       |        "NOTE: command line flags are disabled in this build", true);
  902|       |#else
  903|      0|    flags_internal::HandleUsageFlags(std::cerr, ProgramUsageMessage());
  904|      0|#endif
  905|      0|    return HelpMode::kFull;  // We just need to make sure the exit with
  906|       |                             // code 1.
  907|      0|  }
  908|       |
  909|      2|  return usage_flag_action == UsageFlagsAction::kHandleUsage
  ------------------
  |  Branch (909:10): [True: 2, False: 0]
  ------------------
  910|      2|             ? flags_internal::HandleUsageFlags(std::cout,
  911|      2|                                                ProgramUsageMessage())
  912|      2|             : HelpMode::kNone;
  913|      2|}
_ZN4absl12lts_2024011620ParseAbseilFlagsOnlyEiPPcRNSt3__16vectorIS1_NS3_9allocatorIS1_EEEERNS4_INS0_16UnrecognizedFlagENS5_IS9_EEEE:
  919|      2|                          std::vector<UnrecognizedFlag>& unrecognized_flags) {
  920|      2|  auto help_mode = flags_internal::ParseAbseilFlagsOnlyImpl(
  921|      2|      argc, argv, positional_args, unrecognized_flags,
  922|      2|      flags_internal::UsageFlagsAction::kHandleUsage);
  923|       |
  924|      2|  flags_internal::MaybeExit(help_mode);
  925|      2|}
parse.cc:_ZNK3$_0clEv:
  106|      2|    .OnUpdate([]() {
  107|      2|      if (absl::GetFlag(FLAGS_flagfile).empty()) return;
  ------------------
  |  Branch (107:11): [True: 2, False: 0]
  ------------------
  108|       |
  109|      0|      absl::MutexLock l(&absl::flags_internal::processing_checks_guard);
  110|       |
  111|       |      // Setting this flag twice before it is handled most likely an internal
  112|       |      // error and should be reviewed by developers.
  113|      0|      if (absl::flags_internal::flagfile_needs_processing) {
  ------------------
  |  Branch (113:11): [True: 0, False: 0]
  ------------------
  114|      0|        ABSL_INTERNAL_LOG(WARNING, "flagfile set twice before it is handled");
  ------------------
  |  |   76|      0|  do {                                                                    \
  |  |   77|      0|    constexpr const char* absl_raw_log_internal_filename = __FILE__;      \
  |  |   78|      0|    ::absl::raw_log_internal::internal_log_function(                      \
  |  |   79|      0|        ABSL_RAW_LOG_INTERNAL_##severity, absl_raw_log_internal_filename, \
  |  |  ------------------
  |  |  |  |  108|      0|#define ABSL_RAW_LOG_INTERNAL_WARNING ::absl::LogSeverity::kWarning
  |  |  ------------------
  |  |   80|      0|        __LINE__, message);                                               \
  |  |   81|      0|    ABSL_RAW_LOG_INTERNAL_MAYBE_UNREACHABLE_##severity;                   \
  |  |   82|      0|  } while (0)
  |  |  ------------------
  |  |  |  Branch (82:12): [Folded, False: 0]
  |  |  ------------------
  ------------------
  115|      0|      }
  116|       |
  117|      0|      absl::flags_internal::flagfile_needs_processing = true;
  118|      0|    });
parse.cc:_ZNK3$_1clEv:
  122|      2|    .OnUpdate([]() {
  123|      2|      if (absl::GetFlag(FLAGS_fromenv).empty()) return;
  ------------------
  |  Branch (123:11): [True: 2, False: 0]
  ------------------
  124|       |
  125|      0|      absl::MutexLock l(&absl::flags_internal::processing_checks_guard);
  126|       |
  127|       |      // Setting this flag twice before it is handled most likely an internal
  128|       |      // error and should be reviewed by developers.
  129|      0|      if (absl::flags_internal::fromenv_needs_processing) {
  ------------------
  |  Branch (129:11): [True: 0, False: 0]
  ------------------
  130|      0|        ABSL_INTERNAL_LOG(WARNING, "fromenv set twice before it is handled.");
  ------------------
  |  |   76|      0|  do {                                                                    \
  |  |   77|      0|    constexpr const char* absl_raw_log_internal_filename = __FILE__;      \
  |  |   78|      0|    ::absl::raw_log_internal::internal_log_function(                      \
  |  |   79|      0|        ABSL_RAW_LOG_INTERNAL_##severity, absl_raw_log_internal_filename, \
  |  |  ------------------
  |  |  |  |  108|      0|#define ABSL_RAW_LOG_INTERNAL_WARNING ::absl::LogSeverity::kWarning
  |  |  ------------------
  |  |   80|      0|        __LINE__, message);                                               \
  |  |   81|      0|    ABSL_RAW_LOG_INTERNAL_MAYBE_UNREACHABLE_##severity;                   \
  |  |   82|      0|  } while (0)
  |  |  ------------------
  |  |  |  Branch (82:12): [Folded, False: 0]
  |  |  ------------------
  ------------------
  131|      0|      }
  132|       |
  133|      0|      absl::flags_internal::fromenv_needs_processing = true;
  134|      0|    });
parse.cc:_ZNK3$_2clEv:
  138|      2|    .OnUpdate([]() {
  139|      2|      if (absl::GetFlag(FLAGS_tryfromenv).empty()) return;
  ------------------
  |  Branch (139:11): [True: 2, False: 0]
  ------------------
  140|       |
  141|      0|      absl::MutexLock l(&absl::flags_internal::processing_checks_guard);
  142|       |
  143|       |      // Setting this flag twice before it is handled most likely an internal
  144|       |      // error and should be reviewed by developers.
  145|      0|      if (absl::flags_internal::tryfromenv_needs_processing) {
  ------------------
  |  Branch (145:11): [True: 0, False: 0]
  ------------------
  146|      0|        ABSL_INTERNAL_LOG(WARNING,
  ------------------
  |  |   76|      0|  do {                                                                    \
  |  |   77|      0|    constexpr const char* absl_raw_log_internal_filename = __FILE__;      \
  |  |   78|      0|    ::absl::raw_log_internal::internal_log_function(                      \
  |  |   79|      0|        ABSL_RAW_LOG_INTERNAL_##severity, absl_raw_log_internal_filename, \
  |  |  ------------------
  |  |  |  |  108|      0|#define ABSL_RAW_LOG_INTERNAL_WARNING ::absl::LogSeverity::kWarning
  |  |  ------------------
  |  |   80|      0|        __LINE__, message);                                               \
  |  |   81|      0|    ABSL_RAW_LOG_INTERNAL_MAYBE_UNREACHABLE_##severity;                   \
  |  |   82|      0|  } while (0)
  |  |  ------------------
  |  |  |  Branch (82:12): [Folded, False: 0]
  |  |  ------------------
  ------------------
  147|      0|                          "tryfromenv set twice before it is handled.");
  148|      0|      }
  149|       |
  150|      0|      absl::flags_internal::tryfromenv_needs_processing = true;
  151|      0|    });
parse.cc:_ZN4absl12lts_2024011614flags_internal12_GLOBAL__N_134CheckDefaultValuesParsingRoundtripEv:
  322|      2|void CheckDefaultValuesParsingRoundtrip() {
  323|       |#ifndef NDEBUG
  324|       |  flags_internal::ForEachFlag([&](CommandLineFlag& flag) {
  325|       |    if (flag.IsRetired()) return;
  326|       |
  327|       |#define ABSL_FLAGS_INTERNAL_IGNORE_TYPE(T, _) \
  328|       |  if (flag.IsOfType<T>()) return;
  329|       |
  330|       |    ABSL_FLAGS_INTERNAL_SUPPORTED_TYPES(ABSL_FLAGS_INTERNAL_IGNORE_TYPE)
  331|       |#undef ABSL_FLAGS_INTERNAL_IGNORE_TYPE
  332|       |
  333|       |    flags_internal::PrivateHandleAccessor::CheckDefaultValueParsingRoundtrip(
  334|       |        flag);
  335|       |  });
  336|       |#endif
  337|      2|}
parse.cc:_ZN4absl12lts_2024011614flags_internal12_GLOBAL__N_18ArgsListC2EiPPc:
  169|      2|  ArgsList(int argc, char* argv[]) : args_(argv, argv + argc), next_arg_(0) {}
parse.cc:_ZN4absl12lts_2024011614flags_internal12_GLOBAL__N_120HandleGeneratorFlagsERNSt3__16vectorINS2_8ArgsListENS3_9allocatorIS5_EEEERNS4_INS3_12basic_stringIcNS3_11char_traitsIcEENS6_IcEEEENS6_ISE_EEEE:
  415|      6|                          std::vector<std::string>& flagfile_value) {
  416|      6|  bool success = true;
  417|       |
  418|      6|  absl::MutexLock l(&flags_internal::processing_checks_guard);
  419|       |
  420|       |  // flagfile could have been set either on a command line or
  421|       |  // programmatically before invoking ParseCommandLine. Note that we do not
  422|       |  // actually process arguments specified in the flagfile, but instead
  423|       |  // create a secondary arguments list to be processed along with the rest
  424|       |  // of the command line arguments. Since we always the process most recently
  425|       |  // created list of arguments first, this will result in flagfile argument
  426|       |  // being processed before any other argument in the command line. If
  427|       |  // FLAGS_flagfile contains more than one file name we create multiple new
  428|       |  // levels of arguments in a reverse order of file names. Thus we always
  429|       |  // process arguments from first file before arguments containing in a
  430|       |  // second file, etc. If flagfile contains another
  431|       |  // --flagfile inside of it, it will produce new level of arguments and
  432|       |  // processed before the rest of the flagfile. We are also collecting all
  433|       |  // flagfiles set on original command line. Unlike the rest of the flags,
  434|       |  // this flag can be set multiple times and is expected to be handled
  435|       |  // multiple times. We are collecting them all into a single list and set
  436|       |  // the value of FLAGS_flagfile to that value at the end of the parsing.
  437|      6|  if (flags_internal::flagfile_needs_processing) {
  ------------------
  |  Branch (437:7): [True: 0, False: 6]
  ------------------
  438|      0|    auto flagfiles = absl::GetFlag(FLAGS_flagfile);
  439|       |
  440|      0|    if (input_args.size() == 1) {
  ------------------
  |  Branch (440:9): [True: 0, False: 0]
  ------------------
  441|      0|      flagfile_value.insert(flagfile_value.end(), flagfiles.begin(),
  442|      0|                            flagfiles.end());
  443|      0|    }
  444|       |
  445|      0|    success &= ReadFlagfiles(flagfiles, input_args);
  446|       |
  447|      0|    flags_internal::flagfile_needs_processing = false;
  448|      0|  }
  449|       |
  450|       |  // Similar to flagfile fromenv/tryfromemv can be set both
  451|       |  // programmatically and at runtime on a command line. Unlike flagfile these
  452|       |  // can't be recursive.
  453|      6|  if (flags_internal::fromenv_needs_processing) {
  ------------------
  |  Branch (453:7): [True: 0, False: 6]
  ------------------
  454|      0|    auto flags_list = absl::GetFlag(FLAGS_fromenv);
  455|       |
  456|      0|    success &= ReadFlagsFromEnv(flags_list, input_args, true);
  457|       |
  458|      0|    flags_internal::fromenv_needs_processing = false;
  459|      0|  }
  460|       |
  461|      6|  if (flags_internal::tryfromenv_needs_processing) {
  ------------------
  |  Branch (461:7): [True: 0, False: 6]
  ------------------
  462|      0|    auto flags_list = absl::GetFlag(FLAGS_tryfromenv);
  463|       |
  464|      0|    success &= ReadFlagsFromEnv(flags_list, input_args, false);
  465|       |
  466|      0|    flags_internal::tryfromenv_needs_processing = false;
  467|      0|  }
  468|       |
  469|      6|  return success;
  470|      6|}
parse.cc:_ZN4absl12lts_2024011614flags_internal12_GLOBAL__N_18ArgsList8PopFrontEv:
  179|      8|  void PopFront() { next_arg_++; }
parse.cc:_ZNK4absl12lts_2024011614flags_internal12_GLOBAL__N_18ArgsList4SizeEv:
  176|      6|  size_t Size() const { return args_.size() - next_arg_; }
parse.cc:_ZNK4absl12lts_2024011614flags_internal12_GLOBAL__N_18ArgsList5FrontEv:
  178|      6|  absl::string_view Front() const { return args_[next_arg_]; }
parse.cc:_ZNK4absl12lts_2024011614flags_internal12_GLOBAL__N_18ArgsList10FrontIndexEv:
  177|      2|  size_t FrontIndex() const { return next_arg_; }
parse.cc:_ZN4absl12lts_2024011614flags_internal12_GLOBAL__N_117SplitNameAndValueENSt3__117basic_string_viewIcNS3_11char_traitsIcEEEE:
  278|      6|    absl::string_view arg) {
  279|       |  // Allow -foo and --foo
  280|      6|  absl::ConsumePrefix(&arg, "-");
  281|       |
  282|      6|  if (arg.empty()) {
  ------------------
  |  Branch (282:7): [True: 2, False: 4]
  ------------------
  283|      2|    return std::make_tuple("", "", false);
  284|      2|  }
  285|       |
  286|      4|  auto equal_sign_pos = arg.find('=');
  287|       |
  288|      4|  absl::string_view flag_name = arg.substr(0, equal_sign_pos);
  289|       |
  290|      4|  absl::string_view value;
  291|      4|  bool is_empty_value = false;
  292|       |
  293|      4|  if (equal_sign_pos != absl::string_view::npos) {
  ------------------
  |  Branch (293:7): [True: 4, False: 0]
  ------------------
  294|      4|    value = arg.substr(equal_sign_pos + 1);
  295|      4|    is_empty_value = value.empty();
  296|      4|  }
  297|       |
  298|      4|  return std::make_tuple(flag_name, value, is_empty_value);
  299|      6|}
parse.cc:_ZN4absl12lts_2024011614flags_internal12_GLOBAL__N_110LocateFlagENSt3__117basic_string_viewIcNS3_11char_traitsIcEEEE:
  306|      4|std::tuple<CommandLineFlag*, bool> LocateFlag(absl::string_view flag_name) {
  307|      4|  CommandLineFlag* flag = absl::FindCommandLineFlag(flag_name);
  308|      4|  bool is_negative = false;
  309|       |
  310|      4|  if (!flag && absl::ConsumePrefix(&flag_name, "no")) {
  ------------------
  |  Branch (310:7): [True: 0, False: 4]
  |  Branch (310:16): [True: 0, False: 0]
  ------------------
  311|      0|    flag = absl::FindCommandLineFlag(flag_name);
  312|      0|    is_negative = true;
  313|      0|  }
  314|       |
  315|      4|  return std::make_tuple(flag, is_negative);
  316|      4|}
parse.cc:_ZN4absl12lts_2024011614flags_internal12_GLOBAL__N_115DeduceFlagValueERKNS0_15CommandLineFlagENSt3__117basic_string_viewIcNS6_11char_traitsIcEEEEbbPNS2_8ArgsListE:
  509|      4|                                                    ArgsList* curr_list) {
  510|       |  // Value is either an argument suffix after `=` in "--foo=<value>"
  511|       |  // or separate argument in case of "--foo" "<value>".
  512|       |
  513|       |  // boolean flags have these forms:
  514|       |  //   --foo
  515|       |  //   --nofoo
  516|       |  //   --foo=true
  517|       |  //   --foo=false
  518|       |  //   --nofoo=<value> is not supported
  519|       |  //   --foo <value> is not supported
  520|       |
  521|       |  // non boolean flags have these forms:
  522|       |  // --foo=<value>
  523|       |  // --foo <value>
  524|       |  // --nofoo is not supported
  525|       |
  526|      4|  if (flag.IsOfType<bool>()) {
  ------------------
  |  Branch (526:7): [True: 0, False: 4]
  ------------------
  527|      0|    if (value.empty()) {
  ------------------
  |  Branch (527:9): [True: 0, False: 0]
  ------------------
  528|      0|      if (is_empty_value) {
  ------------------
  |  Branch (528:11): [True: 0, False: 0]
  ------------------
  529|       |        // "--bool_flag=" case
  530|      0|        flags_internal::ReportUsageError(
  531|      0|            absl::StrCat(
  532|      0|                "Missing the value after assignment for the boolean flag '",
  533|      0|                flag.Name(), "'"),
  534|      0|            true);
  535|      0|        return std::make_tuple(false, "");
  536|      0|      }
  537|       |
  538|       |      // "--bool_flag" case
  539|      0|      value = is_negative ? "0" : "1";
  ------------------
  |  Branch (539:15): [True: 0, False: 0]
  ------------------
  540|      0|    } else if (is_negative) {
  ------------------
  |  Branch (540:16): [True: 0, False: 0]
  ------------------
  541|       |      // "--nobool_flag=Y" case
  542|      0|      flags_internal::ReportUsageError(
  543|      0|          absl::StrCat("Negative form with assignment is not valid for the "
  544|      0|                       "boolean flag '",
  545|      0|                       flag.Name(), "'"),
  546|      0|          true);
  547|      0|      return std::make_tuple(false, "");
  548|      0|    }
  549|      4|  } else if (is_negative) {
  ------------------
  |  Branch (549:14): [True: 0, False: 4]
  ------------------
  550|       |    // "--noint_flag=1" case
  551|      0|    flags_internal::ReportUsageError(
  552|      0|        absl::StrCat("Negative form is not valid for the flag '", flag.Name(),
  553|      0|                     "'"),
  554|      0|        true);
  555|      0|    return std::make_tuple(false, "");
  556|      4|  } else if (value.empty() && (!is_empty_value)) {
  ------------------
  |  Branch (556:14): [True: 0, False: 4]
  |  Branch (556:31): [True: 0, False: 0]
  ------------------
  557|      0|    if (curr_list->Size() == 1) {
  ------------------
  |  Branch (557:9): [True: 0, False: 0]
  ------------------
  558|       |      // "--int_flag" case
  559|      0|      flags_internal::ReportUsageError(
  560|      0|          absl::StrCat("Missing the value for the flag '", flag.Name(), "'"),
  561|      0|          true);
  562|      0|      return std::make_tuple(false, "");
  563|      0|    }
  564|       |
  565|       |    // "--int_flag" "10" case
  566|      0|    curr_list->PopFront();
  567|      0|    value = curr_list->Front();
  568|       |
  569|       |    // Heuristic to detect the case where someone treats a string arg
  570|       |    // like a bool or just forgets to pass a value:
  571|       |    // --my_string_var --foo=bar
  572|       |    // We look for a flag of string type, whose value begins with a
  573|       |    // dash and corresponds to known flag or standalone --.
  574|      0|    if (!value.empty() && value[0] == '-' && flag.IsOfType<std::string>()) {
  ------------------
  |  Branch (574:9): [True: 0, False: 0]
  |  Branch (574:27): [True: 0, False: 0]
  |  Branch (574:46): [True: 0, False: 0]
  ------------------
  575|      0|      auto maybe_flag_name = std::get<0>(SplitNameAndValue(value.substr(1)));
  576|       |
  577|      0|      if (maybe_flag_name.empty() ||
  ------------------
  |  Branch (577:11): [True: 0, False: 0]
  |  Branch (577:11): [True: 0, False: 0]
  ------------------
  578|      0|          std::get<0>(LocateFlag(maybe_flag_name)) != nullptr) {
  ------------------
  |  Branch (578:11): [True: 0, False: 0]
  ------------------
  579|       |        // "--string_flag" "--known_flag" case
  580|      0|        ABSL_INTERNAL_LOG(
  ------------------
  |  |   76|      0|  do {                                                                    \
  |  |   77|      0|    constexpr const char* absl_raw_log_internal_filename = __FILE__;      \
  |  |   78|      0|    ::absl::raw_log_internal::internal_log_function(                      \
  |  |   79|      0|        ABSL_RAW_LOG_INTERNAL_##severity, absl_raw_log_internal_filename, \
  |  |  ------------------
  |  |  |  |  108|      0|#define ABSL_RAW_LOG_INTERNAL_WARNING ::absl::LogSeverity::kWarning
  |  |  ------------------
  |  |   80|      0|        __LINE__, message);                                               \
  |  |   81|      0|    ABSL_RAW_LOG_INTERNAL_MAYBE_UNREACHABLE_##severity;                   \
  |  |   82|      0|  } while (0)
  |  |  ------------------
  |  |  |  Branch (82:12): [Folded, False: 0]
  |  |  ------------------
  ------------------
  581|      0|            WARNING,
  582|      0|            absl::StrCat("Did you really mean to set flag '", flag.Name(),
  583|      0|                         "' to the value '", value, "'?"));
  584|      0|      }
  585|      0|    }
  586|      0|  }
  587|       |
  588|      4|  return std::make_tuple(true, value);
  589|      4|}
parse.cc:_ZN4absl12lts_2024011614flags_internal12_GLOBAL__N_119ResetGeneratorFlagsERKNSt3__16vectorINS3_12basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEENS8_ISA_EEEE:
  474|      2|void ResetGeneratorFlags(const std::vector<std::string>& flagfile_value) {
  475|       |  // Setting flagfile to the value which collates all the values set on a
  476|       |  // command line and programmatically. So if command line looked like
  477|       |  // --flagfile=f1 --flagfile=f2 the final value of the FLAGS_flagfile flag is
  478|       |  // going to be {"f1", "f2"}
  479|      2|  if (!flagfile_value.empty()) {
  ------------------
  |  Branch (479:7): [True: 0, False: 2]
  ------------------
  480|      0|    absl::SetFlag(&FLAGS_flagfile, flagfile_value);
  481|      0|    absl::MutexLock l(&flags_internal::processing_checks_guard);
  482|      0|    flags_internal::flagfile_needs_processing = false;
  483|      0|  }
  484|       |
  485|       |  // fromenv/tryfromenv are set to <undefined> value.
  486|      2|  if (!absl::GetFlag(FLAGS_fromenv).empty()) {
  ------------------
  |  Branch (486:7): [True: 0, False: 2]
  ------------------
  487|      0|    absl::SetFlag(&FLAGS_fromenv, {});
  488|      0|  }
  489|      2|  if (!absl::GetFlag(FLAGS_tryfromenv).empty()) {
  ------------------
  |  Branch (489:7): [True: 0, False: 2]
  ------------------
  490|      0|    absl::SetFlag(&FLAGS_tryfromenv, {});
  491|      0|  }
  492|       |
  493|      2|  absl::MutexLock l(&flags_internal::processing_checks_guard);
  494|      2|  flags_internal::fromenv_needs_processing = false;
  495|      2|  flags_internal::tryfromenv_needs_processing = false;
  496|      2|}
parse.cc:_ZNK4absl12lts_2024011614flags_internal12_GLOBAL__N_121SpecifiedFlagsCompareclEPKNS0_15CommandLineFlagES6_:
   86|      2|  bool operator()(const CommandLineFlag* a, const CommandLineFlag* b) const {
   87|      2|    return a->Name() < b->Name();
   88|      2|  }

_ZN4absl12lts_2024011614flags_internal12FlagRegistry8FindFlagENSt3__117basic_string_viewIcNS3_11char_traitsIcEEEE:
   99|      4|CommandLineFlag* FlagRegistry::FindFlag(absl::string_view name) {
  100|      4|  if (finalized_flags_.load(std::memory_order_acquire)) {
  ------------------
  |  Branch (100:7): [True: 4, False: 0]
  ------------------
  101|       |    // We could save some gcus here if we make `Name()` be non-virtual.
  102|       |    // We could move the `const char*` name to the base class.
  103|      4|    auto it = std::partition_point(
  104|      4|        flat_flags_.begin(), flat_flags_.end(),
  105|      4|        [=](CommandLineFlag* f) { return f->Name() < name; });
  106|      4|    if (it != flat_flags_.end() && (*it)->Name() == name) return *it;
  ------------------
  |  Branch (106:9): [True: 4, False: 0]
  |  Branch (106:9): [True: 4, False: 0]
  |  Branch (106:36): [True: 4, False: 0]
  ------------------
  107|      4|  }
  108|       |
  109|      0|  FlagRegistryLock frl(*this);
  110|      0|  auto it = flags_.find(name);
  111|      0|  return it != flags_.end() ? it->second : nullptr;
  ------------------
  |  Branch (111:10): [True: 0, False: 0]
  ------------------
  112|      4|}
_ZN4absl12lts_2024011614flags_internal12FlagRegistry12RegisterFlagERNS0_15CommandLineFlagEPKc:
  114|     32|void FlagRegistry::RegisterFlag(CommandLineFlag& flag, const char* filename) {
  115|     32|  if (filename != nullptr &&
  ------------------
  |  Branch (115:7): [True: 32, False: 0]
  |  Branch (115:7): [True: 0, False: 32]
  ------------------
  116|     32|      flag.Filename() != GetUsageConfig().normalize_filename(filename)) {
  ------------------
  |  Branch (116:7): [True: 0, False: 32]
  ------------------
  117|      0|    flags_internal::ReportUsageError(
  118|      0|        absl::StrCat(
  119|      0|            "Inconsistency between flag object and registration for flag '",
  120|      0|            flag.Name(),
  121|      0|            "', likely due to duplicate flags or an ODR violation. Relevant "
  122|      0|            "files: ",
  123|      0|            flag.Filename(), " and ", filename),
  124|      0|        true);
  125|      0|    std::exit(1);
  126|      0|  }
  127|       |
  128|     32|  FlagRegistryLock registry_lock(*this);
  129|       |
  130|     32|  std::pair<FlagIterator, bool> ins =
  131|     32|      flags_.insert(FlagMap::value_type(flag.Name(), &flag));
  132|     32|  if (ins.second == false) {  // means the name was already in the map
  ------------------
  |  Branch (132:7): [True: 0, False: 32]
  ------------------
  133|      0|    CommandLineFlag& old_flag = *ins.first->second;
  134|      0|    if (flag.IsRetired() != old_flag.IsRetired()) {
  ------------------
  |  Branch (134:9): [True: 0, False: 0]
  ------------------
  135|       |      // All registrations must agree on the 'retired' flag.
  136|      0|      flags_internal::ReportUsageError(
  137|      0|          absl::StrCat(
  138|      0|              "Retired flag '", flag.Name(), "' was defined normally in file '",
  139|      0|              (flag.IsRetired() ? old_flag.Filename() : flag.Filename()), "'."),
  ------------------
  |  Branch (139:16): [True: 0, False: 0]
  ------------------
  140|      0|          true);
  141|      0|    } else if (flags_internal::PrivateHandleAccessor::TypeId(flag) !=
  ------------------
  |  Branch (141:16): [True: 0, False: 0]
  ------------------
  142|      0|               flags_internal::PrivateHandleAccessor::TypeId(old_flag)) {
  143|      0|      flags_internal::ReportUsageError(
  144|      0|          absl::StrCat("Flag '", flag.Name(),
  145|      0|                       "' was defined more than once but with "
  146|      0|                       "differing types. Defined in files '",
  147|      0|                       old_flag.Filename(), "' and '", flag.Filename(), "'."),
  148|      0|          true);
  149|      0|    } else if (old_flag.IsRetired()) {
  ------------------
  |  Branch (149:16): [True: 0, False: 0]
  ------------------
  150|      0|      return;
  151|      0|    } else if (old_flag.Filename() != flag.Filename()) {
  ------------------
  |  Branch (151:16): [True: 0, False: 0]
  ------------------
  152|      0|      flags_internal::ReportUsageError(
  153|      0|          absl::StrCat("Flag '", flag.Name(),
  154|      0|                       "' was defined more than once (in files '",
  155|      0|                       old_flag.Filename(), "' and '", flag.Filename(), "')."),
  156|      0|          true);
  157|      0|    } else {
  158|      0|      flags_internal::ReportUsageError(
  159|      0|          absl::StrCat(
  160|      0|              "Something is wrong with flag '", flag.Name(), "' in file '",
  161|      0|              flag.Filename(), "'. One possibility: file '", flag.Filename(),
  162|      0|              "' is being linked both statically and dynamically into this "
  163|      0|              "executable. e.g. some files listed as srcs to a test and also "
  164|      0|              "listed as srcs of some shared lib deps of the same test."),
  165|      0|          true);
  166|      0|    }
  167|       |    // All cases above are fatal, except for the retired flags.
  168|      0|    std::exit(1);
  169|      0|  }
  170|     32|}
_ZN4absl12lts_2024011614flags_internal12FlagRegistry14GlobalRegistryEv:
  172|     38|FlagRegistry& FlagRegistry::GlobalRegistry() {
  173|     38|  static absl::NoDestructor<FlagRegistry> global_registry;
  174|     38|  return *global_registry;
  175|     38|}
_ZN4absl12lts_2024011614flags_internal23RegisterCommandLineFlagERNS0_15CommandLineFlagEPKc:
  192|     32|bool RegisterCommandLineFlag(CommandLineFlag& flag, const char* filename) {
  193|     32|  FlagRegistry::GlobalRegistry().RegisterFlag(flag, filename);
  194|     32|  return true;
  195|     32|}
_ZN4absl12lts_2024011614flags_internal16FinalizeRegistryEv:
  197|      2|void FinalizeRegistry() {
  198|      2|  auto& registry = FlagRegistry::GlobalRegistry();
  199|      2|  FlagRegistryLock frl(registry);
  200|      2|  if (registry.finalized_flags_.load(std::memory_order_relaxed)) {
  ------------------
  |  Branch (200:7): [True: 0, False: 2]
  ------------------
  201|       |    // Was already finalized. Ignore the second time.
  202|      0|    return;
  203|      0|  }
  204|      2|  registry.flat_flags_.reserve(registry.flags_.size());
  205|     32|  for (const auto& f : registry.flags_) {
  ------------------
  |  Branch (205:22): [True: 32, False: 2]
  ------------------
  206|     32|    registry.flat_flags_.push_back(f.second);
  207|     32|  }
  208|      2|  std::sort(std::begin(registry.flat_flags_), std::end(registry.flat_flags_),
  209|      2|            [](const CommandLineFlag* lhs, const CommandLineFlag* rhs) {
  210|      2|              return lhs->Name() < rhs->Name();
  211|      2|            });
  212|      2|  registry.flags_.clear();
  213|      2|  registry.finalized_flags_.store(true, std::memory_order_release);
  214|      2|}
_ZN4absl12lts_2024011619FindCommandLineFlagENSt3__117basic_string_viewIcNS1_11char_traitsIcEEEE:
  337|      4|CommandLineFlag* FindCommandLineFlag(absl::string_view name) {
  338|      4|  if (name.empty()) return nullptr;
  ------------------
  |  Branch (338:7): [True: 0, False: 4]
  ------------------
  339|      4|  flags_internal::FlagRegistry& registry =
  340|      4|      flags_internal::FlagRegistry::GlobalRegistry();
  341|      4|  return registry.FindFlag(name);
  342|      4|}
reflection.cc:_ZN4absl12lts_2024011614flags_internal12_GLOBAL__N_116FlagRegistryLockC2ERNS1_12FlagRegistryE:
   90|     34|  explicit FlagRegistryLock(FlagRegistry& fr) : fr_(fr) { fr_.Lock(); }
_ZN4absl12lts_2024011614flags_internal12FlagRegistry4LockEv:
   56|     34|  void Lock() ABSL_EXCLUSIVE_LOCK_FUNCTION(lock_) { lock_.Lock(); }
reflection.cc:_ZN4absl12lts_2024011614flags_internal12_GLOBAL__N_116FlagRegistryLockD2Ev:
   91|     34|  ~FlagRegistryLock() { fr_.Unlock(); }
_ZN4absl12lts_2024011614flags_internal12FlagRegistry6UnlockEv:
   57|     34|  void Unlock() ABSL_UNLOCK_FUNCTION(lock_) { lock_.Unlock(); }
_ZN4absl12lts_2024011614flags_internal12FlagRegistryC2Ev:
   50|      2|  FlagRegistry() = default;
reflection.cc:_ZZN4absl12lts_2024011614flags_internal12FlagRegistry8FindFlagENSt3__117basic_string_viewIcNS3_11char_traitsIcEEEEENK3$_0clEPNS0_15CommandLineFlagE:
  105|     16|        [=](CommandLineFlag* f) { return f->Name() < name; });
reflection.cc:_ZZN4absl12lts_2024011614flags_internal16FinalizeRegistryEvENK3$_0clEPKNS0_15CommandLineFlagES5_:
  209|    126|            [](const CommandLineFlag* lhs, const CommandLineFlag* rhs) {
  210|    126|              return lhs->Name() < rhs->Name();
  211|    126|            });

_ZN4absl12lts_2024011619ProgramUsageMessageEv:
   57|      2|absl::string_view ProgramUsageMessage() {
   58|      2|  absl::MutexLock l(&flags_internal::usage_message_guard);
   59|       |
   60|      2|  return flags_internal::program_usage_message != nullptr
  ------------------
  |  Branch (60:10): [True: 0, False: 2]
  ------------------
   61|      2|             ? absl::string_view(*flags_internal::program_usage_message)
   62|      2|             : "Warning: SetProgramUsageMessage() never called";
   63|      2|}

_ZN4absl12lts_2024011614flags_internal14GetUsageConfigEv:
  113|     64|FlagsUsageConfig GetUsageConfig() {
  114|     64|  absl::MutexLock l(&custom_usage_config_guard);
  115|       |
  116|     64|  if (custom_usage_config) return *custom_usage_config;
  ------------------
  |  Branch (116:7): [True: 0, False: 64]
  ------------------
  117|       |
  118|     64|  FlagsUsageConfig default_config;
  119|     64|  default_config.contains_helpshort_flags = &ContainsHelpshortFlags;
  120|     64|  default_config.contains_help_flags = &ContainsHelppackageFlags;
  121|     64|  default_config.contains_helppackage_flags = &ContainsHelppackageFlags;
  122|     64|  default_config.version_string = &VersionString;
  123|     64|  default_config.normalize_filename = &NormalizeFilename;
  124|       |
  125|     64|  return default_config;
  126|     64|}
usage_config.cc:_ZN4absl12lts_2024011614flags_internal12_GLOBAL__N_117NormalizeFilenameENSt3__117basic_string_viewIcNS3_11char_traitsIcEEEE:
   96|     64|std::string NormalizeFilename(absl::string_view filename) {
   97|       |  // Skip any leading slashes
   98|     64|  auto pos = filename.find_first_not_of("\\/");
   99|     64|  if (pos == absl::string_view::npos) return "";
  ------------------
  |  Branch (99:7): [True: 0, False: 64]
  ------------------
  100|       |
  101|     64|  filename.remove_prefix(pos);
  102|     64|  return std::string(filename);
  103|     64|}

_ZN4absl12lts_2024011612AnyInvocableIKFNSt3__110unique_ptrIN8fuzztest8internal14FuzzTestFuzzerENS2_14default_deleteIS6_EEEERKNS5_8FuzzTestEEEC2IZNS5_17RegistrationToken24GetFuzzTestFuzzerFactoryINS5_27RegistrationWithDomainsBaseIJNS2_12basic_stringIcNS2_11char_traitsIcEENS2_9allocatorIcEEEEbNS3_I11avifDecoderN4avif16UniquePtrDeleterEEEEEENS5_9NoFixtureEPFvRKSO_bSS_EPvEESE_ONS5_12RegistrationIT0_T1_T_T2_EEEUlSC_E_vEEOS13_:
  186|      4|      : Impl(internal_any_invocable::ConversionConstruct(),
  187|      4|             std::forward<F>(f)) {}
_ZN4absl12lts_2024011612AnyInvocableIKFNSt3__110unique_ptrIN8fuzztest8internal14FuzzTestFuzzerENS2_14default_deleteIS6_EEEERKNS5_8FuzzTestEEEC2EOSE_:
  176|      8|  AnyInvocable(AnyInvocable&& /*f*/) noexcept = default;

init_fuzztest.cc:_ZN4absl12lts_2024011611FunctionRefIFvRN8fuzztest8internal8FuzzTestEEEC2IZNS2_19ListRegisteredTestsEvE3$_0vEERKT_:
  105|      6|      : invoker_(&absl::functional_internal::InvokeObject<F, R, Args...>) {
  106|      6|    absl::functional_internal::AssertNonNull(f);
  107|      6|    ptr_.obj = &f;
  108|      6|  }
googletest_adaptor.cc:_ZN4absl12lts_2024011611FunctionRefIFvRN8fuzztest8internal8FuzzTestEEEC2IZNS3_30RegisterFuzzTestsAsGoogleTestsEPiPPPcRKNS3_13ConfigurationEE3$_0vEERKT_:
  105|      2|      : invoker_(&absl::functional_internal::InvokeObject<F, R, Args...>) {
  106|      2|    absl::functional_internal::AssertNonNull(f);
  107|      2|    ptr_.obj = &f;
  108|      2|  }
_ZNK4absl12lts_2024011611FunctionRefIFvRN8fuzztest8internal8FuzzTestEEEclES5_:
  131|     16|  R operator()(Args... args) const {
  132|     16|    return invoker_(ptr_, std::forward<Args>(args)...);
  133|     16|  }
status.cc:_ZN4absl12lts_2024011611FunctionRefIFvNSt3__117basic_string_viewIcNS2_11char_traitsIcEEEERKNS0_4CordEEEC2IZ10SetMessageRKNS0_6StatusES6_E3$_0vEERKT_:
  105|    393|      : invoker_(&absl::functional_internal::InvokeObject<F, R, Args...>) {
  106|    393|    absl::functional_internal::AssertNonNull(f);
  107|    393|    ptr_.obj = &f;
  108|    393|  }

_ZN4absl12lts_2024011622internal_any_invocable12EmptyManagerENS1_14FunctionToCallEPNS1_15TypeErasedStateES4_:
  246|      8|                         TypeErasedState* /*to*/) noexcept {}
_ZNK4absl12lts_2024011622internal_any_invocable4ImplIKFNSt3__110unique_ptrIN8fuzztest8internal14FuzzTestFuzzerENS3_14default_deleteIS7_EEEERKNS6_8FuzzTestEEE14ExtractInvokerEv:
  831|      2|    InvokerType<noex, ReturnType, P...>* ExtractInvoker() cv {                 \
  832|      2|      using QualifiedTestType = int cv ref;                                    \
  833|      2|      auto* invoker = this->invoker_;                                          \
  834|      2|      if (!std::is_const<QualifiedTestType>::value &&                          \
  ------------------
  |  Branch (834:11): [Folded, False: 2]
  ------------------
  835|      2|          std::is_rvalue_reference<QualifiedTestType>::value) {                \
  ------------------
  |  Branch (835:11): [Folded, False: 0]
  ------------------
  836|      0|        ABSL_ASSERT([this]() {                                                 \
  ------------------
  |  |   95|      0|  (false ? static_cast<void>(expr) : static_cast<void>(0))
  |  |  ------------------
  |  |  |  Branch (95:4): [Folded, False: 0]
  |  |  ------------------
  ------------------
  837|      0|          /* We checked that this isn't const above, so const_cast is safe */  \
  838|      0|          const_cast<Impl*>(this)->invoker_ = InvokedAfterMove;                \
  839|      0|          return this->HasValue();                                             \
  840|      0|        }());                                                                  \
  841|      0|      }                                                                        \
  842|      2|      return invoker;                                                          \
  843|      2|    }                                                                          \
_ZN4absl12lts_2024011622internal_any_invocable4ImplIKFNSt3__110unique_ptrIN8fuzztest8internal14FuzzTestFuzzerENS3_14default_deleteIS7_EEEERKNS6_8FuzzTestEEEC2IZNS6_17RegistrationToken24GetFuzzTestFuzzerFactoryINS6_27RegistrationWithDomainsBaseIJNS3_12basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEbNS4_I11avifDecoderN4avif16UniquePtrDeleterEEEEEENS6_9NoFixtureEPFvRKSP_bST_EPvEENS0_12AnyInvocableISE_EEONS6_12RegistrationIT0_T1_T_T2_EEEUlSD_E_EENS1_19ConversionConstructEOS16_:
  813|      4|        : Core(TypedConversionConstruct<                                       \
  814|      4|                   typename std::decay<F>::type inv_quals>(),                  \
  815|      4|               std::forward<F>(f)) {}                                          \
_ZN4absl12lts_2024011622internal_any_invocable8CoreImplILb0ENSt3__110unique_ptrIN8fuzztest8internal14FuzzTestFuzzerENS3_14default_deleteIS7_EEEEJRKNS6_8FuzzTestEEEC2IRKZNS6_17RegistrationToken24GetFuzzTestFuzzerFactoryINS6_27RegistrationWithDomainsBaseIJNS3_12basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEbNS4_I11avifDecoderN4avif16UniquePtrDeleterEEEEEENS6_9NoFixtureEPFvRKSO_bSS_EPvEENS0_12AnyInvocableIKFSA_SD_EEEONS6_12RegistrationIT0_T1_T_T2_EEEUlSD_E_S1A_EENS1_24TypedConversionConstructIS16_EEOS14_:
  454|      4|  explicit CoreImpl(TypedConversionConstruct<QualDecayedTRef>, F&& f) {
  455|      4|    using DecayedT = RemoveCVRef<QualDecayedTRef>;
  456|       |
  457|      4|    constexpr TargetType kTargetType =
  458|      4|        (std::is_pointer<DecayedT>::value ||
  ------------------
  |  Branch (458:10): [Folded, False: 0]
  ------------------
  459|      0|         std::is_member_pointer<DecayedT>::value)
  ------------------
  |  Branch (459:10): [Folded, False: 0]
  ------------------
  460|      4|            ? TargetType::kPointer
  461|      4|        : IsCompatibleAnyInvocable<DecayedT>::value
  ------------------
  |  Branch (461:11): [Folded, False: 4]
  ------------------
  462|      4|            ? TargetType::kCompatibleAnyInvocable
  463|      4|        : IsAnyInvocable<DecayedT>::value
  ------------------
  |  Branch (463:11): [Folded, False: 4]
  ------------------
  464|      4|            ? TargetType::kIncompatibleAnyInvocable
  465|      4|            : TargetType::kOther;
  466|       |    // NOTE: We only use integers instead of enums as template parameters in
  467|       |    // order to work around a bug on C++14 under MSVC 2017.
  468|       |    // See b/236131881.
  469|      4|    Initialize<kTargetType, QualDecayedTRef>(std::forward<F>(f));
  470|      4|  }
_ZN4absl12lts_2024011622internal_any_invocable8CoreImplILb0ENSt3__110unique_ptrIN8fuzztest8internal14FuzzTestFuzzerENS3_14default_deleteIS7_EEEEJRKNS6_8FuzzTestEEE10InitializeILNSE_10TargetTypeE3ERKZNS6_17RegistrationToken24GetFuzzTestFuzzerFactoryINS6_27RegistrationWithDomainsBaseIJNS3_12basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEbNS4_I11avifDecoderN4avif16UniquePtrDeleterEEEEEENS6_9NoFixtureEPFvRKSP_bST_EPvEENS0_12AnyInvocableIKFSA_SD_EEEONS6_12RegistrationIT0_T1_T_T2_EEEUlSD_E_S1B_vEEvOS16_:
  572|      4|  void Initialize(F&& f) {
  573|      4|    InitializeStorage<QualDecayedTRef>(std::forward<F>(f));
  574|      4|  }
_ZN4absl12lts_2024011622internal_any_invocable8CoreImplILb0ENSt3__110unique_ptrIN8fuzztest8internal14FuzzTestFuzzerENS3_14default_deleteIS7_EEEEJRKNS6_8FuzzTestEEE17InitializeStorageIRKZNS6_17RegistrationToken24GetFuzzTestFuzzerFactoryINS6_27RegistrationWithDomainsBaseIJNS3_12basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEbNS4_I11avifDecoderN4avif16UniquePtrDeleterEEEEEENS6_9NoFixtureEPFvRKSO_bSS_EPvEENS0_12AnyInvocableIKFSA_SD_EEEONS6_12RegistrationIT0_T1_T_T2_EEEUlSD_E_JS1A_ETnNS3_9enable_ifIXntsr15IsStoredLocallyINS3_9remove_cvINS3_16remove_referenceIS16_E4typeEE4typeEEE5valueEiE4typeELi0EEEvDpOT0_:
  594|      4|  void InitializeStorage(Args&&... args) {
  595|      4|    InitializeRemoteManager<RemoveCVRef<QualTRef>>(std::forward<Args>(args)...);
  596|       |    // This is set after everything else in case an exception is thrown in an
  597|       |    // earlier step of the initialization.
  598|      4|    invoker_ = RemoteInvoker<SigIsNoexcept, ReturnType, QualTRef, P...>;
  599|      4|  }
_ZN4absl12lts_2024011622internal_any_invocable8CoreImplILb0ENSt3__110unique_ptrIN8fuzztest8internal14FuzzTestFuzzerENS3_14default_deleteIS7_EEEEJRKNS6_8FuzzTestEEE23InitializeRemoteManagerIZNS6_17RegistrationToken24GetFuzzTestFuzzerFactoryINS6_27RegistrationWithDomainsBaseIJNS3_12basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEbNS4_I11avifDecoderN4avif16UniquePtrDeleterEEEEEENS6_9NoFixtureEPFvRKSO_bSS_EPvEENS0_12AnyInvocableIKFSA_SD_EEEONS6_12RegistrationIT0_T1_T_T2_EEEUlSD_E_JS1A_ETnNS3_9enable_ifIXntsr23HasTrivialRemoteStorageIS16_EE5valueEiE4typeELi0EEEvDpOT0_:
  633|      4|  void InitializeRemoteManager(Args&&... args) {
  634|      4|    state_.remote.target = ::new T(std::forward<Args>(args)...);
  635|      4|    manager_ = RemoteManagerNontrivial<T>;
  636|      4|  }
_ZN4absl12lts_2024011622internal_any_invocable23RemoteManagerNontrivialIZN8fuzztest8internal17RegistrationToken24GetFuzzTestFuzzerFactoryINS4_27RegistrationWithDomainsBaseIJNSt3__112basic_stringIcNS8_11char_traitsIcEENS8_9allocatorIcEEEEbNS8_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEEEEENS4_9NoFixtureEPFvRKSE_bSJ_EPvEENS0_12AnyInvocableIKFNSF_INS4_14FuzzTestFuzzerENS8_14default_deleteISS_EEEERKNS4_8FuzzTestEEEEONS4_12RegistrationIT0_T1_T_T2_EEEUlSY_E_EEvNS1_14FunctionToCallEPNS1_15TypeErasedStateES1B_:
  340|      8|                             TypeErasedState* const to) noexcept {
  341|      8|  static_assert(!IsStoredLocally<T>::value,
  342|      8|                "Remote storage must only be used for types that do not "
  343|      8|                "qualify for local storage.");
  344|       |
  345|      8|  switch (operation) {
  ------------------
  |  Branch (345:11): [True: 8, False: 0]
  ------------------
  346|      8|    case FunctionToCall::relocate_from_to:
  ------------------
  |  Branch (346:5): [True: 8, False: 0]
  ------------------
  347|       |      // NOTE: Requires that the left-hand operand is already empty.
  348|      8|      to->remote.target = from->remote.target;
  349|      8|      return;
  350|      0|    case FunctionToCall::dispose:
  ------------------
  |  Branch (350:5): [True: 0, False: 8]
  ------------------
  351|      0|      ::delete static_cast<T*>(from->remote.target);  // Must not throw.
  352|      0|      return;
  353|      8|  }
  354|      0|  ABSL_UNREACHABLE();
  ------------------
  |  |  225|      0|  do {                                           \
  |  |  226|      0|    /* NOLINTNEXTLINE: misc-static-assert */     \
  |  |  227|      0|    assert(false && "ABSL_UNREACHABLE reached"); \
  |  |  228|      0|    ABSL_INTERNAL_UNREACHABLE_IMPL();            \
  |  |  ------------------
  |  |  |  |  203|      0|#define ABSL_INTERNAL_UNREACHABLE_IMPL() __builtin_unreachable()
  |  |  ------------------
  |  |  229|      0|  } while (false)
  |  |  ------------------
  |  |  |  Branch (229:12): [Folded, False: 0]
  |  |  ------------------
  ------------------
  355|      0|}
_ZN4absl12lts_2024011622internal_any_invocable13RemoteInvokerILb0ENSt3__110unique_ptrIN8fuzztest8internal14FuzzTestFuzzerENS3_14default_deleteIS7_EEEERKZNS6_17RegistrationToken24GetFuzzTestFuzzerFactoryINS6_27RegistrationWithDomainsBaseIJNS3_12basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEbNS4_I11avifDecoderN4avif16UniquePtrDeleterEEEEEENS6_9NoFixtureEPFvRKSJ_bSN_EPvEENS0_12AnyInvocableIKFSA_RKNS6_8FuzzTestEEEEONS6_12RegistrationIT0_T1_T_T2_EEEUlSY_E_JSY_EEES12_PNS1_15TypeErasedStateEDpNS1_18ForwardedParameterIT2_E4typeE:
  361|      2|    ForwardedParameterType<P>... args) noexcept(SigIsNoexcept) {
  362|      2|  using RawT = RemoveCVRef<QualTRef>;
  363|      2|  static_assert(!IsStoredLocally<RawT>::value,
  364|      2|                "Target object must be in remote storage in order to be "
  365|      2|                "invoked from it.");
  366|       |
  367|      2|  auto& f = *static_cast<RawT*>(state->remote.target);
  368|      2|  return (InvokeR<ReturnType>)(static_cast<QualTRef>(f),
  369|      2|                               static_cast<ForwardedParameterType<P>>(args)...);
  370|      2|}
_ZN4absl12lts_2024011622internal_any_invocable7InvokeRINSt3__110unique_ptrIN8fuzztest8internal14FuzzTestFuzzerENS3_14default_deleteIS7_EEEERKZNS6_17RegistrationToken24GetFuzzTestFuzzerFactoryINS6_27RegistrationWithDomainsBaseIJNS3_12basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEbNS4_I11avifDecoderN4avif16UniquePtrDeleterEEEEEENS6_9NoFixtureEPFvRKSJ_bSN_EPvEENS0_12AnyInvocableIKFSA_RKNS6_8FuzzTestEEEEONS6_12RegistrationIT0_T1_T_T2_EEEUlSY_E_JSY_ETnNS3_9enable_ifIXntsr3std7is_voidIS14_EE5valueEiE4typeELi0EEES14_OS12_DpOT1_:
  137|      2|ReturnType InvokeR(F&& f, P&&... args) {
  138|       |  // GCC 12 has a false-positive -Wmaybe-uninitialized warning here.
  139|       |#if ABSL_INTERNAL_HAVE_MIN_GNUC_VERSION(12, 0)
  140|       |#pragma GCC diagnostic push
  141|       |#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
  142|       |#endif
  143|      2|  return absl::base_internal::invoke(std::forward<F>(f),
  144|      2|                                     std::forward<P>(args)...);
  145|       |#if ABSL_INTERNAL_HAVE_MIN_GNUC_VERSION(12, 0)
  146|       |#pragma GCC diagnostic pop
  147|       |#endif
  148|      2|}
_ZN4absl12lts_2024011622internal_any_invocable8CoreImplILb0ENSt3__110unique_ptrIN8fuzztest8internal14FuzzTestFuzzerENS3_14default_deleteIS7_EEEEJRKNS6_8FuzzTestEEED2Ev:
  508|      8|  ~CoreImpl() { manager_(FunctionToCall::dispose, &state_, &state_); }
_ZN4absl12lts_2024011622internal_any_invocable8CoreImplILb0ENSt3__110unique_ptrIN8fuzztest8internal14FuzzTestFuzzerENS3_14default_deleteIS7_EEEEJRKNS6_8FuzzTestEEEC2EOSE_:
  480|      8|  CoreImpl(CoreImpl&& other) noexcept {
  481|      8|    other.manager_(FunctionToCall::relocate_from_to, &other.state_, &state_);
  482|      8|    manager_ = other.manager_;
  483|      8|    invoker_ = other.invoker_;
  484|      8|    other.manager_ = EmptyManager;
  485|      8|    other.invoker_ = nullptr;
  486|      8|  }

init_fuzztest.cc:_ZN4absl12lts_2024011619functional_internal12InvokeObjectIZN8fuzztest19ListRegisteredTestsEvE3$_0vJRNS3_8internal8FuzzTestEEEET0_NS1_7VoidPtrEDpNS1_8ForwardTIT1_E4typeE:
   75|     12|R InvokeObject(VoidPtr ptr, typename ForwardT<Args>::type... args) {
   76|     12|  auto o = static_cast<const Obj*>(ptr.obj);
   77|     12|  return static_cast<R>(
   78|     12|      absl::base_internal::invoke(*o, std::forward<Args>(args)...));
   79|     12|}
init_fuzztest.cc:_ZN4absl12lts_2024011619functional_internal13AssertNonNullIZN8fuzztest19ListRegisteredTestsEvE3$_0EEvRKT_:
  101|      6|void AssertNonNull(const F&) {}
googletest_adaptor.cc:_ZN4absl12lts_2024011619functional_internal12InvokeObjectIZN8fuzztest8internal30RegisterFuzzTestsAsGoogleTestsEPiPPPcRKNS4_13ConfigurationEE3$_0vJRNS4_8FuzzTestEEEET0_NS1_7VoidPtrEDpNS1_8ForwardTIT1_E4typeE:
   75|      4|R InvokeObject(VoidPtr ptr, typename ForwardT<Args>::type... args) {
   76|      4|  auto o = static_cast<const Obj*>(ptr.obj);
   77|      4|  return static_cast<R>(
   78|      4|      absl::base_internal::invoke(*o, std::forward<Args>(args)...));
   79|      4|}
googletest_adaptor.cc:_ZN4absl12lts_2024011619functional_internal13AssertNonNullIZN8fuzztest8internal30RegisterFuzzTestsAsGoogleTestsEPiPPPcRKNS4_13ConfigurationEE3$_0EEvRKT_:
  101|      2|void AssertNonNull(const F&) {}
status.cc:_ZN4absl12lts_2024011619functional_internal13AssertNonNullIZ10SetMessageRKNS0_6StatusENSt3__117basic_string_viewIcNS6_11char_traitsIcEEEEE3$_0EEvRKT_:
  101|    393|void AssertNonNull(const F&) {}

_ZN4absl12lts_2024011613hash_internal15MixingHashState16LowLevelHashImplEPKhm:
   63|     12|                                           size_t len) {
   64|     12|  return LowLevelHash(data, len, Seed(), kHashSalt);
   65|     12|}

_ZN4absl12lts_2024011613hash_internal15MixingHashState18combine_contiguousES2_PKhm:
 1023|    120|                                            size_t size) {
 1024|    120|    return MixingHashState(
 1025|    120|        CombineContiguousImpl(hash_state.state_, first, size,
 1026|    120|                              std::integral_constant<int, sizeof(size_t)>{}));
 1027|    120|  }
_ZN4absl12lts_2024011613hash_internal15MixingHashStateC2Ev:
 1052|     60|  MixingHashState() : state_(Seed()) {}
_ZN4absl12lts_2024011613hash_internal15MixingHashStateC2Em:
 1085|    120|  explicit MixingHashState(uint64_t state) : state_(state) {}
_ZN4absl12lts_2024011613hash_internal15MixingHashState9Read9To16EPKhm:
 1114|     26|                                                 size_t len) {
 1115|     26|    uint64_t low_mem = absl::base_internal::UnalignedLoad64(p);
 1116|     26|    uint64_t high_mem = absl::base_internal::UnalignedLoad64(p + len - 8);
 1117|     26|#ifdef ABSL_IS_LITTLE_ENDIAN
 1118|     26|    uint64_t most_significant = high_mem;
 1119|     26|    uint64_t least_significant = low_mem;
 1120|       |#else
 1121|       |    uint64_t most_significant = low_mem;
 1122|       |    uint64_t least_significant = high_mem;
 1123|       |#endif
 1124|     26|    return {least_significant, most_significant};
 1125|     26|  }
_ZN4absl12lts_2024011613hash_internal15MixingHashState8Read4To8EPKhm:
 1128|     82|  static uint64_t Read4To8(const unsigned char* p, size_t len) {
 1129|     82|    uint32_t low_mem = absl::base_internal::UnalignedLoad32(p);
 1130|     82|    uint32_t high_mem = absl::base_internal::UnalignedLoad32(p + len - 4);
 1131|     82|#ifdef ABSL_IS_LITTLE_ENDIAN
 1132|     82|    uint32_t most_significant = high_mem;
 1133|     82|    uint32_t least_significant = low_mem;
 1134|       |#else
 1135|       |    uint32_t most_significant = low_mem;
 1136|       |    uint32_t least_significant = high_mem;
 1137|       |#endif
 1138|     82|    return (static_cast<uint64_t>(most_significant) << (len - 4) * 8) |
 1139|     82|           least_significant;
 1140|     82|  }
_ZN4absl12lts_2024011613hash_internal15MixingHashState3MixEmm:
 1162|     94|  ABSL_ATTRIBUTE_ALWAYS_INLINE static uint64_t Mix(uint64_t state, uint64_t v) {
 1163|       |    // Though the 128-bit product on AArch64 needs two instructions, it is
 1164|       |    // still a good balance between speed and hash quality.
 1165|     94|    using MultType =
 1166|     94|        absl::conditional_t<sizeof(size_t) == 4, uint64_t, uint128>;
 1167|       |    // We do the addition in 64-bit space to make sure the 128-bit
 1168|       |    // multiplication is fast. If we were to do it as MultType the compiler has
 1169|       |    // to assume that the high word is non-zero and needs to perform 2
 1170|       |    // multiplications instead of one.
 1171|     94|    MultType m = state + v;
 1172|     94|    m *= kMul;
 1173|     94|    return static_cast<uint64_t>(m ^ (m >> (sizeof(m) * 8 / 2)));
 1174|     94|  }
_ZN4absl12lts_2024011613hash_internal15MixingHashState6Hash64EPKhm:
 1181|     12|                                                      size_t len) {
 1182|     12|#ifdef ABSL_HAVE_INTRINSIC_INT128
 1183|     12|    return LowLevelHashImpl(data, len);
 1184|       |#else
 1185|       |    return hash_internal::CityHash64(reinterpret_cast<const char*>(data), len);
 1186|       |#endif
 1187|     12|  }
_ZN4absl12lts_2024011613hash_internal15MixingHashState4SeedEv:
 1205|     72|  ABSL_ATTRIBUTE_ALWAYS_INLINE static uint64_t Seed() {
 1206|     72|#if (!defined(__clang__) || __clang_major__ > 11) && \
 1207|     72|    (!defined(__apple_build_version__) ||            \
 1208|     72|     __apple_build_version__ >= 19558921)  // Xcode 12
 1209|     72|    return static_cast<uint64_t>(reinterpret_cast<uintptr_t>(&kSeed));
 1210|       |#else
 1211|       |    // Workaround the absence of
 1212|       |    // https://github.com/llvm/llvm-project/commit/bc15bf66dcca76cc06fe71fca35b74dc4d521021.
 1213|       |    return static_cast<uint64_t>(reinterpret_cast<uintptr_t>(kSeed));
 1214|       |#endif
 1215|     72|  }
_ZN4absl12lts_2024011613hash_internal18PiecewiseChunkSizeEv:
   82|     12|constexpr size_t PiecewiseChunkSize() { return 1024; }
_ZN4absl12lts_2024011613hash_internal15MixingHashState21CombineContiguousImplEmPKhmNSt3__117integral_constantIiLi8EEE:
 1247|    120|    std::integral_constant<int, 8> /* sizeof_size_t */) {
 1248|       |  // For large values we use LowLevelHash or CityHash depending on the platform,
 1249|       |  // for small ones we just use a multiplicative hash.
 1250|    120|  uint64_t v;
 1251|    120|  if (len > 16) {
  ------------------
  |  Branch (1251:7): [True: 12, False: 108]
  ------------------
 1252|     12|    if (ABSL_PREDICT_FALSE(len > PiecewiseChunkSize())) {
  ------------------
  |  |  178|     12|#define ABSL_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
  |  |  ------------------
  |  |  |  Branch (178:31): [True: 0, False: 12]
  |  |  |  Branch (178:49): [Folded, False: 12]
  |  |  |  Branch (178:58): [True: 0, False: 12]
  |  |  ------------------
  ------------------
 1253|      0|      return CombineLargeContiguousImpl64(state, first, len);
 1254|      0|    }
 1255|     12|    v = Hash64(first, len);
 1256|    108|  } else if (len > 8) {
  ------------------
  |  Branch (1256:14): [True: 26, False: 82]
  ------------------
 1257|       |    // This hash function was constructed by the ML-driven algorithm discovery
 1258|       |    // using reinforcement learning. We fed the agent lots of inputs from
 1259|       |    // microbenchmarks, SMHasher, low hamming distance from generated inputs and
 1260|       |    // picked up the one that was good on micro and macrobenchmarks.
 1261|     26|    auto p = Read9To16(first, len);
 1262|     26|    uint64_t lo = p.first;
 1263|     26|    uint64_t hi = p.second;
 1264|       |    // Rotation by 53 was found to be most often useful when discovering these
 1265|       |    // hashing algorithms with ML techniques.
 1266|     26|    lo = absl::rotr(lo, 53);
 1267|     26|    state += kMul;
 1268|     26|    lo += state;
 1269|     26|    state ^= hi;
 1270|     26|    uint128 m = state;
 1271|     26|    m *= lo;
 1272|     26|    return static_cast<uint64_t>(m ^ (m >> 64));
 1273|     82|  } else if (len >= 4) {
  ------------------
  |  Branch (1273:14): [True: 82, False: 0]
  ------------------
 1274|     82|    v = Read4To8(first, len);
 1275|     82|  } else if (len > 0) {
  ------------------
  |  Branch (1275:14): [True: 0, False: 0]
  ------------------
 1276|      0|    v = Read1To3(first, len);
 1277|      0|  } else {
 1278|       |    // Empty ranges have no effect.
 1279|      0|    return state;
 1280|      0|  }
 1281|     94|  return Mix(state, v);
 1282|    120|}
_ZNK4absl12lts_2024011613hash_internal8HashImplINSt3__117basic_string_viewIcNS3_11char_traitsIcEEEEEclERKS7_:
 1299|     60|  size_t operator()(const T& value) const {
 1300|     60|    return MixingHashState::hash(value);
 1301|     60|  }
_ZN4absl12lts_2024011613hash_internal15MixingHashState4hashINSt3__117basic_string_viewIcNS4_11char_traitsIcEEEETnNS4_9enable_ifIXntsr16IntegralFastPathIT_EE5valueEiE4typeELi0EEEmRKSA_:
 1045|     60|  static size_t hash(const T& value) {
 1046|     60|    return static_cast<size_t>(combine(MixingHashState{}, value).state_);
 1047|     60|  }
_ZN4absl12lts_2024011613hash_internal13HashStateBaseINS1_15MixingHashStateEE7combineINSt3__117basic_string_viewIcNS6_11char_traitsIcEEEEJEEES3_S3_RKT_DpRKT0_:
 1310|     60|H HashStateBase<H>::combine(H state, const T& value, const Ts&... values) {
 1311|     60|  return H::combine(hash_internal::HashSelect::template Apply<T>::Invoke(
 1312|     60|                        std::move(state), value),
 1313|     60|                    values...);
 1314|     60|}
_ZN4absl12lts_2024011613hash_internal10HashSelect14HashValueProbe6InvokeINS1_15MixingHashStateENSt3__117basic_string_viewIcNS6_11char_traitsIcEEEEEENS6_9enable_ifIXsr3std7is_sameIT_DTcl13AbslHashValueclsr3stdE4movefp_Efp0_EEEE5valueESC_E4typeESC_RKT0_:
  937|     60|        H> {
  938|     60|      return AbslHashValue(std::move(state), value);
  939|     60|    }
_ZN4absl12lts_2024011613hash_internal13AbslHashValueINS1_15MixingHashStateEEET_S4_NSt3__117basic_string_viewIcNS5_11char_traitsIcEEEE:
  557|     60|H AbslHashValue(H hash_state, absl::string_view str) {
  558|     60|  return H::combine(
  559|     60|      H::combine_contiguous(std::move(hash_state), str.data(), str.size()),
  560|     60|      str.size());
  561|     60|}
_ZN4absl12lts_2024011613hash_internal13HashStateBaseINS1_15MixingHashStateEE18combine_contiguousIcEES3_S3_PKT_m:
 1319|     60|H HashStateBase<H>::combine_contiguous(H state, const T* data, size_t size) {
 1320|     60|  return hash_internal::hash_range_or_bytes(std::move(state), data, size);
 1321|     60|}
_ZN4absl12lts_2024011613hash_internal19hash_range_or_bytesINS1_15MixingHashStateEcEENSt3__19enable_ifIXsr23is_uniquely_representedIT0_EE5valueET_E4typeES7_PKS6_m:
  885|     60|hash_range_or_bytes(H hash_state, const T* data, size_t size) {
  886|     60|  const auto* bytes = reinterpret_cast<const unsigned char*>(data);
  887|     60|  return H::combine_contiguous(std::move(hash_state), bytes, sizeof(T) * size);
  888|     60|}
_ZN4absl12lts_2024011613hash_internal13HashStateBaseINS1_15MixingHashStateEE7combineImJEEES3_S3_RKT_DpRKT0_:
 1310|     60|H HashStateBase<H>::combine(H state, const T& value, const Ts&... values) {
 1311|     60|  return H::combine(hash_internal::HashSelect::template Apply<T>::Invoke(
 1312|     60|                        std::move(state), value),
 1313|     60|                    values...);
 1314|     60|}
_ZN4absl12lts_2024011613hash_internal10HashSelect24UniquelyRepresentedProbe6InvokeINS1_15MixingHashStateEmEENSt3__19enable_ifIXsr23is_uniquely_representedIT0_EE5valueET_E4typeES9_RKS8_:
  927|     60|        -> absl::enable_if_t<is_uniquely_represented<T>::value, H> {
  928|     60|      return hash_internal::hash_bytes(std::move(state), value);
  929|     60|    }
_ZN4absl12lts_2024011613hash_internal10hash_bytesINS1_15MixingHashStateEmEET_S4_RKT0_:
  340|     60|H hash_bytes(H hash_state, const T& value) {
  341|     60|  const unsigned char* start = reinterpret_cast<const unsigned char*>(&value);
  342|     60|  return H::combine_contiguous(std::move(hash_state), start, sizeof(value));
  343|     60|}
_ZN4absl12lts_2024011613hash_internal13HashStateBaseINS1_15MixingHashStateEE7combineES3_:
  234|    120|  static H combine(H state) { return state; }

_ZN4absl12lts_2024011613hash_internal12LowLevelHashEPKvmmPKm:
   32|     12|                      const uint64_t salt[5]) {
   33|       |  // Prefetch the cacheline that data resides in.
   34|     12|  PrefetchToLocalCache(data);
   35|     12|  const uint8_t* ptr = static_cast<const uint8_t*>(data);
   36|     12|  uint64_t starting_length = static_cast<uint64_t>(len);
   37|     12|  uint64_t current_state = seed ^ salt[0];
   38|       |
   39|     12|  if (len > 64) {
  ------------------
  |  Branch (39:7): [True: 0, False: 12]
  ------------------
   40|       |    // If we have more than 64 bytes, we're going to handle chunks of 64
   41|       |    // bytes at a time. We're going to build up two separate hash states
   42|       |    // which we will then hash together.
   43|      0|    uint64_t duplicated_state = current_state;
   44|       |
   45|      0|    do {
   46|       |      // Always prefetch the next cacheline.
   47|      0|      PrefetchToLocalCache(ptr + ABSL_CACHELINE_SIZE);
  ------------------
  |  |   77|      0|#define ABSL_CACHELINE_SIZE 64
  ------------------
   48|       |
   49|      0|      uint64_t a = absl::base_internal::UnalignedLoad64(ptr);
   50|      0|      uint64_t b = absl::base_internal::UnalignedLoad64(ptr + 8);
   51|      0|      uint64_t c = absl::base_internal::UnalignedLoad64(ptr + 16);
   52|      0|      uint64_t d = absl::base_internal::UnalignedLoad64(ptr + 24);
   53|      0|      uint64_t e = absl::base_internal::UnalignedLoad64(ptr + 32);
   54|      0|      uint64_t f = absl::base_internal::UnalignedLoad64(ptr + 40);
   55|      0|      uint64_t g = absl::base_internal::UnalignedLoad64(ptr + 48);
   56|      0|      uint64_t h = absl::base_internal::UnalignedLoad64(ptr + 56);
   57|       |
   58|      0|      uint64_t cs0 = Mix(a ^ salt[1], b ^ current_state);
   59|      0|      uint64_t cs1 = Mix(c ^ salt[2], d ^ current_state);
   60|      0|      current_state = (cs0 ^ cs1);
   61|       |
   62|      0|      uint64_t ds0 = Mix(e ^ salt[3], f ^ duplicated_state);
   63|      0|      uint64_t ds1 = Mix(g ^ salt[4], h ^ duplicated_state);
   64|      0|      duplicated_state = (ds0 ^ ds1);
   65|       |
   66|      0|      ptr += 64;
   67|      0|      len -= 64;
   68|      0|    } while (len > 64);
  ------------------
  |  Branch (68:14): [True: 0, False: 0]
  ------------------
   69|       |
   70|      0|    current_state = current_state ^ duplicated_state;
   71|      0|  }
   72|       |
   73|       |  // We now have a data `ptr` with at most 64 bytes and the current state
   74|       |  // of the hashing state machine stored in current_state.
   75|     28|  while (len > 16) {
  ------------------
  |  Branch (75:10): [True: 16, False: 12]
  ------------------
   76|     16|    uint64_t a = absl::base_internal::UnalignedLoad64(ptr);
   77|     16|    uint64_t b = absl::base_internal::UnalignedLoad64(ptr + 8);
   78|       |
   79|     16|    current_state = Mix(a ^ salt[1], b ^ current_state);
   80|       |
   81|     16|    ptr += 16;
   82|     16|    len -= 16;
   83|     16|  }
   84|       |
   85|       |  // We now have a data `ptr` with at most 16 bytes.
   86|     12|  uint64_t a = 0;
   87|     12|  uint64_t b = 0;
   88|     12|  if (len > 8) {
  ------------------
  |  Branch (88:7): [True: 0, False: 12]
  ------------------
   89|       |    // When we have at least 9 and at most 16 bytes, set A to the first 64
   90|       |    // bits of the input and B to the last 64 bits of the input. Yes, they will
   91|       |    // overlap in the middle if we are working with less than the full 16
   92|       |    // bytes.
   93|      0|    a = absl::base_internal::UnalignedLoad64(ptr);
   94|      0|    b = absl::base_internal::UnalignedLoad64(ptr + len - 8);
   95|     12|  } else if (len > 3) {
  ------------------
  |  Branch (95:14): [True: 8, False: 4]
  ------------------
   96|       |    // If we have at least 4 and at most 8 bytes, set A to the first 32
   97|       |    // bits and B to the last 32 bits.
   98|      8|    a = absl::base_internal::UnalignedLoad32(ptr);
   99|      8|    b = absl::base_internal::UnalignedLoad32(ptr + len - 4);
  100|      8|  } else if (len > 0) {
  ------------------
  |  Branch (100:14): [True: 4, False: 0]
  ------------------
  101|       |    // If we have at least 1 and at most 3 bytes, read all of the provided
  102|       |    // bits into A, with some adjustments.
  103|      4|    a = static_cast<uint64_t>((ptr[0] << 16) | (ptr[len >> 1] << 8) |
  104|      4|                              ptr[len - 1]);
  105|      4|    b = 0;
  106|      4|  } else {
  107|      0|    a = 0;
  108|      0|    b = 0;
  109|      0|  }
  110|       |
  111|     12|  uint64_t w = Mix(a ^ salt[1], b ^ current_state);
  112|     12|  uint64_t z = salt[1] ^ starting_length;
  113|     12|  return Mix(w, z);
  114|     12|}
low_level_hash.cc:_ZN4absl12lts_2024011613hash_internalL3MixEmm:
   25|     40|static uint64_t Mix(uint64_t v0, uint64_t v1) {
   26|     40|  absl::uint128 p = v0;
   27|     40|  p *= v1;
   28|     40|  return absl::Uint128Low64(p) ^ absl::Uint128High64(p);
   29|     40|}

_ZN4absl12lts_202401164rotrImEENSt3__19enable_ifIXsr3std11is_unsignedIT_EE5valueES4_E4typeES4_i:
   75|     26|    rotr(T x, int s) noexcept {
   76|     26|  return numeric_internal::RotateRight(x, s);
   77|     26|}
_ZN4absl12lts_2024011611countl_zeroImEENSt3__19enable_ifIXsr3std11is_unsignedIT_EE5valueEiE4typeES4_:
  103|  21.2k|    countl_zero(T x) noexcept {
  104|  21.2k|  return numeric_internal::CountLeadingZeroes(x);
  105|  21.2k|}
_ZN4absl12lts_202401169bit_widthImEENSt3__19enable_ifIXsr3std11is_unsignedIT_EE5valueEiE4typeES4_:
  160|  4.89k|    bit_width(T x) noexcept {
  161|  4.89k|  return std::numeric_limits<T>::digits - countl_zero(x);
  162|  4.89k|}
_ZN4absl12lts_2024011611countr_zeroIjEENSt3__19enable_ifIXsr3std11is_unsignedIT_EE5valueEiE4typeES4_:
  118|     13|    countr_zero(T x) noexcept {
  119|     13|  return numeric_internal::CountTrailingZeroes(x);
  120|     13|}
_ZN4absl12lts_2024011611countr_zeroItEENSt3__19enable_ifIXsr3std11is_unsignedIT_EE5valueEiE4typeES4_:
  118|     62|    countr_zero(T x) noexcept {
  119|     62|  return numeric_internal::CountTrailingZeroes(x);
  120|     62|}
_ZN4absl12lts_2024011611countl_zeroIjEENSt3__19enable_ifIXsr3std11is_unsignedIT_EE5valueEiE4typeES4_:
  103|      4|    countl_zero(T x) noexcept {
  104|      4|  return numeric_internal::CountLeadingZeroes(x);
  105|      4|}
_ZN4absl12lts_202401169bit_widthIjEENSt3__19enable_ifIXsr3std11is_unsignedIT_EE5valueEiE4typeES4_:
  160|      4|    bit_width(T x) noexcept {
  161|      4|  return std::numeric_limits<T>::digits - countl_zero(x);
  162|      4|}

_ZN4absl12lts_202401167uint128rSEi:
  613|  24.0k|inline uint128& uint128::operator>>=(int amount) {
  614|  24.0k|  *this = *this >> amount;
  615|  24.0k|  return *this;
  616|  24.0k|}
_ZN4absl12lts_202401167uint128pLES1_:
  618|  2.47k|inline uint128& uint128::operator+=(uint128 other) {
  619|  2.47k|  *this = *this + other;
  620|  2.47k|  return *this;
  621|  2.47k|}
_ZN4absl12lts_202401167uint128mLES1_:
  628|  11.4k|inline uint128& uint128::operator*=(uint128 other) {
  629|  11.4k|  *this = *this * other;
  630|  11.4k|  return *this;
  631|  11.4k|}
_ZN4absl12lts_202401167uint128dVES1_:
  633|     70|inline uint128& uint128::operator/=(uint128 other) {
  634|     70|  *this = *this / other;
  635|     70|  return *this;
  636|     70|}
_ZN4absl12lts_2024011612Uint128Low64ENS0_7uint128E:
  643|  10.2k|constexpr uint64_t Uint128Low64(uint128 v) { return v.lo_; }
_ZN4absl12lts_2024011613Uint128High64ENS0_7uint128E:
  645|  44.6k|constexpr uint64_t Uint128High64(uint128 v) { return v.hi_; }
_ZNK4absl12lts_202401167uint128cvbEv:
  719|     70|constexpr uint128::operator bool() const { return lo_ || hi_; }
  ------------------
  |  Branch (719:51): [True: 34, False: 36]
  |  Branch (719:58): [True: 0, False: 36]
  ------------------
_ZNK4absl12lts_202401167uint128cvjEv:
  752|     70|constexpr uint128::operator unsigned int() const {
  753|     70|  return static_cast<unsigned int>(lo_);
  754|     70|}
_ZNK4absl12lts_202401167uint128cvmEv:
  759|  14.9k|constexpr uint128::operator unsigned long() const {  // NOLINT(runtime/int)
  760|  14.9k|  return static_cast<unsigned long>(lo_);            // NOLINT(runtime/int)
  761|  14.9k|}
_ZNK4absl12lts_202401167uint128cvoEv:
  776|   245k|constexpr uint128::operator unsigned __int128() const {
  777|   245k|  return (static_cast<unsigned __int128>(hi_) << 64) + lo_;
  778|   245k|}
_ZN4absl12lts_20240116eqENS0_7uint128ES1_:
  798|  22.4k|constexpr bool operator==(uint128 lhs, uint128 rhs) {
  799|  22.4k|#if defined(ABSL_HAVE_INTRINSIC_INT128)
  800|  22.4k|  return static_cast<unsigned __int128>(lhs) ==
  801|  22.4k|         static_cast<unsigned __int128>(rhs);
  802|       |#else
  803|       |  return (Uint128Low64(lhs) == Uint128Low64(rhs) &&
  804|       |          Uint128High64(lhs) == Uint128High64(rhs));
  805|       |#endif
  806|  22.4k|}
_ZN4absl12lts_20240116ltENS0_7uint128ES1_:
  810|  13.3k|constexpr bool operator<(uint128 lhs, uint128 rhs) {
  811|  13.3k|#ifdef ABSL_HAVE_INTRINSIC_INT128
  812|  13.3k|  return static_cast<unsigned __int128>(lhs) <
  813|  13.3k|         static_cast<unsigned __int128>(rhs);
  814|       |#else
  815|       |  return (Uint128High64(lhs) == Uint128High64(rhs))
  816|       |             ? (Uint128Low64(lhs) < Uint128Low64(rhs))
  817|       |             : (Uint128High64(lhs) < Uint128High64(rhs));
  818|       |#endif
  819|  13.3k|}
_ZN4absl12lts_20240116gtENS0_7uint128ES1_:
  821|  13.3k|constexpr bool operator>(uint128 lhs, uint128 rhs) { return rhs < lhs; }
_ZN4absl12lts_20240116anENS0_7uint128ES1_:
  871|  15.0k|constexpr inline uint128 operator&(uint128 lhs, uint128 rhs) {
  872|  15.0k|#if defined(ABSL_HAVE_INTRINSIC_INT128)
  873|  15.0k|  return static_cast<unsigned __int128>(lhs) &
  874|  15.0k|         static_cast<unsigned __int128>(rhs);
  875|       |#else
  876|       |  return MakeUint128(Uint128High64(lhs) & Uint128High64(rhs),
  877|       |                     Uint128Low64(lhs) & Uint128Low64(rhs));
  878|       |#endif
  879|  15.0k|}
_ZN4absl12lts_20240116lsENS0_7uint128Ei:
  908|  28.2k|constexpr uint128 operator<<(uint128 lhs, int amount) {
  909|  28.2k|#ifdef ABSL_HAVE_INTRINSIC_INT128
  910|  28.2k|  return static_cast<unsigned __int128>(lhs) << amount;
  911|       |#else
  912|       |  // uint64_t shifts of >= 64 are undefined, so we will need some
  913|       |  // special-casing.
  914|       |  return amount >= 64  ? MakeUint128(Uint128Low64(lhs) << (amount - 64), 0)
  915|       |         : amount == 0 ? lhs
  916|       |                       : MakeUint128((Uint128High64(lhs) << amount) |
  917|       |                                         (Uint128Low64(lhs) >> (64 - amount)),
  918|       |                                     Uint128Low64(lhs) << amount);
  919|       |#endif
  920|  28.2k|}
_ZN4absl12lts_20240116rsENS0_7uint128Ei:
  922|  24.0k|constexpr uint128 operator>>(uint128 lhs, int amount) {
  923|  24.0k|#ifdef ABSL_HAVE_INTRINSIC_INT128
  924|  24.0k|  return static_cast<unsigned __int128>(lhs) >> amount;
  925|       |#else
  926|       |  // uint64_t shifts of >= 64 are undefined, so we will need some
  927|       |  // special-casing.
  928|       |  return amount >= 64  ? MakeUint128(0, Uint128High64(lhs) >> (amount - 64))
  929|       |         : amount == 0 ? lhs
  930|       |                       : MakeUint128(Uint128High64(lhs) >> amount,
  931|       |                                     (Uint128Low64(lhs) >> amount) |
  932|       |                                         (Uint128High64(lhs) << (64 - amount)));
  933|       |#endif
  934|  24.0k|}
_ZN4absl12lts_20240116plENS0_7uint128ES1_:
  947|  4.21k|constexpr uint128 operator+(uint128 lhs, uint128 rhs) {
  948|  4.21k|#if defined(ABSL_HAVE_INTRINSIC_INT128)
  949|  4.21k|  return static_cast<unsigned __int128>(lhs) +
  950|  4.21k|         static_cast<unsigned __int128>(rhs);
  951|       |#else
  952|       |  return int128_internal::AddResult(
  953|       |      MakeUint128(Uint128High64(lhs) + Uint128High64(rhs),
  954|       |                  Uint128Low64(lhs) + Uint128Low64(rhs)),
  955|       |      lhs);
  956|       |#endif
  957|  4.21k|}
_ZN4absl12lts_20240116miENS0_7uint128ES1_:
  970|  22.4k|constexpr uint128 operator-(uint128 lhs, uint128 rhs) {
  971|  22.4k|#if defined(ABSL_HAVE_INTRINSIC_INT128)
  972|  22.4k|  return static_cast<unsigned __int128>(lhs) -
  973|  22.4k|         static_cast<unsigned __int128>(rhs);
  974|       |#else
  975|       |  return int128_internal::SubstructResult(
  976|       |      MakeUint128(Uint128High64(lhs) - Uint128High64(rhs),
  977|       |                  Uint128Low64(lhs) - Uint128Low64(rhs)),
  978|       |      lhs, rhs);
  979|       |#endif
  980|  22.4k|}
_ZN4absl12lts_20240116mlENS0_7uint128ES1_:
  982|  19.0k|inline uint128 operator*(uint128 lhs, uint128 rhs) {
  983|  19.0k|#if defined(ABSL_HAVE_INTRINSIC_INT128)
  984|       |  // TODO(strel) Remove once alignment issues are resolved and unsigned __int128
  985|       |  // can be used for uint128 storage.
  986|  19.0k|  return static_cast<unsigned __int128>(lhs) *
  987|  19.0k|         static_cast<unsigned __int128>(rhs);
  988|       |#elif defined(_MSC_VER) && defined(_M_X64) && !defined(_M_ARM64EC)
  989|       |  uint64_t carry;
  990|       |  uint64_t low = _umul128(Uint128Low64(lhs), Uint128Low64(rhs), &carry);
  991|       |  return MakeUint128(Uint128Low64(lhs) * Uint128High64(rhs) +
  992|       |                         Uint128High64(lhs) * Uint128Low64(rhs) + carry,
  993|       |                     low);
  994|       |#else   // ABSL_HAVE_INTRINSIC128
  995|       |  uint64_t a32 = Uint128Low64(lhs) >> 32;
  996|       |  uint64_t a00 = Uint128Low64(lhs) & 0xffffffff;
  997|       |  uint64_t b32 = Uint128Low64(rhs) >> 32;
  998|       |  uint64_t b00 = Uint128Low64(rhs) & 0xffffffff;
  999|       |  uint128 result =
 1000|       |      MakeUint128(Uint128High64(lhs) * Uint128Low64(rhs) +
 1001|       |                      Uint128Low64(lhs) * Uint128High64(rhs) + a32 * b32,
 1002|       |                  a00 * b00);
 1003|       |  result += uint128(a32 * b00) << 32;
 1004|       |  result += uint128(a00 * b32) << 32;
 1005|       |  return result;
 1006|       |#endif  // ABSL_HAVE_INTRINSIC128
 1007|  19.0k|}
_ZN4absl12lts_20240116dvENS0_7uint128ES1_:
 1010|     70|inline uint128 operator/(uint128 lhs, uint128 rhs) {
 1011|     70|  return static_cast<unsigned __int128>(lhs) /
 1012|     70|         static_cast<unsigned __int128>(rhs);
 1013|     70|}
_ZN4absl12lts_20240116rmENS0_7uint128ES1_:
 1015|     70|inline uint128 operator%(uint128 lhs, uint128 rhs) {
 1016|     70|  return static_cast<unsigned __int128>(lhs) %
 1017|     70|         static_cast<unsigned __int128>(rhs);
 1018|     70|}
_ZN4absl12lts_202401167uint128ppEv:
 1035|    869|inline uint128& uint128::operator++() {
 1036|    869|  *this += 1;
 1037|    869|  return *this;
 1038|    869|}
_ZN4absl12lts_2024011615int128_internal15BitCastToSignedEm:
 1139|     36|constexpr int64_t BitCastToSigned(uint64_t v) {
 1140|       |  // Casting an unsigned integer to a signed integer of the same
 1141|       |  // width is implementation defined behavior if the source value would not fit
 1142|       |  // in the destination type. We step around it with a roundtrip bitwise not
 1143|       |  // operation to make sure this function remains constexpr. Clang, GCC, and
 1144|       |  // MSVC optimize this to a no-op on x86-64.
 1145|     36|  return v & (uint64_t{1} << 63) ? ~static_cast<int64_t>(~v)
  ------------------
  |  Branch (1145:10): [True: 3, False: 33]
  ------------------
 1146|     36|                                 : static_cast<int64_t>(v);
 1147|     36|}
_ZN4absl12lts_202401167uint128C2Eo:
  674|   113k|    : lo_{static_cast<uint64_t>(v & ~uint64_t{0})},
  675|   113k|      hi_{static_cast<uint64_t>(v >> 64)} {}
_ZN4absl12lts_202401167uint128C2Em:
  665|  43.5k|constexpr uint128::uint128(unsigned long v) : lo_{v}, hi_{0} {}
_ZN4absl12lts_202401167uint128C2ENS0_6int128E:
  679|     36|    : lo_{Int128Low64(v)}, hi_{static_cast<uint64_t>(Int128High64(v))} {}
_ZN4absl12lts_202401167uint128C2Ei:
  654|  55.2k|    : lo_{static_cast<uint64_t>(v)},
  655|  55.2k|      hi_{v < 0 ? (std::numeric_limits<uint64_t>::max)() : 0} {}
  ------------------
  |  Branch (655:11): [True: 0, False: 55.2k]
  ------------------

_ZN4absl12lts_2024011611Int128Low64ENS0_6int128E:
   42|     36|constexpr uint64_t Int128Low64(int128 v) {
   43|     36|  return static_cast<uint64_t>(v.v_ & ~uint64_t{0});
   44|     36|}
_ZN4absl12lts_2024011612Int128High64ENS0_6int128E:
   46|     36|constexpr int64_t Int128High64(int128 v) {
   47|       |  // Initially cast to unsigned to prevent a right shift on a negative value.
   48|     36|  return int128_internal::BitCastToSigned(
   49|     36|      static_cast<uint64_t>(static_cast<unsigned __int128>(v.v_) >> 64));
   50|     36|}
_ZNK4absl12lts_202401166int128cvnEv:
  146|     72|constexpr int128::operator __int128() const { return v_; }
_ZN4absl12lts_20240116ltENS0_6int128ES1_:
  207|     36|constexpr bool operator<(int128 lhs, int128 rhs) {
  208|     36|  return static_cast<__int128>(lhs) < static_cast<__int128>(rhs);
  209|     36|}
_ZN4absl12lts_202401166int128C2Ei:
   59|     72|constexpr int128::int128(int v) : v_{v} {}

_ZN4absl12lts_2024011616numeric_internal20CountLeadingZeroes32Ej:
  133|      4|CountLeadingZeroes32(uint32_t x) {
  134|      4|#if ABSL_NUMERIC_INTERNAL_HAVE_BUILTIN_OR_GCC(__builtin_clz)
  135|       |  // Use __builtin_clz, which uses the following instructions:
  136|       |  //  x86: bsr, lzcnt
  137|       |  //  ARM64: clz
  138|       |  //  PPC: cntlzd
  139|       |
  140|      4|  static_assert(sizeof(unsigned int) == sizeof(x),
  141|      4|                "__builtin_clz does not take 32-bit arg");
  142|       |  // Handle 0 as a special case because __builtin_clz(0) is undefined.
  143|      4|  return x == 0 ? 32 : __builtin_clz(x);
  ------------------
  |  Branch (143:10): [True: 0, False: 4]
  ------------------
  144|       |#elif defined(_MSC_VER) && !defined(__clang__)
  145|       |  unsigned long result = 0;  // NOLINT(runtime/int)
  146|       |  if (_BitScanReverse(&result, x)) {
  147|       |    return 31 - result;
  148|       |  }
  149|       |  return 32;
  150|       |#else
  151|       |  int zeroes = 28;
  152|       |  if (x >> 16) {
  153|       |    zeroes -= 16;
  154|       |    x >>= 16;
  155|       |  }
  156|       |  if (x >> 8) {
  157|       |    zeroes -= 8;
  158|       |    x >>= 8;
  159|       |  }
  160|       |  if (x >> 4) {
  161|       |    zeroes -= 4;
  162|       |    x >>= 4;
  163|       |  }
  164|       |  return "\4\3\2\2\1\1\1\1\0\0\0\0\0\0\0"[x] + zeroes;
  165|       |#endif
  166|      4|}
_ZN4absl12lts_2024011616numeric_internal20CountLeadingZeroes64Em:
  180|  21.2k|CountLeadingZeroes64(uint64_t x) {
  181|  21.2k|#if ABSL_NUMERIC_INTERNAL_HAVE_BUILTIN_OR_GCC(__builtin_clzll)
  182|       |  // Use __builtin_clzll, which uses the following instructions:
  183|       |  //  x86: bsr, lzcnt
  184|       |  //  ARM64: clz
  185|       |  //  PPC: cntlzd
  186|  21.2k|  static_assert(sizeof(unsigned long long) == sizeof(x),  // NOLINT(runtime/int)
  187|  21.2k|                "__builtin_clzll does not take 64-bit arg");
  188|       |
  189|       |  // Handle 0 as a special case because __builtin_clzll(0) is undefined.
  190|  21.2k|  return x == 0 ? 64 : __builtin_clzll(x);
  ------------------
  |  Branch (190:10): [True: 0, False: 21.2k]
  ------------------
  191|       |#elif defined(_MSC_VER) && !defined(__clang__) && \
  192|       |    (defined(_M_X64) || defined(_M_ARM64))
  193|       |  // MSVC does not have __buitin_clzll. Use _BitScanReverse64.
  194|       |  unsigned long result = 0;  // NOLINT(runtime/int)
  195|       |  if (_BitScanReverse64(&result, x)) {
  196|       |    return 63 - result;
  197|       |  }
  198|       |  return 64;
  199|       |#elif defined(_MSC_VER) && !defined(__clang__)
  200|       |  // MSVC does not have __buitin_clzll. Compose two calls to _BitScanReverse
  201|       |  unsigned long result = 0;  // NOLINT(runtime/int)
  202|       |  if ((x >> 32) &&
  203|       |      _BitScanReverse(&result, static_cast<unsigned long>(x >> 32))) {
  204|       |    return 31 - result;
  205|       |  }
  206|       |  if (_BitScanReverse(&result, static_cast<unsigned long>(x))) {
  207|       |    return 63 - result;
  208|       |  }
  209|       |  return 64;
  210|       |#else
  211|       |  int zeroes = 60;
  212|       |  if (x >> 32) {
  213|       |    zeroes -= 32;
  214|       |    x >>= 32;
  215|       |  }
  216|       |  if (x >> 16) {
  217|       |    zeroes -= 16;
  218|       |    x >>= 16;
  219|       |  }
  220|       |  if (x >> 8) {
  221|       |    zeroes -= 8;
  222|       |    x >>= 8;
  223|       |  }
  224|       |  if (x >> 4) {
  225|       |    zeroes -= 4;
  226|       |    x >>= 4;
  227|       |  }
  228|       |  return "\4\3\2\2\1\1\1\1\0\0\0\0\0\0\0"[x] + zeroes;
  229|       |#endif
  230|  21.2k|}
_ZN4absl12lts_2024011616numeric_internal28CountTrailingZeroesNonzero32Ej:
  251|     13|CountTrailingZeroesNonzero32(uint32_t x) {
  252|     13|#if ABSL_NUMERIC_INTERNAL_HAVE_BUILTIN_OR_GCC(__builtin_ctz)
  253|     13|  static_assert(sizeof(unsigned int) == sizeof(x),
  254|     13|                "__builtin_ctz does not take 32-bit arg");
  255|     13|  return __builtin_ctz(x);
  256|       |#elif defined(_MSC_VER) && !defined(__clang__)
  257|       |  unsigned long result = 0;  // NOLINT(runtime/int)
  258|       |  _BitScanForward(&result, x);
  259|       |  return result;
  260|       |#else
  261|       |  int c = 31;
  262|       |  x &= ~x + 1;
  263|       |  if (x & 0x0000FFFF) c -= 16;
  264|       |  if (x & 0x00FF00FF) c -= 8;
  265|       |  if (x & 0x0F0F0F0F) c -= 4;
  266|       |  if (x & 0x33333333) c -= 2;
  267|       |  if (x & 0x55555555) c -= 1;
  268|       |  return c;
  269|       |#endif
  270|     13|}
_ZN4absl12lts_2024011616numeric_internal28CountTrailingZeroesNonzero16Et:
  305|     62|CountTrailingZeroesNonzero16(uint16_t x) {
  306|     62|#if ABSL_HAVE_BUILTIN(__builtin_ctzs)
  307|     62|  static_assert(sizeof(unsigned short) == sizeof(x),  // NOLINT(runtime/int)
  308|     62|                "__builtin_ctzs does not take 16-bit arg");
  309|     62|  return __builtin_ctzs(x);
  310|       |#else
  311|       |  return CountTrailingZeroesNonzero32(x);
  312|       |#endif
  313|     62|}
_ZN4absl12lts_2024011616numeric_internal11RotateRightImEET_S3_i:
   75|     26|    T x, int s) noexcept {
   76|     26|  static_assert(std::is_unsigned<T>::value, "T must be unsigned");
   77|     26|  static_assert(IsPowerOf2(std::numeric_limits<T>::digits),
   78|     26|                "T must have a power-of-2 size");
   79|       |
   80|     26|  return static_cast<T>(x >> (s & (std::numeric_limits<T>::digits - 1))) |
   81|     26|         static_cast<T>(x << ((-s) & (std::numeric_limits<T>::digits - 1)));
   82|     26|}
_ZN4absl12lts_2024011616numeric_internal18CountLeadingZeroesImEEiT_:
  234|  21.2k|CountLeadingZeroes(T x) {
  235|  21.2k|  static_assert(std::is_unsigned<T>::value, "T must be unsigned");
  236|  21.2k|  static_assert(IsPowerOf2(std::numeric_limits<T>::digits),
  237|  21.2k|                "T must have a power-of-2 size");
  238|  21.2k|  static_assert(sizeof(T) <= sizeof(uint64_t), "T too large");
  239|  21.2k|  return sizeof(T) <= sizeof(uint16_t)
  ------------------
  |  Branch (239:10): [Folded, False: 21.2k]
  ------------------
  240|  21.2k|             ? CountLeadingZeroes16(static_cast<uint16_t>(x)) -
  241|      0|                   (std::numeric_limits<uint16_t>::digits -
  242|      0|                    std::numeric_limits<T>::digits)
  243|  21.2k|             : (sizeof(T) <= sizeof(uint32_t)
  ------------------
  |  Branch (243:17): [Folded, False: 21.2k]
  ------------------
  244|  21.2k|                    ? CountLeadingZeroes32(static_cast<uint32_t>(x)) -
  245|      0|                          (std::numeric_limits<uint32_t>::digits -
  246|      0|                           std::numeric_limits<T>::digits)
  247|  21.2k|                    : CountLeadingZeroes64(x));
  248|  21.2k|}
_ZN4absl12lts_2024011616numeric_internal19CountTrailingZeroesIjEEiT_:
  317|     13|CountTrailingZeroes(T x) noexcept {
  318|     13|  static_assert(std::is_unsigned<T>::value, "T must be unsigned");
  319|     13|  static_assert(IsPowerOf2(std::numeric_limits<T>::digits),
  320|     13|                "T must have a power-of-2 size");
  321|     13|  static_assert(sizeof(T) <= sizeof(uint64_t), "T too large");
  322|     13|  return x == 0 ? std::numeric_limits<T>::digits
  ------------------
  |  Branch (322:10): [True: 0, False: 13]
  ------------------
  323|     13|                : (sizeof(T) <= sizeof(uint16_t)
  ------------------
  |  Branch (323:20): [Folded, False: 13]
  ------------------
  324|     13|                       ? CountTrailingZeroesNonzero16(static_cast<uint16_t>(x))
  325|     13|                       : (sizeof(T) <= sizeof(uint32_t)
  ------------------
  |  Branch (325:27): [True: 13, Folded]
  ------------------
  326|     13|                              ? CountTrailingZeroesNonzero32(
  327|     13|                                    static_cast<uint32_t>(x))
  328|     13|                              : CountTrailingZeroesNonzero64(x)));
  329|     13|}
_ZN4absl12lts_2024011616numeric_internal19CountTrailingZeroesItEEiT_:
  317|     62|CountTrailingZeroes(T x) noexcept {
  318|     62|  static_assert(std::is_unsigned<T>::value, "T must be unsigned");
  319|     62|  static_assert(IsPowerOf2(std::numeric_limits<T>::digits),
  320|     62|                "T must have a power-of-2 size");
  321|     62|  static_assert(sizeof(T) <= sizeof(uint64_t), "T too large");
  322|     62|  return x == 0 ? std::numeric_limits<T>::digits
  ------------------
  |  Branch (322:10): [True: 0, False: 62]
  ------------------
  323|     62|                : (sizeof(T) <= sizeof(uint16_t)
  ------------------
  |  Branch (323:20): [True: 62, Folded]
  ------------------
  324|     62|                       ? CountTrailingZeroesNonzero16(static_cast<uint16_t>(x))
  325|     62|                       : (sizeof(T) <= sizeof(uint32_t)
  ------------------
  |  Branch (325:27): [True: 0, Folded]
  ------------------
  326|      0|                              ? CountTrailingZeroesNonzero32(
  327|      0|                                    static_cast<uint32_t>(x))
  328|      0|                              : CountTrailingZeroesNonzero64(x)));
  329|     62|}
_ZN4absl12lts_2024011616numeric_internal18CountLeadingZeroesIjEEiT_:
  234|      4|CountLeadingZeroes(T x) {
  235|      4|  static_assert(std::is_unsigned<T>::value, "T must be unsigned");
  236|      4|  static_assert(IsPowerOf2(std::numeric_limits<T>::digits),
  237|      4|                "T must have a power-of-2 size");
  238|      4|  static_assert(sizeof(T) <= sizeof(uint64_t), "T too large");
  239|      4|  return sizeof(T) <= sizeof(uint16_t)
  ------------------
  |  Branch (239:10): [Folded, False: 4]
  ------------------
  240|      4|             ? CountLeadingZeroes16(static_cast<uint16_t>(x)) -
  241|      0|                   (std::numeric_limits<uint16_t>::digits -
  242|      0|                    std::numeric_limits<T>::digits)
  243|      4|             : (sizeof(T) <= sizeof(uint32_t)
  ------------------
  |  Branch (243:17): [True: 4, Folded]
  ------------------
  244|      4|                    ? CountLeadingZeroes32(static_cast<uint32_t>(x)) -
  245|      4|                          (std::numeric_limits<uint32_t>::digits -
  246|      4|                           std::numeric_limits<T>::digits)
  247|      4|                    : CountLeadingZeroes64(x));
  248|      4|}

_ZN4absl12lts_2024011615random_internal24InitDiscreteDistributionEPNSt3__16vectorIdNS2_9allocatorIdEEEE:
   24|      2|    std::vector<double>* probabilities) {
   25|       |  // The empty-case should already be handled by the constructor.
   26|      2|  assert(probabilities);
   27|      2|  assert(!probabilities->empty());
   28|       |
   29|       |  // Step 1. Normalize the input probabilities to 1.0.
   30|      2|  double sum = std::accumulate(std::begin(*probabilities),
   31|      2|                               std::end(*probabilities), 0.0);
   32|      2|  if (std::fabs(sum - 1.0) > 1e-6) {
  ------------------
  |  Branch (32:7): [True: 2, False: 0]
  ------------------
   33|       |    // Scale `probabilities` only when the sum is too far from 1.0.  Scaling
   34|       |    // unconditionally will alter the probabilities slightly.
   35|      2|    for (double& item : *probabilities) {
  ------------------
  |  Branch (35:23): [True: 2, False: 2]
  ------------------
   36|      2|      item = item / sum;
   37|      2|    }
   38|      2|  }
   39|       |
   40|       |  // Step 2. At this point `probabilities` is set to the conditional
   41|       |  // probabilities of each element which sum to 1.0, to within reasonable error.
   42|       |  // These values are used to construct the proportional probability tables for
   43|       |  // the selection phases of Walker's Aliasing algorithm.
   44|       |  //
   45|       |  // To construct the table, pick an element which is under-full (i.e., an
   46|       |  // element for which `(*probabilities)[i] < 1.0/n`), and pair it with an
   47|       |  // element which is over-full (i.e., an element for which
   48|       |  // `(*probabilities)[i] > 1.0/n`). The smaller value can always be retired.
   49|       |  // The larger may still be greater than 1.0/n, or may now be less than 1.0/n,
   50|       |  // and put back onto the appropriate collection.
   51|      2|  const size_t n = probabilities->size();
   52|      2|  std::vector<std::pair<double, size_t>> q;
   53|      2|  q.reserve(n);
   54|       |
   55|      2|  std::vector<size_t> over;
   56|      2|  std::vector<size_t> under;
   57|      2|  size_t idx = 0;
   58|      2|  for (const double item : *probabilities) {
  ------------------
  |  Branch (58:26): [True: 2, False: 2]
  ------------------
   59|      2|    assert(item >= 0);
   60|      2|    const double v = item * n;
   61|      2|    q.emplace_back(v, 0);
   62|      2|    if (v < 1.0) {
  ------------------
  |  Branch (62:9): [True: 0, False: 2]
  ------------------
   63|      0|      under.push_back(idx++);
   64|      2|    } else {
   65|      2|      over.push_back(idx++);
   66|      2|    }
   67|      2|  }
   68|      2|  while (!over.empty() && !under.empty()) {
  ------------------
  |  Branch (68:10): [True: 2, False: 0]
  |  Branch (68:27): [True: 0, False: 2]
  ------------------
   69|      0|    auto lo = under.back();
   70|      0|    under.pop_back();
   71|      0|    auto hi = over.back();
   72|      0|    over.pop_back();
   73|       |
   74|      0|    q[lo].second = hi;
   75|      0|    const double r = q[hi].first - (1.0 - q[lo].first);
   76|      0|    q[hi].first = r;
   77|      0|    if (r < 1.0) {
  ------------------
  |  Branch (77:9): [True: 0, False: 0]
  ------------------
   78|      0|      under.push_back(hi);
   79|      0|    } else {
   80|      0|      over.push_back(hi);
   81|      0|    }
   82|      0|  }
   83|       |
   84|       |  // Due to rounding errors, there may be un-paired elements in either
   85|       |  // collection; these should all be values near 1.0.  For these values, set `q`
   86|       |  // to 1.0 and set the alternate to the identity.
   87|      2|  for (auto i : over) {
  ------------------
  |  Branch (87:15): [True: 2, False: 2]
  ------------------
   88|      2|    q[i] = {1.0, i};
   89|      2|  }
   90|      2|  for (auto i : under) {
  ------------------
  |  Branch (90:15): [True: 0, False: 2]
  ------------------
   91|      0|    q[i] = {1.0, i};
   92|      0|  }
   93|      2|  return q;
   94|      2|}

_ZN4absl12lts_2024011621discrete_distributionIiEC2Ev:
  111|      2|  discrete_distribution() : param_() {}
_ZN4absl12lts_2024011621discrete_distributionIiE10param_typeC2Ev:
   60|      2|    param_type() { init(); }
_ZN4absl12lts_2024011621discrete_distributionIiE10param_type4initEv:
  183|      4|void discrete_distribution<IntType>::param_type::init() {
  184|      4|  if (p_.empty()) {
  ------------------
  |  Branch (184:7): [True: 2, False: 2]
  ------------------
  185|      2|    p_.push_back(1.0);
  186|      2|    q_.emplace_back(1.0, 0);
  187|      2|  } else {
  188|       |    assert(n() <= (std::numeric_limits<IntType>::max)());
  189|      2|    q_ = random_internal::InitDiscreteDistribution(&p_);
  190|      2|  }
  191|      4|}
_ZN4absl12lts_2024011621discrete_distributionIiEC2INSt3__111__wrap_iterIPdEEEET_S8_:
  117|      2|      : param_(begin, end) {}
_ZN4absl12lts_2024011621discrete_distributionIiE10param_typeC2INSt3__111__wrap_iterIPdEEEET_S9_:
   64|      2|        : p_(begin, end) {
   65|      2|      init();
   66|      2|    }

_ZN4absl12lts_202401167UniformIjRNS0_15random_internal17NonsecureURBGBaseINS2_13randen_engineImEENS2_17RandenPoolSeedSeqEEEEENSt3__19enable_ifIXntsr3std9is_signedIT_EE5valueESB_E4typeEOT0_:
  212|     16|Uniform(URBG&& urbg) {  // NOLINT(runtime/references)
  213|     16|  using gen_t = absl::decay_t<URBG>;
  214|     16|  using distribution_t = random_internal::UniformDistributionWrapper<R>;
  215|       |
  216|     16|  return random_internal::DistributionCaller<gen_t>::template Call<
  217|     16|      distribution_t>(&urbg);
  218|     16|}

_ZN4absl12lts_2024011615random_internal18DistributionCallerINS1_17NonsecureURBGBaseINS1_13randen_engineImEENS1_17RandenPoolSeedSeqEEEE4CallINS1_26UniformDistributionWrapperIjEEJEEENT_11result_typeEPS7_DpOT0_:
   85|     16|  static typename DistrT::result_type Call(URBG* urbg, Args&&... args) {
   86|     16|    return Impl<DistrT, Args...>(HasInvokeMock{}, urbg,
   87|     16|                                 std::forward<Args>(args)...);
   88|     16|  }
_ZN4absl12lts_2024011615random_internal18DistributionCallerINS1_17NonsecureURBGBaseINS1_13randen_engineImEENS1_17RandenPoolSeedSeqEEEE4ImplINS1_26UniformDistributionWrapperIjEEJEEENT_11result_typeENSt3__117integral_constantIbLb0EEEPS7_DpOT0_:
   59|     16|                                           Args&&... args) {
   60|     16|    DistrT dist(std::forward<Args>(args)...);
   61|     16|    return dist(*urbg);
   62|     16|  }

_ZN4absl12lts_2024011615random_internal15FastUniformBitsIjEclINS1_17NonsecureURBGBaseINS1_13randen_engineImEENS1_17RandenPoolSeedSeqEEEEEjRT_:
  122|     16|FastUniformBits<UIntType>::operator()(URBG& g) {  // NOLINT(runtime/references)
  123|       |  // kRangeMask is the mask used when sampling variates from the URBG when the
  124|       |  // width of the URBG range is not a power of 2.
  125|       |  // Y = (2 ^ kRange) - 1
  126|     16|  static_assert((URBG::max)() > (URBG::min)(),
  127|     16|                "URBG::max and URBG::min may not be equal.");
  128|       |
  129|     16|  using tag = absl::conditional_t<IsPowerOfTwoOrZero(RangeSize<URBG>()),
  130|     16|                                  SimplifiedLoopTag, RejectionLoopTag>;
  131|     16|  return Generate(g, tag{});
  132|     16|}
_ZN4absl12lts_2024011615random_internal15FastUniformBitsIjE8GenerateINS1_17NonsecureURBGBaseINS1_13randen_engineImEENS1_17RandenPoolSeedSeqEEEEEjRT_NS1_17SimplifiedLoopTagE:
  138|     16|                                    SimplifiedLoopTag) {
  139|       |  // The simplified version of FastUniformBits works only on URBGs that have
  140|       |  // a range that is a power of 2. In this case we simply loop and shift without
  141|       |  // attempting to balance the bits across calls.
  142|     16|  static_assert(IsPowerOfTwoOrZero(RangeSize<URBG>()),
  143|     16|                "incorrect Generate tag for URBG instance");
  144|       |
  145|     16|  static constexpr size_t kResultBits =
  146|     16|      std::numeric_limits<result_type>::digits;
  147|     16|  static constexpr size_t kUrbgBits = NumBits<URBG>();
  148|     16|  static constexpr size_t kIters =
  149|     16|      (kResultBits / kUrbgBits) + (kResultBits % kUrbgBits != 0);
  150|     16|  static constexpr size_t kShift = (kIters == 1) ? 0 : kUrbgBits;
  ------------------
  |  Branch (150:36): [True: 0, Folded]
  ------------------
  151|     16|  static constexpr auto kMin = (URBG::min)();
  152|       |
  153|     16|  result_type r = static_cast<result_type>(g() - kMin);
  154|     16|  for (size_t n = 1; n < kIters; ++n) {
  ------------------
  |  Branch (154:22): [True: 0, False: 16]
  ------------------
  155|      0|    r = static_cast<result_type>(r << kShift) +
  156|      0|        static_cast<result_type>(g() - kMin);
  157|      0|  }
  158|     16|  return r;
  159|     16|}

_ZN4absl12lts_2024011615random_internal17RandenPoolSeedSeq4sizeEv:
   67|      2|  size_t size() { return 0; }
_ZN4absl12lts_2024011615random_internal17NonsecureURBGBaseINS1_13randen_engineImEENS1_17RandenPoolSeedSeqEEC2Ev:
  101|      2|  NonsecureURBGBase() : urbg_(ConstructURBG()) {}
_ZN4absl12lts_2024011615random_internal17NonsecureURBGBaseINS1_13randen_engineImEENS1_17RandenPoolSeedSeqEE13ConstructURBGEv:
  142|      2|  static URBG ConstructURBG() {
  143|      2|    Seeder seeder;
  144|      2|    return URBG(seeder);
  145|      2|  }
_ZN4absl12lts_2024011615random_internal17RandenPoolSeedSeq8generateIPjEEvT_S5_:
   73|      2|  void generate(RandomAccessIterator begin, RandomAccessIterator end) {
   74|       |    // RandomAccessIterator must be assignable from uint32_t
   75|      2|    if (begin != end) {
  ------------------
  |  Branch (75:9): [True: 2, False: 0]
  ------------------
   76|      2|      using U = typename std::iterator_traits<RandomAccessIterator>::value_type;
   77|       |      // ContiguousTag indicates the common case of a known contiguous buffer,
   78|       |      // which allows directly filling the buffer. In C++20,
   79|       |      // std::contiguous_iterator_tag provides a mechanism for testing this
   80|       |      // capability, however until Abseil's support requirements allow us to
   81|       |      // assume C++20, limit checks to a few common cases.
   82|      2|      using TagType = absl::conditional_t<
   83|      2|          (std::is_pointer<RandomAccessIterator>::value ||
   84|      2|           std::is_same<RandomAccessIterator,
   85|      2|                        typename std::vector<U>::iterator>::value),
   86|      2|          ContiguousTag, BufferTag>;
   87|       |
   88|      2|      generate_impl(TagType{}, begin, end);
   89|      2|    }
   90|      2|  }
_ZN4absl12lts_2024011615random_internal17RandenPoolSeedSeq13generate_implIPjEEvNS2_13ContiguousTagET_S6_:
   46|      2|  void generate_impl(ContiguousTag, Contiguous begin, Contiguous end) {
   47|      2|    const size_t n = static_cast<size_t>(std::distance(begin, end));
   48|      2|    auto* a = &(*begin);
   49|      2|    RandenPool<uint8_t>::Fill(
   50|      2|        absl::MakeSpan(reinterpret_cast<uint8_t*>(a), sizeof(*a) * n));
   51|      2|  }
_ZN4absl12lts_2024011615random_internal17NonsecureURBGBaseINS1_13randen_engineImEENS1_17RandenPoolSeedSeqEEclEv:
  126|     16|  result_type operator()() { return urbg_(); }

_ZN4absl12lts_2024011615random_internal10RandenPoolIhE4FillENS0_4SpanIhEE:
  240|      2|void RandenPool<T>::Fill(absl::Span<result_type> data) {
  241|      2|  auto* pool = GetPoolForCurrentThread();
  242|      2|  pool->Fill(reinterpret_cast<uint8_t*>(data.data()),
  243|      2|             data.size() * sizeof(result_type));
  244|      2|}
pool_urbg.cc:_ZN4absl12lts_2024011615random_internal12_GLOBAL__N_115RandenPoolEntry11MaybeRefillEv:
   69|      2|  inline void MaybeRefill() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
   70|      2|    if (next_ >= kState) {
  ------------------
  |  Branch (70:9): [True: 2, False: 0]
  ------------------
   71|      2|      next_ = kCapacity;
   72|      2|      impl_.Generate(state_);
   73|      2|    }
   74|      2|  }
pool_urbg.cc:_ZN4absl12lts_2024011615random_internal12_GLOBAL__N_115RandenPoolEntry4FillEPhm:
  120|      2|void RandenPoolEntry::Fill(uint8_t* out, size_t bytes) {
  121|      2|  SpinLockHolder l(&mu_);
  122|      4|  while (bytes > 0) {
  ------------------
  |  Branch (122:10): [True: 2, False: 2]
  ------------------
  123|      2|    MaybeRefill();
  124|      2|    size_t remaining = (kState - next_) * sizeof(state_[0]);
  125|      2|    size_t to_copy = std::min(bytes, remaining);
  126|      2|    std::memcpy(out, &state_[next_], to_copy);
  127|      2|    out += to_copy;
  128|      2|    bytes -= to_copy;
  129|      2|    next_ += (to_copy + sizeof(state_[0]) - 1) / sizeof(state_[0]);
  130|      2|  }
  131|      2|}
pool_urbg.cc:_ZN4absl12lts_2024011615random_internal12_GLOBAL__N_123GetPoolForCurrentThreadEv:
  226|      2|RandenPoolEntry* GetPoolForCurrentThread() {
  227|      2|  absl::call_once(pool_once, InitPoolURBG);
  228|      2|  return shared_pools[GetPoolID()];
  229|      2|}
pool_urbg.cc:_ZN4absl12lts_2024011615random_internal12_GLOBAL__N_112InitPoolURBGEv:
  209|      2|void InitPoolURBG() {
  210|      2|  static constexpr size_t kSeedSize =
  211|      2|      RandenTraits::kStateBytes / sizeof(uint32_t);
  212|       |  // Read the seed data from OS entropy once.
  213|      2|  uint32_t seed_material[kPoolSize * kSeedSize];
  214|      2|  if (!random_internal::ReadSeedMaterialFromOSEntropy(
  ------------------
  |  Branch (214:7): [True: 0, False: 2]
  ------------------
  215|      2|          absl::MakeSpan(seed_material))) {
  216|      0|    random_internal::ThrowSeedGenException();
  217|      0|  }
  218|     18|  for (size_t i = 0; i < kPoolSize; i++) {
  ------------------
  |  Branch (218:22): [True: 16, False: 2]
  ------------------
  219|     16|    shared_pools[i] = PoolAlignedAlloc();
  220|     16|    shared_pools[i]->Init(
  221|     16|        absl::MakeSpan(&seed_material[i * kSeedSize], kSeedSize));
  222|     16|  }
  223|      2|}
pool_urbg.cc:_ZN4absl12lts_2024011615random_internal12_GLOBAL__N_116PoolAlignedAllocEv:
  190|     16|RandenPoolEntry* PoolAlignedAlloc() {
  191|     16|  constexpr size_t kAlignment =
  192|     16|      ABSL_CACHELINE_SIZE > 32 ? ABSL_CACHELINE_SIZE : 32;
  ------------------
  |  |   77|     16|#define ABSL_CACHELINE_SIZE 64
  ------------------
                    ABSL_CACHELINE_SIZE > 32 ? ABSL_CACHELINE_SIZE : 32;
  ------------------
  |  |   77|      0|#define ABSL_CACHELINE_SIZE 64
  ------------------
  |  Branch (192:7): [True: 0, Folded]
  ------------------
  193|       |
  194|       |  // Not all the platforms that we build for have std::aligned_alloc, however
  195|       |  // since we never free these objects, we can over allocate and munge the
  196|       |  // pointers to the correct alignment.
  197|     16|  uintptr_t x = reinterpret_cast<uintptr_t>(
  198|     16|      new char[sizeof(RandenPoolEntry) + kAlignment]);
  199|     16|  auto y = x % kAlignment;
  200|     16|  void* aligned = reinterpret_cast<void*>(y == 0 ? x : (x + kAlignment - y));
  ------------------
  |  Branch (200:43): [True: 7, False: 9]
  ------------------
  201|     16|  return new (aligned) RandenPoolEntry();
  202|     16|}
pool_urbg.cc:_ZN4absl12lts_2024011615random_internal12_GLOBAL__N_115RandenPoolEntry4InitENS0_4SpanIKjEE:
   56|     16|  void Init(absl::Span<const uint32_t> data) {
   57|     16|    SpinLockHolder l(&mu_);  // Always uncontested.
   58|     16|    std::copy(data.begin(), data.end(), std::begin(state_));
   59|     16|    next_ = kState;
   60|     16|  }
pool_urbg.cc:_ZN4absl12lts_2024011615random_internal12_GLOBAL__N_19GetPoolIDEv:
  150|      2|size_t GetPoolID() {
  151|      2|  static_assert(kPoolSize >= 1,
  152|      2|                "At least one urbg instance is required for PoolURBG");
  153|       |
  154|      2|  ABSL_CONST_INIT static std::atomic<uint64_t> sequence{0};
  ------------------
  |  |  745|      2|#define ABSL_CONST_INIT [[clang::require_constant_initialization]]
  ------------------
  155|       |
  156|      2|#ifdef ABSL_HAVE_THREAD_LOCAL
  157|      2|  static thread_local size_t my_pool_id = kPoolSize;
  158|      2|  if (ABSL_PREDICT_FALSE(my_pool_id == kPoolSize)) {
  ------------------
  |  |  178|      2|#define ABSL_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
  |  |  ------------------
  |  |  |  Branch (178:31): [True: 2, False: 0]
  |  |  |  Branch (178:49): [Folded, False: 2]
  |  |  |  Branch (178:58): [True: 2, False: 0]
  |  |  ------------------
  ------------------
  159|      2|    my_pool_id = (sequence++ % kPoolSize);
  160|      2|  }
  161|      2|  return my_pool_id;
  162|       |#else
  163|       |  static pthread_key_t tid_key = [] {
  164|       |    pthread_key_t tmp_key;
  165|       |    int err = pthread_key_create(&tmp_key, nullptr);
  166|       |    if (err) {
  167|       |      ABSL_RAW_LOG(FATAL, "pthread_key_create failed with %d", err);
  168|       |    }
  169|       |    return tmp_key;
  170|       |  }();
  171|       |
  172|       |  // Store the value in the pthread_{get/set}specific. However an uninitialized
  173|       |  // value is 0, so add +1 to distinguish from the null value.
  174|       |  uintptr_t my_pool_id =
  175|       |      reinterpret_cast<uintptr_t>(pthread_getspecific(tid_key));
  176|       |  if (ABSL_PREDICT_FALSE(my_pool_id == 0)) {
  177|       |    // No allocated ID, allocate the next value, cache it, and return.
  178|       |    my_pool_id = (sequence++ % kPoolSize) + 1;
  179|       |    int err = pthread_setspecific(tid_key, reinterpret_cast<void*>(my_pool_id));
  180|       |    if (err) {
  181|       |      ABSL_RAW_LOG(FATAL, "pthread_setspecific failed with %d", err);
  182|       |    }
  183|       |  }
  184|       |  return my_pool_id - 1;
  185|       |#endif
  186|      2|}

_ZN4absl12lts_2024011615random_internal6RandenC2Ev:
   81|     18|Randen::Randen() {
   82|     18|  auto tmp = GetRandenState();
   83|     18|  keys_ = tmp.keys;
   84|     18|#if ABSL_RANDOM_INTERNAL_AES_DISPATCH
   85|     18|  has_crypto_ = tmp.has_crypto;
   86|     18|#endif
   87|     18|}
randen.cc:_ZN4absl12lts_2024011615random_internal12_GLOBAL__N_114GetRandenStateEv:
   53|     18|RandenState GetRandenState() {
   54|     18|  static const RandenState state = []() {
   55|     18|    RandenState tmp;
   56|     18|#if ABSL_RANDOM_INTERNAL_AES_DISPATCH
   57|       |    // HW AES Dispatch.
   58|     18|    if (HasRandenHwAesImplementation() && CPUSupportsRandenHwAes()) {
   59|     18|      tmp.has_crypto = true;
   60|     18|      tmp.keys = RandenHwAes::GetKeys();
   61|     18|    } else {
   62|     18|      tmp.has_crypto = false;
   63|     18|      tmp.keys = RandenSlow::GetKeys();
   64|     18|    }
   65|       |#elif ABSL_HAVE_ACCELERATED_AES
   66|       |    // HW AES is enabled.
   67|       |    tmp.has_crypto = true;
   68|       |    tmp.keys = RandenHwAes::GetKeys();
   69|       |#else
   70|       |    // HW AES is disabled.
   71|       |    tmp.has_crypto = false;
   72|       |    tmp.keys = RandenSlow::GetKeys();
   73|       |#endif
   74|     18|    return tmp;
   75|     18|  }();
   76|     18|  return state;
   77|     18|}
randen.cc:_ZZN4absl12lts_2024011615random_internal12_GLOBAL__N_114GetRandenStateEvENK3$_0clEv:
   54|      2|  static const RandenState state = []() {
   55|      2|    RandenState tmp;
   56|      2|#if ABSL_RANDOM_INTERNAL_AES_DISPATCH
   57|       |    // HW AES Dispatch.
   58|      2|    if (HasRandenHwAesImplementation() && CPUSupportsRandenHwAes()) {
  ------------------
  |  Branch (58:9): [True: 2, False: 0]
  |  Branch (58:43): [True: 2, False: 0]
  ------------------
   59|      2|      tmp.has_crypto = true;
   60|      2|      tmp.keys = RandenHwAes::GetKeys();
   61|      2|    } else {
   62|      0|      tmp.has_crypto = false;
   63|      0|      tmp.keys = RandenSlow::GetKeys();
   64|      0|    }
   65|       |#elif ABSL_HAVE_ACCELERATED_AES
   66|       |    // HW AES is enabled.
   67|       |    tmp.has_crypto = true;
   68|       |    tmp.keys = RandenHwAes::GetKeys();
   69|       |#else
   70|       |    // HW AES is disabled.
   71|       |    tmp.has_crypto = false;
   72|       |    tmp.keys = RandenSlow::GetKeys();
   73|       |#endif
   74|      2|    return tmp;
   75|      2|  }();

_ZNK4absl12lts_2024011615random_internal6Randen8GenerateEPv:
   47|      4|  inline void Generate(void* state) const {
   48|      4|#if ABSL_RANDOM_INTERNAL_AES_DISPATCH
   49|       |    // HW AES Dispatch.
   50|      4|    if (has_crypto_) {
  ------------------
  |  Branch (50:9): [True: 4, False: 0]
  ------------------
   51|      4|      RandenHwAes::Generate(keys_, state);
   52|      4|    } else {
   53|      0|      RandenSlow::Generate(keys_, state);
   54|      0|    }
   55|       |#elif ABSL_HAVE_ACCELERATED_AES
   56|       |    // HW AES is enabled.
   57|       |    RandenHwAes::Generate(keys_, state);
   58|       |#else
   59|       |    // HW AES is disabled.
   60|       |    RandenSlow::Generate(keys_, state);
   61|       |#endif
   62|      4|  }
_ZNK4absl12lts_2024011615random_internal6Randen6AbsorbEPKvPv:
   68|      2|  inline void Absorb(const void* seed, void* state) const {
   69|      2|#if ABSL_RANDOM_INTERNAL_AES_DISPATCH
   70|       |    // HW AES Dispatch.
   71|      2|    if (has_crypto_) {
  ------------------
  |  Branch (71:9): [True: 2, False: 0]
  ------------------
   72|      2|      RandenHwAes::Absorb(seed, state);
   73|      2|    } else {
   74|      0|      RandenSlow::Absorb(seed, state);
   75|      0|    }
   76|       |#elif ABSL_HAVE_ACCELERATED_AES
   77|       |    // HW AES is enabled.
   78|       |    RandenHwAes::Absorb(seed, state);
   79|       |#else
   80|       |    // HW AES is disabled.
   81|       |    RandenSlow::Absorb(seed, state);
   82|       |#endif
   83|      2|  }

_ZN4absl12lts_2024011615random_internal22CPUSupportsRandenHwAesEv:
  135|      2|bool CPUSupportsRandenHwAes() {
  136|      2|#if defined(ABSL_INTERNAL_USE_X86_CPUID)
  137|       |  // 1. For x86: Use CPUID to detect the required AES instruction set.
  138|      2|  int regs[4];
  139|      2|  __cpuid(reinterpret_cast<int*>(regs), 1);
  140|      2|  return regs[2] & (1 << 25);  // AES
  141|       |
  142|       |#elif defined(ABSL_INTERNAL_USE_GETAUXVAL)
  143|       |  // 2. Use getauxval() to read the hardware bits and determine
  144|       |  // cpu capabilities.
  145|       |
  146|       |#define AT_HWCAP 16
  147|       |#define AT_HWCAP2 26
  148|       |#if defined(ABSL_ARCH_PPC)
  149|       |  // For Power / PPC: Expect that the cpu supports VCRYPTO
  150|       |  // See https://members.openpowerfoundation.org/document/dl/576
  151|       |  // VCRYPTO should be present in POWER8 >= 2.07.
  152|       |  // Uses Linux kernel constants from arch/powerpc/include/uapi/asm/cputable.h
  153|       |  static const uint32_t kVCRYPTO = 0x02000000;
  154|       |  const uint32_t hwcap = GetAuxval(AT_HWCAP2);
  155|       |  return (hwcap & kVCRYPTO) != 0;
  156|       |
  157|       |#elif defined(ABSL_ARCH_ARM)
  158|       |  // For ARM: Require crypto+neon
  159|       |  // http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0500f/CIHBIBBA.html
  160|       |  // Uses Linux kernel constants from arch/arm64/include/asm/hwcap.h
  161|       |  static const uint32_t kNEON = 1 << 12;
  162|       |  uint32_t hwcap = GetAuxval(AT_HWCAP);
  163|       |  if ((hwcap & kNEON) == 0) {
  164|       |    return false;
  165|       |  }
  166|       |
  167|       |  // And use it again to detect AES.
  168|       |  static const uint32_t kAES = 1 << 0;
  169|       |  const uint32_t hwcap2 = GetAuxval(AT_HWCAP2);
  170|       |  return (hwcap2 & kAES) != 0;
  171|       |
  172|       |#elif defined(ABSL_ARCH_AARCH64)
  173|       |  // For AARCH64: Require crypto+neon
  174|       |  // http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0500f/CIHBIBBA.html
  175|       |  static const uint32_t kNEON = 1 << 1;
  176|       |  static const uint32_t kAES = 1 << 3;
  177|       |  const uint32_t hwcap = GetAuxval(AT_HWCAP);
  178|       |  return ((hwcap & kNEON) != 0) && ((hwcap & kAES) != 0);
  179|       |#endif
  180|       |
  181|       |#else  // ABSL_INTERNAL_USE_GETAUXVAL
  182|       |  // 3. By default, assume that the compiler default.
  183|       |  return ABSL_HAVE_ACCELERATED_AES ? true : false;
  184|       |
  185|       |#endif
  186|       |  // NOTE: There are some other techniques that may be worth trying:
  187|       |  //
  188|       |  // * Use an environment variable: ABSL_RANDOM_USE_HWAES
  189|       |  //
  190|       |  // * Rely on compiler-generated target-based dispatch.
  191|       |  // Using x86/gcc it might look something like this:
  192|       |  //
  193|       |  // int __attribute__((target("aes"))) HasAes() { return 1; }
  194|       |  // int __attribute__((target("default"))) HasAes() { return 0; }
  195|       |  //
  196|       |  // This does not work on all architecture/compiler combinations.
  197|       |  //
  198|       |  // * On Linux consider reading /proc/cpuinfo and/or /proc/self/auxv.
  199|       |  // These files have lines which are easy to parse; for ARM/AARCH64 it is quite
  200|       |  // easy to find the Features: line and extract aes / neon. Likewise for
  201|       |  // PPC.
  202|       |  //
  203|       |  // * Fork a process and test for SIGILL:
  204|       |  //
  205|       |  // * Many architectures have instructions to read the ISA. Unfortunately
  206|       |  //   most of those require that the code is running in ring 0 /
  207|       |  //   protected-mode.
  208|       |  //
  209|       |  //   There are several examples. e.g. Valgrind detects PPC ISA 2.07:
  210|       |  //   https://github.com/lu-zero/valgrind/blob/master/none/tests/ppc64/test_isa_2_07_part1.c
  211|       |  //
  212|       |  //   MRS <Xt>, ID_AA64ISAR0_EL1 ; Read ID_AA64ISAR0_EL1 into Xt
  213|       |  //
  214|       |  //   uint64_t val;
  215|       |  //   __asm __volatile("mrs %0, id_aa64isar0_el1" :"=&r" (val));
  216|       |  //
  217|       |  // * Use a CPUID-style heuristic database.
  218|       |  //
  219|       |  // * On Apple (__APPLE__), AES is available on Arm v8.
  220|       |  //   https://stackoverflow.com/questions/45637888/how-to-determine-armv8-features-at-runtime-on-ios
  221|      2|}
randen_detect.cc:_ZL7__cpuidPii:
   54|      2|static void __cpuid(int cpu_info[4], int info_type) {
   55|      2|  __asm__ volatile("cpuid \n\t"
   56|      2|                   : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]),
   57|      2|                     "=d"(cpu_info[3])
   58|      2|                   : "a"(info_type), "c"(0));
   59|      2|}

_ZN4absl12lts_2024011615random_internal13randen_engineImEC2IRNS1_17RandenPoolSeedSeqEvEEOT_:
   67|      2|  explicit randen_engine(SeedSequence&& seq) {
   68|      2|    seed(seq);
   69|      2|  }
_ZN4absl12lts_2024011615random_internal13randen_engineImE4seedIRNS1_17RandenPoolSeedSeqEEENSt3__19enable_ifIXntsr3std14is_convertibleIT_mEE5valueEvE4typeEOS9_:
   97|      2|  seed(SeedSequence&& seq) {
   98|       |    // Zeroes the state.
   99|      2|    seed();
  100|      2|    reseed(seq);
  101|      2|  }
_ZN4absl12lts_2024011615random_internal13randen_engineImE4seedEm:
  103|      2|  void seed(result_type seed_value = 0) {
  104|      2|    next_ = kStateSizeT;
  105|       |    // Zeroes the inner state and fills the outer state with seed_value to
  106|       |    // mimic the behaviour of reseed
  107|      2|    auto* begin = state();
  108|      2|    std::fill(begin, begin + kCapacityT, 0);
  109|      2|    std::fill(begin + kCapacityT, begin + kStateSizeT, seed_value);
  110|      2|  }
_ZN4absl12lts_2024011615random_internal13randen_engineImE5stateEv:
  244|     20|  result_type* state() {
  245|     20|    return reinterpret_cast<result_type*>(
  246|     20|        (reinterpret_cast<uintptr_t>(&raw_state_) & 0xf) ? (raw_state_ + 8)
  ------------------
  |  Branch (246:9): [True: 0, False: 20]
  ------------------
  247|     20|                                                         : raw_state_);
  248|     20|  }
_ZN4absl12lts_2024011615random_internal13randen_engineImE6reseedINS1_17RandenPoolSeedSeqEEEvRT_:
  116|      2|  void reseed(SeedSequence& seq) {
  117|      2|    using sequence_result_type = typename SeedSequence::result_type;
  118|      2|    static_assert(sizeof(sequence_result_type) == 4,
  119|      2|                  "SeedSequence::result_type must be 32-bit");
  120|      2|    constexpr size_t kBufferSize =
  121|      2|        Randen::kSeedBytes / sizeof(sequence_result_type);
  122|      2|    alignas(16) sequence_result_type buffer[kBufferSize];
  123|       |
  124|       |    // Randen::Absorb XORs the seed into state, which is then mixed by a call
  125|       |    // to Randen::Generate. Seeding with only the provided entropy is preferred
  126|       |    // to using an arbitrary generate() call, so use [rand.req.seed_seq]
  127|       |    // size as a proxy for the number of entropy units that can be generated
  128|       |    // without relying on seed sequence mixing...
  129|      2|    const size_t entropy_size = seq.size();
  130|      2|    if (entropy_size < kBufferSize) {
  ------------------
  |  Branch (130:9): [True: 2, False: 0]
  ------------------
  131|       |      // ... and only request that many values, or 256-bits, when unspecified.
  132|      2|      const size_t requested_entropy = (entropy_size == 0) ? 8u : entropy_size;
  ------------------
  |  Branch (132:40): [True: 2, False: 0]
  ------------------
  133|      2|      std::fill(buffer + requested_entropy, buffer + kBufferSize, 0);
  134|      2|      seq.generate(buffer, buffer + requested_entropy);
  135|       |#ifdef ABSL_IS_BIG_ENDIAN
  136|       |      // Randen expects the seed buffer to be in Little Endian; reverse it on
  137|       |      // Big Endian platforms.
  138|       |      for (sequence_result_type& e : buffer) {
  139|       |        e = absl::little_endian::FromHost(e);
  140|       |      }
  141|       |#endif
  142|       |      // The Randen paper suggests preferentially initializing even-numbered
  143|       |      // 128-bit vectors of the randen state (there are 16 such vectors).
  144|       |      // The seed data is merged into the state offset by 128-bits, which
  145|       |      // implies preferring seed bytes [16..31, ..., 208..223]. Since the
  146|       |      // buffer is 32-bit values, we swap the corresponding buffer positions in
  147|       |      // 128-bit chunks.
  148|      2|      size_t dst = kBufferSize;
  149|     16|      while (dst > 7) {
  ------------------
  |  Branch (149:14): [True: 14, False: 2]
  ------------------
  150|       |        // leave the odd bucket as-is.
  151|     14|        dst -= 4;
  152|     14|        size_t src = dst >> 1;
  153|       |        // swap 128-bits into the even bucket
  154|     14|        std::swap(buffer[--dst], buffer[--src]);
  155|     14|        std::swap(buffer[--dst], buffer[--src]);
  156|     14|        std::swap(buffer[--dst], buffer[--src]);
  157|     14|        std::swap(buffer[--dst], buffer[--src]);
  158|     14|      }
  159|      2|    } else {
  160|      0|      seq.generate(buffer, buffer + kBufferSize);
  161|      0|    }
  162|      2|    impl_.Absorb(buffer, state());
  163|       |
  164|       |    // Generate will be called when operator() is called
  165|      2|    next_ = kStateSizeT;
  166|      2|  }
_ZN4absl12lts_2024011615random_internal13randen_engineImEclEv:
   84|     16|  result_type operator()() {
   85|       |    // Refill the buffer if needed (unlikely).
   86|     16|    auto* begin = state();
   87|     16|    if (next_ >= kStateSizeT) {
  ------------------
  |  Branch (87:9): [True: 2, False: 14]
  ------------------
   88|      2|      next_ = kCapacityT;
   89|      2|      impl_.Generate(begin);
   90|      2|    }
   91|     16|    return little_endian::ToHost(begin[next_++]);
   92|     16|  }

_ZN4absl12lts_2024011615random_internal28HasRandenHwAesImplementationEv:
  408|      2|bool HasRandenHwAesImplementation() { return true; }
_ZN4absl12lts_2024011615random_internal11RandenHwAes7GetKeysEv:
  410|      2|const void* ABSL_TARGET_CRYPTO RandenHwAes::GetKeys() {
  411|       |  // Round keys for one AES per Feistel round and branch.
  412|       |  // The canonical implementation uses first digits of Pi.
  413|       |#if defined(ABSL_ARCH_PPC)
  414|       |  return kRandenRoundKeysBE;
  415|       |#else
  416|      2|  return kRandenRoundKeys;
  417|      2|#endif
  418|      2|}
_ZN4absl12lts_2024011615random_internal11RandenHwAes6AbsorbEPKvPv:
  422|      2|                                            void* state_void) {
  423|      2|  static_assert(RandenTraits::kCapacityBytes / sizeof(Vector128) == 1,
  424|      2|                "Unexpected Randen kCapacityBlocks");
  425|      2|  static_assert(RandenTraits::kStateBytes / sizeof(Vector128) == 16,
  426|      2|                "Unexpected Randen kStateBlocks");
  427|       |
  428|      2|  auto* state = reinterpret_cast<absl::uint128 * ABSL_RANDOM_INTERNAL_RESTRICT>(
  429|      2|      state_void);
  430|      2|  const auto* seed =
  431|      2|      reinterpret_cast<const absl::uint128 * ABSL_RANDOM_INTERNAL_RESTRICT>(
  432|      2|          seed_void);
  433|       |
  434|      2|  Vector128 b1 = Vector128Load(state + 1);
  435|      2|  b1 ^= Vector128Load(seed + 0);
  436|      2|  Vector128Store(b1, state + 1);
  437|       |
  438|      2|  Vector128 b2 = Vector128Load(state + 2);
  439|      2|  b2 ^= Vector128Load(seed + 1);
  440|      2|  Vector128Store(b2, state + 2);
  441|       |
  442|      2|  Vector128 b3 = Vector128Load(state + 3);
  443|      2|  b3 ^= Vector128Load(seed + 2);
  444|      2|  Vector128Store(b3, state + 3);
  445|       |
  446|      2|  Vector128 b4 = Vector128Load(state + 4);
  447|      2|  b4 ^= Vector128Load(seed + 3);
  448|      2|  Vector128Store(b4, state + 4);
  449|       |
  450|      2|  Vector128 b5 = Vector128Load(state + 5);
  451|      2|  b5 ^= Vector128Load(seed + 4);
  452|      2|  Vector128Store(b5, state + 5);
  453|       |
  454|      2|  Vector128 b6 = Vector128Load(state + 6);
  455|      2|  b6 ^= Vector128Load(seed + 5);
  456|      2|  Vector128Store(b6, state + 6);
  457|       |
  458|      2|  Vector128 b7 = Vector128Load(state + 7);
  459|      2|  b7 ^= Vector128Load(seed + 6);
  460|      2|  Vector128Store(b7, state + 7);
  461|       |
  462|      2|  Vector128 b8 = Vector128Load(state + 8);
  463|      2|  b8 ^= Vector128Load(seed + 7);
  464|      2|  Vector128Store(b8, state + 8);
  465|       |
  466|      2|  Vector128 b9 = Vector128Load(state + 9);
  467|      2|  b9 ^= Vector128Load(seed + 8);
  468|      2|  Vector128Store(b9, state + 9);
  469|       |
  470|      2|  Vector128 b10 = Vector128Load(state + 10);
  471|      2|  b10 ^= Vector128Load(seed + 9);
  472|      2|  Vector128Store(b10, state + 10);
  473|       |
  474|      2|  Vector128 b11 = Vector128Load(state + 11);
  475|      2|  b11 ^= Vector128Load(seed + 10);
  476|      2|  Vector128Store(b11, state + 11);
  477|       |
  478|      2|  Vector128 b12 = Vector128Load(state + 12);
  479|      2|  b12 ^= Vector128Load(seed + 11);
  480|      2|  Vector128Store(b12, state + 12);
  481|       |
  482|      2|  Vector128 b13 = Vector128Load(state + 13);
  483|      2|  b13 ^= Vector128Load(seed + 12);
  484|      2|  Vector128Store(b13, state + 13);
  485|       |
  486|      2|  Vector128 b14 = Vector128Load(state + 14);
  487|      2|  b14 ^= Vector128Load(seed + 13);
  488|      2|  Vector128Store(b14, state + 14);
  489|       |
  490|      2|  Vector128 b15 = Vector128Load(state + 15);
  491|      2|  b15 ^= Vector128Load(seed + 14);
  492|      2|  Vector128Store(b15, state + 15);
  493|      2|}
_ZN4absl12lts_2024011615random_internal11RandenHwAes8GenerateEPKvPv:
  497|      4|                                              void* state_void) {
  498|      4|  static_assert(RandenTraits::kCapacityBytes == sizeof(Vector128),
  499|      4|                "Capacity mismatch");
  500|       |
  501|      4|  auto* state = reinterpret_cast<absl::uint128*>(state_void);
  502|      4|  const auto* keys = reinterpret_cast<const absl::uint128*>(keys_void);
  503|       |
  504|      4|  const Vector128 prev_inner = Vector128Load(state);
  505|       |
  506|      4|  SwapEndian(state);
  507|       |
  508|      4|  Permute(state, keys);
  509|       |
  510|      4|  SwapEndian(state);
  511|       |
  512|       |  // Ensure backtracking resistance.
  513|      4|  Vector128 inner = Vector128Load(state);
  514|      4|  inner ^= prev_inner;
  515|      4|  Vector128Store(inner, state);
  516|      4|}
randen_hwaes.cc:_ZN12_GLOBAL__N_113Vector128LoadEPKv:
  236|  2.78k|inline ABSL_TARGET_CRYPTO Vector128 Vector128Load(const void* from) {
  237|  2.78k|  return Vector128(_mm_load_si128(reinterpret_cast<const __m128i*>(from)));
  238|  2.78k|}
randen_hwaes.cc:_ZN12_GLOBAL__N_19Vector128C2ERKDv2_x:
  223|  3.87k|  inline explicit Vector128(const __m128i& v) : data_(v) {}
randen_hwaes.cc:_ZN12_GLOBAL__N_19Vector128eOERKS0_:
  227|     34|  inline Vector128& operator^=(const Vector128& other) {
  228|     34|    data_ = _mm_xor_si128(data_, other.data());
  229|     34|    return *this;
  230|     34|  }
randen_hwaes.cc:_ZNK12_GLOBAL__N_19Vector1284dataEv:
  225|  3.87k|  inline __m128i data() const { return data_; }
randen_hwaes.cc:_ZN12_GLOBAL__N_114Vector128StoreERKNS_9Vector128EPv:
  240|  1.66k|inline ABSL_TARGET_CRYPTO void Vector128Store(const Vector128& v, void* to) {
  241|  1.66k|  _mm_store_si128(reinterpret_cast<__m128i*>(to), v.data());
  242|  1.66k|}
randen_hwaes.cc:_ZN12_GLOBAL__N_110SwapEndianEPv:
  254|      8|inline ABSL_TARGET_CRYPTO void SwapEndian(void*) {}
randen_hwaes.cc:_ZN12_GLOBAL__N_17PermuteEPN4absl12lts_202401167uint128EPKS2_:
  391|      4|    const absl::uint128* ABSL_RANDOM_INTERNAL_RESTRICT keys) {
  392|       |  // (Successfully unrolled; the first iteration jumps into the second half)
  393|      4|#ifdef __clang__
  394|      4|#pragma clang loop unroll_count(2)
  395|      4|#endif
  396|     72|  for (size_t round = 0; round < RandenTraits::kFeistelRounds; ++round) {
  ------------------
  |  Branch (396:26): [True: 68, False: 4]
  ------------------
  397|     68|    keys = FeistelRound(state, keys);
  398|     68|    BlockShuffle(state);
  399|     68|  }
  400|      4|}
randen_hwaes.cc:_ZN12_GLOBAL__N_112FeistelRoundEPN4absl12lts_202401167uint128EPKS2_:
  329|     68|    const absl::uint128* ABSL_RANDOM_INTERNAL_RESTRICT keys) {
  330|     68|  static_assert(RandenTraits::kFeistelBlocks == 16,
  331|     68|                "Expecting 16 FeistelBlocks.");
  332|       |
  333|       |  // MSVC does a horrible job at unrolling loops.
  334|       |  // So we unroll the loop by hand to improve the performance.
  335|     68|  const Vector128 s0 = Vector128Load(state + 0);
  336|     68|  const Vector128 s1 = Vector128Load(state + 1);
  337|     68|  const Vector128 s2 = Vector128Load(state + 2);
  338|     68|  const Vector128 s3 = Vector128Load(state + 3);
  339|     68|  const Vector128 s4 = Vector128Load(state + 4);
  340|     68|  const Vector128 s5 = Vector128Load(state + 5);
  341|     68|  const Vector128 s6 = Vector128Load(state + 6);
  342|     68|  const Vector128 s7 = Vector128Load(state + 7);
  343|     68|  const Vector128 s8 = Vector128Load(state + 8);
  344|     68|  const Vector128 s9 = Vector128Load(state + 9);
  345|     68|  const Vector128 s10 = Vector128Load(state + 10);
  346|     68|  const Vector128 s11 = Vector128Load(state + 11);
  347|     68|  const Vector128 s12 = Vector128Load(state + 12);
  348|     68|  const Vector128 s13 = Vector128Load(state + 13);
  349|     68|  const Vector128 s14 = Vector128Load(state + 14);
  350|     68|  const Vector128 s15 = Vector128Load(state + 15);
  351|       |
  352|       |  // Encode even blocks with keys.
  353|     68|  const Vector128 e0 = AesRound(s0, Vector128Load(keys + 0));
  354|     68|  const Vector128 e2 = AesRound(s2, Vector128Load(keys + 1));
  355|     68|  const Vector128 e4 = AesRound(s4, Vector128Load(keys + 2));
  356|     68|  const Vector128 e6 = AesRound(s6, Vector128Load(keys + 3));
  357|     68|  const Vector128 e8 = AesRound(s8, Vector128Load(keys + 4));
  358|     68|  const Vector128 e10 = AesRound(s10, Vector128Load(keys + 5));
  359|     68|  const Vector128 e12 = AesRound(s12, Vector128Load(keys + 6));
  360|     68|  const Vector128 e14 = AesRound(s14, Vector128Load(keys + 7));
  361|       |
  362|       |  // Encode odd blocks with even output from above.
  363|     68|  const Vector128 o1 = AesRound(e0, s1);
  364|     68|  const Vector128 o3 = AesRound(e2, s3);
  365|     68|  const Vector128 o5 = AesRound(e4, s5);
  366|     68|  const Vector128 o7 = AesRound(e6, s7);
  367|     68|  const Vector128 o9 = AesRound(e8, s9);
  368|     68|  const Vector128 o11 = AesRound(e10, s11);
  369|     68|  const Vector128 o13 = AesRound(e12, s13);
  370|     68|  const Vector128 o15 = AesRound(e14, s15);
  371|       |
  372|       |  // Store odd blocks. (These will be shuffled later).
  373|     68|  Vector128Store(o1, state + 1);
  374|     68|  Vector128Store(o3, state + 3);
  375|     68|  Vector128Store(o5, state + 5);
  376|     68|  Vector128Store(o7, state + 7);
  377|     68|  Vector128Store(o9, state + 9);
  378|     68|  Vector128Store(o11, state + 11);
  379|     68|  Vector128Store(o13, state + 13);
  380|     68|  Vector128Store(o15, state + 15);
  381|       |
  382|     68|  return keys + 8;
  383|     68|}
randen_hwaes.cc:_ZN12_GLOBAL__N_18AesRoundERKNS_9Vector128ES2_:
  247|  1.08k|                                             const Vector128& round_key) {
  248|       |  // It is important to always use the full round function - omitting the
  249|       |  // final MixColumns reduces security [https://eprint.iacr.org/2010/041.pdf]
  250|       |  // and does not help because we never decrypt.
  251|  1.08k|  return Vector128(_mm_aesenc_si128(state.data(), round_key.data()));
  252|  1.08k|}
randen_hwaes.cc:_ZN12_GLOBAL__N_112BlockShuffleEPN4absl12lts_202401167uint128E:
  280|     68|inline ABSL_TARGET_CRYPTO void BlockShuffle(absl::uint128* state) {
  281|     68|  static_assert(RandenTraits::kFeistelBlocks == 16,
  282|     68|                "Expecting 16 FeistelBlocks.");
  283|       |
  284|     68|  constexpr size_t shuffle[RandenTraits::kFeistelBlocks] = {
  285|     68|      7, 2, 13, 4, 11, 8, 3, 6, 15, 0, 9, 10, 1, 14, 5, 12};
  286|       |
  287|     68|  const Vector128 v0 = Vector128Load(state + shuffle[0]);
  288|     68|  const Vector128 v1 = Vector128Load(state + shuffle[1]);
  289|     68|  const Vector128 v2 = Vector128Load(state + shuffle[2]);
  290|     68|  const Vector128 v3 = Vector128Load(state + shuffle[3]);
  291|     68|  const Vector128 v4 = Vector128Load(state + shuffle[4]);
  292|     68|  const Vector128 v5 = Vector128Load(state + shuffle[5]);
  293|     68|  const Vector128 v6 = Vector128Load(state + shuffle[6]);
  294|     68|  const Vector128 v7 = Vector128Load(state + shuffle[7]);
  295|     68|  const Vector128 w0 = Vector128Load(state + shuffle[8]);
  296|     68|  const Vector128 w1 = Vector128Load(state + shuffle[9]);
  297|     68|  const Vector128 w2 = Vector128Load(state + shuffle[10]);
  298|     68|  const Vector128 w3 = Vector128Load(state + shuffle[11]);
  299|     68|  const Vector128 w4 = Vector128Load(state + shuffle[12]);
  300|     68|  const Vector128 w5 = Vector128Load(state + shuffle[13]);
  301|     68|  const Vector128 w6 = Vector128Load(state + shuffle[14]);
  302|     68|  const Vector128 w7 = Vector128Load(state + shuffle[15]);
  303|       |
  304|     68|  Vector128Store(v0, state + 0);
  305|     68|  Vector128Store(v1, state + 1);
  306|     68|  Vector128Store(v2, state + 2);
  307|     68|  Vector128Store(v3, state + 3);
  308|     68|  Vector128Store(v4, state + 4);
  309|     68|  Vector128Store(v5, state + 5);
  310|     68|  Vector128Store(v6, state + 6);
  311|     68|  Vector128Store(v7, state + 7);
  312|     68|  Vector128Store(w0, state + 8);
  313|     68|  Vector128Store(w1, state + 9);
  314|     68|  Vector128Store(w2, state + 10);
  315|     68|  Vector128Store(w3, state + 11);
  316|     68|  Vector128Store(w4, state + 12);
  317|     68|  Vector128Store(w5, state + 13);
  318|     68|  Vector128Store(w6, state + 14);
  319|     68|  Vector128Store(w7, state + 15);
  320|     68|}

_ZN4absl12lts_2024011615random_internal29ReadSeedMaterialFromOSEntropyENS0_4SpanIjEE:
  205|      2|bool ReadSeedMaterialFromOSEntropy(absl::Span<uint32_t> values) {
  206|      2|  assert(values.data() != nullptr);
  207|      2|  if (values.data() == nullptr) {
  ------------------
  |  Branch (207:7): [True: 0, False: 2]
  ------------------
  208|      0|    return false;
  209|      0|  }
  210|      2|  if (values.empty()) {
  ------------------
  |  Branch (210:7): [True: 0, False: 2]
  ------------------
  211|      0|    return true;
  212|      0|  }
  213|      2|  return ReadSeedMaterialFromOSEntropyImpl(values);
  214|      2|}
seed_material.cc:_ZN4absl12lts_2024011615random_internal12_GLOBAL__N_133ReadSeedMaterialFromOSEntropyImplENS0_4SpanIjEE:
  190|      2|bool ReadSeedMaterialFromOSEntropyImpl(absl::Span<uint32_t> values) {
  191|      2|#if defined(ABSL_RANDOM_USE_GET_ENTROPY)
  192|      2|  if (ReadSeedMaterialFromGetEntropy(values)) {
  ------------------
  |  Branch (192:7): [True: 2, False: 0]
  ------------------
  193|      2|    return true;
  194|      2|  }
  195|      0|#endif
  196|       |  // Libc may support getentropy, but the kernel may not, so we still have
  197|       |  // to fallback to ReadSeedMaterialFromDevURandom().
  198|      0|  return ReadSeedMaterialFromDevURandom(values);
  199|      2|}
seed_material.cc:_ZN4absl12lts_2024011615random_internal12_GLOBAL__N_130ReadSeedMaterialFromGetEntropyENS0_4SpanIjEE:
  142|      2|bool ReadSeedMaterialFromGetEntropy(absl::Span<uint32_t> values) {
  143|      2|  auto buffer = reinterpret_cast<uint8_t*>(values.data());
  144|      2|  size_t buffer_size = sizeof(uint32_t) * values.size();
  145|     18|  while (buffer_size > 0) {
  ------------------
  |  Branch (145:10): [True: 16, False: 2]
  ------------------
  146|       |    // getentropy() has a maximum permitted length of 256.
  147|     16|    size_t to_read = std::min<size_t>(buffer_size, 256);
  148|     16|    int result = getentropy(buffer, to_read);
  149|     16|    if (result < 0) {
  ------------------
  |  Branch (149:9): [True: 0, False: 16]
  ------------------
  150|      0|      return false;
  151|      0|    }
  152|       |    // https://github.com/google/sanitizers/issues/1173
  153|       |    // MemorySanitizer can't see through getentropy().
  154|     16|    ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(buffer, to_read);
  155|     16|    buffer += to_read;
  156|     16|    buffer_size -= to_read;
  157|     16|  }
  158|      2|  return true;
  159|      2|}

_ZN4absl12lts_2024011615random_internal26UniformDistributionWrapperIjEC2Ev:
  236|     16|      : UniformDistribution<NumType>(std::numeric_limits<NumType>::lowest(),
  237|     16|                                     (std::numeric_limits<NumType>::max)()) {}

_ZNK4absl12lts_2024011624uniform_int_distributionIjE5paramEv:
  134|     16|  param_type param() const { return param_; }
_ZNK4absl12lts_2024011624uniform_int_distributionIjE10param_type1aEv:
   80|     16|    result_type a() const { return lo_; }
_ZNK4absl12lts_2024011624uniform_int_distributionIjE10param_type5rangeEv:
   95|     16|    unsigned_type range() const { return range_; }
_ZN4absl12lts_2024011624uniform_int_distributionIjEC2Ejj:
  110|     16|      : param_(lo, hi) {}
_ZN4absl12lts_2024011624uniform_int_distributionIjE10param_typeC2Ejj:
   73|     16|        : lo_(lo),
   74|     16|          range_(static_cast<unsigned_type>(hi) -
   75|     16|                 static_cast<unsigned_type>(lo)) {
   76|       |      // [rand.dist.uni.int] precondition 2
   77|       |      assert(lo <= hi);
   78|     16|    }
_ZN4absl12lts_2024011624uniform_int_distributionIjEclINS0_15random_internal17NonsecureURBGBaseINS4_13randen_engineImEENS4_17RandenPoolSeedSeqEEEEEjRT_:
  121|     16|  result_type operator()(URBG& gen) {  // NOLINT(runtime/references)
  122|     16|    return (*this)(gen, param());
  123|     16|  }
_ZN4absl12lts_2024011624uniform_int_distributionIjEclINS0_15random_internal17NonsecureURBGBaseINS4_13randen_engineImEENS4_17RandenPoolSeedSeqEEEEEjRT_RKNS2_10param_typeE:
  127|     16|      URBG& gen, const param_type& param) {  // NOLINT(runtime/references)
  128|     16|    return static_cast<result_type>(param.a() + Generate(gen, param.range()));
  129|     16|  }
_ZN4absl12lts_2024011624uniform_int_distributionIjE8GenerateINS0_15random_internal17NonsecureURBGBaseINS4_13randen_engineImEENS4_17RandenPoolSeedSeqEEEEEjRT_j:
  198|     16|    typename random_internal::make_unsigned_bits<IntType>::type R) {
  199|     16|  random_internal::FastUniformBits<unsigned_type> fast_bits;
  200|     16|  unsigned_type bits = fast_bits(g);
  201|     16|  const unsigned_type Lim = R + 1;
  202|     16|  if ((R & Lim) == 0) {
  ------------------
  |  Branch (202:7): [True: 16, False: 0]
  ------------------
  203|       |    // If the interval's length is a power of two range, just take the low bits.
  204|     16|    return bits & R;
  205|     16|  }
  206|       |
  207|       |  // Generates a uniform variate on [0, Lim) using fixed-point multiplication.
  208|       |  // The above fast-path guarantees that Lim is representable in unsigned_type.
  209|       |  //
  210|       |  // Algorithm adapted from
  211|       |  // http://lemire.me/blog/2016/06/30/fast-random-shuffling/, with added
  212|       |  // explanation.
  213|       |  //
  214|       |  // The algorithm creates a uniform variate `bits` in the interval [0, 2^N),
  215|       |  // and treats it as the fractional part of a fixed-point real value in [0, 1),
  216|       |  // multiplied by 2^N.  For example, 0.25 would be represented as 2^(N - 2),
  217|       |  // because 2^N * 0.25 == 2^(N - 2).
  218|       |  //
  219|       |  // Next, `bits` and `Lim` are multiplied with a wide-multiply to bring the
  220|       |  // value into the range [0, Lim).  The integral part (the high word of the
  221|       |  // multiplication result) is then very nearly the desired result.  However,
  222|       |  // this is not quite accurate; viewing the multiplication result as one
  223|       |  // double-width integer, the resulting values for the sample are mapped as
  224|       |  // follows:
  225|       |  //
  226|       |  // If the result lies in this interval:       Return this value:
  227|       |  //        [0, 2^N)                                    0
  228|       |  //        [2^N, 2 * 2^N)                              1
  229|       |  //        ...                                         ...
  230|       |  //        [K * 2^N, (K + 1) * 2^N)                    K
  231|       |  //        ...                                         ...
  232|       |  //        [(Lim - 1) * 2^N, Lim * 2^N)                Lim - 1
  233|       |  //
  234|       |  // While all of these intervals have the same size, the result of `bits * Lim`
  235|       |  // must be a multiple of `Lim`, and not all of these intervals contain the
  236|       |  // same number of multiples of `Lim`.  In particular, some contain
  237|       |  // `F = floor(2^N / Lim)` and some contain `F + 1 = ceil(2^N / Lim)`.  This
  238|       |  // difference produces a small nonuniformity, which is corrected by applying
  239|       |  // rejection sampling to one of the values in the "larger intervals" (i.e.,
  240|       |  // the intervals containing `F + 1` multiples of `Lim`.
  241|       |  //
  242|       |  // An interval contains `F + 1` multiples of `Lim` if and only if its smallest
  243|       |  // value modulo 2^N is less than `2^N % Lim`.  The unique value satisfying
  244|       |  // this property is used as the one for rejection.  That is, a value of
  245|       |  // `bits * Lim` is rejected if `(bit * Lim) % 2^N < (2^N % Lim)`.
  246|       |
  247|      0|  using helper = random_internal::wide_multiply<unsigned_type>;
  248|      0|  auto product = helper::multiply(bits, Lim);
  249|       |
  250|       |  // Two optimizations here:
  251|       |  // * Rejection occurs with some probability less than 1/2, and for reasonable
  252|       |  //   ranges considerably less (in particular, less than 1/(F+1)), so
  253|       |  //   ABSL_PREDICT_FALSE is apt.
  254|       |  // * `Lim` is an overestimate of `threshold`, and doesn't require a divide.
  255|      0|  if (ABSL_PREDICT_FALSE(helper::lo(product) < Lim)) {
  ------------------
  |  |  178|      0|#define ABSL_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
  |  |  ------------------
  |  |  |  Branch (178:31): [True: 0, False: 0]
  |  |  |  Branch (178:49): [Folded, False: 0]
  |  |  |  Branch (178:58): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  256|       |    // This quantity is exactly equal to `2^N % Lim`, but does not require high
  257|       |    // precision calculations: `2^N % Lim` is congruent to `(2^N - Lim) % Lim`.
  258|       |    // Ideally this could be expressed simply as `-X` rather than `2^N - X`, but
  259|       |    // for types smaller than int, this calculation is incorrect due to integer
  260|       |    // promotion rules.
  261|      0|    const unsigned_type threshold =
  262|      0|        ((std::numeric_limits<unsigned_type>::max)() - Lim + 1) % Lim;
  263|      0|    while (helper::lo(product) < threshold) {
  ------------------
  |  Branch (263:12): [True: 0, False: 0]
  ------------------
  264|      0|      bits = fast_bits(g);
  265|      0|      product = helper::multiply(bits, Lim);
  266|      0|    }
  267|      0|  }
  268|       |
  269|      0|  return helper::hi(product);
  270|     16|}

_ZNK4absl12lts_2024011615status_internal9StatusRep5UnrefEv:
   48|  3.23k|void StatusRep::Unref() const {
   49|       |  // Fast path: if ref==1, there is no need for a RefCountDec (since
   50|       |  // this is the only reference and therefore no other thread is
   51|       |  // allowed to be mucking with r).
   52|  3.23k|  if (ref_.load(std::memory_order_acquire) == 1 ||
  ------------------
  |  Branch (52:7): [True: 3.23k, False: 0]
  ------------------
   53|  3.23k|      ref_.fetch_sub(1, std::memory_order_acq_rel) - 1 == 0) {
  ------------------
  |  Branch (53:7): [True: 0, False: 0]
  ------------------
   54|  3.23k|    delete this;
   55|  3.23k|  }
   56|  3.23k|}
_ZNK4absl12lts_2024011615status_internal9StatusRep14ForEachPayloadENS0_11FunctionRefIFvNSt3__117basic_string_viewIcNS4_11char_traitsIcEEEERKNS0_4CordEEEE:
  110|    393|    const {
  111|    393|  if (auto* payloads = payloads_.get()) {
  ------------------
  |  Branch (111:13): [True: 0, False: 393]
  ------------------
  112|      0|    bool in_reverse =
  113|      0|        payloads->size() > 1 && reinterpret_cast<uintptr_t>(payloads) % 13 > 6;
  ------------------
  |  Branch (113:9): [True: 0, False: 0]
  |  Branch (113:33): [True: 0, False: 0]
  ------------------
  114|       |
  115|      0|    for (size_t index = 0; index < payloads->size(); ++index) {
  ------------------
  |  Branch (115:28): [True: 0, False: 0]
  ------------------
  116|      0|      const auto& elem =
  117|      0|          (*payloads)[in_reverse ? payloads->size() - 1 - index : index];
  ------------------
  |  Branch (117:23): [True: 0, False: 0]
  ------------------
  118|       |
  119|      0|#ifdef NDEBUG
  120|      0|      visitor(elem.type_url, elem.payload);
  121|       |#else
  122|       |      // In debug mode invalidate the type url to prevent users from relying on
  123|       |      // this string lifetime.
  124|       |
  125|       |      // NOLINTNEXTLINE intentional extra conversion to force temporary.
  126|       |      visitor(std::string(elem.type_url), elem.payload);
  127|       |#endif  // NDEBUG
  128|      0|    }
  129|      0|  }
  130|    393|}
_ZN4absl12lts_2024011615status_internal14MapToLocalCodeEi:
  211|    393|absl::StatusCode MapToLocalCode(int value) {
  212|    393|  absl::StatusCode code = static_cast<absl::StatusCode>(value);
  213|    393|  switch (code) {
  214|      0|    case absl::StatusCode::kOk:
  ------------------
  |  Branch (214:5): [True: 0, False: 393]
  ------------------
  215|      0|    case absl::StatusCode::kCancelled:
  ------------------
  |  Branch (215:5): [True: 0, False: 393]
  ------------------
  216|      0|    case absl::StatusCode::kUnknown:
  ------------------
  |  Branch (216:5): [True: 0, False: 393]
  ------------------
  217|    393|    case absl::StatusCode::kInvalidArgument:
  ------------------
  |  Branch (217:5): [True: 393, False: 0]
  ------------------
  218|    393|    case absl::StatusCode::kDeadlineExceeded:
  ------------------
  |  Branch (218:5): [True: 0, False: 393]
  ------------------
  219|    393|    case absl::StatusCode::kNotFound:
  ------------------
  |  Branch (219:5): [True: 0, False: 393]
  ------------------
  220|    393|    case absl::StatusCode::kAlreadyExists:
  ------------------
  |  Branch (220:5): [True: 0, False: 393]
  ------------------
  221|    393|    case absl::StatusCode::kPermissionDenied:
  ------------------
  |  Branch (221:5): [True: 0, False: 393]
  ------------------
  222|    393|    case absl::StatusCode::kResourceExhausted:
  ------------------
  |  Branch (222:5): [True: 0, False: 393]
  ------------------
  223|    393|    case absl::StatusCode::kFailedPrecondition:
  ------------------
  |  Branch (223:5): [True: 0, False: 393]
  ------------------
  224|    393|    case absl::StatusCode::kAborted:
  ------------------
  |  Branch (224:5): [True: 0, False: 393]
  ------------------
  225|    393|    case absl::StatusCode::kOutOfRange:
  ------------------
  |  Branch (225:5): [True: 0, False: 393]
  ------------------
  226|    393|    case absl::StatusCode::kUnimplemented:
  ------------------
  |  Branch (226:5): [True: 0, False: 393]
  ------------------
  227|    393|    case absl::StatusCode::kInternal:
  ------------------
  |  Branch (227:5): [True: 0, False: 393]
  ------------------
  228|    393|    case absl::StatusCode::kUnavailable:
  ------------------
  |  Branch (228:5): [True: 0, False: 393]
  ------------------
  229|    393|    case absl::StatusCode::kDataLoss:
  ------------------
  |  Branch (229:5): [True: 0, False: 393]
  ------------------
  230|    393|    case absl::StatusCode::kUnauthenticated:
  ------------------
  |  Branch (230:5): [True: 0, False: 393]
  ------------------
  231|    393|      return code;
  232|      0|    default:
  ------------------
  |  Branch (232:5): [True: 0, False: 393]
  ------------------
  233|      0|      return absl::StatusCode::kUnknown;
  234|    393|  }
  235|    393|}

_ZN4absl12lts_2024011615status_internal9StatusRepC2ENS0_10StatusCodeENSt3__117basic_string_viewIcNS4_11char_traitsIcEEEENS4_10unique_ptrINS0_13InlinedVectorINS1_7PayloadELm1ENS4_9allocatorISB_EEEENS4_14default_deleteISE_EEEE:
   70|  3.23k|      : ref_(int32_t{1}),
   71|  3.23k|        code_(code_arg),
   72|  3.23k|        message_(message_arg),
   73|  3.23k|        payloads_(std::move(payloads_arg)) {}
_ZNK4absl12lts_2024011615status_internal9StatusRep4codeEv:
   75|    393|  absl::StatusCode code() const { return code_; }
_ZNK4absl12lts_2024011615status_internal9StatusRep7messageEv:
   76|    393|  const std::string& message() const { return message_; }

_ZN4absl12lts_2024011617internal_statusor12PlacementNewINS0_6StatusEJEEEvPvDpOT0_:
  138|  20.5k|void PlacementNew(absl::Nonnull<void*> p, Args&&... args) {
  139|  20.5k|  new (p) T(std::forward<Args>(args)...);
  140|  20.5k|}
_ZN4absl12lts_2024011617internal_statusor12StatusOrDataIN8fuzztest8internal11CopyableAnyEED2Ev:
  229|  23.3k|  ~StatusOrData() {
  230|  23.3k|    if (ok()) {
  ------------------
  |  Branch (230:9): [True: 20.5k, False: 2.83k]
  ------------------
  231|  20.5k|      status_.~Status();
  232|  20.5k|      data_.~T();
  233|  20.5k|    } else {
  234|  2.83k|      status_.~Status();
  235|  2.83k|    }
  236|  23.3k|  }
_ZNK4absl12lts_2024011617internal_statusor12StatusOrDataIN8fuzztest8internal11CopyableAnyEE2okEv:
  255|  46.6k|  bool ok() const { return status_.ok(); }
_ZNK4absl12lts_2024011617internal_statusor12StatusOrDataIN8fuzztest8internal11CopyableAnyEE8EnsureOkEv:
  280|  20.5k|  void EnsureOk() const {
  281|  20.5k|    if (ABSL_PREDICT_FALSE(!ok())) Helper::Crash(status_);
  ------------------
  |  |  178|  20.5k|#define ABSL_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
  |  |  ------------------
  |  |  |  Branch (178:31): [True: 0, False: 20.5k]
  |  |  |  Branch (178:49): [Folded, False: 20.5k]
  |  |  |  Branch (178:58): [True: 0, False: 20.5k]
  |  |  ------------------
  ------------------
  282|  20.5k|  }
_ZN4absl12lts_2024011617internal_statusor12StatusOrDataIN8fuzztest8internal11CopyableAnyEEC2INS0_6StatusETnNSt3__19enable_ifIXsr3std16is_constructibleIS8_OT_EE5valueEiE4typeELi0EEESC_:
  207|  2.83k|  explicit StatusOrData(U&& v) : status_(std::forward<U>(v)) {
  208|  2.83k|    EnsureNotOk();
  209|  2.83k|  }
_ZN4absl12lts_2024011617internal_statusor12StatusOrDataIN8fuzztest8internal11CopyableAnyEE11EnsureNotOkEv:
  284|  2.83k|  void EnsureNotOk() {
  285|  2.83k|    if (ABSL_PREDICT_FALSE(ok())) Helper::HandleInvalidStatusCtorArg(&status_);
  ------------------
  |  |  178|  2.83k|#define ABSL_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
  |  |  ------------------
  |  |  |  Branch (178:31): [True: 0, False: 2.83k]
  |  |  |  Branch (178:49): [Folded, False: 2.83k]
  |  |  |  Branch (178:58): [True: 0, False: 2.83k]
  |  |  ------------------
  ------------------
  286|  2.83k|  }
_ZN4absl12lts_2024011617internal_statusor12StatusOrDataIN8fuzztest8internal11CopyableAnyEEC2IJRS5_EEENSt3__110in_place_tEDpOT_:
  193|  20.5k|      : data_(std::forward<Args>(args)...) {
  194|  20.5k|    MakeStatus();
  195|  20.5k|  }
_ZN4absl12lts_2024011617internal_statusor12StatusOrDataIN8fuzztest8internal11CopyableAnyEE10MakeStatusIJEEEvDpOT_:
  298|  20.5k|  void MakeStatus(Args&&... args) {
  299|  20.5k|    internal_statusor::PlacementNew<Status>(&status_,
  300|  20.5k|                                            std::forward<Args>(args)...);
  301|  20.5k|  }

_ZN4absl12lts_202401166StatusC2ENS0_10StatusCodeENSt3__117basic_string_viewIcNS3_11char_traitsIcEEEE:
  109|  3.23k|    : rep_(CodeToInlinedRep(code)) {
  110|  3.23k|  if (code != absl::StatusCode::kOk && !msg.empty()) {
  ------------------
  |  Branch (110:7): [True: 3.23k, False: 0]
  |  Branch (110:40): [True: 3.23k, False: 0]
  ------------------
  111|  3.23k|    rep_ = PointerToRep(new status_internal::StatusRep(code, msg, nullptr));
  112|  3.23k|  }
  113|  3.23k|}
_ZN4absl12lts_2024011620InvalidArgumentErrorENSt3__117basic_string_viewIcNS1_11char_traitsIcEEEE:
  164|  2.83k|Status InvalidArgumentError(absl::string_view message) {
  165|  2.83k|  return Status(absl::StatusCode::kInvalidArgument, message);
  166|  2.83k|}

_ZN4absl12lts_202401166StatusC2Em:
  619|  55.8M|  explicit Status(uintptr_t rep) : rep_(rep) {}
_ZN4absl12lts_202401166StatusaSEOS1_:
  792|   329k|inline Status& Status::operator=(Status&& x) {
  793|   329k|  uintptr_t old_rep = rep_;
  794|   329k|  if (x.rep_ != old_rep) {
  ------------------
  |  Branch (794:7): [True: 293, False: 328k]
  ------------------
  795|    293|    rep_ = x.rep_;
  796|    293|    x.rep_ = MovedFromRep();
  797|    293|    Unref(old_rep);
  798|    293|  }
  799|   329k|  return *this;
  800|   329k|}
_ZNK4absl12lts_202401166Status2okEv:
  816|  55.8M|inline bool Status::ok() const {
  817|  55.8M|  return rep_ == CodeToInlinedRep(absl::StatusCode::kOk);
  818|  55.8M|}
_ZNK4absl12lts_202401166Status4codeEv:
  820|    393|inline absl::StatusCode Status::code() const {
  821|    393|  return status_internal::MapToLocalCode(raw_code());
  822|    393|}
_ZNK4absl12lts_202401166Status8raw_codeEv:
  824|    393|inline int Status::raw_code() const {
  825|    393|  if (IsInlined(rep_)) return static_cast<int>(InlinedRepToCode(rep_));
  ------------------
  |  Branch (825:7): [True: 0, False: 393]
  ------------------
  826|    393|  return static_cast<int>(RepToPointer(rep_)->code());
  827|    393|}
_ZNK4absl12lts_202401166Status7messageEv:
  829|    393|inline absl::string_view Status::message() const {
  830|    393|  return !IsInlined(rep_)
  ------------------
  |  Branch (830:10): [True: 393, False: 0]
  ------------------
  831|    393|             ? RepToPointer(rep_)->message()
  832|    393|             : (IsMovedFrom(rep_) ? absl::string_view(kMovedFromString)
  ------------------
  |  Branch (832:17): [True: 0, False: 0]
  ------------------
  833|      0|                                  : absl::string_view());
  834|    393|}
_ZNK4absl12lts_202401166Status14ForEachPayloadENS0_11FunctionRefIFvNSt3__117basic_string_viewIcNS3_11char_traitsIcEEEERKNS0_4CordEEEE:
  883|    393|    const {
  884|    393|  if (IsInlined(rep_)) return;
  ------------------
  |  Branch (884:7): [True: 0, False: 393]
  ------------------
  885|    393|  RepToPointer(rep_)->ForEachPayload(visitor);
  886|    393|}
_ZN4absl12lts_202401166Status9IsInlinedEm:
  888|  56.1M|constexpr bool Status::IsInlined(uintptr_t rep) { return (rep & 1) != 0; }
_ZN4absl12lts_202401166Status16CodeToInlinedRepENS0_10StatusCodeE:
  892|   111M|constexpr uintptr_t Status::CodeToInlinedRep(absl::StatusCode code) {
  893|   111M|  return (static_cast<uintptr_t>(code) << 2) + 1;
  894|   111M|}
_ZN4absl12lts_202401166Status12MovedFromRepEv:
  901|  3.13k|constexpr uintptr_t Status::MovedFromRep() {
  902|  3.13k|  return CodeToInlinedRep(absl::StatusCode::kInternal) | 2;
  903|  3.13k|}
_ZN4absl12lts_202401166Status12RepToPointerEm:
  906|  4.41k|    uintptr_t rep) {
  907|       |  assert(!IsInlined(rep));
  908|  4.41k|  return reinterpret_cast<const status_internal::StatusRep*>(rep);
  909|  4.41k|}
_ZN4absl12lts_202401166Status12PointerToRepEPNS0_15status_internal9StatusRepE:
  912|  3.23k|    absl::Nonnull<status_internal::StatusRep*> rep) {
  913|  3.23k|  return reinterpret_cast<uintptr_t>(rep);
  914|  3.23k|}
_ZN4absl12lts_202401166Status3RefEm:
  916|   328k|inline void Status::Ref(uintptr_t rep) {
  917|   328k|  if (!IsInlined(rep)) RepToPointer(rep)->Ref();
  ------------------
  |  Branch (917:7): [True: 0, False: 328k]
  ------------------
  918|   328k|}
_ZN4absl12lts_202401166Status5UnrefEm:
  920|  55.8M|inline void Status::Unref(uintptr_t rep) {
  921|  55.8M|  if (!IsInlined(rep)) RepToPointer(rep)->Unref();
  ------------------
  |  Branch (921:7): [True: 3.23k, False: 55.8M]
  ------------------
  922|  55.8M|}
_ZN4absl12lts_202401168OkStatusEv:
  924|  55.4M|inline Status OkStatus() { return Status(); }
_ZN4absl12lts_202401166StatusD2Ev:
  814|  55.8M|inline Status::~Status() { Unref(rep_); }
_ZN4absl12lts_202401166StatusC2Ev:
  772|  55.5M|inline Status::Status() : Status(absl::StatusCode::kOk) {}
_ZN4absl12lts_202401166StatusC2ENS0_10StatusCodeE:
  774|  55.5M|inline Status::Status(absl::StatusCode code) : Status(CodeToInlinedRep(code)) {}
_ZN4absl12lts_202401166StatusC2EOS1_:
  788|  2.83k|inline Status::Status(Status&& x) noexcept : Status(x.rep_) {
  789|  2.83k|  x.rep_ = MovedFromRep();
  790|  2.83k|}
_ZN4absl12lts_202401166StatusC2ERKS1_:
  776|   328k|inline Status::Status(const Status& x) : Status(x.rep_) { Ref(rep_); }

_ZNK4absl12lts_202401168StatusOrIN8fuzztest8internal11CopyableAnyEE2okEv:
  494|  23.3k|  ABSL_MUST_USE_RESULT bool ok() const { return this->status_.ok(); }
_ZNO4absl12lts_202401168StatusOrIN8fuzztest8internal11CopyableAnyEEdeEv:
  787|  20.5k|T&& StatusOr<T>::operator*() && {
  788|  20.5k|  this->EnsureOk();
  789|  20.5k|  return std::move(this->data_);
  790|  20.5k|}
_ZN4absl12lts_202401168StatusOrIN8fuzztest8internal11CopyableAnyEEC2INS0_6StatusETnNSt3__19enable_ifIXsr4absl11conjunctionINS8_14is_convertibleIOT_S7_EENS8_16is_constructibleIS7_JSC_EEENS0_8negationINS8_7is_sameINS8_5decayISB_E4typeES5_EEEENSG_INSH_ISK_S4_EEEENSG_INSH_ISK_NS8_10in_place_tEEEEENSG_INS0_17internal_statusor31HasConversionOperatorToStatusOrIS4_SC_vEEEEEE5valueEiE4typeELi0EEESC_:
  365|  2.83k|  StatusOr(U&& v) : Base(std::forward<U>(v)) {}
_ZN4absl12lts_202401168StatusOrIN8fuzztest8internal11CopyableAnyEEC2IRS4_TnNSt3__19enable_ifIXsr4absl11conjunctionINS0_11disjunctionIJNS8_7is_sameIS4_NS0_12remove_cvrefIOT_E4typeEEENS0_8negationINSA_IJNSB_IS5_SG_EENSB_INS0_6StatusESG_EENSB_INS8_10in_place_tESG_EENS0_17internal_statusor31IsDirectInitializationAmbiguousIS4_SE_EEEEEEEEEENS8_16is_constructibleIS4_JSE_EEENS8_14is_convertibleISE_S4_EENSA_IJNSB_INSC_ISD_E4typeES4_EENS0_11conjunctionIJNSI_INSW_ISE_SK_EEEENSI_INSO_31HasConversionOperatorToStatusOrIS4_SE_vEEEEEEEEEEEE5valueEiE4typeELi0EEESE_:
  460|  20.5k|      : StatusOr(absl::in_place, std::forward<U>(u)) {}
_ZN4absl12lts_202401168StatusOrIN8fuzztest8internal11CopyableAnyEEC2IJRS4_EEENSt3__110in_place_tEDpOT_:
  723|  20.5k|    : Base(absl::in_place, std::forward<Args>(args)...) {}

_ZN4absl12lts_2024011613ascii_isspaceEh:
   97|  46.3k|inline bool ascii_isspace(unsigned char c) {
   98|  46.3k|  return (ascii_internal::kPropertyBits[c] & 0x08) != 0;
   99|  46.3k|}
_ZN4absl12lts_2024011627StripLeadingAsciiWhitespaceENSt3__117basic_string_viewIcNS1_11char_traitsIcEEEE:
  200|  21.5k|    absl::string_view str) {
  201|  21.5k|  auto it = std::find_if_not(str.begin(), str.end(), absl::ascii_isspace);
  202|  21.5k|  return str.substr(static_cast<size_t>(it - str.begin()));
  203|  21.5k|}
_ZN4absl12lts_2024011628StripTrailingAsciiWhitespaceENSt3__117basic_string_viewIcNS1_11char_traitsIcEEEE:
  214|  21.5k|    absl::string_view str) {
  215|  21.5k|  auto it = std::find_if_not(str.rbegin(), str.rend(), absl::ascii_isspace);
  216|  21.5k|  return str.substr(0, static_cast<size_t>(str.rend() - it));
  217|  21.5k|}
_ZN4absl12lts_2024011620StripAsciiWhitespaceENSt3__117basic_string_viewIcNS1_11char_traitsIcEEEE:
  228|  21.5k|    absl::string_view str) {
  229|  21.5k|  return StripTrailingAsciiWhitespace(StripLeadingAsciiWhitespace(str));
  230|  21.5k|}

_ZN4absl12lts_2024011610from_charsEPKcS2_RdNS0_12chars_formatE:
  953|  21.5k|                             chars_format fmt) {
  954|  21.5k|  return FromCharsImpl(first, last, value, fmt);
  955|  21.5k|}
charconv.cc:_ZN4absl12lts_2024011612_GLOBAL__N_113FromCharsImplIdEENS0_17from_chars_resultEPKcS5_RT_NS0_12chars_formatE:
  866|  21.5k|                                FloatType& value, chars_format fmt_flags) {
  867|  21.5k|  from_chars_result result;
  868|  21.5k|  result.ptr = first;  // overwritten on successful parse
  869|  21.5k|  result.ec = std::errc();
  870|       |
  871|  21.5k|  bool negative = false;
  872|  21.5k|  if (first != last && *first == '-') {
  ------------------
  |  Branch (872:7): [True: 21.5k, False: 10]
  |  Branch (872:24): [True: 4.92k, False: 16.6k]
  ------------------
  873|  4.92k|    ++first;
  874|  4.92k|    negative = true;
  875|  4.92k|  }
  876|       |  // If the `hex` flag is *not* set, then we will accept a 0x prefix and try
  877|       |  // to parse a hexadecimal float.
  878|  21.5k|  if ((fmt_flags & chars_format::hex) == chars_format{} && last - first >= 2 &&
  ------------------
  |  Branch (878:7): [True: 21.5k, False: 0]
  |  Branch (878:60): [True: 20.4k, False: 1.13k]
  ------------------
  879|  20.4k|      *first == '0' && (first[1] == 'x' || first[1] == 'X')) {
  ------------------
  |  Branch (879:7): [True: 5.25k, False: 15.1k]
  |  Branch (879:25): [True: 992, False: 4.26k]
  |  Branch (879:44): [True: 3.65k, False: 609]
  ------------------
  880|  4.64k|    const char* hex_first = first + 2;
  881|  4.64k|    strings_internal::ParsedFloat hex_parse =
  882|  4.64k|        strings_internal::ParseFloat<16>(hex_first, last, fmt_flags);
  883|  4.64k|    if (hex_parse.end == nullptr ||
  ------------------
  |  Branch (883:9): [True: 11, False: 4.63k]
  ------------------
  884|  4.63k|        hex_parse.type != strings_internal::FloatType::kNumber) {
  ------------------
  |  Branch (884:9): [True: 1, False: 4.63k]
  ------------------
  885|       |      // Either we failed to parse a hex float after the "0x", or we read
  886|       |      // "0xinf" or "0xnan" which we don't want to match.
  887|       |      //
  888|       |      // However, a string that begins with "0x" also begins with "0", which
  889|       |      // is normally a valid match for the number zero.  So we want these
  890|       |      // strings to match zero unless fmt_flags is `scientific`.  (This flag
  891|       |      // means an exponent is required, which the string "0" does not have.)
  892|     12|      if (fmt_flags == chars_format::scientific) {
  ------------------
  |  Branch (892:11): [True: 0, False: 12]
  ------------------
  893|      0|        result.ec = std::errc::invalid_argument;
  894|     12|      } else {
  895|     12|        result.ptr = first + 1;
  896|     12|        value = negative ? -0.0 : 0.0;
  ------------------
  |  Branch (896:17): [True: 1, False: 11]
  ------------------
  897|     12|      }
  898|     12|      return result;
  899|     12|    }
  900|       |    // We matched a value.
  901|  4.63k|    result.ptr = hex_parse.end;
  902|  4.63k|    if (HandleEdgeCase(hex_parse, negative, &value)) {
  ------------------
  |  Branch (902:9): [True: 741, False: 3.89k]
  ------------------
  903|    741|      return result;
  904|    741|    }
  905|  3.89k|    CalculatedFloat calculated =
  906|  3.89k|        CalculateFromParsedHexadecimal<FloatType>(hex_parse);
  907|  3.89k|    EncodeResult(calculated, negative, &result, &value);
  908|  3.89k|    return result;
  909|  4.63k|  }
  910|       |  // Otherwise, we choose the number base based on the flags.
  911|  16.9k|  if ((fmt_flags & chars_format::hex) == chars_format::hex) {
  ------------------
  |  Branch (911:7): [True: 0, False: 16.9k]
  ------------------
  912|      0|    strings_internal::ParsedFloat hex_parse =
  913|      0|        strings_internal::ParseFloat<16>(first, last, fmt_flags);
  914|      0|    if (hex_parse.end == nullptr) {
  ------------------
  |  Branch (914:9): [True: 0, False: 0]
  ------------------
  915|      0|      result.ec = std::errc::invalid_argument;
  916|      0|      return result;
  917|      0|    }
  918|      0|    result.ptr = hex_parse.end;
  919|      0|    if (HandleEdgeCase(hex_parse, negative, &value)) {
  ------------------
  |  Branch (919:9): [True: 0, False: 0]
  ------------------
  920|      0|      return result;
  921|      0|    }
  922|      0|    CalculatedFloat calculated =
  923|      0|        CalculateFromParsedHexadecimal<FloatType>(hex_parse);
  924|      0|    EncodeResult(calculated, negative, &result, &value);
  925|      0|    return result;
  926|  16.9k|  } else {
  927|  16.9k|    strings_internal::ParsedFloat decimal_parse =
  928|  16.9k|        strings_internal::ParseFloat<10>(first, last, fmt_flags);
  929|  16.9k|    if (decimal_parse.end == nullptr) {
  ------------------
  |  Branch (929:9): [True: 91, False: 16.8k]
  ------------------
  930|     91|      result.ec = std::errc::invalid_argument;
  931|     91|      return result;
  932|     91|    }
  933|  16.8k|    result.ptr = decimal_parse.end;
  934|  16.8k|    if (HandleEdgeCase(decimal_parse, negative, &value)) {
  ------------------
  |  Branch (934:9): [True: 2.00k, False: 14.8k]
  ------------------
  935|  2.00k|      return result;
  936|  2.00k|    }
  937|       |    // A nullptr subrange_begin means that the decimal_parse.mantissa is exact
  938|       |    // (not truncated), a precondition of the Eisel-Lemire algorithm.
  939|  14.8k|    if ((decimal_parse.subrange_begin == nullptr) &&
  ------------------
  |  Branch (939:9): [True: 6.52k, False: 8.29k]
  ------------------
  940|  6.52k|        EiselLemire<FloatType>(decimal_parse, negative, &value, &result.ec)) {
  ------------------
  |  Branch (940:9): [True: 3.11k, False: 3.41k]
  ------------------
  941|  3.11k|      return result;
  942|  3.11k|    }
  943|  11.7k|    CalculatedFloat calculated =
  944|  11.7k|        CalculateFromParsedDecimal<FloatType>(decimal_parse);
  945|  11.7k|    EncodeResult(calculated, negative, &result, &value);
  946|  11.7k|    return result;
  947|  14.8k|  }
  948|  16.9k|}
charconv.cc:_ZN4absl12lts_2024011612_GLOBAL__N_114HandleEdgeCaseIdEEbRKNS0_16strings_internal11ParsedFloatEbPT_:
  360|  21.4k|                    absl::Nonnull<FloatType*> value) {
  361|  21.4k|  if (input.type == strings_internal::FloatType::kNan) {
  ------------------
  |  Branch (361:7): [True: 417, False: 21.0k]
  ------------------
  362|       |    // A bug in both clang < 7 and gcc would cause the compiler to optimize
  363|       |    // away the buffer we are building below.  Declaring the buffer volatile
  364|       |    // avoids the issue, and has no measurable performance impact in
  365|       |    // microbenchmarks.
  366|       |    //
  367|       |    // https://bugs.llvm.org/show_bug.cgi?id=37778
  368|       |    // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=86113
  369|    417|    constexpr ptrdiff_t kNanBufferSize = 128;
  370|       |#if (defined(__GNUC__) && !defined(__clang__)) || \
  371|       |    (defined(__clang__) && __clang_major__ < 7)
  372|       |    volatile char n_char_sequence[kNanBufferSize];
  373|       |#else
  374|    417|    char n_char_sequence[kNanBufferSize];
  375|    417|#endif
  376|    417|    if (input.subrange_begin == nullptr) {
  ------------------
  |  Branch (376:9): [True: 417, False: 0]
  ------------------
  377|    417|      n_char_sequence[0] = '\0';
  378|    417|    } else {
  379|      0|      ptrdiff_t nan_size = input.subrange_end - input.subrange_begin;
  380|      0|      nan_size = std::min(nan_size, kNanBufferSize - 1);
  381|      0|      std::copy_n(input.subrange_begin, nan_size, n_char_sequence);
  382|      0|      n_char_sequence[nan_size] = '\0';
  383|      0|    }
  384|    417|    char* nan_argument = const_cast<char*>(n_char_sequence);
  385|    417|    *value = negative ? -FloatTraits<FloatType>::MakeNan(nan_argument)
  ------------------
  |  Branch (385:14): [True: 198, False: 219]
  ------------------
  386|    417|                      : FloatTraits<FloatType>::MakeNan(nan_argument);
  387|    417|    return true;
  388|    417|  }
  389|  21.0k|  if (input.type == strings_internal::FloatType::kInfinity) {
  ------------------
  |  Branch (389:7): [True: 488, False: 20.5k]
  ------------------
  390|    488|    *value = negative ? -std::numeric_limits<FloatType>::infinity()
  ------------------
  |  Branch (390:14): [True: 0, False: 488]
  ------------------
  391|    488|                      : std::numeric_limits<FloatType>::infinity();
  392|    488|    return true;
  393|    488|  }
  394|  20.5k|  if (input.mantissa == 0) {
  ------------------
  |  Branch (394:7): [True: 1.84k, False: 18.7k]
  ------------------
  395|  1.84k|    *value = negative ? -0.0 : 0.0;
  ------------------
  |  Branch (395:14): [True: 476, False: 1.36k]
  ------------------
  396|  1.84k|    return true;
  397|  1.84k|  }
  398|  18.7k|  return false;
  399|  20.5k|}
charconv.cc:_ZN4absl12lts_2024011612_GLOBAL__N_111FloatTraitsIdE7MakeNanEPKc:
  123|    417|  static double MakeNan(absl::Nonnull<const char*> tagp) {
  124|    417|#if ABSL_HAVE_BUILTIN(__builtin_nan)
  125|       |    // Use __builtin_nan() if available since it has a fix for
  126|       |    // https://bugs.llvm.org/show_bug.cgi?id=37778
  127|       |    // std::nan may use the glibc implementation.
  128|    417|    return __builtin_nan(tagp);
  129|       |#else
  130|       |    // Support nan no matter which namespace it's in.  Some platforms
  131|       |    // incorrectly don't put it in namespace std.
  132|       |    using namespace std;  // NOLINT
  133|       |    return nan(tagp);
  134|       |#endif
  135|    417|  }
charconv.cc:_ZN4absl12lts_2024011612_GLOBAL__N_130CalculateFromParsedHexadecimalIdEENS1_15CalculatedFloatERKNS0_16strings_internal11ParsedFloatE:
  602|  3.89k|    const strings_internal::ParsedFloat& parsed_hex) {
  603|  3.89k|  uint64_t mantissa = parsed_hex.mantissa;
  604|  3.89k|  int exponent = parsed_hex.exponent;
  605|       |  // This static_cast is only needed when using a std::bit_width()
  606|       |  // implementation that does not have the fix for LWG 3656 applied.
  607|  3.89k|  int mantissa_width = static_cast<int>(bit_width(mantissa));
  608|  3.89k|  const int shift = NormalizedShiftSize<FloatType>(mantissa_width, exponent);
  609|  3.89k|  bool result_exact;
  610|  3.89k|  exponent += shift;
  611|  3.89k|  mantissa = ShiftRightAndRound(mantissa, shift,
  612|  3.89k|                                /* input exact= */ true, &result_exact);
  613|       |  // ParseFloat handles rounding in the hexadecimal case, so we don't have to
  614|       |  // check `result_exact` here.
  615|  3.89k|  return CalculatedFloatFromRawValues<FloatType>(mantissa, exponent);
  616|  3.89k|}
charconv.cc:_ZN4absl12lts_2024011612_GLOBAL__N_119NormalizedShiftSizeIdEEiii:
  335|  15.2k|int NormalizedShiftSize(int mantissa_width, int binary_exponent) {
  336|  15.2k|  const int normal_shift =
  337|  15.2k|      mantissa_width - FloatTraits<FloatType>::kTargetMantissaBits;
  338|  15.2k|  const int minimum_shift =
  339|  15.2k|      FloatTraits<FloatType>::kMinNormalExponent - binary_exponent;
  340|  15.2k|  return std::max(normal_shift, minimum_shift);
  341|  15.2k|}
charconv.cc:_ZN4absl12lts_2024011612_GLOBAL__N_118ShiftRightAndRoundENS0_7uint128EibPb:
  456|  15.2k|                            absl::Nonnull<bool*> output_exact) {
  457|  15.2k|  if (shift <= 0) {
  ------------------
  |  Branch (457:7): [True: 1.59k, False: 13.7k]
  ------------------
  458|  1.59k|    *output_exact = input_exact;
  459|  1.59k|    return static_cast<uint64_t>(value << -shift);
  460|  1.59k|  }
  461|  13.7k|  if (shift >= 128) {
  ------------------
  |  Branch (461:7): [True: 363, False: 13.3k]
  ------------------
  462|       |    // Exponent is so small that we are shifting away all significant bits.
  463|       |    // Answer will not be representable, even as a subnormal, so return a zero
  464|       |    // mantissa (which represents underflow).
  465|    363|    *output_exact = true;
  466|    363|    return 0;
  467|    363|  }
  468|       |
  469|  13.3k|  *output_exact = true;
  470|  13.3k|  const uint128 shift_mask = (uint128(1) << shift) - 1;
  471|  13.3k|  const uint128 halfway_point = uint128(1) << (shift - 1);
  472|       |
  473|  13.3k|  const uint128 shifted_bits = value & shift_mask;
  474|  13.3k|  value >>= shift;
  475|  13.3k|  if (shifted_bits > halfway_point) {
  ------------------
  |  Branch (475:7): [True: 1.73k, False: 11.5k]
  ------------------
  476|       |    // Shifted bits greater than 10000... require rounding up.
  477|  1.73k|    return static_cast<uint64_t>(value + 1);
  478|  1.73k|  }
  479|  11.5k|  if (shifted_bits == halfway_point) {
  ------------------
  |  Branch (479:7): [True: 1.70k, False: 9.89k]
  ------------------
  480|       |    // In exact mode, shifted bits of 10000... mean we're exactly halfway
  481|       |    // between two numbers, and we must round to even.  So only round up if
  482|       |    // the low bit of `value` is set.
  483|       |    //
  484|       |    // In inexact mode, the nonzero error means the actual value is greater
  485|       |    // than the halfway point and we must always round up.
  486|  1.70k|    if ((value & 1) == 1 || !input_exact) {
  ------------------
  |  Branch (486:9): [True: 672, False: 1.03k]
  |  Branch (486:29): [True: 197, False: 839]
  ------------------
  487|    869|      ++value;
  488|    869|    }
  489|  1.70k|    return static_cast<uint64_t>(value);
  490|  1.70k|  }
  491|  9.89k|  if (!input_exact && shifted_bits == halfway_point - 1) {
  ------------------
  |  Branch (491:7): [True: 9.09k, False: 795]
  |  Branch (491:23): [True: 8.51k, False: 576]
  ------------------
  492|       |    // Rounding direction is unclear, due to error.
  493|  8.51k|    *output_exact = false;
  494|  8.51k|  }
  495|       |  // Otherwise, round down.
  496|  9.89k|  return static_cast<uint64_t>(value);
  497|  11.5k|}
charconv.cc:_ZN4absl12lts_2024011612_GLOBAL__N_128CalculatedFloatFromRawValuesIdEENS1_15CalculatedFloatEmi:
  583|  15.2k|CalculatedFloat CalculatedFloatFromRawValues(uint64_t mantissa, int exponent) {
  584|  15.2k|  CalculatedFloat result;
  585|  15.2k|  if (mantissa == uint64_t{1} << FloatTraits<FloatType>::kTargetMantissaBits) {
  ------------------
  |  Branch (585:7): [True: 570, False: 14.7k]
  ------------------
  586|    570|    mantissa >>= 1;
  587|    570|    exponent += 1;
  588|    570|  }
  589|  15.2k|  if (exponent > FloatTraits<FloatType>::kMaxExponent) {
  ------------------
  |  Branch (589:7): [True: 1.70k, False: 13.5k]
  ------------------
  590|  1.70k|    result.exponent = kOverflow;
  591|  13.5k|  } else if (mantissa == 0) {
  ------------------
  |  Branch (591:14): [True: 964, False: 12.6k]
  ------------------
  592|    964|    result.exponent = kUnderflow;
  593|  12.6k|  } else {
  594|  12.6k|    result.exponent = exponent;
  595|  12.6k|    result.mantissa = mantissa;
  596|  12.6k|  }
  597|  15.2k|  return result;
  598|  15.2k|}
charconv.cc:_ZN4absl12lts_2024011612_GLOBAL__N_112EncodeResultIdEEvRKNS1_15CalculatedFloatEbPNS0_17from_chars_resultEPT_:
  410|  15.6k|                  absl::Nonnull<FloatType*> value) {
  411|  15.6k|  if (calculated.exponent == kOverflow) {
  ------------------
  |  Branch (411:7): [True: 1.76k, False: 13.8k]
  ------------------
  412|  1.76k|    result->ec = std::errc::result_out_of_range;
  413|  1.76k|    *value = negative ? -std::numeric_limits<FloatType>::max()
  ------------------
  |  Branch (413:14): [True: 361, False: 1.40k]
  ------------------
  414|  1.76k|                      : std::numeric_limits<FloatType>::max();
  415|  1.76k|    return;
  416|  13.8k|  } else if (calculated.mantissa == 0 || calculated.exponent == kUnderflow) {
  ------------------
  |  Branch (416:14): [True: 1.21k, False: 12.6k]
  |  Branch (416:42): [True: 0, False: 12.6k]
  ------------------
  417|  1.21k|    result->ec = std::errc::result_out_of_range;
  418|  1.21k|    *value = negative ? -0.0 : 0.0;
  ------------------
  |  Branch (418:14): [True: 461, False: 750]
  ------------------
  419|  1.21k|    return;
  420|  1.21k|  }
  421|  12.6k|  *value = FloatTraits<FloatType>::Make(
  422|  12.6k|      static_cast<typename FloatTraits<FloatType>::mantissa_t>(
  423|  12.6k|          calculated.mantissa),
  424|  12.6k|      calculated.exponent, negative);
  425|  12.6k|}
charconv.cc:_ZN4absl12lts_2024011612_GLOBAL__N_111FloatTraitsIdE4MakeEmib:
  150|  12.6k|  static double Make(mantissa_t mantissa, int exponent, bool sign) {
  151|       |#ifndef ABSL_BIT_PACK_FLOATS
  152|       |    // Support ldexp no matter which namespace it's in.  Some platforms
  153|       |    // incorrectly don't put it in namespace std.
  154|       |    using namespace std;  // NOLINT
  155|       |    return sign ? -ldexp(mantissa, exponent) : ldexp(mantissa, exponent);
  156|       |#else
  157|  12.6k|    constexpr uint64_t kMantissaMask =
  158|  12.6k|        (uint64_t{1} << (kTargetMantissaBits - 1)) - 1;
  159|  12.6k|    uint64_t dbl = static_cast<uint64_t>(sign) << 63;
  160|  12.6k|    if (mantissa > kMantissaMask) {
  ------------------
  |  Branch (160:9): [True: 11.7k, False: 886]
  ------------------
  161|       |      // Normal value.
  162|       |      // Adjust by 1023 for the exponent representation bias, and an additional
  163|       |      // 52 due to the implied decimal point in the IEEE mantissa
  164|       |      // representation.
  165|  11.7k|      dbl += static_cast<uint64_t>(exponent + 1023 + kTargetMantissaBits - 1)
  166|  11.7k|             << 52;
  167|  11.7k|      mantissa &= kMantissaMask;
  168|  11.7k|    } else {
  169|       |      // subnormal value
  170|       |      assert(exponent == kMinNormalExponent);
  171|    886|    }
  172|  12.6k|    dbl += mantissa;
  173|  12.6k|    return absl::bit_cast<double>(dbl);
  174|  12.6k|#endif  // ABSL_BIT_PACK_FLOATS
  175|  12.6k|  }
charconv.cc:_ZN4absl12lts_2024011612_GLOBAL__N_111EiselLemireIdEEbRKNS0_16strings_internal11ParsedFloatEbPT_PNSt3__14errcE:
  691|  6.52k|                 absl::Nonnull<std::errc*> ec) {
  692|  6.52k|  uint64_t man = input.mantissa;
  693|  6.52k|  int exp10 = input.exponent;
  694|  6.52k|  if (exp10 < FloatTraits<FloatType>::kEiselLemireMinInclusiveExp10) {
  ------------------
  |  Branch (694:7): [True: 277, False: 6.25k]
  ------------------
  695|    277|    *value = negative ? -0.0 : 0.0;
  ------------------
  |  Branch (695:14): [True: 3, False: 274]
  ------------------
  696|    277|    *ec = std::errc::result_out_of_range;
  697|    277|    return true;
  698|  6.25k|  } else if (exp10 >= FloatTraits<FloatType>::kEiselLemireMaxExclusiveExp10) {
  ------------------
  |  Branch (698:14): [True: 280, False: 5.97k]
  ------------------
  699|       |    // Return max (a finite value) consistent with from_chars and DR 3081. For
  700|       |    // SimpleAtod and SimpleAtof, post-processing will return infinity.
  701|    280|    *value = negative ? -std::numeric_limits<FloatType>::max()
  ------------------
  |  Branch (701:14): [True: 204, False: 76]
  ------------------
  702|    280|                      : std::numeric_limits<FloatType>::max();
  703|    280|    *ec = std::errc::result_out_of_range;
  704|    280|    return true;
  705|    280|  }
  706|       |
  707|       |  // Assert kPower10TableMinInclusive <= exp10 < kPower10TableMaxExclusive.
  708|       |  // Equivalently, !Power10Underflow(exp10) and !Power10Overflow(exp10).
  709|  5.97k|  static_assert(
  710|  5.97k|      FloatTraits<FloatType>::kEiselLemireMinInclusiveExp10 >=
  711|  5.97k|          kPower10TableMinInclusive,
  712|  5.97k|      "(exp10-kPower10TableMinInclusive) in kPower10MantissaHighTable bounds");
  713|  5.97k|  static_assert(
  714|  5.97k|      FloatTraits<FloatType>::kEiselLemireMaxExclusiveExp10 <=
  715|  5.97k|          kPower10TableMaxExclusive,
  716|  5.97k|      "(exp10-kPower10TableMinInclusive) in kPower10MantissaHighTable bounds");
  717|       |
  718|       |  // The terse (+) comments in this function body refer to sections of the
  719|       |  // https://nigeltao.github.io/blog/2020/eisel-lemire.html blog post.
  720|       |  //
  721|       |  // That blog post discusses double precision (11 exponent bits with a -1023
  722|       |  // bias, 52 mantissa bits), but the same approach applies to single precision
  723|       |  // (8 exponent bits with a -127 bias, 23 mantissa bits). Either way, the
  724|       |  // computation here happens with 64-bit values (e.g. man) or 128-bit values
  725|       |  // (e.g. x) before finally converting to 64- or 32-bit floating point.
  726|       |  //
  727|       |  // See also "Number Parsing at a Gigabyte per Second, Software: Practice and
  728|       |  // Experience 51 (8), 2021" (https://arxiv.org/abs/2101.11408) for detail.
  729|       |
  730|       |  // (+) Normalization.
  731|  5.97k|  int clz = countl_zero(man);
  732|  5.97k|  man <<= static_cast<unsigned int>(clz);
  733|       |  // The 217706 etc magic numbers are from the Power10Exponent function.
  734|  5.97k|  uint64_t ret_exp2 =
  735|  5.97k|      static_cast<uint64_t>((217706 * exp10 >> 16) + 64 +
  736|  5.97k|                            FloatTraits<FloatType>::kExponentBias - clz);
  737|       |
  738|       |  // (+) Multiplication.
  739|  5.97k|  uint128 x = static_cast<uint128>(man) *
  740|  5.97k|              static_cast<uint128>(
  741|  5.97k|                  kPower10MantissaHighTable[exp10 - kPower10TableMinInclusive]);
  742|       |
  743|       |  // (+) Wider Approximation.
  744|  5.97k|  static constexpr uint64_t high64_mask =
  745|  5.97k|      FloatTraits<FloatType>::kEiselLemireMask;
  746|  5.97k|  if (((Uint128High64(x) & high64_mask) == high64_mask) &&
  ------------------
  |  Branch (746:7): [True: 2.15k, False: 3.82k]
  ------------------
  747|  2.15k|      (man > (std::numeric_limits<uint64_t>::max() - Uint128Low64(x)))) {
  ------------------
  |  Branch (747:7): [True: 1.60k, False: 547]
  ------------------
  748|  1.60k|    uint128 y =
  749|  1.60k|        static_cast<uint128>(man) *
  750|  1.60k|        static_cast<uint128>(
  751|  1.60k|            kPower10MantissaLowTable[exp10 - kPower10TableMinInclusive]);
  752|  1.60k|    x += Uint128High64(y);
  753|       |    // For example, parsing "4503599627370497.5" will take the if-true
  754|       |    // branch here (for double precision), since:
  755|       |    //  - x   = 0x8000000000000BFF_FFFFFFFFFFFFFFFF
  756|       |    //  - y   = 0x8000000000000BFF_7FFFFFFFFFFFF400
  757|       |    //  - man = 0xA000000000000F00
  758|       |    // Likewise, when parsing "0.0625" for single precision:
  759|       |    //  - x   = 0x7FFFFFFFFFFFFFFF_FFFFFFFFFFFFFFFF
  760|       |    //  - y   = 0x813FFFFFFFFFFFFF_8A00000000000000
  761|       |    //  - man = 0x9C40000000000000
  762|  1.60k|    if (((Uint128High64(x) & high64_mask) == high64_mask) &&
  ------------------
  |  Branch (762:9): [True: 1.08k, False: 523]
  ------------------
  763|  1.08k|        ((Uint128Low64(x) + 1) == 0) &&
  ------------------
  |  Branch (763:9): [True: 688, False: 394]
  ------------------
  764|    688|        (man > (std::numeric_limits<uint64_t>::max() - Uint128Low64(y)))) {
  ------------------
  |  Branch (764:9): [True: 688, False: 0]
  ------------------
  765|    688|      return false;
  766|    688|    }
  767|  1.60k|  }
  768|       |
  769|       |  // (+) Shifting to 54 Bits (or for single precision, to 25 bits).
  770|  5.28k|  uint64_t msb = Uint128High64(x) >> 63;
  771|  5.28k|  uint64_t ret_man =
  772|  5.28k|      Uint128High64(x) >> (msb + FloatTraits<FloatType>::kEiselLemireShift);
  773|  5.28k|  ret_exp2 -= 1 ^ msb;
  774|       |
  775|       |  // (+) Half-way Ambiguity.
  776|       |  //
  777|       |  // For example, parsing "1e+23" will take the if-true branch here (for double
  778|       |  // precision), since:
  779|       |  //  - x       = 0x54B40B1F852BDA00_0000000000000000
  780|       |  //  - ret_man = 0x002A5A058FC295ED
  781|       |  // Likewise, when parsing "20040229.0" for single precision:
  782|       |  //  - x       = 0x4C72894000000000_0000000000000000
  783|       |  //  - ret_man = 0x000000000131CA25
  784|  5.28k|  if ((Uint128Low64(x) == 0) && ((Uint128High64(x) & high64_mask) == 0) &&
  ------------------
  |  Branch (784:7): [True: 3.02k, False: 2.26k]
  |  Branch (784:33): [True: 1.83k, False: 1.18k]
  ------------------
  785|  1.83k|      ((ret_man & 3) == 1)) {
  ------------------
  |  Branch (785:7): [True: 638, False: 1.19k]
  ------------------
  786|    638|    return false;
  787|    638|  }
  788|       |
  789|       |  // (+) From 54 to 53 Bits (or for single precision, from 25 to 24 bits).
  790|  4.64k|  ret_man += ret_man & 1;  // Line From54a.
  791|  4.64k|  ret_man >>= 1;           // Line From54b.
  792|       |  // Incrementing ret_man (at line From54a) may have overflowed 54 bits (53
  793|       |  // bits after the right shift by 1 at line From54b), so adjust for that.
  794|       |  //
  795|       |  // For example, parsing "9223372036854775807" will take the if-true branch
  796|       |  // here (for double precision), since:
  797|       |  //  - ret_man = 0x0020000000000000 = (1 << 53)
  798|       |  // Likewise, when parsing "2147483647.0" for single precision:
  799|       |  //  - ret_man = 0x0000000001000000 = (1 << 24)
  800|  4.64k|  if ((ret_man >> FloatTraits<FloatType>::kTargetMantissaBits) > 0) {
  ------------------
  |  Branch (800:7): [True: 71, False: 4.57k]
  ------------------
  801|     71|    ret_exp2 += 1;
  802|       |    // Conceptually, we need a "ret_man >>= 1" in this if-block to balance
  803|       |    // incrementing ret_exp2 in the line immediately above. However, we only
  804|       |    // get here when line From54a overflowed (after adding a 1), so ret_man
  805|       |    // here is (1 << 53). Its low 53 bits are therefore all zeroes. The only
  806|       |    // remaining use of ret_man is to mask it with ((1 << 52) - 1), so only its
  807|       |    // low 52 bits matter. A "ret_man >>= 1" would have no effect in practice.
  808|       |    //
  809|       |    // We omit the "ret_man >>= 1", even if it is cheap (and this if-branch is
  810|       |    // rarely taken) and technically 'more correct', so that mutation tests
  811|       |    // that would otherwise modify or omit that "ret_man >>= 1" don't complain
  812|       |    // that such code mutations have no observable effect.
  813|     71|  }
  814|       |
  815|       |  // ret_exp2 is a uint64_t. Zero or underflow means that we're in subnormal
  816|       |  // space. max_exp2 (0x7FF for double precision, 0xFF for single precision) or
  817|       |  // above means that we're in Inf/NaN space.
  818|       |  //
  819|       |  // The if block is equivalent to (but has fewer branches than):
  820|       |  //   if ((ret_exp2 <= 0) || (ret_exp2 >= max_exp2)) { etc }
  821|       |  //
  822|       |  // For example, parsing "4.9406564584124654e-324" will take the if-true
  823|       |  // branch here, since ret_exp2 = -51.
  824|  4.64k|  static constexpr uint64_t max_exp2 =
  825|  4.64k|      (1 << FloatTraits<FloatType>::kTargetExponentBits) - 1;
  826|  4.64k|  if ((ret_exp2 - 1) >= (max_exp2 - 1)) {
  ------------------
  |  Branch (826:7): [True: 2.09k, False: 2.55k]
  ------------------
  827|  2.09k|    return false;
  828|  2.09k|  }
  829|       |
  830|       |#ifndef ABSL_BIT_PACK_FLOATS
  831|       |  if (FloatTraits<FloatType>::kTargetBits == 64) {
  832|       |    *value = FloatTraits<FloatType>::Make(
  833|       |        (ret_man & 0x000FFFFFFFFFFFFFu) | 0x0010000000000000u,
  834|       |        static_cast<int>(ret_exp2) - 1023 - 52, negative);
  835|       |    return true;
  836|       |  } else if (FloatTraits<FloatType>::kTargetBits == 32) {
  837|       |    *value = FloatTraits<FloatType>::Make(
  838|       |        (static_cast<uint32_t>(ret_man) & 0x007FFFFFu) | 0x00800000u,
  839|       |        static_cast<int>(ret_exp2) - 127 - 23, negative);
  840|       |    return true;
  841|       |  }
  842|       |#else
  843|  2.55k|  if (FloatTraits<FloatType>::kTargetBits == 64) {
  ------------------
  |  Branch (843:7): [True: 2.55k, Folded]
  ------------------
  844|  2.55k|    uint64_t ret_bits = (ret_exp2 << 52) | (ret_man & 0x000FFFFFFFFFFFFFu);
  845|  2.55k|    if (negative) {
  ------------------
  |  Branch (845:9): [True: 311, False: 2.24k]
  ------------------
  846|    311|      ret_bits |= 0x8000000000000000u;
  847|    311|    }
  848|  2.55k|    *value = absl::bit_cast<double>(ret_bits);
  849|  2.55k|    return true;
  850|  2.55k|  } else if (FloatTraits<FloatType>::kTargetBits == 32) {
  ------------------
  |  Branch (850:14): [Folded, False: 0]
  ------------------
  851|      0|    uint32_t ret_bits = (static_cast<uint32_t>(ret_exp2) << 23) |
  852|      0|                        (static_cast<uint32_t>(ret_man) & 0x007FFFFFu);
  853|      0|    if (negative) {
  ------------------
  |  Branch (853:9): [True: 0, False: 0]
  ------------------
  854|      0|      ret_bits |= 0x80000000u;
  855|      0|    }
  856|      0|    *value = absl::bit_cast<float>(ret_bits);
  857|      0|    return true;
  858|      0|  }
  859|      0|#endif  // ABSL_BIT_PACK_FLOATS
  860|      0|  return false;
  861|  2.55k|}
charconv.cc:_ZN4absl12lts_2024011612_GLOBAL__N_126CalculateFromParsedDecimalIdEENS1_15CalculatedFloatERKNS0_16strings_internal11ParsedFloatE:
  620|  11.7k|    const strings_internal::ParsedFloat& parsed_decimal) {
  621|  11.7k|  CalculatedFloat result;
  622|       |
  623|       |  // Large or small enough decimal exponents will always result in overflow
  624|       |  // or underflow.
  625|  11.7k|  if (Power10Underflow(parsed_decimal.exponent)) {
  ------------------
  |  Branch (625:7): [True: 247, False: 11.4k]
  ------------------
  626|    247|    result.exponent = kUnderflow;
  627|    247|    return result;
  628|  11.4k|  } else if (Power10Overflow(parsed_decimal.exponent)) {
  ------------------
  |  Branch (628:14): [True: 62, False: 11.4k]
  ------------------
  629|     62|    result.exponent = kOverflow;
  630|     62|    return result;
  631|     62|  }
  632|       |
  633|       |  // Otherwise convert our power of 10 into a power of 2 times an integer
  634|       |  // mantissa, and multiply this by our parsed decimal mantissa.
  635|  11.4k|  uint128 wide_binary_mantissa = parsed_decimal.mantissa;
  636|  11.4k|  wide_binary_mantissa *= Power10Mantissa(parsed_decimal.exponent);
  637|  11.4k|  int binary_exponent = Power10Exponent(parsed_decimal.exponent);
  638|       |
  639|       |  // Discard bits that are inaccurate due to truncation error.  The magic
  640|       |  // `mantissa_width` constants below are justified in
  641|       |  // https://abseil.io/about/design/charconv. They represent the number of bits
  642|       |  // in `wide_binary_mantissa` that are guaranteed to be unaffected by error
  643|       |  // propagation.
  644|  11.4k|  bool mantissa_exact;
  645|  11.4k|  int mantissa_width;
  646|  11.4k|  if (parsed_decimal.subrange_begin) {
  ------------------
  |  Branch (646:7): [True: 7.98k, False: 3.41k]
  ------------------
  647|       |    // Truncated mantissa
  648|  7.98k|    mantissa_width = 58;
  649|  7.98k|    mantissa_exact = false;
  650|  7.98k|    binary_exponent +=
  651|  7.98k|        TruncateToBitWidth(mantissa_width, &wide_binary_mantissa);
  652|  7.98k|  } else if (!Power10Exact(parsed_decimal.exponent)) {
  ------------------
  |  Branch (652:14): [True: 2.78k, False: 638]
  ------------------
  653|       |    // Exact mantissa, truncated power of ten
  654|  2.78k|    mantissa_width = 63;
  655|  2.78k|    mantissa_exact = false;
  656|  2.78k|    binary_exponent +=
  657|  2.78k|        TruncateToBitWidth(mantissa_width, &wide_binary_mantissa);
  658|  2.78k|  } else {
  659|       |    // Product is exact
  660|    638|    mantissa_width = BitWidth(wide_binary_mantissa);
  661|    638|    mantissa_exact = true;
  662|    638|  }
  663|       |
  664|       |  // Shift into an FloatType-sized mantissa, and round to nearest.
  665|  11.4k|  const int shift =
  666|  11.4k|      NormalizedShiftSize<FloatType>(mantissa_width, binary_exponent);
  667|  11.4k|  bool result_exact;
  668|  11.4k|  binary_exponent += shift;
  669|  11.4k|  uint64_t binary_mantissa = ShiftRightAndRound(wide_binary_mantissa, shift,
  670|  11.4k|                                                mantissa_exact, &result_exact);
  671|  11.4k|  if (!result_exact) {
  ------------------
  |  Branch (671:7): [True: 8.51k, False: 2.88k]
  ------------------
  672|       |    // We could not determine the rounding direction using int128 math.  Use
  673|       |    // full resolution math instead.
  674|  8.51k|    if (MustRoundUp(binary_mantissa, binary_exponent, parsed_decimal)) {
  ------------------
  |  Branch (674:9): [True: 2.49k, False: 6.02k]
  ------------------
  675|  2.49k|      binary_mantissa += 1;
  676|  2.49k|    }
  677|  8.51k|  }
  678|       |
  679|  11.4k|  return CalculatedFloatFromRawValues<FloatType>(binary_mantissa,
  680|  11.4k|                                                 binary_exponent);
  681|  11.7k|}
charconv.cc:_ZN4absl12lts_2024011612_GLOBAL__N_116Power10UnderflowEi:
  288|  11.7k|bool Power10Underflow(int n) { return n < kPower10TableMinInclusive; }
charconv.cc:_ZN4absl12lts_2024011612_GLOBAL__N_115Power10OverflowEi:
  284|  11.4k|bool Power10Overflow(int n) { return n >= kPower10TableMaxExclusive; }
charconv.cc:_ZN4absl12lts_2024011612_GLOBAL__N_115Power10MantissaEi:
  270|  11.4k|uint64_t Power10Mantissa(int n) {
  271|  11.4k|  return kPower10MantissaHighTable[n - kPower10TableMinInclusive];
  272|  11.4k|}
charconv.cc:_ZN4absl12lts_2024011612_GLOBAL__N_115Power10ExponentEi:
  274|  11.4k|int Power10Exponent(int n) {
  275|       |  // The 217706 etc magic numbers encode the results as a formula instead of a
  276|       |  // table. Their equivalence (over the kPower10TableMinInclusive ..
  277|       |  // kPower10TableMaxExclusive range) is confirmed by
  278|       |  // https://github.com/google/wuffs/blob/315b2e52625ebd7b02d8fac13e3cd85ea374fb80/script/print-mpb-powers-of-10.go
  279|  11.4k|  return (217706 * n >> 16) - 63;
  280|  11.4k|}
charconv.cc:_ZN4absl12lts_2024011612_GLOBAL__N_118TruncateToBitWidthEiPNS0_7uint128E:
  348|  10.7k|int TruncateToBitWidth(int bit_width, absl::Nonnull<uint128*> value) {
  349|  10.7k|  const int current_bit_width = BitWidth(*value);
  350|  10.7k|  const int shift = current_bit_width - bit_width;
  351|  10.7k|  *value >>= shift;
  352|  10.7k|  return shift;
  353|  10.7k|}
charconv.cc:_ZN4absl12lts_2024011612_GLOBAL__N_112Power10ExactEi:
  293|  3.41k|bool Power10Exact(int n) { return n >= 0 && n <= 27; }
  ------------------
  |  Branch (293:35): [True: 2.12k, False: 1.29k]
  |  Branch (293:45): [True: 638, False: 1.48k]
  ------------------
charconv.cc:_ZN4absl12lts_2024011612_GLOBAL__N_18BitWidthENS0_7uint128E:
  317|  11.4k|int BitWidth(uint128 value) {
  318|  11.4k|  if (Uint128High64(value) == 0) {
  ------------------
  |  Branch (318:7): [True: 1.00k, False: 10.4k]
  ------------------
  319|       |    // This static_cast is only needed when using a std::bit_width()
  320|       |    // implementation that does not have the fix for LWG 3656 applied.
  321|  1.00k|    return static_cast<int>(bit_width(Uint128Low64(value)));
  322|  1.00k|  }
  323|  10.4k|  return 128 - countl_zero(Uint128High64(value));
  324|  11.4k|}
charconv.cc:_ZN4absl12lts_2024011612_GLOBAL__N_111MustRoundUpEmiRKNS0_16strings_internal11ParsedFloatE:
  513|  8.51k|                 const strings_internal::ParsedFloat& parsed_decimal) {
  514|       |  // 768 is the number of digits needed in the worst case.  We could determine a
  515|       |  // better limit dynamically based on the value of parsed_decimal.exponent.
  516|       |  // This would optimize pathological input cases only.  (Sane inputs won't have
  517|       |  // hundreds of digits of mantissa.)
  518|  8.51k|  absl::strings_internal::BigUnsigned<84> exact_mantissa;
  519|  8.51k|  int exact_exponent = exact_mantissa.ReadFloatMantissa(parsed_decimal, 768);
  520|       |
  521|       |  // Adjust the `guess` arguments to be halfway between A and B.
  522|  8.51k|  guess_mantissa = guess_mantissa * 2 + 1;
  523|  8.51k|  guess_exponent -= 1;
  524|       |
  525|       |  // In our comparison:
  526|       |  // lhs = exact = exact_mantissa * 10**exact_exponent
  527|       |  //             = exact_mantissa * 5**exact_exponent * 2**exact_exponent
  528|       |  // rhs = guess = guess_mantissa * 2**guess_exponent
  529|       |  //
  530|       |  // Because we are doing integer math, we can't directly deal with negative
  531|       |  // exponents.  We instead move these to the other side of the inequality.
  532|  8.51k|  absl::strings_internal::BigUnsigned<84>& lhs = exact_mantissa;
  533|  8.51k|  int comparison;
  534|  8.51k|  if (exact_exponent >= 0) {
  ------------------
  |  Branch (534:7): [True: 3.18k, False: 5.33k]
  ------------------
  535|  3.18k|    lhs.MultiplyByFiveToTheNth(exact_exponent);
  536|  3.18k|    absl::strings_internal::BigUnsigned<84> rhs(guess_mantissa);
  537|       |    // There are powers of 2 on both sides of the inequality; reduce this to
  538|       |    // a single bit-shift.
  539|  3.18k|    if (exact_exponent > guess_exponent) {
  ------------------
  |  Branch (539:9): [True: 0, False: 3.18k]
  ------------------
  540|      0|      lhs.ShiftLeft(exact_exponent - guess_exponent);
  541|  3.18k|    } else {
  542|  3.18k|      rhs.ShiftLeft(guess_exponent - exact_exponent);
  543|  3.18k|    }
  544|  3.18k|    comparison = Compare(lhs, rhs);
  545|  5.33k|  } else {
  546|       |    // Move the power of 5 to the other side of the equation, giving us:
  547|       |    // lhs = exact_mantissa * 2**exact_exponent
  548|       |    // rhs = guess_mantissa * 5**(-exact_exponent) * 2**guess_exponent
  549|  5.33k|    absl::strings_internal::BigUnsigned<84> rhs =
  550|  5.33k|        absl::strings_internal::BigUnsigned<84>::FiveToTheNth(-exact_exponent);
  551|  5.33k|    rhs.MultiplyBy(guess_mantissa);
  552|  5.33k|    if (exact_exponent > guess_exponent) {
  ------------------
  |  Branch (552:9): [True: 2.59k, False: 2.73k]
  ------------------
  553|  2.59k|      lhs.ShiftLeft(exact_exponent - guess_exponent);
  554|  2.73k|    } else {
  555|  2.73k|      rhs.ShiftLeft(guess_exponent - exact_exponent);
  556|  2.73k|    }
  557|  5.33k|    comparison = Compare(lhs, rhs);
  558|  5.33k|  }
  559|  8.51k|  if (comparison < 0) {
  ------------------
  |  Branch (559:7): [True: 5.66k, False: 2.85k]
  ------------------
  560|  5.66k|    return false;
  561|  5.66k|  } else if (comparison > 0) {
  ------------------
  |  Branch (561:14): [True: 2.42k, False: 438]
  ------------------
  562|  2.42k|    return true;
  563|  2.42k|  } else {
  564|       |    // When lhs == rhs, the decimal input is exactly between A and B.
  565|       |    // Round towards even -- round up only if the low bit of the initial
  566|       |    // `guess_mantissa` was a 1.  We shifted guess_mantissa left 1 bit at
  567|       |    // the beginning of this function, so test the 2nd bit here.
  568|    438|    return (guess_mantissa & 2) == 2;
  569|    438|  }
  570|  8.51k|}

_ZN4absl12lts_20240116anENS0_12chars_formatES1_:
   92|   102k|inline constexpr chars_format operator&(chars_format lhs, chars_format rhs) {
   93|   102k|  return static_cast<chars_format>(static_cast<int>(lhs) &
   94|   102k|                                   static_cast<int>(rhs));
   95|   102k|}

_ZN4absl12lts_2024011619WebSafeBase64EscapeENSt3__117basic_string_viewIcNS1_11char_traitsIcEEEE:
  914|      2|std::string WebSafeBase64Escape(absl::string_view src) {
  915|      2|  std::string dest;
  916|      2|  strings_internal::Base64EscapeInternal(
  917|      2|      reinterpret_cast<const unsigned char*>(src.data()), src.size(), &dest,
  918|      2|      false, strings_internal::kWebSafeBase64Chars);
  919|      2|  return dest;
  920|      2|}

_ZN4absl12lts_2024011616strings_internal11BigUnsignedILi84EE17ReadFloatMantissaERKNS1_11ParsedFloatEi:
  172|  8.51k|                                              int significant_digits) {
  173|  8.51k|  SetToZero();
  174|  8.51k|  assert(fp.type == FloatType::kNumber);
  175|       |
  176|  8.51k|  if (fp.subrange_begin == nullptr) {
  ------------------
  |  Branch (176:7): [True: 1.36k, False: 7.15k]
  ------------------
  177|       |    // We already exactly parsed the mantissa, so no more work is necessary.
  178|  1.36k|    words_[0] = fp.mantissa & 0xffffffffu;
  179|  1.36k|    words_[1] = fp.mantissa >> 32;
  180|  1.36k|    if (words_[1]) {
  ------------------
  |  Branch (180:9): [True: 292, False: 1.07k]
  ------------------
  181|    292|      size_ = 2;
  182|  1.07k|    } else if (words_[0]) {
  ------------------
  |  Branch (182:16): [True: 1.07k, False: 0]
  ------------------
  183|  1.07k|      size_ = 1;
  184|  1.07k|    }
  185|  1.36k|    return fp.exponent;
  186|  1.36k|  }
  187|  7.15k|  int exponent_adjust =
  188|  7.15k|      ReadDigits(fp.subrange_begin, fp.subrange_end, significant_digits);
  189|  7.15k|  return fp.literal_exponent + exponent_adjust;
  190|  8.51k|}
_ZN4absl12lts_2024011616strings_internal11BigUnsignedILi84EE12FiveToTheNthEi:
  289|  5.33k|    int n) {
  290|  5.33k|  BigUnsigned answer(1u);
  291|       |
  292|       |  // Seed from the table of large powers, if possible.
  293|  5.33k|  bool first_pass = true;
  294|  11.4k|  while (n >= kLargePowerOfFiveStep) {
  ------------------
  |  Branch (294:10): [True: 6.11k, False: 5.33k]
  ------------------
  295|  6.11k|    int big_power =
  296|  6.11k|        std::min(n / kLargePowerOfFiveStep, kLargestPowerOfFiveIndex);
  297|  6.11k|    if (first_pass) {
  ------------------
  |  Branch (297:9): [True: 3.61k, False: 2.50k]
  ------------------
  298|       |      // just copy, rather than multiplying by 1
  299|  3.61k|      std::copy_n(LargePowerOfFiveData(big_power),
  300|  3.61k|                  LargePowerOfFiveSize(big_power), answer.words_);
  301|  3.61k|      answer.size_ = LargePowerOfFiveSize(big_power);
  302|  3.61k|      first_pass = false;
  303|  3.61k|    } else {
  304|  2.50k|      answer.MultiplyBy(LargePowerOfFiveSize(big_power),
  305|  2.50k|                        LargePowerOfFiveData(big_power));
  306|  2.50k|    }
  307|  6.11k|    n -= kLargePowerOfFiveStep * big_power;
  308|  6.11k|  }
  309|  5.33k|  answer.MultiplyByFiveToTheNth(n);
  310|  5.33k|  return answer;
  311|  5.33k|}
_ZN4absl12lts_2024011616strings_internal11BigUnsignedILi84EE10ReadDigitsEPKcS5_i:
  194|  7.15k|                                       int significant_digits) {
  195|  7.15k|  assert(significant_digits <= Digits10() + 1);
  196|  7.15k|  SetToZero();
  197|       |
  198|  7.15k|  bool after_decimal_point = false;
  199|       |  // Discard any leading zeroes before the decimal point
  200|  7.58k|  while (begin < end && *begin == '0') {
  ------------------
  |  Branch (200:10): [True: 7.58k, False: 0]
  |  Branch (200:25): [True: 435, False: 7.15k]
  ------------------
  201|    435|    ++begin;
  202|    435|  }
  203|  7.15k|  int dropped_digits = 0;
  204|       |  // Discard any trailing zeroes.  These may or may not be after the decimal
  205|       |  // point.
  206|  8.23k|  while (begin < end && *std::prev(end) == '0') {
  ------------------
  |  Branch (206:10): [True: 8.23k, False: 0]
  |  Branch (206:25): [True: 1.07k, False: 7.15k]
  ------------------
  207|  1.07k|    --end;
  208|  1.07k|    ++dropped_digits;
  209|  1.07k|  }
  210|  7.15k|  if (begin < end && *std::prev(end) == '.') {
  ------------------
  |  Branch (210:7): [True: 7.15k, False: 0]
  |  Branch (210:22): [True: 340, False: 6.81k]
  ------------------
  211|       |    // If the string ends in '.', either before or after dropping zeroes, then
  212|       |    // drop the decimal point and look for more digits to drop.
  213|    340|    dropped_digits = 0;
  214|    340|    --end;
  215|    871|    while (begin < end && *std::prev(end) == '0') {
  ------------------
  |  Branch (215:12): [True: 871, False: 0]
  |  Branch (215:27): [True: 531, False: 340]
  ------------------
  216|    531|      --end;
  217|    531|      ++dropped_digits;
  218|    531|    }
  219|  6.81k|  } else if (dropped_digits) {
  ------------------
  |  Branch (219:14): [True: 815, False: 5.99k]
  ------------------
  220|       |    // We dropped digits, and aren't sure if they're before or after the decimal
  221|       |    // point.  Figure that out now.
  222|    815|    const char* dp = std::find(begin, end, '.');
  223|    815|    if (dp != end) {
  ------------------
  |  Branch (223:9): [True: 541, False: 274]
  ------------------
  224|       |      // The dropped trailing digits were after the decimal point, so don't
  225|       |      // count them.
  226|    541|      dropped_digits = 0;
  227|    541|    }
  228|    815|  }
  229|       |  // Any non-fraction digits we dropped need to be accounted for in our exponent
  230|       |  // adjustment.
  231|  7.15k|  int exponent_adjust = dropped_digits;
  232|       |
  233|  7.15k|  uint32_t queued = 0;
  234|  7.15k|  int digits_queued = 0;
  235|  1.63M|  for (; begin != end && significant_digits > 0; ++begin) {
  ------------------
  |  Branch (235:10): [True: 1.62M, False: 6.42k]
  |  Branch (235:26): [True: 1.62M, False: 726]
  ------------------
  236|  1.62M|    if (*begin == '.') {
  ------------------
  |  Branch (236:9): [True: 4.86k, False: 1.62M]
  ------------------
  237|  4.86k|      after_decimal_point = true;
  238|  4.86k|      continue;
  239|  4.86k|    }
  240|  1.62M|    if (after_decimal_point) {
  ------------------
  |  Branch (240:9): [True: 1.51M, False: 102k]
  ------------------
  241|       |      // For each fractional digit we emit in our parsed integer, adjust our
  242|       |      // decimal exponent to compensate.
  243|  1.51M|      --exponent_adjust;
  244|  1.51M|    }
  245|  1.62M|    char digit = (*begin - '0');
  246|  1.62M|    --significant_digits;
  247|  1.62M|    if (significant_digits == 0 && std::next(begin) != end &&
  ------------------
  |  Branch (247:9): [True: 929, False: 1.62M]
  |  Branch (247:36): [True: 726, False: 203]
  ------------------
  248|    726|        (digit == 0 || digit == 5)) {
  ------------------
  |  Branch (248:10): [True: 91, False: 635]
  |  Branch (248:24): [True: 216, False: 419]
  ------------------
  249|       |      // If this is the very last significant digit, but insignificant digits
  250|       |      // remain, we know that the last of those remaining significant digits is
  251|       |      // nonzero.  (If it wasn't, we would have stripped it before we got here.)
  252|       |      // So if this final digit is a 0 or 5, adjust it upward by 1.
  253|       |      //
  254|       |      // This adjustment is what allows incredibly large mantissas ending in
  255|       |      // 500000...000000000001 to correctly round up, rather than to nearest.
  256|    307|      ++digit;
  257|    307|    }
  258|  1.62M|    queued = 10 * queued + static_cast<uint32_t>(digit);
  259|  1.62M|    ++digits_queued;
  260|  1.62M|    if (digits_queued == kMaxSmallPowerOfTen) {
  ------------------
  |  Branch (260:9): [True: 177k, False: 1.44M]
  ------------------
  261|   177k|      MultiplyBy(kTenToNth[kMaxSmallPowerOfTen]);
  262|   177k|      AddWithCarry(0, queued);
  263|   177k|      queued = digits_queued = 0;
  264|   177k|    }
  265|  1.62M|  }
  266|       |  // Encode any remaining digits.
  267|  7.15k|  if (digits_queued) {
  ------------------
  |  Branch (267:7): [True: 6.63k, False: 516]
  ------------------
  268|  6.63k|    MultiplyBy(kTenToNth[digits_queued]);
  269|  6.63k|    AddWithCarry(0, queued);
  270|  6.63k|  }
  271|       |
  272|       |  // If any insignificant digits remain, we will drop them.  But if we have not
  273|       |  // yet read the decimal point, then we have to adjust the exponent to account
  274|       |  // for the dropped digits.
  275|  7.15k|  if (begin < end && !after_decimal_point) {
  ------------------
  |  Branch (275:7): [True: 726, False: 6.42k]
  |  Branch (275:22): [True: 2, False: 724]
  ------------------
  276|       |    // This call to std::find will result in a pointer either to the decimal
  277|       |    // point, or to the end of our buffer if there was none.
  278|       |    //
  279|       |    // Either way, [begin, decimal_point) will contain the set of dropped digits
  280|       |    // that require an exponent adjustment.
  281|      2|    const char* decimal_point = std::find(begin, end, '.');
  282|      2|    exponent_adjust += (decimal_point - begin);
  283|      2|  }
  284|  7.15k|  return exponent_adjust;
  285|  7.15k|}
_ZN4absl12lts_2024011616strings_internal11BigUnsignedILi84EE12MultiplyStepEiPKjii:
  316|   275k|                                          int other_size, int step) {
  317|   275k|  int this_i = std::min(original_size - 1, step);
  318|   275k|  int other_i = step - this_i;
  319|       |
  320|   275k|  uint64_t this_word = 0;
  321|   275k|  uint64_t carry = 0;
  322|  1.59M|  for (; this_i >= 0 && other_i < other_size; --this_i, ++other_i) {
  ------------------
  |  Branch (322:10): [True: 1.55M, False: 36.2k]
  |  Branch (322:25): [True: 1.31M, False: 239k]
  ------------------
  323|  1.31M|    uint64_t product = words_[this_i];
  324|  1.31M|    product *= other_words[other_i];
  325|  1.31M|    this_word += product;
  326|  1.31M|    carry += (this_word >> 32);
  327|  1.31M|    this_word &= 0xffffffff;
  328|  1.31M|  }
  329|   275k|  AddWithCarry(step + 1, carry);
  330|   275k|  words_[step] = this_word & 0xffffffff;
  331|   275k|  if (this_word > 0 && size_ <= step) {
  ------------------
  |  Branch (331:7): [True: 275k, False: 0]
  |  Branch (331:24): [True: 3.77k, False: 272k]
  ------------------
  332|  3.77k|    size_ = step + 1;
  333|  3.77k|  }
  334|   275k|}
charconv_bigint.cc:_ZN4absl12lts_2024011616strings_internal12_GLOBAL__N_120LargePowerOfFiveDataEi:
  152|  6.11k|const uint32_t* LargePowerOfFiveData(int i) {
  153|  6.11k|  return kLargePowersOfFive + i * (i - 1);
  154|  6.11k|}
charconv_bigint.cc:_ZN4absl12lts_2024011616strings_internal12_GLOBAL__N_120LargePowerOfFiveSizeEi:
  158|  9.73k|int LargePowerOfFiveSize(int i) { return 2 * i; }

_ZN4absl12lts_2024011616strings_internal11BigUnsignedILi84EEC2Ev:
   62|  8.51k|  BigUnsigned() : size_(0), words_{} {}
_ZN4absl12lts_2024011616strings_internal11BigUnsignedILi84EE22MultiplyByFiveToTheNthEi:
  164|  8.51k|  void MultiplyByFiveToTheNth(int n) {
  165|  39.0k|    while (n >= kMaxSmallPowerOfFive) {
  ------------------
  |  Branch (165:12): [True: 30.5k, False: 8.51k]
  ------------------
  166|  30.5k|      MultiplyBy(kFiveToNth[kMaxSmallPowerOfFive]);
  167|  30.5k|      n -= kMaxSmallPowerOfFive;
  168|  30.5k|    }
  169|  8.51k|    if (n > 0) {
  ------------------
  |  Branch (169:9): [True: 6.83k, False: 1.68k]
  ------------------
  170|  6.83k|      MultiplyBy(kFiveToNth[n]);
  171|  6.83k|    }
  172|  8.51k|  }
_ZN4absl12lts_2024011616strings_internal11BigUnsignedILi84EE10MultiplyByEj:
  130|   221k|  void MultiplyBy(uint32_t v) {
  131|   221k|    if (size_ == 0 || v == 1) {
  ------------------
  |  Branch (131:9): [True: 7.15k, False: 214k]
  |  Branch (131:23): [True: 0, False: 214k]
  ------------------
  132|  7.15k|      return;
  133|  7.15k|    }
  134|   214k|    if (v == 0) {
  ------------------
  |  Branch (134:9): [True: 0, False: 214k]
  ------------------
  135|      0|      SetToZero();
  136|      0|      return;
  137|      0|    }
  138|   214k|    const uint64_t factor = v;
  139|   214k|    uint64_t window = 0;
  140|  6.00M|    for (int i = 0; i < size_; ++i) {
  ------------------
  |  Branch (140:21): [True: 5.79M, False: 214k]
  ------------------
  141|  5.79M|      window += factor * words_[i];
  142|  5.79M|      words_[i] = window & 0xffffffff;
  143|  5.79M|      window >>= 32;
  144|  5.79M|    }
  145|       |    // If carry bits remain and there's space for them, grow size_.
  146|   214k|    if (window && size_ < max_words) {
  ------------------
  |  Branch (146:9): [True: 196k, False: 18.2k]
  |  Branch (146:19): [True: 196k, False: 0]
  ------------------
  147|   196k|      words_[size_] = window & 0xffffffff;
  148|   196k|      ++size_;
  149|   196k|    }
  150|   214k|  }
_ZN4absl12lts_2024011616strings_internal11BigUnsignedILi84EE9SetToZeroEv:
  199|  15.6k|  void SetToZero() {
  200|  15.6k|    std::fill_n(words_, size_, 0u);
  201|  15.6k|    size_ = 0;
  202|  15.6k|  }
_ZN4absl12lts_2024011616strings_internal11BigUnsignedILi84EEC2Em:
   64|  8.51k|      : size_((v >> 32) ? 2 : v ? 1 : 0),
  ------------------
  |  Branch (64:15): [True: 3.18k, False: 5.33k]
  |  Branch (64:31): [True: 5.33k, False: 0]
  ------------------
   65|  8.51k|        words_{static_cast<uint32_t>(v & 0xffffffffu),
   66|  8.51k|               static_cast<uint32_t>(v >> 32)} {}
_ZN4absl12lts_2024011616strings_internal11BigUnsignedILi84EE9ShiftLeftEi:
  102|  8.51k|  void ShiftLeft(int count) {
  103|  8.51k|    if (count > 0) {
  ------------------
  |  Branch (103:9): [True: 8.27k, False: 242]
  ------------------
  104|  8.27k|      const int word_shift = count / 32;
  105|  8.27k|      if (word_shift >= max_words) {
  ------------------
  |  Branch (105:11): [True: 0, False: 8.27k]
  ------------------
  106|      0|        SetToZero();
  107|      0|        return;
  108|      0|      }
  109|  8.27k|      size_ = (std::min)(size_ + word_shift, max_words);
  110|  8.27k|      count %= 32;
  111|  8.27k|      if (count == 0) {
  ------------------
  |  Branch (111:11): [True: 355, False: 7.92k]
  ------------------
  112|    355|        std::copy_backward(words_, words_ + size_ - word_shift, words_ + size_);
  113|  7.92k|      } else {
  114|   145k|        for (int i = (std::min)(size_, max_words - 1); i > word_shift; --i) {
  ------------------
  |  Branch (114:56): [True: 137k, False: 7.92k]
  ------------------
  115|   137k|          words_[i] = (words_[i - word_shift] << count) |
  116|   137k|                      (words_[i - word_shift - 1] >> (32 - count));
  117|   137k|        }
  118|  7.92k|        words_[word_shift] = words_[0] << count;
  119|       |        // Grow size_ if necessary.
  120|  7.92k|        if (size_ < max_words && words_[size_]) {
  ------------------
  |  Branch (120:13): [True: 7.92k, False: 0]
  |  Branch (120:34): [True: 2.62k, False: 5.29k]
  ------------------
  121|  2.62k|          ++size_;
  122|  2.62k|        }
  123|  7.92k|      }
  124|  8.27k|      std::fill_n(words_, word_shift, 0u);
  125|  8.27k|    }
  126|  8.51k|  }
_ZN4absl12lts_2024011616strings_internal7CompareILi84ELi84EEEiRKNS1_11BigUnsignedIXT_EEERKNS3_IXT0_EEE:
  353|  8.51k|int Compare(const BigUnsigned<N>& lhs, const BigUnsigned<M>& rhs) {
  354|  8.51k|  int limit = (std::max)(lhs.size(), rhs.size());
  355|  23.4k|  for (int i = limit - 1; i >= 0; --i) {
  ------------------
  |  Branch (355:27): [True: 22.9k, False: 438]
  ------------------
  356|  22.9k|    const uint32_t lhs_word = lhs.GetWord(i);
  357|  22.9k|    const uint32_t rhs_word = rhs.GetWord(i);
  358|  22.9k|    if (lhs_word < rhs_word) {
  ------------------
  |  Branch (358:9): [True: 5.66k, False: 17.3k]
  ------------------
  359|  5.66k|      return -1;
  360|  17.3k|    } else if (lhs_word > rhs_word) {
  ------------------
  |  Branch (360:16): [True: 2.42k, False: 14.8k]
  ------------------
  361|  2.42k|      return 1;
  362|  2.42k|    }
  363|  22.9k|  }
  364|    438|  return 0;
  365|  8.51k|}
_ZNK4absl12lts_2024011616strings_internal11BigUnsignedILi84EE4sizeEv:
  217|  17.0k|  int size() const { return size_; }
_ZNK4absl12lts_2024011616strings_internal11BigUnsignedILi84EE7GetWordEi:
  206|  45.9k|  uint32_t GetWord(int index) const {
  207|  45.9k|    if (index < 0 || index >= size_) {
  ------------------
  |  Branch (207:9): [True: 0, False: 45.9k]
  |  Branch (207:22): [True: 194, False: 45.7k]
  ------------------
  208|    194|      return 0;
  209|    194|    }
  210|  45.7k|    return words_[index];
  211|  45.9k|  }
_ZN4absl12lts_2024011616strings_internal11BigUnsignedILi84EE10MultiplyByEm:
  152|  5.33k|  void MultiplyBy(uint64_t v) {
  153|  5.33k|    uint32_t words[2];
  154|  5.33k|    words[0] = static_cast<uint32_t>(v);
  155|  5.33k|    words[1] = static_cast<uint32_t>(v >> 32);
  156|  5.33k|    if (words[1] == 0) {
  ------------------
  |  Branch (156:9): [True: 0, False: 5.33k]
  ------------------
  157|      0|      MultiplyBy(words[0]);
  158|  5.33k|    } else {
  159|  5.33k|      MultiplyBy(2, words);
  160|  5.33k|    }
  161|  5.33k|  }
_ZN4absl12lts_2024011616strings_internal11BigUnsignedILi84EE10MultiplyByEiPKj:
  270|  7.83k|  void MultiplyBy(int other_size, const uint32_t* other_words) {
  271|  7.83k|    const int original_size = size_;
  272|  7.83k|    const int first_step =
  273|  7.83k|        (std::min)(original_size + other_size - 2, max_words - 1);
  274|   283k|    for (int step = first_step; step >= 0; --step) {
  ------------------
  |  Branch (274:33): [True: 275k, False: 7.83k]
  ------------------
  275|   275k|      MultiplyStep(original_size, other_words, other_size, step);
  276|   275k|    }
  277|  7.83k|  }
_ZN4absl12lts_2024011616strings_internal11BigUnsignedILi84EE12AddWithCarryEij:
  280|   331k|  void AddWithCarry(int index, uint32_t value) {
  281|   331k|    if (value) {
  ------------------
  |  Branch (281:9): [True: 330k, False: 481]
  ------------------
  282|   682k|      while (index < max_words && value > 0) {
  ------------------
  |  Branch (282:14): [True: 682k, False: 0]
  |  Branch (282:35): [True: 351k, False: 330k]
  ------------------
  283|   351k|        words_[index] += value;
  284|       |        // carry if we overflowed in this word:
  285|   351k|        if (value > words_[index]) {
  ------------------
  |  Branch (285:13): [True: 20.9k, False: 330k]
  ------------------
  286|  20.9k|          value = 1;
  287|  20.9k|          ++index;
  288|   330k|        } else {
  289|   330k|          value = 0;
  290|   330k|        }
  291|   351k|      }
  292|   330k|      size_ = (std::min)(max_words, (std::max)(index + 1, size_));
  293|   330k|    }
  294|   331k|  }
_ZN4absl12lts_2024011616strings_internal11BigUnsignedILi84EE12AddWithCarryEim:
  296|   275k|  void AddWithCarry(int index, uint64_t value) {
  297|   275k|    if (value && index < max_words) {
  ------------------
  |  Branch (297:9): [True: 271k, False: 4.04k]
  |  Branch (297:18): [True: 271k, False: 0]
  ------------------
  298|   271k|      uint32_t high = value >> 32;
  299|   271k|      uint32_t low = value & 0xffffffff;
  300|   271k|      words_[index] += low;
  301|   271k|      if (words_[index] < low) {
  ------------------
  |  Branch (301:11): [True: 103k, False: 168k]
  ------------------
  302|   103k|        ++high;
  303|   103k|        if (high == 0) {
  ------------------
  |  Branch (303:13): [True: 0, False: 103k]
  ------------------
  304|       |          // Carry from the low word caused our high word to overflow.
  305|       |          // Short circuit here to do the right thing.
  306|      0|          AddWithCarry(index + 2, static_cast<uint32_t>(1));
  307|      0|          return;
  308|      0|        }
  309|   103k|      }
  310|   271k|      if (high > 0) {
  ------------------
  |  Branch (310:11): [True: 147k, False: 124k]
  ------------------
  311|   147k|        AddWithCarry(index + 1, high);
  312|   147k|      } else {
  313|       |        // Normally 32-bit AddWithCarry() sets size_, but since we don't call
  314|       |        // it when `high` is 0, do it ourselves here.
  315|   124k|        size_ = (std::min)(max_words, (std::max)(index + 1, size_));
  316|   124k|      }
  317|   271k|    }
  318|   275k|  }

_ZN4absl12lts_2024011616strings_internal10ParseFloatILi10EEENS1_11ParsedFloatEPKcS5_NS0_12chars_formatE:
  356|  16.9k|                                         chars_format format_flags) {
  357|  16.9k|  strings_internal::ParsedFloat result;
  358|       |
  359|       |  // Exit early if we're given an empty range.
  360|  16.9k|  if (begin == end) return result;
  ------------------
  |  Branch (360:7): [True: 12, False: 16.9k]
  ------------------
  361|       |
  362|       |  // Handle the infinity and NaN cases.
  363|  16.9k|  if (ParseInfinityOrNan(begin, end, &result)) {
  ------------------
  |  Branch (363:7): [True: 905, False: 15.9k]
  ------------------
  364|    905|    return result;
  365|    905|  }
  366|       |
  367|  15.9k|  const char* const mantissa_begin = begin;
  368|  17.2k|  while (begin < end && *begin == '0') {
  ------------------
  |  Branch (368:10): [True: 16.8k, False: 360]
  |  Branch (368:25): [True: 1.22k, False: 15.6k]
  ------------------
  369|  1.22k|    ++begin;  // skip leading zeros
  370|  1.22k|  }
  371|  15.9k|  uint64_t mantissa = 0;
  372|       |
  373|  15.9k|  int exponent_adjustment = 0;
  374|  15.9k|  bool mantissa_is_inexact = false;
  375|  15.9k|  int pre_decimal_digits = ConsumeDigits<base>(
  376|  15.9k|      begin, end, MantissaDigitsMax<base>(), &mantissa, &mantissa_is_inexact);
  377|  15.9k|  begin += pre_decimal_digits;
  378|  15.9k|  int digits_left;
  379|  15.9k|  if (pre_decimal_digits >= DigitLimit<base>()) {
  ------------------
  |  Branch (379:7): [True: 0, False: 15.9k]
  ------------------
  380|       |    // refuse to parse pathological inputs
  381|      0|    return result;
  382|  15.9k|  } else if (pre_decimal_digits > MantissaDigitsMax<base>()) {
  ------------------
  |  Branch (382:14): [True: 4.19k, False: 11.8k]
  ------------------
  383|       |    // We dropped some non-fraction digits on the floor.  Adjust our exponent
  384|       |    // to compensate.
  385|  4.19k|    exponent_adjustment =
  386|  4.19k|        static_cast<int>(pre_decimal_digits - MantissaDigitsMax<base>());
  387|  4.19k|    digits_left = 0;
  388|  11.8k|  } else {
  389|  11.8k|    digits_left =
  390|  11.8k|        static_cast<int>(MantissaDigitsMax<base>() - pre_decimal_digits);
  391|  11.8k|  }
  392|  15.9k|  if (begin < end && *begin == '.') {
  ------------------
  |  Branch (392:7): [True: 13.0k, False: 2.97k]
  |  Branch (392:22): [True: 7.85k, False: 5.16k]
  ------------------
  393|  7.85k|    ++begin;
  394|  7.85k|    if (mantissa == 0) {
  ------------------
  |  Branch (394:9): [True: 4.16k, False: 3.68k]
  ------------------
  395|       |      // If we haven't seen any nonzero digits yet, keep skipping zeros.  We
  396|       |      // have to adjust the exponent to reflect the changed place value.
  397|  4.16k|      const char* begin_zeros = begin;
  398|  5.93k|      while (begin < end && *begin == '0') {
  ------------------
  |  Branch (398:14): [True: 5.36k, False: 569]
  |  Branch (398:29): [True: 1.77k, False: 3.59k]
  ------------------
  399|  1.77k|        ++begin;
  400|  1.77k|      }
  401|  4.16k|      int zeros_skipped = static_cast<int>(begin - begin_zeros);
  402|  4.16k|      if (zeros_skipped >= DigitLimit<base>()) {
  ------------------
  |  Branch (402:11): [True: 0, False: 4.16k]
  ------------------
  403|       |        // refuse to parse pathological inputs
  404|      0|        return result;
  405|      0|      }
  406|  4.16k|      exponent_adjustment -= static_cast<int>(zeros_skipped);
  407|  4.16k|    }
  408|  7.85k|    int post_decimal_digits = ConsumeDigits<base>(
  409|  7.85k|        begin, end, digits_left, &mantissa, &mantissa_is_inexact);
  410|  7.85k|    begin += post_decimal_digits;
  411|       |
  412|       |    // Since `mantissa` is an integer, each significant digit we read after
  413|       |    // the decimal point requires an adjustment to the exponent. "1.23e0" will
  414|       |    // be stored as `mantissa` == 123 and `exponent` == -2 (that is,
  415|       |    // "123e-2").
  416|  7.85k|    if (post_decimal_digits >= DigitLimit<base>()) {
  ------------------
  |  Branch (416:9): [True: 0, False: 7.85k]
  ------------------
  417|       |      // refuse to parse pathological inputs
  418|      0|      return result;
  419|  7.85k|    } else if (post_decimal_digits > digits_left) {
  ------------------
  |  Branch (419:16): [True: 5.17k, False: 2.67k]
  ------------------
  420|  5.17k|      exponent_adjustment -= digits_left;
  421|  5.17k|    } else {
  422|  2.67k|      exponent_adjustment -= post_decimal_digits;
  423|  2.67k|    }
  424|  7.85k|  }
  425|       |  // If we've found no mantissa whatsoever, this isn't a number.
  426|  15.9k|  if (mantissa_begin == begin) {
  ------------------
  |  Branch (426:7): [True: 68, False: 15.9k]
  ------------------
  427|     68|    return result;
  428|     68|  }
  429|       |  // A bare "." doesn't count as a mantissa either.
  430|  15.9k|  if (begin - mantissa_begin == 1 && *mantissa_begin == '.') {
  ------------------
  |  Branch (430:7): [True: 3.43k, False: 12.4k]
  |  Branch (430:38): [True: 11, False: 3.41k]
  ------------------
  431|     11|    return result;
  432|     11|  }
  433|       |
  434|  15.9k|  if (mantissa_is_inexact) {
  ------------------
  |  Branch (434:7): [True: 8.29k, False: 7.62k]
  ------------------
  435|       |    // We dropped significant digits on the floor.  Handle this appropriately.
  436|  8.29k|    if (base == 10) {
  ------------------
  |  Branch (436:9): [True: 8.29k, Folded]
  ------------------
  437|       |      // If we truncated significant decimal digits, store the full range of the
  438|       |      // mantissa for future big integer math for exact rounding.
  439|  8.29k|      result.subrange_begin = mantissa_begin;
  440|  8.29k|      result.subrange_end = begin;
  441|  8.29k|    } else if (base == 16) {
  ------------------
  |  Branch (441:16): [Folded, False: 0]
  ------------------
  442|       |      // If we truncated hex digits, reflect this fact by setting the low
  443|       |      // ("sticky") bit.  This allows for correct rounding in all cases.
  444|      0|      mantissa |= 1;
  445|      0|    }
  446|  8.29k|  }
  447|  15.9k|  result.mantissa = mantissa;
  448|       |
  449|  15.9k|  const char* const exponent_begin = begin;
  450|  15.9k|  result.literal_exponent = 0;
  451|  15.9k|  bool found_exponent = false;
  452|  15.9k|  if (AllowExponent(format_flags) && begin < end &&
  ------------------
  |  Branch (452:7): [True: 15.9k, False: 0]
  |  Branch (452:38): [True: 7.27k, False: 8.64k]
  ------------------
  453|  7.27k|      IsExponentCharacter<base>(*begin)) {
  ------------------
  |  Branch (453:7): [True: 7.20k, False: 77]
  ------------------
  454|  7.20k|    bool negative_exponent = false;
  455|  7.20k|    ++begin;
  456|  7.20k|    if (begin < end && *begin == '-') {
  ------------------
  |  Branch (456:9): [True: 7.19k, False: 5]
  |  Branch (456:24): [True: 3.16k, False: 4.03k]
  ------------------
  457|  3.16k|      negative_exponent = true;
  458|  3.16k|      ++begin;
  459|  4.04k|    } else if (begin < end && *begin == '+') {
  ------------------
  |  Branch (459:16): [True: 4.03k, False: 5]
  |  Branch (459:31): [True: 722, False: 3.31k]
  ------------------
  460|    722|      ++begin;
  461|    722|    }
  462|  7.20k|    const char* const exponent_digits_begin = begin;
  463|       |    // Exponent is always expressed in decimal, even for hexadecimal floats.
  464|  7.20k|    begin += ConsumeDigits<10>(begin, end, kDecimalExponentDigitsMax,
  465|  7.20k|                               &result.literal_exponent, nullptr);
  466|  7.20k|    if (begin == exponent_digits_begin) {
  ------------------
  |  Branch (466:9): [True: 19, False: 7.18k]
  ------------------
  467|       |      // there were no digits where we expected an exponent.  We failed to read
  468|       |      // an exponent and should not consume the 'e' after all.  Rewind 'begin'.
  469|     19|      found_exponent = false;
  470|     19|      begin = exponent_begin;
  471|  7.18k|    } else {
  472|  7.18k|      found_exponent = true;
  473|  7.18k|      if (negative_exponent) {
  ------------------
  |  Branch (473:11): [True: 3.14k, False: 4.03k]
  ------------------
  474|  3.14k|        result.literal_exponent = -result.literal_exponent;
  475|  3.14k|      }
  476|  7.18k|    }
  477|  7.20k|  }
  478|       |
  479|  15.9k|  if (!found_exponent && RequireExponent(format_flags)) {
  ------------------
  |  Branch (479:7): [True: 8.73k, False: 7.18k]
  |  Branch (479:26): [True: 0, False: 8.73k]
  ------------------
  480|       |    // Provided flags required an exponent, but none was found.  This results
  481|       |    // in a failure to scan.
  482|      0|    return result;
  483|      0|  }
  484|       |
  485|       |  // Success!
  486|  15.9k|  result.type = strings_internal::FloatType::kNumber;
  487|  15.9k|  if (result.mantissa > 0) {
  ------------------
  |  Branch (487:7): [True: 14.8k, False: 1.09k]
  ------------------
  488|  14.8k|    result.exponent = result.literal_exponent +
  489|  14.8k|                      (DigitMagnitude<base>() * exponent_adjustment);
  490|  14.8k|  } else {
  491|  1.09k|    result.exponent = 0;
  492|  1.09k|  }
  493|  15.9k|  result.end = begin;
  494|  15.9k|  return result;
  495|  15.9k|}
_ZN4absl12lts_2024011616strings_internal10ParseFloatILi16EEENS1_11ParsedFloatEPKcS5_NS0_12chars_formatE:
  356|  4.64k|                                         chars_format format_flags) {
  357|  4.64k|  strings_internal::ParsedFloat result;
  358|       |
  359|       |  // Exit early if we're given an empty range.
  360|  4.64k|  if (begin == end) return result;
  ------------------
  |  Branch (360:7): [True: 4, False: 4.64k]
  ------------------
  361|       |
  362|       |  // Handle the infinity and NaN cases.
  363|  4.64k|  if (ParseInfinityOrNan(begin, end, &result)) {
  ------------------
  |  Branch (363:7): [True: 1, False: 4.64k]
  ------------------
  364|      1|    return result;
  365|      1|  }
  366|       |
  367|  4.64k|  const char* const mantissa_begin = begin;
  368|  5.19k|  while (begin < end && *begin == '0') {
  ------------------
  |  Branch (368:10): [True: 4.99k, False: 201]
  |  Branch (368:25): [True: 551, False: 4.43k]
  ------------------
  369|    551|    ++begin;  // skip leading zeros
  370|    551|  }
  371|  4.64k|  uint64_t mantissa = 0;
  372|       |
  373|  4.64k|  int exponent_adjustment = 0;
  374|  4.64k|  bool mantissa_is_inexact = false;
  375|  4.64k|  int pre_decimal_digits = ConsumeDigits<base>(
  376|  4.64k|      begin, end, MantissaDigitsMax<base>(), &mantissa, &mantissa_is_inexact);
  377|  4.64k|  begin += pre_decimal_digits;
  378|  4.64k|  int digits_left;
  379|  4.64k|  if (pre_decimal_digits >= DigitLimit<base>()) {
  ------------------
  |  Branch (379:7): [True: 0, False: 4.64k]
  ------------------
  380|       |    // refuse to parse pathological inputs
  381|      0|    return result;
  382|  4.64k|  } else if (pre_decimal_digits > MantissaDigitsMax<base>()) {
  ------------------
  |  Branch (382:14): [True: 330, False: 4.31k]
  ------------------
  383|       |    // We dropped some non-fraction digits on the floor.  Adjust our exponent
  384|       |    // to compensate.
  385|    330|    exponent_adjustment =
  386|    330|        static_cast<int>(pre_decimal_digits - MantissaDigitsMax<base>());
  387|    330|    digits_left = 0;
  388|  4.31k|  } else {
  389|  4.31k|    digits_left =
  390|  4.31k|        static_cast<int>(MantissaDigitsMax<base>() - pre_decimal_digits);
  391|  4.31k|  }
  392|  4.64k|  if (begin < end && *begin == '.') {
  ------------------
  |  Branch (392:7): [True: 3.84k, False: 794]
  |  Branch (392:22): [True: 2.35k, False: 1.48k]
  ------------------
  393|  2.35k|    ++begin;
  394|  2.35k|    if (mantissa == 0) {
  ------------------
  |  Branch (394:9): [True: 1.20k, False: 1.15k]
  ------------------
  395|       |      // If we haven't seen any nonzero digits yet, keep skipping zeros.  We
  396|       |      // have to adjust the exponent to reflect the changed place value.
  397|  1.20k|      const char* begin_zeros = begin;
  398|  1.61k|      while (begin < end && *begin == '0') {
  ------------------
  |  Branch (398:14): [True: 1.17k, False: 448]
  |  Branch (398:29): [True: 418, False: 752]
  ------------------
  399|    418|        ++begin;
  400|    418|      }
  401|  1.20k|      int zeros_skipped = static_cast<int>(begin - begin_zeros);
  402|  1.20k|      if (zeros_skipped >= DigitLimit<base>()) {
  ------------------
  |  Branch (402:11): [True: 0, False: 1.20k]
  ------------------
  403|       |        // refuse to parse pathological inputs
  404|      0|        return result;
  405|      0|      }
  406|  1.20k|      exponent_adjustment -= static_cast<int>(zeros_skipped);
  407|  1.20k|    }
  408|  2.35k|    int post_decimal_digits = ConsumeDigits<base>(
  409|  2.35k|        begin, end, digits_left, &mantissa, &mantissa_is_inexact);
  410|  2.35k|    begin += post_decimal_digits;
  411|       |
  412|       |    // Since `mantissa` is an integer, each significant digit we read after
  413|       |    // the decimal point requires an adjustment to the exponent. "1.23e0" will
  414|       |    // be stored as `mantissa` == 123 and `exponent` == -2 (that is,
  415|       |    // "123e-2").
  416|  2.35k|    if (post_decimal_digits >= DigitLimit<base>()) {
  ------------------
  |  Branch (416:9): [True: 0, False: 2.35k]
  ------------------
  417|       |      // refuse to parse pathological inputs
  418|      0|      return result;
  419|  2.35k|    } else if (post_decimal_digits > digits_left) {
  ------------------
  |  Branch (419:16): [True: 406, False: 1.95k]
  ------------------
  420|    406|      exponent_adjustment -= digits_left;
  421|  1.95k|    } else {
  422|  1.95k|      exponent_adjustment -= post_decimal_digits;
  423|  1.95k|    }
  424|  2.35k|  }
  425|       |  // If we've found no mantissa whatsoever, this isn't a number.
  426|  4.64k|  if (mantissa_begin == begin) {
  ------------------
  |  Branch (426:7): [True: 5, False: 4.63k]
  ------------------
  427|      5|    return result;
  428|      5|  }
  429|       |  // A bare "." doesn't count as a mantissa either.
  430|  4.63k|  if (begin - mantissa_begin == 1 && *mantissa_begin == '.') {
  ------------------
  |  Branch (430:7): [True: 1.63k, False: 2.99k]
  |  Branch (430:38): [True: 2, False: 1.63k]
  ------------------
  431|      2|    return result;
  432|      2|  }
  433|       |
  434|  4.63k|  if (mantissa_is_inexact) {
  ------------------
  |  Branch (434:7): [True: 713, False: 3.92k]
  ------------------
  435|       |    // We dropped significant digits on the floor.  Handle this appropriately.
  436|    713|    if (base == 10) {
  ------------------
  |  Branch (436:9): [Folded, False: 713]
  ------------------
  437|       |      // If we truncated significant decimal digits, store the full range of the
  438|       |      // mantissa for future big integer math for exact rounding.
  439|      0|      result.subrange_begin = mantissa_begin;
  440|      0|      result.subrange_end = begin;
  441|    713|    } else if (base == 16) {
  ------------------
  |  Branch (441:16): [True: 713, Folded]
  ------------------
  442|       |      // If we truncated hex digits, reflect this fact by setting the low
  443|       |      // ("sticky") bit.  This allows for correct rounding in all cases.
  444|    713|      mantissa |= 1;
  445|    713|    }
  446|    713|  }
  447|  4.63k|  result.mantissa = mantissa;
  448|       |
  449|  4.63k|  const char* const exponent_begin = begin;
  450|  4.63k|  result.literal_exponent = 0;
  451|  4.63k|  bool found_exponent = false;
  452|  4.63k|  if (AllowExponent(format_flags) && begin < end &&
  ------------------
  |  Branch (452:7): [True: 4.63k, False: 0]
  |  Branch (452:38): [True: 1.95k, False: 2.68k]
  ------------------
  453|  1.95k|      IsExponentCharacter<base>(*begin)) {
  ------------------
  |  Branch (453:7): [True: 1.92k, False: 25]
  ------------------
  454|  1.92k|    bool negative_exponent = false;
  455|  1.92k|    ++begin;
  456|  1.92k|    if (begin < end && *begin == '-') {
  ------------------
  |  Branch (456:9): [True: 1.91k, False: 8]
  |  Branch (456:24): [True: 1.33k, False: 583]
  ------------------
  457|  1.33k|      negative_exponent = true;
  458|  1.33k|      ++begin;
  459|  1.33k|    } else if (begin < end && *begin == '+') {
  ------------------
  |  Branch (459:16): [True: 583, False: 8]
  |  Branch (459:31): [True: 301, False: 282]
  ------------------
  460|    301|      ++begin;
  461|    301|    }
  462|  1.92k|    const char* const exponent_digits_begin = begin;
  463|       |    // Exponent is always expressed in decimal, even for hexadecimal floats.
  464|  1.92k|    begin += ConsumeDigits<10>(begin, end, kDecimalExponentDigitsMax,
  465|  1.92k|                               &result.literal_exponent, nullptr);
  466|  1.92k|    if (begin == exponent_digits_begin) {
  ------------------
  |  Branch (466:9): [True: 18, False: 1.90k]
  ------------------
  467|       |      // there were no digits where we expected an exponent.  We failed to read
  468|       |      // an exponent and should not consume the 'e' after all.  Rewind 'begin'.
  469|     18|      found_exponent = false;
  470|     18|      begin = exponent_begin;
  471|  1.90k|    } else {
  472|  1.90k|      found_exponent = true;
  473|  1.90k|      if (negative_exponent) {
  ------------------
  |  Branch (473:11): [True: 1.33k, False: 574]
  ------------------
  474|  1.33k|        result.literal_exponent = -result.literal_exponent;
  475|  1.33k|      }
  476|  1.90k|    }
  477|  1.92k|  }
  478|       |
  479|  4.63k|  if (!found_exponent && RequireExponent(format_flags)) {
  ------------------
  |  Branch (479:7): [True: 2.72k, False: 1.90k]
  |  Branch (479:26): [True: 0, False: 2.72k]
  ------------------
  480|       |    // Provided flags required an exponent, but none was found.  This results
  481|       |    // in a failure to scan.
  482|      0|    return result;
  483|      0|  }
  484|       |
  485|       |  // Success!
  486|  4.63k|  result.type = strings_internal::FloatType::kNumber;
  487|  4.63k|  if (result.mantissa > 0) {
  ------------------
  |  Branch (487:7): [True: 3.89k, False: 741]
  ------------------
  488|  3.89k|    result.exponent = result.literal_exponent +
  489|  3.89k|                      (DigitMagnitude<base>() * exponent_adjustment);
  490|  3.89k|  } else {
  491|    741|    result.exponent = 0;
  492|    741|  }
  493|  4.63k|  result.end = begin;
  494|  4.63k|  return result;
  495|  4.63k|}
charconv_parse.cc:_ZN4absl12lts_2024011612_GLOBAL__N_113AllowExponentENS0_12chars_formatE:
  122|  20.5k|bool AllowExponent(chars_format flags) {
  123|  20.5k|  bool fixed = (flags & chars_format::fixed) == chars_format::fixed;
  124|  20.5k|  bool scientific =
  125|  20.5k|      (flags & chars_format::scientific) == chars_format::scientific;
  126|  20.5k|  return scientific || !fixed;
  ------------------
  |  Branch (126:10): [True: 20.5k, False: 0]
  |  Branch (126:24): [True: 0, False: 0]
  ------------------
  127|  20.5k|}
charconv_parse.cc:_ZN4absl12lts_2024011612_GLOBAL__N_115RequireExponentENS0_12chars_formatE:
  130|  11.4k|bool RequireExponent(chars_format flags) {
  131|  11.4k|  bool fixed = (flags & chars_format::fixed) == chars_format::fixed;
  132|  11.4k|  bool scientific =
  133|  11.4k|      (flags & chars_format::scientific) == chars_format::scientific;
  134|  11.4k|  return scientific && !fixed;
  ------------------
  |  Branch (134:10): [True: 11.4k, False: 0]
  |  Branch (134:24): [True: 0, False: 11.4k]
  ------------------
  135|  11.4k|}
charconv_parse.cc:_ZN4absl12lts_2024011612_GLOBAL__N_119IsExponentCharacterILi10EEEbc:
  201|  7.27k|bool IsExponentCharacter<10>(char ch) {
  202|  7.27k|  return ch == 'e' || ch == 'E';
  ------------------
  |  Branch (202:10): [True: 4.40k, False: 2.87k]
  |  Branch (202:23): [True: 2.79k, False: 77]
  ------------------
  203|  7.27k|}
charconv_parse.cc:_ZN4absl12lts_2024011612_GLOBAL__N_119IsExponentCharacterILi16EEEbc:
  206|  1.95k|bool IsExponentCharacter<16>(char ch) {
  207|  1.95k|  return ch == 'p' || ch == 'P';
  ------------------
  |  Branch (207:10): [True: 519, False: 1.43k]
  |  Branch (207:23): [True: 1.40k, False: 25]
  ------------------
  208|  1.95k|}
charconv_parse.cc:_ZN4absl12lts_2024011612_GLOBAL__N_117MantissaDigitsMaxILi10EEEiv:
  211|  47.9k|constexpr int MantissaDigitsMax<10>() {
  212|  47.9k|  return kDecimalMantissaDigitsMax;
  213|  47.9k|}
charconv_parse.cc:_ZN4absl12lts_2024011612_GLOBAL__N_117MantissaDigitsMaxILi16EEEiv:
  215|  13.9k|constexpr int MantissaDigitsMax<16>() {
  216|  13.9k|  return kHexadecimalMantissaDigitsMax;
  217|  13.9k|}
charconv_parse.cc:_ZN4absl12lts_2024011612_GLOBAL__N_110DigitLimitILi10EEEiv:
  220|  28.0k|constexpr int DigitLimit<10>() {
  221|  28.0k|  return kDecimalDigitLimit;
  222|  28.0k|}
charconv_parse.cc:_ZN4absl12lts_2024011612_GLOBAL__N_110DigitLimitILi16EEEiv:
  224|  8.19k|constexpr int DigitLimit<16>() {
  225|  8.19k|  return kHexadecimalDigitLimit;
  226|  8.19k|}
charconv_parse.cc:_ZN4absl12lts_2024011612_GLOBAL__N_114DigitMagnitudeILi10EEEiv:
  229|  14.8k|constexpr int DigitMagnitude<10>() {
  230|  14.8k|  return 1;
  231|  14.8k|}
charconv_parse.cc:_ZN4absl12lts_2024011612_GLOBAL__N_114DigitMagnitudeILi16EEEiv:
  233|  3.89k|constexpr int DigitMagnitude<16>() {
  234|  3.89k|  return 4;
  235|  3.89k|}
charconv_parse.cc:_ZN4absl12lts_2024011612_GLOBAL__N_118ParseInfinityOrNanEPKcS3_PNS0_16strings_internal11ParsedFloatE:
  298|  21.5k|                        strings_internal::ParsedFloat* out) {
  299|  21.5k|  if (end - begin < 3) {
  ------------------
  |  Branch (299:7): [True: 3.58k, False: 17.9k]
  ------------------
  300|  3.58k|    return false;
  301|  3.58k|  }
  302|  17.9k|  switch (*begin) {
  303|    242|    case 'i':
  ------------------
  |  Branch (303:5): [True: 242, False: 17.7k]
  ------------------
  304|    497|    case 'I': {
  ------------------
  |  Branch (304:5): [True: 255, False: 17.7k]
  ------------------
  305|       |      // An infinity string consists of the characters "inf" or "infinity",
  306|       |      // case insensitive.
  307|    497|      if (strings_internal::memcasecmp(begin + 1, "nf", 2) != 0) {
  ------------------
  |  Branch (307:11): [True: 9, False: 488]
  ------------------
  308|      9|        return false;
  309|      9|      }
  310|    488|      out->type = strings_internal::FloatType::kInfinity;
  311|    488|      if (end - begin >= 8 &&
  ------------------
  |  Branch (311:11): [True: 13, False: 475]
  ------------------
  312|     13|          strings_internal::memcasecmp(begin + 3, "inity", 5) == 0) {
  ------------------
  |  Branch (312:11): [True: 0, False: 13]
  ------------------
  313|      0|        out->end = begin + 8;
  314|    488|      } else {
  315|    488|        out->end = begin + 3;
  316|    488|      }
  317|    488|      return true;
  318|    497|    }
  319|    145|    case 'n':
  ------------------
  |  Branch (319:5): [True: 145, False: 17.8k]
  ------------------
  320|    428|    case 'N': {
  ------------------
  |  Branch (320:5): [True: 283, False: 17.6k]
  ------------------
  321|       |      // A NaN consists of the characters "nan", case insensitive, optionally
  322|       |      // followed by a parenthesized sequence of zero or more alphanumeric
  323|       |      // characters and/or underscores.
  324|    428|      if (strings_internal::memcasecmp(begin + 1, "an", 2) != 0) {
  ------------------
  |  Branch (324:11): [True: 10, False: 418]
  ------------------
  325|     10|        return false;
  326|     10|      }
  327|    418|      out->type = strings_internal::FloatType::kNan;
  328|    418|      out->end = begin + 3;
  329|       |      // NaN is allowed to be followed by a parenthesized string, consisting of
  330|       |      // only the characters [a-zA-Z0-9_].  Match that if it's present.
  331|    418|      begin += 3;
  332|    418|      if (begin < end && *begin == '(') {
  ------------------
  |  Branch (332:11): [True: 2, False: 416]
  |  Branch (332:26): [True: 0, False: 2]
  ------------------
  333|      0|        const char* nan_begin = begin + 1;
  334|      0|        while (nan_begin < end && IsNanChar(*nan_begin)) {
  ------------------
  |  Branch (334:16): [True: 0, False: 0]
  |  Branch (334:35): [True: 0, False: 0]
  ------------------
  335|      0|          ++nan_begin;
  336|      0|        }
  337|      0|        if (nan_begin < end && *nan_begin == ')') {
  ------------------
  |  Branch (337:13): [True: 0, False: 0]
  |  Branch (337:32): [True: 0, False: 0]
  ------------------
  338|       |          // We found an extra NaN specifier range
  339|      0|          out->subrange_begin = begin + 1;
  340|      0|          out->subrange_end = nan_begin;
  341|      0|          out->end = nan_begin + 1;
  342|      0|        }
  343|      0|      }
  344|    418|      return true;
  345|    428|    }
  346|  17.0k|    default:
  ------------------
  |  Branch (346:5): [True: 17.0k, False: 925]
  ------------------
  347|  17.0k|      return false;
  348|  17.9k|  }
  349|  17.9k|}
charconv_parse.cc:_ZN4absl12lts_2024011612_GLOBAL__N_113ConsumeDigitsILi10EmEEiPKcS4_iPT0_Pb:
  250|  23.8k|                  bool* dropped_nonzero_digit) {
  251|  23.8k|  if (base == 10) {
  ------------------
  |  Branch (251:7): [True: 23.8k, Folded]
  ------------------
  252|  23.8k|    assert(max_digits <= std::numeric_limits<T>::digits10);
  253|  23.8k|  } else if (base == 16) {
  ------------------
  |  Branch (253:14): [Folded, False: 0]
  ------------------
  254|      0|    assert(max_digits * 4 <= std::numeric_limits<T>::digits);
  255|      0|  }
  256|  23.8k|  const char* const original_begin = begin;
  257|       |
  258|       |  // Skip leading zeros, but only if *out is zero.
  259|       |  // They don't cause an overflow so we don't have to count them for
  260|       |  // `max_digits`.
  261|  23.8k|  while (!*out && end != begin && *begin == '0') ++begin;
  ------------------
  |  Branch (261:10): [True: 20.1k, False: 3.68k]
  |  Branch (261:19): [True: 19.2k, False: 929]
  |  Branch (261:35): [True: 0, False: 19.2k]
  ------------------
  262|       |
  263|  23.8k|  T accumulator = *out;
  264|  23.8k|  const char* significant_digits_end =
  265|  23.8k|      (end - begin > max_digits) ? begin + max_digits : end;
  ------------------
  |  Branch (265:7): [True: 13.6k, False: 10.2k]
  ------------------
  266|   211k|  while (begin < significant_digits_end && IsDigit<base>(*begin)) {
  ------------------
  |  Branch (266:10): [True: 198k, False: 13.3k]
  |  Branch (266:44): [True: 187k, False: 10.5k]
  ------------------
  267|       |    // Do not guard against *out overflow; max_digits was chosen to avoid this.
  268|       |    // Do assert against it, to detect problems in debug builds.
  269|   187k|    auto digit = static_cast<T>(ToDigit<base>(*begin));
  270|   187k|    assert(accumulator * base >= accumulator);
  271|   187k|    accumulator *= base;
  272|   187k|    assert(accumulator + digit >= accumulator);
  273|   187k|    accumulator += digit;
  274|   187k|    ++begin;
  275|   187k|  }
  276|  23.8k|  bool dropped_nonzero = false;
  277|  5.76M|  while (begin < end && IsDigit<base>(*begin)) {
  ------------------
  |  Branch (277:10): [True: 5.75M, False: 8.64k]
  |  Branch (277:25): [True: 5.73M, False: 15.2k]
  ------------------
  278|  5.73M|    dropped_nonzero = dropped_nonzero || (*begin != '0');
  ------------------
  |  Branch (278:23): [True: 5.72M, False: 11.2k]
  |  Branch (278:42): [True: 9.30k, False: 1.99k]
  ------------------
  279|  5.73M|    ++begin;
  280|  5.73M|  }
  281|  23.8k|  if (dropped_nonzero && dropped_nonzero_digit != nullptr) {
  ------------------
  |  Branch (281:7): [True: 9.30k, False: 14.5k]
  |  Branch (281:26): [True: 9.30k, False: 0]
  ------------------
  282|  9.30k|    *dropped_nonzero_digit = true;
  283|  9.30k|  }
  284|  23.8k|  *out = accumulator;
  285|  23.8k|  return static_cast<int>(begin - original_begin);
  286|  23.8k|}
charconv_parse.cc:_ZN4absl12lts_2024011612_GLOBAL__N_17IsDigitILi10EEEbc:
  183|  5.97M|bool IsDigit<10>(char ch) {
  184|  5.97M|  return ch >= '0' && ch <= '9';
  ------------------
  |  Branch (184:10): [True: 5.96M, False: 14.3k]
  |  Branch (184:23): [True: 5.95M, False: 11.4k]
  ------------------
  185|  5.97M|}
charconv_parse.cc:_ZN4absl12lts_2024011612_GLOBAL__N_17ToDigitILi10EEEjc:
  192|   214k|unsigned ToDigit<10>(char ch) {
  193|   214k|  return static_cast<unsigned>(ch - '0');
  194|   214k|}
charconv_parse.cc:_ZN4absl12lts_2024011612_GLOBAL__N_113ConsumeDigitsILi10EiEEiPKcS4_iPT0_Pb:
  250|  9.12k|                  bool* dropped_nonzero_digit) {
  251|  9.12k|  if (base == 10) {
  ------------------
  |  Branch (251:7): [True: 9.12k, Folded]
  ------------------
  252|  9.12k|    assert(max_digits <= std::numeric_limits<T>::digits10);
  253|  9.12k|  } else if (base == 16) {
  ------------------
  |  Branch (253:14): [Folded, False: 0]
  ------------------
  254|      0|    assert(max_digits * 4 <= std::numeric_limits<T>::digits);
  255|      0|  }
  256|  9.12k|  const char* const original_begin = begin;
  257|       |
  258|       |  // Skip leading zeros, but only if *out is zero.
  259|       |  // They don't cause an overflow so we don't have to count them for
  260|       |  // `max_digits`.
  261|  10.1k|  while (!*out && end != begin && *begin == '0') ++begin;
  ------------------
  |  Branch (261:10): [True: 10.1k, False: 0]
  |  Branch (261:19): [True: 9.62k, False: 533]
  |  Branch (261:35): [True: 1.03k, False: 8.59k]
  ------------------
  262|       |
  263|  9.12k|  T accumulator = *out;
  264|  9.12k|  const char* significant_digits_end =
  265|  9.12k|      (end - begin > max_digits) ? begin + max_digits : end;
  ------------------
  |  Branch (265:7): [True: 265, False: 8.86k]
  ------------------
  266|  36.1k|  while (begin < significant_digits_end && IsDigit<base>(*begin)) {
  ------------------
  |  Branch (266:10): [True: 27.0k, False: 9.08k]
  |  Branch (266:44): [True: 26.9k, False: 43]
  ------------------
  267|       |    // Do not guard against *out overflow; max_digits was chosen to avoid this.
  268|       |    // Do assert against it, to detect problems in debug builds.
  269|  26.9k|    auto digit = static_cast<T>(ToDigit<base>(*begin));
  270|  26.9k|    assert(accumulator * base >= accumulator);
  271|  26.9k|    accumulator *= base;
  272|  26.9k|    assert(accumulator + digit >= accumulator);
  273|  26.9k|    accumulator += digit;
  274|  26.9k|    ++begin;
  275|  26.9k|  }
  276|  9.12k|  bool dropped_nonzero = false;
  277|  10.0k|  while (begin < end && IsDigit<base>(*begin)) {
  ------------------
  |  Branch (277:10): [True: 941, False: 9.07k]
  |  Branch (277:25): [True: 895, False: 46]
  ------------------
  278|    895|    dropped_nonzero = dropped_nonzero || (*begin != '0');
  ------------------
  |  Branch (278:23): [True: 634, False: 261]
  |  Branch (278:42): [True: 261, False: 0]
  ------------------
  279|    895|    ++begin;
  280|    895|  }
  281|  9.12k|  if (dropped_nonzero && dropped_nonzero_digit != nullptr) {
  ------------------
  |  Branch (281:7): [True: 261, False: 8.86k]
  |  Branch (281:26): [True: 0, False: 261]
  ------------------
  282|      0|    *dropped_nonzero_digit = true;
  283|      0|  }
  284|  9.12k|  *out = accumulator;
  285|  9.12k|  return static_cast<int>(begin - original_begin);
  286|  9.12k|}
charconv_parse.cc:_ZN4absl12lts_2024011612_GLOBAL__N_113ConsumeDigitsILi16EmEEiPKcS4_iPT0_Pb:
  250|  6.99k|                  bool* dropped_nonzero_digit) {
  251|  6.99k|  if (base == 10) {
  ------------------
  |  Branch (251:7): [Folded, False: 6.99k]
  ------------------
  252|      0|    assert(max_digits <= std::numeric_limits<T>::digits10);
  253|  6.99k|  } else if (base == 16) {
  ------------------
  |  Branch (253:14): [True: 6.99k, Folded]
  ------------------
  254|  6.99k|    assert(max_digits * 4 <= std::numeric_limits<T>::digits);
  255|  6.99k|  }
  256|  6.99k|  const char* const original_begin = begin;
  257|       |
  258|       |  // Skip leading zeros, but only if *out is zero.
  259|       |  // They don't cause an overflow so we don't have to count them for
  260|       |  // `max_digits`.
  261|  6.99k|  while (!*out && end != begin && *begin == '0') ++begin;
  ------------------
  |  Branch (261:10): [True: 5.84k, False: 1.15k]
  |  Branch (261:19): [True: 5.19k, False: 649]
  |  Branch (261:35): [True: 0, False: 5.19k]
  ------------------
  262|       |
  263|  6.99k|  T accumulator = *out;
  264|  6.99k|  const char* significant_digits_end =
  265|  6.99k|      (end - begin > max_digits) ? begin + max_digits : end;
  ------------------
  |  Branch (265:7): [True: 1.16k, False: 5.83k]
  ------------------
  266|  27.3k|  while (begin < significant_digits_end && IsDigit<base>(*begin)) {
  ------------------
  |  Branch (266:10): [True: 24.6k, False: 2.72k]
  |  Branch (266:44): [True: 20.3k, False: 4.27k]
  ------------------
  267|       |    // Do not guard against *out overflow; max_digits was chosen to avoid this.
  268|       |    // Do assert against it, to detect problems in debug builds.
  269|  20.3k|    auto digit = static_cast<T>(ToDigit<base>(*begin));
  270|  20.3k|    assert(accumulator * base >= accumulator);
  271|  20.3k|    accumulator *= base;
  272|  20.3k|    assert(accumulator + digit >= accumulator);
  273|  20.3k|    accumulator += digit;
  274|  20.3k|    ++begin;
  275|  20.3k|  }
  276|  6.99k|  bool dropped_nonzero = false;
  277|  2.11M|  while (begin < end && IsDigit<base>(*begin)) {
  ------------------
  |  Branch (277:10): [True: 2.10M, False: 2.68k]
  |  Branch (277:25): [True: 2.10M, False: 4.31k]
  ------------------
  278|  2.10M|    dropped_nonzero = dropped_nonzero || (*begin != '0');
  ------------------
  |  Branch (278:23): [True: 2.10M, False: 1.24k]
  |  Branch (278:42): [True: 717, False: 524]
  ------------------
  279|  2.10M|    ++begin;
  280|  2.10M|  }
  281|  6.99k|  if (dropped_nonzero && dropped_nonzero_digit != nullptr) {
  ------------------
  |  Branch (281:7): [True: 717, False: 6.28k]
  |  Branch (281:26): [True: 717, False: 0]
  ------------------
  282|    717|    *dropped_nonzero_digit = true;
  283|    717|  }
  284|  6.99k|  *out = accumulator;
  285|  6.99k|  return static_cast<int>(begin - original_begin);
  286|  6.99k|}
charconv_parse.cc:_ZN4absl12lts_2024011612_GLOBAL__N_17IsDigitILi16EEEbc:
  187|  2.13M|bool IsDigit<16>(char ch) {
  188|  2.13M|  return kAsciiToInt[static_cast<unsigned char>(ch)] >= 0;
  189|  2.13M|}
charconv_parse.cc:_ZN4absl12lts_2024011612_GLOBAL__N_17ToDigitILi16EEEjc:
  196|  20.3k|unsigned ToDigit<16>(char ch) {
  197|  20.3k|  return static_cast<unsigned>(kAsciiToInt[static_cast<unsigned char>(ch)]);
  198|  20.3k|}

_ZN4absl12lts_2024011616strings_internal33CalculateBase64EscapedLenInternalEmb:
   35|      2|size_t CalculateBase64EscapedLenInternal(size_t input_len, bool do_padding) {
   36|       |  // Base64 encodes three bytes of input at a time. If the input is not
   37|       |  // divisible by three, we pad as appropriate.
   38|       |  //
   39|       |  // Base64 encodes each three bytes of input into four bytes of output.
   40|      2|  size_t len = (input_len / 3) * 4;
   41|       |
   42|       |  // Since all base 64 input is an integral number of octets, only the following
   43|       |  // cases can arise:
   44|      2|  if (input_len % 3 == 0) {
  ------------------
  |  Branch (44:7): [True: 0, False: 2]
  ------------------
   45|       |    // (from https://tools.ietf.org/html/rfc3548)
   46|       |    // (1) the final quantum of encoding input is an integral multiple of 24
   47|       |    // bits; here, the final unit of encoded output will be an integral
   48|       |    // multiple of 4 characters with no "=" padding,
   49|      2|  } else if (input_len % 3 == 1) {
  ------------------
  |  Branch (49:14): [True: 0, False: 2]
  ------------------
   50|       |    // (from https://tools.ietf.org/html/rfc3548)
   51|       |    // (2) the final quantum of encoding input is exactly 8 bits; here, the
   52|       |    // final unit of encoded output will be two characters followed by two
   53|       |    // "=" padding characters, or
   54|      0|    len += 2;
   55|      0|    if (do_padding) {
  ------------------
  |  Branch (55:9): [True: 0, False: 0]
  ------------------
   56|      0|      len += 2;
   57|      0|    }
   58|      2|  } else {  // (input_len % 3 == 2)
   59|       |    // (from https://tools.ietf.org/html/rfc3548)
   60|       |    // (3) the final quantum of encoding input is exactly 16 bits; here, the
   61|       |    // final unit of encoded output will be three characters followed by one
   62|       |    // "=" padding character.
   63|      2|    len += 3;
   64|      2|    if (do_padding) {
  ------------------
  |  Branch (64:9): [True: 0, False: 2]
  ------------------
   65|      0|      len += 1;
   66|      0|    }
   67|      2|  }
   68|       |
   69|       |  assert(len >= input_len);  // make sure we didn't overflow
   70|      2|  return len;
   71|      2|}
_ZN4absl12lts_2024011616strings_internal20Base64EscapeInternalEPKhmPcmPKcb:
   90|      2|                            bool do_padding) {
   91|      2|  static const char kPad64 = '=';
   92|       |
   93|      2|  if (szsrc * 4 > szdest * 3) return 0;
  ------------------
  |  Branch (93:7): [True: 0, False: 2]
  ------------------
   94|       |
   95|      2|  char* cur_dest = dest;
   96|      2|  const unsigned char* cur_src = src;
   97|       |
   98|      2|  char* const limit_dest = dest + szdest;
   99|      2|  const unsigned char* const limit_src = src + szsrc;
  100|       |
  101|       |  // (from https://tools.ietf.org/html/rfc3548)
  102|       |  // Special processing is performed if fewer than 24 bits are available
  103|       |  // at the end of the data being encoded.  A full encoding quantum is
  104|       |  // always completed at the end of a quantity.  When fewer than 24 input
  105|       |  // bits are available in an input group, zero bits are added (on the
  106|       |  // right) to form an integral number of 6-bit groups.
  107|       |  //
  108|       |  // If do_padding is true, padding at the end of the data is performed. This
  109|       |  // output padding uses the '=' character.
  110|       |
  111|       |  // Three bytes of data encodes to four characters of cyphertext.
  112|       |  // So we can pump through three-byte chunks atomically.
  113|      2|  if (szsrc >= 3) {                    // "limit_src - 3" is UB if szsrc < 3.
  ------------------
  |  Branch (113:7): [True: 2, False: 0]
  ------------------
  114|     22|    while (cur_src < limit_src - 3) {  // While we have >= 32 bits.
  ------------------
  |  Branch (114:12): [True: 20, False: 2]
  ------------------
  115|     20|      uint32_t in = absl::big_endian::Load32(cur_src) >> 8;
  116|       |
  117|     20|      cur_dest[0] = base64[in >> 18];
  118|     20|      in &= 0x3FFFF;
  119|     20|      cur_dest[1] = base64[in >> 12];
  120|     20|      in &= 0xFFF;
  121|     20|      cur_dest[2] = base64[in >> 6];
  122|     20|      in &= 0x3F;
  123|     20|      cur_dest[3] = base64[in];
  124|       |
  125|     20|      cur_dest += 4;
  126|     20|      cur_src += 3;
  127|     20|    }
  128|      2|  }
  129|       |  // To save time, we didn't update szdest or szsrc in the loop.  So do it now.
  130|      2|  szdest = static_cast<size_t>(limit_dest - cur_dest);
  131|      2|  szsrc = static_cast<size_t>(limit_src - cur_src);
  132|       |
  133|       |  /* now deal with the tail (<=3 bytes) */
  134|      2|  switch (szsrc) {
  135|      0|    case 0:
  ------------------
  |  Branch (135:5): [True: 0, False: 2]
  ------------------
  136|       |      // Nothing left; nothing more to do.
  137|      0|      break;
  138|      0|    case 1: {
  ------------------
  |  Branch (138:5): [True: 0, False: 2]
  ------------------
  139|       |      // One byte left: this encodes to two characters, and (optionally)
  140|       |      // two pad characters to round out the four-character cypherblock.
  141|      0|      if (szdest < 2) return 0;
  ------------------
  |  Branch (141:11): [True: 0, False: 0]
  ------------------
  142|      0|      uint32_t in = cur_src[0];
  143|      0|      cur_dest[0] = base64[in >> 2];
  144|      0|      in &= 0x3;
  145|      0|      cur_dest[1] = base64[in << 4];
  146|      0|      cur_dest += 2;
  147|      0|      szdest -= 2;
  148|      0|      if (do_padding) {
  ------------------
  |  Branch (148:11): [True: 0, False: 0]
  ------------------
  149|      0|        if (szdest < 2) return 0;
  ------------------
  |  Branch (149:13): [True: 0, False: 0]
  ------------------
  150|      0|        cur_dest[0] = kPad64;
  151|      0|        cur_dest[1] = kPad64;
  152|      0|        cur_dest += 2;
  153|      0|        szdest -= 2;
  154|      0|      }
  155|      0|      break;
  156|      0|    }
  157|      2|    case 2: {
  ------------------
  |  Branch (157:5): [True: 2, False: 0]
  ------------------
  158|       |      // Two bytes left: this encodes to three characters, and (optionally)
  159|       |      // one pad character to round out the four-character cypherblock.
  160|      2|      if (szdest < 3) return 0;
  ------------------
  |  Branch (160:11): [True: 0, False: 2]
  ------------------
  161|      2|      uint32_t in = absl::big_endian::Load16(cur_src);
  162|      2|      cur_dest[0] = base64[in >> 10];
  163|      2|      in &= 0x3FF;
  164|      2|      cur_dest[1] = base64[in >> 4];
  165|      2|      in &= 0x00F;
  166|      2|      cur_dest[2] = base64[in << 2];
  167|      2|      cur_dest += 3;
  168|      2|      szdest -= 3;
  169|      2|      if (do_padding) {
  ------------------
  |  Branch (169:11): [True: 0, False: 2]
  ------------------
  170|      0|        if (szdest < 1) return 0;
  ------------------
  |  Branch (170:13): [True: 0, False: 0]
  ------------------
  171|      0|        cur_dest[0] = kPad64;
  172|      0|        cur_dest += 1;
  173|      0|        szdest -= 1;
  174|      0|      }
  175|      2|      break;
  176|      2|    }
  177|      2|    case 3: {
  ------------------
  |  Branch (177:5): [True: 0, False: 2]
  ------------------
  178|       |      // Three bytes left: same as in the big loop above.  We can't do this in
  179|       |      // the loop because the loop above always reads 4 bytes, and the fourth
  180|       |      // byte is past the end of the input.
  181|      0|      if (szdest < 4) return 0;
  ------------------
  |  Branch (181:11): [True: 0, False: 0]
  ------------------
  182|      0|      uint32_t in =
  183|      0|          (uint32_t{cur_src[0]} << 16) + absl::big_endian::Load16(cur_src + 1);
  184|      0|      cur_dest[0] = base64[in >> 18];
  185|      0|      in &= 0x3FFFF;
  186|      0|      cur_dest[1] = base64[in >> 12];
  187|      0|      in &= 0xFFF;
  188|      0|      cur_dest[2] = base64[in >> 6];
  189|      0|      in &= 0x3F;
  190|      0|      cur_dest[3] = base64[in];
  191|      0|      cur_dest += 4;
  192|      0|      szdest -= 4;
  193|      0|      break;
  194|      0|    }
  195|      0|    default:
  ------------------
  |  Branch (195:5): [True: 0, False: 2]
  ------------------
  196|       |      // Should not be reached: blocks of 4 bytes are handled
  197|       |      // in the while loop before this switch statement.
  198|      0|      ABSL_RAW_LOG(FATAL, "Logic problem? szsrc = %zu", szsrc);
  ------------------
  |  |   45|      0|  do {                                                                         \
  |  |   46|      0|    constexpr const char* absl_raw_log_internal_basename =                     \
  |  |   47|      0|        ::absl::raw_log_internal::Basename(__FILE__, sizeof(__FILE__) - 1);    \
  |  |   48|      0|    ::absl::raw_log_internal::RawLog(ABSL_RAW_LOG_INTERNAL_##severity,         \
  |  |  ------------------
  |  |  |  |  110|      0|#define ABSL_RAW_LOG_INTERNAL_FATAL ::absl::LogSeverity::kFatal
  |  |  ------------------
  |  |   49|      0|                                     absl_raw_log_internal_basename, __LINE__, \
  |  |   50|      0|                                     __VA_ARGS__);                             \
  |  |   51|      0|    ABSL_RAW_LOG_INTERNAL_MAYBE_UNREACHABLE_##severity;                        \
  |  |  ------------------
  |  |  |  |  118|      0|#define ABSL_RAW_LOG_INTERNAL_MAYBE_UNREACHABLE_FATAL ABSL_UNREACHABLE()
  |  |  |  |  ------------------
  |  |  |  |  |  |  225|      0|  do {                                           \
  |  |  |  |  |  |  226|      0|    /* NOLINTNEXTLINE: misc-static-assert */     \
  |  |  |  |  |  |  227|      0|    assert(false && "ABSL_UNREACHABLE reached"); \
  |  |  |  |  |  |  228|      0|    ABSL_INTERNAL_UNREACHABLE_IMPL();            \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  203|      0|#define ABSL_INTERNAL_UNREACHABLE_IMPL() __builtin_unreachable()
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  229|      0|  } while (false)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (229:12): [Folded, False: 0]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   52|      0|  } while (0)
  |  |  ------------------
  |  |  |  Branch (52:12): [Folded, False: 0]
  |  |  ------------------
  ------------------
  199|      0|      break;
  200|      2|  }
  201|      2|  return static_cast<size_t>(cur_dest - dest);
  202|      2|}

_ZN4absl12lts_2024011616strings_internal20Base64EscapeInternalINSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEEEvPKhmPT_bPKc:
   42|      2|                          bool do_padding, const char* base64_chars) {
   43|      2|  const size_t calc_escaped_size =
   44|      2|      CalculateBase64EscapedLenInternal(szsrc, do_padding);
   45|      2|  STLStringResizeUninitialized(dest, calc_escaped_size);
   46|       |
   47|      2|  const size_t escaped_len = Base64EscapeInternal(
   48|      2|      src, szsrc, &(*dest)[0], dest->size(), base64_chars, do_padding);
   49|       |  assert(calc_escaped_size == escaped_len);
   50|      2|  dest->erase(escaped_len);
   51|      2|}

_ZN4absl12lts_2024011616strings_internal10memcasecmpEPKcS3_m:
   25|    938|int memcasecmp(const char* s1, const char* s2, size_t len) {
   26|    938|  const unsigned char* us1 = reinterpret_cast<const unsigned char*>(s1);
   27|    938|  const unsigned char* us2 = reinterpret_cast<const unsigned char*>(s2);
   28|       |
   29|  2.77k|  for (size_t i = 0; i < len; i++) {
  ------------------
  |  Branch (29:22): [True: 1.86k, False: 906]
  ------------------
   30|  1.86k|    unsigned char c1 = us1[i];
   31|  1.86k|    unsigned char c2 = us2[i];
   32|       |    // If bytes are the same, they will be the same when converted to lower.
   33|       |    // So we only need to convert if bytes are not equal.
   34|       |    // NOTE(b/308193381): We do not use `absl::ascii_tolower` here in order
   35|       |    // to avoid its lookup table and improve performance.
   36|  1.86k|    if (c1 != c2) {
  ------------------
  |  Branch (36:9): [True: 326, False: 1.53k]
  ------------------
   37|    326|      c1 = c1 >= 'A' && c1 <= 'Z' ? c1 - 'A' + 'a' : c1;
  ------------------
  |  Branch (37:12): [True: 313, False: 13]
  |  Branch (37:25): [True: 302, False: 11]
  ------------------
   38|    326|      c2 = c2 >= 'A' && c2 <= 'Z' ? c2 - 'A' + 'a' : c2;
  ------------------
  |  Branch (38:12): [True: 326, False: 0]
  |  Branch (38:25): [True: 0, False: 326]
  ------------------
   39|    326|      const int diff = int{c1} - int{c2};
   40|    326|      if (diff != 0) return diff;
  ------------------
  |  Branch (40:11): [True: 32, False: 294]
  ------------------
   41|    326|    }
   42|  1.86k|  }
   43|    906|  return 0;
   44|    938|}

_ZN4absl12lts_2024011616strings_internal28STLStringResizeUninitializedINSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEvEEvPT_m:
   67|    423|inline void STLStringResizeUninitialized(string_type* s, size_t new_size) {
   68|    423|  ResizeUninitializedTraits<string_type>::Resize(s, new_size);
   69|    423|}
_ZN4absl12lts_2024011616strings_internal25ResizeUninitializedTraitsINSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEvE6ResizeEPS9_m:
   47|    423|  static void Resize(string_type* s, size_t new_size) {
   48|    423|    s->__resize_default_init(new_size);
   49|    423|  }

_ZN4absl12lts_2024011619str_format_internal17FormatConvertImplENS0_6int128ENS1_24FormatConversionSpecImplEPNS1_14FormatSinkImplE:
  655|     36|                                        FormatSinkImpl *sink) {
  656|     36|  return {ConvertIntArg(v, conv, sink)};
  657|     36|}
arg.cc:_ZNK4absl12lts_2024011619str_format_internal12_GLOBAL__N_19IntDigits17with_neg_and_zeroEv:
  176|     36|  string_view with_neg_and_zero() const { return {start_, size_}; }
_ZN4absl12lts_2024011619str_format_internal13ConvertIntArgINS0_6int128EEEbT_NS1_24FormatConversionSpecImplEPNS1_14FormatSinkImplE:
  403|     36|bool ConvertIntArg(T v, FormatConversionSpecImpl conv, FormatSinkImpl *sink) {
  404|     36|  using U = typename MakeUnsigned<T>::type;
  405|     36|  IntDigits as_digits;
  406|       |
  407|       |  // This odd casting is due to a bug in -Wswitch behavior in gcc49 which causes
  408|       |  // it to complain about a switch/case type mismatch, even though both are
  409|       |  // FormatConversionChar.  Likely this is because at this point
  410|       |  // FormatConversionChar is declared, but not defined.
  411|     36|  switch (static_cast<uint8_t>(conv.conversion_char())) {
  412|      0|    case static_cast<uint8_t>(FormatConversionCharInternal::c):
  ------------------
  |  Branch (412:5): [True: 0, False: 36]
  ------------------
  413|      0|      return (std::is_same<T, wchar_t>::value ||
  ------------------
  |  Branch (413:15): [Folded, False: 0]
  ------------------
  414|      0|              (conv.length_mod() == LengthMod::l))
  ------------------
  |  Branch (414:15): [True: 0, False: 0]
  ------------------
  415|      0|                 ? ConvertWCharTImpl(static_cast<wchar_t>(v), conv, sink)
  416|      0|                 : ConvertCharImpl(static_cast<char>(v), conv, sink);
  417|       |
  418|      0|    case static_cast<uint8_t>(FormatConversionCharInternal::o):
  ------------------
  |  Branch (418:5): [True: 0, False: 36]
  ------------------
  419|      0|      as_digits.PrintAsOct(static_cast<U>(v));
  420|      0|      break;
  421|       |
  422|      0|    case static_cast<uint8_t>(FormatConversionCharInternal::x):
  ------------------
  |  Branch (422:5): [True: 0, False: 36]
  ------------------
  423|      0|      as_digits.PrintAsHexLower(static_cast<U>(v));
  424|      0|      break;
  425|      0|    case static_cast<uint8_t>(FormatConversionCharInternal::X):
  ------------------
  |  Branch (425:5): [True: 0, False: 36]
  ------------------
  426|      0|      as_digits.PrintAsHexUpper(static_cast<U>(v));
  427|      0|      break;
  428|       |
  429|      0|    case static_cast<uint8_t>(FormatConversionCharInternal::u):
  ------------------
  |  Branch (429:5): [True: 0, False: 36]
  ------------------
  430|      0|      as_digits.PrintAsDec(static_cast<U>(v));
  431|      0|      break;
  432|       |
  433|     36|    case static_cast<uint8_t>(FormatConversionCharInternal::d):
  ------------------
  |  Branch (433:5): [True: 36, False: 0]
  ------------------
  434|     36|    case static_cast<uint8_t>(FormatConversionCharInternal::i):
  ------------------
  |  Branch (434:5): [True: 0, False: 36]
  ------------------
  435|     36|    case static_cast<uint8_t>(FormatConversionCharInternal::v):
  ------------------
  |  Branch (435:5): [True: 0, False: 36]
  ------------------
  436|     36|      as_digits.PrintAsDec(v);
  437|     36|      break;
  438|       |
  439|      0|    case static_cast<uint8_t>(FormatConversionCharInternal::a):
  ------------------
  |  Branch (439:5): [True: 0, False: 36]
  ------------------
  440|      0|    case static_cast<uint8_t>(FormatConversionCharInternal::e):
  ------------------
  |  Branch (440:5): [True: 0, False: 36]
  ------------------
  441|      0|    case static_cast<uint8_t>(FormatConversionCharInternal::f):
  ------------------
  |  Branch (441:5): [True: 0, False: 36]
  ------------------
  442|      0|    case static_cast<uint8_t>(FormatConversionCharInternal::g):
  ------------------
  |  Branch (442:5): [True: 0, False: 36]
  ------------------
  443|      0|    case static_cast<uint8_t>(FormatConversionCharInternal::A):
  ------------------
  |  Branch (443:5): [True: 0, False: 36]
  ------------------
  444|      0|    case static_cast<uint8_t>(FormatConversionCharInternal::E):
  ------------------
  |  Branch (444:5): [True: 0, False: 36]
  ------------------
  445|      0|    case static_cast<uint8_t>(FormatConversionCharInternal::F):
  ------------------
  |  Branch (445:5): [True: 0, False: 36]
  ------------------
  446|      0|    case static_cast<uint8_t>(FormatConversionCharInternal::G):
  ------------------
  |  Branch (446:5): [True: 0, False: 36]
  ------------------
  447|      0|      return ConvertFloatImpl(static_cast<double>(v), conv, sink);
  448|       |
  449|      0|    default:
  ------------------
  |  Branch (449:5): [True: 0, False: 36]
  ------------------
  450|      0|      ABSL_ASSUME(false);
  ------------------
  |  |  259|      0|#define ABSL_ASSUME(cond) __builtin_assume(cond)
  ------------------
  451|     36|  }
  452|       |
  453|     36|  if (conv.is_basic()) {
  ------------------
  |  Branch (453:7): [True: 36, False: 0]
  ------------------
  454|     36|    sink->Append(as_digits.with_neg_and_zero());
  455|     36|    return true;
  456|     36|  }
  457|      0|  return ConvertIntImplInnerSlow(as_digits, conv, sink);
  458|     36|}
arg.cc:_ZN4absl12lts_2024011619str_format_internal12_GLOBAL__N_19IntDigits10PrintAsDecENS0_7uint128Eb:
  116|     36|  void PrintAsDec(uint128 v, bool add_neg = false) {
  117|       |    // This function can be sped up if needed. We can call FastIntToBuffer
  118|       |    // twice, or fix FastIntToBuffer to support uint128.
  119|     36|    char *p = storage_ + sizeof(storage_);
  120|     70|    do {
  121|     70|      p -= 2;
  122|     70|      numbers_internal::PutTwoDigits(static_cast<uint32_t>(v % 100), p);
  123|     70|      v /= 100;
  124|     70|    } while (v);
  ------------------
  |  Branch (124:14): [True: 34, False: 36]
  ------------------
  125|     36|    if (p[0] == '0') {
  ------------------
  |  Branch (125:9): [True: 19, False: 17]
  ------------------
  126|       |      // We printed one too many hexits.
  127|     19|      ++p;
  128|     19|    }
  129|     36|    if (add_neg) {
  ------------------
  |  Branch (129:9): [True: 3, False: 33]
  ------------------
  130|      3|      *--p = '-';
  131|      3|    }
  132|     36|    size_ = static_cast<size_t>(storage_ + sizeof(storage_) - p);
  133|     36|    start_ = p;
  134|     36|  }
arg.cc:_ZN4absl12lts_2024011619str_format_internal12_GLOBAL__N_19IntDigits10PrintAsDecENS0_6int128E:
  106|     36|  void PrintAsDec(int128 v) {
  107|     36|    auto u = static_cast<uint128>(v);
  108|     36|    bool add_neg = false;
  109|     36|    if (v < 0) {
  ------------------
  |  Branch (109:9): [True: 3, False: 33]
  ------------------
  110|      3|      add_neg = true;
  111|      3|      u = uint128{} - u;
  112|      3|    }
  113|     36|    PrintAsDec(u, add_neg);
  114|     36|  }

_ZN4absl12lts_2024011619str_format_internal14ExtractCharSetILNS0_23FormatConversionCharSetE655355EEES3_NS1_16ArgConvertResultIXT_EEE:
  211|     36|constexpr FormatConversionCharSet ExtractCharSet(ArgConvertResult<C>) {
  212|     36|  return C;
  213|     36|}
_ZN4absl12lts_2024011619str_format_internal13FormatArgImplC2INS0_6int128EEERKT_:
  505|     36|  explicit FormatArgImpl(const T& value) {
  506|     36|    using D = typename DecayType<T>::type;
  507|     36|    static_assert(
  508|     36|        std::is_same<D, const T&>::value || storage_policy<D>::value == ByValue,
  509|     36|        "Decayed types must be stored by value");
  510|     36|    Init(static_cast<D>(value));
  511|     36|  }
_ZN4absl12lts_2024011619str_format_internal13FormatArgImpl4InitINS0_6int128EEEvRKT_:
  558|     36|  void Init(const T& value) {
  559|     36|    data_ = Manager<T>::SetValue(value);
  560|     36|    dispatcher_ = &Dispatch<T>;
  561|     36|  }
_ZN4absl12lts_2024011619str_format_internal13FormatArgImpl7ManagerINS0_6int128ELNS2_13StoragePolicyE0EE8SetValueERKS4_:
  520|     36|    static Data SetValue(const T& value) {
  521|     36|      Data data;
  522|     36|      data.ptr = std::addressof(value);
  523|     36|      return data;
  524|     36|    }
_ZN4absl12lts_2024011619str_format_internal13FormatArgImpl8DispatchINS0_6int128EEEbNS2_4DataENS1_24FormatConversionSpecImplEPv:
  599|     36|  static bool Dispatch(Data arg, FormatConversionSpecImpl spec, void* out) {
  600|       |    // A `none` conv indicates that we want the `int` conversion.
  601|     36|    if (ABSL_PREDICT_FALSE(spec.conversion_char() ==
  ------------------
  |  |  178|     36|#define ABSL_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
  |  |  ------------------
  |  |  |  Branch (178:31): [True: 0, False: 36]
  |  |  |  Branch (178:49): [Folded, False: 36]
  |  |  |  Branch (178:58): [True: 0, False: 36]
  |  |  ------------------
  ------------------
  602|     36|                           FormatConversionCharInternal::kNone)) {
  603|      0|      return ToInt<T>(arg, static_cast<int*>(out), std::is_integral<T>(),
  604|      0|                      std::is_enum<T>());
  605|      0|    }
  606|     36|    if (ABSL_PREDICT_FALSE(!Contains(ArgumentToConv<T>(),
  ------------------
  |  |  178|     36|#define ABSL_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
  |  |  ------------------
  |  |  |  Branch (178:31): [True: 0, False: 36]
  |  |  |  Branch (178:49): [Folded, False: 36]
  |  |  |  Branch (178:58): [True: 0, False: 36]
  |  |  ------------------
  ------------------
  607|     36|                                     spec.conversion_char()))) {
  608|      0|      return false;
  609|      0|    }
  610|     36|    return str_format_internal::FormatConvertImpl(
  611|     36|               Manager<T>::Value(arg), spec,
  612|     36|               static_cast<FormatSinkImpl*>(out))
  613|     36|        .value;
  614|     36|  }
_ZN4absl12lts_2024011619str_format_internal14ArgumentToConvINS0_6int128EEENS0_23FormatConversionCharSetEv:
  430|     36|constexpr FormatConversionCharSet ArgumentToConv() {
  431|     36|  using ConvResult = decltype(str_format_internal::FormatConvertImpl(
  432|     36|      std::declval<const Arg&>(),
  433|     36|      std::declval<const FormatConversionSpecImpl&>(),
  434|     36|      std::declval<FormatSinkImpl*>()));
  435|     36|  return absl::str_format_internal::ExtractCharSet(ConvResult{});
  436|     36|}
_ZN4absl12lts_2024011619str_format_internal13FormatArgImpl7ManagerINS0_6int128ELNS2_13StoragePolicyE0EE5ValueENS2_4DataE:
  526|     36|    static const T& Value(Data arg) { return *static_cast<const T*>(arg.ptr); }
_ZN4absl12lts_2024011619str_format_internal19FormatArgImplFriend7ConvertINS1_13FormatArgImplEEEbT_NS1_24FormatConversionSpecImplEPNS1_14FormatSinkImplE:
  419|     36|                      FormatSinkImpl* out) {
  420|     36|    return arg.dispatcher_(arg.data_, conv, out);
  421|     36|  }

_ZN4absl12lts_2024011619str_format_internal13FormatUntypedENS1_17FormatRawSinkImplENS1_21UntypedFormatSpecImplENS0_4SpanIKNS1_13FormatArgImplEEE:
  214|     84|                   absl::Span<const FormatArgImpl> args) {
  215|     84|  FormatSinkImpl sink(raw_sink);
  216|     84|  using Converter = DefaultConverter;
  217|     84|  return ConvertAll(format, args, Converter(&sink));
  218|     84|}
bind.cc:_ZN4absl12lts_2024011619str_format_internal12_GLOBAL__N_110ArgContextC2ENS0_4SpanIKNS1_13FormatArgImplEEE:
   55|     84|  explicit ArgContext(absl::Span<const FormatArgImpl> pack) : pack_(pack) {}
bind.cc:_ZN4absl12lts_2024011619str_format_internal12_GLOBAL__N_110ArgContext4BindEPKNS1_17UnboundConversionEPNS1_15BoundConversionE:
   70|     36|                             BoundConversion* bound) {
   71|     36|  const FormatArgImpl* arg = nullptr;
   72|     36|  int arg_position = unbound->arg_position;
   73|     36|  if (static_cast<size_t>(arg_position - 1) >= pack_.size()) return false;
  ------------------
  |  Branch (73:7): [True: 0, False: 36]
  ------------------
   74|     36|  arg = &pack_[static_cast<size_t>(arg_position - 1)];  // 1-based
   75|       |
   76|     36|  if (unbound->flags != Flags::kBasic) {
  ------------------
  |  Branch (76:7): [True: 0, False: 36]
  ------------------
   77|      0|    int width = unbound->width.value();
   78|      0|    bool force_left = false;
   79|      0|    if (unbound->width.is_from_arg()) {
  ------------------
  |  Branch (79:9): [True: 0, False: 0]
  ------------------
   80|      0|      if (!BindFromPosition(unbound->width.get_from_arg(), &width, pack_))
  ------------------
  |  Branch (80:11): [True: 0, False: 0]
  ------------------
   81|      0|        return false;
   82|      0|      if (width < 0) {
  ------------------
  |  Branch (82:11): [True: 0, False: 0]
  ------------------
   83|       |        // "A negative field width is taken as a '-' flag followed by a
   84|       |        // positive field width."
   85|      0|        force_left = true;
   86|       |        // Make sure we don't overflow the width when negating it.
   87|      0|        width = -std::max(width, -std::numeric_limits<int>::max());
   88|      0|      }
   89|      0|    }
   90|       |
   91|      0|    int precision = unbound->precision.value();
   92|      0|    if (unbound->precision.is_from_arg()) {
  ------------------
  |  Branch (92:9): [True: 0, False: 0]
  ------------------
   93|      0|      if (!BindFromPosition(unbound->precision.get_from_arg(), &precision,
  ------------------
  |  Branch (93:11): [True: 0, False: 0]
  ------------------
   94|      0|                            pack_))
   95|      0|        return false;
   96|      0|    }
   97|       |
   98|      0|    FormatConversionSpecImplFriend::SetWidth(width, bound);
   99|      0|    FormatConversionSpecImplFriend::SetPrecision(precision, bound);
  100|       |
  101|      0|    if (force_left) {
  ------------------
  |  Branch (101:9): [True: 0, False: 0]
  ------------------
  102|      0|      FormatConversionSpecImplFriend::SetFlags(unbound->flags | Flags::kLeft,
  103|      0|                                               bound);
  104|      0|    } else {
  105|      0|      FormatConversionSpecImplFriend::SetFlags(unbound->flags, bound);
  106|      0|    }
  107|       |
  108|      0|    FormatConversionSpecImplFriend::SetLengthMod(unbound->length_mod, bound);
  109|     36|  } else {
  110|     36|    FormatConversionSpecImplFriend::SetFlags(unbound->flags, bound);
  111|     36|    FormatConversionSpecImplFriend::SetWidth(-1, bound);
  112|     36|    FormatConversionSpecImplFriend::SetPrecision(-1, bound);
  113|     36|  }
  114|     36|  FormatConversionSpecImplFriend::SetConversionChar(unbound->conv, bound);
  115|     36|  bound->set_arg(arg);
  116|     36|  return true;
  117|     36|}
bind.cc:_ZN4absl12lts_2024011619str_format_internal12_GLOBAL__N_116DefaultConverterC2EPNS1_14FormatSinkImplE:
  154|     84|  explicit DefaultConverter(FormatSinkImpl* sink) : sink_(sink) {}
bind.cc:_ZN4absl12lts_2024011619str_format_internal12_GLOBAL__N_110ConvertAllINS2_16DefaultConverterEEEbNS1_21UntypedFormatSpecImplENS0_4SpanIKNS1_13FormatArgImplEEET_:
  142|     84|                absl::Span<const FormatArgImpl> args, Converter converter) {
  143|     84|  if (format.has_parsed_conversion()) {
  ------------------
  |  Branch (143:7): [True: 0, False: 84]
  ------------------
  144|      0|    return format.parsed_conversion()->ProcessFormat(
  145|      0|        ConverterConsumer<Converter>(converter, args));
  146|     84|  } else {
  147|     84|    return ParseFormatString(format.str(),
  148|     84|                             ConverterConsumer<Converter>(converter, args));
  149|     84|  }
  150|     84|}
bind.cc:_ZN4absl12lts_2024011619str_format_internal12_GLOBAL__N_117ConverterConsumerINS2_16DefaultConverterEE10ConvertOneERKNS1_17UnboundConversionENSt3__117basic_string_viewIcNS9_11char_traitsIcEEEE:
  129|     36|  bool ConvertOne(const UnboundConversion& conv, string_view conv_string) {
  130|     36|    BoundConversion bound;
  131|     36|    if (!arg_context_.Bind(&conv, &bound)) return false;
  ------------------
  |  Branch (131:9): [True: 0, False: 36]
  ------------------
  132|     36|    return converter_.ConvertOne(bound, conv_string);
  133|     36|  }
bind.cc:_ZNK4absl12lts_2024011619str_format_internal12_GLOBAL__N_116DefaultConverter10ConvertOneERKNS1_15BoundConversionENSt3__117basic_string_viewIcNS7_11char_traitsIcEEEE:
  158|     36|  bool ConvertOne(const BoundConversion& bound, string_view /*conv*/) const {
  159|     36|    return FormatArgImplFriend::Convert(*bound.arg(), bound, sink_);
  160|     36|  }
bind.cc:_ZN4absl12lts_2024011619str_format_internal12_GLOBAL__N_117ConverterConsumerINS2_16DefaultConverterEE6AppendENSt3__117basic_string_viewIcNS6_11char_traitsIcEEEE:
  125|     84|  bool Append(string_view s) {
  126|     84|    converter_.Append(s);
  127|     84|    return true;
  128|     84|  }
bind.cc:_ZNK4absl12lts_2024011619str_format_internal12_GLOBAL__N_116DefaultConverter6AppendENSt3__117basic_string_viewIcNS4_11char_traitsIcEEEE:
  156|     84|  void Append(string_view s) const { sink_->Append(s); }
bind.cc:_ZN4absl12lts_2024011619str_format_internal12_GLOBAL__N_117ConverterConsumerINS2_16DefaultConverterEEC2ES4_NS0_4SpanIKNS1_13FormatArgImplEEE:
  123|     84|      : converter_(converter), arg_context_(pack) {}

_ZNK4absl12lts_2024011619str_format_internal15BoundConversion3argEv:
   43|     36|  const FormatArgImpl* arg() const { return arg_; }
_ZN4absl12lts_2024011619str_format_internal15BoundConversion7set_argEPKNS1_13FormatArgImplE:
   44|     36|  void set_arg(const FormatArgImpl* a) { arg_ = a; }
_ZN4absl12lts_2024011619str_format_internal21UntypedFormatSpecImplC2ENSt3__117basic_string_viewIcNS3_11char_traitsIcEEEE:
   56|     84|      : data_(s.data()), size_(s.size()) {}
_ZNK4absl12lts_2024011619str_format_internal21UntypedFormatSpecImpl21has_parsed_conversionEv:
   61|     84|  bool has_parsed_conversion() const { return size_ == ~size_t{}; }
_ZNK4absl12lts_2024011619str_format_internal21UntypedFormatSpecImpl3strEv:
   63|     84|  string_view str() const {
   64|       |    assert(!has_parsed_conversion());
   65|     84|    return string_view(static_cast<const char*>(data_), size_);
   66|     84|  }
_ZN4absl12lts_2024011619str_format_internal21UntypedFormatSpecImpl7ExtractINS1_18FormatSpecTemplateIJLNS0_23FormatConversionCharSetE655355EEEEEERKS2_RKT_:
   73|     36|  static const UntypedFormatSpecImpl& Extract(const T& s) {
   74|     36|    return s.spec_;
   75|     36|  }
_ZN4absl12lts_2024011619str_format_internal21UntypedFormatSpecImpl7ExtractINS1_18FormatSpecTemplateIJEEEEERKS2_RKT_:
   73|     48|  static const UntypedFormatSpecImpl& Extract(const T& s) {
   74|     48|    return s.spec_;
   75|     48|  }
_ZN4absl12lts_2024011619str_format_internal18FormatSpecTemplateIJEEC2EUa9enable_ifIXclL_ZNS1_15ValidFormatImplITpTnNS0_23FormatConversionCharSetEJEEEbNSt3__117basic_string_viewIcNS6_11char_traitsIcEEEEEfL0p_EEEPKc:
  159|     48|      : Base(s) {}
_ZN4absl12lts_2024011619str_format_internal18FormatSpecTemplateIJLNS0_23FormatConversionCharSetE655355EEEC2EUa9enable_ifIXclL_ZNS1_15ValidFormatImplIJLS3_655355EEEEbNSt3__117basic_string_viewIcNS6_11char_traitsIcEEEEEfL0p_EEEPKc:
  159|     36|      : Base(s) {}

_ZN4absl12lts_2024011619str_format_internal17UnboundConversionC2Ev:
   36|     36|  UnboundConversion() {}  // NOLINT
_ZNK4absl12lts_2024011619str_format_internal7ConvTag7is_convEv:
   94|     36|  constexpr bool is_conv() const { return (tag_ & 0x80) == 0; }
_ZNK4absl12lts_2024011619str_format_internal7ConvTag7as_convEv:
   98|     36|  constexpr FormatConversionChar as_conv() const {
   99|     36|    assert(is_conv());
  100|     36|    assert(!is_length());
  101|       |    assert(!is_flags());
  102|     36|    return static_cast<FormatConversionChar>(tag_);
  103|     36|  }
_ZN4absl12lts_2024011619str_format_internal13GetTagForCharEc:
  169|     36|constexpr ConvTag GetTagForChar(char c) {
  170|     36|  return ConvTagHolder::value[static_cast<unsigned char>(c)];
  171|     36|}

_ZN4absl12lts_2024011619str_format_internal17FormatRawSinkImpl5WriteENSt3__117basic_string_viewIcNS3_11char_traitsIcEEEE:
   48|     84|  void Write(string_view s) { write_(sink_, s); }
_ZN4absl12lts_2024011619str_format_internal14FormatSinkImplC2ENS1_17FormatRawSinkImplE:
   68|     84|  explicit FormatSinkImpl(FormatRawSinkImpl raw) : raw_(raw) {}
_ZN4absl12lts_2024011619str_format_internal14FormatSinkImplD2Ev:
   70|     84|  ~FormatSinkImpl() { Flush(); }
_ZN4absl12lts_2024011619str_format_internal14FormatSinkImpl5FlushEv:
   72|     84|  void Flush() {
   73|     84|    raw_.Write(string_view(buf_, static_cast<size_t>(pos_ - buf_)));
   74|     84|    pos_ = buf_;
   75|     84|  }
_ZN4absl12lts_2024011619str_format_internal14FormatSinkImpl6AppendENSt3__117basic_string_viewIcNS3_11char_traitsIcEEEE:
   94|    120|  void Append(string_view v) {
   95|    120|    size_t n = v.size();
   96|    120|    if (n == 0) return;
  ------------------
  |  Branch (96:9): [True: 36, False: 84]
  ------------------
   97|     84|    size_ += n;
   98|     84|    if (n >= Avail()) {
  ------------------
  |  Branch (98:9): [True: 0, False: 84]
  ------------------
   99|      0|      Flush();
  100|      0|      raw_.Write(v);
  101|      0|      return;
  102|      0|    }
  103|     84|    memcpy(pos_, v.data(), n);
  104|     84|    pos_ += n;
  105|     84|  }
_ZNK4absl12lts_2024011619str_format_internal14FormatSinkImpl5AvailEv:
  123|     84|  size_t Avail() const {
  124|     84|    return static_cast<size_t>(buf_ + sizeof(buf_) - pos_);
  125|     84|  }
_ZNK4absl12lts_2024011619str_format_internal24FormatConversionSpecImpl8is_basicEv:
  277|     36|  bool is_basic() const { return flags_ == Flags::kBasic; }
_ZNK4absl12lts_2024011619str_format_internal24FormatConversionSpecImpl15conversion_charEv:
  290|    108|  FormatConversionChar conversion_char() const {
  291|       |    // Keep this field first in the struct . It generates better code when
  292|       |    // accessing it when ConversionSpec is passed by value in registers.
  293|    108|    static_assert(offsetof(FormatConversionSpecImpl, conv_) == 0, "");
  294|    108|    return conv_;
  295|    108|  }
_ZN4absl12lts_2024011619str_format_internal30FormatConversionSpecImplFriend8SetFlagsENS1_5FlagsEPNS1_24FormatConversionSpecImplE:
  321|     36|  static void SetFlags(Flags f, FormatConversionSpecImpl* conv) {
  322|     36|    conv->flags_ = f;
  323|     36|  }
_ZN4absl12lts_2024011619str_format_internal30FormatConversionSpecImplFriend17SetConversionCharENS0_20FormatConversionCharEPNS1_24FormatConversionSpecImplE:
  328|     36|                                FormatConversionSpecImpl* conv) {
  329|     36|    conv->conv_ = c;
  330|     36|  }
_ZN4absl12lts_2024011619str_format_internal30FormatConversionSpecImplFriend8SetWidthEiPNS1_24FormatConversionSpecImplE:
  331|     36|  static void SetWidth(int w, FormatConversionSpecImpl* conv) {
  332|     36|    conv->width_ = w;
  333|     36|  }
_ZN4absl12lts_2024011619str_format_internal30FormatConversionSpecImplFriend12SetPrecisionEiPNS1_24FormatConversionSpecImplE:
  334|     36|  static void SetPrecision(int p, FormatConversionSpecImpl* conv) {
  335|     36|    conv->precision_ = p;
  336|     36|  }
_ZN4absl12lts_2024011619str_format_internal29FormatConversionCharToConvIntENS0_20FormatConversionCharE:
  360|     36|constexpr uint64_t FormatConversionCharToConvInt(FormatConversionChar c) {
  361|     36|  return uint64_t{1} << (1 + static_cast<uint8_t>(c));
  362|     36|}
_ZN4absl12lts_2024011619str_format_internal8ContainsENS0_23FormatConversionCharSetENS0_20FormatConversionCharE:
  442|     36|constexpr bool Contains(FormatConversionCharSet set, FormatConversionChar c) {
  443|     36|  return (static_cast<uint64_t>(set) & FormatConversionCharToConvInt(c)) != 0;
  444|     36|}
_ZN4absl12lts_2024011619str_format_internal17FormatRawSinkImpl7ExtractINS0_13FormatRawSinkEEES2_T_:
   51|     84|  static FormatRawSinkImpl Extract(T s) {
   52|     84|    return s.sink_;
   53|     84|  }
_ZN4absl12lts_2024011619str_format_internal17FormatRawSinkImpl5FlushINSt3__112basic_stringIcNS4_11char_traitsIcEENS4_9allocatorIcEEEEEEvPvNS4_17basic_string_viewIcS7_EE:
   57|     84|  static void Flush(void* r, string_view s) {
   58|     84|    str_format_internal::InvokeFlush(static_cast<T*>(r), s);
   59|     84|  }
_ZN4absl12lts_2024011619str_format_internal17FormatRawSinkImplC2INSt3__112basic_stringIcNS4_11char_traitsIcEENS4_9allocatorIcEEEETnPDTclsr19str_format_internalE11InvokeFlushclsr3stdE7declvalIPT_EEcvNS4_17basic_string_viewIcS7_EE_EEELPv0EEESC_:
   46|     84|      : sink_(raw), write_(&FormatRawSinkImpl::Flush<T>) {}

_ZN4absl12lts_2024011619str_format_internal15AbslFormatFlushEPNSt3__112basic_stringIcNS2_11char_traitsIcEENS2_9allocatorIcEEEENS2_17basic_string_viewIcS5_EE:
   71|     84|inline void AbslFormatFlush(std::string* out, string_view s) {
   72|     84|  out->append(s.data(), s.size());
   73|     84|}
_ZN4absl12lts_2024011619str_format_internal11InvokeFlushINSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEEEDTcl15AbslFormatFlushfp_fp0_EEPT_NS3_17basic_string_viewIcS6_EE:
   89|     84|auto InvokeFlush(T* out, string_view s) -> decltype(AbslFormatFlush(out, s)) {
   90|     84|  AbslFormatFlush(out, s);
   91|     84|}

bind.cc:_ZN4absl12lts_2024011619str_format_internal17ParseFormatStringINS1_12_GLOBAL__N_117ConverterConsumerINS3_16DefaultConverterEEEEEbNSt3__117basic_string_viewIcNS7_11char_traitsIcEEEET_:
   57|     84|bool ParseFormatString(string_view src, Consumer consumer) {
   58|     84|  int next_arg = 0;
   59|     84|  const char* p = src.data();
   60|     84|  const char* const end = p + src.size();
   61|    120|  while (p != end) {
  ------------------
  |  Branch (61:10): [True: 84, False: 36]
  ------------------
   62|     84|    const char* percent =
   63|     84|        static_cast<const char*>(memchr(p, '%', static_cast<size_t>(end - p)));
   64|     84|    if (!percent) {
  ------------------
  |  Branch (64:9): [True: 48, False: 36]
  ------------------
   65|       |      // We found the last substring.
   66|     48|      return consumer.Append(string_view(p, static_cast<size_t>(end - p)));
   67|     48|    }
   68|       |    // We found a percent, so push the text run then process the percent.
   69|     36|    if (ABSL_PREDICT_FALSE(!consumer.Append(
  ------------------
  |  |  178|     36|#define ABSL_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
  |  |  ------------------
  |  |  |  Branch (178:31): [True: 0, False: 36]
  |  |  |  Branch (178:49): [Folded, False: 36]
  |  |  |  Branch (178:58): [True: 0, False: 36]
  |  |  ------------------
  ------------------
   70|     36|            string_view(p, static_cast<size_t>(percent - p))))) {
   71|      0|      return false;
   72|      0|    }
   73|     36|    if (ABSL_PREDICT_FALSE(percent + 1 >= end)) return false;
  ------------------
  |  |  178|     36|#define ABSL_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
  |  |  ------------------
  |  |  |  Branch (178:31): [True: 0, False: 36]
  |  |  |  Branch (178:49): [Folded, False: 36]
  |  |  |  Branch (178:58): [True: 0, False: 36]
  |  |  ------------------
  ------------------
   74|       |
   75|     36|    auto tag = GetTagForChar(percent[1]);
   76|     36|    if (tag.is_conv()) {
  ------------------
  |  Branch (76:9): [True: 36, False: 0]
  ------------------
   77|     36|      if (ABSL_PREDICT_FALSE(next_arg < 0)) {
  ------------------
  |  |  178|     36|#define ABSL_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
  |  |  ------------------
  |  |  |  Branch (178:31): [True: 0, False: 36]
  |  |  |  Branch (178:49): [Folded, False: 36]
  |  |  |  Branch (178:58): [True: 0, False: 36]
  |  |  ------------------
  ------------------
   78|       |        // This indicates an error in the format string.
   79|       |        // The only way to get `next_arg < 0` here is to have a positional
   80|       |        // argument first which sets next_arg to -1 and then a non-positional
   81|       |        // argument.
   82|      0|        return false;
   83|      0|      }
   84|     36|      p = percent + 2;
   85|       |
   86|       |      // Keep this case separate from the one below.
   87|       |      // ConvertOne is more efficient when the compiler can see that the `basic`
   88|       |      // flag is set.
   89|     36|      UnboundConversion conv;
   90|     36|      conv.conv = tag.as_conv();
   91|     36|      conv.arg_position = ++next_arg;
   92|     36|      if (ABSL_PREDICT_FALSE(
  ------------------
  |  |  178|     36|#define ABSL_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
  |  |  ------------------
  |  |  |  Branch (178:31): [True: 0, False: 36]
  |  |  |  Branch (178:49): [Folded, False: 36]
  |  |  |  Branch (178:58): [True: 0, False: 36]
  |  |  ------------------
  ------------------
   93|     36|              !consumer.ConvertOne(conv, string_view(percent + 1, 1)))) {
   94|      0|        return false;
   95|      0|      }
   96|     36|    } else if (percent[1] != '%') {
  ------------------
  |  Branch (96:16): [True: 0, False: 0]
  ------------------
   97|      0|      UnboundConversion conv;
   98|      0|      p = ConsumeUnboundConversionNoInline(percent + 1, end, &conv, &next_arg);
   99|      0|      if (ABSL_PREDICT_FALSE(p == nullptr)) return false;
  ------------------
  |  |  178|      0|#define ABSL_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
  |  |  ------------------
  |  |  |  Branch (178:31): [True: 0, False: 0]
  |  |  |  Branch (178:49): [Folded, False: 0]
  |  |  |  Branch (178:58): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  100|      0|      if (ABSL_PREDICT_FALSE(!consumer.ConvertOne(
  ------------------
  |  |  178|      0|#define ABSL_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
  |  |  ------------------
  |  |  |  Branch (178:31): [True: 0, False: 0]
  |  |  |  Branch (178:49): [Folded, False: 0]
  |  |  |  Branch (178:58): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  101|      0|              conv, string_view(percent + 1,
  102|      0|                                static_cast<size_t>(p - (percent + 1)))))) {
  103|      0|        return false;
  104|      0|      }
  105|      0|    } else {
  106|      0|      if (ABSL_PREDICT_FALSE(!consumer.Append("%"))) return false;
  ------------------
  |  |  178|      0|#define ABSL_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
  |  |  ------------------
  |  |  |  Branch (178:31): [True: 0, False: 0]
  |  |  |  Branch (178:49): [Folded, False: 0]
  |  |  |  Branch (178:58): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  107|      0|      p = percent + 2;
  108|      0|      continue;
  109|      0|    }
  110|     36|  }
  111|     36|  return true;
  112|     84|}

_ZN4absl12lts_2024011611StrContainsENSt3__117basic_string_viewIcNS1_11char_traitsIcEEEES5_:
   47|      4|                        absl::string_view needle) noexcept {
   48|      4|  return haystack.find(needle, 0) != haystack.npos;
   49|      4|}
_ZN4absl12lts_2024011610StartsWithENSt3__117basic_string_viewIcNS1_11char_traitsIcEEEES5_:
   59|     16|                       absl::string_view prefix) noexcept {
   60|     16|  return prefix.empty() ||
  ------------------
  |  Branch (60:10): [True: 0, False: 16]
  ------------------
   61|     16|         (text.size() >= prefix.size() &&
  ------------------
  |  Branch (61:11): [True: 16, False: 0]
  ------------------
   62|     16|          memcmp(text.data(), prefix.data(), prefix.size()) == 0);
  ------------------
  |  Branch (62:11): [True: 12, False: 4]
  ------------------
   63|     16|}

_ZN4absl12lts_2024011610SimpleAtodENSt3__117basic_string_viewIcNS1_11char_traitsIcEEEEPd:
   83|  21.5k|bool SimpleAtod(absl::string_view str, absl::Nonnull<double*> out) {
   84|  21.5k|  *out = 0.0;
   85|  21.5k|  str = StripAsciiWhitespace(str);
   86|       |  // std::from_chars doesn't accept an initial +, but SimpleAtod does, so if one
   87|       |  // is present, skip it, while avoiding accepting "+-0" as valid.
   88|  21.5k|  if (!str.empty() && str[0] == '+') {
  ------------------
  |  Branch (88:7): [True: 21.5k, False: 8]
  |  Branch (88:23): [True: 700, False: 20.8k]
  ------------------
   89|    700|    str.remove_prefix(1);
   90|    700|    if (!str.empty() && str[0] == '-') {
  ------------------
  |  Branch (90:9): [True: 698, False: 2]
  |  Branch (90:25): [True: 2, False: 696]
  ------------------
   91|      2|      return false;
   92|      2|    }
   93|    700|  }
   94|  21.5k|  auto result = absl::from_chars(str.data(), str.data() + str.size(), *out);
   95|  21.5k|  if (result.ec == std::errc::invalid_argument) {
  ------------------
  |  Branch (95:7): [True: 91, False: 21.4k]
  ------------------
   96|     91|    return false;
   97|     91|  }
   98|  21.4k|  if (result.ptr != str.data() + str.size()) {
  ------------------
  |  Branch (98:7): [True: 198, False: 21.2k]
  ------------------
   99|       |    // not all non-whitespace characters consumed
  100|    198|    return false;
  101|    198|  }
  102|       |  // from_chars() with DR 3081's current wording will return max() on
  103|       |  // overflow.  SimpleAtod returns infinity instead.
  104|  21.2k|  if (result.ec == std::errc::result_out_of_range) {
  ------------------
  |  Branch (104:7): [True: 3.52k, False: 17.7k]
  ------------------
  105|  3.52k|    if (*out > 1.0) {
  ------------------
  |  Branch (105:9): [True: 1.47k, False: 2.05k]
  ------------------
  106|  1.47k|      *out = std::numeric_limits<double>::infinity();
  107|  2.05k|    } else if (*out < -1.0) {
  ------------------
  |  Branch (107:16): [True: 564, False: 1.48k]
  ------------------
  108|    564|      *out = -std::numeric_limits<double>::infinity();
  109|    564|    }
  110|  3.52k|  }
  111|  21.2k|  return true;
  112|  21.4k|}
_ZN4absl12lts_2024011616numbers_internal12PutTwoDigitsEjPc:
  466|     70|void numbers_internal::PutTwoDigits(uint32_t i, absl::Nonnull<char*> buf) {
  467|     70|  little_endian::Store16(
  468|     70|      buf, static_cast<uint16_t>(PrepareTwoDigits(i) + kTwoZeroBytes));
  469|     70|}
_ZN4absl12lts_2024011616numbers_internal18safe_strtou64_baseENSt3__117basic_string_viewIcNS2_11char_traitsIcEEEEPmi:
 1360|  1.49k|                        int base) {
 1361|  1.49k|  return safe_uint_internal<uint64_t>(text, value, base);
 1362|  1.49k|}
numbers.cc:_ZN4absl12lts_2024011612_GLOBAL__N_116PrepareTwoDigitsEj:
  221|     70|inline uint32_t PrepareTwoDigits(uint32_t i) {
  222|     70|  return PrepareTwoDigitsImpl(i, false);
  223|     70|}
numbers.cc:_ZN4absl12lts_2024011612_GLOBAL__N_120PrepareTwoDigitsImplEjb:
  215|     70|inline uint32_t PrepareTwoDigitsImpl(uint32_t i, bool reversed) {
  216|     70|  assert(i < 100);
  217|     70|  uint32_t div10 = (i * kDivisionBy10Mul) / kDivisionBy10Div;
  218|     70|  uint32_t mod10 = i - 10u * div10;
  219|     70|  return (div10 << (reversed ? 8 : 0)) + (mod10 << (reversed ? 0 : 8));
  ------------------
  |  Branch (219:21): [True: 0, False: 70]
  |  Branch (219:53): [True: 0, False: 70]
  ------------------
  220|     70|}
numbers.cc:_ZN4absl12lts_2024011612_GLOBAL__N_124safe_parse_sign_and_baseEPNSt3__117basic_string_viewIcNS2_11char_traitsIcEEEEPiPb:
  923|  1.49k|    absl::Nonnull<bool*> negative_ptr /*output*/) {
  924|  1.49k|  if (text->data() == nullptr) {
  ------------------
  |  Branch (924:7): [True: 0, False: 1.49k]
  ------------------
  925|      0|    return false;
  926|      0|  }
  927|       |
  928|  1.49k|  const char* start = text->data();
  929|  1.49k|  const char* end = start + text->size();
  930|  1.49k|  int base = *base_ptr;
  931|       |
  932|       |  // Consume whitespace.
  933|  1.49k|  while (start < end &&
  ------------------
  |  Branch (933:10): [True: 1.49k, False: 1]
  ------------------
  934|  1.49k|         absl::ascii_isspace(static_cast<unsigned char>(start[0]))) {
  ------------------
  |  Branch (934:10): [True: 0, False: 1.49k]
  ------------------
  935|      0|    ++start;
  936|      0|  }
  937|  1.55k|  while (start < end &&
  ------------------
  |  Branch (937:10): [True: 1.55k, False: 1]
  ------------------
  938|  1.55k|         absl::ascii_isspace(static_cast<unsigned char>(end[-1]))) {
  ------------------
  |  Branch (938:10): [True: 64, False: 1.49k]
  ------------------
  939|     64|    --end;
  940|     64|  }
  941|  1.49k|  if (start >= end) {
  ------------------
  |  Branch (941:7): [True: 1, False: 1.49k]
  ------------------
  942|      1|    return false;
  943|      1|  }
  944|       |
  945|       |  // Consume sign.
  946|  1.49k|  *negative_ptr = (start[0] == '-');
  947|  1.49k|  if (*negative_ptr || start[0] == '+') {
  ------------------
  |  Branch (947:7): [True: 1, False: 1.49k]
  |  Branch (947:24): [True: 350, False: 1.14k]
  ------------------
  948|    351|    ++start;
  949|    351|    if (start >= end) {
  ------------------
  |  Branch (949:9): [True: 11, False: 340]
  ------------------
  950|     11|      return false;
  951|     11|    }
  952|    351|  }
  953|       |
  954|       |  // Consume base-dependent prefix.
  955|       |  //  base 0: "0x" -> base 16, "0" -> base 8, default -> base 10
  956|       |  //  base 16: "0x" -> base 16
  957|       |  // Also validate the base.
  958|  1.48k|  if (base == 0) {
  ------------------
  |  Branch (958:7): [True: 0, False: 1.48k]
  ------------------
  959|      0|    if (end - start >= 2 && start[0] == '0' &&
  ------------------
  |  Branch (959:9): [True: 0, False: 0]
  |  Branch (959:29): [True: 0, False: 0]
  ------------------
  960|      0|        (start[1] == 'x' || start[1] == 'X')) {
  ------------------
  |  Branch (960:10): [True: 0, False: 0]
  |  Branch (960:29): [True: 0, False: 0]
  ------------------
  961|      0|      base = 16;
  962|      0|      start += 2;
  963|      0|      if (start >= end) {
  ------------------
  |  Branch (963:11): [True: 0, False: 0]
  ------------------
  964|       |        // "0x" with no digits after is invalid.
  965|      0|        return false;
  966|      0|      }
  967|      0|    } else if (end - start >= 1 && start[0] == '0') {
  ------------------
  |  Branch (967:16): [True: 0, False: 0]
  |  Branch (967:36): [True: 0, False: 0]
  ------------------
  968|      0|      base = 8;
  969|      0|      start += 1;
  970|      0|    } else {
  971|      0|      base = 10;
  972|      0|    }
  973|  1.48k|  } else if (base == 16) {
  ------------------
  |  Branch (973:14): [True: 0, False: 1.48k]
  ------------------
  974|      0|    if (end - start >= 2 && start[0] == '0' &&
  ------------------
  |  Branch (974:9): [True: 0, False: 0]
  |  Branch (974:29): [True: 0, False: 0]
  ------------------
  975|      0|        (start[1] == 'x' || start[1] == 'X')) {
  ------------------
  |  Branch (975:10): [True: 0, False: 0]
  |  Branch (975:29): [True: 0, False: 0]
  ------------------
  976|      0|      start += 2;
  977|      0|      if (start >= end) {
  ------------------
  |  Branch (977:11): [True: 0, False: 0]
  ------------------
  978|       |        // "0x" with no digits after is invalid.
  979|      0|        return false;
  980|      0|      }
  981|      0|    }
  982|  1.48k|  } else if (base >= 2 && base <= 36) {
  ------------------
  |  Branch (982:14): [True: 1.48k, False: 0]
  |  Branch (982:27): [True: 1.48k, False: 0]
  ------------------
  983|       |    // okay
  984|  1.48k|  } else {
  985|      0|    return false;
  986|      0|  }
  987|  1.48k|  *text = absl::string_view(start, static_cast<size_t>(end - start));
  988|  1.48k|  *base_ptr = base;
  989|  1.48k|  return true;
  990|  1.48k|}
numbers.cc:_ZN4absl12lts_2024011612_GLOBAL__N_118safe_uint_internalImEEbNSt3__117basic_string_viewIcNS3_11char_traitsIcEEEEPT_i:
 1305|  1.49k|                               absl::Nonnull<IntType*> value_p, int base) {
 1306|  1.49k|  *value_p = 0;
 1307|  1.49k|  bool negative;
 1308|  1.49k|  if (!safe_parse_sign_and_base(&text, &base, &negative) || negative) {
  ------------------
  |  Branch (1308:7): [True: 12, False: 1.48k]
  |  Branch (1308:61): [True: 1, False: 1.47k]
  ------------------
 1309|     13|    return false;
 1310|     13|  }
 1311|  1.47k|  return safe_parse_positive_int(text, base, value_p);
 1312|  1.49k|}
numbers.cc:_ZN4absl12lts_2024011612_GLOBAL__N_123safe_parse_positive_intImEEbNSt3__117basic_string_viewIcNS3_11char_traitsIcEEEEiPT_:
 1208|  1.47k|                                    absl::Nonnull<IntType*> value_p) {
 1209|  1.47k|  IntType value = 0;
 1210|  1.47k|  const IntType vmax = std::numeric_limits<IntType>::max();
 1211|  1.47k|  assert(vmax > 0);
 1212|  1.47k|  assert(base >= 0);
 1213|  1.47k|  const IntType base_inttype = static_cast<IntType>(base);
 1214|  1.47k|  assert(vmax >= base_inttype);
 1215|  1.47k|  const IntType vmax_over_base = LookupTables<IntType>::kVmaxOverBase[base];
 1216|  1.47k|  assert(base < 2 ||
 1217|  1.47k|         std::numeric_limits<IntType>::max() / base_inttype == vmax_over_base);
 1218|  1.47k|  const char* start = text.data();
 1219|  1.47k|  const char* end = start + text.size();
 1220|       |  // loop over digits
 1221|  6.01k|  for (; start < end; ++start) {
  ------------------
  |  Branch (1221:10): [True: 4.61k, False: 1.39k]
  ------------------
 1222|  4.61k|    unsigned char c = static_cast<unsigned char>(start[0]);
 1223|  4.61k|    IntType digit = static_cast<IntType>(kAsciiToInt[c]);
 1224|  4.61k|    if (digit >= base_inttype) {
  ------------------
  |  Branch (1224:9): [True: 18, False: 4.59k]
  ------------------
 1225|     18|      *value_p = value;
 1226|     18|      return false;
 1227|     18|    }
 1228|  4.59k|    if (value > vmax_over_base) {
  ------------------
  |  Branch (1228:9): [True: 67, False: 4.53k]
  ------------------
 1229|     67|      *value_p = vmax;
 1230|     67|      return false;
 1231|     67|    }
 1232|  4.53k|    value *= base_inttype;
 1233|  4.53k|    if (value > vmax - digit) {
  ------------------
  |  Branch (1233:9): [True: 1, False: 4.53k]
  ------------------
 1234|      1|      *value_p = vmax;
 1235|      1|      return false;
 1236|      1|    }
 1237|  4.53k|    value += digit;
 1238|  4.53k|  }
 1239|  1.39k|  *value_p = value;
 1240|  1.39k|  return true;
 1241|  1.47k|}

_ZN4absl12lts_2024011616numbers_internal16safe_strtoi_baseImEEbNSt3__117basic_string_viewIcNS3_11char_traitsIcEEEEPT_i:
  354|  1.49k|                                           int base) {
  355|  1.49k|  static_assert(sizeof(*out) == 4 || sizeof(*out) == 8,
  356|  1.49k|                "SimpleAtoi works only with 32-bit or 64-bit integers.");
  357|  1.49k|  static_assert(!std::is_floating_point<int_type>::value,
  358|  1.49k|                "Use SimpleAtof or SimpleAtod instead.");
  359|  1.49k|  bool parsed;
  360|       |  // TODO(jorg): This signed-ness check is used because it works correctly
  361|       |  // with enums, and it also serves to check that int_type is not a pointer.
  362|       |  // If one day something like std::is_signed<enum E> works, switch to it.
  363|       |  // These conditions are constexpr bools to suppress MSVC warning C4127.
  364|  1.49k|  constexpr bool kIsSigned = static_cast<int_type>(1) - 2 < 0;
  365|  1.49k|  constexpr bool kUse64Bit = sizeof(*out) == 64 / 8;
  366|  1.49k|  if (kIsSigned) {
  ------------------
  |  Branch (366:7): [Folded, False: 1.49k]
  ------------------
  367|      0|    if (kUse64Bit) {
  ------------------
  |  Branch (367:9): [True: 0, Folded]
  ------------------
  368|      0|      int64_t val;
  369|      0|      parsed = numbers_internal::safe_strto64_base(s, &val, base);
  370|      0|      *out = static_cast<int_type>(val);
  371|      0|    } else {
  372|      0|      int32_t val;
  373|      0|      parsed = numbers_internal::safe_strto32_base(s, &val, base);
  374|      0|      *out = static_cast<int_type>(val);
  375|      0|    }
  376|  1.49k|  } else {
  377|  1.49k|    if (kUse64Bit) {
  ------------------
  |  Branch (377:9): [True: 1.49k, Folded]
  ------------------
  378|  1.49k|      uint64_t val;
  379|  1.49k|      parsed = numbers_internal::safe_strtou64_base(s, &val, base);
  380|  1.49k|      *out = static_cast<int_type>(val);
  381|  1.49k|    } else {
  382|      0|      uint32_t val;
  383|      0|      parsed = numbers_internal::safe_strtou32_base(s, &val, base);
  384|      0|      *out = static_cast<int_type>(val);
  385|      0|    }
  386|  1.49k|  }
  387|  1.49k|  return parsed;
  388|  1.49k|}
_ZN4absl12lts_2024011610SimpleAtoiImEEbNSt3__117basic_string_viewIcNS2_11char_traitsIcEEEEPT_:
  423|  1.49k|                                     absl::Nonnull<int_type*> out) {
  424|  1.49k|  return numbers_internal::safe_strtoi_base(str, out, 10);
  425|  1.49k|}

_ZN4absl12lts_202401166StrCatERKNS0_8AlphaNumES3_:
   58|      4|std::string StrCat(const AlphaNum& a, const AlphaNum& b) {
   59|      4|  std::string result;
   60|      4|  absl::strings_internal::STLStringResizeUninitialized(&result,
   61|      4|                                                       a.size() + b.size());
   62|      4|  char* const begin = &result[0];
   63|      4|  char* out = begin;
   64|      4|  out = Append(out, a);
   65|      4|  out = Append(out, b);
   66|       |  assert(out == begin + result.size());
   67|      4|  return result;
   68|      4|}
_ZN4absl12lts_202401166StrCatERKNS0_8AlphaNumES3_S3_:
   70|    417|std::string StrCat(const AlphaNum& a, const AlphaNum& b, const AlphaNum& c) {
   71|    417|  std::string result;
   72|    417|  strings_internal::STLStringResizeUninitialized(
   73|    417|      &result, a.size() + b.size() + c.size());
   74|    417|  char* const begin = &result[0];
   75|    417|  char* out = begin;
   76|    417|  out = Append(out, a);
   77|    417|  out = Append(out, b);
   78|    417|  out = Append(out, c);
   79|       |  assert(out == begin + result.size());
   80|    417|  return result;
   81|    417|}
str_cat.cc:_ZN4absl12lts_2024011612_GLOBAL__N_16AppendEPcRKNS0_8AlphaNumE:
   46|  1.25k|absl::Nonnull<char*> Append(absl::Nonnull<char*> out, const AlphaNum& x) {
   47|       |  // memcpy is allowed to overwrite arbitrary memory, so doing this after the
   48|       |  // call would force an extra fetch of x.size().
   49|  1.25k|  char* after = out + x.size();
   50|  1.25k|  if (x.size() != 0) {
  ------------------
  |  Branch (50:7): [True: 1.25k, False: 4]
  ------------------
   51|  1.25k|    memcpy(out, x.data(), x.size());
   52|  1.25k|  }
   53|  1.25k|  return after;
   54|  1.25k|}

_ZN4absl12lts_202401168AlphaNumC2EPKc:
  353|    425|      : piece_(NullSafeStringView(c_str)) {}
_ZN4absl12lts_202401168AlphaNumC2ENSt3__117basic_string_viewIcNS2_11char_traitsIcEEEE:
  356|    798|      : piece_(pc) {}
_ZNK4absl12lts_202401168AlphaNum4sizeEv:
  377|  5.03k|  absl::string_view::size_type size() const { return piece_.size(); }
_ZNK4absl12lts_202401168AlphaNum4dataEv:
  378|  1.25k|  absl::Nullable<const char*> data() const { return piece_.data(); }
_ZN4absl12lts_202401168AlphaNumC2INSt3__19allocatorIcEEEERKNS3_12basic_stringIcNS3_11char_traitsIcEET_EE:
  369|     36|      : piece_(str) {}

_ZN4absl12lts_2024011617UntypedFormatSpecC2ENSt3__117basic_string_viewIcNS2_11char_traitsIcEEEE:
  111|     84|  explicit UntypedFormatSpec(string_view s) : spec_(s) {}
_ZN4absl12lts_202401166FormatIJEEEbNS0_13FormatRawSinkERKNS0_19str_format_internal18FormatSpecTemplateIJXspclsr19str_format_internalE14ArgumentToConvIT_EEEEEEDpRKS5_:
  529|     48|            const Args&... args) {
  530|     48|  return str_format_internal::FormatUntyped(
  531|     48|      str_format_internal::FormatRawSinkImpl::Extract(raw_sink),
  532|     48|      str_format_internal::UntypedFormatSpecImpl::Extract(format),
  533|     48|      {str_format_internal::FormatArgImpl(args)...});
  534|     48|}
_ZN4absl12lts_2024011613FormatRawSinkC2INSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEvEEPT_:
  504|     84|      : sink_(raw) {}
_ZN4absl12lts_202401166FormatIJNS0_6int128EEEEbNS0_13FormatRawSinkERKNS0_19str_format_internal18FormatSpecTemplateIJXspclsr19str_format_internalE14ArgumentToConvIT_EEEEEEDpRKS6_:
  529|     36|            const Args&... args) {
  530|     36|  return str_format_internal::FormatUntyped(
  531|     36|      str_format_internal::FormatRawSinkImpl::Extract(raw_sink),
  532|     36|      str_format_internal::UntypedFormatSpecImpl::Extract(format),
  533|     36|      {str_format_internal::FormatArgImpl(args)...});
  534|     36|}

_ZN4absl12lts_2024011618NullSafeStringViewEPKc:
  762|    425|constexpr string_view NullSafeStringView(absl::Nullable<const char*> p) {
  763|    425|  return p ? string_view(p) : string_view();
  ------------------
  |  Branch (763:10): [True: 425, False: 0]
  ------------------
  764|    425|}

_ZN4absl12lts_2024011613ConsumePrefixEPNSt3__117basic_string_viewIcNS1_11char_traitsIcEEEES5_:
   48|     12|                          absl::string_view expected) {
   49|     12|  if (!absl::StartsWith(*str, expected)) return false;
  ------------------
  |  Branch (49:7): [True: 0, False: 12]
  ------------------
   50|     12|  str->remove_prefix(expected.size());
   51|     12|  return true;
   52|     12|}

_ZN4absl12lts_2024011624synchronization_internal14InvalidGraphIdEv:
   58|    172|inline GraphId InvalidGraphId() {
   59|    172|  return GraphId{0};
   60|    172|}

_ZN4absl12lts_202401165Mutex4LockEv:
 1524|    172|void Mutex::Lock() {
 1525|    172|  ABSL_TSAN_MUTEX_PRE_LOCK(this, 0);
 1526|    172|  GraphId id = DebugOnlyDeadlockCheck(this);
 1527|    172|  intptr_t v = mu_.load(std::memory_order_relaxed);
 1528|       |  // try fast acquire, then spin loop
 1529|    172|  if (ABSL_PREDICT_FALSE((v & (kMuWriter | kMuReader | kMuEvent)) != 0) ||
  ------------------
  |  |  178|    344|#define ABSL_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
  |  |  ------------------
  |  |  |  Branch (178:31): [True: 0, False: 172]
  |  |  |  Branch (178:49): [Folded, False: 172]
  |  |  |  Branch (178:58): [True: 0, False: 172]
  |  |  ------------------
  ------------------
 1530|    172|      ABSL_PREDICT_FALSE(!mu_.compare_exchange_strong(
  ------------------
  |  |  178|    172|#define ABSL_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
  |  |  ------------------
  |  |  |  Branch (178:31): [True: 0, False: 172]
  |  |  |  Branch (178:49): [Folded, False: 172]
  |  |  |  Branch (178:58): [True: 0, False: 172]
  |  |  ------------------
  ------------------
 1531|    172|          v, kMuWriter | v, std::memory_order_acquire,
 1532|    172|          std::memory_order_relaxed))) {
 1533|       |    // try spin acquire, then slow loop
 1534|      0|    if (ABSL_PREDICT_FALSE(!TryAcquireWithSpinning(&this->mu_))) {
  ------------------
  |  |  178|      0|#define ABSL_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
  |  |  ------------------
  |  |  |  Branch (178:31): [True: 0, False: 0]
  |  |  |  Branch (178:49): [Folded, False: 0]
  |  |  |  Branch (178:58): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1535|      0|      this->LockSlow(kExclusive, nullptr, 0);
 1536|      0|    }
 1537|      0|  }
 1538|    172|  DebugOnlyLockEnter(this, id);
 1539|    172|  ABSL_TSAN_MUTEX_POST_LOCK(this, 0, 0);
 1540|    172|}
_ZN4absl12lts_202401165Mutex6UnlockEv:
 1702|    172|void Mutex::Unlock() {
 1703|    172|  ABSL_TSAN_MUTEX_PRE_UNLOCK(this, 0);
 1704|    172|  DebugOnlyLockLeave(this);
 1705|    172|  intptr_t v = mu_.load(std::memory_order_relaxed);
 1706|       |
 1707|    172|  if (kDebugMode && ((v & (kMuWriter | kMuReader)) != kMuWriter)) {
  ------------------
  |  Branch (1707:7): [Folded, False: 172]
  |  Branch (1707:21): [True: 0, False: 0]
  ------------------
 1708|      0|    ABSL_RAW_LOG(FATAL, "Mutex unlocked when destroyed or not locked: v=0x%x",
  ------------------
  |  |   45|      0|  do {                                                                         \
  |  |   46|      0|    constexpr const char* absl_raw_log_internal_basename =                     \
  |  |   47|      0|        ::absl::raw_log_internal::Basename(__FILE__, sizeof(__FILE__) - 1);    \
  |  |   48|      0|    ::absl::raw_log_internal::RawLog(ABSL_RAW_LOG_INTERNAL_##severity,         \
  |  |  ------------------
  |  |  |  |  110|      0|#define ABSL_RAW_LOG_INTERNAL_FATAL ::absl::LogSeverity::kFatal
  |  |  ------------------
  |  |   49|      0|                                     absl_raw_log_internal_basename, __LINE__, \
  |  |   50|      0|                                     __VA_ARGS__);                             \
  |  |   51|      0|    ABSL_RAW_LOG_INTERNAL_MAYBE_UNREACHABLE_##severity;                        \
  |  |  ------------------
  |  |  |  |  118|      0|#define ABSL_RAW_LOG_INTERNAL_MAYBE_UNREACHABLE_FATAL ABSL_UNREACHABLE()
  |  |  |  |  ------------------
  |  |  |  |  |  |  225|      0|  do {                                           \
  |  |  |  |  |  |  226|      0|    /* NOLINTNEXTLINE: misc-static-assert */     \
  |  |  |  |  |  |  227|      0|    assert(false && "ABSL_UNREACHABLE reached"); \
  |  |  |  |  |  |  228|      0|    ABSL_INTERNAL_UNREACHABLE_IMPL();            \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  203|      0|#define ABSL_INTERNAL_UNREACHABLE_IMPL() __builtin_unreachable()
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  229|      0|  } while (false)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (229:12): [Folded, False: 0]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   52|      0|  } while (0)
  |  |  ------------------
  |  |  |  Branch (52:12): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1709|      0|                 static_cast<unsigned>(v));
 1710|      0|  }
 1711|       |
 1712|       |  // should_try_cas is whether we'll try a compare-and-swap immediately.
 1713|       |  // NOTE: optimized out when kDebugMode is false.
 1714|    172|  bool should_try_cas = ((v & (kMuEvent | kMuWriter)) == kMuWriter &&
  ------------------
  |  Branch (1714:26): [True: 172, False: 0]
  ------------------
 1715|    172|                         (v & (kMuWait | kMuDesig)) != kMuWait);
  ------------------
  |  Branch (1715:26): [True: 172, False: 0]
  ------------------
 1716|       |  // But, we can use an alternate computation of it, that compilers
 1717|       |  // currently don't find on their own.  When that changes, this function
 1718|       |  // can be simplified.
 1719|    172|  intptr_t x = (v ^ (kMuWriter | kMuWait)) & (kMuWriter | kMuEvent);
 1720|    172|  intptr_t y = (v ^ (kMuWriter | kMuWait)) & (kMuWait | kMuDesig);
 1721|       |  // Claim: "x == 0 && y > 0" is equal to should_try_cas.
 1722|       |  // Also, because kMuWriter and kMuEvent exceed kMuDesig and kMuWait,
 1723|       |  // all possible non-zero values for x exceed all possible values for y.
 1724|       |  // Therefore, (x == 0 && y > 0) == (x < y).
 1725|    172|  if (kDebugMode && should_try_cas != (x < y)) {
  ------------------
  |  Branch (1725:7): [Folded, False: 172]
  |  Branch (1725:21): [True: 0, False: 0]
  ------------------
 1726|       |    // We would usually use PRIdPTR here, but is not correctly implemented
 1727|       |    // within the android toolchain.
 1728|      0|    ABSL_RAW_LOG(FATAL, "internal logic error %llx %llx %llx\n",
  ------------------
  |  |   45|      0|  do {                                                                         \
  |  |   46|      0|    constexpr const char* absl_raw_log_internal_basename =                     \
  |  |   47|      0|        ::absl::raw_log_internal::Basename(__FILE__, sizeof(__FILE__) - 1);    \
  |  |   48|      0|    ::absl::raw_log_internal::RawLog(ABSL_RAW_LOG_INTERNAL_##severity,         \
  |  |  ------------------
  |  |  |  |  110|      0|#define ABSL_RAW_LOG_INTERNAL_FATAL ::absl::LogSeverity::kFatal
  |  |  ------------------
  |  |   49|      0|                                     absl_raw_log_internal_basename, __LINE__, \
  |  |   50|      0|                                     __VA_ARGS__);                             \
  |  |   51|      0|    ABSL_RAW_LOG_INTERNAL_MAYBE_UNREACHABLE_##severity;                        \
  |  |  ------------------
  |  |  |  |  118|      0|#define ABSL_RAW_LOG_INTERNAL_MAYBE_UNREACHABLE_FATAL ABSL_UNREACHABLE()
  |  |  |  |  ------------------
  |  |  |  |  |  |  225|      0|  do {                                           \
  |  |  |  |  |  |  226|      0|    /* NOLINTNEXTLINE: misc-static-assert */     \
  |  |  |  |  |  |  227|      0|    assert(false && "ABSL_UNREACHABLE reached"); \
  |  |  |  |  |  |  228|      0|    ABSL_INTERNAL_UNREACHABLE_IMPL();            \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  203|      0|#define ABSL_INTERNAL_UNREACHABLE_IMPL() __builtin_unreachable()
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  229|      0|  } while (false)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (229:12): [Folded, False: 0]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   52|      0|  } while (0)
  |  |  ------------------
  |  |  |  Branch (52:12): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1729|      0|                 static_cast<long long>(v), static_cast<long long>(x),
 1730|      0|                 static_cast<long long>(y));
 1731|      0|  }
 1732|    172|  if (x < y && mu_.compare_exchange_strong(v, v & ~(kMuWrWait | kMuWriter),
  ------------------
  |  Branch (1732:7): [True: 172, False: 0]
  |  Branch (1732:16): [True: 172, False: 0]
  ------------------
 1733|    172|                                           std::memory_order_release,
 1734|    172|                                           std::memory_order_relaxed)) {
 1735|       |    // fast writer release (writer with no waiters or with designated waker)
 1736|    172|  } else {
 1737|      0|    this->UnlockSlow(nullptr /*no waitp*/);  // take slow path
 1738|      0|  }
 1739|    172|  ABSL_TSAN_MUTEX_POST_UNLOCK(this, 0);
 1740|    172|}
mutex.cc:_ZN4absl12lts_20240116L22DebugOnlyDeadlockCheckEPNS0_5MutexE:
 1466|    172|static inline GraphId DebugOnlyDeadlockCheck(Mutex* mu) {
 1467|    172|  if (kDebugMode && synch_deadlock_detection.load(std::memory_order_acquire) !=
  ------------------
  |  Branch (1467:7): [Folded, False: 172]
  |  Branch (1467:21): [True: 0, False: 0]
  ------------------
 1468|      0|                        OnDeadlockCycle::kIgnore) {
 1469|      0|    return DeadlockCheck(mu);
 1470|    172|  } else {
 1471|    172|    return InvalidGraphId();
 1472|    172|  }
 1473|    172|}
mutex.cc:_ZN4absl12lts_20240116L18DebugOnlyLockEnterEPNS0_5MutexENS0_24synchronization_internal7GraphIdE:
 1298|    172|static inline void DebugOnlyLockEnter(Mutex* mu, GraphId id) {
 1299|    172|  if (kDebugMode) {
  ------------------
  |  Branch (1299:7): [Folded, False: 172]
  ------------------
 1300|      0|    if (synch_deadlock_detection.load(std::memory_order_acquire) !=
  ------------------
  |  Branch (1300:9): [True: 0, False: 0]
  ------------------
 1301|      0|        OnDeadlockCycle::kIgnore) {
 1302|      0|      LockEnter(mu, id, Synch_GetAllLocks());
 1303|      0|    }
 1304|      0|  }
 1305|    172|}
mutex.cc:_ZN4absl12lts_20240116L18DebugOnlyLockLeaveEPNS0_5MutexE:
 1308|    172|static inline void DebugOnlyLockLeave(Mutex* mu) {
 1309|    172|  if (kDebugMode) {
  ------------------
  |  Branch (1309:7): [Folded, False: 172]
  ------------------
 1310|      0|    if (synch_deadlock_detection.load(std::memory_order_acquire) !=
  ------------------
  |  Branch (1310:9): [True: 0, False: 0]
  ------------------
 1311|      0|        OnDeadlockCycle::kIgnore) {
 1312|      0|      LockLeave(mu, GetGraphId(mu), Synch_GetAllLocks());
 1313|      0|    }
 1314|      0|  }
 1315|    172|}

_ZN4absl12lts_202401169MutexLockC2EPNS0_5MutexE:
  583|    130|  explicit MutexLock(Mutex* mu) ABSL_EXCLUSIVE_LOCK_FUNCTION(mu) : mu_(mu) {
  584|    130|    this->mu_->Lock();
  585|    130|  }
_ZN4absl12lts_202401169MutexLockD2Ev:
  601|    130|  ~MutexLock() ABSL_UNLOCK_FUNCTION() { this->mu_->Unlock(); }
_ZN4absl12lts_202401165MutexC2Ev:
 1061|     40|inline Mutex::Mutex() : mu_(0) {
 1062|     40|  ABSL_TSAN_MUTEX_CREATE(this, __tsan_mutex_not_static);
 1063|     40|}

_ZN4absl12lts_202401163NowEv:
   39|  41.0k|Time Now() {
   40|       |  // TODO(bww): Get a timespec instead so we don't have to divide.
   41|  41.0k|  int64_t n = absl::GetCurrentTimeNanos();
   42|  41.0k|  if (n >= 0) {
  ------------------
  |  Branch (42:7): [True: 41.0k, False: 0]
  ------------------
   43|  41.0k|    return time_internal::FromUnixDuration(
   44|  41.0k|        time_internal::MakeDuration(n / 1000000000, n % 1000000000 * 4));
   45|  41.0k|  }
   46|      0|  return time_internal::FromUnixDuration(absl::Nanoseconds(n));
   47|  41.0k|}
_ZN4absl12lts_2024011619GetCurrentTimeNanosEv:
   77|  41.0k|int64_t GetCurrentTimeNanos() { return GET_CURRENT_TIME_NANOS_FROM_SYSTEM(); }
  ------------------
  |  |   71|  41.0k|  ::absl::time_internal::GetCurrentTimeNanosFromSystem()
  ------------------

_ZN4absl12lts_202401168DurationmIES1_:
  419|  20.5k|Duration& Duration::operator-=(Duration rhs) {
  420|  20.5k|  if (time_internal::IsInfiniteDuration(*this)) return *this;
  ------------------
  |  Branch (420:7): [True: 0, False: 20.5k]
  ------------------
  421|  20.5k|  if (time_internal::IsInfiniteDuration(rhs)) {
  ------------------
  |  Branch (421:7): [True: 0, False: 20.5k]
  ------------------
  422|      0|    return *this = rhs.rep_hi_.Get() >= 0 ? -InfiniteDuration()
  ------------------
  |  Branch (422:20): [True: 0, False: 0]
  ------------------
  423|      0|                                          : InfiniteDuration();
  424|      0|  }
  425|  20.5k|  const int64_t orig_rep_hi = rep_hi_.Get();
  426|  20.5k|  rep_hi_ = DecodeTwosComp(EncodeTwosComp(rep_hi_.Get()) -
  427|  20.5k|                           EncodeTwosComp(rhs.rep_hi_.Get()));
  428|  20.5k|  if (rep_lo_ < rhs.rep_lo_) {
  ------------------
  |  Branch (428:7): [True: 126, False: 20.3k]
  ------------------
  429|    126|    rep_hi_ = DecodeTwosComp(EncodeTwosComp(rep_hi_.Get()) - 1);
  430|    126|    rep_lo_ += kTicksPerSecond;
  431|    126|  }
  432|  20.5k|  rep_lo_ -= rhs.rep_lo_;
  433|  20.5k|  if (rhs.rep_hi_.Get() < 0 ? rep_hi_.Get() < orig_rep_hi
  ------------------
  |  Branch (433:7): [True: 0, False: 20.5k]
  |  Branch (433:7): [True: 0, False: 20.5k]
  ------------------
  434|  20.5k|                            : rep_hi_.Get() > orig_rep_hi) {
  435|      0|    return *this = rhs.rep_hi_.Get() >= 0 ? -InfiniteDuration()
  ------------------
  |  Branch (435:20): [True: 0, False: 0]
  ------------------
  436|      0|                                          : InfiniteDuration();
  437|      0|  }
  438|  20.5k|  return *this;
  439|  20.5k|}
duration.cc:_ZN4absl12lts_2024011612_GLOBAL__N_114DecodeTwosCompEm:
  187|  20.6k|inline int64_t DecodeTwosComp(uint64_t v) { return absl::bit_cast<int64_t>(v); }
duration.cc:_ZN4absl12lts_2024011612_GLOBAL__N_114EncodeTwosCompEl:
  184|  41.1k|inline uint64_t EncodeTwosComp(int64_t v) {
  185|  41.1k|  return absl::bit_cast<uint64_t>(v);
  186|  41.1k|}

clock.cc:_ZN4absl12lts_2024011613time_internalL29GetCurrentTimeNanosFromSystemEv:
   13|  41.0k|static int64_t GetCurrentTimeNanosFromSystem() {
   14|  41.0k|  const int64_t kNanosPerSecond = 1000 * 1000 * 1000;
   15|  41.0k|  struct timespec ts;
   16|  41.0k|  ABSL_RAW_CHECK(clock_gettime(CLOCK_REALTIME, &ts) == 0,
  ------------------
  |  |   60|  41.0k|  do {                                                                 \
  |  |   61|  41.0k|    if (ABSL_PREDICT_FALSE(!(condition))) {                            \
  |  |  ------------------
  |  |  |  |  178|  41.0k|#define ABSL_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (178:31): [True: 0, False: 41.0k]
  |  |  |  |  |  Branch (178:49): [Folded, False: 41.0k]
  |  |  |  |  |  Branch (178:58): [True: 0, False: 41.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   62|      0|      ABSL_RAW_LOG(FATAL, "Check %s failed: %s", #condition, message); \
  |  |  ------------------
  |  |  |  |   45|      0|  do {                                                                         \
  |  |  |  |   46|      0|    constexpr const char* absl_raw_log_internal_basename =                     \
  |  |  |  |   47|      0|        ::absl::raw_log_internal::Basename(__FILE__, sizeof(__FILE__) - 1);    \
  |  |  |  |   48|      0|    ::absl::raw_log_internal::RawLog(ABSL_RAW_LOG_INTERNAL_##severity,         \
  |  |  |  |  ------------------
  |  |  |  |  |  |  110|      0|#define ABSL_RAW_LOG_INTERNAL_FATAL ::absl::LogSeverity::kFatal
  |  |  |  |  ------------------
  |  |  |  |   49|      0|                                     absl_raw_log_internal_basename, __LINE__, \
  |  |  |  |   50|      0|                                     __VA_ARGS__);                             \
  |  |  |  |   51|      0|    ABSL_RAW_LOG_INTERNAL_MAYBE_UNREACHABLE_##severity;                        \
  |  |  |  |  ------------------
  |  |  |  |  |  |  118|      0|#define ABSL_RAW_LOG_INTERNAL_MAYBE_UNREACHABLE_FATAL ABSL_UNREACHABLE()
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  225|      0|  do {                                           \
  |  |  |  |  |  |  |  |  226|      0|    /* NOLINTNEXTLINE: misc-static-assert */     \
  |  |  |  |  |  |  |  |  227|      0|    assert(false && "ABSL_UNREACHABLE reached"); \
  |  |  |  |  |  |  |  |  228|      0|    ABSL_INTERNAL_UNREACHABLE_IMPL();            \
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |  203|      0|#define ABSL_INTERNAL_UNREACHABLE_IMPL() __builtin_unreachable()
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  229|      0|  } while (false)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (229:12): [Folded, False: 0]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   52|      0|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (52:12): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   63|      0|    }                                                                  \
  |  |   64|  41.0k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (64:12): [Folded, False: 41.0k]
  |  |  ------------------
  ------------------
   17|  41.0k|                 "Failed to read real-time clock.");
   18|  41.0k|  return (int64_t{ts.tv_sec} * kNanosPerSecond +
   19|  41.0k|          int64_t{ts.tv_nsec});
   20|  41.0k|}

_ZN4absl12lts_202401168DurationC2Ev:
  169|  20.5k|  constexpr Duration() : rep_hi_(0), rep_lo_(0) {}  // zero-length duration
_ZN4absl12lts_202401168DurationC2Elj:
  231|  64.3k|  constexpr Duration(int64_t hi, uint32_t lo) : rep_hi_(hi), rep_lo_(lo) {}
_ZN4absl12lts_202401168Duration5HiRepC2El:
  251|  84.8k|          lo_(0),
  252|  84.8k|          hi_(0)
  253|       |#endif
  254|  84.8k|    {
  255|  84.8k|      *this = value;
  256|  84.8k|    }
_ZNK4absl12lts_202401168Duration5HiRep3GetEv:
  258|   149k|    constexpr int64_t Get() const {
  259|   149k|      const uint64_t unsigned_value =
  260|   149k|          (static_cast<uint64_t>(hi_) << 32) | static_cast<uint64_t>(lo_);
  261|       |      // `static_cast<int64_t>(unsigned_value)` is implementation-defined
  262|       |      // before c++20. On all supported platforms the behaviour is that mandated
  263|       |      // by c++20, i.e. "If the destination type is signed, [...] the result is
  264|       |      // the unique value of the destination type equal to the source value
  265|       |      // modulo 2^n, where n is the number of bits used to represent the
  266|       |      // destination type."
  267|   149k|      static_assert(
  268|   149k|          (static_cast<int64_t>((std::numeric_limits<uint64_t>::max)()) ==
  269|   149k|           int64_t{-1}) &&
  270|   149k|              (static_cast<int64_t>(static_cast<uint64_t>(
  271|   149k|                                        (std::numeric_limits<int64_t>::max)()) +
  272|   149k|                                    1) ==
  273|   149k|               (std::numeric_limits<int64_t>::min)()),
  274|   149k|          "static_cast<int64_t>(uint64_t) does not have c++20 semantics");
  275|   149k|      return static_cast<int64_t>(unsigned_value);
  276|   149k|    }
_ZN4absl12lts_202401168Duration5HiRepaSEl:
  278|   105k|    constexpr HiRep& operator=(const int64_t value) {
  279|       |      // "If the destination type is unsigned, the resulting value is the
  280|       |      // smallest unsigned value equal to the source value modulo 2^n
  281|       |      // where `n` is the number of bits used to represent the destination
  282|       |      // type".
  283|   105k|      const auto unsigned_value = static_cast<uint64_t>(value);
  284|   105k|      hi_ = static_cast<uint32_t>(unsigned_value >> 32);
  285|   105k|      lo_ = static_cast<uint32_t>(unsigned_value);
  286|   105k|      return *this;
  287|   105k|    }
_ZN4absl12lts_202401164TimeC2ENS0_8DurationE:
  850|  64.3k|  constexpr explicit Time(Duration rep) : rep_(rep) {}
_ZN4absl12lts_20240116leENS0_8DurationES1_:
  320|      8|                                                        Duration rhs) {
  321|      8|  return !(rhs < lhs);
  322|      8|}
_ZN4absl12lts_20240116miENS0_8DurationES1_:
  337|  20.5k|                                                        Duration rhs) {
  338|  20.5k|  return lhs -= rhs;
  339|  20.5k|}
_ZN4absl12lts_2024011612ZeroDurationEv:
  416|  20.5k|ABSL_ATTRIBUTE_CONST_FUNCTION constexpr Duration ZeroDuration() {
  417|  20.5k|  return Duration();
  418|  20.5k|}
_ZN4absl12lts_20240116eqENS0_4TimeES1_:
  867|  23.3k|ABSL_ATTRIBUTE_CONST_FUNCTION constexpr bool operator==(Time lhs, Time rhs) {
  868|  23.3k|  return lhs.rep_ == rhs.rep_;
  869|  23.3k|}
_ZN4absl12lts_20240116neENS0_4TimeES1_:
  870|  23.3k|ABSL_ATTRIBUTE_CONST_FUNCTION constexpr bool operator!=(Time lhs, Time rhs) {
  871|  23.3k|  return !(lhs == rhs);
  872|  23.3k|}
_ZN4absl12lts_20240116miENS0_4TimeES1_:
  884|  20.5k|ABSL_ATTRIBUTE_CONST_FUNCTION inline Duration operator-(Time lhs, Time rhs) {
  885|  20.5k|  return lhs.rep_ - rhs.rep_;
  886|  20.5k|}
_ZN4absl12lts_2024011614InfiniteFutureEv:
  907|  23.3k|ABSL_ATTRIBUTE_CONST_FUNCTION constexpr Time InfiniteFuture() {
  908|  23.3k|  return Time(time_internal::MakeDuration((std::numeric_limits<int64_t>::max)(),
  909|  23.3k|                                          ~uint32_t{0}));
  910|  23.3k|}
_ZN4absl12lts_2024011613time_internal12MakeDurationElj:
 1561|  64.3k|                                                              uint32_t lo = 0) {
 1562|  64.3k|  return Duration(hi, lo);
 1563|  64.3k|}
_ZN4absl12lts_2024011613time_internal12MakeDurationEll:
 1566|  41.0k|                                                              int64_t lo) {
 1567|  41.0k|  return MakeDuration(hi, static_cast<uint32_t>(lo));
 1568|  41.0k|}
_ZN4absl12lts_2024011613time_internal8GetRepHiENS0_8DurationE:
 1593|  46.6k|ABSL_ATTRIBUTE_CONST_FUNCTION constexpr int64_t GetRepHi(Duration d) {
 1594|  46.6k|  return d.rep_hi_.Get();
 1595|  46.6k|}
_ZN4absl12lts_2024011613time_internal8GetRepLoENS0_8DurationE:
 1596|  87.6k|ABSL_ATTRIBUTE_CONST_FUNCTION constexpr uint32_t GetRepLo(Duration d) {
 1597|  87.6k|  return d.rep_lo_;
 1598|  87.6k|}
_ZN4absl12lts_2024011613time_internal18IsInfiniteDurationENS0_8DurationE:
 1601|  41.0k|ABSL_ATTRIBUTE_CONST_FUNCTION constexpr bool IsInfiniteDuration(Duration d) {
 1602|  41.0k|  return GetRepLo(d) == ~uint32_t{0};
 1603|  41.0k|}
_ZN4absl12lts_2024011613time_internal16FromUnixDurationENS0_8DurationE:
 1626|  41.0k|ABSL_ATTRIBUTE_CONST_FUNCTION constexpr Time FromUnixDuration(Duration d) {
 1627|  41.0k|  return Time(d);
 1628|  41.0k|}
_ZN4absl12lts_20240116ltENS0_8DurationES1_:
 1721|      8|                                                       Duration rhs) {
 1722|      8|  return time_internal::GetRepHi(lhs) != time_internal::GetRepHi(rhs)
  ------------------
  |  Branch (1722:10): [True: 4, False: 4]
  ------------------
 1723|      8|             ? time_internal::GetRepHi(lhs) < time_internal::GetRepHi(rhs)
 1724|      8|         : time_internal::GetRepHi(lhs) == (std::numeric_limits<int64_t>::min)()
  ------------------
  |  Branch (1724:12): [True: 0, False: 4]
  ------------------
 1725|      4|             ? time_internal::GetRepLo(lhs) + 1 <
 1726|      0|                   time_internal::GetRepLo(rhs) + 1
 1727|      4|             : time_internal::GetRepLo(lhs) < time_internal::GetRepLo(rhs);
 1728|      8|}
_ZN4absl12lts_20240116eqENS0_8DurationES1_:
 1731|  23.3k|                                                        Duration rhs) {
 1732|  23.3k|  return time_internal::GetRepHi(lhs) == time_internal::GetRepHi(rhs) &&
  ------------------
  |  Branch (1732:10): [True: 23.3k, False: 0]
  ------------------
 1733|  23.3k|         time_internal::GetRepLo(lhs) == time_internal::GetRepLo(rhs);
  ------------------
  |  Branch (1733:10): [True: 23.3k, False: 0]
  ------------------
 1734|  23.3k|}
_ZN4absl12lts_2024011616InfiniteDurationEv:
 1762|      4|ABSL_ATTRIBUTE_CONST_FUNCTION constexpr Duration InfiniteDuration() {
 1763|      4|  return time_internal::MakeDuration((std::numeric_limits<int64_t>::max)(),
 1764|      4|                                     ~uint32_t{0});
 1765|      4|}
_ZN4absl12lts_202401164TimeC2Ev:
  780|      4|  constexpr Time() = default;

_ZN4absl12lts_2024011613span_internal11GetDataImplIKNSt3__16vectorIN8fuzztest8internal8IRObjectENS3_9allocatorIS7_EEEEEEDTcldtfp_4dataEERT_c:
   38|  62.1k|    -> decltype(c.data()) {
   39|  62.1k|  return c.data();
   40|  62.1k|}
_ZN4absl12lts_2024011613span_internal7GetDataIKNSt3__16vectorIN8fuzztest8internal8IRObjectENS3_9allocatorIS7_EEEEEEDTcl11GetDataImplfp_Li0EEERT_:
   50|  62.1k|    -> decltype(GetDataImpl(c, 0)) {
   51|  62.1k|  return GetDataImpl(c, 0);
   52|  62.1k|}
_ZN4absl12lts_2024011613span_internal7GetDataIKNSt3__16vectorIjNS3_9allocatorIjEEEEEEDTcl11GetDataImplfp_Li0EEERT_:
   50|      2|    -> decltype(GetDataImpl(c, 0)) {
   51|      2|  return GetDataImpl(c, 0);
   52|      2|}
_ZN4absl12lts_2024011613span_internal11GetDataImplIKNSt3__16vectorIjNS3_9allocatorIjEEEEEEDTcldtfp_4dataEERT_c:
   38|      2|    -> decltype(c.data()) {
   39|      2|  return c.data();
   40|      2|}
_ZN4absl12lts_2024011613span_internal7GetDataIKNS0_4SpanIjEEEEDTcl11GetDataImplfp_Li0EEERT_:
   50|     16|    -> decltype(GetDataImpl(c, 0)) {
   51|     16|  return GetDataImpl(c, 0);
   52|     16|}
_ZN4absl12lts_2024011613span_internal11GetDataImplIKNS0_4SpanIjEEEEDTcldtfp_4dataEERT_c:
   38|     16|    -> decltype(c.data()) {
   39|     16|  return c.data();
   40|     16|}

_ZNK4absl12lts_202401164SpanIKNS0_19str_format_internal13FormatArgImplEE4sizeEv:
  286|     36|  constexpr size_type size() const noexcept { return len_; }
_ZNK4absl12lts_202401164SpanIKN8fuzztest8internal8IRObjectEE4sizeEv:
  286|  62.1k|  constexpr size_type size() const noexcept { return len_; }
_ZNK4absl12lts_202401164SpanIKN8fuzztest8internal8IRObjectEEixEm:
  301|   330k|  constexpr reference operator[](size_type i) const noexcept {
  302|   330k|    return ABSL_HARDENING_ASSERT(i < size()), ptr_[i];
  ------------------
  |  |  128|   330k|#define ABSL_HARDENING_ASSERT(expr) ABSL_ASSERT(expr)
  |  |  ------------------
  |  |  |  |   95|   330k|  (false ? static_cast<void>(expr) : static_cast<void>(0))
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (95:4): [Folded, False: 330k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  303|   330k|  }
_ZNK4absl12lts_202401164SpanIjE4dataEv:
  281|     20|  constexpr pointer data() const noexcept { return ptr_; }
_ZNK4absl12lts_202401164SpanIjE4sizeEv:
  286|     20|  constexpr size_type size() const noexcept { return len_; }
_ZN4absl12lts_202401168MakeSpanITpTnRiJEhEENS0_4SpanIT0_EEPS4_m:
  685|      2|constexpr Span<T> MakeSpan(absl::Nullable<T*> ptr, size_t size) noexcept {
  686|      2|  return Span<T>(ptr, size);
  687|      2|}
_ZNK4absl12lts_202401164SpanIhE4dataEv:
  281|      2|  constexpr pointer data() const noexcept { return ptr_; }
_ZNK4absl12lts_202401164SpanIhE4sizeEv:
  286|      2|  constexpr size_type size() const noexcept { return len_; }
_ZN4absl12lts_202401168MakeSpanITpTnRiJEKjEENS0_4SpanIT0_EEPS5_S7_:
  690|      8|Span<T> MakeSpan(absl::Nullable<T*> begin, absl::Nullable<T*> end) noexcept {
  691|      8|  return ABSL_HARDENING_ASSERT(begin <= end),
  ------------------
  |  |  128|      8|#define ABSL_HARDENING_ASSERT(expr) ABSL_ASSERT(expr)
  |  |  ------------------
  |  |  |  |   95|      8|  (false ? static_cast<void>(expr) : static_cast<void>(0))
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (95:4): [Folded, False: 8]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  692|      8|         Span<T>(begin, static_cast<size_t>(end - begin));
  693|      8|}
_ZNK4absl12lts_202401164SpanIKjE4dataEv:
  281|     50|  constexpr pointer data() const noexcept { return ptr_; }
_ZNK4absl12lts_202401164SpanIKjE5beginEv:
  336|     24|  constexpr iterator begin() const noexcept { return data(); }
_ZNK4absl12lts_202401164SpanIKjE4sizeEv:
  286|     98|  constexpr size_type size() const noexcept { return len_; }
_ZNK4absl12lts_202401164SpanIKjE3endEv:
  349|     24|  constexpr iterator end() const noexcept { return data() + size(); }
_ZNK4absl12lts_202401164SpanIKjE5emptyEv:
  296|      8|  constexpr bool empty() const noexcept { return size() == 0; }
_ZNK4absl12lts_202401164SpanIKjEixEm:
  301|     36|  constexpr reference operator[](size_type i) const noexcept {
  302|     36|    return ABSL_HARDENING_ASSERT(i < size()), ptr_[i];
  ------------------
  |  |  128|     36|#define ABSL_HARDENING_ASSERT(expr) ABSL_ASSERT(expr)
  |  |  ------------------
  |  |  |  |   95|     36|  (false ? static_cast<void>(expr) : static_cast<void>(0))
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (95:4): [Folded, False: 36]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  303|     36|  }
_ZN4absl12lts_202401164SpanIKNS0_19str_format_internal13FormatArgImplEEC2IS4_S4_EESt16initializer_listIS3_E:
  273|     36|      : Span(v.begin(), v.size()) {}
_ZN4absl12lts_202401164SpanIKNS0_19str_format_internal13FormatArgImplEEC2EPS4_m:
  193|     84|      : ptr_(array), len_(length) {}
_ZN4absl12lts_202401164SpanIKNS0_19str_format_internal13FormatArgImplEEC2Ev:
  191|     48|  constexpr Span() noexcept : Span(nullptr, 0) {}
_ZN4absl12lts_202401164SpanIKN8fuzztest8internal8IRObjectEEC2INSt3__16vectorIS4_NS8_9allocatorIS4_EEEEvSC_iEERKT_:
  217|  62.1k|      : Span(span_internal::GetData(v), v.size()) {}
_ZN4absl12lts_202401164SpanIKN8fuzztest8internal8IRObjectEEC2EPS5_m:
  193|  62.1k|      : ptr_(array), len_(length) {}
_ZN4absl12lts_202401164SpanIKN8fuzztest8internal8IRObjectEEC2Ev:
  191|      5|  constexpr Span() noexcept : Span(nullptr, 0) {}
_ZN4absl12lts_202401164SpanIKjEC2EPS2_m:
  193|     26|      : ptr_(array), len_(length) {}
_ZN4absl12lts_202401164SpanIhEC2EPhm:
  193|      2|      : ptr_(array), len_(length) {}
_ZN4absl12lts_202401164SpanIKjEC2INSt3__16vectorIjNS5_9allocatorIjEEEEvS9_iEERKT_:
  217|      2|      : Span(span_internal::GetData(v), v.size()) {}
_ZN4absl12lts_202401168MakeSpanITpTnRiJEjLm512EEENS0_4SpanIT0_EERAT1__S4_:
  702|      2|constexpr Span<T> MakeSpan(T (&array)[N]) noexcept {
  703|      2|  return Span<T>(array, N);
  704|      2|}
_ZN4absl12lts_202401164SpanIjEC2EPjm:
  193|     18|      : ptr_(array), len_(length) {}
_ZN4absl12lts_202401168MakeSpanITpTnRiJEjEENS0_4SpanIT0_EEPS4_m:
  685|     16|constexpr Span<T> MakeSpan(absl::Nullable<T*> ptr, size_t size) noexcept {
  686|     16|  return Span<T>(ptr, size);
  687|     16|}
_ZN4absl12lts_202401164SpanIKjEC2INS1_IjEEvS5_TnNSt3__19enable_ifIXsr6IsViewIT_EE5valueEiE4typeELi0EEERKS8_:
  232|     16|      : Span(span_internal::GetData(v), v.size()) {}
_ZNK4absl12lts_202401164SpanIjE5emptyEv:
  296|      2|  constexpr bool empty() const noexcept { return size() == 0; }
_ZNK4absl12lts_202401164SpanIKNS0_19str_format_internal13FormatArgImplEEixEm:
  301|     36|  constexpr reference operator[](size_type i) const noexcept {
  302|     36|    return ABSL_HARDENING_ASSERT(i < size()), ptr_[i];
  ------------------
  |  |  128|     36|#define ABSL_HARDENING_ASSERT(expr) ABSL_ASSERT(expr)
  |  |  ------------------
  |  |  |  |   95|     36|  (false ? static_cast<void>(expr) : static_cast<void>(0))
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (95:4): [Folded, False: 36]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  303|     36|  }

_ZN4absl12lts_202401167forwardINS0_18container_internal12CommonFieldsEEEOT_RNSt3__116remove_referenceIS4_E4typeE:
  149|      6|    absl::remove_reference_t<T>& t) noexcept {  // NOLINT(runtime/references)
  150|      6|  return static_cast<T&&>(t);
  151|      6|}
_ZN4absl12lts_202401167forwardINS0_18container_internal10StringHashEEEOT_RNSt3__116remove_referenceIS4_E4typeE:
  149|      6|    absl::remove_reference_t<T>& t) noexcept {  // NOLINT(runtime/references)
  150|      6|  return static_cast<T&&>(t);
  151|      6|}
_ZN4absl12lts_202401167forwardINS0_18container_internal8StringEqEEEOT_RNSt3__116remove_referenceIS4_E4typeE:
  149|      6|    absl::remove_reference_t<T>& t) noexcept {  // NOLINT(runtime/references)
  150|      6|  return static_cast<T&&>(t);
  151|      6|}
_ZN4absl12lts_202401167forwardINSt3__19allocatorINS2_4pairIKNS2_17basic_string_viewIcNS2_11char_traitsIcEEEEPNS0_15CommandLineFlagEEEEEEEOT_RNS2_16remove_referenceISE_E4typeE:
  149|      6|    absl::remove_reference_t<T>& t) noexcept {  // NOLINT(runtime/references)
  150|      6|  return static_cast<T&&>(t);
  151|      6|}

_ZN8fuzztest15internal_no_adl9ArbitraryINSt3__112basic_stringIcNS2_11char_traitsIcEENS2_9allocatorIcEEEEEEDav:
  283|      4|auto Arbitrary() {
  284|      4|  return internal::ArbitraryImpl<T>{};
  285|      4|}
_ZN8fuzztest15internal_no_adl4JustImEEDaT_:
  305|      8|auto Just(T val) {
  306|      8|  return internal::ElementOfImpl<T>({std::move(val)});
  307|      8|}
_ZN8fuzztest15internal_no_adl9ArbitraryIbEEDav:
  283|     20|auto Arbitrary() {
  284|     20|  return internal::ArbitraryImpl<T>{};
  285|     20|}
_ZN8fuzztest15internal_no_adl4JustIiEEDaT_:
  305|      4|auto Just(T val) {
  306|      4|  return internal::ElementOfImpl<T>({std::move(val)});
  307|      4|}
_ZN8fuzztest15internal_no_adl7InRangeIiEEDaT_S2_:
  349|      4|auto InRange(T min, T max) {
  350|      4|  return internal::InRangeImpl<T>(min, max);
  351|      4|}
_ZN8fuzztest15internal_no_adl9ElementOfI15avifCodecChoiceEEDaSt16initializer_listIT_E:
  295|      4|auto ElementOf(std::initializer_list<T> values) {
  296|      4|  return internal::ElementOfImpl<T>(values);
  297|      4|}
_ZN8fuzztest15internal_no_adl9ElementOfI17avifDecoderSourceEEDaSt16initializer_listIT_E:
  295|      4|auto ElementOf(std::initializer_list<T> values) {
  296|      4|  return internal::ElementOfImpl<T>(values);
  297|      4|}
_ZN8fuzztest15internal_no_adl20BitFlagCombinationOfIjEEDaSt16initializer_listIT_E:
  461|      8|auto BitFlagCombinationOf(std::initializer_list<T> flags) {
  462|      8|  return internal::BitFlagCombinationOfImpl<T>(
  463|      8|      absl::MakeSpan(flags.begin(), flags.end()));
  464|      8|}
_ZN8fuzztest15internal_no_adl3MapITpTnRiJEPFNSt3__110unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEE15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS_8internal13ElementOfImplIS9_EENSD_11InRangeImplIiEENSE_ISA_EENSD_13ArbitraryImplIbvEESK_SK_SK_NSE_ImEESL_NSE_IiEENSD_24BitFlagCombinationOfImplIjEEEEEDaT0_DpT1_:
  717|      4|auto Map(Mapper mapper, Inner... inner) {
  718|      4|  return internal::MapImpl<Mapper, Inner...>(std::move(mapper),
  719|      4|                                             std::move(inner)...);
  720|      4|}
_ZN8fuzztest15internal_no_adl3MapITpTnRiJEPFNSt3__110unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEES8_jEJNS_8internal7MapImplIPFS8_15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNSB_13ElementOfImplISD_EENSB_11InRangeImplIiEENSH_ISE_EENSB_13ArbitraryImplIbvEESN_SN_SN_NSH_ImEESO_NSH_IiEENSB_24BitFlagCombinationOfImplIjEEEEESR_EEEDaT0_DpT1_:
  717|      4|auto Map(Mapper mapper, Inner... inner) {
  718|      4|  return internal::MapImpl<Mapper, Inner...>(std::move(mapper),
  719|      4|                                             std::move(inner)...);
  720|      4|}
_ZN8fuzztest15internal_no_adl7TupleOfITpTnRiJEJNS_8internal23SequenceContainerOfImplINSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEENS3_13ArbitraryImplIcvEEEENSC_IbvEENS3_7MapImplIPFNS5_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEESL_jEJNSG_IPFSL_15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS3_13ElementOfImplISO_EENS3_11InRangeImplIiEENSS_ISP_EESF_SF_SF_SF_NSS_ImEESX_NSS_IiEENS3_24BitFlagCombinationOfImplIjEEEEES10_EEEEEEDaDpT0_:
  584|      4|auto TupleOf(Inner... inner) {
  585|      4|  return internal::AggregateOfImpl<std::tuple<internal::value_type_t<Inner>...>,
  586|      4|                                   internal::RequireCustomCorpusType::kNo,
  587|      4|                                   Inner...>(std::in_place,
  588|      4|                                             std::move(inner)...);
  589|      4|}

main:
   21|      2|int main(int argc, char** argv) {
   22|      2|  testing::InitGoogleTest(&argc, argv);
   23|       |  // We call fuzztest::ParseAbslFlagsrather than absl::ParseCommandLine
   24|       |  // since the latter would complain about any unknown flags that need
   25|       |  // to be passed to legacy fuzzing engines (e.g. libfuzzer).
   26|      2|  fuzztest::ParseAbslFlags(argc, argv);
   27|      2|  fuzztest::InitFuzzTest(&argc, &argv);
   28|      2|  return RUN_ALL_TESTS();
   29|      2|}

_ZN8fuzztest22ReadFilesFromDirectoryENSt3__117basic_string_viewIcNS0_11char_traitsIcEEEE:
   67|      4|    std::string_view dir) {
   68|      4|  std::vector<std::tuple<std::string>> out;
   69|      4|  const std::filesystem::path fs_dir(dir);
   70|      4|  if (!std::filesystem::is_directory(fs_dir)) return out;
  ------------------
  |  Branch (70:7): [True: 0, False: 4]
  ------------------
   71|      4|  for (const auto& entry :
  ------------------
  |  Branch (71:26): [True: 404, False: 4]
  ------------------
   72|    404|       std::filesystem::recursive_directory_iterator(fs_dir)) {
   73|    404|    if (std::filesystem::is_directory(entry)) continue;
  ------------------
  |  Branch (73:9): [True: 0, False: 404]
  ------------------
   74|    404|    std::ifstream stream(entry.path().string());
   75|    404|    if (!stream.good()) {
  ------------------
  |  Branch (75:9): [True: 0, False: 404]
  ------------------
   76|       |      // Using stderr instead of GetStderr() to avoid
   77|       |      // initialization-order-fiasco when reading files at static init time with
   78|       |      // `.WithSeeds(fuzztest::ReadFilesFromDirectory(...))`.
   79|      0|      absl::FPrintF(stderr, "[!] %s:%d: Error reading %s: (%d) %s\n", __FILE__,
   80|      0|                    __LINE__, entry.path().string(), errno, strerror(errno));
   81|      0|      continue;
   82|      0|    }
   83|    404|    std::stringstream buffer;
   84|    404|    buffer << stream.rdbuf();
   85|    404|    out.push_back({buffer.str()});
   86|    404|  }
   87|      4|  return out;
   88|      4|}

_ZN8fuzztest19ListRegisteredTestsEv:
  168|      6|std::vector<std::string> ListRegisteredTests() {
  169|      6|  std::vector<std::string> result;
  170|      6|  internal::ForEachTest(
  171|      6|      [&](const auto& test) { result.push_back(test.full_name()); });
  172|      6|  return result;
  173|      6|}
_ZN8fuzztest25GetMatchingFuzzTestOrExitENSt3__117basic_string_viewIcNS0_11char_traitsIcEEEE:
  175|      2|std::string GetMatchingFuzzTestOrExit(std::string_view name) {
  176|      2|  const std::string partial_name(name);
  177|      2|  const std::vector<std::string> full_names = ListRegisteredTests();
  178|      2|  std::vector<const std::string*> matches;
  179|      4|  for (const std::string& full_name : full_names) {
  ------------------
  |  Branch (179:37): [True: 4, False: 0]
  ------------------
  180|      4|    if (absl::StrContains(full_name, partial_name)) {
  ------------------
  |  Branch (180:9): [True: 2, False: 2]
  ------------------
  181|      2|      if (full_name == partial_name) {
  ------------------
  |  Branch (181:11): [True: 2, False: 0]
  ------------------
  182|       |        // In case of an exact match, we end the search and use it. This is to
  183|       |        // handle the case when we want to select `MySuite.MyTest`, but the
  184|       |        // binary has both `MySuite.MyTest` and `MySuite.MyTestX`.
  185|      2|        return full_name;
  186|      2|      } else {
  187|      0|        matches.push_back(&full_name);
  188|      0|      }
  189|      2|    }
  190|      4|  }
  191|       |
  192|      0|  if (matches.empty()) {
  ------------------
  |  Branch (192:7): [True: 0, False: 0]
  ------------------
  193|      0|    absl::FPrintF(stderr, "\n\nNo FUZZ_TEST matches the name: %s\n\n",
  194|      0|                  partial_name);
  195|      0|    absl::FPrintF(stderr, "Valid tests:\n");
  196|      0|    for (const std::string& full_name : full_names) {
  ------------------
  |  Branch (196:39): [True: 0, False: 0]
  ------------------
  197|      0|      absl::FPrintF(stderr, " %s\n", full_name);
  198|      0|    }
  199|      0|    exit(1);
  200|      0|  } else if (matches.size() > 1) {
  ------------------
  |  Branch (200:14): [True: 0, False: 0]
  ------------------
  201|      0|    absl::FPrintF(stderr, "\n\nMultiple FUZZ_TESTs match the name: %s\n\n",
  202|      0|                  partial_name);
  203|      0|    absl::FPrintF(stderr, "Please select one. Matching tests:\n");
  204|      0|    for (const std::string* full_name : matches) {
  ------------------
  |  Branch (204:39): [True: 0, False: 0]
  ------------------
  205|      0|      absl::FPrintF(stderr, " %s\n", *full_name);
  206|      0|    }
  207|      0|    exit(1);
  208|      0|  }
  209|      0|  return *matches[0];
  210|      0|}
_ZN8fuzztest12InitFuzzTestEPiPPPcNSt3__117basic_string_viewIcNS4_11char_traitsIcEEEE:
  283|      2|void InitFuzzTest(int* argc, char*** argv, std::string_view binary_id) {
  284|      2|  const bool is_listing = absl::GetFlag(FUZZTEST_FLAG(list_fuzz_tests));
  ------------------
  |  |   20|      2|#define FUZZTEST_FLAG(name) FLAGS_##name
  ------------------
  285|      2|  if (is_listing) {
  ------------------
  |  Branch (285:7): [True: 0, False: 2]
  ------------------
  286|      0|    for (const auto& name : ListRegisteredTests()) {
  ------------------
  |  Branch (286:27): [True: 0, False: 0]
  ------------------
  287|      0|      std::cout << "[*] Fuzz test: " << name << '\n';
  288|      0|    }
  289|      0|    std::exit(0);
  290|      0|  }
  291|      2|  std::optional<absl::Duration> fuzzing_time_limit = GetFuzzingTime();
  292|      2|  std::optional<absl::Duration> replay_corpus_time_limit =
  293|      2|      GetReplayCorpusTime();
  294|      2|  FUZZTEST_INTERNAL_CHECK(
  ------------------
  |  |   48|      4|  ((cond) ? (void)0                                            \
  |  |  ------------------
  |  |  |  Branch (48:5): [True: 0, False: 2]
  |  |  |  Branch (48:5): [True: 2, False: 0]
  |  |  ------------------
  |  |   49|      2|          : ::fuzztest::internal::Abort(                       \
  |  |   50|      0|                __FILE__, __LINE__,                            \
  |  |   51|      0|                absl::StrCat("Internal error! Check (", #cond, \
  |  |   52|      0|                             ") failed: ", __VA_ARGS__)))
  ------------------
  295|      2|      !fuzzing_time_limit || !replay_corpus_time_limit,
  296|      2|      "Cannot run in fuzzing and corpus replay mode at the same time.");
  297|      2|  const auto test_to_fuzz = absl::GetFlag(FUZZTEST_FLAG(fuzz));
  ------------------
  |  |   20|      2|#define FUZZTEST_FLAG(name) FLAGS_##name
  ------------------
  298|      2|  const auto test_to_replay_corpus =
  299|      2|      absl::GetFlag(FUZZTEST_FLAG(replay_corpus));
  ------------------
  |  |   20|      2|#define FUZZTEST_FLAG(name) FLAGS_##name
  ------------------
  300|      2|  const auto specified_test =
  301|      2|      test_to_fuzz != kUnspecified ? test_to_fuzz : test_to_replay_corpus;
  ------------------
  |  Branch (301:7): [True: 2, False: 0]
  ------------------
  302|      2|  const bool is_test_specified = specified_test != kUnspecified;
  303|      2|  if (is_test_specified) {
  ------------------
  |  Branch (303:7): [True: 2, False: 0]
  ------------------
  304|      2|    const std::string matching_fuzz_test =
  305|      2|        GetMatchingFuzzTestOrExit(specified_test);
  306|       |    // Delegate the test to GoogleTest.
  307|      2|    GTEST_FLAG_SET(filter, matching_fuzz_test);
  308|      2|  }
  309|       |
  310|      2|  std::string derived_binary_id =
  311|      2|      binary_id.empty() ? std::string(internal::Basename(*argv[0]))
  ------------------
  |  Branch (311:7): [True: 2, False: 0]
  ------------------
  312|      2|                        : std::string(binary_id);
  313|      2|  std::optional<std::string> reproduction_command_template;
  314|      2|  internal::Configuration configuration =
  315|      2|      CreateConfigurationsFromFlags(derived_binary_id);
  316|      2|  configuration.reproduction_command_template = reproduction_command_template;
  317|      2|  internal::RegisterFuzzTestsAsGoogleTests(argc, argv, configuration);
  318|       |
  319|      2|  const bool is_fuzzing_or_replaying =
  320|      2|      (fuzzing_time_limit || replay_corpus_time_limit);
  ------------------
  |  Branch (320:8): [True: 2, False: 0]
  |  Branch (320:30): [True: 0, False: 0]
  ------------------
  321|      2|  if (is_fuzzing_or_replaying && !is_test_specified) {
  ------------------
  |  Branch (321:7): [True: 2, False: 0]
  |  Branch (321:34): [True: 0, False: 2]
  ------------------
  322|      0|    absl::flat_hash_set<std::string> fuzz_tests = {
  323|      0|        configuration.fuzz_tests.begin(), configuration.fuzz_tests.end()};
  324|      0|    std::vector<std::string> non_fuzz_tests;
  325|      0|    for (const auto* test : internal::GetRegisteredTests()) {
  ------------------
  |  Branch (325:27): [True: 0, False: 0]
  ------------------
  326|      0|      const std::string test_name =
  327|      0|          absl::StrCat(test->test_suite_name(), ".", test->name());
  328|      0|      if (!fuzz_tests.contains(test_name)) {
  ------------------
  |  Branch (328:11): [True: 0, False: 0]
  ------------------
  329|      0|        non_fuzz_tests.push_back(test_name);
  330|      0|      }
  331|      0|    }
  332|      0|    if (!non_fuzz_tests.empty()) {
  ------------------
  |  Branch (332:9): [True: 0, False: 0]
  ------------------
  333|       |      // Run only the fuzz tests, and not the unit tests.
  334|       |      // TODO: b/340232436 -- This is needed because we currently rely on a fuzz
  335|       |      // test being the first test to run so that Centipede can get the
  336|       |      // serialized configuration from the binary.
  337|      0|      std::string filter = absl::StrCat(
  338|      0|          GTEST_FLAG_GET(filter),
  339|       |          // When the filter already includes the negative patterns, append to
  340|       |          // the negative patterns.
  341|      0|          absl::StrContains(GTEST_FLAG_GET(filter), '-') ? ":" : "-",
  ------------------
  |  Branch (341:11): [True: 0, False: 0]
  ------------------
  342|      0|          absl::StrJoin(non_fuzz_tests, ":"));
  343|      0|      GTEST_FLAG_SET(filter, filter);
  344|      0|    }
  345|      0|  }
  346|      2|  const bool is_runner_mode = std::getenv("CENTIPEDE_RUNNER_FLAGS") != nullptr;
  347|      2|  const bool is_fuzzing_mode = (is_runner_mode && is_fuzzing_or_replaying) ||
  ------------------
  |  Branch (347:33): [True: 0, False: 2]
  |  Branch (347:51): [True: 0, False: 0]
  ------------------
  348|      2|                               fuzzing_time_limit.has_value();
  ------------------
  |  Branch (348:32): [True: 2, False: 0]
  ------------------
  349|      2|  const RunMode run_mode =
  350|      2|      is_fuzzing_mode ? RunMode::kFuzz : RunMode::kUnitTest;
  ------------------
  |  Branch (350:7): [True: 2, False: 0]
  ------------------
  351|       |  // TODO(b/307513669): Use the Configuration class instead of Runtime.
  352|      2|  internal::Runtime::instance().SetRunMode(run_mode);
  353|      2|}
_ZN8fuzztest14ParseAbslFlagsEiPPc:
  355|      2|void ParseAbslFlags(int argc, char** argv) {
  356|      2|  std::vector<char*> positional_args;
  357|      2|  std::vector<absl::UnrecognizedFlag> unrecognized_flags;
  358|      2|  absl::ParseAbseilFlagsOnly(argc, argv, positional_args, unrecognized_flags);
  359|      2|}
init_fuzztest.cc:_ZNK3$_0clEv:
   49|      2|    .OnUpdate([]() {
   50|      2|      fuzztest::internal::SetFuzzTestListingModeValidatorForGoogleTest(
   51|      2|          absl::GetFlag(FUZZTEST_FLAG(list_fuzz_tests)));
  ------------------
  |  |   20|      2|#define FUZZTEST_FLAG(name) FLAGS_##name
  ------------------
   52|      2|    });
init_fuzztest.cc:_ZN8fuzztest12_GLOBAL__N_129CreateConfigurationsFromFlagsENSt3__117basic_string_viewIcNS1_11char_traitsIcEEEE:
  241|      2|    absl::string_view binary_identifier) {
  242|      2|  bool reproduce_findings_as_separate_tests =
  243|      2|      absl::GetFlag(FUZZTEST_FLAG(reproduce_findings_as_separate_tests));
  ------------------
  |  |   20|      2|#define FUZZTEST_FLAG(name) FLAGS_##name
  ------------------
  244|      2|  std::optional<absl::Duration> fuzzing_time_limit = GetFuzzingTime();
  245|      2|  std::optional<absl::Duration> replay_corpus_time_limit =
  246|      2|      GetReplayCorpusTime();
  247|      2|  absl::Duration time_limit = fuzzing_time_limit ? *fuzzing_time_limit
  ------------------
  |  Branch (247:31): [True: 2, False: 0]
  ------------------
  248|      2|                              : replay_corpus_time_limit
  ------------------
  |  Branch (248:33): [True: 0, False: 0]
  ------------------
  249|      0|                                  ? *replay_corpus_time_limit
  250|      0|                                  : absl::ZeroDuration();
  251|      2|  std::optional<size_t> jobs = absl::GetFlag(FUZZTEST_FLAG(jobs));
  ------------------
  |  |   20|      2|#define FUZZTEST_FLAG(name) FLAGS_##name
  ------------------
  252|      2|  FUZZTEST_INTERNAL_CHECK(!jobs.has_value() || *jobs > 0, "If specified, --",
  ------------------
  |  |   48|      2|  ((cond) ? (void)0                                            \
  |  |  ------------------
  |  |  |  Branch (48:5): [True: 2, False: 0]
  |  |  |  Branch (48:5): [True: 0, False: 0]
  |  |  ------------------
  |  |   49|      2|          : ::fuzztest::internal::Abort(                       \
  |  |   50|      0|                __FILE__, __LINE__,                            \
  |  |   51|      0|                absl::StrCat("Internal error! Check (", #cond, \
  |  |   52|      0|                             ") failed: ", __VA_ARGS__)))
  ------------------
  253|      2|                          FUZZTEST_FLAG(jobs).Name(), " must be positive.");
  254|      2|  return internal::Configuration{
  255|      2|      absl::GetFlag(FUZZTEST_FLAG(corpus_database)),
  ------------------
  |  |   20|      2|#define FUZZTEST_FLAG(name) FLAGS_##name
  ------------------
  256|      2|      /*stats_root=*/"",
  257|      2|      std::string(binary_identifier),
  258|      2|      /*fuzz_tests=*/ListRegisteredTests(),
  259|      2|      /*fuzz_tests_in_current_shard=*/ListRegisteredTests(),
  260|      2|      reproduce_findings_as_separate_tests,
  261|       |      /*only_replay_corpus=*/
  262|      2|      replay_corpus_time_limit.has_value(),
  263|      2|      /*stack_limit=*/absl::GetFlag(FUZZTEST_FLAG(stack_limit_kb)) * 1024,
  ------------------
  |  |   20|      2|#define FUZZTEST_FLAG(name) FLAGS_##name
  ------------------
  264|      2|      /*rss_limit=*/absl::GetFlag(FUZZTEST_FLAG(rss_limit_mb)) * 1024 * 1024,
  ------------------
  |  |   20|      2|#define FUZZTEST_FLAG(name) FLAGS_##name
  ------------------
  265|      2|      absl::GetFlag(FUZZTEST_FLAG(time_limit_per_input)), time_limit,
  ------------------
  |  |   20|      2|#define FUZZTEST_FLAG(name) FLAGS_##name
  ------------------
  266|      2|      absl::GetFlag(FUZZTEST_FLAG(time_budget_type)), jobs.value_or(0)};
  ------------------
  |  |   20|      2|#define FUZZTEST_FLAG(name) FLAGS_##name
  ------------------
  267|      2|}
init_fuzztest.cc:_ZN8fuzztest12_GLOBAL__N_114GetFuzzingTimeEv:
  214|      4|std::optional<absl::Duration> GetFuzzingTime() {
  215|      4|  absl::Duration fuzz_time_limit = absl::GetFlag(FUZZTEST_FLAG(fuzz_for));
  ------------------
  |  |   20|      4|#define FUZZTEST_FLAG(name) FLAGS_##name
  ------------------
  216|      4|  if (fuzz_time_limit <= absl::ZeroDuration()) {
  ------------------
  |  Branch (216:7): [True: 0, False: 4]
  ------------------
  217|      0|    fuzz_time_limit = absl::InfiniteDuration();
  218|      0|  }
  219|      4|  if (absl::GetFlag(FUZZTEST_FLAG(fuzz)) == kUnspecified &&
  ------------------
  |  |   20|      4|#define FUZZTEST_FLAG(name) FLAGS_##name
  ------------------
  |  Branch (219:7): [True: 0, False: 4]
  |  Branch (219:7): [True: 0, False: 4]
  ------------------
  220|      0|      fuzz_time_limit == absl::InfiniteDuration()) {
  ------------------
  |  Branch (220:7): [True: 0, False: 0]
  ------------------
  221|      0|    return std::nullopt;
  222|      0|  }
  223|      4|  return fuzz_time_limit;
  224|      4|}
init_fuzztest.cc:_ZN8fuzztest12_GLOBAL__N_119GetReplayCorpusTimeEv:
  226|      4|std::optional<absl::Duration> GetReplayCorpusTime() {
  227|      4|  absl::Duration replay_corpus_time_limit =
  228|      4|      absl::GetFlag(FUZZTEST_FLAG(replay_corpus_for));
  ------------------
  |  |   20|      4|#define FUZZTEST_FLAG(name) FLAGS_##name
  ------------------
  229|      4|  if (absl::GetFlag(FUZZTEST_FLAG(replay_corpus)) == kUnspecified &&
  ------------------
  |  |   20|      4|#define FUZZTEST_FLAG(name) FLAGS_##name
  ------------------
  |  Branch (229:7): [True: 4, False: 0]
  |  Branch (229:7): [True: 4, False: 0]
  ------------------
  230|      4|      replay_corpus_time_limit <= absl::ZeroDuration()) {
  ------------------
  |  Branch (230:7): [True: 4, False: 0]
  ------------------
  231|      4|    return std::nullopt;
  232|      4|  }
  233|      0|  if (absl::GetFlag(FUZZTEST_FLAG(replay_corpus)) != kUnspecified &&
  ------------------
  |  |   20|      0|#define FUZZTEST_FLAG(name) FLAGS_##name
  ------------------
  |  Branch (233:7): [True: 0, False: 0]
  |  Branch (233:7): [True: 0, False: 0]
  ------------------
  234|      0|      replay_corpus_time_limit <= absl::ZeroDuration()) {
  ------------------
  |  Branch (234:7): [True: 0, False: 0]
  ------------------
  235|      0|    replay_corpus_time_limit = absl::InfiniteDuration();
  236|      0|  }
  237|      0|  return replay_corpus_time_limit;
  238|      4|}
init_fuzztest.cc:_ZZN8fuzztest19ListRegisteredTestsEvENK3$_0clINS_8internal8FuzzTestEEEDaRKT_:
  171|     12|      [&](const auto& test) { result.push_back(test.full_name()); });

_ZNK8fuzztest8internal7AnyBase9has_valueEv:
   46|   266k|  bool has_value() const {
   47|   266k|    FUZZTEST_INTERNAL_CHECK((vtable_ == nullptr) == (value_ == nullptr),
  ------------------
  |  |   48|   266k|  ((cond) ? (void)0                                            \
  |  |  ------------------
  |  |  |  Branch (48:4): [True: 266k, False: 0]
  |  |  ------------------
  |  |   49|   266k|          : ::fuzztest::internal::Abort(                       \
  |  |   50|      0|                __FILE__, __LINE__,                            \
  |  |   51|      0|                absl::StrCat("Internal error! Check (", #cond, \
  |  |   52|      0|                             ") failed: ", __VA_ARGS__)))
  ------------------
   48|   266k|                            "Inconsistent state between value and vtable.");
   49|   266k|    return value_ != nullptr;
   50|   266k|  }
_ZN8fuzztest8internal7AnyBaseC2Ev:
   86|  20.5k|  AnyBase() : vtable_(nullptr), value_(nullptr) {}
_ZN8fuzztest8internal7AnyBaseC2EOS1_:
   89|  41.1k|      : vtable_(std::exchange(other.vtable_, nullptr)),
   90|  41.1k|        value_(std::exchange(other.value_, nullptr)) {}
_ZN8fuzztest8internal7AnyBaseD2Ev:
  100|   102k|  ~AnyBase() { Destroy(); }
_ZN8fuzztest8internal7AnyBase7DestroyEv:
  102|   102k|  void Destroy() {
  103|   102k|    if (has_value()) vtable_->destroy(value_);
  ------------------
  |  Branch (103:9): [True: 61.6k, False: 41.1k]
  ------------------
  104|   102k|  }
_ZN8fuzztest8internal7AnyBase8CopyFromERKS1_:
  106|  20.5k|  void CopyFrom(const AnyBase& other) {
  107|  20.5k|    FUZZTEST_INTERNAL_CHECK(!has_value(), "CopyFrom called on a full object");
  ------------------
  |  |   48|  20.5k|  ((cond) ? (void)0                                            \
  |  |  ------------------
  |  |  |  Branch (48:4): [True: 20.5k, False: 0]
  |  |  ------------------
  |  |   49|  20.5k|          : ::fuzztest::internal::Abort(                       \
  |  |   50|      0|                __FILE__, __LINE__,                            \
  |  |   51|      0|                absl::StrCat("Internal error! Check (", #cond, \
  |  |   52|      0|                             ") failed: ", __VA_ARGS__)))
  ------------------
  108|  20.5k|    if (other.has_value()) {
  ------------------
  |  Branch (108:9): [True: 20.5k, False: 0]
  ------------------
  109|  20.5k|      vtable_ = other.vtable_;
  110|  20.5k|      value_ = vtable_->copy(other.value_);
  111|  20.5k|    }
  112|  20.5k|  }
_ZN8fuzztest8internal11CopyableAnyC2ERKS1_:
  173|  20.5k|  CopyableAny(const CopyableAny& other) { CopyFrom(other); }
_ZN8fuzztest8internal11CopyableAnyC2INSt3__15tupleIJNS3_12basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEbNS4_IJNS4_IJNS0_23ElementOfImplCorpusTypeEiSB_bbbbSB_SB_SB_jEEEjEEEEEEJSE_EEENS3_15in_place_type_tIT_EEDpOT0_:
  169|  20.6k|      : AnyBase(std::in_place, std::true_type{},
  170|  20.6k|                new T(std::forward<U>(args)...)) {}
_ZN8fuzztest8internal7AnyBaseC2INSt3__15tupleIJNS3_12basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEbNS4_IJNS4_IJNS0_23ElementOfImplCorpusTypeEiSB_bbbbSB_SB_SB_jEEEjEEEEEEEENS3_10in_place_tENS3_17integral_constantIbLb1EEEPT_:
   34|  20.6k|  explicit AnyBase(std::in_place_t, std::true_type, T* value) {
   35|  20.6k|    static constexpr VTable kVTable = {type_id<T>, DestroyImpl<T>, CopyImpl<T>};
   36|  20.6k|    vtable_ = &kVTable;
   37|  20.6k|    value_ = const_cast<void*>(static_cast<const void*>(value));
   38|  20.6k|  }
_ZN8fuzztest8internal7AnyBase11DestroyImplINSt3__15tupleIJNS3_12basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEbNS4_IJNS4_IJNS0_23ElementOfImplCorpusTypeEiSB_bbbbSB_SB_SB_jEEEjEEEEEEEEvPv:
  122|  41.1k|  static void DestroyImpl(void* p) {
  123|  41.1k|    delete static_cast<T*>(p);
  124|  41.1k|  }
_ZN8fuzztest8internal7AnyBase8CopyImplINSt3__15tupleIJNS3_12basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEbNS4_IJNS4_IJNS0_23ElementOfImplCorpusTypeEiSB_bbbbSB_SB_SB_jEEEjEEEEEEEEPvSF_:
  127|  20.5k|  static void* CopyImpl(void* p) {
  128|  20.5k|    return new T(*static_cast<T*>(p));
  129|  20.5k|  }
_ZNK8fuzztest8internal7AnyBase3HasINSt3__15tupleIJNS3_12basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEbNS4_IJNS4_IJNS0_23ElementOfImplCorpusTypeEiSB_bbbbSB_SB_SB_jEEEjEEEEEEEEbv:
   53|  41.1k|  bool Has() const {
   54|  41.1k|    return has_value() && vtable_->type_id == type_id<T>;
  ------------------
  |  Branch (54:12): [True: 41.1k, False: 0]
  |  Branch (54:27): [True: 41.1k, False: 0]
  ------------------
   55|  41.1k|  }
_ZNKR8fuzztest8internal7AnyBase5GetAsINSt3__15tupleIJNS3_12basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEbNS4_IJNS4_IJNS0_23ElementOfImplCorpusTypeEiSB_bbbbSB_SB_SB_jEEEjEEEEEEEERKT_v:
   65|  41.1k|  const T& GetAs() const& {
   66|  41.1k|    FUZZTEST_INTERNAL_CHECK_PRECONDITION(has_value(), "Object is empty!");
  ------------------
  |  |   42|  41.1k|  ((P) ? (void)0                                     \
  |  |  ------------------
  |  |  |  Branch (42:4): [True: 41.1k, False: 0]
  |  |  ------------------
  |  |   43|  41.1k|       : ::fuzztest::internal::Abort(                \
  |  |   44|      0|             __FILE__, __LINE__,                     \
  |  |   45|      0|             absl::StrCat("Failed precondition (", #P, "): ", __VA_ARGS__)))
  ------------------
   67|  41.1k|    FUZZTEST_INTERNAL_CHECK_PRECONDITION(Has<T>(), "Wrong type!");
  ------------------
  |  |   42|  41.1k|  ((P) ? (void)0                                     \
  |  |  ------------------
  |  |  |  Branch (42:4): [True: 41.1k, False: 0]
  |  |  ------------------
  |  |   43|  41.1k|       : ::fuzztest::internal::Abort(                \
  |  |   44|      0|             __FILE__, __LINE__,                     \
  |  |   45|      0|             absl::StrCat("Failed precondition (", #P, "): ", __VA_ARGS__)))
  ------------------
   68|  41.1k|    return *static_cast<T*>(value_);
   69|  41.1k|  }
_ZN8fuzztest8internal11CopyableAnyC2EOS1_:
  174|  41.1k|  CopyableAny(CopyableAny&& other) = default;
_ZN8fuzztest8internal11MoveOnlyAnyC2INSt3__15tupleIJNS3_12basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEbNS3_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEEEEEJSG_EEENS3_15in_place_type_tIT_EEDpOT0_:
  150|  20.5k|      : AnyBase(std::in_place, std::false_type{},
  151|  20.5k|                new T(std::forward<U>(args)...)) {}
_ZN8fuzztest8internal7AnyBaseC2INSt3__15tupleIJNS3_12basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEbNS3_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEEEEEEENS3_10in_place_tENS3_17integral_constantIbLb0EEEPT_:
   40|  20.5k|  explicit AnyBase(std::in_place_t, std::false_type, T* value) {
   41|  20.5k|    static constexpr VTable kVTable = {type_id<T>, DestroyImpl<T>, nullptr};
   42|  20.5k|    vtable_ = &kVTable;
   43|  20.5k|    value_ = const_cast<void*>(static_cast<const void*>(value));
   44|  20.5k|  }
_ZN8fuzztest8internal7AnyBase11DestroyImplINSt3__15tupleIJNS3_12basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEbNS3_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEEEEEEEvPv:
  122|  20.5k|  static void DestroyImpl(void* p) {
  123|  20.5k|    delete static_cast<T*>(p);
  124|  20.5k|  }
_ZNR8fuzztest8internal7AnyBase5GetAsINSt3__15tupleIJNS3_12basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEbNS3_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEEEEEEERT_v:
   58|  20.5k|  T& GetAs() & {
   59|  20.5k|    FUZZTEST_INTERNAL_CHECK_PRECONDITION(has_value(), "Object is empty!");
  ------------------
  |  |   42|  20.5k|  ((P) ? (void)0                                     \
  |  |  ------------------
  |  |  |  Branch (42:4): [True: 20.5k, False: 0]
  |  |  ------------------
  |  |   43|  20.5k|       : ::fuzztest::internal::Abort(                \
  |  |   44|      0|             __FILE__, __LINE__,                     \
  |  |   45|      0|             absl::StrCat("Failed precondition (", #P, "): ", __VA_ARGS__)))
  ------------------
   60|  20.5k|    FUZZTEST_INTERNAL_CHECK_PRECONDITION(Has<T>(), "Wrong type!");
  ------------------
  |  |   42|  20.5k|  ((P) ? (void)0                                     \
  |  |  ------------------
  |  |  |  Branch (42:4): [True: 20.5k, False: 0]
  |  |  ------------------
  |  |   43|  20.5k|       : ::fuzztest::internal::Abort(                \
  |  |   44|      0|             __FILE__, __LINE__,                     \
  |  |   45|      0|             absl::StrCat("Failed precondition (", #P, "): ", __VA_ARGS__)))
  ------------------
   61|  20.5k|    return *static_cast<T*>(value_);
   62|  20.5k|  }
_ZNK8fuzztest8internal7AnyBase3HasINSt3__15tupleIJNS3_12basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEbNS3_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEEEEEEEbv:
   53|  20.5k|  bool Has() const {
   54|  20.5k|    return has_value() && vtable_->type_id == type_id<T>;
  ------------------
  |  Branch (54:12): [True: 20.5k, False: 0]
  |  Branch (54:27): [True: 20.5k, False: 0]
  ------------------
   55|  20.5k|  }

_ZN8fuzztest8internal25SetExternalEngineCallbackEPNS0_22ExternalEngineCallbackE:
   33|      2|void SetExternalEngineCallback(ExternalEngineCallback* callback) {
   34|      2|  external_engine_callback = callback;
   35|      2|}
_ZN8fuzztest8internal25GetExternalEngineCallbackEv:
   37|  23.3k|ExternalEngineCallback* GetExternalEngineCallback() {
   38|  23.3k|  return external_engine_callback;
   39|  23.3k|}
_ZN8fuzztest8internal29FuzzTestExternalEngineAdaptorC2ERKNS0_8FuzzTestENSt3__110unique_ptrINS0_20UntypedFixtureDriverENS5_14default_deleteIS7_EEEE:
   66|      2|    : test_(test), fixture_driver_staging_(std::move(fixture_driver)) {}
_ZN8fuzztest8internal29FuzzTestExternalEngineAdaptor16RunInFuzzingModeEPiPPPcRKNS0_13ConfigurationE:
   74|      2|    int* argc, char*** argv, const Configuration& configuration) {
   75|      2|  FUZZTEST_INTERNAL_CHECK(&LLVMFuzzerRunDriver,
  ------------------
  |  |   48|      2|  ((cond) ? (void)0                                            \
  |  |  ------------------
  |  |  |  Branch (48:4): [True: 2, Folded]
  |  |  ------------------
  |  |   49|      2|          : ::fuzztest::internal::Abort(                       \
  |  |   50|      0|                __FILE__, __LINE__,                            \
  |  |   51|      0|                absl::StrCat("Internal error! Check (", #cond, \
  |  |   52|      0|                             ") failed: ", __VA_ARGS__)))
  ------------------
   76|      2|                          "LibFuzzer Driver API not defined.");
   77|      2|  FUZZTEST_INTERNAL_CHECK(
  ------------------
  |  |   48|      2|  ((cond) ? (void)0                                            \
  |  |  ------------------
  |  |  |  Branch (48:4): [True: 2, False: 0]
  |  |  ------------------
  |  |   49|      2|          : ::fuzztest::internal::Abort(                       \
  |  |   50|      0|                __FILE__, __LINE__,                            \
  |  |   51|      0|                absl::StrCat("Internal error! Check (", #cond, \
  |  |   52|      0|                             ") failed: ", __VA_ARGS__)))
  ------------------
   78|      2|      GetExternalEngineCallback() == nullptr,
   79|      2|      "External engine callback is already set while running a fuzz test.");
   80|      2|  SetExternalEngineCallback(this);
   81|      2|  runtime_.SetRunMode(RunMode::kFuzz);
   82|      2|  auto& impl = GetFuzzerImpl();
   83|      2|  runtime_.EnableReporter(&impl.stats_, [] { return absl::Now(); });
   84|       |
   85|      2|  FUZZTEST_INTERNAL_CHECK(impl.fixture_driver_ != nullptr,
  ------------------
  |  |   48|      2|  ((cond) ? (void)0                                            \
  |  |  ------------------
  |  |  |  Branch (48:4): [True: 2, False: 0]
  |  |  ------------------
  |  |   49|      2|          : ::fuzztest::internal::Abort(                       \
  |  |   50|      0|                __FILE__, __LINE__,                            \
  |  |   51|      0|                absl::StrCat("Internal error! Check (", #cond, \
  |  |   52|      0|                             ") failed: ", __VA_ARGS__)))
  ------------------
   86|      2|                          "Invalid fixture driver!");
   87|      2|  impl.fixture_driver_->SetUpFuzzTest();
   88|       |
   89|      2|  static bool driver_started = false;
   90|      2|  FUZZTEST_INTERNAL_CHECK(!driver_started, "Driver started more than once!");
  ------------------
  |  |   48|      2|  ((cond) ? (void)0                                            \
  |  |  ------------------
  |  |  |  Branch (48:4): [True: 2, False: 0]
  |  |  ------------------
  |  |   49|      2|          : ::fuzztest::internal::Abort(                       \
  |  |   50|      0|                __FILE__, __LINE__,                            \
  |  |   51|      0|                absl::StrCat("Internal error! Check (", #cond, \
  |  |   52|      0|                             ") failed: ", __VA_ARGS__)))
  ------------------
   91|      2|  driver_started = true;
   92|      2|  LLVMFuzzerRunDriver(argc, argv, [](const uint8_t* data, size_t size) -> int {
   93|      2|    GetExternalEngineCallback()->RunOneInputData(
   94|      2|        absl::string_view(reinterpret_cast<const char*>(data), size));
   95|      2|    return 0;
   96|      2|  });
   97|       |
   98|       |  // If we're here, we didn't exit from RunOneInputData(), and hence we didn't
   99|       |  // tear down the fixture.
  100|      2|  FUZZTEST_INTERNAL_CHECK(impl.fixture_driver_ != nullptr,
  ------------------
  |  |   48|      2|  ((cond) ? (void)0                                            \
  |  |  ------------------
  |  |  |  Branch (48:4): [True: 0, False: 2]
  |  |  ------------------
  |  |   49|      2|          : ::fuzztest::internal::Abort(                       \
  |  |   50|      2|                __FILE__, __LINE__,                            \
  |  |   51|      2|                absl::StrCat("Internal error! Check (", #cond, \
  |  |   52|      2|                             ") failed: ", __VA_ARGS__)))
  ------------------
  101|      2|                          "Invalid fixture driver!");
  102|      2|  impl.fixture_driver_->TearDownFuzzTest();
  103|       |
  104|      2|  return 0;
  105|      2|}
_ZN8fuzztest8internal29FuzzTestExternalEngineAdaptor15RunOneInputDataENSt3__117basic_string_viewIcNS2_11char_traitsIcEEEE:
  117|  23.3k|void FuzzTestExternalEngineAdaptor::RunOneInputData(absl::string_view data) {
  118|  23.3k|  auto& impl = GetFuzzerImpl();
  119|  23.3k|  if (impl.ShouldStop()) {
  ------------------
  |  Branch (119:7): [True: 0, False: 23.3k]
  ------------------
  120|      0|    FUZZTEST_INTERNAL_CHECK(impl.fixture_driver_ != nullptr,
  ------------------
  |  |   48|      0|  ((cond) ? (void)0                                            \
  |  |  ------------------
  |  |  |  Branch (48:4): [True: 0, False: 0]
  |  |  ------------------
  |  |   49|      0|          : ::fuzztest::internal::Abort(                       \
  |  |   50|      0|                __FILE__, __LINE__,                            \
  |  |   51|      0|                absl::StrCat("Internal error! Check (", #cond, \
  |  |   52|      0|                             ") failed: ", __VA_ARGS__)))
  ------------------
  121|      0|                            "Invalid fixture driver!");
  122|      0|    impl.fixture_driver_->TearDownFuzzTest();
  123|      0|    runtime_.PrintFinalStatsOnDefaultSink();
  124|       |    // Use _Exit instead of exit so libFuzzer does not treat it as a crash.
  125|      0|    std::_Exit(0);
  126|      0|  }
  127|  23.3k|  runtime_.SetCurrentTest(&impl.test_, nullptr);
  128|  23.3k|  if (data.size() == 0) return;
  ------------------
  |  Branch (128:7): [True: 0, False: 23.3k]
  ------------------
  129|  23.3k|  auto input = impl.TryParse(data);
  130|  23.3k|  if (!input.ok()) return;
  ------------------
  |  Branch (130:7): [True: 2.83k, False: 20.5k]
  ------------------
  131|  20.5k|  impl.RunOneInput({*std::move(input)});
  132|  20.5k|}
_ZN8fuzztest8internal29FuzzTestExternalEngineAdaptor13GetFuzzerImplEv:
  164|  23.3k|FuzzTestExternalEngineAdaptor::GetFuzzerImpl() {
  165|       |  // Postpone the creation to override libFuzzer signal setup.
  166|  23.3k|  if (!fuzzer_impl_) {
  ------------------
  |  Branch (166:7): [True: 2, False: 23.3k]
  ------------------
  167|      2|    fuzzer_impl_ =
  168|      2|        std::make_unique<FuzzerImpl>(test_, std::move(fixture_driver_staging_));
  169|      2|    fixture_driver_staging_ = nullptr;
  170|      2|  }
  171|  23.3k|  return *fuzzer_impl_;
  172|  23.3k|}
compatibility_mode.cc:_ZZN8fuzztest8internal29FuzzTestExternalEngineAdaptor16RunInFuzzingModeEPiPPPcRKNS0_13ConfigurationEENK3$_1clEPKhm:
   92|  23.3k|  LLVMFuzzerRunDriver(argc, argv, [](const uint8_t* data, size_t size) -> int {
   93|  23.3k|    GetExternalEngineCallback()->RunOneInputData(
   94|  23.3k|        absl::string_view(reinterpret_cast<const char*>(data), size));
   95|  23.3k|    return 0;
   96|  23.3k|  });

_ZN8fuzztest8internal14CorpusDatabaseC2ENSt3__117basic_string_viewIcNS2_11char_traitsIcEEEES6_b:
   44|      4|    : corpus_path_for_test_binary_([=] () -> std::string {
   45|      4|        if (database_path.empty()) return "";
   46|      4|        std::string corpus_path_for_test_binary =
   47|      4|            absl::StrCat(database_path, "/", binary_identifier);
   48|      4|        if (!absl::StartsWith(corpus_path_for_test_binary, "/") &&
   49|      4|            std::getenv("TEST_SRCDIR")) {
   50|      4|          corpus_path_for_test_binary = absl::StrCat(
   51|      4|              std::getenv("TEST_SRCDIR"), "/", corpus_path_for_test_binary);
   52|      4|        }
   53|      4|        return corpus_path_for_test_binary;
   54|      4|      }()),
   55|      4|      use_crashing_inputs_(use_crashing_inputs) {}
_ZN8fuzztest8internal14CorpusDatabaseC2ERKNS0_13ConfigurationE:
   58|      4|    : CorpusDatabase(configuration.corpus_database,
   59|      4|                     configuration.binary_identifier,
   60|       |                     /*use_crashing_inputs=*/
   61|      4|                     configuration.reproduce_findings_as_separate_tests) {}
_ZNK8fuzztest8internal14CorpusDatabase22GetCrashingInputsIfAnyENSt3__117basic_string_viewIcNS2_11char_traitsIcEEEE:
   69|      4|    absl::string_view test_name) const {
   70|      4|  if (!use_crashing_inputs_) return {};
  ------------------
  |  Branch (70:7): [True: 4, False: 0]
  ------------------
   71|      0|  return GetInputs(corpus_path_for_test_binary_, test_name, "crashing");
   72|      4|}
corpus_database.cc:_ZZN8fuzztest8internal14CorpusDatabaseC1ENSt3__117basic_string_viewIcNS2_11char_traitsIcEEEES6_bENK3$_0clEv:
   44|      4|    : corpus_path_for_test_binary_([=] () -> std::string {
   45|      4|        if (database_path.empty()) return "";
  ------------------
  |  Branch (45:13): [True: 0, False: 4]
  ------------------
   46|      4|        std::string corpus_path_for_test_binary =
   47|      4|            absl::StrCat(database_path, "/", binary_identifier);
   48|      4|        if (!absl::StartsWith(corpus_path_for_test_binary, "/") &&
  ------------------
  |  Branch (48:13): [True: 4, False: 0]
  ------------------
   49|      4|            std::getenv("TEST_SRCDIR")) {
  ------------------
  |  Branch (49:13): [True: 0, False: 4]
  ------------------
   50|      0|          corpus_path_for_test_binary = absl::StrCat(
   51|      0|              std::getenv("TEST_SRCDIR"), "/", corpus_path_for_test_binary);
   52|      0|        }
   53|      4|        return corpus_path_for_test_binary;
   54|      4|      }()),

_ZN8fuzztest8internal20GetExecutionCoverageEv:
   86|      2|ExecutionCoverage* GetExecutionCoverage() {
   87|      2|  return execution_coverage_instance;
   88|      2|}
_ZN8fuzztest8internal14CorpusCoverageC2Em:
  264|      2|CorpusCoverage::CorpusCoverage(size_t map_size) {
  265|      2|  size_t alignment = alignof(Vector);
  266|       |  // Round up to a multiple of alignment.
  267|      2|  map_size += alignment - 1;
  268|      2|  map_size -= map_size % alignment;
  269|       |  // And allocate an extra step to make sure the alignment logic has the
  270|       |  // necessary space.
  271|      2|  map_size += alignment;
  272|      2|  corpus_map_size_ = map_size;
  273|      2|  corpus_map_ = static_cast<uint8_t*>(std::aligned_alloc(alignment, map_size));
  274|      2|  std::fill(corpus_map_, corpus_map_ + corpus_map_size_, 0);
  275|      2|}

_ZNK8fuzztest8internal15AggregateOfImplINSt3__15tupleIJNS2_12basic_stringIcNS2_11char_traitsIcEENS2_9allocatorIcEEEEbNS2_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEEEEELNS0_23RequireCustomCorpusTypeE0EJNS0_23SequenceContainerOfImplIS9_NS0_13ArbitraryImplIcvEEEENSI_IbvEENS0_7MapImplIPFSE_SE_jEJNSM_IPFSE_15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplISP_EENS0_11InRangeImplIiEENST_ISQ_EESL_SL_SL_SL_NST_ImEESY_NST_IiEENS0_24BitFlagCombinationOfImplIjEEEEES11_EEEEE19ValidateCorpusValueERKNS3_IJS9_bNS3_IJNS3_IJNS0_23ElementOfImplCorpusTypeEiS15_bbbbS15_S15_S15_jEEEjEEEEEE:
  173|  20.6k|  absl::Status ValidateCorpusValue(const corpus_type& corpus_value) const {
  174|  20.6k|    absl::Status result = absl::OkStatus();
  175|  20.6k|    ApplyIndex<sizeof...(Inner)>([&](auto... I) {
  176|  20.6k|      (
  177|  20.6k|          [&] {
  178|  20.6k|            if (!result.ok()) return;
  179|  20.6k|            const absl::Status s = std::get<I>(inner_).ValidateCorpusValue(
  180|  20.6k|                std::get<I>(corpus_value));
  181|  20.6k|            result = Prefix(s, "Invalid value in aggregate");
  182|  20.6k|          }(),
  183|  20.6k|          ...);
  184|  20.6k|    });
  185|  20.6k|    return result;
  186|  20.6k|  }
_ZZNK8fuzztest8internal15AggregateOfImplINSt3__15tupleIJNS2_12basic_stringIcNS2_11char_traitsIcEENS2_9allocatorIcEEEEbNS2_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEEEEELNS0_23RequireCustomCorpusTypeE0EJNS0_23SequenceContainerOfImplIS9_NS0_13ArbitraryImplIcvEEEENSI_IbvEENS0_7MapImplIPFSE_SE_jEJNSM_IPFSE_15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplISP_EENS0_11InRangeImplIiEENST_ISQ_EESL_SL_SL_SL_NST_ImEESY_NST_IiEENS0_24BitFlagCombinationOfImplIjEEEEES11_EEEEE19ValidateCorpusValueERKNS3_IJS9_bNS3_IJNS3_IJNS0_23ElementOfImplCorpusTypeEiS15_bbbbS15_S15_S15_jEEEjEEEEEEENKUlDpT_E_clIJNS2_17integral_constantImLm0EEENS1F_ImLm1EEENS1F_ImLm2EEEEEEDaS1C_:
  175|  20.6k|    ApplyIndex<sizeof...(Inner)>([&](auto... I) {
  176|  20.6k|      (
  177|  20.6k|          [&] {
  178|  20.6k|            if (!result.ok()) return;
  179|  20.6k|            const absl::Status s = std::get<I>(inner_).ValidateCorpusValue(
  180|  20.6k|                std::get<I>(corpus_value));
  181|  20.6k|            result = Prefix(s, "Invalid value in aggregate");
  182|  20.6k|          }(),
  183|  20.6k|          ...);
  184|  20.6k|    });
_ZZZNK8fuzztest8internal15AggregateOfImplINSt3__15tupleIJNS2_12basic_stringIcNS2_11char_traitsIcEENS2_9allocatorIcEEEEbNS2_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEEEEELNS0_23RequireCustomCorpusTypeE0EJNS0_23SequenceContainerOfImplIS9_NS0_13ArbitraryImplIcvEEEENSI_IbvEENS0_7MapImplIPFSE_SE_jEJNSM_IPFSE_15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplISP_EENS0_11InRangeImplIiEENST_ISQ_EESL_SL_SL_SL_NST_ImEESY_NST_IiEENS0_24BitFlagCombinationOfImplIjEEEEES11_EEEEE19ValidateCorpusValueERKNS3_IJS9_bNS3_IJNS3_IJNS0_23ElementOfImplCorpusTypeEiS15_bbbbS15_S15_S15_jEEEjEEEEEEENKUlDpT_E_clIJNS2_17integral_constantImLm0EEENS1F_ImLm1EEENS1F_ImLm2EEEEEEDaS1C_ENKUlvE1_clEv:
  177|  20.6k|          [&] {
  178|  20.6k|            if (!result.ok()) return;
  ------------------
  |  Branch (178:17): [True: 0, False: 20.6k]
  ------------------
  179|  20.6k|            const absl::Status s = std::get<I>(inner_).ValidateCorpusValue(
  180|  20.6k|                std::get<I>(corpus_value));
  181|  20.6k|            result = Prefix(s, "Invalid value in aggregate");
  182|  20.6k|          }(),
_ZZZNK8fuzztest8internal15AggregateOfImplINSt3__15tupleIJNS2_12basic_stringIcNS2_11char_traitsIcEENS2_9allocatorIcEEEEbNS2_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEEEEELNS0_23RequireCustomCorpusTypeE0EJNS0_23SequenceContainerOfImplIS9_NS0_13ArbitraryImplIcvEEEENSI_IbvEENS0_7MapImplIPFSE_SE_jEJNSM_IPFSE_15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplISP_EENS0_11InRangeImplIiEENST_ISQ_EESL_SL_SL_SL_NST_ImEESY_NST_IiEENS0_24BitFlagCombinationOfImplIjEEEEES11_EEEEE19ValidateCorpusValueERKNS3_IJS9_bNS3_IJNS3_IJNS0_23ElementOfImplCorpusTypeEiS15_bbbbS15_S15_S15_jEEEjEEEEEEENKUlDpT_E_clIJNS2_17integral_constantImLm0EEENS1F_ImLm1EEENS1F_ImLm2EEEEEEDaS1C_ENKUlvE0_clEv:
  177|  20.6k|          [&] {
  178|  20.6k|            if (!result.ok()) return;
  ------------------
  |  Branch (178:17): [True: 0, False: 20.6k]
  ------------------
  179|  20.6k|            const absl::Status s = std::get<I>(inner_).ValidateCorpusValue(
  180|  20.6k|                std::get<I>(corpus_value));
  181|  20.6k|            result = Prefix(s, "Invalid value in aggregate");
  182|  20.6k|          }(),
_ZZZNK8fuzztest8internal15AggregateOfImplINSt3__15tupleIJNS2_12basic_stringIcNS2_11char_traitsIcEENS2_9allocatorIcEEEEbNS2_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEEEEELNS0_23RequireCustomCorpusTypeE0EJNS0_23SequenceContainerOfImplIS9_NS0_13ArbitraryImplIcvEEEENSI_IbvEENS0_7MapImplIPFSE_SE_jEJNSM_IPFSE_15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplISP_EENS0_11InRangeImplIiEENST_ISQ_EESL_SL_SL_SL_NST_ImEESY_NST_IiEENS0_24BitFlagCombinationOfImplIjEEEEES11_EEEEE19ValidateCorpusValueERKNS3_IJS9_bNS3_IJNS3_IJNS0_23ElementOfImplCorpusTypeEiS15_bbbbS15_S15_S15_jEEEjEEEEEEENKUlDpT_E_clIJNS2_17integral_constantImLm0EEENS1F_ImLm1EEENS1F_ImLm2EEEEEEDaS1C_ENKUlvE_clEv:
  177|  20.6k|          [&] {
  178|  20.6k|            if (!result.ok()) return;
  ------------------
  |  Branch (178:17): [True: 0, False: 20.6k]
  ------------------
  179|  20.6k|            const absl::Status s = std::get<I>(inner_).ValidateCorpusValue(
  180|  20.6k|                std::get<I>(corpus_value));
  181|  20.6k|            result = Prefix(s, "Invalid value in aggregate");
  182|  20.6k|          }(),
_ZNK8fuzztest8internal15AggregateOfImplINSt3__15tupleIJNS2_12basic_stringIcNS2_11char_traitsIcEENS2_9allocatorIcEEEEbNS2_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEEEEELNS0_23RequireCustomCorpusTypeE0EJNS0_23SequenceContainerOfImplIS9_NS0_13ArbitraryImplIcvEEEENSI_IbvEENS0_7MapImplIPFSE_SE_jEJNSM_IPFSE_15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplISP_EENS0_11InRangeImplIiEENST_ISQ_EESL_SL_SL_SL_NST_ImEESY_NST_IiEENS0_24BitFlagCombinationOfImplIjEEEEES11_EEEEE11ParseCorpusERKNS0_8IRObjectE:
  157|  22.1k|  std::optional<corpus_type> ParseCorpus(const IRObject& obj) const {
  158|  22.1k|    if constexpr (has_custom_corpus_type) {
  159|  22.1k|      return ParseWithDomainTuple(inner_, obj);
  160|       |    } else {
  161|       |      return obj.ToCorpus<corpus_type>();
  162|       |    }
  163|  22.1k|  }
_ZNK8fuzztest8internal15AggregateOfImplINSt3__15tupleIJNS2_12basic_stringIcNS2_11char_traitsIcEENS2_9allocatorIcEEEEbNS2_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEEEEELNS0_23RequireCustomCorpusTypeE0EJNS0_23SequenceContainerOfImplIS9_NS0_13ArbitraryImplIcvEEEENSI_IbvEENS0_7MapImplIPFSE_SE_jEJNSM_IPFSE_15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplISP_EENS0_11InRangeImplIiEENST_ISQ_EESL_SL_SL_SL_NST_ImEESY_NST_IiEENS0_24BitFlagCombinationOfImplIjEEEEES11_EEEEE8GetValueERKNS3_IJS9_bNS3_IJNS3_IJNS0_23ElementOfImplCorpusTypeEiS15_bbbbS15_S15_S15_jEEEjEEEEEE:
  115|  20.5k|  value_type GetValue(const corpus_type& value) const {
  116|  20.5k|    if constexpr (has_custom_corpus_type) {
  117|       |      if constexpr (DetectBindableFieldCount<value_type>() ==
  118|  20.5k|                    DetectBraceInitCount<value_type>()) {
  119|  20.5k|        return ApplyIndex<sizeof...(Inner)>([&](auto... I) {
  120|  20.5k|          return T{std::get<I>(inner_).GetValue(std::get<I>(value))...};
  121|  20.5k|        });
  122|       |      } else {
  123|       |        // Right now the only other possibility is that the bindable field count
  124|       |        // is one less than the brace init field count. In that case, that extra
  125|       |        // field is used to initialize an empty base class. We'll need to update
  126|       |        // this if that ever changes.
  127|       |        return ApplyIndex<sizeof...(Inner)>([&](auto... I) {
  128|       |          return T{{}, std::get<I>(inner_).GetValue(std::get<I>(value))...};
  129|       |        });
  130|       |      }
  131|       |    } else {
  132|       |      return value;
  133|       |    }
  134|  20.5k|  }
_ZZNK8fuzztest8internal15AggregateOfImplINSt3__15tupleIJNS2_12basic_stringIcNS2_11char_traitsIcEENS2_9allocatorIcEEEEbNS2_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEEEEELNS0_23RequireCustomCorpusTypeE0EJNS0_23SequenceContainerOfImplIS9_NS0_13ArbitraryImplIcvEEEENSI_IbvEENS0_7MapImplIPFSE_SE_jEJNSM_IPFSE_15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplISP_EENS0_11InRangeImplIiEENST_ISQ_EESL_SL_SL_SL_NST_ImEESY_NST_IiEENS0_24BitFlagCombinationOfImplIjEEEEES11_EEEEE8GetValueERKNS3_IJS9_bNS3_IJNS3_IJNS0_23ElementOfImplCorpusTypeEiS15_bbbbS15_S15_S15_jEEEjEEEEEEENKUlDpT_E_clIJNS2_17integral_constantImLm0EEENS1F_ImLm1EEENS1F_ImLm2EEEEEEDaS1C_:
  119|  20.5k|        return ApplyIndex<sizeof...(Inner)>([&](auto... I) {
  120|  20.5k|          return T{std::get<I>(inner_).GetValue(std::get<I>(value))...};
  121|  20.5k|        });
_ZN8fuzztest8internal15AggregateOfImplINSt3__15tupleIJNS2_12basic_stringIcNS2_11char_traitsIcEENS2_9allocatorIcEEEEbNS2_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEEEEELNS0_23RequireCustomCorpusTypeE0EJNS0_23SequenceContainerOfImplIS9_NS0_13ArbitraryImplIcvEEEENSI_IbvEENS0_7MapImplIPFSE_SE_jEJNSM_IPFSE_15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplISP_EENS0_11InRangeImplIiEENST_ISQ_EESL_SL_SL_SL_NST_ImEESY_NST_IiEENS0_24BitFlagCombinationOfImplIjEEEEES11_EEEEEC2ENS2_10in_place_tESK_SL_S13_:
   58|      4|      : inner_(std::move(inner)...) {}

_ZNK8fuzztest8internal13ArbitraryImplIbvE19ValidateCorpusValueERKb:
  110|   102k|  absl::Status ValidateCorpusValue(const value_type&) const {
  111|   102k|    return absl::OkStatus();  // Nothing to validate.
  112|   102k|  }
_ZNK8fuzztest8internal13ArbitraryImplIcvE19ValidateCorpusValueERKc:
  188|  55.1M|  absl::Status ValidateCorpusValue(const value_type&) const {
  189|  55.1M|    return absl::OkStatus();  // Nothing to validate.
  190|  55.1M|  }

_ZNK8fuzztest8internal24BitFlagCombinationOfImplIjE19ValidateCorpusValueERKj:
   70|  41.0k|  absl::Status ValidateCorpusValue(const value_type& val) const {
   71|  41.0k|    if (BitOr(val, all_flags_combo_) != all_flags_combo_) {
  ------------------
  |  Branch (71:9): [True: 16, False: 41.0k]
  ------------------
   72|     16|      return absl::InvalidArgumentError("Invalid bit flag combination.");
   73|     16|    }
   74|  41.0k|    return absl::OkStatus();
   75|  41.0k|  }
_ZN8fuzztest8internal24BitFlagCombinationOfImplIjE6BitAndIjEEjT_S4_:
   81|     16|  static value_type BitAnd(U a, U b) {
   82|       |    if constexpr (std::is_enum_v<U>) {
   83|       |      return BitAnd(static_cast<std::underlying_type_t<U>>(a),
   84|       |                    static_cast<std::underlying_type_t<U>>(b));
   85|     16|    } else {
   86|     16|      return static_cast<value_type>(a & b);
   87|     16|    }
   88|     16|  }
_ZN8fuzztest8internal24BitFlagCombinationOfImplIjE5BitOrIjEEjT_S4_:
   91|  41.0k|  static value_type BitOr(U a, U b) {
   92|       |    if constexpr (std::is_enum_v<U>) {
   93|       |      return BitOr(static_cast<std::underlying_type_t<U>>(a),
   94|       |                   static_cast<std::underlying_type_t<U>>(b));
   95|  41.0k|    } else {
   96|  41.0k|      return static_cast<value_type>(a | b);
   97|  41.0k|    }
   98|  41.0k|  }
_ZN8fuzztest8internal24BitFlagCombinationOfImplIjEC2EN4absl12lts_202401164SpanIKjEE:
   37|      8|      : flags_(flags.begin(), flags.end()), all_flags_combo_{} {
   38|      8|    FUZZTEST_INTERNAL_CHECK_PRECONDITION(
  ------------------
  |  |   42|      8|  ((P) ? (void)0                                     \
  |  |  ------------------
  |  |  |  Branch (42:4): [True: 8, False: 0]
  |  |  ------------------
  |  |   43|      8|       : ::fuzztest::internal::Abort(                \
  |  |   44|      0|             __FILE__, __LINE__,                     \
  |  |   45|      0|             absl::StrCat("Failed precondition (", #P, "): ", __VA_ARGS__)))
  ------------------
   39|      8|        !flags.empty(), "BitFlagCombinationOf requires a non empty list.");
   40|       |    // Make sure they are mutually exclusive metadata, only_shrink and none are
   41|       |    // empty.
   42|     28|    for (int i = 0; i < flags.size(); ++i) {
  ------------------
  |  Branch (42:21): [True: 20, False: 8]
  ------------------
   43|     20|      T v1 = flags[i];
   44|     20|      FUZZTEST_INTERNAL_CHECK_PRECONDITION(
  ------------------
  |  |   42|     20|  ((P) ? (void)0                                     \
  |  |  ------------------
  |  |  |  Branch (42:4): [True: 20, False: 0]
  |  |  ------------------
  |  |   43|     20|       : ::fuzztest::internal::Abort(                \
  |  |   44|      0|             __FILE__, __LINE__,                     \
  |  |   45|      0|             absl::StrCat("Failed precondition (", #P, "): ", __VA_ARGS__)))
  ------------------
   45|     20|          v1 != T{}, "BitFlagCombinationOf requires non zero flags.");
   46|     36|      for (int j = i + 1; j < flags.size(); ++j) {
  ------------------
  |  Branch (46:27): [True: 16, False: 20]
  ------------------
   47|     16|        T v2 = flags[j];
   48|     16|        FUZZTEST_INTERNAL_CHECK_PRECONDITION(
  ------------------
  |  |   42|     16|  ((P) ? (void)0                                     \
  |  |  ------------------
  |  |  |  Branch (42:4): [True: 16, False: 0]
  |  |  ------------------
  |  |   43|     16|       : ::fuzztest::internal::Abort(                \
  |  |   44|      0|             __FILE__, __LINE__,                     \
  |  |   45|      0|             absl::StrCat("Failed precondition (", #P, "): ", __VA_ARGS__)))
  ------------------
   49|     16|            BitAnd(v1, v2) == T{},
   50|     16|            "BitFlagCombinationOf requires flags to be mutually exclusive.");
   51|     16|      }
   52|     20|      all_flags_combo_ = BitOr(all_flags_combo_, v1);
   53|     20|    }
   54|      8|  }

_ZNK8fuzztest8internal19ContainerOfImplBaseINS0_23SequenceContainerOfImplINSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEENS0_13ArbitraryImplIcvEEEES9_SB_E8min_sizeEv:
  371|  20.8k|  size_t min_size() const { return min_size_; }
_ZNK8fuzztest8internal19ContainerOfImplBaseINS0_23SequenceContainerOfImplINSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEENS0_13ArbitraryImplIcvEEEES9_SB_E8max_sizeEv:
  372|  20.8k|  size_t max_size() const {
  373|  20.8k|    return max_size_.value_or(std::max(min_size_, kDefaultContainerMaxSize));
  374|  20.8k|  }
_ZN8fuzztest8internal19ContainerOfImplBaseINS0_23SequenceContainerOfImplINSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEENS0_13ArbitraryImplIcvEEEES9_SB_E4SelfEv:
  377|      4|  Derived& Self() { return static_cast<Derived&>(*this); }
_ZNK8fuzztest8internal19ContainerOfImplBaseINS0_23SequenceContainerOfImplINSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEENS0_13ArbitraryImplIcvEEEES9_SB_E11ParseCorpusERKNS0_8IRObjectE:
  283|  20.6k|  std::optional<corpus_type> ParseCorpus(const IRObject& obj) const {
  284|       |    // Use the generic serializer when no custom corpus type is used, since it
  285|       |    // is more efficient. Eg a string value can be serialized as a string
  286|       |    // instead of as a sequence of char values.
  287|       |    if constexpr (has_custom_corpus_type) {
  288|       |      auto subs = obj.Subs();
  289|       |      if (!subs) return std::nullopt;
  290|       |      corpus_type res;
  291|       |      for (const auto& elem : *subs) {
  292|       |        if (auto parsed_elem = inner_.ParseCorpus(elem)) {
  293|       |          res.insert(res.end(), std::move(*parsed_elem));
  294|       |        } else {
  295|       |          return std::nullopt;
  296|       |        }
  297|       |      }
  298|       |      return res;
  299|  20.6k|    } else {
  300|  20.6k|      return obj.ToCorpus<corpus_type>();
  301|  20.6k|    }
  302|  20.6k|  }
_ZNK8fuzztest8internal19ContainerOfImplBaseINS0_23SequenceContainerOfImplINSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEENS0_13ArbitraryImplIcvEEEES9_SB_E19ValidateCorpusValueERKS9_:
  317|  20.8k|  absl::Status ValidateCorpusValue(const corpus_type& corpus_value) const {
  318|       |    // Check size.
  319|  20.8k|    if (corpus_value.size() < min_size()) {
  ------------------
  |  Branch (319:9): [True: 0, False: 20.8k]
  ------------------
  320|      0|      return absl::InvalidArgumentError(absl::StrCat(
  321|      0|          "Invalid size: ", corpus_value.size(), ". Min size: ", min_size()));
  322|      0|    }
  323|  20.8k|    if (corpus_value.size() > max_size()) {
  ------------------
  |  Branch (323:9): [True: 0, False: 20.8k]
  ------------------
  324|      0|      return absl::InvalidArgumentError(absl::StrCat(
  325|      0|          "Invalid size: ", corpus_value.size(), ". Max size: ", max_size()));
  326|      0|    }
  327|       |    // Check elements.
  328|  20.8k|    int i = 0;
  329|  55.1M|    for (const auto& elem : corpus_value) {
  ------------------
  |  Branch (329:27): [True: 55.1M, False: 20.8k]
  ------------------
  330|  55.1M|      const absl::Status s = inner_.ValidateCorpusValue(elem);
  331|  55.1M|      if (!s.ok()) {
  ------------------
  |  Branch (331:11): [True: 0, False: 55.1M]
  ------------------
  332|      0|        return Prefix(s,
  333|      0|                      absl::StrCat("Invalid value in container at index ", i));
  334|      0|      }
  335|  55.1M|      i++;
  336|  55.1M|    }
  337|  20.8k|    return absl::OkStatus();
  338|  20.8k|  }
_ZNK8fuzztest8internal19ContainerOfImplBaseINS0_23SequenceContainerOfImplINSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEENS0_13ArbitraryImplIcvEEEES9_SB_E8GetValueERKS9_:
  257|  20.5k|  value_type GetValue(const corpus_type& value) const {
  258|       |    if constexpr (has_custom_corpus_type) {
  259|       |      value_type result;
  260|       |      for (const auto& v : value) {
  261|       |        result.insert(result.end(), inner_.GetValue(v));
  262|       |      }
  263|       |      return result;
  264|  20.5k|    } else {
  265|  20.5k|      return value;
  266|  20.5k|    }
  267|  20.5k|  }
_ZNK8fuzztest8internal19ContainerOfImplBaseINS0_23SequenceContainerOfImplINSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEENS0_13ArbitraryImplIcvEEEES9_SB_E9FromValueERKS9_:
  269|    256|  std::optional<corpus_type> FromValue(const value_type& value) const {
  270|    256|    if constexpr (!has_custom_corpus_type) {
  271|    256|      return value;
  272|       |    } else {
  273|       |      corpus_type copus_value;
  274|       |      for (const auto& elem : value) {
  275|       |        auto inner_value = inner_.FromValue(elem);
  276|       |        if (!inner_value) return std::nullopt;
  277|       |        copus_value.push_back(*std::move(inner_value));
  278|       |      }
  279|       |      return copus_value;
  280|       |    }
  281|    256|  }
_ZN8fuzztest8internal19ContainerOfImplBaseINS0_23SequenceContainerOfImplINSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEENS0_13ArbitraryImplIcvEEEES9_SB_E11WithMaxSizeEm:
  222|      4|  Derived& WithMaxSize(size_t s) {
  223|      4|    FUZZTEST_INTERNAL_CHECK_PRECONDITION(
  ------------------
  |  |   42|      4|  ((P) ? (void)0                                     \
  |  |  ------------------
  |  |  |  Branch (42:4): [True: 4, False: 0]
  |  |  ------------------
  |  |   43|      4|       : ::fuzztest::internal::Abort(                \
  |  |   44|      0|             __FILE__, __LINE__,                     \
  |  |   45|      0|             absl::StrCat("Failed precondition (", #P, "): ", __VA_ARGS__)))
  ------------------
  224|      4|        min_size_ <= s, "Maximal size ", s,
  225|      4|        " cannot be smaller than minimal size ", min_size_);
  226|      4|    max_size_ = s;
  227|      4|    return Self();
  228|      4|  }
_ZN8fuzztest8internal27SequenceContainerOfImplBaseINS0_23SequenceContainerOfImplINSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEENS0_13ArbitraryImplIcvEEEES9_SB_EC2Ev:
  536|      4|  SequenceContainerOfImplBase() = default;
_ZN8fuzztest8internal19ContainerOfImplBaseINS0_23SequenceContainerOfImplINSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEENS0_13ArbitraryImplIcvEEEES9_SB_EC2Ev:
  109|      4|  ContainerOfImplBase() = default;

_ZNK8fuzztest13UntypedDomain8GetValueERKNS_8internal11CopyableAnyE:
  330|  20.5k|  value_type GetValue(const corpus_type& corpus_value) const {
  331|  20.5k|    return inner_->UntypedGetValue(corpus_value);
  332|  20.5k|  }
_ZNK8fuzztest13UntypedDomain11ParseCorpusERKNS_8internal8IRObjectE:
  334|  22.1k|  std::optional<corpus_type> ParseCorpus(const internal::IRObject& obj) const {
  335|  22.1k|    return inner_->UntypedParseCorpus(obj);
  336|  22.1k|  }
_ZNK8fuzztest13UntypedDomain19ValidateCorpusValueERKNS_8internal11CopyableAnyE:
  342|  20.6k|  absl::Status ValidateCorpusValue(const corpus_type& corpus_value) const {
  343|  20.6k|    return inner_->UntypedValidateCorpusValue(corpus_value);
  344|  20.6k|  }
_ZN8fuzztest6DomainINSt3__15tupleIJNS1_12basic_stringIcNS1_11char_traitsIcEENS1_9allocatorIcEEEEbNS1_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEEEEEEC2INS_8internal15AggregateOfImplISE_LNSH_23RequireCustomCorpusTypeE0EJNSH_23SequenceContainerOfImplIS8_NSH_13ArbitraryImplIcvEEEENSL_IbvEENSH_7MapImplIPFSD_SD_jEJNSP_IPFSD_15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNSH_13ElementOfImplISS_EENSH_11InRangeImplIiEENSW_IST_EESO_SO_SO_SO_NSW_ImEES11_NSW_IiEENSH_24BitFlagCombinationOfImplIjEEEEES14_EEEEEENS2_IJS8_bNS2_IJNS2_IJNSH_23ElementOfImplCorpusTypeEiS18_bbbbS18_S18_S18_jEEEjEEEEEEEEONS_18domain_implementor10DomainBaseIT_SE_T0_EE:
  109|      4|      : inner_(std::make_unique<internal::DomainModel<Inner>>(
  110|      4|            static_cast<Inner&&>(inner))) {}
_ZN8fuzztest6DomainINSt3__15tupleIJNS1_12basic_stringIcNS1_11char_traitsIcEENS1_9allocatorIcEEEEbNS1_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEEEEEEC2ERKSF_:
  112|     18|  Domain(const Domain& other) { *this = other; }
_ZN8fuzztest6DomainINSt3__15tupleIJNS1_12basic_stringIcNS1_11char_traitsIcEENS1_9allocatorIcEEEEbNS1_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEEEEEEaSERKSF_:
  113|     18|  Domain& operator=(const Domain& other) {
  114|     18|    inner_ = other.inner_->TypedClone();
  115|     18|    return *this;
  116|     18|  }
_ZN8fuzztest13UntypedDomainC2INSt3__15tupleIJNS2_12basic_stringIcNS2_11char_traitsIcEENS2_9allocatorIcEEEEbNS2_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEEEEEEERKNS_6DomainIT_EE:
  315|      2|      : inner_(domain.inner_->UntypedClone()) {}

_ZN8fuzztest18domain_implementor10DomainBaseINS_8internal23SequenceContainerOfImplINSt3__112basic_stringIcNS4_11char_traitsIcEENS4_9allocatorIcEEEENS2_13ArbitraryImplIcvEEEESA_SA_E7derivedEv:
  252|    516|  Derived& derived() { return static_cast<Derived&>(*this); }
_ZNK8fuzztest18domain_implementor10DomainBaseINS_8internal13ArbitraryImplIbvEEbbE11ParseCorpusERKNS2_8IRObjectE:
  159|   103k|  std::optional<CorpusType> ParseCorpus(const internal::IRObject& obj) const {
  160|   103k|    static_assert(!has_custom_corpus_type);
  161|   103k|    return obj.ToCorpus<CorpusType>();
  162|   103k|  }
_ZNK8fuzztest18domain_implementor10DomainBaseINS_8internal13ArbitraryImplIbvEEbbE8GetValueERKb:
  150|   102k|  ValueType GetValue(const ValueType& v) const {
  151|   102k|    static_assert(!has_custom_corpus_type);
  152|   102k|    return v;
  153|   102k|  }
_ZNK8fuzztest18domain_implementor10DomainBaseINS_8internal11InRangeImplIiEEiiE8GetValueERKi:
  150|  20.5k|  ValueType GetValue(const ValueType& v) const {
  151|  20.5k|    static_assert(!has_custom_corpus_type);
  152|  20.5k|    return v;
  153|  20.5k|  }
_ZNK8fuzztest18domain_implementor10DomainBaseINS_8internal11InRangeImplIiEEiiE11ParseCorpusERKNS2_8IRObjectE:
  159|  20.6k|  std::optional<CorpusType> ParseCorpus(const internal::IRObject& obj) const {
  160|  20.6k|    static_assert(!has_custom_corpus_type);
  161|  20.6k|    return obj.ToCorpus<CorpusType>();
  162|  20.6k|  }
_ZNK8fuzztest18domain_implementor10DomainBaseINS_8internal24BitFlagCombinationOfImplIjEEjjE11ParseCorpusERKNS2_8IRObjectE:
  159|  41.2k|  std::optional<CorpusType> ParseCorpus(const internal::IRObject& obj) const {
  160|  41.2k|    static_assert(!has_custom_corpus_type);
  161|  41.2k|    return obj.ToCorpus<CorpusType>();
  162|  41.2k|  }
_ZNK8fuzztest18domain_implementor10DomainBaseINS_8internal24BitFlagCombinationOfImplIjEEjjE8GetValueERKj:
  150|  41.0k|  ValueType GetValue(const ValueType& v) const {
  151|  41.0k|    static_assert(!has_custom_corpus_type);
  152|  41.0k|    return v;
  153|  41.0k|  }
_ZN8fuzztest18domain_implementor10DomainBaseINS_8internal23SequenceContainerOfImplINSt3__112basic_stringIcNS4_11char_traitsIcEENS4_9allocatorIcEEEENS2_13ArbitraryImplIcvEEEESA_SA_E9WithSeedsERKNS4_6vectorISA_NS8_ISA_EEEE:
  185|      4|  WithSeeds(const std::vector<ValueType>& seeds) {
  186|      4|    seeds_.reserve(seeds_.size() + seeds.size());
  187|    256|    for (const ValueType& seed : seeds) {
  ------------------
  |  Branch (187:32): [True: 256, False: 4]
  ------------------
  188|    256|      std::optional<CorpusType> corpus_value = derived().FromValue(seed);
  189|    256|      if (!corpus_value.has_value()) {
  ------------------
  |  Branch (189:11): [True: 0, False: 256]
  ------------------
  190|      0|        ReportBadSeedAndExit(
  191|      0|            seed,
  192|      0|            absl::InvalidArgumentError(
  193|      0|                "Seed could not be converted to the internal corpus value"));
  194|      0|      }
  195|       |
  196|    256|      absl::Status valid = derived().ValidateCorpusValue(*corpus_value);
  197|    256|      if (!valid.ok()) ReportBadSeedAndExit(seed, valid);
  ------------------
  |  Branch (197:11): [True: 0, False: 256]
  ------------------
  198|       |
  199|    256|      seeds_.push_back(*std::move(corpus_value));
  200|    256|    }
  201|      4|    return derived();
  202|      4|  }
_ZN8fuzztest18domain_implementor10DomainBaseINS_8internal15AggregateOfImplINSt3__15tupleIJNS4_12basic_stringIcNS4_11char_traitsIcEENS4_9allocatorIcEEEEbNS4_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEEEEELNS2_23RequireCustomCorpusTypeE0EJNS2_23SequenceContainerOfImplISB_NS2_13ArbitraryImplIcvEEEENSK_IbvEENS2_7MapImplIPFSG_SG_jEJNSO_IPFSG_15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS2_13ElementOfImplISR_EENS2_11InRangeImplIiEENSV_ISS_EESN_SN_SN_SN_NSV_ImEES10_NSV_IiEENS2_24BitFlagCombinationOfImplIjEEEEES13_EEEEEESH_NS5_IJSB_bNS5_IJNS5_IJNS2_23ElementOfImplCorpusTypeEiS17_bbbbS17_S17_S17_jEEEjEEEEEEEC2Ev:
  115|      4|  DomainBase() {
  116|       |    // Check that the interface of `Derived` matches the requirements for a
  117|       |    // domain implementation. We check these inside the constructor of
  118|       |    // `DomainBase`, where `Derived` is already fully defined. If we try to
  119|       |    // check them at class scope we would see an incomplete `Derived` class and
  120|       |    // the checks would not work.
  121|      4|    fuzztest::internal::CheckIsSame<
  122|      4|        ValueType, fuzztest::internal::value_type_t<Derived>>();
  123|      4|    fuzztest::internal::CheckIsSame<
  124|      4|        CorpusType, fuzztest::internal::corpus_type_t<Derived>>();
  125|      4|    static_assert(has_custom_corpus_type == Derived::has_custom_corpus_type);
  126|       |
  127|       |    // Check that `Derived` type-checks against the type-erased `Domain<T>`
  128|       |    // interface by forcing the `DomainModel` template specialization.
  129|      4|    if (Derived* domain = nullptr) {
  ------------------
  |  Branch (129:18): [True: 0, False: 4]
  ------------------
  130|      0|      (void)fuzztest::internal::DomainModel<Derived>{*domain};
  131|      0|    }
  132|      4|  }
_ZN8fuzztest18domain_implementor10DomainBaseINS_8internal23SequenceContainerOfImplINSt3__112basic_stringIcNS4_11char_traitsIcEENS4_9allocatorIcEEEENS2_13ArbitraryImplIcvEEEESA_SA_EC2Ev:
  115|      4|  DomainBase() {
  116|       |    // Check that the interface of `Derived` matches the requirements for a
  117|       |    // domain implementation. We check these inside the constructor of
  118|       |    // `DomainBase`, where `Derived` is already fully defined. If we try to
  119|       |    // check them at class scope we would see an incomplete `Derived` class and
  120|       |    // the checks would not work.
  121|      4|    fuzztest::internal::CheckIsSame<
  122|      4|        ValueType, fuzztest::internal::value_type_t<Derived>>();
  123|      4|    fuzztest::internal::CheckIsSame<
  124|      4|        CorpusType, fuzztest::internal::corpus_type_t<Derived>>();
  125|      4|    static_assert(has_custom_corpus_type == Derived::has_custom_corpus_type);
  126|       |
  127|       |    // Check that `Derived` type-checks against the type-erased `Domain<T>`
  128|       |    // interface by forcing the `DomainModel` template specialization.
  129|      4|    if (Derived* domain = nullptr) {
  ------------------
  |  Branch (129:18): [True: 0, False: 4]
  ------------------
  130|      0|      (void)fuzztest::internal::DomainModel<Derived>{*domain};
  131|      0|    }
  132|      4|  }
_ZN8fuzztest18domain_implementor10DomainBaseINS_8internal13ArbitraryImplIcvEEccEC2Ev:
  115|      4|  DomainBase() {
  116|       |    // Check that the interface of `Derived` matches the requirements for a
  117|       |    // domain implementation. We check these inside the constructor of
  118|       |    // `DomainBase`, where `Derived` is already fully defined. If we try to
  119|       |    // check them at class scope we would see an incomplete `Derived` class and
  120|       |    // the checks would not work.
  121|      4|    fuzztest::internal::CheckIsSame<
  122|      4|        ValueType, fuzztest::internal::value_type_t<Derived>>();
  123|      4|    fuzztest::internal::CheckIsSame<
  124|      4|        CorpusType, fuzztest::internal::corpus_type_t<Derived>>();
  125|      4|    static_assert(has_custom_corpus_type == Derived::has_custom_corpus_type);
  126|       |
  127|       |    // Check that `Derived` type-checks against the type-erased `Domain<T>`
  128|       |    // interface by forcing the `DomainModel` template specialization.
  129|      4|    if (Derived* domain = nullptr) {
  ------------------
  |  Branch (129:18): [True: 0, False: 4]
  ------------------
  130|      0|      (void)fuzztest::internal::DomainModel<Derived>{*domain};
  131|      0|    }
  132|      4|  }
_ZN8fuzztest18domain_implementor10DomainBaseINS_8internal13ArbitraryImplIbvEEbbEC2Ev:
  115|     20|  DomainBase() {
  116|       |    // Check that the interface of `Derived` matches the requirements for a
  117|       |    // domain implementation. We check these inside the constructor of
  118|       |    // `DomainBase`, where `Derived` is already fully defined. If we try to
  119|       |    // check them at class scope we would see an incomplete `Derived` class and
  120|       |    // the checks would not work.
  121|     20|    fuzztest::internal::CheckIsSame<
  122|     20|        ValueType, fuzztest::internal::value_type_t<Derived>>();
  123|     20|    fuzztest::internal::CheckIsSame<
  124|     20|        CorpusType, fuzztest::internal::corpus_type_t<Derived>>();
  125|     20|    static_assert(has_custom_corpus_type == Derived::has_custom_corpus_type);
  126|       |
  127|       |    // Check that `Derived` type-checks against the type-erased `Domain<T>`
  128|       |    // interface by forcing the `DomainModel` template specialization.
  129|     20|    if (Derived* domain = nullptr) {
  ------------------
  |  Branch (129:18): [True: 0, False: 20]
  ------------------
  130|      0|      (void)fuzztest::internal::DomainModel<Derived>{*domain};
  131|      0|    }
  132|     20|  }
_ZN8fuzztest18domain_implementor10DomainBaseINS_8internal7MapImplIPFNSt3__110unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEES9_jEJNS3_IPFS9_15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS2_13ElementOfImplISC_EENS2_11InRangeImplIiEENSG_ISD_EENS2_13ArbitraryImplIbvEESM_SM_SM_NSG_ImEESN_NSG_IiEENS2_24BitFlagCombinationOfImplIjEEEEESQ_EEES9_NS4_5tupleIJNST_IJNS2_23ElementOfImplCorpusTypeEiSU_bbbbSU_SU_SU_jEEEjEEEEC2Ev:
  115|      4|  DomainBase() {
  116|       |    // Check that the interface of `Derived` matches the requirements for a
  117|       |    // domain implementation. We check these inside the constructor of
  118|       |    // `DomainBase`, where `Derived` is already fully defined. If we try to
  119|       |    // check them at class scope we would see an incomplete `Derived` class and
  120|       |    // the checks would not work.
  121|      4|    fuzztest::internal::CheckIsSame<
  122|      4|        ValueType, fuzztest::internal::value_type_t<Derived>>();
  123|      4|    fuzztest::internal::CheckIsSame<
  124|      4|        CorpusType, fuzztest::internal::corpus_type_t<Derived>>();
  125|      4|    static_assert(has_custom_corpus_type == Derived::has_custom_corpus_type);
  126|       |
  127|       |    // Check that `Derived` type-checks against the type-erased `Domain<T>`
  128|       |    // interface by forcing the `DomainModel` template specialization.
  129|      4|    if (Derived* domain = nullptr) {
  ------------------
  |  Branch (129:18): [True: 0, False: 4]
  ------------------
  130|      0|      (void)fuzztest::internal::DomainModel<Derived>{*domain};
  131|      0|    }
  132|      4|  }
_ZN8fuzztest18domain_implementor10DomainBaseINS_8internal11InRangeImplIiEEiiEC2Ev:
  115|      4|  DomainBase() {
  116|       |    // Check that the interface of `Derived` matches the requirements for a
  117|       |    // domain implementation. We check these inside the constructor of
  118|       |    // `DomainBase`, where `Derived` is already fully defined. If we try to
  119|       |    // check them at class scope we would see an incomplete `Derived` class and
  120|       |    // the checks would not work.
  121|      4|    fuzztest::internal::CheckIsSame<
  122|      4|        ValueType, fuzztest::internal::value_type_t<Derived>>();
  123|      4|    fuzztest::internal::CheckIsSame<
  124|      4|        CorpusType, fuzztest::internal::corpus_type_t<Derived>>();
  125|      4|    static_assert(has_custom_corpus_type == Derived::has_custom_corpus_type);
  126|       |
  127|       |    // Check that `Derived` type-checks against the type-erased `Domain<T>`
  128|       |    // interface by forcing the `DomainModel` template specialization.
  129|      4|    if (Derived* domain = nullptr) {
  ------------------
  |  Branch (129:18): [True: 0, False: 4]
  ------------------
  130|      0|      (void)fuzztest::internal::DomainModel<Derived>{*domain};
  131|      0|    }
  132|      4|  }
_ZN8fuzztest18domain_implementor10DomainBaseINS_8internal7MapImplIPFNSt3__110unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEE15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS2_13ElementOfImplISA_EENS2_11InRangeImplIiEENSE_ISB_EENS2_13ArbitraryImplIbvEESK_SK_SK_NSE_ImEESL_NSE_IiEENS2_24BitFlagCombinationOfImplIjEEEEES9_NS4_5tupleIJNS2_23ElementOfImplCorpusTypeEiSR_bbbbSR_SR_SR_jEEEEC2Ev:
  115|      4|  DomainBase() {
  116|       |    // Check that the interface of `Derived` matches the requirements for a
  117|       |    // domain implementation. We check these inside the constructor of
  118|       |    // `DomainBase`, where `Derived` is already fully defined. If we try to
  119|       |    // check them at class scope we would see an incomplete `Derived` class and
  120|       |    // the checks would not work.
  121|      4|    fuzztest::internal::CheckIsSame<
  122|      4|        ValueType, fuzztest::internal::value_type_t<Derived>>();
  123|      4|    fuzztest::internal::CheckIsSame<
  124|      4|        CorpusType, fuzztest::internal::corpus_type_t<Derived>>();
  125|      4|    static_assert(has_custom_corpus_type == Derived::has_custom_corpus_type);
  126|       |
  127|       |    // Check that `Derived` type-checks against the type-erased `Domain<T>`
  128|       |    // interface by forcing the `DomainModel` template specialization.
  129|      4|    if (Derived* domain = nullptr) {
  ------------------
  |  Branch (129:18): [True: 0, False: 4]
  ------------------
  130|      0|      (void)fuzztest::internal::DomainModel<Derived>{*domain};
  131|      0|    }
  132|      4|  }
_ZN8fuzztest18domain_implementor10DomainBaseINS_8internal13ElementOfImplI15avifCodecChoiceEES4_NS2_23ElementOfImplCorpusTypeEEC2Ev:
  115|      4|  DomainBase() {
  116|       |    // Check that the interface of `Derived` matches the requirements for a
  117|       |    // domain implementation. We check these inside the constructor of
  118|       |    // `DomainBase`, where `Derived` is already fully defined. If we try to
  119|       |    // check them at class scope we would see an incomplete `Derived` class and
  120|       |    // the checks would not work.
  121|      4|    fuzztest::internal::CheckIsSame<
  122|      4|        ValueType, fuzztest::internal::value_type_t<Derived>>();
  123|      4|    fuzztest::internal::CheckIsSame<
  124|      4|        CorpusType, fuzztest::internal::corpus_type_t<Derived>>();
  125|      4|    static_assert(has_custom_corpus_type == Derived::has_custom_corpus_type);
  126|       |
  127|       |    // Check that `Derived` type-checks against the type-erased `Domain<T>`
  128|       |    // interface by forcing the `DomainModel` template specialization.
  129|      4|    if (Derived* domain = nullptr) {
  ------------------
  |  Branch (129:18): [True: 0, False: 4]
  ------------------
  130|      0|      (void)fuzztest::internal::DomainModel<Derived>{*domain};
  131|      0|    }
  132|      4|  }
_ZN8fuzztest18domain_implementor10DomainBaseINS_8internal13ElementOfImplI17avifDecoderSourceEES4_NS2_23ElementOfImplCorpusTypeEEC2Ev:
  115|      4|  DomainBase() {
  116|       |    // Check that the interface of `Derived` matches the requirements for a
  117|       |    // domain implementation. We check these inside the constructor of
  118|       |    // `DomainBase`, where `Derived` is already fully defined. If we try to
  119|       |    // check them at class scope we would see an incomplete `Derived` class and
  120|       |    // the checks would not work.
  121|      4|    fuzztest::internal::CheckIsSame<
  122|      4|        ValueType, fuzztest::internal::value_type_t<Derived>>();
  123|      4|    fuzztest::internal::CheckIsSame<
  124|      4|        CorpusType, fuzztest::internal::corpus_type_t<Derived>>();
  125|      4|    static_assert(has_custom_corpus_type == Derived::has_custom_corpus_type);
  126|       |
  127|       |    // Check that `Derived` type-checks against the type-erased `Domain<T>`
  128|       |    // interface by forcing the `DomainModel` template specialization.
  129|      4|    if (Derived* domain = nullptr) {
  ------------------
  |  Branch (129:18): [True: 0, False: 4]
  ------------------
  130|      0|      (void)fuzztest::internal::DomainModel<Derived>{*domain};
  131|      0|    }
  132|      4|  }
_ZN8fuzztest18domain_implementor10DomainBaseINS_8internal13ElementOfImplImEEmNS2_23ElementOfImplCorpusTypeEEC2Ev:
  115|      8|  DomainBase() {
  116|       |    // Check that the interface of `Derived` matches the requirements for a
  117|       |    // domain implementation. We check these inside the constructor of
  118|       |    // `DomainBase`, where `Derived` is already fully defined. If we try to
  119|       |    // check them at class scope we would see an incomplete `Derived` class and
  120|       |    // the checks would not work.
  121|      8|    fuzztest::internal::CheckIsSame<
  122|      8|        ValueType, fuzztest::internal::value_type_t<Derived>>();
  123|      8|    fuzztest::internal::CheckIsSame<
  124|      8|        CorpusType, fuzztest::internal::corpus_type_t<Derived>>();
  125|      8|    static_assert(has_custom_corpus_type == Derived::has_custom_corpus_type);
  126|       |
  127|       |    // Check that `Derived` type-checks against the type-erased `Domain<T>`
  128|       |    // interface by forcing the `DomainModel` template specialization.
  129|      8|    if (Derived* domain = nullptr) {
  ------------------
  |  Branch (129:18): [True: 0, False: 8]
  ------------------
  130|      0|      (void)fuzztest::internal::DomainModel<Derived>{*domain};
  131|      0|    }
  132|      8|  }
_ZN8fuzztest18domain_implementor10DomainBaseINS_8internal13ElementOfImplIiEEiNS2_23ElementOfImplCorpusTypeEEC2Ev:
  115|      4|  DomainBase() {
  116|       |    // Check that the interface of `Derived` matches the requirements for a
  117|       |    // domain implementation. We check these inside the constructor of
  118|       |    // `DomainBase`, where `Derived` is already fully defined. If we try to
  119|       |    // check them at class scope we would see an incomplete `Derived` class and
  120|       |    // the checks would not work.
  121|      4|    fuzztest::internal::CheckIsSame<
  122|      4|        ValueType, fuzztest::internal::value_type_t<Derived>>();
  123|      4|    fuzztest::internal::CheckIsSame<
  124|      4|        CorpusType, fuzztest::internal::corpus_type_t<Derived>>();
  125|      4|    static_assert(has_custom_corpus_type == Derived::has_custom_corpus_type);
  126|       |
  127|       |    // Check that `Derived` type-checks against the type-erased `Domain<T>`
  128|       |    // interface by forcing the `DomainModel` template specialization.
  129|      4|    if (Derived* domain = nullptr) {
  ------------------
  |  Branch (129:18): [True: 0, False: 4]
  ------------------
  130|      0|      (void)fuzztest::internal::DomainModel<Derived>{*domain};
  131|      0|    }
  132|      4|  }
_ZN8fuzztest18domain_implementor10DomainBaseINS_8internal24BitFlagCombinationOfImplIjEEjjEC2Ev:
  115|      8|  DomainBase() {
  116|       |    // Check that the interface of `Derived` matches the requirements for a
  117|       |    // domain implementation. We check these inside the constructor of
  118|       |    // `DomainBase`, where `Derived` is already fully defined. If we try to
  119|       |    // check them at class scope we would see an incomplete `Derived` class and
  120|       |    // the checks would not work.
  121|      8|    fuzztest::internal::CheckIsSame<
  122|      8|        ValueType, fuzztest::internal::value_type_t<Derived>>();
  123|      8|    fuzztest::internal::CheckIsSame<
  124|      8|        CorpusType, fuzztest::internal::corpus_type_t<Derived>>();
  125|      8|    static_assert(has_custom_corpus_type == Derived::has_custom_corpus_type);
  126|       |
  127|       |    // Check that `Derived` type-checks against the type-erased `Domain<T>`
  128|       |    // interface by forcing the `DomainModel` template specialization.
  129|      8|    if (Derived* domain = nullptr) {
  ------------------
  |  Branch (129:18): [True: 0, False: 8]
  ------------------
  130|      0|      (void)fuzztest::internal::DomainModel<Derived>{*domain};
  131|      0|    }
  132|      8|  }

_ZN8fuzztest8internal11DomainModelINS0_15AggregateOfImplINSt3__15tupleIJNS3_12basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEbNS3_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEEEEELNS0_23RequireCustomCorpusTypeE0EJNS0_23SequenceContainerOfImplISA_NS0_13ArbitraryImplIcvEEEENSJ_IbvEENS0_7MapImplIPFSF_SF_jEJNSN_IPFSF_15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplISQ_EENS0_11InRangeImplIiEENSU_ISR_EESM_SM_SM_SM_NSU_ImEESZ_NSU_IiEENS0_24BitFlagCombinationOfImplIjEEEEES12_EEEEEEEC2EOS15_:
  130|      4|  explicit DomainModel(D&& domain) : domain_(std::forward<D>(domain)) {}
_ZN8fuzztest8internal20UntypedDomainConceptD2Ev:
   56|     16|  virtual ~UntypedDomainConcept() = default;
_ZNK8fuzztest8internal18TypedDomainConceptINSt3__15tupleIJNS2_12basic_stringIcNS2_11char_traitsIcEENS2_9allocatorIcEEEEbNS2_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEEEEEE12UntypedCloneEv:
  108|      2|  std::unique_ptr<UntypedDomainConcept> UntypedClone() const final {
  109|      2|    return TypedClone();
  110|      2|  }
_ZNK8fuzztest8internal11DomainModelINS0_15AggregateOfImplINSt3__15tupleIJNS3_12basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEbNS3_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEEEEELNS0_23RequireCustomCorpusTypeE0EJNS0_23SequenceContainerOfImplISA_NS0_13ArbitraryImplIcvEEEENSJ_IbvEENS0_7MapImplIPFSF_SF_jEJNSN_IPFSF_15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplISQ_EENS0_11InRangeImplIiEENSU_ISR_EESM_SM_SM_SM_NSU_ImEESZ_NSU_IiEENS0_24BitFlagCombinationOfImplIjEEEEES12_EEEEEEE18UntypedParseCorpusERKNS0_8IRObjectE:
  172|  22.1k|      const IRObject& obj) const final {
  173|  22.1k|    if (auto res = domain_.ParseCorpus(obj)) {
  ------------------
  |  Branch (173:14): [True: 20.6k, False: 1.54k]
  ------------------
  174|  20.6k|      return GenericDomainCorpusType(std::in_place_type<CorpusType>,
  175|  20.6k|                                     *std::move(res));
  176|  20.6k|    } else {
  177|  1.54k|      return std::nullopt;
  178|  1.54k|    }
  179|  22.1k|  }
_ZNK8fuzztest8internal11DomainModelINS0_15AggregateOfImplINSt3__15tupleIJNS3_12basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEbNS3_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEEEEELNS0_23RequireCustomCorpusTypeE0EJNS0_23SequenceContainerOfImplISA_NS0_13ArbitraryImplIcvEEEENSJ_IbvEENS0_7MapImplIPFSF_SF_jEJNSN_IPFSF_15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplISQ_EENS0_11InRangeImplIiEENSU_ISR_EESM_SM_SM_SM_NSU_ImEESZ_NSU_IiEENS0_24BitFlagCombinationOfImplIjEEEEES12_EEEEEEE26UntypedValidateCorpusValueERKNS0_11CopyableAnyE:
  187|  20.6k|      const GenericDomainCorpusType& corpus_value) const final {
  188|  20.6k|    return domain_.ValidateCorpusValue(corpus_value.GetAs<CorpusType>());
  189|  20.6k|  }
_ZNK8fuzztest8internal18TypedDomainConceptINSt3__15tupleIJNS2_12basic_stringIcNS2_11char_traitsIcEENS2_9allocatorIcEEEEbNS2_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEEEEEE15UntypedGetValueERKNS0_11CopyableAnyE:
  113|  20.5k|      const GenericDomainCorpusType& v) const final {
  114|  20.5k|    return GenericDomainValueType(std::in_place_type<ValueType>,
  115|  20.5k|                                  TypedGetValue(v));
  116|  20.5k|  }
_ZNK8fuzztest8internal11DomainModelINS0_15AggregateOfImplINSt3__15tupleIJNS3_12basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEbNS3_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEEEEELNS0_23RequireCustomCorpusTypeE0EJNS0_23SequenceContainerOfImplISA_NS0_13ArbitraryImplIcvEEEENSJ_IbvEENS0_7MapImplIPFSF_SF_jEJNSN_IPFSF_15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplISQ_EENS0_11InRangeImplIiEENSU_ISR_EESM_SM_SM_SM_NSU_ImEESZ_NSU_IiEENS0_24BitFlagCombinationOfImplIjEEEEES12_EEEEEEE10TypedCloneEv:
  132|     20|  std::unique_ptr<TypedDomainConcept<ValueType>> TypedClone() const final {
  133|     20|    return std::make_unique<DomainModel>(*this);
  134|     20|  }
_ZNK8fuzztest8internal11DomainModelINS0_15AggregateOfImplINSt3__15tupleIJNS3_12basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEbNS3_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEEEEELNS0_23RequireCustomCorpusTypeE0EJNS0_23SequenceContainerOfImplISA_NS0_13ArbitraryImplIcvEEEENSJ_IbvEENS0_7MapImplIPFSF_SF_jEJNSN_IPFSF_15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplISQ_EENS0_11InRangeImplIiEENSU_ISR_EESM_SM_SM_SM_NSU_ImEESZ_NSU_IiEENS0_24BitFlagCombinationOfImplIjEEEEES12_EEEEEEE13TypedGetValueERKNS0_11CopyableAnyE:
  157|  20.5k|  ValueType TypedGetValue(const GenericDomainCorpusType& v) const final {
  158|  20.5k|    return domain_.GetValue(v.GetAs<CorpusType>());
  159|  20.5k|  }

_ZNK8fuzztest8internal13ElementOfImplImE19ValidateCorpusValueERKNS0_23ElementOfImplCorpusTypeE:
  108|  41.1k|  absl::Status ValidateCorpusValue(const corpus_type& corpus_value) const {
  109|  41.1k|    if (static_cast<size_t>(corpus_value) < values_.size()) {
  ------------------
  |  Branch (109:9): [True: 41.0k, False: 31]
  ------------------
  110|  41.0k|      return absl::OkStatus();
  111|  41.0k|    }
  112|     31|    return absl::InvalidArgumentError("Invalid ElementOf() value");
  113|  41.1k|  }
_ZNK8fuzztest8internal13ElementOfImplImE11ParseCorpusERKNS0_8IRObjectE:
  100|  41.2k|  std::optional<corpus_type> ParseCorpus(const IRObject& obj) const {
  101|  41.2k|    return obj.ToCorpus<corpus_type>();
  102|  41.2k|  }
_ZNK8fuzztest8internal13ElementOfImplImE8GetValueENS0_23ElementOfImplCorpusTypeE:
   72|  41.0k|  value_type GetValue(corpus_type value) const {
   73|  41.0k|    return values_[static_cast<size_t>(value)];
   74|  41.0k|  }
_ZNK8fuzztest8internal13ElementOfImplIiE19ValidateCorpusValueERKNS0_23ElementOfImplCorpusTypeE:
  108|  20.5k|  absl::Status ValidateCorpusValue(const corpus_type& corpus_value) const {
  109|  20.5k|    if (static_cast<size_t>(corpus_value) < values_.size()) {
  ------------------
  |  Branch (109:9): [True: 20.5k, False: 16]
  ------------------
  110|  20.5k|      return absl::OkStatus();
  111|  20.5k|    }
  112|     16|    return absl::InvalidArgumentError("Invalid ElementOf() value");
  113|  20.5k|  }
_ZNK8fuzztest8internal13ElementOfImplIiE11ParseCorpusERKNS0_8IRObjectE:
  100|  20.6k|  std::optional<corpus_type> ParseCorpus(const IRObject& obj) const {
  101|  20.6k|    return obj.ToCorpus<corpus_type>();
  102|  20.6k|  }
_ZNK8fuzztest8internal13ElementOfImplIiE8GetValueENS0_23ElementOfImplCorpusTypeE:
   72|  20.5k|  value_type GetValue(corpus_type value) const {
   73|  20.5k|    return values_[static_cast<size_t>(value)];
   74|  20.5k|  }
_ZNK8fuzztest8internal13ElementOfImplI15avifCodecChoiceE19ValidateCorpusValueERKNS0_23ElementOfImplCorpusTypeE:
  108|  20.6k|  absl::Status ValidateCorpusValue(const corpus_type& corpus_value) const {
  109|  20.6k|    if (static_cast<size_t>(corpus_value) < values_.size()) {
  ------------------
  |  Branch (109:9): [True: 20.5k, False: 12]
  ------------------
  110|  20.5k|      return absl::OkStatus();
  111|  20.5k|    }
  112|     12|    return absl::InvalidArgumentError("Invalid ElementOf() value");
  113|  20.6k|  }
_ZNK8fuzztest8internal13ElementOfImplI15avifCodecChoiceE11ParseCorpusERKNS0_8IRObjectE:
  100|  20.6k|  std::optional<corpus_type> ParseCorpus(const IRObject& obj) const {
  101|  20.6k|    return obj.ToCorpus<corpus_type>();
  102|  20.6k|  }
_ZNK8fuzztest8internal13ElementOfImplI15avifCodecChoiceE8GetValueENS0_23ElementOfImplCorpusTypeE:
   72|  20.5k|  value_type GetValue(corpus_type value) const {
   73|  20.5k|    return values_[static_cast<size_t>(value)];
   74|  20.5k|  }
_ZNK8fuzztest8internal13ElementOfImplI17avifDecoderSourceE19ValidateCorpusValueERKNS0_23ElementOfImplCorpusTypeE:
  108|  20.5k|  absl::Status ValidateCorpusValue(const corpus_type& corpus_value) const {
  109|  20.5k|    if (static_cast<size_t>(corpus_value) < values_.size()) {
  ------------------
  |  Branch (109:9): [True: 20.5k, False: 13]
  ------------------
  110|  20.5k|      return absl::OkStatus();
  111|  20.5k|    }
  112|     13|    return absl::InvalidArgumentError("Invalid ElementOf() value");
  113|  20.5k|  }
_ZNK8fuzztest8internal13ElementOfImplI17avifDecoderSourceE11ParseCorpusERKNS0_8IRObjectE:
  100|  20.6k|  std::optional<corpus_type> ParseCorpus(const IRObject& obj) const {
  101|  20.6k|    return obj.ToCorpus<corpus_type>();
  102|  20.6k|  }
_ZNK8fuzztest8internal13ElementOfImplI17avifDecoderSourceE8GetValueENS0_23ElementOfImplCorpusTypeE:
   72|  20.5k|  value_type GetValue(corpus_type value) const {
   73|  20.5k|    return values_[static_cast<size_t>(value)];
   74|  20.5k|  }
_ZN8fuzztest8internal13ElementOfImplI15avifCodecChoiceEC2ENSt3__16vectorIS2_NS4_9allocatorIS2_EEEE:
   44|      4|  explicit ElementOfImpl(std::vector<T> values) : values_(values) {
   45|      4|    FUZZTEST_INTERNAL_CHECK_PRECONDITION(
  ------------------
  |  |   42|      4|  ((P) ? (void)0                                     \
  |  |  ------------------
  |  |  |  Branch (42:4): [True: 4, False: 0]
  |  |  ------------------
  |  |   43|      4|       : ::fuzztest::internal::Abort(                \
  |  |   44|      0|             __FILE__, __LINE__,                     \
  |  |   45|      0|             absl::StrCat("Failed precondition (", #P, "): ", __VA_ARGS__)))
  ------------------
   46|      4|        !values.empty(), "ElementOf requires a non empty list.");
   47|      4|  }
_ZN8fuzztest8internal13ElementOfImplI17avifDecoderSourceEC2ENSt3__16vectorIS2_NS4_9allocatorIS2_EEEE:
   44|      4|  explicit ElementOfImpl(std::vector<T> values) : values_(values) {
   45|      4|    FUZZTEST_INTERNAL_CHECK_PRECONDITION(
  ------------------
  |  |   42|      4|  ((P) ? (void)0                                     \
  |  |  ------------------
  |  |  |  Branch (42:4): [True: 4, False: 0]
  |  |  ------------------
  |  |   43|      4|       : ::fuzztest::internal::Abort(                \
  |  |   44|      0|             __FILE__, __LINE__,                     \
  |  |   45|      0|             absl::StrCat("Failed precondition (", #P, "): ", __VA_ARGS__)))
  ------------------
   46|      4|        !values.empty(), "ElementOf requires a non empty list.");
   47|      4|  }
_ZN8fuzztest8internal13ElementOfImplImEC2ENSt3__16vectorImNS3_9allocatorImEEEE:
   44|      8|  explicit ElementOfImpl(std::vector<T> values) : values_(values) {
   45|      8|    FUZZTEST_INTERNAL_CHECK_PRECONDITION(
  ------------------
  |  |   42|      8|  ((P) ? (void)0                                     \
  |  |  ------------------
  |  |  |  Branch (42:4): [True: 8, False: 0]
  |  |  ------------------
  |  |   43|      8|       : ::fuzztest::internal::Abort(                \
  |  |   44|      0|             __FILE__, __LINE__,                     \
  |  |   45|      0|             absl::StrCat("Failed precondition (", #P, "): ", __VA_ARGS__)))
  ------------------
   46|      8|        !values.empty(), "ElementOf requires a non empty list.");
   47|      8|  }
_ZN8fuzztest8internal13ElementOfImplIiEC2ENSt3__16vectorIiNS3_9allocatorIiEEEE:
   44|      4|  explicit ElementOfImpl(std::vector<T> values) : values_(values) {
   45|      4|    FUZZTEST_INTERNAL_CHECK_PRECONDITION(
  ------------------
  |  |   42|      4|  ((P) ? (void)0                                     \
  |  |  ------------------
  |  |  |  Branch (42:4): [True: 4, False: 0]
  |  |  ------------------
  |  |   43|      4|       : ::fuzztest::internal::Abort(                \
  |  |   44|      0|             __FILE__, __LINE__,                     \
  |  |   45|      0|             absl::StrCat("Failed precondition (", #P, "): ", __VA_ARGS__)))
  ------------------
   46|      4|        !values.empty(), "ElementOf requires a non empty list.");
   47|      4|  }

_ZNK8fuzztest8internal11InRangeImplIiE19ValidateCorpusValueERKi:
  175|  20.5k|  absl::Status ValidateCorpusValue(const value_type& corpus_value) const {
  176|  20.5k|    if (min_ <= corpus_value && corpus_value <= max_) return absl::OkStatus();
  ------------------
  |  Branch (176:9): [True: 20.5k, False: 3]
  |  Branch (176:33): [True: 20.5k, False: 9]
  ------------------
  177|       |    // We cannot just absl::StrCat() the error message, because it doesn't
  178|       |    // accept some types (like char).
  179|     12|    std::string error_message;
  180|     12|    absl::Format(&error_message, "The value ");
  181|     12|    domain_implementor::PrintValue(*this, corpus_value, &error_message,
  182|     12|                                   domain_implementor::PrintMode::kSourceCode);
  183|     12|    absl::Format(&error_message, " is not InRange(");
  184|     12|    domain_implementor::PrintValue(*this, min_, &error_message,
  185|     12|                                   domain_implementor::PrintMode::kSourceCode);
  186|     12|    absl::Format(&error_message, ", ");
  187|     12|    domain_implementor::PrintValue(*this, max_, &error_message,
  188|     12|                                   domain_implementor::PrintMode::kSourceCode);
  189|     12|    absl::Format(&error_message, ")");
  190|     12|    return absl::InvalidArgumentError(error_message);
  191|  20.5k|  }
_ZNK8fuzztest8internal11InRangeImplIiE10GetPrinterEv:
  193|     36|  auto GetPrinter() const {
  194|     36|    if constexpr (std::numeric_limits<T>::is_integer) {
  195|     36|      return IntegralPrinter{};
  196|       |    } else {
  197|       |      return FloatingPrinter{};
  198|       |    }
  199|     36|  }
_ZN8fuzztest8internal11InRangeImplIiEC2Eii:
   55|      4|  explicit InRangeImpl(T min, T max) : min_(min), max_(max) {
   56|      4|    FUZZTEST_INTERNAL_CHECK_PRECONDITION(
  ------------------
  |  |   42|      4|  ((P) ? (void)0                                     \
  |  |  ------------------
  |  |  |  Branch (42:4): [True: 4, False: 0]
  |  |  ------------------
  |  |   43|      4|       : ::fuzztest::internal::Abort(                \
  |  |   44|      0|             __FILE__, __LINE__,                     \
  |  |   45|      0|             absl::StrCat("Failed precondition (", #P, "): ", __VA_ARGS__)))
  ------------------
   57|      4|        min <= max, "min must be less than or equal to max!");
   58|       |    if constexpr (!T_is_integer) {
   59|       |      FUZZTEST_INTERNAL_CHECK_PRECONDITION(
   60|       |          !(min == std::numeric_limits<T>::lowest() &&
   61|       |            max == std::numeric_limits<T>::max()),
   62|       |          "Consider using the Finite<T>() domain instead.");
   63|       |      FUZZTEST_INTERNAL_CHECK_PRECONDITION(std::isfinite(max - min),
   64|       |                                           "Range is too large!");
   65|       |    }
   66|      4|    if constexpr (T_is_integer) {
   67|       |      // Find the longest common prefix
   68|       |      // (from the most significant bit to the least significant bit) of
   69|       |      // min_ and max_, and we only mutate the bits after the prefix.
   70|       |      // This way it can somehow restrict the bit flipping range, but it
   71|       |      // may still fail for range like [0b10000000, 0b01111111] which has
   72|       |      // no valid bit flipping mutations.
   73|       |      // We need to split the signed type range to positve range and
   74|       |      // negative range because of their two's complement representation.
   75|      4|      if constexpr (T_is_signed) {
   76|      4|        if (min_ < 0 && max_ >= 0) {
  ------------------
  |  Branch (76:13): [True: 0, False: 4]
  |  Branch (76:25): [True: 0, False: 0]
  ------------------
   77|      0|          largest_mutable_bit_negative = BitWidth(~min);
   78|      0|          largest_mutable_bit_positive = BitWidth(max);
   79|      4|        } else if (min_ >= 0) {
  ------------------
  |  Branch (79:20): [True: 4, False: 0]
  ------------------
   80|      4|          largest_mutable_bit_positive = BitWidth(min ^ max);
   81|      4|        } else if (max_ < 0) {
  ------------------
  |  Branch (81:20): [True: 0, False: 0]
  ------------------
   82|      0|          largest_mutable_bit_negative = BitWidth(min ^ max);
   83|      0|        }
   84|       |      } else {
   85|       |        largest_mutable_bit_positive = BitWidth(min ^ max);
   86|       |      }
   87|      4|    }
   88|      4|  }

_ZZNK8fuzztest8internal7MapImplIPFNSt3__110unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEE15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplIS8_EENS0_11InRangeImplIiEENSC_IS9_EENS0_13ArbitraryImplIbvEESI_SI_SI_NSC_ImEESJ_NSC_IiEENS0_24BitFlagCombinationOfImplIjEEEE19ValidateCorpusValueERKNS2_5tupleIJNS0_23ElementOfImplCorpusTypeEiSP_bbbbSP_SP_SP_jEEEENKUlDpT_E_clIJNS2_17integral_constantImLm0EEENSX_ImLm1EEENSX_ImLm2EEENSX_ImLm3EEENSX_ImLm4EEENSX_ImLm5EEENSX_ImLm6EEENSX_ImLm7EEENSX_ImLm8EEENSX_ImLm9EEENSX_ImLm10EEEEEEDaSU_:
   95|  20.6k|    ApplyIndex<sizeof...(Inner)>([&](auto... I) {
   96|  20.6k|      (
   97|  20.6k|          [&] {
   98|  20.6k|            if (!result.ok()) return;
   99|  20.6k|            const absl::Status s = std::get<I>(inner_).ValidateCorpusValue(
  100|  20.6k|                std::get<I>(corpus_value));
  101|  20.6k|            result = Prefix(s, "Invalid value for Map()-ed domain");
  102|  20.6k|          }(),
  103|  20.6k|          ...);
  104|  20.6k|    });
_ZNK8fuzztest8internal7MapImplIPFNSt3__110unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEE15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplIS8_EENS0_11InRangeImplIiEENSC_IS9_EENS0_13ArbitraryImplIbvEESI_SI_SI_NSC_ImEESJ_NSC_IiEENS0_24BitFlagCombinationOfImplIjEEEE19ValidateCorpusValueERKNS2_5tupleIJNS0_23ElementOfImplCorpusTypeEiSP_bbbbSP_SP_SP_jEEE:
   93|  20.6k|  absl::Status ValidateCorpusValue(const corpus_type& corpus_value) const {
   94|  20.6k|    absl::Status result = absl::OkStatus();
   95|  20.6k|    ApplyIndex<sizeof...(Inner)>([&](auto... I) {
   96|  20.6k|      (
   97|  20.6k|          [&] {
   98|  20.6k|            if (!result.ok()) return;
   99|  20.6k|            const absl::Status s = std::get<I>(inner_).ValidateCorpusValue(
  100|  20.6k|                std::get<I>(corpus_value));
  101|  20.6k|            result = Prefix(s, "Invalid value for Map()-ed domain");
  102|  20.6k|          }(),
  103|  20.6k|          ...);
  104|  20.6k|    });
  105|  20.6k|    return result;
  106|  20.6k|  }
_ZNK8fuzztest8internal7MapImplIPFNSt3__110unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEE15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplIS8_EENS0_11InRangeImplIiEENSC_IS9_EENS0_13ArbitraryImplIbvEESI_SI_SI_NSC_ImEESJ_NSC_IiEENS0_24BitFlagCombinationOfImplIjEEEE11ParseCorpusERKNS0_8IRObjectE:
   85|  20.6k|  std::optional<corpus_type> ParseCorpus(const IRObject& obj) const {
   86|  20.6k|    return ParseWithDomainTuple(inner_, obj);
   87|  20.6k|  }
_ZZNK8fuzztest8internal7MapImplIPFNSt3__110unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEE15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplIS8_EENS0_11InRangeImplIiEENSC_IS9_EENS0_13ArbitraryImplIbvEESI_SI_SI_NSC_ImEESJ_NSC_IiEENS0_24BitFlagCombinationOfImplIjEEEE8GetValueERKNS2_5tupleIJNS0_23ElementOfImplCorpusTypeEiSP_bbbbSP_SP_SP_jEEEENKUlDpT_E_clIJNS2_17integral_constantImLm0EEENSX_ImLm1EEENSX_ImLm2EEENSX_ImLm3EEENSX_ImLm4EEENSX_ImLm5EEENSX_ImLm6EEENSX_ImLm7EEENSX_ImLm8EEENSX_ImLm9EEENSX_ImLm10EEEEEEDaSU_:
   72|  20.5k|    return ApplyIndex<sizeof...(Inner)>([&](auto... I) {
   73|  20.5k|      return mapper_(std::get<I>(inner_).GetValue(std::get<I>(v))...);
   74|  20.5k|    });
_ZNK8fuzztest8internal7MapImplIPFNSt3__110unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEE15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplIS8_EENS0_11InRangeImplIiEENSC_IS9_EENS0_13ArbitraryImplIbvEESI_SI_SI_NSC_ImEESJ_NSC_IiEENS0_24BitFlagCombinationOfImplIjEEEE8GetValueERKNS2_5tupleIJNS0_23ElementOfImplCorpusTypeEiSP_bbbbSP_SP_SP_jEEE:
   71|  20.5k|  value_type GetValue(const corpus_type& v) const {
   72|  20.5k|    return ApplyIndex<sizeof...(Inner)>([&](auto... I) {
   73|  20.5k|      return mapper_(std::get<I>(inner_).GetValue(std::get<I>(v))...);
   74|  20.5k|    });
   75|  20.5k|  }
_ZZNK8fuzztest8internal7MapImplIPFNSt3__110unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEES7_jEJNS1_IPFS7_15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplISA_EENS0_11InRangeImplIiEENSE_ISB_EENS0_13ArbitraryImplIbvEESK_SK_SK_NSE_ImEESL_NSE_IiEENS0_24BitFlagCombinationOfImplIjEEEEESO_EE19ValidateCorpusValueERKNS2_5tupleIJNSR_IJNS0_23ElementOfImplCorpusTypeEiSS_bbbbSS_SS_SS_jEEEjEEEENKUlDpT_E_clIJNS2_17integral_constantImLm0EEENS11_ImLm1EEEEEEDaSY_:
   95|  20.6k|    ApplyIndex<sizeof...(Inner)>([&](auto... I) {
   96|  20.6k|      (
   97|  20.6k|          [&] {
   98|  20.6k|            if (!result.ok()) return;
   99|  20.6k|            const absl::Status s = std::get<I>(inner_).ValidateCorpusValue(
  100|  20.6k|                std::get<I>(corpus_value));
  101|  20.6k|            result = Prefix(s, "Invalid value for Map()-ed domain");
  102|  20.6k|          }(),
  103|  20.6k|          ...);
  104|  20.6k|    });
_ZNK8fuzztest8internal7MapImplIPFNSt3__110unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEES7_jEJNS1_IPFS7_15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplISA_EENS0_11InRangeImplIiEENSE_ISB_EENS0_13ArbitraryImplIbvEESK_SK_SK_NSE_ImEESL_NSE_IiEENS0_24BitFlagCombinationOfImplIjEEEEESO_EE19ValidateCorpusValueERKNS2_5tupleIJNSR_IJNS0_23ElementOfImplCorpusTypeEiSS_bbbbSS_SS_SS_jEEEjEEE:
   93|  20.6k|  absl::Status ValidateCorpusValue(const corpus_type& corpus_value) const {
   94|  20.6k|    absl::Status result = absl::OkStatus();
   95|  20.6k|    ApplyIndex<sizeof...(Inner)>([&](auto... I) {
   96|  20.6k|      (
   97|  20.6k|          [&] {
   98|  20.6k|            if (!result.ok()) return;
   99|  20.6k|            const absl::Status s = std::get<I>(inner_).ValidateCorpusValue(
  100|  20.6k|                std::get<I>(corpus_value));
  101|  20.6k|            result = Prefix(s, "Invalid value for Map()-ed domain");
  102|  20.6k|          }(),
  103|  20.6k|          ...);
  104|  20.6k|    });
  105|  20.6k|    return result;
  106|  20.6k|  }
_ZNK8fuzztest8internal7MapImplIPFNSt3__110unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEES7_jEJNS1_IPFS7_15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplISA_EENS0_11InRangeImplIiEENSE_ISB_EENS0_13ArbitraryImplIbvEESK_SK_SK_NSE_ImEESL_NSE_IiEENS0_24BitFlagCombinationOfImplIjEEEEESO_EE11ParseCorpusERKNS0_8IRObjectE:
   85|  20.6k|  std::optional<corpus_type> ParseCorpus(const IRObject& obj) const {
   86|  20.6k|    return ParseWithDomainTuple(inner_, obj);
   87|  20.6k|  }
_ZZNK8fuzztest8internal7MapImplIPFNSt3__110unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEES7_jEJNS1_IPFS7_15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplISA_EENS0_11InRangeImplIiEENSE_ISB_EENS0_13ArbitraryImplIbvEESK_SK_SK_NSE_ImEESL_NSE_IiEENS0_24BitFlagCombinationOfImplIjEEEEESO_EE8GetValueERKNS2_5tupleIJNSR_IJNS0_23ElementOfImplCorpusTypeEiSS_bbbbSS_SS_SS_jEEEjEEEENKUlDpT_E_clIJNS2_17integral_constantImLm0EEENS11_ImLm1EEEEEEDaSY_:
   72|  20.5k|    return ApplyIndex<sizeof...(Inner)>([&](auto... I) {
   73|  20.5k|      return mapper_(std::get<I>(inner_).GetValue(std::get<I>(v))...);
   74|  20.5k|    });
_ZNK8fuzztest8internal7MapImplIPFNSt3__110unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEES7_jEJNS1_IPFS7_15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplISA_EENS0_11InRangeImplIiEENSE_ISB_EENS0_13ArbitraryImplIbvEESK_SK_SK_NSE_ImEESL_NSE_IiEENS0_24BitFlagCombinationOfImplIjEEEEESO_EE8GetValueERKNS2_5tupleIJNSR_IJNS0_23ElementOfImplCorpusTypeEiSS_bbbbSS_SS_SS_jEEEjEEE:
   71|  20.5k|  value_type GetValue(const corpus_type& v) const {
   72|  20.5k|    return ApplyIndex<sizeof...(Inner)>([&](auto... I) {
   73|  20.5k|      return mapper_(std::get<I>(inner_).GetValue(std::get<I>(v))...);
   74|  20.5k|    });
   75|  20.5k|  }
_ZZZNK8fuzztest8internal7MapImplIPFNSt3__110unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEES7_jEJNS1_IPFS7_15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplISA_EENS0_11InRangeImplIiEENSE_ISB_EENS0_13ArbitraryImplIbvEESK_SK_SK_NSE_ImEESL_NSE_IiEENS0_24BitFlagCombinationOfImplIjEEEEESO_EE19ValidateCorpusValueERKNS2_5tupleIJNSR_IJNS0_23ElementOfImplCorpusTypeEiSS_bbbbSS_SS_SS_jEEEjEEEENKUlDpT_E_clIJNS2_17integral_constantImLm0EEENS11_ImLm1EEEEEEDaSY_ENKUlvE0_clEv:
   97|  20.6k|          [&] {
   98|  20.6k|            if (!result.ok()) return;
  ------------------
  |  Branch (98:17): [True: 0, False: 20.6k]
  ------------------
   99|  20.6k|            const absl::Status s = std::get<I>(inner_).ValidateCorpusValue(
  100|  20.6k|                std::get<I>(corpus_value));
  101|  20.6k|            result = Prefix(s, "Invalid value for Map()-ed domain");
  102|  20.6k|          }(),
_ZZZNK8fuzztest8internal7MapImplIPFNSt3__110unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEE15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplIS8_EENS0_11InRangeImplIiEENSC_IS9_EENS0_13ArbitraryImplIbvEESI_SI_SI_NSC_ImEESJ_NSC_IiEENS0_24BitFlagCombinationOfImplIjEEEE19ValidateCorpusValueERKNS2_5tupleIJNS0_23ElementOfImplCorpusTypeEiSP_bbbbSP_SP_SP_jEEEENKUlDpT_E_clIJNS2_17integral_constantImLm0EEENSX_ImLm1EEENSX_ImLm2EEENSX_ImLm3EEENSX_ImLm4EEENSX_ImLm5EEENSX_ImLm6EEENSX_ImLm7EEENSX_ImLm8EEENSX_ImLm9EEENSX_ImLm10EEEEEEDaSU_ENKUlvE9_clEv:
   97|  20.6k|          [&] {
   98|  20.6k|            if (!result.ok()) return;
  ------------------
  |  Branch (98:17): [True: 0, False: 20.6k]
  ------------------
   99|  20.6k|            const absl::Status s = std::get<I>(inner_).ValidateCorpusValue(
  100|  20.6k|                std::get<I>(corpus_value));
  101|  20.6k|            result = Prefix(s, "Invalid value for Map()-ed domain");
  102|  20.6k|          }(),
_ZZZNK8fuzztest8internal7MapImplIPFNSt3__110unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEE15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplIS8_EENS0_11InRangeImplIiEENSC_IS9_EENS0_13ArbitraryImplIbvEESI_SI_SI_NSC_ImEESJ_NSC_IiEENS0_24BitFlagCombinationOfImplIjEEEE19ValidateCorpusValueERKNS2_5tupleIJNS0_23ElementOfImplCorpusTypeEiSP_bbbbSP_SP_SP_jEEEENKUlDpT_E_clIJNS2_17integral_constantImLm0EEENSX_ImLm1EEENSX_ImLm2EEENSX_ImLm3EEENSX_ImLm4EEENSX_ImLm5EEENSX_ImLm6EEENSX_ImLm7EEENSX_ImLm8EEENSX_ImLm9EEENSX_ImLm10EEEEEEDaSU_ENKUlvE8_clEv:
   97|  20.6k|          [&] {
   98|  20.6k|            if (!result.ok()) return;
  ------------------
  |  Branch (98:17): [True: 12, False: 20.5k]
  ------------------
   99|  20.5k|            const absl::Status s = std::get<I>(inner_).ValidateCorpusValue(
  100|  20.5k|                std::get<I>(corpus_value));
  101|  20.5k|            result = Prefix(s, "Invalid value for Map()-ed domain");
  102|  20.5k|          }(),
_ZZZNK8fuzztest8internal7MapImplIPFNSt3__110unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEE15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplIS8_EENS0_11InRangeImplIiEENSC_IS9_EENS0_13ArbitraryImplIbvEESI_SI_SI_NSC_ImEESJ_NSC_IiEENS0_24BitFlagCombinationOfImplIjEEEE19ValidateCorpusValueERKNS2_5tupleIJNS0_23ElementOfImplCorpusTypeEiSP_bbbbSP_SP_SP_jEEEENKUlDpT_E_clIJNS2_17integral_constantImLm0EEENSX_ImLm1EEENSX_ImLm2EEENSX_ImLm3EEENSX_ImLm4EEENSX_ImLm5EEENSX_ImLm6EEENSX_ImLm7EEENSX_ImLm8EEENSX_ImLm9EEENSX_ImLm10EEEEEEDaSU_ENKUlvE7_clEv:
   97|  20.6k|          [&] {
   98|  20.6k|            if (!result.ok()) return;
  ------------------
  |  Branch (98:17): [True: 24, False: 20.5k]
  ------------------
   99|  20.5k|            const absl::Status s = std::get<I>(inner_).ValidateCorpusValue(
  100|  20.5k|                std::get<I>(corpus_value));
  101|  20.5k|            result = Prefix(s, "Invalid value for Map()-ed domain");
  102|  20.5k|          }(),
_ZZZNK8fuzztest8internal7MapImplIPFNSt3__110unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEE15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplIS8_EENS0_11InRangeImplIiEENSC_IS9_EENS0_13ArbitraryImplIbvEESI_SI_SI_NSC_ImEESJ_NSC_IiEENS0_24BitFlagCombinationOfImplIjEEEE19ValidateCorpusValueERKNS2_5tupleIJNS0_23ElementOfImplCorpusTypeEiSP_bbbbSP_SP_SP_jEEEENKUlDpT_E_clIJNS2_17integral_constantImLm0EEENSX_ImLm1EEENSX_ImLm2EEENSX_ImLm3EEENSX_ImLm4EEENSX_ImLm5EEENSX_ImLm6EEENSX_ImLm7EEENSX_ImLm8EEENSX_ImLm9EEENSX_ImLm10EEEEEEDaSU_ENKUlvE6_clEv:
   97|  20.6k|          [&] {
   98|  20.6k|            if (!result.ok()) return;
  ------------------
  |  Branch (98:17): [True: 37, False: 20.5k]
  ------------------
   99|  20.5k|            const absl::Status s = std::get<I>(inner_).ValidateCorpusValue(
  100|  20.5k|                std::get<I>(corpus_value));
  101|  20.5k|            result = Prefix(s, "Invalid value for Map()-ed domain");
  102|  20.5k|          }(),
_ZZZNK8fuzztest8internal7MapImplIPFNSt3__110unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEE15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplIS8_EENS0_11InRangeImplIiEENSC_IS9_EENS0_13ArbitraryImplIbvEESI_SI_SI_NSC_ImEESJ_NSC_IiEENS0_24BitFlagCombinationOfImplIjEEEE19ValidateCorpusValueERKNS2_5tupleIJNS0_23ElementOfImplCorpusTypeEiSP_bbbbSP_SP_SP_jEEEENKUlDpT_E_clIJNS2_17integral_constantImLm0EEENSX_ImLm1EEENSX_ImLm2EEENSX_ImLm3EEENSX_ImLm4EEENSX_ImLm5EEENSX_ImLm6EEENSX_ImLm7EEENSX_ImLm8EEENSX_ImLm9EEENSX_ImLm10EEEEEEDaSU_ENKUlvE5_clEv:
   97|  20.6k|          [&] {
   98|  20.6k|            if (!result.ok()) return;
  ------------------
  |  Branch (98:17): [True: 37, False: 20.5k]
  ------------------
   99|  20.5k|            const absl::Status s = std::get<I>(inner_).ValidateCorpusValue(
  100|  20.5k|                std::get<I>(corpus_value));
  101|  20.5k|            result = Prefix(s, "Invalid value for Map()-ed domain");
  102|  20.5k|          }(),
_ZZZNK8fuzztest8internal7MapImplIPFNSt3__110unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEE15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplIS8_EENS0_11InRangeImplIiEENSC_IS9_EENS0_13ArbitraryImplIbvEESI_SI_SI_NSC_ImEESJ_NSC_IiEENS0_24BitFlagCombinationOfImplIjEEEE19ValidateCorpusValueERKNS2_5tupleIJNS0_23ElementOfImplCorpusTypeEiSP_bbbbSP_SP_SP_jEEEENKUlDpT_E_clIJNS2_17integral_constantImLm0EEENSX_ImLm1EEENSX_ImLm2EEENSX_ImLm3EEENSX_ImLm4EEENSX_ImLm5EEENSX_ImLm6EEENSX_ImLm7EEENSX_ImLm8EEENSX_ImLm9EEENSX_ImLm10EEEEEEDaSU_ENKUlvE4_clEv:
   97|  20.6k|          [&] {
   98|  20.6k|            if (!result.ok()) return;
  ------------------
  |  Branch (98:17): [True: 37, False: 20.5k]
  ------------------
   99|  20.5k|            const absl::Status s = std::get<I>(inner_).ValidateCorpusValue(
  100|  20.5k|                std::get<I>(corpus_value));
  101|  20.5k|            result = Prefix(s, "Invalid value for Map()-ed domain");
  102|  20.5k|          }(),
_ZZZNK8fuzztest8internal7MapImplIPFNSt3__110unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEE15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplIS8_EENS0_11InRangeImplIiEENSC_IS9_EENS0_13ArbitraryImplIbvEESI_SI_SI_NSC_ImEESJ_NSC_IiEENS0_24BitFlagCombinationOfImplIjEEEE19ValidateCorpusValueERKNS2_5tupleIJNS0_23ElementOfImplCorpusTypeEiSP_bbbbSP_SP_SP_jEEEENKUlDpT_E_clIJNS2_17integral_constantImLm0EEENSX_ImLm1EEENSX_ImLm2EEENSX_ImLm3EEENSX_ImLm4EEENSX_ImLm5EEENSX_ImLm6EEENSX_ImLm7EEENSX_ImLm8EEENSX_ImLm9EEENSX_ImLm10EEEEEEDaSU_ENKUlvE3_clEv:
   97|  20.6k|          [&] {
   98|  20.6k|            if (!result.ok()) return;
  ------------------
  |  Branch (98:17): [True: 37, False: 20.5k]
  ------------------
   99|  20.5k|            const absl::Status s = std::get<I>(inner_).ValidateCorpusValue(
  100|  20.5k|                std::get<I>(corpus_value));
  101|  20.5k|            result = Prefix(s, "Invalid value for Map()-ed domain");
  102|  20.5k|          }(),
_ZZZNK8fuzztest8internal7MapImplIPFNSt3__110unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEE15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplIS8_EENS0_11InRangeImplIiEENSC_IS9_EENS0_13ArbitraryImplIbvEESI_SI_SI_NSC_ImEESJ_NSC_IiEENS0_24BitFlagCombinationOfImplIjEEEE19ValidateCorpusValueERKNS2_5tupleIJNS0_23ElementOfImplCorpusTypeEiSP_bbbbSP_SP_SP_jEEEENKUlDpT_E_clIJNS2_17integral_constantImLm0EEENSX_ImLm1EEENSX_ImLm2EEENSX_ImLm3EEENSX_ImLm4EEENSX_ImLm5EEENSX_ImLm6EEENSX_ImLm7EEENSX_ImLm8EEENSX_ImLm9EEENSX_ImLm10EEEEEEDaSU_ENKUlvE2_clEv:
   97|  20.6k|          [&] {
   98|  20.6k|            if (!result.ok()) return;
  ------------------
  |  Branch (98:17): [True: 37, False: 20.5k]
  ------------------
   99|  20.5k|            const absl::Status s = std::get<I>(inner_).ValidateCorpusValue(
  100|  20.5k|                std::get<I>(corpus_value));
  101|  20.5k|            result = Prefix(s, "Invalid value for Map()-ed domain");
  102|  20.5k|          }(),
_ZZZNK8fuzztest8internal7MapImplIPFNSt3__110unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEE15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplIS8_EENS0_11InRangeImplIiEENSC_IS9_EENS0_13ArbitraryImplIbvEESI_SI_SI_NSC_ImEESJ_NSC_IiEENS0_24BitFlagCombinationOfImplIjEEEE19ValidateCorpusValueERKNS2_5tupleIJNS0_23ElementOfImplCorpusTypeEiSP_bbbbSP_SP_SP_jEEEENKUlDpT_E_clIJNS2_17integral_constantImLm0EEENSX_ImLm1EEENSX_ImLm2EEENSX_ImLm3EEENSX_ImLm4EEENSX_ImLm5EEENSX_ImLm6EEENSX_ImLm7EEENSX_ImLm8EEENSX_ImLm9EEENSX_ImLm10EEEEEEDaSU_ENKUlvE1_clEv:
   97|  20.6k|          [&] {
   98|  20.6k|            if (!result.ok()) return;
  ------------------
  |  Branch (98:17): [True: 54, False: 20.5k]
  ------------------
   99|  20.5k|            const absl::Status s = std::get<I>(inner_).ValidateCorpusValue(
  100|  20.5k|                std::get<I>(corpus_value));
  101|  20.5k|            result = Prefix(s, "Invalid value for Map()-ed domain");
  102|  20.5k|          }(),
_ZZZNK8fuzztest8internal7MapImplIPFNSt3__110unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEE15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplIS8_EENS0_11InRangeImplIiEENSC_IS9_EENS0_13ArbitraryImplIbvEESI_SI_SI_NSC_ImEESJ_NSC_IiEENS0_24BitFlagCombinationOfImplIjEEEE19ValidateCorpusValueERKNS2_5tupleIJNS0_23ElementOfImplCorpusTypeEiSP_bbbbSP_SP_SP_jEEEENKUlDpT_E_clIJNS2_17integral_constantImLm0EEENSX_ImLm1EEENSX_ImLm2EEENSX_ImLm3EEENSX_ImLm4EEENSX_ImLm5EEENSX_ImLm6EEENSX_ImLm7EEENSX_ImLm8EEENSX_ImLm9EEENSX_ImLm10EEEEEEDaSU_ENKUlvE0_clEv:
   97|  20.6k|          [&] {
   98|  20.6k|            if (!result.ok()) return;
  ------------------
  |  Branch (98:17): [True: 68, False: 20.5k]
  ------------------
   99|  20.5k|            const absl::Status s = std::get<I>(inner_).ValidateCorpusValue(
  100|  20.5k|                std::get<I>(corpus_value));
  101|  20.5k|            result = Prefix(s, "Invalid value for Map()-ed domain");
  102|  20.5k|          }(),
_ZZZNK8fuzztest8internal7MapImplIPFNSt3__110unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEE15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplIS8_EENS0_11InRangeImplIiEENSC_IS9_EENS0_13ArbitraryImplIbvEESI_SI_SI_NSC_ImEESJ_NSC_IiEENS0_24BitFlagCombinationOfImplIjEEEE19ValidateCorpusValueERKNS2_5tupleIJNS0_23ElementOfImplCorpusTypeEiSP_bbbbSP_SP_SP_jEEEENKUlDpT_E_clIJNS2_17integral_constantImLm0EEENSX_ImLm1EEENSX_ImLm2EEENSX_ImLm3EEENSX_ImLm4EEENSX_ImLm5EEENSX_ImLm6EEENSX_ImLm7EEENSX_ImLm8EEENSX_ImLm9EEENSX_ImLm10EEEEEEDaSU_ENKUlvE_clEv:
   97|  20.6k|          [&] {
   98|  20.6k|            if (!result.ok()) return;
  ------------------
  |  Branch (98:17): [True: 84, False: 20.5k]
  ------------------
   99|  20.5k|            const absl::Status s = std::get<I>(inner_).ValidateCorpusValue(
  100|  20.5k|                std::get<I>(corpus_value));
  101|  20.5k|            result = Prefix(s, "Invalid value for Map()-ed domain");
  102|  20.5k|          }(),
_ZZZNK8fuzztest8internal7MapImplIPFNSt3__110unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEES7_jEJNS1_IPFS7_15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplISA_EENS0_11InRangeImplIiEENSE_ISB_EENS0_13ArbitraryImplIbvEESK_SK_SK_NSE_ImEESL_NSE_IiEENS0_24BitFlagCombinationOfImplIjEEEEESO_EE19ValidateCorpusValueERKNS2_5tupleIJNSR_IJNS0_23ElementOfImplCorpusTypeEiSS_bbbbSS_SS_SS_jEEEjEEEENKUlDpT_E_clIJNS2_17integral_constantImLm0EEENS11_ImLm1EEEEEEDaSY_ENKUlvE_clEv:
   97|  20.6k|          [&] {
   98|  20.6k|            if (!result.ok()) return;
  ------------------
  |  Branch (98:17): [True: 93, False: 20.5k]
  ------------------
   99|  20.5k|            const absl::Status s = std::get<I>(inner_).ValidateCorpusValue(
  100|  20.5k|                std::get<I>(corpus_value));
  101|  20.5k|            result = Prefix(s, "Invalid value for Map()-ed domain");
  102|  20.5k|          }(),
_ZN8fuzztest8internal7MapImplIPFNSt3__110unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEES7_jEJNS1_IPFS7_15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplISA_EENS0_11InRangeImplIiEENSE_ISB_EENS0_13ArbitraryImplIbvEESK_SK_SK_NSE_ImEESL_NSE_IiEENS0_24BitFlagCombinationOfImplIjEEEEESO_EEC2ES9_SP_SO_:
   47|      4|      : mapper_(std::move(mapper)), inner_(std::move(inner)...) {}
_ZN8fuzztest8internal7MapImplIPFNSt3__110unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEE15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplIS8_EENS0_11InRangeImplIiEENSC_IS9_EENS0_13ArbitraryImplIbvEESI_SI_SI_NSC_ImEESJ_NSC_IiEENS0_24BitFlagCombinationOfImplIjEEEEC2ESB_SD_SF_SG_SI_SI_SI_SI_SJ_SJ_SK_SM_:
   47|      4|      : mapper_(std::move(mapper)), inner_(std::move(inner)...) {}

_ZZZN8fuzztest8internal20ParseWithDomainTupleIJNS0_13ElementOfImplI15avifCodecChoiceEENS0_11InRangeImplIiEENS2_I17avifDecoderSourceEENS0_13ArbitraryImplIbvEESA_SA_SA_NS2_ImEESB_NS2_IiEENS0_24BitFlagCombinationOfImplIjEEEEENSt3__18optionalINSF_5tupleIJDpNT_11corpus_typeEEEEEERKNSH_IJDpSI_EEERKNS0_8IRObjectEiENKUlSN_E_clIJNSF_17integral_constantImLm0EEENSW_ImLm1EEENSW_ImLm2EEENSW_ImLm3EEENSW_ImLm4EEENSW_ImLm5EEENSW_ImLm6EEENSW_ImLm7EEENSW_ImLm8EEENSW_ImLm9EEENSW_ImLm10EEEEEEDaSN_ENKUlSN_E_clIJNSG_INS0_23ElementOfImplCorpusTypeEEENSG_IiEES1B_NSG_IbEES1D_S1D_S1D_S1B_S1B_S1B_NSG_IjEEEEEDaSN_:
  118|  20.6k|    return [](auto... opts) {
  119|   432k|      return (!opts || ...)
  ------------------
  |  Branch (119:15): [True: 1, False: 20.6k]
  |  Branch (119:15): [True: 1, False: 20.6k]
  |  Branch (119:15): [True: 1, False: 20.6k]
  |  Branch (119:15): [True: 1, False: 20.6k]
  |  Branch (119:15): [True: 2, False: 20.6k]
  |  Branch (119:15): [True: 4, False: 20.6k]
  |  Branch (119:15): [True: 2, False: 20.6k]
  |  Branch (119:15): [True: 1, False: 20.6k]
  |  Branch (119:15): [True: 2, False: 20.6k]
  |  Branch (119:15): [True: 1, False: 20.6k]
  |  Branch (119:15): [True: 2, False: 20.6k]
  ------------------
  120|  20.6k|                 ? std::nullopt
  121|  20.6k|                 : std::optional(std::tuple<corpus_type_t<Domain>...>{
  122|  20.6k|                       *std::move(opts)...});
  123|  20.6k|    }(std::get<I>(domains).ParseCorpus((*subs)[I + skip])...);
_ZZN8fuzztest8internal20ParseWithDomainTupleIJNS0_13ElementOfImplI15avifCodecChoiceEENS0_11InRangeImplIiEENS2_I17avifDecoderSourceEENS0_13ArbitraryImplIbvEESA_SA_SA_NS2_ImEESB_NS2_IiEENS0_24BitFlagCombinationOfImplIjEEEEENSt3__18optionalINSF_5tupleIJDpNT_11corpus_typeEEEEEERKNSH_IJDpSI_EEERKNS0_8IRObjectEiENKUlSN_E_clIJNSF_17integral_constantImLm0EEENSW_ImLm1EEENSW_ImLm2EEENSW_ImLm3EEENSW_ImLm4EEENSW_ImLm5EEENSW_ImLm6EEENSW_ImLm7EEENSW_ImLm8EEENSW_ImLm9EEENSW_ImLm10EEEEEEDaSN_:
  117|  20.6k|  return ApplyIndex<sizeof...(Domain)>([&](auto... I) {
  118|  20.6k|    return [](auto... opts) {
  119|  20.6k|      return (!opts || ...)
  120|  20.6k|                 ? std::nullopt
  121|  20.6k|                 : std::optional(std::tuple<corpus_type_t<Domain>...>{
  122|  20.6k|                       *std::move(opts)...});
  123|  20.6k|    }(std::get<I>(domains).ParseCorpus((*subs)[I + skip])...);
  124|  20.6k|  });
_ZN8fuzztest8internal20ParseWithDomainTupleIJNS0_13ElementOfImplI15avifCodecChoiceEENS0_11InRangeImplIiEENS2_I17avifDecoderSourceEENS0_13ArbitraryImplIbvEESA_SA_SA_NS2_ImEESB_NS2_IiEENS0_24BitFlagCombinationOfImplIjEEEEENSt3__18optionalINSF_5tupleIJDpNT_11corpus_typeEEEEEERKNSH_IJDpSI_EEERKNS0_8IRObjectEi:
  114|  20.6k|    const std::tuple<Domain...>& domains, const IRObject& obj, int skip = 0) {
  115|  20.6k|  auto subs = obj.Subs();
  116|  20.6k|  if (!subs || subs->size() != sizeof...(Domain) + skip) return std::nullopt;
  ------------------
  |  Branch (116:7): [True: 1, False: 20.6k]
  |  Branch (116:16): [True: 2, False: 20.6k]
  ------------------
  117|  20.6k|  return ApplyIndex<sizeof...(Domain)>([&](auto... I) {
  118|  20.6k|    return [](auto... opts) {
  119|  20.6k|      return (!opts || ...)
  120|  20.6k|                 ? std::nullopt
  121|  20.6k|                 : std::optional(std::tuple<corpus_type_t<Domain>...>{
  122|  20.6k|                       *std::move(opts)...});
  123|  20.6k|    }(std::get<I>(domains).ParseCorpus((*subs)[I + skip])...);
  124|  20.6k|  });
  125|  20.6k|}
_ZZZN8fuzztest8internal20ParseWithDomainTupleIJNS0_7MapImplIPFNSt3__110unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEE15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplIS9_EENS0_11InRangeImplIiEENSD_ISA_EENS0_13ArbitraryImplIbvEESJ_SJ_SJ_NSD_ImEESK_NSD_IiEENS0_24BitFlagCombinationOfImplIjEEEEESN_EEENS3_8optionalINS3_5tupleIJDpNT_11corpus_typeEEEEEERKNSQ_IJDpSR_EEERKNS0_8IRObjectEiENKUlSW_E_clIJNS3_17integral_constantImLm0EEENS15_ImLm1EEEEEEDaSW_ENKUlSW_E_clIJNSP_INSQ_IJNS0_23ElementOfImplCorpusTypeEiS1A_bbbbS1A_S1A_S1A_jEEEEENSP_IjEEEEEDaSW_:
  118|  20.6k|    return [](auto... opts) {
  119|  61.8k|      return (!opts || ...)
  ------------------
  |  Branch (119:15): [True: 21, False: 20.6k]
  |  Branch (119:15): [True: 1, False: 20.6k]
  ------------------
  120|  20.6k|                 ? std::nullopt
  121|  20.6k|                 : std::optional(std::tuple<corpus_type_t<Domain>...>{
  122|  20.6k|                       *std::move(opts)...});
  123|  20.6k|    }(std::get<I>(domains).ParseCorpus((*subs)[I + skip])...);
_ZZN8fuzztest8internal20ParseWithDomainTupleIJNS0_7MapImplIPFNSt3__110unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEE15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplIS9_EENS0_11InRangeImplIiEENSD_ISA_EENS0_13ArbitraryImplIbvEESJ_SJ_SJ_NSD_ImEESK_NSD_IiEENS0_24BitFlagCombinationOfImplIjEEEEESN_EEENS3_8optionalINS3_5tupleIJDpNT_11corpus_typeEEEEEERKNSQ_IJDpSR_EEERKNS0_8IRObjectEiENKUlSW_E_clIJNS3_17integral_constantImLm0EEENS15_ImLm1EEEEEEDaSW_:
  117|  20.6k|  return ApplyIndex<sizeof...(Domain)>([&](auto... I) {
  118|  20.6k|    return [](auto... opts) {
  119|  20.6k|      return (!opts || ...)
  120|  20.6k|                 ? std::nullopt
  121|  20.6k|                 : std::optional(std::tuple<corpus_type_t<Domain>...>{
  122|  20.6k|                       *std::move(opts)...});
  123|  20.6k|    }(std::get<I>(domains).ParseCorpus((*subs)[I + skip])...);
  124|  20.6k|  });
_ZN8fuzztest8internal20ParseWithDomainTupleIJNS0_7MapImplIPFNSt3__110unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEE15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplIS9_EENS0_11InRangeImplIiEENSD_ISA_EENS0_13ArbitraryImplIbvEESJ_SJ_SJ_NSD_ImEESK_NSD_IiEENS0_24BitFlagCombinationOfImplIjEEEEESN_EEENS3_8optionalINS3_5tupleIJDpNT_11corpus_typeEEEEEERKNSQ_IJDpSR_EEERKNS0_8IRObjectEi:
  114|  20.6k|    const std::tuple<Domain...>& domains, const IRObject& obj, int skip = 0) {
  115|  20.6k|  auto subs = obj.Subs();
  116|  20.6k|  if (!subs || subs->size() != sizeof...(Domain) + skip) return std::nullopt;
  ------------------
  |  Branch (116:7): [True: 30, False: 20.6k]
  |  Branch (116:16): [True: 2, False: 20.6k]
  ------------------
  117|  20.6k|  return ApplyIndex<sizeof...(Domain)>([&](auto... I) {
  118|  20.6k|    return [](auto... opts) {
  119|  20.6k|      return (!opts || ...)
  120|  20.6k|                 ? std::nullopt
  121|  20.6k|                 : std::optional(std::tuple<corpus_type_t<Domain>...>{
  122|  20.6k|                       *std::move(opts)...});
  123|  20.6k|    }(std::get<I>(domains).ParseCorpus((*subs)[I + skip])...);
  124|  20.6k|  });
  125|  20.6k|}
_ZN8fuzztest8internal20ParseWithDomainTupleIJNS0_23SequenceContainerOfImplINSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEENS0_13ArbitraryImplIcvEEEENSA_IbvEENS0_7MapImplIPFNS3_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEESJ_jEJNSE_IPFSJ_15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplISM_EENS0_11InRangeImplIiEENSQ_ISN_EESD_SD_SD_SD_NSQ_ImEESV_NSQ_IiEENS0_24BitFlagCombinationOfImplIjEEEEESY_EEEEEENS3_8optionalINS3_5tupleIJDpNT_11corpus_typeEEEEEERKNS12_IJDpS13_EEERKNS0_8IRObjectEi:
  114|  22.1k|    const std::tuple<Domain...>& domains, const IRObject& obj, int skip = 0) {
  115|  22.1k|  auto subs = obj.Subs();
  116|  22.1k|  if (!subs || subs->size() != sizeof...(Domain) + skip) return std::nullopt;
  ------------------
  |  Branch (116:7): [True: 1.28k, False: 20.8k]
  |  Branch (116:16): [True: 198, False: 20.6k]
  ------------------
  117|  20.6k|  return ApplyIndex<sizeof...(Domain)>([&](auto... I) {
  118|  20.6k|    return [](auto... opts) {
  119|  20.6k|      return (!opts || ...)
  120|  20.6k|                 ? std::nullopt
  121|  20.6k|                 : std::optional(std::tuple<corpus_type_t<Domain>...>{
  122|  20.6k|                       *std::move(opts)...});
  123|  20.6k|    }(std::get<I>(domains).ParseCorpus((*subs)[I + skip])...);
  124|  20.6k|  });
  125|  22.1k|}
_ZZN8fuzztest8internal20ParseWithDomainTupleIJNS0_23SequenceContainerOfImplINSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEENS0_13ArbitraryImplIcvEEEENSA_IbvEENS0_7MapImplIPFNS3_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEESJ_jEJNSE_IPFSJ_15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplISM_EENS0_11InRangeImplIiEENSQ_ISN_EESD_SD_SD_SD_NSQ_ImEESV_NSQ_IiEENS0_24BitFlagCombinationOfImplIjEEEEESY_EEEEEENS3_8optionalINS3_5tupleIJDpNT_11corpus_typeEEEEEERKNS12_IJDpS13_EEERKNS0_8IRObjectEiENKUlS18_E_clIJNS3_17integral_constantImLm0EEENS1H_ImLm1EEENS1H_ImLm2EEEEEEDaS18_:
  117|  20.6k|  return ApplyIndex<sizeof...(Domain)>([&](auto... I) {
  118|  20.6k|    return [](auto... opts) {
  119|  20.6k|      return (!opts || ...)
  120|  20.6k|                 ? std::nullopt
  121|  20.6k|                 : std::optional(std::tuple<corpus_type_t<Domain>...>{
  122|  20.6k|                       *std::move(opts)...});
  123|  20.6k|    }(std::get<I>(domains).ParseCorpus((*subs)[I + skip])...);
  124|  20.6k|  });
_ZZZN8fuzztest8internal20ParseWithDomainTupleIJNS0_23SequenceContainerOfImplINSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEENS0_13ArbitraryImplIcvEEEENSA_IbvEENS0_7MapImplIPFNS3_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEESJ_jEJNSE_IPFSJ_15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplISM_EENS0_11InRangeImplIiEENSQ_ISN_EESD_SD_SD_SD_NSQ_ImEESV_NSQ_IiEENS0_24BitFlagCombinationOfImplIjEEEEESY_EEEEEENS3_8optionalINS3_5tupleIJDpNT_11corpus_typeEEEEEERKNS12_IJDpS13_EEERKNS0_8IRObjectEiENKUlS18_E_clIJNS3_17integral_constantImLm0EEENS1H_ImLm1EEENS1H_ImLm2EEEEEEDaS18_ENKUlS18_E_clIJNS11_IS9_EENS11_IbEENS11_INS12_IJNS12_IJNS0_23ElementOfImplCorpusTypeEiS1P_bbbbS1P_S1P_S1P_jEEEjEEEEEEEEDaS18_:
  118|  20.6k|    return [](auto... opts) {
  119|   103k|      return (!opts || ...)
  ------------------
  |  Branch (119:15): [True: 1, False: 20.6k]
  |  Branch (119:15): [True: 25, False: 20.6k]
  |  Branch (119:15): [True: 30, False: 20.6k]
  ------------------
  120|  20.6k|                 ? std::nullopt
  121|  20.6k|                 : std::optional(std::tuple<corpus_type_t<Domain>...>{
  122|  20.6k|                       *std::move(opts)...});
  123|  20.6k|    }(std::get<I>(domains).ParseCorpus((*subs)[I + skip])...);

_ZN8fuzztest8internal8BitWidthIiEEmT_:
   45|      4|size_t BitWidth(T val) {
   46|       |  if constexpr (std::is_same_v<T, absl::int128> ||
   47|       |                std::is_same_v<T, absl::uint128>) {
   48|       |    auto val_unsigned = MakeUnsignedT<T>(val);
   49|       |    size_t res = 0;
   50|       |    while (val_unsigned >>= 1) ++res;
   51|       |    return res;
   52|      4|  } else {
   53|      4|    return absl::bit_width(static_cast<MakeUnsignedT<T>>(val));
   54|      4|  }
   55|      4|}

_ZN8fuzztest8internal20UntypedFixtureDriver13SetUpFuzzTestEv:
   20|      2|void UntypedFixtureDriver::SetUpFuzzTest() {}
_ZN8fuzztest8internal20UntypedFixtureDriver14SetUpIterationEv:
   21|  20.5k|void UntypedFixtureDriver::SetUpIteration() {}
_ZN8fuzztest8internal20UntypedFixtureDriver17TearDownIterationEv:
   22|  20.5k|void UntypedFixtureDriver::TearDownIteration() {}

_ZN8fuzztest8internal13FixtureDriverINS_6DomainINSt3__15tupleIJNS3_12basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEbNS3_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEEEEEEENS0_9NoFixtureEPFvRKSA_bSF_EPvEC2ESM_SH_NS3_6vectorINS0_11CopyableAnyENS8_ISQ_EEEERKSN_:
  287|      2|      : FixtureDriver::TypedFixtureDriver(std::move(domain), std::move(seeds),
  288|      2|                                          seed_provider),
  289|      2|        target_function_(target_function) {}
_ZN8fuzztest8internal18TypedFixtureDriverINSt3__15tupleIJNS2_12basic_stringIcNS2_11char_traitsIcEENS2_9allocatorIcEEEEbNS2_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEEEEEPvEC2ENS_6DomainISF_EENS2_6vectorINS0_11CopyableAnyENS7_ISL_EEEERKSG_:
  104|      2|      : domain_(std::move(domain)),
  105|      2|        seeds_(std::move(seeds)),
  106|      2|        seed_provider_(seed_provider) {}
_ZNK8fuzztest8internal13FixtureDriverINS_6DomainINSt3__15tupleIJNS3_12basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEbNS3_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEEEEEEENS0_9NoFixtureEPFvRKSA_bSF_EPvE4TestEONS0_11MoveOnlyAnyE:
  291|  20.5k|  void Test(MoveOnlyAny&& args_untyped) const override {
  292|  20.5k|    std::apply(
  293|  20.5k|        [&](auto&&... args) {
  294|  20.5k|          target_function_(ForceVectorForStringView<Args>(std::move(args))...);
  295|  20.5k|        },
  296|  20.5k|        args_untyped.GetAs<value_type_t<DomainT>>());
  297|  20.5k|  }
_ZZNK8fuzztest8internal13FixtureDriverINS_6DomainINSt3__15tupleIJNS3_12basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEbNS3_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEEEEEEENS0_9NoFixtureEPFvRKSA_bSF_EPvE4TestEONS0_11MoveOnlyAnyEENKUlDpOT_E_clIJRSA_RbRSF_EEEDaST_:
  293|  20.5k|        [&](auto&&... args) {
  294|  20.5k|          target_function_(ForceVectorForStringView<Args>(std::move(args))...);
  295|  20.5k|        },
_ZN8fuzztest8internal24ForceVectorForStringViewIRKNSt3__112basic_stringIcNS2_11char_traitsIcEENS2_9allocatorIcEEEES8_EEDcOT0_:
  181|  20.5k|decltype(auto) ForceVectorForStringView(Src&& src) {
  182|       |  // We only do this when Src is a std::string. If it's a string view it is
  183|       |  // handled by the string view domain itself.
  184|       |  if constexpr (std::is_same_v<void(std::decay_t<Dest>, std::decay_t<Src>),
  185|       |                               void(std::string_view, std::string)>) {
  186|       |    return ForceVector{std::vector<char>(src.begin(), src.end())};
  187|  20.5k|  } else {
  188|  20.5k|    return std::forward<Src>(src);
  189|  20.5k|  }
  190|  20.5k|}
_ZN8fuzztest8internal24ForceVectorForStringViewIbbEEDcOT0_:
  181|  20.5k|decltype(auto) ForceVectorForStringView(Src&& src) {
  182|       |  // We only do this when Src is a std::string. If it's a string view it is
  183|       |  // handled by the string view domain itself.
  184|       |  if constexpr (std::is_same_v<void(std::decay_t<Dest>, std::decay_t<Src>),
  185|       |                               void(std::string_view, std::string)>) {
  186|       |    return ForceVector{std::vector<char>(src.begin(), src.end())};
  187|  20.5k|  } else {
  188|  20.5k|    return std::forward<Src>(src);
  189|  20.5k|  }
  190|  20.5k|}
_ZN8fuzztest8internal24ForceVectorForStringViewINSt3__110unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEES7_EEDcOT0_:
  181|  20.5k|decltype(auto) ForceVectorForStringView(Src&& src) {
  182|       |  // We only do this when Src is a std::string. If it's a string view it is
  183|       |  // handled by the string view domain itself.
  184|       |  if constexpr (std::is_same_v<void(std::decay_t<Dest>, std::decay_t<Src>),
  185|       |                               void(std::string_view, std::string)>) {
  186|       |    return ForceVector{std::vector<char>(src.begin(), src.end())};
  187|  20.5k|  } else {
  188|  20.5k|    return std::forward<Src>(src);
  189|  20.5k|  }
  190|  20.5k|}
_ZNK8fuzztest8internal18TypedFixtureDriverINSt3__15tupleIJNS2_12basic_stringIcNS2_11char_traitsIcEENS2_9allocatorIcEEEEbNS2_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEEEEEPvE10GetDomainsEv:
  120|      2|  UntypedDomain GetDomains() const final { return domain_; }

_ZNK8fuzztest8internal17GTest_TestAdaptor26GetFuzzTestsInCurrentShardEv:
   23|      2|std::vector<std::string> GTest_TestAdaptor::GetFuzzTestsInCurrentShard() const {
   24|      2|  std::vector<std::string> result;
   25|      4|  for (const auto* test : GetRegisteredTests()) {
  ------------------
  |  Branch (25:25): [True: 4, False: 2]
  ------------------
   26|      4|    if (!test->should_run()) continue;
  ------------------
  |  Branch (26:9): [True: 2, False: 2]
  ------------------
   27|      2|    if (test->is_in_another_shard()) continue;
  ------------------
  |  Branch (27:9): [True: 0, False: 2]
  ------------------
   28|      4|    for (const auto& fuzztest : configuration_.fuzz_tests) {
  ------------------
  |  Branch (28:31): [True: 4, False: 0]
  ------------------
   29|      4|      if (fuzztest ==
  ------------------
  |  Branch (29:11): [True: 2, False: 2]
  ------------------
   30|      4|          absl::StrCat(test->test_suite_name(), ".", test->name())) {
   31|      2|        result.push_back(fuzztest);
   32|      2|        break;
   33|      2|      }
   34|      4|    }
   35|      2|  }
   36|      2|  return result;
   37|      2|}
_ZN8fuzztest8internal30RegisterFuzzTestsAsGoogleTestsEPiPPPcRKNS0_13ConfigurationE:
   82|      2|                                    const Configuration& configuration) {
   83|      2|  ::fuzztest::internal::ForEachTest([&](auto& test) {
   84|      2|    if (test.uses_fixture()) {
   85|      2|      RegisterTests<::fuzztest::internal::GTest_TestAdaptor>(argc, argv, test,
   86|      2|                                                             configuration);
   87|      2|    } else {
   88|      2|      RegisterTests<::testing::Test>(argc, argv, test, configuration);
   89|      2|    }
   90|      2|  });
   91|       |
   92|      2|  ::testing::UnitTest::GetInstance()->listeners().Append(
   93|      2|      new ::fuzztest::internal::GTest_EventListener<
   94|      2|          ::testing::EmptyTestEventListener, ::testing::TestPartResult>());
   95|      2|}
_ZN8fuzztest8internal28FuzzTestListingModeValidator26RegisterGoogleTestListenerEv:
  151|      2|void FuzzTestListingModeValidator::RegisterGoogleTestListener() {
  152|      2|  testing::UnitTest::GetInstance()->listeners().Append(
  153|      2|      new ValidatorProxyListener(this));
  154|      2|}
_ZN8fuzztest8internal44SetFuzzTestListingModeValidatorForGoogleTestEb:
  156|      2|void SetFuzzTestListingModeValidatorForGoogleTest(bool listing_mode) {
  157|      2|  FuzzTestListingModeValidator::GetInstance().set_listing_mode(listing_mode);
  158|      2|}
_ZN8fuzztest8internal18GetRegisteredTestsEv:
  160|      2|std::vector<const testing::TestInfo*> GetRegisteredTests() {
  161|      2|  std::vector<const testing::TestInfo*> result;
  162|      2|  auto& unit_test = *testing::UnitTest::GetInstance();
  163|      6|  for (int i = 0; i < unit_test.total_test_suite_count(); ++i) {
  ------------------
  |  Branch (163:19): [True: 4, False: 2]
  ------------------
  164|      8|    for (int j = 0; j < unit_test.GetTestSuite(i)->total_test_count(); ++j) {
  ------------------
  |  Branch (164:21): [True: 4, False: 4]
  ------------------
  165|      4|      result.push_back(unit_test.GetTestSuite(i)->GetTestInfo(j));
  166|      4|    }
  167|      4|  }
  168|      2|  return result;
  169|      2|}
_ZN8fuzztest8internal22ValidatorProxyListenerC2EPNS0_28FuzzTestListingModeValidatorE:
  146|      2|      : validator_(validator) {}
_ZN8fuzztest8internal22ValidatorProxyListener18OnTestProgramStartERKN7testing8UnitTestE:
  138|      2|  void OnTestProgramStart(const testing::UnitTest& unit_test) override {
  139|      2|    validator_->Run();
  140|      2|  }
_ZN8fuzztest8internal28FuzzTestListingModeValidator3RunEv:
  101|      2|  void Run() {
  102|      2|    if (listing_mode_) {
  ------------------
  |  Branch (102:9): [True: 0, False: 2]
  ------------------
  103|       |      // Since InitFuzzTest calls std::exit after listing the fuzz tests, we
  104|       |      // would not reach here if InitFuzzTest is called before.
  105|       |      absl::FPrintF(stderr,
  106|      0|                    "[!] --" FUZZTEST_FLAG_PREFIX
  107|      0|                    "list_fuzz_tests not handled by InitFuzzTest - fuzz tests "
  108|      0|                    "would not be listed even if defined.\n");
  109|      0|      std::exit(0);
  110|      0|    }
  111|      2|  }
_ZN8fuzztest8internal28FuzzTestListingModeValidator11GetInstanceEv:
  115|      2|  static FuzzTestListingModeValidator& GetInstance() {
  116|      2|    static auto* instance = [] {
  117|      2|      static_assert(
  118|      2|          std::is_trivially_destructible_v<FuzzTestListingModeValidator>);
  119|      2|      static FuzzTestListingModeValidator instance;
  120|      2|      instance.RegisterGoogleTestListener();
  121|      2|      return &instance;
  122|      2|    }();
  123|      2|    return *instance;
  124|      2|  }
_ZZN8fuzztest8internal28FuzzTestListingModeValidator11GetInstanceEvENKUlvE_clEv:
  116|      2|    static auto* instance = [] {
  117|      2|      static_assert(
  118|      2|          std::is_trivially_destructible_v<FuzzTestListingModeValidator>);
  119|      2|      static FuzzTestListingModeValidator instance;
  120|      2|      instance.RegisterGoogleTestListener();
  121|      2|      return &instance;
  122|      2|    }();
_ZN8fuzztest8internal28FuzzTestListingModeValidator16set_listing_modeEb:
  113|      2|  void set_listing_mode(bool listing_mode) { listing_mode_ = listing_mode; }
googletest_adaptor.cc:_ZZN8fuzztest8internal30RegisterFuzzTestsAsGoogleTestsEPiPPPcRKNS0_13ConfigurationEENK3$_0clINS0_8FuzzTestEEEDaRT_:
   83|      4|  ::fuzztest::internal::ForEachTest([&](auto& test) {
   84|      4|    if (test.uses_fixture()) {
  ------------------
  |  Branch (84:9): [True: 0, False: 4]
  ------------------
   85|      0|      RegisterTests<::fuzztest::internal::GTest_TestAdaptor>(argc, argv, test,
   86|      0|                                                             configuration);
   87|      4|    } else {
   88|      4|      RegisterTests<::testing::Test>(argc, argv, test, configuration);
   89|      4|    }
   90|      4|  });
googletest_adaptor.cc:_ZN8fuzztest8internal12_GLOBAL__N_113RegisterTestsIN7testing4TestEEEvPiPPPcRNS0_8FuzzTestERKNS0_13ConfigurationE:
   73|      4|                   const Configuration& configuration) {
   74|      4|  RegisterFuzzTestAsGTest<T>(argc, argv, test, configuration);
   75|      4|  RegisterSeparateRegressionTestForEachCrashingInput<T>(argc, argv, test,
   76|      4|                                                        configuration);
   77|      4|}
googletest_adaptor.cc:_ZN8fuzztest8internal12_GLOBAL__N_123RegisterFuzzTestAsGTestIN7testing4TestEEEvPiPPPcRNS0_8FuzzTestERKNS0_13ConfigurationENSt3__117basic_string_viewIcNSE_11char_traitsIcEEEE:
   43|      4|                             absl::string_view suffix = "") {
   44|      4|  auto fixture_factory = [argc, argv, &test,
   45|      4|                          configuration = configuration]() -> T* {
   46|      4|    return new ::fuzztest::internal::GTest_TestAdaptor(test, argc, argv,
   47|      4|                                                       configuration);
   48|      4|  };
   49|      4|  const std::string test_name_with_suffix =
   50|      4|      absl::StrCat(test.test_name(), suffix);
   51|      4|  ::testing::RegisterTest(
   52|      4|      test.suite_name().c_str(), test_name_with_suffix.c_str(), nullptr,
   53|      4|      nullptr, test.file().c_str(), test.line(), std::move(fixture_factory));
   54|      4|}
googletest_adaptor.cc:_ZZN8fuzztest8internal12_GLOBAL__N_123RegisterFuzzTestAsGTestIN7testing4TestEEEvPiPPPcRNS0_8FuzzTestERKNS0_13ConfigurationENSt3__117basic_string_viewIcNSE_11char_traitsIcEEEEENKUlvE_clEv:
   45|      2|                          configuration = configuration]() -> T* {
   46|      2|    return new ::fuzztest::internal::GTest_TestAdaptor(test, argc, argv,
   47|      2|                                                       configuration);
   48|      2|  };
googletest_adaptor.cc:_ZN8fuzztest8internal12_GLOBAL__N_150RegisterSeparateRegressionTestForEachCrashingInputIN7testing4TestEEEvPiPPPcRNS0_8FuzzTestERKNS0_13ConfigurationE:
   59|      4|    const Configuration& configuration) {
   60|      4|  CorpusDatabase corpus_database(configuration);
   61|      4|  for (const std::string& input :
  ------------------
  |  Branch (61:33): [True: 0, False: 4]
  ------------------
   62|      4|       corpus_database.GetCrashingInputsIfAny(test.full_name())) {
   63|      0|    Configuration updated_configuration = configuration;
   64|      0|    updated_configuration.crashing_input_to_reproduce = input;
   65|      0|    const std::string suffix =
   66|      0|        absl::StrCat("/Regression/", std::string(Basename(input)));
   67|      0|    RegisterFuzzTestAsGTest<T>(argc, argv, test, updated_configuration, suffix);
   68|      0|  }
   69|      4|}

_ZN8fuzztest8internal17GTest_TestAdaptorC2ERNS0_8FuzzTestEPiPPPcNS0_13ConfigurationE:
   35|      2|      : test_(test),
   36|      2|        argc_(argc),
   37|      2|        argv_(argv),
   38|      2|        configuration_(std::move(configuration)) {}
_ZN8fuzztest8internal17GTest_TestAdaptor8TestBodyEv:
   40|      2|  void TestBody() override {
   41|      2|    auto test = test_.make();
   42|      2|    configuration_.fuzz_tests_in_current_shard = GetFuzzTestsInCurrentShard();
   43|      2|    if (Runtime::instance().run_mode() == RunMode::kUnitTest) {
  ------------------
  |  Branch (43:9): [True: 0, False: 2]
  ------------------
   44|       |      // In "bug reproduction" mode, sometimes we need to reproduce multiple
   45|       |      // bugs, i.e., run multiple tests that lead to a crash.
   46|      0|      bool needs_subprocess = false;
   47|      0|#ifdef GTEST_HAS_DEATH_TEST
   48|      0|      needs_subprocess =
   49|      0|          configuration_.crashing_input_to_reproduce.has_value() &&
  ------------------
  |  Branch (49:11): [True: 0, False: 0]
  ------------------
   50|      0|          (
   51|       |              // When only a single test runs, it's okay to crash the process on
   52|       |              // error, as we don't need to run other tests.
   53|      0|              testing::UnitTest::GetInstance()->test_to_run_count() > 1 ||
  ------------------
  |  Branch (53:15): [True: 0, False: 0]
  ------------------
   54|       |              // EXPECT_EXIT is required in the death-test subprocess, but in
   55|       |              // the subprocess there's only one test to run.
   56|      0|              testing::internal::InDeathTestChild());
  ------------------
  |  Branch (56:15): [True: 0, False: 0]
  ------------------
   57|      0|#endif
   58|      0|      if (needs_subprocess) {
  ------------------
  |  Branch (58:11): [True: 0, False: 0]
  ------------------
   59|      0|        configuration_.preprocess_crash_reproducing = [] {
   60|       |          // EXPECT_EXIT disables event forwarding in gtest and as a result,
   61|       |          // EXPECT/ASSERT-s are disabled. Here, we overwrite this option.
   62|      0|          testing::UnitTest::GetInstance()->listeners().SuppressEventForwarding(
   63|      0|              false);
   64|      0|        };
   65|       |        // `RunInUnitTestMode` is supposed to fail and we wish to show the
   66|       |        // failure to the user. Directly running the test would terminate the
   67|       |        // process and using `EXPECT_DEATH` causes the test to pass. We use
   68|       |        // `EXPECT_EXIT` so that the test exit unsuccessfully, meaning that the
   69|       |        // test below fails without terminating the process.
   70|      0|#ifdef GTEST_HAS_DEATH_TEST
   71|      0|        EXPECT_EXIT(
  ------------------
  |  Branch (71:9): [True: 0, False: 0]
  |  Branch (71:9): [True: 0, False: 0]
  |  Branch (71:9): [True: 0, False: 0]
  |  Branch (71:9): [True: 0, False: 0]
  |  Branch (71:9): [True: 0, False: 0]
  |  Branch (71:9): [True: 0, False: 0]
  |  Branch (71:9): [True: 0, False: 0]
  |  Branch (71:9): [True: 0, False: 0]
  |  Branch (71:9): [True: 0, False: 0]
  |  Branch (71:9): [True: 0, False: 0]
  ------------------
   72|      0|            (test->RunInUnitTestMode(configuration_),
   73|      0|             void(
   74|      0|                 R"( FuzzTest failure! Please see 'actual message' below for the crash report. )"),
   75|      0|             std::exit(0)),
   76|      0|            ::testing::ExitedWithCode(0), "");
   77|       |#else
   78|       |        EXPECT_TRUE(false) << "Death test is not supported.";
   79|       |#endif
   80|      0|      } else {
   81|      0|        test->RunInUnitTestMode(configuration_);
   82|      0|      }
   83|      2|    } else {
   84|       |      // TODO(b/245753736): Consider using `tolerate_failure` when FuzzTest can
   85|       |      // tolerate crashes in fuzzing mode.
   86|      6|      ASSERT_EQ(0, test->RunInFuzzingMode(argc_, argv_, configuration_))
  ------------------
  |  Branch (86:7): [True: 0, False: 2]
  |  Branch (86:7): [True: 0, False: 2]
  |  Branch (86:7): [True: 2, False: 0]
  ------------------
   87|      6|          << "Fuzzing failure.";
   88|      2|    }
   89|      2|  }

_ZN8fuzztest8internal8BasenameENSt3__117basic_string_viewIcNS1_11char_traitsIcEEEE:
  183|      2|absl::string_view Basename(absl::string_view filename) {
  184|      2|  auto last_slash_pos = filename.find_last_of("/\\");
  185|       |
  186|      2|  return last_slash_pos == absl::string_view::npos
  ------------------
  |  Branch (186:10): [True: 0, False: 2]
  ------------------
  187|      2|             ? filename
  188|      2|             : filename.substr(last_slash_pos + 1);
  189|      2|}

_ZN8fuzztest8internal9GetStderrEv:
  125|      2|FILE* GetStderr() {
  126|      2|  absl::MutexLock lock(&stderr_file_guard_);
  127|      2|  if (!stderr_file_) {
  ------------------
  |  Branch (127:7): [True: 2, False: 0]
  ------------------
  128|       |    stderr_file_ = stderr;
  129|      2|  }
  130|      2|  return stderr_file_;
  131|      2|}

_ZN8fuzztest8internal11CheckIsSameINS0_23ElementOfImplCorpusTypeES2_EEvv:
  806|     20|constexpr void CheckIsSame() {
  807|     20|  static_assert(std::is_same_v<T, U>);
  808|     20|}
_ZN8fuzztest8internal11CheckIsSameIjjEEvv:
  806|     16|constexpr void CheckIsSame() {
  807|     16|  static_assert(std::is_same_v<T, U>);
  808|     16|}
_ZN8fuzztest8internal11CheckIsSameIccEEvv:
  806|      8|constexpr void CheckIsSame() {
  807|      8|  static_assert(std::is_same_v<T, U>);
  808|      8|}
_ZN8fuzztest8internal11CheckIsSameINSt3__112basic_stringIcNS2_11char_traitsIcEENS2_9allocatorIcEEEES8_EEvv:
  806|      8|constexpr void CheckIsSame() {
  807|      8|  static_assert(std::is_same_v<T, U>);
  808|      8|}
_ZN8fuzztest8internal11CheckIsSameImmEEvv:
  806|      8|constexpr void CheckIsSame() {
  807|      8|  static_assert(std::is_same_v<T, U>);
  808|      8|}
_ZN8fuzztest8internal11CheckIsSameIbbEEvv:
  806|     40|constexpr void CheckIsSame() {
  807|     40|  static_assert(std::is_same_v<T, U>);
  808|     40|}
_ZN8fuzztest8internal11CheckIsSameIiiEEvv:
  806|     12|constexpr void CheckIsSame() {
  807|     12|  static_assert(std::is_same_v<T, U>);
  808|     12|}
_ZN8fuzztest8internal11CheckIsSameI15avifCodecChoiceS2_EEvv:
  806|      4|constexpr void CheckIsSame() {
  807|      4|  static_assert(std::is_same_v<T, U>);
  808|      4|}
_ZN8fuzztest8internal11CheckIsSameI17avifDecoderSourceS2_EEvv:
  806|      4|constexpr void CheckIsSame() {
  807|      4|  static_assert(std::is_same_v<T, U>);
  808|      4|}
_ZN8fuzztest8internal11CheckIsSameINSt3__110unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEES7_EEvv:
  806|      8|constexpr void CheckIsSame() {
  807|      8|  static_assert(std::is_same_v<T, U>);
  808|      8|}
_ZN8fuzztest8internal11CheckIsSameINSt3__15tupleIJNS0_23ElementOfImplCorpusTypeEiS4_bbbbS4_S4_S4_jEEES5_EEvv:
  806|      4|constexpr void CheckIsSame() {
  807|      4|  static_assert(std::is_same_v<T, U>);
  808|      4|}
_ZN8fuzztest8internal14ApplyIndexImplIJLm0ELm1ELm2ELm3ELm4ELm5ELm6ELm7ELm8ELm9ELm10EEZNKS0_7MapImplIPFNSt3__110unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEE15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplIS9_EENS0_11InRangeImplIiEENSD_ISA_EENS0_13ArbitraryImplIbvEESJ_SJ_SJ_NSD_ImEESK_NSD_IiEENS0_24BitFlagCombinationOfImplIjEEEE19ValidateCorpusValueERKNS3_5tupleIJNS0_23ElementOfImplCorpusTypeEiSQ_bbbbSQ_SQ_SQ_jEEEEUlDpT_E_EEDaT0_NS3_16integer_sequenceImJXspT_EEEE:
   39|  20.6k|constexpr auto ApplyIndexImpl(F f, std::index_sequence<I...>) {
   40|  20.6k|  return f(std::integral_constant<size_t, I>{}...);
   41|  20.6k|}
_ZN8fuzztest8internal10ApplyIndexILm11EZNKS0_7MapImplIPFNSt3__110unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEE15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplIS9_EENS0_11InRangeImplIiEENSD_ISA_EENS0_13ArbitraryImplIbvEESJ_SJ_SJ_NSD_ImEESK_NSD_IiEENS0_24BitFlagCombinationOfImplIjEEEE19ValidateCorpusValueERKNS3_5tupleIJNS0_23ElementOfImplCorpusTypeEiSQ_bbbbSQ_SQ_SQ_jEEEEUlDpT_E_EEDaT0_:
   54|  20.6k|constexpr auto ApplyIndex(F f) {
   55|  20.6k|  return ApplyIndexImpl(f, std::make_index_sequence<N>{});
   56|  20.6k|}
_ZN8fuzztest8internal14ApplyIndexImplIJLm0ELm1ELm2ELm3ELm4ELm5ELm6ELm7ELm8ELm9ELm10EEZNS0_20ParseWithDomainTupleIJNS0_13ElementOfImplI15avifCodecChoiceEENS0_11InRangeImplIiEENS3_I17avifDecoderSourceEENS0_13ArbitraryImplIbvEESB_SB_SB_NS3_ImEESC_NS3_IiEENS0_24BitFlagCombinationOfImplIjEEEEENSt3__18optionalINSG_5tupleIJDpNT_11corpus_typeEEEEEERKNSI_IJDpSJ_EEERKNS0_8IRObjectEiEUlSO_E_EEDaT0_NSG_16integer_sequenceImJXspT_EEEE:
   39|  20.6k|constexpr auto ApplyIndexImpl(F f, std::index_sequence<I...>) {
   40|  20.6k|  return f(std::integral_constant<size_t, I>{}...);
   41|  20.6k|}
_ZN8fuzztest8internal10ApplyIndexILm11EZNS0_20ParseWithDomainTupleIJNS0_13ElementOfImplI15avifCodecChoiceEENS0_11InRangeImplIiEENS3_I17avifDecoderSourceEENS0_13ArbitraryImplIbvEESB_SB_SB_NS3_ImEESC_NS3_IiEENS0_24BitFlagCombinationOfImplIjEEEEENSt3__18optionalINSG_5tupleIJDpNT_11corpus_typeEEEEEERKNSI_IJDpSJ_EEERKNS0_8IRObjectEiEUlSO_E_EEDaT0_:
   54|  20.6k|constexpr auto ApplyIndex(F f) {
   55|  20.6k|  return ApplyIndexImpl(f, std::make_index_sequence<N>{});
   56|  20.6k|}
_ZN8fuzztest8internal14ApplyIndexImplIJLm0ELm1ELm2ELm3ELm4ELm5ELm6ELm7ELm8ELm9ELm10EEZNKS0_7MapImplIPFNSt3__110unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEE15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplIS9_EENS0_11InRangeImplIiEENSD_ISA_EENS0_13ArbitraryImplIbvEESJ_SJ_SJ_NSD_ImEESK_NSD_IiEENS0_24BitFlagCombinationOfImplIjEEEE8GetValueERKNS3_5tupleIJNS0_23ElementOfImplCorpusTypeEiSQ_bbbbSQ_SQ_SQ_jEEEEUlDpT_E_EEDaT0_NS3_16integer_sequenceImJXspT_EEEE:
   39|  20.5k|constexpr auto ApplyIndexImpl(F f, std::index_sequence<I...>) {
   40|  20.5k|  return f(std::integral_constant<size_t, I>{}...);
   41|  20.5k|}
_ZN8fuzztest8internal10ApplyIndexILm11EZNKS0_7MapImplIPFNSt3__110unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEE15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplIS9_EENS0_11InRangeImplIiEENSD_ISA_EENS0_13ArbitraryImplIbvEESJ_SJ_SJ_NSD_ImEESK_NSD_IiEENS0_24BitFlagCombinationOfImplIjEEEE8GetValueERKNS3_5tupleIJNS0_23ElementOfImplCorpusTypeEiSQ_bbbbSQ_SQ_SQ_jEEEEUlDpT_E_EEDaT0_:
   54|  20.5k|constexpr auto ApplyIndex(F f) {
   55|  20.5k|  return ApplyIndexImpl(f, std::make_index_sequence<N>{});
   56|  20.5k|}
_ZN8fuzztest8internal11CheckIsSameINSt3__15tupleIJNS3_IJNS0_23ElementOfImplCorpusTypeEiS4_bbbbS4_S4_S4_jEEEjEEES6_EEvv:
  806|      4|constexpr void CheckIsSame() {
  807|      4|  static_assert(std::is_same_v<T, U>);
  808|      4|}
_ZN8fuzztest8internal14ApplyIndexImplIJLm0ELm1EEZNKS0_7MapImplIPFNSt3__110unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEES8_jEJNS2_IPFS8_15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplISB_EENS0_11InRangeImplIiEENSF_ISC_EENS0_13ArbitraryImplIbvEESL_SL_SL_NSF_ImEESM_NSF_IiEENS0_24BitFlagCombinationOfImplIjEEEEESP_EE19ValidateCorpusValueERKNS3_5tupleIJNSS_IJNS0_23ElementOfImplCorpusTypeEiST_bbbbST_ST_ST_jEEEjEEEEUlDpT_E_EEDaT0_NS3_16integer_sequenceImJXspT_EEEE:
   39|  20.6k|constexpr auto ApplyIndexImpl(F f, std::index_sequence<I...>) {
   40|  20.6k|  return f(std::integral_constant<size_t, I>{}...);
   41|  20.6k|}
_ZN8fuzztest8internal10ApplyIndexILm2EZNKS0_7MapImplIPFNSt3__110unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEES8_jEJNS2_IPFS8_15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplISB_EENS0_11InRangeImplIiEENSF_ISC_EENS0_13ArbitraryImplIbvEESL_SL_SL_NSF_ImEESM_NSF_IiEENS0_24BitFlagCombinationOfImplIjEEEEESP_EE19ValidateCorpusValueERKNS3_5tupleIJNSS_IJNS0_23ElementOfImplCorpusTypeEiST_bbbbST_ST_ST_jEEEjEEEEUlDpT_E_EEDaT0_:
   54|  20.6k|constexpr auto ApplyIndex(F f) {
   55|  20.6k|  return ApplyIndexImpl(f, std::make_index_sequence<N>{});
   56|  20.6k|}
_ZN8fuzztest8internal14ApplyIndexImplIJLm0ELm1EEZNS0_20ParseWithDomainTupleIJNS0_7MapImplIPFNSt3__110unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEE15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplISA_EENS0_11InRangeImplIiEENSE_ISB_EENS0_13ArbitraryImplIbvEESK_SK_SK_NSE_ImEESL_NSE_IiEENS0_24BitFlagCombinationOfImplIjEEEEESO_EEENS4_8optionalINS4_5tupleIJDpNT_11corpus_typeEEEEEERKNSR_IJDpSS_EEERKNS0_8IRObjectEiEUlSX_E_EEDaT0_NS4_16integer_sequenceImJXspT_EEEE:
   39|  20.6k|constexpr auto ApplyIndexImpl(F f, std::index_sequence<I...>) {
   40|  20.6k|  return f(std::integral_constant<size_t, I>{}...);
   41|  20.6k|}
_ZN8fuzztest8internal10ApplyIndexILm2EZNS0_20ParseWithDomainTupleIJNS0_7MapImplIPFNSt3__110unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEE15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplISA_EENS0_11InRangeImplIiEENSE_ISB_EENS0_13ArbitraryImplIbvEESK_SK_SK_NSE_ImEESL_NSE_IiEENS0_24BitFlagCombinationOfImplIjEEEEESO_EEENS4_8optionalINS4_5tupleIJDpNT_11corpus_typeEEEEEERKNSR_IJDpSS_EEERKNS0_8IRObjectEiEUlSX_E_EEDaT0_:
   54|  20.6k|constexpr auto ApplyIndex(F f) {
   55|  20.6k|  return ApplyIndexImpl(f, std::make_index_sequence<N>{});
   56|  20.6k|}
_ZN8fuzztest8internal14ApplyIndexImplIJLm0ELm1EEZNKS0_7MapImplIPFNSt3__110unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEES8_jEJNS2_IPFS8_15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplISB_EENS0_11InRangeImplIiEENSF_ISC_EENS0_13ArbitraryImplIbvEESL_SL_SL_NSF_ImEESM_NSF_IiEENS0_24BitFlagCombinationOfImplIjEEEEESP_EE8GetValueERKNS3_5tupleIJNSS_IJNS0_23ElementOfImplCorpusTypeEiST_bbbbST_ST_ST_jEEEjEEEEUlDpT_E_EEDaT0_NS3_16integer_sequenceImJXspT_EEEE:
   39|  20.5k|constexpr auto ApplyIndexImpl(F f, std::index_sequence<I...>) {
   40|  20.5k|  return f(std::integral_constant<size_t, I>{}...);
   41|  20.5k|}
_ZN8fuzztest8internal10ApplyIndexILm2EZNKS0_7MapImplIPFNSt3__110unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEES8_jEJNS2_IPFS8_15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplISB_EENS0_11InRangeImplIiEENSF_ISC_EENS0_13ArbitraryImplIbvEESL_SL_SL_NSF_ImEESM_NSF_IiEENS0_24BitFlagCombinationOfImplIjEEEEESP_EE8GetValueERKNS3_5tupleIJNSS_IJNS0_23ElementOfImplCorpusTypeEiST_bbbbST_ST_ST_jEEEjEEEEUlDpT_E_EEDaT0_:
   54|  20.5k|constexpr auto ApplyIndex(F f) {
   55|  20.5k|  return ApplyIndexImpl(f, std::make_index_sequence<N>{});
   56|  20.5k|}
_ZN8fuzztest8internal10ApplyIndexILm3EZNKS0_15AggregateOfImplINSt3__15tupleIJNS3_12basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEbNS3_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEEEEELNS0_23RequireCustomCorpusTypeE0EJNS0_23SequenceContainerOfImplISA_NS0_13ArbitraryImplIcvEEEENSJ_IbvEENS0_7MapImplIPFSF_SF_jEJNSN_IPFSF_15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplISQ_EENS0_11InRangeImplIiEENSU_ISR_EESM_SM_SM_SM_NSU_ImEESZ_NSU_IiEENS0_24BitFlagCombinationOfImplIjEEEEES12_EEEEE19ValidateCorpusValueERKNS4_IJSA_bNS4_IJNS4_IJNS0_23ElementOfImplCorpusTypeEiS16_bbbbS16_S16_S16_jEEEjEEEEEEEUlDpT_E_EEDaT0_:
   54|  20.6k|constexpr auto ApplyIndex(F f) {
   55|  20.6k|  return ApplyIndexImpl(f, std::make_index_sequence<N>{});
   56|  20.6k|}
_ZN8fuzztest8internal14ApplyIndexImplIJLm0ELm1ELm2EEZNKS0_15AggregateOfImplINSt3__15tupleIJNS3_12basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEbNS3_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEEEEELNS0_23RequireCustomCorpusTypeE0EJNS0_23SequenceContainerOfImplISA_NS0_13ArbitraryImplIcvEEEENSJ_IbvEENS0_7MapImplIPFSF_SF_jEJNSN_IPFSF_15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplISQ_EENS0_11InRangeImplIiEENSU_ISR_EESM_SM_SM_SM_NSU_ImEESZ_NSU_IiEENS0_24BitFlagCombinationOfImplIjEEEEES12_EEEEE19ValidateCorpusValueERKNS4_IJSA_bNS4_IJNS4_IJNS0_23ElementOfImplCorpusTypeEiS16_bbbbS16_S16_S16_jEEEjEEEEEEEUlDpT_E_EEDaT0_NS3_16integer_sequenceImJXspT_EEEE:
   39|  20.6k|constexpr auto ApplyIndexImpl(F f, std::index_sequence<I...>) {
   40|  20.6k|  return f(std::integral_constant<size_t, I>{}...);
   41|  20.6k|}
_ZN8fuzztest8internal10ApplyIndexILm3EZNS0_20ParseWithDomainTupleIJNS0_23SequenceContainerOfImplINSt3__112basic_stringIcNS4_11char_traitsIcEENS4_9allocatorIcEEEENS0_13ArbitraryImplIcvEEEENSB_IbvEENS0_7MapImplIPFNS4_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEESK_jEJNSF_IPFSK_15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplISN_EENS0_11InRangeImplIiEENSR_ISO_EESE_SE_SE_SE_NSR_ImEESW_NSR_IiEENS0_24BitFlagCombinationOfImplIjEEEEESZ_EEEEEENS4_8optionalINS4_5tupleIJDpNT_11corpus_typeEEEEEERKNS13_IJDpS14_EEERKNS0_8IRObjectEiEUlS19_E_EEDaT0_:
   54|  20.6k|constexpr auto ApplyIndex(F f) {
   55|  20.6k|  return ApplyIndexImpl(f, std::make_index_sequence<N>{});
   56|  20.6k|}
_ZN8fuzztest8internal14ApplyIndexImplIJLm0ELm1ELm2EEZNS0_20ParseWithDomainTupleIJNS0_23SequenceContainerOfImplINSt3__112basic_stringIcNS4_11char_traitsIcEENS4_9allocatorIcEEEENS0_13ArbitraryImplIcvEEEENSB_IbvEENS0_7MapImplIPFNS4_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEESK_jEJNSF_IPFSK_15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplISN_EENS0_11InRangeImplIiEENSR_ISO_EESE_SE_SE_SE_NSR_ImEESW_NSR_IiEENS0_24BitFlagCombinationOfImplIjEEEEESZ_EEEEEENS4_8optionalINS4_5tupleIJDpNT_11corpus_typeEEEEEERKNS13_IJDpS14_EEERKNS0_8IRObjectEiEUlS19_E_EEDaT0_NS4_16integer_sequenceImJXspT_EEEE:
   39|  20.6k|constexpr auto ApplyIndexImpl(F f, std::index_sequence<I...>) {
   40|  20.6k|  return f(std::integral_constant<size_t, I>{}...);
   41|  20.6k|}
_ZN8fuzztest8internal10ApplyIndexILm3EZNKS0_15AggregateOfImplINSt3__15tupleIJNS3_12basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEbNS3_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEEEEELNS0_23RequireCustomCorpusTypeE0EJNS0_23SequenceContainerOfImplISA_NS0_13ArbitraryImplIcvEEEENSJ_IbvEENS0_7MapImplIPFSF_SF_jEJNSN_IPFSF_15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplISQ_EENS0_11InRangeImplIiEENSU_ISR_EESM_SM_SM_SM_NSU_ImEESZ_NSU_IiEENS0_24BitFlagCombinationOfImplIjEEEEES12_EEEEE8GetValueERKNS4_IJSA_bNS4_IJNS4_IJNS0_23ElementOfImplCorpusTypeEiS16_bbbbS16_S16_S16_jEEEjEEEEEEEUlDpT_E_EEDaT0_:
   54|  20.5k|constexpr auto ApplyIndex(F f) {
   55|  20.5k|  return ApplyIndexImpl(f, std::make_index_sequence<N>{});
   56|  20.5k|}
_ZN8fuzztest8internal14ApplyIndexImplIJLm0ELm1ELm2EEZNKS0_15AggregateOfImplINSt3__15tupleIJNS3_12basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEbNS3_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEEEEELNS0_23RequireCustomCorpusTypeE0EJNS0_23SequenceContainerOfImplISA_NS0_13ArbitraryImplIcvEEEENSJ_IbvEENS0_7MapImplIPFSF_SF_jEJNSN_IPFSF_15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplISQ_EENS0_11InRangeImplIiEENSU_ISR_EESM_SM_SM_SM_NSU_ImEESZ_NSU_IiEENS0_24BitFlagCombinationOfImplIjEEEEES12_EEEEE8GetValueERKNS4_IJSA_bNS4_IJNS4_IJNS0_23ElementOfImplCorpusTypeEiS16_bbbbS16_S16_S16_jEEEjEEEEEEEUlDpT_E_EEDaT0_NS3_16integer_sequenceImJXspT_EEEE:
   39|  20.5k|constexpr auto ApplyIndexImpl(F f, std::index_sequence<I...>) {
   40|  20.5k|  return f(std::integral_constant<size_t, I>{}...);
   41|  20.5k|}
_ZN8fuzztest8internal11CheckIsSameINSt3__15tupleIJNS2_12basic_stringIcNS2_11char_traitsIcEENS2_9allocatorIcEEEEbNS2_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEEEEESF_EEvv:
  806|      4|constexpr void CheckIsSame() {
  807|      4|  static_assert(std::is_same_v<T, U>);
  808|      4|}
_ZN8fuzztest8internal11CheckIsSameINSt3__15tupleIJNS2_12basic_stringIcNS2_11char_traitsIcEENS2_9allocatorIcEEEEbNS3_IJNS3_IJNS0_23ElementOfImplCorpusTypeEiSA_bbbbSA_SA_SA_jEEEjEEEEEESD_EEvv:
  806|      4|constexpr void CheckIsSame() {
  807|      4|  static_assert(std::is_same_v<T, U>);
  808|      4|}

_ZN8fuzztest18domain_implementor10PrintValueINS_8internal11InRangeImplIiEEEEvRKT_RKNS5_11corpus_typeEN4absl12lts_2024011613FormatRawSinkENS0_9PrintModeE:
   47|     36|                RawSink out, PrintMode mode) {
   48|     36|  auto printer = domain.GetPrinter();
   49|       |  if constexpr (internal::Requires<decltype(printer)>(
   50|       |                    [&](auto t) -> decltype(t.PrintCorpusValue(
   51|       |                                    corpus_value, out, mode)) {})) {
   52|       |    printer.PrintCorpusValue(corpus_value, out, mode);
   53|     36|  } else {
   54|     36|    printer.PrintUserValue(domain.GetValue(corpus_value), out, mode);
   55|     36|  }
   56|     36|}

_ZN8fuzztest15GetRegistrationIPFvRKNSt3__112basic_stringIcNS1_11char_traitsIcEENS1_9allocatorIcEEEEbNS1_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEEEEEDaS7_S7_S7_iT_:
  339|      4|                     TargetFunction target_function) {
  340|      4|  return ::fuzztest::internal::Registration<::fuzztest::internal::NoFixture,
  341|      4|                                            TargetFunction>(
  342|      4|      ::fuzztest::internal::BasicTestInfo{std::move(suite_name),
  343|      4|                                          std::move(test_name), std::move(file),
  344|      4|                                          line, false},
  345|      4|      target_function);
  346|      4|}
_ZN8fuzztest8internal12RegistrationINS0_9NoFixtureEPFvRKNSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEbNS3_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEEENS0_23DefaultRegistrationBaseIJS9_bSG_EEEPvEC2ENS0_13BasicTestInfoESI_:
  134|      4|      : test_info_(std::move(info)), target_function_(target_function) {}
_ZNO8fuzztest8internal12RegistrationINS0_9NoFixtureEPFvRKNSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEbNS3_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEEENS0_23DefaultRegistrationBaseIJS9_bSG_EEEPvE11WithDomainsIJNS0_23SequenceContainerOfImplIS9_NS0_13ArbitraryImplIcvEEEENSP_IbvEENS0_7MapImplIPFSG_SG_jEJNST_IPFSG_15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplISW_EENS0_11InRangeImplIiEENS10_ISX_EESS_SS_SS_SS_NS10_ImEES15_NS10_IiEENS0_24BitFlagCombinationOfImplIjEEEEES18_EEEEEEDaDpOT_:
  172|      4|  auto WithDomains(NewDomains&&... domains) && {
  173|      4|    return std::move(*this).WithDomains(
  174|      4|        TupleOf(std::forward<NewDomains>(domains)...));
  175|      4|  }
_ZNO8fuzztest8internal12RegistrationINS0_9NoFixtureEPFvRKNSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEbNS3_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEEENS0_23DefaultRegistrationBaseIJS9_bSG_EEEPvE11WithDomainsIJNS0_23SequenceContainerOfImplIS9_NS0_13ArbitraryImplIcvEEEENSP_IbvEENS0_7MapImplIPFSG_SG_jEJNST_IPFSG_15avifCodecChoicei17avifDecoderSourcebbbbjjjjEJNS0_13ElementOfImplISW_EENS0_11InRangeImplIiEENS10_ISX_EESS_SS_SS_SS_NS10_ImEES15_NS10_IiEENS0_24BitFlagCombinationOfImplIjEEEEES18_EEEEEEDaNS0_15AggregateOfImplINS3_5tupleIJDpNT_10value_typeEEEELNS0_23RequireCustomCorpusTypeE0EJDpS1D_EEE:
  149|      4|                       domain) && {
  150|      4|    static_assert(!Registration::kHasDomain,
  151|      4|                  "WithDomains can only be called once.");
  152|      4|    static_assert(!Registration::kHasSeeds,
  153|      4|                  "WithDomains can not be called after WithSeeds.");
  154|      4|    static_assert(!Registration::kHasSeedProvider,
  155|      4|                  "WithDomains can not be called after WithSeedProvider.");
  156|      4|    static_assert(
  157|      4|        Base::kNumArgs == sizeof...(NewDomains),
  158|      4|        "Number of domains specified in .WithDomains() does not match "
  159|      4|        "the number of function parameters.");
  160|      4|    using NewBase = RegistrationWithDomainsBase<value_type_t<NewDomains>...>;
  161|      4|    return Registration<Fixture, TargetFunction, NewBase, SeedProvider>(
  162|      4|        std::move(test_info_), target_function_, NewBase{std::move(domain)});
  163|      4|  }
_ZN8fuzztest8internal12RegistrationINS0_9NoFixtureEPFvRKNSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEbNS3_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEEENS0_27RegistrationWithDomainsBaseIJS9_bSG_EEEPvEC2ENS0_13BasicTestInfoESI_SK_:
  294|      4|      : Base(std::move(base)),
  295|      4|        test_info_(std::move(info)),
  296|      4|        target_function_(target_function) {}
_ZNK8fuzztest8internal27RegistrationWithDomainsBaseIJNSt3__112basic_stringIcNS2_11char_traitsIcEENS2_9allocatorIcEEEEbNS2_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEEEE10GetDomainsEv:
   95|      4|  const auto& GetDomains() const { return domains_; }
_ZNK8fuzztest8internal12RegistrationINS0_9NoFixtureEPFvRKNSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEbNS3_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEEENS0_27RegistrationWithDomainsBaseIJS9_bSG_EEEPvE5seedsEv:
  272|      4|  std::vector<GenericDomainCorpusType> seeds() const {
  273|       |    if constexpr (Base::kHasSeeds) {
  274|       |      return this->seeds_;
  275|      4|    } else {
  276|      4|      return {};
  277|      4|    }
  278|      4|  }
_ZN8fuzztest8internal12RegistrationINS0_9NoFixtureEPFvRKNSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEbNS3_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEEENS0_27RegistrationWithDomainsBaseIJS9_bSG_EEEPvE13seed_providerEv:
  280|      4|  SeedProvider seed_provider() {
  281|       |    if constexpr (Base::kHasSeedProvider) {
  282|       |      return std::move(this->seed_provider_);
  283|      4|    } else {
  284|      4|      return {};
  285|      4|    }
  286|      4|  }

_ZN8fuzztest8internal11ForEachTestEN4absl12lts_2024011611FunctionRefIFvRNS0_8FuzzTestEEEE:
   56|      8|void ForEachTest(absl::FunctionRef<void(FuzzTest&)> func) {
   57|     16|  for (auto& t : Regs()) func(t);
  ------------------
  |  Branch (57:16): [True: 16, False: 8]
  ------------------
   58|      8|}
_ZN8fuzztest8internal12RegisterImplENS0_13BasicTestInfoEN4absl12lts_2024011612AnyInvocableIKFNSt3__110unique_ptrINS0_14FuzzTestFuzzerENS5_14default_deleteIS7_EEEERKNS0_8FuzzTestEEEE:
   60|      4|void RegisterImpl(BasicTestInfo test_info, FuzzTestFuzzerFactory factory) {
   61|      4|  Regs().emplace_back(std::move(test_info), std::move(factory));
   62|      4|}
registry.cc:_ZN8fuzztest8internal12_GLOBAL__N_14RegsEv:
   30|     12|auto& Regs() {
   31|       |  // We use a deque because FuzzTest is not copyable/movable.
   32|     12|  static auto* reg = new std::deque<FuzzTest>;
   33|     12|  return *reg;
   34|     12|}

_ZN8fuzztest8internal17RegistrationTokenaSINS0_27RegistrationWithDomainsBaseIJNSt3__112basic_stringIcNS4_11char_traitsIcEENS4_9allocatorIcEEEEbNS4_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEEEEENS0_9NoFixtureEPFvRKSA_bSF_EPvEERS1_ONS0_12RegistrationIT0_T1_T_T2_EE:
   57|      4|      Registration<Fixture, TargetFunction, RegBase, SeedProvider>&& reg) {
   58|       |    if constexpr (std::is_base_of_v<FixtureWithExplicitSetUp, Fixture>) {
   59|       |      RegisterSetUpTearDownTestSuiteFunctions(reg.test_info_.suite_name,
   60|       |                                              &Fixture::SetUpTestSuite,
   61|       |                                              &Fixture::TearDownTestSuite);
   62|       |    }
   63|      4|    BasicTestInfo test_info = reg.test_info_;
   64|      4|    RegisterImpl(std::move(test_info),
   65|      4|                 GetFuzzTestFuzzerFactory(std::move(reg)));
   66|      4|    return *this;
   67|      4|  }
_ZN8fuzztest8internal17RegistrationToken24GetFuzzTestFuzzerFactoryINS0_27RegistrationWithDomainsBaseIJNSt3__112basic_stringIcNS4_11char_traitsIcEENS4_9allocatorIcEEEEbNS4_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEEEEENS0_9NoFixtureEPFvRKSA_bSF_EPvEEN4absl12lts_2024011612AnyInvocableIKFNSB_INS0_14FuzzTestFuzzerENS4_14default_deleteISQ_EEEERKNS0_8FuzzTestEEEEONS0_12RegistrationIT0_T1_T_T2_EE:
   72|      4|      Registration<Fixture, TargetFunction, RegBase, SeedProvider>&& reg) {
   73|       |#if defined(FUZZTEST_COMPATIBILITY_MODE) && defined(FUZZTEST_USE_CENTIPEDE)
   74|       |#error FuzzTest compatibility mode cannot work together with Centipede.
   75|       |#endif
   76|      4|#if defined(FUZZTEST_COMPATIBILITY_MODE)
   77|      4|    using FuzzerImpl = FuzzTestExternalEngineAdaptor;
   78|       |#elif defined(FUZZTEST_USE_CENTIPEDE)
   79|       |    using FuzzerImpl = CentipedeFuzzerAdaptor;
   80|       |#else
   81|       |    using FuzzerImpl = FuzzTestFuzzerImpl;
   82|       |#endif
   83|       |
   84|      4|    return [target_function = reg.target_function_, domain = reg.GetDomains(),
   85|      4|            seeds = reg.seeds(), seed_provider = reg.seed_provider()](
   86|      4|               const FuzzTest& test) -> std::unique_ptr<FuzzTestFuzzer> {
   87|      4|      return std::make_unique<FuzzerImpl>(
   88|      4|          test,
   89|      4|          std::make_unique<FixtureDriverImpl<decltype(domain), Fixture,
   90|      4|                                             TargetFunction, SeedProvider>>(
   91|      4|              target_function, domain, seeds, seed_provider));
   92|      4|    };
   93|      4|  }
_ZZN8fuzztest8internal17RegistrationToken24GetFuzzTestFuzzerFactoryINS0_27RegistrationWithDomainsBaseIJNSt3__112basic_stringIcNS4_11char_traitsIcEENS4_9allocatorIcEEEEbNS4_10unique_ptrI11avifDecoderN4avif16UniquePtrDeleterEEEEEENS0_9NoFixtureEPFvRKSA_bSF_EPvEEN4absl12lts_2024011612AnyInvocableIKFNSB_INS0_14FuzzTestFuzzerENS4_14default_deleteISQ_EEEERKNS0_8FuzzTestEEEEONS0_12RegistrationIT0_T1_T_T2_EEENKUlSW_E_clESW_:
   86|      2|               const FuzzTest& test) -> std::unique_ptr<FuzzTestFuzzer> {
   87|      2|      return std::make_unique<FuzzerImpl>(
   88|      2|          test,
   89|      2|          std::make_unique<FixtureDriverImpl<decltype(domain), Fixture,
   90|      2|                                             TargetFunction, SeedProvider>>(
   91|      2|              target_function, domain, seeds, seed_provider));
   92|      2|    };

_ZN8fuzztest8internal7RuntimeC2Ev:
  226|      2|Runtime::Runtime() {
  227|      2|  if (const char* crash_metadata_path =
  228|      2|          std::getenv("FUZZTEST_CRASH_METADATA_PATH");
  229|      2|      crash_metadata_path != nullptr) {
  ------------------
  |  Branch (229:7): [True: 0, False: 2]
  ------------------
  230|      0|    RegisterCrashMetadataListener(
  231|      0|        [=](absl::string_view crash_type,
  232|      0|            absl::Span<const std::string> /*stack_frames*/) {
  233|      0|          WriteFile(crash_metadata_path, crash_type);
  234|      0|        });
  235|      0|  }
  236|      2|}
_ZN8fuzztest8internal7Runtime19CheckWatchdogLimitsEv:
  374|  20.5k|void Runtime::CheckWatchdogLimits() {
  375|       |  // Centipede runner has its own watchdog.
  376|  20.5k|#ifndef FUZZTEST_USE_CENTIPEDE
  377|  20.5k|  if (current_configuration_ == nullptr) return;
  ------------------
  |  Branch (377:7): [True: 20.5k, False: 0]
  ------------------
  378|      0|  const absl::Duration run_duration =
  379|      0|      clock_fn_() - current_iteration_start_time_;
  380|      0|  if (current_configuration_->time_limit_per_input > absl::ZeroDuration() &&
  ------------------
  |  Branch (380:7): [True: 0, False: 0]
  ------------------
  381|      0|      run_duration > current_configuration_->time_limit_per_input) {
  ------------------
  |  Branch (381:7): [True: 0, False: 0]
  ------------------
  382|      0|    absl::FPrintF(
  383|      0|        GetStderr(), "[!] Per-input timeout exceeded: %s > %s - aborting\n",
  384|      0|        absl::FormatDuration(run_duration),
  385|      0|        absl::FormatDuration(current_configuration_->time_limit_per_input));
  386|      0|    std::abort();
  387|      0|  }
  388|      0|  const size_t rss_usage = GetPeakRSSBytes();
  389|      0|  if (current_configuration_->rss_limit > 0 &&
  ------------------
  |  Branch (389:7): [True: 0, False: 0]
  ------------------
  390|      0|      rss_usage > current_configuration_->rss_limit) {
  ------------------
  |  Branch (390:7): [True: 0, False: 0]
  ------------------
  391|      0|    absl::FPrintF(GetStderr(),
  392|      0|                  "[!] RSS limit exceeded: %zu > %zu (bytes) - aborting\n",
  393|      0|                  rss_usage, current_configuration_->rss_limit);
  394|      0|    std::abort();
  395|      0|  }
  396|      0|#endif
  397|      0|}
_ZN8fuzztest8internal7Runtime18OnTestIterationEndEv:
  399|  20.5k|void Runtime::OnTestIterationEnd() {
  400|  20.5k|  test_iteration_started_ = false;
  401|  20.5k|  while (watchdog_spinlock_.test_and_set()) std::this_thread::yield();
  ------------------
  |  Branch (401:10): [True: 0, False: 20.5k]
  ------------------
  402|  20.5k|  CheckWatchdogLimits();
  403|  20.5k|  watchdog_spinlock_.clear();
  404|  20.5k|}
_ZN8fuzztest8internal21InstallSignalHandlersEP8_IO_FILE:
  517|      2|void InstallSignalHandlers(FILE* out) {
  518|      2|  if (signal_out != nullptr) {
  ------------------
  |  Branch (518:7): [True: 0, False: 2]
  ------------------
  519|       |    // Already installed. Noop.
  520|      0|    return;
  521|      0|  }
  522|      2|  signal_out = out;
  523|       |
  524|       |#if defined(FUZZTEST_HAS_SANITIZER)
  525|       |  // An ASan failure might come without a signal.
  526|       |  // Eg a divide by zero is intercepted by ASan and it terminates the process
  527|       |  // after printing its output. This handler helps us print our output
  528|       |  // afterwards.
  529|       |  __sanitizer_set_death_callback([](auto...) {
  530|       |    Runtime& runtime = Runtime::instance();
  531|       |#if defined(ADDRESS_SANITIZER)
  532|       |    runtime.SetCrashTypeIfUnset(__asan_get_report_description());
  533|       |#else
  534|       |    runtime.SetCrashTypeIfUnset("Sanitizer crash");
  535|       |#endif
  536|       |    runtime.PrintReport(&signal_out_sink);
  537|       |  });
  538|       |#endif
  539|       |
  540|     12|  for (OldSignalHandler& h : crash_handlers) {
  ------------------
  |  Branch (540:28): [True: 12, False: 2]
  ------------------
  541|     12|    SetNewSigAction(h.signum, &HandleCrash, &h.action);
  542|     12|  }
  543|       |
  544|      6|  for (OldSignalHandler& h : termination_handlers) {
  ------------------
  |  Branch (544:28): [True: 6, False: 2]
  ------------------
  545|      6|    SetNewSigAction(h.signum, &HandleTermination, nullptr);
  546|      6|  }
  547|      2|}
_ZN8fuzztest8internal18FuzzTestFuzzerImplC2ERKNS0_8FuzzTestENSt3__110unique_ptrINS0_20UntypedFixtureDriverENS5_14default_deleteIS7_EEEE:
  570|      2|    : test_(test),
  571|      2|      fixture_driver_(std::move(fixture_driver)),
  572|      2|      params_domain_(fixture_driver_->GetDomains()),
  573|      2|      execution_coverage_(internal::GetExecutionCoverage()),
  574|      2|      corpus_coverage_(execution_coverage_ != nullptr
  ------------------
  |  Branch (574:24): [True: 0, False: 2]
  ------------------
  575|      2|                           ? execution_coverage_->GetCounterMap().size()
  576|      2|                           : 0) {
  577|      2|  FUZZTEST_INTERNAL_CHECK_PRECONDITION(fixture_driver_ != nullptr,
  ------------------
  |  |   42|      2|  ((P) ? (void)0                                     \
  |  |  ------------------
  |  |  |  Branch (42:4): [True: 2, False: 0]
  |  |  ------------------
  |  |   43|      2|       : ::fuzztest::internal::Abort(                \
  |  |   44|      0|             __FILE__, __LINE__,                     \
  |  |   45|      0|             absl::StrCat("Failed precondition (", #P, "): ", __VA_ARGS__)))
  ------------------
  578|      2|                                       "Invalid fixture driver!");
  579|      2|  stats_.start_time = absl::Now();
  580|      2|  const char* corpus_out_dir_chars = getenv("FUZZTEST_TESTSUITE_OUT_DIR");
  581|      2|  if (corpus_out_dir_chars) corpus_out_dir_ = corpus_out_dir_chars;
  ------------------
  |  Branch (581:7): [True: 0, False: 2]
  ------------------
  582|       |
  583|      2|  std::vector<double> weights = {100.};
  584|      2|  corpus_distribution_ =
  585|      2|      absl::discrete_distribution<>(weights.begin(), weights.end());
  586|      2|}
_ZN8fuzztest8internal18FuzzTestFuzzerImpl8TryParseENSt3__117basic_string_viewIcNS2_11char_traitsIcEEEE:
  593|  23.3k|    absl::string_view data) {
  594|  23.3k|  auto ir_value = IRObject::FromString(data);
  595|  23.3k|  if (!ir_value) {
  ------------------
  |  Branch (595:7): [True: 1.19k, False: 22.1k]
  ------------------
  596|  1.19k|    return absl::InvalidArgumentError("Unexpected file format");
  597|  1.19k|  }
  598|  22.1k|  auto corpus_value = params_domain_.ParseCorpus(*ir_value);
  599|  22.1k|  if (!corpus_value) {
  ------------------
  |  Branch (599:7): [True: 1.54k, False: 20.6k]
  ------------------
  600|  1.54k|    return absl::InvalidArgumentError("Unexpected intermediate representation");
  601|  1.54k|  }
  602|  20.6k|  absl::Status is_valid = params_domain_.ValidateCorpusValue(*corpus_value);
  603|  20.6k|  if (!is_valid.ok()) {
  ------------------
  |  Branch (603:7): [True: 100, False: 20.5k]
  ------------------
  604|    100|    return Prefix(is_valid, "Invalid corpus value");
  605|    100|  }
  606|  20.5k|  return *corpus_value;
  607|  20.6k|}
_ZN8fuzztest8internal18FuzzTestFuzzerImpl10ShouldStopEv:
  882|  23.3k|bool FuzzTestFuzzerImpl::ShouldStop() {
  883|  23.3k|  if (runs_limit_.has_value() && stats_.runs >= *runs_limit_) return true;
  ------------------
  |  Branch (883:7): [True: 0, False: 23.3k]
  |  Branch (883:34): [True: 0, False: 0]
  ------------------
  884|  23.3k|  if (time_limit_ != absl::InfiniteFuture() && absl::Now() > time_limit_)
  ------------------
  |  Branch (884:7): [True: 0, False: 23.3k]
  |  Branch (884:48): [True: 0, False: 0]
  ------------------
  885|      0|    return true;
  886|  23.3k|  return runtime_.termination_requested();
  887|  23.3k|}
_ZN8fuzztest8internal18FuzzTestFuzzerImpl11RunOneInputERKNS1_5InputE:
 1020|  20.5k|    const Input& input) {
 1021|  20.5k|  ++stats_.runs;
 1022|  20.5k|  auto untyped_args = params_domain_.GetValue(input.args);
 1023|  20.5k|  Runtime::Args debug_args{input.args, params_domain_};
 1024|  20.5k|  runtime_.SetCurrentArgs(&debug_args);
 1025|       |
 1026|       |  // Reset and observe the coverage map and start tracing in
 1027|       |  // the tightest scope possible. In particular, we can't include the call
 1028|       |  // to GetValue in the scope as it will run user code.
 1029|  20.5k|  if (execution_coverage_ != nullptr) {
  ------------------
  |  Branch (1029:7): [True: 0, False: 20.5k]
  ------------------
 1030|      0|    execution_coverage_->ResetState();
 1031|      0|  }
 1032|  20.5k|  absl::Time start = absl::Now();
 1033|  20.5k|  runtime_.OnTestIterationStart(start);
 1034|       |  // Set tracing after absl::Now(), otherwise it will make
 1035|       |  // FuzzingModeTest.MinimizesDuplicatedCorpustest flaky because
 1036|       |  // randomness in absl::Now() being traced by cmp coverage.
 1037|  20.5k|  if (execution_coverage_ != nullptr) {
  ------------------
  |  Branch (1037:7): [True: 0, False: 20.5k]
  ------------------
 1038|      0|    execution_coverage_->SetIsTracing(true);
 1039|      0|  }
 1040|       |
 1041|  20.5k|  runtime_.SetSkippingRequested(false);
 1042|  20.5k|  fixture_driver_->SetUpIteration();
 1043|  20.5k|  if (!runtime_.skipping_requested()) {
  ------------------
  |  Branch (1043:7): [True: 20.5k, False: 0]
  ------------------
 1044|  20.5k|    fixture_driver_->Test(std::move(untyped_args));
 1045|  20.5k|  }
 1046|  20.5k|  fixture_driver_->TearDownIteration();
 1047|  20.5k|  if (execution_coverage_ != nullptr) {
  ------------------
  |  Branch (1047:7): [True: 0, False: 20.5k]
  ------------------
 1048|      0|    execution_coverage_->SetIsTracing(false);
 1049|      0|  }
 1050|  20.5k|  const absl::Duration run_time = absl::Now() - start;
 1051|       |
 1052|  20.5k|  bool new_coverage = false;
 1053|  20.5k|  if (execution_coverage_ != nullptr && !runtime_.skipping_requested()) {
  ------------------
  |  Branch (1053:7): [True: 0, False: 20.5k]
  |  Branch (1053:41): [True: 0, False: 0]
  ------------------
 1054|      0|    new_coverage = corpus_coverage_.Update(execution_coverage_);
 1055|      0|    stats_.max_stack_used =
 1056|      0|        std::max(stats_.max_stack_used, execution_coverage_->MaxStackUsed());
 1057|      0|  }
 1058|       |
 1059|  20.5k|  runtime_.OnTestIterationEnd();
 1060|  20.5k|  runtime_.UnsetCurrentArgs();
 1061|  20.5k|  return {new_coverage, run_time};
 1062|  20.5k|}
runtime.cc:_ZN8fuzztest8internalL15SetNewSigActionEiPFviP9siginfo_tPvEP9sigaction:
  500|     18|                            struct sigaction* old_sigact) {
  501|     18|  struct sigaction new_sigact = {};
  502|     18|  sigemptyset(&new_sigact.sa_mask);
  503|     18|  new_sigact.sa_sigaction = handler;
  504|       |
  505|       |  // We make use of the SA_ONSTACK flag so that signal handlers are executed on
  506|       |  // a separate stack. This is needed to properly handle cases where stack space
  507|       |  // is limited and the delivery of a signal needs to be properly handled.
  508|     18|  new_sigact.sa_flags = SA_SIGINFO | SA_ONSTACK;
  509|       |
  510|     18|  if (sigaction(signum, &new_sigact, old_sigact) == -1) {
  ------------------
  |  Branch (510:7): [True: 0, False: 18]
  ------------------
  511|      0|    fprintf(GetStderr(), "Error installing signal handler: %s\n",
  512|       |            strerror(errno));
  513|      0|    exit(1);
  514|      0|  }
  515|     18|}

_ZN8fuzztest8internal8FuzzTestC2ENS0_13BasicTestInfoEN4absl12lts_2024011612AnyInvocableIKFNSt3__110unique_ptrINS0_14FuzzTestFuzzerENS6_14default_deleteIS8_EEEERKS1_EEE:
   80|      4|      : test_info_(std::move(test_info)), make_(std::move(factory)) {}
_ZNK8fuzztest8internal8FuzzTest10suite_nameEv:
   82|      4|  const std::string& suite_name() const { return test_info_.suite_name; }
_ZNK8fuzztest8internal8FuzzTest9test_nameEv:
   83|      4|  const std::string& test_name() const { return test_info_.test_name; }
_ZNK8fuzztest8internal8FuzzTest9full_nameEv:
   84|     16|  std::string full_name() const {
   85|     16|    return absl::StrCat(test_info_.suite_name, ".", test_info_.test_name);
   86|     16|  }
_ZNK8fuzztest8internal8FuzzTest4fileEv:
   87|      4|  const std::string& file() const { return test_info_.file; }
_ZNK8fuzztest8internal8FuzzTest4lineEv:
   88|      4|  int line() const { return test_info_.line; }
_ZNK8fuzztest8internal8FuzzTest12uses_fixtureEv:
   89|      4|  bool uses_fixture() const { return test_info_.uses_fixture; }
_ZNK8fuzztest8internal8FuzzTest4makeEv:
   90|      2|  auto make() const { return make_(*this); }
_ZN8fuzztest8internal7Runtime8instanceEv:
  122|      8|  static Runtime& instance() {
  123|      8|    static auto* runtime = new Runtime();
  124|      8|    return *runtime;
  125|      8|  }
_ZN8fuzztest8internal7Runtime20SetSkippingRequestedEb:
  134|  20.5k|  void SetSkippingRequested(bool requested) {
  135|  20.5k|    skipping_requested_.store(requested, std::memory_order_relaxed);
  136|  20.5k|  }
_ZNK8fuzztest8internal7Runtime18skipping_requestedEv:
  138|  20.5k|  bool skipping_requested() const {
  139|  20.5k|    return skipping_requested_.load(std::memory_order_relaxed);
  140|  20.5k|  }
_ZNK8fuzztest8internal7Runtime21termination_requestedEv:
  154|  23.3k|  bool termination_requested() const {
  155|  23.3k|    return termination_requested_.load(std::memory_order_relaxed);
  156|  23.3k|  }
_ZN8fuzztest8internal7Runtime10SetRunModeENS_7RunModeE:
  160|      4|  void SetRunMode(RunMode run_mode) { run_mode_ = run_mode; }
_ZNK8fuzztest8internal7Runtime8run_modeEv:
  161|      2|  RunMode run_mode() const { return run_mode_; }
_ZN8fuzztest8internal7Runtime14EnableReporterEPKNS0_12RuntimeStatsEPFN4absl12lts_202401164TimeEvE:
  163|      2|  void EnableReporter(const RuntimeStats* stats, absl::Time (*clock_fn)()) {
  164|      2|    reporter_enabled_ = true;
  165|      2|    stats_ = stats;
  166|      2|    clock_fn_ = clock_fn;
  167|       |    // In case we have not installed them yet, do so now.
  168|      2|    InstallSignalHandlers(GetStderr());
  169|      2|    ResetCrashType();
  170|      2|  }
_ZN8fuzztest8internal7Runtime14SetCurrentTestEPKNS0_8FuzzTestEPKNS0_13ConfigurationE:
  179|  23.3k|                      const Configuration* configuration) {
  180|  23.3k|    current_test_ = test;
  181|  23.3k|    current_configuration_ = configuration;
  182|  23.3k|  }
_ZN8fuzztest8internal7Runtime20OnTestIterationStartERKN4absl12lts_202401164TimeE:
  183|  20.5k|  void OnTestIterationStart(const absl::Time& start_time) {
  184|  20.5k|    current_iteration_start_time_ = start_time;
  185|  20.5k|    test_iteration_started_ = true;
  186|  20.5k|  }
_ZN8fuzztest8internal7Runtime14SetCurrentArgsEPNS1_4ArgsE:
  189|  20.5k|  void SetCurrentArgs(Args* args) { current_args_ = args; }
_ZN8fuzztest8internal7Runtime16UnsetCurrentArgsEv:
  190|  20.5k|  void UnsetCurrentArgs() { current_args_ = nullptr; }
_ZN8fuzztest8internal7Runtime14ResetCrashTypeEv:
  207|      2|  void ResetCrashType() { crash_type_ = std::nullopt; }

_ZN8fuzztest8internal23GetFromEnvOrMakeSeedSeqERNSt3__113basic_ostreamIcNS1_11char_traitsIcEEEENS1_17basic_string_viewIcS4_EE:
   63|      2|                                      absl::string_view env_var) {
   64|      2|  const std::vector<uint32_t> seed_material =
   65|      2|      GetFromEnvOrMakeSeedMaterial(env_var);
   66|      2|  const std::string encoded_seed_material = EncodeSeedMaterial(seed_material);
   67|      2|  out << env_var << "=" << encoded_seed_material << '\n';
   68|      2|  return std::seed_seq(seed_material.begin(), seed_material.end());
   69|      2|}
_ZN8fuzztest8internal18EncodeSeedMaterialEN4absl12lts_202401164SpanIKjEE:
   71|      2|std::string EncodeSeedMaterial(absl::Span<const uint32_t> seed_material) {
   72|      2|  return absl::WebSafeBase64Escape(
   73|      2|      absl::string_view(reinterpret_cast<const char*>(seed_material.data()),
   74|      2|                        seed_material.size() * sizeof(uint32_t)));
   75|      2|}
seed_seq.cc:_ZN8fuzztest8internal12_GLOBAL__N_128GetFromEnvOrMakeSeedMaterialENSt3__117basic_string_viewIcNS2_11char_traitsIcEEEE:
   47|      2|std::vector<uint32_t> GetFromEnvOrMakeSeedMaterial(absl::string_view env_var) {
   48|      2|  const char* encoded_seed_material = std::getenv(env_var.data());
   49|      2|  if (encoded_seed_material == nullptr) {
  ------------------
  |  Branch (49:7): [True: 2, False: 0]
  ------------------
   50|      2|    return MakeSeedMaterial();
   51|      2|  }
   52|      0|  std::optional<std::vector<uint32_t>> seed_material =
   53|      0|      DecodeSeedMaterial(encoded_seed_material);
   54|      0|  FUZZTEST_INTERNAL_CHECK_PRECONDITION(
  ------------------
  |  |   42|      0|  ((P) ? (void)0                                     \
  |  |  ------------------
  |  |  |  Branch (42:4): [True: 0, False: 0]
  |  |  ------------------
  |  |   43|      0|       : ::fuzztest::internal::Abort(                \
  |  |   44|      0|             __FILE__, __LINE__,                     \
  |  |   45|      0|             absl::StrCat("Failed precondition (", #P, "): ", __VA_ARGS__)))
  ------------------
   55|      0|      seed_material.has_value(),
   56|      0|      "Failed to decode seed material from the environment variable ", env_var);
   57|      0|  return *std::move(seed_material);
   58|      2|}
seed_seq.cc:_ZN8fuzztest8internal12_GLOBAL__N_116MakeSeedMaterialEv:
   37|      2|std::vector<uint32_t> MakeSeedMaterial() {
   38|      2|  absl::BitGen gen;
   39|      2|  static constexpr int kNumberOfEntropyBits = 256;
   40|      2|  std::vector<uint32_t> seed_material(kNumberOfEntropyBits /
   41|      2|                                      (8 * sizeof(uint32_t)));
   42|      2|  std::generate(seed_material.begin(), seed_material.end(),
   43|      2|                [&gen] { return absl::Uniform<uint32_t>(gen); });
   44|      2|  return seed_material;
   45|      2|}
seed_seq.cc:_ZZN8fuzztest8internal12_GLOBAL__N_116MakeSeedMaterialEvENK3$_0clEv:
   43|     16|                [&gen] { return absl::Uniform<uint32_t>(gen); });

_ZN8fuzztest8internal8IRObject10FromStringENSt3__117basic_string_viewIcNS2_11char_traitsIcEEEE:
  331|  23.3k|std::optional<IRObject> IRObject::FromString(absl::string_view str) {
  332|  23.3k|  IRObject object;
  333|  23.3k|  if (IsInBinaryFormat(str)) {
  ------------------
  |  Branch (333:7): [True: 20.6k, False: 2.70k]
  ------------------
  334|  20.6k|    BinaryParseBuf buf = {str.data(), str.size()};
  335|  20.6k|    buf.Advance(kBinaryHeader.size());
  336|  20.6k|    if (!BinaryParse(object, buf, /*recursion_depth=*/0) || !buf.empty())
  ------------------
  |  Branch (336:9): [True: 1, False: 20.6k]
  |  Branch (336:61): [True: 1, False: 20.6k]
  ------------------
  337|      2|      return std::nullopt;
  338|  20.6k|  } else {
  339|  2.70k|    if (ReadToken(str) != kHeader) return std::nullopt;
  ------------------
  |  Branch (339:9): [True: 75, False: 2.63k]
  ------------------
  340|  2.63k|    if (!ParseImpl(object, str, /*recursion_depth=*/0) ||
  ------------------
  |  Branch (340:9): [True: 964, False: 1.66k]
  |  Branch (340:9): [True: 1.11k, False: 1.51k]
  ------------------
  341|  1.66k|        !ReadToken(str).empty())
  ------------------
  |  Branch (341:9): [True: 155, False: 1.51k]
  ------------------
  342|  1.11k|      return std::nullopt;
  343|  2.63k|  }
  344|  22.1k|  return object;
  345|  23.3k|}
serialization.cc:_ZN8fuzztest8internal12_GLOBAL__N_116IsInBinaryFormatENSt3__117basic_string_viewIcNS2_11char_traitsIcEEEE:
  302|  23.3k|bool IsInBinaryFormat(absl::string_view str) {
  303|       |  // Not using absl::string_view or std::memcmp because they could be
  304|       |  // instrumented and using them could pollute coverage.
  305|  23.3k|  return str.size() >= kBinaryHeader.size() &&
  ------------------
  |  Branch (305:10): [True: 23.3k, False: 40]
  ------------------
  306|  23.3k|         __builtin_memcmp(str.data(), kBinaryHeader.data(),
  ------------------
  |  Branch (306:10): [True: 20.6k, False: 2.66k]
  ------------------
  307|  23.3k|                          kBinaryHeader.size()) == 0;
  308|  23.3k|}
serialization.cc:_ZN8fuzztest8internal12_GLOBAL__N_114BinaryParseBuf7AdvanceEm:
  242|   742k|  inline void Advance(size_t s) {
  243|   742k|    if (s > size) s = size;
  ------------------
  |  Branch (243:9): [True: 0, False: 742k]
  ------------------
  244|   742k|    str += s;
  245|   742k|    size -= s;
  246|   742k|  }
serialization.cc:_ZN8fuzztest8internal12_GLOBAL__N_111BinaryParseERNS0_8IRObjectERNS1_14BinaryParseBufEi:
  249|   350k|bool BinaryParse(IRObject& obj, BinaryParseBuf& buf, int recursion_depth) {
  250|   350k|  if (recursion_depth > kMaxParseRecursionDepth) return false;
  ------------------
  |  Branch (250:7): [True: 0, False: 350k]
  ------------------
  251|   350k|  if (buf.empty()) return false;
  ------------------
  |  Branch (251:7): [True: 0, False: 350k]
  ------------------
  252|   350k|  const auto h = static_cast<BinaryFormatHeader>(buf.str[0]);
  253|   350k|  buf.Advance(1);
  254|   350k|  switch (h) {
  ------------------
  |  Branch (254:11): [True: 350k, False: 0]
  ------------------
  255|     66|    case BinaryFormatHeader::kEmpty: {
  ------------------
  |  Branch (255:5): [True: 66, False: 350k]
  ------------------
  256|     66|      return true;
  257|      0|    }
  258|   268k|    case BinaryFormatHeader::kUInt64: {
  ------------------
  |  Branch (258:5): [True: 268k, False: 82.5k]
  ------------------
  259|   268k|      if (buf.size < sizeof(uint64_t)) return false;
  ------------------
  |  Branch (259:11): [True: 0, False: 268k]
  ------------------
  260|   268k|      auto& t = obj.value.emplace<uint64_t>();
  261|   268k|      std::memcpy(&t, buf.str, sizeof(uint64_t));
  262|   268k|      buf.Advance(sizeof(uint64_t));
  263|   268k|      return true;
  264|   268k|    }
  265|      4|    case BinaryFormatHeader::kDouble: {
  ------------------
  |  Branch (265:5): [True: 4, False: 350k]
  ------------------
  266|      4|      if (buf.size < sizeof(double)) return false;
  ------------------
  |  Branch (266:11): [True: 0, False: 4]
  ------------------
  267|      4|      auto& t = obj.value.emplace<double>();
  268|      4|      std::memcpy(&t, buf.str, sizeof(t));
  269|      4|      buf.Advance(sizeof(double));
  270|      4|      return true;
  271|      4|    }
  272|  20.6k|    case BinaryFormatHeader::kString: {
  ------------------
  |  Branch (272:5): [True: 20.6k, False: 330k]
  ------------------
  273|  20.6k|      if (buf.size < sizeof(uint64_t)) return false;
  ------------------
  |  Branch (273:11): [True: 0, False: 20.6k]
  ------------------
  274|  20.6k|      uint64_t str_size;
  275|  20.6k|      std::memcpy(&str_size, buf.str, sizeof(str_size));
  276|  20.6k|      buf.Advance(sizeof(uint64_t));
  277|  20.6k|      if (buf.size < str_size) return false;
  ------------------
  |  Branch (277:11): [True: 0, False: 20.6k]
  ------------------
  278|  20.6k|      obj.value.emplace<std::string>() = {buf.str,
  279|  20.6k|                                          static_cast<size_t>(str_size)};
  280|  20.6k|      buf.Advance(str_size);
  281|  20.6k|      return true;
  282|  20.6k|    }
  283|  61.8k|    case BinaryFormatHeader::kObject: {
  ------------------
  |  Branch (283:5): [True: 61.8k, False: 288k]
  ------------------
  284|  61.8k|      if (buf.size < sizeof(uint64_t)) return false;
  ------------------
  |  Branch (284:11): [True: 0, False: 61.8k]
  ------------------
  285|  61.8k|      uint64_t vec_size;
  286|  61.8k|      std::memcpy(&vec_size, buf.str, sizeof(vec_size));
  287|  61.8k|      buf.Advance(sizeof(vec_size));
  288|       |      // This could happen for malformed inputs.
  289|  61.8k|      if (vec_size > buf.size) return false;
  ------------------
  |  Branch (289:11): [True: 1, False: 61.8k]
  ------------------
  290|  61.8k|      auto& v = obj.value.emplace<std::vector<IRObject>>();
  291|  61.8k|      v.reserve(vec_size);
  292|   391k|      for (uint64_t i = 0; i < vec_size; ++i) {
  ------------------
  |  Branch (292:28): [True: 330k, False: 61.8k]
  ------------------
  293|   330k|        if (!BinaryParse(v.emplace_back(), buf, recursion_depth + 1))
  ------------------
  |  Branch (293:13): [True: 0, False: 330k]
  ------------------
  294|      0|          return false;
  295|   330k|      }
  296|  61.8k|      return true;
  297|  61.8k|    }
  298|   350k|  }
  299|      0|  return false;
  300|   350k|}
serialization.cc:_ZNK8fuzztest8internal12_GLOBAL__N_114BinaryParseBuf5emptyEv:
  241|   371k|  inline bool empty() const { return size == 0; }
serialization.cc:_ZN8fuzztest8internal12_GLOBAL__N_19ReadTokenERNSt3__117basic_string_viewIcNS2_11char_traitsIcEEEE:
   78|   147k|absl::string_view ReadToken(absl::string_view& in) {
   79|   215k|  while (!in.empty() && std::isspace(in[0])) in.remove_prefix(1);
  ------------------
  |  Branch (79:10): [True: 212k, False: 2.19k]
  |  Branch (79:25): [True: 67.3k, False: 145k]
  ------------------
   80|   147k|  if (in.empty()) return in;
  ------------------
  |  Branch (80:7): [True: 2.19k, False: 145k]
  ------------------
   81|   145k|  size_t end = 1;
   82|   145k|  const auto is_literal = [](char c) {
   83|   145k|    return std::isalnum(c) != 0 || c == '+' || c == '-' || c == '.';
   84|   145k|  };
   85|   145k|  if (is_literal(in[0])) {
  ------------------
  |  Branch (85:7): [True: 73.9k, False: 71.5k]
  ------------------
   86|  8.26M|    while (end < in.size() && is_literal(in[end])) ++end;
  ------------------
  |  Branch (86:12): [True: 8.25M, False: 1.83k]
  |  Branch (86:31): [True: 8.18M, False: 72.1k]
  ------------------
   87|  73.9k|  } else if (in[0] == '"') {
  ------------------
  |  Branch (87:14): [True: 232, False: 71.3k]
  ------------------
   88|  14.8k|    while (end < in.size() && in[end] != '"') ++end;
  ------------------
  |  Branch (88:12): [True: 14.8k, False: 17]
  |  Branch (88:31): [True: 14.6k, False: 215]
  ------------------
   89|    232|    if (end < in.size()) ++end;
  ------------------
  |  Branch (89:9): [True: 215, False: 17]
  ------------------
   90|    232|  }
   91|   145k|  absl::string_view res = in.substr(0, end);
   92|   145k|  in.remove_prefix(end);
   93|   145k|  return res;
   94|   147k|}
serialization.cc:_ZZN8fuzztest8internal12_GLOBAL__N_19ReadTokenERNSt3__117basic_string_viewIcNS2_11char_traitsIcEEEEENK3$_0clEc:
   82|  8.40M|  const auto is_literal = [](char c) {
   83|  8.40M|    return std::isalnum(c) != 0 || c == '+' || c == '-' || c == '.';
  ------------------
  |  Branch (83:12): [True: 8.23M, False: 165k]
  |  Branch (83:36): [True: 2.09k, False: 163k]
  |  Branch (83:48): [True: 9.54k, False: 154k]
  |  Branch (83:60): [True: 10.2k, False: 143k]
  ------------------
   84|  8.40M|  };
serialization.cc:_ZN8fuzztest8internal12_GLOBAL__N_19ParseImplERNS0_8IRObjectERNSt3__117basic_string_viewIcNS4_11char_traitsIcEEEEi:
  136|  27.4k|bool ParseImpl(IRObject& obj, absl::string_view& str, int recursion_depth) {
  137|  27.4k|  if (recursion_depth > kMaxParseRecursionDepth) return false;
  ------------------
  |  Branch (137:7): [True: 2, False: 27.4k]
  ------------------
  138|  27.4k|  absl::string_view key = ReadToken(str);
  139|  27.4k|  if (key.empty() || key == "}") {
  ------------------
  |  Branch (139:7): [True: 60, False: 27.3k]
  |  Branch (139:22): [True: 707, False: 26.6k]
  ------------------
  140|       |    // The object is empty. Put the token back and return.
  141|    767|    str = absl::string_view(key.data(), str.data() + str.size() - key.data());
  142|    767|    return true;
  143|    767|  }
  144|       |
  145|  26.6k|  if (key == "sub") {
  ------------------
  |  Branch (145:7): [True: 3.32k, False: 23.3k]
  ------------------
  146|  3.32k|    auto& v = obj.value.emplace<std::vector<IRObject>>();
  147|  24.8k|    do {
  148|  24.8k|      if (ReadToken(str) != "{") return false;
  ------------------
  |  Branch (148:11): [True: 83, False: 24.7k]
  ------------------
  149|  24.7k|      if (!ParseImpl(v.emplace_back(), str, recursion_depth + 1)) return false;
  ------------------
  |  Branch (149:11): [True: 2.26k, False: 22.5k]
  ------------------
  150|  22.5k|      if (ReadToken(str) != "}") return false;
  ------------------
  |  Branch (150:11): [True: 440, False: 22.0k]
  ------------------
  151|  22.0k|      key = ReadToken(str);
  152|  22.0k|    } while (key == "sub");
  ------------------
  |  Branch (152:14): [True: 21.5k, False: 530]
  ------------------
  153|       |    // We are done reading this repeated sub.
  154|       |    // Put the token back for the caller.
  155|    530|    str = absl::string_view(key.data(), str.data() + str.size() - key.data());
  156|    530|    return true;
  157|  23.3k|  } else {
  158|  23.3k|    if (ReadToken(str) != ":") return false;
  ------------------
  |  Branch (158:9): [True: 37, False: 23.2k]
  ------------------
  159|  23.2k|    auto value = ReadToken(str);
  160|  23.2k|    auto& v = obj.value;
  161|  23.2k|    if (key == "i") {
  ------------------
  |  Branch (161:9): [True: 1.49k, False: 21.7k]
  ------------------
  162|  1.49k|      return ReadScalar(v.emplace<uint64_t>(), value);
  163|  21.7k|    } else if (key == "d") {
  ------------------
  |  Branch (163:16): [True: 21.5k, False: 224]
  ------------------
  164|  21.5k|      return ReadScalar(v.emplace<double>(), value);
  165|  21.5k|    } else if (key == "s") {
  ------------------
  |  Branch (165:16): [True: 218, False: 6]
  ------------------
  166|    218|      return ReadScalar(v.emplace<std::string>(), value);
  167|    218|    } else {
  168|       |      // Unrecognized key
  169|      6|      return false;
  170|      6|    }
  171|  23.2k|  }
  172|  26.6k|}
serialization.cc:_ZN8fuzztest8internal12_GLOBAL__N_110ReadScalarERmNSt3__117basic_string_viewIcNS3_11char_traitsIcEEEE:
   96|  1.49k|bool ReadScalar(uint64_t& out, absl::string_view value) {
   97|  1.49k|  return absl::SimpleAtoi(value, &out);
   98|  1.49k|}
serialization.cc:_ZN8fuzztest8internal12_GLOBAL__N_110ReadScalarERdNSt3__117basic_string_viewIcNS3_11char_traitsIcEEEE:
  100|  21.5k|bool ReadScalar(double& out, absl::string_view value) {
  101|  21.5k|  return absl::SimpleAtod(value, &out);
  102|  21.5k|}
serialization.cc:_ZN8fuzztest8internal12_GLOBAL__N_110ReadScalarERNSt3__112basic_stringIcNS2_11char_traitsIcEENS2_9allocatorIcEEEENS2_17basic_string_viewIcS5_EE:
  104|    218|bool ReadScalar(std::string& out, absl::string_view value) {
  105|    218|  if (value.empty() || value[0] != '"') return false;
  ------------------
  |  Branch (105:7): [True: 4, False: 214]
  |  Branch (105:24): [True: 1, False: 213]
  ------------------
  106|    213|  value.remove_prefix(1);
  107|       |
  108|    213|  if (value.empty() || value.back() != '"') return false;
  ------------------
  |  Branch (108:7): [True: 0, False: 213]
  |  Branch (108:24): [True: 0, False: 213]
  ------------------
  109|    213|  value.remove_suffix(1);
  110|       |
  111|  10.8k|  while (!value.empty()) {
  ------------------
  |  Branch (111:10): [True: 10.6k, False: 212]
  ------------------
  112|  10.6k|    if (value[0] != '\\') {
  ------------------
  |  Branch (112:9): [True: 9.46k, False: 1.21k]
  ------------------
  113|  9.46k|      out += value[0];
  114|  9.46k|      value.remove_prefix(1);
  115|  9.46k|    } else {
  116|  1.21k|      uint32_t v = 0;
  117|       |
  118|  1.21k|      if (value.size() < 4) return false;
  ------------------
  |  Branch (118:11): [True: 0, False: 1.21k]
  ------------------
  119|  4.87k|      for (int i = 1; i < 4; ++i) {
  ------------------
  |  Branch (119:23): [True: 3.65k, False: 1.21k]
  ------------------
  120|  3.65k|        if (value[i] < '0' || value[i] > '7') {
  ------------------
  |  Branch (120:13): [True: 1, False: 3.65k]
  |  Branch (120:31): [True: 0, False: 3.65k]
  ------------------
  121|      1|          return false;
  122|      1|        }
  123|  3.65k|        v = 8 * v + value[i] - '0';
  124|  3.65k|      }
  125|  1.21k|      if (v > 255) return false;
  ------------------
  |  Branch (125:11): [True: 0, False: 1.21k]
  ------------------
  126|       |
  127|  1.21k|      out += static_cast<char>(v);
  128|  1.21k|      value.remove_prefix(4);
  129|  1.21k|    }
  130|  10.6k|  }
  131|    212|  return true;
  132|    213|}

_ZNK8fuzztest8internal8IRObject4SubsEv:
  130|  63.4k|  std::optional<absl::Span<const IRObject>> Subs() const {
  131|  63.4k|    if (const auto* i = std::get_if<std::vector<IRObject>>(&value)) {
  ------------------
  |  Branch (131:21): [True: 62.1k, False: 1.32k]
  ------------------
  132|  62.1k|      return *i;
  133|  62.1k|    }
  134|       |    // The empty vector is serialized the same way as the monostate: nothing.
  135|       |    // Handle that case too.
  136|  1.32k|    if (std::holds_alternative<std::monostate>(value)) {
  ------------------
  |  Branch (136:9): [True: 5, False: 1.31k]
  ------------------
  137|      5|      return absl::Span<const IRObject>{};
  138|      5|    }
  139|  1.31k|    return std::nullopt;
  140|  1.32k|  }
_ZNK8fuzztest8internal8IRObject9GetScalarImEEDav:
   90|   103k|  auto GetScalar() const {
   91|       |    if constexpr (std::is_enum_v<T>) {
   92|       |      auto inner = GetScalar<std::underlying_type_t<T>>();
   93|       |      return inner ? std::optional(static_cast<T>(*inner)) : std::nullopt;
   94|   103k|    } else if constexpr (std::is_integral_v<T>) {
   95|   103k|      const uint64_t* i = std::get_if<uint64_t>(&value);
   96|   103k|      return i != nullptr ? std::optional(static_cast<T>(*i)) : std::nullopt;
  ------------------
  |  Branch (96:14): [True: 103k, False: 25]
  ------------------
   97|       |    } else if constexpr (std::is_same_v<float, T> ||
   98|       |                         std::is_same_v<double, T>) {
   99|       |      const double* i = std::get_if<double>(&value);
  100|       |      return i != nullptr ? std::optional(static_cast<T>(*i)) : std::nullopt;
  101|       |    } else if constexpr (std::is_same_v<std::string, T>) {
  102|       |      std::optional<absl::string_view> out;
  103|       |      if (const auto* s = std::get_if<std::string>(&value)) {
  104|       |        out = *s;
  105|       |      }
  106|       |      return out;
  107|       |    }
  108|   103k|  }
_ZNK8fuzztest8internal8IRObject9GetScalarIiEEDav:
   90|  20.6k|  auto GetScalar() const {
   91|       |    if constexpr (std::is_enum_v<T>) {
   92|       |      auto inner = GetScalar<std::underlying_type_t<T>>();
   93|       |      return inner ? std::optional(static_cast<T>(*inner)) : std::nullopt;
   94|  20.6k|    } else if constexpr (std::is_integral_v<T>) {
   95|  20.6k|      const uint64_t* i = std::get_if<uint64_t>(&value);
   96|  20.6k|      return i != nullptr ? std::optional(static_cast<T>(*i)) : std::nullopt;
  ------------------
  |  Branch (96:14): [True: 20.6k, False: 2]
  ------------------
   97|       |    } else if constexpr (std::is_same_v<float, T> ||
   98|       |                         std::is_same_v<double, T>) {
   99|       |      const double* i = std::get_if<double>(&value);
  100|       |      return i != nullptr ? std::optional(static_cast<T>(*i)) : std::nullopt;
  101|       |    } else if constexpr (std::is_same_v<std::string, T>) {
  102|       |      std::optional<absl::string_view> out;
  103|       |      if (const auto* s = std::get_if<std::string>(&value)) {
  104|       |        out = *s;
  105|       |      }
  106|       |      return out;
  107|       |    }
  108|  20.6k|  }
_ZNK8fuzztest8internal8IRObject9GetScalarINSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEEEDav:
   90|  20.6k|  auto GetScalar() const {
   91|       |    if constexpr (std::is_enum_v<T>) {
   92|       |      auto inner = GetScalar<std::underlying_type_t<T>>();
   93|       |      return inner ? std::optional(static_cast<T>(*inner)) : std::nullopt;
   94|       |    } else if constexpr (std::is_integral_v<T>) {
   95|       |      const uint64_t* i = std::get_if<uint64_t>(&value);
   96|       |      return i != nullptr ? std::optional(static_cast<T>(*i)) : std::nullopt;
   97|       |    } else if constexpr (std::is_same_v<float, T> ||
   98|       |                         std::is_same_v<double, T>) {
   99|       |      const double* i = std::get_if<double>(&value);
  100|       |      return i != nullptr ? std::optional(static_cast<T>(*i)) : std::nullopt;
  101|  20.6k|    } else if constexpr (std::is_same_v<std::string, T>) {
  102|  20.6k|      std::optional<absl::string_view> out;
  103|  20.6k|      if (const auto* s = std::get_if<std::string>(&value)) {
  ------------------
  |  Branch (103:23): [True: 20.6k, False: 30]
  ------------------
  104|  20.6k|        out = *s;
  105|  20.6k|      }
  106|  20.6k|      return out;
  107|  20.6k|    }
  108|  20.6k|  }
_ZNK8fuzztest8internal8IRObject9GetScalarINS0_23ElementOfImplCorpusTypeEEEDav:
   90|   103k|  auto GetScalar() const {
   91|   103k|    if constexpr (std::is_enum_v<T>) {
   92|   103k|      auto inner = GetScalar<std::underlying_type_t<T>>();
   93|   103k|      return inner ? std::optional(static_cast<T>(*inner)) : std::nullopt;
  ------------------
  |  Branch (93:14): [True: 103k, False: 25]
  ------------------
   94|       |    } else if constexpr (std::is_integral_v<T>) {
   95|       |      const uint64_t* i = std::get_if<uint64_t>(&value);
   96|       |      return i != nullptr ? std::optional(static_cast<T>(*i)) : std::nullopt;
   97|       |    } else if constexpr (std::is_same_v<float, T> ||
   98|       |                         std::is_same_v<double, T>) {
   99|       |      const double* i = std::get_if<double>(&value);
  100|       |      return i != nullptr ? std::optional(static_cast<T>(*i)) : std::nullopt;
  101|       |    } else if constexpr (std::is_same_v<std::string, T>) {
  102|       |      std::optional<absl::string_view> out;
  103|       |      if (const auto* s = std::get_if<std::string>(&value)) {
  104|       |        out = *s;
  105|       |      }
  106|       |      return out;
  107|       |    }
  108|   103k|  }
_ZNK8fuzztest8internal8IRObject8ToCorpusINS0_23ElementOfImplCorpusTypeEEENSt3__18optionalIT_EEv:
  219|   103k|  std::optional<T> ToCorpus() const {
  220|       |    if constexpr (std::is_const_v<T>) {
  221|       |      return ToCorpus<std::remove_const_t<T>>();
  222|       |    } else if constexpr (is_monostate_v<T>) {
  223|       |      if (std::holds_alternative<std::monostate>(value)) return T{};
  224|       |      return std::nullopt;
  225|       |    } else if constexpr (std::is_same_v<T, IRObject>) {
  226|       |      return *this;
  227|   103k|    } else if constexpr (std::is_constructible_v<IRObject, T>) {
  228|   103k|      if (auto v = GetScalar<T>()) {
  ------------------
  |  Branch (228:16): [True: 103k, False: 25]
  ------------------
  229|   103k|        return static_cast<T>(*v);
  230|   103k|      }
  231|     25|      return std::nullopt;
  232|       |    } else if constexpr (is_variant_v<T>) {
  233|       |      auto elems = Subs();
  234|       |      if (!elems || elems->size() != 2) return std::nullopt;
  235|       |      auto index = (*elems)[0].ToCorpus<size_t>();
  236|       |      if (!index || *index >= std::variant_size_v<T>) return std::nullopt;
  237|       |      return Switch<std::variant_size_v<T>>(
  238|       |          *index, [&](auto I) -> std::optional<T> {
  239|       |            auto inner =
  240|       |                (*elems)[1].ToCorpus<std::variant_alternative_t<I, T>>();
  241|       |            if (inner) return T(std::in_place_index<I>, *std::move(inner));
  242|       |            return std::nullopt;
  243|       |          });
  244|       |    } else if constexpr (std::is_same_v<T, absl::int128> ||
  245|       |                         std::is_same_v<T, absl::uint128>) {
  246|       |      if (auto res = ToCorpus<std::pair<uint64_t, uint64_t>>()) {
  247|       |        return static_cast<T>(absl::MakeUint128(res->first, res->second));
  248|       |      }
  249|       |      return std::nullopt;
  250|       |    } else if constexpr (is_protocol_buffer_v<T>) {
  251|       |      const std::string* v = std::get_if<std::string>(&value);
  252|       |      T out;
  253|       |      if (v && out.ParseFromString(*v)) return out;
  254|       |      return std::nullopt;
  255|       |    } else if constexpr (is_dynamic_container_v<T>) {
  256|       |      if constexpr (is_bytevector_v<T>) {
  257|       |        const std::string* v = std::get_if<std::string>(&value);
  258|       |        if (v) {
  259|       |          T out;
  260|       |          out.resize(v->size());
  261|       |          std::memcpy(out.data(), v->data(), v->size());
  262|       |          return out;
  263|       |        }
  264|       |      }
  265|       |
  266|       |      auto elems = Subs();
  267|       |      if (!elems) return std::nullopt;
  268|       |
  269|       |      T out;
  270|       |      for (const auto& elem : *elems) {
  271|       |        if (auto inner = elem.ToCorpus<typename T::value_type>()) {
  272|       |          out.insert(out.end(), *std::move(inner));
  273|       |        } else {
  274|       |          return std::nullopt;
  275|       |        }
  276|       |      }
  277|       |      return out;
  278|       |    } else {
  279|       |      // Must be a tuple like object.
  280|       |      auto elems = Subs();
  281|       |      if (!elems || elems->size() != std::tuple_size_v<T>) return std::nullopt;
  282|       |      auto it = elems->begin();
  283|       |      auto parts = ApplyIndex<std::tuple_size_v<T>>([&](auto... I) {
  284|       |        return std::tuple{it++->ToCorpus<std::tuple_element_t<I, T>>()...};
  285|       |      });
  286|       |      return std::apply(
  287|       |          [&](auto&... part) -> std::optional<T> {
  288|       |            if ((!part || ...)) return std::nullopt;
  289|       |            return T{*std::move(part)...};
  290|       |          },
  291|       |          parts);
  292|       |    }
  293|   103k|  }
_ZNK8fuzztest8internal8IRObject9GetScalarIjEEDav:
   90|  41.2k|  auto GetScalar() const {
   91|       |    if constexpr (std::is_enum_v<T>) {
   92|       |      auto inner = GetScalar<std::underlying_type_t<T>>();
   93|       |      return inner ? std::optional(static_cast<T>(*inner)) : std::nullopt;
   94|  41.2k|    } else if constexpr (std::is_integral_v<T>) {
   95|  41.2k|      const uint64_t* i = std::get_if<uint64_t>(&value);
   96|  41.2k|      return i != nullptr ? std::optional(static_cast<T>(*i)) : std::nullopt;
  ------------------
  |  Branch (96:14): [True: 41.2k, False: 8]
  ------------------
   97|       |    } else if constexpr (std::is_same_v<float, T> ||
   98|       |                         std::is_same_v<double, T>) {
   99|       |      const double* i = std::get_if<double>(&value);
  100|       |      return i != nullptr ? std::optional(static_cast<T>(*i)) : std::nullopt;
  101|       |    } else if constexpr (std::is_same_v<std::string, T>) {
  102|       |      std::optional<absl::string_view> out;
  103|       |      if (const auto* s = std::get_if<std::string>(&value)) {
  104|       |        out = *s;
  105|       |      }
  106|       |      return out;
  107|       |    }
  108|  41.2k|  }
_ZNK8fuzztest8internal8IRObject8ToCorpusIjEENSt3__18optionalIT_EEv:
  219|  41.2k|  std::optional<T> ToCorpus() const {
  220|       |    if constexpr (std::is_const_v<T>) {
  221|       |      return ToCorpus<std::remove_const_t<T>>();
  222|       |    } else if constexpr (is_monostate_v<T>) {
  223|       |      if (std::holds_alternative<std::monostate>(value)) return T{};
  224|       |      return std::nullopt;
  225|       |    } else if constexpr (std::is_same_v<T, IRObject>) {
  226|       |      return *this;
  227|  41.2k|    } else if constexpr (std::is_constructible_v<IRObject, T>) {
  228|  41.2k|      if (auto v = GetScalar<T>()) {
  ------------------
  |  Branch (228:16): [True: 41.2k, False: 8]
  ------------------
  229|  41.2k|        return static_cast<T>(*v);
  230|  41.2k|      }
  231|      8|      return std::nullopt;
  232|       |    } else if constexpr (is_variant_v<T>) {
  233|       |      auto elems = Subs();
  234|       |      if (!elems || elems->size() != 2) return std::nullopt;
  235|       |      auto index = (*elems)[0].ToCorpus<size_t>();
  236|       |      if (!index || *index >= std::variant_size_v<T>) return std::nullopt;
  237|       |      return Switch<std::variant_size_v<T>>(
  238|       |          *index, [&](auto I) -> std::optional<T> {
  239|       |            auto inner =
  240|       |                (*elems)[1].ToCorpus<std::variant_alternative_t<I, T>>();
  241|       |            if (inner) return T(std::in_place_index<I>, *std::move(inner));
  242|       |            return std::nullopt;
  243|       |          });
  244|       |    } else if constexpr (std::is_same_v<T, absl::int128> ||
  245|       |                         std::is_same_v<T, absl::uint128>) {
  246|       |      if (auto res = ToCorpus<std::pair<uint64_t, uint64_t>>()) {
  247|       |        return static_cast<T>(absl::MakeUint128(res->first, res->second));
  248|       |      }
  249|       |      return std::nullopt;
  250|       |    } else if constexpr (is_protocol_buffer_v<T>) {
  251|       |      const std::string* v = std::get_if<std::string>(&value);
  252|       |      T out;
  253|       |      if (v && out.ParseFromString(*v)) return out;
  254|       |      return std::nullopt;
  255|       |    } else if constexpr (is_dynamic_container_v<T>) {
  256|       |      if constexpr (is_bytevector_v<T>) {
  257|       |        const std::string* v = std::get_if<std::string>(&value);
  258|       |        if (v) {
  259|       |          T out;
  260|       |          out.resize(v->size());
  261|       |          std::memcpy(out.data(), v->data(), v->size());
  262|       |          return out;
  263|       |        }
  264|       |      }
  265|       |
  266|       |      auto elems = Subs();
  267|       |      if (!elems) return std::nullopt;
  268|       |
  269|       |      T out;
  270|       |      for (const auto& elem : *elems) {
  271|       |        if (auto inner = elem.ToCorpus<typename T::value_type>()) {
  272|       |          out.insert(out.end(), *std::move(inner));
  273|       |        } else {
  274|       |          return std::nullopt;
  275|       |        }
  276|       |      }
  277|       |      return out;
  278|       |    } else {
  279|       |      // Must be a tuple like object.
  280|       |      auto elems = Subs();
  281|       |      if (!elems || elems->size() != std::tuple_size_v<T>) return std::nullopt;
  282|       |      auto it = elems->begin();
  283|       |      auto parts = ApplyIndex<std::tuple_size_v<T>>([&](auto... I) {
  284|       |        return std::tuple{it++->ToCorpus<std::tuple_element_t<I, T>>()...};
  285|       |      });
  286|       |      return std::apply(
  287|       |          [&](auto&... part) -> std::optional<T> {
  288|       |            if ((!part || ...)) return std::nullopt;
  289|       |            return T{*std::move(part)...};
  290|       |          },
  291|       |          parts);
  292|       |    }
  293|  41.2k|  }
_ZNK8fuzztest8internal8IRObject8ToCorpusINSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEEENS3_8optionalIT_EEv:
  219|  20.6k|  std::optional<T> ToCorpus() const {
  220|       |    if constexpr (std::is_const_v<T>) {
  221|       |      return ToCorpus<std::remove_const_t<T>>();
  222|       |    } else if constexpr (is_monostate_v<T>) {
  223|       |      if (std::holds_alternative<std::monostate>(value)) return T{};
  224|       |      return std::nullopt;
  225|       |    } else if constexpr (std::is_same_v<T, IRObject>) {
  226|       |      return *this;
  227|  20.6k|    } else if constexpr (std::is_constructible_v<IRObject, T>) {
  228|  20.6k|      if (auto v = GetScalar<T>()) {
  ------------------
  |  Branch (228:16): [True: 20.6k, False: 30]
  ------------------
  229|  20.6k|        return static_cast<T>(*v);
  230|  20.6k|      }
  231|     30|      return std::nullopt;
  232|       |    } else if constexpr (is_variant_v<T>) {
  233|       |      auto elems = Subs();
  234|       |      if (!elems || elems->size() != 2) return std::nullopt;
  235|       |      auto index = (*elems)[0].ToCorpus<size_t>();
  236|       |      if (!index || *index >= std::variant_size_v<T>) return std::nullopt;
  237|       |      return Switch<std::variant_size_v<T>>(
  238|       |          *index, [&](auto I) -> std::optional<T> {
  239|       |            auto inner =
  240|       |                (*elems)[1].ToCorpus<std::variant_alternative_t<I, T>>();
  241|       |            if (inner) return T(std::in_place_index<I>, *std::move(inner));
  242|       |            return std::nullopt;
  243|       |          });
  244|       |    } else if constexpr (std::is_same_v<T, absl::int128> ||
  245|       |                         std::is_same_v<T, absl::uint128>) {
  246|       |      if (auto res = ToCorpus<std::pair<uint64_t, uint64_t>>()) {
  247|       |        return static_cast<T>(absl::MakeUint128(res->first, res->second));
  248|       |      }
  249|       |      return std::nullopt;
  250|       |    } else if constexpr (is_protocol_buffer_v<T>) {
  251|       |      const std::string* v = std::get_if<std::string>(&value);
  252|       |      T out;
  253|       |      if (v && out.ParseFromString(*v)) return out;
  254|       |      return std::nullopt;
  255|       |    } else if constexpr (is_dynamic_container_v<T>) {
  256|       |      if constexpr (is_bytevector_v<T>) {
  257|       |        const std::string* v = std::get_if<std::string>(&value);
  258|       |        if (v) {
  259|       |          T out;
  260|       |          out.resize(v->size());
  261|       |          std::memcpy(out.data(), v->data(), v->size());
  262|       |          return out;
  263|       |        }
  264|       |      }
  265|       |
  266|       |      auto elems = Subs();
  267|       |      if (!elems) return std::nullopt;
  268|       |
  269|       |      T out;
  270|       |      for (const auto& elem : *elems) {
  271|       |        if (auto inner = elem.ToCorpus<typename T::value_type>()) {
  272|       |          out.insert(out.end(), *std::move(inner));
  273|       |        } else {
  274|       |          return std::nullopt;
  275|       |        }
  276|       |      }
  277|       |      return out;
  278|       |    } else {
  279|       |      // Must be a tuple like object.
  280|       |      auto elems = Subs();
  281|       |      if (!elems || elems->size() != std::tuple_size_v<T>) return std::nullopt;
  282|       |      auto it = elems->begin();
  283|       |      auto parts = ApplyIndex<std::tuple_size_v<T>>([&](auto... I) {
  284|       |        return std::tuple{it++->ToCorpus<std::tuple_element_t<I, T>>()...};
  285|       |      });
  286|       |      return std::apply(
  287|       |          [&](auto&... part) -> std::optional<T> {
  288|       |            if ((!part || ...)) return std::nullopt;
  289|       |            return T{*std::move(part)...};
  290|       |          },
  291|       |          parts);
  292|       |    }
  293|  20.6k|  }
_ZNK8fuzztest8internal8IRObject8ToCorpusIiEENSt3__18optionalIT_EEv:
  219|  20.6k|  std::optional<T> ToCorpus() const {
  220|       |    if constexpr (std::is_const_v<T>) {
  221|       |      return ToCorpus<std::remove_const_t<T>>();
  222|       |    } else if constexpr (is_monostate_v<T>) {
  223|       |      if (std::holds_alternative<std::monostate>(value)) return T{};
  224|       |      return std::nullopt;
  225|       |    } else if constexpr (std::is_same_v<T, IRObject>) {
  226|       |      return *this;
  227|  20.6k|    } else if constexpr (std::is_constructible_v<IRObject, T>) {
  228|  20.6k|      if (auto v = GetScalar<T>()) {
  ------------------
  |  Branch (228:16): [True: 20.6k, False: 2]
  ------------------
  229|  20.6k|        return static_cast<T>(*v);
  230|  20.6k|      }
  231|      2|      return std::nullopt;
  232|       |    } else if constexpr (is_variant_v<T>) {
  233|       |      auto elems = Subs();
  234|       |      if (!elems || elems->size() != 2) return std::nullopt;
  235|       |      auto index = (*elems)[0].ToCorpus<size_t>();
  236|       |      if (!index || *index >= std::variant_size_v<T>) return std::nullopt;
  237|       |      return Switch<std::variant_size_v<T>>(
  238|       |          *index, [&](auto I) -> std::optional<T> {
  239|       |            auto inner =
  240|       |                (*elems)[1].ToCorpus<std::variant_alternative_t<I, T>>();
  241|       |            if (inner) return T(std::in_place_index<I>, *std::move(inner));
  242|       |            return std::nullopt;
  243|       |          });
  244|       |    } else if constexpr (std::is_same_v<T, absl::int128> ||
  245|       |                         std::is_same_v<T, absl::uint128>) {
  246|       |      if (auto res = ToCorpus<std::pair<uint64_t, uint64_t>>()) {
  247|       |        return static_cast<T>(absl::MakeUint128(res->first, res->second));
  248|       |      }
  249|       |      return std::nullopt;
  250|       |    } else if constexpr (is_protocol_buffer_v<T>) {
  251|       |      const std::string* v = std::get_if<std::string>(&value);
  252|       |      T out;
  253|       |      if (v && out.ParseFromString(*v)) return out;
  254|       |      return std::nullopt;
  255|       |    } else if constexpr (is_dynamic_container_v<T>) {
  256|       |      if constexpr (is_bytevector_v<T>) {
  257|       |        const std::string* v = std::get_if<std::string>(&value);
  258|       |        if (v) {
  259|       |          T out;
  260|       |          out.resize(v->size());
  261|       |          std::memcpy(out.data(), v->data(), v->size());
  262|       |          return out;
  263|       |        }
  264|       |      }
  265|       |
  266|       |      auto elems = Subs();
  267|       |      if (!elems) return std::nullopt;
  268|       |
  269|       |      T out;
  270|       |      for (const auto& elem : *elems) {
  271|       |        if (auto inner = elem.ToCorpus<typename T::value_type>()) {
  272|       |          out.insert(out.end(), *std::move(inner));
  273|       |        } else {
  274|       |          return std::nullopt;
  275|       |        }
  276|       |      }
  277|       |      return out;
  278|       |    } else {
  279|       |      // Must be a tuple like object.
  280|       |      auto elems = Subs();
  281|       |      if (!elems || elems->size() != std::tuple_size_v<T>) return std::nullopt;
  282|       |      auto it = elems->begin();
  283|       |      auto parts = ApplyIndex<std::tuple_size_v<T>>([&](auto... I) {
  284|       |        return std::tuple{it++->ToCorpus<std::tuple_element_t<I, T>>()...};
  285|       |      });
  286|       |      return std::apply(
  287|       |          [&](auto&... part) -> std::optional<T> {
  288|       |            if ((!part || ...)) return std::nullopt;
  289|       |            return T{*std::move(part)...};
  290|       |          },
  291|       |          parts);
  292|       |    }
  293|  20.6k|  }
_ZNK8fuzztest8internal8IRObject9GetScalarIbEEDav:
   90|   103k|  auto GetScalar() const {
   91|       |    if constexpr (std::is_enum_v<T>) {
   92|       |      auto inner = GetScalar<std::underlying_type_t<T>>();
   93|       |      return inner ? std::optional(static_cast<T>(*inner)) : std::nullopt;
   94|   103k|    } else if constexpr (std::is_integral_v<T>) {
   95|   103k|      const uint64_t* i = std::get_if<uint64_t>(&value);
   96|   103k|      return i != nullptr ? std::optional(static_cast<T>(*i)) : std::nullopt;
  ------------------
  |  Branch (96:14): [True: 103k, False: 60]
  ------------------
   97|       |    } else if constexpr (std::is_same_v<float, T> ||
   98|       |                         std::is_same_v<double, T>) {
   99|       |      const double* i = std::get_if<double>(&value);
  100|       |      return i != nullptr ? std::optional(static_cast<T>(*i)) : std::nullopt;
  101|       |    } else if constexpr (std::is_same_v<std::string, T>) {
  102|       |      std::optional<absl::string_view> out;
  103|       |      if (const auto* s = std::get_if<std::string>(&value)) {
  104|       |        out = *s;
  105|       |      }
  106|       |      return out;
  107|       |    }
  108|   103k|  }
_ZNK8fuzztest8internal8IRObject8ToCorpusIbEENSt3__18optionalIT_EEv:
  219|   103k|  std::optional<T> ToCorpus() const {
  220|       |    if constexpr (std::is_const_v<T>) {
  221|       |      return ToCorpus<std::remove_const_t<T>>();
  222|       |    } else if constexpr (is_monostate_v<T>) {
  223|       |      if (std::holds_alternative<std::monostate>(value)) return T{};
  224|       |      return std::nullopt;
  225|       |    } else if constexpr (std::is_same_v<T, IRObject>) {
  226|       |      return *this;
  227|   103k|    } else if constexpr (std::is_constructible_v<IRObject, T>) {
  228|   103k|      if (auto v = GetScalar<T>()) {
  ------------------
  |  Branch (228:16): [True: 103k, False: 60]
  ------------------
  229|   103k|        return static_cast<T>(*v);
  230|   103k|      }
  231|     60|      return std::nullopt;
  232|       |    } else if constexpr (is_variant_v<T>) {
  233|       |      auto elems = Subs();
  234|       |      if (!elems || elems->size() != 2) return std::nullopt;
  235|       |      auto index = (*elems)[0].ToCorpus<size_t>();
  236|       |      if (!index || *index >= std::variant_size_v<T>) return std::nullopt;
  237|       |      return Switch<std::variant_size_v<T>>(
  238|       |          *index, [&](auto I) -> std::optional<T> {
  239|       |            auto inner =
  240|       |                (*elems)[1].ToCorpus<std::variant_alternative_t<I, T>>();
  241|       |            if (inner) return T(std::in_place_index<I>, *std::move(inner));
  242|       |            return std::nullopt;
  243|       |          });
  244|       |    } else if constexpr (std::is_same_v<T, absl::int128> ||
  245|       |                         std::is_same_v<T, absl::uint128>) {
  246|       |      if (auto res = ToCorpus<std::pair<uint64_t, uint64_t>>()) {
  247|       |        return static_cast<T>(absl::MakeUint128(res->first, res->second));
  248|       |      }
  249|       |      return std::nullopt;
  250|       |    } else if constexpr (is_protocol_buffer_v<T>) {
  251|       |      const std::string* v = std::get_if<std::string>(&value);
  252|       |      T out;
  253|       |      if (v && out.ParseFromString(*v)) return out;
  254|       |      return std::nullopt;
  255|       |    } else if constexpr (is_dynamic_container_v<T>) {
  256|       |      if constexpr (is_bytevector_v<T>) {
  257|       |        const std::string* v = std::get_if<std::string>(&value);
  258|       |        if (v) {
  259|       |          T out;
  260|       |          out.resize(v->size());
  261|       |          std::memcpy(out.data(), v->data(), v->size());
  262|       |          return out;
  263|       |        }
  264|       |      }
  265|       |
  266|       |      auto elems = Subs();
  267|       |      if (!elems) return std::nullopt;
  268|       |
  269|       |      T out;
  270|       |      for (const auto& elem : *elems) {
  271|       |        if (auto inner = elem.ToCorpus<typename T::value_type>()) {
  272|       |          out.insert(out.end(), *std::move(inner));
  273|       |        } else {
  274|       |          return std::nullopt;
  275|       |        }
  276|       |      }
  277|       |      return out;
  278|       |    } else {
  279|       |      // Must be a tuple like object.
  280|       |      auto elems = Subs();
  281|       |      if (!elems || elems->size() != std::tuple_size_v<T>) return std::nullopt;
  282|       |      auto it = elems->begin();
  283|       |      auto parts = ApplyIndex<std::tuple_size_v<T>>([&](auto... I) {
  284|       |        return std::tuple{it++->ToCorpus<std::tuple_element_t<I, T>>()...};
  285|       |      });
  286|       |      return std::apply(
  287|       |          [&](auto&... part) -> std::optional<T> {
  288|       |            if ((!part || ...)) return std::nullopt;
  289|       |            return T{*std::move(part)...};
  290|       |          },
  291|       |          parts);
  292|       |    }
  293|   103k|  }
_ZN8fuzztest8internal8IRObjectC2Ev:
   76|   378k|  IRObject() = default;

_Z10SetMessageRKN4absl12lts_202401166StatusENSt3__117basic_string_viewIcNS4_11char_traitsIcEEEE:
    8|    393|absl::Status SetMessage(const absl::Status& status, absl::string_view message) {
    9|    393|  absl::Status result(status.code(), message);
   10|    393|  status.ForEachPayload(
   11|    393|      [&](absl::string_view type_url, const absl::Cord& payload) {
   12|    393|        result.SetPayload(type_url, payload);
   13|    393|      });
   14|    393|  return result;
   15|    393|}
_Z6PrefixRKN4absl12lts_202401166StatusENSt3__117basic_string_viewIcNS4_11char_traitsIcEEEE:
   17|   329k|absl::Status Prefix(const absl::Status& status, absl::string_view prefix) {
   18|   329k|  if (status.ok() || prefix.empty()) return status;
  ------------------
  |  Branch (18:7): [True: 328k, False: 393]
  |  Branch (18:22): [True: 0, False: 393]
  ------------------
   19|    393|  return SetMessage(status, absl::StrCat(prefix, " >> ", status.message()));
   20|   329k|}

_ZNK8fuzztest8internal15IntegralPrinter14PrintUserValueEN4absl12lts_202401166int128ENS3_13FormatRawSinkENS_18domain_implementor9PrintModeE:
   65|     36|                                     PrintMode) const {
   66|     36|  absl::Format(out, "%d", v);
   67|     36|}

_ZNK8fuzztest8internal15IntegralPrinter14PrintUserValueIiEEvT_N4absl12lts_2024011613FormatRawSinkENS_18domain_implementor9PrintModeE:
  124|     36|                      domain_implementor::PrintMode mode) const {
  125|       |    if constexpr (std::is_enum_v<T>) {
  126|       |      // TODO(sbenzaquen): Try to use enum labels where possible.
  127|       |      // Use static_cast<> when printing source code to avoid init conversion.
  128|       |      switch (mode) {
  129|       |        case domain_implementor::PrintMode::kHumanReadable:
  130|       |          absl::Format(out, "%s{", GetTypeName<T>());
  131|       |          break;
  132|       |        case domain_implementor::PrintMode::kSourceCode:
  133|       |          absl::Format(out, "static_cast<%s>(", GetTypeName<T>());
  134|       |          break;
  135|       |      }
  136|       |      PrintUserValue(static_cast<std::underlying_type_t<T>>(v), out, mode);
  137|       |      switch (mode) {
  138|       |        case domain_implementor::PrintMode::kHumanReadable:
  139|       |          absl::Format(out, "}");
  140|       |          break;
  141|       |        case domain_implementor::PrintMode::kSourceCode:
  142|       |          absl::Format(out, ")");
  143|       |          break;
  144|       |      }
  145|     36|    } else if constexpr (std::is_signed_v<T>) {
  146|       |      // Cast to [u]int128 to cover all integral types.
  147|     36|      PrintUserValue(static_cast<absl::int128>(v), out, mode);
  148|       |    } else {
  149|       |      PrintUserValue(static_cast<absl::uint128>(v), out, mode);
  150|       |    }
  151|     36|  }

_ZN7testing15AssertionResultC2IbEERKT_PNSt3__19enable_ifIXntsr3std14is_convertibleIS2_S0_EE5valueEvE4typeE:
  161|  96.8k|      : success_(success) {}

_ZN7testing7MessagelsIKcEERS0_RKPT_:
  182|     42|  inline Message& operator<<(T* const& pointer) {  // NOLINT
  183|     42|    if (pointer == nullptr) {
  ------------------
  |  Branch (183:9): [True: 0, False: 42]
  ------------------
  184|      0|      *ss_ << "(null)";
  185|     42|    } else {
  186|     42|      *ss_ << pointer;
  187|     42|    }
  188|     42|    return *this;
  189|     42|  }
_ZN7testing8internal18StreamableToStringIiEENSt3__112basic_stringIcNS2_11char_traitsIcEENS2_9allocatorIcEEEERKT_:
  243|      6|std::string StreamableToString(const T& streamable) {
  244|      6|  return (Message() << streamable).GetString();
  245|      6|}
_ZN7testing7MessagelsIiEERS0_RKT_:
  132|      6|  inline Message& operator<<(const T& val) {
  133|       |        // Some libraries overload << for STL containers.  These
  134|       |    // overloads are defined in the global namespace instead of ::std.
  135|       |    //
  136|       |    // C++'s symbol lookup rule (i.e. Koenig lookup) says that these
  137|       |    // overloads are visible in either the std namespace or the global
  138|       |    // namespace, but not other namespaces, including the testing
  139|       |    // namespace which Google Test's Message class is in.
  140|       |    //
  141|       |    // To allow STL containers (and other types that has a << operator
  142|       |    // defined in the global namespace) to be used in Google Test
  143|       |    // assertions, testing::Message must access the custom << operator
  144|       |    // from the global namespace.  With this using declaration,
  145|       |    // overloads of << defined in the global namespace and those
  146|       |    // visible via Koenig lookup are both exposed in this function.
  147|      6|    using ::operator<<;
  148|      6|    *ss_ << val;
  149|      6|    return *this;
  150|      6|  }
_ZN7testing7MessagelsIA7_cEERS0_RKT_:
  132|     42|  inline Message& operator<<(const T& val) {
  133|       |        // Some libraries overload << for STL containers.  These
  134|       |    // overloads are defined in the global namespace instead of ::std.
  135|       |    //
  136|       |    // C++'s symbol lookup rule (i.e. Koenig lookup) says that these
  137|       |    // overloads are visible in either the std namespace or the global
  138|       |    // namespace, but not other namespaces, including the testing
  139|       |    // namespace which Google Test's Message class is in.
  140|       |    //
  141|       |    // To allow STL containers (and other types that has a << operator
  142|       |    // defined in the global namespace) to be used in Google Test
  143|       |    // assertions, testing::Message must access the custom << operator
  144|       |    // from the global namespace.  With this using declaration,
  145|       |    // overloads of << defined in the global namespace and those
  146|       |    // visible via Koenig lookup are both exposed in this function.
  147|     42|    using ::operator<<;
  148|     42|    *ss_ << val;
  149|     42|    return *this;
  150|     42|  }
_ZN7testing7MessagelsIcEERS0_RKT_:
  132|    828|  inline Message& operator<<(const T& val) {
  133|       |        // Some libraries overload << for STL containers.  These
  134|       |    // overloads are defined in the global namespace instead of ::std.
  135|       |    //
  136|       |    // C++'s symbol lookup rule (i.e. Koenig lookup) says that these
  137|       |    // overloads are visible in either the std namespace or the global
  138|       |    // namespace, but not other namespaces, including the testing
  139|       |    // namespace which Google Test's Message class is in.
  140|       |    //
  141|       |    // To allow STL containers (and other types that has a << operator
  142|       |    // defined in the global namespace) to be used in Google Test
  143|       |    // assertions, testing::Message must access the custom << operator
  144|       |    // from the global namespace.  With this using declaration,
  145|       |    // overloads of << defined in the global namespace and those
  146|       |    // visible via Koenig lookup are both exposed in this function.
  147|    828|    using ::operator<<;
  148|    828|    *ss_ << val;
  149|    828|    return *this;
  150|    828|  }
_ZN7testing8internal18StreamableToStringIPcEENSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEERKT_:
  243|     32|std::string StreamableToString(const T& streamable) {
  244|     32|  return (Message() << streamable).GetString();
  245|     32|}
_ZN7testing7MessagelsIcEERS0_RKPT_:
  182|     32|  inline Message& operator<<(T* const& pointer) {  // NOLINT
  183|     32|    if (pointer == nullptr) {
  ------------------
  |  Branch (183:9): [True: 0, False: 32]
  ------------------
  184|      0|      *ss_ << "(null)";
  185|     32|    } else {
  186|     32|      *ss_ << pointer;
  187|     32|    }
  188|     32|    return *this;
  189|     32|  }

_ZNK7testing8TestInfo15test_suite_nameEv:
  541|      6|  const char* test_suite_name() const { return test_suite_name_.c_str(); }
_ZNK7testing8TestInfo4nameEv:
  549|     10|  const char* name() const { return name_.c_str(); }
_ZN7testing8UnitTest4implEv:
 1267|     50|  internal::UnitTestImpl* impl() { return impl_; }
_ZNK7testing9TestSuite14test_info_listEv:
  762|      2|  const std::vector<TestInfo*>& test_info_list() const {
  763|      2|    return test_info_list_;
  764|      2|  }
_ZN7testing10TestResult19set_start_timestampEl:
  468|      2|  void set_start_timestamp(TimeInMillis start) { start_timestamp_ = start; }
_ZN7testing9TestSuite13ShouldRunTestEPKNS_8TestInfoE:
  839|      6|  static bool ShouldRunTest(const TestInfo* test_info) {
  840|      6|    return test_info->should_run();
  841|      6|  }
_ZN7testing9TestSuite17RunSetUpTestSuiteEv:
  793|      2|  void RunSetUpTestSuite() {
  794|      2|    if (set_up_tc_ != nullptr) {
  ------------------
  |  Branch (794:9): [True: 0, False: 2]
  ------------------
  795|      0|      (*set_up_tc_)();
  796|      0|    }
  797|      2|  }
_ZNK7testing9TestSuite18ad_hoc_test_resultEv:
  752|      4|  const TestResult& ad_hoc_test_result() const { return ad_hoc_test_result_; }
_ZN7testing8TestInfo15ClearTestResultEPS0_:
  639|      4|  static void ClearTestResult(TestInfo* test_info) {
  640|      4|    test_info->result_.Clear();
  641|      4|  }
_ZNK7testing9TestSuite4nameEv:
  695|      8|  const char* name() const { return name_.c_str(); }
_ZNK7testing9TestSuite10type_paramEv:
  699|      2|  const char* type_param() const {
  700|      2|    if (type_param_ != nullptr) return type_param_->c_str();
  ------------------
  |  Branch (700:9): [True: 0, False: 2]
  ------------------
  701|      2|    return nullptr;
  702|      2|  }
_ZNK7testing9TestSuite10should_runEv:
  705|      8|  bool should_run() const { return should_run_; }
_ZNK7testing8TestInfo10should_runEv:
  590|      6|  bool should_run() const { return should_run_; }
_ZNK7testing8TestInfo10type_paramEv:
  553|      4|  const char* type_param() const {
  554|      4|    if (type_param_ != nullptr) return type_param_->c_str();
  ------------------
  |  Branch (554:9): [True: 0, False: 4]
  ------------------
  555|      4|    return nullptr;
  556|      4|  }
_ZNK7testing8UnitTest4implEv:
 1268|     22|  const internal::UnitTestImpl* impl() const { return impl_; }
_ZN7testing9TestSuite20ClearTestSuiteResultEPS0_:
  781|      4|  static void ClearTestSuiteResult(TestSuite* test_suite) {
  782|      4|    test_suite->ClearResult();
  783|      4|  }
_ZN7testing9TestSuite14set_should_runEb:
  771|      8|  void set_should_run(bool should) { should_run_ = should; }
_ZN7testing9TestSuite14test_info_listEv:
  759|     12|  std::vector<TestInfo*>& test_info_list() { return test_info_list_; }
_ZN7testing17TestEventListener16OnTestSuiteStartERKNS_9TestSuiteE:
  946|      2|  virtual void OnTestSuiteStart(const TestSuite& /*test_suite*/) {}

_ZN7testing8internal8FilePathC2ERKNSt3__112basic_stringIcNS2_11char_traitsIcEENS2_9allocatorIcEEEE:
   74|      2|  explicit FilePath(const std::string& pathname) : pathname_(pathname) {
   75|      2|    Normalize();
   76|      2|  }
_ZNK7testing8internal8FilePath7IsEmptyEv:
  118|      6|  bool IsEmpty() const { return pathname_.empty(); }
_ZN7testing8internal8FilePathC2Ev:
   71|      2|  FilePath() : pathname_("") {}
_ZN7testing8internal8FilePath3SetERKS1_:
   83|      2|  void Set(const FilePath& rhs) { pathname_ = rhs.pathname_; }

_ZN7testing8internal6RandomC2Ej:
  859|      2|  explicit Random(uint32_t seed) : state_(seed) {}
_ZN7testing8internal9GetTypeIdINS_4TestEEEPKvv:
  419|      2|TypeId GetTypeId() {
  420|       |  // The compiler is required to allocate a different
  421|       |  // TypeIdHelper<T>::dummy_ variable for each T used to instantiate
  422|       |  // the template.  Therefore, the address of dummy_ is guaranteed to
  423|       |  // be unique.
  424|      2|  return &(TypeIdHelper<T>::dummy_);
  425|      2|}

_ZN7testing8internal30ParameterizedTestSuiteRegistry13RegisterTestsEv:
  732|      2|  void RegisterTests() {
  733|      2|    for (auto& test_suite_info : test_suite_infos_) {
  ------------------
  |  Branch (733:32): [True: 0, False: 2]
  ------------------
  734|      0|      test_suite_info->RegisterTests();
  735|      0|    }
  736|      2|  }
_ZN7testing8internal30ParameterizedTestSuiteRegistryC2Ev:
  694|      2|  ParameterizedTestSuiteRegistry() = default;

_ZN7testing8internal7ToUpperEc:
 1971|    828|inline char ToUpper(char ch) {
 1972|    828|  return static_cast<char>(toupper(static_cast<unsigned char>(ch)));
 1973|    828|}
_ZN7testing8internal5posix6GetEnvEPKc:
 2122|     62|inline const char* GetEnv(const char* name) {
 2123|       |#if defined(GTEST_OS_WINDOWS_MOBILE) || defined(GTEST_OS_WINDOWS_PHONE) || \
 2124|       |    defined(GTEST_OS_ESP8266) || defined(GTEST_OS_XTENSA) ||               \
 2125|       |    defined(GTEST_OS_QURT)
 2126|       |  // We are on an embedded platform, which has no environment variables.
 2127|       |  static_cast<void>(name);  // To prevent 'unused argument' warning.
 2128|       |  return nullptr;
 2129|       |#elif defined(__BORLANDC__) || defined(__SunOS_5_8) || defined(__SunOS_5_9)
 2130|       |  // Environment variables which we programmatically clear will be set to the
 2131|       |  // empty string rather than unset (NULL).  Handle that case.
 2132|       |  const char* const env = getenv(name);
 2133|       |  return (env != nullptr && env[0] != '\0') ? env : nullptr;
 2134|       |#else
 2135|     62|  return getenv(name);
 2136|     62|#endif
 2137|     62|}
_ZN7testing8internal5posix10StrCaseCmpEPKcS3_:
 2059|      2|inline int StrCaseCmp(const char* s1, const char* s2) {
 2060|      2|  return strcasecmp(s1, s2);
 2061|      2|}
_ZN7testing8internal5MutexC2Ev:
 1706|     14|  Mutex() {
 1707|     14|    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, nullptr));
  ------------------
  |  | 1092|     14|  if (const int gtest_error = (posix_call))    \
  |  |  ------------------
  |  |  |  Branch (1092:17): [True: 0, False: 14]
  |  |  ------------------
  |  | 1093|     14|  GTEST_LOG_(FATAL) << #posix_call << "failed with error " << gtest_error
  |  |  ------------------
  |  |  |  | 1054|      0|  ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \
  |  |  |  | 1055|      0|                                __FILE__, __LINE__)                    \
  |  |  |  | 1056|      0|      .GetStream()
  |  |  ------------------
  ------------------
 1708|     14|    has_owner_ = false;
 1709|     14|  }
_ZN7testing8internal5posix6IsATTYEi:
 2065|      2|inline int IsATTY(int fd) {
 2066|       |  // DoIsATTY might change errno (for example ENOTTY in case you redirect stdout
 2067|       |  // to a file on Linux), which is unexpected, so save the previous value, and
 2068|       |  // restore it after the call.
 2069|      2|  int savedErrno = errno;
 2070|      2|  int isAttyValue = DoIsATTY(fd);
 2071|      2|  errno = savedErrno;
 2072|       |
 2073|      2|  return isAttyValue;
 2074|      2|}
_ZN7testing8internal5posix8DoIsATTYEi:
 2058|      2|inline int DoIsATTY(int fd) { return isatty(fd); }
_ZN7testing8internal5posix6FileNoEP8_IO_FILE:
 2021|      2|inline int FileNo(FILE* file) { return fileno(file); }
_ZN7testing8internal11ThreadLocalIPNS_31TestPartResultReporterInterfaceEEC2ERKS3_:
 1761|      2|      : key_(CreateKey()),
 1762|      2|        default_factory_(new InstanceValueHolderFactory(value)) {}
_ZN7testing8internal11ThreadLocalIPNS_31TestPartResultReporterInterfaceEE9CreateKeyEv:
 1793|      2|  static pthread_key_t CreateKey() {
 1794|      2|    pthread_key_t key;
 1795|       |    // When a thread exits, DeleteThreadLocalValue() will be called on
 1796|       |    // the object managed for that thread.
 1797|      2|    GTEST_CHECK_POSIX_SUCCESS_(
  ------------------
  |  | 1092|      2|  if (const int gtest_error = (posix_call))    \
  |  |  ------------------
  |  |  |  Branch (1092:17): [True: 0, False: 2]
  |  |  ------------------
  |  | 1093|      2|  GTEST_LOG_(FATAL) << #posix_call << "failed with error " << gtest_error
  |  |  ------------------
  |  |  |  | 1054|      0|  ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \
  |  |  |  | 1055|      0|                                __FILE__, __LINE__)                    \
  |  |  |  | 1056|      0|      .GetStream()
  |  |  ------------------
  ------------------
 1798|      2|        pthread_key_create(&key, &DeleteThreadLocalValue));
 1799|      2|    return key;
 1800|      2|  }
_ZN7testing8internal11ThreadLocalIPNS_31TestPartResultReporterInterfaceEE26InstanceValueHolderFactoryC2ERKS3_:
 1839|      2|    explicit InstanceValueHolderFactory(const T& value) : value_(value) {}
_ZN7testing8internal11ThreadLocalIPNS_31TestPartResultReporterInterfaceEE18ValueHolderFactoryC2Ev:
 1817|      2|    ValueHolderFactory() = default;
_ZN7testing8internal11ThreadLocalINSt3__16vectorINS0_9TraceInfoENS2_9allocatorIS4_EEEEEC2Ev:
 1759|      2|      : key_(CreateKey()), default_factory_(new DefaultValueHolderFactory()) {}
_ZN7testing8internal11ThreadLocalINSt3__16vectorINS0_9TraceInfoENS2_9allocatorIS4_EEEEE9CreateKeyEv:
 1793|      2|  static pthread_key_t CreateKey() {
 1794|      2|    pthread_key_t key;
 1795|       |    // When a thread exits, DeleteThreadLocalValue() will be called on
 1796|       |    // the object managed for that thread.
 1797|      2|    GTEST_CHECK_POSIX_SUCCESS_(
  ------------------
  |  | 1092|      2|  if (const int gtest_error = (posix_call))    \
  |  |  ------------------
  |  |  |  Branch (1092:17): [True: 0, False: 2]
  |  |  ------------------
  |  | 1093|      2|  GTEST_LOG_(FATAL) << #posix_call << "failed with error " << gtest_error
  |  |  ------------------
  |  |  |  | 1054|      0|  ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \
  |  |  |  | 1055|      0|                                __FILE__, __LINE__)                    \
  |  |  |  | 1056|      0|      .GetStream()
  |  |  ------------------
  ------------------
 1798|      2|        pthread_key_create(&key, &DeleteThreadLocalValue));
 1799|      2|    return key;
 1800|      2|  }
_ZN7testing8internal11ThreadLocalINSt3__16vectorINS0_9TraceInfoENS2_9allocatorIS4_EEEEE25DefaultValueHolderFactoryC2Ev:
 1828|      2|    DefaultValueHolderFactory() = default;
_ZN7testing8internal11ThreadLocalINSt3__16vectorINS0_9TraceInfoENS2_9allocatorIS4_EEEEE18ValueHolderFactoryC2Ev:
 1817|      2|    ValueHolderFactory() = default;

_ZN7testing16AssertionSuccessEv:
   66|  96.8k|AssertionResult AssertionSuccess() { return AssertionResult(true); }

_ZN7testing8internal29ParseInternalRunDeathTestFlagEv:
 1531|      2|InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag() {
 1532|      2|  if (GTEST_FLAG_GET(internal_run_death_test).empty()) return nullptr;
  ------------------
  |  | 2293|      2|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  ------------------
  |  |  |  | 2226|      2|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  ------------------
  ------------------
  |  Branch (1532:7): [True: 2, False: 0]
  ------------------
 1533|       |
 1534|       |  // GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we
 1535|       |  // can use it here.
 1536|      0|  int line = -1;
 1537|      0|  int index = -1;
 1538|      0|  ::std::vector< ::std::string> fields;
 1539|      0|  SplitString(GTEST_FLAG_GET(internal_run_death_test), '|', &fields);
  ------------------
  |  | 2293|      0|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  ------------------
  |  |  |  | 2226|      0|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  ------------------
  ------------------
 1540|      0|  int write_fd = -1;
 1541|       |
 1542|       |#ifdef GTEST_OS_WINDOWS
 1543|       |
 1544|       |  unsigned int parent_process_id = 0;
 1545|       |  size_t write_handle_as_size_t = 0;
 1546|       |  size_t event_handle_as_size_t = 0;
 1547|       |
 1548|       |  if (fields.size() != 6 || !ParseNaturalNumber(fields[1], &line) ||
 1549|       |      !ParseNaturalNumber(fields[2], &index) ||
 1550|       |      !ParseNaturalNumber(fields[3], &parent_process_id) ||
 1551|       |      !ParseNaturalNumber(fields[4], &write_handle_as_size_t) ||
 1552|       |      !ParseNaturalNumber(fields[5], &event_handle_as_size_t)) {
 1553|       |    DeathTestAbort("Bad --gtest_internal_run_death_test flag: " +
 1554|       |                   GTEST_FLAG_GET(internal_run_death_test));
 1555|       |  }
 1556|       |  write_fd = GetStatusFileDescriptor(parent_process_id, write_handle_as_size_t,
 1557|       |                                     event_handle_as_size_t);
 1558|       |
 1559|       |#elif defined(GTEST_OS_FUCHSIA)
 1560|       |
 1561|       |  if (fields.size() != 3 || !ParseNaturalNumber(fields[1], &line) ||
 1562|       |      !ParseNaturalNumber(fields[2], &index)) {
 1563|       |    DeathTestAbort("Bad --gtest_internal_run_death_test flag: " +
 1564|       |                   GTEST_FLAG_GET(internal_run_death_test));
 1565|       |  }
 1566|       |
 1567|       |#else
 1568|       |
 1569|      0|  if (fields.size() != 4 || !ParseNaturalNumber(fields[1], &line) ||
  ------------------
  |  Branch (1569:7): [True: 0, False: 0]
  |  Branch (1569:29): [True: 0, False: 0]
  ------------------
 1570|      0|      !ParseNaturalNumber(fields[2], &index) ||
  ------------------
  |  Branch (1570:7): [True: 0, False: 0]
  ------------------
 1571|      0|      !ParseNaturalNumber(fields[3], &write_fd)) {
  ------------------
  |  Branch (1571:7): [True: 0, False: 0]
  ------------------
 1572|      0|    DeathTestAbort("Bad --gtest_internal_run_death_test flag: " +
 1573|      0|                   GTEST_FLAG_GET(internal_run_death_test));
  ------------------
  |  | 2293|      0|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  ------------------
  |  |  |  | 2226|      0|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  ------------------
  ------------------
 1574|      0|  }
 1575|       |
 1576|      0|#endif  // GTEST_OS_WINDOWS
 1577|       |
 1578|      0|  return new InternalRunDeathTestFlag(fields[0], line, index, write_fd);
 1579|      2|}

_ZN7testing8internal8FilePath13GetCurrentDirEv:
  101|      2|FilePath FilePath::GetCurrentDir() {
  102|       |#if defined(GTEST_OS_WINDOWS_MOBILE) || defined(GTEST_OS_WINDOWS_PHONE) || \
  103|       |    defined(GTEST_OS_WINDOWS_RT) || defined(GTEST_OS_ESP8266) ||           \
  104|       |    defined(GTEST_OS_ESP32) || defined(GTEST_OS_XTENSA) ||                 \
  105|       |    defined(GTEST_OS_QURT) || defined(GTEST_OS_NXP_QN9090) ||              \
  106|       |    defined(GTEST_OS_NRF52)
  107|       |  // These platforms do not have a current directory, so we just return
  108|       |  // something reasonable.
  109|       |  return FilePath(kCurrentDirectoryString);
  110|       |#elif defined(GTEST_OS_WINDOWS)
  111|       |  char cwd[GTEST_PATH_MAX_ + 1] = {'\0'};
  112|       |  return FilePath(_getcwd(cwd, sizeof(cwd)) == nullptr ? "" : cwd);
  113|       |#else
  114|      2|  char cwd[GTEST_PATH_MAX_ + 1] = {'\0'};
  115|      2|  char* result = getcwd(cwd, sizeof(cwd));
  116|       |#ifdef GTEST_OS_NACL
  117|       |  // getcwd will likely fail in NaCl due to the sandbox, so return something
  118|       |  // reasonable. The user may have provided a shim implementation for getcwd,
  119|       |  // however, so fallback only when failure is detected.
  120|       |  return FilePath(result == nullptr ? kCurrentDirectoryString : cwd);
  121|       |#endif  // GTEST_OS_NACL
  122|      2|  return FilePath(result == nullptr ? "" : cwd);
  ------------------
  |  Branch (122:19): [True: 0, False: 2]
  ------------------
  123|      2|#endif  // GTEST_OS_WINDOWS_MOBILE
  124|      2|}
_ZN7testing8internal8FilePath9NormalizeEv:
  386|      2|void FilePath::Normalize() {
  387|      2|  auto out = pathname_.begin();
  388|       |
  389|      2|  auto i = pathname_.cbegin();
  390|       |#ifdef GTEST_OS_WINDOWS
  391|       |  // UNC paths are treated specially
  392|       |  if (pathname_.end() - i >= 3 && IsPathSeparator(*i) &&
  393|       |      IsPathSeparator(*(i + 1)) && !IsPathSeparator(*(i + 2))) {
  394|       |    *(out++) = kPathSeparator;
  395|       |    *(out++) = kPathSeparator;
  396|       |  }
  397|       |#endif
  398|     82|  while (i != pathname_.end()) {
  ------------------
  |  Branch (398:10): [True: 80, False: 2]
  ------------------
  399|     80|    const char character = *i;
  400|     80|    if (!IsPathSeparator(character)) {
  ------------------
  |  Branch (400:9): [True: 74, False: 6]
  ------------------
  401|     74|      *(out++) = character;
  402|     74|    } else if (out == pathname_.begin() || *std::prev(out) != kPathSeparator) {
  ------------------
  |  Branch (402:16): [True: 2, False: 4]
  |  Branch (402:16): [True: 6, False: 0]
  |  Branch (402:44): [True: 4, False: 0]
  ------------------
  403|      6|      *(out++) = kPathSeparator;
  404|      6|    }
  405|     80|    ++i;
  406|     80|  }
  407|       |
  408|      2|  pathname_.erase(out, pathname_.end());
  409|      2|}
gtest-all.cc:_ZN7testing8internalL15IsPathSeparatorEc:
   92|     80|static bool IsPathSeparator(char c) {
   93|       |#if GTEST_HAS_ALT_PATH_SEP_
   94|       |  return (c == kPathSeparator) || (c == kAlternatePathSeparator);
   95|       |#else
   96|     80|  return c == kPathSeparator;
   97|     80|#endif
   98|     80|}

_ZN7testing8internal15GetUnitTestImplEv:
  966|     36|inline UnitTestImpl* GetUnitTestImpl() {
  967|     36|  return UnitTest::GetInstance()->impl();
  968|     36|}
_ZN7testing8internal12UnitTestImpl17current_test_infoEv:
  766|      2|  TestInfo* current_test_info() { return current_test_info_; }
_ZN7testing8internal12UnitTestImpl9listenersEv:
  608|     12|  TestEventListeners* listeners() { return &listeners_; }
_ZN7testing8internal12UnitTestImpl33ignored_parameterized_test_suitesEv:
  701|      2|  std::set<std::string>* ignored_parameterized_test_suites() {
  702|      2|    return &ignored_parameterized_test_suites_;
  703|      2|  }
_ZN7testing8internal14GTestFlagSaverC2Ev:
  143|      2|  GTestFlagSaver() {
  144|      2|    also_run_disabled_tests_ = GTEST_FLAG_GET(also_run_disabled_tests);
  ------------------
  |  | 2293|      2|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  ------------------
  |  |  |  | 2226|      2|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  ------------------
  ------------------
  145|      2|    break_on_failure_ = GTEST_FLAG_GET(break_on_failure);
  ------------------
  |  | 2293|      2|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  ------------------
  |  |  |  | 2226|      2|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  ------------------
  ------------------
  146|      2|    catch_exceptions_ = GTEST_FLAG_GET(catch_exceptions);
  ------------------
  |  | 2293|      2|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  ------------------
  |  |  |  | 2226|      2|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  ------------------
  ------------------
  147|      2|    color_ = GTEST_FLAG_GET(color);
  ------------------
  |  | 2293|      2|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  ------------------
  |  |  |  | 2226|      2|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  ------------------
  ------------------
  148|      2|    death_test_style_ = GTEST_FLAG_GET(death_test_style);
  ------------------
  |  | 2293|      2|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  ------------------
  |  |  |  | 2226|      2|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  ------------------
  ------------------
  149|      2|    death_test_use_fork_ = GTEST_FLAG_GET(death_test_use_fork);
  ------------------
  |  | 2293|      2|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  ------------------
  |  |  |  | 2226|      2|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  ------------------
  ------------------
  150|      2|    fail_fast_ = GTEST_FLAG_GET(fail_fast);
  ------------------
  |  | 2293|      2|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  ------------------
  |  |  |  | 2226|      2|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  ------------------
  ------------------
  151|      2|    filter_ = GTEST_FLAG_GET(filter);
  ------------------
  |  | 2293|      2|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  ------------------
  |  |  |  | 2226|      2|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  ------------------
  ------------------
  152|      2|    internal_run_death_test_ = GTEST_FLAG_GET(internal_run_death_test);
  ------------------
  |  | 2293|      2|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  ------------------
  |  |  |  | 2226|      2|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  ------------------
  ------------------
  153|      2|    list_tests_ = GTEST_FLAG_GET(list_tests);
  ------------------
  |  | 2293|      2|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  ------------------
  |  |  |  | 2226|      2|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  ------------------
  ------------------
  154|      2|    output_ = GTEST_FLAG_GET(output);
  ------------------
  |  | 2293|      2|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  ------------------
  |  |  |  | 2226|      2|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  ------------------
  ------------------
  155|      2|    brief_ = GTEST_FLAG_GET(brief);
  ------------------
  |  | 2293|      2|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  ------------------
  |  |  |  | 2226|      2|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  ------------------
  ------------------
  156|      2|    print_time_ = GTEST_FLAG_GET(print_time);
  ------------------
  |  | 2293|      2|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  ------------------
  |  |  |  | 2226|      2|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  ------------------
  ------------------
  157|      2|    print_utf8_ = GTEST_FLAG_GET(print_utf8);
  ------------------
  |  | 2293|      2|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  ------------------
  |  |  |  | 2226|      2|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  ------------------
  ------------------
  158|      2|    random_seed_ = GTEST_FLAG_GET(random_seed);
  ------------------
  |  | 2293|      2|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  ------------------
  |  |  |  | 2226|      2|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  ------------------
  ------------------
  159|      2|    repeat_ = GTEST_FLAG_GET(repeat);
  ------------------
  |  | 2293|      2|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  ------------------
  |  |  |  | 2226|      2|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  ------------------
  ------------------
  160|      2|    recreate_environments_when_repeating_ =
  161|      2|        GTEST_FLAG_GET(recreate_environments_when_repeating);
  ------------------
  |  | 2293|      2|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  ------------------
  |  |  |  | 2226|      2|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  ------------------
  ------------------
  162|      2|    shuffle_ = GTEST_FLAG_GET(shuffle);
  ------------------
  |  | 2293|      2|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  ------------------
  |  |  |  | 2226|      2|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  ------------------
  ------------------
  163|      2|    stack_trace_depth_ = GTEST_FLAG_GET(stack_trace_depth);
  ------------------
  |  | 2293|      2|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  ------------------
  |  |  |  | 2226|      2|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  ------------------
  ------------------
  164|      2|    stream_result_to_ = GTEST_FLAG_GET(stream_result_to);
  ------------------
  |  | 2293|      2|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  ------------------
  |  |  |  | 2226|      2|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  ------------------
  ------------------
  165|      2|    throw_on_failure_ = GTEST_FLAG_GET(throw_on_failure);
  ------------------
  |  | 2293|      2|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  ------------------
  |  |  |  | 2226|      2|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  ------------------
  ------------------
  166|      2|  }
_ZNK7testing8internal12UnitTestImpl18current_test_suiteEv:
  765|      2|  const TestSuite* current_test_suite() const { return current_test_suite_; }
_ZN7testing8internal12UnitTestImpl11AddTestInfoEPFvvES3_PNS_8TestInfoE:
  674|      4|                   TestInfo* test_info) {
  675|      4|#if GTEST_HAS_FILE_SYSTEM
  676|       |    // In order to support thread-safe death tests, we need to
  677|       |    // remember the original working directory when the test program
  678|       |    // was first invoked.  We cannot do this in RUN_ALL_TESTS(), as
  679|       |    // the user may have changed the current directory before calling
  680|       |    // RUN_ALL_TESTS().  Therefore we capture the current directory in
  681|       |    // AddTestInfo(), which is called to register a TEST or TEST_F
  682|       |    // before main() is reached.
  683|      4|    if (original_working_dir_.IsEmpty()) {
  ------------------
  |  Branch (683:9): [True: 2, False: 2]
  ------------------
  684|      2|      original_working_dir_.Set(FilePath::GetCurrentDir());
  685|      4|      GTEST_CHECK_(!original_working_dir_.IsEmpty())
  ------------------
  |  | 1079|      2|  GTEST_AMBIGUOUS_ELSE_BLOCKER_               \
  |  |  ------------------
  |  |  |  |  712|      4|  switch (0)                          \
  |  |  |  |  713|      4|  case 0:                             \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (713:3): [True: 2, False: 0]
  |  |  |  |  ------------------
  |  |  |  |  714|      2|  default:  // NOLINT
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (714:3): [True: 0, False: 2]
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 1080|      2|  if (::testing::internal::IsTrue(condition)) \
  |  |  ------------------
  |  |  |  Branch (1080:7): [True: 2, False: 0]
  |  |  ------------------
  |  | 1081|      2|    ;                                         \
  |  | 1082|      2|  else                                        \
  |  | 1083|      2|    GTEST_LOG_(FATAL) << "Condition " #condition " failed. "
  |  |  ------------------
  |  |  |  | 1054|      0|  ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \
  |  |  |  | 1055|      0|                                __FILE__, __LINE__)                    \
  |  |  |  | 1056|      0|      .GetStream()
  |  |  ------------------
  ------------------
  686|      4|          << "Failed to get the current working directory.";
  687|      2|    }
  688|      4|#endif  // GTEST_HAS_FILE_SYSTEM
  689|       |
  690|      4|    GetTestSuite(test_info->test_suite_name(), test_info->type_param(),
  691|      4|                 set_up_tc, tear_down_tc)
  692|      4|        ->AddTestInfo(test_info);
  693|      4|  }
_ZN7testing8internal12UnitTestImpl21set_current_test_infoEPNS_8TestInfoE:
  720|      2|  void set_current_test_info(TestInfo* a_current_test_info) {
  721|      2|    current_test_info_ = a_current_test_info;
  722|      2|  }
_ZN7testing8internal12UnitTestImpl22set_current_test_suiteEPNS_9TestSuiteE:
  713|      2|  void set_current_test_suite(TestSuite* a_current_test_suite) {
  714|      2|    current_test_suite_ = a_current_test_suite;
  715|      2|  }
_ZNK7testing8internal12UnitTestImpl12GetTestSuiteEi:
  590|     12|  const TestSuite* GetTestSuite(int i) const {
  591|     12|    const int index = GetElementOr(test_suite_indices_, i, -1);
  592|     12|    return index < 0 ? nullptr : test_suites_[static_cast<size_t>(i)];
  ------------------
  |  Branch (592:12): [True: 0, False: 12]
  ------------------
  593|     12|  }
_ZN7testing8internal12UnitTestImpl19GetMutableSuiteCaseEi:
  602|      4|  TestSuite* GetMutableSuiteCase(int i) {
  603|      4|    const int index = GetElementOr(test_suite_indices_, i, -1);
  604|      4|    return index < 0 ? nullptr : test_suites_[static_cast<size_t>(index)];
  ------------------
  |  Branch (604:12): [True: 0, False: 4]
  ------------------
  605|      4|  }
_ZN7testing8internal12UnitTestImpl20set_catch_exceptionsEb:
  842|      2|  void set_catch_exceptions(bool value) { catch_exceptions_ = value; }
_ZN7testing8internal12UnitTestImpl34InitDeathTestSubprocessControlInfoEv:
  782|      2|  void InitDeathTestSubprocessControlInfo() {
  783|      2|    internal_run_death_test_flag_.reset(ParseInternalRunDeathTestFlag());
  784|      2|  }
_ZN7testing8internal21GetRandomSeedFromFlagEi:
  113|      2|inline int GetRandomSeedFromFlag(int32_t random_seed_flag) {
  114|      2|  const unsigned int raw_seed =
  115|      2|      (random_seed_flag == 0) ? static_cast<unsigned int>(GetTimeInMillis())
  ------------------
  |  Branch (115:7): [True: 2, False: 0]
  ------------------
  116|      2|                              : static_cast<unsigned int>(random_seed_flag);
  117|       |
  118|       |  // Normalizes the actual seed to range [1, kMaxRandomSeed] such that
  119|       |  // it's easy to type.
  120|      2|  const int normalized_seed =
  121|      2|      static_cast<int>((raw_seed - 1U) %
  122|      2|                       static_cast<unsigned int>(kMaxRandomSeed)) +
  123|      2|      1;
  124|      2|  return normalized_seed;
  125|      2|}
_ZN7testing8internal12UnitTestImpl23ClearNonAdHocTestResultEv:
  739|      2|  void ClearNonAdHocTestResult() {
  740|      2|    ForEach(test_suites_, TestSuite::ClearTestSuiteResult);
  741|      2|  }
_ZN7testing8internal18OsStackTraceGetterC2Ev:
  439|      2|  OsStackTraceGetter() = default;
_ZN7testing8internal27OsStackTraceGetterInterfaceC2Ev:
  410|      2|  OsStackTraceGetterInterface() = default;
_ZN7testing8internal12GetElementOrIiEET_RKNSt3__16vectorIS2_NS3_9allocatorIS2_EEEEiS2_:
  294|     22|inline E GetElementOr(const std::vector<E>& v, int i, E default_value) {
  295|     22|  return (i < 0 || i >= static_cast<int>(v.size())) ? default_value
  ------------------
  |  Branch (295:11): [True: 0, False: 22]
  |  Branch (295:20): [True: 0, False: 22]
  ------------------
  296|     22|                                                    : v[static_cast<size_t>(i)];
  297|     22|}
_ZN7testing8internal7ForEachINSt3__16vectorIPNS_9TestSuiteENS2_9allocatorIS5_EEEEPFvS5_EEEvRKT_T0_:
  287|      2|void ForEach(const Container& c, Functor functor) {
  288|      2|  std::for_each(c.begin(), c.end(), functor);
  289|      2|}
_ZN7testing8internal7CountIfINSt3__16vectorIPNS_9TestSuiteENS2_9allocatorIS5_EEEEPFbPKS4_EEEiRKT_T0_:
  275|      2|inline int CountIf(const Container& c, Predicate predicate) {
  276|       |  // Implemented as an explicit loop since std::count_if() in libCstd on
  277|       |  // Solaris has a non-standard signature.
  278|      2|  int count = 0;
  279|      6|  for (auto it = c.begin(); it != c.end(); ++it) {
  ------------------
  |  Branch (279:29): [True: 4, False: 2]
  ------------------
  280|      4|    if (predicate(*it)) ++count;
  ------------------
  |  Branch (280:9): [True: 2, False: 2]
  ------------------
  281|      4|  }
  282|      2|  return count;
  283|      2|}
_ZN7testing8internal7CountIfINSt3__16vectorINS_14TestPartResultENS2_9allocatorIS4_EEEEPFbRKS4_EEEiRKT_T0_:
  275|     14|inline int CountIf(const Container& c, Predicate predicate) {
  276|       |  // Implemented as an explicit loop since std::count_if() in libCstd on
  277|       |  // Solaris has a non-standard signature.
  278|     14|  int count = 0;
  279|     14|  for (auto it = c.begin(); it != c.end(); ++it) {
  ------------------
  |  Branch (279:29): [True: 0, False: 14]
  ------------------
  280|      0|    if (predicate(*it)) ++count;
  ------------------
  |  Branch (280:9): [True: 0, False: 0]
  ------------------
  281|      0|  }
  282|     14|  return count;
  283|     14|}
_ZNK7testing8internal12UnitTestImpl16catch_exceptionsEv:
  835|     10|  bool catch_exceptions() const { return catch_exceptions_; }
_ZN7testing8internal7CountIfINSt3__16vectorIPNS_8TestInfoENS2_9allocatorIS5_EEEEPFbPKS4_EEEiRKT_T0_:
  275|      6|inline int CountIf(const Container& c, Predicate predicate) {
  276|       |  // Implemented as an explicit loop since std::count_if() in libCstd on
  277|       |  // Solaris has a non-standard signature.
  278|      6|  int count = 0;
  279|     12|  for (auto it = c.begin(); it != c.end(); ++it) {
  ------------------
  |  Branch (279:29): [True: 6, False: 6]
  ------------------
  280|      6|    if (predicate(*it)) ++count;
  ------------------
  |  Branch (280:9): [True: 4, False: 2]
  ------------------
  281|      6|  }
  282|      6|  return count;
  283|      6|}
_ZN7testing8internal7ForEachINSt3__16vectorIPNS_8TestInfoENS2_9allocatorIS5_EEEEPFvS5_EEEvRKT_T0_:
  287|      4|void ForEach(const Container& c, Functor functor) {
  288|      4|  std::for_each(c.begin(), c.end(), functor);
  289|      4|}
_ZN7testing8internal7ForEachINSt3__16vectorIPNS_11EnvironmentENS2_9allocatorIS5_EEEEPFvS5_EEEvRKT_T0_:
  287|      2|void ForEach(const Container& c, Functor functor) {
  288|      2|  std::for_each(c.begin(), c.end(), functor);
  289|      2|}

_ZN7testing8internal16BoolFromGTestEnvEPKcb:
 1324|     24|bool BoolFromGTestEnv(const char* flag, bool default_value) {
 1325|       |#if defined(GTEST_GET_BOOL_FROM_ENV_)
 1326|       |  return GTEST_GET_BOOL_FROM_ENV_(flag, default_value);
 1327|       |#else
 1328|     24|  const std::string env_var = FlagToEnvVar(flag);
 1329|     24|  const char* const string_value = posix::GetEnv(env_var.c_str());
 1330|     24|  return string_value == nullptr ? default_value
  ------------------
  |  Branch (1330:10): [True: 24, False: 0]
  ------------------
 1331|     24|                                 : strcmp(string_value, "0") != 0;
 1332|     24|#endif  // defined(GTEST_GET_BOOL_FROM_ENV_)
 1333|     24|}
_ZN7testing8internal17Int32FromGTestEnvEPKci:
 1338|      6|int32_t Int32FromGTestEnv(const char* flag, int32_t default_value) {
 1339|       |#if defined(GTEST_GET_INT32_FROM_ENV_)
 1340|       |  return GTEST_GET_INT32_FROM_ENV_(flag, default_value);
 1341|       |#else
 1342|      6|  const std::string env_var = FlagToEnvVar(flag);
 1343|      6|  const char* const string_value = posix::GetEnv(env_var.c_str());
 1344|      6|  if (string_value == nullptr) {
  ------------------
  |  Branch (1344:7): [True: 6, False: 0]
  ------------------
 1345|       |    // The environment variable is not set.
 1346|      6|    return default_value;
 1347|      6|  }
 1348|       |
 1349|      0|  int32_t result = default_value;
 1350|      0|  if (!ParseInt32(Message() << "Environment variable " << env_var, string_value,
  ------------------
  |  Branch (1350:7): [True: 0, False: 0]
  ------------------
 1351|      0|                  &result)) {
 1352|      0|    printf("The default value %s is used.\n",
 1353|      0|           (Message() << default_value).GetString().c_str());
 1354|      0|    fflush(stdout);
 1355|      0|    return default_value;
 1356|      0|  }
 1357|       |
 1358|      0|  return result;
 1359|      0|#endif  // defined(GTEST_GET_INT32_FROM_ENV_)
 1360|      0|}
_ZN7testing8internal25OutputFlagAlsoCheckEnvVarEv:
 1370|      2|std::string OutputFlagAlsoCheckEnvVar() {
 1371|      2|  std::string default_value_for_output_flag = "";
 1372|      2|  const char* xml_output_file_env = posix::GetEnv("XML_OUTPUT_FILE");
 1373|      2|  if (nullptr != xml_output_file_env) {
  ------------------
  |  Branch (1373:7): [True: 0, False: 2]
  ------------------
 1374|      0|    default_value_for_output_flag = std::string("xml:") + xml_output_file_env;
 1375|      0|  }
 1376|      2|  return default_value_for_output_flag;
 1377|      2|}
_ZN7testing8internal18StringFromGTestEnvEPKcS2_:
 1381|     12|const char* StringFromGTestEnv(const char* flag, const char* default_value) {
 1382|       |#if defined(GTEST_GET_STRING_FROM_ENV_)
 1383|       |  return GTEST_GET_STRING_FROM_ENV_(flag, default_value);
 1384|       |#else
 1385|     12|  const std::string env_var = FlagToEnvVar(flag);
 1386|     12|  const char* const value = posix::GetEnv(env_var.c_str());
 1387|     12|  return value == nullptr ? default_value : value;
  ------------------
  |  Branch (1387:10): [True: 12, False: 0]
  ------------------
 1388|     12|#endif  // defined(GTEST_GET_STRING_FROM_ENV_)
 1389|     12|}
gtest-all.cc:_ZN7testing8internalL12FlagToEnvVarEPKc:
 1267|     42|static std::string FlagToEnvVar(const char* flag) {
 1268|     42|  const std::string full_flag =
 1269|     42|      (Message() << GTEST_FLAG_PREFIX_ << flag).GetString();
  ------------------
  |  |  331|     42|#define GTEST_FLAG_PREFIX_ "gtest_"
  ------------------
 1270|       |
 1271|     42|  Message env_var;
 1272|    870|  for (size_t i = 0; i != full_flag.length(); i++) {
  ------------------
  |  Branch (1272:22): [True: 828, False: 42]
  ------------------
 1273|    828|    env_var << ToUpper(full_flag.c_str()[i]);
 1274|    828|  }
 1275|       |
 1276|     42|  return env_var.GetString();
 1277|     42|}

_ZN7testing8internal33GetIgnoredParameterizedTestSuitesEv:
  502|      2|std::set<std::string>* GetIgnoredParameterizedTestSuites() {
  503|      2|  return UnitTest::GetInstance()->impl()->ignored_parameterized_test_suites();
  504|      2|}
_ZN7testing8internal34TypeParameterizedTestSuiteRegistry22CheckForInstantiationsEv:
  585|      2|void TypeParameterizedTestSuiteRegistry::CheckForInstantiations() {
  586|      2|  const auto& ignored = *GetIgnoredParameterizedTestSuites();
  587|      2|  for (const auto& testcase : suites_) {
  ------------------
  |  Branch (587:29): [True: 0, False: 2]
  ------------------
  588|      0|    if (testcase.second.instantiated) continue;
  ------------------
  |  Branch (588:9): [True: 0, False: 0]
  ------------------
  589|      0|    if (ignored.find(testcase.first) != ignored.end()) continue;
  ------------------
  |  Branch (589:9): [True: 0, False: 0]
  ------------------
  590|       |
  591|      0|    std::string message =
  592|      0|        "Type parameterized test suite " + testcase.first +
  593|      0|        " is defined via REGISTER_TYPED_TEST_SUITE_P, but never instantiated "
  594|      0|        "via INSTANTIATE_TYPED_TEST_SUITE_P. None of the test cases will run."
  595|      0|        "\n\n"
  596|      0|        "Ideally, TYPED_TEST_P definitions should only ever be included as "
  597|      0|        "part of binaries that intend to use them. (As opposed to, for "
  598|      0|        "example, being placed in a library that may be linked in to get other "
  599|      0|        "utilities.)"
  600|      0|        "\n\n"
  601|      0|        "To suppress this error for this test suite, insert the following line "
  602|      0|        "(in a non-header) in the namespace it is defined in:"
  603|      0|        "\n\n"
  604|      0|        "GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(" +
  605|      0|        testcase.first + ");";
  606|       |
  607|      0|    std::string full_name =
  608|      0|        "UninstantiatedTypeParameterizedTestSuite<" + testcase.first + ">";
  609|      0|    RegisterTest(  //
  610|      0|        "GoogleTestVerification", full_name.c_str(),
  611|      0|        nullptr,  // No type parameter.
  612|      0|        nullptr,  // No value parameter.
  613|      0|        testcase.second.code_location.file.c_str(),
  614|      0|        testcase.second.code_location.line, [message, testcase] {
  615|      0|          return new FailureTest(testcase.second.code_location, message,
  616|      0|                                 kErrorOnUninstantiatedTypeParameterizedTest);
  617|      0|        });
  618|      0|  }
  619|      2|}
_ZN7testing8internal8GetArgvsEv:
  624|      4|::std::vector<std::string> GetArgvs() {
  625|       |#if defined(GTEST_CUSTOM_GET_ARGVS_)
  626|       |  // GTEST_CUSTOM_GET_ARGVS_() may return a container of std::string or
  627|       |  // ::string. This code converts it to the appropriate type.
  628|       |  const auto& custom = GTEST_CUSTOM_GET_ARGVS_();
  629|       |  return ::std::vector<std::string>(custom.begin(), custom.end());
  630|       |#else   // defined(GTEST_CUSTOM_GET_ARGVS_)
  631|      4|  return g_argvs;
  632|      4|#endif  // defined(GTEST_CUSTOM_GET_ARGVS_)
  633|      4|}
_ZN7testing8internal15UnitTestOptions15GetOutputFormatEv:
  654|      2|std::string UnitTestOptions::GetOutputFormat() {
  655|      2|  std::string s = GTEST_FLAG_GET(output);
  ------------------
  |  | 2293|      2|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  ------------------
  |  |  |  | 2226|      2|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  ------------------
  ------------------
  656|      2|  const char* const gtest_output_flag = s.c_str();
  657|      2|  const char* const colon = strchr(gtest_output_flag, ':');
  658|      2|  return (colon == nullptr)
  ------------------
  |  Branch (658:10): [True: 2, False: 0]
  ------------------
  659|      2|             ? std::string(gtest_output_flag)
  660|      2|             : std::string(gtest_output_flag,
  661|      0|                           static_cast<size_t>(colon - gtest_output_flag));
  662|      2|}
_ZN7testing8internal13GetTestTypeIdEv:
  962|      2|TypeId GetTestTypeId() { return GetTypeId<Test>(); }
_ZN7testing8internal35DefaultGlobalTestPartResultReporterC2EPNS0_12UnitTestImplE:
 1025|      2|    : unit_test_(unit_test) {}
_ZN7testing8internal38DefaultPerThreadTestPartResultReporterC2EPNS0_12UnitTestImplE:
 1035|      2|    : unit_test_(unit_test) {}
_ZNK7testing8internal12UnitTestImpl22total_test_suite_countEv:
 1079|     10|int UnitTestImpl::total_test_suite_count() const {
 1080|     10|  return static_cast<int>(test_suites_.size());
 1081|     10|}
_ZNK7testing8internal12UnitTestImpl23test_suite_to_run_countEv:
 1085|      2|int UnitTestImpl::test_suite_to_run_count() const {
 1086|      2|  return CountIf(test_suites_, ShouldRunTestSuite);
 1087|      2|}
_ZNK7testing8internal12UnitTestImpl17test_to_run_countEv:
 1126|      2|int UnitTestImpl::test_to_run_count() const {
 1127|      2|  return SumOverTestSuiteList(test_suites_, &TestSuite::test_to_run_count);
 1128|      2|}
_ZN7testing8internal15GetTimeInMillisEv:
 1174|      8|TimeInMillis GetTimeInMillis() {
 1175|      8|  return std::chrono::duration_cast<std::chrono::milliseconds>(
 1176|      8|             std::chrono::system_clock::now() -
 1177|      8|             std::chrono::system_clock::from_time_t(0))
 1178|      8|      .count();
 1179|      8|}
_ZN7testing8internal6String13CStringEqualsEPKcS3_:
 1224|      2|bool String::CStringEquals(const char* lhs, const char* rhs) {
 1225|      2|  if (lhs == nullptr) return rhs == nullptr;
  ------------------
  |  Branch (1225:7): [True: 0, False: 2]
  ------------------
 1226|       |
 1227|      2|  if (rhs == nullptr) return false;
  ------------------
  |  Branch (1227:7): [True: 0, False: 2]
  ------------------
 1228|       |
 1229|      2|  return strcmp(lhs, rhs) == 0;
 1230|      2|}
_ZN7testing8internal11SplitStringERKNSt3__112basic_stringIcNS1_11char_traitsIcEENS1_9allocatorIcEEEEcPNS1_6vectorIS7_NS5_IS7_EEEE:
 1252|     10|                 ::std::vector< ::std::string>* dest) {
 1253|     10|  ::std::vector< ::std::string> parsed;
 1254|     10|  ::std::string::size_type pos = 0;
 1255|     16|  while (::testing::internal::AlwaysTrue()) {
  ------------------
  |  Branch (1255:10): [True: 16, False: 0]
  ------------------
 1256|     16|    const ::std::string::size_type colon = str.find(delimiter, pos);
 1257|     16|    if (colon == ::std::string::npos) {
  ------------------
  |  Branch (1257:9): [True: 10, False: 6]
  ------------------
 1258|     10|      parsed.push_back(str.substr(pos));
 1259|     10|      break;
 1260|     10|    } else {
 1261|      6|      parsed.push_back(str.substr(pos, colon - pos));
 1262|      6|      pos = colon + 1;
 1263|      6|    }
 1264|     16|  }
 1265|     10|  dest->swap(parsed);
 1266|     10|}
_ZN7testing7MessageC2Ev:
 1275|    122|Message::Message() : ss_(new ::std::stringstream) {
 1276|       |  // By default, we want there to be enough precision when printing
 1277|       |  // a double to a Message.
 1278|    122|  *ss_ << std::setprecision(std::numeric_limits<double>::digits10 + 2);
 1279|    122|}
_ZNK7testing7Message9GetStringEv:
 1301|    122|std::string Message::GetString() const {
 1302|    122|  return internal::StringStreamToString(ss_.get());
 1303|    122|}
_ZN7testing8internal6String28CaseInsensitiveCStringEqualsEPKcS3_:
 2131|      2|bool String::CaseInsensitiveCStringEquals(const char* lhs, const char* rhs) {
 2132|      2|  if (lhs == nullptr) return rhs == nullptr;
  ------------------
  |  Branch (2132:7): [True: 0, False: 2]
  ------------------
 2133|      2|  if (rhs == nullptr) return false;
  ------------------
  |  Branch (2133:7): [True: 0, False: 2]
  ------------------
 2134|      2|  return posix::StrCaseCmp(lhs, rhs) == 0;
 2135|      2|}
_ZN7testing8internal20StringStreamToStringEPNSt3__118basic_stringstreamIcNS1_11char_traitsIcEENS1_9allocatorIcEEEE:
 2216|    122|std::string StringStreamToString(::std::stringstream* ss) {
 2217|    122|  const ::std::string& str = ss->str();
 2218|    122|  const char* const start = str.c_str();
 2219|    122|  const char* const end = start + str.length();
 2220|       |
 2221|    122|  std::string result;
 2222|    122|  result.reserve(static_cast<size_t>(2 * (end - start)));
 2223|  2.88k|  for (const char* ch = start; ch != end; ++ch) {
  ------------------
  |  Branch (2223:32): [True: 2.76k, False: 122]
  ------------------
 2224|  2.76k|    if (*ch == '\0') {
  ------------------
  |  Branch (2224:9): [True: 0, False: 2.76k]
  ------------------
 2225|      0|      result += "\\0";  // Replaces NUL with "\\0";
 2226|  2.76k|    } else {
 2227|  2.76k|      result += *ch;
 2228|  2.76k|    }
 2229|  2.76k|  }
 2230|       |
 2231|    122|  return result;
 2232|    122|}
_ZN7testing10TestResultC2Ev:
 2254|     10|    : death_test_count_(0), start_timestamp_(0), elapsed_time_(0) {}
_ZN7testing10TestResult5ClearEv:
 2400|      8|void TestResult::Clear() {
 2401|      8|  test_part_results_.clear();
 2402|      8|  test_properties_.clear();
 2403|      8|  death_test_count_ = 0;
 2404|      8|  elapsed_time_ = 0;
 2405|      8|}
_ZNK7testing10TestResult7SkippedEv:
 2413|      8|bool TestResult::Skipped() const {
 2414|      8|  return !Failed() && CountIf(test_part_results_, TestPartSkipped) > 0;
  ------------------
  |  Branch (2414:10): [True: 8, False: 0]
  |  Branch (2414:23): [True: 0, False: 8]
  ------------------
 2415|      8|}
_ZNK7testing10TestResult6FailedEv:
 2418|     10|bool TestResult::Failed() const {
 2419|     10|  for (int i = 0; i < total_part_count(); ++i) {
  ------------------
  |  Branch (2419:19): [True: 0, False: 10]
  ------------------
 2420|      0|    if (GetTestPartResult(i).failed()) return true;
  ------------------
  |  Branch (2420:9): [True: 0, False: 0]
  ------------------
 2421|      0|  }
 2422|     10|  return false;
 2423|     10|}
_ZNK7testing10TestResult15HasFatalFailureEv:
 2431|      6|bool TestResult::HasFatalFailure() const {
 2432|      6|  return CountIf(test_part_results_, TestPartFatallyFailed) > 0;
 2433|      6|}
_ZNK7testing10TestResult16total_part_countEv:
 2447|     10|int TestResult::total_part_count() const {
 2448|     10|  return static_cast<int>(test_part_results_.size());
 2449|     10|}
_ZN7testing4TestC2Ev:
 2461|      2|Test::Test() : gtest_flag_saver_(new GTEST_FLAG_SAVER_) {}
  ------------------
  |  | 2291|      2|#define GTEST_FLAG_SAVER_ ::testing::internal::GTestFlagSaver
  ------------------
_ZN7testing4Test5SetUpEv:
 2471|      2|void Test::SetUp() {}
_ZN7testing4Test19HasSameFixtureClassEv:
 2504|      2|bool Test::HasSameFixtureClass() {
 2505|      2|  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
 2506|      2|  const TestSuite* const test_suite = impl->current_test_suite();
 2507|       |
 2508|       |  // Info about the first test in the current test suite.
 2509|      2|  const TestInfo* const first_test_info = test_suite->test_info_list()[0];
 2510|      2|  const internal::TypeId first_fixture_id = first_test_info->fixture_class_id_;
 2511|      2|  const char* const first_test_name = first_test_info->name();
 2512|       |
 2513|       |  // Info about the current test.
 2514|      2|  const TestInfo* const this_test_info = impl->current_test_info();
 2515|      2|  const internal::TypeId this_fixture_id = this_test_info->fixture_class_id_;
 2516|      2|  const char* const this_test_name = this_test_info->name();
 2517|       |
 2518|      2|  if (this_fixture_id != first_fixture_id) {
  ------------------
  |  Branch (2518:7): [True: 0, False: 2]
  ------------------
 2519|       |    // Is the first test defined using TEST?
 2520|      0|    const bool first_is_TEST = first_fixture_id == internal::GetTestTypeId();
 2521|       |    // Is this test defined using TEST?
 2522|      0|    const bool this_is_TEST = this_fixture_id == internal::GetTestTypeId();
 2523|       |
 2524|      0|    if (first_is_TEST || this_is_TEST) {
  ------------------
  |  Branch (2524:9): [True: 0, False: 0]
  |  Branch (2524:26): [True: 0, False: 0]
  ------------------
 2525|       |      // Both TEST and TEST_F appear in same test suite, which is incorrect.
 2526|       |      // Tell the user how to fix this.
 2527|       |
 2528|       |      // Gets the name of the TEST and the name of the TEST_F.  Note
 2529|       |      // that first_is_TEST and this_is_TEST cannot both be true, as
 2530|       |      // the fixture IDs are different for the two tests.
 2531|      0|      const char* const TEST_name =
 2532|      0|          first_is_TEST ? first_test_name : this_test_name;
  ------------------
  |  Branch (2532:11): [True: 0, False: 0]
  ------------------
 2533|      0|      const char* const TEST_F_name =
 2534|      0|          first_is_TEST ? this_test_name : first_test_name;
  ------------------
  |  Branch (2534:11): [True: 0, False: 0]
  ------------------
 2535|       |
 2536|      0|      ADD_FAILURE()
  ------------------
  |  | 1735|      0|#define ADD_FAILURE() GTEST_NONFATAL_FAILURE_("Failed")
  |  |  ------------------
  |  |  |  | 1339|      0|  GTEST_MESSAGE_(message, ::testing::TestPartResult::kNonFatalFailure)
  |  |  |  |  ------------------
  |  |  |  |  |  | 1333|      0|  GTEST_MESSAGE_AT_(__FILE__, __LINE__, message, result_type)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 1329|      0|  ::testing::internal::AssertHelper(result_type, file, line, message) = \
  |  |  |  |  |  |  |  | 1330|      0|      ::testing::Message()
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 2537|      0|          << "All tests in the same test suite must use the same test fixture\n"
 2538|      0|          << "class, so mixing TEST_F and TEST in the same test suite is\n"
 2539|      0|          << "illegal.  In test suite " << this_test_info->test_suite_name()
 2540|      0|          << ",\n"
 2541|      0|          << "test " << TEST_F_name << " is defined using TEST_F but\n"
 2542|      0|          << "test " << TEST_name << " is defined using TEST.  You probably\n"
 2543|      0|          << "want to change the TEST to TEST_F or move it to another test\n"
 2544|      0|          << "case.";
 2545|      0|    } else {
 2546|       |      // Two fixture classes with the same name appear in two different
 2547|       |      // namespaces, which is not allowed. Tell the user how to fix this.
 2548|      0|      ADD_FAILURE()
  ------------------
  |  | 1735|      0|#define ADD_FAILURE() GTEST_NONFATAL_FAILURE_("Failed")
  |  |  ------------------
  |  |  |  | 1339|      0|  GTEST_MESSAGE_(message, ::testing::TestPartResult::kNonFatalFailure)
  |  |  |  |  ------------------
  |  |  |  |  |  | 1333|      0|  GTEST_MESSAGE_AT_(__FILE__, __LINE__, message, result_type)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 1329|      0|  ::testing::internal::AssertHelper(result_type, file, line, message) = \
  |  |  |  |  |  |  |  | 1330|      0|      ::testing::Message()
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 2549|      0|          << "All tests in the same test suite must use the same test fixture\n"
 2550|      0|          << "class.  However, in test suite "
 2551|      0|          << this_test_info->test_suite_name() << ",\n"
 2552|      0|          << "you defined test " << first_test_name << " and test "
 2553|      0|          << this_test_name << "\n"
 2554|      0|          << "using two different test fixture classes.  This can happen if\n"
 2555|      0|          << "the two classes are from different namespaces or translation\n"
 2556|      0|          << "units and have the same name.  You should probably rename one\n"
 2557|      0|          << "of the classes to put the tests into different test suites.";
 2558|      0|    }
 2559|      0|    return false;
 2560|      0|  }
 2561|       |
 2562|      2|  return true;
 2563|      2|}
_ZN7testing4Test3RunEv:
 2677|      2|void Test::Run() {
 2678|      2|  if (!HasSameFixtureClass()) return;
  ------------------
  |  Branch (2678:7): [True: 0, False: 2]
  ------------------
 2679|       |
 2680|      2|  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
 2681|      2|  impl->os_stack_trace_getter()->UponLeavingGTest();
 2682|      2|  internal::HandleExceptionsInMethodIfSupported(this, &Test::SetUp, "SetUp()");
 2683|       |  // We will run the test only if SetUp() was successful and didn't call
 2684|       |  // GTEST_SKIP().
 2685|      2|  if (!HasFatalFailure() && !IsSkipped()) {
  ------------------
  |  Branch (2685:7): [True: 2, False: 0]
  |  Branch (2685:29): [True: 2, False: 0]
  ------------------
 2686|      2|    impl->os_stack_trace_getter()->UponLeavingGTest();
 2687|      2|    internal::HandleExceptionsInMethodIfSupported(this, &Test::TestBody,
 2688|      2|                                                  "the test body");
 2689|      2|  }
 2690|       |
 2691|       |  // However, we want to clean up as much as possible.  Hence we will
 2692|       |  // always call TearDown(), even if SetUp() or the test body has
 2693|       |  // failed.
 2694|      2|  impl->os_stack_trace_getter()->UponLeavingGTest();
 2695|      2|  internal::HandleExceptionsInMethodIfSupported(this, &Test::TearDown,
 2696|      2|                                                "TearDown()");
 2697|      2|}
_ZN7testing4Test15HasFatalFailureEv:
 2700|      6|bool Test::HasFatalFailure() {
 2701|      6|  return internal::GetUnitTestImpl()->current_test_result()->HasFatalFailure();
 2702|      6|}
_ZN7testing4Test9IsSkippedEv:
 2712|      6|bool Test::IsSkipped() {
 2713|      6|  return internal::GetUnitTestImpl()->current_test_result()->Skipped();
 2714|      6|}
_ZN7testing8TestInfoC2ERKNSt3__112basic_stringIcNS1_11char_traitsIcEENS1_9allocatorIcEEEES9_PKcSB_NS_8internal12CodeLocationEPKvPNSC_15TestFactoryBaseE:
 2726|      4|    : test_suite_name_(a_test_suite_name),
 2727|       |      // begin()/end() is MSVC 17.3.3 ASAN crash workaround (GitHub issue #3997)
 2728|      4|      name_(a_name.begin(), a_name.end()),
 2729|      4|      type_param_(a_type_param ? new std::string(a_type_param) : nullptr),
  ------------------
  |  Branch (2729:19): [True: 0, False: 4]
  ------------------
 2730|      4|      value_param_(a_value_param ? new std::string(a_value_param) : nullptr),
  ------------------
  |  Branch (2730:20): [True: 0, False: 4]
  ------------------
 2731|      4|      location_(a_code_location),
 2732|      4|      fixture_class_id_(fixture_class_id),
 2733|      4|      should_run_(false),
 2734|      4|      is_disabled_(false),
 2735|      4|      matches_filter_(false),
 2736|      4|      is_in_another_shard_(false),
 2737|      4|      factory_(factory),
 2738|      4|      result_() {}
_ZN7testing8internal23MakeAndRegisterTestInfoEPKcS2_S2_S2_NS0_12CodeLocationEPKvPFvvES7_PNS0_15TestFactoryBaseE:
 2767|      4|    TearDownTestSuiteFunc tear_down_tc, TestFactoryBase* factory) {
 2768|      4|  TestInfo* const test_info =
 2769|      4|      new TestInfo(test_suite_name, name, type_param, value_param,
 2770|      4|                   code_location, fixture_class_id, factory);
 2771|      4|  GetUnitTestImpl()->AddTestInfo(set_up_tc, tear_down_tc, test_info);
 2772|      4|  return test_info;
 2773|      4|}
_ZN7testing8internal12UnitTestImpl26RegisterParameterizedTestsEv:
 2796|      2|void UnitTestImpl::RegisterParameterizedTests() {
 2797|      2|  if (!parameterized_tests_registered_) {
  ------------------
  |  Branch (2797:7): [True: 2, False: 0]
  ------------------
 2798|      2|    parameterized_test_registry_.RegisterTests();
 2799|      2|    type_parameterized_test_registry_.CheckForInstantiations();
 2800|      2|    parameterized_tests_registered_ = true;
 2801|      2|  }
 2802|      2|}
_ZN7testing8TestInfo3RunEv:
 2808|      2|void TestInfo::Run() {
 2809|      2|  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
 2810|      2|  if (!should_run_) {
  ------------------
  |  Branch (2810:7): [True: 0, False: 2]
  ------------------
 2811|      0|    if (is_disabled_ && matches_filter_) repeater->OnTestDisabled(*this);
  ------------------
  |  Branch (2811:9): [True: 0, False: 0]
  |  Branch (2811:25): [True: 0, False: 0]
  ------------------
 2812|      0|    return;
 2813|      0|  }
 2814|       |
 2815|       |  // Tells UnitTest where to store test result.
 2816|      2|  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
 2817|      2|  impl->set_current_test_info(this);
 2818|       |
 2819|       |  // Notifies the unit test event listeners that a test is about to start.
 2820|      2|  repeater->OnTestStart(*this);
 2821|      2|  result_.set_start_timestamp(internal::GetTimeInMillis());
 2822|      2|  internal::Timer timer;
 2823|      2|  impl->os_stack_trace_getter()->UponLeavingGTest();
 2824|       |
 2825|       |  // Creates the test object.
 2826|      2|  Test* const test = internal::HandleExceptionsInMethodIfSupported(
 2827|      2|      factory_, &internal::TestFactoryBase::CreateTest,
 2828|      2|      "the test fixture's constructor");
 2829|       |
 2830|       |  // Runs the test if the constructor didn't generate a fatal failure or invoke
 2831|       |  // GTEST_SKIP().
 2832|       |  // Note that the object will not be null
 2833|      2|  if (!Test::HasFatalFailure() && !Test::IsSkipped()) {
  ------------------
  |  Branch (2833:7): [True: 2, False: 0]
  |  Branch (2833:35): [True: 2, False: 0]
  ------------------
 2834|       |    // This doesn't throw as all user code that can throw are wrapped into
 2835|       |    // exception handling code.
 2836|      2|    test->Run();
 2837|      2|  }
 2838|       |
 2839|      2|  if (test != nullptr) {
  ------------------
  |  Branch (2839:7): [True: 0, False: 2]
  ------------------
 2840|       |    // Deletes the test object.
 2841|      0|    impl->os_stack_trace_getter()->UponLeavingGTest();
 2842|      0|    internal::HandleExceptionsInMethodIfSupported(
 2843|      0|        test, &Test::DeleteSelf_, "the test fixture's destructor");
 2844|      0|  }
 2845|       |
 2846|      2|  result_.set_elapsed_time(timer.Elapsed());
 2847|       |
 2848|       |  // Notifies the unit test event listener that a test has just finished.
 2849|      2|  repeater->OnTestEnd(*this);
 2850|       |
 2851|       |  // Tells UnitTest to stop associating assertion results to this
 2852|       |  // test.
 2853|      2|  impl->set_current_test_info(nullptr);
 2854|      2|}
_ZNK7testing9TestSuite17test_to_run_countEv:
 2911|      6|int TestSuite::test_to_run_count() const {
 2912|      6|  return CountIf(test_info_list_, ShouldRunTest);
 2913|      6|}
_ZNK7testing9TestSuite16total_test_countEv:
 2916|     10|int TestSuite::total_test_count() const {
 2917|     10|  return static_cast<int>(test_info_list_.size());
 2918|     10|}
_ZN7testing9TestSuiteC2EPKcS2_PFvvES4_:
 2932|      4|    : name_(a_name),
 2933|      4|      type_param_(a_type_param ? new std::string(a_type_param) : nullptr),
  ------------------
  |  Branch (2933:19): [True: 0, False: 4]
  ------------------
 2934|      4|      set_up_tc_(set_up_tc),
 2935|      4|      tear_down_tc_(tear_down_tc),
 2936|      4|      should_run_(false),
 2937|      4|      start_timestamp_(0),
 2938|      4|      elapsed_time_(0) {}
_ZNK7testing9TestSuite11GetTestInfoEi:
 2948|      4|const TestInfo* TestSuite::GetTestInfo(int i) const {
 2949|      4|  const int index = GetElementOr(test_indices_, i, -1);
 2950|      4|  return index < 0 ? nullptr : test_info_list_[static_cast<size_t>(index)];
  ------------------
  |  Branch (2950:10): [True: 0, False: 4]
  ------------------
 2951|      4|}
_ZN7testing9TestSuite18GetMutableTestInfoEi:
 2955|      2|TestInfo* TestSuite::GetMutableTestInfo(int i) {
 2956|      2|  const int index = GetElementOr(test_indices_, i, -1);
 2957|      2|  return index < 0 ? nullptr : test_info_list_[static_cast<size_t>(index)];
  ------------------
  |  Branch (2957:10): [True: 0, False: 2]
  ------------------
 2958|      2|}
_ZN7testing9TestSuite11AddTestInfoEPNS_8TestInfoE:
 2962|      4|void TestSuite::AddTestInfo(TestInfo* test_info) {
 2963|      4|  test_info_list_.push_back(test_info);
 2964|      4|  test_indices_.push_back(static_cast<int>(test_indices_.size()));
 2965|      4|}
_ZN7testing9TestSuite3RunEv:
 2968|      4|void TestSuite::Run() {
 2969|      4|  if (!should_run_) return;
  ------------------
  |  Branch (2969:7): [True: 2, False: 2]
  ------------------
 2970|       |
 2971|      2|  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
 2972|      2|  impl->set_current_test_suite(this);
 2973|       |
 2974|      2|  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
 2975|       |
 2976|       |  // Ensure our tests are in a deterministic order.
 2977|       |  //
 2978|       |  // We do this by sorting lexicographically on (file, line number), providing
 2979|       |  // an order matching what the user can see in the source code.
 2980|       |  //
 2981|       |  // In the common case the line number comparison shouldn't be necessary,
 2982|       |  // because the registrations made by the TEST macro are executed in order
 2983|       |  // within a translation unit. But this is not true of the manual registration
 2984|       |  // API, and in more exotic scenarios a single file may be part of multiple
 2985|       |  // translation units.
 2986|      2|  std::stable_sort(test_info_list_.begin(), test_info_list_.end(),
 2987|      2|                   [](const TestInfo* const a, const TestInfo* const b) {
 2988|      2|                     if (const int result = std::strcmp(a->file(), b->file())) {
 2989|      2|                       return result < 0;
 2990|      2|                     }
 2991|       |
 2992|      2|                     return a->line() < b->line();
 2993|      2|                   });
 2994|       |
 2995|       |  // Call both legacy and the new API
 2996|      2|  repeater->OnTestSuiteStart(*this);
 2997|       |//  Legacy API is deprecated but still available
 2998|      2|#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 2999|      2|  repeater->OnTestCaseStart(*this);
 3000|      2|#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 3001|       |
 3002|      2|  impl->os_stack_trace_getter()->UponLeavingGTest();
 3003|      2|  internal::HandleExceptionsInMethodIfSupported(
 3004|      2|      this, &TestSuite::RunSetUpTestSuite, "SetUpTestSuite()");
 3005|       |
 3006|      2|  const bool skip_all =
 3007|      2|      ad_hoc_test_result().Failed() || ad_hoc_test_result().Skipped();
  ------------------
  |  Branch (3007:7): [True: 0, False: 2]
  |  Branch (3007:40): [True: 0, False: 2]
  ------------------
 3008|       |
 3009|      2|  start_timestamp_ = internal::GetTimeInMillis();
 3010|      2|  internal::Timer timer;
 3011|      4|  for (int i = 0; i < total_test_count(); i++) {
  ------------------
  |  Branch (3011:19): [True: 2, False: 2]
  ------------------
 3012|      2|    if (skip_all) {
  ------------------
  |  Branch (3012:9): [True: 0, False: 2]
  ------------------
 3013|      0|      GetMutableTestInfo(i)->Skip();
 3014|      2|    } else {
 3015|      2|      GetMutableTestInfo(i)->Run();
 3016|      2|    }
 3017|      2|    if (GTEST_FLAG_GET(fail_fast) &&
  ------------------
  |  | 2293|      2|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  ------------------
  |  |  |  | 2226|      2|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  ------------------
  |  |  |  Branch (2293:30): [True: 0, False: 2]
  |  |  ------------------
  ------------------
 3018|      0|        GetMutableTestInfo(i)->result()->Failed()) {
  ------------------
  |  Branch (3018:9): [True: 0, False: 0]
  ------------------
 3019|      0|      for (int j = i + 1; j < total_test_count(); j++) {
  ------------------
  |  Branch (3019:27): [True: 0, False: 0]
  ------------------
 3020|      0|        GetMutableTestInfo(j)->Skip();
 3021|      0|      }
 3022|      0|      break;
 3023|      0|    }
 3024|      2|  }
 3025|      2|  elapsed_time_ = timer.Elapsed();
 3026|       |
 3027|      2|  impl->os_stack_trace_getter()->UponLeavingGTest();
 3028|      2|  internal::HandleExceptionsInMethodIfSupported(
 3029|      2|      this, &TestSuite::RunTearDownTestSuite, "TearDownTestSuite()");
 3030|       |
 3031|       |  // Call both legacy and the new API
 3032|      2|  repeater->OnTestSuiteEnd(*this);
 3033|       |//  Legacy API is deprecated but still available
 3034|      2|#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 3035|      2|  repeater->OnTestCaseEnd(*this);
 3036|      2|#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 3037|       |
 3038|      2|  impl->set_current_test_suite(nullptr);
 3039|      2|}
_ZN7testing9TestSuite11ClearResultEv:
 3072|      4|void TestSuite::ClearResult() {
 3073|      4|  ad_hoc_test_result_.Clear();
 3074|      4|  ForEach(test_info_list_, TestInfo::ClearTestResult);
 3075|      4|}
_ZN7testing8internal14ShouldUseColorEb:
 3238|      2|bool ShouldUseColor(bool stdout_is_tty) {
 3239|      2|  std::string c = GTEST_FLAG_GET(color);
  ------------------
  |  | 2293|      2|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  ------------------
  |  |  |  | 2226|      2|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  ------------------
  ------------------
 3240|      2|  const char* const gtest_color = c.c_str();
 3241|       |
 3242|      2|  if (String::CaseInsensitiveCStringEquals(gtest_color, "auto")) {
  ------------------
  |  Branch (3242:7): [True: 2, False: 0]
  ------------------
 3243|       |#if defined(GTEST_OS_WINDOWS) && !defined(GTEST_OS_WINDOWS_MINGW)
 3244|       |    // On Windows the TERM variable is usually not set, but the
 3245|       |    // console there does support colors.
 3246|       |    return stdout_is_tty;
 3247|       |#else
 3248|       |    // On non-Windows platforms, we rely on the TERM variable.
 3249|      2|    const char* const term = posix::GetEnv("TERM");
 3250|      2|    const bool term_supports_color =
 3251|      2|        term != nullptr && (String::CStringEquals(term, "xterm") ||
  ------------------
  |  Branch (3251:9): [True: 0, False: 2]
  |  Branch (3251:29): [True: 0, False: 0]
  ------------------
 3252|      0|                            String::CStringEquals(term, "xterm-color") ||
  ------------------
  |  Branch (3252:29): [True: 0, False: 0]
  ------------------
 3253|      0|                            String::CStringEquals(term, "xterm-kitty") ||
  ------------------
  |  Branch (3253:29): [True: 0, False: 0]
  ------------------
 3254|      0|                            String::CStringEquals(term, "screen") ||
  ------------------
  |  Branch (3254:29): [True: 0, False: 0]
  ------------------
 3255|      0|                            String::CStringEquals(term, "tmux") ||
  ------------------
  |  Branch (3255:29): [True: 0, False: 0]
  ------------------
 3256|      0|                            String::CStringEquals(term, "rxvt-unicode") ||
  ------------------
  |  Branch (3256:29): [True: 0, False: 0]
  ------------------
 3257|      0|                            String::CStringEquals(term, "linux") ||
  ------------------
  |  Branch (3257:29): [True: 0, False: 0]
  ------------------
 3258|      0|                            String::CStringEquals(term, "cygwin") ||
  ------------------
  |  Branch (3258:29): [True: 0, False: 0]
  ------------------
 3259|      0|                            String::EndsWithCaseInsensitive(term, "-256color"));
  ------------------
  |  Branch (3259:29): [True: 0, False: 0]
  ------------------
 3260|      2|    return stdout_is_tty && term_supports_color;
  ------------------
  |  Branch (3260:12): [True: 0, False: 2]
  |  Branch (3260:29): [True: 0, False: 0]
  ------------------
 3261|      2|#endif  // GTEST_OS_WINDOWS
 3262|      2|  }
 3263|       |
 3264|      0|  return String::CaseInsensitiveCStringEquals(gtest_color, "yes") ||
  ------------------
  |  Branch (3264:10): [True: 0, False: 0]
  ------------------
 3265|      0|         String::CaseInsensitiveCStringEquals(gtest_color, "true") ||
  ------------------
  |  Branch (3265:10): [True: 0, False: 0]
  ------------------
 3266|      0|         String::CaseInsensitiveCStringEquals(gtest_color, "t") ||
  ------------------
  |  Branch (3266:10): [True: 0, False: 0]
  ------------------
 3267|      0|         String::CStringEquals(gtest_color, "1");
  ------------------
  |  Branch (3267:10): [True: 0, False: 0]
  ------------------
 3268|       |  // We take "yes", "true", "t", and "1" as meaning "yes".  If the
 3269|       |  // value is neither one of these nor "auto", we treat it as "no" to
 3270|       |  // be conservative.
 3271|      2|}
_ZN7testing8internal27PrettyUnitTestResultPrinter20OnTestIterationStartERKNS_8UnitTestEi:
 3394|      2|    const UnitTest& unit_test, int iteration) {
 3395|      2|  if (GTEST_FLAG_GET(repeat) != 1)
  ------------------
  |  | 2293|      2|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  ------------------
  |  |  |  | 2226|      2|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  ------------------
  ------------------
  |  Branch (3395:7): [True: 0, False: 2]
  ------------------
 3396|      0|    printf("\nRepeating all tests (iteration %d) . . .\n\n", iteration + 1);
 3397|       |
 3398|      2|  std::string f = GTEST_FLAG_GET(filter);
  ------------------
  |  | 2293|      2|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  ------------------
  |  |  |  | 2226|      2|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  ------------------
  ------------------
 3399|      2|  const char* const filter = f.c_str();
 3400|       |
 3401|       |  // Prints the filter if it's not *.  This reminds the user that some
 3402|       |  // tests may be skipped.
 3403|      2|  if (!String::CStringEquals(filter, kUniversalFilter)) {
  ------------------
  |  Branch (3403:7): [True: 2, False: 0]
  ------------------
 3404|      2|    ColoredPrintf(GTestColor::kYellow, "Note: %s filter = %s\n", GTEST_NAME_,
  ------------------
  |  |  334|      2|#define GTEST_NAME_ "Google Test"
  ------------------
 3405|      2|                  filter);
 3406|      2|  }
 3407|       |
 3408|      2|  if (internal::ShouldShard(kTestTotalShards, kTestShardIndex, false)) {
  ------------------
  |  Branch (3408:7): [True: 0, False: 2]
  ------------------
 3409|      0|    const int32_t shard_index = Int32FromEnvOrDie(kTestShardIndex, -1);
 3410|      0|    ColoredPrintf(GTestColor::kYellow, "Note: This is test shard %d of %s.\n",
 3411|      0|                  static_cast<int>(shard_index) + 1,
 3412|      0|                  internal::posix::GetEnv(kTestTotalShards));
 3413|      0|  }
 3414|       |
 3415|      2|  if (GTEST_FLAG_GET(shuffle)) {
  ------------------
  |  | 2293|      2|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  ------------------
  |  |  |  | 2226|      2|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  ------------------
  |  |  |  Branch (2293:30): [True: 0, False: 2]
  |  |  ------------------
  ------------------
 3416|      0|    ColoredPrintf(GTestColor::kYellow,
 3417|      0|                  "Note: Randomizing tests' orders with a seed of %d .\n",
 3418|      0|                  unit_test.random_seed());
 3419|      0|  }
 3420|       |
 3421|      2|  ColoredPrintf(GTestColor::kGreen, "[==========] ");
 3422|      2|  printf("Running %s from %s.\n",
 3423|      2|         FormatTestCount(unit_test.test_to_run_count()).c_str(),
 3424|      2|         FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str());
 3425|       |  fflush(stdout);
 3426|      2|}
_ZN7testing8internal27PrettyUnitTestResultPrinter24OnEnvironmentsSetUpStartERKNS_8UnitTestE:
 3429|      2|    const UnitTest& /*unit_test*/) {
 3430|      2|  ColoredPrintf(GTestColor::kGreen, "[----------] ");
 3431|      2|  printf("Global test environment set-up.\n");
 3432|       |  fflush(stdout);
 3433|      2|}
_ZN7testing8internal27PrettyUnitTestResultPrinter15OnTestCaseStartERKNS_9TestSuiteE:
 3436|      2|void PrettyUnitTestResultPrinter::OnTestCaseStart(const TestCase& test_case) {
 3437|      2|  const std::string counts =
 3438|      2|      FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
 3439|      2|  ColoredPrintf(GTestColor::kGreen, "[----------] ");
 3440|      2|  printf("%s from %s", counts.c_str(), test_case.name());
 3441|      2|  if (test_case.type_param() == nullptr) {
  ------------------
  |  Branch (3441:7): [True: 2, False: 0]
  ------------------
 3442|      2|    printf("\n");
 3443|      2|  } else {
 3444|      0|    printf(", where %s = %s\n", kTypeParamLabel, test_case.type_param());
 3445|      0|  }
 3446|       |  fflush(stdout);
 3447|      2|}
_ZN7testing8internal27PrettyUnitTestResultPrinter11OnTestStartERKNS_8TestInfoE:
 3464|      2|void PrettyUnitTestResultPrinter::OnTestStart(const TestInfo& test_info) {
 3465|      2|  ColoredPrintf(GTestColor::kGreen, "[ RUN      ] ");
 3466|      2|  PrintTestName(test_info.test_suite_name(), test_info.name());
 3467|      2|  printf("\n");
 3468|       |  fflush(stdout);
 3469|      2|}
_ZN7testing8internal17TestEventRepeater6AppendEPNS_17TestEventListenerE:
 3815|      6|void TestEventRepeater::Append(TestEventListener* listener) {
 3816|      6|  listeners_.push_back(listener);
 3817|      6|}
_ZN7testing8internal17TestEventRepeater7ReleaseEPNS_17TestEventListenerE:
 3819|      2|TestEventListener* TestEventRepeater::Release(TestEventListener* listener) {
 3820|      2|  for (size_t i = 0; i < listeners_.size(); ++i) {
  ------------------
  |  Branch (3820:22): [True: 0, False: 2]
  ------------------
 3821|      0|    if (listeners_[i] == listener) {
  ------------------
  |  Branch (3821:9): [True: 0, False: 0]
  ------------------
 3822|      0|      listeners_.erase(listeners_.begin() + static_cast<int>(i));
 3823|      0|      return listener;
 3824|      0|    }
 3825|      0|  }
 3826|       |
 3827|      2|  return nullptr;
 3828|      2|}
_ZN7testing8internal17TestEventRepeater18OnTestProgramStartERKNS_8UnitTestE:
 3833|      2|  void TestEventRepeater::Name(const Type& parameter) { \
 3834|      2|    if (forwarding_enabled_) {                          \
  ------------------
  |  Branch (3834:9): [True: 2, False: 0]
  ------------------
 3835|      8|      for (size_t i = 0; i < listeners_.size(); i++) {  \
  ------------------
  |  Branch (3835:26): [True: 6, False: 2]
  ------------------
 3836|      6|        listeners_[i]->Name(parameter);                 \
 3837|      6|      }                                                 \
 3838|      2|    }                                                   \
 3839|      2|  }
_ZN7testing8internal17TestEventRepeater24OnEnvironmentsSetUpStartERKNS_8UnitTestE:
 3833|      2|  void TestEventRepeater::Name(const Type& parameter) { \
 3834|      2|    if (forwarding_enabled_) {                          \
  ------------------
  |  Branch (3834:9): [True: 2, False: 0]
  ------------------
 3835|      8|      for (size_t i = 0; i < listeners_.size(); i++) {  \
  ------------------
  |  Branch (3835:26): [True: 6, False: 2]
  ------------------
 3836|      6|        listeners_[i]->Name(parameter);                 \
 3837|      6|      }                                                 \
 3838|      2|    }                                                   \
 3839|      2|  }
_ZN7testing8internal17TestEventRepeater15OnTestCaseStartERKNS_9TestSuiteE:
 3833|      2|  void TestEventRepeater::Name(const Type& parameter) { \
 3834|      2|    if (forwarding_enabled_) {                          \
  ------------------
  |  Branch (3834:9): [True: 2, False: 0]
  ------------------
 3835|      8|      for (size_t i = 0; i < listeners_.size(); i++) {  \
  ------------------
  |  Branch (3835:26): [True: 6, False: 2]
  ------------------
 3836|      6|        listeners_[i]->Name(parameter);                 \
 3837|      6|      }                                                 \
 3838|      2|    }                                                   \
 3839|      2|  }
_ZN7testing8internal17TestEventRepeater16OnTestSuiteStartERKNS_9TestSuiteE:
 3833|      2|  void TestEventRepeater::Name(const Type& parameter) { \
 3834|      2|    if (forwarding_enabled_) {                          \
  ------------------
  |  Branch (3834:9): [True: 2, False: 0]
  ------------------
 3835|      8|      for (size_t i = 0; i < listeners_.size(); i++) {  \
  ------------------
  |  Branch (3835:26): [True: 6, False: 2]
  ------------------
 3836|      6|        listeners_[i]->Name(parameter);                 \
 3837|      6|      }                                                 \
 3838|      2|    }                                                   \
 3839|      2|  }
_ZN7testing8internal17TestEventRepeater11OnTestStartERKNS_8TestInfoE:
 3833|      2|  void TestEventRepeater::Name(const Type& parameter) { \
 3834|      2|    if (forwarding_enabled_) {                          \
  ------------------
  |  Branch (3834:9): [True: 2, False: 0]
  ------------------
 3835|      8|      for (size_t i = 0; i < listeners_.size(); i++) {  \
  ------------------
  |  Branch (3835:26): [True: 6, False: 2]
  ------------------
 3836|      6|        listeners_[i]->Name(parameter);                 \
 3837|      6|      }                                                 \
 3838|      2|    }                                                   \
 3839|      2|  }
_ZN7testing8internal17TestEventRepeater22OnEnvironmentsSetUpEndERKNS_8UnitTestE:
 3843|      2|  void TestEventRepeater::Name(const Type& parameter) { \
 3844|      2|    if (forwarding_enabled_) {                          \
  ------------------
  |  Branch (3844:9): [True: 2, False: 0]
  ------------------
 3845|      8|      for (size_t i = listeners_.size(); i != 0; i--) { \
  ------------------
  |  Branch (3845:42): [True: 6, False: 2]
  ------------------
 3846|      6|        listeners_[i - 1]->Name(parameter);             \
 3847|      6|      }                                                 \
 3848|      2|    }                                                   \
 3849|      2|  }
_ZN7testing8internal17TestEventRepeater20OnTestIterationStartERKNS_8UnitTestEi:
 3876|      2|                                             int iteration) {
 3877|      2|  if (forwarding_enabled_) {
  ------------------
  |  Branch (3877:7): [True: 2, False: 0]
  ------------------
 3878|      8|    for (size_t i = 0; i < listeners_.size(); i++) {
  ------------------
  |  Branch (3878:24): [True: 6, False: 2]
  ------------------
 3879|      6|      listeners_[i]->OnTestIterationStart(unit_test, iteration);
 3880|      6|    }
 3881|      2|  }
 3882|      2|}
_ZN7testing8internal18OsStackTraceGetter16UponLeavingGTestEv:
 5021|      8|void OsStackTraceGetter::UponLeavingGTest() GTEST_LOCK_EXCLUDED_(mutex_) {
 5022|       |#ifdef GTEST_HAS_ABSL
 5023|       |  void* caller_frame = nullptr;
 5024|       |  if (absl::GetStackTrace(&caller_frame, 1, 3) <= 0) {
 5025|       |    caller_frame = nullptr;
 5026|       |  }
 5027|       |
 5028|       |  MutexLock lock(&mutex_);
 5029|       |  caller_frame_ = caller_frame;
 5030|       |#endif  // GTEST_HAS_ABSL
 5031|      8|}
_ZN7testing18TestEventListenersC2Ev:
 5078|      2|    : repeater_(new internal::TestEventRepeater()),
 5079|      2|      default_result_printer_(nullptr),
 5080|      2|      default_xml_generator_(nullptr) {}
_ZN7testing18TestEventListeners6AppendEPNS_17TestEventListenerE:
 5088|      6|void TestEventListeners::Append(TestEventListener* listener) {
 5089|      6|  repeater_->Append(listener);
 5090|      6|}
_ZN7testing18TestEventListeners7ReleaseEPNS_17TestEventListenerE:
 5095|      2|TestEventListener* TestEventListeners::Release(TestEventListener* listener) {
 5096|      2|  if (listener == default_result_printer_)
  ------------------
  |  Branch (5096:7): [True: 2, False: 0]
  ------------------
 5097|      2|    default_result_printer_ = nullptr;
 5098|      0|  else if (listener == default_xml_generator_)
  ------------------
  |  Branch (5098:12): [True: 0, False: 0]
  ------------------
 5099|      0|    default_xml_generator_ = nullptr;
 5100|      2|  return repeater_->Release(listener);
 5101|      2|}
_ZN7testing18TestEventListeners8repeaterEv:
 5105|      6|TestEventListener* TestEventListeners::repeater() { return repeater_; }
_ZN7testing18TestEventListeners23SetDefaultResultPrinterEPNS_17TestEventListenerE:
 5112|      2|void TestEventListeners::SetDefaultResultPrinter(TestEventListener* listener) {
 5113|      2|  if (default_result_printer_ != listener) {
  ------------------
  |  Branch (5113:7): [True: 2, False: 0]
  ------------------
 5114|       |    // It is an error to pass this method a listener that is already in the
 5115|       |    // list.
 5116|      2|    delete Release(default_result_printer_);
 5117|      2|    default_result_printer_ = listener;
 5118|      2|    if (listener != nullptr) Append(listener);
  ------------------
  |  Branch (5118:9): [True: 2, False: 0]
  ------------------
 5119|      2|  }
 5120|      2|}
_ZN7testing8UnitTest11GetInstanceEv:
 5156|     50|UnitTest* UnitTest::GetInstance() {
 5157|       |  // CodeGear C++Builder insists on a public destructor for the
 5158|       |  // default implementation.  Use this implementation to keep good OO
 5159|       |  // design with private destructor.
 5160|       |
 5161|       |#if defined(__BORLANDC__)
 5162|       |  static UnitTest* const instance = new UnitTest;
 5163|       |  return instance;
 5164|       |#else
 5165|     50|  static UnitTest instance;
 5166|     50|  return &instance;
 5167|     50|#endif  // defined(__BORLANDC__)
 5168|     50|}
_ZNK7testing8UnitTest22total_test_suite_countEv:
 5181|      6|int UnitTest::total_test_suite_count() const {
 5182|      6|  return impl()->total_test_suite_count();
 5183|      6|}
_ZNK7testing8UnitTest23test_suite_to_run_countEv:
 5187|      2|int UnitTest::test_suite_to_run_count() const {
 5188|      2|  return impl()->test_suite_to_run_count();
 5189|      2|}
_ZNK7testing8UnitTest17test_to_run_countEv:
 5239|      2|int UnitTest::test_to_run_count() const { return impl()->test_to_run_count(); }
_ZNK7testing8UnitTest12GetTestSuiteEi:
 5262|     12|const TestSuite* UnitTest::GetTestSuite(int i) const {
 5263|     12|  return impl()->GetTestSuite(i);
 5264|     12|}
_ZN7testing8UnitTest9listenersEv:
 5287|      8|TestEventListeners& UnitTest::listeners() { return *impl()->listeners(); }
_ZN7testing8UnitTest3RunEv:
 5399|      2|int UnitTest::Run() {
 5400|      2|#ifdef GTEST_HAS_DEATH_TEST
 5401|      2|  const bool in_death_test_child_process =
 5402|      2|      GTEST_FLAG_GET(internal_run_death_test).length() > 0;
  ------------------
  |  | 2293|      2|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  ------------------
  |  |  |  | 2226|      2|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  ------------------
  ------------------
 5403|       |
 5404|       |  // Google Test implements this protocol for catching that a test
 5405|       |  // program exits before returning control to Google Test:
 5406|       |  //
 5407|       |  //   1. Upon start, Google Test creates a file whose absolute path
 5408|       |  //      is specified by the environment variable
 5409|       |  //      TEST_PREMATURE_EXIT_FILE.
 5410|       |  //   2. When Google Test has finished its work, it deletes the file.
 5411|       |  //
 5412|       |  // This allows a test runner to set TEST_PREMATURE_EXIT_FILE before
 5413|       |  // running a Google-Test-based test program and check the existence
 5414|       |  // of the file at the end of the test execution to see if it has
 5415|       |  // exited prematurely.
 5416|       |
 5417|       |  // If we are in the child process of a death test, don't
 5418|       |  // create/delete the premature exit file, as doing so is unnecessary
 5419|       |  // and will confuse the parent process.  Otherwise, create/delete
 5420|       |  // the file upon entering/leaving this function.  If the program
 5421|       |  // somehow exits before this function has a chance to return, the
 5422|       |  // premature-exit file will be left undeleted, causing a test runner
 5423|       |  // that understands the premature-exit-file protocol to report the
 5424|       |  // test as having failed.
 5425|      2|  const internal::ScopedPrematureExitFile premature_exit_file(
 5426|      2|      in_death_test_child_process
  ------------------
  |  Branch (5426:7): [True: 0, False: 2]
  ------------------
 5427|      2|          ? nullptr
 5428|      2|          : internal::posix::GetEnv("TEST_PREMATURE_EXIT_FILE"));
 5429|       |#else
 5430|       |  const bool in_death_test_child_process = false;
 5431|       |#endif  // GTEST_HAS_DEATH_TEST
 5432|       |
 5433|       |  // Captures the value of GTEST_FLAG(catch_exceptions).  This value will be
 5434|       |  // used for the duration of the program.
 5435|      2|  impl()->set_catch_exceptions(GTEST_FLAG_GET(catch_exceptions));
  ------------------
  |  | 2293|      2|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  ------------------
  |  |  |  | 2226|      2|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  ------------------
  ------------------
 5436|       |
 5437|       |#ifdef GTEST_OS_WINDOWS
 5438|       |  // Either the user wants Google Test to catch exceptions thrown by the
 5439|       |  // tests or this is executing in the context of death test child
 5440|       |  // process. In either case the user does not want to see pop-up dialogs
 5441|       |  // about crashes - they are expected.
 5442|       |  if (impl()->catch_exceptions() || in_death_test_child_process) {
 5443|       |#if !defined(GTEST_OS_WINDOWS_MOBILE) && !defined(GTEST_OS_WINDOWS_PHONE) && \
 5444|       |    !defined(GTEST_OS_WINDOWS_RT)
 5445|       |    // SetErrorMode doesn't exist on CE.
 5446|       |    SetErrorMode(SEM_FAILCRITICALERRORS | SEM_NOALIGNMENTFAULTEXCEPT |
 5447|       |                 SEM_NOGPFAULTERRORBOX | SEM_NOOPENFILEERRORBOX);
 5448|       |#endif  // !GTEST_OS_WINDOWS_MOBILE
 5449|       |
 5450|       |#if (defined(_MSC_VER) || defined(GTEST_OS_WINDOWS_MINGW)) && \
 5451|       |    !defined(GTEST_OS_WINDOWS_MOBILE)
 5452|       |    // Death test children can be terminated with _abort().  On Windows,
 5453|       |    // _abort() can show a dialog with a warning message.  This forces the
 5454|       |    // abort message to go to stderr instead.
 5455|       |    _set_error_mode(_OUT_TO_STDERR);
 5456|       |#endif
 5457|       |
 5458|       |#if defined(_MSC_VER) && !defined(GTEST_OS_WINDOWS_MOBILE)
 5459|       |    // In the debug version, Visual Studio pops up a separate dialog
 5460|       |    // offering a choice to debug the aborted program. We need to suppress
 5461|       |    // this dialog or it will pop up for every EXPECT/ASSERT_DEATH statement
 5462|       |    // executed. Google Test will notify the user of any unexpected
 5463|       |    // failure via stderr.
 5464|       |    if (!GTEST_FLAG_GET(break_on_failure))
 5465|       |      _set_abort_behavior(
 5466|       |          0x0,                                    // Clear the following flags:
 5467|       |          _WRITE_ABORT_MSG | _CALL_REPORTFAULT);  // pop-up window, core dump.
 5468|       |
 5469|       |    // In debug mode, the Windows CRT can crash with an assertion over invalid
 5470|       |    // input (e.g. passing an invalid file descriptor).  The default handling
 5471|       |    // for these assertions is to pop up a dialog and wait for user input.
 5472|       |    // Instead ask the CRT to dump such assertions to stderr non-interactively.
 5473|       |    if (!IsDebuggerPresent()) {
 5474|       |      (void)_CrtSetReportMode(_CRT_ASSERT,
 5475|       |                              _CRTDBG_MODE_FILE | _CRTDBG_MODE_DEBUG);
 5476|       |      (void)_CrtSetReportFile(_CRT_ASSERT, _CRTDBG_FILE_STDERR);
 5477|       |    }
 5478|       |#endif
 5479|       |  }
 5480|       |#else
 5481|      2|  (void)in_death_test_child_process;  // Needed inside the #if block above
 5482|      2|#endif  // GTEST_OS_WINDOWS
 5483|       |
 5484|      2|  return internal::HandleExceptionsInMethodIfSupported(
  ------------------
  |  Branch (5484:10): [True: 0, False: 2]
  ------------------
 5485|      2|             impl(), &internal::UnitTestImpl::RunAllTests,
 5486|      2|             "auxiliary test code (environments or event listeners)")
 5487|      2|             ? 0
 5488|      2|             : 1;
 5489|      2|}
_ZN7testing8UnitTestC2Ev:
 5535|      2|UnitTest::UnitTest() { impl_ = new internal::UnitTestImpl(this); }
_ZN7testing8internal12UnitTestImplC2EPNS_8UnitTestE:
 5557|      2|    : parent_(parent),
 5558|       |      GTEST_DISABLE_MSC_WARNINGS_PUSH_(4355 /* using this in initializer */)
 5559|      2|          default_global_test_part_result_reporter_(this),
 5560|      2|      default_per_thread_test_part_result_reporter_(this),
 5561|       |      GTEST_DISABLE_MSC_WARNINGS_POP_() global_test_part_result_reporter_(
 5562|      2|          &default_global_test_part_result_reporter_),
 5563|      2|      per_thread_test_part_result_reporter_(
 5564|      2|          &default_per_thread_test_part_result_reporter_),
 5565|      2|      parameterized_test_registry_(),
 5566|      2|      parameterized_tests_registered_(false),
 5567|      2|      last_death_test_suite_(-1),
 5568|      2|      current_test_suite_(nullptr),
 5569|      2|      current_test_info_(nullptr),
 5570|      2|      ad_hoc_test_result_(),
 5571|      2|      os_stack_trace_getter_(nullptr),
 5572|      2|      post_flag_parse_init_performed_(false),
 5573|      2|      random_seed_(0),  // Will be overridden by the flag before first use.
 5574|      2|      random_(0),       // Will be reseeded before first use.
 5575|      2|      start_timestamp_(0),
 5576|      2|      elapsed_time_(0),
 5577|       |#ifdef GTEST_HAS_DEATH_TEST
 5578|      2|      death_test_factory_(new DefaultDeathTestFactory),
 5579|       |#endif
 5580|       |      // Will be overridden by the flag before first use.
 5581|      2|      catch_exceptions_(false) {
 5582|      2|  listeners()->SetDefaultResultPrinter(new PrettyUnitTestResultPrinter);
 5583|      2|}
_ZN7testing8internal12UnitTestImpl32SuppressTestEventsIfInSubprocessEv:
 5620|      2|void UnitTestImpl::SuppressTestEventsIfInSubprocess() {
 5621|      2|  if (internal_run_death_test_flag_ != nullptr)
  ------------------
  |  Branch (5621:7): [True: 0, False: 2]
  ------------------
 5622|      0|    listeners()->SuppressEventForwarding(true);
 5623|      2|}
_ZN7testing8internal12UnitTestImpl18ConfigureXmlOutputEv:
 5628|      2|void UnitTestImpl::ConfigureXmlOutput() {
 5629|      2|  const std::string& output_format = UnitTestOptions::GetOutputFormat();
 5630|      2|#if GTEST_HAS_FILE_SYSTEM
 5631|      2|  if (output_format == "xml") {
  ------------------
  |  Branch (5631:7): [True: 0, False: 2]
  ------------------
 5632|      0|    listeners()->SetDefaultXmlGenerator(new XmlUnitTestResultPrinter(
 5633|      0|        UnitTestOptions::GetAbsolutePathToOutputFile().c_str()));
 5634|      2|  } else if (output_format == "json") {
  ------------------
  |  Branch (5634:14): [True: 0, False: 2]
  ------------------
 5635|      0|    listeners()->SetDefaultXmlGenerator(new JsonUnitTestResultPrinter(
 5636|      0|        UnitTestOptions::GetAbsolutePathToOutputFile().c_str()));
 5637|      2|  } else if (!output_format.empty()) {
  ------------------
  |  Branch (5637:14): [True: 0, False: 2]
  ------------------
 5638|      0|    GTEST_LOG_(WARNING) << "WARNING: unrecognized output format \""
  ------------------
  |  | 1054|      0|  ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \
  |  | 1055|      0|                                __FILE__, __LINE__)                    \
  |  | 1056|      0|      .GetStream()
  ------------------
 5639|      0|                        << output_format << "\" ignored.";
 5640|      0|  }
 5641|       |#else
 5642|       |  if (!output_format.empty()) {
 5643|       |    GTEST_LOG_(ERROR) << "ERROR: alternative output formats require "
 5644|       |                      << "GTEST_HAS_FILE_SYSTEM to be enabled";
 5645|       |  }
 5646|       |#endif  // GTEST_HAS_FILE_SYSTEM
 5647|      2|}
_ZN7testing8internal12UnitTestImpl24ConfigureStreamingOutputEv:
 5652|      2|void UnitTestImpl::ConfigureStreamingOutput() {
 5653|      2|  const std::string& target = GTEST_FLAG_GET(stream_result_to);
  ------------------
  |  | 2293|      2|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  ------------------
  |  |  |  | 2226|      2|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  ------------------
  ------------------
 5654|      2|  if (!target.empty()) {
  ------------------
  |  Branch (5654:7): [True: 0, False: 2]
  ------------------
 5655|      0|    const size_t pos = target.find(':');
 5656|      0|    if (pos != std::string::npos) {
  ------------------
  |  Branch (5656:9): [True: 0, False: 0]
  ------------------
 5657|      0|      listeners()->Append(
 5658|      0|          new StreamingListener(target.substr(0, pos), target.substr(pos + 1)));
 5659|      0|    } else {
 5660|      0|      GTEST_LOG_(WARNING) << "unrecognized streaming target \"" << target
  ------------------
  |  | 1054|      0|  ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \
  |  | 1055|      0|                                __FILE__, __LINE__)                    \
  |  | 1056|      0|      .GetStream()
  ------------------
 5661|      0|                          << "\" ignored.";
 5662|      0|    }
 5663|      0|  }
 5664|      2|}
_ZN7testing8internal12UnitTestImpl19PostFlagParsingInitEv:
 5672|      4|void UnitTestImpl::PostFlagParsingInit() {
 5673|       |  // Ensures that this function does not execute more than once.
 5674|      4|  if (!post_flag_parse_init_performed_) {
  ------------------
  |  Branch (5674:7): [True: 2, False: 2]
  ------------------
 5675|      2|    post_flag_parse_init_performed_ = true;
 5676|       |
 5677|       |#if defined(GTEST_CUSTOM_TEST_EVENT_LISTENER_)
 5678|       |    // Register to send notifications about key process state changes.
 5679|       |    listeners()->Append(new GTEST_CUSTOM_TEST_EVENT_LISTENER_());
 5680|       |#endif  // defined(GTEST_CUSTOM_TEST_EVENT_LISTENER_)
 5681|       |
 5682|      2|#ifdef GTEST_HAS_DEATH_TEST
 5683|      2|    InitDeathTestSubprocessControlInfo();
 5684|      2|    SuppressTestEventsIfInSubprocess();
 5685|      2|#endif  // GTEST_HAS_DEATH_TEST
 5686|       |
 5687|       |    // Registers parameterized tests. This makes parameterized tests
 5688|       |    // available to the UnitTest reflection API without running
 5689|       |    // RUN_ALL_TESTS.
 5690|      2|    RegisterParameterizedTests();
 5691|       |
 5692|       |    // Configures listeners for XML output. This makes it possible for users
 5693|       |    // to shut down the default XML output before invoking RUN_ALL_TESTS.
 5694|      2|    ConfigureXmlOutput();
 5695|       |
 5696|      2|    if (GTEST_FLAG_GET(brief)) {
  ------------------
  |  | 2293|      2|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  ------------------
  |  |  |  | 2226|      2|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  ------------------
  |  |  |  Branch (2293:30): [True: 0, False: 2]
  |  |  ------------------
  ------------------
 5697|      0|      listeners()->SetDefaultResultPrinter(new BriefUnitTestResultPrinter);
 5698|      0|    }
 5699|       |
 5700|      2|#if GTEST_CAN_STREAM_RESULTS_
 5701|       |    // Configures listeners for streaming test results to the specified server.
 5702|      2|    ConfigureStreamingOutput();
 5703|      2|#endif  // GTEST_CAN_STREAM_RESULTS_
 5704|       |
 5705|       |#ifdef GTEST_HAS_ABSL
 5706|       |    if (GTEST_FLAG_GET(install_failure_signal_handler)) {
 5707|       |      absl::FailureSignalHandlerOptions options;
 5708|       |      absl::InstallFailureSignalHandler(options);
 5709|       |    }
 5710|       |#endif  // GTEST_HAS_ABSL
 5711|      2|  }
 5712|      4|}
_ZN7testing8internal12UnitTestImpl12GetTestSuiteEPKcS3_PFvvES5_:
 5752|      4|    internal::TearDownTestSuiteFunc tear_down_tc) {
 5753|       |  // Can we find a TestSuite with the given name?
 5754|      4|  const auto test_suite =
 5755|      4|      std::find_if(test_suites_.rbegin(), test_suites_.rend(),
 5756|      4|                   TestSuiteNameIs(test_suite_name));
 5757|       |
 5758|      4|  if (test_suite != test_suites_.rend()) return *test_suite;
  ------------------
  |  Branch (5758:7): [True: 0, False: 4]
  ------------------
 5759|       |
 5760|       |  // No.  Let's create one.
 5761|      4|  auto* const new_test_suite =
 5762|      4|      new TestSuite(test_suite_name, type_param, set_up_tc, tear_down_tc);
 5763|       |
 5764|      4|  const UnitTestFilter death_test_suite_filter(kDeathTestSuiteFilter);
 5765|       |  // Is this a death test suite?
 5766|      4|  if (death_test_suite_filter.MatchesName(test_suite_name)) {
  ------------------
  |  Branch (5766:7): [True: 0, False: 4]
  ------------------
 5767|       |    // Yes.  Inserts the test suite after the last death test suite
 5768|       |    // defined so far.  This only works when the test suites haven't
 5769|       |    // been shuffled.  Otherwise we may end up running a death test
 5770|       |    // after a non-death test.
 5771|      0|    ++last_death_test_suite_;
 5772|      0|    test_suites_.insert(test_suites_.begin() + last_death_test_suite_,
 5773|      0|                        new_test_suite);
 5774|      4|  } else {
 5775|       |    // No.  Appends to the end of the list.
 5776|      4|    test_suites_.push_back(new_test_suite);
 5777|      4|  }
 5778|       |
 5779|      4|  test_suite_indices_.push_back(static_cast<int>(test_suite_indices_.size()));
 5780|      4|  return new_test_suite;
 5781|      4|}
_ZN7testing8internal12UnitTestImpl11RunAllTestsEv:
 5797|      2|bool UnitTestImpl::RunAllTests() {
 5798|       |  // True if and only if Google Test is initialized before RUN_ALL_TESTS() is
 5799|       |  // called.
 5800|      2|  const bool gtest_is_initialized_before_run_all_tests = GTestIsInitialized();
 5801|       |
 5802|       |  // Do not run any test if the --help flag was specified.
 5803|      2|  if (g_help_flag) return true;
  ------------------
  |  Branch (5803:7): [True: 0, False: 2]
  ------------------
 5804|       |
 5805|       |  // Repeats the call to the post-flag parsing initialization in case the
 5806|       |  // user didn't call InitGoogleTest.
 5807|      2|  PostFlagParsingInit();
 5808|       |
 5809|      2|#if GTEST_HAS_FILE_SYSTEM
 5810|       |  // Even if sharding is not on, test runners may want to use the
 5811|       |  // GTEST_SHARD_STATUS_FILE to query whether the test supports the sharding
 5812|       |  // protocol.
 5813|      2|  internal::WriteToShardStatusFileIfNeeded();
 5814|      2|#endif  // GTEST_HAS_FILE_SYSTEM
 5815|       |
 5816|       |  // True if and only if we are in a subprocess for running a thread-safe-style
 5817|       |  // death test.
 5818|      2|  bool in_subprocess_for_death_test = false;
 5819|       |
 5820|      2|#ifdef GTEST_HAS_DEATH_TEST
 5821|      2|  in_subprocess_for_death_test = (internal_run_death_test_flag_ != nullptr);
 5822|       |#if defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
 5823|       |  if (in_subprocess_for_death_test) {
 5824|       |    GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_();
 5825|       |  }
 5826|       |#endif  // defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
 5827|      2|#endif  // GTEST_HAS_DEATH_TEST
 5828|       |
 5829|      2|  const bool should_shard = ShouldShard(kTestTotalShards, kTestShardIndex,
 5830|      2|                                        in_subprocess_for_death_test);
 5831|       |
 5832|       |  // Compares the full test names with the filter to decide which
 5833|       |  // tests to run.
 5834|      2|  const bool has_tests_to_run =
 5835|      2|      FilterTests(should_shard ? HONOR_SHARDING_PROTOCOL
  ------------------
  |  Branch (5835:19): [True: 0, False: 2]
  ------------------
 5836|      2|                               : IGNORE_SHARDING_PROTOCOL) > 0;
 5837|       |
 5838|       |  // Lists the tests and exits if the --gtest_list_tests flag was specified.
 5839|      2|  if (GTEST_FLAG_GET(list_tests)) {
  ------------------
  |  | 2293|      2|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  ------------------
  |  |  |  | 2226|      2|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  ------------------
  |  |  |  Branch (2293:30): [True: 0, False: 2]
  |  |  ------------------
  ------------------
 5840|       |    // This must be called *after* FilterTests() has been called.
 5841|      0|    ListTestsMatchingFilter();
 5842|      0|    return true;
 5843|      0|  }
 5844|       |
 5845|      2|  random_seed_ = GetRandomSeedFromFlag(GTEST_FLAG_GET(random_seed));
  ------------------
  |  | 2293|      2|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  ------------------
  |  |  |  | 2226|      2|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  ------------------
  ------------------
 5846|       |
 5847|       |  // True if and only if at least one test has failed.
 5848|      2|  bool failed = false;
 5849|       |
 5850|      2|  TestEventListener* repeater = listeners()->repeater();
 5851|       |
 5852|      2|  start_timestamp_ = GetTimeInMillis();
 5853|      2|  repeater->OnTestProgramStart(*parent_);
 5854|       |
 5855|       |  // How many times to repeat the tests?  We don't want to repeat them
 5856|       |  // when we are inside the subprocess of a death test.
 5857|      2|  const int repeat = in_subprocess_for_death_test ? 1 : GTEST_FLAG_GET(repeat);
  ------------------
  |  | 2293|      2|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  ------------------
  |  |  |  | 2226|      2|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  ------------------
  ------------------
  |  Branch (5857:22): [True: 0, False: 2]
  ------------------
 5858|       |
 5859|       |  // Repeats forever if the repeat count is negative.
 5860|      2|  const bool gtest_repeat_forever = repeat < 0;
 5861|       |
 5862|       |  // Should test environments be set up and torn down for each repeat, or only
 5863|       |  // set up on the first and torn down on the last iteration? If there is no
 5864|       |  // "last" iteration because the tests will repeat forever, always recreate the
 5865|       |  // environments to avoid leaks in case one of the environments is using
 5866|       |  // resources that are external to this process. Without this check there would
 5867|       |  // be no way to clean up those external resources automatically.
 5868|      2|  const bool recreate_environments_when_repeating =
 5869|      2|      GTEST_FLAG_GET(recreate_environments_when_repeating) ||
  ------------------
  |  | 2293|      2|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  ------------------
  |  |  |  | 2226|      2|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  ------------------
  |  |  |  Branch (2293:30): [True: 0, False: 2]
  |  |  ------------------
  ------------------
 5870|      2|      gtest_repeat_forever;
  ------------------
  |  Branch (5870:7): [True: 0, False: 2]
  ------------------
 5871|       |
 5872|      4|  for (int i = 0; gtest_repeat_forever || i != repeat; i++) {
  ------------------
  |  Branch (5872:19): [True: 2, False: 2]
  |  Branch (5872:43): [True: 2, False: 0]
  ------------------
 5873|       |    // We want to preserve failures generated by ad-hoc test
 5874|       |    // assertions executed before RUN_ALL_TESTS().
 5875|      2|    ClearNonAdHocTestResult();
 5876|       |
 5877|      2|    Timer timer;
 5878|       |
 5879|       |    // Shuffles test suites and tests if requested.
 5880|      2|    if (has_tests_to_run && GTEST_FLAG_GET(shuffle)) {
  ------------------
  |  | 2293|      2|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  ------------------
  |  |  |  | 2226|      2|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  ------------------
  |  |  |  Branch (2293:30): [True: 0, False: 2]
  |  |  ------------------
  ------------------
  |  Branch (5880:9): [True: 2, False: 0]
  ------------------
 5881|      0|      random()->Reseed(static_cast<uint32_t>(random_seed_));
 5882|       |      // This should be done before calling OnTestIterationStart(),
 5883|       |      // such that a test event listener can see the actual test order
 5884|       |      // in the event.
 5885|      0|      ShuffleTests();
 5886|      0|    }
 5887|       |
 5888|       |    // Tells the unit test event listeners that the tests are about to start.
 5889|      2|    repeater->OnTestIterationStart(*parent_, i);
 5890|       |
 5891|       |    // Runs each test suite if there is at least one test to run.
 5892|      2|    if (has_tests_to_run) {
  ------------------
  |  Branch (5892:9): [True: 2, False: 0]
  ------------------
 5893|       |      // Sets up all environments beforehand. If test environments aren't
 5894|       |      // recreated for each iteration, only do so on the first iteration.
 5895|      2|      if (i == 0 || recreate_environments_when_repeating) {
  ------------------
  |  Branch (5895:11): [True: 2, False: 0]
  |  Branch (5895:21): [True: 0, False: 0]
  ------------------
 5896|      2|        repeater->OnEnvironmentsSetUpStart(*parent_);
 5897|      2|        ForEach(environments_, SetUpEnvironment);
 5898|      2|        repeater->OnEnvironmentsSetUpEnd(*parent_);
 5899|      2|      }
 5900|       |
 5901|       |      // Runs the tests only if there was no fatal failure or skip triggered
 5902|       |      // during global set-up.
 5903|      2|      if (Test::IsSkipped()) {
  ------------------
  |  Branch (5903:11): [True: 0, False: 2]
  ------------------
 5904|       |        // Emit diagnostics when global set-up calls skip, as it will not be
 5905|       |        // emitted by default.
 5906|      0|        TestResult& test_result =
 5907|      0|            *internal::GetUnitTestImpl()->current_test_result();
 5908|      0|        for (int j = 0; j < test_result.total_part_count(); ++j) {
  ------------------
  |  Branch (5908:25): [True: 0, False: 0]
  ------------------
 5909|      0|          const TestPartResult& test_part_result =
 5910|      0|              test_result.GetTestPartResult(j);
 5911|      0|          if (test_part_result.type() == TestPartResult::kSkip) {
  ------------------
  |  Branch (5911:15): [True: 0, False: 0]
  ------------------
 5912|      0|            const std::string& result = test_part_result.message();
 5913|      0|            printf("%s\n", result.c_str());
 5914|      0|          }
 5915|      0|        }
 5916|      0|        fflush(stdout);
 5917|      2|      } else if (!Test::HasFatalFailure()) {
  ------------------
  |  Branch (5917:18): [True: 2, False: 0]
  ------------------
 5918|      6|        for (int test_index = 0; test_index < total_test_suite_count();
  ------------------
  |  Branch (5918:34): [True: 4, False: 2]
  ------------------
 5919|      4|             test_index++) {
 5920|      4|          GetMutableSuiteCase(test_index)->Run();
 5921|      4|          if (GTEST_FLAG_GET(fail_fast) &&
  ------------------
  |  | 2293|      4|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  ------------------
  |  |  |  | 2226|      4|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  ------------------
  |  |  |  Branch (2293:30): [True: 0, False: 4]
  |  |  ------------------
  ------------------
 5922|      0|              GetMutableSuiteCase(test_index)->Failed()) {
  ------------------
  |  Branch (5922:15): [True: 0, False: 0]
  ------------------
 5923|      0|            for (int j = test_index + 1; j < total_test_suite_count(); j++) {
  ------------------
  |  Branch (5923:42): [True: 0, False: 0]
  ------------------
 5924|      0|              GetMutableSuiteCase(j)->Skip();
 5925|      0|            }
 5926|      0|            break;
 5927|      0|          }
 5928|      4|        }
 5929|      2|      } else if (Test::HasFatalFailure()) {
  ------------------
  |  Branch (5929:18): [True: 0, False: 0]
  ------------------
 5930|       |        // If there was a fatal failure during the global setup then we know we
 5931|       |        // aren't going to run any tests. Explicitly mark all of the tests as
 5932|       |        // skipped to make this obvious in the output.
 5933|      0|        for (int test_index = 0; test_index < total_test_suite_count();
  ------------------
  |  Branch (5933:34): [True: 0, False: 0]
  ------------------
 5934|      0|             test_index++) {
 5935|      0|          GetMutableSuiteCase(test_index)->Skip();
 5936|      0|        }
 5937|      0|      }
 5938|       |
 5939|       |      // Tears down all environments in reverse order afterwards. If test
 5940|       |      // environments aren't recreated for each iteration, only do so on the
 5941|       |      // last iteration.
 5942|      2|      if (i == repeat - 1 || recreate_environments_when_repeating) {
  ------------------
  |  Branch (5942:11): [True: 2, False: 0]
  |  Branch (5942:30): [True: 0, False: 0]
  ------------------
 5943|      0|        repeater->OnEnvironmentsTearDownStart(*parent_);
 5944|      0|        std::for_each(environments_.rbegin(), environments_.rend(),
 5945|      0|                      TearDownEnvironment);
 5946|      0|        repeater->OnEnvironmentsTearDownEnd(*parent_);
 5947|      0|      }
 5948|      2|    }
 5949|       |
 5950|      2|    elapsed_time_ = timer.Elapsed();
 5951|       |
 5952|       |    // Tells the unit test event listener that the tests have just finished.
 5953|      2|    repeater->OnTestIterationEnd(*parent_, i);
 5954|       |
 5955|       |    // Gets the result and clears it.
 5956|      2|    if (!Passed()) {
  ------------------
  |  Branch (5956:9): [True: 0, False: 2]
  ------------------
 5957|      0|      failed = true;
 5958|      0|    }
 5959|       |
 5960|       |    // Restores the original test order after the iteration.  This
 5961|       |    // allows the user to quickly repro a failure that happens in the
 5962|       |    // N-th iteration without repeating the first (N - 1) iterations.
 5963|       |    // This is not enclosed in "if (GTEST_FLAG(shuffle)) { ... }", in
 5964|       |    // case the user somehow changes the value of the flag somewhere
 5965|       |    // (it's always safe to unshuffle the tests).
 5966|      2|    UnshuffleTests();
 5967|       |
 5968|      2|    if (GTEST_FLAG_GET(shuffle)) {
  ------------------
  |  | 2293|      2|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  ------------------
  |  |  |  | 2226|      2|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  ------------------
  |  |  |  Branch (2293:30): [True: 0, False: 2]
  |  |  ------------------
  ------------------
 5969|       |      // Picks a new random seed for each iteration.
 5970|      0|      random_seed_ = GetNextRandomSeed(random_seed_);
 5971|      0|    }
 5972|      2|  }
 5973|       |
 5974|      2|  repeater->OnTestProgramEnd(*parent_);
 5975|       |
 5976|      2|  if (!gtest_is_initialized_before_run_all_tests) {
  ------------------
  |  Branch (5976:7): [True: 0, False: 2]
  ------------------
 5977|      0|    ColoredPrintf(
 5978|      0|        GTestColor::kRed,
 5979|      0|        "\nIMPORTANT NOTICE - DO NOT IGNORE:\n"
 5980|      0|        "This test program did NOT call " GTEST_INIT_GOOGLE_TEST_NAME_
 5981|      0|        "() before calling RUN_ALL_TESTS(). This is INVALID. Soon " GTEST_NAME_
 5982|      0|        " will start to enforce the valid usage. "
 5983|      0|        "Please fix it ASAP, or IT WILL START TO FAIL.\n");  // NOLINT
 5984|      0|  }
 5985|       |
 5986|      2|  return !failed;
 5987|      2|}
_ZN7testing8internal30WriteToShardStatusFileIfNeededEv:
 5994|      2|void WriteToShardStatusFileIfNeeded() {
 5995|      2|  const char* const test_shard_file = posix::GetEnv(kTestShardStatusFile);
 5996|      2|  if (test_shard_file != nullptr) {
  ------------------
  |  Branch (5996:7): [True: 0, False: 2]
  ------------------
 5997|      0|    FILE* const file = posix::FOpen(test_shard_file, "w");
 5998|      0|    if (file == nullptr) {
  ------------------
  |  Branch (5998:9): [True: 0, False: 0]
  ------------------
 5999|      0|      ColoredPrintf(GTestColor::kRed,
 6000|      0|                    "Could not write to the test shard status file \"%s\" "
 6001|      0|                    "specified by the %s environment variable.\n",
 6002|      0|                    test_shard_file, kTestShardStatusFile);
 6003|      0|      fflush(stdout);
 6004|      0|      exit(EXIT_FAILURE);
 6005|      0|    }
 6006|      0|    fclose(file);
 6007|      0|  }
 6008|      2|}
_ZN7testing8internal11ShouldShardEPKcS2_b:
 6018|      4|                 bool in_subprocess_for_death_test) {
 6019|      4|  if (in_subprocess_for_death_test) {
  ------------------
  |  Branch (6019:7): [True: 0, False: 4]
  ------------------
 6020|      0|    return false;
 6021|      0|  }
 6022|       |
 6023|      4|  const int32_t total_shards = Int32FromEnvOrDie(total_shards_env, -1);
 6024|      4|  const int32_t shard_index = Int32FromEnvOrDie(shard_index_env, -1);
 6025|       |
 6026|      4|  if (total_shards == -1 && shard_index == -1) {
  ------------------
  |  Branch (6026:7): [True: 4, False: 0]
  |  Branch (6026:29): [True: 4, False: 0]
  ------------------
 6027|      4|    return false;
 6028|      4|  } else if (total_shards == -1 && shard_index != -1) {
  ------------------
  |  Branch (6028:14): [True: 0, False: 0]
  |  Branch (6028:36): [True: 0, False: 0]
  ------------------
 6029|      0|    const Message msg = Message() << "Invalid environment variables: you have "
 6030|      0|                                  << kTestShardIndex << " = " << shard_index
 6031|      0|                                  << ", but have left " << kTestTotalShards
 6032|      0|                                  << " unset.\n";
 6033|      0|    ColoredPrintf(GTestColor::kRed, "%s", msg.GetString().c_str());
 6034|      0|    fflush(stdout);
 6035|      0|    exit(EXIT_FAILURE);
 6036|      0|  } else if (total_shards != -1 && shard_index == -1) {
  ------------------
  |  Branch (6036:14): [True: 0, False: 0]
  |  Branch (6036:36): [True: 0, False: 0]
  ------------------
 6037|      0|    const Message msg = Message()
 6038|      0|                        << "Invalid environment variables: you have "
 6039|      0|                        << kTestTotalShards << " = " << total_shards
 6040|      0|                        << ", but have left " << kTestShardIndex << " unset.\n";
 6041|      0|    ColoredPrintf(GTestColor::kRed, "%s", msg.GetString().c_str());
 6042|      0|    fflush(stdout);
 6043|      0|    exit(EXIT_FAILURE);
 6044|      0|  } else if (shard_index < 0 || shard_index >= total_shards) {
  ------------------
  |  Branch (6044:14): [True: 0, False: 0]
  |  Branch (6044:33): [True: 0, False: 0]
  ------------------
 6045|      0|    const Message msg =
 6046|      0|        Message() << "Invalid environment variables: we require 0 <= "
 6047|      0|                  << kTestShardIndex << " < " << kTestTotalShards
 6048|      0|                  << ", but you have " << kTestShardIndex << "=" << shard_index
 6049|      0|                  << ", " << kTestTotalShards << "=" << total_shards << ".\n";
 6050|      0|    ColoredPrintf(GTestColor::kRed, "%s", msg.GetString().c_str());
 6051|      0|    fflush(stdout);
 6052|      0|    exit(EXIT_FAILURE);
 6053|      0|  }
 6054|       |
 6055|      0|  return total_shards > 1;
 6056|      4|}
_ZN7testing8internal17Int32FromEnvOrDieEPKci:
 6061|      8|int32_t Int32FromEnvOrDie(const char* var, int32_t default_val) {
 6062|      8|  const char* str_val = posix::GetEnv(var);
 6063|      8|  if (str_val == nullptr) {
  ------------------
  |  Branch (6063:7): [True: 8, False: 0]
  ------------------
 6064|      8|    return default_val;
 6065|      8|  }
 6066|       |
 6067|      0|  int32_t result;
 6068|      0|  if (!ParseInt32(Message() << "The value of environment variable " << var,
  ------------------
  |  Branch (6068:7): [True: 0, False: 0]
  ------------------
 6069|      0|                  str_val, &result)) {
 6070|      0|    exit(EXIT_FAILURE);
 6071|      0|  }
 6072|      0|  return result;
 6073|      0|}
_ZN7testing8internal12UnitTestImpl11FilterTestsENS1_18ReactionToShardingE:
 6090|      2|int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) {
 6091|      2|  const int32_t total_shards = shard_tests == HONOR_SHARDING_PROTOCOL
  ------------------
  |  Branch (6091:32): [True: 0, False: 2]
  ------------------
 6092|      2|                                   ? Int32FromEnvOrDie(kTestTotalShards, -1)
 6093|      2|                                   : -1;
 6094|      2|  const int32_t shard_index = shard_tests == HONOR_SHARDING_PROTOCOL
  ------------------
  |  Branch (6094:31): [True: 0, False: 2]
  ------------------
 6095|      2|                                  ? Int32FromEnvOrDie(kTestShardIndex, -1)
 6096|      2|                                  : -1;
 6097|       |
 6098|      2|  const PositiveAndNegativeUnitTestFilter gtest_flag_filter(
 6099|      2|      GTEST_FLAG_GET(filter));
  ------------------
  |  | 2293|      2|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  ------------------
  |  |  |  | 2226|      2|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  ------------------
  ------------------
 6100|      2|  const UnitTestFilter disable_test_filter(kDisableTestFilter);
 6101|       |  // num_runnable_tests are the number of tests that will
 6102|       |  // run across all shards (i.e., match filter and are not disabled).
 6103|       |  // num_selected_tests are the number of tests to be run on
 6104|       |  // this shard.
 6105|      2|  int num_runnable_tests = 0;
 6106|      2|  int num_selected_tests = 0;
 6107|      4|  for (auto* test_suite : test_suites_) {
  ------------------
  |  Branch (6107:25): [True: 4, False: 2]
  ------------------
 6108|      4|    const std::string& test_suite_name = test_suite->name();
 6109|      4|    test_suite->set_should_run(false);
 6110|       |
 6111|      8|    for (size_t j = 0; j < test_suite->test_info_list().size(); j++) {
  ------------------
  |  Branch (6111:24): [True: 4, False: 4]
  ------------------
 6112|      4|      TestInfo* const test_info = test_suite->test_info_list()[j];
 6113|      4|      const std::string test_name(test_info->name());
 6114|       |      // A test is disabled if test suite name or test name matches
 6115|       |      // kDisableTestFilter.
 6116|      4|      const bool is_disabled =
 6117|      4|          disable_test_filter.MatchesName(test_suite_name) ||
  ------------------
  |  Branch (6117:11): [True: 0, False: 4]
  ------------------
 6118|      4|          disable_test_filter.MatchesName(test_name);
  ------------------
  |  Branch (6118:11): [True: 0, False: 4]
  ------------------
 6119|      4|      test_info->is_disabled_ = is_disabled;
 6120|       |
 6121|      4|      const bool matches_filter =
 6122|      4|          gtest_flag_filter.MatchesTest(test_suite_name, test_name);
 6123|      4|      test_info->matches_filter_ = matches_filter;
 6124|       |
 6125|      4|      const bool is_runnable =
 6126|      4|          (GTEST_FLAG_GET(also_run_disabled_tests) || !is_disabled) &&
  ------------------
  |  | 2293|      4|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  ------------------
  |  |  |  | 2226|      4|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  ------------------
  |  |  |  Branch (2293:30): [True: 0, False: 4]
  |  |  ------------------
  ------------------
  |  Branch (6126:55): [True: 4, False: 0]
  ------------------
 6127|      4|          matches_filter;
  ------------------
  |  Branch (6127:11): [True: 2, False: 2]
  ------------------
 6128|       |
 6129|      4|      const bool is_in_another_shard =
 6130|      4|          shard_tests != IGNORE_SHARDING_PROTOCOL &&
  ------------------
  |  Branch (6130:11): [True: 0, False: 4]
  ------------------
 6131|      0|          !ShouldRunTestOnShard(total_shards, shard_index, num_runnable_tests);
  ------------------
  |  Branch (6131:11): [True: 0, False: 0]
  ------------------
 6132|      4|      test_info->is_in_another_shard_ = is_in_another_shard;
 6133|      4|      const bool is_selected = is_runnable && !is_in_another_shard;
  ------------------
  |  Branch (6133:32): [True: 2, False: 2]
  |  Branch (6133:47): [True: 2, False: 0]
  ------------------
 6134|       |
 6135|      4|      num_runnable_tests += is_runnable;
 6136|      4|      num_selected_tests += is_selected;
 6137|       |
 6138|      4|      test_info->should_run_ = is_selected;
 6139|      4|      test_suite->set_should_run(test_suite->should_run() || is_selected);
  ------------------
  |  Branch (6139:34): [True: 0, False: 4]
  |  Branch (6139:62): [True: 2, False: 2]
  ------------------
 6140|      4|    }
 6141|      4|  }
 6142|      2|  return num_selected_tests;
 6143|      2|}
_ZN7testing8internal12UnitTestImpl21os_stack_trace_getterEv:
 6238|      8|OsStackTraceGetterInterface* UnitTestImpl::os_stack_trace_getter() {
 6239|      8|  if (os_stack_trace_getter_ == nullptr) {
  ------------------
  |  Branch (6239:7): [True: 2, False: 6]
  ------------------
 6240|       |#ifdef GTEST_OS_STACK_TRACE_GETTER_
 6241|       |    os_stack_trace_getter_ = new GTEST_OS_STACK_TRACE_GETTER_;
 6242|       |#else
 6243|      2|    os_stack_trace_getter_ = new OsStackTraceGetter;
 6244|      2|#endif  // GTEST_OS_STACK_TRACE_GETTER_
 6245|      2|  }
 6246|       |
 6247|      8|  return os_stack_trace_getter_;
 6248|      8|}
_ZN7testing8internal12UnitTestImpl19current_test_resultEv:
 6251|     12|TestResult* UnitTestImpl::current_test_result() {
 6252|     12|  if (current_test_info_ != nullptr) {
  ------------------
  |  Branch (6252:7): [True: 8, False: 4]
  ------------------
 6253|      8|    return &current_test_info_->result_;
 6254|      8|  }
 6255|      4|  if (current_test_suite_ != nullptr) {
  ------------------
  |  Branch (6255:7): [True: 0, False: 4]
  ------------------
 6256|      0|    return &current_test_suite_->ad_hoc_test_result_;
 6257|      0|  }
 6258|      4|  return &ad_hoc_test_result_;
 6259|      4|}
_ZN7testing8internal6IsTrueEb:
 6310|     26|bool IsTrue(bool condition) { return condition; }
_ZN7testing8internal10AlwaysTrueEv:
 6312|     16|bool AlwaysTrue() {
 6313|     16|#if GTEST_HAS_EXCEPTIONS
 6314|       |  // This condition is always false so AlwaysTrue() never actually throws,
 6315|       |  // but it makes the compiler think that it may throw.
 6316|     16|  if (IsTrue(false)) throw ClassUniqueToAlwaysTrue();
  ------------------
  |  Branch (6316:7): [True: 0, False: 16]
  ------------------
 6317|     16|#endif  // GTEST_HAS_EXCEPTIONS
 6318|     16|  return true;
 6319|     16|}
_ZN7testing8internal10SkipPrefixEPKcPS2_:
 6324|     73|bool SkipPrefix(const char* prefix, const char** pstr) {
 6325|     73|  const size_t prefix_len = strlen(prefix);
 6326|     73|  if (strncmp(*pstr, prefix, prefix_len) == 0) {
  ------------------
  |  Branch (6326:7): [True: 15, False: 58]
  ------------------
 6327|     15|    *pstr += prefix_len;
 6328|     15|    return true;
 6329|     15|  }
 6330|     58|  return false;
 6331|     73|}
_ZN7testing8internal9ParseFlagEPKcS2_Pi:
 6392|     45|bool ParseFlag(const char* str, const char* flag_name, int32_t* value) {
 6393|       |  // Gets the value of the flag as a string.
 6394|     45|  const char* const value_str = ParseFlagValue(str, flag_name, false);
 6395|       |
 6396|       |  // Aborts if the parsing failed.
 6397|     45|  if (value_str == nullptr) return false;
  ------------------
  |  Branch (6397:7): [True: 45, False: 0]
  ------------------
 6398|       |
 6399|       |  // Sets *value to the value of the flag.
 6400|      0|  return ParseInt32(Message() << "The value of flag --" << flag_name, value_str,
 6401|      0|                    value);
 6402|     45|}
_ZN7testing8internal24ParseGoogleTestFlagsOnlyEPiPPc:
 6685|      2|void ParseGoogleTestFlagsOnly(int* argc, char** argv) {
 6686|       |#ifdef GTEST_HAS_ABSL
 6687|       |  if (*argc <= 0) return;
 6688|       |
 6689|       |  std::vector<char*> positional_args;
 6690|       |  std::vector<absl::UnrecognizedFlag> unrecognized_flags;
 6691|       |  absl::ParseAbseilFlagsOnly(*argc, argv, positional_args, unrecognized_flags);
 6692|       |  absl::flat_hash_set<absl::string_view> unrecognized;
 6693|       |  for (const auto& flag : unrecognized_flags) {
 6694|       |    unrecognized.insert(flag.flag_name);
 6695|       |  }
 6696|       |  absl::flat_hash_set<char*> positional;
 6697|       |  for (const auto& arg : positional_args) {
 6698|       |    positional.insert(arg);
 6699|       |  }
 6700|       |
 6701|       |  int out_pos = 1;
 6702|       |  int in_pos = 1;
 6703|       |  for (; in_pos < *argc; ++in_pos) {
 6704|       |    char* arg = argv[in_pos];
 6705|       |    absl::string_view arg_str(arg);
 6706|       |    if (absl::ConsumePrefix(&arg_str, "--")) {
 6707|       |      // Flag-like argument. If the flag was unrecognized, keep it.
 6708|       |      // If it was a GoogleTest flag, remove it.
 6709|       |      if (unrecognized.contains(arg_str)) {
 6710|       |        argv[out_pos++] = argv[in_pos];
 6711|       |        continue;
 6712|       |      }
 6713|       |    }
 6714|       |
 6715|       |    if (arg_str.empty()) {
 6716|       |      ++in_pos;
 6717|       |      break;  // '--' indicates that the rest of the arguments are positional
 6718|       |    }
 6719|       |
 6720|       |    // Probably a positional argument. If it is in fact positional, keep it.
 6721|       |    // If it was a value for the flag argument, remove it.
 6722|       |    if (positional.contains(arg)) {
 6723|       |      argv[out_pos++] = arg;
 6724|       |    }
 6725|       |  }
 6726|       |
 6727|       |  // The rest are positional args for sure.
 6728|       |  while (in_pos < *argc) {
 6729|       |    argv[out_pos++] = argv[in_pos++];
 6730|       |  }
 6731|       |
 6732|       |  *argc = out_pos;
 6733|       |  argv[out_pos] = nullptr;
 6734|       |#else
 6735|      2|  ParseGoogleTestFlagsOnlyImpl(argc, argv);
 6736|      2|#endif
 6737|       |
 6738|       |  // Fix the value of *_NSGetArgc() on macOS, but if and only if
 6739|       |  // *_NSGetArgv() == argv
 6740|       |  // Only applicable to char** version of argv
 6741|       |#ifdef GTEST_OS_MAC
 6742|       |#ifndef GTEST_OS_IOS
 6743|       |  if (*_NSGetArgv() == argv) {
 6744|       |    *_NSGetArgc() = *argc;
 6745|       |  }
 6746|       |#endif
 6747|       |#endif
 6748|      2|}
_ZN7testing14InitGoogleTestEPiPPc:
 6794|      2|void InitGoogleTest(int* argc, char** argv) {
 6795|       |#if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
 6796|       |  GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv);
 6797|       |#else   // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
 6798|      2|  internal::InitGoogleTestImpl(argc, argv);
 6799|      2|#endif  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
 6800|      2|}
gtest-all.cc:_ZN7testingL18GetDefaultFailFastEv:
  239|      2|static bool GetDefaultFailFast() {
  240|      2|  const char* const testbridge_test_runner_fail_fast =
  241|      2|      internal::posix::GetEnv("TESTBRIDGE_TEST_RUNNER_FAIL_FAST");
  242|      2|  if (testbridge_test_runner_fail_fast != nullptr) {
  ------------------
  |  Branch (242:7): [True: 0, False: 2]
  ------------------
  243|      0|    return strcmp(testbridge_test_runner_fail_fast, "1") == 0;
  244|      0|  }
  245|      2|  return false;
  246|      2|}
gtest-all.cc:_ZN7testingL16GetDefaultFilterEv:
  228|      2|static const char* GetDefaultFilter() {
  229|      2|  const char* const testbridge_test_only =
  230|      2|      internal::posix::GetEnv("TESTBRIDGE_TEST_ONLY");
  231|      2|  if (testbridge_test_only != nullptr) {
  ------------------
  |  Branch (231:7): [True: 0, False: 2]
  ------------------
  232|      0|    return testbridge_test_only;
  233|      0|  }
  234|      2|  return kUniversalFilter;
  235|      2|}
gtest-all.cc:_ZN7testing8internal12_GLOBAL__N_114UnitTestFilterC2ERKNSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEE:
  762|      8|  explicit UnitTestFilter(const std::string& filter) {
  763|       |    // By design "" filter matches "" string.
  764|      8|    std::vector<std::string> all_patterns;
  765|      8|    SplitString(filter, ':', &all_patterns);
  766|      8|    const auto exact_match_patterns_begin = std::partition(
  767|      8|        all_patterns.begin(), all_patterns.end(), &IsGlobPattern);
  768|       |
  769|      8|    glob_patterns_.reserve(static_cast<size_t>(
  770|      8|        std::distance(all_patterns.begin(), exact_match_patterns_begin)));
  771|      8|    std::move(all_patterns.begin(), exact_match_patterns_begin,
  772|      8|              std::inserter(glob_patterns_, glob_patterns_.begin()));
  773|      8|    std::move(
  774|      8|        exact_match_patterns_begin, all_patterns.end(),
  775|      8|        std::inserter(exact_match_patterns_, exact_match_patterns_.begin()));
  776|      8|  }
gtest-all.cc:_ZN7testing8internal12_GLOBAL__N_113IsGlobPatternERKNSt3__112basic_stringIcNS2_11char_traitsIcEENS2_9allocatorIcEEEE:
  752|     14|bool IsGlobPattern(const std::string& pattern) {
  753|     14|  return std::any_of(pattern.begin(), pattern.end(),
  754|     14|                     [](const char c) { return c == '?' || c == '*'; });
  755|     14|}
gtest-all.cc:_ZZN7testing8internal12_GLOBAL__N_113IsGlobPatternERKNSt3__112basic_stringIcNS2_11char_traitsIcEENS2_9allocatorIcEEEEENK3$_0clEc:
  754|     72|                     [](const char c) { return c == '?' || c == '*'; });
  ------------------
  |  Branch (754:48): [True: 0, False: 72]
  |  Branch (754:60): [True: 12, False: 60]
  ------------------
gtest-all.cc:_ZNK7testing8internal12_GLOBAL__N_114UnitTestFilter11MatchesNameERKNSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEE:
  780|     18|  bool MatchesName(const std::string& name) const {
  781|     18|    return exact_match_patterns_.count(name) > 0 ||
  ------------------
  |  Branch (781:12): [True: 2, False: 16]
  ------------------
  782|     16|           std::any_of(glob_patterns_.begin(), glob_patterns_.end(),
  ------------------
  |  Branch (782:12): [True: 0, False: 16]
  ------------------
  783|     16|                       [&name](const std::string& pattern) {
  784|     16|                         return PatternMatchesString(
  785|     16|                             name, pattern.c_str(),
  786|     16|                             pattern.c_str() + pattern.size());
  787|     16|                       });
  788|     18|  }
gtest-all.cc:_ZZNK7testing8internal12_GLOBAL__N_114UnitTestFilter11MatchesNameERKNSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEEENKUlSB_E_clESB_:
  783|     24|                       [&name](const std::string& pattern) {
  784|     24|                         return PatternMatchesString(
  785|     24|                             name, pattern.c_str(),
  786|     24|                             pattern.c_str() + pattern.size());
  787|     24|                       });
gtest-all.cc:_ZN7testing8internalL20PatternMatchesStringERKNSt3__112basic_stringIcNS1_11char_traitsIcEENS1_9allocatorIcEEEEPKcSB_:
  704|     24|                                 const char* pattern, const char* pattern_end) {
  705|     24|  const char* name = name_str.c_str();
  706|     24|  const char* const name_begin = name;
  707|     24|  const char* const name_end = name + name_str.size();
  708|       |
  709|     24|  const char* pattern_next = pattern;
  710|     24|  const char* name_next = name;
  711|       |
  712|    420|  while (pattern < pattern_end || name < name_end) {
  ------------------
  |  Branch (712:10): [True: 420, False: 0]
  |  Branch (712:35): [True: 0, False: 0]
  ------------------
  713|    420|    if (pattern < pattern_end) {
  ------------------
  |  Branch (713:9): [True: 420, False: 0]
  ------------------
  714|    420|      switch (*pattern) {
  715|    220|        default:  // Match an ordinary character.
  ------------------
  |  Branch (715:9): [True: 220, False: 200]
  ------------------
  716|    220|          if (name < name_end && *name == *pattern) {
  ------------------
  |  Branch (716:15): [True: 204, False: 16]
  |  Branch (716:34): [True: 12, False: 192]
  ------------------
  717|     12|            ++pattern;
  718|     12|            ++name;
  719|     12|            continue;
  720|     12|          }
  721|    208|          break;
  722|    208|        case '?':  // Match any single character.
  ------------------
  |  Branch (722:9): [True: 0, False: 420]
  ------------------
  723|      0|          if (name < name_end) {
  ------------------
  |  Branch (723:15): [True: 0, False: 0]
  ------------------
  724|      0|            ++pattern;
  725|      0|            ++name;
  726|      0|            continue;
  727|      0|          }
  728|      0|          break;
  729|    200|        case '*':
  ------------------
  |  Branch (729:9): [True: 200, False: 220]
  ------------------
  730|       |          // Match zero or more characters. Start by skipping over the wildcard
  731|       |          // and matching zero characters from name. If that fails, restart and
  732|       |          // match one more character than the last attempt.
  733|    200|          pattern_next = pattern;
  734|    200|          name_next = name + 1;
  735|    200|          ++pattern;
  736|    200|          continue;
  737|    420|      }
  738|    420|    }
  739|       |    // Failed to match a character. Restart if possible.
  740|    208|    if (name_begin < name_next && name_next <= name_end) {
  ------------------
  |  Branch (740:9): [True: 200, False: 8]
  |  Branch (740:35): [True: 184, False: 16]
  ------------------
  741|    184|      pattern = pattern_next;
  742|    184|      name = name_next;
  743|    184|      continue;
  744|    184|    }
  745|     24|    return false;
  746|    208|  }
  747|      0|  return true;
  748|     24|}
gtest-all.cc:_ZN7testing8internal12_GLOBAL__N_133PositiveAndNegativeUnitTestFilterC2ERKNSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEE:
  802|      2|  explicit PositiveAndNegativeUnitTestFilter(const std::string& filter) {
  803|      2|    std::vector<std::string> positive_and_negative_filters;
  804|       |
  805|       |    // NOTE: `SplitString` always returns a non-empty container.
  806|      2|    SplitString(filter, '-', &positive_and_negative_filters);
  807|      2|    const auto& positive_filter = positive_and_negative_filters.front();
  808|       |
  809|      2|    if (positive_and_negative_filters.size() > 1) {
  ------------------
  |  Branch (809:9): [True: 0, False: 2]
  ------------------
  810|      0|      positive_filter_ = UnitTestFilter(
  811|      0|          positive_filter.empty() ? kUniversalFilter : positive_filter);
  ------------------
  |  Branch (811:11): [True: 0, False: 0]
  ------------------
  812|       |
  813|       |      // TODO(b/214626361): Fail on multiple '-' characters
  814|       |      // For the moment to preserve old behavior we concatenate the rest of the
  815|       |      // string parts with `-` as separator to generate the negative filter.
  816|      0|      auto negative_filter_string = positive_and_negative_filters[1];
  817|      0|      for (std::size_t i = 2; i < positive_and_negative_filters.size(); i++)
  ------------------
  |  Branch (817:31): [True: 0, False: 0]
  ------------------
  818|      0|        negative_filter_string =
  819|      0|            negative_filter_string + '-' + positive_and_negative_filters[i];
  820|      0|      negative_filter_ = UnitTestFilter(negative_filter_string);
  821|      2|    } else {
  822|       |      // In case we don't have a negative filter and positive filter is ""
  823|       |      // we do not use kUniversalFilter by design as opposed to when we have a
  824|       |      // negative filter.
  825|      2|      positive_filter_ = UnitTestFilter(positive_filter);
  826|      2|    }
  827|      2|  }
gtest-all.cc:_ZN7testing8internal12_GLOBAL__N_114UnitTestFilterC2Ev:
  759|      4|  UnitTestFilter() = default;
gtest-all.cc:_ZNK7testing8internal12_GLOBAL__N_133PositiveAndNegativeUnitTestFilter11MatchesTestERKNSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEESB_:
  833|      4|                   const std::string& test_name) const {
  834|      4|    return MatchesName(test_suite_name + "." + test_name);
  835|      4|  }
gtest-all.cc:_ZNK7testing8internal12_GLOBAL__N_133PositiveAndNegativeUnitTestFilter11MatchesNameERKNSt3__112basic_stringIcNS3_11char_traitsIcEENS3_9allocatorIcEEEE:
  839|      4|  bool MatchesName(const std::string& name) const {
  840|      4|    return positive_filter_.MatchesName(name) &&
  ------------------
  |  Branch (840:12): [True: 2, False: 2]
  ------------------
  841|      2|           !negative_filter_.MatchesName(name);
  ------------------
  |  Branch (841:12): [True: 2, False: 0]
  ------------------
  842|      4|  }
gtest-all.cc:_ZN7testing8internalL18ShouldRunTestSuiteEPKNS_9TestSuiteE:
  445|      4|static bool ShouldRunTestSuite(const TestSuite* test_suite) {
  446|      4|  return test_suite->should_run();
  447|      4|}
gtest-all.cc:_ZN7testing8internalL20SumOverTestSuiteListERKNSt3__16vectorIPNS_9TestSuiteENS1_9allocatorIS4_EEEEMS3_KFivE:
  425|      2|                                int (TestSuite::*method)() const) {
  426|      2|  int sum = 0;
  427|      6|  for (size_t i = 0; i < case_list.size(); i++) {
  ------------------
  |  Branch (427:22): [True: 4, False: 2]
  ------------------
  428|      4|    sum += (case_list[i]->*method)();
  429|      4|  }
  430|      2|  return sum;
  431|      2|}
_ZN7testing8internal5TimerC2Ev:
 1151|      6|  Timer() : start_(clock::now()) {}
gtest-all.cc:_ZN7testing8internalL13ColoredPrintfENS0_12_GLOBAL__N_110GTestColorEPKcz:
 3279|     10|static void ColoredPrintf(GTestColor color, const char* fmt, ...) {
 3280|     10|  va_list args;
 3281|     10|  va_start(args, fmt);
 3282|       |
 3283|     10|  static const bool in_color_mode =
 3284|     10|#if GTEST_HAS_FILE_SYSTEM
 3285|     10|      ShouldUseColor(posix::IsATTY(posix::FileNo(stdout)) != 0);
 3286|       |#else
 3287|       |      false;
 3288|       |#endif  // GTEST_HAS_FILE_SYSTEM
 3289|       |
 3290|     10|  const bool use_color = in_color_mode && (color != GTestColor::kDefault);
  ------------------
  |  Branch (3290:26): [True: 0, False: 10]
  |  Branch (3290:43): [True: 0, False: 0]
  ------------------
 3291|       |
 3292|     10|  if (!use_color) {
  ------------------
  |  Branch (3292:7): [True: 10, False: 0]
  ------------------
 3293|     10|    vprintf(fmt, args);
 3294|     10|    va_end(args);
 3295|     10|    return;
 3296|     10|  }
 3297|       |
 3298|       |#if defined(GTEST_OS_WINDOWS) && !defined(GTEST_OS_WINDOWS_MOBILE) &&    \
 3299|       |    !defined(GTEST_OS_WINDOWS_PHONE) && !defined(GTEST_OS_WINDOWS_RT) && \
 3300|       |    !defined(GTEST_OS_WINDOWS_MINGW)
 3301|       |  const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE);
 3302|       |
 3303|       |  // Gets the current text color.
 3304|       |  CONSOLE_SCREEN_BUFFER_INFO buffer_info;
 3305|       |  GetConsoleScreenBufferInfo(stdout_handle, &buffer_info);
 3306|       |  const WORD old_color_attrs = buffer_info.wAttributes;
 3307|       |  const WORD new_color = GetNewColor(color, old_color_attrs);
 3308|       |
 3309|       |  // We need to flush the stream buffers into the console before each
 3310|       |  // SetConsoleTextAttribute call lest it affect the text that is already
 3311|       |  // printed but has not yet reached the console.
 3312|       |  fflush(stdout);
 3313|       |  SetConsoleTextAttribute(stdout_handle, new_color);
 3314|       |
 3315|       |  vprintf(fmt, args);
 3316|       |
 3317|       |  fflush(stdout);
 3318|       |  // Restores the text color.
 3319|       |  SetConsoleTextAttribute(stdout_handle, old_color_attrs);
 3320|       |#else
 3321|      0|  printf("\033[0;3%sm", GetAnsiColorCode(color));
 3322|      0|  vprintf(fmt, args);
 3323|      0|  printf("\033[m");  // Resets the terminal to default.
 3324|      0|#endif  // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
 3325|       |  va_end(args);
 3326|      0|}
gtest-all.cc:_ZN7testingL15FormatTestCountEi:
 3101|      2|static std::string FormatTestCount(int test_count) {
 3102|      2|  return FormatCountableNoun(test_count, "test", "tests");
 3103|      2|}
gtest-all.cc:_ZN7testingL20FormatTestSuiteCountEi:
 3106|      2|static std::string FormatTestSuiteCount(int test_suite_count) {
 3107|      2|  return FormatCountableNoun(test_suite_count, "test suite", "test suites");
 3108|      2|}
gtest-all.cc:_ZN7testingL19FormatCountableNounEiPKcS1_:
 3095|      6|                                       const char* plural_form) {
 3096|      6|  return internal::StreamableToString(count) + " " +
 3097|      6|         (count == 1 ? singular_form : plural_form);
  ------------------
  |  Branch (3097:11): [True: 6, False: 0]
  ------------------
 3098|      6|}
_ZN7testing8internal27PrettyUnitTestResultPrinter13PrintTestNameEPKcS3_:
 3355|      2|  static void PrintTestName(const char* test_suite, const char* test) {
 3356|      2|    printf("%s.%s", test_suite, test);
 3357|      2|  }
_ZN7testing8internal17TestEventRepeaterC2Ev:
 3767|      2|  TestEventRepeater() : forwarding_enabled_(true) {}
_ZN7testing8internal23ScopedPrematureExitFileC2EPKc:
 5039|      2|      : premature_exit_filepath_(
 5040|      2|            premature_exit_filepath ? premature_exit_filepath : "") {
  ------------------
  |  Branch (5040:13): [True: 0, False: 2]
  ------------------
 5041|       |    // If a path to the premature-exit file is specified...
 5042|      2|    if (!premature_exit_filepath_.empty()) {
  ------------------
  |  Branch (5042:9): [True: 0, False: 2]
  ------------------
 5043|       |      // create the file with a single "0" character in it.  I/O
 5044|       |      // errors are ignored as there's nothing better we can do and we
 5045|       |      // don't want to fail the test because of this.
 5046|      0|      FILE* pfile = posix::FOpen(premature_exit_filepath_.c_str(), "w");
 5047|      0|      fwrite("0", 1, 1, pfile);
 5048|      0|      fclose(pfile);
 5049|      0|    }
 5050|      2|  }
_ZN7testing8internal27PrettyUnitTestResultPrinterC2Ev:
 3354|      2|  PrettyUnitTestResultPrinter() = default;
_ZN7testing8internal15TestSuiteNameIsC2ERKNSt3__112basic_stringIcNS2_11char_traitsIcEENS2_9allocatorIcEEEE:
 5725|      4|  explicit TestSuiteNameIs(const std::string& name) : name_(name) {}
gtest-all.cc:_ZN7testing8internalL18GTestIsInitializedEv:
  419|      4|static bool GTestIsInitialized() { return !GetArgvs().empty(); }
gtest-all.cc:_ZN7testing8internalL14ParseFlagValueEPKcS2_b:
 6339|    330|                                  bool def_optional) {
 6340|       |  // str and flag must not be NULL.
 6341|    330|  if (str == nullptr || flag_name == nullptr) return nullptr;
  ------------------
  |  Branch (6341:7): [True: 0, False: 330]
  |  Branch (6341:25): [True: 0, False: 330]
  ------------------
 6342|       |
 6343|       |  // The flag must start with "--" followed by GTEST_FLAG_PREFIX_.
 6344|    330|  const std::string flag_str =
 6345|    330|      std::string("--") + GTEST_FLAG_PREFIX_ + flag_name;
  ------------------
  |  |  331|    330|#define GTEST_FLAG_PREFIX_ "gtest_"
  ------------------
 6346|    330|  const size_t flag_len = flag_str.length();
 6347|    330|  if (strncmp(str, flag_str.c_str(), flag_len) != 0) return nullptr;
  ------------------
  |  Branch (6347:7): [True: 330, False: 0]
  ------------------
 6348|       |
 6349|       |  // Skips the flag name.
 6350|      0|  const char* flag_end = str + flag_len;
 6351|       |
 6352|       |  // When def_optional is true, it's OK to not have a "=value" part.
 6353|      0|  if (def_optional && (flag_end[0] == '\0')) {
  ------------------
  |  Branch (6353:7): [True: 0, False: 0]
  |  Branch (6353:23): [True: 0, False: 0]
  ------------------
 6354|      0|    return flag_end;
 6355|      0|  }
 6356|       |
 6357|       |  // If def_optional is true and there are more characters after the
 6358|       |  // flag name, or if def_optional is false, there must be a '=' after
 6359|       |  // the flag name.
 6360|      0|  if (flag_end[0] != '=') return nullptr;
  ------------------
  |  Branch (6360:7): [True: 0, False: 0]
  ------------------
 6361|       |
 6362|       |  // Returns the string after "=".
 6363|      0|  return flag_end + 1;
 6364|      0|}
_ZN7testing8internal27PrettyUnitTestResultPrinter18OnTestProgramStartERKNS_8UnitTestE:
 3360|      2|  void OnTestProgramStart(const UnitTest& /*unit_test*/) override {}
_ZN7testing8internal27PrettyUnitTestResultPrinter22OnEnvironmentsSetUpEndERKNS_8UnitTestE:
 3363|      2|  void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) override {}
_ZN7testing8internal35HandleExceptionsInMethodIfSupportedINS_4TestEvEET0_PT_MS4_FS3_vEPKc:
 2621|      4|                                           const char* location) {
 2622|       |  // NOTE: The user code can affect the way in which Google Test handles
 2623|       |  // exceptions by setting GTEST_FLAG(catch_exceptions), but only before
 2624|       |  // RUN_ALL_TESTS() starts. It is technically possible to check the flag
 2625|       |  // after the exception is caught and either report or re-throw the
 2626|       |  // exception based on the flag's value:
 2627|       |  //
 2628|       |  // try {
 2629|       |  //   // Perform the test method.
 2630|       |  // } catch (...) {
 2631|       |  //   if (GTEST_FLAG_GET(catch_exceptions))
 2632|       |  //     // Report the exception as failure.
 2633|       |  //   else
 2634|       |  //     throw;  // Re-throws the original exception.
 2635|       |  // }
 2636|       |  //
 2637|       |  // However, the purpose of this flag is to allow the program to drop into
 2638|       |  // the debugger when the exception is thrown. On most platforms, once the
 2639|       |  // control enters the catch block, the exception origin information is
 2640|       |  // lost and the debugger will stop the program at the point of the
 2641|       |  // re-throw in this function -- instead of at the point of the original
 2642|       |  // throw statement in the code under test.  For this reason, we perform
 2643|       |  // the check early, sacrificing the ability to affect Google Test's
 2644|       |  // exception handling in the method where the exception is thrown.
 2645|      4|  if (internal::GetUnitTestImpl()->catch_exceptions()) {
  ------------------
  |  Branch (2645:7): [True: 4, False: 0]
  ------------------
 2646|      4|#if GTEST_HAS_EXCEPTIONS
 2647|      4|    try {
 2648|      4|      return HandleSehExceptionsInMethodIfSupported(object, method, location);
 2649|      4|    } catch (const AssertionException&) {  // NOLINT
 2650|       |      // This failure was reported already.
 2651|      0|    } catch (const internal::GoogleTestFailureException&) {  // NOLINT
 2652|       |      // This exception type can only be thrown by a failed Google
 2653|       |      // Test assertion with the intention of letting another testing
 2654|       |      // framework catch it.  Therefore we just re-throw it.
 2655|      0|      throw;
 2656|      0|    } catch (const std::exception& e) {  // NOLINT
 2657|      0|      internal::ReportFailureInUnknownLocation(
 2658|      0|          TestPartResult::kFatalFailure,
 2659|      0|          FormatCxxExceptionMessage(e.what(), location));
 2660|      0|    } catch (...) {  // NOLINT
 2661|      0|      internal::ReportFailureInUnknownLocation(
 2662|      0|          TestPartResult::kFatalFailure,
 2663|      0|          FormatCxxExceptionMessage(nullptr, location));
 2664|      0|    }
 2665|      0|    return static_cast<Result>(0);
 2666|       |#else
 2667|       |    return HandleSehExceptionsInMethodIfSupported(object, method, location);
 2668|       |#endif  // GTEST_HAS_EXCEPTIONS
 2669|      4|  } else {
 2670|      0|    return (object->*method)();
 2671|      0|  }
 2672|      4|}
_ZN7testing8internal38HandleSehExceptionsInMethodIfSupportedINS_4TestEvEET0_PT_MS4_FS3_vEPKc:
 2602|      4|                                              const char* location) {
 2603|       |#if GTEST_HAS_SEH
 2604|       |  __try {
 2605|       |    return (object->*method)();
 2606|       |  } __except (internal::UnitTestOptions::GTestProcessSEH(  // NOLINT
 2607|       |      GetExceptionCode(), location)) {
 2608|       |    return static_cast<Result>(0);
 2609|       |  }
 2610|       |#else
 2611|      4|  (void)location;
 2612|      4|  return (object->*method)();
 2613|      4|#endif  // GTEST_HAS_SEH
 2614|      4|}
_ZN7testing8internal35HandleExceptionsInMethodIfSupportedINS0_15TestFactoryBaseEPNS_4TestEEET0_PT_MS6_FS5_vEPKc:
 2621|      2|                                           const char* location) {
 2622|       |  // NOTE: The user code can affect the way in which Google Test handles
 2623|       |  // exceptions by setting GTEST_FLAG(catch_exceptions), but only before
 2624|       |  // RUN_ALL_TESTS() starts. It is technically possible to check the flag
 2625|       |  // after the exception is caught and either report or re-throw the
 2626|       |  // exception based on the flag's value:
 2627|       |  //
 2628|       |  // try {
 2629|       |  //   // Perform the test method.
 2630|       |  // } catch (...) {
 2631|       |  //   if (GTEST_FLAG_GET(catch_exceptions))
 2632|       |  //     // Report the exception as failure.
 2633|       |  //   else
 2634|       |  //     throw;  // Re-throws the original exception.
 2635|       |  // }
 2636|       |  //
 2637|       |  // However, the purpose of this flag is to allow the program to drop into
 2638|       |  // the debugger when the exception is thrown. On most platforms, once the
 2639|       |  // control enters the catch block, the exception origin information is
 2640|       |  // lost and the debugger will stop the program at the point of the
 2641|       |  // re-throw in this function -- instead of at the point of the original
 2642|       |  // throw statement in the code under test.  For this reason, we perform
 2643|       |  // the check early, sacrificing the ability to affect Google Test's
 2644|       |  // exception handling in the method where the exception is thrown.
 2645|      2|  if (internal::GetUnitTestImpl()->catch_exceptions()) {
  ------------------
  |  Branch (2645:7): [True: 2, False: 0]
  ------------------
 2646|      2|#if GTEST_HAS_EXCEPTIONS
 2647|      2|    try {
 2648|      2|      return HandleSehExceptionsInMethodIfSupported(object, method, location);
 2649|      2|    } catch (const AssertionException&) {  // NOLINT
 2650|       |      // This failure was reported already.
 2651|      0|    } catch (const internal::GoogleTestFailureException&) {  // NOLINT
 2652|       |      // This exception type can only be thrown by a failed Google
 2653|       |      // Test assertion with the intention of letting another testing
 2654|       |      // framework catch it.  Therefore we just re-throw it.
 2655|      0|      throw;
 2656|      0|    } catch (const std::exception& e) {  // NOLINT
 2657|      0|      internal::ReportFailureInUnknownLocation(
 2658|      0|          TestPartResult::kFatalFailure,
 2659|      0|          FormatCxxExceptionMessage(e.what(), location));
 2660|      0|    } catch (...) {  // NOLINT
 2661|      0|      internal::ReportFailureInUnknownLocation(
 2662|      0|          TestPartResult::kFatalFailure,
 2663|      0|          FormatCxxExceptionMessage(nullptr, location));
 2664|      0|    }
 2665|      0|    return static_cast<Result>(0);
 2666|       |#else
 2667|       |    return HandleSehExceptionsInMethodIfSupported(object, method, location);
 2668|       |#endif  // GTEST_HAS_EXCEPTIONS
 2669|      2|  } else {
 2670|      0|    return (object->*method)();
 2671|      0|  }
 2672|      2|}
_ZN7testing8internal38HandleSehExceptionsInMethodIfSupportedINS0_15TestFactoryBaseEPNS_4TestEEET0_PT_MS6_FS5_vEPKc:
 2602|      2|                                              const char* location) {
 2603|       |#if GTEST_HAS_SEH
 2604|       |  __try {
 2605|       |    return (object->*method)();
 2606|       |  } __except (internal::UnitTestOptions::GTestProcessSEH(  // NOLINT
 2607|       |      GetExceptionCode(), location)) {
 2608|       |    return static_cast<Result>(0);
 2609|       |  }
 2610|       |#else
 2611|      2|  (void)location;
 2612|      2|  return (object->*method)();
 2613|      2|#endif  // GTEST_HAS_SEH
 2614|      2|}
_ZN7testing8internal35HandleExceptionsInMethodIfSupportedINS_9TestSuiteEvEET0_PT_MS4_FS3_vEPKc:
 2621|      2|                                           const char* location) {
 2622|       |  // NOTE: The user code can affect the way in which Google Test handles
 2623|       |  // exceptions by setting GTEST_FLAG(catch_exceptions), but only before
 2624|       |  // RUN_ALL_TESTS() starts. It is technically possible to check the flag
 2625|       |  // after the exception is caught and either report or re-throw the
 2626|       |  // exception based on the flag's value:
 2627|       |  //
 2628|       |  // try {
 2629|       |  //   // Perform the test method.
 2630|       |  // } catch (...) {
 2631|       |  //   if (GTEST_FLAG_GET(catch_exceptions))
 2632|       |  //     // Report the exception as failure.
 2633|       |  //   else
 2634|       |  //     throw;  // Re-throws the original exception.
 2635|       |  // }
 2636|       |  //
 2637|       |  // However, the purpose of this flag is to allow the program to drop into
 2638|       |  // the debugger when the exception is thrown. On most platforms, once the
 2639|       |  // control enters the catch block, the exception origin information is
 2640|       |  // lost and the debugger will stop the program at the point of the
 2641|       |  // re-throw in this function -- instead of at the point of the original
 2642|       |  // throw statement in the code under test.  For this reason, we perform
 2643|       |  // the check early, sacrificing the ability to affect Google Test's
 2644|       |  // exception handling in the method where the exception is thrown.
 2645|      2|  if (internal::GetUnitTestImpl()->catch_exceptions()) {
  ------------------
  |  Branch (2645:7): [True: 2, False: 0]
  ------------------
 2646|      2|#if GTEST_HAS_EXCEPTIONS
 2647|      2|    try {
 2648|      2|      return HandleSehExceptionsInMethodIfSupported(object, method, location);
 2649|      2|    } catch (const AssertionException&) {  // NOLINT
 2650|       |      // This failure was reported already.
 2651|      0|    } catch (const internal::GoogleTestFailureException&) {  // NOLINT
 2652|       |      // This exception type can only be thrown by a failed Google
 2653|       |      // Test assertion with the intention of letting another testing
 2654|       |      // framework catch it.  Therefore we just re-throw it.
 2655|      0|      throw;
 2656|      0|    } catch (const std::exception& e) {  // NOLINT
 2657|      0|      internal::ReportFailureInUnknownLocation(
 2658|      0|          TestPartResult::kFatalFailure,
 2659|      0|          FormatCxxExceptionMessage(e.what(), location));
 2660|      0|    } catch (...) {  // NOLINT
 2661|      0|      internal::ReportFailureInUnknownLocation(
 2662|      0|          TestPartResult::kFatalFailure,
 2663|      0|          FormatCxxExceptionMessage(nullptr, location));
 2664|      0|    }
 2665|      0|    return static_cast<Result>(0);
 2666|       |#else
 2667|       |    return HandleSehExceptionsInMethodIfSupported(object, method, location);
 2668|       |#endif  // GTEST_HAS_EXCEPTIONS
 2669|      2|  } else {
 2670|      0|    return (object->*method)();
 2671|      0|  }
 2672|      2|}
_ZN7testing8internal38HandleSehExceptionsInMethodIfSupportedINS_9TestSuiteEvEET0_PT_MS4_FS3_vEPKc:
 2602|      2|                                              const char* location) {
 2603|       |#if GTEST_HAS_SEH
 2604|       |  __try {
 2605|       |    return (object->*method)();
 2606|       |  } __except (internal::UnitTestOptions::GTestProcessSEH(  // NOLINT
 2607|       |      GetExceptionCode(), location)) {
 2608|       |    return static_cast<Result>(0);
 2609|       |  }
 2610|       |#else
 2611|      2|  (void)location;
 2612|      2|  return (object->*method)();
 2613|      2|#endif  // GTEST_HAS_SEH
 2614|      2|}
_ZN7testing8internal35HandleExceptionsInMethodIfSupportedINS0_12UnitTestImplEbEET0_PT_MS4_FS3_vEPKc:
 2621|      2|                                           const char* location) {
 2622|       |  // NOTE: The user code can affect the way in which Google Test handles
 2623|       |  // exceptions by setting GTEST_FLAG(catch_exceptions), but only before
 2624|       |  // RUN_ALL_TESTS() starts. It is technically possible to check the flag
 2625|       |  // after the exception is caught and either report or re-throw the
 2626|       |  // exception based on the flag's value:
 2627|       |  //
 2628|       |  // try {
 2629|       |  //   // Perform the test method.
 2630|       |  // } catch (...) {
 2631|       |  //   if (GTEST_FLAG_GET(catch_exceptions))
 2632|       |  //     // Report the exception as failure.
 2633|       |  //   else
 2634|       |  //     throw;  // Re-throws the original exception.
 2635|       |  // }
 2636|       |  //
 2637|       |  // However, the purpose of this flag is to allow the program to drop into
 2638|       |  // the debugger when the exception is thrown. On most platforms, once the
 2639|       |  // control enters the catch block, the exception origin information is
 2640|       |  // lost and the debugger will stop the program at the point of the
 2641|       |  // re-throw in this function -- instead of at the point of the original
 2642|       |  // throw statement in the code under test.  For this reason, we perform
 2643|       |  // the check early, sacrificing the ability to affect Google Test's
 2644|       |  // exception handling in the method where the exception is thrown.
 2645|      2|  if (internal::GetUnitTestImpl()->catch_exceptions()) {
  ------------------
  |  Branch (2645:7): [True: 2, False: 0]
  ------------------
 2646|      2|#if GTEST_HAS_EXCEPTIONS
 2647|      2|    try {
 2648|      2|      return HandleSehExceptionsInMethodIfSupported(object, method, location);
 2649|      2|    } catch (const AssertionException&) {  // NOLINT
 2650|       |      // This failure was reported already.
 2651|      0|    } catch (const internal::GoogleTestFailureException&) {  // NOLINT
 2652|       |      // This exception type can only be thrown by a failed Google
 2653|       |      // Test assertion with the intention of letting another testing
 2654|       |      // framework catch it.  Therefore we just re-throw it.
 2655|      0|      throw;
 2656|      0|    } catch (const std::exception& e) {  // NOLINT
 2657|      0|      internal::ReportFailureInUnknownLocation(
 2658|      0|          TestPartResult::kFatalFailure,
 2659|      0|          FormatCxxExceptionMessage(e.what(), location));
 2660|      0|    } catch (...) {  // NOLINT
 2661|      0|      internal::ReportFailureInUnknownLocation(
 2662|      0|          TestPartResult::kFatalFailure,
 2663|      0|          FormatCxxExceptionMessage(nullptr, location));
 2664|      0|    }
 2665|      0|    return static_cast<Result>(0);
 2666|       |#else
 2667|       |    return HandleSehExceptionsInMethodIfSupported(object, method, location);
 2668|       |#endif  // GTEST_HAS_EXCEPTIONS
 2669|      2|  } else {
 2670|      0|    return (object->*method)();
 2671|      0|  }
 2672|      2|}
_ZN7testing8internal38HandleSehExceptionsInMethodIfSupportedINS0_12UnitTestImplEbEET0_PT_MS4_FS3_vEPKc:
 2602|      2|                                              const char* location) {
 2603|       |#if GTEST_HAS_SEH
 2604|       |  __try {
 2605|       |    return (object->*method)();
 2606|       |  } __except (internal::UnitTestOptions::GTestProcessSEH(  // NOLINT
 2607|       |      GetExceptionCode(), location)) {
 2608|       |    return static_cast<Result>(0);
 2609|       |  }
 2610|       |#else
 2611|      2|  (void)location;
 2612|      2|  return (object->*method)();
 2613|      2|#endif  // GTEST_HAS_SEH
 2614|      2|}
_ZNK7testing8internal15TestSuiteNameIsclEPKNS_9TestSuiteE:
 5728|      2|  bool operator()(const TestSuite* test_suite) const {
 5729|      2|    return test_suite != nullptr &&
  ------------------
  |  Branch (5729:12): [True: 2, False: 0]
  ------------------
 5730|      2|           strcmp(test_suite->name(), name_.c_str()) == 0;
  ------------------
  |  Branch (5730:12): [True: 0, False: 2]
  ------------------
 5731|      2|  }
_ZN7testing8internal28ParseGoogleTestFlagsOnlyImplIcEEvPiPPT_:
 6630|      2|void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) {
 6631|      2|  std::string flagfile_value;
 6632|     17|  for (int i = 1; i < *argc; i++) {
  ------------------
  |  Branch (6632:19): [True: 15, False: 2]
  ------------------
 6633|     15|    const std::string arg_string = StreamableToString(argv[i]);
 6634|     15|    const char* const arg = arg_string.c_str();
 6635|       |
 6636|     15|    using internal::ParseFlag;
 6637|       |
 6638|     15|    bool remove_flag = false;
 6639|     15|    if (ParseGoogleTestFlag(arg)) {
  ------------------
  |  Branch (6639:9): [True: 0, False: 15]
  ------------------
 6640|      0|      remove_flag = true;
 6641|      0|#if GTEST_USE_OWN_FLAGFILE_FLAG_ && GTEST_HAS_FILE_SYSTEM
 6642|     15|    } else if (ParseFlag(arg, "flagfile", &flagfile_value)) {
  ------------------
  |  Branch (6642:16): [True: 0, False: 15]
  ------------------
 6643|      0|      GTEST_FLAG_SET(flagfile, flagfile_value);
  ------------------
  |  | 2294|      0|#define GTEST_FLAG_SET(name, value) (void)(::testing::GTEST_FLAG(name) = value)
  |  |  ------------------
  |  |  |  | 2226|      0|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  ------------------
  ------------------
 6644|      0|      LoadFlagsFromFile(flagfile_value);
 6645|      0|      remove_flag = true;
 6646|      0|#endif  // GTEST_USE_OWN_FLAGFILE_FLAG_ && GTEST_HAS_FILE_SYSTEM
 6647|     15|    } else if (arg_string == "--help" || HasGoogleTestFlagPrefix(arg)) {
  ------------------
  |  Branch (6647:16): [True: 0, False: 15]
  |  Branch (6647:42): [True: 0, False: 15]
  ------------------
 6648|       |      // Both help flag and unrecognized Google Test flags (excluding
 6649|       |      // internal ones) trigger help display.
 6650|      0|      g_help_flag = true;
 6651|      0|    }
 6652|       |
 6653|     15|    if (remove_flag) {
  ------------------
  |  Branch (6653:9): [True: 0, False: 15]
  ------------------
 6654|       |      // Shift the remainder of the argv list left by one.  Note
 6655|       |      // that argv has (*argc + 1) elements, the last one always being
 6656|       |      // NULL.  The following loop moves the trailing NULL element as
 6657|       |      // well.
 6658|      0|      for (int j = i; j != *argc; j++) {
  ------------------
  |  Branch (6658:23): [True: 0, False: 0]
  ------------------
 6659|      0|        argv[j] = argv[j + 1];
 6660|      0|      }
 6661|       |
 6662|       |      // Decrements the argument count.
 6663|      0|      (*argc)--;
 6664|       |
 6665|       |      // We also need to decrement the iterator as we just removed
 6666|       |      // an element.
 6667|      0|      i--;
 6668|      0|    }
 6669|     15|  }
 6670|       |
 6671|      2|  if (g_help_flag) {
  ------------------
  |  Branch (6671:7): [True: 0, False: 2]
  ------------------
 6672|       |    // We print the help here instead of in RUN_ALL_TESTS(), as the
 6673|       |    // latter may not be called at all if the user is using Google
 6674|       |    // Test with another testing framework.
 6675|      0|    PrintColorEncoded(kColorEncodedHelpMessage);
 6676|      0|  }
 6677|      2|}
gtest-all.cc:_ZN7testing8internalL19ParseGoogleTestFlagEPKc:
 6574|     15|static bool ParseGoogleTestFlag(const char* const arg) {
 6575|     15|#define GTEST_INTERNAL_PARSE_FLAG(flag_name)  \
 6576|     15|  do {                                        \
 6577|     15|    auto value = GTEST_FLAG_GET(flag_name);   \
 6578|     15|    if (ParseFlag(arg, #flag_name, &value)) { \
 6579|     15|      GTEST_FLAG_SET(flag_name, value);       \
 6580|     15|      return true;                            \
 6581|     15|    }                                         \
 6582|     15|  } while (false)
 6583|       |
 6584|     15|  GTEST_INTERNAL_PARSE_FLAG(also_run_disabled_tests);
  ------------------
  |  | 6576|     15|  do {                                        \
  |  | 6577|     15|    auto value = GTEST_FLAG_GET(flag_name);   \
  |  |  ------------------
  |  |  |  | 2293|     15|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  |  |  ------------------
  |  |  |  |  |  | 2226|     15|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 6578|     15|    if (ParseFlag(arg, #flag_name, &value)) { \
  |  |  ------------------
  |  |  |  Branch (6578:9): [True: 0, False: 15]
  |  |  ------------------
  |  | 6579|      0|      GTEST_FLAG_SET(flag_name, value);       \
  |  |  ------------------
  |  |  |  | 2294|      0|#define GTEST_FLAG_SET(name, value) (void)(::testing::GTEST_FLAG(name) = value)
  |  |  |  |  ------------------
  |  |  |  |  |  | 2226|      0|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 6580|      0|      return true;                            \
  |  | 6581|      0|    }                                         \
  |  | 6582|     15|  } while (false)
  |  |  ------------------
  |  |  |  Branch (6582:12): [Folded, False: 15]
  |  |  ------------------
  ------------------
 6585|     15|  GTEST_INTERNAL_PARSE_FLAG(break_on_failure);
  ------------------
  |  | 6576|     15|  do {                                        \
  |  | 6577|     15|    auto value = GTEST_FLAG_GET(flag_name);   \
  |  |  ------------------
  |  |  |  | 2293|     15|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  |  |  ------------------
  |  |  |  |  |  | 2226|     15|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 6578|     15|    if (ParseFlag(arg, #flag_name, &value)) { \
  |  |  ------------------
  |  |  |  Branch (6578:9): [True: 0, False: 15]
  |  |  ------------------
  |  | 6579|      0|      GTEST_FLAG_SET(flag_name, value);       \
  |  |  ------------------
  |  |  |  | 2294|      0|#define GTEST_FLAG_SET(name, value) (void)(::testing::GTEST_FLAG(name) = value)
  |  |  |  |  ------------------
  |  |  |  |  |  | 2226|      0|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 6580|      0|      return true;                            \
  |  | 6581|      0|    }                                         \
  |  | 6582|     15|  } while (false)
  |  |  ------------------
  |  |  |  Branch (6582:12): [Folded, False: 15]
  |  |  ------------------
  ------------------
 6586|     15|  GTEST_INTERNAL_PARSE_FLAG(catch_exceptions);
  ------------------
  |  | 6576|     15|  do {                                        \
  |  | 6577|     15|    auto value = GTEST_FLAG_GET(flag_name);   \
  |  |  ------------------
  |  |  |  | 2293|     15|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  |  |  ------------------
  |  |  |  |  |  | 2226|     15|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 6578|     15|    if (ParseFlag(arg, #flag_name, &value)) { \
  |  |  ------------------
  |  |  |  Branch (6578:9): [True: 0, False: 15]
  |  |  ------------------
  |  | 6579|      0|      GTEST_FLAG_SET(flag_name, value);       \
  |  |  ------------------
  |  |  |  | 2294|      0|#define GTEST_FLAG_SET(name, value) (void)(::testing::GTEST_FLAG(name) = value)
  |  |  |  |  ------------------
  |  |  |  |  |  | 2226|      0|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 6580|      0|      return true;                            \
  |  | 6581|      0|    }                                         \
  |  | 6582|     15|  } while (false)
  |  |  ------------------
  |  |  |  Branch (6582:12): [Folded, False: 15]
  |  |  ------------------
  ------------------
 6587|     15|  GTEST_INTERNAL_PARSE_FLAG(color);
  ------------------
  |  | 6576|     15|  do {                                        \
  |  | 6577|     15|    auto value = GTEST_FLAG_GET(flag_name);   \
  |  |  ------------------
  |  |  |  | 2293|     15|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  |  |  ------------------
  |  |  |  |  |  | 2226|     15|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 6578|     15|    if (ParseFlag(arg, #flag_name, &value)) { \
  |  |  ------------------
  |  |  |  Branch (6578:9): [True: 0, False: 15]
  |  |  ------------------
  |  | 6579|      0|      GTEST_FLAG_SET(flag_name, value);       \
  |  |  ------------------
  |  |  |  | 2294|      0|#define GTEST_FLAG_SET(name, value) (void)(::testing::GTEST_FLAG(name) = value)
  |  |  |  |  ------------------
  |  |  |  |  |  | 2226|      0|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 6580|      0|      return true;                            \
  |  | 6581|      0|    }                                         \
  |  | 6582|     15|  } while (false)
  |  |  ------------------
  |  |  |  Branch (6582:12): [Folded, False: 15]
  |  |  ------------------
  ------------------
 6588|     15|  GTEST_INTERNAL_PARSE_FLAG(death_test_style);
  ------------------
  |  | 6576|     15|  do {                                        \
  |  | 6577|     15|    auto value = GTEST_FLAG_GET(flag_name);   \
  |  |  ------------------
  |  |  |  | 2293|     15|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  |  |  ------------------
  |  |  |  |  |  | 2226|     15|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 6578|     15|    if (ParseFlag(arg, #flag_name, &value)) { \
  |  |  ------------------
  |  |  |  Branch (6578:9): [True: 0, False: 15]
  |  |  ------------------
  |  | 6579|      0|      GTEST_FLAG_SET(flag_name, value);       \
  |  |  ------------------
  |  |  |  | 2294|      0|#define GTEST_FLAG_SET(name, value) (void)(::testing::GTEST_FLAG(name) = value)
  |  |  |  |  ------------------
  |  |  |  |  |  | 2226|      0|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 6580|      0|      return true;                            \
  |  | 6581|      0|    }                                         \
  |  | 6582|     15|  } while (false)
  |  |  ------------------
  |  |  |  Branch (6582:12): [Folded, False: 15]
  |  |  ------------------
  ------------------
 6589|     15|  GTEST_INTERNAL_PARSE_FLAG(death_test_use_fork);
  ------------------
  |  | 6576|     15|  do {                                        \
  |  | 6577|     15|    auto value = GTEST_FLAG_GET(flag_name);   \
  |  |  ------------------
  |  |  |  | 2293|     15|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  |  |  ------------------
  |  |  |  |  |  | 2226|     15|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 6578|     15|    if (ParseFlag(arg, #flag_name, &value)) { \
  |  |  ------------------
  |  |  |  Branch (6578:9): [True: 0, False: 15]
  |  |  ------------------
  |  | 6579|      0|      GTEST_FLAG_SET(flag_name, value);       \
  |  |  ------------------
  |  |  |  | 2294|      0|#define GTEST_FLAG_SET(name, value) (void)(::testing::GTEST_FLAG(name) = value)
  |  |  |  |  ------------------
  |  |  |  |  |  | 2226|      0|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 6580|      0|      return true;                            \
  |  | 6581|      0|    }                                         \
  |  | 6582|     15|  } while (false)
  |  |  ------------------
  |  |  |  Branch (6582:12): [Folded, False: 15]
  |  |  ------------------
  ------------------
 6590|     15|  GTEST_INTERNAL_PARSE_FLAG(fail_fast);
  ------------------
  |  | 6576|     15|  do {                                        \
  |  | 6577|     15|    auto value = GTEST_FLAG_GET(flag_name);   \
  |  |  ------------------
  |  |  |  | 2293|     15|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  |  |  ------------------
  |  |  |  |  |  | 2226|     15|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 6578|     15|    if (ParseFlag(arg, #flag_name, &value)) { \
  |  |  ------------------
  |  |  |  Branch (6578:9): [True: 0, False: 15]
  |  |  ------------------
  |  | 6579|      0|      GTEST_FLAG_SET(flag_name, value);       \
  |  |  ------------------
  |  |  |  | 2294|      0|#define GTEST_FLAG_SET(name, value) (void)(::testing::GTEST_FLAG(name) = value)
  |  |  |  |  ------------------
  |  |  |  |  |  | 2226|      0|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 6580|      0|      return true;                            \
  |  | 6581|      0|    }                                         \
  |  | 6582|     15|  } while (false)
  |  |  ------------------
  |  |  |  Branch (6582:12): [Folded, False: 15]
  |  |  ------------------
  ------------------
 6591|     15|  GTEST_INTERNAL_PARSE_FLAG(filter);
  ------------------
  |  | 6576|     15|  do {                                        \
  |  | 6577|     15|    auto value = GTEST_FLAG_GET(flag_name);   \
  |  |  ------------------
  |  |  |  | 2293|     15|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  |  |  ------------------
  |  |  |  |  |  | 2226|     15|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 6578|     15|    if (ParseFlag(arg, #flag_name, &value)) { \
  |  |  ------------------
  |  |  |  Branch (6578:9): [True: 0, False: 15]
  |  |  ------------------
  |  | 6579|      0|      GTEST_FLAG_SET(flag_name, value);       \
  |  |  ------------------
  |  |  |  | 2294|      0|#define GTEST_FLAG_SET(name, value) (void)(::testing::GTEST_FLAG(name) = value)
  |  |  |  |  ------------------
  |  |  |  |  |  | 2226|      0|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 6580|      0|      return true;                            \
  |  | 6581|      0|    }                                         \
  |  | 6582|     15|  } while (false)
  |  |  ------------------
  |  |  |  Branch (6582:12): [Folded, False: 15]
  |  |  ------------------
  ------------------
 6592|     15|  GTEST_INTERNAL_PARSE_FLAG(internal_run_death_test);
  ------------------
  |  | 6576|     15|  do {                                        \
  |  | 6577|     15|    auto value = GTEST_FLAG_GET(flag_name);   \
  |  |  ------------------
  |  |  |  | 2293|     15|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  |  |  ------------------
  |  |  |  |  |  | 2226|     15|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 6578|     15|    if (ParseFlag(arg, #flag_name, &value)) { \
  |  |  ------------------
  |  |  |  Branch (6578:9): [True: 0, False: 15]
  |  |  ------------------
  |  | 6579|      0|      GTEST_FLAG_SET(flag_name, value);       \
  |  |  ------------------
  |  |  |  | 2294|      0|#define GTEST_FLAG_SET(name, value) (void)(::testing::GTEST_FLAG(name) = value)
  |  |  |  |  ------------------
  |  |  |  |  |  | 2226|      0|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 6580|      0|      return true;                            \
  |  | 6581|      0|    }                                         \
  |  | 6582|     15|  } while (false)
  |  |  ------------------
  |  |  |  Branch (6582:12): [Folded, False: 15]
  |  |  ------------------
  ------------------
 6593|     15|  GTEST_INTERNAL_PARSE_FLAG(list_tests);
  ------------------
  |  | 6576|     15|  do {                                        \
  |  | 6577|     15|    auto value = GTEST_FLAG_GET(flag_name);   \
  |  |  ------------------
  |  |  |  | 2293|     15|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  |  |  ------------------
  |  |  |  |  |  | 2226|     15|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 6578|     15|    if (ParseFlag(arg, #flag_name, &value)) { \
  |  |  ------------------
  |  |  |  Branch (6578:9): [True: 0, False: 15]
  |  |  ------------------
  |  | 6579|      0|      GTEST_FLAG_SET(flag_name, value);       \
  |  |  ------------------
  |  |  |  | 2294|      0|#define GTEST_FLAG_SET(name, value) (void)(::testing::GTEST_FLAG(name) = value)
  |  |  |  |  ------------------
  |  |  |  |  |  | 2226|      0|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 6580|      0|      return true;                            \
  |  | 6581|      0|    }                                         \
  |  | 6582|     15|  } while (false)
  |  |  ------------------
  |  |  |  Branch (6582:12): [Folded, False: 15]
  |  |  ------------------
  ------------------
 6594|     15|  GTEST_INTERNAL_PARSE_FLAG(output);
  ------------------
  |  | 6576|     15|  do {                                        \
  |  | 6577|     15|    auto value = GTEST_FLAG_GET(flag_name);   \
  |  |  ------------------
  |  |  |  | 2293|     15|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  |  |  ------------------
  |  |  |  |  |  | 2226|     15|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 6578|     15|    if (ParseFlag(arg, #flag_name, &value)) { \
  |  |  ------------------
  |  |  |  Branch (6578:9): [True: 0, False: 15]
  |  |  ------------------
  |  | 6579|      0|      GTEST_FLAG_SET(flag_name, value);       \
  |  |  ------------------
  |  |  |  | 2294|      0|#define GTEST_FLAG_SET(name, value) (void)(::testing::GTEST_FLAG(name) = value)
  |  |  |  |  ------------------
  |  |  |  |  |  | 2226|      0|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 6580|      0|      return true;                            \
  |  | 6581|      0|    }                                         \
  |  | 6582|     15|  } while (false)
  |  |  ------------------
  |  |  |  Branch (6582:12): [Folded, False: 15]
  |  |  ------------------
  ------------------
 6595|     15|  GTEST_INTERNAL_PARSE_FLAG(brief);
  ------------------
  |  | 6576|     15|  do {                                        \
  |  | 6577|     15|    auto value = GTEST_FLAG_GET(flag_name);   \
  |  |  ------------------
  |  |  |  | 2293|     15|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  |  |  ------------------
  |  |  |  |  |  | 2226|     15|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 6578|     15|    if (ParseFlag(arg, #flag_name, &value)) { \
  |  |  ------------------
  |  |  |  Branch (6578:9): [True: 0, False: 15]
  |  |  ------------------
  |  | 6579|      0|      GTEST_FLAG_SET(flag_name, value);       \
  |  |  ------------------
  |  |  |  | 2294|      0|#define GTEST_FLAG_SET(name, value) (void)(::testing::GTEST_FLAG(name) = value)
  |  |  |  |  ------------------
  |  |  |  |  |  | 2226|      0|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 6580|      0|      return true;                            \
  |  | 6581|      0|    }                                         \
  |  | 6582|     15|  } while (false)
  |  |  ------------------
  |  |  |  Branch (6582:12): [Folded, False: 15]
  |  |  ------------------
  ------------------
 6596|     15|  GTEST_INTERNAL_PARSE_FLAG(print_time);
  ------------------
  |  | 6576|     15|  do {                                        \
  |  | 6577|     15|    auto value = GTEST_FLAG_GET(flag_name);   \
  |  |  ------------------
  |  |  |  | 2293|     15|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  |  |  ------------------
  |  |  |  |  |  | 2226|     15|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 6578|     15|    if (ParseFlag(arg, #flag_name, &value)) { \
  |  |  ------------------
  |  |  |  Branch (6578:9): [True: 0, False: 15]
  |  |  ------------------
  |  | 6579|      0|      GTEST_FLAG_SET(flag_name, value);       \
  |  |  ------------------
  |  |  |  | 2294|      0|#define GTEST_FLAG_SET(name, value) (void)(::testing::GTEST_FLAG(name) = value)
  |  |  |  |  ------------------
  |  |  |  |  |  | 2226|      0|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 6580|      0|      return true;                            \
  |  | 6581|      0|    }                                         \
  |  | 6582|     15|  } while (false)
  |  |  ------------------
  |  |  |  Branch (6582:12): [Folded, False: 15]
  |  |  ------------------
  ------------------
 6597|     15|  GTEST_INTERNAL_PARSE_FLAG(print_utf8);
  ------------------
  |  | 6576|     15|  do {                                        \
  |  | 6577|     15|    auto value = GTEST_FLAG_GET(flag_name);   \
  |  |  ------------------
  |  |  |  | 2293|     15|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  |  |  ------------------
  |  |  |  |  |  | 2226|     15|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 6578|     15|    if (ParseFlag(arg, #flag_name, &value)) { \
  |  |  ------------------
  |  |  |  Branch (6578:9): [True: 0, False: 15]
  |  |  ------------------
  |  | 6579|      0|      GTEST_FLAG_SET(flag_name, value);       \
  |  |  ------------------
  |  |  |  | 2294|      0|#define GTEST_FLAG_SET(name, value) (void)(::testing::GTEST_FLAG(name) = value)
  |  |  |  |  ------------------
  |  |  |  |  |  | 2226|      0|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 6580|      0|      return true;                            \
  |  | 6581|      0|    }                                         \
  |  | 6582|     15|  } while (false)
  |  |  ------------------
  |  |  |  Branch (6582:12): [Folded, False: 15]
  |  |  ------------------
  ------------------
 6598|     15|  GTEST_INTERNAL_PARSE_FLAG(random_seed);
  ------------------
  |  | 6576|     15|  do {                                        \
  |  | 6577|     15|    auto value = GTEST_FLAG_GET(flag_name);   \
  |  |  ------------------
  |  |  |  | 2293|     15|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  |  |  ------------------
  |  |  |  |  |  | 2226|     15|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 6578|     15|    if (ParseFlag(arg, #flag_name, &value)) { \
  |  |  ------------------
  |  |  |  Branch (6578:9): [True: 0, False: 15]
  |  |  ------------------
  |  | 6579|      0|      GTEST_FLAG_SET(flag_name, value);       \
  |  |  ------------------
  |  |  |  | 2294|      0|#define GTEST_FLAG_SET(name, value) (void)(::testing::GTEST_FLAG(name) = value)
  |  |  |  |  ------------------
  |  |  |  |  |  | 2226|      0|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 6580|      0|      return true;                            \
  |  | 6581|      0|    }                                         \
  |  | 6582|     15|  } while (false)
  |  |  ------------------
  |  |  |  Branch (6582:12): [Folded, False: 15]
  |  |  ------------------
  ------------------
 6599|     15|  GTEST_INTERNAL_PARSE_FLAG(repeat);
  ------------------
  |  | 6576|     15|  do {                                        \
  |  | 6577|     15|    auto value = GTEST_FLAG_GET(flag_name);   \
  |  |  ------------------
  |  |  |  | 2293|     15|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  |  |  ------------------
  |  |  |  |  |  | 2226|     15|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 6578|     15|    if (ParseFlag(arg, #flag_name, &value)) { \
  |  |  ------------------
  |  |  |  Branch (6578:9): [True: 0, False: 15]
  |  |  ------------------
  |  | 6579|      0|      GTEST_FLAG_SET(flag_name, value);       \
  |  |  ------------------
  |  |  |  | 2294|      0|#define GTEST_FLAG_SET(name, value) (void)(::testing::GTEST_FLAG(name) = value)
  |  |  |  |  ------------------
  |  |  |  |  |  | 2226|      0|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 6580|      0|      return true;                            \
  |  | 6581|      0|    }                                         \
  |  | 6582|     15|  } while (false)
  |  |  ------------------
  |  |  |  Branch (6582:12): [Folded, False: 15]
  |  |  ------------------
  ------------------
 6600|     15|  GTEST_INTERNAL_PARSE_FLAG(recreate_environments_when_repeating);
  ------------------
  |  | 6576|     15|  do {                                        \
  |  | 6577|     15|    auto value = GTEST_FLAG_GET(flag_name);   \
  |  |  ------------------
  |  |  |  | 2293|     15|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  |  |  ------------------
  |  |  |  |  |  | 2226|     15|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 6578|     15|    if (ParseFlag(arg, #flag_name, &value)) { \
  |  |  ------------------
  |  |  |  Branch (6578:9): [True: 0, False: 15]
  |  |  ------------------
  |  | 6579|      0|      GTEST_FLAG_SET(flag_name, value);       \
  |  |  ------------------
  |  |  |  | 2294|      0|#define GTEST_FLAG_SET(name, value) (void)(::testing::GTEST_FLAG(name) = value)
  |  |  |  |  ------------------
  |  |  |  |  |  | 2226|      0|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 6580|      0|      return true;                            \
  |  | 6581|      0|    }                                         \
  |  | 6582|     15|  } while (false)
  |  |  ------------------
  |  |  |  Branch (6582:12): [Folded, False: 15]
  |  |  ------------------
  ------------------
 6601|     15|  GTEST_INTERNAL_PARSE_FLAG(shuffle);
  ------------------
  |  | 6576|     15|  do {                                        \
  |  | 6577|     15|    auto value = GTEST_FLAG_GET(flag_name);   \
  |  |  ------------------
  |  |  |  | 2293|     15|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  |  |  ------------------
  |  |  |  |  |  | 2226|     15|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 6578|     15|    if (ParseFlag(arg, #flag_name, &value)) { \
  |  |  ------------------
  |  |  |  Branch (6578:9): [True: 0, False: 15]
  |  |  ------------------
  |  | 6579|      0|      GTEST_FLAG_SET(flag_name, value);       \
  |  |  ------------------
  |  |  |  | 2294|      0|#define GTEST_FLAG_SET(name, value) (void)(::testing::GTEST_FLAG(name) = value)
  |  |  |  |  ------------------
  |  |  |  |  |  | 2226|      0|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 6580|      0|      return true;                            \
  |  | 6581|      0|    }                                         \
  |  | 6582|     15|  } while (false)
  |  |  ------------------
  |  |  |  Branch (6582:12): [Folded, False: 15]
  |  |  ------------------
  ------------------
 6602|     15|  GTEST_INTERNAL_PARSE_FLAG(stack_trace_depth);
  ------------------
  |  | 6576|     15|  do {                                        \
  |  | 6577|     15|    auto value = GTEST_FLAG_GET(flag_name);   \
  |  |  ------------------
  |  |  |  | 2293|     15|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  |  |  ------------------
  |  |  |  |  |  | 2226|     15|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 6578|     15|    if (ParseFlag(arg, #flag_name, &value)) { \
  |  |  ------------------
  |  |  |  Branch (6578:9): [True: 0, False: 15]
  |  |  ------------------
  |  | 6579|      0|      GTEST_FLAG_SET(flag_name, value);       \
  |  |  ------------------
  |  |  |  | 2294|      0|#define GTEST_FLAG_SET(name, value) (void)(::testing::GTEST_FLAG(name) = value)
  |  |  |  |  ------------------
  |  |  |  |  |  | 2226|      0|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 6580|      0|      return true;                            \
  |  | 6581|      0|    }                                         \
  |  | 6582|     15|  } while (false)
  |  |  ------------------
  |  |  |  Branch (6582:12): [Folded, False: 15]
  |  |  ------------------
  ------------------
 6603|     15|  GTEST_INTERNAL_PARSE_FLAG(stream_result_to);
  ------------------
  |  | 6576|     15|  do {                                        \
  |  | 6577|     15|    auto value = GTEST_FLAG_GET(flag_name);   \
  |  |  ------------------
  |  |  |  | 2293|     15|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  |  |  ------------------
  |  |  |  |  |  | 2226|     15|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 6578|     15|    if (ParseFlag(arg, #flag_name, &value)) { \
  |  |  ------------------
  |  |  |  Branch (6578:9): [True: 0, False: 15]
  |  |  ------------------
  |  | 6579|      0|      GTEST_FLAG_SET(flag_name, value);       \
  |  |  ------------------
  |  |  |  | 2294|      0|#define GTEST_FLAG_SET(name, value) (void)(::testing::GTEST_FLAG(name) = value)
  |  |  |  |  ------------------
  |  |  |  |  |  | 2226|      0|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 6580|      0|      return true;                            \
  |  | 6581|      0|    }                                         \
  |  | 6582|     15|  } while (false)
  |  |  ------------------
  |  |  |  Branch (6582:12): [Folded, False: 15]
  |  |  ------------------
  ------------------
 6604|     15|  GTEST_INTERNAL_PARSE_FLAG(throw_on_failure);
  ------------------
  |  | 6576|     15|  do {                                        \
  |  | 6577|     15|    auto value = GTEST_FLAG_GET(flag_name);   \
  |  |  ------------------
  |  |  |  | 2293|     15|#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
  |  |  |  |  ------------------
  |  |  |  |  |  | 2226|     15|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 6578|     15|    if (ParseFlag(arg, #flag_name, &value)) { \
  |  |  ------------------
  |  |  |  Branch (6578:9): [True: 0, False: 15]
  |  |  ------------------
  |  | 6579|      0|      GTEST_FLAG_SET(flag_name, value);       \
  |  |  ------------------
  |  |  |  | 2294|      0|#define GTEST_FLAG_SET(name, value) (void)(::testing::GTEST_FLAG(name) = value)
  |  |  |  |  ------------------
  |  |  |  |  |  | 2226|      0|#define GTEST_FLAG(name) FLAGS_gtest_##name
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 6580|      0|      return true;                            \
  |  | 6581|      0|    }                                         \
  |  | 6582|     15|  } while (false)
  |  |  ------------------
  |  |  |  Branch (6582:12): [Folded, False: 15]
  |  |  ------------------
  ------------------
 6605|     15|  return false;
 6606|     15|}
gtest-all.cc:_ZN7testing8internalL9ParseFlagEPKcS2_Pb:
 6376|    180|static bool ParseFlag(const char* str, const char* flag_name, bool* value) {
 6377|       |  // Gets the value of the flag as a string.
 6378|    180|  const char* const value_str = ParseFlagValue(str, flag_name, true);
 6379|       |
 6380|       |  // Aborts if the parsing failed.
 6381|    180|  if (value_str == nullptr) return false;
  ------------------
  |  Branch (6381:7): [True: 180, False: 0]
  ------------------
 6382|       |
 6383|       |  // Converts the string value to a bool.
 6384|      0|  *value = !(*value_str == '0' || *value_str == 'f' || *value_str == 'F');
  ------------------
  |  Branch (6384:14): [True: 0, False: 0]
  |  Branch (6384:35): [True: 0, False: 0]
  |  Branch (6384:56): [True: 0, False: 0]
  ------------------
 6385|      0|  return true;
 6386|    180|}
gtest-all.cc:_ZN7testing8internalL9ParseFlagINSt3__112basic_stringIcNS2_11char_traitsIcEENS2_9allocatorIcEEEEEEbPKcSA_PT_:
 6409|    105|static bool ParseFlag(const char* str, const char* flag_name, String* value) {
 6410|       |  // Gets the value of the flag as a string.
 6411|    105|  const char* const value_str = ParseFlagValue(str, flag_name, false);
 6412|       |
 6413|       |  // Aborts if the parsing failed.
 6414|    105|  if (value_str == nullptr) return false;
  ------------------
  |  Branch (6414:7): [True: 105, False: 0]
  ------------------
 6415|       |
 6416|       |  // Sets *value to the value of the flag.
 6417|      0|  *value = value_str;
 6418|      0|  return true;
 6419|    105|}
gtest-all.cc:_ZN7testing8internalL23HasGoogleTestFlagPrefixEPKc:
 6427|     15|static bool HasGoogleTestFlagPrefix(const char* str) {
 6428|     15|  return (SkipPrefix("--", &str) || SkipPrefix("-", &str) ||
  ------------------
  |  Branch (6428:11): [True: 6, False: 9]
  |  Branch (6428:37): [True: 5, False: 4]
  ------------------
 6429|      4|          SkipPrefix("/", &str)) &&
  ------------------
  |  Branch (6429:11): [True: 4, False: 0]
  ------------------
 6430|     15|         !SkipPrefix(GTEST_FLAG_PREFIX_ "internal_", &str) &&
  ------------------
  |  |  331|     15|#define GTEST_FLAG_PREFIX_ "gtest_"
  ------------------
  |  Branch (6430:10): [True: 15, False: 0]
  ------------------
 6431|     15|         (SkipPrefix(GTEST_FLAG_PREFIX_, &str) ||
  ------------------
  |  |  331|     15|#define GTEST_FLAG_PREFIX_ "gtest_"
  ------------------
  |  Branch (6431:11): [True: 0, False: 15]
  ------------------
 6432|     15|          SkipPrefix(GTEST_FLAG_PREFIX_DASH_, &str));
  ------------------
  |  |  332|     15|#define GTEST_FLAG_PREFIX_DASH_ "gtest-"
  ------------------
  |  Branch (6432:11): [True: 0, False: 15]
  ------------------
 6433|     15|}
_ZN7testing8internal18InitGoogleTestImplIcEEvPiPPT_:
 6758|      2|void InitGoogleTestImpl(int* argc, CharType** argv) {
 6759|       |  // We don't want to run the initialization code twice.
 6760|      2|  if (GTestIsInitialized()) return;
  ------------------
  |  Branch (6760:7): [True: 0, False: 2]
  ------------------
 6761|       |
 6762|      2|  if (*argc <= 0) return;
  ------------------
  |  Branch (6762:7): [True: 0, False: 2]
  ------------------
 6763|       |
 6764|      2|  g_argvs.clear();
 6765|     19|  for (int i = 0; i != *argc; i++) {
  ------------------
  |  Branch (6765:19): [True: 17, False: 2]
  ------------------
 6766|     17|    g_argvs.push_back(StreamableToString(argv[i]));
 6767|     17|  }
 6768|       |
 6769|       |#ifdef GTEST_HAS_ABSL
 6770|       |  absl::InitializeSymbolizer(g_argvs[0].c_str());
 6771|       |
 6772|       |  // When using the Abseil Flags library, set the program usage message to the
 6773|       |  // help message, but remove the color-encoding from the message first.
 6774|       |  absl::SetProgramUsageMessage(absl::StrReplaceAll(
 6775|       |      kColorEncodedHelpMessage,
 6776|       |      {{"@D", ""}, {"@R", ""}, {"@G", ""}, {"@Y", ""}, {"@@", "@"}}));
 6777|       |#endif  // GTEST_HAS_ABSL
 6778|       |
 6779|      2|  ParseGoogleTestFlagsOnly(argc, argv);
 6780|      2|  GetUnitTestImpl()->PostFlagParsingInit();
 6781|      2|}

aom_codec.c:at_ctrl_map_end:
  191|   376k|static inline int at_ctrl_map_end(aom_codec_ctrl_fn_map_t *e) {
  192|   376k|  return e->ctrl_id == 0 && e->fn == NULL;
  ------------------
  |  Branch (192:10): [True: 0, False: 376k]
  |  Branch (192:29): [True: 0, False: 0]
  ------------------
  193|   376k|}

aom_codec_err_to_string:
   36|  16.2k|const char *aom_codec_err_to_string(aom_codec_err_t err) {
   37|  16.2k|  switch (err) {
  ------------------
  |  Branch (37:11): [True: 16.2k, False: 0]
  ------------------
   38|      0|    case AOM_CODEC_OK: return "Success";
  ------------------
  |  Branch (38:5): [True: 0, False: 16.2k]
  ------------------
   39|     19|    case AOM_CODEC_ERROR: return "Unspecified internal error";
  ------------------
  |  Branch (39:5): [True: 19, False: 16.2k]
  ------------------
   40|      0|    case AOM_CODEC_MEM_ERROR: return "Memory allocation error";
  ------------------
  |  Branch (40:5): [True: 0, False: 16.2k]
  ------------------
   41|      0|    case AOM_CODEC_ABI_MISMATCH: return "ABI version mismatch";
  ------------------
  |  Branch (41:5): [True: 0, False: 16.2k]
  ------------------
   42|      0|    case AOM_CODEC_INCAPABLE:
  ------------------
  |  Branch (42:5): [True: 0, False: 16.2k]
  ------------------
   43|      0|      return "Codec does not implement requested capability";
   44|    704|    case AOM_CODEC_UNSUP_BITSTREAM:
  ------------------
  |  Branch (44:5): [True: 704, False: 15.5k]
  ------------------
   45|    704|      return "Bitstream not supported by this decoder";
   46|      0|    case AOM_CODEC_UNSUP_FEATURE:
  ------------------
  |  Branch (46:5): [True: 0, False: 16.2k]
  ------------------
   47|      0|      return "Bitstream required feature not supported by this decoder";
   48|  15.5k|    case AOM_CODEC_CORRUPT_FRAME: return "Corrupt frame detected";
  ------------------
  |  Branch (48:5): [True: 15.5k, False: 723]
  ------------------
   49|      0|    case AOM_CODEC_INVALID_PARAM: return "Invalid parameter";
  ------------------
  |  Branch (49:5): [True: 0, False: 16.2k]
  ------------------
   50|      0|    case AOM_CODEC_LIST_END: return "End of iterated list";
  ------------------
  |  Branch (50:5): [True: 0, False: 16.2k]
  ------------------
   51|  16.2k|  }
   52|       |
   53|      0|  return "Unrecognized error code";
   54|  16.2k|}
aom_codec_error:
   56|  15.2k|const char *aom_codec_error(const aom_codec_ctx_t *ctx) {
   57|  15.2k|  return (ctx) ? aom_codec_err_to_string(ctx->err)
  ------------------
  |  Branch (57:10): [True: 15.2k, False: 0]
  ------------------
   58|  15.2k|               : aom_codec_err_to_string(AOM_CODEC_INVALID_PARAM);
   59|  15.2k|}
aom_codec_error_detail:
   61|  15.2k|const char *aom_codec_error_detail(const aom_codec_ctx_t *ctx) {
   62|  15.2k|  if (ctx && ctx->err)
  ------------------
  |  Branch (62:7): [True: 15.2k, False: 0]
  |  Branch (62:14): [True: 15.2k, False: 0]
  ------------------
   63|  15.2k|    return ctx->priv ? ctx->priv->err_detail : ctx->err_detail;
  ------------------
  |  Branch (63:12): [True: 15.2k, False: 0]
  ------------------
   64|       |
   65|      0|  return NULL;
   66|  15.2k|}
aom_codec_destroy:
   68|  17.9k|aom_codec_err_t aom_codec_destroy(aom_codec_ctx_t *ctx) {
   69|  17.9k|  if (!ctx) {
  ------------------
  |  Branch (69:7): [True: 0, False: 17.9k]
  ------------------
   70|      0|    return AOM_CODEC_INVALID_PARAM;
   71|      0|  }
   72|  17.9k|  if (!ctx->iface || !ctx->priv) {
  ------------------
  |  Branch (72:7): [True: 0, False: 17.9k]
  |  Branch (72:22): [True: 0, False: 17.9k]
  ------------------
   73|      0|    ctx->err = AOM_CODEC_ERROR;
   74|      0|    return AOM_CODEC_ERROR;
   75|      0|  }
   76|  17.9k|  ctx->iface->destroy((aom_codec_alg_priv_t *)ctx->priv);
   77|  17.9k|  ctx->iface = NULL;
   78|  17.9k|  ctx->name = NULL;
   79|       |  ctx->priv = NULL;
   80|  17.9k|  ctx->err = AOM_CODEC_OK;
   81|  17.9k|  return AOM_CODEC_OK;
   82|  17.9k|}
aom_codec_control:
   88|  35.8k|aom_codec_err_t aom_codec_control(aom_codec_ctx_t *ctx, int ctrl_id, ...) {
   89|  35.8k|  if (!ctx) {
  ------------------
  |  Branch (89:7): [True: 0, False: 35.8k]
  ------------------
   90|      0|    return AOM_CODEC_INVALID_PARAM;
   91|      0|  }
   92|       |  // Control ID must be non-zero.
   93|  35.8k|  if (!ctrl_id) {
  ------------------
  |  Branch (93:7): [True: 0, False: 35.8k]
  ------------------
   94|      0|    ctx->err = AOM_CODEC_INVALID_PARAM;
   95|      0|    return AOM_CODEC_INVALID_PARAM;
   96|      0|  }
   97|  35.8k|  if (!ctx->iface || !ctx->priv || !ctx->iface->ctrl_maps) {
  ------------------
  |  Branch (97:7): [True: 0, False: 35.8k]
  |  Branch (97:22): [True: 0, False: 35.8k]
  |  Branch (97:36): [True: 0, False: 35.8k]
  ------------------
   98|      0|    ctx->err = AOM_CODEC_ERROR;
   99|      0|    return AOM_CODEC_ERROR;
  100|      0|  }
  101|       |
  102|       |  // "ctrl_maps" is an array of (control ID, function pointer) elements,
  103|       |  // with CTRL_MAP_END as a sentinel.
  104|  35.8k|  for (aom_codec_ctrl_fn_map_t *entry = ctx->iface->ctrl_maps;
  105|   376k|       !at_ctrl_map_end(entry); ++entry) {
  ------------------
  |  Branch (105:8): [True: 376k, False: 0]
  ------------------
  106|   376k|    if (entry->ctrl_id == ctrl_id) {
  ------------------
  |  Branch (106:9): [True: 35.8k, False: 340k]
  ------------------
  107|  35.8k|      va_list ap;
  108|  35.8k|      va_start(ap, ctrl_id);
  109|  35.8k|      ctx->err = entry->fn((aom_codec_alg_priv_t *)ctx->priv, ap);
  110|  35.8k|      va_end(ap);
  111|  35.8k|      return ctx->err;
  112|  35.8k|    }
  113|   376k|  }
  114|      0|  ctx->err = AOM_CODEC_ERROR;
  115|      0|  ctx->priv->err_detail = "Invalid control ID";
  116|      0|  return AOM_CODEC_ERROR;
  117|  35.8k|}
aom_internal_error:
  160|  14.6k|                        aom_codec_err_t error, const char *fmt, ...) {
  161|  14.6k|  va_list ap;
  162|       |
  163|  14.6k|  va_start(ap, fmt);
  164|  14.6k|  set_error(info, error, fmt, ap);
  165|  14.6k|  va_end(ap);
  166|       |
  167|  14.6k|  if (info->setjmp) longjmp(info->jmp, info->error_code);
  ------------------
  |  Branch (167:7): [True: 14.6k, False: 0]
  ------------------
  168|  14.6k|}
aom_merge_corrupted_flag:
  182|   138k|void aom_merge_corrupted_flag(int *corrupted, int value) {
  183|   138k|  *corrupted |= value;
  184|   138k|}
aom_codec.c:set_error:
  135|  14.6k|                      aom_codec_err_t error, const char *fmt, va_list ap) {
  136|  14.6k|  info->error_code = error;
  137|  14.6k|  info->has_detail = 0;
  138|       |
  139|  14.6k|  if (fmt) {
  ------------------
  |  Branch (139:7): [True: 14.6k, False: 0]
  ------------------
  140|  14.6k|    size_t sz = sizeof(info->detail);
  141|       |
  142|  14.6k|    info->has_detail = 1;
  143|  14.6k|    vsnprintf(info->detail, sz - 1, fmt, ap);
  144|  14.6k|    info->detail[sz - 1] = '\0';
  145|  14.6k|  }
  146|  14.6k|}

aom_codec_dec_init_ver:
   28|  17.9k|                                       aom_codec_flags_t flags, int ver) {
   29|  17.9k|  aom_codec_err_t res;
   30|       |
   31|  17.9k|  if (ver != AOM_DECODER_ABI_VERSION)
  ------------------
  |  |   45|  17.9k|  (6 + AOM_CODEC_ABI_VERSION) /**<\hideinitializer*/
  |  |  ------------------
  |  |  |  |  152|  17.9k|#define AOM_CODEC_ABI_VERSION (7 + AOM_IMAGE_ABI_VERSION) /**<\hideinitializer*/
  |  |  |  |  ------------------
  |  |  |  |  |  |   33|  17.9k|#define AOM_IMAGE_ABI_VERSION (9) /**<\hideinitializer*/
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  |  Branch (31:7): [True: 0, False: 17.9k]
  ------------------
   32|      0|    res = AOM_CODEC_ABI_MISMATCH;
   33|  17.9k|  else if (!ctx || !iface)
  ------------------
  |  Branch (33:12): [True: 0, False: 17.9k]
  |  Branch (33:20): [True: 0, False: 17.9k]
  ------------------
   34|      0|    res = AOM_CODEC_INVALID_PARAM;
   35|  17.9k|  else if (iface->abi_version != AOM_CODEC_INTERNAL_ABI_VERSION)
  ------------------
  |  |   65|  17.9k|#define AOM_CODEC_INTERNAL_ABI_VERSION (7) /**<\hideinitializer*/
  ------------------
  |  Branch (35:12): [True: 0, False: 17.9k]
  ------------------
   36|      0|    res = AOM_CODEC_ABI_MISMATCH;
   37|  17.9k|  else if (!(iface->caps & AOM_CODEC_CAP_DECODER))
  ------------------
  |  |  218|  17.9k|#define AOM_CODEC_CAP_DECODER 0x1 /**< Is a decoder */
  ------------------
  |  Branch (37:12): [True: 0, False: 17.9k]
  ------------------
   38|      0|    res = AOM_CODEC_INCAPABLE;
   39|  17.9k|  else {
   40|  17.9k|    memset(ctx, 0, sizeof(*ctx));
   41|  17.9k|    ctx->iface = iface;
   42|  17.9k|    ctx->name = iface->name;
   43|  17.9k|    ctx->priv = NULL;
   44|  17.9k|    ctx->init_flags = flags;
   45|  17.9k|    ctx->config.dec = cfg;
   46|       |
   47|  17.9k|    res = ctx->iface->init(ctx);
   48|  17.9k|    if (res) {
  ------------------
  |  Branch (48:9): [True: 0, False: 17.9k]
  ------------------
   49|      0|      ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL;
  ------------------
  |  Branch (49:25): [True: 0, False: 0]
  ------------------
   50|      0|      aom_codec_destroy(ctx);
   51|      0|    }
   52|  17.9k|  }
   53|       |
   54|  17.9k|  return SAVE_STATUS(ctx, res);
  ------------------
  |  |   19|  17.9k|#define SAVE_STATUS(ctx, var) (ctx ? (ctx->err = var) : var)
  |  |  ------------------
  |  |  |  Branch (19:32): [True: 17.9k, False: 0]
  |  |  ------------------
  ------------------
   55|  17.9k|}
aom_codec_peek_stream_info:
   59|  23.4k|                                           aom_codec_stream_info_t *si) {
   60|  23.4k|  aom_codec_err_t res;
   61|       |
   62|  23.4k|  if (!iface || !data || !data_sz || !si) {
  ------------------
  |  Branch (62:7): [True: 0, False: 23.4k]
  |  Branch (62:17): [True: 0, False: 23.4k]
  |  Branch (62:26): [True: 0, False: 23.4k]
  |  Branch (62:38): [True: 0, False: 23.4k]
  ------------------
   63|      0|    res = AOM_CODEC_INVALID_PARAM;
   64|  23.4k|  } else {
   65|       |    /* Set default/unknown values */
   66|  23.4k|    si->w = 0;
   67|  23.4k|    si->h = 0;
   68|       |
   69|  23.4k|    res = iface->dec.peek_si(data, data_sz, si);
   70|  23.4k|  }
   71|       |
   72|  23.4k|  return res;
   73|  23.4k|}
aom_codec_decode:
   95|  22.4k|                                 size_t data_sz, void *user_priv) {
   96|  22.4k|  aom_codec_err_t res;
   97|       |
   98|  22.4k|  if (!ctx)
  ------------------
  |  Branch (98:7): [True: 0, False: 22.4k]
  ------------------
   99|      0|    res = AOM_CODEC_INVALID_PARAM;
  100|  22.4k|  else if (!ctx->iface || !ctx->priv)
  ------------------
  |  Branch (100:12): [True: 0, False: 22.4k]
  |  Branch (100:27): [True: 0, False: 22.4k]
  ------------------
  101|      0|    res = AOM_CODEC_ERROR;
  102|  22.4k|  else {
  103|  22.4k|    res = ctx->iface->dec.decode(get_alg_priv(ctx), data, data_sz, user_priv);
  104|  22.4k|  }
  105|       |
  106|  22.4k|  return SAVE_STATUS(ctx, res);
  ------------------
  |  |   19|  22.4k|#define SAVE_STATUS(ctx, var) (ctx ? (ctx->err = var) : var)
  |  |  ------------------
  |  |  |  Branch (19:32): [True: 22.4k, False: 0]
  |  |  ------------------
  ------------------
  107|  22.4k|}
aom_codec_get_frame:
  109|  29.6k|aom_image_t *aom_codec_get_frame(aom_codec_ctx_t *ctx, aom_codec_iter_t *iter) {
  110|  29.6k|  aom_image_t *img;
  111|       |
  112|  29.6k|  if (!ctx || !iter || !ctx->iface || !ctx->priv)
  ------------------
  |  Branch (112:7): [True: 0, False: 29.6k]
  |  Branch (112:15): [True: 0, False: 29.6k]
  |  Branch (112:24): [True: 0, False: 29.6k]
  |  Branch (112:39): [True: 0, False: 29.6k]
  ------------------
  113|      0|    img = NULL;
  114|  29.6k|  else
  115|  29.6k|    img = ctx->iface->dec.get_frame(get_alg_priv(ctx), iter);
  116|       |
  117|  29.6k|  return img;
  118|  29.6k|}
aom_decoder.c:get_alg_priv:
   21|  52.0k|static aom_codec_alg_priv_t *get_alg_priv(aom_codec_ctx_t *ctx) {
   22|  52.0k|  return (aom_codec_alg_priv_t *)ctx->priv;
   23|  52.0k|}

aom_img_free:
  303|  17.9k|void aom_img_free(aom_image_t *img) {
  304|  17.9k|  if (img) {
  ------------------
  |  Branch (304:7): [True: 17.9k, False: 0]
  ------------------
  305|  17.9k|    aom_img_remove_metadata(img);
  306|  17.9k|    if (img->img_data && img->img_data_owner) aom_free(img->img_data);
  ------------------
  |  Branch (306:9): [True: 4.97k, False: 12.9k]
  |  Branch (306:26): [True: 0, False: 4.97k]
  ------------------
  307|       |
  308|  17.9k|    if (img->self_allocd) free(img);
  ------------------
  |  Branch (308:9): [True: 0, False: 17.9k]
  ------------------
  309|  17.9k|  }
  310|  17.9k|}
aom_img_metadata_alloc:
  328|      6|    aom_metadata_insert_flags_t insert_flag) {
  329|      6|  if (!data || sz == 0) return NULL;
  ------------------
  |  Branch (329:7): [True: 0, False: 6]
  |  Branch (329:16): [True: 0, False: 6]
  ------------------
  330|      6|  aom_metadata_t *metadata = (aom_metadata_t *)malloc(sizeof(aom_metadata_t));
  331|      6|  if (!metadata) return NULL;
  ------------------
  |  Branch (331:7): [True: 0, False: 6]
  ------------------
  332|      6|  metadata->type = type;
  333|      6|  metadata->payload = (uint8_t *)malloc(sz);
  334|      6|  if (!metadata->payload) {
  ------------------
  |  Branch (334:7): [True: 0, False: 6]
  ------------------
  335|      0|    free(metadata);
  336|      0|    return NULL;
  337|      0|  }
  338|      6|  memcpy(metadata->payload, data, sz);
  339|      6|  metadata->sz = sz;
  340|      6|  metadata->insert_flag = insert_flag;
  341|      6|  return metadata;
  342|      6|}
aom_img_metadata_free:
  344|      6|void aom_img_metadata_free(aom_metadata_t *metadata) {
  345|      6|  if (metadata) {
  ------------------
  |  Branch (345:7): [True: 6, False: 0]
  ------------------
  346|      6|    if (metadata->payload) free(metadata->payload);
  ------------------
  |  Branch (346:9): [True: 6, False: 0]
  ------------------
  347|      6|    free(metadata);
  348|      6|  }
  349|      6|}
aom_img_metadata_array_alloc:
  351|      6|aom_metadata_array_t *aom_img_metadata_array_alloc(size_t sz) {
  352|      6|  aom_metadata_array_t *arr =
  353|      6|      (aom_metadata_array_t *)calloc(1, sizeof(aom_metadata_array_t));
  354|      6|  if (!arr) return NULL;
  ------------------
  |  Branch (354:7): [True: 0, False: 6]
  ------------------
  355|      6|  if (sz > 0) {
  ------------------
  |  Branch (355:7): [True: 0, False: 6]
  ------------------
  356|      0|    arr->metadata_array =
  357|      0|        (aom_metadata_t **)calloc(sz, sizeof(aom_metadata_t *));
  358|      0|    if (!arr->metadata_array) {
  ------------------
  |  Branch (358:9): [True: 0, False: 0]
  ------------------
  359|      0|      aom_img_metadata_array_free(arr);
  360|      0|      return NULL;
  361|      0|    }
  362|      0|    arr->sz = sz;
  363|      0|  }
  364|      6|  return arr;
  365|      6|}
aom_img_metadata_array_free:
  367|  17.9k|void aom_img_metadata_array_free(aom_metadata_array_t *arr) {
  368|  17.9k|  if (arr) {
  ------------------
  |  Branch (368:7): [True: 6, False: 17.9k]
  ------------------
  369|      6|    if (arr->metadata_array) {
  ------------------
  |  Branch (369:9): [True: 6, False: 0]
  ------------------
  370|     12|      for (size_t i = 0; i < arr->sz; i++) {
  ------------------
  |  Branch (370:26): [True: 6, False: 6]
  ------------------
  371|      6|        aom_img_metadata_free(arr->metadata_array[i]);
  372|      6|      }
  373|      6|      free(arr->metadata_array);
  374|      6|    }
  375|      6|    free(arr);
  376|      6|  }
  377|  17.9k|}
aom_img_remove_metadata:
  402|  24.9k|void aom_img_remove_metadata(aom_image_t *img) {
  403|  24.9k|  if (img && img->metadata) {
  ------------------
  |  Branch (403:7): [True: 24.9k, False: 0]
  |  Branch (403:14): [True: 0, False: 24.9k]
  ------------------
  404|      0|    aom_img_metadata_array_free(img->metadata);
  405|       |    img->metadata = NULL;
  406|      0|  }
  407|  24.9k|}

aom_uleb_decode:
   32|   175k|                    size_t *length) {
   33|   175k|  if (buffer && value) {
  ------------------
  |  Branch (33:7): [True: 175k, False: 0]
  |  Branch (33:17): [True: 175k, False: 0]
  ------------------
   34|   175k|    *value = 0;
   35|   201k|    for (size_t i = 0; i < kMaximumLeb128Size && i < available; ++i) {
  ------------------
  |  Branch (35:24): [True: 201k, False: 13]
  |  Branch (35:50): [True: 201k, False: 169]
  ------------------
   36|   201k|      const uint8_t decoded_byte = *(buffer + i) & kLeb128ByteMask;
   37|   201k|      *value |= ((uint64_t)decoded_byte) << (i * 7);
   38|   201k|      if ((*(buffer + i) >> 7) == 0) {
  ------------------
  |  Branch (38:11): [True: 175k, False: 26.1k]
  ------------------
   39|   175k|        if (length) {
  ------------------
  |  Branch (39:13): [True: 175k, False: 0]
  ------------------
   40|   175k|          *length = i + 1;
   41|   175k|        }
   42|       |
   43|       |        // Fail on values larger than 32-bits to ensure consistent behavior on
   44|       |        // 32 and 64 bit targets: value is typically used to determine buffer
   45|       |        // allocation size.
   46|   175k|        if (*value > UINT32_MAX) return -1;
  ------------------
  |  Branch (46:13): [True: 50, False: 175k]
  ------------------
   47|       |
   48|   175k|        return 0;
   49|   175k|      }
   50|   201k|    }
   51|   175k|  }
   52|       |
   53|       |  // If we get here, either the buffer/value pointers were invalid,
   54|       |  // or we ran over the available space
   55|    182|  return -1;
   56|   175k|}

decodeframe.c:clamp:
   74|   508k|static inline int clamp(int value, int low, int high) {
   75|   508k|  return value < low ? low : (value > high ? high : value);
  ------------------
  |  Branch (75:10): [True: 16.5k, False: 492k]
  |  Branch (75:31): [True: 2.15k, False: 489k]
  ------------------
   76|   508k|}
decodemv.c:clamp:
   74|  1.84M|static inline int clamp(int value, int low, int high) {
   75|  1.84M|  return value < low ? low : (value > high ? high : value);
  ------------------
  |  Branch (75:10): [True: 1.00k, False: 1.84M]
  |  Branch (75:31): [True: 38.4k, False: 1.80M]
  ------------------
   76|  1.84M|}
decodetxb.c:clamp:
   74|  26.2M|static inline int clamp(int value, int low, int high) {
   75|  26.2M|  return value < low ? low : (value > high ? high : value);
  ------------------
  |  Branch (75:10): [True: 1.92k, False: 26.2M]
  |  Branch (75:31): [True: 1.33k, False: 26.2M]
  ------------------
   76|  26.2M|}
blend_a64_mask.c:negative_to_zero:
   99|  9.47k|static inline unsigned int negative_to_zero(int value) {
  100|  9.47k|  return value & ~(value >> (sizeof(value) * 8 - 1));
  101|  9.47k|}
av1_inv_txfm2d.c:clip_pixel_highbd:
   86|  1.21M|static inline uint16_t clip_pixel_highbd(int val, int bd) {
   87|  1.21M|  switch (bd) {
   88|   754k|    case 8:
  ------------------
  |  Branch (88:5): [True: 754k, False: 462k]
  ------------------
   89|   754k|    default: return (uint16_t)clamp(val, 0, 255);
  ------------------
  |  Branch (89:5): [True: 0, False: 1.21M]
  ------------------
   90|   455k|    case 10: return (uint16_t)clamp(val, 0, 1023);
  ------------------
  |  Branch (90:5): [True: 455k, False: 761k]
  ------------------
   91|  6.52k|    case 12: return (uint16_t)clamp(val, 0, 4095);
  ------------------
  |  Branch (91:5): [True: 6.52k, False: 1.21M]
  ------------------
   92|  1.21M|  }
   93|  1.21M|}
av1_inv_txfm2d.c:clamp:
   74|  1.21M|static inline int clamp(int value, int low, int high) {
   75|  1.21M|  return value < low ? low : (value > high ? high : value);
  ------------------
  |  Branch (75:10): [True: 5.95k, False: 1.21M]
  |  Branch (75:31): [True: 5.19k, False: 1.20M]
  ------------------
   76|  1.21M|}
av1_loopfilter.c:clamp:
   74|  2.38M|static inline int clamp(int value, int low, int high) {
   75|  2.38M|  return value < low ? low : (value > high ? high : value);
  ------------------
  |  Branch (75:10): [True: 782k, False: 1.60M]
  |  Branch (75:31): [True: 219k, False: 1.38M]
  ------------------
   76|  2.38M|}
mvref_common.c:clamp:
   74|   570k|static inline int clamp(int value, int low, int high) {
   75|   570k|  return value < low ? low : (value > high ? high : value);
  ------------------
  |  Branch (75:10): [True: 21.4k, False: 549k]
  |  Branch (75:31): [True: 687, False: 548k]
  ------------------
   76|   570k|}
quant_common.c:clamp:
   74|  28.7M|static inline int clamp(int value, int low, int high) {
   75|  28.7M|  return value < low ? low : (value > high ? high : value);
  ------------------
  |  Branch (75:10): [True: 2.28M, False: 26.4M]
  |  Branch (75:31): [True: 486k, False: 25.9M]
  ------------------
   76|  28.7M|}
reconinter.c:clamp:
   74|  11.0k|static inline int clamp(int value, int low, int high) {
   75|  11.0k|  return value < low ? low : (value > high ? high : value);
  ------------------
  |  Branch (75:10): [True: 0, False: 11.0k]
  |  Branch (75:31): [True: 192, False: 10.8k]
  ------------------
   76|  11.0k|}
reconintra.c:clip_pixel_highbd:
   86|  14.9M|static inline uint16_t clip_pixel_highbd(int val, int bd) {
   87|  14.9M|  switch (bd) {
   88|      0|    case 8:
  ------------------
  |  Branch (88:5): [True: 0, False: 14.9M]
  ------------------
   89|      0|    default: return (uint16_t)clamp(val, 0, 255);
  ------------------
  |  Branch (89:5): [True: 0, False: 14.9M]
  ------------------
   90|  14.8M|    case 10: return (uint16_t)clamp(val, 0, 1023);
  ------------------
  |  Branch (90:5): [True: 14.8M, False: 55.0k]
  ------------------
   91|  55.3k|    case 12: return (uint16_t)clamp(val, 0, 4095);
  ------------------
  |  Branch (91:5): [True: 55.3k, False: 14.8M]
  ------------------
   92|  14.9M|  }
   93|  14.9M|}
reconintra.c:clamp:
   74|  14.9M|static inline int clamp(int value, int low, int high) {
   75|  14.9M|  return value < low ? low : (value > high ? high : value);
  ------------------
  |  Branch (75:10): [True: 9.27k, False: 14.9M]
  |  Branch (75:31): [True: 4.88k, False: 14.9M]
  ------------------
   76|  14.9M|}
warped_motion.c:clamp:
   74|   240k|static inline int clamp(int value, int low, int high) {
   75|   240k|  return value < low ? low : (value > high ? high : value);
  ------------------
  |  Branch (75:10): [True: 0, False: 240k]
  |  Branch (75:31): [True: 18, False: 240k]
  ------------------
   76|   240k|}
warped_motion.c:clamp64:
   78|  13.6k|static inline int64_t clamp64(int64_t value, int64_t low, int64_t high) {
   79|  13.6k|  return value < low ? low : (value > high ? high : value);
  ------------------
  |  Branch (79:10): [True: 396, False: 13.2k]
  |  Branch (79:31): [True: 358, False: 12.9k]
  ------------------
   80|  13.6k|}
warp_plane_avx2.c:clamp:
   74|   220k|static inline int clamp(int value, int low, int high) {
   75|   220k|  return value < low ? low : (value > high ? high : value);
  ------------------
  |  Branch (75:10): [True: 7.53k, False: 213k]
  |  Branch (75:31): [True: 5.02k, False: 207k]
  ------------------
   76|   220k|}
highbd_warp_affine_avx2.c:clamp:
   74|   758k|static inline int clamp(int value, int low, int high) {
   75|   758k|  return value < low ? low : (value > high ? high : value);
  ------------------
  |  Branch (75:10): [True: 46.8k, False: 711k]
  |  Branch (75:31): [True: 163k, False: 548k]
  ------------------
   76|   758k|}

aom_dsp_rtcd:
   18|  17.9k|void aom_dsp_rtcd(void) { aom_once(setup_rtcd_internal); }

aom_read_primitive_refsubexpfin_:
   57|  41.8k|                                          uint16_t ref ACCT_STR_PARAM) {
   58|  41.8k|  return inv_recenter_finite_nonneg(
   59|  41.8k|      n, ref, read_primitive_subexpfin(r, n, k, ACCT_STR_NAME));
  ------------------
  |  |   18|  41.8k|  read_primitive_subexpfin_(r, n, k ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
   60|  41.8k|}
binary_codes_reader.c:read_primitive_subexpfin_:
   32|  41.8k|                                          uint16_t k ACCT_STR_PARAM) {
   33|  41.8k|  int i = 0;
   34|  41.8k|  int mk = 0;
   35|       |
   36|  73.4k|  while (1) {
  ------------------
  |  Branch (36:10): [True: 73.4k, Folded]
  ------------------
   37|  73.4k|    int b = (i ? k + i - 1 : k);
  ------------------
  |  Branch (37:14): [True: 31.6k, False: 41.8k]
  ------------------
   38|  73.4k|    int a = (1 << b);
   39|       |
   40|  73.4k|    if (n <= mk + 3 * a) {
  ------------------
  |  Branch (40:9): [True: 10.6k, False: 62.8k]
  ------------------
   41|  10.6k|      return read_primitive_quniform(r, n - mk, ACCT_STR_NAME) + mk;
  ------------------
  |  |   16|  10.6k|  read_primitive_quniform_(r, n ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
   42|  10.6k|    }
   43|       |
   44|  62.8k|    if (!aom_read_bit(r, ACCT_STR_NAME)) {
  ------------------
  |  |   43|  62.8k|  aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  |  Branch (44:9): [True: 31.1k, False: 31.6k]
  ------------------
   45|  31.1k|      return aom_read_literal(r, b, ACCT_STR_NAME) + mk;
  ------------------
  |  |   47|  31.1k|  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
   46|  31.1k|    }
   47|       |
   48|  31.6k|    i = i + 1;
   49|  31.6k|    mk += a;
   50|  31.6k|  }
   51|       |
   52|  41.8k|  assert(0);
   53|      0|  return 0;
   54|  41.8k|}
binary_codes_reader.c:read_primitive_quniform_:
   21|  10.6k|                                         uint16_t n ACCT_STR_PARAM) {
   22|  10.6k|  if (n <= 1) return 0;
  ------------------
  |  Branch (22:7): [True: 0, False: 10.6k]
  ------------------
   23|  10.6k|  const int l = get_msb(n) + 1;
   24|  10.6k|  const int m = (1 << l) - n;
   25|  10.6k|  const int v = aom_read_literal(r, l - 1, ACCT_STR_NAME);
  ------------------
  |  |   47|  10.6k|  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
   26|  10.6k|  return v < m ? v : (v << 1) - m + aom_read_bit(r, ACCT_STR_NAME);
  ------------------
  |  |   43|  15.9k|  aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  |  Branch (26:10): [True: 5.33k, False: 5.31k]
  ------------------
   27|  10.6k|}

aom_reader_init:
   14|  27.3k|int aom_reader_init(aom_reader *r, const uint8_t *buffer, size_t size) {
   15|  27.3k|  if (size && !buffer) {
  ------------------
  |  Branch (15:7): [True: 27.3k, False: 18.4E]
  |  Branch (15:15): [True: 0, False: 27.3k]
  ------------------
   16|      0|    return 1;
   17|      0|  }
   18|  27.3k|  r->buffer_end = buffer + size;
   19|  27.3k|  r->buffer = buffer;
   20|  27.3k|  od_ec_dec_init(&r->ec, buffer, (uint32_t)size);
   21|       |#if CONFIG_ACCOUNTING
   22|       |  r->accounting = NULL;
   23|       |#endif
   24|  27.3k|  return 0;
   25|  27.3k|}
aom_reader_find_begin:
   27|  17.6k|const uint8_t *aom_reader_find_begin(aom_reader *r) { return r->buffer; }
aom_reader_find_end:
   29|  25.8k|const uint8_t *aom_reader_find_end(aom_reader *r) { return r->buffer_end; }
aom_reader_tell:
   31|   187k|uint32_t aom_reader_tell(const aom_reader *r) { return od_ec_dec_tell(&r->ec); }
aom_reader_has_overflowed:
   37|   170k|int aom_reader_has_overflowed(const aom_reader *r) {
   38|   170k|  const uint32_t tell_bits = aom_reader_tell(r);
   39|   170k|  const uint32_t tell_bytes = (tell_bits + 7) >> 3;
   40|   170k|  return ((ptrdiff_t)tell_bytes > r->buffer_end - r->buffer);
   41|   170k|}

decodeframe.c:aom_read_symbol_:
  221|  1.92M|                                   int nsymbs ACCT_STR_PARAM) {
  222|  1.92M|  int ret;
  223|  1.92M|  ret = aom_read_cdf(r, cdf, nsymbs, ACCT_STR_NAME);
  ------------------
  |  |   49|  1.92M|  aom_read_cdf_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  224|  1.92M|  if (r->allow_update_cdf) update_cdf(cdf, ret, nsymbs);
  ------------------
  |  Branch (224:7): [True: 1.75M, False: 164k]
  ------------------
  225|  1.92M|  return ret;
  226|  1.92M|}
decodeframe.c:aom_read_cdf_:
  169|  2.01M|                                int nsymbs ACCT_STR_PARAM) {
  170|  2.01M|  int symb;
  171|  2.01M|  assert(cdf != NULL);
  172|  2.01M|  symb = od_ec_decode_cdf_q15(&r->ec, cdf, nsymbs);
  173|       |
  174|       |#if CONFIG_BITSTREAM_DEBUG
  175|       |  {
  176|       |    int i;
  177|       |    int cdf_error = 0;
  178|       |    int ref_symb, ref_nsymbs;
  179|       |    aom_cdf_prob ref_cdf[16];
  180|       |    const int queue_r = bitstream_queue_get_read();
  181|       |    const int frame_idx = aom_bitstream_queue_get_frame_read();
  182|       |    bitstream_queue_pop(&ref_symb, ref_cdf, &ref_nsymbs);
  183|       |    if (nsymbs != ref_nsymbs) {
  184|       |      fprintf(stderr,
  185|       |              "\n *** nsymbs error, frame_idx_r %d nsymbs %d ref_nsymbs %d "
  186|       |              "queue_r %d\n",
  187|       |              frame_idx, nsymbs, ref_nsymbs, queue_r);
  188|       |      cdf_error = 0;
  189|       |      assert(0);
  190|       |    } else {
  191|       |      for (i = 0; i < nsymbs; ++i)
  192|       |        if (cdf[i] != ref_cdf[i]) cdf_error = 1;
  193|       |    }
  194|       |    if (cdf_error) {
  195|       |      fprintf(stderr, "\n *** cdf error, frame_idx_r %d cdf {%d", frame_idx,
  196|       |              cdf[0]);
  197|       |      for (i = 1; i < nsymbs; ++i) fprintf(stderr, ", %d", cdf[i]);
  198|       |      fprintf(stderr, "} ref_cdf {%d", ref_cdf[0]);
  199|       |      for (i = 1; i < ref_nsymbs; ++i) fprintf(stderr, ", %d", ref_cdf[i]);
  200|       |      fprintf(stderr, "} queue_r %d\n", queue_r);
  201|       |      assert(0);
  202|       |    }
  203|       |    if (symb != ref_symb) {
  204|       |      fprintf(
  205|       |          stderr,
  206|       |          "\n *** symb error, frame_idx_r %d symb %d ref_symb %d queue_r %d\n",
  207|       |          frame_idx, symb, ref_symb, queue_r);
  208|       |      assert(0);
  209|       |    }
  210|       |  }
  211|       |#endif
  212|       |
  213|       |#if CONFIG_ACCOUNTING
  214|       |  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
  215|       |  aom_update_symb_counts(r, (nsymbs == 2));
  216|       |#endif
  217|  2.01M|  return symb;
  218|  2.01M|}
decodeframe.c:aom_read_literal_:
  158|  5.36k|static inline int aom_read_literal_(aom_reader *r, int bits ACCT_STR_PARAM) {
  159|  5.36k|  int literal = 0, bit;
  160|       |
  161|  26.8k|  for (bit = bits - 1; bit >= 0; bit--) literal |= aom_read_bit(r, NULL) << bit;
  ------------------
  |  |   43|  21.4k|  aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  |  Branch (161:24): [True: 21.4k, False: 5.36k]
  ------------------
  162|       |#if CONFIG_ACCOUNTING
  163|       |  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
  164|       |#endif
  165|  5.36k|  return literal;
  166|  5.36k|}
decodeframe.c:aom_read_bit_:
  149|  21.4k|static inline int aom_read_bit_(aom_reader *r ACCT_STR_PARAM) {
  150|  21.4k|  int ret;
  151|  21.4k|  ret = aom_read(r, 128, NULL);  // aom_prob_half
  ------------------
  |  |   41|  21.4k|  aom_read_(r, prob ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  152|       |#if CONFIG_ACCOUNTING
  153|       |  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
  154|       |#endif
  155|  21.4k|  return ret;
  156|  21.4k|}
decodeframe.c:aom_read_:
  104|  21.4k|static inline int aom_read_(aom_reader *r, int prob ACCT_STR_PARAM) {
  105|  21.4k|  int p = (0x7FFFFF - (prob << 15) + prob) >> 8;
  106|  21.4k|  int bit = od_ec_decode_bool_q15(&r->ec, p);
  107|       |
  108|       |#if CONFIG_BITSTREAM_DEBUG
  109|       |  {
  110|       |    int i;
  111|       |    int ref_bit, ref_nsymbs;
  112|       |    aom_cdf_prob ref_cdf[16];
  113|       |    const int queue_r = bitstream_queue_get_read();
  114|       |    const int frame_idx = aom_bitstream_queue_get_frame_read();
  115|       |    bitstream_queue_pop(&ref_bit, ref_cdf, &ref_nsymbs);
  116|       |    if (ref_nsymbs != 2) {
  117|       |      fprintf(stderr,
  118|       |              "\n *** [bit] nsymbs error, frame_idx_r %d nsymbs %d ref_nsymbs "
  119|       |              "%d queue_r %d\n",
  120|       |              frame_idx, 2, ref_nsymbs, queue_r);
  121|       |      assert(0);
  122|       |    }
  123|       |    if ((ref_nsymbs != 2) || (ref_cdf[0] != (aom_cdf_prob)p) ||
  124|       |        (ref_cdf[1] != 32767)) {
  125|       |      fprintf(stderr,
  126|       |              "\n *** [bit] cdf error, frame_idx_r %d cdf {%d, %d} ref_cdf {%d",
  127|       |              frame_idx, p, 32767, ref_cdf[0]);
  128|       |      for (i = 1; i < ref_nsymbs; ++i) fprintf(stderr, ", %d", ref_cdf[i]);
  129|       |      fprintf(stderr, "} queue_r %d\n", queue_r);
  130|       |      assert(0);
  131|       |    }
  132|       |    if (bit != ref_bit) {
  133|       |      fprintf(stderr,
  134|       |              "\n *** [bit] symb error, frame_idx_r %d symb %d ref_symb %d "
  135|       |              "queue_r %d\n",
  136|       |              frame_idx, bit, ref_bit, queue_r);
  137|       |      assert(0);
  138|       |    }
  139|       |  }
  140|       |#endif
  141|       |
  142|       |#if CONFIG_ACCOUNTING
  143|       |  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
  144|       |  aom_update_symb_counts(r, 1);
  145|       |#endif
  146|  21.4k|  return bit;
  147|  21.4k|}
decodemv.c:aom_read_symbol_:
  221|  9.81M|                                   int nsymbs ACCT_STR_PARAM) {
  222|  9.81M|  int ret;
  223|  9.81M|  ret = aom_read_cdf(r, cdf, nsymbs, ACCT_STR_NAME);
  ------------------
  |  |   49|  9.81M|  aom_read_cdf_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  224|  9.81M|  if (r->allow_update_cdf) update_cdf(cdf, ret, nsymbs);
  ------------------
  |  Branch (224:7): [True: 9.12M, False: 694k]
  ------------------
  225|  9.81M|  return ret;
  226|  9.81M|}
decodemv.c:aom_read_cdf_:
  169|  9.81M|                                int nsymbs ACCT_STR_PARAM) {
  170|  9.81M|  int symb;
  171|  9.81M|  assert(cdf != NULL);
  172|  9.81M|  symb = od_ec_decode_cdf_q15(&r->ec, cdf, nsymbs);
  173|       |
  174|       |#if CONFIG_BITSTREAM_DEBUG
  175|       |  {
  176|       |    int i;
  177|       |    int cdf_error = 0;
  178|       |    int ref_symb, ref_nsymbs;
  179|       |    aom_cdf_prob ref_cdf[16];
  180|       |    const int queue_r = bitstream_queue_get_read();
  181|       |    const int frame_idx = aom_bitstream_queue_get_frame_read();
  182|       |    bitstream_queue_pop(&ref_symb, ref_cdf, &ref_nsymbs);
  183|       |    if (nsymbs != ref_nsymbs) {
  184|       |      fprintf(stderr,
  185|       |              "\n *** nsymbs error, frame_idx_r %d nsymbs %d ref_nsymbs %d "
  186|       |              "queue_r %d\n",
  187|       |              frame_idx, nsymbs, ref_nsymbs, queue_r);
  188|       |      cdf_error = 0;
  189|       |      assert(0);
  190|       |    } else {
  191|       |      for (i = 0; i < nsymbs; ++i)
  192|       |        if (cdf[i] != ref_cdf[i]) cdf_error = 1;
  193|       |    }
  194|       |    if (cdf_error) {
  195|       |      fprintf(stderr, "\n *** cdf error, frame_idx_r %d cdf {%d", frame_idx,
  196|       |              cdf[0]);
  197|       |      for (i = 1; i < nsymbs; ++i) fprintf(stderr, ", %d", cdf[i]);
  198|       |      fprintf(stderr, "} ref_cdf {%d", ref_cdf[0]);
  199|       |      for (i = 1; i < ref_nsymbs; ++i) fprintf(stderr, ", %d", ref_cdf[i]);
  200|       |      fprintf(stderr, "} queue_r %d\n", queue_r);
  201|       |      assert(0);
  202|       |    }
  203|       |    if (symb != ref_symb) {
  204|       |      fprintf(
  205|       |          stderr,
  206|       |          "\n *** symb error, frame_idx_r %d symb %d ref_symb %d queue_r %d\n",
  207|       |          frame_idx, symb, ref_symb, queue_r);
  208|       |      assert(0);
  209|       |    }
  210|       |  }
  211|       |#endif
  212|       |
  213|       |#if CONFIG_ACCOUNTING
  214|       |  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
  215|       |  aom_update_symb_counts(r, (nsymbs == 2));
  216|       |#endif
  217|  9.81M|  return symb;
  218|  9.81M|}
decodemv.c:aom_read_literal_:
  158|   404k|static inline int aom_read_literal_(aom_reader *r, int bits ACCT_STR_PARAM) {
  159|   404k|  int literal = 0, bit;
  160|       |
  161|  2.04M|  for (bit = bits - 1; bit >= 0; bit--) literal |= aom_read_bit(r, NULL) << bit;
  ------------------
  |  |   43|  1.64M|  aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  |  Branch (161:24): [True: 1.64M, False: 404k]
  ------------------
  162|       |#if CONFIG_ACCOUNTING
  163|       |  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
  164|       |#endif
  165|   404k|  return literal;
  166|   404k|}
decodemv.c:aom_read_bit_:
  149|  1.83M|static inline int aom_read_bit_(aom_reader *r ACCT_STR_PARAM) {
  150|  1.83M|  int ret;
  151|  1.83M|  ret = aom_read(r, 128, NULL);  // aom_prob_half
  ------------------
  |  |   41|  1.83M|  aom_read_(r, prob ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  152|       |#if CONFIG_ACCOUNTING
  153|       |  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
  154|       |#endif
  155|  1.83M|  return ret;
  156|  1.83M|}
decodemv.c:aom_read_:
  104|  1.83M|static inline int aom_read_(aom_reader *r, int prob ACCT_STR_PARAM) {
  105|  1.83M|  int p = (0x7FFFFF - (prob << 15) + prob) >> 8;
  106|  1.83M|  int bit = od_ec_decode_bool_q15(&r->ec, p);
  107|       |
  108|       |#if CONFIG_BITSTREAM_DEBUG
  109|       |  {
  110|       |    int i;
  111|       |    int ref_bit, ref_nsymbs;
  112|       |    aom_cdf_prob ref_cdf[16];
  113|       |    const int queue_r = bitstream_queue_get_read();
  114|       |    const int frame_idx = aom_bitstream_queue_get_frame_read();
  115|       |    bitstream_queue_pop(&ref_bit, ref_cdf, &ref_nsymbs);
  116|       |    if (ref_nsymbs != 2) {
  117|       |      fprintf(stderr,
  118|       |              "\n *** [bit] nsymbs error, frame_idx_r %d nsymbs %d ref_nsymbs "
  119|       |              "%d queue_r %d\n",
  120|       |              frame_idx, 2, ref_nsymbs, queue_r);
  121|       |      assert(0);
  122|       |    }
  123|       |    if ((ref_nsymbs != 2) || (ref_cdf[0] != (aom_cdf_prob)p) ||
  124|       |        (ref_cdf[1] != 32767)) {
  125|       |      fprintf(stderr,
  126|       |              "\n *** [bit] cdf error, frame_idx_r %d cdf {%d, %d} ref_cdf {%d",
  127|       |              frame_idx, p, 32767, ref_cdf[0]);
  128|       |      for (i = 1; i < ref_nsymbs; ++i) fprintf(stderr, ", %d", ref_cdf[i]);
  129|       |      fprintf(stderr, "} queue_r %d\n", queue_r);
  130|       |      assert(0);
  131|       |    }
  132|       |    if (bit != ref_bit) {
  133|       |      fprintf(stderr,
  134|       |              "\n *** [bit] symb error, frame_idx_r %d symb %d ref_symb %d "
  135|       |              "queue_r %d\n",
  136|       |              frame_idx, bit, ref_bit, queue_r);
  137|       |      assert(0);
  138|       |    }
  139|       |  }
  140|       |#endif
  141|       |
  142|       |#if CONFIG_ACCOUNTING
  143|       |  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
  144|       |  aom_update_symb_counts(r, 1);
  145|       |#endif
  146|  1.83M|  return bit;
  147|  1.83M|}
decodetxb.c:aom_read_symbol_:
  221|  83.0M|                                   int nsymbs ACCT_STR_PARAM) {
  222|  83.0M|  int ret;
  223|  83.0M|  ret = aom_read_cdf(r, cdf, nsymbs, ACCT_STR_NAME);
  ------------------
  |  |   49|  83.0M|  aom_read_cdf_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  224|  83.0M|  if (r->allow_update_cdf) update_cdf(cdf, ret, nsymbs);
  ------------------
  |  Branch (224:7): [True: 77.6M, False: 5.37M]
  ------------------
  225|  83.0M|  return ret;
  226|  83.0M|}
decodetxb.c:aom_read_cdf_:
  169|  83.0M|                                int nsymbs ACCT_STR_PARAM) {
  170|  83.0M|  int symb;
  171|  83.0M|  assert(cdf != NULL);
  172|  83.0M|  symb = od_ec_decode_cdf_q15(&r->ec, cdf, nsymbs);
  173|       |
  174|       |#if CONFIG_BITSTREAM_DEBUG
  175|       |  {
  176|       |    int i;
  177|       |    int cdf_error = 0;
  178|       |    int ref_symb, ref_nsymbs;
  179|       |    aom_cdf_prob ref_cdf[16];
  180|       |    const int queue_r = bitstream_queue_get_read();
  181|       |    const int frame_idx = aom_bitstream_queue_get_frame_read();
  182|       |    bitstream_queue_pop(&ref_symb, ref_cdf, &ref_nsymbs);
  183|       |    if (nsymbs != ref_nsymbs) {
  184|       |      fprintf(stderr,
  185|       |              "\n *** nsymbs error, frame_idx_r %d nsymbs %d ref_nsymbs %d "
  186|       |              "queue_r %d\n",
  187|       |              frame_idx, nsymbs, ref_nsymbs, queue_r);
  188|       |      cdf_error = 0;
  189|       |      assert(0);
  190|       |    } else {
  191|       |      for (i = 0; i < nsymbs; ++i)
  192|       |        if (cdf[i] != ref_cdf[i]) cdf_error = 1;
  193|       |    }
  194|       |    if (cdf_error) {
  195|       |      fprintf(stderr, "\n *** cdf error, frame_idx_r %d cdf {%d", frame_idx,
  196|       |              cdf[0]);
  197|       |      for (i = 1; i < nsymbs; ++i) fprintf(stderr, ", %d", cdf[i]);
  198|       |      fprintf(stderr, "} ref_cdf {%d", ref_cdf[0]);
  199|       |      for (i = 1; i < ref_nsymbs; ++i) fprintf(stderr, ", %d", ref_cdf[i]);
  200|       |      fprintf(stderr, "} queue_r %d\n", queue_r);
  201|       |      assert(0);
  202|       |    }
  203|       |    if (symb != ref_symb) {
  204|       |      fprintf(
  205|       |          stderr,
  206|       |          "\n *** symb error, frame_idx_r %d symb %d ref_symb %d queue_r %d\n",
  207|       |          frame_idx, symb, ref_symb, queue_r);
  208|       |      assert(0);
  209|       |    }
  210|       |  }
  211|       |#endif
  212|       |
  213|       |#if CONFIG_ACCOUNTING
  214|       |  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
  215|       |  aom_update_symb_counts(r, (nsymbs == 2));
  216|       |#endif
  217|  83.0M|  return symb;
  218|  83.0M|}
decodetxb.c:aom_read_bit_:
  149|  31.4M|static inline int aom_read_bit_(aom_reader *r ACCT_STR_PARAM) {
  150|  31.4M|  int ret;
  151|  31.4M|  ret = aom_read(r, 128, NULL);  // aom_prob_half
  ------------------
  |  |   41|  31.4M|  aom_read_(r, prob ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  152|       |#if CONFIG_ACCOUNTING
  153|       |  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
  154|       |#endif
  155|  31.4M|  return ret;
  156|  31.4M|}
decodetxb.c:aom_read_:
  104|  31.4M|static inline int aom_read_(aom_reader *r, int prob ACCT_STR_PARAM) {
  105|  31.4M|  int p = (0x7FFFFF - (prob << 15) + prob) >> 8;
  106|  31.4M|  int bit = od_ec_decode_bool_q15(&r->ec, p);
  107|       |
  108|       |#if CONFIG_BITSTREAM_DEBUG
  109|       |  {
  110|       |    int i;
  111|       |    int ref_bit, ref_nsymbs;
  112|       |    aom_cdf_prob ref_cdf[16];
  113|       |    const int queue_r = bitstream_queue_get_read();
  114|       |    const int frame_idx = aom_bitstream_queue_get_frame_read();
  115|       |    bitstream_queue_pop(&ref_bit, ref_cdf, &ref_nsymbs);
  116|       |    if (ref_nsymbs != 2) {
  117|       |      fprintf(stderr,
  118|       |              "\n *** [bit] nsymbs error, frame_idx_r %d nsymbs %d ref_nsymbs "
  119|       |              "%d queue_r %d\n",
  120|       |              frame_idx, 2, ref_nsymbs, queue_r);
  121|       |      assert(0);
  122|       |    }
  123|       |    if ((ref_nsymbs != 2) || (ref_cdf[0] != (aom_cdf_prob)p) ||
  124|       |        (ref_cdf[1] != 32767)) {
  125|       |      fprintf(stderr,
  126|       |              "\n *** [bit] cdf error, frame_idx_r %d cdf {%d, %d} ref_cdf {%d",
  127|       |              frame_idx, p, 32767, ref_cdf[0]);
  128|       |      for (i = 1; i < ref_nsymbs; ++i) fprintf(stderr, ", %d", ref_cdf[i]);
  129|       |      fprintf(stderr, "} queue_r %d\n", queue_r);
  130|       |      assert(0);
  131|       |    }
  132|       |    if (bit != ref_bit) {
  133|       |      fprintf(stderr,
  134|       |              "\n *** [bit] symb error, frame_idx_r %d symb %d ref_symb %d "
  135|       |              "queue_r %d\n",
  136|       |              frame_idx, bit, ref_bit, queue_r);
  137|       |      assert(0);
  138|       |    }
  139|       |  }
  140|       |#endif
  141|       |
  142|       |#if CONFIG_ACCOUNTING
  143|       |  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
  144|       |  aom_update_symb_counts(r, 1);
  145|       |#endif
  146|  31.4M|  return bit;
  147|  31.4M|}
detokenize.c:aom_read_literal_:
  158|  76.2k|static inline int aom_read_literal_(aom_reader *r, int bits ACCT_STR_PARAM) {
  159|  76.2k|  int literal = 0, bit;
  160|       |
  161|   188k|  for (bit = bits - 1; bit >= 0; bit--) literal |= aom_read_bit(r, NULL) << bit;
  ------------------
  |  |   43|   112k|  aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  |  Branch (161:24): [True: 112k, False: 76.2k]
  ------------------
  162|       |#if CONFIG_ACCOUNTING
  163|       |  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
  164|       |#endif
  165|  76.2k|  return literal;
  166|  76.2k|}
detokenize.c:aom_read_bit_:
  149|   112k|static inline int aom_read_bit_(aom_reader *r ACCT_STR_PARAM) {
  150|   112k|  int ret;
  151|   112k|  ret = aom_read(r, 128, NULL);  // aom_prob_half
  ------------------
  |  |   41|   112k|  aom_read_(r, prob ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  152|       |#if CONFIG_ACCOUNTING
  153|       |  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
  154|       |#endif
  155|   112k|  return ret;
  156|   112k|}
detokenize.c:aom_read_:
  104|   112k|static inline int aom_read_(aom_reader *r, int prob ACCT_STR_PARAM) {
  105|   112k|  int p = (0x7FFFFF - (prob << 15) + prob) >> 8;
  106|   112k|  int bit = od_ec_decode_bool_q15(&r->ec, p);
  107|       |
  108|       |#if CONFIG_BITSTREAM_DEBUG
  109|       |  {
  110|       |    int i;
  111|       |    int ref_bit, ref_nsymbs;
  112|       |    aom_cdf_prob ref_cdf[16];
  113|       |    const int queue_r = bitstream_queue_get_read();
  114|       |    const int frame_idx = aom_bitstream_queue_get_frame_read();
  115|       |    bitstream_queue_pop(&ref_bit, ref_cdf, &ref_nsymbs);
  116|       |    if (ref_nsymbs != 2) {
  117|       |      fprintf(stderr,
  118|       |              "\n *** [bit] nsymbs error, frame_idx_r %d nsymbs %d ref_nsymbs "
  119|       |              "%d queue_r %d\n",
  120|       |              frame_idx, 2, ref_nsymbs, queue_r);
  121|       |      assert(0);
  122|       |    }
  123|       |    if ((ref_nsymbs != 2) || (ref_cdf[0] != (aom_cdf_prob)p) ||
  124|       |        (ref_cdf[1] != 32767)) {
  125|       |      fprintf(stderr,
  126|       |              "\n *** [bit] cdf error, frame_idx_r %d cdf {%d, %d} ref_cdf {%d",
  127|       |              frame_idx, p, 32767, ref_cdf[0]);
  128|       |      for (i = 1; i < ref_nsymbs; ++i) fprintf(stderr, ", %d", ref_cdf[i]);
  129|       |      fprintf(stderr, "} queue_r %d\n", queue_r);
  130|       |      assert(0);
  131|       |    }
  132|       |    if (bit != ref_bit) {
  133|       |      fprintf(stderr,
  134|       |              "\n *** [bit] symb error, frame_idx_r %d symb %d ref_symb %d "
  135|       |              "queue_r %d\n",
  136|       |              frame_idx, bit, ref_bit, queue_r);
  137|       |      assert(0);
  138|       |    }
  139|       |  }
  140|       |#endif
  141|       |
  142|       |#if CONFIG_ACCOUNTING
  143|       |  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
  144|       |  aom_update_symb_counts(r, 1);
  145|       |#endif
  146|   112k|  return bit;
  147|   112k|}
detokenize.c:aom_read_symbol_:
  221|  10.8M|                                   int nsymbs ACCT_STR_PARAM) {
  222|  10.8M|  int ret;
  223|  10.8M|  ret = aom_read_cdf(r, cdf, nsymbs, ACCT_STR_NAME);
  ------------------
  |  |   49|  10.8M|  aom_read_cdf_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  224|  10.8M|  if (r->allow_update_cdf) update_cdf(cdf, ret, nsymbs);
  ------------------
  |  Branch (224:7): [True: 10.1M, False: 693k]
  ------------------
  225|  10.8M|  return ret;
  226|  10.8M|}
detokenize.c:aom_read_cdf_:
  169|  10.8M|                                int nsymbs ACCT_STR_PARAM) {
  170|  10.8M|  int symb;
  171|  10.8M|  assert(cdf != NULL);
  172|  10.8M|  symb = od_ec_decode_cdf_q15(&r->ec, cdf, nsymbs);
  173|       |
  174|       |#if CONFIG_BITSTREAM_DEBUG
  175|       |  {
  176|       |    int i;
  177|       |    int cdf_error = 0;
  178|       |    int ref_symb, ref_nsymbs;
  179|       |    aom_cdf_prob ref_cdf[16];
  180|       |    const int queue_r = bitstream_queue_get_read();
  181|       |    const int frame_idx = aom_bitstream_queue_get_frame_read();
  182|       |    bitstream_queue_pop(&ref_symb, ref_cdf, &ref_nsymbs);
  183|       |    if (nsymbs != ref_nsymbs) {
  184|       |      fprintf(stderr,
  185|       |              "\n *** nsymbs error, frame_idx_r %d nsymbs %d ref_nsymbs %d "
  186|       |              "queue_r %d\n",
  187|       |              frame_idx, nsymbs, ref_nsymbs, queue_r);
  188|       |      cdf_error = 0;
  189|       |      assert(0);
  190|       |    } else {
  191|       |      for (i = 0; i < nsymbs; ++i)
  192|       |        if (cdf[i] != ref_cdf[i]) cdf_error = 1;
  193|       |    }
  194|       |    if (cdf_error) {
  195|       |      fprintf(stderr, "\n *** cdf error, frame_idx_r %d cdf {%d", frame_idx,
  196|       |              cdf[0]);
  197|       |      for (i = 1; i < nsymbs; ++i) fprintf(stderr, ", %d", cdf[i]);
  198|       |      fprintf(stderr, "} ref_cdf {%d", ref_cdf[0]);
  199|       |      for (i = 1; i < ref_nsymbs; ++i) fprintf(stderr, ", %d", ref_cdf[i]);
  200|       |      fprintf(stderr, "} queue_r %d\n", queue_r);
  201|       |      assert(0);
  202|       |    }
  203|       |    if (symb != ref_symb) {
  204|       |      fprintf(
  205|       |          stderr,
  206|       |          "\n *** symb error, frame_idx_r %d symb %d ref_symb %d queue_r %d\n",
  207|       |          frame_idx, symb, ref_symb, queue_r);
  208|       |      assert(0);
  209|       |    }
  210|       |  }
  211|       |#endif
  212|       |
  213|       |#if CONFIG_ACCOUNTING
  214|       |  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
  215|       |  aom_update_symb_counts(r, (nsymbs == 2));
  216|       |#endif
  217|  10.8M|  return symb;
  218|  10.8M|}
binary_codes_reader.c:aom_read_bit_:
  149|   199k|static inline int aom_read_bit_(aom_reader *r ACCT_STR_PARAM) {
  150|   199k|  int ret;
  151|   199k|  ret = aom_read(r, 128, NULL);  // aom_prob_half
  ------------------
  |  |   41|   199k|  aom_read_(r, prob ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  152|       |#if CONFIG_ACCOUNTING
  153|       |  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
  154|       |#endif
  155|   199k|  return ret;
  156|   199k|}
binary_codes_reader.c:aom_read_:
  104|   199k|static inline int aom_read_(aom_reader *r, int prob ACCT_STR_PARAM) {
  105|   199k|  int p = (0x7FFFFF - (prob << 15) + prob) >> 8;
  106|   199k|  int bit = od_ec_decode_bool_q15(&r->ec, p);
  107|       |
  108|       |#if CONFIG_BITSTREAM_DEBUG
  109|       |  {
  110|       |    int i;
  111|       |    int ref_bit, ref_nsymbs;
  112|       |    aom_cdf_prob ref_cdf[16];
  113|       |    const int queue_r = bitstream_queue_get_read();
  114|       |    const int frame_idx = aom_bitstream_queue_get_frame_read();
  115|       |    bitstream_queue_pop(&ref_bit, ref_cdf, &ref_nsymbs);
  116|       |    if (ref_nsymbs != 2) {
  117|       |      fprintf(stderr,
  118|       |              "\n *** [bit] nsymbs error, frame_idx_r %d nsymbs %d ref_nsymbs "
  119|       |              "%d queue_r %d\n",
  120|       |              frame_idx, 2, ref_nsymbs, queue_r);
  121|       |      assert(0);
  122|       |    }
  123|       |    if ((ref_nsymbs != 2) || (ref_cdf[0] != (aom_cdf_prob)p) ||
  124|       |        (ref_cdf[1] != 32767)) {
  125|       |      fprintf(stderr,
  126|       |              "\n *** [bit] cdf error, frame_idx_r %d cdf {%d, %d} ref_cdf {%d",
  127|       |              frame_idx, p, 32767, ref_cdf[0]);
  128|       |      for (i = 1; i < ref_nsymbs; ++i) fprintf(stderr, ", %d", ref_cdf[i]);
  129|       |      fprintf(stderr, "} queue_r %d\n", queue_r);
  130|       |      assert(0);
  131|       |    }
  132|       |    if (bit != ref_bit) {
  133|       |      fprintf(stderr,
  134|       |              "\n *** [bit] symb error, frame_idx_r %d symb %d ref_symb %d "
  135|       |              "queue_r %d\n",
  136|       |              frame_idx, bit, ref_bit, queue_r);
  137|       |      assert(0);
  138|       |    }
  139|       |  }
  140|       |#endif
  141|       |
  142|       |#if CONFIG_ACCOUNTING
  143|       |  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
  144|       |  aom_update_symb_counts(r, 1);
  145|       |#endif
  146|   199k|  return bit;
  147|   199k|}
binary_codes_reader.c:aom_read_literal_:
  158|  41.8k|static inline int aom_read_literal_(aom_reader *r, int bits ACCT_STR_PARAM) {
  159|  41.8k|  int literal = 0, bit;
  160|       |
  161|   172k|  for (bit = bits - 1; bit >= 0; bit--) literal |= aom_read_bit(r, NULL) << bit;
  ------------------
  |  |   43|   130k|  aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  |  Branch (161:24): [True: 130k, False: 41.8k]
  ------------------
  162|       |#if CONFIG_ACCOUNTING
  163|       |  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
  164|       |#endif
  165|  41.8k|  return literal;
  166|  41.8k|}

aom_rb_bytes_read:
   21|  26.3k|size_t aom_rb_bytes_read(const struct aom_read_bit_buffer *rb) {
   22|  26.3k|  return (rb->bit_offset + 7) >> 3;
   23|  26.3k|}
aom_rb_read_bit:
   25|  7.84M|int aom_rb_read_bit(struct aom_read_bit_buffer *rb) {
   26|  7.84M|  const uint32_t off = rb->bit_offset;
   27|  7.84M|  const uint32_t p = off >> 3;
   28|  7.84M|  const int q = 7 - (int)(off & 0x7);
   29|  7.84M|  if (rb->bit_buffer + p < rb->bit_buffer_end) {
  ------------------
  |  Branch (29:7): [True: 7.78M, False: 64.3k]
  ------------------
   30|  7.78M|    const int bit = (rb->bit_buffer[p] >> q) & 1;
   31|  7.78M|    rb->bit_offset = off + 1;
   32|  7.78M|    return bit;
   33|  7.78M|  } else {
   34|  64.3k|    if (rb->error_handler) rb->error_handler(rb->error_handler_data);
  ------------------
  |  Branch (34:9): [True: 471, False: 63.9k]
  ------------------
   35|  64.3k|    return 0;
   36|  64.3k|  }
   37|  7.84M|}
aom_rb_read_literal:
   39|  1.12M|int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits) {
   40|  1.12M|  assert(bits <= 31);
   41|  1.12M|  int value = 0, bit;
   42|  6.75M|  for (bit = bits - 1; bit >= 0; bit--) value |= aom_rb_read_bit(rb) << bit;
  ------------------
  |  Branch (42:24): [True: 5.62M, False: 1.12M]
  ------------------
   43|  1.12M|  return value;
   44|  1.12M|}
aom_rb_read_unsigned_literal:
   48|  3.49k|                                      int bits) {
   49|  3.49k|  assert(bits <= 32);
   50|  3.49k|  uint32_t value = 0;
   51|  3.49k|  int bit;
   52|  73.5k|  for (bit = bits - 1; bit >= 0; bit--)
  ------------------
  |  Branch (52:24): [True: 70.0k, False: 3.49k]
  ------------------
   53|  70.0k|    value |= (uint32_t)aom_rb_read_bit(rb) << bit;
   54|  3.49k|  return value;
   55|  3.49k|}
aom_rb_read_inv_signed_literal:
   57|  65.3k|int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits) {
   58|  65.3k|  const int nbits = sizeof(unsigned) * 8 - bits - 1;
   59|  65.3k|  const unsigned value = (unsigned)aom_rb_read_literal(rb, bits + 1) << nbits;
   60|  65.3k|  return ((int)value) >> nbits;
   61|  65.3k|}
aom_rb_read_uvlc:
   64|    288|uint32_t aom_rb_read_uvlc(struct aom_read_bit_buffer *rb) {
   65|    288|  int leading_zeros = 0;
   66|  1.01k|  while (leading_zeros < 32 && !aom_rb_read_bit(rb)) ++leading_zeros;
  ------------------
  |  Branch (66:10): [True: 1.01k, False: 0]
  |  Branch (66:32): [True: 727, False: 288]
  ------------------
   67|       |  // Maximum 32 bits.
   68|    288|  if (leading_zeros == 32) return UINT32_MAX;  // Error.
  ------------------
  |  Branch (68:7): [True: 0, False: 288]
  ------------------
   69|    288|  const uint32_t base = (1u << leading_zeros) - 1;
   70|    288|  const uint32_t value = aom_rb_read_literal(rb, leading_zeros);
   71|    288|  return base + value;
   72|    288|}
aom_rb_read_signed_primitive_refsubexpfin:
  116|  9.21k|    struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, int16_t ref) {
  117|  9.21k|  ref += n - 1;
  118|  9.21k|  const uint16_t scaled_n = (n << 1) - 1;
  119|  9.21k|  return aom_rb_read_primitive_refsubexpfin(rb, scaled_n, k, ref) - n + 1;
  120|  9.21k|}
bitreader_buffer.c:aom_rb_read_primitive_refsubexpfin:
  110|  9.21k|    struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, uint16_t ref) {
  111|  9.21k|  return inv_recenter_finite_nonneg(n, ref,
  112|  9.21k|                                    aom_rb_read_primitive_subexpfin(rb, n, k));
  113|  9.21k|}
bitreader_buffer.c:aom_rb_read_primitive_subexpfin:
   85|  9.21k|                                                uint16_t n, uint16_t k) {
   86|  9.21k|  int i = 0;
   87|  9.21k|  int mk = 0;
   88|       |
   89|  17.5k|  while (1) {
  ------------------
  |  Branch (89:10): [True: 17.4k, Folded]
  ------------------
   90|  17.4k|    int b = (i ? k + i - 1 : k);
  ------------------
  |  Branch (90:14): [True: 8.27k, False: 9.21k]
  ------------------
   91|  17.4k|    int a = (1 << b);
   92|       |
   93|  17.4k|    if (n <= mk + 3 * a) {
  ------------------
  |  Branch (93:9): [True: 66, False: 17.4k]
  ------------------
   94|     66|      return aom_rb_read_primitive_quniform(rb, n - mk) + mk;
   95|     66|    }
   96|       |
   97|  17.4k|    if (!aom_rb_read_bit(rb)) {
  ------------------
  |  Branch (97:9): [True: 9.13k, False: 8.29k]
  ------------------
   98|  9.13k|      return aom_rb_read_literal(rb, b) + mk;
   99|  9.13k|    }
  100|       |
  101|  8.29k|    i = i + 1;
  102|  8.29k|    mk += a;
  103|  8.29k|  }
  104|       |
  105|  9.21k|  assert(0);
  106|     17|  return 0;
  107|  9.21k|}
bitreader_buffer.c:aom_rb_read_primitive_quniform:
   76|     66|                                               uint16_t n) {
   77|     66|  if (n <= 1) return 0;
  ------------------
  |  Branch (77:7): [True: 0, False: 66]
  ------------------
   78|     66|  const int l = get_msb(n) + 1;
   79|     66|  const int m = (1 << l) - n;
   80|     66|  const int v = aom_rb_read_literal(rb, l - 1);
   81|     66|  return v < m ? v : (v << 1) - m + aom_rb_read_bit(rb);
  ------------------
  |  Branch (81:10): [True: 29, False: 37]
  ------------------
   82|     66|}

aom_highbd_blend_a64_d16_mask_c:
  128|     40|    ConvolveParams *conv_params, const int bd) {
  129|     40|  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
  ------------------
  |  |   21|     40|#define FILTER_BITS 7
  ------------------
  130|     40|  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
  131|     40|                           (1 << (offset_bits - conv_params->round_1 - 1));
  132|     40|  const int round_bits =
  133|     40|      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|     40|#define FILTER_BITS 7
  ------------------
  134|     40|  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
  ------------------
  |  |   75|     40|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  135|       |
  136|     40|  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
  137|     40|  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
  138|       |
  139|     40|  assert(h >= 1);
  140|     40|  assert(w >= 1);
  141|     40|  assert(IS_POWER_OF_TWO(h));
  142|     40|  assert(IS_POWER_OF_TWO(w));
  143|       |
  144|       |  // excerpt from clip_pixel_highbd()
  145|       |  // set saturation_value to (1 << bd) - 1
  146|     40|  unsigned int saturation_value;
  147|     40|  switch (bd) {
  148|      0|    case 8:
  ------------------
  |  Branch (148:5): [True: 0, False: 40]
  ------------------
  149|      0|    default: saturation_value = 255; break;
  ------------------
  |  Branch (149:5): [True: 0, False: 40]
  ------------------
  150|      0|    case 10: saturation_value = 1023; break;
  ------------------
  |  Branch (150:5): [True: 0, False: 40]
  ------------------
  151|     40|    case 12: saturation_value = 4095; break;
  ------------------
  |  Branch (151:5): [True: 40, False: 0]
  ------------------
  152|     40|  }
  153|       |
  154|     40|  if (subw == 0 && subh == 0) {
  ------------------
  |  Branch (154:7): [True: 0, False: 40]
  |  Branch (154:20): [True: 0, False: 0]
  ------------------
  155|      0|    for (int i = 0; i < h; ++i) {
  ------------------
  |  Branch (155:21): [True: 0, False: 0]
  ------------------
  156|      0|      for (int j = 0; j < w; ++j) {
  ------------------
  |  Branch (156:23): [True: 0, False: 0]
  ------------------
  157|      0|        int32_t res;
  158|      0|        const int m = mask[j];
  159|      0|        res = ((m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >>
  ------------------
  |  |   24|      0|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|      0|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  160|      0|               AOM_BLEND_A64_ROUND_BITS);
  ------------------
  |  |   23|      0|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
  161|      0|        res -= round_offset;
  162|      0|        unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits));
  ------------------
  |  |   41|      0|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
  163|      0|        dst[j] = AOMMIN(v, saturation_value);
  ------------------
  |  |   34|      0|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  164|      0|      }
  165|      0|      mask += mask_stride;
  166|      0|      src0 += src0_stride;
  167|      0|      src1 += src1_stride;
  168|      0|      dst += dst_stride;
  169|      0|    }
  170|     40|  } else if (subw == 1 && subh == 1) {
  ------------------
  |  Branch (170:14): [True: 40, False: 0]
  |  Branch (170:27): [True: 0, False: 40]
  ------------------
  171|      0|    for (int i = 0; i < h; ++i) {
  ------------------
  |  Branch (171:21): [True: 0, False: 0]
  ------------------
  172|      0|      for (int j = 0; j < w; ++j) {
  ------------------
  |  Branch (172:23): [True: 0, False: 0]
  ------------------
  173|      0|        int32_t res;
  174|      0|        const int m = ROUND_POWER_OF_TWO(
  ------------------
  |  |   41|      0|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
  175|      0|            mask[2 * j] + mask[mask_stride + 2 * j] + mask[2 * j + 1] +
  176|      0|                mask[mask_stride + 2 * j + 1],
  177|      0|            2);
  178|      0|        res = (m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >>
  ------------------
  |  |   24|      0|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|      0|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  179|      0|              AOM_BLEND_A64_ROUND_BITS;
  ------------------
  |  |   23|      0|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
  180|      0|        res -= round_offset;
  181|      0|        unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits));
  ------------------
  |  |   41|      0|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
  182|      0|        dst[j] = AOMMIN(v, saturation_value);
  ------------------
  |  |   34|      0|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  183|      0|      }
  184|      0|      mask += 2 * mask_stride;
  185|      0|      src0 += src0_stride;
  186|      0|      src1 += src1_stride;
  187|      0|      dst += dst_stride;
  188|      0|    }
  189|     40|  } else if (subw == 1 && subh == 0) {
  ------------------
  |  Branch (189:14): [True: 40, False: 0]
  |  Branch (189:27): [True: 40, False: 0]
  ------------------
  190|    616|    for (int i = 0; i < h; ++i) {
  ------------------
  |  Branch (190:21): [True: 576, False: 40]
  ------------------
  191|  10.0k|      for (int j = 0; j < w; ++j) {
  ------------------
  |  Branch (191:23): [True: 9.47k, False: 576]
  ------------------
  192|  9.47k|        int32_t res;
  193|  9.47k|        const int m = AOM_BLEND_AVG(mask[2 * j], mask[2 * j + 1]);
  ------------------
  |  |   40|  9.47k|#define AOM_BLEND_AVG(v0, v1) ROUND_POWER_OF_TWO((v0) + (v1), 1)
  |  |  ------------------
  |  |  |  |   41|  9.47k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
  194|  9.47k|        res = (m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >>
  ------------------
  |  |   24|  9.47k|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|  9.47k|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  195|  9.47k|              AOM_BLEND_A64_ROUND_BITS;
  ------------------
  |  |   23|  9.47k|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
  196|  9.47k|        res -= round_offset;
  197|  9.47k|        unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits));
  ------------------
  |  |   41|  9.47k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
  198|  9.47k|        dst[j] = AOMMIN(v, saturation_value);
  ------------------
  |  |   34|  9.47k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 9.47k, False: 0]
  |  |  ------------------
  ------------------
  199|  9.47k|      }
  200|    576|      mask += mask_stride;
  201|    576|      src0 += src0_stride;
  202|    576|      src1 += src1_stride;
  203|    576|      dst += dst_stride;
  204|    576|    }
  205|     40|  } else {
  206|      0|    for (int i = 0; i < h; ++i) {
  ------------------
  |  Branch (206:21): [True: 0, False: 0]
  ------------------
  207|      0|      for (int j = 0; j < w; ++j) {
  ------------------
  |  Branch (207:23): [True: 0, False: 0]
  ------------------
  208|      0|        int32_t res;
  209|      0|        const int m = AOM_BLEND_AVG(mask[j], mask[mask_stride + j]);
  ------------------
  |  |   40|      0|#define AOM_BLEND_AVG(v0, v1) ROUND_POWER_OF_TWO((v0) + (v1), 1)
  |  |  ------------------
  |  |  |  |   41|      0|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
  210|      0|        res = (m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >>
  ------------------
  |  |   24|      0|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|      0|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  211|      0|              AOM_BLEND_A64_ROUND_BITS;
  ------------------
  |  |   23|      0|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
  212|      0|        res -= round_offset;
  213|      0|        unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits));
  ------------------
  |  |   41|      0|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
  214|      0|        dst[j] = AOMMIN(v, saturation_value);
  ------------------
  |  |   34|      0|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  215|      0|      }
  216|      0|      mask += 2 * mask_stride;
  217|      0|      src0 += src0_stride;
  218|      0|      src1 += src1_stride;
  219|      0|      dst += dst_stride;
  220|      0|    }
  221|      0|  }
  222|     40|}
aom_blend_a64_mask_c:
  233|  5.60k|                          int h, int subw, int subh) {
  234|  5.60k|  int i, j;
  235|       |
  236|  5.60k|  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
  237|  5.60k|  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
  238|       |
  239|  5.60k|  assert(h >= 1);
  240|  5.60k|  assert(w >= 1);
  241|  5.60k|  assert(IS_POWER_OF_TWO(h));
  242|  5.60k|  assert(IS_POWER_OF_TWO(w));
  243|       |
  244|  5.60k|  if (subw == 0 && subh == 0) {
  ------------------
  |  Branch (244:7): [True: 5.60k, False: 0]
  |  Branch (244:20): [True: 5.60k, False: 0]
  ------------------
  245|  34.9k|    for (i = 0; i < h; ++i) {
  ------------------
  |  Branch (245:17): [True: 29.3k, False: 5.60k]
  ------------------
  246|  88.0k|      for (j = 0; j < w; ++j) {
  ------------------
  |  Branch (246:19): [True: 58.7k, False: 29.3k]
  ------------------
  247|  58.7k|        const int m = mask[i * mask_stride + j];
  248|  58.7k|        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
  ------------------
  |  |   27|  58.7k|  ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A64_MAX_ALPHA - (a)) * (v1), \
  |  |  ------------------
  |  |  |  |   41|  58.7k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |   28|  58.7k|                     AOM_BLEND_A64_ROUND_BITS)
  ------------------
  249|  58.7k|                                                src1[i * src1_stride + j]);
  250|  58.7k|      }
  251|  29.3k|    }
  252|  5.60k|  } else if (subw == 1 && subh == 1) {
  ------------------
  |  Branch (252:14): [True: 0, False: 0]
  |  Branch (252:27): [True: 0, False: 0]
  ------------------
  253|      0|    for (i = 0; i < h; ++i) {
  ------------------
  |  Branch (253:17): [True: 0, False: 0]
  ------------------
  254|      0|      for (j = 0; j < w; ++j) {
  ------------------
  |  Branch (254:19): [True: 0, False: 0]
  ------------------
  255|      0|        const int m = ROUND_POWER_OF_TWO(
  ------------------
  |  |   41|      0|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
  256|      0|            mask[(2 * i) * mask_stride + (2 * j)] +
  257|      0|                mask[(2 * i + 1) * mask_stride + (2 * j)] +
  258|      0|                mask[(2 * i) * mask_stride + (2 * j + 1)] +
  259|      0|                mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
  260|      0|            2);
  261|      0|        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
  ------------------
  |  |   27|      0|  ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A64_MAX_ALPHA - (a)) * (v1), \
  |  |  ------------------
  |  |  |  |   41|      0|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |   28|      0|                     AOM_BLEND_A64_ROUND_BITS)
  ------------------
  262|      0|                                                src1[i * src1_stride + j]);
  263|      0|      }
  264|      0|    }
  265|      0|  } else if (subw == 1 && subh == 0) {
  ------------------
  |  Branch (265:14): [True: 0, False: 0]
  |  Branch (265:27): [True: 0, False: 0]
  ------------------
  266|      0|    for (i = 0; i < h; ++i) {
  ------------------
  |  Branch (266:17): [True: 0, False: 0]
  ------------------
  267|      0|      for (j = 0; j < w; ++j) {
  ------------------
  |  Branch (267:19): [True: 0, False: 0]
  ------------------
  268|      0|        const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)],
  ------------------
  |  |   40|      0|#define AOM_BLEND_AVG(v0, v1) ROUND_POWER_OF_TWO((v0) + (v1), 1)
  |  |  ------------------
  |  |  |  |   41|      0|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
  269|      0|                                    mask[i * mask_stride + (2 * j + 1)]);
  270|      0|        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
  ------------------
  |  |   27|      0|  ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A64_MAX_ALPHA - (a)) * (v1), \
  |  |  ------------------
  |  |  |  |   41|      0|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |   28|      0|                     AOM_BLEND_A64_ROUND_BITS)
  ------------------
  271|      0|                                                src1[i * src1_stride + j]);
  272|      0|      }
  273|      0|    }
  274|      0|  } else {
  275|      0|    for (i = 0; i < h; ++i) {
  ------------------
  |  Branch (275:17): [True: 0, False: 0]
  ------------------
  276|      0|      for (j = 0; j < w; ++j) {
  ------------------
  |  Branch (276:19): [True: 0, False: 0]
  ------------------
  277|      0|        const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j],
  ------------------
  |  |   40|      0|#define AOM_BLEND_AVG(v0, v1) ROUND_POWER_OF_TWO((v0) + (v1), 1)
  |  |  ------------------
  |  |  |  |   41|      0|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
  278|      0|                                    mask[(2 * i + 1) * mask_stride + j]);
  279|      0|        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
  ------------------
  |  |   27|      0|  ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A64_MAX_ALPHA - (a)) * (v1), \
  |  |  ------------------
  |  |  |  |   41|      0|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |   28|      0|                     AOM_BLEND_A64_ROUND_BITS)
  ------------------
  280|      0|                                                src1[i * src1_stride + j]);
  281|      0|      }
  282|      0|    }
  283|      0|  }
  284|  5.60k|}
aom_highbd_blend_a64_mask_c:
  291|  1.44k|                                 int w, int h, int subw, int subh, int bd) {
  292|  1.44k|  int i, j;
  293|  1.44k|  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
  ------------------
  |  |   75|  1.44k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  294|  1.44k|  const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
  ------------------
  |  |   75|  1.44k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  295|  1.44k|  const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
  ------------------
  |  |   75|  1.44k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  296|  1.44k|  (void)bd;
  297|       |
  298|  1.44k|  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
  299|  1.44k|  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
  300|       |
  301|  1.44k|  assert(h >= 1);
  302|  1.44k|  assert(w >= 1);
  303|  1.44k|  assert(IS_POWER_OF_TWO(h));
  304|  1.44k|  assert(IS_POWER_OF_TWO(w));
  305|       |
  306|  1.44k|  assert(bd == 8 || bd == 10 || bd == 12);
  307|       |
  308|  1.44k|  if (subw == 0 && subh == 0) {
  ------------------
  |  Branch (308:7): [True: 1.44k, False: 0]
  |  Branch (308:20): [True: 1.44k, False: 0]
  ------------------
  309|  9.28k|    for (i = 0; i < h; ++i) {
  ------------------
  |  Branch (309:17): [True: 7.84k, False: 1.44k]
  ------------------
  310|  23.5k|      for (j = 0; j < w; ++j) {
  ------------------
  |  Branch (310:19): [True: 15.6k, False: 7.84k]
  ------------------
  311|  15.6k|        const int m = mask[i * mask_stride + j];
  312|  15.6k|        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
  ------------------
  |  |   27|  15.6k|  ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A64_MAX_ALPHA - (a)) * (v1), \
  |  |  ------------------
  |  |  |  |   41|  15.6k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |   28|  15.6k|                     AOM_BLEND_A64_ROUND_BITS)
  ------------------
  313|  15.6k|                                                src1[i * src1_stride + j]);
  314|  15.6k|      }
  315|  7.84k|    }
  316|  1.44k|  } else if (subw == 1 && subh == 1) {
  ------------------
  |  Branch (316:14): [True: 0, False: 0]
  |  Branch (316:27): [True: 0, False: 0]
  ------------------
  317|      0|    for (i = 0; i < h; ++i) {
  ------------------
  |  Branch (317:17): [True: 0, False: 0]
  ------------------
  318|      0|      for (j = 0; j < w; ++j) {
  ------------------
  |  Branch (318:19): [True: 0, False: 0]
  ------------------
  319|      0|        const int m = ROUND_POWER_OF_TWO(
  ------------------
  |  |   41|      0|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
  320|      0|            mask[(2 * i) * mask_stride + (2 * j)] +
  321|      0|                mask[(2 * i + 1) * mask_stride + (2 * j)] +
  322|      0|                mask[(2 * i) * mask_stride + (2 * j + 1)] +
  323|      0|                mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
  324|      0|            2);
  325|      0|        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
  ------------------
  |  |   27|      0|  ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A64_MAX_ALPHA - (a)) * (v1), \
  |  |  ------------------
  |  |  |  |   41|      0|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |   28|      0|                     AOM_BLEND_A64_ROUND_BITS)
  ------------------
  326|      0|                                                src1[i * src1_stride + j]);
  327|      0|      }
  328|      0|    }
  329|      0|  } else if (subw == 1 && subh == 0) {
  ------------------
  |  Branch (329:14): [True: 0, False: 0]
  |  Branch (329:27): [True: 0, False: 0]
  ------------------
  330|      0|    for (i = 0; i < h; ++i) {
  ------------------
  |  Branch (330:17): [True: 0, False: 0]
  ------------------
  331|      0|      for (j = 0; j < w; ++j) {
  ------------------
  |  Branch (331:19): [True: 0, False: 0]
  ------------------
  332|      0|        const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)],
  ------------------
  |  |   40|      0|#define AOM_BLEND_AVG(v0, v1) ROUND_POWER_OF_TWO((v0) + (v1), 1)
  |  |  ------------------
  |  |  |  |   41|      0|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
  333|      0|                                    mask[i * mask_stride + (2 * j + 1)]);
  334|      0|        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
  ------------------
  |  |   27|      0|  ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A64_MAX_ALPHA - (a)) * (v1), \
  |  |  ------------------
  |  |  |  |   41|      0|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |   28|      0|                     AOM_BLEND_A64_ROUND_BITS)
  ------------------
  335|      0|                                                src1[i * src1_stride + j]);
  336|      0|      }
  337|      0|    }
  338|      0|  } else {
  339|      0|    for (i = 0; i < h; ++i) {
  ------------------
  |  Branch (339:17): [True: 0, False: 0]
  ------------------
  340|      0|      for (j = 0; j < w; ++j) {
  ------------------
  |  Branch (340:19): [True: 0, False: 0]
  ------------------
  341|      0|        const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j],
  ------------------
  |  |   40|      0|#define AOM_BLEND_AVG(v0, v1) ROUND_POWER_OF_TWO((v0) + (v1), 1)
  |  |  ------------------
  |  |  |  |   41|      0|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
  342|      0|                                    mask[(2 * i + 1) * mask_stride + j]);
  343|      0|        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
  ------------------
  |  |   27|      0|  ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A64_MAX_ALPHA - (a)) * (v1), \
  |  |  ------------------
  |  |  |  |   41|      0|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |   28|      0|                     AOM_BLEND_A64_ROUND_BITS)
  ------------------
  344|      0|                                                src1[i * src1_stride + j]);
  345|      0|      }
  346|      0|    }
  347|      0|  }
  348|  1.44k|}

aom_highbd_blend_a64_vmask_c:
   48|     68|                                  const uint8_t *mask, int w, int h, int bd) {
   49|     68|  int i, j;
   50|     68|  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
  ------------------
  |  |   75|     68|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
   51|     68|  const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
  ------------------
  |  |   75|     68|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
   52|     68|  const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
  ------------------
  |  |   75|     68|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
   53|     68|  (void)bd;
   54|       |
   55|     68|  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
   56|     68|  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
   57|       |
   58|     68|  assert(h >= 1);
   59|     68|  assert(w >= 1);
   60|     68|  assert(IS_POWER_OF_TWO(h));
   61|     68|  assert(IS_POWER_OF_TWO(w));
   62|       |
   63|     68|  assert(bd == 8 || bd == 10 || bd == 12);
   64|       |
   65|    204|  for (i = 0; i < h; ++i) {
  ------------------
  |  Branch (65:15): [True: 136, False: 68]
  ------------------
   66|    136|    const int m = mask[i];
   67|  2.02k|    for (j = 0; j < w; ++j) {
  ------------------
  |  Branch (67:17): [True: 1.88k, False: 136]
  ------------------
   68|  1.88k|      dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
  ------------------
  |  |   27|  1.88k|  ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A64_MAX_ALPHA - (a)) * (v1), \
  |  |  ------------------
  |  |  |  |   41|  1.88k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |   28|  1.88k|                     AOM_BLEND_A64_ROUND_BITS)
  ------------------
   69|  1.88k|                                              src1[i * src1_stride + j]);
   70|  1.88k|    }
   71|    136|  }
   72|     68|}

od_ec_dec_init:
  144|  27.3k|                    uint32_t storage) {
  145|  27.3k|  dec->buf = buf;
  146|  27.3k|  dec->tell_offs = 10 - (OD_EC_WINDOW_SIZE - 8);
  ------------------
  |  |   28|  27.3k|#define OD_EC_WINDOW_SIZE ((int)sizeof(od_ec_window) * CHAR_BIT)
  ------------------
  147|  27.3k|  dec->end = buf + storage;
  148|  27.3k|  dec->bptr = buf;
  149|       |  dec->dif = ((od_ec_window)1 << (OD_EC_WINDOW_SIZE - 1)) - 1;
  ------------------
  |  |   28|  27.3k|#define OD_EC_WINDOW_SIZE ((int)sizeof(od_ec_window) * CHAR_BIT)
  ------------------
  150|  27.3k|  dec->rng = 0x8000;
  151|  27.3k|  dec->cnt = -15;
  152|  27.3k|  od_ec_dec_refill(dec);
  153|  27.3k|}
od_ec_decode_bool_q15:
  158|  33.6M|int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned f) {
  159|  33.6M|  od_ec_window dif;
  160|  33.6M|  od_ec_window vw;
  161|  33.6M|  unsigned r;
  162|  33.6M|  unsigned r_new;
  163|  33.6M|  unsigned v;
  164|  33.6M|  int ret;
  165|  33.6M|  assert(0 < f);
  166|  33.6M|  assert(f < 32768U);
  167|  33.6M|  dif = dec->dif;
  168|  33.6M|  r = dec->rng;
  169|  33.6M|  assert(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
  170|  33.6M|  assert(32768U <= r);
  171|  33.6M|  v = ((r >> 8) * (uint32_t)(f >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT));
  ------------------
  |  |   20|  33.6M|#define EC_PROB_SHIFT 6
  ------------------
                v = ((r >> 8) * (uint32_t)(f >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT));
  ------------------
  |  |   20|  33.6M|#define EC_PROB_SHIFT 6
  ------------------
  172|  33.6M|  v += EC_MIN_PROB;
  ------------------
  |  |   21|  33.6M|#define EC_MIN_PROB 4  // must be <= (1<<EC_PROB_SHIFT)/16
  ------------------
  173|  33.6M|  vw = (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16);
  ------------------
  |  |   28|  33.6M|#define OD_EC_WINDOW_SIZE ((int)sizeof(od_ec_window) * CHAR_BIT)
  ------------------
  174|  33.6M|  ret = 1;
  175|  33.6M|  r_new = v;
  176|  33.6M|  if (dif >= vw) {
  ------------------
  |  Branch (176:7): [True: 17.0M, False: 16.5M]
  ------------------
  177|  17.0M|    r_new = r - v;
  178|  17.0M|    dif -= vw;
  179|  17.0M|    ret = 0;
  180|  17.0M|  }
  181|  33.6M|  return od_ec_dec_normalize(dec, dif, r_new, ret);
  182|  33.6M|}
od_ec_decode_cdf_q15:
  193|   104M|int od_ec_decode_cdf_q15(od_ec_dec *dec, const uint16_t *icdf, int nsyms) {
  194|   104M|  od_ec_window dif;
  195|   104M|  unsigned r;
  196|   104M|  unsigned c;
  197|   104M|  unsigned u;
  198|   104M|  unsigned v;
  199|   104M|  int ret;
  200|   104M|  (void)nsyms;
  201|   104M|  dif = dec->dif;
  202|   104M|  r = dec->rng;
  203|   104M|  const int N = nsyms - 1;
  204|       |
  205|   104M|  assert(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
  206|   104M|  assert(icdf[nsyms - 1] == OD_ICDF(CDF_PROB_TOP));
  207|   104M|  assert(32768U <= r);
  208|   104M|  assert(7 - EC_PROB_SHIFT >= 0);
  209|   104M|  c = (unsigned)(dif >> (OD_EC_WINDOW_SIZE - 16));
  ------------------
  |  |   28|   104M|#define OD_EC_WINDOW_SIZE ((int)sizeof(od_ec_window) * CHAR_BIT)
  ------------------
  210|   104M|  v = r;
  211|   104M|  ret = -1;
  212|   206M|  do {
  213|   206M|    u = v;
  214|   206M|    v = ((r >> 8) * (uint32_t)(icdf[++ret] >> EC_PROB_SHIFT) >>
  ------------------
  |  |   20|   206M|#define EC_PROB_SHIFT 6
  ------------------
  215|   206M|         (7 - EC_PROB_SHIFT));
  ------------------
  |  |   20|   206M|#define EC_PROB_SHIFT 6
  ------------------
  216|   206M|    v += EC_MIN_PROB * (N - ret);
  ------------------
  |  |   21|   206M|#define EC_MIN_PROB 4  // must be <= (1<<EC_PROB_SHIFT)/16
  ------------------
  217|   206M|  } while (c < v);
  ------------------
  |  Branch (217:12): [True: 102M, False: 104M]
  ------------------
  218|   104M|  assert(v < u);
  219|   104M|  assert(u <= r);
  220|   104M|  r = u - v;
  221|       |  dif -= (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16);
  ------------------
  |  |   28|   104M|#define OD_EC_WINDOW_SIZE ((int)sizeof(od_ec_window) * CHAR_BIT)
  ------------------
  222|   104M|  return od_ec_dec_normalize(dec, dif, r, ret);
  223|   104M|}
od_ec_dec_tell:
  231|   187k|int od_ec_dec_tell(const od_ec_dec *dec) {
  232|       |  /*There is a window of bits stored in dec->dif. The difference
  233|       |     (dec->bptr - dec->buf) tells us how many bytes have been read into this
  234|       |     window. The difference (dec->cnt - dec->tell_offs) tells us how many of
  235|       |     the bits in that window remain unconsumed.*/
  236|   187k|  return (int)((dec->bptr - dec->buf) * 8 - dec->cnt + dec->tell_offs);
  237|   187k|}
entdec.c:od_ec_dec_refill:
   78|  8.79M|static void od_ec_dec_refill(od_ec_dec *dec) {
   79|  8.79M|  int s;
   80|  8.79M|  od_ec_window dif;
   81|  8.79M|  int16_t cnt;
   82|  8.79M|  const unsigned char *bptr;
   83|  8.79M|  const unsigned char *end;
   84|  8.79M|  dif = dec->dif;
   85|  8.79M|  cnt = dec->cnt;
   86|  8.79M|  bptr = dec->bptr;
   87|  8.79M|  end = dec->end;
   88|  8.79M|  s = OD_EC_WINDOW_SIZE - 9 - (cnt + 15);
  ------------------
  |  |   28|  8.79M|#define OD_EC_WINDOW_SIZE ((int)sizeof(od_ec_window) * CHAR_BIT)
  ------------------
   89|  26.3M|  for (; s >= 0 && bptr < end; s -= 8, bptr++) {
  ------------------
  |  Branch (89:10): [True: 17.6M, False: 8.77M]
  |  Branch (89:20): [True: 17.6M, False: 13.7k]
  ------------------
   90|       |    /*Each time a byte is inserted into the window (dif), bptr advances and cnt
   91|       |       is incremented by 8, so the total number of consumed bits (the return
   92|       |       value of od_ec_dec_tell) does not change.*/
   93|  17.6M|    assert(s <= OD_EC_WINDOW_SIZE - 8);
   94|  17.6M|    dif ^= (od_ec_window)bptr[0] << s;
   95|  17.6M|    cnt += 8;
   96|  17.6M|  }
   97|  8.79M|  if (bptr >= end) {
  ------------------
  |  Branch (97:7): [True: 21.9k, False: 8.76M]
  ------------------
   98|       |    /*We've reached the end of the buffer. It is perfectly valid for us to need
   99|       |       to fill the window with additional bits past the end of the buffer (and
  100|       |       this happens in normal operation). These bits should all just be taken
  101|       |       as zero. But we cannot increment bptr past 'end' (this is undefined
  102|       |       behavior), so we start to increment dec->tell_offs. We also don't want
  103|       |       to keep testing bptr against 'end', so we set cnt to OD_EC_LOTS_OF_BITS
  104|       |       and adjust dec->tell_offs so that the total number of unconsumed bits in
  105|       |       the window (dec->cnt - dec->tell_offs) does not change. This effectively
  106|       |       puts lots of zero bits into the window, and means we won't try to refill
  107|       |       it from the buffer for a very long time (at which point we'll put lots
  108|       |       of zero bits into the window again).*/
  109|  21.9k|    dec->tell_offs += OD_EC_LOTS_OF_BITS - cnt;
  ------------------
  |  |   74|  21.9k|#define OD_EC_LOTS_OF_BITS (0x4000)
  ------------------
  110|  21.9k|    cnt = OD_EC_LOTS_OF_BITS;
  ------------------
  |  |   74|  21.9k|#define OD_EC_LOTS_OF_BITS (0x4000)
  ------------------
  111|  21.9k|  }
  112|  8.79M|  dec->dif = dif;
  113|  8.79M|  dec->cnt = cnt;
  114|  8.79M|  dec->bptr = bptr;
  115|  8.79M|}
entdec.c:od_ec_dec_normalize:
  126|   136M|                               int ret) {
  127|   136M|  int d;
  128|   136M|  assert(rng <= 65535U);
  129|       |  /*The number of leading zeros in the 16-bit binary representation of rng.*/
  130|   136M|  d = 16 - OD_ILOG_NZ(rng);
  ------------------
  |  |   50|   136M|#define OD_ILOG_NZ(x) (1 + get_msb(x))
  ------------------
  131|       |  /*d bits in dec->dif are consumed.*/
  132|   136M|  dec->cnt -= d;
  133|       |  /*This is equivalent to shifting in 1's instead of 0's.*/
  134|   136M|  dec->dif = ((dif + 1) << d) - 1;
  135|   136M|  dec->rng = rng << d;
  136|   136M|  if (dec->cnt < 0) od_ec_dec_refill(dec);
  ------------------
  |  Branch (136:7): [True: 8.76M, False: 127M]
  ------------------
  137|   136M|  return ret;
  138|   136M|}

av1_invalidate_corner_list:
  168|  29.0k|void av1_invalidate_corner_list(CornerList *corners) {
  169|  29.0k|  if (corners) {
  ------------------
  |  Branch (169:7): [True: 0, False: 29.0k]
  ------------------
  170|      0|#if CONFIG_MULTITHREAD
  171|      0|    pthread_mutex_lock(&corners->mutex);
  172|      0|#endif  // CONFIG_MULTITHREAD
  173|       |    corners->valid = false;
  174|      0|#if CONFIG_MULTITHREAD
  175|      0|    pthread_mutex_unlock(&corners->mutex);
  176|      0|#endif  // CONFIG_MULTITHREAD
  177|      0|  }
  178|  29.0k|}

aom_highbd_dc_predictor_4x16_c:
  632|  12.5k|                                    int bd) {
  633|  12.5k|  highbd_dc_predictor_rect(dst, stride, 4, 16, above, left, bd, 2,
  634|  12.5k|                           HIGHBD_DC_MULTIPLIER_1X4);
  ------------------
  |  |  584|  12.5k|#define HIGHBD_DC_MULTIPLIER_1X4 0x6667
  ------------------
  635|  12.5k|}
aom_highbd_dc_predictor_16x4_c:
  639|  30.0k|                                    int bd) {
  640|  30.0k|  highbd_dc_predictor_rect(dst, stride, 16, 4, above, left, bd, 2,
  641|  30.0k|                           HIGHBD_DC_MULTIPLIER_1X4);
  ------------------
  |  |  584|  30.0k|#define HIGHBD_DC_MULTIPLIER_1X4 0x6667
  ------------------
  642|  30.0k|}
aom_highbd_dc_predictor_8x32_c:
  662|  6.37k|                                    int bd) {
  663|  6.37k|  highbd_dc_predictor_rect(dst, stride, 8, 32, above, left, bd, 3,
  664|  6.37k|                           HIGHBD_DC_MULTIPLIER_1X4);
  ------------------
  |  |  584|  6.37k|#define HIGHBD_DC_MULTIPLIER_1X4 0x6667
  ------------------
  665|  6.37k|}
aom_highbd_dc_predictor_32x8_c:
  669|  10.2k|                                    int bd) {
  670|  10.2k|  highbd_dc_predictor_rect(dst, stride, 32, 8, above, left, bd, 3,
  671|  10.2k|                           HIGHBD_DC_MULTIPLIER_1X4);
  ------------------
  |  |  584|  10.2k|#define HIGHBD_DC_MULTIPLIER_1X4 0x6667
  ------------------
  672|  10.2k|}
aom_highbd_dc_predictor_16x64_c:
  692|    499|                                     const uint16_t *left, int bd) {
  693|    499|  highbd_dc_predictor_rect(dst, stride, 16, 64, above, left, bd, 4,
  694|    499|                           HIGHBD_DC_MULTIPLIER_1X4);
  ------------------
  |  |  584|    499|#define HIGHBD_DC_MULTIPLIER_1X4 0x6667
  ------------------
  695|    499|}
aom_highbd_dc_predictor_64x16_c:
  699|    554|                                     const uint16_t *left, int bd) {
  700|    554|  highbd_dc_predictor_rect(dst, stride, 64, 16, above, left, bd, 4,
  701|    554|                           HIGHBD_DC_MULTIPLIER_1X4);
  ------------------
  |  |  584|    554|#define HIGHBD_DC_MULTIPLIER_1X4 0x6667
  ------------------
  702|    554|}
aom_highbd_dc_predictor_32x64_c:
  707|    686|                                     const uint16_t *left, int bd) {
  708|    686|  highbd_dc_predictor_rect(dst, stride, 32, 64, above, left, bd, 5,
  709|    686|                           HIGHBD_DC_MULTIPLIER_1X2);
  ------------------
  |  |  581|    686|#define HIGHBD_DC_MULTIPLIER_1X2 0xAAAB
  ------------------
  710|    686|}
aom_highbd_dc_predictor_64x32_c:
  714|  1.37k|                                     const uint16_t *left, int bd) {
  715|  1.37k|  highbd_dc_predictor_rect(dst, stride, 64, 32, above, left, bd, 5,
  716|  1.37k|                           HIGHBD_DC_MULTIPLIER_1X2);
  ------------------
  |  |  581|  1.37k|#define HIGHBD_DC_MULTIPLIER_1X2 0xAAAB
  ------------------
  717|  1.37k|}
aom_highbd_v_predictor_64x64_c:
  737|     62|      const uint16_t *left, int bd) {                                       \
  738|     62|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|     62|  }
aom_highbd_v_predictor_32x64_c:
  737|     47|      const uint16_t *left, int bd) {                                       \
  738|     47|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|     47|  }
aom_highbd_v_predictor_64x32_c:
  737|     73|      const uint16_t *left, int bd) {                                       \
  738|     73|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|     73|  }
aom_highbd_v_predictor_4x16_c:
  737|  1.34k|      const uint16_t *left, int bd) {                                       \
  738|  1.34k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  1.34k|  }
aom_highbd_v_predictor_16x4_c:
  737|  2.72k|      const uint16_t *left, int bd) {                                       \
  738|  2.72k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  2.72k|  }
aom_highbd_v_predictor_8x32_c:
  737|    474|      const uint16_t *left, int bd) {                                       \
  738|    474|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|    474|  }
aom_highbd_v_predictor_32x8_c:
  737|    889|      const uint16_t *left, int bd) {                                       \
  738|    889|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|    889|  }
aom_highbd_v_predictor_16x64_c:
  737|    100|      const uint16_t *left, int bd) {                                       \
  738|    100|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|    100|  }
aom_highbd_v_predictor_64x16_c:
  737|    160|      const uint16_t *left, int bd) {                                       \
  738|    160|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|    160|  }
aom_highbd_h_predictor_64x64_c:
  737|     95|      const uint16_t *left, int bd) {                                       \
  738|     95|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|     95|  }
aom_highbd_h_predictor_32x64_c:
  737|     88|      const uint16_t *left, int bd) {                                       \
  738|     88|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|     88|  }
aom_highbd_h_predictor_64x32_c:
  737|     80|      const uint16_t *left, int bd) {                                       \
  738|     80|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|     80|  }
aom_highbd_h_predictor_4x16_c:
  737|  1.80k|      const uint16_t *left, int bd) {                                       \
  738|  1.80k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  1.80k|  }
aom_highbd_h_predictor_16x4_c:
  737|  4.34k|      const uint16_t *left, int bd) {                                       \
  738|  4.34k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  4.34k|  }
aom_highbd_h_predictor_8x32_c:
  737|    965|      const uint16_t *left, int bd) {                                       \
  738|    965|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|    965|  }
aom_highbd_h_predictor_32x8_c:
  737|  1.61k|      const uint16_t *left, int bd) {                                       \
  738|  1.61k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  1.61k|  }
aom_highbd_h_predictor_16x64_c:
  737|    129|      const uint16_t *left, int bd) {                                       \
  738|    129|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|    129|  }
aom_highbd_h_predictor_64x16_c:
  737|    132|      const uint16_t *left, int bd) {                                       \
  738|    132|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|    132|  }
aom_highbd_smooth_predictor_4x4_c:
  737|  43.6k|      const uint16_t *left, int bd) {                                       \
  738|  43.6k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  43.6k|  }
aom_highbd_smooth_predictor_8x8_c:
  737|  43.0k|      const uint16_t *left, int bd) {                                       \
  738|  43.0k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  43.0k|  }
aom_highbd_smooth_predictor_16x16_c:
  737|  17.2k|      const uint16_t *left, int bd) {                                       \
  738|  17.2k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  17.2k|  }
aom_highbd_smooth_predictor_32x32_c:
  737|  5.47k|      const uint16_t *left, int bd) {                                       \
  738|  5.47k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  5.47k|  }
aom_highbd_smooth_predictor_64x64_c:
  737|    455|      const uint16_t *left, int bd) {                                       \
  738|    455|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|    455|  }
aom_highbd_smooth_predictor_4x8_c:
  737|  8.02k|      const uint16_t *left, int bd) {                                       \
  738|  8.02k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  8.02k|  }
aom_highbd_smooth_predictor_8x4_c:
  737|  12.7k|      const uint16_t *left, int bd) {                                       \
  738|  12.7k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  12.7k|  }
aom_highbd_smooth_predictor_8x16_c:
  737|  7.77k|      const uint16_t *left, int bd) {                                       \
  738|  7.77k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  7.77k|  }
aom_highbd_smooth_predictor_16x8_c:
  737|  10.9k|      const uint16_t *left, int bd) {                                       \
  738|  10.9k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  10.9k|  }
aom_highbd_smooth_predictor_16x32_c:
  737|  2.66k|      const uint16_t *left, int bd) {                                       \
  738|  2.66k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  2.66k|  }
aom_highbd_smooth_predictor_32x16_c:
  737|  2.56k|      const uint16_t *left, int bd) {                                       \
  738|  2.56k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  2.56k|  }
aom_highbd_smooth_predictor_32x64_c:
  737|    173|      const uint16_t *left, int bd) {                                       \
  738|    173|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|    173|  }
aom_highbd_smooth_predictor_64x32_c:
  737|    177|      const uint16_t *left, int bd) {                                       \
  738|    177|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|    177|  }
aom_highbd_smooth_predictor_4x16_c:
  737|  3.98k|      const uint16_t *left, int bd) {                                       \
  738|  3.98k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  3.98k|  }
aom_highbd_smooth_predictor_16x4_c:
  737|  10.1k|      const uint16_t *left, int bd) {                                       \
  738|  10.1k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  10.1k|  }
aom_highbd_smooth_predictor_8x32_c:
  737|  2.42k|      const uint16_t *left, int bd) {                                       \
  738|  2.42k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  2.42k|  }
aom_highbd_smooth_predictor_32x8_c:
  737|  3.06k|      const uint16_t *left, int bd) {                                       \
  738|  3.06k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  3.06k|  }
aom_highbd_smooth_predictor_16x64_c:
  737|    301|      const uint16_t *left, int bd) {                                       \
  738|    301|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|    301|  }
aom_highbd_smooth_predictor_64x16_c:
  737|    234|      const uint16_t *left, int bd) {                                       \
  738|    234|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|    234|  }
aom_highbd_smooth_v_predictor_4x4_c:
  737|  14.1k|      const uint16_t *left, int bd) {                                       \
  738|  14.1k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  14.1k|  }
aom_highbd_smooth_v_predictor_8x8_c:
  737|  12.8k|      const uint16_t *left, int bd) {                                       \
  738|  12.8k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  12.8k|  }
aom_highbd_smooth_v_predictor_16x16_c:
  737|  3.98k|      const uint16_t *left, int bd) {                                       \
  738|  3.98k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  3.98k|  }
aom_highbd_smooth_v_predictor_32x32_c:
  737|  1.47k|      const uint16_t *left, int bd) {                                       \
  738|  1.47k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  1.47k|  }
aom_highbd_smooth_v_predictor_64x64_c:
  737|    163|      const uint16_t *left, int bd) {                                       \
  738|    163|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|    163|  }
aom_highbd_smooth_v_predictor_4x8_c:
  737|  2.35k|      const uint16_t *left, int bd) {                                       \
  738|  2.35k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  2.35k|  }
aom_highbd_smooth_v_predictor_8x4_c:
  737|  3.58k|      const uint16_t *left, int bd) {                                       \
  738|  3.58k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  3.58k|  }
aom_highbd_smooth_v_predictor_8x16_c:
  737|  2.25k|      const uint16_t *left, int bd) {                                       \
  738|  2.25k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  2.25k|  }
aom_highbd_smooth_v_predictor_16x8_c:
  737|  3.13k|      const uint16_t *left, int bd) {                                       \
  738|  3.13k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  3.13k|  }
aom_highbd_smooth_v_predictor_16x32_c:
  737|    647|      const uint16_t *left, int bd) {                                       \
  738|    647|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|    647|  }
aom_highbd_smooth_v_predictor_32x16_c:
  737|    822|      const uint16_t *left, int bd) {                                       \
  738|    822|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|    822|  }
aom_highbd_smooth_v_predictor_32x64_c:
  737|     47|      const uint16_t *left, int bd) {                                       \
  738|     47|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|     47|  }
aom_highbd_smooth_v_predictor_64x32_c:
  737|     69|      const uint16_t *left, int bd) {                                       \
  738|     69|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|     69|  }
aom_highbd_smooth_v_predictor_4x16_c:
  737|  1.29k|      const uint16_t *left, int bd) {                                       \
  738|  1.29k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  1.29k|  }
aom_highbd_smooth_v_predictor_16x4_c:
  737|  2.98k|      const uint16_t *left, int bd) {                                       \
  738|  2.98k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  2.98k|  }
aom_highbd_smooth_v_predictor_8x32_c:
  737|    688|      const uint16_t *left, int bd) {                                       \
  738|    688|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|    688|  }
aom_highbd_smooth_v_predictor_32x8_c:
  737|  1.10k|      const uint16_t *left, int bd) {                                       \
  738|  1.10k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  1.10k|  }
aom_highbd_smooth_v_predictor_16x64_c:
  737|     61|      const uint16_t *left, int bd) {                                       \
  738|     61|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|     61|  }
aom_highbd_smooth_v_predictor_64x16_c:
  737|    180|      const uint16_t *left, int bd) {                                       \
  738|    180|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|    180|  }
aom_highbd_smooth_h_predictor_4x4_c:
  737|  17.6k|      const uint16_t *left, int bd) {                                       \
  738|  17.6k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  17.6k|  }
aom_highbd_smooth_h_predictor_8x8_c:
  737|  17.4k|      const uint16_t *left, int bd) {                                       \
  738|  17.4k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  17.4k|  }
aom_highbd_smooth_h_predictor_16x16_c:
  737|  6.73k|      const uint16_t *left, int bd) {                                       \
  738|  6.73k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  6.73k|  }
aom_highbd_smooth_h_predictor_32x32_c:
  737|  3.19k|      const uint16_t *left, int bd) {                                       \
  738|  3.19k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  3.19k|  }
aom_highbd_smooth_h_predictor_64x64_c:
  737|    153|      const uint16_t *left, int bd) {                                       \
  738|    153|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|    153|  }
aom_highbd_smooth_h_predictor_4x8_c:
  737|  3.10k|      const uint16_t *left, int bd) {                                       \
  738|  3.10k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  3.10k|  }
aom_highbd_smooth_h_predictor_8x4_c:
  737|  5.10k|      const uint16_t *left, int bd) {                                       \
  738|  5.10k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  5.10k|  }
aom_highbd_smooth_h_predictor_8x16_c:
  737|  2.80k|      const uint16_t *left, int bd) {                                       \
  738|  2.80k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  2.80k|  }
aom_highbd_smooth_h_predictor_16x8_c:
  737|  4.41k|      const uint16_t *left, int bd) {                                       \
  738|  4.41k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  4.41k|  }
aom_highbd_smooth_h_predictor_16x32_c:
  737|  1.11k|      const uint16_t *left, int bd) {                                       \
  738|  1.11k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  1.11k|  }
aom_highbd_smooth_h_predictor_32x16_c:
  737|  1.03k|      const uint16_t *left, int bd) {                                       \
  738|  1.03k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  1.03k|  }
aom_highbd_smooth_h_predictor_32x64_c:
  737|     43|      const uint16_t *left, int bd) {                                       \
  738|     43|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|     43|  }
aom_highbd_smooth_h_predictor_64x32_c:
  737|     78|      const uint16_t *left, int bd) {                                       \
  738|     78|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|     78|  }
aom_highbd_smooth_h_predictor_4x16_c:
  737|  1.75k|      const uint16_t *left, int bd) {                                       \
  738|  1.75k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  1.75k|  }
aom_highbd_smooth_h_predictor_16x4_c:
  737|  4.56k|      const uint16_t *left, int bd) {                                       \
  738|  4.56k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  4.56k|  }
aom_highbd_smooth_h_predictor_8x32_c:
  737|  1.09k|      const uint16_t *left, int bd) {                                       \
  738|  1.09k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  1.09k|  }
aom_highbd_smooth_h_predictor_32x8_c:
  737|  1.48k|      const uint16_t *left, int bd) {                                       \
  738|  1.48k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  1.48k|  }
aom_highbd_smooth_h_predictor_16x64_c:
  737|    108|      const uint16_t *left, int bd) {                                       \
  738|    108|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|    108|  }
aom_highbd_smooth_h_predictor_64x16_c:
  737|    148|      const uint16_t *left, int bd) {                                       \
  738|    148|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|    148|  }
aom_highbd_paeth_predictor_4x4_c:
  737|  28.1k|      const uint16_t *left, int bd) {                                       \
  738|  28.1k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  28.1k|  }
aom_highbd_paeth_predictor_8x8_c:
  737|  18.4k|      const uint16_t *left, int bd) {                                       \
  738|  18.4k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  18.4k|  }
aom_highbd_paeth_predictor_16x16_c:
  737|  7.78k|      const uint16_t *left, int bd) {                                       \
  738|  7.78k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  7.78k|  }
aom_highbd_paeth_predictor_32x32_c:
  737|  3.41k|      const uint16_t *left, int bd) {                                       \
  738|  3.41k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  3.41k|  }
aom_highbd_paeth_predictor_64x64_c:
  737|    139|      const uint16_t *left, int bd) {                                       \
  738|    139|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|    139|  }
aom_highbd_paeth_predictor_4x8_c:
  737|  4.58k|      const uint16_t *left, int bd) {                                       \
  738|  4.58k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  4.58k|  }
aom_highbd_paeth_predictor_8x4_c:
  737|  6.94k|      const uint16_t *left, int bd) {                                       \
  738|  6.94k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  6.94k|  }
aom_highbd_paeth_predictor_8x16_c:
  737|  4.53k|      const uint16_t *left, int bd) {                                       \
  738|  4.53k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  4.53k|  }
aom_highbd_paeth_predictor_16x8_c:
  737|  5.79k|      const uint16_t *left, int bd) {                                       \
  738|  5.79k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  5.79k|  }
aom_highbd_paeth_predictor_16x32_c:
  737|  1.23k|      const uint16_t *left, int bd) {                                       \
  738|  1.23k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  1.23k|  }
aom_highbd_paeth_predictor_32x16_c:
  737|  1.21k|      const uint16_t *left, int bd) {                                       \
  738|  1.21k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  1.21k|  }
aom_highbd_paeth_predictor_32x64_c:
  737|     59|      const uint16_t *left, int bd) {                                       \
  738|     59|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|     59|  }
aom_highbd_paeth_predictor_64x32_c:
  737|     81|      const uint16_t *left, int bd) {                                       \
  738|     81|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|     81|  }
aom_highbd_paeth_predictor_4x16_c:
  737|  2.49k|      const uint16_t *left, int bd) {                                       \
  738|  2.49k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  2.49k|  }
aom_highbd_paeth_predictor_16x4_c:
  737|  5.46k|      const uint16_t *left, int bd) {                                       \
  738|  5.46k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  5.46k|  }
aom_highbd_paeth_predictor_8x32_c:
  737|  1.35k|      const uint16_t *left, int bd) {                                       \
  738|  1.35k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  1.35k|  }
aom_highbd_paeth_predictor_32x8_c:
  737|  1.88k|      const uint16_t *left, int bd) {                                       \
  738|  1.88k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  1.88k|  }
aom_highbd_paeth_predictor_16x64_c:
  737|    196|      const uint16_t *left, int bd) {                                       \
  738|    196|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|    196|  }
aom_highbd_paeth_predictor_64x16_c:
  737|    113|      const uint16_t *left, int bd) {                                       \
  738|    113|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|    113|  }
aom_highbd_dc_128_predictor_64x64_c:
  737|  2.24k|      const uint16_t *left, int bd) {                                       \
  738|  2.24k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  2.24k|  }
aom_highbd_dc_128_predictor_32x64_c:
  737|     67|      const uint16_t *left, int bd) {                                       \
  738|     67|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|     67|  }
aom_highbd_dc_128_predictor_64x32_c:
  737|    290|      const uint16_t *left, int bd) {                                       \
  738|    290|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|    290|  }
aom_highbd_dc_128_predictor_4x16_c:
  737|      6|      const uint16_t *left, int bd) {                                       \
  738|      6|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|      6|  }
aom_highbd_dc_128_predictor_16x4_c:
  737|     13|      const uint16_t *left, int bd) {                                       \
  738|     13|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|     13|  }
aom_highbd_dc_128_predictor_8x32_c:
  737|     68|      const uint16_t *left, int bd) {                                       \
  738|     68|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|     68|  }
aom_highbd_dc_128_predictor_32x8_c:
  737|     73|      const uint16_t *left, int bd) {                                       \
  738|     73|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|     73|  }
aom_highbd_dc_128_predictor_16x64_c:
  737|     86|      const uint16_t *left, int bd) {                                       \
  738|     86|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|     86|  }
aom_highbd_dc_128_predictor_64x16_c:
  737|     14|      const uint16_t *left, int bd) {                                       \
  738|     14|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|     14|  }
aom_highbd_dc_left_predictor_64x64_c:
  737|  1.40k|      const uint16_t *left, int bd) {                                       \
  738|  1.40k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  1.40k|  }
aom_highbd_dc_left_predictor_32x64_c:
  737|    692|      const uint16_t *left, int bd) {                                       \
  738|    692|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|    692|  }
aom_highbd_dc_left_predictor_64x32_c:
  737|    119|      const uint16_t *left, int bd) {                                       \
  738|    119|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|    119|  }
aom_highbd_dc_left_predictor_4x16_c:
  737|    679|      const uint16_t *left, int bd) {                                       \
  738|    679|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|    679|  }
aom_highbd_dc_left_predictor_16x4_c:
  737|    820|      const uint16_t *left, int bd) {                                       \
  738|    820|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|    820|  }
aom_highbd_dc_left_predictor_8x32_c:
  737|  1.13k|      const uint16_t *left, int bd) {                                       \
  738|  1.13k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  1.13k|  }
aom_highbd_dc_left_predictor_32x8_c:
  737|    695|      const uint16_t *left, int bd) {                                       \
  738|    695|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|    695|  }
aom_highbd_dc_left_predictor_16x64_c:
  737|    325|      const uint16_t *left, int bd) {                                       \
  738|    325|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|    325|  }
aom_highbd_dc_left_predictor_64x16_c:
  737|    131|      const uint16_t *left, int bd) {                                       \
  738|    131|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|    131|  }
aom_highbd_dc_top_predictor_64x64_c:
  737|  1.38k|      const uint16_t *left, int bd) {                                       \
  738|  1.38k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  1.38k|  }
aom_highbd_dc_top_predictor_32x64_c:
  737|    198|      const uint16_t *left, int bd) {                                       \
  738|    198|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|    198|  }
aom_highbd_dc_top_predictor_64x32_c:
  737|  1.41k|      const uint16_t *left, int bd) {                                       \
  738|  1.41k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  1.41k|  }
aom_highbd_dc_top_predictor_4x16_c:
  737|    416|      const uint16_t *left, int bd) {                                       \
  738|    416|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|    416|  }
aom_highbd_dc_top_predictor_16x4_c:
  737|    809|      const uint16_t *left, int bd) {                                       \
  738|    809|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|    809|  }
aom_highbd_dc_top_predictor_8x32_c:
  737|    602|      const uint16_t *left, int bd) {                                       \
  738|    602|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|    602|  }
aom_highbd_dc_top_predictor_32x8_c:
  737|    810|      const uint16_t *left, int bd) {                                       \
  738|    810|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|    810|  }
aom_highbd_dc_top_predictor_16x64_c:
  737|     57|      const uint16_t *left, int bd) {                                       \
  738|     57|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|     57|  }
aom_highbd_dc_top_predictor_64x16_c:
  737|    164|      const uint16_t *left, int bd) {                                       \
  738|    164|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|    164|  }
aom_highbd_dc_predictor_64x64_c:
  737|  1.42k|      const uint16_t *left, int bd) {                                       \
  738|  1.42k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  1.42k|  }
intrapred.c:divide_using_multiply_shift:
  237|  62.3k|                                              int multiplier, int shift2) {
  238|  62.3k|  const int interm = num >> shift1;
  239|  62.3k|  return interm * multiplier >> shift2;
  240|  62.3k|}
intrapred.c:highbd_dc_predictor_rect:
  592|  62.3k|                                            int shift1, uint32_t multiplier) {
  593|  62.3k|  int sum = 0;
  594|  62.3k|  (void)bd;
  595|       |
  596|  1.12M|  for (int i = 0; i < bw; i++) {
  ------------------
  |  Branch (596:19): [True: 1.06M, False: 62.3k]
  ------------------
  597|  1.06M|    sum += above[i];
  598|  1.06M|  }
  599|   798k|  for (int i = 0; i < bh; i++) {
  ------------------
  |  Branch (599:19): [True: 736k, False: 62.3k]
  ------------------
  600|   736k|    sum += left[i];
  601|   736k|  }
  602|       |
  603|  62.3k|  const int expected_dc = divide_using_multiply_shift(
  604|  62.3k|      sum + ((bw + bh) >> 1), shift1, multiplier, HIGHBD_DC_SHIFT2);
  ------------------
  |  |  586|  62.3k|#define HIGHBD_DC_SHIFT2 17
  ------------------
  605|  62.3k|  assert(expected_dc < (1 << bd));
  606|       |
  607|   798k|  for (int r = 0; r < bh; r++) {
  ------------------
  |  Branch (607:19): [True: 736k, False: 62.3k]
  ------------------
  608|   736k|    aom_memset16(dst, expected_dc, bw);
  609|   736k|    dst += stride;
  610|   736k|  }
  611|  62.3k|}
intrapred.c:highbd_v_predictor:
  373|  5.87k|                                      const uint16_t *left, int bd) {
  374|  5.87k|  int r;
  375|  5.87k|  (void)left;
  376|  5.87k|  (void)bd;
  377|  78.8k|  for (r = 0; r < bh; r++) {
  ------------------
  |  Branch (377:15): [True: 72.9k, False: 5.87k]
  ------------------
  378|  72.9k|    memcpy(dst, above, bw * sizeof(uint16_t));
  379|  72.9k|    dst += stride;
  380|  72.9k|  }
  381|  5.87k|}
intrapred.c:highbd_h_predictor:
  385|  9.25k|                                      const uint16_t *left, int bd) {
  386|  9.25k|  int r;
  387|  9.25k|  (void)above;
  388|  9.25k|  (void)bd;
  389|   123k|  for (r = 0; r < bh; r++) {
  ------------------
  |  Branch (389:15): [True: 114k, False: 9.25k]
  ------------------
  390|   114k|    aom_memset16(dst, left[r], bw);
  391|   114k|    dst += stride;
  392|   114k|  }
  393|  9.25k|}
intrapred.c:highbd_smooth_predictor:
  412|   175k|                                           const uint16_t *left, int bd) {
  413|   175k|  (void)bd;
  414|   175k|  const uint16_t below_pred = left[bh - 1];   // estimated by bottom-left pixel
  415|   175k|  const uint16_t right_pred = above[bw - 1];  // estimated by top-right pixel
  416|   175k|  const uint8_t *const sm_weights_w = smooth_weights + bw - 4;
  417|   175k|  const uint8_t *const sm_weights_h = smooth_weights + bh - 4;
  418|       |  // scale = 2 * 2^SMOOTH_WEIGHT_LOG2_SCALE
  419|   175k|  const int log2_scale = 1 + SMOOTH_WEIGHT_LOG2_SCALE;
  ------------------
  |  |   19|   175k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
  420|   175k|  const uint16_t scale = (1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|   175k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
  421|   175k|  sm_weights_sanity_checks(sm_weights_w, sm_weights_h, scale,
  ------------------
  |  |   76|   175k|  assert(weights_w[0] < weights_scale);                               \
  |  |   77|   175k|  assert(weights_h[0] < weights_scale);                               \
  |  |   78|   175k|  assert(weights_scale - weights_w[bw - 1] < weights_scale);          \
  |  |   79|   175k|  assert(weights_scale - weights_h[bh - 1] < weights_scale);          \
  |  |   80|   175k|  assert(pred_scale < 31)  // ensures no overflow when calculating predictor.
  ------------------
  422|   175k|                           log2_scale + sizeof(*dst));
  423|   175k|  int r;
  424|  1.87M|  for (r = 0; r < bh; ++r) {
  ------------------
  |  Branch (424:15): [True: 1.69M, False: 175k]
  ------------------
  425|  1.69M|    int c;
  426|  26.3M|    for (c = 0; c < bw; ++c) {
  ------------------
  |  Branch (426:17): [True: 24.6M, False: 1.69M]
  ------------------
  427|  24.6M|      const uint16_t pixels[] = { above[c], below_pred, left[r], right_pred };
  428|  24.6M|      const uint8_t weights[] = { sm_weights_h[r], scale - sm_weights_h[r],
  429|  24.6M|                                  sm_weights_w[c], scale - sm_weights_w[c] };
  430|  24.6M|      uint32_t this_pred = 0;
  431|  24.6M|      int i;
  432|  24.6M|      assert(scale >= sm_weights_h[r] && scale >= sm_weights_w[c]);
  433|   123M|      for (i = 0; i < 4; ++i) {
  ------------------
  |  Branch (433:19): [True: 98.5M, False: 24.6M]
  ------------------
  434|  98.5M|        this_pred += weights[i] * pixels[i];
  435|  98.5M|      }
  436|  24.6M|      dst[c] = divide_round(this_pred, log2_scale);
  ------------------
  |  |   82|  24.6M|#define divide_round(value, bits) (((value) + (1 << ((bits) - 1))) >> (bits))
  ------------------
  437|  24.6M|    }
  438|  1.69M|    dst += stride;
  439|  1.69M|  }
  440|   175k|}
intrapred.c:highbd_smooth_v_predictor:
  445|  51.9k|                                             const uint16_t *left, int bd) {
  446|  51.9k|  (void)bd;
  447|  51.9k|  const uint16_t below_pred = left[bh - 1];  // estimated by bottom-left pixel
  448|  51.9k|  const uint8_t *const sm_weights = smooth_weights + bh - 4;
  449|       |  // scale = 2^SMOOTH_WEIGHT_LOG2_SCALE
  450|  51.9k|  const int log2_scale = SMOOTH_WEIGHT_LOG2_SCALE;
  ------------------
  |  |   19|  51.9k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
  451|  51.9k|  const uint16_t scale = (1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  51.9k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
  452|  51.9k|  sm_weights_sanity_checks(sm_weights, sm_weights, scale,
  ------------------
  |  |   76|  51.9k|  assert(weights_w[0] < weights_scale);                               \
  |  |   77|  51.9k|  assert(weights_h[0] < weights_scale);                               \
  |  |   78|  51.9k|  assert(weights_scale - weights_w[bw - 1] < weights_scale);          \
  |  |   79|  51.9k|  assert(weights_scale - weights_h[bh - 1] < weights_scale);          \
  |  |   80|  51.9k|  assert(pred_scale < 31)  // ensures no overflow when calculating predictor.
  ------------------
  453|  51.9k|                           log2_scale + sizeof(*dst));
  454|       |
  455|  51.9k|  int r;
  456|   536k|  for (r = 0; r < bh; r++) {
  ------------------
  |  Branch (456:15): [True: 485k, False: 51.9k]
  ------------------
  457|   485k|    int c;
  458|  7.58M|    for (c = 0; c < bw; ++c) {
  ------------------
  |  Branch (458:17): [True: 7.10M, False: 485k]
  ------------------
  459|  7.10M|      const uint16_t pixels[] = { above[c], below_pred };
  460|  7.10M|      const uint8_t weights[] = { sm_weights[r], scale - sm_weights[r] };
  461|  7.10M|      uint32_t this_pred = 0;
  462|  7.10M|      assert(scale >= sm_weights[r]);
  463|  7.10M|      int i;
  464|  21.3M|      for (i = 0; i < 2; ++i) {
  ------------------
  |  Branch (464:19): [True: 14.2M, False: 7.10M]
  ------------------
  465|  14.2M|        this_pred += weights[i] * pixels[i];
  466|  14.2M|      }
  467|  7.10M|      dst[c] = divide_round(this_pred, log2_scale);
  ------------------
  |  |   82|  7.10M|#define divide_round(value, bits) (((value) + (1 << ((bits) - 1))) >> (bits))
  ------------------
  468|  7.10M|    }
  469|   485k|    dst += stride;
  470|   485k|  }
  471|  51.9k|}
intrapred.c:highbd_smooth_h_predictor:
  476|  72.0k|                                             const uint16_t *left, int bd) {
  477|  72.0k|  (void)bd;
  478|  72.0k|  const uint16_t right_pred = above[bw - 1];  // estimated by top-right pixel
  479|  72.0k|  const uint8_t *const sm_weights = smooth_weights + bw - 4;
  480|       |  // scale = 2^SMOOTH_WEIGHT_LOG2_SCALE
  481|  72.0k|  const int log2_scale = SMOOTH_WEIGHT_LOG2_SCALE;
  ------------------
  |  |   19|  72.0k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
  482|  72.0k|  const uint16_t scale = (1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  72.0k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
  483|  72.0k|  sm_weights_sanity_checks(sm_weights, sm_weights, scale,
  ------------------
  |  |   76|  72.0k|  assert(weights_w[0] < weights_scale);                               \
  |  |   77|  72.0k|  assert(weights_h[0] < weights_scale);                               \
  |  |   78|  72.0k|  assert(weights_scale - weights_w[bw - 1] < weights_scale);          \
  |  |   79|  72.0k|  assert(weights_scale - weights_h[bh - 1] < weights_scale);          \
  |  |   80|  72.0k|  assert(pred_scale < 31)  // ensures no overflow when calculating predictor.
  ------------------
  484|  72.0k|                           log2_scale + sizeof(*dst));
  485|       |
  486|  72.0k|  int r;
  487|   787k|  for (r = 0; r < bh; r++) {
  ------------------
  |  Branch (487:15): [True: 715k, False: 72.0k]
  ------------------
  488|   715k|    int c;
  489|  11.5M|    for (c = 0; c < bw; ++c) {
  ------------------
  |  Branch (489:17): [True: 10.8M, False: 715k]
  ------------------
  490|  10.8M|      const uint16_t pixels[] = { left[r], right_pred };
  491|  10.8M|      const uint8_t weights[] = { sm_weights[c], scale - sm_weights[c] };
  492|  10.8M|      uint32_t this_pred = 0;
  493|  10.8M|      assert(scale >= sm_weights[c]);
  494|  10.8M|      int i;
  495|  32.6M|      for (i = 0; i < 2; ++i) {
  ------------------
  |  Branch (495:19): [True: 21.7M, False: 10.8M]
  ------------------
  496|  21.7M|        this_pred += weights[i] * pixels[i];
  497|  21.7M|      }
  498|  10.8M|      dst[c] = divide_round(this_pred, log2_scale);
  ------------------
  |  |   82|  10.8M|#define divide_round(value, bits) (((value) + (1 << ((bits) - 1))) >> (bits))
  ------------------
  499|  10.8M|    }
  500|   715k|    dst += stride;
  501|   715k|  }
  502|  72.0k|}
intrapred.c:paeth_predictor_single:
   48|  12.5M|                                              uint16_t top_left) {
   49|  12.5M|  const int base = top + left - top_left;
   50|  12.5M|  const int p_left = abs_diff(base, left);
   51|  12.5M|  const int p_top = abs_diff(base, top);
   52|  12.5M|  const int p_top_left = abs_diff(base, top_left);
   53|       |
   54|       |  // Return nearest to base of left, top and top_left.
   55|  12.5M|  return (p_left <= p_top && p_left <= p_top_left) ? left
  ------------------
  |  Branch (55:11): [True: 8.64M, False: 3.92M]
  |  Branch (55:30): [True: 7.74M, False: 904k]
  ------------------
   56|  12.5M|         : (p_top <= p_top_left)                   ? top
  ------------------
  |  Branch (56:12): [True: 3.68M, False: 1.14M]
  ------------------
   57|  4.83M|                                                   : top_left;
   58|  12.5M|}
intrapred.c:abs_diff:
   45|  37.7M|static inline int abs_diff(int a, int b) { return (a > b) ? a - b : b - a; }
  ------------------
  |  Branch (45:51): [True: 11.5M, False: 26.2M]
  ------------------
intrapred.c:highbd_paeth_predictor:
  397|  93.9k|                                          const uint16_t *left, int bd) {
  398|  93.9k|  int r, c;
  399|  93.9k|  const uint16_t ytop_left = above[-1];
  400|  93.9k|  (void)bd;
  401|       |
  402|   979k|  for (r = 0; r < bh; r++) {
  ------------------
  |  Branch (402:15): [True: 885k, False: 93.9k]
  ------------------
  403|  13.4M|    for (c = 0; c < bw; c++)
  ------------------
  |  Branch (403:17): [True: 12.5M, False: 885k]
  ------------------
  404|  12.5M|      dst[c] = paeth_predictor_single(left[r], above[c], ytop_left);
  405|   885k|    dst += stride;
  406|   885k|  }
  407|  93.9k|}
intrapred.c:highbd_dc_128_predictor:
  507|  2.85k|                                           const uint16_t *left, int bd) {
  508|  2.85k|  int r;
  509|  2.85k|  (void)above;
  510|  2.85k|  (void)left;
  511|       |
  512|   168k|  for (r = 0; r < bh; r++) {
  ------------------
  |  Branch (512:15): [True: 165k, False: 2.85k]
  ------------------
  513|   165k|    aom_memset16(dst, 128 << (bd - 8), bw);
  514|   165k|    dst += stride;
  515|   165k|  }
  516|  2.85k|}
intrapred.c:highbd_dc_left_predictor:
  521|  5.99k|                                            const uint16_t *left, int bd) {
  522|  5.99k|  int i, r, expected_dc, sum = 0;
  523|  5.99k|  (void)above;
  524|  5.99k|  (void)bd;
  525|       |
  526|   222k|  for (i = 0; i < bh; i++) sum += left[i];
  ------------------
  |  Branch (526:15): [True: 216k, False: 5.99k]
  ------------------
  527|  5.99k|  expected_dc = (sum + (bh >> 1)) / bh;
  528|       |
  529|   222k|  for (r = 0; r < bh; r++) {
  ------------------
  |  Branch (529:15): [True: 216k, False: 5.99k]
  ------------------
  530|   216k|    aom_memset16(dst, expected_dc, bw);
  531|   216k|    dst += stride;
  532|   216k|  }
  533|  5.99k|}
intrapred.c:highbd_dc_top_predictor:
  538|  5.85k|                                           const uint16_t *left, int bd) {
  539|  5.85k|  int i, r, expected_dc, sum = 0;
  540|  5.85k|  (void)left;
  541|  5.85k|  (void)bd;
  542|       |
  543|   248k|  for (i = 0; i < bw; i++) sum += above[i];
  ------------------
  |  Branch (543:15): [True: 242k, False: 5.85k]
  ------------------
  544|  5.85k|  expected_dc = (sum + (bw >> 1)) / bw;
  545|       |
  546|   194k|  for (r = 0; r < bh; r++) {
  ------------------
  |  Branch (546:15): [True: 188k, False: 5.85k]
  ------------------
  547|   188k|    aom_memset16(dst, expected_dc, bw);
  548|   188k|    dst += stride;
  549|   188k|  }
  550|  5.85k|}
intrapred.c:highbd_dc_predictor:
  554|  1.42k|                                       const uint16_t *left, int bd) {
  555|  1.42k|  int i, r, expected_dc, sum = 0;
  556|  1.42k|  const int count = bw + bh;
  557|  1.42k|  (void)bd;
  558|       |
  559|  92.8k|  for (i = 0; i < bw; i++) {
  ------------------
  |  Branch (559:15): [True: 91.3k, False: 1.42k]
  ------------------
  560|  91.3k|    sum += above[i];
  561|  91.3k|  }
  562|  92.8k|  for (i = 0; i < bh; i++) {
  ------------------
  |  Branch (562:15): [True: 91.3k, False: 1.42k]
  ------------------
  563|  91.3k|    sum += left[i];
  564|  91.3k|  }
  565|       |
  566|  1.42k|  expected_dc = (sum + (count >> 1)) / count;
  567|       |
  568|  92.8k|  for (r = 0; r < bh; r++) {
  ------------------
  |  Branch (568:15): [True: 91.3k, False: 1.42k]
  ------------------
  569|  91.3k|    aom_memset16(dst, expected_dc, bw);
  570|  91.3k|    dst += stride;
  571|  91.3k|  }
  572|  1.42k|}

decodeframe.c:update_cdf:
  110|  1.75M|static inline void update_cdf(aom_cdf_prob *cdf, int8_t val, int nsymbs) {
  111|  1.75M|  assert(nsymbs < 17);
  112|  1.75M|  const int count = cdf[nsymbs];
  113|       |
  114|       |  // rate is computed in the spec as:
  115|       |  //  3 + ( cdf[N] > 15 ) + ( cdf[N] > 31 ) + Min(FloorLog2(N), 2)
  116|       |  // In this case cdf[N] is |count|.
  117|       |  // Min(FloorLog2(N), 2) is 1 for nsymbs == {2, 3} and 2 for all
  118|       |  // nsymbs > 3. So the equation becomes:
  119|       |  //  4 + (count > 15) + (count > 31) + (nsymbs > 3).
  120|       |  // Note that the largest value for count is 32 (it is not incremented beyond
  121|       |  // 32). So using that information:
  122|       |  //  count >> 4 is 0 for count from 0 to 15.
  123|       |  //  count >> 4 is 1 for count from 16 to 31.
  124|       |  //  count >> 4 is 2 for count == 31.
  125|       |  // Now, the equation becomes:
  126|       |  //  4 + (count >> 4) + (nsymbs > 3).
  127|  1.75M|  const int rate = 4 + (count >> 4) + (nsymbs > 3);
  128|       |
  129|  1.75M|  int i = 0;
  130|  8.89M|  do {
  131|  8.89M|    if (i < val) {
  ------------------
  |  Branch (131:9): [True: 2.50M, False: 6.38M]
  ------------------
  132|  2.50M|      cdf[i] += (CDF_PROB_TOP - cdf[i]) >> rate;
  ------------------
  |  |   33|  2.50M|#define CDF_PROB_TOP (1 << CDF_PROB_BITS)
  |  |  ------------------
  |  |  |  |   32|  2.50M|#define CDF_PROB_BITS 15
  |  |  ------------------
  ------------------
  133|  6.38M|    } else {
  134|  6.38M|      cdf[i] -= cdf[i] >> rate;
  135|  6.38M|    }
  136|  8.89M|  } while (++i < nsymbs - 1);
  ------------------
  |  Branch (136:12): [True: 7.14M, False: 1.75M]
  ------------------
  137|  1.75M|  cdf[nsymbs] += (count < 32);
  138|  1.75M|}
decodemv.c:update_cdf:
  110|  9.12M|static inline void update_cdf(aom_cdf_prob *cdf, int8_t val, int nsymbs) {
  111|  9.12M|  assert(nsymbs < 17);
  112|  9.12M|  const int count = cdf[nsymbs];
  113|       |
  114|       |  // rate is computed in the spec as:
  115|       |  //  3 + ( cdf[N] > 15 ) + ( cdf[N] > 31 ) + Min(FloorLog2(N), 2)
  116|       |  // In this case cdf[N] is |count|.
  117|       |  // Min(FloorLog2(N), 2) is 1 for nsymbs == {2, 3} and 2 for all
  118|       |  // nsymbs > 3. So the equation becomes:
  119|       |  //  4 + (count > 15) + (count > 31) + (nsymbs > 3).
  120|       |  // Note that the largest value for count is 32 (it is not incremented beyond
  121|       |  // 32). So using that information:
  122|       |  //  count >> 4 is 0 for count from 0 to 15.
  123|       |  //  count >> 4 is 1 for count from 16 to 31.
  124|       |  //  count >> 4 is 2 for count == 31.
  125|       |  // Now, the equation becomes:
  126|       |  //  4 + (count >> 4) + (nsymbs > 3).
  127|  9.12M|  const int rate = 4 + (count >> 4) + (nsymbs > 3);
  128|       |
  129|  9.12M|  int i = 0;
  130|  55.3M|  do {
  131|  55.3M|    if (i < val) {
  ------------------
  |  Branch (131:9): [True: 18.8M, False: 36.4M]
  ------------------
  132|  18.8M|      cdf[i] += (CDF_PROB_TOP - cdf[i]) >> rate;
  ------------------
  |  |   33|  18.8M|#define CDF_PROB_TOP (1 << CDF_PROB_BITS)
  |  |  ------------------
  |  |  |  |   32|  18.8M|#define CDF_PROB_BITS 15
  |  |  ------------------
  ------------------
  133|  36.4M|    } else {
  134|  36.4M|      cdf[i] -= cdf[i] >> rate;
  135|  36.4M|    }
  136|  55.3M|  } while (++i < nsymbs - 1);
  ------------------
  |  Branch (136:12): [True: 46.2M, False: 9.12M]
  ------------------
  137|  9.12M|  cdf[nsymbs] += (count < 32);
  138|  9.12M|}
decodetxb.c:update_cdf:
  110|  77.6M|static inline void update_cdf(aom_cdf_prob *cdf, int8_t val, int nsymbs) {
  111|  77.6M|  assert(nsymbs < 17);
  112|  77.6M|  const int count = cdf[nsymbs];
  113|       |
  114|       |  // rate is computed in the spec as:
  115|       |  //  3 + ( cdf[N] > 15 ) + ( cdf[N] > 31 ) + Min(FloorLog2(N), 2)
  116|       |  // In this case cdf[N] is |count|.
  117|       |  // Min(FloorLog2(N), 2) is 1 for nsymbs == {2, 3} and 2 for all
  118|       |  // nsymbs > 3. So the equation becomes:
  119|       |  //  4 + (count > 15) + (count > 31) + (nsymbs > 3).
  120|       |  // Note that the largest value for count is 32 (it is not incremented beyond
  121|       |  // 32). So using that information:
  122|       |  //  count >> 4 is 0 for count from 0 to 15.
  123|       |  //  count >> 4 is 1 for count from 16 to 31.
  124|       |  //  count >> 4 is 2 for count == 31.
  125|       |  // Now, the equation becomes:
  126|       |  //  4 + (count >> 4) + (nsymbs > 3).
  127|  77.6M|  const int rate = 4 + (count >> 4) + (nsymbs > 3);
  128|       |
  129|  77.6M|  int i = 0;
  130|   220M|  do {
  131|   220M|    if (i < val) {
  ------------------
  |  Branch (131:9): [True: 69.1M, False: 151M]
  ------------------
  132|  69.1M|      cdf[i] += (CDF_PROB_TOP - cdf[i]) >> rate;
  ------------------
  |  |   33|  69.1M|#define CDF_PROB_TOP (1 << CDF_PROB_BITS)
  |  |  ------------------
  |  |  |  |   32|  69.1M|#define CDF_PROB_BITS 15
  |  |  ------------------
  ------------------
  133|   151M|    } else {
  134|   151M|      cdf[i] -= cdf[i] >> rate;
  135|   151M|    }
  136|   220M|  } while (++i < nsymbs - 1);
  ------------------
  |  Branch (136:12): [True: 143M, False: 77.6M]
  ------------------
  137|  77.6M|  cdf[nsymbs] += (count < 32);
  138|  77.6M|}
detokenize.c:update_cdf:
  110|  10.1M|static inline void update_cdf(aom_cdf_prob *cdf, int8_t val, int nsymbs) {
  111|  10.1M|  assert(nsymbs < 17);
  112|  10.1M|  const int count = cdf[nsymbs];
  113|       |
  114|       |  // rate is computed in the spec as:
  115|       |  //  3 + ( cdf[N] > 15 ) + ( cdf[N] > 31 ) + Min(FloorLog2(N), 2)
  116|       |  // In this case cdf[N] is |count|.
  117|       |  // Min(FloorLog2(N), 2) is 1 for nsymbs == {2, 3} and 2 for all
  118|       |  // nsymbs > 3. So the equation becomes:
  119|       |  //  4 + (count > 15) + (count > 31) + (nsymbs > 3).
  120|       |  // Note that the largest value for count is 32 (it is not incremented beyond
  121|       |  // 32). So using that information:
  122|       |  //  count >> 4 is 0 for count from 0 to 15.
  123|       |  //  count >> 4 is 1 for count from 16 to 31.
  124|       |  //  count >> 4 is 2 for count == 31.
  125|       |  // Now, the equation becomes:
  126|       |  //  4 + (count >> 4) + (nsymbs > 3).
  127|  10.1M|  const int rate = 4 + (count >> 4) + (nsymbs > 3);
  128|       |
  129|  10.1M|  int i = 0;
  130|  30.9M|  do {
  131|  30.9M|    if (i < val) {
  ------------------
  |  Branch (131:9): [True: 5.99M, False: 24.9M]
  ------------------
  132|  5.99M|      cdf[i] += (CDF_PROB_TOP - cdf[i]) >> rate;
  ------------------
  |  |   33|  5.99M|#define CDF_PROB_TOP (1 << CDF_PROB_BITS)
  |  |  ------------------
  |  |  |  |   32|  5.99M|#define CDF_PROB_BITS 15
  |  |  ------------------
  ------------------
  133|  24.9M|    } else {
  134|  24.9M|      cdf[i] -= cdf[i] >> rate;
  135|  24.9M|    }
  136|  30.9M|  } while (++i < nsymbs - 1);
  ------------------
  |  Branch (136:12): [True: 20.8M, False: 10.1M]
  ------------------
  137|  10.1M|  cdf[nsymbs] += (count < 32);
  138|  10.1M|}

aom_invalidate_pyramid:
  446|  29.0k|void aom_invalidate_pyramid(ImagePyramid *pyr) {
  447|  29.0k|  if (pyr) {
  ------------------
  |  Branch (447:7): [True: 0, False: 29.0k]
  ------------------
  448|      0|#if CONFIG_MULTITHREAD
  449|      0|    pthread_mutex_lock(&pyr->mutex);
  450|      0|#endif  // CONFIG_MULTITHREAD
  451|      0|    pyr->filled_levels = 0;
  452|      0|#if CONFIG_MULTITHREAD
  453|      0|    pthread_mutex_unlock(&pyr->mutex);
  454|      0|#endif  // CONFIG_MULTITHREAD
  455|      0|  }
  456|  29.0k|}

bitreader_buffer.c:inv_recenter_finite_nonneg:
   32|  9.13k|                                                  uint16_t v) {
   33|  9.13k|  if ((r << 1) <= n) {
  ------------------
  |  Branch (33:7): [True: 9.13k, False: 0]
  ------------------
   34|  9.13k|    return inv_recenter_nonneg(r, v);
   35|  9.13k|  } else {
   36|      0|    return n - 1 - inv_recenter_nonneg(n - 1 - r, v);
   37|      0|  }
   38|  9.13k|}
bitreader_buffer.c:inv_recenter_nonneg:
   20|  9.13k|static inline uint16_t inv_recenter_nonneg(uint16_t r, uint16_t v) {
   21|  9.13k|  if (v > (r << 1))
  ------------------
  |  Branch (21:7): [True: 0, False: 9.13k]
  ------------------
   22|      0|    return v;
   23|  9.13k|  else if ((v & 1) == 0)
  ------------------
  |  Branch (23:12): [True: 5.69k, False: 3.43k]
  ------------------
   24|  5.69k|    return (v >> 1) + r;
   25|  3.43k|  else
   26|  3.43k|    return r - ((v + 1) >> 1);
   27|  9.13k|}
binary_codes_reader.c:inv_recenter_finite_nonneg:
   32|  41.8k|                                                  uint16_t v) {
   33|  41.8k|  if ((r << 1) <= n) {
  ------------------
  |  Branch (33:7): [True: 30.8k, False: 11.0k]
  ------------------
   34|  30.8k|    return inv_recenter_nonneg(r, v);
   35|  30.8k|  } else {
   36|  11.0k|    return n - 1 - inv_recenter_nonneg(n - 1 - r, v);
   37|  11.0k|  }
   38|  41.8k|}
binary_codes_reader.c:inv_recenter_nonneg:
   20|  41.8k|static inline uint16_t inv_recenter_nonneg(uint16_t r, uint16_t v) {
   21|  41.8k|  if (v > (r << 1))
  ------------------
  |  Branch (21:7): [True: 3.50k, False: 38.3k]
  ------------------
   22|  3.50k|    return v;
   23|  38.3k|  else if ((v & 1) == 0)
  ------------------
  |  Branch (23:12): [True: 19.6k, False: 18.7k]
  ------------------
   24|  19.6k|    return (v >> 1) + r;
   25|  18.7k|  else
   26|  18.7k|    return r - ((v + 1) >> 1);
   27|  41.8k|}

cdef_block_avx2.c:v128_load_unaligned:
   46|  25.1M|SIMD_INLINE v128 v128_load_unaligned(const void *p) {
   47|  25.1M|#if defined(__SSSE3__)
   48|  25.1M|  return _mm_lddqu_si128((__m128i *)p);
   49|       |#else
   50|       |  return _mm_loadu_si128((__m128i *)p);
   51|       |#endif
   52|  25.1M|}
cdef_block_avx2.c:v128_sub_16:
  120|  12.7k|SIMD_INLINE v128 v128_sub_16(v128 a, v128 b) { return _mm_sub_epi16(a, b); }
cdef_block_avx2.c:v128_shr_s16:
  562|  12.7k|SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) {
  563|  12.7k|  return _mm_sra_epi16(a, _mm_cvtsi32_si128((int)c));
  564|  12.7k|}
cdef_block_avx2.c:v128_dup_16:
   86|  12.7k|SIMD_INLINE v128 v128_dup_16(uint16_t x) { return _mm_set1_epi16((short)x); }
cdef_block_avx2.c:v128_add_16:
   98|   102k|SIMD_INLINE v128 v128_add_16(v128 a, v128 b) { return _mm_add_epi16(a, b); }
cdef_block_avx2.c:v128_shuffle_8:
  300|  9.60k|SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) {
  301|  9.60k|#if defined(__SSSE3__)
  302|  9.60k|  return _mm_shuffle_epi8(x, pattern);
  303|       |#else
  304|       |  v128 output;
  305|       |  unsigned char *input = (unsigned char *)&x;
  306|       |  unsigned char *index = (unsigned char *)&pattern;
  307|       |  unsigned char *selected = (unsigned char *)&output;
  308|       |  int counter;
  309|       |
  310|       |  for (counter = 0; counter < 16; counter++) {
  311|       |    selected[counter] = input[index[counter] & 15];
  312|       |  }
  313|       |
  314|       |  return output;
  315|       |#endif
  316|  9.60k|}
cdef_block_avx2.c:v128_ziplo_16:
  155|  16.0k|SIMD_INLINE v128 v128_ziplo_16(v128 a, v128 b) {
  156|  16.0k|  return _mm_unpacklo_epi16(b, a);
  157|  16.0k|}
cdef_block_avx2.c:v128_ziphi_16:
  159|  16.0k|SIMD_INLINE v128 v128_ziphi_16(v128 a, v128 b) {
  160|  16.0k|  return _mm_unpackhi_epi16(b, a);
  161|  16.0k|}
cdef_block_avx2.c:v128_add_32:
  106|  19.2k|SIMD_INLINE v128 v128_add_32(v128 a, v128 b) { return _mm_add_epi32(a, b); }
cdef_block_avx2.c:v128_from_32:
   38|  28.8k|SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
   39|  28.8k|  return _mm_set_epi32((int)a, (int)b, (int)c, (int)d);
   40|  28.8k|}
cdef_block_avx2.c:v128_madd_s16:
  425|  22.3k|SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { return _mm_madd_epi16(a, b); }
cdef_block_avx2.c:v128_mullo_s32:
  406|  22.4k|SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) {
  407|  22.4k|#if defined(__SSE4_1__)
  408|  22.4k|  return _mm_mullo_epi32(a, b);
  409|       |#else
  410|       |  return _mm_unpacklo_epi32(
  411|       |      _mm_shuffle_epi32(_mm_mul_epu32(a, b), 8),
  412|       |      _mm_shuffle_epi32(
  413|       |          _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)), 8));
  414|       |#endif
  415|  22.4k|}
cdef_block_avx2.c:v128_dup_32:
   88|  3.20k|SIMD_INLINE v128 v128_dup_32(uint32_t x) { return _mm_set1_epi32((int)x); }
cdef_block_avx2.c:v128_ziplo_32:
  163|  12.8k|SIMD_INLINE v128 v128_ziplo_32(v128 a, v128 b) {
  164|  12.8k|  return _mm_unpacklo_epi32(b, a);
  165|  12.8k|}
cdef_block_avx2.c:v128_ziphi_32:
  167|  12.7k|SIMD_INLINE v128 v128_ziphi_32(v128 a, v128 b) {
  168|  12.7k|  return _mm_unpackhi_epi32(b, a);
  169|  12.7k|}
cdef_block_avx2.c:v128_ziplo_64:
  171|  12.8k|SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) {
  172|  12.8k|  return _mm_unpacklo_epi64(b, a);
  173|  12.8k|}
cdef_block_avx2.c:v128_ziphi_64:
  175|  12.7k|SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) {
  176|  12.7k|  return _mm_unpackhi_epi64(b, a);
  177|  12.7k|}
cdef_block_avx2.c:v128_max_s32:
  503|  4.80k|SIMD_INLINE v128 v128_max_s32(v128 a, v128 b) {
  504|  4.80k|#if defined(__SSE4_1__)
  505|  4.80k|  return _mm_max_epi32(a, b);
  506|       |#else
  507|       |  v128 mask = _mm_cmplt_epi32(b, a);
  508|       |  return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
  509|       |#endif
  510|  4.80k|}
cdef_block_avx2.c:v128_low_u32:
   20|  1.59k|SIMD_INLINE uint32_t v128_low_u32(v128 a) {
   21|  1.59k|  return (uint32_t)_mm_cvtsi128_si32(a);
   22|  1.59k|}
cdef_block_avx2.c:v128_pack_s32_s16:
  255|  1.60k|SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) {
  256|  1.60k|  return _mm_packs_epi32(b, a);
  257|  1.60k|}
cdef_block_avx2.c:v128_cmpeq_32:
  526|  3.20k|SIMD_INLINE v128 v128_cmpeq_32(v128 a, v128 b) { return _mm_cmpeq_epi32(a, b); }
cdef_block_avx2.c:v128_movemask_8:
  470|  1.60k|SIMD_INLINE uint32_t v128_movemask_8(v128 a) { return _mm_movemask_epi8(a); }
cdef_block_avx2.c:v128_pack_s16_s8:
  272|  1.60k|SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) {
  273|  1.60k|  return _mm_packs_epi16(b, a);
  274|  1.60k|}
cdef_block_avx2.c:v128_load_aligned:
   42|  5.29M|SIMD_INLINE v128 v128_load_aligned(const void *p) {
   43|  5.29M|  return _mm_load_si128((__m128i *)p);
   44|  5.29M|}
cdef_block_avx2.c:v128_high_v64:
   28|  3.12M|SIMD_INLINE v64 v128_high_v64(v128 a) { return _mm_srli_si128(a, 8); }
cdef_block_avx2.c:v128_low_v64:
   24|  3.17M|SIMD_INLINE v64 v128_low_v64(v128 a) {
   25|  3.17M|  return _mm_unpacklo_epi64(a, v64_zero());
   26|  3.17M|}
cdef_block_avx2.c:v128_from_v64:
   30|  10.0M|SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) {
   31|  10.0M|  return _mm_unpacklo_epi64(b, a);
   32|  10.0M|}
cdef_block_avx2.c:v128_pack_s16_u8:
  268|   873k|SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) {
  269|   873k|  return _mm_packus_epi16(b, a);
  270|   873k|}
cdef_block_avx2.c:v128_store_unaligned:
   58|  4.31M|SIMD_INLINE void v128_store_unaligned(void *p, v128 a) {
   59|  4.31M|  _mm_storeu_si128((__m128i *)p, a);
   60|  4.31M|}

cdef_block_avx2.c:v256_dup_16:
   83|  23.2M|SIMD_INLINE v256 v256_dup_16(uint16_t x) { return _mm256_set1_epi16((short)x); }
cdef_block_avx2.c:v256_zero:
   79|  4.24M|SIMD_INLINE v256 v256_zero(void) { return _mm256_setzero_si256(); }
cdef_block_avx2.c:v256_from_v128:
   50|  17.4M|SIMD_INLINE v256 v256_from_v128(v128 a, v128 b) {
   51|       |  // gcc seems to be missing _mm256_set_m128i()
   52|       |  return _mm256_inserti128_si256(_mm256_castsi128_si256(b), a, 1);
   53|  17.4M|}
cdef_block_avx2.c:v256_sub_16:
  121|  15.3M|SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { return _mm256_sub_epi16(a, b); }
cdef_block_avx2.c:v256_abs_s16:
  135|  15.4M|SIMD_INLINE v256 v256_abs_s16(v256 a) { return _mm256_abs_epi16(a); }
cdef_block_avx2.c:v256_ssub_u16:
  127|  15.5M|SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) {
  128|  15.5M|  return _mm256_subs_epu16(a, b);
  129|  15.5M|}
cdef_block_avx2.c:v256_shr_u16:
  622|  15.5M|SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) {
  623|  15.5M|  return _mm256_srl_epi16(a, _mm_cvtsi32_si128((int)c));
  624|  15.5M|}
cdef_block_avx2.c:v256_xor:
  493|  15.5M|SIMD_INLINE v256 v256_xor(v256 a, v256 b) { return _mm256_xor_si256(a, b); }
cdef_block_avx2.c:v256_add_16:
   93|  36.6M|SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { return _mm256_add_epi16(a, b); }
cdef_block_avx2.c:v256_mullo_s16:
  506|  5.17M|SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) {
  507|  5.17M|  return _mm256_mullo_epi16(a, b);
  508|  5.17M|}
cdef_block_avx2.c:v256_max_u8:
  544|  3.37M|SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { return _mm256_max_epu8(a, b); }
cdef_block_avx2.c:v256_and:
  495|  3.40M|SIMD_INLINE v256 v256_and(v256 a, v256 b) { return _mm256_and_si256(a, b); }
cdef_block_avx2.c:v256_min_s16:
  558|  22.0M|SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { return _mm256_min_epi16(a, b); }
cdef_block_avx2.c:v256_cmplt_s16:
  582|  2.17M|SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
  583|  2.17M|  return _mm256_cmpgt_epi16(b, a);
  584|  2.17M|}
cdef_block_avx2.c:v256_max_s16:
  560|  3.95M|SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { return _mm256_max_epi16(a, b); }
cdef_block_avx2.c:v256_low_v128:
   44|  2.56M|SIMD_INLINE v128 v256_low_v128(v256 a) { return _mm256_castsi256_si128(a); }
cdef_block_avx2.c:v256_pack_s16_u8:
  303|  1.17M|SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
  304|       |  return _mm256_permute4x64_epi64(_mm256_packus_epi16(b, a),
  305|  1.17M|                                  _MM_SHUFFLE(3, 1, 2, 0));
  306|  1.17M|}
cdef_block_avx2.c:v256_high_v128:
   46|  1.38M|SIMD_INLINE v128 v256_high_v128(v256 a) {
   47|       |  return _mm256_extracti128_si256(a, 1);
   48|  1.38M|}
cdef_block_avx2.c:v256_from_v64:
   55|  5.04M|SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) {
   56|  5.04M|  return v256_from_v128(v128_from_v64(a, b), v128_from_v64(c, d));
   57|  5.04M|}

cdef_block_avx2.c:v64_store_aligned:
   85|  5.04M|SIMD_INLINE void v64_store_aligned(void *p, v64 a) {
   86|  5.04M|  _mm_storel_epi64((__m128i *)p, a);
   87|  5.04M|}
cdef_block_avx2.c:v64_zero:
  102|  3.17M|SIMD_INLINE v64 v64_zero(void) { return _mm_setzero_si128(); }
cdef_block_avx2.c:v64_load_aligned:
   77|  2.97M|SIMD_INLINE v64 v64_load_aligned(const void *p) {
   78|  2.97M|  return _mm_loadl_epi64((__m128i *)p);
   79|  2.97M|}
cdef_block_avx2.c:v64_load_unaligned:
   81|  17.3M|SIMD_INLINE v64 v64_load_unaligned(const void *p) {
   82|  17.3M|  return _mm_loadl_epi64((__m128i *)p);
   83|  17.3M|}
cdef_block_avx2.c:u32_store_aligned:
   69|  1.26M|SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) {
   70|  1.26M|  *((uint32_t *)p) = a;
   71|  1.26M|}
cdef_block_avx2.c:v64_high_u32:
   29|   635k|SIMD_INLINE uint32_t v64_high_u32(v64 a) {
   30|       |  return (uint32_t)_mm_cvtsi128_si32(_mm_srli_si128(a, 4));
   31|   635k|}
cdef_block_avx2.c:v64_low_u32:
   25|   656k|SIMD_INLINE uint32_t v64_low_u32(v64 a) {
   26|   656k|  return (uint32_t)_mm_cvtsi128_si32(a);
   27|   656k|}

aom_convolve_copy_avx2:
   29|  45.8k|                            uint8_t *dst, ptrdiff_t dst_stride, int w, int h) {
   30|       |  // The w == 16 case uses _mm_store_si128(), which requires its output address
   31|       |  // be aligned on a 16-byte boundary.
   32|  45.8k|  if (w == 16) {
  ------------------
  |  Branch (32:7): [True: 10.3k, False: 35.5k]
  ------------------
   33|  10.3k|    assert(!((intptr_t)dst % 16));
   34|  10.3k|    assert(!(dst_stride % 16));
   35|  10.3k|  }
   36|       |
   37|  45.8k|  if (w == 2) {
  ------------------
  |  Branch (37:7): [True: 2.08k, False: 43.7k]
  ------------------
   38|  4.88k|    do {
   39|  4.88k|      memmove(dst, src, 2 * sizeof(*src));
   40|  4.88k|      src += src_stride;
   41|  4.88k|      dst += dst_stride;
   42|  4.88k|      memmove(dst, src, 2 * sizeof(*src));
   43|  4.88k|      src += src_stride;
   44|  4.88k|      dst += dst_stride;
   45|  4.88k|      h -= 2;
   46|  4.88k|    } while (h);
  ------------------
  |  Branch (46:14): [True: 2.80k, False: 2.08k]
  ------------------
   47|  43.7k|  } else if (w == 4) {
  ------------------
  |  Branch (47:14): [True: 12.3k, False: 31.4k]
  ------------------
   48|  41.0k|    do {
   49|  41.0k|      memmove(dst, src, 4 * sizeof(*src));
   50|  41.0k|      src += src_stride;
   51|  41.0k|      dst += dst_stride;
   52|  41.0k|      memmove(dst, src, 4 * sizeof(*src));
   53|  41.0k|      src += src_stride;
   54|  41.0k|      dst += dst_stride;
   55|  41.0k|      h -= 2;
   56|  41.0k|    } while (h);
  ------------------
  |  Branch (56:14): [True: 28.7k, False: 12.3k]
  ------------------
   57|  31.4k|  } else if (w == 8) {
  ------------------
  |  Branch (57:14): [True: 15.3k, False: 16.1k]
  ------------------
   58|  61.3k|    do {
   59|  61.3k|      __m128i s[2];
   60|  61.3k|      s[0] = _mm_loadl_epi64((__m128i *)src);
   61|  61.3k|      src += src_stride;
   62|  61.3k|      s[1] = _mm_loadl_epi64((__m128i *)src);
   63|  61.3k|      src += src_stride;
   64|  61.3k|      _mm_storel_epi64((__m128i *)dst, s[0]);
   65|  61.3k|      dst += dst_stride;
   66|  61.3k|      _mm_storel_epi64((__m128i *)dst, s[1]);
   67|  61.3k|      dst += dst_stride;
   68|  61.3k|      h -= 2;
   69|  61.3k|    } while (h);
  ------------------
  |  Branch (69:14): [True: 46.0k, False: 15.3k]
  ------------------
   70|  16.1k|  } else if (w == 16) {
  ------------------
  |  Branch (70:14): [True: 10.3k, False: 5.73k]
  ------------------
   71|  61.6k|    do {
   72|  61.6k|      __m128i s[2];
   73|  61.6k|      s[0] = _mm_loadu_si128((__m128i *)src);
   74|  61.6k|      src += src_stride;
   75|  61.6k|      s[1] = _mm_loadu_si128((__m128i *)src);
   76|  61.6k|      src += src_stride;
   77|  61.6k|      _mm_store_si128((__m128i *)dst, s[0]);
   78|  61.6k|      dst += dst_stride;
   79|  61.6k|      _mm_store_si128((__m128i *)dst, s[1]);
   80|  61.6k|      dst += dst_stride;
   81|  61.6k|      h -= 2;
   82|  61.6k|    } while (h);
  ------------------
  |  Branch (82:14): [True: 51.2k, False: 10.3k]
  ------------------
   83|  10.3k|  } else if (w == 32) {
  ------------------
  |  Branch (83:14): [True: 4.18k, False: 1.55k]
  ------------------
   84|  50.4k|    do {
   85|  50.4k|      __m256i s[2];
   86|  50.4k|      s[0] = _mm256_loadu_si256((__m256i *)src);
   87|  50.4k|      src += src_stride;
   88|  50.4k|      s[1] = _mm256_loadu_si256((__m256i *)src);
   89|  50.4k|      src += src_stride;
   90|  50.4k|      _mm256_storeu_si256((__m256i *)dst, s[0]);
   91|  50.4k|      dst += dst_stride;
   92|  50.4k|      _mm256_storeu_si256((__m256i *)dst, s[1]);
   93|  50.4k|      dst += dst_stride;
   94|  50.4k|      h -= 2;
   95|  50.4k|    } while (h);
  ------------------
  |  Branch (95:14): [True: 46.2k, False: 4.18k]
  ------------------
   96|  4.18k|  } else if (w == 64) {
  ------------------
  |  Branch (96:14): [True: 1.52k, False: 26]
  ------------------
   97|  34.8k|    do {
   98|  34.8k|      __m256i s[4];
   99|  34.8k|      s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
  100|  34.8k|      s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
  101|  34.8k|      src += src_stride;
  102|  34.8k|      s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
  103|  34.8k|      s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
  104|  34.8k|      src += src_stride;
  105|  34.8k|      _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]);
  106|  34.8k|      _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]);
  107|  34.8k|      dst += dst_stride;
  108|  34.8k|      _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[2]);
  109|  34.8k|      _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[3]);
  110|  34.8k|      dst += dst_stride;
  111|  34.8k|      h -= 2;
  112|  34.8k|    } while (h);
  ------------------
  |  Branch (112:14): [True: 33.3k, False: 1.52k]
  ------------------
  113|  1.52k|  } else {
  114|  1.60k|    do {
  115|  1.60k|      copy_128(src, dst);
  116|  1.60k|      src += src_stride;
  117|  1.60k|      dst += dst_stride;
  118|  1.60k|      copy_128(src, dst);
  119|  1.60k|      src += src_stride;
  120|  1.60k|      dst += dst_stride;
  121|  1.60k|      h -= 2;
  122|  1.60k|    } while (h);
  ------------------
  |  Branch (122:14): [True: 1.57k, False: 26]
  ------------------
  123|     26|  }
  124|  45.8k|}
aom_highbd_convolve_copy_avx2:
  163|  14.3k|                                   int h) {
  164|       |  // The w == 8 case uses _mm_store_si128(), which requires its output address
  165|       |  // be aligned on a 16-byte boundary.
  166|  14.3k|  if (w == 8) {
  ------------------
  |  Branch (166:7): [True: 4.43k, False: 9.87k]
  ------------------
  167|  4.43k|    assert(!((intptr_t)dst % 16));
  168|  4.43k|    assert(!(dst_stride % 8));
  169|  4.43k|  }
  170|       |
  171|  14.3k|  if (w == 2) {
  ------------------
  |  Branch (171:7): [True: 392, False: 13.9k]
  ------------------
  172|    880|    do {
  173|    880|      memmove(dst, src, 2 * sizeof(*src));
  174|    880|      src += src_stride;
  175|    880|      dst += dst_stride;
  176|    880|      memmove(dst, src, 2 * sizeof(*src));
  177|    880|      src += src_stride;
  178|    880|      dst += dst_stride;
  179|    880|      h -= 2;
  180|    880|    } while (h);
  ------------------
  |  Branch (180:14): [True: 488, False: 392]
  ------------------
  181|  13.9k|  } else if (w == 4) {
  ------------------
  |  Branch (181:14): [True: 3.58k, False: 10.3k]
  ------------------
  182|  12.9k|    do {
  183|  12.9k|      __m128i s[2];
  184|  12.9k|      s[0] = _mm_loadl_epi64((__m128i *)src);
  185|  12.9k|      src += src_stride;
  186|  12.9k|      s[1] = _mm_loadl_epi64((__m128i *)src);
  187|  12.9k|      src += src_stride;
  188|  12.9k|      _mm_storel_epi64((__m128i *)dst, s[0]);
  189|  12.9k|      dst += dst_stride;
  190|  12.9k|      _mm_storel_epi64((__m128i *)dst, s[1]);
  191|  12.9k|      dst += dst_stride;
  192|  12.9k|      h -= 2;
  193|  12.9k|    } while (h);
  ------------------
  |  Branch (193:14): [True: 9.35k, False: 3.58k]
  ------------------
  194|  10.3k|  } else if (w == 8) {
  ------------------
  |  Branch (194:14): [True: 4.43k, False: 5.90k]
  ------------------
  195|  17.9k|    do {
  196|  17.9k|      __m128i s[2];
  197|  17.9k|      s[0] = _mm_loadu_si128((__m128i *)src);
  198|  17.9k|      src += src_stride;
  199|  17.9k|      s[1] = _mm_loadu_si128((__m128i *)src);
  200|  17.9k|      src += src_stride;
  201|  17.9k|      _mm_store_si128((__m128i *)dst, s[0]);
  202|  17.9k|      dst += dst_stride;
  203|  17.9k|      _mm_store_si128((__m128i *)dst, s[1]);
  204|  17.9k|      dst += dst_stride;
  205|  17.9k|      h -= 2;
  206|  17.9k|    } while (h);
  ------------------
  |  Branch (206:14): [True: 13.4k, False: 4.43k]
  ------------------
  207|  5.90k|  } else if (w == 16) {
  ------------------
  |  Branch (207:14): [True: 3.71k, False: 2.18k]
  ------------------
  208|  23.1k|    do {
  209|  23.1k|      __m256i s[2];
  210|  23.1k|      s[0] = _mm256_loadu_si256((__m256i *)src);
  211|  23.1k|      src += src_stride;
  212|  23.1k|      s[1] = _mm256_loadu_si256((__m256i *)src);
  213|  23.1k|      src += src_stride;
  214|  23.1k|      _mm256_storeu_si256((__m256i *)dst, s[0]);
  215|  23.1k|      dst += dst_stride;
  216|  23.1k|      _mm256_storeu_si256((__m256i *)dst, s[1]);
  217|  23.1k|      dst += dst_stride;
  218|  23.1k|      h -= 2;
  219|  23.1k|    } while (h);
  ------------------
  |  Branch (219:14): [True: 19.4k, False: 3.71k]
  ------------------
  220|  3.71k|  } else if (w == 32) {
  ------------------
  |  Branch (220:14): [True: 1.68k, False: 501]
  ------------------
  221|  19.1k|    do {
  222|  19.1k|      __m256i s[4];
  223|  19.1k|      s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
  224|  19.1k|      s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
  225|  19.1k|      src += src_stride;
  226|  19.1k|      s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
  227|  19.1k|      s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
  228|  19.1k|      src += src_stride;
  229|  19.1k|      _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
  230|  19.1k|      _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
  231|  19.1k|      dst += dst_stride;
  232|  19.1k|      _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[2]);
  233|  19.1k|      _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[3]);
  234|  19.1k|      dst += dst_stride;
  235|  19.1k|      h -= 2;
  236|  19.1k|    } while (h);
  ------------------
  |  Branch (236:14): [True: 17.4k, False: 1.68k]
  ------------------
  237|  1.68k|  } else if (w == 64) {
  ------------------
  |  Branch (237:14): [True: 501, False: 0]
  ------------------
  238|  15.1k|    do {
  239|  15.1k|      highbd_copy_64(src, dst);
  240|  15.1k|      src += src_stride;
  241|  15.1k|      dst += dst_stride;
  242|  15.1k|      highbd_copy_64(src, dst);
  243|  15.1k|      src += src_stride;
  244|  15.1k|      dst += dst_stride;
  245|  15.1k|      h -= 2;
  246|  15.1k|    } while (h);
  ------------------
  |  Branch (246:14): [True: 14.6k, False: 501]
  ------------------
  247|    501|  } else {
  248|      0|    assert(w == 128);
  249|      0|    do {
  250|      0|      highbd_copy_128(src, dst);
  251|      0|      src += src_stride;
  252|      0|      dst += dst_stride;
  253|      0|      highbd_copy_128(src, dst);
  254|      0|      src += src_stride;
  255|      0|      dst += dst_stride;
  256|      0|      h -= 2;
  257|      0|    } while (h);
  ------------------
  |  Branch (257:14): [True: 0, False: 0]
  ------------------
  258|      0|  }
  259|  14.3k|}
aom_convolve_copy_avx2.c:copy_128:
   16|  3.20k|static inline void copy_128(const uint8_t *src, uint8_t *dst) {
   17|  3.20k|  __m256i s[4];
   18|  3.20k|  s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
   19|  3.20k|  s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
   20|  3.20k|  s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 32));
   21|  3.20k|  s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 32));
   22|  3.20k|  _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]);
   23|  3.20k|  _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]);
   24|  3.20k|  _mm256_storeu_si256((__m256i *)(dst + 2 * 32), s[2]);
   25|  3.20k|  _mm256_storeu_si256((__m256i *)(dst + 3 * 32), s[3]);
   26|  3.20k|}
aom_convolve_copy_avx2.c:highbd_copy_64:
  128|  30.3k|static inline void highbd_copy_64(const uint16_t *src, uint16_t *dst) {
  129|  30.3k|  __m256i s[4];
  130|  30.3k|  s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
  131|  30.3k|  s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
  132|  30.3k|  s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
  133|  30.3k|  s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
  134|  30.3k|  _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
  135|  30.3k|  _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
  136|  30.3k|  _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]);
  137|  30.3k|  _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]);
  138|  30.3k|}

aom_blend_a64_hmask_sse4_1:
   22|  16.8k|                                const uint8_t *mask, int w, int h) {
   23|  16.8k|  aom_blend_a64_mask_sse4_1(dst, dst_stride, src0, src0_stride, src1,
   24|  16.8k|                            src1_stride, mask, 0, w, h, 0, 0);
   25|  16.8k|}
aom_highbd_blend_a64_hmask_sse4_1:
   31|  4.48k|    const uint8_t *mask, int w, int h, int bd) {
   32|  4.48k|  aom_highbd_blend_a64_mask_sse4_1(dst_8, dst_stride, src0_8, src0_stride,
   33|  4.48k|                                   src1_8, src1_stride, mask, 0, w, h, 0, 0,
   34|  4.48k|                                   bd);
   35|  4.48k|}

aom_lowbd_blend_a64_d16_mask_avx2:
  288|  2.69k|    ConvolveParams *conv_params) {
  289|  2.69k|  const int bd = 8;
  290|  2.69k|  const int round_bits =
  291|  2.69k|      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|  2.69k|#define FILTER_BITS 7
  ------------------
  292|       |
  293|  2.69k|  const int round_offset =
  294|  2.69k|      ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
  295|  2.69k|       (1 << (round_bits - 1)))
  296|  2.69k|      << AOM_BLEND_A64_ROUND_BITS;
  ------------------
  |  |   23|  2.69k|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
  297|       |
  298|  2.69k|  const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
  ------------------
  |  |   23|  2.69k|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
  299|  2.69k|  assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
  300|  2.69k|  assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
  301|       |
  302|  2.69k|  assert(h >= 4);
  303|  2.69k|  assert(w >= 4);
  304|  2.69k|  assert(IS_POWER_OF_TWO(h));
  305|  2.69k|  assert(IS_POWER_OF_TWO(w));
  306|  2.69k|  const __m128i v_round_offset = _mm_set1_epi32(round_offset);
  307|  2.69k|  const __m256i y_round_offset = _mm256_set1_epi32(round_offset);
  308|       |
  309|  2.69k|  if (subw == 0 && subh == 0) {
  ------------------
  |  Branch (309:7): [True: 1.13k, False: 1.56k]
  |  Branch (309:20): [True: 1.13k, False: 0]
  ------------------
  310|  1.13k|    switch (w) {
  311|      0|      case 4:
  ------------------
  |  Branch (311:7): [True: 0, False: 1.13k]
  ------------------
  312|      0|        aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
  313|      0|            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
  314|      0|            mask_stride, h, &v_round_offset, shift);
  315|      0|        break;
  316|    548|      case 8:
  ------------------
  |  Branch (316:7): [True: 548, False: 583]
  ------------------
  317|    548|        aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
  318|    548|            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
  319|    548|            mask_stride, h, &v_round_offset, shift);
  320|    548|        break;
  321|    410|      case 16:
  ------------------
  |  Branch (321:7): [True: 410, False: 721]
  ------------------
  322|    410|        lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
  323|    410|            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
  324|    410|            mask_stride, h, &y_round_offset, shift);
  325|    410|        break;
  326|    173|      default:
  ------------------
  |  Branch (326:7): [True: 173, False: 958]
  ------------------
  327|    173|        lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2(
  328|    173|            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
  329|    173|            mask_stride, h, w, &y_round_offset, shift);
  330|    173|        break;
  331|  1.13k|    }
  332|  1.56k|  } else if (subw == 1 && subh == 1) {
  ------------------
  |  Branch (332:14): [True: 1.56k, False: 0]
  |  Branch (332:27): [True: 1.56k, False: 0]
  ------------------
  333|  1.56k|    switch (w) {
  334|    604|      case 4:
  ------------------
  |  Branch (334:7): [True: 604, False: 960]
  ------------------
  335|    604|        aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
  336|    604|            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
  337|    604|            mask_stride, h, &v_round_offset, shift);
  338|    604|        break;
  339|    620|      case 8:
  ------------------
  |  Branch (339:7): [True: 620, False: 944]
  ------------------
  340|    620|        aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
  341|    620|            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
  342|    620|            mask_stride, h, &v_round_offset, shift);
  343|    620|        break;
  344|    246|      case 16:
  ------------------
  |  Branch (344:7): [True: 246, False: 1.31k]
  ------------------
  345|    246|        lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
  346|    246|            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
  347|    246|            mask_stride, h, &y_round_offset, shift);
  348|    246|        break;
  349|     94|      default:
  ------------------
  |  Branch (349:7): [True: 94, False: 1.47k]
  ------------------
  350|     94|        lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2(
  351|     94|            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
  352|     94|            mask_stride, h, w, &y_round_offset, shift);
  353|     94|        break;
  354|  1.56k|    }
  355|  1.56k|  } else if (subw == 1 && subh == 0) {
  ------------------
  |  Branch (355:14): [True: 0, False: 0]
  |  Branch (355:27): [True: 0, False: 0]
  ------------------
  356|      0|    switch (w) {
  357|      0|      case 4:
  ------------------
  |  Branch (357:7): [True: 0, False: 0]
  ------------------
  358|      0|        aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(
  359|      0|            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
  360|      0|            mask_stride, h, &v_round_offset, shift);
  361|      0|        break;
  362|      0|      case 8:
  ------------------
  |  Branch (362:7): [True: 0, False: 0]
  ------------------
  363|      0|        aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(
  364|      0|            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
  365|      0|            mask_stride, h, &v_round_offset, shift);
  366|      0|        break;
  367|      0|      case 16:
  ------------------
  |  Branch (367:7): [True: 0, False: 0]
  ------------------
  368|      0|        lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2(
  369|      0|            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
  370|      0|            mask_stride, h, w, &y_round_offset, shift);
  371|      0|        break;
  372|      0|      default:
  ------------------
  |  Branch (372:7): [True: 0, False: 0]
  ------------------
  373|      0|        lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2(
  374|      0|            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
  375|      0|            mask_stride, h, w, &y_round_offset, shift);
  376|      0|        break;
  377|      0|    }
  378|      0|  } else {
  379|      0|    switch (w) {
  380|      0|      case 4:
  ------------------
  |  Branch (380:7): [True: 0, False: 0]
  ------------------
  381|      0|        aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(
  382|      0|            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
  383|      0|            mask_stride, h, &v_round_offset, shift);
  384|      0|        break;
  385|      0|      case 8:
  ------------------
  |  Branch (385:7): [True: 0, False: 0]
  ------------------
  386|      0|        aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1(
  387|      0|            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
  388|      0|            mask_stride, h, &v_round_offset, shift);
  389|      0|        break;
  390|      0|      case 16:
  ------------------
  |  Branch (390:7): [True: 0, False: 0]
  ------------------
  391|      0|        lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2(
  392|      0|            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
  393|      0|            mask_stride, h, w, &y_round_offset, shift);
  394|      0|        break;
  395|      0|      default:
  ------------------
  |  Branch (395:7): [True: 0, False: 0]
  ------------------
  396|      0|        lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2(
  397|      0|            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
  398|      0|            mask_stride, h, w, &y_round_offset, shift);
  399|      0|        break;
  400|      0|    }
  401|      0|  }
  402|  2.69k|}
aom_blend_a64_mask_avx2:
  873|  4.42k|                             int h, int subw, int subh) {
  874|  4.42k|  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
  875|  4.42k|  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
  876|       |
  877|  4.42k|  assert(h >= 1);
  878|  4.42k|  assert(w >= 1);
  879|  4.42k|  assert(IS_POWER_OF_TWO(h));
  880|  4.42k|  assert(IS_POWER_OF_TWO(w));
  881|       |
  882|  4.42k|  if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
  ------------------
  |  |   55|  4.42k|#define UNLIKELY(v) __builtin_expect(v, 0)
  |  |  ------------------
  |  |  |  Branch (55:21): [True: 0, False: 4.42k]
  |  |  ------------------
  ------------------
  883|      0|    aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
  884|      0|                         mask, mask_stride, w, h, subw, subh);
  885|  4.42k|  } else {
  886|  4.42k|    if (subw & subh) {
  ------------------
  |  Branch (886:9): [True: 796, False: 3.62k]
  ------------------
  887|    796|      blend_a64_mask_sx_sy_avx2(dst, dst_stride, src0, src0_stride, src1,
  888|    796|                                src1_stride, mask, mask_stride, w, h);
  889|  3.62k|    } else if (subw) {
  ------------------
  |  Branch (889:16): [True: 0, False: 3.62k]
  ------------------
  890|      0|      blend_a64_mask_sx_avx2(dst, dst_stride, src0, src0_stride, src1,
  891|      0|                             src1_stride, mask, mask_stride, w, h);
  892|  3.62k|    } else if (subh) {
  ------------------
  |  Branch (892:16): [True: 0, False: 3.62k]
  ------------------
  893|      0|      blend_a64_mask_sy_avx2(dst, dst_stride, src0, src0_stride, src1,
  894|      0|                             src1_stride, mask, mask_stride, w, h);
  895|  3.62k|    } else {
  896|  3.62k|      blend_a64_mask_avx2(dst, dst_stride, src0, src0_stride, src1, src1_stride,
  897|  3.62k|                          mask, mask_stride, w, h);
  898|  3.62k|    }
  899|  4.42k|  }
  900|  4.42k|}
aom_highbd_blend_a64_d16_mask_avx2:
 1297|    498|    ConvolveParams *conv_params, const int bd) {
 1298|    498|  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
  ------------------
  |  |   75|    498|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
 1299|    498|  const int round_bits =
 1300|    498|      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|    498|#define FILTER_BITS 7
  ------------------
 1301|    498|  const int32_t round_offset =
 1302|    498|      ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
 1303|    498|       (1 << (round_bits - 1)))
 1304|    498|      << AOM_BLEND_A64_ROUND_BITS;
  ------------------
  |  |   23|    498|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
 1305|    498|  const __m256i v_round_offset = _mm256_set1_epi32(round_offset);
 1306|    498|  const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
  ------------------
  |  |   23|    498|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
 1307|       |
 1308|    498|  const __m256i clip_low = _mm256_setzero_si256();
 1309|    498|  const __m256i clip_high = _mm256_set1_epi16((1 << bd) - 1);
 1310|    498|  const __m256i mask_max = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|    498|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|    498|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
 1311|       |
 1312|    498|  assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
 1313|    498|  assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
 1314|       |
 1315|    498|  assert(h >= 4);
 1316|    498|  assert(w >= 4);
 1317|    498|  assert(IS_POWER_OF_TWO(h));
 1318|    498|  assert(IS_POWER_OF_TWO(w));
 1319|       |
 1320|    498|  if (subw == 0 && subh == 0) {
  ------------------
  |  Branch (1320:7): [True: 166, False: 332]
  |  Branch (1320:20): [True: 166, False: 0]
  ------------------
 1321|    166|    switch (w) {
 1322|      0|      case 4:
  ------------------
  |  Branch (1322:7): [True: 0, False: 166]
  ------------------
 1323|      0|        highbd_blend_a64_d16_mask_subw0_subh0_w4_avx2(
 1324|      0|            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
 1325|      0|            mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
 1326|      0|            &mask_max);
 1327|      0|        break;
 1328|     50|      case 8:
  ------------------
  |  Branch (1328:7): [True: 50, False: 116]
  ------------------
 1329|     50|        highbd_blend_a64_d16_mask_subw0_subh0_w8_avx2(
 1330|     50|            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
 1331|     50|            mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
 1332|     50|            &mask_max);
 1333|     50|        break;
 1334|    116|      default:  // >= 16
  ------------------
  |  Branch (1334:7): [True: 116, False: 50]
  ------------------
 1335|    116|        highbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
 1336|    116|            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
 1337|    116|            mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high,
 1338|    116|            &mask_max);
 1339|    116|        break;
 1340|    166|    }
 1341|       |
 1342|    332|  } else if (subw == 1 && subh == 1) {
  ------------------
  |  Branch (1342:14): [True: 332, False: 0]
  |  Branch (1342:27): [True: 292, False: 40]
  ------------------
 1343|    292|    switch (w) {
 1344|     92|      case 4:
  ------------------
  |  Branch (1344:7): [True: 92, False: 200]
  ------------------
 1345|     92|        highbd_blend_a64_d16_mask_subw1_subh1_w4_avx2(
 1346|     92|            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
 1347|     92|            mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
 1348|     92|            &mask_max);
 1349|     92|        break;
 1350|    116|      case 8:
  ------------------
  |  Branch (1350:7): [True: 116, False: 176]
  ------------------
 1351|    116|        highbd_blend_a64_d16_mask_subw1_subh1_w8_avx2(
 1352|    116|            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
 1353|    116|            mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
 1354|    116|            &mask_max);
 1355|    116|        break;
 1356|     84|      default:  // >= 16
  ------------------
  |  Branch (1356:7): [True: 84, False: 208]
  ------------------
 1357|     84|        highbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
 1358|     84|            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
 1359|     84|            mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high,
 1360|     84|            &mask_max);
 1361|     84|        break;
 1362|    292|    }
 1363|    292|  } else {
 1364|       |    // Sub-sampling in only one axis doesn't seem to happen very much, so fall
 1365|       |    // back to the vanilla C implementation instead of having all the optimised
 1366|       |    // code for these.
 1367|     40|    aom_highbd_blend_a64_d16_mask_c(dst8, dst_stride, src0, src0_stride, src1,
 1368|     40|                                    src1_stride, mask, mask_stride, w, h, subw,
 1369|     40|                                    subh, conv_params, bd);
 1370|     40|  }
 1371|    498|}
blend_a64_mask_avx2.c:lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2:
   86|    410|    const __m256i *round_offset, int shift) {
   87|    410|  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|    410|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|    410|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
   88|  6.85k|  for (int i = 0; i < h; ++i) {
  ------------------
  |  Branch (88:19): [True: 6.44k, False: 410]
  ------------------
   89|  6.44k|    const __m128i m = xx_loadu_128(mask);
   90|  6.44k|    const __m256i m0 = _mm256_cvtepu8_epi16(m);
   91|       |
   92|  6.44k|    blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval,
   93|  6.44k|                                shift);
   94|  6.44k|    mask += mask_stride;
   95|  6.44k|    dst += dst_stride;
   96|  6.44k|    src0 += src0_stride;
   97|  6.44k|    src1 += src1_stride;
   98|  6.44k|  }
   99|    410|}
blend_a64_mask_avx2.c:blend_a64_d16_mask_w16_avx2:
   31|  9.93k|    int shift) {
   32|  9.93k|  const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0);
   33|  9.93k|  const __m256i s0_0 = yy_loadu_256(src0);
   34|  9.93k|  const __m256i s1_0 = yy_loadu_256(src1);
   35|  9.93k|  __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0),
   36|  9.93k|                                      _mm256_unpacklo_epi16(*m0, max_minus_m0));
   37|  9.93k|  __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0),
   38|  9.93k|                                      _mm256_unpackhi_epi16(*m0, max_minus_m0));
   39|  9.93k|  res0_lo =
   40|  9.93k|      _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift);
   41|  9.93k|  res0_hi =
   42|  9.93k|      _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift);
   43|  9.93k|  const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi);
   44|  9.93k|  __m256i res = _mm256_packus_epi16(res0, res0);
   45|       |  res = _mm256_permute4x64_epi64(res, 0xd8);
   46|  9.93k|  _mm_storeu_si128((__m128i *)(dst), _mm256_castsi256_si128(res));
   47|  9.93k|}
blend_a64_mask_avx2.c:lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2:
  105|    173|    const __m256i *round_offset, int shift) {
  106|    173|  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|    173|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|    173|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  107|  6.66k|  for (int i = 0; i < h; ++i) {
  ------------------
  |  Branch (107:19): [True: 6.49k, False: 173]
  ------------------
  108|  17.2k|    for (int j = 0; j < w; j += 32) {
  ------------------
  |  Branch (108:21): [True: 10.7k, False: 6.49k]
  ------------------
  109|  10.7k|      const __m256i m = yy_loadu_256(mask + j);
  110|  10.7k|      const __m256i m0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(m));
  111|  10.7k|      const __m256i m1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(m, 1));
  112|       |
  113|  10.7k|      blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
  114|  10.7k|                                  round_offset, &v_maxval, shift);
  115|  10.7k|    }
  116|  6.49k|    mask += mask_stride;
  117|  6.49k|    dst += dst_stride;
  118|  6.49k|    src0 += src0_stride;
  119|  6.49k|    src1 += src1_stride;
  120|  6.49k|  }
  121|    173|}
blend_a64_mask_avx2.c:blend_a64_d16_mask_w32_avx2:
   52|  14.3k|    const __m256i *v_maxval, int shift) {
   53|  14.3k|  const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0);
   54|  14.3k|  const __m256i max_minus_m1 = _mm256_sub_epi16(*v_maxval, *m1);
   55|  14.3k|  const __m256i s0_0 = yy_loadu_256(src0);
   56|  14.3k|  const __m256i s0_1 = yy_loadu_256(src0 + 16);
   57|  14.3k|  const __m256i s1_0 = yy_loadu_256(src1);
   58|  14.3k|  const __m256i s1_1 = yy_loadu_256(src1 + 16);
   59|  14.3k|  __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0),
   60|  14.3k|                                      _mm256_unpacklo_epi16(*m0, max_minus_m0));
   61|  14.3k|  __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0),
   62|  14.3k|                                      _mm256_unpackhi_epi16(*m0, max_minus_m0));
   63|  14.3k|  __m256i res1_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_1, s1_1),
   64|  14.3k|                                      _mm256_unpacklo_epi16(*m1, max_minus_m1));
   65|  14.3k|  __m256i res1_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_1, s1_1),
   66|  14.3k|                                      _mm256_unpackhi_epi16(*m1, max_minus_m1));
   67|  14.3k|  res0_lo =
   68|  14.3k|      _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift);
   69|  14.3k|  res0_hi =
   70|  14.3k|      _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift);
   71|  14.3k|  res1_lo =
   72|  14.3k|      _mm256_srai_epi32(_mm256_sub_epi32(res1_lo, *v_round_offset), shift);
   73|  14.3k|  res1_hi =
   74|  14.3k|      _mm256_srai_epi32(_mm256_sub_epi32(res1_hi, *v_round_offset), shift);
   75|  14.3k|  const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi);
   76|  14.3k|  const __m256i res1 = _mm256_packs_epi32(res1_lo, res1_hi);
   77|  14.3k|  __m256i res = _mm256_packus_epi16(res0, res1);
   78|       |  res = _mm256_permute4x64_epi64(res, 0xd8);
   79|  14.3k|  _mm256_storeu_si256((__m256i *)(dst), res);
   80|  14.3k|}
blend_a64_mask_avx2.c:lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2:
  127|    246|    const __m256i *round_offset, int shift) {
  128|    246|  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|    246|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|    246|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  129|    246|  const __m256i one_b = _mm256_set1_epi8(1);
  130|    246|  const __m256i two_w = _mm256_set1_epi16(2);
  131|  3.73k|  for (int i = 0; i < h; ++i) {
  ------------------
  |  Branch (131:19): [True: 3.48k, False: 246]
  ------------------
  132|  3.48k|    const __m256i m_i00 = yy_loadu_256(mask);
  133|  3.48k|    const __m256i m_i10 = yy_loadu_256(mask + mask_stride);
  134|       |
  135|  3.48k|    const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10);
  136|  3.48k|    const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b);
  137|  3.48k|    const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2);
  138|       |
  139|  3.48k|    blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval,
  140|  3.48k|                                shift);
  141|  3.48k|    mask += mask_stride << 1;
  142|  3.48k|    dst += dst_stride;
  143|  3.48k|    src0 += src0_stride;
  144|  3.48k|    src1 += src1_stride;
  145|  3.48k|  }
  146|    246|}
blend_a64_mask_avx2.c:lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2:
  152|     94|    const __m256i *round_offset, int shift) {
  153|     94|  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|     94|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|     94|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  154|     94|  const __m256i one_b = _mm256_set1_epi8(1);
  155|     94|  const __m256i two_w = _mm256_set1_epi16(2);
  156|  3.00k|  for (int i = 0; i < h; ++i) {
  ------------------
  |  Branch (156:19): [True: 2.91k, False: 94]
  ------------------
  157|  6.46k|    for (int j = 0; j < w; j += 32) {
  ------------------
  |  Branch (157:21): [True: 3.55k, False: 2.91k]
  ------------------
  158|  3.55k|      const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
  159|  3.55k|      const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32);
  160|  3.55k|      const __m256i m_i10 = yy_loadu_256(mask + mask_stride + 2 * j);
  161|  3.55k|      const __m256i m_i11 = yy_loadu_256(mask + mask_stride + 2 * j + 32);
  162|       |
  163|  3.55k|      const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10);
  164|  3.55k|      const __m256i m1_ac = _mm256_adds_epu8(m_i01, m_i11);
  165|  3.55k|      const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b);
  166|  3.55k|      const __m256i m1_acbd = _mm256_maddubs_epi16(m1_ac, one_b);
  167|  3.55k|      const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2);
  168|  3.55k|      const __m256i m1 = _mm256_srli_epi16(_mm256_add_epi16(m1_acbd, two_w), 2);
  169|       |
  170|  3.55k|      blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
  171|  3.55k|                                  round_offset, &v_maxval, shift);
  172|  3.55k|    }
  173|  2.91k|    mask += mask_stride << 1;
  174|  2.91k|    dst += dst_stride;
  175|  2.91k|    src0 += src0_stride;
  176|  2.91k|    src1 += src1_stride;
  177|  2.91k|  }
  178|     94|}
blend_a64_mask_avx2.c:blend_a64_mask_sx_sy_avx2:
  518|    796|    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
  519|    796|  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
  520|    796|  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|    796|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|    796|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  521|    796|  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
  ------------------
  |  |   23|    796|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
  522|    796|  switch (w) {
  523|    572|    case 4:
  ------------------
  |  Branch (523:5): [True: 572, False: 224]
  ------------------
  524|  2.62k|      do {
  525|  2.62k|        const __m128i v_ra_b = xx_loadl_64(mask);
  526|  2.62k|        const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
  527|  2.62k|        const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
  528|  2.62k|        const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
  529|  2.62k|        const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
  530|  2.62k|        const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
  531|  2.62k|        const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
  532|  2.62k|        const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
  533|  2.62k|        const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
  534|  2.62k|        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
  535|       |
  536|  2.62k|        const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
  537|       |
  538|  2.62k|        xx_storel_32(dst, v_res_b);
  539|       |
  540|  2.62k|        dst += dst_stride;
  541|  2.62k|        src0 += src0_stride;
  542|  2.62k|        src1 += src1_stride;
  543|  2.62k|        mask += 2 * mask_stride;
  544|  2.62k|      } while (--h);
  ------------------
  |  Branch (544:16): [True: 2.05k, False: 572]
  ------------------
  545|    572|      break;
  546|    204|    case 8:
  ------------------
  |  Branch (546:5): [True: 204, False: 592]
  ------------------
  547|  1.34k|      do {
  548|  1.34k|        const __m128i v_ra_b = xx_loadu_128(mask);
  549|  1.34k|        const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
  550|  1.34k|        const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
  551|  1.34k|        const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
  552|  1.34k|        const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
  553|  1.34k|        const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
  554|  1.34k|        const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
  555|  1.34k|        const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
  556|  1.34k|        const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
  557|  1.34k|        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
  558|       |
  559|  1.34k|        const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
  560|       |
  561|  1.34k|        xx_storel_64(dst, v_res_b);
  562|       |
  563|  1.34k|        dst += dst_stride;
  564|  1.34k|        src0 += src0_stride;
  565|  1.34k|        src1 += src1_stride;
  566|  1.34k|        mask += 2 * mask_stride;
  567|  1.34k|      } while (--h);
  ------------------
  |  Branch (567:16): [True: 1.14k, False: 204]
  ------------------
  568|    204|      break;
  569|     20|    case 16:
  ------------------
  |  Branch (569:5): [True: 20, False: 776]
  ------------------
  570|     20|      blend_a64_mask_sx_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
  571|     20|                                    src1_stride, mask, mask_stride, h);
  572|     20|      break;
  573|      0|    default:
  ------------------
  |  Branch (573:5): [True: 0, False: 796]
  ------------------
  574|      0|      blend_a64_mask_sx_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
  575|      0|                                     src1_stride, mask, mask_stride, w, h);
  576|      0|      break;
  577|    796|  }
  578|    796|}
blend_a64_mask_avx2.c:blend_a64_mask_sx_sy_w16_avx2:
  446|     20|    const uint8_t *mask, uint32_t mask_stride, int h) {
  447|     20|  const __m256i v_zmask_b = _mm256_set1_epi16(0xFF);
  448|     20|  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|     20|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|     20|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  449|    192|  do {
  450|    192|    const __m256i v_ral_b = yy_loadu_256(mask);
  451|    192|    const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride);
  452|    192|    const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b);
  453|    192|    const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b);
  454|    192|    const __m256i v_rvsbl_w =
  455|    192|        _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b);
  456|    192|    const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w);
  457|       |
  458|    192|    const __m256i v_m0_w = yy_roundn_epu16(v_rsl_w, 2);
  459|    192|    const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, v_m0_w);
  460|    192|    const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
  461|       |
  462|    192|    const __m256i y_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b,
  463|    192|                                             AOM_BLEND_A64_ROUND_BITS);
  ------------------
  |  |   23|    192|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
  464|       |
  465|    192|    xx_storeu_128(dst, _mm256_castsi256_si128(y_res_b));
  466|    192|    dst += dst_stride;
  467|    192|    src0 += src0_stride;
  468|    192|    src1 += src1_stride;
  469|    192|    mask += 2 * mask_stride;
  470|    192|  } while (--h);
  ------------------
  |  Branch (470:12): [True: 172, False: 20]
  ------------------
  471|     20|}
blend_a64_mask_avx2.c:blend_16_u8_avx2:
  407|    192|                                       const int32_t bits) {
  408|    192|  const __m256i v_s0_b = _mm256_castsi128_si256(xx_loadu_128(src0));
  409|    192|  const __m256i v_s1_b = _mm256_castsi128_si256(xx_loadu_128(src1));
  410|    192|  const __m256i v_s0_s_b = _mm256_permute4x64_epi64(v_s0_b, 0xd8);
  411|    192|  const __m256i v_s1_s_b = _mm256_permute4x64_epi64(v_s1_b, 0xd8);
  412|       |
  413|    192|  const __m256i v_p0_w =
  414|    192|      _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_s_b, v_s1_s_b),
  415|    192|                           _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b));
  416|       |
  417|    192|  const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits);
  418|    192|  const __m256i v_res_b = _mm256_packus_epi16(v_res0_w, v_res0_w);
  419|       |  const __m256i v_res = _mm256_permute4x64_epi64(v_res_b, 0xd8);
  420|    192|  return v_res;
  421|    192|}
blend_a64_mask_avx2.c:blend_32_u8_avx2:
  426|  1.53k|                                       const int32_t bits) {
  427|  1.53k|  const __m256i v_s0_b = yy_loadu_256(src0);
  428|  1.53k|  const __m256i v_s1_b = yy_loadu_256(src1);
  429|       |
  430|  1.53k|  const __m256i v_p0_w =
  431|  1.53k|      _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_b, v_s1_b),
  432|  1.53k|                           _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b));
  433|  1.53k|  const __m256i v_p1_w =
  434|  1.53k|      _mm256_maddubs_epi16(_mm256_unpackhi_epi8(v_s0_b, v_s1_b),
  435|  1.53k|                           _mm256_unpackhi_epi8(*v_m0_b, *v_m1_b));
  436|       |
  437|  1.53k|  const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits);
  438|  1.53k|  const __m256i v_res1_w = yy_roundn_epu16(v_p1_w, bits);
  439|  1.53k|  const __m256i v_res = _mm256_packus_epi16(v_res0_w, v_res1_w);
  440|  1.53k|  return v_res;
  441|  1.53k|}
blend_a64_mask_avx2.c:blend_a64_mask_avx2:
  818|  3.62k|    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
  819|  3.62k|  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|  3.62k|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|  3.62k|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  820|  3.62k|  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
  ------------------
  |  |   23|  3.62k|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
  821|  3.62k|  switch (w) {
  822|    884|    case 4:
  ------------------
  |  Branch (822:5): [True: 884, False: 2.74k]
  ------------------
  823|  4.62k|      do {
  824|  4.62k|        const __m128i v_m0_b = xx_loadl_32(mask);
  825|  4.62k|        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
  826|  4.62k|        const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
  827|       |
  828|  4.62k|        xx_storel_32(dst, v_res_b);
  829|       |
  830|  4.62k|        dst += dst_stride;
  831|  4.62k|        src0 += src0_stride;
  832|  4.62k|        src1 += src1_stride;
  833|  4.62k|        mask += mask_stride;
  834|  4.62k|      } while (--h);
  ------------------
  |  Branch (834:16): [True: 3.74k, False: 884]
  ------------------
  835|    884|      break;
  836|  1.88k|    case 8:
  ------------------
  |  Branch (836:5): [True: 1.88k, False: 1.74k]
  ------------------
  837|  14.8k|      do {
  838|  14.8k|        const __m128i v_m0_b = xx_loadl_64(mask);
  839|  14.8k|        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
  840|  14.8k|        const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
  841|       |
  842|  14.8k|        xx_storel_64(dst, v_res_b);
  843|       |
  844|  14.8k|        dst += dst_stride;
  845|  14.8k|        src0 += src0_stride;
  846|  14.8k|        src1 += src1_stride;
  847|  14.8k|        mask += mask_stride;
  848|  14.8k|      } while (--h);
  ------------------
  |  Branch (848:16): [True: 12.9k, False: 1.88k]
  ------------------
  849|  1.88k|      break;
  850|    791|    case 16:
  ------------------
  |  Branch (850:5): [True: 791, False: 2.83k]
  ------------------
  851|  9.94k|      do {
  852|  9.94k|        const __m128i v_m0_b = xx_loadu_128(mask);
  853|  9.94k|        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
  854|  9.94k|        const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
  855|       |
  856|  9.94k|        xx_storeu_128(dst, v_res_b);
  857|  9.94k|        dst += dst_stride;
  858|  9.94k|        src0 += src0_stride;
  859|  9.94k|        src1 += src1_stride;
  860|  9.94k|        mask += mask_stride;
  861|  9.94k|      } while (--h);
  ------------------
  |  Branch (861:16): [True: 9.15k, False: 791]
  ------------------
  862|    791|      break;
  863|     70|    default:
  ------------------
  |  Branch (863:5): [True: 70, False: 3.55k]
  ------------------
  864|     70|      blend_a64_mask_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
  865|     70|                               src1_stride, mask, mask_stride, w, h);
  866|  3.62k|  }
  867|  3.62k|}
blend_a64_mask_avx2.c:blend_a64_mask_w32n_avx2:
  795|     70|    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
  796|     70|  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|     70|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|     70|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  797|  1.53k|  do {
  798|  1.53k|    int c;
  799|  3.07k|    for (c = 0; c < w; c += 32) {
  ------------------
  |  Branch (799:17): [True: 1.53k, False: 1.53k]
  ------------------
  800|  1.53k|      const __m256i v_m0_b = yy_loadu_256(mask + c);
  801|  1.53k|      const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
  802|       |
  803|  1.53k|      const __m256i v_res_b = blend_32_u8_avx2(
  804|  1.53k|          src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
  ------------------
  |  |   23|  1.53k|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
  805|       |
  806|  1.53k|      yy_storeu_256(dst + c, v_res_b);
  807|  1.53k|    }
  808|  1.53k|    dst += dst_stride;
  809|  1.53k|    src0 += src0_stride;
  810|  1.53k|    src1 += src1_stride;
  811|  1.53k|    mask += mask_stride;
  812|  1.53k|  } while (--h);
  ------------------
  |  Branch (812:12): [True: 1.46k, False: 70]
  ------------------
  813|     70|}
blend_a64_mask_avx2.c:highbd_blend_a64_d16_mask_w4_avx2:
  911|    172|    const __m256i *clip_high, const __m256i *mask_max) {
  912|       |  // Load 4x u16 pixels from each of 4 rows from each source
  913|    172|  const __m256i s0 =
  914|    172|      yy_loadu_4x64(src0 + 3 * src0_stride, src0 + 2 * src0_stride,
  915|    172|                    src0 + 1 * src0_stride, src0 + 0 * src0_stride);
  916|    172|  const __m256i s1 =
  917|    172|      yy_loadu_4x64(src1 + 3 * src1_stride, src1 + 2 * src1_stride,
  918|    172|                    src1 + 1 * src1_stride, src1 + 0 * src1_stride);
  919|       |  // Generate the inverse mask
  920|    172|  const __m256i mask1 = _mm256_sub_epi16(*mask_max, *mask0);
  921|       |
  922|       |  // Multiply each mask by the respective source
  923|    172|  const __m256i mul0_highs = _mm256_mulhi_epu16(*mask0, s0);
  924|    172|  const __m256i mul0_lows = _mm256_mullo_epi16(*mask0, s0);
  925|    172|  const __m256i mul0h = _mm256_unpackhi_epi16(mul0_lows, mul0_highs);
  926|    172|  const __m256i mul0l = _mm256_unpacklo_epi16(mul0_lows, mul0_highs);
  927|       |  // Note that AVX2 unpack orders 64-bit words as [3 1] [2 0] to keep within
  928|       |  // lanes Later, packs does the same again which cancels this out with no need
  929|       |  // for a permute.  The intermediate values being reordered makes no difference
  930|       |
  931|    172|  const __m256i mul1_highs = _mm256_mulhi_epu16(mask1, s1);
  932|    172|  const __m256i mul1_lows = _mm256_mullo_epi16(mask1, s1);
  933|    172|  const __m256i mul1h = _mm256_unpackhi_epi16(mul1_lows, mul1_highs);
  934|    172|  const __m256i mul1l = _mm256_unpacklo_epi16(mul1_lows, mul1_highs);
  935|       |
  936|    172|  const __m256i sumh = _mm256_add_epi32(mul0h, mul1h);
  937|    172|  const __m256i suml = _mm256_add_epi32(mul0l, mul1l);
  938|       |
  939|    172|  const __m256i roundh =
  940|    172|      _mm256_srai_epi32(_mm256_sub_epi32(sumh, *round_offset), shift);
  941|    172|  const __m256i roundl =
  942|    172|      _mm256_srai_epi32(_mm256_sub_epi32(suml, *round_offset), shift);
  943|       |
  944|    172|  const __m256i pack = _mm256_packs_epi32(roundl, roundh);
  945|    172|  const __m256i clip =
  946|    172|      _mm256_min_epi16(_mm256_max_epi16(pack, *clip_low), *clip_high);
  947|       |
  948|       |  // _mm256_extract_epi64 doesn't exist on x86, so do it the old-fashioned way:
  949|    172|  const __m128i cliph = _mm256_extracti128_si256(clip, 1);
  950|    172|  xx_storel_64(dst + 3 * dst_stride, _mm_srli_si128(cliph, 8));
  951|    172|  xx_storel_64(dst + 2 * dst_stride, cliph);
  952|    172|  const __m128i clipl = _mm256_castsi256_si128(clip);
  953|       |  xx_storel_64(dst + 1 * dst_stride, _mm_srli_si128(clipl, 8));
  954|    172|  xx_storel_64(dst + 0 * dst_stride, clipl);
  955|    172|}
blend_a64_mask_avx2.c:highbd_blend_a64_d16_mask_subw0_subh0_w8_avx2:
 1098|     50|    const __m256i *mask_max) {
 1099|    180|  do {
 1100|       |    // Load 8x u8 pixels from each of 4 rows in the mask
 1101|    180|    const __m128i mask0a8 =
 1102|    180|        _mm_set_epi64x(*(int64_t *)mask, *(uint64_t *)(mask + mask_stride));
 1103|    180|    const __m128i mask0b8 =
 1104|    180|        _mm_set_epi64x(*(int64_t *)(mask + 2 * mask_stride),
 1105|    180|                       *(int64_t *)(mask + 3 * mask_stride));
 1106|    180|    const __m256i mask0a = _mm256_cvtepu8_epi16(mask0a8);
 1107|    180|    const __m256i mask0b = _mm256_cvtepu8_epi16(mask0b8);
 1108|       |
 1109|    180|    highbd_blend_a64_d16_mask_w8_avx2(
 1110|    180|        dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask0a, &mask0b,
 1111|    180|        round_offset, shift, clip_low, clip_high, mask_max);
 1112|       |
 1113|    180|    dst += dst_stride * 4;
 1114|    180|    src0 += src0_stride * 4;
 1115|    180|    src1 += src1_stride * 4;
 1116|    180|    mask += mask_stride * 4;
 1117|    180|  } while (h -= 4);
  ------------------
  |  Branch (1117:12): [True: 130, False: 50]
  ------------------
 1118|     50|}
blend_a64_mask_avx2.c:highbd_blend_a64_d16_mask_w8_avx2:
 1025|    392|    const __m256i *mask_max) {
 1026|       |  // Load 8x u16 pixels from each of 4 rows from each source
 1027|    392|  const __m256i s0a =
 1028|    392|      yy_loadu2_128(src0 + 0 * src0_stride, src0 + 1 * src0_stride);
 1029|    392|  const __m256i s0b =
 1030|    392|      yy_loadu2_128(src0 + 2 * src0_stride, src0 + 3 * src0_stride);
 1031|    392|  const __m256i s1a =
 1032|    392|      yy_loadu2_128(src1 + 0 * src1_stride, src1 + 1 * src1_stride);
 1033|    392|  const __m256i s1b =
 1034|    392|      yy_loadu2_128(src1 + 2 * src1_stride, src1 + 3 * src1_stride);
 1035|       |
 1036|       |  // Generate inverse masks
 1037|    392|  const __m256i mask1a = _mm256_sub_epi16(*mask_max, *mask0a);
 1038|    392|  const __m256i mask1b = _mm256_sub_epi16(*mask_max, *mask0b);
 1039|       |
 1040|       |  // Multiply sources by respective masks
 1041|    392|  const __m256i mul0a_highs = _mm256_mulhi_epu16(*mask0a, s0a);
 1042|    392|  const __m256i mul0a_lows = _mm256_mullo_epi16(*mask0a, s0a);
 1043|    392|  const __m256i mul0ah = _mm256_unpackhi_epi16(mul0a_lows, mul0a_highs);
 1044|    392|  const __m256i mul0al = _mm256_unpacklo_epi16(mul0a_lows, mul0a_highs);
 1045|       |  // Note that AVX2 unpack orders 64-bit words as [3 1] [2 0] to keep within
 1046|       |  // lanes Later, packs does the same again which cancels this out with no need
 1047|       |  // for a permute.  The intermediate values being reordered makes no difference
 1048|       |
 1049|    392|  const __m256i mul1a_highs = _mm256_mulhi_epu16(mask1a, s1a);
 1050|    392|  const __m256i mul1a_lows = _mm256_mullo_epi16(mask1a, s1a);
 1051|    392|  const __m256i mul1ah = _mm256_unpackhi_epi16(mul1a_lows, mul1a_highs);
 1052|    392|  const __m256i mul1al = _mm256_unpacklo_epi16(mul1a_lows, mul1a_highs);
 1053|       |
 1054|    392|  const __m256i sumah = _mm256_add_epi32(mul0ah, mul1ah);
 1055|    392|  const __m256i sumal = _mm256_add_epi32(mul0al, mul1al);
 1056|       |
 1057|    392|  const __m256i mul0b_highs = _mm256_mulhi_epu16(*mask0b, s0b);
 1058|    392|  const __m256i mul0b_lows = _mm256_mullo_epi16(*mask0b, s0b);
 1059|    392|  const __m256i mul0bh = _mm256_unpackhi_epi16(mul0b_lows, mul0b_highs);
 1060|    392|  const __m256i mul0bl = _mm256_unpacklo_epi16(mul0b_lows, mul0b_highs);
 1061|       |
 1062|    392|  const __m256i mul1b_highs = _mm256_mulhi_epu16(mask1b, s1b);
 1063|    392|  const __m256i mul1b_lows = _mm256_mullo_epi16(mask1b, s1b);
 1064|    392|  const __m256i mul1bh = _mm256_unpackhi_epi16(mul1b_lows, mul1b_highs);
 1065|    392|  const __m256i mul1bl = _mm256_unpacklo_epi16(mul1b_lows, mul1b_highs);
 1066|       |
 1067|    392|  const __m256i sumbh = _mm256_add_epi32(mul0bh, mul1bh);
 1068|    392|  const __m256i sumbl = _mm256_add_epi32(mul0bl, mul1bl);
 1069|       |
 1070|       |  // Divide down each result, with rounding
 1071|    392|  const __m256i roundah =
 1072|    392|      _mm256_srai_epi32(_mm256_sub_epi32(sumah, *round_offset), shift);
 1073|    392|  const __m256i roundal =
 1074|    392|      _mm256_srai_epi32(_mm256_sub_epi32(sumal, *round_offset), shift);
 1075|    392|  const __m256i roundbh =
 1076|    392|      _mm256_srai_epi32(_mm256_sub_epi32(sumbh, *round_offset), shift);
 1077|    392|  const __m256i roundbl =
 1078|    392|      _mm256_srai_epi32(_mm256_sub_epi32(sumbl, *round_offset), shift);
 1079|       |
 1080|       |  // Pack each i32 down to an i16 with saturation, then clip to valid range
 1081|    392|  const __m256i packa = _mm256_packs_epi32(roundal, roundah);
 1082|    392|  const __m256i clipa =
 1083|    392|      _mm256_min_epi16(_mm256_max_epi16(packa, *clip_low), *clip_high);
 1084|    392|  const __m256i packb = _mm256_packs_epi32(roundbl, roundbh);
 1085|    392|  const __m256i clipb =
 1086|    392|      _mm256_min_epi16(_mm256_max_epi16(packb, *clip_low), *clip_high);
 1087|       |
 1088|       |  // Store 8x u16 pixels to each of 4 rows in the destination
 1089|    392|  yy_storeu2_128(dst + 0 * dst_stride, dst + 1 * dst_stride, clipa);
 1090|    392|  yy_storeu2_128(dst + 2 * dst_stride, dst + 3 * dst_stride, clipb);
 1091|    392|}
blend_a64_mask_avx2.c:highbd_blend_a64_d16_mask_subw0_subh0_w16_avx2:
 1235|    116|    const __m256i *mask_max) {
 1236|  1.08k|  for (int i = 0; i < h; i += 2) {
  ------------------
  |  Branch (1236:19): [True: 968, False: 116]
  ------------------
 1237|  2.76k|    for (int j = 0; j < w; j += 16) {
  ------------------
  |  Branch (1237:21): [True: 1.80k, False: 968]
  ------------------
 1238|       |      // Load 16x u8 alpha-mask values from each of two rows and pad to u16
 1239|  1.80k|      const __m128i masks_a8 = xx_loadu_128(mask + j);
 1240|  1.80k|      const __m128i masks_b8 = xx_loadu_128(mask + mask_stride + j);
 1241|  1.80k|      const __m256i mask0a = _mm256_cvtepu8_epi16(masks_a8);
 1242|  1.80k|      const __m256i mask0b = _mm256_cvtepu8_epi16(masks_b8);
 1243|       |
 1244|  1.80k|      highbd_blend_a64_d16_mask_w16_avx2(
 1245|  1.80k|          dst + j, dst_stride, src0 + j, src0_stride, src1 + j, src1_stride,
 1246|  1.80k|          &mask0a, &mask0b, round_offset, shift, clip_low, clip_high, mask_max);
 1247|  1.80k|    }
 1248|    968|    dst += dst_stride * 2;
 1249|    968|    src0 += src0_stride * 2;
 1250|    968|    src1 += src1_stride * 2;
 1251|    968|    mask += mask_stride * 2;
 1252|    968|  }
 1253|    116|}
blend_a64_mask_avx2.c:highbd_blend_a64_d16_mask_w16_avx2:
 1165|  2.34k|    const __m256i *mask_max) {
 1166|       |  // Load 16x pixels from each of 2 rows from each source
 1167|  2.34k|  const __m256i s0a = yy_loadu_256(src0);
 1168|  2.34k|  const __m256i s0b = yy_loadu_256(src0 + src0_stride);
 1169|  2.34k|  const __m256i s1a = yy_loadu_256(src1);
 1170|  2.34k|  const __m256i s1b = yy_loadu_256(src1 + src1_stride);
 1171|       |
 1172|       |  // Calculate inverse masks
 1173|  2.34k|  const __m256i mask1a = _mm256_sub_epi16(*mask_max, *mask0a);
 1174|  2.34k|  const __m256i mask1b = _mm256_sub_epi16(*mask_max, *mask0b);
 1175|       |
 1176|       |  // Multiply each source by appropriate mask
 1177|  2.34k|  const __m256i mul0a_highs = _mm256_mulhi_epu16(*mask0a, s0a);
 1178|  2.34k|  const __m256i mul0a_lows = _mm256_mullo_epi16(*mask0a, s0a);
 1179|  2.34k|  const __m256i mul0ah = _mm256_unpackhi_epi16(mul0a_lows, mul0a_highs);
 1180|  2.34k|  const __m256i mul0al = _mm256_unpacklo_epi16(mul0a_lows, mul0a_highs);
 1181|       |  // Note that AVX2 unpack orders 64-bit words as [3 1] [2 0] to keep within
 1182|       |  // lanes Later, packs does the same again which cancels this out with no need
 1183|       |  // for a permute.  The intermediate values being reordered makes no difference
 1184|       |
 1185|  2.34k|  const __m256i mul1a_highs = _mm256_mulhi_epu16(mask1a, s1a);
 1186|  2.34k|  const __m256i mul1a_lows = _mm256_mullo_epi16(mask1a, s1a);
 1187|  2.34k|  const __m256i mul1ah = _mm256_unpackhi_epi16(mul1a_lows, mul1a_highs);
 1188|  2.34k|  const __m256i mul1al = _mm256_unpacklo_epi16(mul1a_lows, mul1a_highs);
 1189|       |
 1190|  2.34k|  const __m256i mulah = _mm256_add_epi32(mul0ah, mul1ah);
 1191|  2.34k|  const __m256i mulal = _mm256_add_epi32(mul0al, mul1al);
 1192|       |
 1193|  2.34k|  const __m256i mul0b_highs = _mm256_mulhi_epu16(*mask0b, s0b);
 1194|  2.34k|  const __m256i mul0b_lows = _mm256_mullo_epi16(*mask0b, s0b);
 1195|  2.34k|  const __m256i mul0bh = _mm256_unpackhi_epi16(mul0b_lows, mul0b_highs);
 1196|  2.34k|  const __m256i mul0bl = _mm256_unpacklo_epi16(mul0b_lows, mul0b_highs);
 1197|       |
 1198|  2.34k|  const __m256i mul1b_highs = _mm256_mulhi_epu16(mask1b, s1b);
 1199|  2.34k|  const __m256i mul1b_lows = _mm256_mullo_epi16(mask1b, s1b);
 1200|  2.34k|  const __m256i mul1bh = _mm256_unpackhi_epi16(mul1b_lows, mul1b_highs);
 1201|  2.34k|  const __m256i mul1bl = _mm256_unpacklo_epi16(mul1b_lows, mul1b_highs);
 1202|       |
 1203|  2.34k|  const __m256i mulbh = _mm256_add_epi32(mul0bh, mul1bh);
 1204|  2.34k|  const __m256i mulbl = _mm256_add_epi32(mul0bl, mul1bl);
 1205|       |
 1206|  2.34k|  const __m256i resah =
 1207|  2.34k|      _mm256_srai_epi32(_mm256_sub_epi32(mulah, *round_offset), shift);
 1208|  2.34k|  const __m256i resal =
 1209|  2.34k|      _mm256_srai_epi32(_mm256_sub_epi32(mulal, *round_offset), shift);
 1210|  2.34k|  const __m256i resbh =
 1211|  2.34k|      _mm256_srai_epi32(_mm256_sub_epi32(mulbh, *round_offset), shift);
 1212|  2.34k|  const __m256i resbl =
 1213|  2.34k|      _mm256_srai_epi32(_mm256_sub_epi32(mulbl, *round_offset), shift);
 1214|       |
 1215|       |  // Signed saturating pack from i32 to i16:
 1216|  2.34k|  const __m256i packa = _mm256_packs_epi32(resal, resah);
 1217|  2.34k|  const __m256i packb = _mm256_packs_epi32(resbl, resbh);
 1218|       |
 1219|       |  // Clip the values to the valid range
 1220|  2.34k|  const __m256i clipa =
 1221|  2.34k|      _mm256_min_epi16(_mm256_max_epi16(packa, *clip_low), *clip_high);
 1222|  2.34k|  const __m256i clipb =
 1223|  2.34k|      _mm256_min_epi16(_mm256_max_epi16(packb, *clip_low), *clip_high);
 1224|       |
 1225|       |  // Store 16 pixels
 1226|  2.34k|  yy_storeu_256(dst, clipa);
 1227|  2.34k|  yy_storeu_256(dst + dst_stride, clipb);
 1228|  2.34k|}
blend_a64_mask_avx2.c:highbd_blend_a64_d16_mask_subw1_subh1_w4_avx2:
  987|     92|    const __m256i *clip_high, const __m256i *mask_max) {
  988|     92|  const __m256i one_b = _mm256_set1_epi8(1);
  989|     92|  const __m256i two_w = _mm256_set1_epi16(2);
  990|    172|  do {
  991|       |    // Load 8 pixels from each of 8 rows of mask,
  992|       |    // (saturating) add together rows then use madd to add adjacent pixels
  993|       |    // Finally, divide each value by 4 (with rounding)
  994|    172|    const __m256i m0246 =
  995|    172|        _mm256_set_epi64x(*(int64_t *)(mask + 6 * mask_stride),
  996|    172|                          *(int64_t *)(mask + 4 * mask_stride),
  997|    172|                          *(int64_t *)(mask + 2 * mask_stride),
  998|    172|                          *(int64_t *)(mask + 0 * mask_stride));
  999|    172|    const __m256i m1357 =
 1000|    172|        _mm256_set_epi64x(*(int64_t *)(mask + 7 * mask_stride),
 1001|    172|                          *(int64_t *)(mask + 5 * mask_stride),
 1002|    172|                          *(int64_t *)(mask + 3 * mask_stride),
 1003|    172|                          *(int64_t *)(mask + 1 * mask_stride));
 1004|    172|    const __m256i addrows = _mm256_adds_epu8(m0246, m1357);
 1005|    172|    const __m256i adjacent = _mm256_maddubs_epi16(addrows, one_b);
 1006|    172|    const __m256i mask0 =
 1007|    172|        _mm256_srli_epi16(_mm256_add_epi16(adjacent, two_w), 2);
 1008|       |
 1009|    172|    highbd_blend_a64_d16_mask_w4_avx2(dst, dst_stride, src0, src0_stride, src1,
 1010|    172|                                      src1_stride, &mask0, round_offset, shift,
 1011|    172|                                      clip_low, clip_high, mask_max);
 1012|       |
 1013|    172|    dst += dst_stride * 4;
 1014|    172|    src0 += src0_stride * 4;
 1015|    172|    src1 += src1_stride * 4;
 1016|    172|    mask += mask_stride * 8;
 1017|    172|  } while (h -= 4);
  ------------------
  |  Branch (1017:12): [True: 80, False: 92]
  ------------------
 1018|     92|}
blend_a64_mask_avx2.c:highbd_blend_a64_d16_mask_subw1_subh1_w8_avx2:
 1125|    116|    const __m256i *mask_max) {
 1126|    116|  const __m256i one_b = _mm256_set1_epi8(1);
 1127|    116|  const __m256i two_w = _mm256_set1_epi16(2);
 1128|    212|  do {
 1129|       |    // Load 16x u8 pixels from each of 8 rows in the mask,
 1130|       |    // (saturating) add together rows then use madd to add adjacent pixels
 1131|       |    // Finally, divide each value by 4 (with rounding)
 1132|    212|    const __m256i m02 =
 1133|    212|        yy_loadu2_128(mask + 0 * mask_stride, mask + 2 * mask_stride);
 1134|    212|    const __m256i m13 =
 1135|    212|        yy_loadu2_128(mask + 1 * mask_stride, mask + 3 * mask_stride);
 1136|    212|    const __m256i m0123 =
 1137|    212|        _mm256_maddubs_epi16(_mm256_adds_epu8(m02, m13), one_b);
 1138|    212|    const __m256i mask_0a =
 1139|    212|        _mm256_srli_epi16(_mm256_add_epi16(m0123, two_w), 2);
 1140|    212|    const __m256i m46 =
 1141|    212|        yy_loadu2_128(mask + 4 * mask_stride, mask + 6 * mask_stride);
 1142|    212|    const __m256i m57 =
 1143|    212|        yy_loadu2_128(mask + 5 * mask_stride, mask + 7 * mask_stride);
 1144|    212|    const __m256i m4567 =
 1145|    212|        _mm256_maddubs_epi16(_mm256_adds_epu8(m46, m57), one_b);
 1146|    212|    const __m256i mask_0b =
 1147|    212|        _mm256_srli_epi16(_mm256_add_epi16(m4567, two_w), 2);
 1148|       |
 1149|    212|    highbd_blend_a64_d16_mask_w8_avx2(
 1150|    212|        dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask_0a,
 1151|    212|        &mask_0b, round_offset, shift, clip_low, clip_high, mask_max);
 1152|       |
 1153|    212|    dst += dst_stride * 4;
 1154|    212|    src0 += src0_stride * 4;
 1155|    212|    src1 += src1_stride * 4;
 1156|    212|    mask += mask_stride * 8;
 1157|    212|  } while (h -= 4);
  ------------------
  |  Branch (1157:12): [True: 96, False: 116]
  ------------------
 1158|    116|}
blend_a64_mask_avx2.c:highbd_blend_a64_d16_mask_subw1_subh1_w16_avx2:
 1260|     84|    const __m256i *mask_max) {
 1261|     84|  const __m256i one_b = _mm256_set1_epi8(1);
 1262|     84|  const __m256i two_w = _mm256_set1_epi16(2);
 1263|    500|  for (int i = 0; i < h; i += 2) {
  ------------------
  |  Branch (1263:19): [True: 416, False: 84]
  ------------------
 1264|    960|    for (int j = 0; j < w; j += 16) {
  ------------------
  |  Branch (1264:21): [True: 544, False: 416]
  ------------------
 1265|       |      // Load 32x u8 alpha-mask values from each of four rows
 1266|       |      // (saturating) add pairs of rows, then use madd to add adjacent values
 1267|       |      // Finally, divide down each result with rounding
 1268|    544|      const __m256i m0 = yy_loadu_256(mask + 0 * mask_stride + 2 * j);
 1269|    544|      const __m256i m1 = yy_loadu_256(mask + 1 * mask_stride + 2 * j);
 1270|    544|      const __m256i m2 = yy_loadu_256(mask + 2 * mask_stride + 2 * j);
 1271|    544|      const __m256i m3 = yy_loadu_256(mask + 3 * mask_stride + 2 * j);
 1272|       |
 1273|    544|      const __m256i m01_8 = _mm256_adds_epu8(m0, m1);
 1274|    544|      const __m256i m23_8 = _mm256_adds_epu8(m2, m3);
 1275|       |
 1276|    544|      const __m256i m01 = _mm256_maddubs_epi16(m01_8, one_b);
 1277|    544|      const __m256i m23 = _mm256_maddubs_epi16(m23_8, one_b);
 1278|       |
 1279|    544|      const __m256i mask0a = _mm256_srli_epi16(_mm256_add_epi16(m01, two_w), 2);
 1280|    544|      const __m256i mask0b = _mm256_srli_epi16(_mm256_add_epi16(m23, two_w), 2);
 1281|       |
 1282|    544|      highbd_blend_a64_d16_mask_w16_avx2(
 1283|    544|          dst + j, dst_stride, src0 + j, src0_stride, src1 + j, src1_stride,
 1284|    544|          &mask0a, &mask0b, round_offset, shift, clip_low, clip_high, mask_max);
 1285|    544|    }
 1286|    416|    dst += dst_stride * 2;
 1287|    416|    src0 += src0_stride * 2;
 1288|    416|    src1 += src1_stride * 2;
 1289|    416|    mask += mask_stride * 4;
 1290|    416|  }
 1291|     84|}

aom_blend_a64_mask_sse4_1:
  389|  16.8k|                               int h, int subw, int subh) {
  390|  16.8k|  typedef void (*blend_fn)(
  391|  16.8k|      uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
  392|  16.8k|      uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
  393|  16.8k|      const uint8_t *mask, uint32_t mask_stride, int w, int h);
  394|       |
  395|       |  // Dimensions are: width_index X subx X suby
  396|  16.8k|  static const blend_fn blend[3][2][2] = {
  397|  16.8k|    { // w % 16 == 0
  398|  16.8k|      { blend_a64_mask_w16n_sse4_1, blend_a64_mask_sy_w16n_sse4_1 },
  399|  16.8k|      { blend_a64_mask_sx_w16n_sse4_1, blend_a64_mask_sx_sy_w16n_sse4_1 } },
  400|  16.8k|    { // w == 4
  401|  16.8k|      { blend_a64_mask_w4_sse4_1, blend_a64_mask_sy_w4_sse4_1 },
  402|  16.8k|      { blend_a64_mask_sx_w4_sse4_1, blend_a64_mask_sx_sy_w4_sse4_1 } },
  403|  16.8k|    { // w == 8
  404|  16.8k|      { blend_a64_mask_w8_sse4_1, blend_a64_mask_sy_w8_sse4_1 },
  405|  16.8k|      { blend_a64_mask_sx_w8_sse4_1, blend_a64_mask_sx_sy_w8_sse4_1 } }
  406|  16.8k|  };
  407|       |
  408|  16.8k|  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
  409|  16.8k|  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
  410|       |
  411|  16.8k|  assert(h >= 1);
  412|  16.8k|  assert(w >= 1);
  413|  16.8k|  assert(IS_POWER_OF_TWO(h));
  414|  16.8k|  assert(IS_POWER_OF_TWO(w));
  415|       |
  416|  16.8k|  if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
  ------------------
  |  |   55|  16.8k|#define UNLIKELY(v) __builtin_expect(v, 0)
  |  |  ------------------
  |  |  |  Branch (55:21): [True: 5.60k, False: 11.2k]
  |  |  ------------------
  ------------------
  417|  5.60k|    aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
  418|  5.60k|                         mask, mask_stride, w, h, subw, subh);
  419|  11.2k|  } else {
  420|  11.2k|    blend[(w >> 2) & 3][subw != 0][subh != 0](dst, dst_stride, src0,
  421|  11.2k|                                              src0_stride, src1, src1_stride,
  422|  11.2k|                                              mask, mask_stride, w, h);
  423|  11.2k|  }
  424|  16.8k|}
aom_highbd_blend_a64_mask_sse4_1:
  822|  4.95k|                                      int subw, int subh, int bd) {
  823|  4.95k|  typedef void (*blend_fn)(
  824|  4.95k|      uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
  825|  4.95k|      uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
  826|  4.95k|      const uint8_t *mask, uint32_t mask_stride, int w, int h);
  827|       |
  828|       |  // Dimensions are: bd_index X width_index X subw X subh
  829|  4.95k|  static const blend_fn blend[2][2][2][2] = {
  830|  4.95k|    {   // bd == 8 or 10
  831|  4.95k|      { // w % 8 == 0
  832|  4.95k|        { blend_a64_mask_b10_w8n_sse4_1, blend_a64_mask_b10_sy_w8n_sse4_1 },
  833|  4.95k|        { blend_a64_mask_b10_sx_w8n_sse4_1,
  834|  4.95k|          blend_a64_mask_b10_sx_sy_w8n_sse4_1 } },
  835|  4.95k|      { // w == 4
  836|  4.95k|        { blend_a64_mask_b10_w4_sse4_1, blend_a64_mask_b10_sy_w4_sse4_1 },
  837|  4.95k|        { blend_a64_mask_b10_sx_w4_sse4_1,
  838|  4.95k|          blend_a64_mask_b10_sx_sy_w4_sse4_1 } } },
  839|  4.95k|    {   // bd == 12
  840|  4.95k|      { // w % 8 == 0
  841|  4.95k|        { blend_a64_mask_b12_w8n_sse4_1, blend_a64_mask_b12_sy_w8n_sse4_1 },
  842|  4.95k|        { blend_a64_mask_b12_sx_w8n_sse4_1,
  843|  4.95k|          blend_a64_mask_b12_sx_sy_w8n_sse4_1 } },
  844|  4.95k|      { // w == 4
  845|  4.95k|        { blend_a64_mask_b12_w4_sse4_1, blend_a64_mask_b12_sy_w4_sse4_1 },
  846|  4.95k|        { blend_a64_mask_b12_sx_w4_sse4_1,
  847|  4.95k|          blend_a64_mask_b12_sx_sy_w4_sse4_1 } } }
  848|  4.95k|  };
  849|       |
  850|  4.95k|  assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride));
  851|  4.95k|  assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride));
  852|       |
  853|  4.95k|  assert(h >= 1);
  854|  4.95k|  assert(w >= 1);
  855|  4.95k|  assert(IS_POWER_OF_TWO(h));
  856|  4.95k|  assert(IS_POWER_OF_TWO(w));
  857|       |
  858|  4.95k|  assert(bd == 8 || bd == 10 || bd == 12);
  859|  4.95k|  if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
  ------------------
  |  |   55|  4.95k|#define UNLIKELY(v) __builtin_expect(v, 0)
  |  |  ------------------
  |  |  |  Branch (55:21): [True: 1.44k, False: 3.51k]
  |  |  ------------------
  ------------------
  860|  1.44k|    aom_highbd_blend_a64_mask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
  861|  1.44k|                                src1_stride, mask, mask_stride, w, h, subw,
  862|  1.44k|                                subh, bd);
  863|  3.51k|  } else {
  864|  3.51k|    uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
  ------------------
  |  |   75|  3.51k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  865|  3.51k|    const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
  ------------------
  |  |   75|  3.51k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  866|  3.51k|    const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
  ------------------
  |  |   75|  3.51k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  867|       |
  868|  3.51k|    blend[bd == 12][(w >> 2) & 1][subw != 0][subh != 0](
  869|  3.51k|        dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
  870|  3.51k|        mask_stride, w, h);
  871|  3.51k|  }
  872|  4.95k|}
blend_a64_mask_sse4.c:blend_a64_mask_w16n_sse4_1:
   76|    744|    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   77|    744|  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|    744|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|    744|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
   78|    744|  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
  ------------------
  |  |   23|    744|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
   79|       |
   80|  14.0k|  do {
   81|  14.0k|    int c;
   82|  30.0k|    for (c = 0; c < w; c += 16) {
  ------------------
  |  Branch (82:17): [True: 16.0k, False: 14.0k]
  ------------------
   83|  16.0k|      const __m128i v_m0_b = xx_loadu_128(mask + c);
   84|  16.0k|      const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
   85|       |
   86|  16.0k|      const __m128i v_res_b =
   87|  16.0k|          blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
   88|       |
   89|  16.0k|      xx_storeu_128(dst + c, v_res_b);
   90|  16.0k|    }
   91|  14.0k|    dst += dst_stride;
   92|  14.0k|    src0 += src0_stride;
   93|  14.0k|    src1 += src1_stride;
   94|  14.0k|    mask += mask_stride;
   95|  14.0k|  } while (--h);
  ------------------
  |  Branch (95:12): [True: 13.3k, False: 744]
  ------------------
   96|    744|}
blend_a64_mask_sse4.c:blend_a64_mask_w4_sse4_1:
   35|  7.32k|                                     int w, int h) {
   36|  7.32k|  (void)w;
   37|  7.32k|  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|  7.32k|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|  7.32k|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
   38|  7.32k|  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
  ------------------
  |  |   23|  7.32k|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
   39|  59.4k|  do {
   40|  59.4k|    const __m128i v_m0_b = xx_loadl_32(mask);
   41|  59.4k|    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
   42|  59.4k|    const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
   43|  59.4k|    xx_storel_32(dst, v_res_b);
   44|       |
   45|  59.4k|    dst += dst_stride;
   46|  59.4k|    src0 += src0_stride;
   47|  59.4k|    src1 += src1_stride;
   48|  59.4k|    mask += mask_stride;
   49|  59.4k|  } while (--h);
  ------------------
  |  Branch (49:12): [True: 52.1k, False: 7.32k]
  ------------------
   50|  7.32k|}
blend_a64_mask_sse4.c:blend_a64_mask_w8_sse4_1:
   56|  3.21k|                                     int w, int h) {
   57|  3.21k|  (void)w;
   58|  3.21k|  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|  3.21k|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|  3.21k|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
   59|  3.21k|  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
  ------------------
  |  |   23|  3.21k|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
   60|  36.1k|  do {
   61|  36.1k|    const __m128i v_m0_b = xx_loadl_64(mask);
   62|  36.1k|    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
   63|  36.1k|    const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
   64|  36.1k|    xx_storel_64(dst, v_res_b);
   65|       |
   66|  36.1k|    dst += dst_stride;
   67|  36.1k|    src0 += src0_stride;
   68|  36.1k|    src1 += src1_stride;
   69|  36.1k|    mask += mask_stride;
   70|  36.1k|  } while (--h);
  ------------------
  |  Branch (70:12): [True: 32.9k, False: 3.21k]
  ------------------
   71|  3.21k|}
blend_a64_mask_sse4.c:blend_a64_mask_b10_w8n_sse4_1:
  499|    909|    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
  500|    909|  blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
  501|    909|                               src1_stride, mask, mask_stride, w, h,
  502|    909|                               blend_8_b10);
  503|    909|}
blend_a64_mask_sse4.c:blend_a64_mask_bn_w8n_sse4_1:
  475|  1.44k|    blend_unit_fn blend) {
  476|  1.44k|  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|  1.44k|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|  1.44k|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  477|       |
  478|  20.5k|  do {
  479|  20.5k|    int c;
  480|  51.0k|    for (c = 0; c < w; c += 8) {
  ------------------
  |  Branch (480:17): [True: 30.4k, False: 20.5k]
  ------------------
  481|  30.4k|      const __m128i v_m0_b = xx_loadl_64(mask + c);
  482|  30.4k|      const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
  483|  30.4k|      const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
  484|       |
  485|  30.4k|      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
  486|       |
  487|  30.4k|      xx_storeu_128(dst + c, v_res_w);
  488|  30.4k|    }
  489|  20.5k|    dst += dst_stride;
  490|  20.5k|    src0 += src0_stride;
  491|  20.5k|    src1 += src1_stride;
  492|  20.5k|    mask += mask_stride;
  493|  20.5k|  } while (--h);
  ------------------
  |  Branch (493:12): [True: 19.0k, False: 1.44k]
  ------------------
  494|  1.44k|}
blend_a64_mask_sse4.c:blend_a64_mask_bn_sx_w8n_sse4_1:
  568|     32|    blend_unit_fn blend) {
  569|     32|  const __m128i v_zmask_b =
  570|     32|      _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
  571|     32|  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|     32|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|     32|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  572|       |
  573|    544|  do {
  574|    544|    int c;
  575|  1.34k|    for (c = 0; c < w; c += 8) {
  ------------------
  |  Branch (575:17): [True: 800, False: 544]
  ------------------
  576|    800|      const __m128i v_r_b = xx_loadu_128(mask + 2 * c);
  577|    800|      const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
  578|       |
  579|    800|      const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
  580|    800|      const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
  581|       |
  582|    800|      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
  583|       |
  584|    800|      xx_storeu_128(dst + c, v_res_w);
  585|    800|    }
  586|    544|    dst += dst_stride;
  587|    544|    src0 += src0_stride;
  588|    544|    src1 += src1_stride;
  589|    544|    mask += mask_stride;
  590|    544|  } while (--h);
  ------------------
  |  Branch (590:12): [True: 512, False: 32]
  ------------------
  591|     32|}
blend_a64_mask_sse4.c:blend_a64_mask_b10_sx_sy_w8n_sse4_1:
  798|      8|    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
  799|      8|  blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
  800|      8|                                     src1_stride, mask, mask_stride, w, h,
  801|      8|                                     blend_8_b10);
  802|      8|}
blend_a64_mask_sse4.c:blend_a64_mask_bn_sx_sy_w8n_sse4_1:
  765|      8|    blend_unit_fn blend) {
  766|      8|  const __m128i v_zmask_b =
  767|      8|      _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
  768|      8|  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|      8|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|      8|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  769|       |
  770|     48|  do {
  771|     48|    int c;
  772|     96|    for (c = 0; c < w; c += 8) {
  ------------------
  |  Branch (772:17): [True: 48, False: 48]
  ------------------
  773|     48|      const __m128i v_ra_b = xx_loadu_128(mask + 2 * c);
  774|     48|      const __m128i v_rb_b = xx_loadu_128(mask + 2 * c + mask_stride);
  775|     48|      const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
  776|     48|      const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
  777|     48|      const __m128i v_rvsb_w =
  778|     48|          _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
  779|     48|      const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
  780|       |
  781|     48|      const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
  782|     48|      const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
  783|       |
  784|     48|      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
  785|       |
  786|     48|      xx_storeu_128(dst + c, v_res_w);
  787|     48|    }
  788|     48|    dst += dst_stride;
  789|     48|    src0 += src0_stride;
  790|     48|    src1 += src1_stride;
  791|     48|    mask += 2 * mask_stride;
  792|     48|  } while (--h);
  ------------------
  |  Branch (792:12): [True: 40, False: 8]
  ------------------
  793|      8|}
blend_a64_mask_sse4.c:blend_a64_mask_b10_w4_sse4_1:
  456|  1.67k|    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
  457|  1.67k|  (void)w;
  458|  1.67k|  blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
  459|  1.67k|                              src1_stride, mask, mask_stride, h, blend_4_b10);
  460|  1.67k|}
blend_a64_mask_sse4.c:blend_a64_mask_bn_w4_sse4_1:
  434|  2.01k|    const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
  435|  2.01k|  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|  2.01k|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|  2.01k|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  436|       |
  437|  16.1k|  do {
  438|  16.1k|    const __m128i v_m0_b = xx_loadl_32(mask);
  439|  16.1k|    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
  440|  16.1k|    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
  441|       |
  442|  16.1k|    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
  443|       |
  444|  16.1k|    xx_storel_64(dst, v_res_w);
  445|       |
  446|  16.1k|    dst += dst_stride;
  447|  16.1k|    src0 += src0_stride;
  448|  16.1k|    src1 += src1_stride;
  449|  16.1k|    mask += mask_stride;
  450|  16.1k|  } while (--h);
  ------------------
  |  Branch (450:12): [True: 14.1k, False: 2.01k]
  ------------------
  451|  2.01k|}
blend_a64_mask_sse4.c:blend_a64_mask_bn_sx_w4_sse4_1:
  521|      4|    const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
  522|      4|  const __m128i v_zmask_b =
  523|      4|      _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
  524|      4|  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|      4|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|      4|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  525|       |
  526|     32|  do {
  527|     32|    const __m128i v_r_b = xx_loadl_64(mask);
  528|     32|    const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
  529|       |
  530|     32|    const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
  531|     32|    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
  532|       |
  533|     32|    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
  534|       |
  535|     32|    xx_storel_64(dst, v_res_w);
  536|       |
  537|     32|    dst += dst_stride;
  538|     32|    src0 += src0_stride;
  539|     32|    src1 += src1_stride;
  540|     32|    mask += mask_stride;
  541|     32|  } while (--h);
  ------------------
  |  Branch (541:12): [True: 28, False: 4]
  ------------------
  542|      4|}
blend_a64_mask_sse4.c:blend_a64_mask_b10_sx_sy_w4_sse4_1:
  744|      8|    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
  745|      8|  (void)w;
  746|      8|  blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
  747|      8|                                    src1_stride, mask, mask_stride, h,
  748|      8|                                    blend_4_b10);
  749|      8|}
blend_a64_mask_sse4.c:blend_a64_mask_bn_sx_sy_w4_sse4_1:
  713|      8|    const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
  714|      8|  const __m128i v_zmask_b =
  715|      8|      _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
  716|      8|  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|      8|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|      8|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  717|       |
  718|     48|  do {
  719|     48|    const __m128i v_ra_b = xx_loadl_64(mask);
  720|     48|    const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
  721|     48|    const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
  722|     48|    const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
  723|     48|    const __m128i v_rvsb_w =
  724|     48|        _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
  725|     48|    const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
  726|       |
  727|     48|    const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
  728|     48|    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
  729|       |
  730|     48|    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
  731|       |
  732|     48|    xx_storel_64(dst, v_res_w);
  733|       |
  734|     48|    dst += dst_stride;
  735|     48|    src0 += src0_stride;
  736|     48|    src1 += src1_stride;
  737|     48|    mask += 2 * mask_stride;
  738|     48|  } while (--h);
  ------------------
  |  Branch (738:12): [True: 40, False: 8]
  ------------------
  739|      8|}
blend_a64_mask_sse4.c:blend_a64_mask_b12_w8n_sse4_1:
  508|    540|    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
  509|    540|  blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
  510|    540|                               src1_stride, mask, mask_stride, w, h,
  511|    540|                               blend_8_b12);
  512|    540|}
blend_a64_mask_sse4.c:blend_a64_mask_b12_sx_w8n_sse4_1:
  605|     32|    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
  606|     32|  blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
  607|     32|                                  src1_stride, mask, mask_stride, w, h,
  608|     32|                                  blend_8_b12);
  609|     32|}
blend_a64_mask_sse4.c:blend_a64_mask_b12_w4_sse4_1:
  465|    338|    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
  466|    338|  (void)w;
  467|    338|  blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
  468|    338|                              src1_stride, mask, mask_stride, h, blend_4_b12);
  469|    338|}
blend_a64_mask_sse4.c:blend_a64_mask_b12_sx_w4_sse4_1:
  557|      4|    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
  558|      4|  (void)w;
  559|      4|  blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
  560|      4|                                 src1_stride, mask, mask_stride, h,
  561|      4|                                 blend_4_b12);
  562|      4|}

aom_blend_a64_vmask_sse4_1:
  115|  8.56k|                                const uint8_t *mask, int w, int h) {
  116|  8.56k|  typedef void (*blend_fn)(uint8_t *dst, uint32_t dst_stride,
  117|  8.56k|                           const uint8_t *src0, uint32_t src0_stride,
  118|  8.56k|                           const uint8_t *src1, uint32_t src1_stride,
  119|  8.56k|                           const uint8_t *mask, int w, int h);
  120|       |
  121|       |  // Dimension: width_index
  122|  8.56k|  static const blend_fn blend[9] = {
  123|  8.56k|    blend_a64_vmask_w16n_sse4_1,  // w % 16 == 0
  124|  8.56k|    aom_blend_a64_vmask_c,        // w == 1
  125|  8.56k|    aom_blend_a64_vmask_c,        // w == 2
  126|  8.56k|    NULL,                         // INVALID
  127|  8.56k|    blend_a64_vmask_w4_sse4_1,    // w == 4
  128|  8.56k|    NULL,                         // INVALID
  129|  8.56k|    NULL,                         // INVALID
  130|  8.56k|    NULL,                         // INVALID
  131|  8.56k|    blend_a64_vmask_w8_sse4_1,    // w == 8
  132|  8.56k|  };
  133|       |
  134|  8.56k|  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
  135|  8.56k|  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
  136|       |
  137|  8.56k|  assert(h >= 1);
  138|  8.56k|  assert(w >= 1);
  139|  8.56k|  assert(IS_POWER_OF_TWO(h));
  140|  8.56k|  assert(IS_POWER_OF_TWO(w));
  141|       |
  142|  8.56k|  blend[w & 0xf](dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, w,
  143|  8.56k|                 h);
  144|  8.56k|}
aom_highbd_blend_a64_vmask_sse4_1:
  243|  2.38k|    const uint8_t *mask, int w, int h, int bd) {
  244|  2.38k|  typedef void (*blend_fn)(uint16_t *dst, uint32_t dst_stride,
  245|  2.38k|                           const uint16_t *src0, uint32_t src0_stride,
  246|  2.38k|                           const uint16_t *src1, uint32_t src1_stride,
  247|  2.38k|                           const uint8_t *mask, int w, int h);
  248|       |
  249|       |  // Dimensions are: bd_index X width_index
  250|  2.38k|  static const blend_fn blend[2][2] = {
  251|  2.38k|    {
  252|       |        // bd == 8 or 10
  253|  2.38k|        blend_a64_vmask_b10_w8n_sse4_1,  // w % 8 == 0
  254|  2.38k|        blend_a64_vmask_b10_w4_sse4_1,   // w == 4
  255|  2.38k|    },
  256|  2.38k|    {
  257|       |        // bd == 12
  258|  2.38k|        blend_a64_vmask_b12_w8n_sse4_1,  // w % 8 == 0
  259|  2.38k|        blend_a64_vmask_b12_w4_sse4_1,   // w == 4
  260|  2.38k|    }
  261|  2.38k|  };
  262|       |
  263|  2.38k|  assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride));
  264|  2.38k|  assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride));
  265|       |
  266|  2.38k|  assert(h >= 1);
  267|  2.38k|  assert(w >= 1);
  268|  2.38k|  assert(IS_POWER_OF_TWO(h));
  269|  2.38k|  assert(IS_POWER_OF_TWO(w));
  270|       |
  271|  2.38k|  assert(bd == 8 || bd == 10 || bd == 12);
  272|       |
  273|  2.38k|  if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
  ------------------
  |  |   55|  2.38k|#define UNLIKELY(v) __builtin_expect(v, 0)
  |  |  ------------------
  |  |  |  Branch (55:21): [True: 68, False: 2.32k]
  |  |  ------------------
  ------------------
  274|     68|    aom_highbd_blend_a64_vmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
  275|     68|                                 src1_stride, mask, w, h, bd);
  276|  2.32k|  } else {
  277|  2.32k|    uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
  ------------------
  |  |   75|  2.32k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  278|  2.32k|    const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
  ------------------
  |  |   75|  2.32k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  279|  2.32k|    const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
  ------------------
  |  |   75|  2.32k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  280|       |
  281|  2.32k|    blend[bd == 12][(w >> 2) & 1](dst, dst_stride, src0, src0_stride, src1,
  282|  2.32k|                                  src1_stride, mask, w, h);
  283|  2.32k|  }
  284|  2.38k|}
blend_a64_vmask_sse4.c:blend_a64_vmask_w16n_sse4_1:
   85|  2.92k|                                        const uint8_t *mask, int w, int h) {
   86|  2.92k|  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|  2.92k|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|  2.92k|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
   87|       |
   88|  18.6k|  do {
   89|  18.6k|    int c;
   90|  18.6k|    const __m128i v_m0_w = _mm_set1_epi16(*mask);
   91|  18.6k|    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
   92|  43.5k|    for (c = 0; c < w; c += 16) {
  ------------------
  |  Branch (92:17): [True: 24.8k, False: 18.6k]
  ------------------
   93|  24.8k|      const __m128i v_resl_w = blend_8(src0 + c, src1 + c, &v_m0_w, &v_m1_w);
   94|  24.8k|      const __m128i v_resh_w =
   95|  24.8k|          blend_8(src0 + c + 8, src1 + c + 8, &v_m0_w, &v_m1_w);
   96|       |
   97|  24.8k|      const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
   98|       |
   99|  24.8k|      xx_storeu_128(dst + c, v_res_b);
  100|  24.8k|    }
  101|  18.6k|    dst += dst_stride;
  102|  18.6k|    src0 += src0_stride;
  103|  18.6k|    src1 += src1_stride;
  104|  18.6k|    mask += 1;
  105|  18.6k|  } while (--h);
  ------------------
  |  Branch (105:12): [True: 15.7k, False: 2.92k]
  ------------------
  106|  2.92k|}
blend_a64_vmask_sse4.c:blend_a64_vmask_w4_sse4_1:
   33|    784|                                      const uint8_t *mask, int w, int h) {
   34|    784|  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|    784|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|    784|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
   35|       |
   36|    784|  (void)w;
   37|       |
   38|  4.65k|  do {
   39|  4.65k|    const __m128i v_m0_w = _mm_set1_epi16(*mask);
   40|  4.65k|    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
   41|       |
   42|  4.65k|    const __m128i v_res_w = blend_4(src0, src1, &v_m0_w, &v_m1_w);
   43|       |
   44|  4.65k|    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
   45|       |
   46|  4.65k|    xx_storel_32(dst, v_res_b);
   47|       |
   48|  4.65k|    dst += dst_stride;
   49|  4.65k|    src0 += src0_stride;
   50|  4.65k|    src1 += src1_stride;
   51|  4.65k|    mask += 1;
   52|  4.65k|  } while (--h);
  ------------------
  |  Branch (52:12): [True: 3.87k, False: 784]
  ------------------
   53|    784|}
blend_a64_vmask_sse4.c:blend_a64_vmask_w8_sse4_1:
   58|  4.85k|                                      const uint8_t *mask, int w, int h) {
   59|  4.85k|  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|  4.85k|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|  4.85k|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
   60|       |
   61|  4.85k|  (void)w;
   62|       |
   63|  26.5k|  do {
   64|  26.5k|    const __m128i v_m0_w = _mm_set1_epi16(*mask);
   65|  26.5k|    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
   66|       |
   67|  26.5k|    const __m128i v_res_w = blend_8(src0, src1, &v_m0_w, &v_m1_w);
   68|       |
   69|  26.5k|    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
   70|       |
   71|  26.5k|    xx_storel_64(dst, v_res_b);
   72|       |
   73|  26.5k|    dst += dst_stride;
   74|  26.5k|    src0 += src0_stride;
   75|  26.5k|    src1 += src1_stride;
   76|  26.5k|    mask += 1;
   77|  26.5k|  } while (--h);
  ------------------
  |  Branch (77:12): [True: 21.7k, False: 4.85k]
  ------------------
   78|  4.85k|}
blend_a64_vmask_sse4.c:blend_a64_vmask_b10_w8n_sse4_1:
  221|  1.59k|                                           const uint8_t *mask, int w, int h) {
  222|  1.59k|  blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
  223|  1.59k|                                src1_stride, mask, w, h, blend_8_b10);
  224|  1.59k|}
blend_a64_vmask_sse4.c:blend_a64_vmask_bn_w8n_sse4_1:
  197|  2.16k|    const uint8_t *mask, int w, int h, blend_unit_fn blend) {
  198|  2.16k|  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|  2.16k|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|  2.16k|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  199|       |
  200|  12.3k|  do {
  201|  12.3k|    int c;
  202|  12.3k|    const __m128i v_m0_w = _mm_set1_epi16(*mask);
  203|  12.3k|    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
  204|  32.0k|    for (c = 0; c < w; c += 8) {
  ------------------
  |  Branch (204:17): [True: 19.7k, False: 12.3k]
  ------------------
  205|  19.7k|      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
  206|       |
  207|  19.7k|      xx_storeu_128(dst + c, v_res_w);
  208|  19.7k|    }
  209|  12.3k|    dst += dst_stride;
  210|  12.3k|    src0 += src0_stride;
  211|  12.3k|    src1 += src1_stride;
  212|  12.3k|    mask += 1;
  213|  12.3k|  } while (--h);
  ------------------
  |  Branch (213:12): [True: 10.2k, False: 2.16k]
  ------------------
  214|  2.16k|}
blend_a64_vmask_sse4.c:blend_a64_vmask_b10_w4_sse4_1:
  177|    128|                                          const uint8_t *mask, int w, int h) {
  178|    128|  (void)w;
  179|    128|  blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
  180|    128|                               src1_stride, mask, h, blend_4_b10);
  181|    128|}
blend_a64_vmask_sse4.c:blend_a64_vmask_bn_w4_sse4_1:
  154|    160|    const uint8_t *mask, int h, blend_unit_fn blend) {
  155|    160|  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|    160|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|    160|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  156|       |
  157|    912|  do {
  158|    912|    const __m128i v_m0_w = _mm_set1_epi16(*mask);
  159|    912|    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
  160|       |
  161|    912|    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
  162|       |
  163|    912|    xx_storel_64(dst, v_res_w);
  164|       |
  165|    912|    dst += dst_stride;
  166|    912|    src0 += src0_stride;
  167|    912|    src1 += src1_stride;
  168|    912|    mask += 1;
  169|    912|  } while (--h);
  ------------------
  |  Branch (169:12): [True: 752, False: 160]
  ------------------
  170|    160|}
blend_a64_vmask_sse4.c:blend_a64_vmask_b12_w8n_sse4_1:
  231|    564|                                           const uint8_t *mask, int w, int h) {
  232|    564|  blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
  233|    564|                                src1_stride, mask, w, h, blend_8_b12);
  234|    564|}
blend_a64_vmask_sse4.c:blend_a64_vmask_b12_w4_sse4_1:
  188|     32|                                          const uint8_t *mask, int w, int h) {
  189|     32|  (void)w;
  190|     32|  blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
  191|     32|                               src1_stride, mask, h, blend_4_b12);
  192|     32|}

blend_a64_mask_avx2.c:blend_a64_d16_mask_w4_sse41:
   30|  3.88k|    int shift) {
   31|  3.88k|  const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m);
   32|  3.88k|  const __m128i s0 = xx_loadl_64(src0);
   33|  3.88k|  const __m128i s1 = xx_loadl_64(src1);
   34|  3.88k|  const __m128i s0_s1 = _mm_unpacklo_epi16(s0, s1);
   35|  3.88k|  const __m128i m_max_minus_m = _mm_unpacklo_epi16(*m, max_minus_m);
   36|  3.88k|  const __m128i res_a = _mm_madd_epi16(s0_s1, m_max_minus_m);
   37|  3.88k|  const __m128i res_c = _mm_sub_epi32(res_a, *v_round_offset);
   38|  3.88k|  const __m128i res_d = _mm_srai_epi32(res_c, shift);
   39|  3.88k|  const __m128i res_e = _mm_packs_epi32(res_d, res_d);
   40|  3.88k|  const __m128i res = _mm_packus_epi16(res_e, res_e);
   41|       |
   42|  3.88k|  xx_storel_32(dst, res);
   43|  3.88k|}
blend_a64_mask_avx2.c:aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1:
   87|    548|    const __m128i *round_offset, int shift) {
   88|    548|  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|    548|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|    548|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
   89|  6.89k|  for (int i = 0; i < h; ++i) {
  ------------------
  |  Branch (89:19): [True: 6.34k, False: 548]
  ------------------
   90|  6.34k|    const __m128i m0 = xx_loadl_64(mask);
   91|  6.34k|    const __m128i m = _mm_cvtepu8_epi16(m0);
   92|  6.34k|    blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
   93|  6.34k|                                shift);
   94|  6.34k|    mask += mask_stride;
   95|  6.34k|    dst += dst_stride;
   96|  6.34k|    src0 += src0_stride;
   97|  6.34k|    src1 += src1_stride;
   98|  6.34k|  }
   99|    548|}
blend_a64_mask_avx2.c:blend_a64_d16_mask_w8_sse41:
   48|  11.5k|    int shift) {
   49|  11.5k|  const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m);
   50|  11.5k|  const __m128i s0 = xx_loadu_128(src0);
   51|  11.5k|  const __m128i s1 = xx_loadu_128(src1);
   52|  11.5k|  __m128i res_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0, s1),
   53|  11.5k|                                  _mm_unpacklo_epi16(*m, max_minus_m));
   54|  11.5k|  __m128i res_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0, s1),
   55|  11.5k|                                  _mm_unpackhi_epi16(*m, max_minus_m));
   56|  11.5k|  res_lo = _mm_srai_epi32(_mm_sub_epi32(res_lo, *v_round_offset), shift);
   57|  11.5k|  res_hi = _mm_srai_epi32(_mm_sub_epi32(res_hi, *v_round_offset), shift);
   58|  11.5k|  const __m128i res_e = _mm_packs_epi32(res_lo, res_hi);
   59|  11.5k|  const __m128i res = _mm_packus_epi16(res_e, res_e);
   60|       |
   61|  11.5k|  _mm_storel_epi64((__m128i *)(dst), res);
   62|  11.5k|}
blend_a64_mask_avx2.c:aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1:
  105|    604|    const __m128i *round_offset, int shift) {
  106|    604|  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|    604|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|    604|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  107|    604|  const __m128i one_b = _mm_set1_epi8(1);
  108|    604|  const __m128i two_w = _mm_set1_epi16(2);
  109|  4.49k|  for (int i = 0; i < h; ++i) {
  ------------------
  |  Branch (109:19): [True: 3.88k, False: 604]
  ------------------
  110|  3.88k|    const __m128i m_i0 = xx_loadl_64(mask);
  111|  3.88k|    const __m128i m_i1 = xx_loadl_64(mask + mask_stride);
  112|  3.88k|    const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
  113|  3.88k|    const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b);
  114|  3.88k|    const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w);
  115|  3.88k|    const __m128i m = _mm_srli_epi16(m_acbd_2, 2);
  116|       |
  117|  3.88k|    blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
  118|  3.88k|                                shift);
  119|  3.88k|    mask += mask_stride << 1;
  120|  3.88k|    dst += dst_stride;
  121|  3.88k|    src0 += src0_stride;
  122|  3.88k|    src1 += src1_stride;
  123|  3.88k|  }
  124|    604|}
blend_a64_mask_avx2.c:aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1:
  130|    620|    const __m128i *round_offset, int shift) {
  131|    620|  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|    620|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|    620|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  132|    620|  const __m128i one_b = _mm_set1_epi8(1);
  133|    620|  const __m128i two_w = _mm_set1_epi16(2);
  134|  5.78k|  for (int i = 0; i < h; ++i) {
  ------------------
  |  Branch (134:19): [True: 5.16k, False: 620]
  ------------------
  135|  5.16k|    const __m128i m_i0 = xx_loadu_128(mask);
  136|  5.16k|    const __m128i m_i1 = xx_loadu_128(mask + mask_stride);
  137|  5.16k|    const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
  138|  5.16k|    const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b);
  139|  5.16k|    const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w);
  140|  5.16k|    const __m128i m = _mm_srli_epi16(m_acbd_2, 2);
  141|       |
  142|  5.16k|    blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
  143|  5.16k|                                shift);
  144|  5.16k|    mask += mask_stride << 1;
  145|  5.16k|    dst += dst_stride;
  146|  5.16k|    src0 += src0_stride;
  147|  5.16k|    src1 += src1_stride;
  148|  5.16k|  }
  149|    620|}

blend_a64_mask_sse4.c:blend_16_u8:
   88|  16.0k|                                  const __m128i *rounding) {
   89|  16.0k|  const __m128i v_s0_b = xx_loadu_128(src0);
   90|  16.0k|  const __m128i v_s1_b = xx_loadu_128(src1);
   91|       |
   92|  16.0k|  const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
   93|  16.0k|                                           _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
   94|  16.0k|  const __m128i v_p1_w = _mm_maddubs_epi16(_mm_unpackhi_epi8(v_s0_b, v_s1_b),
   95|  16.0k|                                           _mm_unpackhi_epi8(*v_m0_b, *v_m1_b));
   96|       |
   97|  16.0k|  const __m128i v_res0_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
   98|  16.0k|  const __m128i v_res1_w = _mm_mulhrs_epi16(v_p1_w, *rounding);
   99|  16.0k|  const __m128i v_res = _mm_packus_epi16(v_res0_w, v_res1_w);
  100|  16.0k|  return v_res;
  101|  16.0k|}
blend_a64_mask_sse4.c:blend_4_u8:
   60|  59.4k|                                 const __m128i *rounding) {
   61|  59.4k|  const __m128i v_s0_b = xx_loadl_32(src0);
   62|  59.4k|  const __m128i v_s1_b = xx_loadl_32(src1);
   63|       |
   64|  59.4k|  const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
   65|  59.4k|                                           _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
   66|       |
   67|  59.4k|  const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
   68|  59.4k|  const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w);
   69|  59.4k|  return v_res;
   70|  59.4k|}
blend_a64_mask_sse4.c:blend_8_u8:
   74|  36.1k|                                 const __m128i *rounding) {
   75|  36.1k|  const __m128i v_s0_b = xx_loadl_64(src0);
   76|  36.1k|  const __m128i v_s1_b = xx_loadl_64(src1);
   77|       |
   78|  36.1k|  const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
   79|  36.1k|                                           _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
   80|       |
   81|  36.1k|  const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
   82|  36.1k|  const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w);
   83|  36.1k|  return v_res;
   84|  36.1k|}
blend_a64_mask_sse4.c:blend_8_b10:
  122|  16.0k|                                  const __m128i v_m0_w, const __m128i v_m1_w) {
  123|  16.0k|  const __m128i v_s0_w = xx_loadu_128(src0);
  124|  16.0k|  const __m128i v_s1_w = xx_loadu_128(src1);
  125|       |
  126|  16.0k|  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
  127|  16.0k|  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
  128|       |
  129|  16.0k|  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
  130|       |
  131|  16.0k|  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
  ------------------
  |  |   23|  16.0k|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
  132|       |
  133|  16.0k|  return v_res_w;
  134|  16.0k|}
blend_a64_mask_sse4.c:blend_4_b10:
  107|  13.0k|                                  const __m128i v_m0_w, const __m128i v_m1_w) {
  108|  13.0k|  const __m128i v_s0_w = xx_loadl_64(src0);
  109|  13.0k|  const __m128i v_s1_w = xx_loadl_64(src1);
  110|       |
  111|  13.0k|  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
  112|  13.0k|  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
  113|       |
  114|  13.0k|  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
  115|       |
  116|  13.0k|  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
  ------------------
  |  |   23|  13.0k|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
  117|       |
  118|  13.0k|  return v_res_w;
  119|  13.0k|}
blend_a64_mask_sse4.c:blend_8_b12:
  162|  15.3k|                                  const __m128i v_m0_w, const __m128i v_m1_w) {
  163|  15.3k|  const __m128i v_s0_w = xx_loadu_128(src0);
  164|  15.3k|  const __m128i v_s1_w = xx_loadu_128(src1);
  165|       |
  166|       |  // Interleave
  167|  15.3k|  const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
  168|  15.3k|  const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w);
  169|  15.3k|  const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
  170|  15.3k|  const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w);
  171|       |
  172|       |  // Multiply-Add
  173|  15.3k|  const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w);
  174|  15.3k|  const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w);
  175|       |
  176|       |  // Scale
  177|  15.3k|  const __m128i v_ssuml_d =
  178|  15.3k|      _mm_srli_epi32(v_suml_d, AOM_BLEND_A64_ROUND_BITS - 1);
  ------------------
  |  |   23|  15.3k|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
  179|  15.3k|  const __m128i v_ssumh_d =
  180|  15.3k|      _mm_srli_epi32(v_sumh_d, AOM_BLEND_A64_ROUND_BITS - 1);
  ------------------
  |  |   23|  15.3k|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
  181|       |
  182|       |  // Pack
  183|  15.3k|  const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d);
  184|       |
  185|       |  // Round
  186|  15.3k|  const __m128i v_res_w = xx_round_epu16(v_pssum_d);
  187|       |
  188|  15.3k|  return v_res_w;
  189|  15.3k|}
blend_a64_mask_sse4.c:blend_4_b12:
  137|  3.21k|                                  const __m128i v_m0_w, const __m128i v_m1_w) {
  138|  3.21k|  const __m128i v_s0_w = xx_loadl_64(src0);
  139|  3.21k|  const __m128i v_s1_w = xx_loadl_64(src1);
  140|       |
  141|       |  // Interleave
  142|  3.21k|  const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
  143|  3.21k|  const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
  144|       |
  145|       |  // Multiply-Add
  146|  3.21k|  const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w);
  147|       |
  148|       |  // Scale
  149|  3.21k|  const __m128i v_ssum_d =
  150|  3.21k|      _mm_srli_epi32(v_sum_d, AOM_BLEND_A64_ROUND_BITS - 1);
  ------------------
  |  |   23|  3.21k|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
  151|       |
  152|       |  // Pack
  153|  3.21k|  const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d);
  154|       |
  155|       |  // Round
  156|  3.21k|  const __m128i v_res_w = xx_round_epu16(v_pssum_d);
  157|       |
  158|  3.21k|  return v_res_w;
  159|  3.21k|}
blend_a64_vmask_sse4.c:blend_8:
   42|  76.3k|                              const __m128i *v_m0_w, const __m128i *v_m1_w) {
   43|  76.3k|  const __m128i v_s0_b = xx_loadl_64(src0);
   44|  76.3k|  const __m128i v_s1_b = xx_loadl_64(src1);
   45|  76.3k|  const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
   46|  76.3k|  const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
   47|       |
   48|  76.3k|  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w);
   49|  76.3k|  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w);
   50|       |
   51|  76.3k|  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
   52|       |
   53|  76.3k|  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
  ------------------
  |  |   23|  76.3k|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
   54|       |
   55|  76.3k|  return v_res_w;
   56|  76.3k|}
blend_a64_vmask_sse4.c:blend_4:
   27|  4.65k|                              const __m128i *v_m0_w, const __m128i *v_m1_w) {
   28|  4.65k|  const __m128i v_s0_b = xx_loadl_32(src0);
   29|  4.65k|  const __m128i v_s1_b = xx_loadl_32(src1);
   30|  4.65k|  const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
   31|  4.65k|  const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
   32|       |
   33|  4.65k|  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w);
   34|  4.65k|  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w);
   35|  4.65k|  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
   36|  4.65k|  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
  ------------------
  |  |   23|  4.65k|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
   37|       |
   38|  4.65k|  return v_res_w;
   39|  4.65k|}
blend_a64_vmask_sse4.c:blend_8_b10:
  122|  14.1k|                                  const __m128i v_m0_w, const __m128i v_m1_w) {
  123|  14.1k|  const __m128i v_s0_w = xx_loadu_128(src0);
  124|  14.1k|  const __m128i v_s1_w = xx_loadu_128(src1);
  125|       |
  126|  14.1k|  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
  127|  14.1k|  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
  128|       |
  129|  14.1k|  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
  130|       |
  131|  14.1k|  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
  ------------------
  |  |   23|  14.1k|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
  132|       |
  133|  14.1k|  return v_res_w;
  134|  14.1k|}
blend_a64_vmask_sse4.c:blend_4_b10:
  107|    720|                                  const __m128i v_m0_w, const __m128i v_m1_w) {
  108|    720|  const __m128i v_s0_w = xx_loadl_64(src0);
  109|    720|  const __m128i v_s1_w = xx_loadl_64(src1);
  110|       |
  111|    720|  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
  112|    720|  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
  113|       |
  114|    720|  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
  115|       |
  116|    720|  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
  ------------------
  |  |   23|    720|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
  117|       |
  118|    720|  return v_res_w;
  119|    720|}
blend_a64_vmask_sse4.c:blend_8_b12:
  162|  5.56k|                                  const __m128i v_m0_w, const __m128i v_m1_w) {
  163|  5.56k|  const __m128i v_s0_w = xx_loadu_128(src0);
  164|  5.56k|  const __m128i v_s1_w = xx_loadu_128(src1);
  165|       |
  166|       |  // Interleave
  167|  5.56k|  const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
  168|  5.56k|  const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w);
  169|  5.56k|  const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
  170|  5.56k|  const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w);
  171|       |
  172|       |  // Multiply-Add
  173|  5.56k|  const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w);
  174|  5.56k|  const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w);
  175|       |
  176|       |  // Scale
  177|  5.56k|  const __m128i v_ssuml_d =
  178|  5.56k|      _mm_srli_epi32(v_suml_d, AOM_BLEND_A64_ROUND_BITS - 1);
  ------------------
  |  |   23|  5.56k|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
  179|  5.56k|  const __m128i v_ssumh_d =
  180|  5.56k|      _mm_srli_epi32(v_sumh_d, AOM_BLEND_A64_ROUND_BITS - 1);
  ------------------
  |  |   23|  5.56k|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
  181|       |
  182|       |  // Pack
  183|  5.56k|  const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d);
  184|       |
  185|       |  // Round
  186|  5.56k|  const __m128i v_res_w = xx_round_epu16(v_pssum_d);
  187|       |
  188|  5.56k|  return v_res_w;
  189|  5.56k|}
blend_a64_vmask_sse4.c:blend_4_b12:
  137|    192|                                  const __m128i v_m0_w, const __m128i v_m1_w) {
  138|    192|  const __m128i v_s0_w = xx_loadl_64(src0);
  139|    192|  const __m128i v_s1_w = xx_loadl_64(src1);
  140|       |
  141|       |  // Interleave
  142|    192|  const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
  143|    192|  const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
  144|       |
  145|       |  // Multiply-Add
  146|    192|  const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w);
  147|       |
  148|       |  // Scale
  149|    192|  const __m128i v_ssum_d =
  150|    192|      _mm_srli_epi32(v_sum_d, AOM_BLEND_A64_ROUND_BITS - 1);
  ------------------
  |  |   23|    192|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
  151|       |
  152|       |  // Pack
  153|    192|  const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d);
  154|       |
  155|       |  // Round
  156|    192|  const __m128i v_res_w = xx_round_epu16(v_pssum_d);
  157|       |
  158|    192|  return v_res_w;
  159|    192|}
blend_a64_mask_avx2.c:blend_4_u8:
   60|  7.24k|                                 const __m128i *rounding) {
   61|  7.24k|  const __m128i v_s0_b = xx_loadl_32(src0);
   62|  7.24k|  const __m128i v_s1_b = xx_loadl_32(src1);
   63|       |
   64|  7.24k|  const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
   65|  7.24k|                                           _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
   66|       |
   67|  7.24k|  const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
   68|  7.24k|  const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w);
   69|  7.24k|  return v_res;
   70|  7.24k|}
blend_a64_mask_avx2.c:blend_8_u8:
   74|  16.2k|                                 const __m128i *rounding) {
   75|  16.2k|  const __m128i v_s0_b = xx_loadl_64(src0);
   76|  16.2k|  const __m128i v_s1_b = xx_loadl_64(src1);
   77|       |
   78|  16.2k|  const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
   79|  16.2k|                                           _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
   80|       |
   81|  16.2k|  const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
   82|  16.2k|  const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w);
   83|  16.2k|  return v_res;
   84|  16.2k|}
blend_a64_mask_avx2.c:blend_16_u8:
   88|  9.94k|                                  const __m128i *rounding) {
   89|  9.94k|  const __m128i v_s0_b = xx_loadu_128(src0);
   90|  9.94k|  const __m128i v_s1_b = xx_loadu_128(src1);
   91|       |
   92|  9.94k|  const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
   93|  9.94k|                                           _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
   94|  9.94k|  const __m128i v_p1_w = _mm_maddubs_epi16(_mm_unpackhi_epi8(v_s0_b, v_s1_b),
   95|  9.94k|                                           _mm_unpackhi_epi8(*v_m0_b, *v_m1_b));
   96|       |
   97|  9.94k|  const __m128i v_res0_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
   98|  9.94k|  const __m128i v_res1_w = _mm_mulhrs_epi16(v_p1_w, *rounding);
   99|  9.94k|  const __m128i v_res = _mm_packus_epi16(v_res0_w, v_res1_w);
  100|  9.94k|  return v_res;
  101|  9.94k|}

highbd_convolve_avx2.c:prepare_coeffs:
  683|  7.35k|                                  __m256i *const coeffs /* [4] */) {
  684|  7.35k|  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
  685|  7.35k|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|  7.35k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|  7.35k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  686|       |
  687|  7.35k|  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
  688|  7.35k|  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
  689|       |
  690|       |  // coeffs 0 1 0 1 0 1 0 1
  691|  7.35k|  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
  692|       |  // coeffs 2 3 2 3 2 3 2 3
  693|  7.35k|  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
  694|       |  // coeffs 4 5 4 5 4 5 4 5
  695|  7.35k|  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
  696|       |  // coeffs 6 7 6 7 6 7 6 7
  697|       |  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
  698|  7.35k|}
highbd_convolve_avx2.c:convolve:
  790|   167k|                               const __m256i *const coeffs) {
  791|   167k|  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
  792|   167k|  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
  793|   167k|  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
  794|   167k|  const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
  795|       |
  796|   167k|  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
  797|   167k|                                       _mm256_add_epi32(res_2, res_3));
  798|       |
  799|   167k|  return res;
  800|   167k|}
convolve_avx2.c:prepare_coeffs_lowbd:
  612|    616|    __m256i *const coeffs /* [4] */) {
  613|    616|  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
  614|    616|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|    616|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|    616|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  615|    616|  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
  616|    616|  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
  617|       |
  618|       |  // right shift all filter co-efficients by 1 to reduce the bits required.
  619|       |  // This extra right shift will be taken care of at the end while rounding
  620|       |  // the result.
  621|       |  // Since all filter co-efficients are even, this change will not affect the
  622|       |  // end result
  623|    616|  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
  624|    616|                            _mm_set1_epi16((short)0xffff)));
  625|       |
  626|    616|  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
  627|       |
  628|       |  // coeffs 0 1 0 1 0 1 0 1
  629|    616|  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
  630|       |  // coeffs 2 3 2 3 2 3 2 3
  631|    616|  coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u));
  632|       |  // coeffs 4 5 4 5 4 5 4 5
  633|    616|  coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u));
  634|       |  // coeffs 6 7 6 7 6 7 6 7
  635|    616|  coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu));
  636|    616|}
convolve_avx2.c:convolve_lowbd_4tap:
  752|  10.3k|                                          const __m256i *const coeffs) {
  753|  10.3k|  const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]);
  754|  10.3k|  const __m256i res_45 = _mm256_maddubs_epi16(s[1], coeffs[1]);
  755|       |
  756|       |  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
  757|  10.3k|  const __m256i res = _mm256_add_epi16(res_45, res_23);
  758|       |
  759|  10.3k|  return res;
  760|  10.3k|}
convolve_avx2.c:convolve_lowbd_x_4tap:
  838|  10.3k|                                            const __m256i *const filt) {
  839|  10.3k|  __m256i s[2];
  840|       |
  841|  10.3k|  s[0] = _mm256_shuffle_epi8(data, filt[0]);
  842|  10.3k|  s[1] = _mm256_shuffle_epi8(data, filt[1]);
  843|       |
  844|  10.3k|  return convolve_lowbd_4tap(s, coeffs);
  845|  10.3k|}
jnt_convolve_avx2.c:prepare_coeffs_lowbd:
  612|  16.5k|    __m256i *const coeffs /* [4] */) {
  613|  16.5k|  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
  614|  16.5k|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|  16.5k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|  16.5k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  615|  16.5k|  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
  616|  16.5k|  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
  617|       |
  618|       |  // right shift all filter co-efficients by 1 to reduce the bits required.
  619|       |  // This extra right shift will be taken care of at the end while rounding
  620|       |  // the result.
  621|       |  // Since all filter co-efficients are even, this change will not affect the
  622|       |  // end result
  623|  16.5k|  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
  624|  16.5k|                            _mm_set1_epi16((short)0xffff)));
  625|       |
  626|  16.5k|  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
  627|       |
  628|       |  // coeffs 0 1 0 1 0 1 0 1
  629|  16.5k|  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
  630|       |  // coeffs 2 3 2 3 2 3 2 3
  631|  16.5k|  coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u));
  632|       |  // coeffs 4 5 4 5 4 5 4 5
  633|  16.5k|  coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u));
  634|       |  // coeffs 6 7 6 7 6 7 6 7
  635|  16.5k|  coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu));
  636|  16.5k|}
jnt_convolve_avx2.c:convolve_lowbd_x_4tap:
  838|  44.4k|                                            const __m256i *const filt) {
  839|  44.4k|  __m256i s[2];
  840|       |
  841|  44.4k|  s[0] = _mm256_shuffle_epi8(data, filt[0]);
  842|  44.4k|  s[1] = _mm256_shuffle_epi8(data, filt[1]);
  843|       |
  844|  44.4k|  return convolve_lowbd_4tap(s, coeffs);
  845|  44.4k|}
jnt_convolve_avx2.c:comp_avg:
  864|   225k|                               const int use_dist_wtd_comp_avg) {
  865|   225k|  __m256i res;
  866|   225k|  if (use_dist_wtd_comp_avg) {
  ------------------
  |  Branch (866:7): [True: 42.0k, False: 183k]
  ------------------
  867|  42.0k|    const __m256i data_lo = _mm256_unpacklo_epi16(*data_ref_0, *res_unsigned);
  868|  42.0k|    const __m256i data_hi = _mm256_unpackhi_epi16(*data_ref_0, *res_unsigned);
  869|       |
  870|  42.0k|    const __m256i wt_res_lo = _mm256_madd_epi16(data_lo, *wt);
  871|  42.0k|    const __m256i wt_res_hi = _mm256_madd_epi16(data_hi, *wt);
  872|       |
  873|  42.0k|    const __m256i res_lo = _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
  ------------------
  |  |   76|  42.0k|#define DIST_PRECISION_BITS 4
  ------------------
  874|  42.0k|    const __m256i res_hi = _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
  ------------------
  |  |   76|  42.0k|#define DIST_PRECISION_BITS 4
  ------------------
  875|       |
  876|  42.0k|    res = _mm256_packs_epi32(res_lo, res_hi);
  877|   183k|  } else {
  878|   183k|    const __m256i wt_res = _mm256_add_epi16(*data_ref_0, *res_unsigned);
  879|   183k|    res = _mm256_srai_epi16(wt_res, 1);
  880|   183k|  }
  881|   225k|  return res;
  882|   225k|}
jnt_convolve_avx2.c:convolve_rounding:
  887|   225k|                                        const int round_shift) {
  888|   225k|  const __m256i res_signed = _mm256_sub_epi16(*res_unsigned, *offset_const);
  889|   225k|  const __m256i res_round = _mm256_srai_epi16(
  890|   225k|      _mm256_add_epi16(res_signed, *round_const), round_shift);
  891|   225k|  return res_round;
  892|   225k|}
jnt_convolve_avx2.c:convolve_lowbd_x:
  813|   181k|                                       const __m256i *const filt) {
  814|   181k|  __m256i s[4];
  815|       |
  816|   181k|  s[0] = _mm256_shuffle_epi8(data, filt[0]);
  817|   181k|  s[1] = _mm256_shuffle_epi8(data, filt[1]);
  818|   181k|  s[2] = _mm256_shuffle_epi8(data, filt[2]);
  819|   181k|  s[3] = _mm256_shuffle_epi8(data, filt[3]);
  820|       |
  821|   181k|  return convolve_lowbd(s, coeffs);
  822|   181k|}
jnt_convolve_avx2.c:convolve_lowbd_4tap:
  752|  50.3k|                                          const __m256i *const coeffs) {
  753|  50.3k|  const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]);
  754|  50.3k|  const __m256i res_45 = _mm256_maddubs_epi16(s[1], coeffs[1]);
  755|       |
  756|       |  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
  757|  50.3k|  const __m256i res = _mm256_add_epi16(res_45, res_23);
  758|       |
  759|  50.3k|  return res;
  760|  50.3k|}
jnt_convolve_avx2.c:convolve_lowbd:
  725|   216k|                                     const __m256i *const coeffs) {
  726|   216k|  const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
  727|   216k|  const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
  728|   216k|  const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]);
  729|   216k|  const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]);
  730|       |
  731|       |  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
  732|   216k|  const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45),
  733|   216k|                                       _mm256_add_epi16(res_23, res_67));
  734|       |
  735|   216k|  return res;
  736|   216k|}
jnt_convolve_avx2.c:prepare_coeffs:
  683|  9.41k|                                  __m256i *const coeffs /* [4] */) {
  684|  9.41k|  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
  685|  9.41k|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|  9.41k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|  9.41k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  686|       |
  687|  9.41k|  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
  688|  9.41k|  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
  689|       |
  690|       |  // coeffs 0 1 0 1 0 1 0 1
  691|  9.41k|  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
  692|       |  // coeffs 2 3 2 3 2 3 2 3
  693|  9.41k|  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
  694|       |  // coeffs 4 5 4 5 4 5 4 5
  695|  9.41k|  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
  696|       |  // coeffs 6 7 6 7 6 7 6 7
  697|       |  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
  698|  9.41k|}
jnt_convolve_avx2.c:convolve:
  790|   198k|                               const __m256i *const coeffs) {
  791|   198k|  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
  792|   198k|  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
  793|   198k|  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
  794|   198k|  const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
  795|       |
  796|   198k|  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
  797|   198k|                                       _mm256_add_epi32(res_2, res_3));
  798|       |
  799|   198k|  return res;
  800|   198k|}
jnt_convolve_avx2.c:convolve_4tap:
  803|  22.3k|                                    const __m256i *const coeffs) {
  804|  22.3k|  const __m256i res_1 = _mm256_madd_epi16(s[0], coeffs[0]);
  805|  22.3k|  const __m256i res_2 = _mm256_madd_epi16(s[1], coeffs[1]);
  806|       |
  807|  22.3k|  const __m256i res = _mm256_add_epi32(res_1, res_2);
  808|  22.3k|  return res;
  809|  22.3k|}
wiener_convolve_avx2.c:convolve_lowbd_x:
  813|   288k|                                       const __m256i *const filt) {
  814|   288k|  __m256i s[4];
  815|       |
  816|   288k|  s[0] = _mm256_shuffle_epi8(data, filt[0]);
  817|   288k|  s[1] = _mm256_shuffle_epi8(data, filt[1]);
  818|   288k|  s[2] = _mm256_shuffle_epi8(data, filt[2]);
  819|   288k|  s[3] = _mm256_shuffle_epi8(data, filt[3]);
  820|       |
  821|   288k|  return convolve_lowbd(s, coeffs);
  822|   288k|}
wiener_convolve_avx2.c:convolve_lowbd:
  725|   288k|                                     const __m256i *const coeffs) {
  726|   288k|  const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
  727|   288k|  const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
  728|   288k|  const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]);
  729|   288k|  const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]);
  730|       |
  731|       |  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
  732|   288k|  const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45),
  733|   288k|                                       _mm256_add_epi16(res_23, res_67));
  734|       |
  735|   288k|  return res;
  736|   288k|}
wiener_convolve_avx2.c:convolve:
  790|   514k|                               const __m256i *const coeffs) {
  791|   514k|  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
  792|   514k|  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
  793|   514k|  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
  794|   514k|  const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
  795|       |
  796|   514k|  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
  797|   514k|                                       _mm256_add_epi32(res_2, res_3));
  798|       |
  799|   514k|  return res;
  800|   514k|}
highbd_convolve_2d_avx2.c:prepare_coeffs:
  683|  18.7k|                                  __m256i *const coeffs /* [4] */) {
  684|  18.7k|  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
  685|  18.7k|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|  18.7k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|  18.7k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  686|       |
  687|  18.7k|  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
  688|  18.7k|  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
  689|       |
  690|       |  // coeffs 0 1 0 1 0 1 0 1
  691|  18.7k|  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
  692|       |  // coeffs 2 3 2 3 2 3 2 3
  693|  18.7k|  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
  694|       |  // coeffs 4 5 4 5 4 5 4 5
  695|  18.7k|  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
  696|       |  // coeffs 6 7 6 7 6 7 6 7
  697|       |  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
  698|  18.7k|}
highbd_convolve_2d_avx2.c:convolve:
  790|   366k|                               const __m256i *const coeffs) {
  791|   366k|  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
  792|   366k|  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
  793|   366k|  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
  794|   366k|  const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
  795|       |
  796|   366k|  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
  797|   366k|                                       _mm256_add_epi32(res_2, res_3));
  798|       |
  799|   366k|  return res;
  800|   366k|}
highbd_jnt_convolve_avx2.c:highbd_comp_avg:
  898|  61.2k|                                      const int use_dist_wtd_comp_avg) {
  899|  61.2k|  __m256i res;
  900|  61.2k|  if (use_dist_wtd_comp_avg) {
  ------------------
  |  Branch (900:7): [True: 6.72k, False: 54.4k]
  ------------------
  901|  6.72k|    const __m256i wt0_res = _mm256_mullo_epi32(*data_ref_0, *wt0);
  902|  6.72k|    const __m256i wt1_res = _mm256_mullo_epi32(*res_unsigned, *wt1);
  903|  6.72k|    const __m256i wt_res = _mm256_add_epi32(wt0_res, wt1_res);
  904|  6.72k|    res = _mm256_srai_epi32(wt_res, DIST_PRECISION_BITS);
  ------------------
  |  |   76|  6.72k|#define DIST_PRECISION_BITS 4
  ------------------
  905|  54.4k|  } else {
  906|  54.4k|    const __m256i wt_res = _mm256_add_epi32(*data_ref_0, *res_unsigned);
  907|  54.4k|    res = _mm256_srai_epi32(wt_res, 1);
  908|  54.4k|  }
  909|  61.2k|  return res;
  910|  61.2k|}
highbd_jnt_convolve_avx2.c:highbd_convolve_rounding:
  914|  61.2k|    const __m256i *const round_const, const int round_shift) {
  915|  61.2k|  const __m256i res_signed = _mm256_sub_epi32(*res_unsigned, *offset_const);
  916|  61.2k|  const __m256i res_round = _mm256_srai_epi32(
  917|  61.2k|      _mm256_add_epi32(res_signed, *round_const), round_shift);
  918|       |
  919|  61.2k|  return res_round;
  920|  61.2k|}
highbd_jnt_convolve_avx2.c:prepare_coeffs:
  683|  6.24k|                                  __m256i *const coeffs /* [4] */) {
  684|  6.24k|  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
  685|  6.24k|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|  6.24k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|  6.24k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  686|       |
  687|  6.24k|  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
  688|  6.24k|  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
  689|       |
  690|       |  // coeffs 0 1 0 1 0 1 0 1
  691|  6.24k|  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
  692|       |  // coeffs 2 3 2 3 2 3 2 3
  693|  6.24k|  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
  694|       |  // coeffs 4 5 4 5 4 5 4 5
  695|  6.24k|  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
  696|       |  // coeffs 6 7 6 7 6 7 6 7
  697|       |  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
  698|  6.24k|}
highbd_jnt_convolve_avx2.c:convolve:
  790|   194k|                               const __m256i *const coeffs) {
  791|   194k|  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
  792|   194k|  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
  793|   194k|  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
  794|   194k|  const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
  795|       |
  796|   194k|  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
  797|   194k|                                       _mm256_add_epi32(res_2, res_3));
  798|       |
  799|   194k|  return res;
  800|   194k|}

av1_highbd_convolve_y_sr_avx2:
   44|  3.03k|                                   const int subpel_y_qn, int bd) {
   45|  3.03k|  if (filter_params_y->taps == 12) {
  ------------------
  |  Branch (45:7): [True: 0, False: 3.03k]
  ------------------
   46|      0|    av1_highbd_convolve_y_sr_ssse3(src, src_stride, dst, dst_stride, w, h,
   47|      0|                                   filter_params_y, subpel_y_qn, bd);
   48|      0|    return;
   49|      0|  }
   50|  3.03k|  int i, j;
   51|  3.03k|  const int fo_vert = filter_params_y->taps / 2 - 1;
   52|  3.03k|  const uint16_t *const src_ptr = src - fo_vert * src_stride;
   53|       |
   54|  3.03k|  __m256i s[8], coeffs_y[4];
   55|       |
   56|  3.03k|  const int bits = FILTER_BITS;
  ------------------
  |  |   21|  3.03k|#define FILTER_BITS 7
  ------------------
   57|       |
   58|  3.03k|  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
   59|  3.03k|  const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
   60|  3.03k|  const __m256i clip_pixel =
   61|  3.03k|      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
  ------------------
  |  Branch (61:25): [True: 2.56k, False: 476]
  |  Branch (61:44): [True: 476, False: 0]
  ------------------
   62|  3.03k|  const __m256i zero = _mm256_setzero_si256();
   63|       |
   64|  3.03k|  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
   65|       |
   66|  7.25k|  for (j = 0; j < w; j += 8) {
  ------------------
  |  Branch (66:15): [True: 4.21k, False: 3.03k]
  ------------------
   67|  4.21k|    const uint16_t *data = &src_ptr[j];
   68|       |    /* Vertical filter */
   69|  4.21k|    {
   70|  4.21k|      __m256i src6;
   71|  4.21k|      __m256i s01 = _mm256_permute2x128_si256(
   72|  4.21k|          _mm256_castsi128_si256(
   73|  4.21k|              _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
   74|  4.21k|          _mm256_castsi128_si256(
   75|  4.21k|              _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
   76|  4.21k|          0x20);
   77|  4.21k|      __m256i s12 = _mm256_permute2x128_si256(
   78|  4.21k|          _mm256_castsi128_si256(
   79|  4.21k|              _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
   80|  4.21k|          _mm256_castsi128_si256(
   81|  4.21k|              _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
   82|  4.21k|          0x20);
   83|  4.21k|      __m256i s23 = _mm256_permute2x128_si256(
   84|  4.21k|          _mm256_castsi128_si256(
   85|  4.21k|              _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
   86|  4.21k|          _mm256_castsi128_si256(
   87|  4.21k|              _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
   88|  4.21k|          0x20);
   89|  4.21k|      __m256i s34 = _mm256_permute2x128_si256(
   90|  4.21k|          _mm256_castsi128_si256(
   91|  4.21k|              _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
   92|  4.21k|          _mm256_castsi128_si256(
   93|  4.21k|              _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
   94|  4.21k|          0x20);
   95|  4.21k|      __m256i s45 = _mm256_permute2x128_si256(
   96|  4.21k|          _mm256_castsi128_si256(
   97|  4.21k|              _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
   98|  4.21k|          _mm256_castsi128_si256(
   99|  4.21k|              _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
  100|  4.21k|          0x20);
  101|  4.21k|      src6 = _mm256_castsi128_si256(
  102|  4.21k|          _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
  103|  4.21k|      __m256i s56 = _mm256_permute2x128_si256(
  104|  4.21k|          _mm256_castsi128_si256(
  105|  4.21k|              _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
  106|  4.21k|          src6, 0x20);
  107|       |
  108|  4.21k|      s[0] = _mm256_unpacklo_epi16(s01, s12);
  109|  4.21k|      s[1] = _mm256_unpacklo_epi16(s23, s34);
  110|  4.21k|      s[2] = _mm256_unpacklo_epi16(s45, s56);
  111|       |
  112|  4.21k|      s[4] = _mm256_unpackhi_epi16(s01, s12);
  113|  4.21k|      s[5] = _mm256_unpackhi_epi16(s23, s34);
  114|  4.21k|      s[6] = _mm256_unpackhi_epi16(s45, s56);
  115|       |
  116|  29.2k|      for (i = 0; i < h; i += 2) {
  ------------------
  |  Branch (116:19): [True: 25.0k, False: 4.21k]
  ------------------
  117|  25.0k|        data = &src_ptr[i * src_stride + j];
  118|       |
  119|  25.0k|        const __m256i s67 = _mm256_permute2x128_si256(
  120|  25.0k|            src6,
  121|  25.0k|            _mm256_castsi128_si256(
  122|  25.0k|                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
  123|  25.0k|            0x20);
  124|       |
  125|  25.0k|        src6 = _mm256_castsi128_si256(
  126|  25.0k|            _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
  127|       |
  128|  25.0k|        const __m256i s78 = _mm256_permute2x128_si256(
  129|  25.0k|            _mm256_castsi128_si256(
  130|  25.0k|                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
  131|  25.0k|            src6, 0x20);
  132|       |
  133|  25.0k|        s[3] = _mm256_unpacklo_epi16(s67, s78);
  134|  25.0k|        s[7] = _mm256_unpackhi_epi16(s67, s78);
  135|       |
  136|  25.0k|        const __m256i res_a = convolve(s, coeffs_y);
  137|       |
  138|  25.0k|        __m256i res_a_round = _mm256_sra_epi32(
  139|  25.0k|            _mm256_add_epi32(res_a, round_const_bits), round_shift_bits);
  140|       |
  141|  25.0k|        if (w - j > 4) {
  ------------------
  |  Branch (141:13): [True: 21.7k, False: 3.36k]
  ------------------
  142|  21.7k|          const __m256i res_b = convolve(s + 4, coeffs_y);
  143|  21.7k|          __m256i res_b_round = _mm256_sra_epi32(
  144|  21.7k|              _mm256_add_epi32(res_b, round_const_bits), round_shift_bits);
  145|       |
  146|  21.7k|          __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
  147|  21.7k|          res_16bit = _mm256_min_epi16(res_16bit, clip_pixel);
  148|  21.7k|          res_16bit = _mm256_max_epi16(res_16bit, zero);
  149|       |
  150|  21.7k|          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
  151|  21.7k|                           _mm256_castsi256_si128(res_16bit));
  152|  21.7k|          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
  153|  21.7k|                           _mm256_extracti128_si256(res_16bit, 1));
  154|  21.7k|        } else if (w == 4) {
  ------------------
  |  Branch (154:20): [True: 2.76k, False: 608]
  ------------------
  155|  2.76k|          res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
  156|  2.76k|          res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
  157|  2.76k|          res_a_round = _mm256_max_epi16(res_a_round, zero);
  158|       |
  159|  2.76k|          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j],
  160|  2.76k|                           _mm256_castsi256_si128(res_a_round));
  161|  2.76k|          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
  162|  2.76k|                           _mm256_extracti128_si256(res_a_round, 1));
  163|  2.76k|        } else {
  164|    608|          res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
  165|    608|          res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
  166|    608|          res_a_round = _mm256_max_epi16(res_a_round, zero);
  167|       |
  168|    608|          xx_storel_32(&dst[i * dst_stride + j],
  169|    608|                       _mm256_castsi256_si128(res_a_round));
  170|    608|          xx_storel_32(&dst[i * dst_stride + j + dst_stride],
  171|    608|                       _mm256_extracti128_si256(res_a_round, 1));
  172|    608|        }
  173|       |
  174|  25.0k|        s[0] = s[1];
  175|  25.0k|        s[1] = s[2];
  176|  25.0k|        s[2] = s[3];
  177|       |
  178|  25.0k|        s[4] = s[5];
  179|  25.0k|        s[5] = s[6];
  180|  25.0k|        s[6] = s[7];
  181|  25.0k|      }
  182|  4.21k|    }
  183|  4.21k|  }
  184|  3.03k|}
av1_highbd_convolve_x_sr_avx2:
  190|  4.32k|                                   ConvolveParams *conv_params, int bd) {
  191|  4.32k|  if (filter_params_x->taps == 12) {
  ------------------
  |  Branch (191:7): [True: 0, False: 4.32k]
  ------------------
  192|      0|    av1_highbd_convolve_x_sr_ssse3(src, src_stride, dst, dst_stride, w, h,
  193|      0|                                   filter_params_x, subpel_x_qn, conv_params,
  194|      0|                                   bd);
  195|      0|    return;
  196|      0|  }
  197|  4.32k|  int i, j;
  198|  4.32k|  const int fo_horiz = filter_params_x->taps / 2 - 1;
  199|  4.32k|  const uint16_t *const src_ptr = src - fo_horiz;
  200|       |
  201|       |  // Check that, even with 12-bit input, the intermediate values will fit
  202|       |  // into an unsigned 16-bit intermediate array.
  203|  4.32k|  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
  204|       |
  205|  4.32k|  __m256i s[4], coeffs_x[4];
  206|       |
  207|  4.32k|  const __m256i round_const_x =
  208|  4.32k|      _mm256_set1_epi32(((1 << conv_params->round_0) >> 1));
  209|  4.32k|  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
  210|       |
  211|  4.32k|  const int bits = FILTER_BITS - conv_params->round_0;
  ------------------
  |  |   21|  4.32k|#define FILTER_BITS 7
  ------------------
  212|  4.32k|  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
  213|  4.32k|  const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
  214|  4.32k|  const __m256i clip_pixel =
  215|  4.32k|      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
  ------------------
  |  Branch (215:25): [True: 3.08k, False: 1.24k]
  |  Branch (215:44): [True: 1.24k, False: 0]
  ------------------
  216|  4.32k|  const __m256i zero = _mm256_setzero_si256();
  217|       |
  218|  4.32k|  assert(bits >= 0);
  219|  4.32k|  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
  220|  4.32k|         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
  221|       |
  222|  4.32k|  prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
  223|       |
  224|  11.0k|  for (j = 0; j < w; j += 8) {
  ------------------
  |  Branch (224:15): [True: 6.73k, False: 4.32k]
  ------------------
  225|       |    /* Horizontal filter */
  226|  67.2k|    for (i = 0; i < h; i += 2) {
  ------------------
  |  Branch (226:17): [True: 60.5k, False: 6.73k]
  ------------------
  227|  60.5k|      const __m256i row0 =
  228|  60.5k|          _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
  229|  60.5k|      __m256i row1 =
  230|  60.5k|          _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
  231|       |
  232|  60.5k|      const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
  233|  60.5k|      const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
  234|       |
  235|       |      // even pixels
  236|  60.5k|      s[0] = _mm256_alignr_epi8(r1, r0, 0);
  237|  60.5k|      s[1] = _mm256_alignr_epi8(r1, r0, 4);
  238|  60.5k|      s[2] = _mm256_alignr_epi8(r1, r0, 8);
  239|  60.5k|      s[3] = _mm256_alignr_epi8(r1, r0, 12);
  240|       |
  241|  60.5k|      __m256i res_even = convolve(s, coeffs_x);
  242|  60.5k|      res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
  243|  60.5k|                                  round_shift_x);
  244|       |
  245|       |      // odd pixels
  246|  60.5k|      s[0] = _mm256_alignr_epi8(r1, r0, 2);
  247|  60.5k|      s[1] = _mm256_alignr_epi8(r1, r0, 6);
  248|  60.5k|      s[2] = _mm256_alignr_epi8(r1, r0, 10);
  249|  60.5k|      s[3] = _mm256_alignr_epi8(r1, r0, 14);
  250|       |
  251|  60.5k|      __m256i res_odd = convolve(s, coeffs_x);
  252|  60.5k|      res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
  253|  60.5k|                                 round_shift_x);
  254|       |
  255|  60.5k|      res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_bits),
  256|  60.5k|                                  round_shift_bits);
  257|  60.5k|      res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_bits),
  258|  60.5k|                                 round_shift_bits);
  259|       |
  260|  60.5k|      __m256i res_even1 = _mm256_packs_epi32(res_even, res_even);
  261|  60.5k|      __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd);
  262|       |
  263|  60.5k|      __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1);
  264|  60.5k|      res = _mm256_min_epi16(res, clip_pixel);
  265|  60.5k|      res = _mm256_max_epi16(res, zero);
  266|       |
  267|  60.5k|      if (w - j > 4) {
  ------------------
  |  Branch (267:11): [True: 54.8k, False: 5.64k]
  ------------------
  268|  54.8k|        _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
  269|  54.8k|                         _mm256_castsi256_si128(res));
  270|  54.8k|        _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
  271|  54.8k|                         _mm256_extracti128_si256(res, 1));
  272|  54.8k|      } else if (w == 4) {
  ------------------
  |  Branch (272:18): [True: 4.96k, False: 680]
  ------------------
  273|  4.96k|        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j],
  274|  4.96k|                         _mm256_castsi256_si128(res));
  275|  4.96k|        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
  276|  4.96k|                         _mm256_extracti128_si256(res, 1));
  277|  4.96k|      } else {
  278|    680|        xx_storel_32(&dst[i * dst_stride + j], _mm256_castsi256_si128(res));
  279|    680|        xx_storel_32(&dst[i * dst_stride + j + dst_stride],
  280|       |                     _mm256_extracti128_si256(res, 1));
  281|    680|      }
  282|  60.5k|    }
  283|  6.73k|  }
  284|  4.32k|}

aom_highbd_h_predictor_4x4_sse2:
   21|  50.2k|                                     const uint16_t *left, int bd) {
   22|  50.2k|  const __m128i left_u16 = _mm_loadl_epi64((const __m128i *)left);
   23|  50.2k|  const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
   24|  50.2k|  const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
   25|  50.2k|  const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
   26|       |  const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
   27|  50.2k|  (void)above;
   28|  50.2k|  (void)bd;
   29|  50.2k|  _mm_storel_epi64((__m128i *)dst, row0);
   30|  50.2k|  dst += stride;
   31|  50.2k|  _mm_storel_epi64((__m128i *)dst, row1);
   32|  50.2k|  dst += stride;
   33|  50.2k|  _mm_storel_epi64((__m128i *)dst, row2);
   34|  50.2k|  dst += stride;
   35|  50.2k|  _mm_storel_epi64((__m128i *)dst, row3);
   36|  50.2k|}
aom_highbd_h_predictor_4x8_sse2:
   40|  8.55k|                                     const uint16_t *left, int bd) {
   41|  8.55k|  aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd);
   42|  8.55k|  dst += stride << 2;
   43|  8.55k|  left += 4;
   44|  8.55k|  aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd);
   45|  8.55k|}
aom_highbd_h_predictor_8x4_sse2:
   49|  12.5k|                                     const uint16_t *left, int bd) {
   50|  12.5k|  const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
   51|  12.5k|  const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
   52|  12.5k|  const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
   53|  12.5k|  const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
   54|       |  const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
   55|  12.5k|  (void)above;
   56|  12.5k|  (void)bd;
   57|  12.5k|  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0));
   58|  12.5k|  dst += stride;
   59|  12.5k|  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1));
   60|  12.5k|  dst += stride;
   61|  12.5k|  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2));
   62|  12.5k|  dst += stride;
   63|  12.5k|  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3));
   64|  12.5k|}
aom_highbd_h_predictor_8x8_sse2:
   68|  27.9k|                                     const uint16_t *left, int bd) {
   69|  27.9k|  const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
   70|  27.9k|  const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
   71|  27.9k|  const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
   72|  27.9k|  const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
   73|  27.9k|  const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
   74|  27.9k|  const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
   75|  27.9k|  const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
   76|  27.9k|  const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
   77|       |  const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
   78|  27.9k|  (void)above;
   79|  27.9k|  (void)bd;
   80|  27.9k|  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0));
   81|  27.9k|  dst += stride;
   82|  27.9k|  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1));
   83|  27.9k|  dst += stride;
   84|  27.9k|  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2));
   85|  27.9k|  dst += stride;
   86|  27.9k|  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3));
   87|  27.9k|  dst += stride;
   88|  27.9k|  _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row4, row4));
   89|  27.9k|  dst += stride;
   90|  27.9k|  _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row5, row5));
   91|  27.9k|  dst += stride;
   92|  27.9k|  _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row6, row6));
   93|  27.9k|  dst += stride;
   94|  27.9k|  _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row7, row7));
   95|  27.9k|}
aom_highbd_h_predictor_8x16_sse2:
   99|  3.07k|                                      const uint16_t *left, int bd) {
  100|  3.07k|  aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd);
  101|  3.07k|  dst += stride << 3;
  102|  3.07k|  left += 8;
  103|  3.07k|  aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd);
  104|  3.07k|}
aom_highbd_h_predictor_16x8_sse2:
  145|  4.66k|                                      const uint16_t *left, int bd) {
  146|  4.66k|  (void)above;
  147|  4.66k|  (void)bd;
  148|  4.66k|  h_predictor_16x8(dst, stride, left);
  149|  4.66k|}
aom_highbd_h_predictor_16x16_sse2:
  153|  7.69k|                                       const uint16_t *left, int bd) {
  154|  7.69k|  int i;
  155|  7.69k|  (void)above;
  156|  7.69k|  (void)bd;
  157|       |
  158|  23.0k|  for (i = 0; i < 2; i++, left += 8) {
  ------------------
  |  Branch (158:15): [True: 15.3k, False: 7.69k]
  ------------------
  159|  15.3k|    h_predictor_16x8(dst, stride, left);
  160|  15.3k|    dst += stride << 3;
  161|  15.3k|  }
  162|  7.69k|}
aom_highbd_h_predictor_16x32_sse2:
  166|  1.16k|                                       const uint16_t *left, int bd) {
  167|  1.16k|  int i;
  168|  1.16k|  (void)above;
  169|  1.16k|  (void)bd;
  170|       |
  171|  5.84k|  for (i = 0; i < 4; i++, left += 8) {
  ------------------
  |  Branch (171:15): [True: 4.67k, False: 1.16k]
  ------------------
  172|  4.67k|    h_predictor_16x8(dst, stride, left);
  173|  4.67k|    dst += stride << 3;
  174|  4.67k|  }
  175|  1.16k|}
aom_highbd_h_predictor_32x16_sse2:
  220|  1.00k|                                       const uint16_t *left, int bd) {
  221|  1.00k|  int i;
  222|  1.00k|  (void)above;
  223|  1.00k|  (void)bd;
  224|       |
  225|  3.00k|  for (i = 0; i < 2; i++, left += 8) {
  ------------------
  |  Branch (225:15): [True: 2.00k, False: 1.00k]
  ------------------
  226|  2.00k|    h_predictor_32x8(dst, stride, left);
  227|  2.00k|    dst += stride << 3;
  228|  2.00k|  }
  229|  1.00k|}
aom_highbd_h_predictor_32x32_sse2:
  233|  1.80k|                                       const uint16_t *left, int bd) {
  234|  1.80k|  int i;
  235|  1.80k|  (void)above;
  236|  1.80k|  (void)bd;
  237|       |
  238|  9.02k|  for (i = 0; i < 4; i++, left += 8) {
  ------------------
  |  Branch (238:15): [True: 7.22k, False: 1.80k]
  ------------------
  239|  7.22k|    h_predictor_32x8(dst, stride, left);
  240|  7.22k|    dst += stride << 3;
  241|  7.22k|  }
  242|  1.80k|}
aom_highbd_dc_left_predictor_4x4_sse2:
  267|  4.54k|                                           const uint16_t *left, int bd) {
  268|  4.54k|  const __m128i two = _mm_cvtsi32_si128(2);
  269|  4.54k|  const __m128i sum = dc_sum_4(left);
  270|  4.54k|  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
  271|  4.54k|  (void)above;
  272|  4.54k|  (void)bd;
  273|  4.54k|  dc_store_4x4(dst, stride, &dc);
  274|  4.54k|}
aom_highbd_dc_top_predictor_4x4_sse2:
  278|  5.24k|                                          const uint16_t *left, int bd) {
  279|  5.24k|  const __m128i two = _mm_cvtsi32_si128(2);
  280|  5.24k|  const __m128i sum = dc_sum_4(above);
  281|  5.24k|  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
  282|  5.24k|  (void)left;
  283|  5.24k|  (void)bd;
  284|  5.24k|  dc_store_4x4(dst, stride, &dc);
  285|  5.24k|}
aom_highbd_dc_128_predictor_4x4_sse2:
  289|    351|                                          const uint16_t *left, int bd) {
  290|    351|  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
  291|       |  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
  292|    351|  (void)above;
  293|    351|  (void)left;
  294|    351|  dc_store_4x4(dst, stride, &dc_dup);
  295|    351|}
aom_highbd_dc_left_predictor_4x8_sse2:
  321|  1.04k|                                           const uint16_t *left, int bd) {
  322|  1.04k|  const __m128i sum = dc_sum_8(left);
  323|  1.04k|  const __m128i four = _mm_cvtsi32_si128(4);
  324|  1.04k|  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
  325|  1.04k|  (void)above;
  326|  1.04k|  (void)bd;
  327|  1.04k|  dc_store_4x8(dst, stride, &dc);
  328|  1.04k|}
aom_highbd_dc_top_predictor_4x8_sse2:
  332|    720|                                          const uint16_t *left, int bd) {
  333|    720|  const __m128i two = _mm_cvtsi32_si128(2);
  334|    720|  const __m128i sum = dc_sum_4(above);
  335|    720|  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
  336|    720|  (void)left;
  337|    720|  (void)bd;
  338|    720|  dc_store_4x8(dst, stride, &dc);
  339|    720|}
aom_highbd_dc_128_predictor_4x8_sse2:
  343|     10|                                          const uint16_t *left, int bd) {
  344|     10|  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
  345|       |  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
  346|     10|  (void)above;
  347|     10|  (void)left;
  348|     10|  dc_store_4x8(dst, stride, &dc_dup);
  349|     10|}
aom_highbd_dc_top_predictor_8x4_sse2:
  377|  1.99k|                                          const uint16_t *left, int bd) {
  378|  1.99k|  (void)left;
  379|  1.99k|  (void)bd;
  380|  1.99k|  dc_top_predictor_8xh(dst, stride, 4, above);
  381|  1.99k|}
aom_highbd_dc_top_predictor_8x8_sse2:
  385|  5.44k|                                          const uint16_t *left, int bd) {
  386|  5.44k|  (void)left;
  387|  5.44k|  (void)bd;
  388|  5.44k|  dc_top_predictor_8xh(dst, stride, 8, above);
  389|  5.44k|}
aom_highbd_dc_top_predictor_8x16_sse2:
  393|  1.85k|                                           const uint16_t *left, int bd) {
  394|  1.85k|  (void)left;
  395|  1.85k|  (void)bd;
  396|  1.85k|  dc_top_predictor_8xh(dst, stride, 16, above);
  397|  1.85k|}
aom_highbd_dc_left_predictor_8x4_sse2:
  404|    846|                                           const uint16_t *left, int bd) {
  405|    846|  const __m128i two = _mm_cvtsi32_si128(2);
  406|    846|  const __m128i sum = dc_sum_4(left);
  407|    846|  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
  408|    846|  (void)above;
  409|    846|  (void)bd;
  410|    846|  dc_store_8xh(dst, stride, 4, &dc);
  411|    846|}
aom_highbd_dc_left_predictor_8x8_sse2:
  415|  5.41k|                                           const uint16_t *left, int bd) {
  416|  5.41k|  const __m128i four = _mm_cvtsi32_si128(4);
  417|  5.41k|  const __m128i sum = dc_sum_8(left);
  418|  5.41k|  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
  419|  5.41k|  (void)above;
  420|  5.41k|  (void)bd;
  421|  5.41k|  dc_store_8xh(dst, stride, 8, &dc);
  422|  5.41k|}
aom_highbd_dc_left_predictor_8x16_sse2:
  433|  2.88k|                                            const uint16_t *left, int bd) {
  434|  2.88k|  const __m128i eight = _mm_cvtsi32_si128(8);
  435|  2.88k|  const __m128i sum = dc_sum_16(left);
  436|  2.88k|  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
  437|  2.88k|  (void)above;
  438|  2.88k|  (void)bd;
  439|  2.88k|  dc_store_8xh(dst, stride, 16, &dc);
  440|  2.88k|}
aom_highbd_dc_128_predictor_8x4_sse2:
  454|     18|                                          const uint16_t *left, int bd) {
  455|     18|  (void)above;
  456|     18|  (void)left;
  457|     18|  dc_128_predictor_8xh(dst, stride, 4, bd);
  458|     18|}
aom_highbd_dc_128_predictor_8x8_sse2:
  462|  1.21k|                                          const uint16_t *left, int bd) {
  463|  1.21k|  (void)above;
  464|  1.21k|  (void)left;
  465|  1.21k|  dc_128_predictor_8xh(dst, stride, 8, bd);
  466|  1.21k|}
aom_highbd_dc_128_predictor_8x16_sse2:
  470|     58|                                           const uint16_t *left, int bd) {
  471|     58|  (void)above;
  472|     58|  (void)left;
  473|     58|  dc_128_predictor_8xh(dst, stride, 16, bd);
  474|     58|}
aom_highbd_dc_left_predictor_16x8_sse2:
  495|  1.74k|                                            const uint16_t *left, int bd) {
  496|  1.74k|  const __m128i four = _mm_cvtsi32_si128(4);
  497|  1.74k|  const __m128i sum = dc_sum_8(left);
  498|  1.74k|  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
  499|  1.74k|  (void)above;
  500|  1.74k|  (void)bd;
  501|  1.74k|  dc_store_16xh(dst, stride, 8, &dc);
  502|  1.74k|}
aom_highbd_dc_left_predictor_16x16_sse2:
  506|  8.07k|                                             const uint16_t *left, int bd) {
  507|  8.07k|  const __m128i eight = _mm_cvtsi32_si128(8);
  508|  8.07k|  const __m128i sum = dc_sum_16(left);
  509|  8.07k|  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
  510|  8.07k|  (void)above;
  511|  8.07k|  (void)bd;
  512|  8.07k|  dc_store_16xh(dst, stride, 16, &dc);
  513|  8.07k|}
aom_highbd_dc_left_predictor_16x32_sse2:
  527|  2.85k|                                             const uint16_t *left, int bd) {
  528|  2.85k|  const __m128i sixteen = _mm_cvtsi32_si128(16);
  529|  2.85k|  const __m128i sum = dc_sum_32(left);
  530|  2.85k|  const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
  531|  2.85k|  (void)above;
  532|  2.85k|  (void)bd;
  533|  2.85k|  dc_store_16xh(dst, stride, 32, &dc);
  534|  2.85k|}
aom_highbd_dc_top_predictor_16x8_sse2:
  541|  2.34k|                                           const uint16_t *left, int bd) {
  542|  2.34k|  const __m128i eight = _mm_cvtsi32_si128(8);
  543|  2.34k|  const __m128i sum = dc_sum_16(above);
  544|  2.34k|  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
  545|  2.34k|  (void)left;
  546|  2.34k|  (void)bd;
  547|  2.34k|  dc_store_16xh(dst, stride, 8, &dc);
  548|  2.34k|}
aom_highbd_dc_top_predictor_16x16_sse2:
  552|  5.41k|                                            const uint16_t *left, int bd) {
  553|  5.41k|  const __m128i eight = _mm_cvtsi32_si128(8);
  554|  5.41k|  const __m128i sum = dc_sum_16(above);
  555|  5.41k|  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
  556|  5.41k|  (void)left;
  557|  5.41k|  (void)bd;
  558|  5.41k|  dc_store_16xh(dst, stride, 16, &dc);
  559|  5.41k|}
aom_highbd_dc_top_predictor_16x32_sse2:
  563|  1.89k|                                            const uint16_t *left, int bd) {
  564|  1.89k|  const __m128i eight = _mm_cvtsi32_si128(8);
  565|  1.89k|  const __m128i sum = dc_sum_16(above);
  566|  1.89k|  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
  567|  1.89k|  (void)left;
  568|  1.89k|  (void)bd;
  569|  1.89k|  dc_store_16xh(dst, stride, 32, &dc);
  570|  1.89k|}
aom_highbd_dc_128_predictor_16x8_sse2:
  577|    132|                                           const uint16_t *left, int bd) {
  578|    132|  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
  579|       |  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
  580|    132|  (void)above;
  581|    132|  (void)left;
  582|    132|  dc_store_16xh(dst, stride, 8, &dc_dup);
  583|    132|}
aom_highbd_dc_128_predictor_16x16_sse2:
  587|  3.60k|                                            const uint16_t *left, int bd) {
  588|  3.60k|  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
  589|       |  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
  590|  3.60k|  (void)above;
  591|  3.60k|  (void)left;
  592|  3.60k|  dc_store_16xh(dst, stride, 16, &dc_dup);
  593|  3.60k|}
aom_highbd_dc_128_predictor_16x32_sse2:
  597|    608|                                            const uint16_t *left, int bd) {
  598|    608|  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
  599|       |  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
  600|    608|  (void)above;
  601|    608|  (void)left;
  602|    608|  dc_store_16xh(dst, stride, 32, &dc_dup);
  603|    608|}
aom_highbd_dc_left_predictor_32x16_sse2:
  623|  1.28k|                                             const uint16_t *left, int bd) {
  624|  1.28k|  const __m128i eight = _mm_cvtsi32_si128(8);
  625|  1.28k|  const __m128i sum = dc_sum_16(left);
  626|  1.28k|  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
  627|  1.28k|  (void)above;
  628|  1.28k|  (void)bd;
  629|  1.28k|  dc_store_32xh(dst, stride, 16, &dc);
  630|  1.28k|}
aom_highbd_dc_left_predictor_32x32_sse2:
  634|  11.5k|                                             const uint16_t *left, int bd) {
  635|  11.5k|  const __m128i sixteen = _mm_cvtsi32_si128(16);
  636|  11.5k|  const __m128i sum = dc_sum_32(left);
  637|  11.5k|  const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
  638|  11.5k|  (void)above;
  639|  11.5k|  (void)bd;
  640|  11.5k|  dc_store_32xh(dst, stride, 32, &dc);
  641|  11.5k|}
aom_highbd_dc_top_predictor_32x16_sse2:
  645|  3.27k|                                            const uint16_t *left, int bd) {
  646|  3.27k|  const __m128i sixteen = _mm_cvtsi32_si128(16);
  647|  3.27k|  const __m128i sum = dc_sum_32(above);
  648|  3.27k|  const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
  649|  3.27k|  (void)left;
  650|  3.27k|  (void)bd;
  651|  3.27k|  dc_store_32xh(dst, stride, 16, &dc);
  652|  3.27k|}
aom_highbd_dc_128_predictor_32x16_sse2:
  656|    126|                                            const uint16_t *left, int bd) {
  657|    126|  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
  658|       |  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
  659|    126|  (void)above;
  660|    126|  (void)left;
  661|    126|  dc_store_32xh(dst, stride, 16, &dc_dup);
  662|    126|}
aom_highbd_dc_top_predictor_32x32_sse2:
  666|  8.80k|                                            const uint16_t *left, int bd) {
  667|  8.80k|  const __m128i sixteen = _mm_cvtsi32_si128(16);
  668|  8.80k|  const __m128i sum = dc_sum_32(above);
  669|  8.80k|  const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
  670|  8.80k|  (void)left;
  671|  8.80k|  (void)bd;
  672|  8.80k|  dc_store_32xh(dst, stride, 32, &dc);
  673|  8.80k|}
aom_highbd_dc_128_predictor_32x32_sse2:
  677|  5.71k|                                            const uint16_t *left, int bd) {
  678|  5.71k|  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
  679|       |  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
  680|  5.71k|  (void)above;
  681|  5.71k|  (void)left;
  682|  5.71k|  dc_store_32xh(dst, stride, 32, &dc_dup);
  683|  5.71k|}
aom_highbd_v_predictor_4x8_sse2:
  690|  4.65k|                                     const uint16_t *left, int bd) {
  691|  4.65k|  (void)left;
  692|  4.65k|  (void)bd;
  693|  4.65k|  const __m128i above_u16 = _mm_loadl_epi64((const __m128i *)above);
  694|  4.65k|  int i;
  695|  13.9k|  for (i = 0; i < 2; ++i) {
  ------------------
  |  Branch (695:15): [True: 9.31k, False: 4.65k]
  ------------------
  696|  9.31k|    _mm_storel_epi64((__m128i *)dst, above_u16);
  697|  9.31k|    _mm_storel_epi64((__m128i *)(dst + stride), above_u16);
  698|  9.31k|    _mm_storel_epi64((__m128i *)(dst + 2 * stride), above_u16);
  699|  9.31k|    _mm_storel_epi64((__m128i *)(dst + 3 * stride), above_u16);
  700|  9.31k|    dst += stride << 2;
  701|  9.31k|  }
  702|  4.65k|}
aom_highbd_v_predictor_8x4_sse2:
  706|  7.28k|                                     const uint16_t *left, int bd) {
  707|  7.28k|  (void)left;
  708|  7.28k|  (void)bd;
  709|  7.28k|  const __m128i above_u16 = _mm_load_si128((const __m128i *)above);
  710|  7.28k|  _mm_store_si128((__m128i *)dst, above_u16);
  711|  7.28k|  _mm_store_si128((__m128i *)(dst + stride), above_u16);
  712|  7.28k|  _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16);
  713|  7.28k|  _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16);
  714|  7.28k|}
aom_highbd_v_predictor_8x16_sse2:
  718|  1.95k|                                      const uint16_t *left, int bd) {
  719|  1.95k|  (void)left;
  720|  1.95k|  (void)bd;
  721|  1.95k|  const __m128i above_u16 = _mm_load_si128((const __m128i *)above);
  722|  1.95k|  int i;
  723|  9.75k|  for (i = 0; i < 4; ++i) {
  ------------------
  |  Branch (723:15): [True: 7.80k, False: 1.95k]
  ------------------
  724|  7.80k|    _mm_store_si128((__m128i *)dst, above_u16);
  725|  7.80k|    _mm_store_si128((__m128i *)(dst + stride), above_u16);
  726|  7.80k|    _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16);
  727|  7.80k|    _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16);
  728|  7.80k|    dst += stride << 2;
  729|  7.80k|  }
  730|  1.95k|}
aom_highbd_v_predictor_16x8_sse2:
  734|  2.63k|                                      const uint16_t *left, int bd) {
  735|  2.63k|  (void)left;
  736|  2.63k|  (void)bd;
  737|  2.63k|  const __m128i above0_u16 = _mm_load_si128((const __m128i *)above);
  738|  2.63k|  const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8));
  739|  2.63k|  int i;
  740|  7.91k|  for (i = 0; i < 2; ++i) {
  ------------------
  |  Branch (740:15): [True: 5.27k, False: 2.63k]
  ------------------
  741|  5.27k|    _mm_store_si128((__m128i *)dst, above0_u16);
  742|  5.27k|    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
  743|  5.27k|    dst += stride;
  744|  5.27k|    _mm_store_si128((__m128i *)dst, above0_u16);
  745|  5.27k|    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
  746|  5.27k|    dst += stride;
  747|  5.27k|    _mm_store_si128((__m128i *)dst, above0_u16);
  748|  5.27k|    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
  749|  5.27k|    dst += stride;
  750|  5.27k|    _mm_store_si128((__m128i *)dst, above0_u16);
  751|  5.27k|    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
  752|  5.27k|    dst += stride;
  753|  5.27k|  }
  754|  2.63k|}
aom_highbd_v_predictor_16x32_sse2:
  758|    617|                                       const uint16_t *left, int bd) {
  759|    617|  (void)left;
  760|    617|  (void)bd;
  761|    617|  const __m128i above0_u16 = _mm_load_si128((const __m128i *)above);
  762|    617|  const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8));
  763|    617|  int i;
  764|  5.55k|  for (i = 0; i < 8; ++i) {
  ------------------
  |  Branch (764:15): [True: 4.93k, False: 617]
  ------------------
  765|  4.93k|    _mm_store_si128((__m128i *)dst, above0_u16);
  766|  4.93k|    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
  767|  4.93k|    dst += stride;
  768|  4.93k|    _mm_store_si128((__m128i *)dst, above0_u16);
  769|  4.93k|    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
  770|  4.93k|    dst += stride;
  771|  4.93k|    _mm_store_si128((__m128i *)dst, above0_u16);
  772|  4.93k|    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
  773|  4.93k|    dst += stride;
  774|  4.93k|    _mm_store_si128((__m128i *)dst, above0_u16);
  775|  4.93k|    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
  776|  4.93k|    dst += stride;
  777|  4.93k|  }
  778|    617|}
aom_highbd_v_predictor_32x16_sse2:
  782|    601|                                       const uint16_t *left, int bd) {
  783|    601|  (void)left;
  784|    601|  (void)bd;
  785|    601|  const __m128i above0_u16 = _mm_load_si128((const __m128i *)above);
  786|    601|  const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8));
  787|    601|  const __m128i above2_u16 = _mm_load_si128((const __m128i *)(above + 16));
  788|    601|  const __m128i above3_u16 = _mm_load_si128((const __m128i *)(above + 24));
  789|    601|  int i;
  790|  3.00k|  for (i = 0; i < 4; ++i) {
  ------------------
  |  Branch (790:15): [True: 2.40k, False: 601]
  ------------------
  791|  2.40k|    _mm_store_si128((__m128i *)dst, above0_u16);
  792|  2.40k|    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
  793|  2.40k|    _mm_store_si128((__m128i *)(dst + 16), above2_u16);
  794|  2.40k|    _mm_store_si128((__m128i *)(dst + 24), above3_u16);
  795|  2.40k|    dst += stride;
  796|  2.40k|    _mm_store_si128((__m128i *)dst, above0_u16);
  797|  2.40k|    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
  798|  2.40k|    _mm_store_si128((__m128i *)(dst + 16), above2_u16);
  799|  2.40k|    _mm_store_si128((__m128i *)(dst + 24), above3_u16);
  800|  2.40k|    dst += stride;
  801|  2.40k|    _mm_store_si128((__m128i *)dst, above0_u16);
  802|  2.40k|    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
  803|  2.40k|    _mm_store_si128((__m128i *)(dst + 16), above2_u16);
  804|  2.40k|    _mm_store_si128((__m128i *)(dst + 24), above3_u16);
  805|  2.40k|    dst += stride;
  806|  2.40k|    _mm_store_si128((__m128i *)dst, above0_u16);
  807|  2.40k|    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
  808|  2.40k|    _mm_store_si128((__m128i *)(dst + 16), above2_u16);
  809|  2.40k|    _mm_store_si128((__m128i *)(dst + 24), above3_u16);
  810|  2.40k|    dst += stride;
  811|  2.40k|  }
  812|    601|}
aom_highbd_dc_predictor_4x8_sse2:
  819|  27.5k|                                      const uint16_t *left, int bd) {
  820|  27.5k|  (void)bd;
  821|  27.5k|  const __m128i sum_above = dc_sum_4(above);
  822|  27.5k|  const __m128i sum_left = dc_sum_8(left);
  823|  27.5k|  const __m128i sum = _mm_add_epi16(sum_above, sum_left);
  824|  27.5k|  uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum);
  825|  27.5k|  sum32 >>= 16;
  826|  27.5k|  sum32 += 6;
  827|  27.5k|  sum32 /= 12;
  828|  27.5k|  const __m128i row = _mm_set1_epi16((int16_t)sum32);
  829|  27.5k|  int i;
  830|   137k|  for (i = 0; i < 4; ++i) {
  ------------------
  |  Branch (830:15): [True: 110k, False: 27.5k]
  ------------------
  831|   110k|    _mm_storel_epi64((__m128i *)dst, row);
  832|   110k|    dst += stride;
  833|   110k|    _mm_storel_epi64((__m128i *)dst, row);
  834|   110k|    dst += stride;
  835|   110k|  }
  836|  27.5k|}
aom_highbd_dc_predictor_8x4_sse2:
  840|  39.5k|                                      const uint16_t *left, int bd) {
  841|  39.5k|  (void)bd;
  842|  39.5k|  const __m128i sum_left = dc_sum_4(left);
  843|  39.5k|  const __m128i sum_above = dc_sum_8(above);
  844|  39.5k|  const __m128i sum = _mm_add_epi16(sum_above, sum_left);
  845|  39.5k|  uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum);
  846|  39.5k|  sum32 >>= 16;
  847|  39.5k|  sum32 += 6;
  848|  39.5k|  sum32 /= 12;
  849|  39.5k|  const __m128i row = _mm_set1_epi16((int16_t)sum32);
  850|       |
  851|  39.5k|  _mm_store_si128((__m128i *)dst, row);
  852|  39.5k|  dst += stride;
  853|  39.5k|  _mm_store_si128((__m128i *)dst, row);
  854|  39.5k|  dst += stride;
  855|  39.5k|  _mm_store_si128((__m128i *)dst, row);
  856|  39.5k|  dst += stride;
  857|  39.5k|  _mm_store_si128((__m128i *)dst, row);
  858|  39.5k|}
aom_highbd_dc_predictor_8x16_sse2:
  862|  20.4k|                                       const uint16_t *left, int bd) {
  863|  20.4k|  (void)bd;
  864|  20.4k|  __m128i sum_left = dc_sum_16(left);
  865|  20.4k|  __m128i sum_above = dc_sum_8(above);
  866|  20.4k|  const __m128i zero = _mm_setzero_si128();
  867|  20.4k|  sum_left = _mm_unpacklo_epi16(sum_left, zero);
  868|  20.4k|  sum_above = _mm_unpacklo_epi16(sum_above, zero);
  869|  20.4k|  const __m128i sum = _mm_add_epi32(sum_left, sum_above);
  870|  20.4k|  uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum);
  871|  20.4k|  sum32 += 12;
  872|  20.4k|  sum32 /= 24;
  873|  20.4k|  const __m128i row = _mm_set1_epi16((int16_t)sum32);
  874|  20.4k|  int i;
  875|   102k|  for (i = 0; i < 4; ++i) {
  ------------------
  |  Branch (875:15): [True: 81.8k, False: 20.4k]
  ------------------
  876|  81.8k|    _mm_store_si128((__m128i *)dst, row);
  877|  81.8k|    dst += stride;
  878|  81.8k|    _mm_store_si128((__m128i *)dst, row);
  879|  81.8k|    dst += stride;
  880|  81.8k|    _mm_store_si128((__m128i *)dst, row);
  881|  81.8k|    dst += stride;
  882|  81.8k|    _mm_store_si128((__m128i *)dst, row);
  883|  81.8k|    dst += stride;
  884|  81.8k|  }
  885|  20.4k|}
aom_highbd_dc_predictor_16x8_sse2:
  889|  30.3k|                                       const uint16_t *left, int bd) {
  890|  30.3k|  (void)bd;
  891|  30.3k|  __m128i sum_left = dc_sum_8(left);
  892|  30.3k|  __m128i sum_above = dc_sum_16(above);
  893|  30.3k|  const __m128i zero = _mm_setzero_si128();
  894|  30.3k|  sum_left = _mm_unpacklo_epi16(sum_left, zero);
  895|  30.3k|  sum_above = _mm_unpacklo_epi16(sum_above, zero);
  896|  30.3k|  const __m128i sum = _mm_add_epi32(sum_left, sum_above);
  897|  30.3k|  uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum);
  898|  30.3k|  sum32 += 12;
  899|  30.3k|  sum32 /= 24;
  900|  30.3k|  const __m128i row = _mm_set1_epi16((int16_t)sum32);
  901|  30.3k|  int i;
  902|  91.1k|  for (i = 0; i < 2; ++i) {
  ------------------
  |  Branch (902:15): [True: 60.7k, False: 30.3k]
  ------------------
  903|  60.7k|    _mm_store_si128((__m128i *)dst, row);
  904|  60.7k|    _mm_store_si128((__m128i *)(dst + 8), row);
  905|  60.7k|    dst += stride;
  906|  60.7k|    _mm_store_si128((__m128i *)dst, row);
  907|  60.7k|    _mm_store_si128((__m128i *)(dst + 8), row);
  908|  60.7k|    dst += stride;
  909|  60.7k|    _mm_store_si128((__m128i *)dst, row);
  910|  60.7k|    _mm_store_si128((__m128i *)(dst + 8), row);
  911|  60.7k|    dst += stride;
  912|  60.7k|    _mm_store_si128((__m128i *)dst, row);
  913|  60.7k|    _mm_store_si128((__m128i *)(dst + 8), row);
  914|  60.7k|    dst += stride;
  915|  60.7k|  }
  916|  30.3k|}
aom_highbd_dc_predictor_16x32_sse2:
  920|  7.61k|                                        const uint16_t *left, int bd) {
  921|  7.61k|  (void)bd;
  922|  7.61k|  __m128i sum_left = dc_sum_32(left);
  923|  7.61k|  __m128i sum_above = dc_sum_16(above);
  924|  7.61k|  const __m128i zero = _mm_setzero_si128();
  925|  7.61k|  sum_above = _mm_unpacklo_epi16(sum_above, zero);
  926|  7.61k|  const __m128i sum = _mm_add_epi32(sum_left, sum_above);
  927|  7.61k|  uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum);
  928|  7.61k|  sum32 += 24;
  929|  7.61k|  sum32 /= 48;
  930|  7.61k|  const __m128i row = _mm_set1_epi16((int16_t)sum32);
  931|  7.61k|  int i;
  932|  68.5k|  for (i = 0; i < 8; ++i) {
  ------------------
  |  Branch (932:15): [True: 60.9k, False: 7.61k]
  ------------------
  933|  60.9k|    _mm_store_si128((__m128i *)dst, row);
  934|  60.9k|    _mm_store_si128((__m128i *)(dst + 8), row);
  935|  60.9k|    dst += stride;
  936|  60.9k|    _mm_store_si128((__m128i *)dst, row);
  937|  60.9k|    _mm_store_si128((__m128i *)(dst + 8), row);
  938|  60.9k|    dst += stride;
  939|  60.9k|    _mm_store_si128((__m128i *)dst, row);
  940|  60.9k|    _mm_store_si128((__m128i *)(dst + 8), row);
  941|  60.9k|    dst += stride;
  942|  60.9k|    _mm_store_si128((__m128i *)dst, row);
  943|  60.9k|    _mm_store_si128((__m128i *)(dst + 8), row);
  944|  60.9k|    dst += stride;
  945|  60.9k|  }
  946|  7.61k|}
aom_highbd_dc_predictor_32x16_sse2:
  950|  10.0k|                                        const uint16_t *left, int bd) {
  951|  10.0k|  (void)bd;
  952|  10.0k|  __m128i sum_left = dc_sum_16(left);
  953|  10.0k|  __m128i sum_above = dc_sum_32(above);
  954|  10.0k|  const __m128i zero = _mm_setzero_si128();
  955|  10.0k|  sum_left = _mm_unpacklo_epi16(sum_left, zero);
  956|  10.0k|  const __m128i sum = _mm_add_epi32(sum_left, sum_above);
  957|  10.0k|  uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum);
  958|  10.0k|  sum32 += 24;
  959|  10.0k|  sum32 /= 48;
  960|  10.0k|  const __m128i row = _mm_set1_epi16((int16_t)sum32);
  961|  10.0k|  int i;
  962|  50.3k|  for (i = 0; i < 4; ++i) {
  ------------------
  |  Branch (962:15): [True: 40.3k, False: 10.0k]
  ------------------
  963|  40.3k|    _mm_store_si128((__m128i *)dst, row);
  964|  40.3k|    _mm_store_si128((__m128i *)(dst + 8), row);
  965|  40.3k|    _mm_store_si128((__m128i *)(dst + 16), row);
  966|  40.3k|    _mm_store_si128((__m128i *)(dst + 24), row);
  967|  40.3k|    dst += stride;
  968|  40.3k|    _mm_store_si128((__m128i *)dst, row);
  969|  40.3k|    _mm_store_si128((__m128i *)(dst + 8), row);
  970|  40.3k|    _mm_store_si128((__m128i *)(dst + 16), row);
  971|  40.3k|    _mm_store_si128((__m128i *)(dst + 24), row);
  972|  40.3k|    dst += stride;
  973|  40.3k|    _mm_store_si128((__m128i *)dst, row);
  974|  40.3k|    _mm_store_si128((__m128i *)(dst + 8), row);
  975|  40.3k|    _mm_store_si128((__m128i *)(dst + 16), row);
  976|  40.3k|    _mm_store_si128((__m128i *)(dst + 24), row);
  977|  40.3k|    dst += stride;
  978|  40.3k|    _mm_store_si128((__m128i *)dst, row);
  979|  40.3k|    _mm_store_si128((__m128i *)(dst + 8), row);
  980|  40.3k|    _mm_store_si128((__m128i *)(dst + 16), row);
  981|  40.3k|    _mm_store_si128((__m128i *)(dst + 24), row);
  982|  40.3k|    dst += stride;
  983|  40.3k|  }
  984|  10.0k|}
highbd_intrapred_sse2.c:h_predictor_16x8:
  123|  24.7k|                                    const uint16_t *left) {
  124|  24.7k|  const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
  125|  24.7k|  const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
  126|  24.7k|  const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
  127|  24.7k|  const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
  128|  24.7k|  const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
  129|  24.7k|  const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
  130|  24.7k|  const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
  131|  24.7k|  const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
  132|       |  const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
  133|  24.7k|  h_store_16_unpacklo(&dst, stride, &row0);
  134|  24.7k|  h_store_16_unpacklo(&dst, stride, &row1);
  135|  24.7k|  h_store_16_unpacklo(&dst, stride, &row2);
  136|  24.7k|  h_store_16_unpacklo(&dst, stride, &row3);
  137|  24.7k|  h_store_16_unpackhi(&dst, stride, &row4);
  138|  24.7k|  h_store_16_unpackhi(&dst, stride, &row5);
  139|  24.7k|  h_store_16_unpackhi(&dst, stride, &row6);
  140|  24.7k|  h_store_16_unpackhi(&dst, stride, &row7);
  141|  24.7k|}
highbd_intrapred_sse2.c:h_store_16_unpacklo:
  107|  98.8k|                                       const __m128i *row) {
  108|  98.8k|  const __m128i val = _mm_unpacklo_epi64(*row, *row);
  109|  98.8k|  _mm_store_si128((__m128i *)*dst, val);
  110|  98.8k|  _mm_store_si128((__m128i *)(*dst + 8), val);
  111|  98.8k|  *dst += stride;
  112|  98.8k|}
highbd_intrapred_sse2.c:h_store_16_unpackhi:
  115|  98.8k|                                       const __m128i *row) {
  116|  98.8k|  const __m128i val = _mm_unpackhi_epi64(*row, *row);
  117|  98.8k|  _mm_store_si128((__m128i *)(*dst), val);
  118|  98.8k|  _mm_store_si128((__m128i *)(*dst + 8), val);
  119|  98.8k|  *dst += stride;
  120|  98.8k|}
highbd_intrapred_sse2.c:h_predictor_32x8:
  198|  9.22k|                                    const uint16_t *left) {
  199|  9.22k|  const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
  200|  9.22k|  const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
  201|  9.22k|  const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
  202|  9.22k|  const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
  203|  9.22k|  const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
  204|  9.22k|  const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
  205|  9.22k|  const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
  206|  9.22k|  const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
  207|       |  const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
  208|  9.22k|  h_store_32_unpacklo(&dst, stride, &row0);
  209|  9.22k|  h_store_32_unpacklo(&dst, stride, &row1);
  210|  9.22k|  h_store_32_unpacklo(&dst, stride, &row2);
  211|  9.22k|  h_store_32_unpacklo(&dst, stride, &row3);
  212|  9.22k|  h_store_32_unpackhi(&dst, stride, &row4);
  213|  9.22k|  h_store_32_unpackhi(&dst, stride, &row5);
  214|  9.22k|  h_store_32_unpackhi(&dst, stride, &row6);
  215|  9.22k|  h_store_32_unpackhi(&dst, stride, &row7);
  216|  9.22k|}
highbd_intrapred_sse2.c:h_store_32_unpacklo:
  178|  36.8k|                                       const __m128i *row) {
  179|  36.8k|  const __m128i val = _mm_unpacklo_epi64(*row, *row);
  180|  36.8k|  _mm_store_si128((__m128i *)(*dst), val);
  181|  36.8k|  _mm_store_si128((__m128i *)(*dst + 8), val);
  182|  36.8k|  _mm_store_si128((__m128i *)(*dst + 16), val);
  183|  36.8k|  _mm_store_si128((__m128i *)(*dst + 24), val);
  184|  36.8k|  *dst += stride;
  185|  36.8k|}
highbd_intrapred_sse2.c:h_store_32_unpackhi:
  188|  36.8k|                                       const __m128i *row) {
  189|  36.8k|  const __m128i val = _mm_unpackhi_epi64(*row, *row);
  190|  36.8k|  _mm_store_si128((__m128i *)(*dst), val);
  191|  36.8k|  _mm_store_si128((__m128i *)(*dst + 8), val);
  192|  36.8k|  _mm_store_si128((__m128i *)(*dst + 16), val);
  193|  36.8k|  _mm_store_si128((__m128i *)(*dst + 24), val);
  194|  36.8k|  *dst += stride;
  195|  36.8k|}
highbd_intrapred_sse2.c:dc_sum_4:
  249|  78.4k|static inline __m128i dc_sum_4(const uint16_t *ref) {
  250|  78.4k|  const __m128i _dcba = _mm_loadl_epi64((const __m128i *)ref);
  251|  78.4k|  const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe);
  252|  78.4k|  const __m128i a = _mm_add_epi16(_dcba, _xxdc);
  253|       |  return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1));
  254|  78.4k|}
highbd_intrapred_sse2.c:dc_store_4x4:
  257|  10.1k|                                const __m128i *dc) {
  258|  10.1k|  const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0);
  259|  10.1k|  int i;
  260|  50.7k|  for (i = 0; i < 4; ++i, dst += stride) {
  ------------------
  |  Branch (260:15): [True: 40.5k, False: 10.1k]
  ------------------
  261|  40.5k|    _mm_storel_epi64((__m128i *)dst, dc_dup);
  262|  40.5k|  }
  263|  10.1k|}
highbd_intrapred_sse2.c:dc_sum_8:
  310|   492k|static inline __m128i dc_sum_8(const uint16_t *ref) {
  311|   492k|  const __m128i ref_u16 = _mm_load_si128((const __m128i *)ref);
  312|   492k|  const __m128i _dcba = _mm_add_epi16(ref_u16, _mm_srli_si128(ref_u16, 8));
  313|   492k|  const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe);
  314|   492k|  const __m128i a = _mm_add_epi16(_dcba, _xxdc);
  315|       |
  316|       |  return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1));
  317|   492k|}
highbd_intrapred_sse2.c:dc_store_4x8:
  301|  1.77k|                                const __m128i *dc) {
  302|  1.77k|  const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0);
  303|  1.77k|  int i;
  304|  15.9k|  for (i = 0; i < 8; ++i, dst += stride) {
  ------------------
  |  Branch (304:15): [True: 14.1k, False: 1.77k]
  ------------------
  305|  14.1k|    _mm_storel_epi64((__m128i *)dst, dc_dup);
  306|  14.1k|  }
  307|  1.77k|}
highbd_intrapred_sse2.c:dc_top_predictor_8xh:
  368|  9.29k|                                        int height, const uint16_t *above) {
  369|  9.29k|  const __m128i four = _mm_cvtsi32_si128(4);
  370|  9.29k|  const __m128i sum = dc_sum_8(above);
  371|  9.29k|  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
  372|  9.29k|  dc_store_8xh(dst, stride, height, &dc);
  373|  9.29k|}
highbd_intrapred_sse2.c:dc_store_8xh:
  355|  19.7k|                                const __m128i *dc) {
  356|  19.7k|  const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
  357|  19.7k|  const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
  358|  19.7k|  int i;
  359|   204k|  for (i = 0; i < height; ++i, dst += stride) {
  ------------------
  |  Branch (359:15): [True: 184k, False: 19.7k]
  ------------------
  360|   184k|    _mm_store_si128((__m128i *)dst, dc_dup);
  361|   184k|  }
  362|  19.7k|}
highbd_intrapred_sse2.c:dc_sum_16:
  425|   178k|static inline __m128i dc_sum_16(const uint16_t *ref) {
  426|   178k|  const __m128i sum_lo = dc_sum_8(ref);
  427|   178k|  const __m128i sum_hi = dc_sum_8(ref + 8);
  428|   178k|  return _mm_add_epi16(sum_lo, sum_hi);
  429|   178k|}
highbd_intrapred_sse2.c:dc_128_predictor_8xh:
  446|  1.29k|                                        int height, int bd) {
  447|  1.29k|  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
  448|       |  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
  449|  1.29k|  dc_store_8xh(dst, stride, height, &dc_dup);
  450|  1.29k|}
highbd_intrapred_sse2.c:dc_store_16xh:
  480|  26.6k|                                 const __m128i *dc) {
  481|  26.6k|  const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
  482|  26.6k|  const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
  483|  26.6k|  int i;
  484|   505k|  for (i = 0; i < height; ++i, dst += stride) {
  ------------------
  |  Branch (484:15): [True: 478k, False: 26.6k]
  ------------------
  485|   478k|    _mm_store_si128((__m128i *)dst, dc_dup);
  486|   478k|    _mm_store_si128((__m128i *)(dst + 8), dc_dup);
  487|   478k|  }
  488|  26.6k|}
highbd_intrapred_sse2.c:dc_sum_32:
  516|  44.1k|static inline __m128i dc_sum_32(const uint16_t *ref) {
  517|  44.1k|  const __m128i zero = _mm_setzero_si128();
  518|  44.1k|  const __m128i sum_a = dc_sum_16(ref);
  519|  44.1k|  const __m128i sum_b = dc_sum_16(ref + 16);
  520|       |  // 12 bit bd will outrange, so expand to 32 bit before adding final total
  521|  44.1k|  return _mm_add_epi32(_mm_unpacklo_epi16(sum_a, zero),
  522|  44.1k|                       _mm_unpacklo_epi16(sum_b, zero));
  523|  44.1k|}
highbd_intrapred_sse2.c:dc_store_32xh:
  609|  30.7k|                                 const __m128i *dc) {
  610|  30.7k|  const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
  611|  30.7k|  const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
  612|  30.7k|  int i;
  613|   938k|  for (i = 0; i < height; ++i, dst += stride) {
  ------------------
  |  Branch (613:15): [True: 907k, False: 30.7k]
  ------------------
  614|   907k|    _mm_store_si128((__m128i *)dst, dc_dup);
  615|   907k|    _mm_store_si128((__m128i *)(dst + 8), dc_dup);
  616|   907k|    _mm_store_si128((__m128i *)(dst + 16), dc_dup);
  617|   907k|    _mm_store_si128((__m128i *)(dst + 24), dc_dup);
  618|   907k|  }
  619|  30.7k|}

aom_highbd_lpf_horizontal_14_sse2:
  502|  49.2k|                                       const uint8_t *thresh, int bd) {
  503|  49.2k|  __m128i p[7], q[7], pq[7];
  504|  49.2k|  int i;
  505|       |
  506|   393k|  for (i = 0; i < 7; i++) {
  ------------------
  |  Branch (506:15): [True: 344k, False: 49.2k]
  ------------------
  507|   344k|    p[i] = _mm_loadl_epi64((__m128i *)(s - (i + 1) * pitch));
  508|   344k|    q[i] = _mm_loadl_epi64((__m128i *)(s + i * pitch));
  509|   344k|  }
  510|       |
  511|  49.2k|  highbd_lpf_internal_14_sse2(p, q, pq, blimit, limit, thresh, bd);
  512|       |
  513|   344k|  for (i = 0; i < 6; i++) {
  ------------------
  |  Branch (513:15): [True: 294k, False: 49.2k]
  ------------------
  514|   294k|    _mm_storel_epi64((__m128i *)(s - (i + 1) * pitch), pq[i]);
  515|       |    _mm_storel_epi64((__m128i *)(s + i * pitch), _mm_srli_si128(pq[i], 8));
  516|   294k|  }
  517|  49.2k|}
aom_highbd_lpf_horizontal_6_sse2:
  952|   144k|                                      const uint8_t *_thresh, int bd) {
  953|   144k|  __m128i p2, p1, p0, q0, q1, q2, p1p0_out, q1q0_out;
  954|       |
  955|   144k|  p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
  956|   144k|  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
  957|   144k|  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
  958|   144k|  q0 = _mm_loadl_epi64((__m128i *)(s + 0 * p));
  959|   144k|  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
  960|   144k|  q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
  961|       |
  962|   144k|  highbd_lpf_internal_6_sse2(&p2, &p1, &p0, &q0, &q1, &q2, &p1p0_out, &q1q0_out,
  963|   144k|                             _blimit, _limit, _thresh, bd);
  964|       |
  965|   144k|  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0_out, 8));
  966|   144k|  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0_out);
  967|   144k|  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0_out);
  968|       |  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0_out, 8));
  969|   144k|}
aom_highbd_lpf_horizontal_8_sse2:
 1223|  50.2k|                                      const uint8_t *_thresh, int bd) {
 1224|  50.2k|  __m128i p2, p1, p0, q0, q1, q2, p3, q3;
 1225|  50.2k|  __m128i q1q0, p1p0;
 1226|       |
 1227|  50.2k|  p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
 1228|  50.2k|  q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
 1229|  50.2k|  p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
 1230|  50.2k|  q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
 1231|  50.2k|  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
 1232|  50.2k|  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
 1233|  50.2k|  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
 1234|  50.2k|  q0 = _mm_loadl_epi64((__m128i *)(s + 0 * p));
 1235|       |
 1236|  50.2k|  highbd_lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0,
 1237|  50.2k|                             &p1p0, _blimit, _limit, _thresh, bd);
 1238|       |
 1239|  50.2k|  _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
 1240|  50.2k|  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
 1241|  50.2k|  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
 1242|  50.2k|  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
 1243|       |  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
 1244|  50.2k|  _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
 1245|  50.2k|}
aom_highbd_lpf_horizontal_4_sse2:
 1348|  88.4k|                                      const uint8_t *_thresh, int bd) {
 1349|  88.4k|  __m128i p1p0, q1q0;
 1350|  88.4k|  __m128i p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
 1351|  88.4k|  __m128i p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
 1352|  88.4k|  __m128i q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
 1353|  88.4k|  __m128i q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
 1354|       |
 1355|  88.4k|  highbd_lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &q1q0, &p1p0, _blimit, _limit,
 1356|  88.4k|                             _thresh, bd);
 1357|       |
 1358|  88.4k|  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
 1359|  88.4k|  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
 1360|  88.4k|  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
 1361|       |  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
 1362|  88.4k|}
aom_highbd_lpf_vertical_4_sse2:
 1385|  72.9k|                                    int bd) {
 1386|  72.9k|  __m128i x0, x1, x2, x3, d0, d1, d2, d3;
 1387|  72.9k|  __m128i p1p0, q1q0;
 1388|  72.9k|  __m128i p1, q1;
 1389|       |
 1390|  72.9k|  x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p));
 1391|  72.9k|  x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p));
 1392|  72.9k|  x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p));
 1393|  72.9k|  x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p));
 1394|       |
 1395|  72.9k|  highbd_transpose4x8_8x4_low_sse2(&x0, &x1, &x2, &x3, &d0, &d1, &d2, &d3);
 1396|       |
 1397|  72.9k|  highbd_lpf_internal_4_sse2(&d0, &d1, &d2, &d3, &q1q0, &p1p0, blimit, limit,
 1398|  72.9k|                             thresh, bd);
 1399|       |
 1400|  72.9k|  p1 = _mm_srli_si128(p1p0, 8);
 1401|  72.9k|  q1 = _mm_srli_si128(q1q0, 8);
 1402|       |
 1403|       |  // transpose from 8x4 to 4x8
 1404|  72.9k|  highbd_transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3);
 1405|       |
 1406|  72.9k|  _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
 1407|  72.9k|  _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
 1408|  72.9k|  _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
 1409|  72.9k|  _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
 1410|  72.9k|}
aom_highbd_lpf_vertical_6_sse2:
 1450|   131k|                                    int bd) {
 1451|   131k|  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
 1452|   131k|  __m128i x3, x2, x1, x0, p0, q0;
 1453|   131k|  __m128i p1p0, q1q0;
 1454|       |
 1455|   131k|  x3 = _mm_loadu_si128((__m128i *)((s - 3) + 0 * p));
 1456|   131k|  x2 = _mm_loadu_si128((__m128i *)((s - 3) + 1 * p));
 1457|   131k|  x1 = _mm_loadu_si128((__m128i *)((s - 3) + 2 * p));
 1458|   131k|  x0 = _mm_loadu_si128((__m128i *)((s - 3) + 3 * p));
 1459|       |
 1460|   131k|  highbd_transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5,
 1461|   131k|                               &d6, &d7);
 1462|       |
 1463|   131k|  highbd_lpf_internal_6_sse2(&d0, &d1, &d2, &d3, &d4, &d5, &p1p0, &q1q0, blimit,
 1464|   131k|                             limit, thresh, bd);
 1465|       |
 1466|   131k|  p0 = _mm_srli_si128(p1p0, 8);
 1467|   131k|  q0 = _mm_srli_si128(q1q0, 8);
 1468|       |
 1469|   131k|  highbd_transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3);
 1470|       |
 1471|   131k|  _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
 1472|   131k|  _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
 1473|   131k|  _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
 1474|   131k|  _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
 1475|   131k|}
aom_highbd_lpf_vertical_8_sse2:
 1516|  46.8k|                                    int bd) {
 1517|  46.8k|  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
 1518|  46.8k|  __m128i p2, p1, p0, p3, q0;
 1519|  46.8k|  __m128i q1q0, p1p0;
 1520|       |
 1521|  46.8k|  p3 = _mm_loadu_si128((__m128i *)((s - 4) + 0 * p));
 1522|  46.8k|  p2 = _mm_loadu_si128((__m128i *)((s - 4) + 1 * p));
 1523|  46.8k|  p1 = _mm_loadu_si128((__m128i *)((s - 4) + 2 * p));
 1524|  46.8k|  p0 = _mm_loadu_si128((__m128i *)((s - 4) + 3 * p));
 1525|       |
 1526|  46.8k|  highbd_transpose4x8_8x4_sse2(&p3, &p2, &p1, &p0, &d0, &d1, &d2, &d3, &d4, &d5,
 1527|  46.8k|                               &d6, &d7);
 1528|       |
 1529|       |  // Loop filtering
 1530|  46.8k|  highbd_lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0,
 1531|  46.8k|                             &p1p0, blimit, limit, thresh, bd);
 1532|       |
 1533|  46.8k|  p0 = _mm_srli_si128(p1p0, 8);
 1534|  46.8k|  q0 = _mm_srli_si128(q1q0, 8);
 1535|       |
 1536|  46.8k|  highbd_transpose8x8_low_sse2(&d0, &d1, &p0, &p1p0, &q1q0, &q0, &d6, &d7, &d0,
 1537|  46.8k|                               &d1, &d2, &d3);
 1538|       |
 1539|  46.8k|  _mm_storeu_si128((__m128i *)(s - 4 + 0 * p), d0);
 1540|  46.8k|  _mm_storeu_si128((__m128i *)(s - 4 + 1 * p), d1);
 1541|  46.8k|  _mm_storeu_si128((__m128i *)(s - 4 + 2 * p), d2);
 1542|  46.8k|  _mm_storeu_si128((__m128i *)(s - 4 + 3 * p), d3);
 1543|  46.8k|}
aom_highbd_lpf_vertical_14_sse2:
 1584|  43.7k|                                     const uint8_t *thresh, int bd) {
 1585|  43.7k|  __m128i q[7], p[7], pq[7];
 1586|  43.7k|  __m128i p6, p5, p4, p3;
 1587|  43.7k|  __m128i p6_2, p5_2, p4_2, p3_2;
 1588|  43.7k|  __m128i d0, d1, d2, d3;
 1589|  43.7k|  __m128i d0_2, d1_2, d2_2, d3_2, d7_2;
 1590|       |
 1591|  43.7k|  p6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * pitch));
 1592|  43.7k|  p5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * pitch));
 1593|  43.7k|  p4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * pitch));
 1594|  43.7k|  p3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * pitch));
 1595|       |
 1596|  43.7k|  highbd_transpose4x8_8x4_sse2(&p6, &p5, &p4, &p3, &d0, &p[6], &p[5], &p[4],
 1597|  43.7k|                               &p[3], &p[2], &p[1], &p[0]);
 1598|       |
 1599|  43.7k|  p6_2 = _mm_loadu_si128((__m128i *)(s + 0 * pitch));
 1600|  43.7k|  p5_2 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
 1601|  43.7k|  p4_2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
 1602|  43.7k|  p3_2 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
 1603|       |
 1604|  43.7k|  highbd_transpose4x8_8x4_sse2(&p6_2, &p5_2, &p4_2, &p3_2, &q[0], &q[1], &q[2],
 1605|  43.7k|                               &q[3], &q[4], &q[5], &q[6], &d7_2);
 1606|       |
 1607|  43.7k|  highbd_lpf_internal_14_sse2(p, q, pq, blimit, limit, thresh, bd);
 1608|       |
 1609|  43.7k|  highbd_transpose8x8_low_sse2(&d0, &p[6], &pq[5], &pq[4], &pq[3], &pq[2],
 1610|  43.7k|                               &pq[1], &pq[0], &d0, &d1, &d2, &d3);
 1611|       |
 1612|  43.7k|  q[0] = _mm_srli_si128(pq[0], 8);
 1613|  43.7k|  q[1] = _mm_srli_si128(pq[1], 8);
 1614|  43.7k|  q[2] = _mm_srli_si128(pq[2], 8);
 1615|  43.7k|  q[3] = _mm_srli_si128(pq[3], 8);
 1616|  43.7k|  q[4] = _mm_srli_si128(pq[4], 8);
 1617|  43.7k|  q[5] = _mm_srli_si128(pq[5], 8);
 1618|       |
 1619|  43.7k|  highbd_transpose8x8_low_sse2(&q[0], &q[1], &q[2], &q[3], &q[4], &q[5], &q[6],
 1620|  43.7k|                               &d7_2, &d0_2, &d1_2, &d2_2, &d3_2);
 1621|       |
 1622|  43.7k|  _mm_storeu_si128((__m128i *)(s - 8 + 0 * pitch), d0);
 1623|  43.7k|  _mm_storeu_si128((__m128i *)(s + 0 * pitch), d0_2);
 1624|       |
 1625|  43.7k|  _mm_storeu_si128((__m128i *)(s - 8 + 1 * pitch), d1);
 1626|  43.7k|  _mm_storeu_si128((__m128i *)(s + 1 * pitch), d1_2);
 1627|       |
 1628|  43.7k|  _mm_storeu_si128((__m128i *)(s - 8 + 2 * pitch), d2);
 1629|  43.7k|  _mm_storeu_si128((__m128i *)(s + 2 * pitch), d2_2);
 1630|       |
 1631|  43.7k|  _mm_storeu_si128((__m128i *)(s - 8 + 3 * pitch), d3);
 1632|  43.7k|  _mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_2);
 1633|  43.7k|}
highbd_loopfilter_sse2.c:highbd_lpf_internal_14_sse2:
  328|  92.7k|    const unsigned char *lt, const unsigned char *thr, int bd) {
  329|  92.7k|  int i;
  330|  92.7k|  const __m128i zero = _mm_setzero_si128();
  331|  92.7k|  __m128i blimit, limit, thresh;
  332|  92.7k|  __m128i t80;
  333|  92.7k|  get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh, &t80);
  334|       |
  335|   738k|  for (i = 0; i < 7; i++) {
  ------------------
  |  Branch (335:15): [True: 645k, False: 92.7k]
  ------------------
  336|   645k|    pq[i] = _mm_unpacklo_epi64(p[i], q[i]);
  337|   645k|  }
  338|  92.7k|  __m128i mask, hevhev;
  339|  92.7k|  __m128i p1p0, q1q0, abs_p1p0;
  340|       |
  341|  92.7k|  highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
  342|  92.7k|                                &thresh, &hevhev, &mask);
  343|       |
  344|  92.7k|  __m128i ps0ps1, qs0qs1;
  345|       |  // filter4
  346|  92.7k|  highbd_filter4_sse2(&p1p0, &q1q0, &hevhev, &mask, &qs0qs1, &ps0ps1, &t80, bd);
  347|       |
  348|  92.7k|  __m128i flat, flat2;
  349|  92.7k|  highbd_flat_mask4_sse2(pq, &flat, &flat2, bd);
  350|       |
  351|  92.7k|  flat = _mm_and_si128(flat, mask);
  352|  92.7k|  flat2 = _mm_and_si128(flat2, flat);
  353|       |
  354|       |  // replicate for the further "merged variables" usage
  355|  92.7k|  flat = _mm_unpacklo_epi64(flat, flat);
  356|  92.7k|  flat2 = _mm_unpacklo_epi64(flat2, flat2);
  357|       |
  358|       |  // flat and wide flat calculations
  359|       |
  360|       |  // if flat ==0 then flat2 is zero as well and we don't need any calc below
  361|       |  // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
  362|  92.7k|  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
  ------------------
  |  Branch (362:7): [True: 31.7k, False: 61.0k]
  ------------------
  363|  31.7k|    __m128i flat_p[3], flat_q[3], flat_pq[3];
  364|  31.7k|    __m128i flat2_p[6], flat2_q[6];
  365|  31.7k|    __m128i flat2_pq[6];
  366|  31.7k|    __m128i sum_p6, sum_p3;
  367|  31.7k|    const __m128i eight = _mm_set1_epi16(8);
  368|  31.7k|    const __m128i four = _mm_set1_epi16(4);
  369|       |
  370|  31.7k|    __m128i work0, work0_0, work0_1, sum_p_0;
  371|  31.7k|    __m128i sum_p = _mm_add_epi16(pq[5], _mm_add_epi16(pq[4], pq[3]));
  372|  31.7k|    __m128i sum_lp = _mm_add_epi16(pq[0], _mm_add_epi16(pq[2], pq[1]));
  373|  31.7k|    sum_p = _mm_add_epi16(sum_p, sum_lp);
  374|       |
  375|  31.7k|    __m128i sum_lq = _mm_srli_si128(sum_lp, 8);
  376|  31.7k|    __m128i sum_q = _mm_srli_si128(sum_p, 8);
  377|       |
  378|  31.7k|    sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
  379|  31.7k|    sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
  380|       |
  381|  31.7k|    flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq[3], pq[0]));
  382|  31.7k|    flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0]));
  383|       |
  384|  31.7k|    sum_p6 = _mm_add_epi16(pq[6], pq[6]);
  385|  31.7k|    sum_p3 = _mm_add_epi16(pq[3], pq[3]);
  386|       |
  387|  31.7k|    sum_q = _mm_sub_epi16(sum_p_0, pq[5]);
  388|  31.7k|    sum_p = _mm_sub_epi16(sum_p_0, q[5]);
  389|       |
  390|  31.7k|    work0_0 = _mm_add_epi16(_mm_add_epi16(pq[6], pq[0]), pq[1]);
  391|  31.7k|    work0_1 = _mm_add_epi16(sum_p6,
  392|  31.7k|                            _mm_add_epi16(pq[1], _mm_add_epi16(pq[2], pq[0])));
  393|       |
  394|  31.7k|    sum_lq = _mm_sub_epi16(sum_lp, pq[2]);
  395|  31.7k|    sum_lp = _mm_sub_epi16(sum_lp, q[2]);
  396|       |
  397|  31.7k|    work0 = _mm_add_epi16(sum_p3, pq[1]);
  398|  31.7k|    flat_p[1] = _mm_add_epi16(sum_lp, work0);
  399|  31.7k|    flat_q[1] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
  400|       |
  401|  31.7k|    flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3);
  402|  31.7k|    flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3);
  403|       |
  404|  31.7k|    sum_lp = _mm_sub_epi16(sum_lp, q[1]);
  405|  31.7k|    sum_lq = _mm_sub_epi16(sum_lq, pq[1]);
  406|       |
  407|  31.7k|    sum_p3 = _mm_add_epi16(sum_p3, pq[3]);
  408|  31.7k|    work0 = _mm_add_epi16(sum_p3, pq[2]);
  409|       |
  410|  31.7k|    flat_p[2] = _mm_add_epi16(sum_lp, work0);
  411|  31.7k|    flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
  412|  31.7k|    flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3);
  413|       |
  414|  31.7k|    int flat2_mask =
  415|  31.7k|        (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat2, zero)));
  416|  31.7k|    if (flat2_mask) {
  ------------------
  |  Branch (416:9): [True: 24.9k, False: 6.79k]
  ------------------
  417|  24.9k|      flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q[0]));
  418|  24.9k|      flat2_q[0] = _mm_add_epi16(
  419|  24.9k|          sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq[0]));
  420|       |
  421|  24.9k|      flat2_p[1] = _mm_add_epi16(sum_p, work0_1);
  422|  24.9k|      flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8));
  423|       |
  424|  24.9k|      flat2_pq[0] =
  425|  24.9k|          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4);
  426|  24.9k|      flat2_pq[1] =
  427|  24.9k|          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4);
  428|       |
  429|  24.9k|      sum_p = _mm_sub_epi16(sum_p, q[4]);
  430|  24.9k|      sum_q = _mm_sub_epi16(sum_q, pq[4]);
  431|       |
  432|  24.9k|      sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
  433|  24.9k|      work0 = _mm_add_epi16(sum_p6,
  434|  24.9k|                            _mm_add_epi16(pq[2], _mm_add_epi16(pq[3], pq[1])));
  435|  24.9k|      flat2_p[2] = _mm_add_epi16(sum_p, work0);
  436|  24.9k|      flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
  437|  24.9k|      flat2_pq[2] =
  438|  24.9k|          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4);
  439|       |
  440|  24.9k|      sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
  441|  24.9k|      sum_p = _mm_sub_epi16(sum_p, q[3]);
  442|  24.9k|      sum_q = _mm_sub_epi16(sum_q, pq[3]);
  443|       |
  444|  24.9k|      work0 = _mm_add_epi16(sum_p6,
  445|  24.9k|                            _mm_add_epi16(pq[3], _mm_add_epi16(pq[4], pq[2])));
  446|  24.9k|      flat2_p[3] = _mm_add_epi16(sum_p, work0);
  447|  24.9k|      flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
  448|  24.9k|      flat2_pq[3] =
  449|  24.9k|          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4);
  450|       |
  451|  24.9k|      sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
  452|  24.9k|      sum_p = _mm_sub_epi16(sum_p, q[2]);
  453|  24.9k|      sum_q = _mm_sub_epi16(sum_q, pq[2]);
  454|       |
  455|  24.9k|      work0 = _mm_add_epi16(sum_p6,
  456|  24.9k|                            _mm_add_epi16(pq[4], _mm_add_epi16(pq[5], pq[3])));
  457|  24.9k|      flat2_p[4] = _mm_add_epi16(sum_p, work0);
  458|  24.9k|      flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
  459|  24.9k|      flat2_pq[4] =
  460|  24.9k|          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4);
  461|       |
  462|  24.9k|      sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
  463|  24.9k|      sum_p = _mm_sub_epi16(sum_p, q[1]);
  464|  24.9k|      sum_q = _mm_sub_epi16(sum_q, pq[1]);
  465|       |
  466|  24.9k|      work0 = _mm_add_epi16(sum_p6,
  467|  24.9k|                            _mm_add_epi16(pq[5], _mm_add_epi16(pq[6], pq[4])));
  468|  24.9k|      flat2_p[5] = _mm_add_epi16(sum_p, work0);
  469|  24.9k|      flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
  470|  24.9k|      flat2_pq[5] =
  471|  24.9k|          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4);
  472|  24.9k|    }  // flat2
  473|       |       // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  474|       |    // highbd_filter8
  475|  31.7k|    pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1);
  476|  31.7k|    pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1);
  477|       |
  478|   126k|    for (i = 0; i < 3; i++) {
  ------------------
  |  Branch (478:17): [True: 95.1k, False: 31.7k]
  ------------------
  479|  95.1k|      pq[i] = _mm_andnot_si128(flat, pq[i]);
  480|  95.1k|      flat_pq[i] = _mm_and_si128(flat, flat_pq[i]);
  481|  95.1k|      pq[i] = _mm_or_si128(pq[i], flat_pq[i]);
  482|  95.1k|    }
  483|       |
  484|       |    // wide flat
  485|       |    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  486|  31.7k|    if (flat2_mask) {
  ------------------
  |  Branch (486:9): [True: 24.9k, False: 6.79k]
  ------------------
  487|   174k|      for (i = 0; i < 6; i++) {
  ------------------
  |  Branch (487:19): [True: 149k, False: 24.9k]
  ------------------
  488|   149k|        pq[i] = _mm_andnot_si128(flat2, pq[i]);
  489|   149k|        flat2_pq[i] = _mm_and_si128(flat2, flat2_pq[i]);
  490|   149k|        pq[i] = _mm_or_si128(pq[i], flat2_pq[i]);  // full list of pq values
  491|   149k|      }
  492|  24.9k|    }
  493|  61.0k|  } else {
  494|  61.0k|    pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1);
  495|  61.0k|    pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1);
  496|  61.0k|  }
  497|  92.7k|}
highbd_loopfilter_sse2.c:get_limit:
   30|   578k|                             __m128i *lt, __m128i *thr, __m128i *t80_out) {
   31|   578k|  const int shift = bd - 8;
   32|   578k|  const __m128i zero = _mm_setzero_si128();
   33|       |
   34|   578k|  __m128i x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)bl), zero);
   35|   578k|  *blt = _mm_slli_epi16(x, shift);
   36|       |
   37|   578k|  x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l), zero);
   38|   578k|  *lt = _mm_slli_epi16(x, shift);
   39|       |
   40|   578k|  x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t), zero);
   41|   578k|  *thr = _mm_slli_epi16(x, shift);
   42|       |
   43|   578k|  *t80_out = _mm_set1_epi16(1 << (bd - 1));
   44|   578k|}
highbd_loopfilter_sse2.c:highbd_hev_filter_mask_x_sse2:
  112|   546k|                                                 __m128i *hev, __m128i *mask) {
  113|   546k|  const __m128i zero = _mm_setzero_si128();
  114|   546k|  const __m128i one = _mm_set1_epi16(1);
  115|   546k|  const __m128i ffff = _mm_set1_epi16((short)0xFFFF);
  116|   546k|  __m128i abs_p0q0_p1q1, abs_p0q0, abs_p1q1, abs_q1q0;
  117|   546k|  __m128i max, max01, h;
  118|       |
  119|   546k|  *p1p0 = _mm_unpacklo_epi64(pq[0], pq[1]);
  120|   546k|  *q1q0 = _mm_unpackhi_epi64(pq[0], pq[1]);
  121|       |
  122|   546k|  abs_p0q0_p1q1 = abs_diff16(*p1p0, *q1q0);
  123|   546k|  abs_p0q0 = _mm_adds_epu16(abs_p0q0_p1q1, abs_p0q0_p1q1);
  124|   546k|  abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero);
  125|       |
  126|   546k|  abs_p1q1 = _mm_srli_si128(abs_p0q0_p1q1, 8);
  127|   546k|  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);  // divide by 2
  128|       |
  129|   546k|  max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl);
  130|   546k|  max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff);
  131|       |  // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2  > blimit) * -1;
  132|       |  // So taking maximums continues to work:
  133|   546k|  max = _mm_and_si128(max, _mm_adds_epu16(*l, one));
  134|       |
  135|   546k|  *abs_p1p0 = abs_diff16(pq[0], pq[1]);
  136|   546k|  abs_q1q0 = _mm_srli_si128(*abs_p1p0, 8);
  137|   546k|  max01 = _mm_max_epi16(*abs_p1p0, abs_q1q0);
  138|       |  // mask |= (abs(*p1 - *p0) > limit) * -1;
  139|       |  // mask |= (abs(*q1 - *q0) > limit) * -1;
  140|   546k|  h = _mm_subs_epu16(max01, *t);
  141|       |
  142|   546k|  *hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff);
  143|       |  // replicate for the further "merged variables" usage
  144|   546k|  *hev = _mm_unpacklo_epi64(*hev, *hev);
  145|       |
  146|   546k|  max = _mm_max_epi16(max, max01);
  147|   546k|  int i;
  148|  1.13M|  for (i = 2; i < x; ++i) {
  ------------------
  |  Branch (148:15): [True: 587k, False: 546k]
  ------------------
  149|   587k|    max = _mm_max_epi16(max, abs_diff16(pq[i], pq[i - 1]));
  150|   587k|  }
  151|   546k|  max = _mm_max_epi16(max, _mm_srli_si128(max, 8));
  152|       |
  153|   546k|  max = _mm_subs_epu16(max, *l);
  154|   546k|  *mask = _mm_cmpeq_epi16(max, zero);  //  ~mask
  155|   546k|}
highbd_loopfilter_sse2.c:abs_diff16:
   24|  2.59M|static AOM_FORCE_INLINE __m128i abs_diff16(__m128i a, __m128i b) {
   25|  2.59M|  return _mm_or_si128(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a));
   26|  2.59M|}
highbd_loopfilter_sse2.c:highbd_filter4_sse2:
  217|   563k|                                                 int bd) {
  218|   563k|  const __m128i zero = _mm_setzero_si128();
  219|   563k|  const __m128i one = _mm_set1_epi16(1);
  220|   563k|  const __m128i pmax =
  221|   563k|      _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), *t80);
  222|   563k|  const __m128i pmin = _mm_subs_epi16(zero, *t80);
  223|       |
  224|   563k|  const __m128i t3t4 = _mm_set_epi16(3, 3, 3, 3, 4, 4, 4, 4);
  225|   563k|  __m128i ps1ps0_work, qs1qs0_work, work;
  226|   563k|  __m128i filt, filter2filter1, filter2filt, filter1filt;
  227|       |
  228|   563k|  ps1ps0_work = _mm_subs_epi16(*p1p0, *t80);
  229|   563k|  qs1qs0_work = _mm_subs_epi16(*q1q0, *t80);
  230|       |
  231|   563k|  work = _mm_subs_epi16(ps1ps0_work, qs1qs0_work);
  232|   563k|  pixel_clamp(&pmin, &pmax, &work);
  233|   563k|  filt = _mm_and_si128(_mm_srli_si128(work, 8), *hev);
  234|       |
  235|   563k|  filt = _mm_subs_epi16(filt, work);
  236|   563k|  filt = _mm_subs_epi16(filt, work);
  237|   563k|  filt = _mm_subs_epi16(filt, work);
  238|       |  // (aom_filter + 3 * (qs0 - ps0)) & mask
  239|   563k|  pixel_clamp(&pmin, &pmax, &filt);
  240|   563k|  filt = _mm_and_si128(filt, *mask);
  241|   563k|  filt = _mm_unpacklo_epi64(filt, filt);
  242|       |
  243|   563k|  filter2filter1 = _mm_adds_epi16(filt, t3t4); /* signed_short_clamp */
  244|   563k|  pixel_clamp(&pmin, &pmax, &filter2filter1);
  245|   563k|  filter2filter1 = _mm_srai_epi16(filter2filter1, 3); /* >> 3 */
  246|       |
  247|   563k|  filt = _mm_unpacklo_epi64(filter2filter1, filter2filter1);
  248|       |
  249|       |  // filt >> 1
  250|   563k|  filt = _mm_adds_epi16(filt, one);
  251|   563k|  filt = _mm_srai_epi16(filt, 1);
  252|   563k|  filt = _mm_andnot_si128(*hev, filt);
  253|       |
  254|   563k|  filter2filt = _mm_unpackhi_epi64(filter2filter1, filt);
  255|   563k|  filter1filt = _mm_unpacklo_epi64(filter2filter1, filt);
  256|       |
  257|   563k|  qs1qs0_work = _mm_subs_epi16(qs1qs0_work, filter1filt);
  258|   563k|  ps1ps0_work = _mm_adds_epi16(ps1ps0_work, filter2filt);
  259|       |
  260|   563k|  pixel_clamp(&pmin, &pmax, &qs1qs0_work);
  261|   563k|  pixel_clamp(&pmin, &pmax, &ps1ps0_work);
  262|       |
  263|   563k|  *qs1qs0 = _mm_adds_epi16(qs1qs0_work, *t80);
  264|   563k|  *ps1ps0 = _mm_adds_epi16(ps1ps0_work, *t80);
  265|   563k|}
highbd_loopfilter_sse2.c:pixel_clamp:
   19|  2.81M|                                         __m128i *pixel) {
   20|  2.81M|  *pixel = _mm_min_epi16(*pixel, *max);
   21|  2.81M|  *pixel = _mm_max_epi16(*pixel, *min);
   22|  2.81M|}
highbd_loopfilter_sse2.c:highbd_flat_mask4_sse2:
  195|  92.3k|                                          __m128i *flat2, int bd) {
  196|       |  // check the distance 1,2,3 against 0
  197|  92.3k|  __m128i th = _mm_set1_epi16(1);
  198|  92.3k|  th = _mm_slli_epi16(th, bd - 8);
  199|  92.3k|  flat_mask_internal(&th, pq, 1, 4, flat);
  200|  92.3k|  flat_mask_internal(&th, pq, 4, 7, flat2);
  201|  92.3k|}
highbd_loopfilter_sse2.c:flat_mask_internal:
  158|   184k|                                      int start, int end, __m128i *flat) {
  159|   184k|  int i;
  160|   184k|  __m128i max = _mm_max_epi16(abs_diff16(pq[start], pq[0]),
  161|   184k|                              abs_diff16(pq[start + 1], pq[0]));
  162|       |
  163|   368k|  for (i = start + 2; i < end; ++i) {
  ------------------
  |  Branch (163:23): [True: 184k, False: 184k]
  ------------------
  164|   184k|    max = _mm_max_epi16(max, abs_diff16(pq[i], pq[0]));
  165|   184k|  }
  166|   184k|  max = _mm_max_epi16(max, _mm_srli_si128(max, 8));
  167|       |
  168|   184k|  __m128i ft;
  169|   184k|  ft = _mm_subs_epu16(max, *th);
  170|       |
  171|   184k|  const __m128i zero = _mm_setzero_si128();
  172|   184k|  *flat = _mm_cmpeq_epi16(ft, zero);
  173|   184k|}
highbd_loopfilter_sse2.c:highbd_lpf_internal_6_sse2:
  739|   272k|    const uint8_t *_limit, const uint8_t *_thresh, int bd) {
  740|   272k|  __m128i blimit, limit, thresh;
  741|   272k|  __m128i mask, hev, flat;
  742|   272k|  __m128i pq[3];
  743|   272k|  __m128i p1p0, q1q0, abs_p1p0, ps1ps0, qs1qs0;
  744|   272k|  __m128i flat_p1p0, flat_q0q1;
  745|       |
  746|   272k|  pq[0] = _mm_unpacklo_epi64(*p0, *q0);
  747|   272k|  pq[1] = _mm_unpacklo_epi64(*p1, *q1);
  748|   272k|  pq[2] = _mm_unpacklo_epi64(*p2, *q2);
  749|       |
  750|   272k|  const __m128i zero = _mm_setzero_si128();
  751|   272k|  const __m128i four = _mm_set1_epi16(4);
  752|   272k|  __m128i t80;
  753|   272k|  const __m128i one = _mm_set1_epi16(0x1);
  754|       |
  755|   272k|  get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80);
  756|       |
  757|   272k|  highbd_hev_filter_mask_x_sse2(pq, 3, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
  758|   272k|                                &thresh, &hev, &mask);
  759|       |
  760|       |  // lp filter
  761|   272k|  highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd);
  762|       |
  763|       |  // flat_mask
  764|   272k|  flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_p1p0);
  765|   272k|  flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8));
  766|       |
  767|   272k|  flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
  768|       |
  769|   272k|  flat = _mm_cmpeq_epi16(flat, zero);
  770|   272k|  flat = _mm_and_si128(flat, mask);
  771|       |  // replicate for the further "merged variables" usage
  772|   272k|  flat = _mm_unpacklo_epi64(flat, flat);
  773|       |
  774|       |  // 5 tap filter
  775|       |  // need it only if flat !=0
  776|   272k|  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
  ------------------
  |  Branch (776:7): [True: 232k, False: 40.2k]
  ------------------
  777|   232k|    __m128i workp_a, workp_b, workp_c;
  778|   232k|    __m128i pq0x2_pq1, pq1_pq2;
  779|       |
  780|       |    // op1
  781|   232k|    pq0x2_pq1 =
  782|   232k|        _mm_add_epi16(_mm_add_epi16(pq[0], pq[0]), pq[1]);  // p0 *2 + p1
  783|   232k|    pq1_pq2 = _mm_add_epi16(pq[1], pq[2]);                  // p1 + p2
  784|   232k|    workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four),
  785|   232k|                            pq1_pq2);  // p2 + p0 * 2 + p1 * 2 + 4
  786|       |
  787|   232k|    workp_b = _mm_add_epi16(_mm_add_epi16(pq[2], pq[2]), *q0);
  788|   232k|    workp_b =
  789|   232k|        _mm_add_epi16(workp_a, workp_b);  // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4
  790|       |
  791|       |    // op0
  792|   232k|    workp_c = _mm_srli_si128(pq0x2_pq1, 8);  // q0 * 2 + q1
  793|   232k|    workp_a = _mm_add_epi16(workp_a,
  794|   232k|                            workp_c);  // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4
  795|   232k|    workp_b = _mm_unpacklo_epi64(workp_a, workp_b);
  796|   232k|    flat_p1p0 = _mm_srli_epi16(workp_b, 3);
  797|       |
  798|       |    // oq0
  799|   232k|    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[2]),
  800|   232k|                            pq[1]);  // p0 * 2 + p1  + q0 * 2 + q1 + 4
  801|   232k|    workp_b = _mm_srli_si128(pq1_pq2, 8);
  802|   232k|    workp_a = _mm_add_epi16(
  803|   232k|        workp_a, workp_b);  // p0 * 2 + p1  + q0 * 2 + q1 * 2 + q2 + 4
  804|       |    // workp_shft0 = _mm_srli_epi16(workp_a, 3);
  805|       |
  806|       |    // oq1
  807|   232k|    workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[1]),
  808|   232k|                            pq[0]);  // p0   + q0 * 2 + q1 * 2 + q2 + 4
  809|   232k|    workp_b = _mm_add_epi16(*q2, *q2);
  810|   232k|    workp_b =
  811|   232k|        _mm_add_epi16(workp_c, workp_b);  // p0  + q0 * 2 + q1 * 2 + q2 * 3 + 4
  812|       |
  813|   232k|    workp_a = _mm_unpacklo_epi64(workp_a, workp_b);
  814|   232k|    flat_q0q1 = _mm_srli_epi16(workp_a, 3);
  815|       |
  816|   232k|    qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
  817|   232k|    q1q0 = _mm_and_si128(flat, flat_q0q1);
  818|   232k|    *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
  819|       |
  820|   232k|    ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
  821|   232k|    p1p0 = _mm_and_si128(flat, flat_p1p0);
  822|   232k|    *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
  823|   232k|  }
  824|   272k|}
highbd_loopfilter_sse2.c:highbd_lpf_internal_8_sse2:
  998|  96.2k|    const unsigned char *_thresh, int bd) {
  999|  96.2k|  const __m128i zero = _mm_setzero_si128();
 1000|  96.2k|  __m128i blimit, limit, thresh;
 1001|  96.2k|  __m128i mask, hev, flat;
 1002|  96.2k|  __m128i pq[4];
 1003|  96.2k|  __m128i p1p0, q1q0, ps1ps0, qs1qs0;
 1004|  96.2k|  __m128i work_a, opq2, flat_p1p0, flat_q0q1;
 1005|       |
 1006|  96.2k|  pq[0] = _mm_unpacklo_epi64(*p0, *q0);
 1007|  96.2k|  pq[1] = _mm_unpacklo_epi64(*p1, *q1);
 1008|  96.2k|  pq[2] = _mm_unpacklo_epi64(*p2, *q2);
 1009|  96.2k|  pq[3] = _mm_unpacklo_epi64(*p3, *q3);
 1010|       |
 1011|  96.2k|  __m128i abs_p1p0;
 1012|       |
 1013|  96.2k|  const __m128i four = _mm_set1_epi16(4);
 1014|  96.2k|  __m128i t80;
 1015|  96.2k|  const __m128i one = _mm_set1_epi16(0x1);
 1016|       |
 1017|  96.2k|  get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80);
 1018|       |
 1019|  96.2k|  highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
 1020|  96.2k|                                &thresh, &hev, &mask);
 1021|       |
 1022|       |  // lp filter
 1023|  96.2k|  highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd);
 1024|       |
 1025|       |  // flat_mask4
 1026|  96.2k|  flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_diff16(pq[3], pq[0]));
 1027|  96.2k|  flat = _mm_max_epi16(abs_p1p0, flat);
 1028|  96.2k|  flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8));
 1029|       |
 1030|  96.2k|  flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
 1031|       |
 1032|  96.2k|  flat = _mm_cmpeq_epi16(flat, zero);
 1033|  96.2k|  flat = _mm_and_si128(flat, mask);
 1034|       |  // replicate for the further "merged variables" usage
 1035|  96.2k|  flat = _mm_unpacklo_epi64(flat, flat);
 1036|       |
 1037|  96.2k|  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
  ------------------
  |  Branch (1037:7): [True: 27.0k, False: 69.1k]
  ------------------
 1038|  27.0k|    __m128i workp_a, workp_b, workp_c, workp_shft0, workp_shft1;
 1039|       |    // Added before shift for rounding part of ROUND_POWER_OF_TWO
 1040|       |
 1041|       |    // o*p2
 1042|  27.0k|    workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1));
 1043|  27.0k|    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0);
 1044|  27.0k|    workp_c = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3);
 1045|  27.0k|    workp_c = _mm_add_epi16(workp_a, workp_c);
 1046|       |
 1047|       |    // o*p1
 1048|  27.0k|    workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1);
 1049|  27.0k|    workp_shft0 = _mm_add_epi16(workp_a, workp_b);
 1050|       |
 1051|       |    // o*p0
 1052|  27.0k|    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q2);
 1053|  27.0k|    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p1), *p0);
 1054|  27.0k|    workp_shft1 = _mm_add_epi16(workp_a, workp_b);
 1055|       |
 1056|  27.0k|    flat_p1p0 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft1, workp_shft0), 3);
 1057|       |
 1058|       |    // oq0
 1059|  27.0k|    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q3);
 1060|  27.0k|    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p0), *q0);
 1061|  27.0k|    workp_shft0 = _mm_add_epi16(workp_a, workp_b);
 1062|       |
 1063|       |    // oq1
 1064|  27.0k|    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p2), *q3);
 1065|  27.0k|    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q0), *q1);
 1066|  27.0k|    workp_shft1 = _mm_add_epi16(workp_a, workp_b);
 1067|       |
 1068|  27.0k|    flat_q0q1 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft0, workp_shft1), 3);
 1069|       |
 1070|       |    // oq2
 1071|  27.0k|    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3);
 1072|  27.0k|    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2);
 1073|  27.0k|    workp_a = _mm_add_epi16(workp_a, workp_b);
 1074|  27.0k|    opq2 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_c, workp_a), 3);
 1075|       |
 1076|  27.0k|    qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
 1077|  27.0k|    q1q0 = _mm_and_si128(flat, flat_q0q1);
 1078|  27.0k|    *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
 1079|       |
 1080|  27.0k|    ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
 1081|  27.0k|    p1p0 = _mm_and_si128(flat, flat_p1p0);
 1082|  27.0k|    *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
 1083|       |
 1084|  27.0k|    work_a = _mm_andnot_si128(flat, pq[2]);
 1085|  27.0k|    *p2 = _mm_and_si128(flat, opq2);
 1086|  27.0k|    *p2 = _mm_or_si128(work_a, *p2);
 1087|       |    *q2 = _mm_srli_si128(*p2, 8);
 1088|  27.0k|  }
 1089|  96.2k|}
highbd_loopfilter_sse2.c:highbd_lpf_internal_4_sse2:
 1277|   159k|    const uint8_t *_thresh, int bd) {
 1278|   159k|  __m128i blimit, limit, thresh;
 1279|   159k|  __m128i mask, hev;
 1280|   159k|  __m128i p1p0, q1q0;
 1281|   159k|  __m128i pq[2];
 1282|       |
 1283|   159k|  __m128i abs_p1p0;
 1284|       |
 1285|   159k|  __m128i t80;
 1286|   159k|  get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80);
 1287|       |
 1288|   159k|  pq[0] = _mm_unpacklo_epi64(*p0, *q0);
 1289|   159k|  pq[1] = _mm_unpacklo_epi64(*p1, *q1);
 1290|       |
 1291|   159k|  highbd_hev_filter_mask_x_sse2(pq, 2, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
 1292|   159k|                                &thresh, &hev, &mask);
 1293|       |
 1294|   159k|  highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd);
 1295|   159k|}

aom_dc_predictor_32x32_avx2:
  323|  54.4k|                                 const uint8_t *above, const uint8_t *left) {
  324|  54.4k|  const __m256i sum_above = dc_sum_32(above);
  325|  54.4k|  __m256i sum_left = dc_sum_32(left);
  326|  54.4k|  sum_left = _mm256_add_epi16(sum_left, sum_above);
  327|  54.4k|  const __m256i thirtytwo = _mm256_set1_epi16(32);
  328|  54.4k|  sum_left = _mm256_add_epi16(sum_left, thirtytwo);
  329|  54.4k|  sum_left = _mm256_srai_epi16(sum_left, 6);
  330|  54.4k|  const __m256i zero = _mm256_setzero_si256();
  331|  54.4k|  __m256i row = _mm256_shuffle_epi8(sum_left, zero);
  332|  54.4k|  row_store_32xh(&row, 32, dst, stride);
  333|  54.4k|}
aom_dc_top_predictor_32x32_avx2:
  337|  16.3k|                                     const uint8_t *left) {
  338|  16.3k|  __m256i sum = dc_sum_32(above);
  339|  16.3k|  (void)left;
  340|       |
  341|  16.3k|  const __m256i sixteen = _mm256_set1_epi16(16);
  342|  16.3k|  sum = _mm256_add_epi16(sum, sixteen);
  343|  16.3k|  sum = _mm256_srai_epi16(sum, 5);
  344|  16.3k|  const __m256i zero = _mm256_setzero_si256();
  345|  16.3k|  __m256i row = _mm256_shuffle_epi8(sum, zero);
  346|  16.3k|  row_store_32xh(&row, 32, dst, stride);
  347|  16.3k|}
aom_dc_left_predictor_32x32_avx2:
  351|  21.0k|                                      const uint8_t *left) {
  352|  21.0k|  __m256i sum = dc_sum_32(left);
  353|  21.0k|  (void)above;
  354|       |
  355|  21.0k|  const __m256i sixteen = _mm256_set1_epi16(16);
  356|  21.0k|  sum = _mm256_add_epi16(sum, sixteen);
  357|  21.0k|  sum = _mm256_srai_epi16(sum, 5);
  358|  21.0k|  const __m256i zero = _mm256_setzero_si256();
  359|  21.0k|  __m256i row = _mm256_shuffle_epi8(sum, zero);
  360|  21.0k|  row_store_32xh(&row, 32, dst, stride);
  361|  21.0k|}
aom_dc_128_predictor_32x32_avx2:
  365|  10.6k|                                     const uint8_t *left) {
  366|  10.6k|  (void)above;
  367|  10.6k|  (void)left;
  368|  10.6k|  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
  369|  10.6k|  row_store_32xh(&row, 32, dst, stride);
  370|  10.6k|}
aom_v_predictor_32x32_avx2:
  373|  1.41k|                                const uint8_t *above, const uint8_t *left) {
  374|  1.41k|  const __m256i row = _mm256_loadu_si256((const __m256i *)above);
  375|  1.41k|  (void)left;
  376|  1.41k|  row_store_32xh(&row, 32, dst, stride);
  377|  1.41k|}
aom_h_predictor_32x32_avx2:
  402|  3.97k|                                const uint8_t *above, const uint8_t *left) {
  403|  3.97k|  (void)above;
  404|  3.97k|  const __m256i left_col = _mm256_loadu_si256((__m256i const *)left);
  405|       |
  406|  3.97k|  __m256i u = _mm256_unpacklo_epi8(left_col, left_col);
  407|       |
  408|  3.97k|  __m256i v = _mm256_unpacklo_epi8(u, u);
  409|  3.97k|  h_predictor_32x8line(&v, dst, stride);
  410|  3.97k|  dst += stride << 2;
  411|       |
  412|  3.97k|  v = _mm256_unpackhi_epi8(u, u);
  413|  3.97k|  h_predictor_32x8line(&v, dst, stride);
  414|  3.97k|  dst += stride << 2;
  415|       |
  416|  3.97k|  u = _mm256_unpackhi_epi8(left_col, left_col);
  417|       |
  418|  3.97k|  v = _mm256_unpacklo_epi8(u, u);
  419|  3.97k|  h_predictor_32x8line(&v, dst, stride);
  420|  3.97k|  dst += stride << 2;
  421|       |
  422|  3.97k|  v = _mm256_unpackhi_epi8(u, u);
  423|  3.97k|  h_predictor_32x8line(&v, dst, stride);
  424|  3.97k|}
aom_dc_predictor_32x16_avx2:
  429|  16.4k|                                 const uint8_t *above, const uint8_t *left) {
  430|  16.4k|  const __m128i top_sum = dc_sum_32_sse2(above);
  431|  16.4k|  __m128i left_sum = dc_sum_16_sse2(left);
  432|  16.4k|  left_sum = _mm_add_epi16(top_sum, left_sum);
  433|  16.4k|  uint16_t sum = (uint16_t)_mm_cvtsi128_si32(left_sum);
  434|  16.4k|  sum += 24;
  435|  16.4k|  sum /= 48;
  436|  16.4k|  const __m256i row = _mm256_set1_epi8((int8_t)sum);
  437|  16.4k|  row_store_32xh(&row, 16, dst, stride);
  438|  16.4k|}
aom_dc_predictor_32x64_avx2:
  441|  2.16k|                                 const uint8_t *above, const uint8_t *left) {
  442|  2.16k|  const __m256i sum_above = dc_sum_32(above);
  443|  2.16k|  __m256i sum_left = dc_sum_64(left);
  444|  2.16k|  sum_left = _mm256_add_epi16(sum_left, sum_above);
  445|  2.16k|  uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
  446|  2.16k|  sum += 48;
  447|  2.16k|  sum /= 96;
  448|  2.16k|  const __m256i row = _mm256_set1_epi8((int8_t)sum);
  449|  2.16k|  row_store_32xh(&row, 64, dst, stride);
  450|  2.16k|}
aom_dc_predictor_64x64_avx2:
  453|  5.58k|                                 const uint8_t *above, const uint8_t *left) {
  454|  5.58k|  const __m256i sum_above = dc_sum_64(above);
  455|  5.58k|  __m256i sum_left = dc_sum_64(left);
  456|  5.58k|  sum_left = _mm256_add_epi16(sum_left, sum_above);
  457|  5.58k|  uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
  458|  5.58k|  sum += 64;
  459|  5.58k|  sum /= 128;
  460|  5.58k|  const __m256i row = _mm256_set1_epi8((int8_t)sum);
  461|  5.58k|  row_store_64xh(&row, 64, dst, stride);
  462|  5.58k|}
aom_dc_predictor_64x32_avx2:
  465|  4.31k|                                 const uint8_t *above, const uint8_t *left) {
  466|  4.31k|  const __m256i sum_above = dc_sum_64(above);
  467|  4.31k|  __m256i sum_left = dc_sum_32(left);
  468|  4.31k|  sum_left = _mm256_add_epi16(sum_left, sum_above);
  469|  4.31k|  uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
  470|  4.31k|  sum += 48;
  471|  4.31k|  sum /= 96;
  472|  4.31k|  const __m256i row = _mm256_set1_epi8((int8_t)sum);
  473|  4.31k|  row_store_64xh(&row, 32, dst, stride);
  474|  4.31k|}
aom_dc_predictor_64x16_avx2:
  478|    741|                                 const uint8_t *above, const uint8_t *left) {
  479|    741|  const __m256i sum_above = dc_sum_64(above);
  480|    741|  __m256i sum_left = _mm256_castsi128_si256(dc_sum_16_sse2(left));
  481|    741|  sum_left = _mm256_add_epi16(sum_left, sum_above);
  482|    741|  uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
  483|    741|  sum += 40;
  484|    741|  sum /= 80;
  485|    741|  const __m256i row = _mm256_set1_epi8((int8_t)sum);
  486|    741|  row_store_64xh(&row, 16, dst, stride);
  487|    741|}
aom_dc_top_predictor_32x16_avx2:
  492|  9.48k|                                     const uint8_t *left) {
  493|  9.48k|  __m256i sum = dc_sum_32(above);
  494|  9.48k|  (void)left;
  495|       |
  496|  9.48k|  const __m256i sixteen = _mm256_set1_epi16(16);
  497|  9.48k|  sum = _mm256_add_epi16(sum, sixteen);
  498|  9.48k|  sum = _mm256_srai_epi16(sum, 5);
  499|  9.48k|  const __m256i zero = _mm256_setzero_si256();
  500|  9.48k|  __m256i row = _mm256_shuffle_epi8(sum, zero);
  501|  9.48k|  row_store_32xh(&row, 16, dst, stride);
  502|  9.48k|}
aom_dc_top_predictor_32x64_avx2:
  506|     66|                                     const uint8_t *left) {
  507|     66|  __m256i sum = dc_sum_32(above);
  508|     66|  (void)left;
  509|       |
  510|     66|  const __m256i sixteen = _mm256_set1_epi16(16);
  511|     66|  sum = _mm256_add_epi16(sum, sixteen);
  512|     66|  sum = _mm256_srai_epi16(sum, 5);
  513|     66|  const __m256i zero = _mm256_setzero_si256();
  514|     66|  __m256i row = _mm256_shuffle_epi8(sum, zero);
  515|     66|  row_store_32xh(&row, 64, dst, stride);
  516|     66|}
aom_dc_top_predictor_64x64_avx2:
  520|  5.01k|                                     const uint8_t *left) {
  521|  5.01k|  __m256i sum = dc_sum_64(above);
  522|  5.01k|  (void)left;
  523|       |
  524|  5.01k|  const __m256i thirtytwo = _mm256_set1_epi16(32);
  525|  5.01k|  sum = _mm256_add_epi16(sum, thirtytwo);
  526|  5.01k|  sum = _mm256_srai_epi16(sum, 6);
  527|  5.01k|  const __m256i zero = _mm256_setzero_si256();
  528|  5.01k|  __m256i row = _mm256_shuffle_epi8(sum, zero);
  529|  5.01k|  row_store_64xh(&row, 64, dst, stride);
  530|  5.01k|}
aom_dc_top_predictor_64x32_avx2:
  534|  4.12k|                                     const uint8_t *left) {
  535|  4.12k|  __m256i sum = dc_sum_64(above);
  536|  4.12k|  (void)left;
  537|       |
  538|  4.12k|  const __m256i thirtytwo = _mm256_set1_epi16(32);
  539|  4.12k|  sum = _mm256_add_epi16(sum, thirtytwo);
  540|  4.12k|  sum = _mm256_srai_epi16(sum, 6);
  541|  4.12k|  const __m256i zero = _mm256_setzero_si256();
  542|  4.12k|  __m256i row = _mm256_shuffle_epi8(sum, zero);
  543|  4.12k|  row_store_64xh(&row, 32, dst, stride);
  544|  4.12k|}
aom_dc_top_predictor_64x16_avx2:
  549|    247|                                     const uint8_t *left) {
  550|    247|  __m256i sum = dc_sum_64(above);
  551|    247|  (void)left;
  552|       |
  553|    247|  const __m256i thirtytwo = _mm256_set1_epi16(32);
  554|    247|  sum = _mm256_add_epi16(sum, thirtytwo);
  555|    247|  sum = _mm256_srai_epi16(sum, 6);
  556|    247|  const __m256i zero = _mm256_setzero_si256();
  557|    247|  __m256i row = _mm256_shuffle_epi8(sum, zero);
  558|    247|  row_store_64xh(&row, 16, dst, stride);
  559|    247|}
aom_dc_left_predictor_32x16_avx2:
  564|  1.08k|                                      const uint8_t *left) {
  565|  1.08k|  __m128i sum = dc_sum_16_sse2(left);
  566|  1.08k|  (void)above;
  567|       |
  568|  1.08k|  const __m128i eight = _mm_set1_epi16(8);
  569|  1.08k|  sum = _mm_add_epi16(sum, eight);
  570|  1.08k|  sum = _mm_srai_epi16(sum, 4);
  571|  1.08k|  const __m128i zero = _mm_setzero_si128();
  572|  1.08k|  const __m128i r = _mm_shuffle_epi8(sum, zero);
  573|       |  const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
  574|  1.08k|  row_store_32xh(&row, 16, dst, stride);
  575|  1.08k|}
aom_dc_left_predictor_32x64_avx2:
  579|  2.11k|                                      const uint8_t *left) {
  580|  2.11k|  __m256i sum = dc_sum_64(left);
  581|  2.11k|  (void)above;
  582|       |
  583|  2.11k|  const __m256i thirtytwo = _mm256_set1_epi16(32);
  584|  2.11k|  sum = _mm256_add_epi16(sum, thirtytwo);
  585|  2.11k|  sum = _mm256_srai_epi16(sum, 6);
  586|  2.11k|  const __m256i zero = _mm256_setzero_si256();
  587|  2.11k|  __m256i row = _mm256_shuffle_epi8(sum, zero);
  588|  2.11k|  row_store_32xh(&row, 64, dst, stride);
  589|  2.11k|}
aom_dc_left_predictor_64x64_avx2:
  593|  5.73k|                                      const uint8_t *left) {
  594|  5.73k|  __m256i sum = dc_sum_64(left);
  595|  5.73k|  (void)above;
  596|       |
  597|  5.73k|  const __m256i thirtytwo = _mm256_set1_epi16(32);
  598|  5.73k|  sum = _mm256_add_epi16(sum, thirtytwo);
  599|  5.73k|  sum = _mm256_srai_epi16(sum, 6);
  600|  5.73k|  const __m256i zero = _mm256_setzero_si256();
  601|  5.73k|  __m256i row = _mm256_shuffle_epi8(sum, zero);
  602|  5.73k|  row_store_64xh(&row, 64, dst, stride);
  603|  5.73k|}
aom_dc_left_predictor_64x32_avx2:
  607|    107|                                      const uint8_t *left) {
  608|    107|  __m256i sum = dc_sum_32(left);
  609|    107|  (void)above;
  610|       |
  611|    107|  const __m256i sixteen = _mm256_set1_epi16(16);
  612|    107|  sum = _mm256_add_epi16(sum, sixteen);
  613|    107|  sum = _mm256_srai_epi16(sum, 5);
  614|    107|  const __m256i zero = _mm256_setzero_si256();
  615|    107|  __m256i row = _mm256_shuffle_epi8(sum, zero);
  616|    107|  row_store_64xh(&row, 32, dst, stride);
  617|    107|}
aom_dc_left_predictor_64x16_avx2:
  622|    140|                                      const uint8_t *left) {
  623|    140|  __m128i sum = dc_sum_16_sse2(left);
  624|    140|  (void)above;
  625|       |
  626|    140|  const __m128i eight = _mm_set1_epi16(8);
  627|    140|  sum = _mm_add_epi16(sum, eight);
  628|    140|  sum = _mm_srai_epi16(sum, 4);
  629|    140|  const __m128i zero = _mm_setzero_si128();
  630|    140|  const __m128i r = _mm_shuffle_epi8(sum, zero);
  631|       |  const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
  632|    140|  row_store_64xh(&row, 16, dst, stride);
  633|    140|}
aom_dc_128_predictor_32x16_avx2:
  638|     68|                                     const uint8_t *left) {
  639|     68|  (void)above;
  640|     68|  (void)left;
  641|     68|  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
  642|     68|  row_store_32xh(&row, 16, dst, stride);
  643|     68|}
aom_dc_128_predictor_32x64_avx2:
  647|     44|                                     const uint8_t *left) {
  648|     44|  (void)above;
  649|     44|  (void)left;
  650|     44|  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
  651|     44|  row_store_32xh(&row, 64, dst, stride);
  652|     44|}
aom_dc_128_predictor_64x64_avx2:
  656|  4.44k|                                     const uint8_t *left) {
  657|  4.44k|  (void)above;
  658|  4.44k|  (void)left;
  659|  4.44k|  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
  660|  4.44k|  row_store_64xh(&row, 64, dst, stride);
  661|  4.44k|}
aom_dc_128_predictor_64x32_avx2:
  665|     38|                                     const uint8_t *left) {
  666|     38|  (void)above;
  667|     38|  (void)left;
  668|     38|  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
  669|     38|  row_store_64xh(&row, 32, dst, stride);
  670|     38|}
aom_dc_128_predictor_64x16_avx2:
  675|     10|                                     const uint8_t *left) {
  676|     10|  (void)above;
  677|     10|  (void)left;
  678|     10|  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
  679|     10|  row_store_64xh(&row, 16, dst, stride);
  680|     10|}
aom_v_predictor_32x16_avx2:
  684|    686|                                const uint8_t *above, const uint8_t *left) {
  685|    686|  const __m256i row = _mm256_loadu_si256((const __m256i *)above);
  686|    686|  (void)left;
  687|    686|  row_store_32xh(&row, 16, dst, stride);
  688|    686|}
aom_v_predictor_32x64_avx2:
  691|     68|                                const uint8_t *above, const uint8_t *left) {
  692|     68|  const __m256i row = _mm256_loadu_si256((const __m256i *)above);
  693|     68|  (void)left;
  694|     68|  row_store_32xh(&row, 64, dst, stride);
  695|     68|}
aom_v_predictor_64x64_avx2:
  698|     67|                                const uint8_t *above, const uint8_t *left) {
  699|     67|  const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
  700|     67|  const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
  701|     67|  (void)left;
  702|     67|  row_store_32x2xh(&row0, &row1, 64, dst, stride);
  703|     67|}
aom_v_predictor_64x32_avx2:
  706|     49|                                const uint8_t *above, const uint8_t *left) {
  707|     49|  const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
  708|     49|  const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
  709|     49|  (void)left;
  710|     49|  row_store_32x2xh(&row0, &row1, 32, dst, stride);
  711|     49|}
aom_v_predictor_64x16_avx2:
  715|     58|                                const uint8_t *above, const uint8_t *left) {
  716|     58|  const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
  717|     58|  const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
  718|     58|  (void)left;
  719|     58|  row_store_32x2xh(&row0, &row1, 16, dst, stride);
  720|     58|}
aom_paeth_predictor_16x8_avx2:
  768|  7.03k|                                   const uint8_t *above, const uint8_t *left) {
  769|  7.03k|  __m128i x = _mm_loadl_epi64((const __m128i *)left);
  770|  7.03k|  const __m256i l = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
  771|  7.03k|  const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
  772|  7.03k|  __m256i rep = _mm256_set1_epi16((short)0x8000);
  773|  7.03k|  const __m256i one = _mm256_set1_epi16(1);
  774|  7.03k|  const __m256i top = get_top_vector(above);
  775|       |
  776|  7.03k|  int i;
  777|  63.2k|  for (i = 0; i < 8; ++i) {
  ------------------
  |  Branch (777:15): [True: 56.2k, False: 7.03k]
  ------------------
  778|  56.2k|    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
  779|  56.2k|    const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
  780|       |
  781|  56.2k|    _mm_store_si128((__m128i *)dst, row);
  782|  56.2k|    dst += stride;
  783|  56.2k|    rep = _mm256_add_epi16(rep, one);
  784|  56.2k|  }
  785|  7.03k|}
aom_paeth_predictor_16x16_avx2:
  793|  7.96k|                                    const uint8_t *above, const uint8_t *left) {
  794|  7.96k|  const __m256i l = get_left_vector(left);
  795|  7.96k|  const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
  796|  7.96k|  __m256i rep = _mm256_set1_epi16((short)0x8000);
  797|  7.96k|  const __m256i one = _mm256_set1_epi16(1);
  798|  7.96k|  const __m256i top = get_top_vector(above);
  799|       |
  800|  7.96k|  int i;
  801|   135k|  for (i = 0; i < 16; ++i) {
  ------------------
  |  Branch (801:15): [True: 127k, False: 7.96k]
  ------------------
  802|   127k|    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
  803|   127k|    const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
  804|       |
  805|   127k|    _mm_store_si128((__m128i *)dst, row);
  806|   127k|    dst += stride;
  807|   127k|    rep = _mm256_add_epi16(rep, one);
  808|   127k|  }
  809|  7.96k|}
aom_paeth_predictor_16x32_avx2:
  812|  1.56k|                                    const uint8_t *above, const uint8_t *left) {
  813|  1.56k|  __m256i l = get_left_vector(left);
  814|  1.56k|  const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
  815|  1.56k|  __m256i rep = _mm256_set1_epi16((short)0x8000);
  816|  1.56k|  const __m256i one = _mm256_set1_epi16(1);
  817|  1.56k|  const __m256i top = get_top_vector(above);
  818|       |
  819|  1.56k|  int i;
  820|  26.6k|  for (i = 0; i < 16; ++i) {
  ------------------
  |  Branch (820:15): [True: 25.0k, False: 1.56k]
  ------------------
  821|  25.0k|    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
  822|  25.0k|    const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
  823|       |
  824|  25.0k|    _mm_store_si128((__m128i *)dst, row);
  825|  25.0k|    dst += stride;
  826|  25.0k|    rep = _mm256_add_epi16(rep, one);
  827|  25.0k|  }
  828|       |
  829|  1.56k|  l = get_left_vector(left + 16);
  830|  1.56k|  rep = _mm256_set1_epi16((short)0x8000);
  831|  26.6k|  for (i = 0; i < 16; ++i) {
  ------------------
  |  Branch (831:15): [True: 25.0k, False: 1.56k]
  ------------------
  832|  25.0k|    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
  833|  25.0k|    const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
  834|       |
  835|  25.0k|    _mm_store_si128((__m128i *)dst, row);
  836|  25.0k|    dst += stride;
  837|  25.0k|    rep = _mm256_add_epi16(rep, one);
  838|  25.0k|  }
  839|  1.56k|}
aom_paeth_predictor_16x64_avx2:
  843|    601|                                    const uint8_t *above, const uint8_t *left) {
  844|    601|  const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
  845|    601|  const __m256i one = _mm256_set1_epi16(1);
  846|    601|  const __m256i top = get_top_vector(above);
  847|       |
  848|  3.00k|  for (int j = 0; j < 4; ++j) {
  ------------------
  |  Branch (848:19): [True: 2.40k, False: 601]
  ------------------
  849|  2.40k|    const __m256i l = get_left_vector(left + j * 16);
  850|  2.40k|    __m256i rep = _mm256_set1_epi16((short)0x8000);
  851|  40.8k|    for (int i = 0; i < 16; ++i) {
  ------------------
  |  Branch (851:21): [True: 38.4k, False: 2.40k]
  ------------------
  852|  38.4k|      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
  853|  38.4k|      const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
  854|       |
  855|  38.4k|      _mm_store_si128((__m128i *)dst, row);
  856|  38.4k|      dst += stride;
  857|  38.4k|      rep = _mm256_add_epi16(rep, one);
  858|  38.4k|    }
  859|  2.40k|  }
  860|    601|}
aom_paeth_predictor_32x16_avx2:
  879|  1.45k|                                    const uint8_t *above, const uint8_t *left) {
  880|  1.45k|  const __m256i l = get_left_vector(left);
  881|  1.45k|  const __m256i t0 = get_top_vector(above);
  882|  1.45k|  const __m256i t1 = get_top_vector(above + 16);
  883|  1.45k|  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
  884|  1.45k|  __m256i rep = _mm256_set1_epi16((short)0x8000);
  885|  1.45k|  const __m256i one = _mm256_set1_epi16(1);
  886|       |
  887|  1.45k|  int i;
  888|  24.7k|  for (i = 0; i < 16; ++i) {
  ------------------
  |  Branch (888:15): [True: 23.3k, False: 1.45k]
  ------------------
  889|  23.3k|    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
  890|       |
  891|  23.3k|    const __m256i r = paeth_32x1_pred(&l16, &t0, &t1, &tl);
  892|       |
  893|  23.3k|    _mm256_storeu_si256((__m256i *)dst, r);
  894|       |
  895|  23.3k|    dst += stride;
  896|  23.3k|    rep = _mm256_add_epi16(rep, one);
  897|  23.3k|  }
  898|  1.45k|}
aom_paeth_predictor_32x32_avx2:
  901|  3.40k|                                    const uint8_t *above, const uint8_t *left) {
  902|  3.40k|  __m256i l = get_left_vector(left);
  903|  3.40k|  const __m256i t0 = get_top_vector(above);
  904|  3.40k|  const __m256i t1 = get_top_vector(above + 16);
  905|  3.40k|  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
  906|  3.40k|  __m256i rep = _mm256_set1_epi16((short)0x8000);
  907|  3.40k|  const __m256i one = _mm256_set1_epi16(1);
  908|       |
  909|  3.40k|  int i;
  910|  57.8k|  for (i = 0; i < 16; ++i) {
  ------------------
  |  Branch (910:15): [True: 54.4k, False: 3.40k]
  ------------------
  911|  54.4k|    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
  912|       |
  913|  54.4k|    const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
  914|  54.4k|    const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
  915|       |
  916|  54.4k|    _mm_store_si128((__m128i *)dst, r0);
  917|  54.4k|    _mm_store_si128((__m128i *)(dst + 16), r1);
  918|       |
  919|  54.4k|    dst += stride;
  920|  54.4k|    rep = _mm256_add_epi16(rep, one);
  921|  54.4k|  }
  922|       |
  923|  3.40k|  l = get_left_vector(left + 16);
  924|  3.40k|  rep = _mm256_set1_epi16((short)0x8000);
  925|  57.8k|  for (i = 0; i < 16; ++i) {
  ------------------
  |  Branch (925:15): [True: 54.4k, False: 3.40k]
  ------------------
  926|  54.4k|    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
  927|       |
  928|  54.4k|    const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
  929|  54.4k|    const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
  930|       |
  931|  54.4k|    _mm_store_si128((__m128i *)dst, r0);
  932|  54.4k|    _mm_store_si128((__m128i *)(dst + 16), r1);
  933|       |
  934|  54.4k|    dst += stride;
  935|  54.4k|    rep = _mm256_add_epi16(rep, one);
  936|  54.4k|  }
  937|  3.40k|}
aom_paeth_predictor_32x64_avx2:
  940|     98|                                    const uint8_t *above, const uint8_t *left) {
  941|     98|  const __m256i t0 = get_top_vector(above);
  942|     98|  const __m256i t1 = get_top_vector(above + 16);
  943|     98|  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
  944|     98|  const __m256i one = _mm256_set1_epi16(1);
  945|       |
  946|     98|  int i, j;
  947|    490|  for (j = 0; j < 4; ++j) {
  ------------------
  |  Branch (947:15): [True: 392, False: 98]
  ------------------
  948|    392|    const __m256i l = get_left_vector(left + j * 16);
  949|    392|    __m256i rep = _mm256_set1_epi16((short)0x8000);
  950|  6.66k|    for (i = 0; i < 16; ++i) {
  ------------------
  |  Branch (950:17): [True: 6.27k, False: 392]
  ------------------
  951|  6.27k|      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
  952|       |
  953|  6.27k|      const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
  954|  6.27k|      const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
  955|       |
  956|  6.27k|      _mm_store_si128((__m128i *)dst, r0);
  957|  6.27k|      _mm_store_si128((__m128i *)(dst + 16), r1);
  958|       |
  959|  6.27k|      dst += stride;
  960|  6.27k|      rep = _mm256_add_epi16(rep, one);
  961|  6.27k|    }
  962|    392|  }
  963|     98|}
aom_paeth_predictor_64x32_avx2:
  966|     83|                                    const uint8_t *above, const uint8_t *left) {
  967|     83|  const __m256i t0 = get_top_vector(above);
  968|     83|  const __m256i t1 = get_top_vector(above + 16);
  969|     83|  const __m256i t2 = get_top_vector(above + 32);
  970|     83|  const __m256i t3 = get_top_vector(above + 48);
  971|     83|  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
  972|     83|  const __m256i one = _mm256_set1_epi16(1);
  973|       |
  974|     83|  int i, j;
  975|    249|  for (j = 0; j < 2; ++j) {
  ------------------
  |  Branch (975:15): [True: 166, False: 83]
  ------------------
  976|    166|    const __m256i l = get_left_vector(left + j * 16);
  977|    166|    __m256i rep = _mm256_set1_epi16((short)0x8000);
  978|  2.82k|    for (i = 0; i < 16; ++i) {
  ------------------
  |  Branch (978:17): [True: 2.65k, False: 166]
  ------------------
  979|  2.65k|      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
  980|       |
  981|  2.65k|      const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
  982|  2.65k|      const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
  983|  2.65k|      const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
  984|  2.65k|      const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
  985|       |
  986|  2.65k|      _mm_store_si128((__m128i *)dst, r0);
  987|  2.65k|      _mm_store_si128((__m128i *)(dst + 16), r1);
  988|  2.65k|      _mm_store_si128((__m128i *)(dst + 32), r2);
  989|  2.65k|      _mm_store_si128((__m128i *)(dst + 48), r3);
  990|       |
  991|  2.65k|      dst += stride;
  992|  2.65k|      rep = _mm256_add_epi16(rep, one);
  993|  2.65k|    }
  994|    166|  }
  995|     83|}
aom_paeth_predictor_64x64_avx2:
  998|    259|                                    const uint8_t *above, const uint8_t *left) {
  999|    259|  const __m256i t0 = get_top_vector(above);
 1000|    259|  const __m256i t1 = get_top_vector(above + 16);
 1001|    259|  const __m256i t2 = get_top_vector(above + 32);
 1002|    259|  const __m256i t3 = get_top_vector(above + 48);
 1003|    259|  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
 1004|    259|  const __m256i one = _mm256_set1_epi16(1);
 1005|       |
 1006|    259|  int i, j;
 1007|  1.29k|  for (j = 0; j < 4; ++j) {
  ------------------
  |  Branch (1007:15): [True: 1.03k, False: 259]
  ------------------
 1008|  1.03k|    const __m256i l = get_left_vector(left + j * 16);
 1009|  1.03k|    __m256i rep = _mm256_set1_epi16((short)0x8000);
 1010|  17.6k|    for (i = 0; i < 16; ++i) {
  ------------------
  |  Branch (1010:17): [True: 16.5k, False: 1.03k]
  ------------------
 1011|  16.5k|      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
 1012|       |
 1013|  16.5k|      const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
 1014|  16.5k|      const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
 1015|  16.5k|      const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
 1016|  16.5k|      const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
 1017|       |
 1018|  16.5k|      _mm_store_si128((__m128i *)dst, r0);
 1019|  16.5k|      _mm_store_si128((__m128i *)(dst + 16), r1);
 1020|  16.5k|      _mm_store_si128((__m128i *)(dst + 32), r2);
 1021|  16.5k|      _mm_store_si128((__m128i *)(dst + 48), r3);
 1022|       |
 1023|  16.5k|      dst += stride;
 1024|  16.5k|      rep = _mm256_add_epi16(rep, one);
 1025|  16.5k|    }
 1026|  1.03k|  }
 1027|    259|}
aom_paeth_predictor_64x16_avx2:
 1031|     79|                                    const uint8_t *above, const uint8_t *left) {
 1032|     79|  const __m256i t0 = get_top_vector(above);
 1033|     79|  const __m256i t1 = get_top_vector(above + 16);
 1034|     79|  const __m256i t2 = get_top_vector(above + 32);
 1035|     79|  const __m256i t3 = get_top_vector(above + 48);
 1036|     79|  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
 1037|     79|  const __m256i one = _mm256_set1_epi16(1);
 1038|       |
 1039|     79|  int i;
 1040|     79|  const __m256i l = get_left_vector(left);
 1041|     79|  __m256i rep = _mm256_set1_epi16((short)0x8000);
 1042|  1.34k|  for (i = 0; i < 16; ++i) {
  ------------------
  |  Branch (1042:15): [True: 1.26k, False: 79]
  ------------------
 1043|  1.26k|    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
 1044|       |
 1045|  1.26k|    const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
 1046|  1.26k|    const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
 1047|  1.26k|    const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
 1048|  1.26k|    const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
 1049|       |
 1050|  1.26k|    _mm_store_si128((__m128i *)dst, r0);
 1051|  1.26k|    _mm_store_si128((__m128i *)(dst + 16), r1);
 1052|  1.26k|    _mm_store_si128((__m128i *)(dst + 32), r2);
 1053|  1.26k|    _mm_store_si128((__m128i *)(dst + 48), r3);
 1054|       |
 1055|  1.26k|    dst += stride;
 1056|  1.26k|    rep = _mm256_add_epi16(rep, one);
 1057|  1.26k|  }
 1058|     79|}
av1_highbd_dr_prediction_z1_avx2:
 1921|   101k|                                      int dx, int dy, int bd) {
 1922|   101k|  (void)left;
 1923|   101k|  (void)dy;
 1924|       |
 1925|   101k|  switch (bw) {
 1926|  36.5k|    case 4:
  ------------------
  |  Branch (1926:5): [True: 36.5k, False: 65.0k]
  ------------------
 1927|  36.5k|      highbd_dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above,
 1928|  36.5k|                                       dx, bd);
 1929|  36.5k|      break;
 1930|  38.1k|    case 8:
  ------------------
  |  Branch (1930:5): [True: 38.1k, False: 63.4k]
  ------------------
 1931|  38.1k|      highbd_dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above,
 1932|  38.1k|                                       dx, bd);
 1933|  38.1k|      break;
 1934|  20.3k|    case 16:
  ------------------
  |  Branch (1934:5): [True: 20.3k, False: 81.2k]
  ------------------
 1935|  20.3k|      highbd_dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above,
 1936|  20.3k|                                        dx, bd);
 1937|  20.3k|      break;
 1938|  6.08k|    case 32:
  ------------------
  |  Branch (1938:5): [True: 6.08k, False: 95.5k]
  ------------------
 1939|  6.08k|      highbd_dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above,
 1940|  6.08k|                                        dx, bd);
 1941|  6.08k|      break;
 1942|    438|    case 64:
  ------------------
  |  Branch (1942:5): [True: 438, False: 101k]
  ------------------
 1943|    438|      if (bd < 12) {
  ------------------
  |  Branch (1943:11): [True: 434, False: 4]
  ------------------
 1944|    434|        highbd_dr_prediction_z1_64xN_avx2(bh, dst, stride, above,
 1945|    434|                                          upsample_above, dx);
 1946|    434|      } else {
 1947|      4|        highbd_dr_prediction_32bit_z1_64xN_avx2(bh, dst, stride, above,
 1948|      4|                                                upsample_above, dx);
 1949|      4|      }
 1950|    438|      break;
 1951|      0|    default: break;
  ------------------
  |  Branch (1951:5): [True: 0, False: 101k]
  ------------------
 1952|   101k|  }
 1953|   101k|  return;
 1954|   101k|}
av1_highbd_dr_prediction_z2_avx2:
 2874|   183k|                                      int bd) {
 2875|   183k|  (void)bd;
 2876|   183k|  assert(dx > 0);
 2877|   183k|  assert(dy > 0);
 2878|   183k|  switch (bw) {
 2879|  63.2k|    case 4:
  ------------------
  |  Branch (2879:5): [True: 63.2k, False: 120k]
  ------------------
 2880|  63.2k|      if (bd < 12) {
  ------------------
  |  Branch (2880:11): [True: 63.0k, False: 150]
  ------------------
 2881|  63.0k|        highbd_dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left,
 2882|  63.0k|                                         upsample_above, upsample_left, dx, dy);
 2883|  63.0k|      } else {
 2884|    150|        highbd_dr_prediction_32bit_z2_Nx4_avx2(bh, dst, stride, above, left,
 2885|    150|                                               upsample_above, upsample_left,
 2886|    150|                                               dx, dy);
 2887|    150|      }
 2888|  63.2k|      break;
 2889|  66.3k|    case 8:
  ------------------
  |  Branch (2889:5): [True: 66.3k, False: 117k]
  ------------------
 2890|  66.3k|      if (bd < 12) {
  ------------------
  |  Branch (2890:11): [True: 65.8k, False: 490]
  ------------------
 2891|  65.8k|        highbd_dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left,
 2892|  65.8k|                                         upsample_above, upsample_left, dx, dy);
 2893|  65.8k|      } else {
 2894|    490|        highbd_dr_prediction_32bit_z2_Nx8_avx2(bh, dst, stride, above, left,
 2895|    490|                                               upsample_above, upsample_left,
 2896|    490|                                               dx, dy);
 2897|    490|      }
 2898|  66.3k|      break;
 2899|  54.3k|    default:
  ------------------
  |  Branch (2899:5): [True: 54.3k, False: 129k]
  ------------------
 2900|  54.3k|      if (bd < 12) {
  ------------------
  |  Branch (2900:11): [True: 53.8k, False: 448]
  ------------------
 2901|  53.8k|        highbd_dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left,
 2902|  53.8k|                                         upsample_above, upsample_left, dx, dy);
 2903|  53.8k|      } else {
 2904|    448|        highbd_dr_prediction_32bit_z2_HxW_avx2(bh, bw, dst, stride, above, left,
 2905|    448|                                               upsample_above, upsample_left,
 2906|    448|                                               dx, dy);
 2907|    448|      }
 2908|  54.3k|      break;
 2909|   183k|  }
 2910|   183k|}
av1_highbd_dr_prediction_z3_avx2:
 3342|   124k|                                      int dx, int dy, int bd) {
 3343|   124k|  (void)above;
 3344|   124k|  (void)dx;
 3345|       |
 3346|   124k|  assert(dx == 1);
 3347|   124k|  assert(dy > 0);
 3348|   124k|  if (bw == bh) {
  ------------------
  |  Branch (3348:7): [True: 84.0k, False: 40.9k]
  ------------------
 3349|  84.0k|    switch (bw) {
  ------------------
  |  Branch (3349:13): [True: 84.0k, False: 0]
  ------------------
 3350|  35.7k|      case 4:
  ------------------
  |  Branch (3350:7): [True: 35.7k, False: 48.2k]
  ------------------
 3351|  35.7k|        highbd_dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy,
 3352|  35.7k|                                         bd);
 3353|  35.7k|        break;
 3354|  33.7k|      case 8:
  ------------------
  |  Branch (3354:7): [True: 33.7k, False: 50.2k]
  ------------------
 3355|  33.7k|        highbd_dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy,
 3356|  33.7k|                                         bd);
 3357|  33.7k|        break;
 3358|  9.83k|      case 16:
  ------------------
  |  Branch (3358:7): [True: 9.83k, False: 74.1k]
  ------------------
 3359|  9.83k|        highbd_dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy,
 3360|  9.83k|                                           bd);
 3361|  9.83k|        break;
 3362|  4.43k|      case 32:
  ------------------
  |  Branch (3362:7): [True: 4.43k, False: 79.5k]
  ------------------
 3363|  4.43k|        highbd_dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy,
 3364|  4.43k|                                           bd);
 3365|  4.43k|        break;
 3366|    220|      case 64:
  ------------------
  |  Branch (3366:7): [True: 220, False: 83.7k]
  ------------------
 3367|    220|        highbd_dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy,
 3368|    220|                                           bd);
 3369|    220|        break;
 3370|  84.0k|    }
 3371|  84.0k|  } else {
 3372|  40.9k|    if (bw < bh) {
  ------------------
  |  Branch (3372:9): [True: 14.9k, False: 25.9k]
  ------------------
 3373|  14.9k|      if (bw + bw == bh) {
  ------------------
  |  Branch (3373:11): [True: 10.3k, False: 4.59k]
  ------------------
 3374|  10.3k|        switch (bw) {
  ------------------
  |  Branch (3374:17): [True: 10.3k, False: 0]
  ------------------
 3375|  3.98k|          case 4:
  ------------------
  |  Branch (3375:11): [True: 3.98k, False: 6.36k]
  ------------------
 3376|  3.98k|            highbd_dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left,
 3377|  3.98k|                                             dy, bd);
 3378|  3.98k|            break;
 3379|  4.93k|          case 8:
  ------------------
  |  Branch (3379:11): [True: 4.93k, False: 5.41k]
  ------------------
 3380|  4.93k|            highbd_dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left,
 3381|  4.93k|                                              dy, bd);
 3382|  4.93k|            break;
 3383|  1.32k|          case 16:
  ------------------
  |  Branch (3383:11): [True: 1.32k, False: 9.03k]
  ------------------
 3384|  1.32k|            highbd_dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left,
 3385|  1.32k|                                               dy, bd);
 3386|  1.32k|            break;
 3387|    109|          case 32:
  ------------------
  |  Branch (3387:11): [True: 109, False: 10.2k]
  ------------------
 3388|    109|            highbd_dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left,
 3389|    109|                                               dy, bd);
 3390|    109|            break;
 3391|  10.3k|        }
 3392|  10.3k|      } else {
 3393|  4.59k|        switch (bw) {
  ------------------
  |  Branch (3393:17): [True: 4.59k, False: 0]
  ------------------
 3394|      0|#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
 3395|  2.96k|          case 4:
  ------------------
  |  Branch (3395:11): [True: 2.96k, False: 1.63k]
  ------------------
 3396|  2.96k|            highbd_dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left,
 3397|  2.96k|                                              dy, bd);
 3398|  2.96k|            break;
 3399|  1.47k|          case 8:
  ------------------
  |  Branch (3399:11): [True: 1.47k, False: 3.12k]
  ------------------
 3400|  1.47k|            highbd_dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left,
 3401|  1.47k|                                              dy, bd);
 3402|  1.47k|            break;
 3403|    163|          case 16:
  ------------------
  |  Branch (3403:11): [True: 163, False: 4.43k]
  ------------------
 3404|    163|            highbd_dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left,
 3405|    163|                                               dy, bd);
 3406|    163|            break;
 3407|  4.59k|#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
 3408|  4.59k|        }
 3409|  4.59k|      }
 3410|  25.9k|    } else {
 3411|  25.9k|      if (bh + bh == bw) {
  ------------------
  |  Branch (3411:11): [True: 16.0k, False: 9.95k]
  ------------------
 3412|  16.0k|        switch (bh) {
  ------------------
  |  Branch (3412:17): [True: 16.0k, False: 0]
  ------------------
 3413|  6.38k|          case 4:
  ------------------
  |  Branch (3413:11): [True: 6.38k, False: 9.65k]
  ------------------
 3414|  6.38k|            highbd_dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left,
 3415|  6.38k|                                             dy, bd);
 3416|  6.38k|            break;
 3417|  7.83k|          case 8:
  ------------------
  |  Branch (3417:11): [True: 7.83k, False: 8.20k]
  ------------------
 3418|  7.83k|            highbd_dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left,
 3419|  7.83k|                                              dy, bd);
 3420|  7.83k|            break;
 3421|  1.73k|          case 16:
  ------------------
  |  Branch (3421:11): [True: 1.73k, False: 14.3k]
  ------------------
 3422|  1.73k|            highbd_dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left,
 3423|  1.73k|                                               dy, bd);
 3424|  1.73k|            break;
 3425|     86|          case 32:
  ------------------
  |  Branch (3425:11): [True: 86, False: 15.9k]
  ------------------
 3426|     86|            highbd_dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left,
 3427|     86|                                               dy, bd);
 3428|     86|            break;
 3429|  16.0k|        }
 3430|  16.0k|      } else {
 3431|  9.95k|        switch (bh) {
  ------------------
  |  Branch (3431:17): [True: 9.95k, False: 0]
  ------------------
 3432|      0|#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
 3433|  7.46k|          case 4:
  ------------------
  |  Branch (3433:11): [True: 7.46k, False: 2.49k]
  ------------------
 3434|  7.46k|            highbd_dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left,
 3435|  7.46k|                                              dy, bd);
 3436|  7.46k|            break;
 3437|  2.28k|          case 8:
  ------------------
  |  Branch (3437:11): [True: 2.28k, False: 7.67k]
  ------------------
 3438|  2.28k|            highbd_dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left,
 3439|  2.28k|                                              dy, bd);
 3440|  2.28k|            break;
 3441|    212|          case 16:
  ------------------
  |  Branch (3441:11): [True: 212, False: 9.74k]
  ------------------
 3442|    212|            highbd_dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left,
 3443|    212|                                               dy, bd);
 3444|    212|            break;
 3445|  9.95k|#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
 3446|  9.95k|        }
 3447|  9.95k|      }
 3448|  25.9k|    }
 3449|  40.9k|  }
 3450|   124k|  return;
 3451|   124k|}
av1_dr_prediction_z1_avx2:
 3818|   126k|                               int upsample_above, int dx, int dy) {
 3819|   126k|  (void)left;
 3820|   126k|  (void)dy;
 3821|   126k|  switch (bw) {
 3822|  60.0k|    case 4:
  ------------------
  |  Branch (3822:5): [True: 60.0k, False: 66.3k]
  ------------------
 3823|  60.0k|      dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above, dx);
 3824|  60.0k|      break;
 3825|  34.5k|    case 8:
  ------------------
  |  Branch (3825:5): [True: 34.5k, False: 91.9k]
  ------------------
 3826|  34.5k|      dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above, dx);
 3827|  34.5k|      break;
 3828|  24.0k|    case 16:
  ------------------
  |  Branch (3828:5): [True: 24.0k, False: 102k]
  ------------------
 3829|  24.0k|      dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above, dx);
 3830|  24.0k|      break;
 3831|  7.34k|    case 32:
  ------------------
  |  Branch (3831:5): [True: 7.34k, False: 119k]
  ------------------
 3832|  7.34k|      dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above, dx);
 3833|  7.34k|      break;
 3834|    478|    case 64:
  ------------------
  |  Branch (3834:5): [True: 478, False: 125k]
  ------------------
 3835|    478|      dr_prediction_z1_64xN_avx2(bh, dst, stride, above, upsample_above, dx);
 3836|    478|      break;
 3837|      0|    default: break;
  ------------------
  |  Branch (3837:5): [True: 0, False: 126k]
  ------------------
 3838|   126k|  }
 3839|   126k|  return;
 3840|   126k|}
av1_dr_prediction_z2_avx2:
 4247|   205k|                               int dy) {
 4248|   205k|  assert(dx > 0);
 4249|   205k|  assert(dy > 0);
 4250|   205k|  switch (bw) {
 4251|  80.6k|    case 4:
  ------------------
  |  Branch (4251:5): [True: 80.6k, False: 125k]
  ------------------
 4252|  80.6k|      dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left, upsample_above,
 4253|  80.6k|                                upsample_left, dx, dy);
 4254|  80.6k|      break;
 4255|  64.2k|    case 8:
  ------------------
  |  Branch (4255:5): [True: 64.2k, False: 141k]
  ------------------
 4256|  64.2k|      dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left, upsample_above,
 4257|  64.2k|                                upsample_left, dx, dy);
 4258|  64.2k|      break;
 4259|  60.7k|    default:
  ------------------
  |  Branch (4259:5): [True: 60.7k, False: 144k]
  ------------------
 4260|  60.7k|      dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left,
 4261|  60.7k|                                upsample_above, upsample_left, dx, dy);
 4262|  60.7k|      break;
 4263|   205k|  }
 4264|   205k|  return;
 4265|   205k|}
av1_dr_prediction_z3_avx2:
 4652|   149k|                               int upsample_left, int dx, int dy) {
 4653|   149k|  (void)above;
 4654|   149k|  (void)dx;
 4655|   149k|  assert(dx == 1);
 4656|   149k|  assert(dy > 0);
 4657|       |
 4658|   149k|  if (bw == bh) {
  ------------------
  |  Branch (4658:7): [True: 102k, False: 46.6k]
  ------------------
 4659|   102k|    switch (bw) {
  ------------------
  |  Branch (4659:13): [True: 102k, False: 0]
  ------------------
 4660|  57.8k|      case 4:
  ------------------
  |  Branch (4660:7): [True: 57.8k, False: 44.7k]
  ------------------
 4661|  57.8k|        dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy);
 4662|  57.8k|        break;
 4663|  29.5k|      case 8:
  ------------------
  |  Branch (4663:7): [True: 29.5k, False: 73.0k]
  ------------------
 4664|  29.5k|        dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy);
 4665|  29.5k|        break;
 4666|  11.1k|      case 16:
  ------------------
  |  Branch (4666:7): [True: 11.1k, False: 91.4k]
  ------------------
 4667|  11.1k|        dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy);
 4668|  11.1k|        break;
 4669|  3.75k|      case 32:
  ------------------
  |  Branch (4669:7): [True: 3.75k, False: 98.8k]
  ------------------
 4670|  3.75k|        dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy);
 4671|  3.75k|        break;
 4672|    310|      case 64:
  ------------------
  |  Branch (4672:7): [True: 310, False: 102k]
  ------------------
 4673|    310|        dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy);
 4674|    310|        break;
 4675|   102k|    }
 4676|   102k|  } else {
 4677|  46.6k|    if (bw < bh) {
  ------------------
  |  Branch (4677:9): [True: 18.1k, False: 28.5k]
  ------------------
 4678|  18.1k|      if (bw + bw == bh) {
  ------------------
  |  Branch (4678:11): [True: 11.9k, False: 6.15k]
  ------------------
 4679|  11.9k|        switch (bw) {
  ------------------
  |  Branch (4679:17): [True: 11.9k, False: 0]
  ------------------
 4680|  4.06k|          case 4:
  ------------------
  |  Branch (4680:11): [True: 4.06k, False: 7.88k]
  ------------------
 4681|  4.06k|            dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left, dy);
 4682|  4.06k|            break;
 4683|  6.02k|          case 8:
  ------------------
  |  Branch (4683:11): [True: 6.02k, False: 5.91k]
  ------------------
 4684|  6.02k|            dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left, dy);
 4685|  6.02k|            break;
 4686|  1.55k|          case 16:
  ------------------
  |  Branch (4686:11): [True: 1.55k, False: 10.3k]
  ------------------
 4687|  1.55k|            dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left, dy);
 4688|  1.55k|            break;
 4689|    293|          case 32:
  ------------------
  |  Branch (4689:11): [True: 293, False: 11.6k]
  ------------------
 4690|    293|            dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left, dy);
 4691|    293|            break;
 4692|  11.9k|        }
 4693|  11.9k|      } else {
 4694|  6.15k|        switch (bw) {
  ------------------
  |  Branch (4694:17): [True: 6.15k, False: 0]
  ------------------
 4695|      0|#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
 4696|  3.53k|          case 4:
  ------------------
  |  Branch (4696:11): [True: 3.53k, False: 2.62k]
  ------------------
 4697|  3.53k|            dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left, dy);
 4698|  3.53k|            break;
 4699|  2.34k|          case 8:
  ------------------
  |  Branch (4699:11): [True: 2.34k, False: 3.81k]
  ------------------
 4700|  2.34k|            dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left, dy);
 4701|  2.34k|            break;
 4702|    280|          case 16:
  ------------------
  |  Branch (4702:11): [True: 280, False: 5.87k]
  ------------------
 4703|    280|            dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left, dy);
 4704|    280|            break;
 4705|  6.15k|#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
 4706|  6.15k|        }
 4707|  6.15k|      }
 4708|  28.5k|    } else {
 4709|  28.5k|      if (bh + bh == bw) {
  ------------------
  |  Branch (4709:11): [True: 18.1k, False: 10.4k]
  ------------------
 4710|  18.1k|        switch (bh) {
  ------------------
  |  Branch (4710:17): [True: 18.1k, False: 0]
  ------------------
 4711|  6.05k|          case 4:
  ------------------
  |  Branch (4711:11): [True: 6.05k, False: 12.0k]
  ------------------
 4712|  6.05k|            dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left, dy);
 4713|  6.05k|            break;
 4714|  9.88k|          case 8:
  ------------------
  |  Branch (4714:11): [True: 9.88k, False: 8.25k]
  ------------------
 4715|  9.88k|            dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left, dy);
 4716|  9.88k|            break;
 4717|  2.10k|          case 16:
  ------------------
  |  Branch (4717:11): [True: 2.10k, False: 16.0k]
  ------------------
 4718|  2.10k|            dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left, dy);
 4719|  2.10k|            break;
 4720|    100|          case 32:
  ------------------
  |  Branch (4720:11): [True: 100, False: 18.0k]
  ------------------
 4721|    100|            dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left, dy);
 4722|    100|            break;
 4723|  18.1k|        }
 4724|  18.1k|      } else {
 4725|  10.4k|        switch (bh) {
  ------------------
  |  Branch (4725:17): [True: 10.4k, False: 18.4E]
  ------------------
 4726|      0|#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
 4727|  7.76k|          case 4:
  ------------------
  |  Branch (4727:11): [True: 7.76k, False: 2.65k]
  ------------------
 4728|  7.76k|            dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left, dy);
 4729|  7.76k|            break;
 4730|  2.44k|          case 8:
  ------------------
  |  Branch (4730:11): [True: 2.44k, False: 7.97k]
  ------------------
 4731|  2.44k|            dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left, dy);
 4732|  2.44k|            break;
 4733|    211|          case 16:
  ------------------
  |  Branch (4733:11): [True: 211, False: 10.2k]
  ------------------
 4734|    211|            dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left, dy);
 4735|    211|            break;
 4736|  10.4k|#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
 4737|  10.4k|        }
 4738|  10.4k|      }
 4739|  28.5k|    }
 4740|  46.6k|  }
 4741|   149k|}
intrapred_avx2.c:dc_sum_32:
   32|   162k|static inline __m256i dc_sum_32(const uint8_t *ref) {
   33|   162k|  const __m256i x = _mm256_loadu_si256((const __m256i *)ref);
   34|   162k|  const __m256i zero = _mm256_setzero_si256();
   35|   162k|  __m256i y = _mm256_sad_epu8(x, zero);
   36|       |  __m256i u = _mm256_permute2x128_si256(y, y, 1);
   37|   162k|  y = _mm256_add_epi64(u, y);
   38|   162k|  u = _mm256_unpackhi_epi64(y, y);
   39|   162k|  return _mm256_add_epi16(y, u);
   40|   162k|}
intrapred_avx2.c:row_store_32xh:
   43|   136k|                                  ptrdiff_t stride) {
   44|  4.18M|  for (int i = 0; i < height; ++i) {
  ------------------
  |  Branch (44:19): [True: 4.05M, False: 136k]
  ------------------
   45|  4.05M|    _mm256_storeu_si256((__m256i *)dst, *r);
   46|  4.05M|    dst += stride;
   47|  4.05M|  }
   48|   136k|}
intrapred_avx2.c:h_predictor_32x8line:
  384|  15.9k|                                        ptrdiff_t stride) {
  385|  15.9k|  __m256i t[4];
  386|  15.9k|  __m256i m = _mm256_setzero_si256();
  387|  15.9k|  const __m256i inc = _mm256_set1_epi8(4);
  388|  15.9k|  int i;
  389|       |
  390|  79.5k|  for (i = 0; i < 4; i++) {
  ------------------
  |  Branch (390:15): [True: 63.6k, False: 15.9k]
  ------------------
  391|  63.6k|    t[i] = _mm256_shuffle_epi8(*row, m);
  392|  63.6k|    __m256i r0 = _mm256_permute2x128_si256(t[i], t[i], 0);
  393|       |    __m256i r1 = _mm256_permute2x128_si256(t[i], t[i], 0x11);
  394|  63.6k|    _mm256_storeu_si256((__m256i *)dst, r0);
  395|  63.6k|    _mm256_storeu_si256((__m256i *)(dst + (stride << 4)), r1);
  396|  63.6k|    dst += stride;
  397|  63.6k|    m = _mm256_add_epi8(m, inc);
  398|  63.6k|  }
  399|  15.9k|}
intrapred_avx2.c:dc_sum_64:
   19|  35.6k|static inline __m256i dc_sum_64(const uint8_t *ref) {
   20|  35.6k|  const __m256i x0 = _mm256_loadu_si256((const __m256i *)ref);
   21|  35.6k|  const __m256i x1 = _mm256_loadu_si256((const __m256i *)(ref + 32));
   22|  35.6k|  const __m256i zero = _mm256_setzero_si256();
   23|  35.6k|  __m256i y0 = _mm256_sad_epu8(x0, zero);
   24|  35.6k|  __m256i y1 = _mm256_sad_epu8(x1, zero);
   25|  35.6k|  y0 = _mm256_add_epi64(y0, y1);
   26|       |  __m256i u0 = _mm256_permute2x128_si256(y0, y0, 1);
   27|  35.6k|  y0 = _mm256_add_epi64(u0, y0);
   28|  35.6k|  u0 = _mm256_unpackhi_epi64(y0, y0);
   29|  35.6k|  return _mm256_add_epi16(y0, u0);
   30|  35.6k|}
intrapred_avx2.c:row_store_64xh:
   61|  30.4k|                                  ptrdiff_t stride) {
   62|  1.65M|  for (int i = 0; i < height; ++i) {
  ------------------
  |  Branch (62:19): [True: 1.62M, False: 30.4k]
  ------------------
   63|  1.62M|    _mm256_storeu_si256((__m256i *)dst, *r);
   64|  1.62M|    _mm256_storeu_si256((__m256i *)(dst + 32), *r);
   65|  1.62M|    dst += stride;
   66|  1.62M|  }
   67|  30.4k|}
intrapred_avx2.c:row_store_32x2xh:
   52|    174|                                    ptrdiff_t stride) {
   53|  6.95k|  for (int i = 0; i < height; ++i) {
  ------------------
  |  Branch (53:19): [True: 6.78k, False: 174]
  ------------------
   54|  6.78k|    _mm256_storeu_si256((__m256i *)dst, *r0);
   55|  6.78k|    _mm256_storeu_si256((__m256i *)(dst + 32), *r1);
   56|  6.78k|    dst += stride;
   57|  6.78k|  }
   58|    174|}
intrapred_avx2.c:get_top_vector:
  759|  28.7k|static inline __m256i get_top_vector(const uint8_t *above) {
  760|  28.7k|  const __m128i x = _mm_load_si128((const __m128i *)above);
  761|  28.7k|  const __m128i zero = _mm_setzero_si128();
  762|  28.7k|  const __m128i t0 = _mm_unpacklo_epi8(x, zero);
  763|  28.7k|  const __m128i t1 = _mm_unpackhi_epi8(x, zero);
  764|       |  return _mm256_inserti128_si256(_mm256_castsi128_si256(t0), t1, 1);
  765|  28.7k|}
intrapred_avx2.c:paeth_16x1_pred:
  752|   584k|                                      const __m256i *topleft) {
  753|   584k|  const __m256i p0 = paeth_pred(left, top, topleft);
  754|       |  const __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
  755|   584k|  const __m256i p = _mm256_packus_epi16(p0, p1);
  756|   584k|  return _mm256_castsi256_si128(p);
  757|   584k|}
intrapred_avx2.c:paeth_pred:
  728|   631k|                                 const __m256i *topleft) {
  729|   631k|  const __m256i base =
  730|   631k|      _mm256_sub_epi16(_mm256_add_epi16(*top, *left), *topleft);
  731|       |
  732|   631k|  __m256i pl = _mm256_abs_epi16(_mm256_sub_epi16(base, *left));
  733|   631k|  __m256i pt = _mm256_abs_epi16(_mm256_sub_epi16(base, *top));
  734|   631k|  __m256i ptl = _mm256_abs_epi16(_mm256_sub_epi16(base, *topleft));
  735|       |
  736|   631k|  __m256i mask1 = _mm256_cmpgt_epi16(pl, pt);
  737|   631k|  mask1 = _mm256_or_si256(mask1, _mm256_cmpgt_epi16(pl, ptl));
  738|   631k|  __m256i mask2 = _mm256_cmpgt_epi16(pt, ptl);
  739|       |
  740|   631k|  pl = _mm256_andnot_si256(mask1, *left);
  741|       |
  742|   631k|  ptl = _mm256_and_si256(mask2, *topleft);
  743|   631k|  pt = _mm256_andnot_si256(mask2, *top);
  744|   631k|  pt = _mm256_or_si256(pt, ptl);
  745|   631k|  pt = _mm256_and_si256(mask1, pt);
  746|       |
  747|   631k|  return _mm256_or_si256(pt, pl);
  748|   631k|}
intrapred_avx2.c:get_left_vector:
  787|  23.4k|static inline __m256i get_left_vector(const uint8_t *left) {
  788|  23.4k|  const __m128i x = _mm_load_si128((const __m128i *)left);
  789|       |  return _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
  790|  23.4k|}
intrapred_avx2.c:paeth_32x1_pred:
  866|  23.3k|                                      const __m256i *topleft) {
  867|  23.3k|  __m256i p0 = paeth_pred(left, top0, topleft);
  868|  23.3k|  __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
  869|  23.3k|  const __m256i x0 = _mm256_packus_epi16(p0, p1);
  870|       |
  871|  23.3k|  p0 = paeth_pred(left, top1, topleft);
  872|  23.3k|  p1 = _mm256_permute4x64_epi64(p0, 0xe);
  873|  23.3k|  const __m256i x1 = _mm256_packus_epi16(p0, p1);
  874|       |
  875|       |  return _mm256_permute2x128_si256(x0, x1, 0x20);
  876|  23.3k|}
intrapred_avx2.c:highbd_dr_prediction_z1_4xN_avx2:
 1207|  36.5k|                                             int bd) {
 1208|  36.5k|  __m128i dstvec[16];
 1209|  36.5k|  if (bd < 12) {
  ------------------
  |  Branch (1209:7): [True: 36.5k, False: 68]
  ------------------
 1210|  36.5k|    highbd_dr_prediction_z1_4xN_internal_avx2(N, dstvec, above, upsample_above,
 1211|  36.5k|                                              dx);
 1212|  36.5k|  } else {
 1213|     68|    highbd_dr_prediction_32bit_z1_4xN_internal_avx2(N, dstvec, above,
 1214|     68|                                                    upsample_above, dx);
 1215|     68|  }
 1216|   226k|  for (int i = 0; i < N; i++) {
  ------------------
  |  Branch (1216:19): [True: 189k, False: 36.5k]
  ------------------
 1217|   189k|    _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
 1218|   189k|  }
 1219|  36.5k|}
intrapred_avx2.c:highbd_dr_prediction_z1_4xN_internal_avx2:
 1064|  86.0k|    int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
 1065|  86.0k|  const int frac_bits = 6 - upsample_above;
 1066|  86.0k|  const int max_base_x = ((N + 4) - 1) << upsample_above;
 1067|       |
 1068|  86.0k|  assert(dx > 0);
 1069|       |  // pre-filter above pixels
 1070|       |  // store in temp buffers:
 1071|       |  //   above[x] * 32 + 16
 1072|       |  //   above[x+1] - above[x]
 1073|       |  // final pixels will be calculated as:
 1074|       |  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
 1075|  86.0k|  __m256i a0, a1, a32, a16;
 1076|  86.0k|  __m256i diff, c3f;
 1077|  86.0k|  __m128i a_mbase_x, max_base_x128, base_inc128, mask128;
 1078|  86.0k|  __m128i a0_128, a1_128;
 1079|  86.0k|  a16 = _mm256_set1_epi16(16);
 1080|  86.0k|  a_mbase_x = _mm_set1_epi16(above[max_base_x]);
 1081|  86.0k|  max_base_x128 = _mm_set1_epi16(max_base_x);
 1082|  86.0k|  c3f = _mm256_set1_epi16(0x3f);
 1083|       |
 1084|  86.0k|  int x = dx;
 1085|   587k|  for (int r = 0; r < N; r++) {
  ------------------
  |  Branch (1085:19): [True: 501k, False: 85.7k]
  ------------------
 1086|   501k|    __m256i b, res, shift;
 1087|   501k|    __m128i res1;
 1088|       |
 1089|   501k|    int base = x >> frac_bits;
 1090|   501k|    if (base >= max_base_x) {
  ------------------
  |  Branch (1090:9): [True: 290, False: 501k]
  ------------------
 1091|    748|      for (int i = r; i < N; ++i) {
  ------------------
  |  Branch (1091:23): [True: 458, False: 290]
  ------------------
 1092|    458|        dst[i] = a_mbase_x;  // save 4 values
 1093|    458|      }
 1094|    290|      return;
 1095|    290|    }
 1096|       |
 1097|   501k|    a0_128 = _mm_loadu_si128((__m128i *)(above + base));
 1098|   501k|    a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1));
 1099|       |
 1100|   501k|    if (upsample_above) {
  ------------------
  |  Branch (1100:9): [True: 246k, False: 255k]
  ------------------
 1101|   246k|      a0_128 = _mm_shuffle_epi8(a0_128, *(__m128i *)HighbdEvenOddMaskx4[0]);
 1102|   246k|      a1_128 = _mm_srli_si128(a0_128, 8);
 1103|       |
 1104|   246k|      base_inc128 = _mm_setr_epi16(base, base + 2, base + 4, base + 6, base + 8,
 1105|   246k|                                   base + 10, base + 12, base + 14);
 1106|   246k|      shift = _mm256_srli_epi16(
 1107|   246k|          _mm256_and_si256(
 1108|   246k|              _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above),
 1109|   246k|              _mm256_set1_epi16(0x3f)),
 1110|   246k|          1);
 1111|   255k|    } else {
 1112|   255k|      base_inc128 = _mm_setr_epi16(base, base + 1, base + 2, base + 3, base + 4,
 1113|   255k|                                   base + 5, base + 6, base + 7);
 1114|   255k|      shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
 1115|   255k|    }
 1116|   501k|    a0 = _mm256_castsi128_si256(a0_128);
 1117|   501k|    a1 = _mm256_castsi128_si256(a1_128);
 1118|   501k|    diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
 1119|   501k|    a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
 1120|   501k|    a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
 1121|       |
 1122|   501k|    b = _mm256_mullo_epi16(diff, shift);
 1123|   501k|    res = _mm256_add_epi16(a32, b);
 1124|   501k|    res = _mm256_srli_epi16(res, 5);
 1125|   501k|    res1 = _mm256_castsi256_si128(res);
 1126|       |
 1127|   501k|    mask128 = _mm_cmpgt_epi16(max_base_x128, base_inc128);
 1128|   501k|    dst[r] = _mm_blendv_epi8(a_mbase_x, res1, mask128);
 1129|   501k|    x += dx;
 1130|   501k|  }
 1131|  86.0k|}
intrapred_avx2.c:highbd_dr_prediction_32bit_z1_4xN_internal_avx2:
 1134|    182|    int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
 1135|    182|  const int frac_bits = 6 - upsample_above;
 1136|    182|  const int max_base_x = ((N + 4) - 1) << upsample_above;
 1137|       |
 1138|    182|  assert(dx > 0);
 1139|       |  // pre-filter above pixels
 1140|       |  // store in temp buffers:
 1141|       |  //   above[x] * 32 + 16
 1142|       |  //   above[x+1] - above[x]
 1143|       |  // final pixels will be calculated as:
 1144|       |  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
 1145|    182|  __m256i a0, a1, a32, a16;
 1146|    182|  __m256i diff;
 1147|    182|  __m128i a_mbase_x, max_base_x128, base_inc128, mask128;
 1148|       |
 1149|    182|  a16 = _mm256_set1_epi32(16);
 1150|    182|  a_mbase_x = _mm_set1_epi16(above[max_base_x]);
 1151|    182|  max_base_x128 = _mm_set1_epi32(max_base_x);
 1152|       |
 1153|    182|  int x = dx;
 1154|  1.41k|  for (int r = 0; r < N; r++) {
  ------------------
  |  Branch (1154:19): [True: 1.23k, False: 182]
  ------------------
 1155|  1.23k|    __m256i b, res, shift;
 1156|  1.23k|    __m128i res1;
 1157|       |
 1158|  1.23k|    int base = x >> frac_bits;
 1159|  1.23k|    if (base >= max_base_x) {
  ------------------
  |  Branch (1159:9): [True: 0, False: 1.23k]
  ------------------
 1160|      0|      for (int i = r; i < N; ++i) {
  ------------------
  |  Branch (1160:23): [True: 0, False: 0]
  ------------------
 1161|      0|        dst[i] = a_mbase_x;  // save 4 values
 1162|      0|      }
 1163|      0|      return;
 1164|      0|    }
 1165|       |
 1166|  1.23k|    a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
 1167|  1.23k|    a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
 1168|       |
 1169|  1.23k|    if (upsample_above) {
  ------------------
  |  Branch (1169:9): [True: 600, False: 632]
  ------------------
 1170|    600|      a0 = _mm256_permutevar8x32_epi32(
 1171|    600|          a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
 1172|    600|      a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1));
 1173|    600|      base_inc128 = _mm_setr_epi32(base, base + 2, base + 4, base + 6);
 1174|    600|      shift = _mm256_srli_epi32(
 1175|    600|          _mm256_and_si256(
 1176|    600|              _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above),
 1177|    600|              _mm256_set1_epi32(0x3f)),
 1178|    600|          1);
 1179|    632|    } else {
 1180|    632|      base_inc128 = _mm_setr_epi32(base, base + 1, base + 2, base + 3);
 1181|    632|      shift = _mm256_srli_epi32(
 1182|    632|          _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
 1183|    632|    }
 1184|       |
 1185|  1.23k|    diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
 1186|  1.23k|    a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
 1187|  1.23k|    a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
 1188|       |
 1189|  1.23k|    b = _mm256_mullo_epi32(diff, shift);
 1190|  1.23k|    res = _mm256_add_epi32(a32, b);
 1191|  1.23k|    res = _mm256_srli_epi32(res, 5);
 1192|       |
 1193|  1.23k|    res1 = _mm256_castsi256_si128(res);
 1194|  1.23k|    res1 = _mm_packus_epi32(res1, res1);
 1195|       |
 1196|  1.23k|    mask128 = _mm_cmpgt_epi32(max_base_x128, base_inc128);
 1197|  1.23k|    mask128 = _mm_packs_epi32(mask128, mask128);  // goto 16 bit
 1198|  1.23k|    dst[r] = _mm_blendv_epi8(a_mbase_x, res1, mask128);
 1199|  1.23k|    x += dx;
 1200|  1.23k|  }
 1201|    182|}
intrapred_avx2.c:highbd_dr_prediction_z1_8xN_avx2:
 1390|  38.1k|                                             int bd) {
 1391|  38.1k|  __m128i dstvec[32];
 1392|  38.1k|  if (bd < 12) {
  ------------------
  |  Branch (1392:7): [True: 38.0k, False: 110]
  ------------------
 1393|  38.0k|    highbd_dr_prediction_z1_8xN_internal_avx2(N, dstvec, above, upsample_above,
 1394|  38.0k|                                              dx);
 1395|  38.0k|  } else {
 1396|    110|    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(N, dstvec, above,
 1397|    110|                                                    upsample_above, dx);
 1398|    110|  }
 1399|   379k|  for (int i = 0; i < N; i++) {
  ------------------
  |  Branch (1399:19): [True: 341k, False: 38.1k]
  ------------------
 1400|   341k|    _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
 1401|   341k|  }
 1402|  38.1k|}
intrapred_avx2.c:highbd_dr_prediction_z1_8xN_internal_avx2:
 1305|  85.5k|    int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
 1306|  85.5k|  const int frac_bits = 6 - upsample_above;
 1307|  85.5k|  const int max_base_x = ((8 + N) - 1) << upsample_above;
 1308|       |
 1309|  85.5k|  assert(dx > 0);
 1310|       |  // pre-filter above pixels
 1311|       |  // store in temp buffers:
 1312|       |  //   above[x] * 32 + 16
 1313|       |  //   above[x+1] - above[x]
 1314|       |  // final pixels will be calculated as:
 1315|       |  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
 1316|  85.5k|  __m256i a0, a1, a32, a16, c3f;
 1317|  85.5k|  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
 1318|  85.5k|  __m128i a0_x128, a1_x128;
 1319|       |
 1320|  85.5k|  a16 = _mm256_set1_epi16(16);
 1321|  85.5k|  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
 1322|  85.5k|  max_base_x256 = _mm256_set1_epi16(max_base_x);
 1323|  85.5k|  c3f = _mm256_set1_epi16(0x3f);
 1324|       |
 1325|  85.5k|  int x = dx;
 1326|   906k|  for (int r = 0; r < N; r++) {
  ------------------
  |  Branch (1326:19): [True: 820k, False: 85.3k]
  ------------------
 1327|   820k|    __m256i b, res, res1, shift;
 1328|       |
 1329|   820k|    int base = x >> frac_bits;
 1330|   820k|    if (base >= max_base_x) {
  ------------------
  |  Branch (1330:9): [True: 191, False: 820k]
  ------------------
 1331|    622|      for (int i = r; i < N; ++i) {
  ------------------
  |  Branch (1331:23): [True: 431, False: 191]
  ------------------
 1332|    431|        dst[i] = _mm256_castsi256_si128(a_mbase_x);  // save 8 values
 1333|    431|      }
 1334|    191|      return;
 1335|    191|    }
 1336|       |
 1337|   820k|    a0_x128 = _mm_loadu_si128((__m128i *)(above + base));
 1338|   820k|    if (upsample_above) {
  ------------------
  |  Branch (1338:9): [True: 276k, False: 544k]
  ------------------
 1339|   276k|      __m128i mask, atmp0, atmp1, atmp2, atmp3;
 1340|   276k|      a1_x128 = _mm_loadu_si128((__m128i *)(above + base + 8));
 1341|   276k|      atmp0 = _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdEvenOddMaskx[0]);
 1342|   276k|      atmp1 = _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdEvenOddMaskx[0]);
 1343|   276k|      atmp2 =
 1344|   276k|          _mm_shuffle_epi8(a0_x128, *(__m128i *)(HighbdEvenOddMaskx[0] + 16));
 1345|   276k|      atmp3 =
 1346|   276k|          _mm_shuffle_epi8(a1_x128, *(__m128i *)(HighbdEvenOddMaskx[0] + 16));
 1347|   276k|      mask =
 1348|   276k|          _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[0], _mm_set1_epi8(15));
 1349|   276k|      a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
 1350|   276k|      mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[0] + 16),
 1351|   276k|                            _mm_set1_epi8(15));
 1352|   276k|      a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
 1353|       |
 1354|   276k|      base_inc256 = _mm256_setr_epi16(base, base + 2, base + 4, base + 6,
 1355|   276k|                                      base + 8, base + 10, base + 12, base + 14,
 1356|   276k|                                      0, 0, 0, 0, 0, 0, 0, 0);
 1357|   276k|      shift = _mm256_srli_epi16(
 1358|   276k|          _mm256_and_si256(
 1359|   276k|              _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f),
 1360|   276k|          1);
 1361|   544k|    } else {
 1362|   544k|      a1_x128 = _mm_loadu_si128((__m128i *)(above + base + 1));
 1363|   544k|      base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
 1364|   544k|                                      base + 4, base + 5, base + 6, base + 7, 0,
 1365|   544k|                                      0, 0, 0, 0, 0, 0, 0);
 1366|   544k|      shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
 1367|   544k|    }
 1368|   820k|    a0 = _mm256_castsi128_si256(a0_x128);
 1369|   820k|    a1 = _mm256_castsi128_si256(a1_x128);
 1370|       |
 1371|   820k|    diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
 1372|   820k|    a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
 1373|   820k|    a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
 1374|       |
 1375|   820k|    b = _mm256_mullo_epi16(diff, shift);
 1376|   820k|    res = _mm256_add_epi16(a32, b);
 1377|   820k|    res = _mm256_srli_epi16(res, 5);
 1378|       |
 1379|   820k|    mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
 1380|   820k|    res1 = _mm256_blendv_epi8(a_mbase_x, res, mask256);
 1381|   820k|    dst[r] = _mm256_castsi256_si128(res1);
 1382|   820k|    x += dx;
 1383|   820k|  }
 1384|  85.5k|}
intrapred_avx2.c:highbd_dr_prediction_32bit_z1_8xN_internal_avx2:
 1222|    480|    int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
 1223|    480|  const int frac_bits = 6 - upsample_above;
 1224|    480|  const int max_base_x = ((8 + N) - 1) << upsample_above;
 1225|       |
 1226|    480|  assert(dx > 0);
 1227|       |  // pre-filter above pixels
 1228|       |  // store in temp buffers:
 1229|       |  //   above[x] * 32 + 16
 1230|       |  //   above[x+1] - above[x]
 1231|       |  // final pixels will be calculated as:
 1232|       |  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
 1233|    480|  __m256i a0, a1, a0_1, a1_1, a32, a16;
 1234|    480|  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
 1235|       |
 1236|    480|  a16 = _mm256_set1_epi32(16);
 1237|    480|  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
 1238|    480|  max_base_x256 = _mm256_set1_epi32(max_base_x);
 1239|       |
 1240|    480|  int x = dx;
 1241|  5.56k|  for (int r = 0; r < N; r++) {
  ------------------
  |  Branch (1241:19): [True: 5.08k, False: 480]
  ------------------
 1242|  5.08k|    __m256i b, res, res1, shift;
 1243|       |
 1244|  5.08k|    int base = x >> frac_bits;
 1245|  5.08k|    if (base >= max_base_x) {
  ------------------
  |  Branch (1245:9): [True: 0, False: 5.08k]
  ------------------
 1246|      0|      for (int i = r; i < N; ++i) {
  ------------------
  |  Branch (1246:23): [True: 0, False: 0]
  ------------------
 1247|      0|        dst[i] = _mm256_castsi256_si128(a_mbase_x);  // save 8 values
 1248|      0|      }
 1249|      0|      return;
 1250|      0|    }
 1251|       |
 1252|  5.08k|    a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
 1253|  5.08k|    a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
 1254|       |
 1255|  5.08k|    if (upsample_above) {
  ------------------
  |  Branch (1255:9): [True: 2.08k, False: 3.00k]
  ------------------
 1256|  2.08k|      a0 = _mm256_permutevar8x32_epi32(
 1257|  2.08k|          a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
 1258|  2.08k|      a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1));
 1259|       |
 1260|  2.08k|      a0_1 =
 1261|  2.08k|          _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8)));
 1262|  2.08k|      a0_1 = _mm256_permutevar8x32_epi32(
 1263|  2.08k|          a0_1, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
 1264|  2.08k|      a1_1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0_1, 1));
 1265|       |
 1266|  2.08k|      a0 = _mm256_inserti128_si256(a0, _mm256_castsi256_si128(a0_1), 1);
 1267|  2.08k|      a1 = _mm256_inserti128_si256(a1, _mm256_castsi256_si128(a1_1), 1);
 1268|  2.08k|      base_inc256 =
 1269|  2.08k|          _mm256_setr_epi32(base, base + 2, base + 4, base + 6, base + 8,
 1270|  2.08k|                            base + 10, base + 12, base + 14);
 1271|  2.08k|      shift = _mm256_srli_epi32(
 1272|  2.08k|          _mm256_and_si256(
 1273|  2.08k|              _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above),
 1274|  2.08k|              _mm256_set1_epi32(0x3f)),
 1275|  2.08k|          1);
 1276|  3.00k|    } else {
 1277|  3.00k|      base_inc256 = _mm256_setr_epi32(base, base + 1, base + 2, base + 3,
 1278|  3.00k|                                      base + 4, base + 5, base + 6, base + 7);
 1279|  3.00k|      shift = _mm256_srli_epi32(
 1280|  3.00k|          _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
 1281|  3.00k|    }
 1282|       |
 1283|  5.08k|    diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
 1284|  5.08k|    a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
 1285|  5.08k|    a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
 1286|       |
 1287|  5.08k|    b = _mm256_mullo_epi32(diff, shift);
 1288|  5.08k|    res = _mm256_add_epi32(a32, b);
 1289|  5.08k|    res = _mm256_srli_epi32(res, 5);
 1290|       |
 1291|  5.08k|    res1 = _mm256_packus_epi32(
 1292|  5.08k|        res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
 1293|       |
 1294|  5.08k|    mask256 = _mm256_cmpgt_epi32(max_base_x256, base_inc256);
 1295|  5.08k|    mask256 = _mm256_packs_epi32(
 1296|  5.08k|        mask256, _mm256_castsi128_si256(
 1297|       |                     _mm256_extracti128_si256(mask256, 1)));  // goto 16 bit
 1298|  5.08k|    res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
 1299|  5.08k|    dst[r] = _mm256_castsi256_si128(res1);
 1300|  5.08k|    x += dx;
 1301|  5.08k|  }
 1302|    480|}
intrapred_avx2.c:highbd_dr_prediction_z1_16xN_avx2:
 1543|  20.3k|                                              int bd) {
 1544|  20.3k|  __m256i dstvec[64];
 1545|  20.3k|  if (bd < 12) {
  ------------------
  |  Branch (1545:7): [True: 20.2k, False: 116]
  ------------------
 1546|  20.2k|    highbd_dr_prediction_z1_16xN_internal_avx2(N, dstvec, above, upsample_above,
 1547|  20.2k|                                               dx);
 1548|  20.2k|  } else {
 1549|    116|    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(N, dstvec, above,
 1550|    116|                                                     upsample_above, dx);
 1551|    116|  }
 1552|   264k|  for (int i = 0; i < N; i++) {
  ------------------
  |  Branch (1552:19): [True: 244k, False: 20.3k]
  ------------------
 1553|   244k|    _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
 1554|   244k|  }
 1555|  20.3k|}
intrapred_avx2.c:highbd_dr_prediction_z1_16xN_internal_avx2:
 1484|  39.8k|    int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
 1485|       |  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
 1486|  39.8k|  (void)upsample_above;
 1487|  39.8k|  const int frac_bits = 6;
 1488|  39.8k|  const int max_base_x = ((16 + N) - 1);
 1489|       |
 1490|       |  // pre-filter above pixels
 1491|       |  // store in temp buffers:
 1492|       |  //   above[x] * 32 + 16
 1493|       |  //   above[x+1] - above[x]
 1494|       |  // final pixels will be calculated as:
 1495|       |  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
 1496|  39.8k|  __m256i a0, a1, a32, a16, c3f;
 1497|  39.8k|  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
 1498|       |
 1499|  39.8k|  a16 = _mm256_set1_epi16(16);
 1500|  39.8k|  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
 1501|  39.8k|  max_base_x256 = _mm256_set1_epi16(max_base_x);
 1502|  39.8k|  c3f = _mm256_set1_epi16(0x3f);
 1503|       |
 1504|  39.8k|  int x = dx;
 1505|   558k|  for (int r = 0; r < N; r++) {
  ------------------
  |  Branch (1505:19): [True: 519k, False: 39.8k]
  ------------------
 1506|   519k|    __m256i b, res;
 1507|       |
 1508|   519k|    int base = x >> frac_bits;
 1509|   519k|    if (base >= max_base_x) {
  ------------------
  |  Branch (1509:9): [True: 37, False: 519k]
  ------------------
 1510|    214|      for (int i = r; i < N; ++i) {
  ------------------
  |  Branch (1510:23): [True: 177, False: 37]
  ------------------
 1511|    177|        dstvec[i] = a_mbase_x;  // save 16 values
 1512|    177|      }
 1513|     37|      return;
 1514|     37|    }
 1515|   519k|    __m256i shift =
 1516|   519k|        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
 1517|       |
 1518|   519k|    a0 = _mm256_loadu_si256((__m256i *)(above + base));
 1519|   519k|    a1 = _mm256_loadu_si256((__m256i *)(above + base + 1));
 1520|       |
 1521|   519k|    diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
 1522|   519k|    a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
 1523|   519k|    a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
 1524|   519k|    b = _mm256_mullo_epi16(diff, shift);
 1525|       |
 1526|   519k|    res = _mm256_add_epi16(a32, b);
 1527|   519k|    res = _mm256_srli_epi16(res, 5);  // 16 16bit values
 1528|       |
 1529|   519k|    base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
 1530|   519k|                                    base + 4, base + 5, base + 6, base + 7,
 1531|   519k|                                    base + 8, base + 9, base + 10, base + 11,
 1532|   519k|                                    base + 12, base + 13, base + 14, base + 15);
 1533|   519k|    mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
 1534|   519k|    dstvec[r] = _mm256_blendv_epi8(a_mbase_x, res, mask256);
 1535|   519k|    x += dx;
 1536|   519k|  }
 1537|  39.8k|}
intrapred_avx2.c:highbd_dr_prediction_32bit_z1_16xN_internal_avx2:
 1405|    184|    int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
 1406|       |  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
 1407|    184|  (void)upsample_above;
 1408|    184|  const int frac_bits = 6;
 1409|    184|  const int max_base_x = ((16 + N) - 1);
 1410|       |
 1411|       |  // pre-filter above pixels
 1412|       |  // store in temp buffers:
 1413|       |  //   above[x] * 32 + 16
 1414|       |  //   above[x+1] - above[x]
 1415|       |  // final pixels will be calculated as:
 1416|       |  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
 1417|    184|  __m256i a0, a0_1, a1, a1_1, a32, a16;
 1418|    184|  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
 1419|       |
 1420|    184|  a16 = _mm256_set1_epi32(16);
 1421|    184|  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
 1422|    184|  max_base_x256 = _mm256_set1_epi16(max_base_x);
 1423|       |
 1424|    184|  int x = dx;
 1425|  3.12k|  for (int r = 0; r < N; r++) {
  ------------------
  |  Branch (1425:19): [True: 2.93k, False: 184]
  ------------------
 1426|  2.93k|    __m256i b, res[2], res1;
 1427|       |
 1428|  2.93k|    int base = x >> frac_bits;
 1429|  2.93k|    if (base >= max_base_x) {
  ------------------
  |  Branch (1429:9): [True: 0, False: 2.93k]
  ------------------
 1430|      0|      for (int i = r; i < N; ++i) {
  ------------------
  |  Branch (1430:23): [True: 0, False: 0]
  ------------------
 1431|      0|        dstvec[i] = a_mbase_x;  // save 16 values
 1432|      0|      }
 1433|      0|      return;
 1434|      0|    }
 1435|  2.93k|    __m256i shift = _mm256_srli_epi32(
 1436|  2.93k|        _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
 1437|       |
 1438|  2.93k|    a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
 1439|  2.93k|    a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
 1440|       |
 1441|  2.93k|    diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
 1442|  2.93k|    a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
 1443|  2.93k|    a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
 1444|  2.93k|    b = _mm256_mullo_epi32(diff, shift);
 1445|       |
 1446|  2.93k|    res[0] = _mm256_add_epi32(a32, b);
 1447|  2.93k|    res[0] = _mm256_srli_epi32(res[0], 5);
 1448|  2.93k|    res[0] = _mm256_packus_epi32(
 1449|  2.93k|        res[0], _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
 1450|       |
 1451|  2.93k|    int mdif = max_base_x - base;
 1452|  2.93k|    if (mdif > 8) {
  ------------------
  |  Branch (1452:9): [True: 2.91k, False: 24]
  ------------------
 1453|  2.91k|      a0_1 =
 1454|  2.91k|          _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8)));
 1455|  2.91k|      a1_1 =
 1456|  2.91k|          _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 9)));
 1457|       |
 1458|  2.91k|      diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
 1459|  2.91k|      a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
 1460|  2.91k|      a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
 1461|  2.91k|      b = _mm256_mullo_epi32(diff, shift);
 1462|       |
 1463|  2.91k|      res[1] = _mm256_add_epi32(a32, b);
 1464|  2.91k|      res[1] = _mm256_srli_epi32(res[1], 5);
 1465|  2.91k|      res[1] = _mm256_packus_epi32(
 1466|  2.91k|          res[1], _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
 1467|  2.91k|    } else {
 1468|     24|      res[1] = a_mbase_x;
 1469|     24|    }
 1470|  2.93k|    res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
 1471|  2.93k|                                   1);  // 16 16bit values
 1472|       |
 1473|  2.93k|    base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
 1474|  2.93k|                                    base + 4, base + 5, base + 6, base + 7,
 1475|  2.93k|                                    base + 8, base + 9, base + 10, base + 11,
 1476|  2.93k|                                    base + 12, base + 13, base + 14, base + 15);
 1477|  2.93k|    mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
 1478|  2.93k|    dstvec[r] = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
 1479|  2.93k|    x += dx;
 1480|  2.93k|  }
 1481|    184|}
intrapred_avx2.c:highbd_dr_prediction_z1_32xN_avx2:
 1730|  6.17k|                                              int bd) {
 1731|  6.17k|  __m256i dstvec[128];
 1732|  6.17k|  if (bd < 12) {
  ------------------
  |  Branch (1732:7): [True: 6.12k, False: 48]
  ------------------
 1733|  6.12k|    highbd_dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above,
 1734|  6.12k|                                               dx);
 1735|  6.12k|  } else {
 1736|     48|    highbd_dr_prediction_32bit_z1_32xN_internal_avx2(N, dstvec, above,
 1737|     48|                                                     upsample_above, dx);
 1738|     48|  }
 1739|   133k|  for (int i = 0; i < N; i++) {
  ------------------
  |  Branch (1739:19): [True: 127k, False: 6.17k]
  ------------------
 1740|   127k|    _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
 1741|   127k|    _mm256_storeu_si256((__m256i *)(dst + stride * i + 16), dstvec[i + N]);
 1742|   127k|  }
 1743|  6.17k|}
intrapred_avx2.c:highbd_dr_prediction_z1_32xN_internal_avx2:
 1655|  13.3k|    int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
 1656|       |  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
 1657|  13.3k|  (void)upsample_above;
 1658|  13.3k|  const int frac_bits = 6;
 1659|  13.3k|  const int max_base_x = ((32 + N) - 1);
 1660|       |
 1661|       |  // pre-filter above pixels
 1662|       |  // store in temp buffers:
 1663|       |  //   above[x] * 32 + 16
 1664|       |  //   above[x+1] - above[x]
 1665|       |  // final pixels will be calculated as:
 1666|       |  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
 1667|  13.3k|  __m256i a0, a1, a32, a16, c3f;
 1668|  13.3k|  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
 1669|       |
 1670|  13.3k|  a16 = _mm256_set1_epi16(16);
 1671|  13.3k|  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
 1672|  13.3k|  max_base_x256 = _mm256_set1_epi16(max_base_x);
 1673|  13.3k|  c3f = _mm256_set1_epi16(0x3f);
 1674|       |
 1675|  13.3k|  int x = dx;
 1676|   313k|  for (int r = 0; r < N; r++) {
  ------------------
  |  Branch (1676:19): [True: 300k, False: 13.3k]
  ------------------
 1677|   300k|    __m256i b, res;
 1678|       |
 1679|   300k|    int base = x >> frac_bits;
 1680|   300k|    if (base >= max_base_x) {
  ------------------
  |  Branch (1680:9): [True: 0, False: 300k]
  ------------------
 1681|      0|      for (int i = r; i < N; ++i) {
  ------------------
  |  Branch (1681:23): [True: 0, False: 0]
  ------------------
 1682|      0|        dstvec[i] = a_mbase_x;  // save 32 values
 1683|      0|        dstvec[i + N] = a_mbase_x;
 1684|      0|      }
 1685|      0|      return;
 1686|      0|    }
 1687|       |
 1688|   300k|    __m256i shift =
 1689|   300k|        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
 1690|       |
 1691|   901k|    for (int j = 0; j < 32; j += 16) {
  ------------------
  |  Branch (1691:21): [True: 601k, False: 300k]
  ------------------
 1692|   601k|      int mdif = max_base_x - (base + j);
 1693|   601k|      if (mdif <= 0) {
  ------------------
  |  Branch (1693:11): [True: 102, False: 601k]
  ------------------
 1694|    102|        res = a_mbase_x;
 1695|   601k|      } else {
 1696|   601k|        a0 = _mm256_loadu_si256((__m256i *)(above + base + j));
 1697|   601k|        a1 = _mm256_loadu_si256((__m256i *)(above + base + 1 + j));
 1698|       |
 1699|   601k|        diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
 1700|   601k|        a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
 1701|   601k|        a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
 1702|   601k|        b = _mm256_mullo_epi16(diff, shift);
 1703|       |
 1704|   601k|        res = _mm256_add_epi16(a32, b);
 1705|   601k|        res = _mm256_srli_epi16(res, 5);
 1706|       |
 1707|   601k|        base_inc256 = _mm256_setr_epi16(
 1708|   601k|            base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
 1709|   601k|            base + j + 5, base + j + 6, base + j + 7, base + j + 8,
 1710|   601k|            base + j + 9, base + j + 10, base + j + 11, base + j + 12,
 1711|   601k|            base + j + 13, base + j + 14, base + j + 15);
 1712|       |
 1713|   601k|        mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
 1714|   601k|        res = _mm256_blendv_epi8(a_mbase_x, res, mask256);
 1715|   601k|      }
 1716|   601k|      if (!j) {
  ------------------
  |  Branch (1716:11): [True: 300k, False: 300k]
  ------------------
 1717|   300k|        dstvec[r] = res;
 1718|   300k|      } else {
 1719|   300k|        dstvec[r + N] = res;
 1720|   300k|      }
 1721|   601k|    }
 1722|   300k|    x += dx;
 1723|   300k|  }
 1724|  13.3k|}
intrapred_avx2.c:highbd_dr_prediction_32bit_z1_32xN_internal_avx2:
 1558|     68|    int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
 1559|       |  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
 1560|     68|  (void)upsample_above;
 1561|     68|  const int frac_bits = 6;
 1562|     68|  const int max_base_x = ((32 + N) - 1);
 1563|       |
 1564|       |  // pre-filter above pixels
 1565|       |  // store in temp buffers:
 1566|       |  //   above[x] * 32 + 16
 1567|       |  //   above[x+1] - above[x]
 1568|       |  // final pixels will be calculated as:
 1569|       |  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
 1570|     68|  __m256i a0, a0_1, a1, a1_1, a32, a16, c3f;
 1571|     68|  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
 1572|       |
 1573|     68|  a16 = _mm256_set1_epi32(16);
 1574|     68|  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
 1575|     68|  max_base_x256 = _mm256_set1_epi16(max_base_x);
 1576|     68|  c3f = _mm256_set1_epi16(0x3f);
 1577|       |
 1578|     68|  int x = dx;
 1579|  1.36k|  for (int r = 0; r < N; r++) {
  ------------------
  |  Branch (1579:19): [True: 1.29k, False: 68]
  ------------------
 1580|  1.29k|    __m256i b, res[2], res1;
 1581|       |
 1582|  1.29k|    int base = x >> frac_bits;
 1583|  1.29k|    if (base >= max_base_x) {
  ------------------
  |  Branch (1583:9): [True: 0, False: 1.29k]
  ------------------
 1584|      0|      for (int i = r; i < N; ++i) {
  ------------------
  |  Branch (1584:23): [True: 0, False: 0]
  ------------------
 1585|      0|        dstvec[i] = a_mbase_x;  // save 32 values
 1586|      0|        dstvec[i + N] = a_mbase_x;
 1587|      0|      }
 1588|      0|      return;
 1589|      0|    }
 1590|       |
 1591|  1.29k|    __m256i shift =
 1592|  1.29k|        _mm256_srli_epi32(_mm256_and_si256(_mm256_set1_epi32(x), c3f), 1);
 1593|       |
 1594|  3.88k|    for (int j = 0; j < 32; j += 16) {
  ------------------
  |  Branch (1594:21): [True: 2.59k, False: 1.29k]
  ------------------
 1595|  2.59k|      int mdif = max_base_x - (base + j);
 1596|  2.59k|      if (mdif <= 0) {
  ------------------
  |  Branch (1596:11): [True: 0, False: 2.59k]
  ------------------
 1597|      0|        res1 = a_mbase_x;
 1598|  2.59k|      } else {
 1599|  2.59k|        a0 = _mm256_cvtepu16_epi32(
 1600|  2.59k|            _mm_loadu_si128((__m128i *)(above + base + j)));
 1601|  2.59k|        a1 = _mm256_cvtepu16_epi32(
 1602|  2.59k|            _mm_loadu_si128((__m128i *)(above + base + 1 + j)));
 1603|       |
 1604|  2.59k|        diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
 1605|  2.59k|        a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
 1606|  2.59k|        a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
 1607|  2.59k|        b = _mm256_mullo_epi32(diff, shift);
 1608|       |
 1609|  2.59k|        res[0] = _mm256_add_epi32(a32, b);
 1610|  2.59k|        res[0] = _mm256_srli_epi32(res[0], 5);
 1611|  2.59k|        res[0] = _mm256_packus_epi32(
 1612|  2.59k|            res[0],
 1613|  2.59k|            _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
 1614|  2.59k|        if (mdif > 8) {
  ------------------
  |  Branch (1614:13): [True: 2.56k, False: 32]
  ------------------
 1615|  2.56k|          a0_1 = _mm256_cvtepu16_epi32(
 1616|  2.56k|              _mm_loadu_si128((__m128i *)(above + base + 8 + j)));
 1617|  2.56k|          a1_1 = _mm256_cvtepu16_epi32(
 1618|  2.56k|              _mm_loadu_si128((__m128i *)(above + base + 9 + j)));
 1619|       |
 1620|  2.56k|          diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
 1621|  2.56k|          a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
 1622|  2.56k|          a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
 1623|  2.56k|          b = _mm256_mullo_epi32(diff, shift);
 1624|       |
 1625|  2.56k|          res[1] = _mm256_add_epi32(a32, b);
 1626|  2.56k|          res[1] = _mm256_srli_epi32(res[1], 5);
 1627|  2.56k|          res[1] = _mm256_packus_epi32(
 1628|  2.56k|              res[1],
 1629|  2.56k|              _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
 1630|  2.56k|        } else {
 1631|     32|          res[1] = a_mbase_x;
 1632|     32|        }
 1633|  2.59k|        res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
 1634|  2.59k|                                       1);  // 16 16bit values
 1635|  2.59k|        base_inc256 = _mm256_setr_epi16(
 1636|  2.59k|            base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
 1637|  2.59k|            base + j + 5, base + j + 6, base + j + 7, base + j + 8,
 1638|  2.59k|            base + j + 9, base + j + 10, base + j + 11, base + j + 12,
 1639|  2.59k|            base + j + 13, base + j + 14, base + j + 15);
 1640|       |
 1641|  2.59k|        mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
 1642|  2.59k|        res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
 1643|  2.59k|      }
 1644|  2.59k|      if (!j) {
  ------------------
  |  Branch (1644:11): [True: 1.29k, False: 1.29k]
  ------------------
 1645|  1.29k|        dstvec[r] = res1;
 1646|  1.29k|      } else {
 1647|  1.29k|        dstvec[r + N] = res1;
 1648|  1.29k|      }
 1649|  2.59k|    }
 1650|  1.29k|    x += dx;
 1651|  1.29k|  }
 1652|     68|}
intrapred_avx2.c:highbd_dr_prediction_z1_64xN_avx2:
 1847|    926|                                              int upsample_above, int dx) {
 1848|       |  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
 1849|    926|  (void)upsample_above;
 1850|    926|  const int frac_bits = 6;
 1851|    926|  const int max_base_x = ((64 + N) - 1);
 1852|       |
 1853|       |  // pre-filter above pixels
 1854|       |  // store in temp buffers:
 1855|       |  //   above[x] * 32 + 16
 1856|       |  //   above[x+1] - above[x]
 1857|       |  // final pixels will be calculated as:
 1858|       |  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
 1859|    926|  __m256i a0, a1, a32, a16, c3f;
 1860|    926|  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
 1861|       |
 1862|    926|  a16 = _mm256_set1_epi16(16);
 1863|    926|  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
 1864|    926|  max_base_x256 = _mm256_set1_epi16(max_base_x);
 1865|    926|  c3f = _mm256_set1_epi16(0x3f);
 1866|       |
 1867|    926|  int x = dx;
 1868|  37.5k|  for (int r = 0; r < N; r++, dst += stride) {
  ------------------
  |  Branch (1868:19): [True: 36.5k, False: 926]
  ------------------
 1869|  36.5k|    __m256i b, res;
 1870|       |
 1871|  36.5k|    int base = x >> frac_bits;
 1872|  36.5k|    if (base >= max_base_x) {
  ------------------
  |  Branch (1872:9): [True: 0, False: 36.5k]
  ------------------
 1873|      0|      for (int i = r; i < N; ++i) {
  ------------------
  |  Branch (1873:23): [True: 0, False: 0]
  ------------------
 1874|      0|        _mm256_storeu_si256((__m256i *)dst, a_mbase_x);  // save 32 values
 1875|      0|        _mm256_storeu_si256((__m256i *)(dst + 16), a_mbase_x);
 1876|      0|        _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
 1877|      0|        _mm256_storeu_si256((__m256i *)(dst + 48), a_mbase_x);
 1878|      0|        dst += stride;
 1879|      0|      }
 1880|      0|      return;
 1881|      0|    }
 1882|       |
 1883|  36.5k|    __m256i shift =
 1884|  36.5k|        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
 1885|       |
 1886|   182k|    for (int j = 0; j < 64; j += 16) {
  ------------------
  |  Branch (1886:21): [True: 146k, False: 36.5k]
  ------------------
 1887|   146k|      int mdif = max_base_x - (base + j);
 1888|   146k|      if (mdif <= 0) {
  ------------------
  |  Branch (1888:11): [True: 177, False: 146k]
  ------------------
 1889|    177|        _mm256_storeu_si256((__m256i *)(dst + j), a_mbase_x);
 1890|   146k|      } else {
 1891|   146k|        a0 = _mm256_loadu_si256((__m256i *)(above + base + j));
 1892|   146k|        a1 = _mm256_loadu_si256((__m256i *)(above + base + 1 + j));
 1893|       |
 1894|   146k|        diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
 1895|   146k|        a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
 1896|   146k|        a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
 1897|   146k|        b = _mm256_mullo_epi16(diff, shift);
 1898|       |
 1899|   146k|        res = _mm256_add_epi16(a32, b);
 1900|   146k|        res = _mm256_srli_epi16(res, 5);
 1901|       |
 1902|   146k|        base_inc256 = _mm256_setr_epi16(
 1903|   146k|            base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
 1904|   146k|            base + j + 5, base + j + 6, base + j + 7, base + j + 8,
 1905|   146k|            base + j + 9, base + j + 10, base + j + 11, base + j + 12,
 1906|   146k|            base + j + 13, base + j + 14, base + j + 15);
 1907|       |
 1908|   146k|        mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
 1909|   146k|        res = _mm256_blendv_epi8(a_mbase_x, res, mask256);
 1910|   146k|        _mm256_storeu_si256((__m256i *)(dst + j), res);  // 16 16bit values
 1911|   146k|      }
 1912|   146k|    }
 1913|  36.5k|    x += dx;
 1914|  36.5k|  }
 1915|    926|}
intrapred_avx2.c:highbd_dr_prediction_32bit_z1_64xN_avx2:
 1749|      4|                                                    int dx) {
 1750|       |  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
 1751|      4|  (void)upsample_above;
 1752|      4|  const int frac_bits = 6;
 1753|      4|  const int max_base_x = ((64 + N) - 1);
 1754|       |
 1755|       |  // pre-filter above pixels
 1756|       |  // store in temp buffers:
 1757|       |  //   above[x] * 32 + 16
 1758|       |  //   above[x+1] - above[x]
 1759|       |  // final pixels will be calculated as:
 1760|       |  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
 1761|      4|  __m256i a0, a0_1, a1, a1_1, a32, a16;
 1762|      4|  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
 1763|       |
 1764|      4|  a16 = _mm256_set1_epi32(16);
 1765|      4|  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
 1766|      4|  max_base_x256 = _mm256_set1_epi16(max_base_x);
 1767|       |
 1768|      4|  int x = dx;
 1769|    100|  for (int r = 0; r < N; r++, dst += stride) {
  ------------------
  |  Branch (1769:19): [True: 96, False: 4]
  ------------------
 1770|     96|    __m256i b, res[2], res1;
 1771|       |
 1772|     96|    int base = x >> frac_bits;
 1773|     96|    if (base >= max_base_x) {
  ------------------
  |  Branch (1773:9): [True: 0, False: 96]
  ------------------
 1774|      0|      for (int i = r; i < N; ++i) {
  ------------------
  |  Branch (1774:23): [True: 0, False: 0]
  ------------------
 1775|      0|        _mm256_storeu_si256((__m256i *)dst, a_mbase_x);  // save 32 values
 1776|      0|        _mm256_storeu_si256((__m256i *)(dst + 16), a_mbase_x);
 1777|      0|        _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
 1778|      0|        _mm256_storeu_si256((__m256i *)(dst + 48), a_mbase_x);
 1779|      0|        dst += stride;
 1780|      0|      }
 1781|      0|      return;
 1782|      0|    }
 1783|       |
 1784|     96|    __m256i shift = _mm256_srli_epi32(
 1785|     96|        _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
 1786|       |
 1787|     96|    __m128i a0_128, a0_1_128, a1_128, a1_1_128;
 1788|    480|    for (int j = 0; j < 64; j += 16) {
  ------------------
  |  Branch (1788:21): [True: 384, False: 96]
  ------------------
 1789|    384|      int mdif = max_base_x - (base + j);
 1790|    384|      if (mdif <= 0) {
  ------------------
  |  Branch (1790:11): [True: 0, False: 384]
  ------------------
 1791|      0|        _mm256_storeu_si256((__m256i *)(dst + j), a_mbase_x);
 1792|    384|      } else {
 1793|    384|        a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
 1794|    384|        a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
 1795|    384|        a0 = _mm256_cvtepu16_epi32(a0_128);
 1796|    384|        a1 = _mm256_cvtepu16_epi32(a1_128);
 1797|       |
 1798|    384|        diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
 1799|    384|        a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
 1800|    384|        a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
 1801|    384|        b = _mm256_mullo_epi32(diff, shift);
 1802|       |
 1803|    384|        res[0] = _mm256_add_epi32(a32, b);
 1804|    384|        res[0] = _mm256_srli_epi32(res[0], 5);
 1805|    384|        res[0] = _mm256_packus_epi32(
 1806|    384|            res[0],
 1807|    384|            _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
 1808|    384|        if (mdif > 8) {
  ------------------
  |  Branch (1808:13): [True: 384, False: 0]
  ------------------
 1809|    384|          a0_1_128 = _mm_loadu_si128((__m128i *)(above + base + 8 + j));
 1810|    384|          a1_1_128 = _mm_loadu_si128((__m128i *)(above + base + 9 + j));
 1811|    384|          a0_1 = _mm256_cvtepu16_epi32(a0_1_128);
 1812|    384|          a1_1 = _mm256_cvtepu16_epi32(a1_1_128);
 1813|       |
 1814|    384|          diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
 1815|    384|          a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
 1816|    384|          a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
 1817|    384|          b = _mm256_mullo_epi32(diff, shift);
 1818|       |
 1819|    384|          res[1] = _mm256_add_epi32(a32, b);
 1820|    384|          res[1] = _mm256_srli_epi32(res[1], 5);
 1821|    384|          res[1] = _mm256_packus_epi32(
 1822|    384|              res[1],
 1823|    384|              _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
 1824|    384|        } else {
 1825|      0|          res[1] = a_mbase_x;
 1826|      0|        }
 1827|    384|        res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
 1828|    384|                                       1);  // 16 16bit values
 1829|    384|        base_inc256 = _mm256_setr_epi16(
 1830|    384|            base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
 1831|    384|            base + j + 5, base + j + 6, base + j + 7, base + j + 8,
 1832|    384|            base + j + 9, base + j + 10, base + j + 11, base + j + 12,
 1833|    384|            base + j + 13, base + j + 14, base + j + 15);
 1834|       |
 1835|    384|        mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
 1836|    384|        res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
 1837|    384|        _mm256_storeu_si256((__m256i *)(dst + j), res1);
 1838|    384|      }
 1839|    384|    }
 1840|     96|    x += dx;
 1841|     96|  }
 1842|      4|}
intrapred_avx2.c:highbd_dr_prediction_z2_Nx4_avx2:
 2107|  63.0k|    int dy) {
 2108|  63.0k|  const int min_base_x = -(1 << upsample_above);
 2109|  63.0k|  const int min_base_y = -(1 << upsample_left);
 2110|  63.0k|  const int frac_bits_x = 6 - upsample_above;
 2111|  63.0k|  const int frac_bits_y = 6 - upsample_left;
 2112|       |
 2113|  63.0k|  assert(dx > 0);
 2114|       |  // pre-filter above pixels
 2115|       |  // store in temp buffers:
 2116|       |  //   above[x] * 32 + 16
 2117|       |  //   above[x+1] - above[x]
 2118|       |  // final pixels will be calculated as:
 2119|       |  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
 2120|  63.0k|  __m256i a0_x, a1_x, a32, a16;
 2121|  63.0k|  __m256i diff;
 2122|  63.0k|  __m128i c3f, min_base_y128;
 2123|       |
 2124|  63.0k|  a16 = _mm256_set1_epi16(16);
 2125|  63.0k|  c3f = _mm_set1_epi16(0x3f);
 2126|  63.0k|  min_base_y128 = _mm_set1_epi16(min_base_y);
 2127|       |
 2128|   393k|  for (int r = 0; r < N; r++) {
  ------------------
  |  Branch (2128:19): [True: 330k, False: 63.0k]
  ------------------
 2129|   330k|    __m256i b, res, shift;
 2130|   330k|    __m128i resx, resy, resxy;
 2131|   330k|    __m128i a0_x128, a1_x128;
 2132|   330k|    int y = r + 1;
 2133|   330k|    int base_x = (-y * dx) >> frac_bits_x;
 2134|   330k|    int base_shift = 0;
 2135|   330k|    if (base_x < (min_base_x - 1)) {
  ------------------
  |  Branch (2135:9): [True: 222k, False: 107k]
  ------------------
 2136|   222k|      base_shift = (min_base_x - base_x - 1) >> upsample_above;
 2137|   222k|    }
 2138|   330k|    int base_min_diff =
 2139|   330k|        (min_base_x - base_x + upsample_above) >> upsample_above;
 2140|   330k|    if (base_min_diff > 4) {
  ------------------
  |  Branch (2140:9): [True: 135k, False: 194k]
  ------------------
 2141|   135k|      base_min_diff = 4;
 2142|   194k|    } else {
 2143|   194k|      if (base_min_diff < 0) base_min_diff = 0;
  ------------------
  |  Branch (2143:11): [True: 0, False: 194k]
  ------------------
 2144|   194k|    }
 2145|       |
 2146|   330k|    if (base_shift > 3) {
  ------------------
  |  Branch (2146:9): [True: 135k, False: 194k]
  ------------------
 2147|   135k|      a0_x = _mm256_setzero_si256();
 2148|   135k|      a1_x = _mm256_setzero_si256();
 2149|   135k|      shift = _mm256_setzero_si256();
 2150|   194k|    } else {
 2151|   194k|      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
 2152|   194k|      if (upsample_above) {
  ------------------
  |  Branch (2152:11): [True: 100k, False: 93.7k]
  ------------------
 2153|   100k|        a0_x128 = _mm_shuffle_epi8(a0_x128,
 2154|   100k|                                   *(__m128i *)HighbdEvenOddMaskx4[base_shift]);
 2155|   100k|        a1_x128 = _mm_srli_si128(a0_x128, 8);
 2156|       |
 2157|   100k|        shift = _mm256_castsi128_si256(_mm_srli_epi16(
 2158|   100k|            _mm_and_si128(
 2159|   100k|                _mm_slli_epi16(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
 2160|   100k|                                              (2 << 6) - y * dx,
 2161|   100k|                                              (3 << 6) - y * dx, 0, 0, 0, 0),
 2162|   100k|                               upsample_above),
 2163|   100k|                c3f),
 2164|   100k|            1));
 2165|   100k|      } else {
 2166|  93.7k|        a0_x128 =
 2167|  93.7k|            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
 2168|  93.7k|        a1_x128 = _mm_srli_si128(a0_x128, 2);
 2169|       |
 2170|  93.7k|        shift = _mm256_castsi128_si256(_mm_srli_epi16(
 2171|  93.7k|            _mm_and_si128(
 2172|  93.7k|                _mm_setr_epi16(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx,
 2173|  93.7k|                               (3 << 6) - y * dx, 0, 0, 0, 0),
 2174|  93.7k|                c3f),
 2175|  93.7k|            1));
 2176|  93.7k|      }
 2177|   194k|      a0_x = _mm256_castsi128_si256(a0_x128);
 2178|   194k|      a1_x = _mm256_castsi128_si256(a1_x128);
 2179|   194k|    }
 2180|       |    // y calc
 2181|   330k|    __m128i a0_y, a1_y, shifty;
 2182|   330k|    if (base_x < min_base_x) {
  ------------------
  |  Branch (2182:9): [True: 253k, False: 76.0k]
  ------------------
 2183|   253k|      __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
 2184|   253k|      DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
  ------------------
  |  |   19|   253k|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
 2185|   253k|      r6 = _mm_set1_epi16(r << 6);
 2186|   253k|      dy128 = _mm_set1_epi16(dy);
 2187|   253k|      c1234 = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0);
 2188|   253k|      y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
 2189|   253k|      base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
 2190|   253k|      mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
 2191|   253k|      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
 2192|   253k|      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
 2193|       |
 2194|   253k|      a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
 2195|   253k|                            left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
 2196|   253k|      a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
 2197|   253k|                            left[base_y_c[2] + 1], left[base_y_c[3] + 1], 0, 0,
 2198|   253k|                            0, 0);
 2199|       |
 2200|   253k|      if (upsample_left) {
  ------------------
  |  Branch (2200:11): [True: 101k, False: 152k]
  ------------------
 2201|   101k|        shifty = _mm_srli_epi16(
 2202|   101k|            _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
 2203|   152k|      } else {
 2204|   152k|        shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
 2205|   152k|      }
 2206|   253k|      a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
 2207|   253k|      a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
 2208|   253k|      shift = _mm256_inserti128_si256(shift, shifty, 1);
 2209|   253k|    }
 2210|       |
 2211|   330k|    diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
 2212|   330k|    a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
 2213|   330k|    a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
 2214|       |
 2215|   330k|    b = _mm256_mullo_epi16(diff, shift);
 2216|   330k|    res = _mm256_add_epi16(a32, b);
 2217|   330k|    res = _mm256_srli_epi16(res, 5);
 2218|       |
 2219|   330k|    resx = _mm256_castsi256_si128(res);
 2220|       |    resy = _mm256_extracti128_si256(res, 1);
 2221|   330k|    resxy =
 2222|   330k|        _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
 2223|   330k|    _mm_storel_epi64((__m128i *)(dst), resxy);
 2224|   330k|    dst += stride;
 2225|   330k|  }
 2226|  63.0k|}
intrapred_avx2.c:highbd_dr_prediction_32bit_z2_Nx4_avx2:
 1981|    150|    int dy) {
 1982|    150|  const int min_base_x = -(1 << upsample_above);
 1983|    150|  const int min_base_y = -(1 << upsample_left);
 1984|    150|  const int frac_bits_x = 6 - upsample_above;
 1985|    150|  const int frac_bits_y = 6 - upsample_left;
 1986|       |
 1987|    150|  assert(dx > 0);
 1988|       |  // pre-filter above pixels
 1989|       |  // store in temp buffers:
 1990|       |  //   above[x] * 32 + 16
 1991|       |  //   above[x+1] - above[x]
 1992|       |  // final pixels will be calculated as:
 1993|       |  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
 1994|    150|  __m256i a0_x, a1_x, a32, a16;
 1995|    150|  __m256i diff;
 1996|    150|  __m128i c3f, min_base_y128;
 1997|       |
 1998|    150|  a16 = _mm256_set1_epi32(16);
 1999|    150|  c3f = _mm_set1_epi32(0x3f);
 2000|    150|  min_base_y128 = _mm_set1_epi32(min_base_y);
 2001|       |
 2002|  1.05k|  for (int r = 0; r < N; r++) {
  ------------------
  |  Branch (2002:19): [True: 904, False: 150]
  ------------------
 2003|    904|    __m256i b, res, shift;
 2004|    904|    __m128i resx, resy, resxy;
 2005|    904|    __m128i a0_x128, a1_x128;
 2006|    904|    int y = r + 1;
 2007|    904|    int base_x = (-y * dx) >> frac_bits_x;
 2008|    904|    int base_shift = 0;
 2009|    904|    if (base_x < (min_base_x - 1)) {
  ------------------
  |  Branch (2009:9): [True: 638, False: 266]
  ------------------
 2010|    638|      base_shift = (min_base_x - base_x - 1) >> upsample_above;
 2011|    638|    }
 2012|    904|    int base_min_diff =
 2013|    904|        (min_base_x - base_x + upsample_above) >> upsample_above;
 2014|    904|    if (base_min_diff > 4) {
  ------------------
  |  Branch (2014:9): [True: 384, False: 520]
  ------------------
 2015|    384|      base_min_diff = 4;
 2016|    520|    } else {
 2017|    520|      if (base_min_diff < 0) base_min_diff = 0;
  ------------------
  |  Branch (2017:11): [True: 0, False: 520]
  ------------------
 2018|    520|    }
 2019|       |
 2020|    904|    if (base_shift > 3) {
  ------------------
  |  Branch (2020:9): [True: 384, False: 520]
  ------------------
 2021|    384|      a0_x = _mm256_setzero_si256();
 2022|    384|      a1_x = _mm256_setzero_si256();
 2023|    384|      shift = _mm256_setzero_si256();
 2024|    520|    } else {
 2025|    520|      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
 2026|    520|      if (upsample_above) {
  ------------------
  |  Branch (2026:11): [True: 272, False: 248]
  ------------------
 2027|    272|        a0_x128 = _mm_shuffle_epi8(a0_x128,
 2028|    272|                                   *(__m128i *)HighbdEvenOddMaskx4[base_shift]);
 2029|    272|        a1_x128 = _mm_srli_si128(a0_x128, 8);
 2030|       |
 2031|    272|        shift = _mm256_castsi128_si256(_mm_srli_epi32(
 2032|    272|            _mm_and_si128(
 2033|    272|                _mm_slli_epi32(
 2034|    272|                    _mm_setr_epi32(-y * dx, (1 << 6) - y * dx,
 2035|    272|                                   (2 << 6) - y * dx, (3 << 6) - y * dx),
 2036|    272|                    upsample_above),
 2037|    272|                c3f),
 2038|    272|            1));
 2039|    272|      } else {
 2040|    248|        a0_x128 =
 2041|    248|            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
 2042|    248|        a1_x128 = _mm_srli_si128(a0_x128, 2);
 2043|       |
 2044|    248|        shift = _mm256_castsi128_si256(_mm_srli_epi32(
 2045|    248|            _mm_and_si128(_mm_setr_epi32(-y * dx, (1 << 6) - y * dx,
 2046|    248|                                         (2 << 6) - y * dx, (3 << 6) - y * dx),
 2047|    248|                          c3f),
 2048|    248|            1));
 2049|    248|      }
 2050|    520|      a0_x = _mm256_cvtepu16_epi32(a0_x128);
 2051|    520|      a1_x = _mm256_cvtepu16_epi32(a1_x128);
 2052|    520|    }
 2053|       |    // y calc
 2054|    904|    __m128i a0_y, a1_y, shifty;
 2055|    904|    if (base_x < min_base_x) {
  ------------------
  |  Branch (2055:9): [True: 760, False: 144]
  ------------------
 2056|    760|      __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
 2057|    760|      DECLARE_ALIGNED(32, int, base_y_c[4]);
  ------------------
  |  |   19|    760|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
 2058|    760|      r6 = _mm_set1_epi32(r << 6);
 2059|    760|      dy128 = _mm_set1_epi32(dy);
 2060|    760|      c1234 = _mm_setr_epi32(1, 2, 3, 4);
 2061|    760|      y_c128 = _mm_sub_epi32(r6, _mm_mullo_epi32(c1234, dy128));
 2062|    760|      base_y_c128 = _mm_srai_epi32(y_c128, frac_bits_y);
 2063|    760|      mask128 = _mm_cmpgt_epi32(min_base_y128, base_y_c128);
 2064|    760|      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
 2065|    760|      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
 2066|       |
 2067|    760|      a0_y = _mm_setr_epi32(left[base_y_c[0]], left[base_y_c[1]],
 2068|    760|                            left[base_y_c[2]], left[base_y_c[3]]);
 2069|    760|      a1_y = _mm_setr_epi32(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
 2070|    760|                            left[base_y_c[2] + 1], left[base_y_c[3] + 1]);
 2071|       |
 2072|    760|      if (upsample_left) {
  ------------------
  |  Branch (2072:11): [True: 368, False: 392]
  ------------------
 2073|    368|        shifty = _mm_srli_epi32(
 2074|    368|            _mm_and_si128(_mm_slli_epi32(y_c128, upsample_left), c3f), 1);
 2075|    392|      } else {
 2076|    392|        shifty = _mm_srli_epi32(_mm_and_si128(y_c128, c3f), 1);
 2077|    392|      }
 2078|    760|      a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
 2079|    760|      a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
 2080|    760|      shift = _mm256_inserti128_si256(shift, shifty, 1);
 2081|    760|    }
 2082|       |
 2083|    904|    diff = _mm256_sub_epi32(a1_x, a0_x);  // a[x+1] - a[x]
 2084|    904|    a32 = _mm256_slli_epi32(a0_x, 5);     // a[x] * 32
 2085|    904|    a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
 2086|       |
 2087|    904|    b = _mm256_mullo_epi32(diff, shift);
 2088|    904|    res = _mm256_add_epi32(a32, b);
 2089|    904|    res = _mm256_srli_epi32(res, 5);
 2090|       |
 2091|    904|    resx = _mm256_castsi256_si128(res);
 2092|    904|    resx = _mm_packus_epi32(resx, resx);
 2093|       |
 2094|    904|    resy = _mm256_extracti128_si256(res, 1);
 2095|    904|    resy = _mm_packus_epi32(resy, resy);
 2096|       |
 2097|    904|    resxy =
 2098|    904|        _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
 2099|    904|    _mm_storel_epi64((__m128i *)(dst), resxy);
 2100|    904|    dst += stride;
 2101|    904|  }
 2102|    150|}
intrapred_avx2.c:highbd_dr_prediction_z2_Nx8_avx2:
 2381|  65.8k|    int dy) {
 2382|  65.8k|  const int min_base_x = -(1 << upsample_above);
 2383|  65.8k|  const int min_base_y = -(1 << upsample_left);
 2384|  65.8k|  const int frac_bits_x = 6 - upsample_above;
 2385|  65.8k|  const int frac_bits_y = 6 - upsample_left;
 2386|       |
 2387|       |  // pre-filter above pixels
 2388|       |  // store in temp buffers:
 2389|       |  //   above[x] * 32 + 16
 2390|       |  //   above[x+1] - above[x]
 2391|       |  // final pixels will be calculated as:
 2392|       |  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
 2393|  65.8k|  __m128i c3f, min_base_y128;
 2394|  65.8k|  __m256i a0_x, a1_x, diff, a32, a16;
 2395|  65.8k|  __m128i a0_x128, a1_x128;
 2396|       |
 2397|  65.8k|  a16 = _mm256_set1_epi16(16);
 2398|  65.8k|  c3f = _mm_set1_epi16(0x3f);
 2399|  65.8k|  min_base_y128 = _mm_set1_epi16(min_base_y);
 2400|       |
 2401|   681k|  for (int r = 0; r < N; r++) {
  ------------------
  |  Branch (2401:19): [True: 615k, False: 65.8k]
  ------------------
 2402|   615k|    __m256i b, res, shift;
 2403|   615k|    __m128i resx, resy, resxy;
 2404|   615k|    int y = r + 1;
 2405|   615k|    int base_x = (-y * dx) >> frac_bits_x;
 2406|   615k|    int base_shift = 0;
 2407|   615k|    if (base_x < (min_base_x - 1)) {
  ------------------
  |  Branch (2407:9): [True: 457k, False: 157k]
  ------------------
 2408|   457k|      base_shift = (min_base_x - base_x - 1) >> upsample_above;
 2409|   457k|    }
 2410|   615k|    int base_min_diff =
 2411|   615k|        (min_base_x - base_x + upsample_above) >> upsample_above;
 2412|   615k|    if (base_min_diff > 8) {
  ------------------
  |  Branch (2412:9): [True: 270k, False: 344k]
  ------------------
 2413|   270k|      base_min_diff = 8;
 2414|   344k|    } else {
 2415|   344k|      if (base_min_diff < 0) base_min_diff = 0;
  ------------------
  |  Branch (2415:11): [True: 0, False: 344k]
  ------------------
 2416|   344k|    }
 2417|       |
 2418|   615k|    if (base_shift > 7) {
  ------------------
  |  Branch (2418:9): [True: 270k, False: 344k]
  ------------------
 2419|   270k|      a0_x = _mm256_setzero_si256();
 2420|   270k|      a1_x = _mm256_setzero_si256();
 2421|   270k|      shift = _mm256_setzero_si256();
 2422|   344k|    } else {
 2423|   344k|      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
 2424|   344k|      if (upsample_above) {
  ------------------
  |  Branch (2424:11): [True: 86.5k, False: 258k]
  ------------------
 2425|  86.5k|        __m128i mask, atmp0, atmp1, atmp2, atmp3;
 2426|  86.5k|        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 8 + base_shift));
 2427|  86.5k|        atmp0 = _mm_shuffle_epi8(a0_x128,
 2428|  86.5k|                                 *(__m128i *)HighbdEvenOddMaskx[base_shift]);
 2429|  86.5k|        atmp1 = _mm_shuffle_epi8(a1_x128,
 2430|  86.5k|                                 *(__m128i *)HighbdEvenOddMaskx[base_shift]);
 2431|  86.5k|        atmp2 = _mm_shuffle_epi8(
 2432|  86.5k|            a0_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
 2433|  86.5k|        atmp3 = _mm_shuffle_epi8(
 2434|  86.5k|            a1_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
 2435|  86.5k|        mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[base_shift],
 2436|  86.5k|                              _mm_set1_epi8(15));
 2437|  86.5k|        a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
 2438|  86.5k|        mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16),
 2439|  86.5k|                              _mm_set1_epi8(15));
 2440|  86.5k|        a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
 2441|       |
 2442|  86.5k|        shift = _mm256_castsi128_si256(_mm_srli_epi16(
 2443|  86.5k|            _mm_and_si128(
 2444|  86.5k|                _mm_slli_epi16(
 2445|  86.5k|                    _mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
 2446|  86.5k|                                   (2 << 6) - y * dx, (3 << 6) - y * dx,
 2447|  86.5k|                                   (4 << 6) - y * dx, (5 << 6) - y * dx,
 2448|  86.5k|                                   (6 << 6) - y * dx, (7 << 6) - y * dx),
 2449|  86.5k|                    upsample_above),
 2450|  86.5k|                c3f),
 2451|  86.5k|            1));
 2452|   258k|      } else {
 2453|   258k|        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift));
 2454|   258k|        a0_x128 =
 2455|   258k|            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
 2456|   258k|        a1_x128 =
 2457|   258k|            _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
 2458|       |
 2459|   258k|        shift = _mm256_castsi128_si256(_mm_srli_epi16(
 2460|   258k|            _mm_and_si128(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
 2461|   258k|                                         (2 << 6) - y * dx, (3 << 6) - y * dx,
 2462|   258k|                                         (4 << 6) - y * dx, (5 << 6) - y * dx,
 2463|   258k|                                         (6 << 6) - y * dx, (7 << 6) - y * dx),
 2464|   258k|                          c3f),
 2465|   258k|            1));
 2466|   258k|      }
 2467|   344k|      a0_x = _mm256_castsi128_si256(a0_x128);
 2468|   344k|      a1_x = _mm256_castsi128_si256(a1_x128);
 2469|   344k|    }
 2470|       |
 2471|       |    // y calc
 2472|   615k|    __m128i a0_y, a1_y, shifty;
 2473|   615k|    if (base_x < min_base_x) {
  ------------------
  |  Branch (2473:9): [True: 509k, False: 105k]
  ------------------
 2474|   509k|      DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
  ------------------
  |  |   19|   509k|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
 2475|   509k|      __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
 2476|   509k|      r6 = _mm_set1_epi16(r << 6);
 2477|   509k|      dy128 = _mm_set1_epi16(dy);
 2478|   509k|      c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
 2479|   509k|      y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
 2480|   509k|      base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
 2481|   509k|      mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
 2482|   509k|      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
 2483|   509k|      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
 2484|       |
 2485|   509k|      a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
 2486|   509k|                            left[base_y_c[2]], left[base_y_c[3]],
 2487|   509k|                            left[base_y_c[4]], left[base_y_c[5]],
 2488|   509k|                            left[base_y_c[6]], left[base_y_c[7]]);
 2489|   509k|      a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
 2490|   509k|                            left[base_y_c[2] + 1], left[base_y_c[3] + 1],
 2491|   509k|                            left[base_y_c[4] + 1], left[base_y_c[5] + 1],
 2492|   509k|                            left[base_y_c[6] + 1], left[base_y_c[7] + 1]);
 2493|       |
 2494|   509k|      if (upsample_left) {
  ------------------
  |  Branch (2494:11): [True: 135k, False: 373k]
  ------------------
 2495|   135k|        shifty = _mm_srli_epi16(
 2496|   135k|            _mm_and_si128(_mm_slli_epi16((y_c128), upsample_left), c3f), 1);
 2497|   373k|      } else {
 2498|   373k|        shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
 2499|   373k|      }
 2500|   509k|      a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
 2501|   509k|      a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
 2502|   509k|      shift = _mm256_inserti128_si256(shift, shifty, 1);
 2503|   509k|    }
 2504|       |
 2505|   615k|    diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
 2506|   615k|    a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
 2507|   615k|    a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
 2508|       |
 2509|   615k|    b = _mm256_mullo_epi16(diff, shift);
 2510|   615k|    res = _mm256_add_epi16(a32, b);
 2511|   615k|    res = _mm256_srli_epi16(res, 5);
 2512|       |
 2513|   615k|    resx = _mm256_castsi256_si128(res);
 2514|   615k|    resy = _mm256_extracti128_si256(res, 1);
 2515|       |
 2516|   615k|    resxy =
 2517|   615k|        _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
 2518|   615k|    _mm_storeu_si128((__m128i *)(dst), resxy);
 2519|   615k|    dst += stride;
 2520|   615k|  }
 2521|  65.8k|}
intrapred_avx2.c:highbd_dr_prediction_32bit_z2_Nx8_avx2:
 2231|    490|    int dy) {
 2232|    490|  const int min_base_x = -(1 << upsample_above);
 2233|    490|  const int min_base_y = -(1 << upsample_left);
 2234|    490|  const int frac_bits_x = 6 - upsample_above;
 2235|    490|  const int frac_bits_y = 6 - upsample_left;
 2236|       |
 2237|       |  // pre-filter above pixels
 2238|       |  // store in temp buffers:
 2239|       |  //   above[x] * 32 + 16
 2240|       |  //   above[x+1] - above[x]
 2241|       |  // final pixels will be calculated as:
 2242|       |  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
 2243|    490|  __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c3f, min_base_y256;
 2244|    490|  __m256i diff;
 2245|    490|  __m128i a0_x128, a1_x128;
 2246|       |
 2247|    490|  a16 = _mm256_set1_epi32(16);
 2248|    490|  c3f = _mm256_set1_epi32(0x3f);
 2249|    490|  min_base_y256 = _mm256_set1_epi32(min_base_y);
 2250|       |
 2251|  4.49k|  for (int r = 0; r < N; r++) {
  ------------------
  |  Branch (2251:19): [True: 4.00k, False: 490]
  ------------------
 2252|  4.00k|    __m256i b, res, shift;
 2253|  4.00k|    __m128i resx, resy, resxy;
 2254|  4.00k|    int y = r + 1;
 2255|  4.00k|    int base_x = (-y * dx) >> frac_bits_x;
 2256|  4.00k|    int base_shift = 0;
 2257|  4.00k|    if (base_x < (min_base_x - 1)) {
  ------------------
  |  Branch (2257:9): [True: 2.91k, False: 1.08k]
  ------------------
 2258|  2.91k|      base_shift = (min_base_x - base_x - 1) >> upsample_above;
 2259|  2.91k|    }
 2260|  4.00k|    int base_min_diff =
 2261|  4.00k|        (min_base_x - base_x + upsample_above) >> upsample_above;
 2262|  4.00k|    if (base_min_diff > 8) {
  ------------------
  |  Branch (2262:9): [True: 958, False: 3.04k]
  ------------------
 2263|    958|      base_min_diff = 8;
 2264|  3.04k|    } else {
 2265|  3.04k|      if (base_min_diff < 0) base_min_diff = 0;
  ------------------
  |  Branch (2265:11): [True: 0, False: 3.04k]
  ------------------
 2266|  3.04k|    }
 2267|       |
 2268|  4.00k|    if (base_shift > 7) {
  ------------------
  |  Branch (2268:9): [True: 958, False: 3.04k]
  ------------------
 2269|    958|      resx = _mm_setzero_si128();
 2270|  3.04k|    } else {
 2271|  3.04k|      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
 2272|  3.04k|      if (upsample_above) {
  ------------------
  |  Branch (2272:11): [True: 2.16k, False: 882]
  ------------------
 2273|  2.16k|        __m128i mask, atmp0, atmp1, atmp2, atmp3;
 2274|  2.16k|        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 8 + base_shift));
 2275|  2.16k|        atmp0 = _mm_shuffle_epi8(a0_x128,
 2276|  2.16k|                                 *(__m128i *)HighbdEvenOddMaskx[base_shift]);
 2277|  2.16k|        atmp1 = _mm_shuffle_epi8(a1_x128,
 2278|  2.16k|                                 *(__m128i *)HighbdEvenOddMaskx[base_shift]);
 2279|  2.16k|        atmp2 = _mm_shuffle_epi8(
 2280|  2.16k|            a0_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
 2281|  2.16k|        atmp3 = _mm_shuffle_epi8(
 2282|  2.16k|            a1_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
 2283|  2.16k|        mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[base_shift],
 2284|  2.16k|                              _mm_set1_epi8(15));
 2285|  2.16k|        a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
 2286|  2.16k|        mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16),
 2287|  2.16k|                              _mm_set1_epi8(15));
 2288|  2.16k|        a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
 2289|  2.16k|        shift = _mm256_srli_epi32(
 2290|  2.16k|            _mm256_and_si256(
 2291|  2.16k|                _mm256_slli_epi32(
 2292|  2.16k|                    _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx,
 2293|  2.16k|                                      (2 << 6) - y * dx, (3 << 6) - y * dx,
 2294|  2.16k|                                      (4 << 6) - y * dx, (5 << 6) - y * dx,
 2295|  2.16k|                                      (6 << 6) - y * dx, (7 << 6) - y * dx),
 2296|  2.16k|                    upsample_above),
 2297|  2.16k|                c3f),
 2298|  2.16k|            1);
 2299|  2.16k|      } else {
 2300|    882|        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift));
 2301|    882|        a0_x128 =
 2302|    882|            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
 2303|    882|        a1_x128 =
 2304|    882|            _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
 2305|       |
 2306|    882|        shift = _mm256_srli_epi32(
 2307|    882|            _mm256_and_si256(
 2308|    882|                _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx,
 2309|    882|                                  (3 << 6) - y * dx, (4 << 6) - y * dx,
 2310|    882|                                  (5 << 6) - y * dx, (6 << 6) - y * dx,
 2311|    882|                                  (7 << 6) - y * dx),
 2312|    882|                c3f),
 2313|    882|            1);
 2314|    882|      }
 2315|  3.04k|      a0_x = _mm256_cvtepu16_epi32(a0_x128);
 2316|  3.04k|      a1_x = _mm256_cvtepu16_epi32(a1_x128);
 2317|       |
 2318|  3.04k|      diff = _mm256_sub_epi32(a1_x, a0_x);  // a[x+1] - a[x]
 2319|  3.04k|      a32 = _mm256_slli_epi32(a0_x, 5);     // a[x] * 32
 2320|  3.04k|      a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
 2321|       |
 2322|  3.04k|      b = _mm256_mullo_epi32(diff, shift);
 2323|  3.04k|      res = _mm256_add_epi32(a32, b);
 2324|  3.04k|      res = _mm256_srli_epi32(res, 5);
 2325|       |
 2326|  3.04k|      resx = _mm256_castsi256_si128(_mm256_packus_epi32(
 2327|  3.04k|          res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
 2328|  3.04k|    }
 2329|       |    // y calc
 2330|  4.00k|    if (base_x < min_base_x) {
  ------------------
  |  Branch (2330:9): [True: 3.29k, False: 702]
  ------------------
 2331|  3.29k|      DECLARE_ALIGNED(32, int, base_y_c[8]);
  ------------------
  |  |   19|  3.29k|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
 2332|  3.29k|      __m256i r6, c256, dy256, y_c256, base_y_c256, mask256;
 2333|  3.29k|      r6 = _mm256_set1_epi32(r << 6);
 2334|  3.29k|      dy256 = _mm256_set1_epi32(dy);
 2335|  3.29k|      c256 = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
 2336|  3.29k|      y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
 2337|  3.29k|      base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y);
 2338|  3.29k|      mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
 2339|  3.29k|      base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
 2340|  3.29k|      _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
 2341|       |
 2342|  3.29k|      a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
 2343|  3.29k|          left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
 2344|  3.29k|          left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
 2345|  3.29k|          left[base_y_c[6]], left[base_y_c[7]]));
 2346|  3.29k|      a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
 2347|  3.29k|          left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
 2348|  3.29k|          left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
 2349|  3.29k|          left[base_y_c[6] + 1], left[base_y_c[7] + 1]));
 2350|       |
 2351|  3.29k|      if (upsample_left) {
  ------------------
  |  Branch (2351:11): [True: 792, False: 2.50k]
  ------------------
 2352|    792|        shift = _mm256_srli_epi32(
 2353|    792|            _mm256_and_si256(_mm256_slli_epi32((y_c256), upsample_left), c3f),
 2354|    792|            1);
 2355|  2.50k|      } else {
 2356|  2.50k|        shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1);
 2357|  2.50k|      }
 2358|  3.29k|      diff = _mm256_sub_epi32(a1_y, a0_y);  // a[x+1] - a[x]
 2359|  3.29k|      a32 = _mm256_slli_epi32(a0_y, 5);     // a[x] * 32
 2360|  3.29k|      a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
 2361|       |
 2362|  3.29k|      b = _mm256_mullo_epi32(diff, shift);
 2363|  3.29k|      res = _mm256_add_epi32(a32, b);
 2364|  3.29k|      res = _mm256_srli_epi32(res, 5);
 2365|       |
 2366|  3.29k|      resy = _mm256_castsi256_si128(_mm256_packus_epi32(
 2367|  3.29k|          res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
 2368|  3.29k|    } else {
 2369|    702|      resy = resx;
 2370|    702|    }
 2371|  4.00k|    resxy =
 2372|  4.00k|        _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
 2373|  4.00k|    _mm_storeu_si128((__m128i *)(dst), resxy);
 2374|  4.00k|    dst += stride;
 2375|  4.00k|  }
 2376|    490|}
intrapred_avx2.c:highbd_dr_prediction_z2_HxW_avx2:
 2722|  53.8k|    int dy) {
 2723|       |  // here upsample_above and upsample_left are 0 by design of
 2724|       |  // av1_use_intra_edge_upsample
 2725|  53.8k|  const int min_base_x = -1;
 2726|  53.8k|  const int min_base_y = -1;
 2727|  53.8k|  (void)upsample_above;
 2728|  53.8k|  (void)upsample_left;
 2729|  53.8k|  const int frac_bits_x = 6;
 2730|  53.8k|  const int frac_bits_y = 6;
 2731|       |
 2732|       |  // pre-filter above pixels
 2733|       |  // store in temp buffers:
 2734|       |  //   above[x] * 32 + 16
 2735|       |  //   above[x+1] - above[x]
 2736|       |  // final pixels will be calculated as:
 2737|       |  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
 2738|  53.8k|  __m256i a0_x, a1_x, a32, a16, c3f, c1;
 2739|  53.8k|  __m256i diff, min_base_y256, dy256, c1234, c0123;
 2740|  53.8k|  DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
  ------------------
  |  |   19|  53.8k|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
 2741|       |
 2742|  53.8k|  a16 = _mm256_set1_epi16(16);
 2743|  53.8k|  c1 = _mm256_srli_epi16(a16, 4);
 2744|  53.8k|  min_base_y256 = _mm256_set1_epi16(min_base_y);
 2745|  53.8k|  c3f = _mm256_set1_epi16(0x3f);
 2746|  53.8k|  dy256 = _mm256_set1_epi16(dy);
 2747|  53.8k|  c0123 =
 2748|  53.8k|      _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
 2749|  53.8k|  c1234 = _mm256_add_epi16(c0123, c1);
 2750|       |
 2751|   843k|  for (int r = 0; r < H; r++) {
  ------------------
  |  Branch (2751:19): [True: 790k, False: 53.8k]
  ------------------
 2752|   790k|    __m256i b, res, shift;
 2753|   790k|    __m256i resx, resy, ydx;
 2754|   790k|    __m256i resxy, j256, r6;
 2755|   790k|    __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128;
 2756|   790k|    int y = r + 1;
 2757|   790k|    ydx = _mm256_set1_epi16((short)(y * dx));
 2758|       |
 2759|  1.94M|    for (int j = 0; j < W; j += 16) {
  ------------------
  |  Branch (2759:21): [True: 1.15M, False: 790k]
  ------------------
 2760|  1.15M|      j256 = _mm256_set1_epi16(j);
 2761|  1.15M|      int base_x = ((j << 6) - y * dx) >> frac_bits_x;
 2762|  1.15M|      int base_shift = 0;
 2763|  1.15M|      if ((base_x) < (min_base_x - 1)) {
  ------------------
  |  Branch (2763:11): [True: 775k, False: 384k]
  ------------------
 2764|   775k|        base_shift = (min_base_x - (base_x)-1);
 2765|   775k|      }
 2766|  1.15M|      int base_min_diff = (min_base_x - base_x);
 2767|  1.15M|      if (base_min_diff > 16) {
  ------------------
  |  Branch (2767:11): [True: 448k, False: 711k]
  ------------------
 2768|   448k|        base_min_diff = 16;
 2769|   711k|      } else {
 2770|   711k|        if (base_min_diff < 0) base_min_diff = 0;
  ------------------
  |  Branch (2770:13): [True: 195k, False: 516k]
  ------------------
 2771|   711k|      }
 2772|       |
 2773|  1.15M|      if (base_shift < 8) {
  ------------------
  |  Branch (2773:11): [True: 596k, False: 563k]
  ------------------
 2774|   596k|        a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
 2775|   596k|        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1));
 2776|   596k|        a0_x128 =
 2777|   596k|            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
 2778|   596k|        a1_x128 =
 2779|   596k|            _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
 2780|       |
 2781|   596k|        a0_x = _mm256_castsi128_si256(a0_x128);
 2782|   596k|        a1_x = _mm256_castsi128_si256(a1_x128);
 2783|   596k|      } else {
 2784|   563k|        a0_x = _mm256_setzero_si256();
 2785|   563k|        a1_x = _mm256_setzero_si256();
 2786|   563k|      }
 2787|       |
 2788|  1.15M|      int base_shift1 = 0;
 2789|  1.15M|      if (base_shift > 8) {
  ------------------
  |  Branch (2789:11): [True: 543k, False: 615k]
  ------------------
 2790|   543k|        base_shift1 = base_shift - 8;
 2791|   543k|      }
 2792|  1.15M|      if (base_shift1 < 8) {
  ------------------
  |  Branch (2792:11): [True: 711k, False: 448k]
  ------------------
 2793|   711k|        a0_1_x128 =
 2794|   711k|            _mm_loadu_si128((__m128i *)(above + base_x + base_shift1 + 8));
 2795|   711k|        a1_1_x128 =
 2796|   711k|            _mm_loadu_si128((__m128i *)(above + base_x + base_shift1 + 9));
 2797|   711k|        a0_1_x128 = _mm_shuffle_epi8(a0_1_x128,
 2798|   711k|                                     *(__m128i *)HighbdLoadMaskx[base_shift1]);
 2799|   711k|        a1_1_x128 = _mm_shuffle_epi8(a1_1_x128,
 2800|   711k|                                     *(__m128i *)HighbdLoadMaskx[base_shift1]);
 2801|       |
 2802|   711k|        a0_x = _mm256_inserti128_si256(a0_x, a0_1_x128, 1);
 2803|   711k|        a1_x = _mm256_inserti128_si256(a1_x, a1_1_x128, 1);
 2804|   711k|      }
 2805|  1.15M|      r6 = _mm256_slli_epi16(_mm256_add_epi16(c0123, j256), 6);
 2806|  1.15M|      shift = _mm256_srli_epi16(
 2807|  1.15M|          _mm256_and_si256(_mm256_sub_epi16(r6, ydx), c3f), 1);
 2808|       |
 2809|  1.15M|      diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
 2810|  1.15M|      a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
 2811|  1.15M|      a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
 2812|       |
 2813|  1.15M|      b = _mm256_mullo_epi16(diff, shift);
 2814|  1.15M|      res = _mm256_add_epi16(a32, b);
 2815|  1.15M|      resx = _mm256_srli_epi16(res, 5);  // 16 16-bit values
 2816|       |
 2817|       |      // y calc
 2818|  1.15M|      resy = _mm256_setzero_si256();
 2819|  1.15M|      __m256i a0_y, a1_y, shifty;
 2820|  1.15M|      if ((base_x < min_base_x)) {
  ------------------
  |  Branch (2820:11): [True: 846k, False: 313k]
  ------------------
 2821|   846k|        __m256i c256, y_c256, base_y_c256, mask256, mul16;
 2822|   846k|        r6 = _mm256_set1_epi16(r << 6);
 2823|   846k|        c256 = _mm256_add_epi16(j256, c1234);
 2824|   846k|        mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256),
 2825|   846k|                                 _mm256_srli_epi16(min_base_y256, 1));
 2826|   846k|        y_c256 = _mm256_sub_epi16(r6, mul16);
 2827|   846k|        base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y);
 2828|   846k|        mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256);
 2829|   846k|        base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
 2830|   846k|        _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
 2831|       |
 2832|   846k|        a0_y = _mm256_setr_epi16(
 2833|   846k|            left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
 2834|   846k|            left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
 2835|   846k|            left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
 2836|   846k|            left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
 2837|   846k|            left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
 2838|   846k|            left[base_y_c[15]]);
 2839|   846k|        base_y_c256 = _mm256_add_epi16(base_y_c256, c1);
 2840|   846k|        _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
 2841|       |
 2842|   846k|        a1_y = _mm256_setr_epi16(
 2843|   846k|            left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
 2844|   846k|            left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
 2845|   846k|            left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
 2846|   846k|            left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
 2847|   846k|            left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
 2848|   846k|            left[base_y_c[15]]);
 2849|       |
 2850|   846k|        shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1);
 2851|       |
 2852|   846k|        diff = _mm256_sub_epi16(a1_y, a0_y);  // a[x+1] - a[x]
 2853|   846k|        a32 = _mm256_slli_epi16(a0_y, 5);     // a[x] * 32
 2854|   846k|        a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
 2855|       |
 2856|   846k|        b = _mm256_mullo_epi16(diff, shifty);
 2857|   846k|        res = _mm256_add_epi16(a32, b);
 2858|   846k|        resy = _mm256_srli_epi16(res, 5);
 2859|   846k|      }
 2860|       |
 2861|  1.15M|      resxy = _mm256_blendv_epi8(resx, resy,
 2862|  1.15M|                                 *(__m256i *)HighbdBaseMask[base_min_diff]);
 2863|  1.15M|      _mm256_storeu_si256((__m256i *)(dst + j), resxy);
 2864|  1.15M|    }  // for j
 2865|   790k|    dst += stride;
 2866|   790k|  }
 2867|  53.8k|}
intrapred_avx2.c:highbd_dr_prediction_32bit_z2_HxW_avx2:
 2526|    448|    int dy) {
 2527|       |  // here upsample_above and upsample_left are 0 by design of
 2528|       |  // av1_use_intra_edge_upsample
 2529|    448|  const int min_base_x = -1;
 2530|    448|  const int min_base_y = -1;
 2531|    448|  (void)upsample_above;
 2532|    448|  (void)upsample_left;
 2533|    448|  const int frac_bits_x = 6;
 2534|    448|  const int frac_bits_y = 6;
 2535|       |
 2536|       |  // pre-filter above pixels
 2537|       |  // store in temp buffers:
 2538|       |  //   above[x] * 32 + 16
 2539|       |  //   above[x+1] - above[x]
 2540|       |  // final pixels will be calculated as:
 2541|       |  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
 2542|    448|  __m256i a0_x, a1_x, a0_y, a1_y, a32, a0_1_x, a1_1_x, a16, c1;
 2543|    448|  __m256i diff, min_base_y256, c3f, dy256, c1234, c0123, c8;
 2544|    448|  __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128;
 2545|    448|  DECLARE_ALIGNED(32, int, base_y_c[16]);
  ------------------
  |  |   19|    448|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
 2546|       |
 2547|    448|  a16 = _mm256_set1_epi32(16);
 2548|    448|  c1 = _mm256_srli_epi32(a16, 4);
 2549|    448|  c8 = _mm256_srli_epi32(a16, 1);
 2550|    448|  min_base_y256 = _mm256_set1_epi32(min_base_y);
 2551|    448|  c3f = _mm256_set1_epi32(0x3f);
 2552|    448|  dy256 = _mm256_set1_epi32(dy);
 2553|    448|  c0123 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
 2554|    448|  c1234 = _mm256_add_epi32(c0123, c1);
 2555|       |
 2556|  8.86k|  for (int r = 0; r < H; r++) {
  ------------------
  |  Branch (2556:19): [True: 8.41k, False: 448]
  ------------------
 2557|  8.41k|    __m256i b, res, shift, ydx;
 2558|  8.41k|    __m256i resx[2], resy[2];
 2559|  8.41k|    __m256i resxy, j256, r6;
 2560|  21.3k|    for (int j = 0; j < W; j += 16) {
  ------------------
  |  Branch (2560:21): [True: 12.9k, False: 8.41k]
  ------------------
 2561|  12.9k|      j256 = _mm256_set1_epi32(j);
 2562|  12.9k|      int y = r + 1;
 2563|  12.9k|      ydx = _mm256_set1_epi32(y * dx);
 2564|       |
 2565|  12.9k|      int base_x = ((j << 6) - y * dx) >> frac_bits_x;
 2566|  12.9k|      int base_shift = 0;
 2567|  12.9k|      if ((base_x) < (min_base_x - 1)) {
  ------------------
  |  Branch (2567:11): [True: 9.05k, False: 3.92k]
  ------------------
 2568|  9.05k|        base_shift = (min_base_x - base_x - 1);
 2569|  9.05k|      }
 2570|  12.9k|      int base_min_diff = (min_base_x - base_x);
 2571|  12.9k|      if (base_min_diff > 16) {
  ------------------
  |  Branch (2571:11): [True: 4.75k, False: 8.22k]
  ------------------
 2572|  4.75k|        base_min_diff = 16;
 2573|  8.22k|      } else {
 2574|  8.22k|        if (base_min_diff < 0) base_min_diff = 0;
  ------------------
  |  Branch (2574:13): [True: 2.57k, False: 5.65k]
  ------------------
 2575|  8.22k|      }
 2576|       |
 2577|  12.9k|      if (base_shift > 7) {
  ------------------
  |  Branch (2577:11): [True: 6.37k, False: 6.59k]
  ------------------
 2578|  6.37k|        resx[0] = _mm256_setzero_si256();
 2579|  6.59k|      } else {
 2580|  6.59k|        a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
 2581|  6.59k|        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1));
 2582|  6.59k|        a0_x128 =
 2583|  6.59k|            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
 2584|  6.59k|        a1_x128 =
 2585|  6.59k|            _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
 2586|       |
 2587|  6.59k|        a0_x = _mm256_cvtepu16_epi32(a0_x128);
 2588|  6.59k|        a1_x = _mm256_cvtepu16_epi32(a1_x128);
 2589|       |
 2590|  6.59k|        r6 = _mm256_slli_epi32(_mm256_add_epi32(c0123, j256), 6);
 2591|  6.59k|        shift = _mm256_srli_epi32(
 2592|  6.59k|            _mm256_and_si256(_mm256_sub_epi32(r6, ydx), c3f), 1);
 2593|       |
 2594|  6.59k|        diff = _mm256_sub_epi32(a1_x, a0_x);  // a[x+1] - a[x]
 2595|  6.59k|        a32 = _mm256_slli_epi32(a0_x, 5);     // a[x] * 32
 2596|  6.59k|        a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
 2597|       |
 2598|  6.59k|        b = _mm256_mullo_epi32(diff, shift);
 2599|  6.59k|        res = _mm256_add_epi32(a32, b);
 2600|  6.59k|        res = _mm256_srli_epi32(res, 5);
 2601|       |
 2602|  6.59k|        resx[0] = _mm256_packus_epi32(
 2603|  6.59k|            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
 2604|  6.59k|      }
 2605|  12.9k|      int base_shift8 = 0;
 2606|  12.9k|      if ((base_x + 8) < (min_base_x - 1)) {
  ------------------
  |  Branch (2606:11): [True: 6.05k, False: 6.92k]
  ------------------
 2607|  6.05k|        base_shift8 = (min_base_x - (base_x + 8) - 1);
 2608|  6.05k|      }
 2609|  12.9k|      if (base_shift8 > 7) {
  ------------------
  |  Branch (2609:11): [True: 4.75k, False: 8.22k]
  ------------------
 2610|  4.75k|        resx[1] = _mm256_setzero_si256();
 2611|  8.22k|      } else {
 2612|  8.22k|        a0_1_x128 =
 2613|  8.22k|            _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 8));
 2614|  8.22k|        a1_1_x128 =
 2615|  8.22k|            _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 9));
 2616|  8.22k|        a0_1_x128 = _mm_shuffle_epi8(a0_1_x128,
 2617|  8.22k|                                     *(__m128i *)HighbdLoadMaskx[base_shift8]);
 2618|  8.22k|        a1_1_x128 = _mm_shuffle_epi8(a1_1_x128,
 2619|  8.22k|                                     *(__m128i *)HighbdLoadMaskx[base_shift8]);
 2620|       |
 2621|  8.22k|        a0_1_x = _mm256_cvtepu16_epi32(a0_1_x128);
 2622|  8.22k|        a1_1_x = _mm256_cvtepu16_epi32(a1_1_x128);
 2623|       |
 2624|  8.22k|        r6 = _mm256_slli_epi32(
 2625|  8.22k|            _mm256_add_epi32(c0123, _mm256_add_epi32(j256, c8)), 6);
 2626|  8.22k|        shift = _mm256_srli_epi32(
 2627|  8.22k|            _mm256_and_si256(_mm256_sub_epi32(r6, ydx), c3f), 1);
 2628|       |
 2629|  8.22k|        diff = _mm256_sub_epi32(a1_1_x, a0_1_x);  // a[x+1] - a[x]
 2630|  8.22k|        a32 = _mm256_slli_epi32(a0_1_x, 5);       // a[x] * 32
 2631|  8.22k|        a32 = _mm256_add_epi32(a32, a16);         // a[x] * 32 + 16
 2632|  8.22k|        b = _mm256_mullo_epi32(diff, shift);
 2633|       |
 2634|  8.22k|        resx[1] = _mm256_add_epi32(a32, b);
 2635|  8.22k|        resx[1] = _mm256_srli_epi32(resx[1], 5);
 2636|  8.22k|        resx[1] = _mm256_packus_epi32(
 2637|  8.22k|            resx[1],
 2638|  8.22k|            _mm256_castsi128_si256(_mm256_extracti128_si256(resx[1], 1)));
 2639|  8.22k|      }
 2640|  12.9k|      resx[0] =
 2641|  12.9k|          _mm256_inserti128_si256(resx[0], _mm256_castsi256_si128(resx[1]),
 2642|  12.9k|                                  1);  // 16 16bit values
 2643|       |
 2644|       |      // y calc
 2645|  12.9k|      resy[0] = _mm256_setzero_si256();
 2646|  12.9k|      if ((base_x < min_base_x)) {
  ------------------
  |  Branch (2646:11): [True: 9.69k, False: 3.28k]
  ------------------
 2647|  9.69k|        __m256i c256, y_c256, y_c_1_256, base_y_c256, mask256;
 2648|  9.69k|        r6 = _mm256_set1_epi32(r << 6);
 2649|  9.69k|        c256 = _mm256_add_epi32(j256, c1234);
 2650|  9.69k|        y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
 2651|  9.69k|        base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y);
 2652|  9.69k|        mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
 2653|  9.69k|        base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
 2654|  9.69k|        _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
 2655|  9.69k|        c256 = _mm256_add_epi32(c256, c8);
 2656|  9.69k|        y_c_1_256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
 2657|  9.69k|        base_y_c256 = _mm256_srai_epi32(y_c_1_256, frac_bits_y);
 2658|  9.69k|        mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
 2659|  9.69k|        base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
 2660|  9.69k|        _mm256_store_si256((__m256i *)(base_y_c + 8), base_y_c256);
 2661|       |
 2662|  9.69k|        a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
 2663|  9.69k|            left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
 2664|  9.69k|            left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
 2665|  9.69k|            left[base_y_c[6]], left[base_y_c[7]]));
 2666|  9.69k|        a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
 2667|  9.69k|            left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
 2668|  9.69k|            left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
 2669|  9.69k|            left[base_y_c[6] + 1], left[base_y_c[7] + 1]));
 2670|       |
 2671|  9.69k|        shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1);
 2672|       |
 2673|  9.69k|        diff = _mm256_sub_epi32(a1_y, a0_y);  // a[x+1] - a[x]
 2674|  9.69k|        a32 = _mm256_slli_epi32(a0_y, 5);     // a[x] * 32
 2675|  9.69k|        a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
 2676|       |
 2677|  9.69k|        b = _mm256_mullo_epi32(diff, shift);
 2678|  9.69k|        res = _mm256_add_epi32(a32, b);
 2679|  9.69k|        res = _mm256_srli_epi32(res, 5);
 2680|       |
 2681|  9.69k|        resy[0] = _mm256_packus_epi32(
 2682|  9.69k|            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
 2683|       |
 2684|  9.69k|        a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
 2685|  9.69k|            left[base_y_c[8]], left[base_y_c[9]], left[base_y_c[10]],
 2686|  9.69k|            left[base_y_c[11]], left[base_y_c[12]], left[base_y_c[13]],
 2687|  9.69k|            left[base_y_c[14]], left[base_y_c[15]]));
 2688|  9.69k|        a1_y = _mm256_cvtepu16_epi32(
 2689|  9.69k|            _mm_setr_epi16(left[base_y_c[8] + 1], left[base_y_c[9] + 1],
 2690|  9.69k|                           left[base_y_c[10] + 1], left[base_y_c[11] + 1],
 2691|  9.69k|                           left[base_y_c[12] + 1], left[base_y_c[13] + 1],
 2692|  9.69k|                           left[base_y_c[14] + 1], left[base_y_c[15] + 1]));
 2693|  9.69k|        shift = _mm256_srli_epi32(_mm256_and_si256(y_c_1_256, c3f), 1);
 2694|       |
 2695|  9.69k|        diff = _mm256_sub_epi32(a1_y, a0_y);  // a[x+1] - a[x]
 2696|  9.69k|        a32 = _mm256_slli_epi32(a0_y, 5);     // a[x] * 32
 2697|  9.69k|        a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
 2698|       |
 2699|  9.69k|        b = _mm256_mullo_epi32(diff, shift);
 2700|  9.69k|        res = _mm256_add_epi32(a32, b);
 2701|  9.69k|        res = _mm256_srli_epi32(res, 5);
 2702|       |
 2703|  9.69k|        resy[1] = _mm256_packus_epi32(
 2704|  9.69k|            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
 2705|       |
 2706|  9.69k|        resy[0] =
 2707|  9.69k|            _mm256_inserti128_si256(resy[0], _mm256_castsi256_si128(resy[1]),
 2708|  9.69k|                                    1);  // 16 16bit values
 2709|  9.69k|      }
 2710|       |
 2711|  12.9k|      resxy = _mm256_blendv_epi8(resx[0], resy[0],
 2712|  12.9k|                                 *(__m256i *)HighbdBaseMask[base_min_diff]);
 2713|  12.9k|      _mm256_storeu_si256((__m256i *)(dst + j), resxy);
 2714|  12.9k|    }  // for j
 2715|  8.41k|    dst += stride;
 2716|  8.41k|  }
 2717|    448|}
intrapred_avx2.c:highbd_dr_prediction_z3_4x4_avx2:
 2916|  35.7k|                                             int bd) {
 2917|  35.7k|  __m128i dstvec[4], d[4];
 2918|  35.7k|  if (bd < 12) {
  ------------------
  |  Branch (2918:7): [True: 35.7k, False: 52]
  ------------------
 2919|  35.7k|    highbd_dr_prediction_z1_4xN_internal_avx2(4, dstvec, left, upsample_left,
 2920|  35.7k|                                              dy);
 2921|  35.7k|  } else {
 2922|     52|    highbd_dr_prediction_32bit_z1_4xN_internal_avx2(4, dstvec, left,
 2923|     52|                                                    upsample_left, dy);
 2924|     52|  }
 2925|  35.7k|  highbd_transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2],
 2926|  35.7k|                                   &dstvec[3], &d[0], &d[1], &d[2], &d[3]);
 2927|  35.7k|  _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
 2928|  35.7k|  _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]);
 2929|  35.7k|  _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]);
 2930|  35.7k|  _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]);
 2931|  35.7k|  return;
 2932|  35.7k|}
intrapred_avx2.c:highbd_dr_prediction_z3_8x8_avx2:
 2937|  33.7k|                                             int bd) {
 2938|  33.7k|  __m128i dstvec[8], d[8];
 2939|  33.7k|  if (bd < 12) {
  ------------------
  |  Branch (2939:7): [True: 33.5k, False: 202]
  ------------------
 2940|  33.5k|    highbd_dr_prediction_z1_8xN_internal_avx2(8, dstvec, left, upsample_left,
 2941|  33.5k|                                              dy);
 2942|  33.5k|  } else {
 2943|    202|    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(8, dstvec, left,
 2944|    202|                                                    upsample_left, dy);
 2945|    202|  }
 2946|  33.7k|  highbd_transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
 2947|  33.7k|                           &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7],
 2948|  33.7k|                           &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
 2949|  33.7k|                           &d[7]);
 2950|   303k|  for (int i = 0; i < 8; i++) {
  ------------------
  |  Branch (2950:19): [True: 270k, False: 33.7k]
  ------------------
 2951|   270k|    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
 2952|   270k|  }
 2953|  33.7k|}
intrapred_avx2.c:highbd_dr_prediction_z3_16x16_avx2:
 3164|  9.83k|                                               int bd) {
 3165|  9.83k|  __m256i dstvec[16], d[16];
 3166|  9.83k|  if (bd < 12) {
  ------------------
  |  Branch (3166:7): [True: 9.81k, False: 22]
  ------------------
 3167|  9.81k|    highbd_dr_prediction_z1_16xN_internal_avx2(16, dstvec, left, upsample_left,
 3168|  9.81k|                                               dy);
 3169|  9.81k|  } else {
 3170|     22|    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(16, dstvec, left,
 3171|     22|                                                     upsample_left, dy);
 3172|     22|  }
 3173|       |
 3174|  9.83k|  highbd_transpose16x16_avx2(dstvec, d);
 3175|       |
 3176|   167k|  for (int i = 0; i < 16; i++) {
  ------------------
  |  Branch (3176:19): [True: 157k, False: 9.83k]
  ------------------
 3177|   157k|    _mm256_storeu_si256((__m256i *)(dst + i * stride), d[i]);
 3178|   157k|  }
 3179|  9.83k|}
intrapred_avx2.c:highbd_transpose16x16_avx2:
  243|  37.6k|static inline void highbd_transpose16x16_avx2(__m256i *x, __m256i *d) {
  244|  37.6k|  __m256i w0, w1, w2, w3, ww0, ww1;
  245|  37.6k|  __m256i dd[16];
  246|  37.6k|  w0 = _mm256_unpacklo_epi16(x[0], x[1]);
  247|  37.6k|  w1 = _mm256_unpacklo_epi16(x[2], x[3]);
  248|  37.6k|  w2 = _mm256_unpacklo_epi16(x[4], x[5]);
  249|  37.6k|  w3 = _mm256_unpacklo_epi16(x[6], x[7]);
  250|       |
  251|  37.6k|  ww0 = _mm256_unpacklo_epi32(w0, w1);  //
  252|  37.6k|  ww1 = _mm256_unpacklo_epi32(w2, w3);  //
  253|       |
  254|  37.6k|  dd[0] = _mm256_unpacklo_epi64(ww0, ww1);
  255|  37.6k|  dd[1] = _mm256_unpackhi_epi64(ww0, ww1);
  256|       |
  257|  37.6k|  ww0 = _mm256_unpackhi_epi32(w0, w1);  //
  258|  37.6k|  ww1 = _mm256_unpackhi_epi32(w2, w3);  //
  259|       |
  260|  37.6k|  dd[2] = _mm256_unpacklo_epi64(ww0, ww1);
  261|  37.6k|  dd[3] = _mm256_unpackhi_epi64(ww0, ww1);
  262|       |
  263|  37.6k|  w0 = _mm256_unpackhi_epi16(x[0], x[1]);
  264|  37.6k|  w1 = _mm256_unpackhi_epi16(x[2], x[3]);
  265|  37.6k|  w2 = _mm256_unpackhi_epi16(x[4], x[5]);
  266|  37.6k|  w3 = _mm256_unpackhi_epi16(x[6], x[7]);
  267|       |
  268|  37.6k|  ww0 = _mm256_unpacklo_epi32(w0, w1);  //
  269|  37.6k|  ww1 = _mm256_unpacklo_epi32(w2, w3);  //
  270|       |
  271|  37.6k|  dd[4] = _mm256_unpacklo_epi64(ww0, ww1);
  272|  37.6k|  dd[5] = _mm256_unpackhi_epi64(ww0, ww1);
  273|       |
  274|  37.6k|  ww0 = _mm256_unpackhi_epi32(w0, w1);  //
  275|  37.6k|  ww1 = _mm256_unpackhi_epi32(w2, w3);  //
  276|       |
  277|  37.6k|  dd[6] = _mm256_unpacklo_epi64(ww0, ww1);
  278|  37.6k|  dd[7] = _mm256_unpackhi_epi64(ww0, ww1);
  279|       |
  280|  37.6k|  w0 = _mm256_unpacklo_epi16(x[8], x[9]);
  281|  37.6k|  w1 = _mm256_unpacklo_epi16(x[10], x[11]);
  282|  37.6k|  w2 = _mm256_unpacklo_epi16(x[12], x[13]);
  283|  37.6k|  w3 = _mm256_unpacklo_epi16(x[14], x[15]);
  284|       |
  285|  37.6k|  ww0 = _mm256_unpacklo_epi32(w0, w1);
  286|  37.6k|  ww1 = _mm256_unpacklo_epi32(w2, w3);
  287|       |
  288|  37.6k|  dd[8] = _mm256_unpacklo_epi64(ww0, ww1);
  289|  37.6k|  dd[9] = _mm256_unpackhi_epi64(ww0, ww1);
  290|       |
  291|  37.6k|  ww0 = _mm256_unpackhi_epi32(w0, w1);
  292|  37.6k|  ww1 = _mm256_unpackhi_epi32(w2, w3);
  293|       |
  294|  37.6k|  dd[10] = _mm256_unpacklo_epi64(ww0, ww1);
  295|  37.6k|  dd[11] = _mm256_unpackhi_epi64(ww0, ww1);
  296|       |
  297|  37.6k|  w0 = _mm256_unpackhi_epi16(x[8], x[9]);
  298|  37.6k|  w1 = _mm256_unpackhi_epi16(x[10], x[11]);
  299|  37.6k|  w2 = _mm256_unpackhi_epi16(x[12], x[13]);
  300|  37.6k|  w3 = _mm256_unpackhi_epi16(x[14], x[15]);
  301|       |
  302|  37.6k|  ww0 = _mm256_unpacklo_epi32(w0, w1);
  303|  37.6k|  ww1 = _mm256_unpacklo_epi32(w2, w3);
  304|       |
  305|  37.6k|  dd[12] = _mm256_unpacklo_epi64(ww0, ww1);
  306|  37.6k|  dd[13] = _mm256_unpackhi_epi64(ww0, ww1);
  307|       |
  308|  37.6k|  ww0 = _mm256_unpackhi_epi32(w0, w1);
  309|  37.6k|  ww1 = _mm256_unpackhi_epi32(w2, w3);
  310|       |
  311|  37.6k|  dd[14] = _mm256_unpacklo_epi64(ww0, ww1);
  312|  37.6k|  dd[15] = _mm256_unpackhi_epi64(ww0, ww1);
  313|       |
  314|   338k|  for (int i = 0; i < 8; i++) {
  ------------------
  |  Branch (314:19): [True: 300k, False: 37.6k]
  ------------------
  315|   300k|    d[i] = _mm256_insertf128_si256(dd[i], _mm256_castsi256_si128(dd[i + 8]), 1);
  316|       |    d[i + 8] = _mm256_insertf128_si256(dd[i + 8],
  317|   300k|                                       _mm256_extracti128_si256(dd[i], 1), 0);
  318|   300k|  }
  319|  37.6k|}
intrapred_avx2.c:highbd_dr_prediction_z3_32x32_avx2:
 3184|  4.43k|                                               int bd) {
 3185|  4.43k|  __m256i dstvec[64], d[16];
 3186|  4.43k|  if (bd < 12) {
  ------------------
  |  Branch (3186:7): [True: 4.42k, False: 8]
  ------------------
 3187|  4.42k|    highbd_dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left,
 3188|  4.42k|                                               dy);
 3189|  4.42k|  } else {
 3190|      8|    highbd_dr_prediction_32bit_z1_32xN_internal_avx2(32, dstvec, left,
 3191|      8|                                                     upsample_left, dy);
 3192|      8|  }
 3193|  4.43k|  highbd_transpose16x16_avx2(dstvec, d);
 3194|  75.3k|  for (int j = 0; j < 16; j++) {
  ------------------
  |  Branch (3194:19): [True: 70.8k, False: 4.43k]
  ------------------
 3195|  70.8k|    _mm256_storeu_si256((__m256i *)(dst + j * stride), d[j]);
 3196|  70.8k|  }
 3197|  4.43k|  highbd_transpose16x16_avx2(dstvec + 16, d);
 3198|  75.3k|  for (int j = 0; j < 16; j++) {
  ------------------
  |  Branch (3198:19): [True: 70.8k, False: 4.43k]
  ------------------
 3199|  70.8k|    _mm256_storeu_si256((__m256i *)(dst + j * stride + 16), d[j]);
 3200|  70.8k|  }
 3201|  4.43k|  highbd_transpose16x16_avx2(dstvec + 32, d);
 3202|  75.3k|  for (int j = 0; j < 16; j++) {
  ------------------
  |  Branch (3202:19): [True: 70.8k, False: 4.43k]
  ------------------
 3203|  70.8k|    _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride), d[j]);
 3204|  70.8k|  }
 3205|  4.43k|  highbd_transpose16x16_avx2(dstvec + 48, d);
 3206|  75.3k|  for (int j = 0; j < 16; j++) {
  ------------------
  |  Branch (3206:19): [True: 70.8k, False: 4.43k]
  ------------------
 3207|  70.8k|    _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride + 16), d[j]);
 3208|  70.8k|  }
 3209|  4.43k|}
intrapred_avx2.c:highbd_dr_prediction_z3_64x64_avx2:
 3214|    220|                                               int bd) {
 3215|    220|  DECLARE_ALIGNED(16, uint16_t, dstT[64 * 64]);
  ------------------
  |  |   19|    220|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
 3216|    220|  if (bd < 12) {
  ------------------
  |  Branch (3216:7): [True: 220, False: 0]
  ------------------
 3217|    220|    highbd_dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy);
 3218|    220|  } else {
 3219|      0|    highbd_dr_prediction_32bit_z1_64xN_avx2(64, dstT, 64, left, upsample_left,
 3220|      0|                                            dy);
 3221|      0|  }
 3222|    220|  highbd_transpose(dstT, 64, dst, stride, 64, 64);
 3223|    220|}
intrapred_avx2.c:highbd_transpose:
 1971|    578|                             int height) {
 1972|  2.71k|  for (int j = 0; j < height; j += 16)
  ------------------
  |  Branch (1972:19): [True: 2.14k, False: 578]
  ------------------
 1973|  7.87k|    for (int i = 0; i < width; i += 16)
  ------------------
  |  Branch (1973:21): [True: 5.73k, False: 2.14k]
  ------------------
 1974|  5.73k|      highbd_transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc,
 1975|  5.73k|                                dst + j * pitchDst + i, pitchDst);
 1976|    578|}
intrapred_avx2.c:highbd_transpose_TX_16X16:
 1957|  5.73k|                                      uint16_t *dst, ptrdiff_t pitchDst) {
 1958|  5.73k|  __m256i r[16];
 1959|  5.73k|  __m256i d[16];
 1960|  97.4k|  for (int j = 0; j < 16; j++) {
  ------------------
  |  Branch (1960:19): [True: 91.7k, False: 5.73k]
  ------------------
 1961|  91.7k|    r[j] = _mm256_loadu_si256((__m256i *)(src + j * pitchSrc));
 1962|  91.7k|  }
 1963|  5.73k|  highbd_transpose16x16_avx2(r, d);
 1964|  97.4k|  for (int j = 0; j < 16; j++) {
  ------------------
  |  Branch (1964:19): [True: 91.7k, False: 5.73k]
  ------------------
 1965|  91.7k|    _mm256_storeu_si256((__m256i *)(dst + j * pitchDst), d[j]);
 1966|  91.7k|  }
 1967|  5.73k|}
intrapred_avx2.c:highbd_dr_prediction_z3_4x8_avx2:
 2958|  3.98k|                                             int bd) {
 2959|  3.98k|  __m128i dstvec[4], d[8];
 2960|  3.98k|  if (bd < 12) {
  ------------------
  |  Branch (2960:7): [True: 3.91k, False: 68]
  ------------------
 2961|  3.91k|    highbd_dr_prediction_z1_8xN_internal_avx2(4, dstvec, left, upsample_left,
 2962|  3.91k|                                              dy);
 2963|  3.91k|  } else {
 2964|     68|    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(4, dstvec, left,
 2965|     68|                                                    upsample_left, dy);
 2966|     68|  }
 2967|       |
 2968|  3.98k|  highbd_transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
 2969|  3.98k|                               &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
 2970|  3.98k|                               &d[7]);
 2971|  35.8k|  for (int i = 0; i < 8; i++) {
  ------------------
  |  Branch (2971:19): [True: 31.8k, False: 3.98k]
  ------------------
 2972|  31.8k|    _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
 2973|  31.8k|  }
 2974|  3.98k|}
intrapred_avx2.c:highbd_dr_prediction_z3_8x16_avx2:
 3001|  4.93k|                                              int bd) {
 3002|  4.93k|  __m256i dstvec[8], d[8];
 3003|  4.93k|  if (bd < 12) {
  ------------------
  |  Branch (3003:7): [True: 4.92k, False: 16]
  ------------------
 3004|  4.92k|    highbd_dr_prediction_z1_16xN_internal_avx2(8, dstvec, left, upsample_left,
 3005|  4.92k|                                               dy);
 3006|  4.92k|  } else {
 3007|     16|    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(8, dstvec, left,
 3008|     16|                                                     upsample_left, dy);
 3009|     16|  }
 3010|  4.93k|  highbd_transpose8x16_16x8_avx2(dstvec, d);
 3011|  44.4k|  for (int i = 0; i < 8; i++) {
  ------------------
  |  Branch (3011:19): [True: 39.4k, False: 4.93k]
  ------------------
 3012|  39.4k|    _mm_storeu_si128((__m128i *)(dst + i * stride),
 3013|  39.4k|                     _mm256_castsi256_si128(d[i]));
 3014|  39.4k|  }
 3015|  44.4k|  for (int i = 8; i < 16; i++) {
  ------------------
  |  Branch (3015:19): [True: 39.4k, False: 4.93k]
  ------------------
 3016|  39.4k|    _mm_storeu_si128((__m128i *)(dst + i * stride),
 3017|       |                     _mm256_extracti128_si256(d[i - 8], 1));
 3018|  39.4k|  }
 3019|  4.93k|}
intrapred_avx2.c:highbd_transpose8x16_16x8_avx2:
  205|  13.1k|static inline void highbd_transpose8x16_16x8_avx2(__m256i *x, __m256i *d) {
  206|  13.1k|  __m256i w0, w1, w2, w3, ww0, ww1;
  207|       |
  208|  13.1k|  w0 = _mm256_unpacklo_epi16(x[0], x[1]);  // 00 10 01 11 02 12 03 13
  209|  13.1k|  w1 = _mm256_unpacklo_epi16(x[2], x[3]);  // 20 30 21 31 22 32 23 33
  210|  13.1k|  w2 = _mm256_unpacklo_epi16(x[4], x[5]);  // 40 50 41 51 42 52 43 53
  211|  13.1k|  w3 = _mm256_unpacklo_epi16(x[6], x[7]);  // 60 70 61 71 62 72 63 73
  212|       |
  213|  13.1k|  ww0 = _mm256_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
  214|  13.1k|  ww1 = _mm256_unpacklo_epi32(w2, w3);  // 40 50 60 70 41 51 61 71
  215|       |
  216|  13.1k|  d[0] = _mm256_unpacklo_epi64(ww0, ww1);  // 00 10 20 30 40 50 60 70
  217|  13.1k|  d[1] = _mm256_unpackhi_epi64(ww0, ww1);  // 01 11 21 31 41 51 61 71
  218|       |
  219|  13.1k|  ww0 = _mm256_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
  220|  13.1k|  ww1 = _mm256_unpackhi_epi32(w2, w3);  // 42 52 62 72 43 53 63 73
  221|       |
  222|  13.1k|  d[2] = _mm256_unpacklo_epi64(ww0, ww1);  // 02 12 22 32 42 52 62 72
  223|  13.1k|  d[3] = _mm256_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
  224|       |
  225|  13.1k|  w0 = _mm256_unpackhi_epi16(x[0], x[1]);  // 04 14 05 15 06 16 07 17
  226|  13.1k|  w1 = _mm256_unpackhi_epi16(x[2], x[3]);  // 24 34 25 35 26 36 27 37
  227|  13.1k|  w2 = _mm256_unpackhi_epi16(x[4], x[5]);  // 44 54 45 55 46 56 47 57
  228|  13.1k|  w3 = _mm256_unpackhi_epi16(x[6], x[7]);  // 64 74 65 75 66 76 67 77
  229|       |
  230|  13.1k|  ww0 = _mm256_unpacklo_epi32(w0, w1);  // 04 14 24 34 05 15 25 35
  231|  13.1k|  ww1 = _mm256_unpacklo_epi32(w2, w3);  // 44 54 64 74 45 55 65 75
  232|       |
  233|  13.1k|  d[4] = _mm256_unpacklo_epi64(ww0, ww1);  // 04 14 24 34 44 54 64 74
  234|  13.1k|  d[5] = _mm256_unpackhi_epi64(ww0, ww1);  // 05 15 25 35 45 55 65 75
  235|       |
  236|  13.1k|  ww0 = _mm256_unpackhi_epi32(w0, w1);  // 06 16 26 36 07 17 27 37
  237|  13.1k|  ww1 = _mm256_unpackhi_epi32(w2, w3);  // 46 56 66 76 47 57 67 77
  238|       |
  239|  13.1k|  d[6] = _mm256_unpacklo_epi64(ww0, ww1);  // 06 16 26 36 46 56 66 76
  240|  13.1k|  d[7] = _mm256_unpackhi_epi64(ww0, ww1);  // 07 17 27 37 47 57 67 77
  241|  13.1k|}
intrapred_avx2.c:highbd_dr_prediction_z3_16x32_avx2:
 3228|  1.32k|                                               int bd) {
 3229|  1.32k|  __m256i dstvec[32], d[32];
 3230|  1.32k|  if (bd < 12) {
  ------------------
  |  Branch (3230:7): [True: 1.30k, False: 12]
  ------------------
 3231|  1.30k|    highbd_dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left,
 3232|  1.30k|                                               dy);
 3233|  1.30k|  } else {
 3234|     12|    highbd_dr_prediction_32bit_z1_32xN_internal_avx2(16, dstvec, left,
 3235|     12|                                                     upsample_left, dy);
 3236|     12|  }
 3237|  6.60k|  for (int i = 0; i < 32; i += 8) {
  ------------------
  |  Branch (3237:19): [True: 5.28k, False: 1.32k]
  ------------------
 3238|  5.28k|    highbd_transpose8x16_16x8_avx2(dstvec + i, d + i);
 3239|  5.28k|  }
 3240|       |  // store
 3241|  3.96k|  for (int j = 0; j < 32; j += 16) {
  ------------------
  |  Branch (3241:19): [True: 2.64k, False: 1.32k]
  ------------------
 3242|  23.7k|    for (int i = 0; i < 8; i++) {
  ------------------
  |  Branch (3242:21): [True: 21.1k, False: 2.64k]
  ------------------
 3243|  21.1k|      _mm_storeu_si128((__m128i *)(dst + (i + j) * stride),
 3244|  21.1k|                       _mm256_castsi256_si128(d[(i + j)]));
 3245|  21.1k|    }
 3246|  23.7k|    for (int i = 0; i < 8; i++) {
  ------------------
  |  Branch (3246:21): [True: 21.1k, False: 2.64k]
  ------------------
 3247|  21.1k|      _mm_storeu_si128((__m128i *)(dst + (i + j) * stride + 8),
 3248|  21.1k|                       _mm256_castsi256_si128(d[(i + j) + 8]));
 3249|  21.1k|    }
 3250|  23.7k|    for (int i = 8; i < 16; i++) {
  ------------------
  |  Branch (3250:21): [True: 21.1k, False: 2.64k]
  ------------------
 3251|  21.1k|      _mm256_storeu_si256(
 3252|  21.1k|          (__m256i *)(dst + (i + j) * stride),
 3253|       |          _mm256_inserti128_si256(
 3254|  21.1k|              d[(i + j)], _mm256_extracti128_si256(d[(i + j) - 8], 1), 0));
 3255|  21.1k|    }
 3256|  2.64k|  }
 3257|  1.32k|}
intrapred_avx2.c:highbd_dr_prediction_z3_32x64_avx2:
 3282|    109|                                               int bd) {
 3283|    109|  uint16_t dstT[64 * 32];
 3284|    109|  if (bd < 12) {
  ------------------
  |  Branch (3284:7): [True: 109, False: 0]
  ------------------
 3285|    109|    highbd_dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy);
 3286|    109|  } else {
 3287|      0|    highbd_dr_prediction_32bit_z1_64xN_avx2(32, dstT, 64, left, upsample_left,
 3288|      0|                                            dy);
 3289|      0|  }
 3290|    109|  highbd_transpose(dstT, 64, dst, stride, 32, 64);
 3291|    109|}
intrapred_avx2.c:highbd_dr_prediction_z3_4x16_avx2:
 3050|  2.96k|                                              int bd) {
 3051|  2.96k|  __m256i dstvec[4], d[4], d1;
 3052|  2.96k|  if (bd < 12) {
  ------------------
  |  Branch (3052:7): [True: 2.95k, False: 2]
  ------------------
 3053|  2.95k|    highbd_dr_prediction_z1_16xN_internal_avx2(4, dstvec, left, upsample_left,
 3054|  2.95k|                                               dy);
 3055|  2.95k|  } else {
 3056|      2|    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(4, dstvec, left,
 3057|      2|                                                     upsample_left, dy);
 3058|      2|  }
 3059|  2.96k|  highbd_transpose4x16_avx2(dstvec, d);
 3060|  14.8k|  for (int i = 0; i < 4; i++) {
  ------------------
  |  Branch (3060:19): [True: 11.8k, False: 2.96k]
  ------------------
 3061|  11.8k|    _mm_storel_epi64((__m128i *)(dst + i * stride),
 3062|  11.8k|                     _mm256_castsi256_si128(d[i]));
 3063|  11.8k|    d1 = _mm256_bsrli_epi128(d[i], 8);
 3064|  11.8k|    _mm_storel_epi64((__m128i *)(dst + (i + 4) * stride),
 3065|  11.8k|                     _mm256_castsi256_si128(d1));
 3066|  11.8k|    _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride),
 3067|  11.8k|                     _mm256_extracti128_si256(d[i], 1));
 3068|  11.8k|    _mm_storel_epi64((__m128i *)(dst + (i + 12) * stride),
 3069|       |                     _mm256_extracti128_si256(d1, 1));
 3070|  11.8k|  }
 3071|  2.96k|}
intrapred_avx2.c:highbd_transpose4x16_avx2:
  183|  2.96k|static inline void highbd_transpose4x16_avx2(__m256i *x, __m256i *d) {
  184|  2.96k|  __m256i w0, w1, w2, w3, ww0, ww1;
  185|       |
  186|  2.96k|  w0 = _mm256_unpacklo_epi16(x[0], x[1]);  // 00 10 01 11 02 12 03 13
  187|  2.96k|  w1 = _mm256_unpacklo_epi16(x[2], x[3]);  // 20 30 21 31 22 32 23 33
  188|  2.96k|  w2 = _mm256_unpackhi_epi16(x[0], x[1]);  // 40 50 41 51 42 52 43 53
  189|  2.96k|  w3 = _mm256_unpackhi_epi16(x[2], x[3]);  // 60 70 61 71 62 72 63 73
  190|       |
  191|  2.96k|  ww0 = _mm256_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
  192|  2.96k|  ww1 = _mm256_unpacklo_epi32(w2, w3);  // 40 50 60 70 41 51 61 71
  193|       |
  194|  2.96k|  d[0] = _mm256_unpacklo_epi64(ww0, ww1);  // 00 10 20 30 40 50 60 70
  195|  2.96k|  d[1] = _mm256_unpackhi_epi64(ww0, ww1);  // 01 11 21 31 41 51 61 71
  196|       |
  197|  2.96k|  ww0 = _mm256_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
  198|  2.96k|  ww1 = _mm256_unpackhi_epi32(w2, w3);  // 42 52 62 72 43 53 63 73
  199|       |
  200|  2.96k|  d[2] = _mm256_unpacklo_epi64(ww0, ww1);  // 02 12 22 32 42 52 62 72
  201|  2.96k|  d[3] = _mm256_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
  202|  2.96k|}
intrapred_avx2.c:highbd_dr_prediction_z3_8x32_avx2:
 3100|  1.47k|                                              int bd) {
 3101|  1.47k|  __m256i dstvec[16], d[16];
 3102|  1.47k|  if (bd < 12) {
  ------------------
  |  Branch (3102:7): [True: 1.47k, False: 0]
  ------------------
 3103|  1.47k|    highbd_dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left,
 3104|  1.47k|                                               dy);
 3105|  1.47k|  } else {
 3106|      0|    highbd_dr_prediction_32bit_z1_32xN_internal_avx2(8, dstvec, left,
 3107|      0|                                                     upsample_left, dy);
 3108|      0|  }
 3109|       |
 3110|  4.42k|  for (int i = 0; i < 16; i += 8) {
  ------------------
  |  Branch (3110:19): [True: 2.94k, False: 1.47k]
  ------------------
 3111|  2.94k|    highbd_transpose8x16_16x8_avx2(dstvec + i, d + i);
 3112|  2.94k|  }
 3113|       |
 3114|  13.2k|  for (int i = 0; i < 8; i++) {
  ------------------
  |  Branch (3114:19): [True: 11.7k, False: 1.47k]
  ------------------
 3115|  11.7k|    _mm_storeu_si128((__m128i *)(dst + i * stride),
 3116|  11.7k|                     _mm256_castsi256_si128(d[i]));
 3117|  11.7k|  }
 3118|  13.2k|  for (int i = 0; i < 8; i++) {
  ------------------
  |  Branch (3118:19): [True: 11.7k, False: 1.47k]
  ------------------
 3119|  11.7k|    _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride),
 3120|  11.7k|                     _mm256_extracti128_si256(d[i], 1));
 3121|  11.7k|  }
 3122|  13.2k|  for (int i = 8; i < 16; i++) {
  ------------------
  |  Branch (3122:19): [True: 11.7k, False: 1.47k]
  ------------------
 3123|  11.7k|    _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride),
 3124|  11.7k|                     _mm256_castsi256_si128(d[i]));
 3125|  11.7k|  }
 3126|  13.2k|  for (int i = 8; i < 16; i++) {
  ------------------
  |  Branch (3126:19): [True: 11.7k, False: 1.47k]
  ------------------
 3127|  11.7k|    _mm_storeu_si128((__m128i *)(dst + (i + 16) * stride),
 3128|       |                     _mm256_extracti128_si256(d[i], 1));
 3129|  11.7k|  }
 3130|  1.47k|}
intrapred_avx2.c:highbd_dr_prediction_z3_16x64_avx2:
 3307|    163|                                               int bd) {
 3308|    163|  DECLARE_ALIGNED(16, uint16_t, dstT[64 * 16]);
  ------------------
  |  |   19|    163|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
 3309|    163|  if (bd < 12) {
  ------------------
  |  Branch (3309:7): [True: 163, False: 0]
  ------------------
 3310|    163|    highbd_dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy);
 3311|    163|  } else {
 3312|      0|    highbd_dr_prediction_32bit_z1_64xN_avx2(16, dstT, 64, left, upsample_left,
 3313|      0|                                            dy);
 3314|      0|  }
 3315|    163|  highbd_transpose(dstT, 64, dst, stride, 16, 64);
 3316|    163|}
intrapred_avx2.c:highbd_dr_prediction_z3_8x4_avx2:
 2979|  6.38k|                                             int bd) {
 2980|  6.38k|  __m128i dstvec[8], d[4];
 2981|  6.38k|  if (bd < 12) {
  ------------------
  |  Branch (2981:7): [True: 6.34k, False: 42]
  ------------------
 2982|  6.34k|    highbd_dr_prediction_z1_4xN_internal_avx2(8, dstvec, left, upsample_left,
 2983|  6.34k|                                              dy);
 2984|  6.34k|  } else {
 2985|     42|    highbd_dr_prediction_32bit_z1_4xN_internal_avx2(8, dstvec, left,
 2986|     42|                                                    upsample_left, dy);
 2987|     42|  }
 2988|       |
 2989|  6.38k|  highbd_transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
 2990|  6.38k|                               &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7],
 2991|  6.38k|                               &d[0], &d[1], &d[2], &d[3]);
 2992|  6.38k|  _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]);
 2993|  6.38k|  _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[1]);
 2994|  6.38k|  _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[2]);
 2995|  6.38k|  _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[3]);
 2996|  6.38k|}
intrapred_avx2.c:highbd_dr_prediction_z3_16x8_avx2:
 3024|  7.83k|                                              int bd) {
 3025|  7.83k|  __m128i dstvec[16], d[16];
 3026|  7.83k|  if (bd < 12) {
  ------------------
  |  Branch (3026:7): [True: 7.78k, False: 50]
  ------------------
 3027|  7.78k|    highbd_dr_prediction_z1_8xN_internal_avx2(16, dstvec, left, upsample_left,
 3028|  7.78k|                                              dy);
 3029|  7.78k|  } else {
 3030|     50|    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(16, dstvec, left,
 3031|     50|                                                    upsample_left, dy);
 3032|     50|  }
 3033|  23.5k|  for (int i = 0; i < 16; i += 8) {
  ------------------
  |  Branch (3033:19): [True: 15.6k, False: 7.83k]
  ------------------
 3034|  15.6k|    highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i],
 3035|  15.6k|                             &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i],
 3036|  15.6k|                             &dstvec[6 + i], &dstvec[7 + i], &d[0 + i],
 3037|  15.6k|                             &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i],
 3038|  15.6k|                             &d[5 + i], &d[6 + i], &d[7 + i]);
 3039|  15.6k|  }
 3040|  70.5k|  for (int i = 0; i < 8; i++) {
  ------------------
  |  Branch (3040:19): [True: 62.7k, False: 7.83k]
  ------------------
 3041|  62.7k|    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
 3042|  62.7k|    _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]);
 3043|  62.7k|  }
 3044|  7.83k|}
intrapred_avx2.c:highbd_dr_prediction_z3_32x16_avx2:
 3262|  1.73k|                                               int bd) {
 3263|  1.73k|  __m256i dstvec[32], d[16];
 3264|  1.73k|  if (bd < 12) {
  ------------------
  |  Branch (3264:7): [True: 1.70k, False: 28]
  ------------------
 3265|  1.70k|    highbd_dr_prediction_z1_16xN_internal_avx2(32, dstvec, left, upsample_left,
 3266|  1.70k|                                               dy);
 3267|  1.70k|  } else {
 3268|     28|    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(32, dstvec, left,
 3269|     28|                                                     upsample_left, dy);
 3270|     28|  }
 3271|  5.20k|  for (int i = 0; i < 32; i += 16) {
  ------------------
  |  Branch (3271:19): [True: 3.46k, False: 1.73k]
  ------------------
 3272|  3.46k|    highbd_transpose16x16_avx2((dstvec + i), d);
 3273|  58.9k|    for (int j = 0; j < 16; j++) {
  ------------------
  |  Branch (3273:21): [True: 55.4k, False: 3.46k]
  ------------------
 3274|  55.4k|      _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]);
 3275|  55.4k|    }
 3276|  3.46k|  }
 3277|  1.73k|}
intrapred_avx2.c:highbd_dr_prediction_z3_64x32_avx2:
 3296|     86|                                               int bd) {
 3297|     86|  DECLARE_ALIGNED(16, uint16_t, dstT[32 * 64]);
  ------------------
  |  |   19|     86|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
 3298|     86|  highbd_dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy, bd);
 3299|     86|  highbd_transpose(dstT, 32, dst, stride, 64, 32);
 3300|     86|  return;
 3301|     86|}
intrapred_avx2.c:highbd_dr_prediction_z3_16x4_avx2:
 3076|  7.46k|                                              int bd) {
 3077|  7.46k|  __m128i dstvec[16], d[8];
 3078|  7.46k|  if (bd < 12) {
  ------------------
  |  Branch (3078:7): [True: 7.44k, False: 20]
  ------------------
 3079|  7.44k|    highbd_dr_prediction_z1_4xN_internal_avx2(16, dstvec, left, upsample_left,
 3080|  7.44k|                                              dy);
 3081|  7.44k|  } else {
 3082|     20|    highbd_dr_prediction_32bit_z1_4xN_internal_avx2(16, dstvec, left,
 3083|     20|                                                    upsample_left, dy);
 3084|     20|  }
 3085|  7.46k|  highbd_transpose16x4_8x8_sse2(dstvec, d);
 3086|       |
 3087|  7.46k|  _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]);
 3088|  7.46k|  _mm_storeu_si128((__m128i *)(dst + 0 * stride + 8), d[1]);
 3089|  7.46k|  _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[2]);
 3090|  7.46k|  _mm_storeu_si128((__m128i *)(dst + 1 * stride + 8), d[3]);
 3091|  7.46k|  _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[4]);
 3092|  7.46k|  _mm_storeu_si128((__m128i *)(dst + 2 * stride + 8), d[5]);
 3093|  7.46k|  _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[6]);
 3094|  7.46k|  _mm_storeu_si128((__m128i *)(dst + 3 * stride + 8), d[7]);
 3095|  7.46k|}
intrapred_avx2.c:highbd_transpose16x4_8x8_sse2:
  139|  7.46k|static inline void highbd_transpose16x4_8x8_sse2(__m128i *x, __m128i *d) {
  140|  7.46k|  __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
  141|       |
  142|  7.46k|  r0 = _mm_unpacklo_epi16(x[0], x[1]);
  143|  7.46k|  r1 = _mm_unpacklo_epi16(x[2], x[3]);
  144|  7.46k|  r2 = _mm_unpacklo_epi16(x[4], x[5]);
  145|  7.46k|  r3 = _mm_unpacklo_epi16(x[6], x[7]);
  146|       |
  147|  7.46k|  r4 = _mm_unpacklo_epi16(x[8], x[9]);
  148|  7.46k|  r5 = _mm_unpacklo_epi16(x[10], x[11]);
  149|  7.46k|  r6 = _mm_unpacklo_epi16(x[12], x[13]);
  150|  7.46k|  r7 = _mm_unpacklo_epi16(x[14], x[15]);
  151|       |
  152|  7.46k|  r8 = _mm_unpacklo_epi32(r0, r1);
  153|  7.46k|  r9 = _mm_unpackhi_epi32(r0, r1);
  154|  7.46k|  r10 = _mm_unpacklo_epi32(r2, r3);
  155|  7.46k|  r11 = _mm_unpackhi_epi32(r2, r3);
  156|       |
  157|  7.46k|  r12 = _mm_unpacklo_epi32(r4, r5);
  158|  7.46k|  r13 = _mm_unpackhi_epi32(r4, r5);
  159|  7.46k|  r14 = _mm_unpacklo_epi32(r6, r7);
  160|  7.46k|  r15 = _mm_unpackhi_epi32(r6, r7);
  161|       |
  162|  7.46k|  r0 = _mm_unpacklo_epi64(r8, r9);
  163|  7.46k|  r1 = _mm_unpackhi_epi64(r8, r9);
  164|  7.46k|  r2 = _mm_unpacklo_epi64(r10, r11);
  165|  7.46k|  r3 = _mm_unpackhi_epi64(r10, r11);
  166|       |
  167|  7.46k|  r4 = _mm_unpacklo_epi64(r12, r13);
  168|  7.46k|  r5 = _mm_unpackhi_epi64(r12, r13);
  169|  7.46k|  r6 = _mm_unpacklo_epi64(r14, r15);
  170|  7.46k|  r7 = _mm_unpackhi_epi64(r14, r15);
  171|       |
  172|  7.46k|  d[0] = _mm_unpacklo_epi64(r0, r2);
  173|  7.46k|  d[1] = _mm_unpacklo_epi64(r4, r6);
  174|  7.46k|  d[2] = _mm_unpacklo_epi64(r1, r3);
  175|  7.46k|  d[3] = _mm_unpacklo_epi64(r5, r7);
  176|       |
  177|  7.46k|  d[4] = _mm_unpackhi_epi64(r0, r2);
  178|  7.46k|  d[5] = _mm_unpackhi_epi64(r4, r6);
  179|  7.46k|  d[6] = _mm_unpackhi_epi64(r1, r3);
  180|  7.46k|  d[7] = _mm_unpackhi_epi64(r5, r7);
  181|  7.46k|}
intrapred_avx2.c:highbd_dr_prediction_z3_32x8_avx2:
 3135|  2.28k|                                              int bd) {
 3136|  2.28k|  __m128i dstvec[32], d[32];
 3137|  2.28k|  if (bd < 12) {
  ------------------
  |  Branch (3137:7): [True: 2.23k, False: 50]
  ------------------
 3138|  2.23k|    highbd_dr_prediction_z1_8xN_internal_avx2(32, dstvec, left, upsample_left,
 3139|  2.23k|                                              dy);
 3140|  2.23k|  } else {
 3141|     50|    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(32, dstvec, left,
 3142|     50|                                                    upsample_left, dy);
 3143|     50|  }
 3144|       |
 3145|  11.4k|  for (int i = 0; i < 32; i += 8) {
  ------------------
  |  Branch (3145:19): [True: 9.12k, False: 2.28k]
  ------------------
 3146|  9.12k|    highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i],
 3147|  9.12k|                             &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i],
 3148|  9.12k|                             &dstvec[6 + i], &dstvec[7 + i], &d[0 + i],
 3149|  9.12k|                             &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i],
 3150|  9.12k|                             &d[5 + i], &d[6 + i], &d[7 + i]);
 3151|  9.12k|  }
 3152|  20.5k|  for (int i = 0; i < 8; i++) {
  ------------------
  |  Branch (3152:19): [True: 18.2k, False: 2.28k]
  ------------------
 3153|  18.2k|    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
 3154|  18.2k|    _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]);
 3155|  18.2k|    _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 16]);
 3156|  18.2k|    _mm_storeu_si128((__m128i *)(dst + i * stride + 24), d[i + 24]);
 3157|  18.2k|  }
 3158|  2.28k|}
intrapred_avx2.c:highbd_dr_prediction_z3_64x16_avx2:
 3321|    212|                                               int bd) {
 3322|    212|  __m256i dstvec[64], d[16];
 3323|    212|  if (bd < 12) {
  ------------------
  |  Branch (3323:7): [True: 212, False: 0]
  ------------------
 3324|    212|    highbd_dr_prediction_z1_16xN_internal_avx2(64, dstvec, left, upsample_left,
 3325|    212|                                               dy);
 3326|    212|  } else {
 3327|      0|    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(64, dstvec, left,
 3328|      0|                                                     upsample_left, dy);
 3329|      0|  }
 3330|  1.06k|  for (int i = 0; i < 64; i += 16) {
  ------------------
  |  Branch (3330:19): [True: 848, False: 212]
  ------------------
 3331|    848|    highbd_transpose16x16_avx2((dstvec + i), d);
 3332|  14.4k|    for (int j = 0; j < 16; j++) {
  ------------------
  |  Branch (3332:21): [True: 13.5k, False: 848]
  ------------------
 3333|  13.5k|      _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]);
 3334|  13.5k|    }
 3335|    848|  }
 3336|    212|}
intrapred_avx2.c:dr_prediction_z1_4xN_avx2:
 3621|  60.0k|                                      int dx) {
 3622|  60.0k|  __m128i dstvec[16];
 3623|       |
 3624|  60.0k|  dr_prediction_z1_HxW_internal_avx2(4, N, dstvec, above, upsample_above, dx);
 3625|   344k|  for (int i = 0; i < N; i++) {
  ------------------
  |  Branch (3625:19): [True: 284k, False: 60.0k]
  ------------------
 3626|   284k|    *(int *)(dst + stride * i) = _mm_cvtsi128_si32(dstvec[i]);
 3627|   284k|  }
 3628|  60.0k|}
intrapred_avx2.c:dr_prediction_z1_HxW_internal_avx2:
 3550|   259k|    int dx) {
 3551|   259k|  const int frac_bits = 6 - upsample_above;
 3552|   259k|  const int max_base_x = ((W + H) - 1) << upsample_above;
 3553|       |
 3554|   259k|  assert(dx > 0);
 3555|       |  // pre-filter above pixels
 3556|       |  // store in temp buffers:
 3557|       |  //   above[x] * 32 + 16
 3558|       |  //   above[x+1] - above[x]
 3559|       |  // final pixels will be calculated as:
 3560|       |  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
 3561|   259k|  __m256i a0, a1, a32, a16;
 3562|   259k|  __m256i diff, c3f;
 3563|   259k|  __m128i a_mbase_x;
 3564|       |
 3565|   259k|  a16 = _mm256_set1_epi16(16);
 3566|   259k|  a_mbase_x = _mm_set1_epi8((int8_t)above[max_base_x]);
 3567|   259k|  c3f = _mm256_set1_epi16(0x3f);
 3568|       |
 3569|   259k|  int x = dx;
 3570|  2.36M|  for (int r = 0; r < W; r++) {
  ------------------
  |  Branch (3570:19): [True: 2.10M, False: 258k]
  ------------------
 3571|  2.10M|    __m256i b, res, shift;
 3572|  2.10M|    __m128i res1, a0_128, a1_128;
 3573|       |
 3574|  2.10M|    int base = x >> frac_bits;
 3575|  2.10M|    int base_max_diff = (max_base_x - base) >> upsample_above;
 3576|  2.10M|    if (base_max_diff <= 0) {
  ------------------
  |  Branch (3576:9): [True: 707, False: 2.10M]
  ------------------
 3577|  2.22k|      for (int i = r; i < W; ++i) {
  ------------------
  |  Branch (3577:23): [True: 1.51k, False: 707]
  ------------------
 3578|  1.51k|        dst[i] = a_mbase_x;  // save 4 values
 3579|  1.51k|      }
 3580|    707|      return;
 3581|    707|    }
 3582|  2.10M|    if (base_max_diff > H) base_max_diff = H;
  ------------------
  |  Branch (3582:9): [True: 2.02M, False: 79.2k]
  ------------------
 3583|  2.10M|    a0_128 = _mm_loadu_si128((__m128i *)(above + base));
 3584|  2.10M|    a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1));
 3585|       |
 3586|  2.10M|    if (upsample_above) {
  ------------------
  |  Branch (3586:9): [True: 729k, False: 1.37M]
  ------------------
 3587|   729k|      a0_128 = _mm_shuffle_epi8(a0_128, *(__m128i *)EvenOddMaskx[0]);
 3588|   729k|      a1_128 = _mm_srli_si128(a0_128, 8);
 3589|       |
 3590|   729k|      shift = _mm256_srli_epi16(
 3591|   729k|          _mm256_and_si256(
 3592|   729k|              _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f),
 3593|   729k|          1);
 3594|  1.37M|    } else {
 3595|  1.37M|      shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
 3596|  1.37M|    }
 3597|  2.10M|    a0 = _mm256_cvtepu8_epi16(a0_128);
 3598|  2.10M|    a1 = _mm256_cvtepu8_epi16(a1_128);
 3599|       |
 3600|  2.10M|    diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
 3601|  2.10M|    a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
 3602|  2.10M|    a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
 3603|       |
 3604|  2.10M|    b = _mm256_mullo_epi16(diff, shift);
 3605|  2.10M|    res = _mm256_add_epi16(a32, b);
 3606|  2.10M|    res = _mm256_srli_epi16(res, 5);
 3607|       |
 3608|  2.10M|    res = _mm256_packus_epi16(
 3609|  2.10M|        res, _mm256_castsi128_si256(
 3610|  2.10M|                 _mm256_extracti128_si256(res, 1)));  // goto 8 bit
 3611|  2.10M|    res1 = _mm256_castsi256_si128(res);               // 16 8bit values
 3612|       |
 3613|  2.10M|    dst[r] =
 3614|  2.10M|        _mm_blendv_epi8(a_mbase_x, res1, *(__m128i *)BaseMask[base_max_diff]);
 3615|  2.10M|    x += dx;
 3616|  2.10M|  }
 3617|   259k|}
intrapred_avx2.c:dr_prediction_z1_8xN_avx2:
 3632|  34.5k|                                      int dx) {
 3633|  34.5k|  __m128i dstvec[32];
 3634|       |
 3635|  34.5k|  dr_prediction_z1_HxW_internal_avx2(8, N, dstvec, above, upsample_above, dx);
 3636|   349k|  for (int i = 0; i < N; i++) {
  ------------------
  |  Branch (3636:19): [True: 315k, False: 34.5k]
  ------------------
 3637|   315k|    _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
 3638|   315k|  }
 3639|  34.5k|}
intrapred_avx2.c:dr_prediction_z1_16xN_avx2:
 3643|  24.0k|                                       int dx) {
 3644|  24.0k|  __m128i dstvec[64];
 3645|       |
 3646|  24.0k|  dr_prediction_z1_HxW_internal_avx2(16, N, dstvec, above, upsample_above, dx);
 3647|   320k|  for (int i = 0; i < N; i++) {
  ------------------
  |  Branch (3647:19): [True: 296k, False: 24.0k]
  ------------------
 3648|   296k|    _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
 3649|   296k|  }
 3650|  24.0k|}
intrapred_avx2.c:dr_prediction_z1_32xN_avx2:
 3724|  7.44k|                                       int dx) {
 3725|  7.44k|  __m256i dstvec[64];
 3726|  7.44k|  dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above, dx);
 3727|   177k|  for (int i = 0; i < N; i++) {
  ------------------
  |  Branch (3727:19): [True: 169k, False: 7.44k]
  ------------------
 3728|   169k|    _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
 3729|   169k|  }
 3730|  7.44k|}
intrapred_avx2.c:dr_prediction_z1_32xN_internal_avx2:
 3653|  15.0k|    int N, __m256i *dstvec, const uint8_t *above, int upsample_above, int dx) {
 3654|       |  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
 3655|  15.0k|  (void)upsample_above;
 3656|  15.0k|  const int frac_bits = 6;
 3657|  15.0k|  const int max_base_x = ((32 + N) - 1);
 3658|       |
 3659|       |  // pre-filter above pixels
 3660|       |  // store in temp buffers:
 3661|       |  //   above[x] * 32 + 16
 3662|       |  //   above[x+1] - above[x]
 3663|       |  // final pixels will be calculated as:
 3664|       |  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
 3665|  15.0k|  __m256i a0, a1, a32, a16;
 3666|  15.0k|  __m256i a_mbase_x, diff, c3f;
 3667|       |
 3668|  15.0k|  a16 = _mm256_set1_epi16(16);
 3669|  15.0k|  a_mbase_x = _mm256_set1_epi8((int8_t)above[max_base_x]);
 3670|  15.0k|  c3f = _mm256_set1_epi16(0x3f);
 3671|       |
 3672|  15.0k|  int x = dx;
 3673|   348k|  for (int r = 0; r < N; r++) {
  ------------------
  |  Branch (3673:19): [True: 333k, False: 15.0k]
  ------------------
 3674|   333k|    __m256i b, res, res16[2];
 3675|   333k|    __m128i a0_128, a1_128;
 3676|       |
 3677|   333k|    int base = x >> frac_bits;
 3678|   333k|    int base_max_diff = (max_base_x - base);
 3679|   333k|    if (base_max_diff <= 0) {
  ------------------
  |  Branch (3679:9): [True: 0, False: 333k]
  ------------------
 3680|      0|      for (int i = r; i < N; ++i) {
  ------------------
  |  Branch (3680:23): [True: 0, False: 0]
  ------------------
 3681|      0|        dstvec[i] = a_mbase_x;  // save 32 values
 3682|      0|      }
 3683|      0|      return;
 3684|      0|    }
 3685|   333k|    if (base_max_diff > 32) base_max_diff = 32;
  ------------------
  |  Branch (3685:9): [True: 326k, False: 6.67k]
  ------------------
 3686|   333k|    __m256i shift =
 3687|   333k|        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
 3688|       |
 3689|  1.00M|    for (int j = 0, jj = 0; j < 32; j += 16, jj++) {
  ------------------
  |  Branch (3689:29): [True: 667k, False: 333k]
  ------------------
 3690|   667k|      int mdiff = base_max_diff - j;
 3691|   667k|      if (mdiff <= 0) {
  ------------------
  |  Branch (3691:11): [True: 171, False: 667k]
  ------------------
 3692|    171|        res16[jj] = a_mbase_x;
 3693|   667k|      } else {
 3694|   667k|        a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
 3695|   667k|        a1_128 = _mm_loadu_si128((__m128i *)(above + base + j + 1));
 3696|   667k|        a0 = _mm256_cvtepu8_epi16(a0_128);
 3697|   667k|        a1 = _mm256_cvtepu8_epi16(a1_128);
 3698|       |
 3699|   667k|        diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
 3700|   667k|        a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
 3701|   667k|        a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
 3702|   667k|        b = _mm256_mullo_epi16(diff, shift);
 3703|       |
 3704|   667k|        res = _mm256_add_epi16(a32, b);
 3705|   667k|        res = _mm256_srli_epi16(res, 5);
 3706|   667k|        res16[jj] = _mm256_packus_epi16(
 3707|   667k|            res, _mm256_castsi128_si256(
 3708|   667k|                     _mm256_extracti128_si256(res, 1)));  // 16 8bit values
 3709|   667k|      }
 3710|   667k|    }
 3711|   333k|    res16[1] =
 3712|   333k|        _mm256_inserti128_si256(res16[0], _mm256_castsi256_si128(res16[1]),
 3713|   333k|                                1);  // 32 8bit values
 3714|       |
 3715|   333k|    dstvec[r] = _mm256_blendv_epi8(
 3716|   333k|        a_mbase_x, res16[1],
 3717|   333k|        *(__m256i *)BaseMask[base_max_diff]);  // 32 8bit values
 3718|   333k|    x += dx;
 3719|   333k|  }
 3720|  15.0k|}
intrapred_avx2.c:dr_prediction_z1_64xN_avx2:
 3734|  1.36k|                                       int dx) {
 3735|       |  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
 3736|  1.36k|  (void)upsample_above;
 3737|  1.36k|  const int frac_bits = 6;
 3738|  1.36k|  const int max_base_x = ((64 + N) - 1);
 3739|       |
 3740|       |  // pre-filter above pixels
 3741|       |  // store in temp buffers:
 3742|       |  //   above[x] * 32 + 16
 3743|       |  //   above[x+1] - above[x]
 3744|       |  // final pixels will be calculated as:
 3745|       |  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
 3746|  1.36k|  __m256i a0, a1, a32, a16;
 3747|  1.36k|  __m256i a_mbase_x, diff, c3f;
 3748|  1.36k|  __m128i max_base_x128, base_inc128, mask128;
 3749|       |
 3750|  1.36k|  a16 = _mm256_set1_epi16(16);
 3751|  1.36k|  a_mbase_x = _mm256_set1_epi8((int8_t)above[max_base_x]);
 3752|  1.36k|  max_base_x128 = _mm_set1_epi8(max_base_x);
 3753|  1.36k|  c3f = _mm256_set1_epi16(0x3f);
 3754|       |
 3755|  1.36k|  int x = dx;
 3756|  54.0k|  for (int r = 0; r < N; r++, dst += stride) {
  ------------------
  |  Branch (3756:19): [True: 52.6k, False: 1.36k]
  ------------------
 3757|  52.6k|    __m256i b, res;
 3758|  52.6k|    int base = x >> frac_bits;
 3759|  52.6k|    if (base >= max_base_x) {
  ------------------
  |  Branch (3759:9): [True: 0, False: 52.6k]
  ------------------
 3760|      0|      for (int i = r; i < N; ++i) {
  ------------------
  |  Branch (3760:23): [True: 0, False: 0]
  ------------------
 3761|      0|        _mm256_storeu_si256((__m256i *)dst, a_mbase_x);  // save 32 values
 3762|      0|        _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
 3763|      0|        dst += stride;
 3764|      0|      }
 3765|      0|      return;
 3766|      0|    }
 3767|       |
 3768|  52.6k|    __m256i shift =
 3769|  52.6k|        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
 3770|       |
 3771|  52.6k|    __m128i a0_128, a1_128, res128;
 3772|   263k|    for (int j = 0; j < 64; j += 16) {
  ------------------
  |  Branch (3772:21): [True: 210k, False: 52.6k]
  ------------------
 3773|   210k|      int mdif = max_base_x - (base + j);
 3774|   210k|      if (mdif <= 0) {
  ------------------
  |  Branch (3774:11): [True: 198, False: 210k]
  ------------------
 3775|    198|        _mm_storeu_si128((__m128i *)(dst + j),
 3776|    198|                         _mm256_castsi256_si128(a_mbase_x));
 3777|   210k|      } else {
 3778|   210k|        a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
 3779|   210k|        a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
 3780|   210k|        a0 = _mm256_cvtepu8_epi16(a0_128);
 3781|   210k|        a1 = _mm256_cvtepu8_epi16(a1_128);
 3782|       |
 3783|   210k|        diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
 3784|   210k|        a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
 3785|   210k|        a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
 3786|   210k|        b = _mm256_mullo_epi16(diff, shift);
 3787|       |
 3788|   210k|        res = _mm256_add_epi16(a32, b);
 3789|   210k|        res = _mm256_srli_epi16(res, 5);
 3790|   210k|        res = _mm256_packus_epi16(
 3791|   210k|            res, _mm256_castsi128_si256(
 3792|   210k|                     _mm256_extracti128_si256(res, 1)));  // 16 8bit values
 3793|       |
 3794|   210k|        base_inc128 =
 3795|   210k|            _mm_setr_epi8((int8_t)(base + j), (int8_t)(base + j + 1),
 3796|   210k|                          (int8_t)(base + j + 2), (int8_t)(base + j + 3),
 3797|   210k|                          (int8_t)(base + j + 4), (int8_t)(base + j + 5),
 3798|   210k|                          (int8_t)(base + j + 6), (int8_t)(base + j + 7),
 3799|   210k|                          (int8_t)(base + j + 8), (int8_t)(base + j + 9),
 3800|   210k|                          (int8_t)(base + j + 10), (int8_t)(base + j + 11),
 3801|   210k|                          (int8_t)(base + j + 12), (int8_t)(base + j + 13),
 3802|   210k|                          (int8_t)(base + j + 14), (int8_t)(base + j + 15));
 3803|       |
 3804|   210k|        mask128 = _mm_cmpgt_epi8(_mm_subs_epu8(max_base_x128, base_inc128),
 3805|   210k|                                 _mm_setzero_si128());
 3806|   210k|        res128 = _mm_blendv_epi8(_mm256_castsi256_si128(a_mbase_x),
 3807|   210k|                                 _mm256_castsi256_si128(res), mask128);
 3808|   210k|        _mm_storeu_si128((__m128i *)(dst + j), res128);
 3809|   210k|      }
 3810|   210k|    }
 3811|  52.6k|    x += dx;
 3812|  52.6k|  }
 3813|  1.36k|}
intrapred_avx2.c:dr_prediction_z2_Nx4_avx2:
 3845|  80.6k|                                      int dx, int dy) {
 3846|  80.6k|  const int min_base_x = -(1 << upsample_above);
 3847|  80.6k|  const int min_base_y = -(1 << upsample_left);
 3848|  80.6k|  const int frac_bits_x = 6 - upsample_above;
 3849|  80.6k|  const int frac_bits_y = 6 - upsample_left;
 3850|       |
 3851|  80.6k|  assert(dx > 0);
 3852|       |  // pre-filter above pixels
 3853|       |  // store in temp buffers:
 3854|       |  //   above[x] * 32 + 16
 3855|       |  //   above[x+1] - above[x]
 3856|       |  // final pixels will be calculated as:
 3857|       |  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
 3858|  80.6k|  __m128i a0_x, a1_x, a32, a16, diff;
 3859|  80.6k|  __m128i c3f, min_base_y128, c1234, dy128;
 3860|       |
 3861|  80.6k|  a16 = _mm_set1_epi16(16);
 3862|  80.6k|  c3f = _mm_set1_epi16(0x3f);
 3863|  80.6k|  min_base_y128 = _mm_set1_epi16(min_base_y);
 3864|  80.6k|  c1234 = _mm_setr_epi16(0, 1, 2, 3, 4, 0, 0, 0);
 3865|  80.6k|  dy128 = _mm_set1_epi16(dy);
 3866|       |
 3867|   491k|  for (int r = 0; r < N; r++) {
  ------------------
  |  Branch (3867:19): [True: 410k, False: 80.6k]
  ------------------
 3868|   410k|    __m128i b, res, shift, r6, ydx;
 3869|   410k|    __m128i resx, resy, resxy;
 3870|   410k|    __m128i a0_x128, a1_x128;
 3871|   410k|    int y = r + 1;
 3872|   410k|    int base_x = (-y * dx) >> frac_bits_x;
 3873|   410k|    int base_shift = 0;
 3874|   410k|    if (base_x < (min_base_x - 1)) {
  ------------------
  |  Branch (3874:9): [True: 266k, False: 144k]
  ------------------
 3875|   266k|      base_shift = (min_base_x - base_x - 1) >> upsample_above;
 3876|   266k|    }
 3877|   410k|    int base_min_diff =
 3878|   410k|        (min_base_x - base_x + upsample_above) >> upsample_above;
 3879|   410k|    if (base_min_diff > 4) {
  ------------------
  |  Branch (3879:9): [True: 168k, False: 242k]
  ------------------
 3880|   168k|      base_min_diff = 4;
 3881|   242k|    } else {
 3882|   242k|      if (base_min_diff < 0) base_min_diff = 0;
  ------------------
  |  Branch (3882:11): [True: 0, False: 242k]
  ------------------
 3883|   242k|    }
 3884|       |
 3885|   410k|    if (base_shift > 3) {
  ------------------
  |  Branch (3885:9): [True: 168k, False: 242k]
  ------------------
 3886|   168k|      a0_x = _mm_setzero_si128();
 3887|   168k|      a1_x = _mm_setzero_si128();
 3888|   168k|      shift = _mm_setzero_si128();
 3889|   242k|    } else {
 3890|   242k|      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
 3891|   242k|      ydx = _mm_set1_epi16(y * dx);
 3892|   242k|      r6 = _mm_slli_epi16(c1234, 6);
 3893|       |
 3894|   242k|      if (upsample_above) {
  ------------------
  |  Branch (3894:11): [True: 119k, False: 122k]
  ------------------
 3895|   119k|        a0_x128 =
 3896|   119k|            _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]);
 3897|   119k|        a1_x128 = _mm_srli_si128(a0_x128, 8);
 3898|       |
 3899|   119k|        shift = _mm_srli_epi16(
 3900|   119k|            _mm_and_si128(
 3901|   119k|                _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
 3902|   119k|            1);
 3903|   122k|      } else {
 3904|   122k|        a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
 3905|   122k|        a1_x128 = _mm_srli_si128(a0_x128, 1);
 3906|       |
 3907|   122k|        shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
 3908|   122k|      }
 3909|   242k|      a0_x = _mm_cvtepu8_epi16(a0_x128);
 3910|   242k|      a1_x = _mm_cvtepu8_epi16(a1_x128);
 3911|   242k|    }
 3912|       |    // y calc
 3913|   410k|    __m128i a0_y, a1_y, shifty;
 3914|   410k|    if (base_x < min_base_x) {
  ------------------
  |  Branch (3914:9): [True: 313k, False: 97.1k]
  ------------------
 3915|   313k|      DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
  ------------------
  |  |   19|   313k|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
 3916|   313k|      __m128i y_c128, base_y_c128, mask128, c1234_;
 3917|   313k|      c1234_ = _mm_srli_si128(c1234, 2);
 3918|   313k|      r6 = _mm_set1_epi16(r << 6);
 3919|   313k|      y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234_, dy128));
 3920|   313k|      base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
 3921|   313k|      mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
 3922|   313k|      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
 3923|   313k|      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
 3924|       |
 3925|   313k|      a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
 3926|   313k|                            left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
 3927|   313k|      base_y_c128 = _mm_add_epi16(base_y_c128, _mm_srli_epi16(a16, 4));
 3928|   313k|      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
 3929|   313k|      a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
 3930|   313k|                            left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
 3931|       |
 3932|   313k|      if (upsample_left) {
  ------------------
  |  Branch (3932:11): [True: 149k, False: 164k]
  ------------------
 3933|   149k|        shifty = _mm_srli_epi16(
 3934|   149k|            _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
 3935|   164k|      } else {
 3936|   164k|        shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
 3937|   164k|      }
 3938|   313k|      a0_x = _mm_unpacklo_epi64(a0_x, a0_y);
 3939|   313k|      a1_x = _mm_unpacklo_epi64(a1_x, a1_y);
 3940|   313k|      shift = _mm_unpacklo_epi64(shift, shifty);
 3941|   313k|    }
 3942|       |
 3943|   410k|    diff = _mm_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
 3944|   410k|    a32 = _mm_slli_epi16(a0_x, 5);     // a[x] * 32
 3945|   410k|    a32 = _mm_add_epi16(a32, a16);     // a[x] * 32 + 16
 3946|       |
 3947|   410k|    b = _mm_mullo_epi16(diff, shift);
 3948|   410k|    res = _mm_add_epi16(a32, b);
 3949|   410k|    res = _mm_srli_epi16(res, 5);
 3950|       |
 3951|   410k|    resx = _mm_packus_epi16(res, res);
 3952|   410k|    resy = _mm_srli_si128(resx, 4);
 3953|       |
 3954|   410k|    resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
 3955|   410k|    *(int *)(dst) = _mm_cvtsi128_si32(resxy);
 3956|   410k|    dst += stride;
 3957|   410k|  }
 3958|  80.6k|}
intrapred_avx2.c:dr_prediction_z2_Nx8_avx2:
 3963|  64.2k|                                      int dx, int dy) {
 3964|  64.2k|  const int min_base_x = -(1 << upsample_above);
 3965|  64.2k|  const int min_base_y = -(1 << upsample_left);
 3966|  64.2k|  const int frac_bits_x = 6 - upsample_above;
 3967|  64.2k|  const int frac_bits_y = 6 - upsample_left;
 3968|       |
 3969|       |  // pre-filter above pixels
 3970|       |  // store in temp buffers:
 3971|       |  //   above[x] * 32 + 16
 3972|       |  //   above[x+1] - above[x]
 3973|       |  // final pixels will be calculated as:
 3974|       |  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
 3975|  64.2k|  __m256i diff, a32, a16;
 3976|  64.2k|  __m256i a0_x, a1_x;
 3977|  64.2k|  __m128i a0_x128, a1_x128, min_base_y128, c3f;
 3978|  64.2k|  __m128i c1234, dy128;
 3979|       |
 3980|  64.2k|  a16 = _mm256_set1_epi16(16);
 3981|  64.2k|  c3f = _mm_set1_epi16(0x3f);
 3982|  64.2k|  min_base_y128 = _mm_set1_epi16(min_base_y);
 3983|  64.2k|  dy128 = _mm_set1_epi16(dy);
 3984|  64.2k|  c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
 3985|       |
 3986|   681k|  for (int r = 0; r < N; r++) {
  ------------------
  |  Branch (3986:19): [True: 616k, False: 64.2k]
  ------------------
 3987|   616k|    __m256i b, res, shift;
 3988|   616k|    __m128i resx, resy, resxy, r6, ydx;
 3989|       |
 3990|   616k|    int y = r + 1;
 3991|   616k|    int base_x = (-y * dx) >> frac_bits_x;
 3992|   616k|    int base_shift = 0;
 3993|   616k|    if (base_x < (min_base_x - 1)) {
  ------------------
  |  Branch (3993:9): [True: 462k, False: 153k]
  ------------------
 3994|   462k|      base_shift = (min_base_x - base_x - 1) >> upsample_above;
 3995|   462k|    }
 3996|   616k|    int base_min_diff =
 3997|   616k|        (min_base_x - base_x + upsample_above) >> upsample_above;
 3998|   616k|    if (base_min_diff > 8) {
  ------------------
  |  Branch (3998:9): [True: 272k, False: 344k]
  ------------------
 3999|   272k|      base_min_diff = 8;
 4000|   344k|    } else {
 4001|   344k|      if (base_min_diff < 0) base_min_diff = 0;
  ------------------
  |  Branch (4001:11): [True: 0, False: 344k]
  ------------------
 4002|   344k|    }
 4003|       |
 4004|   616k|    if (base_shift > 7) {
  ------------------
  |  Branch (4004:9): [True: 272k, False: 344k]
  ------------------
 4005|   272k|      a0_x = _mm256_setzero_si256();
 4006|   272k|      a1_x = _mm256_setzero_si256();
 4007|   272k|      shift = _mm256_setzero_si256();
 4008|   344k|    } else {
 4009|   344k|      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
 4010|   344k|      ydx = _mm_set1_epi16(y * dx);
 4011|   344k|      r6 = _mm_slli_epi16(_mm_srli_si128(c1234, 2), 6);
 4012|   344k|      if (upsample_above) {
  ------------------
  |  Branch (4012:11): [True: 106k, False: 237k]
  ------------------
 4013|   106k|        a0_x128 =
 4014|   106k|            _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]);
 4015|   106k|        a1_x128 = _mm_srli_si128(a0_x128, 8);
 4016|       |
 4017|   106k|        shift = _mm256_castsi128_si256(_mm_srli_epi16(
 4018|   106k|            _mm_and_si128(
 4019|   106k|                _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
 4020|   106k|            1));
 4021|   237k|      } else {
 4022|   237k|        a1_x128 = _mm_srli_si128(a0_x128, 1);
 4023|   237k|        a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
 4024|   237k|        a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]);
 4025|       |
 4026|   237k|        shift = _mm256_castsi128_si256(
 4027|   237k|            _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1));
 4028|   237k|      }
 4029|   344k|      a0_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a0_x128));
 4030|   344k|      a1_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a1_x128));
 4031|   344k|    }
 4032|       |
 4033|       |    // y calc
 4034|   616k|    __m128i a0_y, a1_y, shifty;
 4035|   616k|    if (base_x < min_base_x) {
  ------------------
  |  Branch (4035:9): [True: 514k, False: 102k]
  ------------------
 4036|   514k|      DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
  ------------------
  |  |   19|   514k|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
 4037|   514k|      __m128i y_c128, base_y_c128, mask128;
 4038|   514k|      r6 = _mm_set1_epi16(r << 6);
 4039|   514k|      y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
 4040|   514k|      base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
 4041|   514k|      mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
 4042|   514k|      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
 4043|   514k|      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
 4044|       |
 4045|   514k|      a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
 4046|   514k|                            left[base_y_c[2]], left[base_y_c[3]],
 4047|   514k|                            left[base_y_c[4]], left[base_y_c[5]],
 4048|   514k|                            left[base_y_c[6]], left[base_y_c[7]]);
 4049|   514k|      base_y_c128 = _mm_add_epi16(
 4050|   514k|          base_y_c128, _mm_srli_epi16(_mm256_castsi256_si128(a16), 4));
 4051|   514k|      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
 4052|       |
 4053|   514k|      a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
 4054|   514k|                            left[base_y_c[2]], left[base_y_c[3]],
 4055|   514k|                            left[base_y_c[4]], left[base_y_c[5]],
 4056|   514k|                            left[base_y_c[6]], left[base_y_c[7]]);
 4057|       |
 4058|   514k|      if (upsample_left) {
  ------------------
  |  Branch (4058:11): [True: 142k, False: 371k]
  ------------------
 4059|   142k|        shifty = _mm_srli_epi16(
 4060|   142k|            _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
 4061|   371k|      } else {
 4062|   371k|        shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
 4063|   371k|      }
 4064|       |
 4065|   514k|      a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
 4066|   514k|      a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
 4067|   514k|      shift = _mm256_inserti128_si256(shift, shifty, 1);
 4068|   514k|    }
 4069|       |
 4070|   616k|    diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
 4071|   616k|    a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
 4072|   616k|    a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
 4073|       |
 4074|   616k|    b = _mm256_mullo_epi16(diff, shift);
 4075|   616k|    res = _mm256_add_epi16(a32, b);
 4076|   616k|    res = _mm256_srli_epi16(res, 5);
 4077|       |
 4078|   616k|    resx = _mm_packus_epi16(_mm256_castsi256_si128(res),
 4079|   616k|                            _mm256_castsi256_si128(res));
 4080|   616k|    resy = _mm256_extracti128_si256(res, 1);
 4081|   616k|    resy = _mm_packus_epi16(resy, resy);
 4082|       |
 4083|   616k|    resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
 4084|   616k|    _mm_storel_epi64((__m128i *)(dst), resxy);
 4085|   616k|    dst += stride;
 4086|   616k|  }
 4087|  64.2k|}
intrapred_avx2.c:dr_prediction_z2_HxW_avx2:
 4092|  60.7k|                                      int upsample_left, int dx, int dy) {
 4093|       |  // here upsample_above and upsample_left are 0 by design of
 4094|       |  // av1_use_intra_edge_upsample
 4095|  60.7k|  const int min_base_x = -1;
 4096|  60.7k|  const int min_base_y = -1;
 4097|  60.7k|  (void)upsample_above;
 4098|  60.7k|  (void)upsample_left;
 4099|  60.7k|  const int frac_bits_x = 6;
 4100|  60.7k|  const int frac_bits_y = 6;
 4101|       |
 4102|  60.7k|  __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c1234, c0123;
 4103|  60.7k|  __m256i diff, min_base_y256, c3f, shifty, dy256, c1;
 4104|  60.7k|  __m128i a0_x128, a1_x128;
 4105|       |
 4106|  60.7k|  DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
  ------------------
  |  |   19|  60.7k|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
 4107|  60.7k|  a16 = _mm256_set1_epi16(16);
 4108|  60.7k|  c1 = _mm256_srli_epi16(a16, 4);
 4109|  60.7k|  min_base_y256 = _mm256_set1_epi16(min_base_y);
 4110|  60.7k|  c3f = _mm256_set1_epi16(0x3f);
 4111|  60.7k|  dy256 = _mm256_set1_epi16(dy);
 4112|  60.7k|  c0123 =
 4113|  60.7k|      _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
 4114|  60.7k|  c1234 = _mm256_add_epi16(c0123, c1);
 4115|       |
 4116|   979k|  for (int r = 0; r < H; r++) {
  ------------------
  |  Branch (4116:19): [True: 918k, False: 60.7k]
  ------------------
 4117|   918k|    __m256i b, res, shift, j256, r6, ydx;
 4118|   918k|    __m128i resx, resy;
 4119|   918k|    __m128i resxy;
 4120|   918k|    int y = r + 1;
 4121|   918k|    ydx = _mm256_set1_epi16((int16_t)(y * dx));
 4122|       |
 4123|   918k|    int base_x = (-y * dx) >> frac_bits_x;
 4124|  2.30M|    for (int j = 0; j < W; j += 16) {
  ------------------
  |  Branch (4124:21): [True: 1.38M, False: 918k]
  ------------------
 4125|  1.38M|      j256 = _mm256_set1_epi16(j);
 4126|  1.38M|      int base_shift = 0;
 4127|  1.38M|      if ((base_x + j) < (min_base_x - 1)) {
  ------------------
  |  Branch (4127:11): [True: 986k, False: 401k]
  ------------------
 4128|   986k|        base_shift = (min_base_x - (base_x + j) - 1);
 4129|   986k|      }
 4130|  1.38M|      int base_min_diff = (min_base_x - base_x - j);
 4131|  1.38M|      if (base_min_diff > 16) {
  ------------------
  |  Branch (4131:11): [True: 619k, False: 769k]
  ------------------
 4132|   619k|        base_min_diff = 16;
 4133|   769k|      } else {
 4134|   769k|        if (base_min_diff < 0) base_min_diff = 0;
  ------------------
  |  Branch (4134:13): [True: 211k, False: 557k]
  ------------------
 4135|   769k|      }
 4136|       |
 4137|  1.38M|      if (base_shift < 16) {
  ------------------
  |  Branch (4137:11): [True: 769k, False: 619k]
  ------------------
 4138|   769k|        a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j));
 4139|   769k|        a1_x128 =
 4140|   769k|            _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j));
 4141|   769k|        a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
 4142|   769k|        a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]);
 4143|       |
 4144|   769k|        a0_x = _mm256_cvtepu8_epi16(a0_x128);
 4145|   769k|        a1_x = _mm256_cvtepu8_epi16(a1_x128);
 4146|       |
 4147|   769k|        r6 = _mm256_slli_epi16(_mm256_add_epi16(c0123, j256), 6);
 4148|   769k|        shift = _mm256_srli_epi16(
 4149|   769k|            _mm256_and_si256(_mm256_sub_epi16(r6, ydx), c3f), 1);
 4150|       |
 4151|   769k|        diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
 4152|   769k|        a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
 4153|   769k|        a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
 4154|       |
 4155|   769k|        b = _mm256_mullo_epi16(diff, shift);
 4156|   769k|        res = _mm256_add_epi16(a32, b);
 4157|   769k|        res = _mm256_srli_epi16(res, 5);  // 16 16-bit values
 4158|   769k|        resx = _mm256_castsi256_si128(_mm256_packus_epi16(
 4159|   769k|            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
 4160|   769k|      } else {
 4161|   619k|        resx = _mm_setzero_si128();
 4162|   619k|      }
 4163|       |
 4164|       |      // y calc
 4165|  1.38M|      if (base_x < min_base_x) {
  ------------------
  |  Branch (4165:11): [True: 1.23M, False: 151k]
  ------------------
 4166|  1.23M|        __m256i c256, y_c256, base_y_c256, mask256, mul16;
 4167|  1.23M|        r6 = _mm256_set1_epi16(r << 6);
 4168|  1.23M|        c256 = _mm256_add_epi16(j256, c1234);
 4169|  1.23M|        mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256),
 4170|  1.23M|                                 _mm256_srli_epi16(min_base_y256, 1));
 4171|  1.23M|        y_c256 = _mm256_sub_epi16(r6, mul16);
 4172|       |
 4173|  1.23M|        base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y);
 4174|  1.23M|        mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256);
 4175|       |
 4176|  1.23M|        base_y_c256 = _mm256_blendv_epi8(base_y_c256, min_base_y256, mask256);
 4177|  1.23M|        int16_t min_y = (int16_t)_mm_extract_epi16(
 4178|  1.23M|            _mm256_extracti128_si256(base_y_c256, 1), 7);
 4179|  1.23M|        int16_t max_y =
 4180|  1.23M|            (int16_t)_mm_extract_epi16(_mm256_castsi256_si128(base_y_c256), 0);
 4181|  1.23M|        int16_t offset_diff = max_y - min_y;
 4182|       |
 4183|  1.23M|        if (offset_diff < 16) {
  ------------------
  |  Branch (4183:13): [True: 1.17M, False: 57.8k]
  ------------------
 4184|  1.17M|          __m256i min_y256 = _mm256_set1_epi16(min_y);
 4185|       |
 4186|  1.17M|          __m256i base_y_offset = _mm256_sub_epi16(base_y_c256, min_y256);
 4187|  1.17M|          __m128i base_y_offset128 =
 4188|  1.17M|              _mm_packs_epi16(_mm256_extracti128_si256(base_y_offset, 0),
 4189|  1.17M|                              _mm256_extracti128_si256(base_y_offset, 1));
 4190|       |
 4191|  1.17M|          __m128i a0_y128 = _mm_maskload_epi32(
 4192|  1.17M|              (int *)(left + min_y), *(__m128i *)LoadMaskz2[offset_diff / 4]);
 4193|  1.17M|          __m128i a1_y128 =
 4194|  1.17M|              _mm_maskload_epi32((int *)(left + min_y + 1),
 4195|  1.17M|                                 *(__m128i *)LoadMaskz2[offset_diff / 4]);
 4196|  1.17M|          a0_y128 = _mm_shuffle_epi8(a0_y128, base_y_offset128);
 4197|  1.17M|          a1_y128 = _mm_shuffle_epi8(a1_y128, base_y_offset128);
 4198|  1.17M|          a0_y = _mm256_cvtepu8_epi16(a0_y128);
 4199|  1.17M|          a1_y = _mm256_cvtepu8_epi16(a1_y128);
 4200|  1.17M|        } else {
 4201|  57.8k|          base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
 4202|  57.8k|          _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
 4203|       |
 4204|  57.8k|          a0_y = _mm256_setr_epi16(
 4205|  57.8k|              left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
 4206|  57.8k|              left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
 4207|  57.8k|              left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
 4208|  57.8k|              left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
 4209|  57.8k|              left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
 4210|  57.8k|              left[base_y_c[15]]);
 4211|  57.8k|          base_y_c256 = _mm256_add_epi16(base_y_c256, c1);
 4212|  57.8k|          _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
 4213|       |
 4214|  57.8k|          a1_y = _mm256_setr_epi16(
 4215|  57.8k|              left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
 4216|  57.8k|              left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
 4217|  57.8k|              left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
 4218|  57.8k|              left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
 4219|  57.8k|              left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
 4220|  57.8k|              left[base_y_c[15]]);
 4221|  57.8k|        }
 4222|  1.23M|        shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1);
 4223|       |
 4224|  1.23M|        diff = _mm256_sub_epi16(a1_y, a0_y);  // a[x+1] - a[x]
 4225|  1.23M|        a32 = _mm256_slli_epi16(a0_y, 5);     // a[x] * 32
 4226|  1.23M|        a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
 4227|       |
 4228|  1.23M|        b = _mm256_mullo_epi16(diff, shifty);
 4229|  1.23M|        res = _mm256_add_epi16(a32, b);
 4230|  1.23M|        res = _mm256_srli_epi16(res, 5);  // 16 16-bit values
 4231|  1.23M|        resy = _mm256_castsi256_si128(_mm256_packus_epi16(
 4232|  1.23M|            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
 4233|  1.23M|      } else {
 4234|   151k|        resy = _mm_setzero_si128();
 4235|   151k|      }
 4236|  1.38M|      resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
 4237|  1.38M|      _mm_storeu_si128((__m128i *)(dst + j), resxy);
 4238|  1.38M|    }  // for j
 4239|   918k|    dst += stride;
 4240|   918k|  }
 4241|  60.7k|}
intrapred_avx2.c:dr_prediction_z3_4x4_avx2:
 4360|  57.8k|                                      int dy) {
 4361|  57.8k|  __m128i dstvec[4], d[4];
 4362|       |
 4363|  57.8k|  dr_prediction_z1_HxW_internal_avx2(4, 4, dstvec, left, upsample_left, dy);
 4364|  57.8k|  transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
 4365|  57.8k|                            &d[0], &d[1], &d[2], &d[3]);
 4366|       |
 4367|  57.8k|  *(int *)(dst + stride * 0) = _mm_cvtsi128_si32(d[0]);
 4368|  57.8k|  *(int *)(dst + stride * 1) = _mm_cvtsi128_si32(d[1]);
 4369|  57.8k|  *(int *)(dst + stride * 2) = _mm_cvtsi128_si32(d[2]);
 4370|  57.8k|  *(int *)(dst + stride * 3) = _mm_cvtsi128_si32(d[3]);
 4371|  57.8k|  return;
 4372|  57.8k|}
intrapred_avx2.c:dr_prediction_z3_8x8_avx2:
 4376|  29.5k|                                      int dy) {
 4377|  29.5k|  __m128i dstvec[8], d[8];
 4378|       |
 4379|  29.5k|  dr_prediction_z1_HxW_internal_avx2(8, 8, dstvec, left, upsample_left, dy);
 4380|  29.5k|  transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4],
 4381|  29.5k|                    &dstvec[5], &dstvec[6], &dstvec[7], &d[0], &d[1], &d[2],
 4382|  29.5k|                    &d[3]);
 4383|       |
 4384|  29.5k|  _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
 4385|  29.5k|  _mm_storel_epi64((__m128i *)(dst + 1 * stride), _mm_srli_si128(d[0], 8));
 4386|  29.5k|  _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[1]);
 4387|  29.5k|  _mm_storel_epi64((__m128i *)(dst + 3 * stride), _mm_srli_si128(d[1], 8));
 4388|  29.5k|  _mm_storel_epi64((__m128i *)(dst + 4 * stride), d[2]);
 4389|  29.5k|  _mm_storel_epi64((__m128i *)(dst + 5 * stride), _mm_srli_si128(d[2], 8));
 4390|  29.5k|  _mm_storel_epi64((__m128i *)(dst + 6 * stride), d[3]);
 4391|       |  _mm_storel_epi64((__m128i *)(dst + 7 * stride), _mm_srli_si128(d[3], 8));
 4392|  29.5k|}
intrapred_avx2.c:dr_prediction_z3_16x16_avx2:
 4538|  11.1k|                                        int dy) {
 4539|  11.1k|  __m128i dstvec[16], d[16];
 4540|       |
 4541|  11.1k|  dr_prediction_z1_HxW_internal_avx2(16, 16, dstvec, left, upsample_left, dy);
 4542|  11.1k|  transpose16x16_sse2(dstvec, d);
 4543|       |
 4544|   189k|  for (int i = 0; i < 16; i++) {
  ------------------
  |  Branch (4544:19): [True: 178k, False: 11.1k]
  ------------------
 4545|   178k|    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
 4546|   178k|  }
 4547|  11.1k|}
intrapred_avx2.c:dr_prediction_z3_32x32_avx2:
 4551|  3.75k|                                        int dy) {
 4552|  3.75k|  __m256i dstvec[32], d[32];
 4553|       |
 4554|  3.75k|  dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left, dy);
 4555|  3.75k|  transpose16x32_avx2(dstvec, d);
 4556|  3.75k|  transpose16x32_avx2(dstvec + 16, d + 16);
 4557|  63.7k|  for (int j = 0; j < 16; j++) {
  ------------------
  |  Branch (4557:19): [True: 60.0k, False: 3.75k]
  ------------------
 4558|  60.0k|    _mm_storeu_si128((__m128i *)(dst + j * stride),
 4559|  60.0k|                     _mm256_castsi256_si128(d[j]));
 4560|  60.0k|    _mm_storeu_si128((__m128i *)(dst + j * stride + 16),
 4561|  60.0k|                     _mm256_castsi256_si128(d[j + 16]));
 4562|  60.0k|  }
 4563|  63.7k|  for (int j = 0; j < 16; j++) {
  ------------------
  |  Branch (4563:19): [True: 60.0k, False: 3.75k]
  ------------------
 4564|  60.0k|    _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride),
 4565|  60.0k|                     _mm256_extracti128_si256(d[j], 1));
 4566|  60.0k|    _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride + 16),
 4567|       |                     _mm256_extracti128_si256(d[j + 16], 1));
 4568|  60.0k|  }
 4569|  3.75k|}
intrapred_avx2.c:transpose16x32_avx2:
 4268|  11.4k|static inline void transpose16x32_avx2(__m256i *x, __m256i *d) {
 4269|  11.4k|  __m256i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
 4270|  11.4k|  __m256i w10, w11, w12, w13, w14, w15;
 4271|       |
 4272|  11.4k|  w0 = _mm256_unpacklo_epi8(x[0], x[1]);
 4273|  11.4k|  w1 = _mm256_unpacklo_epi8(x[2], x[3]);
 4274|  11.4k|  w2 = _mm256_unpacklo_epi8(x[4], x[5]);
 4275|  11.4k|  w3 = _mm256_unpacklo_epi8(x[6], x[7]);
 4276|       |
 4277|  11.4k|  w8 = _mm256_unpacklo_epi8(x[8], x[9]);
 4278|  11.4k|  w9 = _mm256_unpacklo_epi8(x[10], x[11]);
 4279|  11.4k|  w10 = _mm256_unpacklo_epi8(x[12], x[13]);
 4280|  11.4k|  w11 = _mm256_unpacklo_epi8(x[14], x[15]);
 4281|       |
 4282|  11.4k|  w4 = _mm256_unpacklo_epi16(w0, w1);
 4283|  11.4k|  w5 = _mm256_unpacklo_epi16(w2, w3);
 4284|  11.4k|  w12 = _mm256_unpacklo_epi16(w8, w9);
 4285|  11.4k|  w13 = _mm256_unpacklo_epi16(w10, w11);
 4286|       |
 4287|  11.4k|  w6 = _mm256_unpacklo_epi32(w4, w5);
 4288|  11.4k|  w7 = _mm256_unpackhi_epi32(w4, w5);
 4289|  11.4k|  w14 = _mm256_unpacklo_epi32(w12, w13);
 4290|  11.4k|  w15 = _mm256_unpackhi_epi32(w12, w13);
 4291|       |
 4292|       |  // Store first 4-line result
 4293|  11.4k|  d[0] = _mm256_unpacklo_epi64(w6, w14);
 4294|  11.4k|  d[1] = _mm256_unpackhi_epi64(w6, w14);
 4295|  11.4k|  d[2] = _mm256_unpacklo_epi64(w7, w15);
 4296|  11.4k|  d[3] = _mm256_unpackhi_epi64(w7, w15);
 4297|       |
 4298|  11.4k|  w4 = _mm256_unpackhi_epi16(w0, w1);
 4299|  11.4k|  w5 = _mm256_unpackhi_epi16(w2, w3);
 4300|  11.4k|  w12 = _mm256_unpackhi_epi16(w8, w9);
 4301|  11.4k|  w13 = _mm256_unpackhi_epi16(w10, w11);
 4302|       |
 4303|  11.4k|  w6 = _mm256_unpacklo_epi32(w4, w5);
 4304|  11.4k|  w7 = _mm256_unpackhi_epi32(w4, w5);
 4305|  11.4k|  w14 = _mm256_unpacklo_epi32(w12, w13);
 4306|  11.4k|  w15 = _mm256_unpackhi_epi32(w12, w13);
 4307|       |
 4308|       |  // Store second 4-line result
 4309|  11.4k|  d[4] = _mm256_unpacklo_epi64(w6, w14);
 4310|  11.4k|  d[5] = _mm256_unpackhi_epi64(w6, w14);
 4311|  11.4k|  d[6] = _mm256_unpacklo_epi64(w7, w15);
 4312|  11.4k|  d[7] = _mm256_unpackhi_epi64(w7, w15);
 4313|       |
 4314|       |  // upper half
 4315|  11.4k|  w0 = _mm256_unpackhi_epi8(x[0], x[1]);
 4316|  11.4k|  w1 = _mm256_unpackhi_epi8(x[2], x[3]);
 4317|  11.4k|  w2 = _mm256_unpackhi_epi8(x[4], x[5]);
 4318|  11.4k|  w3 = _mm256_unpackhi_epi8(x[6], x[7]);
 4319|       |
 4320|  11.4k|  w8 = _mm256_unpackhi_epi8(x[8], x[9]);
 4321|  11.4k|  w9 = _mm256_unpackhi_epi8(x[10], x[11]);
 4322|  11.4k|  w10 = _mm256_unpackhi_epi8(x[12], x[13]);
 4323|  11.4k|  w11 = _mm256_unpackhi_epi8(x[14], x[15]);
 4324|       |
 4325|  11.4k|  w4 = _mm256_unpacklo_epi16(w0, w1);
 4326|  11.4k|  w5 = _mm256_unpacklo_epi16(w2, w3);
 4327|  11.4k|  w12 = _mm256_unpacklo_epi16(w8, w9);
 4328|  11.4k|  w13 = _mm256_unpacklo_epi16(w10, w11);
 4329|       |
 4330|  11.4k|  w6 = _mm256_unpacklo_epi32(w4, w5);
 4331|  11.4k|  w7 = _mm256_unpackhi_epi32(w4, w5);
 4332|  11.4k|  w14 = _mm256_unpacklo_epi32(w12, w13);
 4333|  11.4k|  w15 = _mm256_unpackhi_epi32(w12, w13);
 4334|       |
 4335|       |  // Store first 4-line result
 4336|  11.4k|  d[8] = _mm256_unpacklo_epi64(w6, w14);
 4337|  11.4k|  d[9] = _mm256_unpackhi_epi64(w6, w14);
 4338|  11.4k|  d[10] = _mm256_unpacklo_epi64(w7, w15);
 4339|  11.4k|  d[11] = _mm256_unpackhi_epi64(w7, w15);
 4340|       |
 4341|  11.4k|  w4 = _mm256_unpackhi_epi16(w0, w1);
 4342|  11.4k|  w5 = _mm256_unpackhi_epi16(w2, w3);
 4343|  11.4k|  w12 = _mm256_unpackhi_epi16(w8, w9);
 4344|  11.4k|  w13 = _mm256_unpackhi_epi16(w10, w11);
 4345|       |
 4346|  11.4k|  w6 = _mm256_unpacklo_epi32(w4, w5);
 4347|  11.4k|  w7 = _mm256_unpackhi_epi32(w4, w5);
 4348|  11.4k|  w14 = _mm256_unpacklo_epi32(w12, w13);
 4349|  11.4k|  w15 = _mm256_unpackhi_epi32(w12, w13);
 4350|       |
 4351|       |  // Store second 4-line result
 4352|  11.4k|  d[12] = _mm256_unpacklo_epi64(w6, w14);
 4353|  11.4k|  d[13] = _mm256_unpackhi_epi64(w6, w14);
 4354|  11.4k|  d[14] = _mm256_unpacklo_epi64(w7, w15);
 4355|  11.4k|  d[15] = _mm256_unpackhi_epi64(w7, w15);
 4356|  11.4k|}
intrapred_avx2.c:dr_prediction_z3_64x64_avx2:
 4573|    310|                                        int dy) {
 4574|    310|  DECLARE_ALIGNED(16, uint8_t, dstT[64 * 64]);
  ------------------
  |  |   19|    310|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
 4575|    310|  dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy);
 4576|    310|  transpose(dstT, 64, dst, stride, 64, 64);
 4577|    310|}
intrapred_avx2.c:dr_prediction_z3_4x8_avx2:
 4396|  4.06k|                                      int dy) {
 4397|  4.06k|  __m128i dstvec[4], d[8];
 4398|       |
 4399|  4.06k|  dr_prediction_z1_HxW_internal_avx2(8, 4, dstvec, left, upsample_left, dy);
 4400|  4.06k|  transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &d[0],
 4401|  4.06k|                        &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
 4402|  36.5k|  for (int i = 0; i < 8; i++) {
  ------------------
  |  Branch (4402:19): [True: 32.5k, False: 4.06k]
  ------------------
 4403|  32.5k|    *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
 4404|  32.5k|  }
 4405|  4.06k|}
intrapred_avx2.c:dr_prediction_z3_8x16_avx2:
 4424|  6.02k|                                       int dy) {
 4425|  6.02k|  __m128i dstvec[8], d[8];
 4426|       |
 4427|  6.02k|  dr_prediction_z1_HxW_internal_avx2(16, 8, dstvec, left, upsample_left, dy);
 4428|  6.02k|  transpose8x16_16x8_sse2(dstvec, dstvec + 1, dstvec + 2, dstvec + 3,
 4429|  6.02k|                          dstvec + 4, dstvec + 5, dstvec + 6, dstvec + 7, d,
 4430|  6.02k|                          d + 1, d + 2, d + 3, d + 4, d + 5, d + 6, d + 7);
 4431|  54.2k|  for (int i = 0; i < 8; i++) {
  ------------------
  |  Branch (4431:19): [True: 48.2k, False: 6.02k]
  ------------------
 4432|  48.2k|    _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
 4433|  48.2k|    _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride),
 4434|       |                     _mm_srli_si128(d[i], 8));
 4435|  48.2k|  }
 4436|  6.02k|}
intrapred_avx2.c:dr_prediction_z3_16x32_avx2:
 4581|  1.55k|                                        int dy) {
 4582|  1.55k|  __m256i dstvec[16], d[16];
 4583|       |
 4584|  1.55k|  dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left, dy);
 4585|  1.55k|  transpose16x32_avx2(dstvec, d);
 4586|       |  // store
 4587|  26.5k|  for (int j = 0; j < 16; j++) {
  ------------------
  |  Branch (4587:19): [True: 24.9k, False: 1.55k]
  ------------------
 4588|  24.9k|    _mm_storeu_si128((__m128i *)(dst + j * stride),
 4589|  24.9k|                     _mm256_castsi256_si128(d[j]));
 4590|  24.9k|    _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride),
 4591|       |                     _mm256_extracti128_si256(d[j], 1));
 4592|  24.9k|  }
 4593|  1.55k|}
intrapred_avx2.c:dr_prediction_z3_32x64_avx2:
 4611|    293|                                        int dy) {
 4612|    293|  uint8_t dstT[64 * 32];
 4613|    293|  dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy);
 4614|    293|  transpose(dstT, 64, dst, stride, 32, 64);
 4615|    293|}
intrapred_avx2.c:dr_prediction_z3_4x16_avx2:
 4458|  3.53k|                                       int dy) {
 4459|  3.53k|  __m128i dstvec[4], d[16];
 4460|       |
 4461|  3.53k|  dr_prediction_z1_HxW_internal_avx2(16, 4, dstvec, left, upsample_left, dy);
 4462|  3.53k|  transpose4x16_sse2(dstvec, d);
 4463|  60.0k|  for (int i = 0; i < 16; i++) {
  ------------------
  |  Branch (4463:19): [True: 56.4k, False: 3.53k]
  ------------------
 4464|  56.4k|    *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
 4465|  56.4k|  }
 4466|  3.53k|}
intrapred_avx2.c:dr_prediction_z3_8x32_avx2:
 4490|  2.34k|                                       int dy) {
 4491|  2.34k|  __m256i dstvec[16], d[16];
 4492|       |
 4493|  2.34k|  dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left, dy);
 4494|  21.0k|  for (int i = 8; i < 16; i++) {
  ------------------
  |  Branch (4494:19): [True: 18.7k, False: 2.34k]
  ------------------
 4495|  18.7k|    dstvec[i] = _mm256_setzero_si256();
 4496|  18.7k|  }
 4497|  2.34k|  transpose16x32_avx2(dstvec, d);
 4498|       |
 4499|  39.8k|  for (int i = 0; i < 16; i++) {
  ------------------
  |  Branch (4499:19): [True: 37.4k, False: 2.34k]
  ------------------
 4500|  37.4k|    _mm_storel_epi64((__m128i *)(dst + i * stride),
 4501|  37.4k|                     _mm256_castsi256_si128(d[i]));
 4502|  37.4k|  }
 4503|  39.8k|  for (int i = 0; i < 16; i++) {
  ------------------
  |  Branch (4503:19): [True: 37.4k, False: 2.34k]
  ------------------
 4504|  37.4k|    _mm_storel_epi64((__m128i *)(dst + (i + 16) * stride),
 4505|       |                     _mm256_extracti128_si256(d[i], 1));
 4506|  37.4k|  }
 4507|  2.34k|}
intrapred_avx2.c:dr_prediction_z3_16x64_avx2:
 4629|    280|                                        int dy) {
 4630|    280|  uint8_t dstT[64 * 16];
 4631|    280|  dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy);
 4632|    280|  transpose(dstT, 64, dst, stride, 16, 64);
 4633|    280|}
intrapred_avx2.c:dr_prediction_z3_8x4_avx2:
 4409|  6.05k|                                      int dy) {
 4410|  6.05k|  __m128i dstvec[8], d[4];
 4411|       |
 4412|  6.05k|  dr_prediction_z1_HxW_internal_avx2(4, 8, dstvec, left, upsample_left, dy);
 4413|  6.05k|  transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
 4414|  6.05k|                        &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &d[0],
 4415|  6.05k|                        &d[1], &d[2], &d[3]);
 4416|  6.05k|  _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
 4417|  6.05k|  _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]);
 4418|  6.05k|  _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]);
 4419|  6.05k|  _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]);
 4420|  6.05k|}
intrapred_avx2.c:dr_prediction_z3_16x8_avx2:
 4440|  9.88k|                                       int dy) {
 4441|  9.88k|  __m128i dstvec[16], d[16];
 4442|       |
 4443|  9.88k|  dr_prediction_z1_HxW_internal_avx2(8, 16, dstvec, left, upsample_left, dy);
 4444|  9.88k|  transpose16x8_8x16_sse2(
 4445|  9.88k|      &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
 4446|  9.88k|      &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
 4447|  9.88k|      &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
 4448|  9.88k|      &d[3], &d[4], &d[5], &d[6], &d[7]);
 4449|       |
 4450|  88.9k|  for (int i = 0; i < 8; i++) {
  ------------------
  |  Branch (4450:19): [True: 79.0k, False: 9.88k]
  ------------------
 4451|  79.0k|    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
 4452|  79.0k|  }
 4453|  9.88k|}
intrapred_avx2.c:dr_prediction_z3_32x16_avx2:
 4597|  2.10k|                                        int dy) {
 4598|  2.10k|  __m128i dstvec[32], d[16];
 4599|       |
 4600|  2.10k|  dr_prediction_z1_HxW_internal_avx2(16, 32, dstvec, left, upsample_left, dy);
 4601|  6.31k|  for (int i = 0; i < 32; i += 16) {
  ------------------
  |  Branch (4601:19): [True: 4.21k, False: 2.10k]
  ------------------
 4602|  4.21k|    transpose16x16_sse2((dstvec + i), d);
 4603|  71.5k|    for (int j = 0; j < 16; j++) {
  ------------------
  |  Branch (4603:21): [True: 67.3k, False: 4.21k]
  ------------------
 4604|  67.3k|      _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
 4605|  67.3k|    }
 4606|  4.21k|  }
 4607|  2.10k|}
intrapred_avx2.c:dr_prediction_z3_64x32_avx2:
 4619|    100|                                        int dy) {
 4620|    100|  uint8_t dstT[32 * 64];
 4621|    100|  dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy);
 4622|    100|  transpose(dstT, 32, dst, stride, 64, 32);
 4623|    100|  return;
 4624|    100|}
intrapred_avx2.c:dr_prediction_z3_16x4_avx2:
 4470|  7.76k|                                       int dy) {
 4471|  7.76k|  __m128i dstvec[16], d[8];
 4472|       |
 4473|  7.76k|  dr_prediction_z1_HxW_internal_avx2(4, 16, dstvec, left, upsample_left, dy);
 4474|  38.8k|  for (int i = 4; i < 8; i++) {
  ------------------
  |  Branch (4474:19): [True: 31.0k, False: 7.76k]
  ------------------
 4475|  31.0k|    d[i] = _mm_setzero_si128();
 4476|  31.0k|  }
 4477|  7.76k|  transpose16x8_8x16_sse2(
 4478|  7.76k|      &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
 4479|  7.76k|      &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
 4480|  7.76k|      &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
 4481|  7.76k|      &d[3], &d[4], &d[5], &d[6], &d[7]);
 4482|       |
 4483|  38.8k|  for (int i = 0; i < 4; i++) {
  ------------------
  |  Branch (4483:19): [True: 31.0k, False: 7.76k]
  ------------------
 4484|  31.0k|    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
 4485|  31.0k|  }
 4486|  7.76k|}
intrapred_avx2.c:dr_prediction_z3_32x8_avx2:
 4511|  2.44k|                                       int dy) {
 4512|  2.44k|  __m128i dstvec[32], d[16];
 4513|       |
 4514|  2.44k|  dr_prediction_z1_HxW_internal_avx2(8, 32, dstvec, left, upsample_left, dy);
 4515|       |
 4516|  2.44k|  transpose16x8_8x16_sse2(
 4517|  2.44k|      &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
 4518|  2.44k|      &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
 4519|  2.44k|      &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
 4520|  2.44k|      &d[3], &d[4], &d[5], &d[6], &d[7]);
 4521|  2.44k|  transpose16x8_8x16_sse2(
 4522|  2.44k|      &dstvec[0 + 16], &dstvec[1 + 16], &dstvec[2 + 16], &dstvec[3 + 16],
 4523|  2.44k|      &dstvec[4 + 16], &dstvec[5 + 16], &dstvec[6 + 16], &dstvec[7 + 16],
 4524|  2.44k|      &dstvec[8 + 16], &dstvec[9 + 16], &dstvec[10 + 16], &dstvec[11 + 16],
 4525|  2.44k|      &dstvec[12 + 16], &dstvec[13 + 16], &dstvec[14 + 16], &dstvec[15 + 16],
 4526|  2.44k|      &d[0 + 8], &d[1 + 8], &d[2 + 8], &d[3 + 8], &d[4 + 8], &d[5 + 8],
 4527|  2.44k|      &d[6 + 8], &d[7 + 8]);
 4528|       |
 4529|  21.9k|  for (int i = 0; i < 8; i++) {
  ------------------
  |  Branch (4529:19): [True: 19.5k, False: 2.44k]
  ------------------
 4530|  19.5k|    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
 4531|  19.5k|    _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 8]);
 4532|  19.5k|  }
 4533|  2.44k|}
intrapred_avx2.c:dr_prediction_z3_64x16_avx2:
 4637|    211|                                        int dy) {
 4638|    211|  __m128i dstvec[64], d[16];
 4639|       |
 4640|    211|  dr_prediction_z1_HxW_internal_avx2(16, 64, dstvec, left, upsample_left, dy);
 4641|  1.05k|  for (int i = 0; i < 64; i += 16) {
  ------------------
  |  Branch (4641:19): [True: 844, False: 211]
  ------------------
 4642|    844|    transpose16x16_sse2((dstvec + i), d);
 4643|  14.3k|    for (int j = 0; j < 16; j++) {
  ------------------
  |  Branch (4643:21): [True: 13.5k, False: 844]
  ------------------
 4644|  13.5k|      _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
 4645|  13.5k|    }
 4646|    844|  }
 4647|    211|}

aom_dc_predictor_4x8_sse2:
  110|  25.6k|                               const uint8_t *above, const uint8_t *left) {
  111|  25.6k|  const __m128i sum_left = dc_sum_8(left);
  112|  25.6k|  __m128i sum_above = dc_sum_4(above);
  113|  25.6k|  sum_above = _mm_add_epi16(sum_left, sum_above);
  114|       |
  115|  25.6k|  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
  116|  25.6k|  sum += 6;
  117|  25.6k|  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
  ------------------
  |  |   95|  25.6k|#define DC_MULTIPLIER_1X2 0x5556
  ------------------
  118|       |
  119|  25.6k|  const __m128i row = _mm_set1_epi8((int8_t)sum);
  120|  25.6k|  const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(row);
  121|  25.6k|  dc_store_4xh(pred, 8, dst, stride);
  122|  25.6k|}
aom_dc_predictor_4x16_sse2:
  126|  15.8k|                                const uint8_t *above, const uint8_t *left) {
  127|  15.8k|  const __m128i sum_left = dc_sum_16_sse2(left);
  128|  15.8k|  __m128i sum_above = dc_sum_4(above);
  129|  15.8k|  sum_above = _mm_add_epi16(sum_left, sum_above);
  130|       |
  131|  15.8k|  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
  132|  15.8k|  sum += 10;
  133|  15.8k|  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
  ------------------
  |  |   96|  15.8k|#define DC_MULTIPLIER_1X4 0x3334
  ------------------
  134|       |
  135|  15.8k|  const __m128i row = _mm_set1_epi8((int8_t)sum);
  136|  15.8k|  const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(row);
  137|  15.8k|  dc_store_4xh(pred, 16, dst, stride);
  138|  15.8k|}
aom_dc_predictor_8x4_sse2:
  142|  35.1k|                               const uint8_t *above, const uint8_t *left) {
  143|  35.1k|  const __m128i sum_left = dc_sum_4(left);
  144|  35.1k|  __m128i sum_above = dc_sum_8(above);
  145|  35.1k|  sum_above = _mm_add_epi16(sum_above, sum_left);
  146|       |
  147|  35.1k|  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
  148|  35.1k|  sum += 6;
  149|  35.1k|  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
  ------------------
  |  |   95|  35.1k|#define DC_MULTIPLIER_1X2 0x5556
  ------------------
  150|       |
  151|  35.1k|  const __m128i row = _mm_set1_epi8((int8_t)sum);
  152|  35.1k|  dc_store_8xh(&row, 4, dst, stride);
  153|  35.1k|}
aom_dc_predictor_8x16_sse2:
  156|  25.5k|                                const uint8_t *above, const uint8_t *left) {
  157|  25.5k|  const __m128i sum_left = dc_sum_16_sse2(left);
  158|  25.5k|  __m128i sum_above = dc_sum_8(above);
  159|  25.5k|  sum_above = _mm_add_epi16(sum_above, sum_left);
  160|       |
  161|  25.5k|  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
  162|  25.5k|  sum += 12;
  163|  25.5k|  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
  ------------------
  |  |   95|  25.5k|#define DC_MULTIPLIER_1X2 0x5556
  ------------------
  164|  25.5k|  const __m128i row = _mm_set1_epi8((int8_t)sum);
  165|  25.5k|  dc_store_8xh(&row, 16, dst, stride);
  166|  25.5k|}
aom_dc_predictor_8x32_sse2:
  170|  8.55k|                                const uint8_t *above, const uint8_t *left) {
  171|  8.55k|  const __m128i sum_left = dc_sum_32_sse2(left);
  172|  8.55k|  __m128i sum_above = dc_sum_8(above);
  173|  8.55k|  sum_above = _mm_add_epi16(sum_above, sum_left);
  174|       |
  175|  8.55k|  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
  176|  8.55k|  sum += 20;
  177|  8.55k|  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
  ------------------
  |  |   96|  8.55k|#define DC_MULTIPLIER_1X4 0x3334
  ------------------
  178|  8.55k|  const __m128i row = _mm_set1_epi8((int8_t)sum);
  179|  8.55k|  dc_store_8xh(&row, 32, dst, stride);
  180|  8.55k|}
aom_dc_predictor_16x4_sse2:
  183|  32.3k|                                const uint8_t *above, const uint8_t *left) {
  184|  32.3k|  const __m128i sum_left = dc_sum_4(left);
  185|  32.3k|  __m128i sum_above = dc_sum_16_sse2(above);
  186|  32.3k|  sum_above = _mm_add_epi16(sum_above, sum_left);
  187|       |
  188|  32.3k|  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
  189|  32.3k|  sum += 10;
  190|  32.3k|  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
  ------------------
  |  |   96|  32.3k|#define DC_MULTIPLIER_1X4 0x3334
  ------------------
  191|  32.3k|  const __m128i row = _mm_set1_epi8((int8_t)sum);
  192|  32.3k|  dc_store_16xh(&row, 4, dst, stride);
  193|  32.3k|}
aom_dc_predictor_16x8_sse2:
  197|  40.4k|                                const uint8_t *above, const uint8_t *left) {
  198|  40.4k|  const __m128i sum_left = dc_sum_8(left);
  199|  40.4k|  __m128i sum_above = dc_sum_16_sse2(above);
  200|  40.4k|  sum_above = _mm_add_epi16(sum_above, sum_left);
  201|       |
  202|  40.4k|  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
  203|  40.4k|  sum += 12;
  204|  40.4k|  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
  ------------------
  |  |   95|  40.4k|#define DC_MULTIPLIER_1X2 0x5556
  ------------------
  205|  40.4k|  const __m128i row = _mm_set1_epi8((int8_t)sum);
  206|  40.4k|  dc_store_16xh(&row, 8, dst, stride);
  207|  40.4k|}
aom_dc_predictor_16x32_sse2:
  210|  11.7k|                                 const uint8_t *above, const uint8_t *left) {
  211|  11.7k|  const __m128i sum_left = dc_sum_32_sse2(left);
  212|  11.7k|  __m128i sum_above = dc_sum_16_sse2(above);
  213|  11.7k|  sum_above = _mm_add_epi16(sum_left, sum_above);
  214|       |
  215|  11.7k|  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
  216|  11.7k|  sum += 24;
  217|  11.7k|  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2);
  ------------------
  |  |   95|  11.7k|#define DC_MULTIPLIER_1X2 0x5556
  ------------------
  218|  11.7k|  const __m128i row = _mm_set1_epi8((int8_t)sum);
  219|  11.7k|  dc_store_16xh(&row, 32, dst, stride);
  220|  11.7k|}
aom_dc_predictor_16x64_sse2:
  224|    781|                                 const uint8_t *above, const uint8_t *left) {
  225|    781|  const __m128i sum_left = dc_sum_64(left);
  226|    781|  __m128i sum_above = dc_sum_16_sse2(above);
  227|    781|  sum_above = _mm_add_epi16(sum_left, sum_above);
  228|       |
  229|    781|  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
  230|    781|  sum += 40;
  231|    781|  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4);
  ------------------
  |  |   96|    781|#define DC_MULTIPLIER_1X4 0x3334
  ------------------
  232|    781|  const __m128i row = _mm_set1_epi8((int8_t)sum);
  233|    781|  dc_store_16xh(&row, 64, dst, stride);
  234|    781|}
aom_dc_predictor_32x8_sse2:
  237|  10.7k|                                const uint8_t *above, const uint8_t *left) {
  238|  10.7k|  __m128i sum_above = dc_sum_32_sse2(above);
  239|  10.7k|  const __m128i sum_left = dc_sum_8(left);
  240|  10.7k|  sum_above = _mm_add_epi16(sum_above, sum_left);
  241|       |
  242|  10.7k|  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
  243|  10.7k|  sum += 20;
  244|  10.7k|  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
  ------------------
  |  |   96|  10.7k|#define DC_MULTIPLIER_1X4 0x3334
  ------------------
  245|  10.7k|  const __m128i row = _mm_set1_epi8((int8_t)sum);
  246|  10.7k|  dc_store_32xh(&row, 8, dst, stride);
  247|  10.7k|}
aom_dc_top_predictor_4x8_sse2:
  321|    591|                                   const uint8_t *above, const uint8_t *left) {
  322|    591|  (void)left;
  323|    591|  __m128i sum_above = dc_sum_4(above);
  324|    591|  const __m128i two = _mm_set1_epi16(2);
  325|    591|  sum_above = _mm_add_epi16(sum_above, two);
  326|    591|  sum_above = _mm_srai_epi16(sum_above, 2);
  327|    591|  sum_above = _mm_shufflelo_epi16(sum_above, 0);
  328|    591|  sum_above = _mm_packus_epi16(sum_above, sum_above);
  329|       |
  330|    591|  const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_above);
  331|    591|  dc_store_4xh(pred, 8, dst, stride);
  332|    591|}
aom_dc_top_predictor_4x16_sse2:
  336|    599|                                    const uint8_t *above, const uint8_t *left) {
  337|    599|  (void)left;
  338|    599|  __m128i sum_above = dc_sum_4(above);
  339|    599|  const __m128i two = _mm_set1_epi16(2);
  340|    599|  sum_above = _mm_add_epi16(sum_above, two);
  341|    599|  sum_above = _mm_srai_epi16(sum_above, 2);
  342|    599|  sum_above = _mm_shufflelo_epi16(sum_above, 0);
  343|    599|  sum_above = _mm_packus_epi16(sum_above, sum_above);
  344|       |
  345|    599|  const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_above);
  346|    599|  dc_store_4xh(pred, 16, dst, stride);
  347|    599|}
aom_dc_top_predictor_8x4_sse2:
  351|    707|                                   const uint8_t *above, const uint8_t *left) {
  352|    707|  (void)left;
  353|    707|  __m128i sum_above = dc_sum_8(above);
  354|    707|  const __m128i four = _mm_set1_epi16(4);
  355|    707|  sum_above = _mm_add_epi16(sum_above, four);
  356|    707|  sum_above = _mm_srai_epi16(sum_above, 3);
  357|    707|  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
  358|       |  const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
  359|    707|  dc_store_8xh(&row, 4, dst, stride);
  360|    707|}
aom_dc_top_predictor_8x16_sse2:
  363|  1.05k|                                    const uint8_t *above, const uint8_t *left) {
  364|  1.05k|  (void)left;
  365|  1.05k|  __m128i sum_above = dc_sum_8(above);
  366|  1.05k|  const __m128i four = _mm_set1_epi16(4);
  367|  1.05k|  sum_above = _mm_add_epi16(sum_above, four);
  368|  1.05k|  sum_above = _mm_srai_epi16(sum_above, 3);
  369|  1.05k|  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
  370|       |  const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
  371|  1.05k|  dc_store_8xh(&row, 16, dst, stride);
  372|  1.05k|}
aom_dc_top_predictor_8x32_sse2:
  376|    824|                                    const uint8_t *above, const uint8_t *left) {
  377|    824|  (void)left;
  378|    824|  __m128i sum_above = dc_sum_8(above);
  379|    824|  const __m128i four = _mm_set1_epi16(4);
  380|    824|  sum_above = _mm_add_epi16(sum_above, four);
  381|    824|  sum_above = _mm_srai_epi16(sum_above, 3);
  382|    824|  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
  383|       |  const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
  384|    824|  dc_store_8xh(&row, 32, dst, stride);
  385|    824|}
aom_dc_top_predictor_16x4_sse2:
  388|    726|                                    const uint8_t *above, const uint8_t *left) {
  389|    726|  (void)left;
  390|    726|  __m128i sum_above = dc_sum_16_sse2(above);
  391|    726|  const __m128i eight = _mm_set1_epi16(8);
  392|    726|  sum_above = _mm_add_epi16(sum_above, eight);
  393|    726|  sum_above = _mm_srai_epi16(sum_above, 4);
  394|    726|  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
  395|       |  sum_above = _mm_shufflelo_epi16(sum_above, 0);
  396|    726|  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
  397|    726|  dc_store_16xh(&row, 4, dst, stride);
  398|    726|}
aom_dc_top_predictor_16x8_sse2:
  402|  2.28k|                                    const uint8_t *above, const uint8_t *left) {
  403|  2.28k|  (void)left;
  404|  2.28k|  __m128i sum_above = dc_sum_16_sse2(above);
  405|  2.28k|  const __m128i eight = _mm_set1_epi16(8);
  406|  2.28k|  sum_above = _mm_add_epi16(sum_above, eight);
  407|  2.28k|  sum_above = _mm_srai_epi16(sum_above, 4);
  408|  2.28k|  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
  409|       |  sum_above = _mm_shufflelo_epi16(sum_above, 0);
  410|  2.28k|  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
  411|  2.28k|  dc_store_16xh(&row, 8, dst, stride);
  412|  2.28k|}
aom_dc_top_predictor_16x32_sse2:
  416|    865|                                     const uint8_t *left) {
  417|    865|  (void)left;
  418|    865|  __m128i sum_above = dc_sum_16_sse2(above);
  419|    865|  const __m128i eight = _mm_set1_epi16(8);
  420|    865|  sum_above = _mm_add_epi16(sum_above, eight);
  421|    865|  sum_above = _mm_srai_epi16(sum_above, 4);
  422|    865|  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
  423|       |  sum_above = _mm_shufflelo_epi16(sum_above, 0);
  424|    865|  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
  425|    865|  dc_store_16xh(&row, 32, dst, stride);
  426|    865|}
aom_dc_top_predictor_16x64_sse2:
  431|    107|                                     const uint8_t *left) {
  432|    107|  (void)left;
  433|    107|  __m128i sum_above = dc_sum_16_sse2(above);
  434|    107|  const __m128i eight = _mm_set1_epi16(8);
  435|    107|  sum_above = _mm_add_epi16(sum_above, eight);
  436|    107|  sum_above = _mm_srai_epi16(sum_above, 4);
  437|    107|  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
  438|       |  sum_above = _mm_shufflelo_epi16(sum_above, 0);
  439|    107|  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
  440|    107|  dc_store_16xh(&row, 64, dst, stride);
  441|    107|}
aom_dc_top_predictor_32x8_sse2:
  444|    668|                                    const uint8_t *above, const uint8_t *left) {
  445|    668|  (void)left;
  446|    668|  __m128i sum_above = dc_sum_32_sse2(above);
  447|    668|  const __m128i sixteen = _mm_set1_epi16(16);
  448|    668|  sum_above = _mm_add_epi16(sum_above, sixteen);
  449|    668|  sum_above = _mm_srai_epi16(sum_above, 5);
  450|    668|  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
  451|       |  sum_above = _mm_shufflelo_epi16(sum_above, 0);
  452|    668|  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
  453|    668|  dc_store_32xh(&row, 8, dst, stride);
  454|    668|}
aom_dc_left_predictor_4x8_sse2:
  533|  1.32k|                                    const uint8_t *above, const uint8_t *left) {
  534|  1.32k|  (void)above;
  535|  1.32k|  __m128i sum_left = dc_sum_8(left);
  536|  1.32k|  const __m128i four = _mm_set1_epi16(4);
  537|  1.32k|  sum_left = _mm_add_epi16(sum_left, four);
  538|  1.32k|  sum_left = _mm_srai_epi16(sum_left, 3);
  539|  1.32k|  sum_left = _mm_shufflelo_epi16(sum_left, 0);
  540|  1.32k|  sum_left = _mm_packus_epi16(sum_left, sum_left);
  541|       |
  542|  1.32k|  const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_left);
  543|  1.32k|  dc_store_4xh(pred, 8, dst, stride);
  544|  1.32k|}
aom_dc_left_predictor_4x16_sse2:
  549|  1.70k|                                     const uint8_t *left) {
  550|  1.70k|  (void)above;
  551|  1.70k|  __m128i sum_left = dc_sum_16_sse2(left);
  552|  1.70k|  const __m128i eight = _mm_set1_epi16(8);
  553|  1.70k|  sum_left = _mm_add_epi16(sum_left, eight);
  554|  1.70k|  sum_left = _mm_srai_epi16(sum_left, 4);
  555|  1.70k|  sum_left = _mm_shufflelo_epi16(sum_left, 0);
  556|  1.70k|  sum_left = _mm_packus_epi16(sum_left, sum_left);
  557|       |
  558|  1.70k|  const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_left);
  559|  1.70k|  dc_store_4xh(pred, 16, dst, stride);
  560|  1.70k|}
aom_dc_left_predictor_8x4_sse2:
  564|    652|                                    const uint8_t *above, const uint8_t *left) {
  565|    652|  (void)above;
  566|    652|  __m128i sum_left = dc_sum_4(left);
  567|    652|  const __m128i two = _mm_set1_epi16(2);
  568|    652|  sum_left = _mm_add_epi16(sum_left, two);
  569|    652|  sum_left = _mm_srai_epi16(sum_left, 2);
  570|    652|  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
  571|       |  const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
  572|    652|  dc_store_8xh(&row, 4, dst, stride);
  573|    652|}
aom_dc_left_predictor_8x16_sse2:
  577|  5.90k|                                     const uint8_t *left) {
  578|  5.90k|  (void)above;
  579|  5.90k|  __m128i sum_left = dc_sum_16_sse2(left);
  580|  5.90k|  const __m128i eight = _mm_set1_epi16(8);
  581|  5.90k|  sum_left = _mm_add_epi16(sum_left, eight);
  582|  5.90k|  sum_left = _mm_srai_epi16(sum_left, 4);
  583|  5.90k|  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
  584|       |  const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
  585|  5.90k|  dc_store_8xh(&row, 16, dst, stride);
  586|  5.90k|}
aom_dc_left_predictor_8x32_sse2:
  591|  1.11k|                                     const uint8_t *left) {
  592|  1.11k|  (void)above;
  593|  1.11k|  __m128i sum_left = dc_sum_32_sse2(left);
  594|  1.11k|  const __m128i sixteen = _mm_set1_epi16(16);
  595|  1.11k|  sum_left = _mm_add_epi16(sum_left, sixteen);
  596|  1.11k|  sum_left = _mm_srai_epi16(sum_left, 5);
  597|  1.11k|  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
  598|       |  const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
  599|  1.11k|  dc_store_8xh(&row, 32, dst, stride);
  600|  1.11k|}
aom_dc_left_predictor_16x4_sse2:
  604|    703|                                     const uint8_t *left) {
  605|    703|  (void)above;
  606|    703|  __m128i sum_left = dc_sum_4(left);
  607|    703|  const __m128i two = _mm_set1_epi16(2);
  608|    703|  sum_left = _mm_add_epi16(sum_left, two);
  609|    703|  sum_left = _mm_srai_epi16(sum_left, 2);
  610|    703|  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
  611|       |  sum_left = _mm_shufflelo_epi16(sum_left, 0);
  612|    703|  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
  613|    703|  dc_store_16xh(&row, 4, dst, stride);
  614|    703|}
aom_dc_left_predictor_16x8_sse2:
  619|  2.11k|                                     const uint8_t *left) {
  620|  2.11k|  (void)above;
  621|  2.11k|  __m128i sum_left = dc_sum_8(left);
  622|  2.11k|  const __m128i four = _mm_set1_epi16(4);
  623|  2.11k|  sum_left = _mm_add_epi16(sum_left, four);
  624|  2.11k|  sum_left = _mm_srai_epi16(sum_left, 3);
  625|  2.11k|  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
  626|       |  sum_left = _mm_shufflelo_epi16(sum_left, 0);
  627|  2.11k|  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
  628|  2.11k|  dc_store_16xh(&row, 8, dst, stride);
  629|  2.11k|}
aom_dc_left_predictor_16x32_sse2:
  633|  5.63k|                                      const uint8_t *left) {
  634|  5.63k|  (void)above;
  635|  5.63k|  __m128i sum_left = dc_sum_32_sse2(left);
  636|  5.63k|  const __m128i sixteen = _mm_set1_epi16(16);
  637|  5.63k|  sum_left = _mm_add_epi16(sum_left, sixteen);
  638|  5.63k|  sum_left = _mm_srai_epi16(sum_left, 5);
  639|  5.63k|  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
  640|       |  sum_left = _mm_shufflelo_epi16(sum_left, 0);
  641|  5.63k|  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
  642|  5.63k|  dc_store_16xh(&row, 32, dst, stride);
  643|  5.63k|}
aom_dc_left_predictor_16x64_sse2:
  648|    366|                                      const uint8_t *left) {
  649|    366|  (void)above;
  650|    366|  __m128i sum_left = dc_sum_64(left);
  651|    366|  const __m128i thirtytwo = _mm_set1_epi16(32);
  652|    366|  sum_left = _mm_add_epi16(sum_left, thirtytwo);
  653|    366|  sum_left = _mm_srai_epi16(sum_left, 6);
  654|    366|  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
  655|       |  sum_left = _mm_shufflelo_epi16(sum_left, 0);
  656|    366|  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
  657|    366|  dc_store_16xh(&row, 64, dst, stride);
  658|    366|}
aom_dc_left_predictor_32x8_sse2:
  662|    710|                                     const uint8_t *left) {
  663|    710|  (void)above;
  664|    710|  __m128i sum_left = dc_sum_8(left);
  665|    710|  const __m128i four = _mm_set1_epi16(4);
  666|    710|  sum_left = _mm_add_epi16(sum_left, four);
  667|    710|  sum_left = _mm_srai_epi16(sum_left, 3);
  668|    710|  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
  669|       |  sum_left = _mm_shufflelo_epi16(sum_left, 0);
  670|    710|  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
  671|    710|  dc_store_32xh(&row, 8, dst, stride);
  672|    710|}
aom_dc_128_predictor_4x8_sse2:
  751|     40|                                   const uint8_t *above, const uint8_t *left) {
  752|     40|  (void)above;
  753|     40|  (void)left;
  754|     40|  const uint32_t pred = 0x80808080;
  755|     40|  dc_store_4xh(pred, 8, dst, stride);
  756|     40|}
aom_dc_128_predictor_4x16_sse2:
  760|     87|                                    const uint8_t *above, const uint8_t *left) {
  761|     87|  (void)above;
  762|     87|  (void)left;
  763|     87|  const uint32_t pred = 0x80808080;
  764|     87|  dc_store_4xh(pred, 16, dst, stride);
  765|     87|}
aom_dc_128_predictor_8x4_sse2:
  769|     36|                                   const uint8_t *above, const uint8_t *left) {
  770|     36|  (void)above;
  771|     36|  (void)left;
  772|     36|  const __m128i row = _mm_set1_epi8((int8_t)128);
  773|     36|  dc_store_8xh(&row, 4, dst, stride);
  774|     36|}
aom_dc_128_predictor_8x16_sse2:
  777|     60|                                    const uint8_t *above, const uint8_t *left) {
  778|     60|  (void)above;
  779|     60|  (void)left;
  780|     60|  const __m128i row = _mm_set1_epi8((int8_t)128);
  781|     60|  dc_store_8xh(&row, 16, dst, stride);
  782|     60|}
aom_dc_128_predictor_8x32_sse2:
  786|     47|                                    const uint8_t *above, const uint8_t *left) {
  787|     47|  (void)above;
  788|     47|  (void)left;
  789|     47|  const __m128i row = _mm_set1_epi8((int8_t)128);
  790|     47|  dc_store_8xh(&row, 32, dst, stride);
  791|     47|}
aom_dc_128_predictor_16x4_sse2:
  794|     42|                                    const uint8_t *above, const uint8_t *left) {
  795|     42|  (void)above;
  796|     42|  (void)left;
  797|     42|  const __m128i row = _mm_set1_epi8((int8_t)128);
  798|     42|  dc_store_16xh(&row, 4, dst, stride);
  799|     42|}
aom_dc_128_predictor_16x8_sse2:
  803|    150|                                    const uint8_t *above, const uint8_t *left) {
  804|    150|  (void)above;
  805|    150|  (void)left;
  806|    150|  const __m128i row = _mm_set1_epi8((int8_t)128);
  807|    150|  dc_store_16xh(&row, 8, dst, stride);
  808|    150|}
aom_dc_128_predictor_16x32_sse2:
  812|    284|                                     const uint8_t *left) {
  813|    284|  (void)above;
  814|    284|  (void)left;
  815|    284|  const __m128i row = _mm_set1_epi8((int8_t)128);
  816|    284|  dc_store_16xh(&row, 32, dst, stride);
  817|    284|}
aom_dc_128_predictor_16x64_sse2:
  822|      3|                                     const uint8_t *left) {
  823|      3|  (void)above;
  824|      3|  (void)left;
  825|      3|  const __m128i row = _mm_set1_epi8((int8_t)128);
  826|      3|  dc_store_16xh(&row, 64, dst, stride);
  827|      3|}
aom_dc_128_predictor_32x8_sse2:
  830|     42|                                    const uint8_t *above, const uint8_t *left) {
  831|     42|  (void)above;
  832|     42|  (void)left;
  833|     42|  const __m128i row = _mm_set1_epi8((int8_t)128);
  834|     42|  dc_store_32xh(&row, 8, dst, stride);
  835|     42|}
aom_v_predictor_4x8_sse2:
  889|  4.72k|                              const uint8_t *above, const uint8_t *left) {
  890|  4.72k|  const uint32_t pred = *(uint32_t *)above;
  891|  4.72k|  (void)left;
  892|  4.72k|  dc_store_4xh(pred, 8, dst, stride);
  893|  4.72k|}
aom_v_predictor_4x16_sse2:
  897|  1.51k|                               const uint8_t *above, const uint8_t *left) {
  898|  1.51k|  const uint32_t pred = *(uint32_t *)above;
  899|  1.51k|  (void)left;
  900|  1.51k|  dc_store_4xh(pred, 16, dst, stride);
  901|  1.51k|}
aom_v_predictor_8x4_sse2:
  905|  6.64k|                              const uint8_t *above, const uint8_t *left) {
  906|  6.64k|  const __m128i row = _mm_loadl_epi64((__m128i const *)above);
  907|  6.64k|  (void)left;
  908|  6.64k|  dc_store_8xh(&row, 4, dst, stride);
  909|  6.64k|}
aom_v_predictor_8x16_sse2:
  912|  2.09k|                               const uint8_t *above, const uint8_t *left) {
  913|  2.09k|  const __m128i row = _mm_loadl_epi64((__m128i const *)above);
  914|  2.09k|  (void)left;
  915|  2.09k|  dc_store_8xh(&row, 16, dst, stride);
  916|  2.09k|}
aom_v_predictor_8x32_sse2:
  920|    562|                               const uint8_t *above, const uint8_t *left) {
  921|    562|  const __m128i row = _mm_loadl_epi64((__m128i const *)above);
  922|    562|  (void)left;
  923|    562|  dc_store_8xh(&row, 32, dst, stride);
  924|    562|}
aom_v_predictor_16x4_sse2:
  927|  2.73k|                               const uint8_t *above, const uint8_t *left) {
  928|  2.73k|  const __m128i row = _mm_load_si128((__m128i const *)above);
  929|  2.73k|  (void)left;
  930|  2.73k|  dc_store_16xh(&row, 4, dst, stride);
  931|  2.73k|}
aom_v_predictor_16x8_sse2:
  935|  10.5k|                               const uint8_t *above, const uint8_t *left) {
  936|  10.5k|  const __m128i row = _mm_load_si128((__m128i const *)above);
  937|  10.5k|  (void)left;
  938|  10.5k|  dc_store_16xh(&row, 8, dst, stride);
  939|  10.5k|}
aom_v_predictor_16x32_sse2:
  942|    583|                                const uint8_t *above, const uint8_t *left) {
  943|    583|  const __m128i row = _mm_load_si128((__m128i const *)above);
  944|    583|  (void)left;
  945|    583|  dc_store_16xh(&row, 32, dst, stride);
  946|    583|}
aom_v_predictor_16x64_sse2:
  950|     96|                                const uint8_t *above, const uint8_t *left) {
  951|     96|  const __m128i row = _mm_load_si128((__m128i const *)above);
  952|     96|  (void)left;
  953|     96|  dc_store_16xh(&row, 64, dst, stride);
  954|     96|}
aom_v_predictor_32x8_sse2:
  970|    968|                               const uint8_t *above, const uint8_t *left) {
  971|    968|  (void)left;
  972|    968|  v_predictor_32xh(dst, stride, above, 8);
  973|    968|}
aom_h_predictor_4x8_sse2:
 1027|  9.28k|                              const uint8_t *above, const uint8_t *left) {
 1028|  9.28k|  (void)above;
 1029|  9.28k|  __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
 1030|  9.28k|  left_col = _mm_unpacklo_epi8(left_col, left_col);
 1031|  9.28k|  __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
 1032|  9.28k|  __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
 1033|  9.28k|  __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
 1034|  9.28k|  __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
 1035|  9.28k|  *(int *)dst = _mm_cvtsi128_si32(row0);
 1036|  9.28k|  dst += stride;
 1037|  9.28k|  *(int *)dst = _mm_cvtsi128_si32(row1);
 1038|  9.28k|  dst += stride;
 1039|  9.28k|  *(int *)dst = _mm_cvtsi128_si32(row2);
 1040|  9.28k|  dst += stride;
 1041|  9.28k|  *(int *)dst = _mm_cvtsi128_si32(row3);
 1042|  9.28k|  dst += stride;
 1043|  9.28k|  left_col = _mm_unpackhi_epi64(left_col, left_col);
 1044|  9.28k|  row0 = _mm_shufflelo_epi16(left_col, 0);
 1045|  9.28k|  row1 = _mm_shufflelo_epi16(left_col, 0x55);
 1046|  9.28k|  row2 = _mm_shufflelo_epi16(left_col, 0xaa);
 1047|       |  row3 = _mm_shufflelo_epi16(left_col, 0xff);
 1048|  9.28k|  *(int *)dst = _mm_cvtsi128_si32(row0);
 1049|  9.28k|  dst += stride;
 1050|  9.28k|  *(int *)dst = _mm_cvtsi128_si32(row1);
 1051|  9.28k|  dst += stride;
 1052|  9.28k|  *(int *)dst = _mm_cvtsi128_si32(row2);
 1053|  9.28k|  dst += stride;
 1054|  9.28k|  *(int *)dst = _mm_cvtsi128_si32(row3);
 1055|  9.28k|}
aom_h_predictor_4x16_sse2:
 1059|  2.19k|                               const uint8_t *above, const uint8_t *left) {
 1060|  2.19k|  (void)above;
 1061|  2.19k|  const __m128i left_col = _mm_load_si128((__m128i const *)left);
 1062|  2.19k|  __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
 1063|  2.19k|  __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
 1064|       |
 1065|  2.19k|  __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
 1066|  2.19k|  __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
 1067|  2.19k|  __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
 1068|  2.19k|  __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
 1069|  2.19k|  *(int *)dst = _mm_cvtsi128_si32(row0);
 1070|  2.19k|  dst += stride;
 1071|  2.19k|  *(int *)dst = _mm_cvtsi128_si32(row1);
 1072|  2.19k|  dst += stride;
 1073|  2.19k|  *(int *)dst = _mm_cvtsi128_si32(row2);
 1074|  2.19k|  dst += stride;
 1075|  2.19k|  *(int *)dst = _mm_cvtsi128_si32(row3);
 1076|  2.19k|  dst += stride;
 1077|       |
 1078|  2.19k|  left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
 1079|  2.19k|  row0 = _mm_shufflelo_epi16(left_col_low, 0);
 1080|  2.19k|  row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
 1081|  2.19k|  row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
 1082|  2.19k|  row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
 1083|  2.19k|  *(int *)dst = _mm_cvtsi128_si32(row0);
 1084|  2.19k|  dst += stride;
 1085|  2.19k|  *(int *)dst = _mm_cvtsi128_si32(row1);
 1086|  2.19k|  dst += stride;
 1087|  2.19k|  *(int *)dst = _mm_cvtsi128_si32(row2);
 1088|  2.19k|  dst += stride;
 1089|  2.19k|  *(int *)dst = _mm_cvtsi128_si32(row3);
 1090|  2.19k|  dst += stride;
 1091|       |
 1092|  2.19k|  row0 = _mm_shufflelo_epi16(left_col_high, 0);
 1093|  2.19k|  row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
 1094|  2.19k|  row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
 1095|  2.19k|  row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
 1096|  2.19k|  *(int *)dst = _mm_cvtsi128_si32(row0);
 1097|  2.19k|  dst += stride;
 1098|  2.19k|  *(int *)dst = _mm_cvtsi128_si32(row1);
 1099|  2.19k|  dst += stride;
 1100|  2.19k|  *(int *)dst = _mm_cvtsi128_si32(row2);
 1101|  2.19k|  dst += stride;
 1102|  2.19k|  *(int *)dst = _mm_cvtsi128_si32(row3);
 1103|  2.19k|  dst += stride;
 1104|       |
 1105|  2.19k|  left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
 1106|  2.19k|  row0 = _mm_shufflelo_epi16(left_col_high, 0);
 1107|  2.19k|  row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
 1108|  2.19k|  row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
 1109|       |  row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
 1110|  2.19k|  *(int *)dst = _mm_cvtsi128_si32(row0);
 1111|  2.19k|  dst += stride;
 1112|  2.19k|  *(int *)dst = _mm_cvtsi128_si32(row1);
 1113|  2.19k|  dst += stride;
 1114|  2.19k|  *(int *)dst = _mm_cvtsi128_si32(row2);
 1115|  2.19k|  dst += stride;
 1116|  2.19k|  *(int *)dst = _mm_cvtsi128_si32(row3);
 1117|  2.19k|}
aom_h_predictor_8x4_sse2:
 1121|  13.3k|                              const uint8_t *above, const uint8_t *left) {
 1122|  13.3k|  (void)above;
 1123|  13.3k|  __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
 1124|  13.3k|  left_col = _mm_unpacklo_epi8(left_col, left_col);
 1125|  13.3k|  __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
 1126|  13.3k|  __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
 1127|  13.3k|  __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
 1128|       |  __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
 1129|  13.3k|  _mm_storel_epi64((__m128i *)dst, row0);
 1130|  13.3k|  dst += stride;
 1131|  13.3k|  _mm_storel_epi64((__m128i *)dst, row1);
 1132|  13.3k|  dst += stride;
 1133|  13.3k|  _mm_storel_epi64((__m128i *)dst, row2);
 1134|  13.3k|  dst += stride;
 1135|  13.3k|  _mm_storel_epi64((__m128i *)dst, row3);
 1136|  13.3k|}
aom_h_predictor_8x16_sse2:
 1205|  3.81k|                               const uint8_t *above, const uint8_t *left) {
 1206|  3.81k|  h_predictor_8x16xc(dst, stride, above, left, 1);
 1207|  3.81k|}
aom_h_predictor_8x32_sse2:
 1211|  1.47k|                               const uint8_t *above, const uint8_t *left) {
 1212|  1.47k|  h_predictor_8x16xc(dst, stride, above, left, 2);
 1213|  1.47k|}
aom_h_predictor_16x4_sse2:
 1269|  5.16k|                               const uint8_t *above, const uint8_t *left) {
 1270|  5.16k|  (void)above;
 1271|  5.16k|  const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
 1272|  5.16k|  const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
 1273|  5.16k|  h_prediction_16x8_1(&left_col_8p, dst, stride);
 1274|  5.16k|}
aom_h_predictor_16x8_sse2:
 1278|  6.96k|                               const uint8_t *above, const uint8_t *left) {
 1279|  6.96k|  (void)above;
 1280|  6.96k|  const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
 1281|  6.96k|  const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
 1282|  6.96k|  h_prediction_16x8_1(&left_col_8p, dst, stride);
 1283|  6.96k|  dst += stride << 2;
 1284|  6.96k|  h_prediction_16x8_2(&left_col_8p, dst, stride);
 1285|  6.96k|}
aom_h_predictor_16x32_sse2:
 1310|  1.17k|                                const uint8_t *above, const uint8_t *left) {
 1311|  1.17k|  (void)above;
 1312|  1.17k|  h_predictor_16xh(dst, stride, left, 2);
 1313|  1.17k|}
aom_h_predictor_16x64_sse2:
 1317|    238|                                const uint8_t *above, const uint8_t *left) {
 1318|    238|  (void)above;
 1319|    238|  h_predictor_16xh(dst, stride, left, 4);
 1320|    238|}
aom_h_predictor_32x8_sse2:
 1353|  1.53k|                               const uint8_t *above, const uint8_t *left) {
 1354|  1.53k|  __m128i left_col, left_col_8p;
 1355|  1.53k|  (void)above;
 1356|       |
 1357|  1.53k|  left_col = _mm_load_si128((const __m128i *)left);
 1358|       |
 1359|  1.53k|  left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
 1360|  1.53k|  h_prediction_32x8_1(&left_col_8p, dst, stride);
 1361|  1.53k|  dst += stride << 2;
 1362|  1.53k|  h_prediction_32x8_2(&left_col_8p, dst, stride);
 1363|  1.53k|}
aom_h_predictor_32x16_sse2:
 1367|  1.08k|                                const uint8_t *above, const uint8_t *left) {
 1368|  1.08k|  __m128i left_col, left_col_8p;
 1369|  1.08k|  (void)above;
 1370|       |
 1371|  1.08k|  left_col = _mm_load_si128((const __m128i *)left);
 1372|       |
 1373|  1.08k|  left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
 1374|  1.08k|  h_prediction_32x8_1(&left_col_8p, dst, stride);
 1375|  1.08k|  dst += stride << 2;
 1376|  1.08k|  h_prediction_32x8_2(&left_col_8p, dst, stride);
 1377|  1.08k|  dst += stride << 2;
 1378|       |
 1379|  1.08k|  left_col_8p = _mm_unpackhi_epi8(left_col, left_col);
 1380|  1.08k|  h_prediction_32x8_1(&left_col_8p, dst, stride);
 1381|  1.08k|  dst += stride << 2;
 1382|  1.08k|  h_prediction_32x8_2(&left_col_8p, dst, stride);
 1383|  1.08k|}
aom_h_predictor_32x64_sse2:
 1410|     64|                                const uint8_t *above, const uint8_t *left) {
 1411|     64|  (void)above;
 1412|     64|  h_predictor_32xh(dst, stride, left, 64);
 1413|     64|}
aom_h_predictor_64x64_sse2:
 1448|    253|                                const uint8_t *above, const uint8_t *left) {
 1449|    253|  (void)above;
 1450|    253|  h_predictor_64xh(dst, stride, left, 64);
 1451|    253|}
aom_h_predictor_64x32_sse2:
 1454|    122|                                const uint8_t *above, const uint8_t *left) {
 1455|    122|  (void)above;
 1456|    122|  h_predictor_64xh(dst, stride, left, 32);
 1457|    122|}
aom_h_predictor_64x16_sse2:
 1461|    123|                                const uint8_t *above, const uint8_t *left) {
 1462|    123|  (void)above;
 1463|    123|  h_predictor_64xh(dst, stride, left, 16);
 1464|    123|}
intrapred_sse2.c:dc_sum_8:
   72|   152k|static inline __m128i dc_sum_8(const uint8_t *ref) {
   73|   152k|  __m128i x = _mm_loadl_epi64((__m128i const *)ref);
   74|   152k|  const __m128i zero = _mm_setzero_si128();
   75|   152k|  return _mm_sad_epu8(x, zero);
   76|   152k|}
intrapred_sse2.c:dc_sum_4:
   65|   111k|static inline __m128i dc_sum_4(const uint8_t *ref) {
   66|   111k|  __m128i x = _mm_loadl_epi64((__m128i const *)ref);
   67|   111k|  const __m128i zero = _mm_setzero_si128();
   68|   111k|  x = _mm_unpacklo_epi8(x, zero);
   69|   111k|  return _mm_sad_epu8(x, zero);
   70|   111k|}
intrapred_sse2.c:divide_using_multiply_shift:
  101|   206k|                                              int multiplier) {
  102|   206k|  const int interm = num >> shift1;
  103|   206k|  return interm * multiplier >> DC_SHIFT2;
  ------------------
  |  |   98|   206k|#define DC_SHIFT2 16
  ------------------
  104|   206k|}
intrapred_sse2.c:dc_store_4xh:
   17|  52.1k|                                ptrdiff_t stride) {
   18|   339k|  for (int i = 0; i < height; i += 2) {
  ------------------
  |  Branch (18:19): [True: 287k, False: 52.1k]
  ------------------
   19|   287k|    *(uint32_t *)dst = dc;
   20|   287k|    dst += stride;
   21|   287k|    *(uint32_t *)dst = dc;
   22|   287k|    dst += stride;
   23|   287k|  }
   24|  52.1k|}
intrapred_sse2.c:dc_store_8xh:
   27|  88.9k|                                ptrdiff_t stride) {
   28|  88.9k|  int i;
   29|  1.17M|  for (i = 0; i < height; ++i) {
  ------------------
  |  Branch (29:15): [True: 1.08M, False: 88.9k]
  ------------------
   30|  1.08M|    _mm_storel_epi64((__m128i *)dst, *row);
   31|  1.08M|    dst += stride;
   32|  1.08M|  }
   33|  88.9k|}
intrapred_sse2.c:dc_store_16xh:
   36|   112k|                                 ptrdiff_t stride) {
   37|   112k|  int i;
   38|  1.39M|  for (i = 0; i < height; ++i) {
  ------------------
  |  Branch (38:15): [True: 1.28M, False: 112k]
  ------------------
   39|  1.28M|    _mm_store_si128((__m128i *)dst, *row);
   40|  1.28M|    dst += stride;
   41|  1.28M|  }
   42|   112k|}
intrapred_sse2.c:dc_sum_64:
   78|  1.14k|static inline __m128i dc_sum_64(const uint8_t *ref) {
   79|  1.14k|  __m128i x0 = _mm_load_si128((__m128i const *)ref);
   80|  1.14k|  __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
   81|  1.14k|  __m128i x2 = _mm_load_si128((__m128i const *)(ref + 32));
   82|  1.14k|  __m128i x3 = _mm_load_si128((__m128i const *)(ref + 48));
   83|  1.14k|  const __m128i zero = _mm_setzero_si128();
   84|  1.14k|  x0 = _mm_sad_epu8(x0, zero);
   85|  1.14k|  x1 = _mm_sad_epu8(x1, zero);
   86|  1.14k|  x2 = _mm_sad_epu8(x2, zero);
   87|  1.14k|  x3 = _mm_sad_epu8(x3, zero);
   88|  1.14k|  x0 = _mm_add_epi16(x0, x1);
   89|  1.14k|  x2 = _mm_add_epi16(x2, x3);
   90|  1.14k|  x0 = _mm_add_epi16(x0, x2);
   91|  1.14k|  const __m128i high = _mm_unpackhi_epi64(x0, x0);
   92|  1.14k|  return _mm_add_epi16(x0, high);
   93|  1.14k|}
intrapred_sse2.c:dc_store_32xh:
   45|  12.1k|                                 ptrdiff_t stride) {
   46|  12.1k|  int i;
   47|   109k|  for (i = 0; i < height; ++i) {
  ------------------
  |  Branch (47:15): [True: 97.5k, False: 12.1k]
  ------------------
   48|  97.5k|    _mm_store_si128((__m128i *)dst, *row);
   49|  97.5k|    _mm_store_si128((__m128i *)(dst + 16), *row);
   50|  97.5k|    dst += stride;
   51|  97.5k|  }
   52|  12.1k|}
intrapred_sse2.c:v_predictor_32xh:
  958|    968|                                    const uint8_t *above, int height) {
  959|    968|  const __m128i row0 = _mm_load_si128((__m128i const *)above);
  960|    968|  const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
  961|  8.71k|  for (int i = 0; i < height; ++i) {
  ------------------
  |  Branch (961:19): [True: 7.74k, False: 968]
  ------------------
  962|  7.74k|    _mm_store_si128((__m128i *)dst, row0);
  963|  7.74k|    _mm_store_si128((__m128i *)(dst + 16), row1);
  964|  7.74k|    dst += stride;
  965|  7.74k|  }
  966|    968|}
intrapred_sse2.c:h_predictor_8x16xc:
 1140|  5.28k|                                      int count) {
 1141|  5.28k|  (void)above;
 1142|  12.0k|  for (int i = 0; i < count; ++i) {
  ------------------
  |  Branch (1142:19): [True: 6.75k, False: 5.28k]
  ------------------
 1143|  6.75k|    const __m128i left_col = _mm_load_si128((__m128i const *)left);
 1144|  6.75k|    __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
 1145|  6.75k|    __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
 1146|       |
 1147|  6.75k|    __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
 1148|  6.75k|    __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
 1149|  6.75k|    __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
 1150|  6.75k|    __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
 1151|  6.75k|    _mm_storel_epi64((__m128i *)dst, row0);
 1152|  6.75k|    dst += stride;
 1153|  6.75k|    _mm_storel_epi64((__m128i *)dst, row1);
 1154|  6.75k|    dst += stride;
 1155|  6.75k|    _mm_storel_epi64((__m128i *)dst, row2);
 1156|  6.75k|    dst += stride;
 1157|  6.75k|    _mm_storel_epi64((__m128i *)dst, row3);
 1158|  6.75k|    dst += stride;
 1159|       |
 1160|  6.75k|    left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
 1161|  6.75k|    row0 = _mm_shufflelo_epi16(left_col_low, 0);
 1162|  6.75k|    row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
 1163|  6.75k|    row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
 1164|  6.75k|    row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
 1165|  6.75k|    _mm_storel_epi64((__m128i *)dst, row0);
 1166|  6.75k|    dst += stride;
 1167|  6.75k|    _mm_storel_epi64((__m128i *)dst, row1);
 1168|  6.75k|    dst += stride;
 1169|  6.75k|    _mm_storel_epi64((__m128i *)dst, row2);
 1170|  6.75k|    dst += stride;
 1171|  6.75k|    _mm_storel_epi64((__m128i *)dst, row3);
 1172|  6.75k|    dst += stride;
 1173|       |
 1174|  6.75k|    row0 = _mm_shufflelo_epi16(left_col_high, 0);
 1175|  6.75k|    row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
 1176|  6.75k|    row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
 1177|  6.75k|    row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
 1178|  6.75k|    _mm_storel_epi64((__m128i *)dst, row0);
 1179|  6.75k|    dst += stride;
 1180|  6.75k|    _mm_storel_epi64((__m128i *)dst, row1);
 1181|  6.75k|    dst += stride;
 1182|  6.75k|    _mm_storel_epi64((__m128i *)dst, row2);
 1183|  6.75k|    dst += stride;
 1184|  6.75k|    _mm_storel_epi64((__m128i *)dst, row3);
 1185|  6.75k|    dst += stride;
 1186|       |
 1187|  6.75k|    left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
 1188|  6.75k|    row0 = _mm_shufflelo_epi16(left_col_high, 0);
 1189|  6.75k|    row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
 1190|  6.75k|    row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
 1191|       |    row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
 1192|  6.75k|    _mm_storel_epi64((__m128i *)dst, row0);
 1193|  6.75k|    dst += stride;
 1194|  6.75k|    _mm_storel_epi64((__m128i *)dst, row1);
 1195|  6.75k|    dst += stride;
 1196|  6.75k|    _mm_storel_epi64((__m128i *)dst, row2);
 1197|  6.75k|    dst += stride;
 1198|  6.75k|    _mm_storel_epi64((__m128i *)dst, row3);
 1199|  6.75k|    dst += stride;
 1200|  6.75k|    left += 16;
 1201|  6.75k|  }
 1202|  5.28k|}
intrapred_sse2.c:h_prediction_16x8_1:
 1252|  18.7k|                                       ptrdiff_t stride) {
 1253|  18.7k|  __m128i row[4];
 1254|  18.7k|  repeat_low_4pixels(left, row);
 1255|  18.7k|  h_pred_store_16xh(row, 4, dst, stride);
 1256|  18.7k|}
intrapred_sse2.c:repeat_low_4pixels:
 1225|  22.4k|static inline void repeat_low_4pixels(const __m128i *x, __m128i *row) {
 1226|  22.4k|  const __m128i u0 = _mm_shufflelo_epi16(*x, 0);
 1227|  22.4k|  const __m128i u1 = _mm_shufflelo_epi16(*x, 0x55);
 1228|  22.4k|  const __m128i u2 = _mm_shufflelo_epi16(*x, 0xaa);
 1229|  22.4k|  const __m128i u3 = _mm_shufflelo_epi16(*x, 0xff);
 1230|       |
 1231|  22.4k|  row[0] = _mm_unpacklo_epi64(u0, u0);
 1232|  22.4k|  row[1] = _mm_unpacklo_epi64(u1, u1);
 1233|  22.4k|  row[2] = _mm_unpacklo_epi64(u2, u2);
 1234|  22.4k|  row[3] = _mm_unpacklo_epi64(u3, u3);
 1235|  22.4k|}
intrapred_sse2.c:h_pred_store_16xh:
 1217|  32.2k|                                     ptrdiff_t stride) {
 1218|  32.2k|  int i;
 1219|   161k|  for (i = 0; i < h; ++i) {
  ------------------
  |  Branch (1219:15): [True: 129k, False: 32.2k]
  ------------------
 1220|   129k|    _mm_store_si128((__m128i *)dst, row[i]);
 1221|   129k|    dst += stride;
 1222|   129k|  }
 1223|  32.2k|}
intrapred_sse2.c:h_prediction_16x8_2:
 1261|  13.5k|                                       ptrdiff_t stride) {
 1262|  13.5k|  __m128i row[4];
 1263|  13.5k|  repeat_high_4pixels(left, row);
 1264|  13.5k|  h_pred_store_16xh(row, 4, dst, stride);
 1265|  13.5k|}
intrapred_sse2.c:repeat_high_4pixels:
 1237|  17.2k|static inline void repeat_high_4pixels(const __m128i *x, __m128i *row) {
 1238|  17.2k|  const __m128i u0 = _mm_shufflehi_epi16(*x, 0);
 1239|  17.2k|  const __m128i u1 = _mm_shufflehi_epi16(*x, 0x55);
 1240|  17.2k|  const __m128i u2 = _mm_shufflehi_epi16(*x, 0xaa);
 1241|  17.2k|  const __m128i u3 = _mm_shufflehi_epi16(*x, 0xff);
 1242|       |
 1243|  17.2k|  row[0] = _mm_unpackhi_epi64(u0, u0);
 1244|  17.2k|  row[1] = _mm_unpackhi_epi64(u1, u1);
 1245|  17.2k|  row[2] = _mm_unpackhi_epi64(u2, u2);
 1246|  17.2k|  row[3] = _mm_unpackhi_epi64(u3, u3);
 1247|  17.2k|}
intrapred_sse2.c:h_predictor_16xh:
 1288|  1.40k|                                    const uint8_t *left, int count) {
 1289|  1.40k|  int i = 0;
 1290|  3.29k|  do {
 1291|  3.29k|    const __m128i left_col = _mm_load_si128((const __m128i *)left);
 1292|  3.29k|    const __m128i left_col_8p_lo = _mm_unpacklo_epi8(left_col, left_col);
 1293|  3.29k|    h_prediction_16x8_1(&left_col_8p_lo, dst, stride);
 1294|  3.29k|    dst += stride << 2;
 1295|  3.29k|    h_prediction_16x8_2(&left_col_8p_lo, dst, stride);
 1296|  3.29k|    dst += stride << 2;
 1297|       |
 1298|  3.29k|    const __m128i left_col_8p_hi = _mm_unpackhi_epi8(left_col, left_col);
 1299|  3.29k|    h_prediction_16x8_1(&left_col_8p_hi, dst, stride);
 1300|  3.29k|    dst += stride << 2;
 1301|  3.29k|    h_prediction_16x8_2(&left_col_8p_hi, dst, stride);
 1302|  3.29k|    dst += stride << 2;
 1303|       |
 1304|  3.29k|    left += 16;
 1305|  3.29k|    i++;
 1306|  3.29k|  } while (i < count);
  ------------------
  |  Branch (1306:12): [True: 1.88k, False: 1.40k]
  ------------------
 1307|  1.40k|}
intrapred_sse2.c:h_prediction_32x8_1:
 1336|  3.70k|                                       ptrdiff_t stride) {
 1337|  3.70k|  __m128i row[4];
 1338|  3.70k|  repeat_low_4pixels(left, row);
 1339|  3.70k|  h_pred_store_32xh(row, 4, dst, stride);
 1340|  3.70k|}
intrapred_sse2.c:h_pred_store_32xh:
 1324|  7.40k|                                     ptrdiff_t stride) {
 1325|  7.40k|  int i;
 1326|  37.0k|  for (i = 0; i < h; ++i) {
  ------------------
  |  Branch (1326:15): [True: 29.6k, False: 7.40k]
  ------------------
 1327|  29.6k|    _mm_store_si128((__m128i *)dst, row[i]);
 1328|  29.6k|    _mm_store_si128((__m128i *)(dst + 16), row[i]);
 1329|  29.6k|    dst += stride;
 1330|  29.6k|  }
 1331|  7.40k|}
intrapred_sse2.c:h_prediction_32x8_2:
 1345|  3.70k|                                       ptrdiff_t stride) {
 1346|  3.70k|  __m128i row[4];
 1347|  3.70k|  repeat_high_4pixels(left, row);
 1348|  3.70k|  h_pred_store_32xh(row, 4, dst, stride);
 1349|  3.70k|}
intrapred_sse2.c:h_predictor_32xh:
 1386|     64|                                    const uint8_t *left, int height) {
 1387|     64|  int i = height >> 2;
 1388|  1.02k|  do {
 1389|  1.02k|    __m128i left4 = _mm_cvtsi32_si128(((int *)left)[0]);
 1390|  1.02k|    left4 = _mm_unpacklo_epi8(left4, left4);
 1391|  1.02k|    left4 = _mm_unpacklo_epi8(left4, left4);
 1392|  1.02k|    const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
 1393|  1.02k|    const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
 1394|  1.02k|    _mm_store_si128((__m128i *)dst, r0);
 1395|  1.02k|    _mm_store_si128((__m128i *)(dst + 16), r0);
 1396|  1.02k|    _mm_store_si128((__m128i *)(dst + stride), r1);
 1397|  1.02k|    _mm_store_si128((__m128i *)(dst + stride + 16), r1);
 1398|  1.02k|    const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
 1399|  1.02k|    const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
 1400|  1.02k|    _mm_store_si128((__m128i *)(dst + stride * 2), r2);
 1401|  1.02k|    _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2);
 1402|  1.02k|    _mm_store_si128((__m128i *)(dst + stride * 3), r3);
 1403|  1.02k|    _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3);
 1404|  1.02k|    left += 4;
 1405|  1.02k|    dst += stride * 4;
 1406|  1.02k|  } while (--i);
  ------------------
  |  Branch (1406:12): [True: 960, False: 64]
  ------------------
 1407|     64|}
intrapred_sse2.c:h_predictor_64xh:
 1416|    498|                                    const uint8_t *left, int height) {
 1417|    498|  int i = height >> 2;
 1418|  5.51k|  do {
 1419|  5.51k|    __m128i left4 = _mm_cvtsi32_si128(((int *)left)[0]);
 1420|  5.51k|    left4 = _mm_unpacklo_epi8(left4, left4);
 1421|  5.51k|    left4 = _mm_unpacklo_epi8(left4, left4);
 1422|  5.51k|    const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
 1423|  5.51k|    const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
 1424|  5.51k|    _mm_store_si128((__m128i *)dst, r0);
 1425|  5.51k|    _mm_store_si128((__m128i *)(dst + 16), r0);
 1426|  5.51k|    _mm_store_si128((__m128i *)(dst + 32), r0);
 1427|  5.51k|    _mm_store_si128((__m128i *)(dst + 48), r0);
 1428|  5.51k|    _mm_store_si128((__m128i *)(dst + stride), r1);
 1429|  5.51k|    _mm_store_si128((__m128i *)(dst + stride + 16), r1);
 1430|  5.51k|    _mm_store_si128((__m128i *)(dst + stride + 32), r1);
 1431|  5.51k|    _mm_store_si128((__m128i *)(dst + stride + 48), r1);
 1432|  5.51k|    const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
 1433|  5.51k|    const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
 1434|  5.51k|    _mm_store_si128((__m128i *)(dst + stride * 2), r2);
 1435|  5.51k|    _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2);
 1436|  5.51k|    _mm_store_si128((__m128i *)(dst + stride * 2 + 32), r2);
 1437|  5.51k|    _mm_store_si128((__m128i *)(dst + stride * 2 + 48), r2);
 1438|  5.51k|    _mm_store_si128((__m128i *)(dst + stride * 3), r3);
 1439|  5.51k|    _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3);
 1440|  5.51k|    _mm_store_si128((__m128i *)(dst + stride * 3 + 32), r3);
 1441|  5.51k|    _mm_store_si128((__m128i *)(dst + stride * 3 + 48), r3);
 1442|  5.51k|    left += 4;
 1443|  5.51k|    dst += stride * 4;
 1444|  5.51k|  } while (--i);
  ------------------
  |  Branch (1444:12): [True: 5.01k, False: 498]
  ------------------
 1445|    498|}

aom_paeth_predictor_4x4_ssse3:
   45|  44.3k|                                   const uint8_t *above, const uint8_t *left) {
   46|  44.3k|  __m128i l = _mm_loadl_epi64((const __m128i *)left);
   47|  44.3k|  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
   48|  44.3k|  const __m128i zero = _mm_setzero_si128();
   49|  44.3k|  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
   50|  44.3k|  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
   51|  44.3k|  __m128i rep = _mm_set1_epi16((short)0x8000);
   52|  44.3k|  const __m128i one = _mm_set1_epi16(1);
   53|       |
   54|  44.3k|  int i;
   55|   221k|  for (i = 0; i < 4; ++i) {
  ------------------
  |  Branch (55:15): [True: 177k, False: 44.3k]
  ------------------
   56|   177k|    const __m128i l16 = _mm_shuffle_epi8(l, rep);
   57|   177k|    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
   58|       |
   59|   177k|    *(int *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
   60|   177k|    dst += stride;
   61|   177k|    rep = _mm_add_epi16(rep, one);
   62|   177k|  }
   63|  44.3k|}
aom_paeth_predictor_4x8_ssse3:
   66|  4.21k|                                   const uint8_t *above, const uint8_t *left) {
   67|  4.21k|  __m128i l = _mm_loadl_epi64((const __m128i *)left);
   68|  4.21k|  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
   69|  4.21k|  const __m128i zero = _mm_setzero_si128();
   70|  4.21k|  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
   71|  4.21k|  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
   72|  4.21k|  __m128i rep = _mm_set1_epi16((short)0x8000);
   73|  4.21k|  const __m128i one = _mm_set1_epi16(1);
   74|       |
   75|  4.21k|  int i;
   76|  37.9k|  for (i = 0; i < 8; ++i) {
  ------------------
  |  Branch (76:15): [True: 33.7k, False: 4.21k]
  ------------------
   77|  33.7k|    const __m128i l16 = _mm_shuffle_epi8(l, rep);
   78|  33.7k|    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
   79|       |
   80|  33.7k|    *(int *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
   81|  33.7k|    dst += stride;
   82|  33.7k|    rep = _mm_add_epi16(rep, one);
   83|  33.7k|  }
   84|  4.21k|}
aom_paeth_predictor_4x16_ssse3:
   88|  2.95k|                                    const uint8_t *above, const uint8_t *left) {
   89|  2.95k|  __m128i l = _mm_load_si128((const __m128i *)left);
   90|  2.95k|  const __m128i t = _mm_cvtsi32_si128(((const int *)above)[0]);
   91|  2.95k|  const __m128i zero = _mm_setzero_si128();
   92|  2.95k|  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
   93|  2.95k|  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
   94|  2.95k|  __m128i rep = _mm_set1_epi16((short)0x8000);
   95|  2.95k|  const __m128i one = _mm_set1_epi16(1);
   96|       |
   97|  50.2k|  for (int i = 0; i < 16; ++i) {
  ------------------
  |  Branch (97:19): [True: 47.2k, False: 2.95k]
  ------------------
   98|  47.2k|    const __m128i l16 = _mm_shuffle_epi8(l, rep);
   99|  47.2k|    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
  100|       |
  101|  47.2k|    *(int *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
  102|  47.2k|    dst += stride;
  103|  47.2k|    rep = _mm_add_epi16(rep, one);
  104|  47.2k|  }
  105|  2.95k|}
aom_paeth_predictor_8x4_ssse3:
  109|  6.13k|                                   const uint8_t *above, const uint8_t *left) {
  110|  6.13k|  __m128i l = _mm_loadl_epi64((const __m128i *)left);
  111|  6.13k|  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
  112|  6.13k|  const __m128i zero = _mm_setzero_si128();
  113|  6.13k|  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
  114|  6.13k|  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
  115|  6.13k|  __m128i rep = _mm_set1_epi16((short)0x8000);
  116|  6.13k|  const __m128i one = _mm_set1_epi16(1);
  117|       |
  118|  6.13k|  int i;
  119|  30.6k|  for (i = 0; i < 4; ++i) {
  ------------------
  |  Branch (119:15): [True: 24.5k, False: 6.13k]
  ------------------
  120|  24.5k|    const __m128i l16 = _mm_shuffle_epi8(l, rep);
  121|  24.5k|    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
  122|       |
  123|  24.5k|    _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
  124|  24.5k|    dst += stride;
  125|  24.5k|    rep = _mm_add_epi16(rep, one);
  126|  24.5k|  }
  127|  6.13k|}
aom_paeth_predictor_8x8_ssse3:
  130|  28.6k|                                   const uint8_t *above, const uint8_t *left) {
  131|  28.6k|  __m128i l = _mm_loadl_epi64((const __m128i *)left);
  132|  28.6k|  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
  133|  28.6k|  const __m128i zero = _mm_setzero_si128();
  134|  28.6k|  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
  135|  28.6k|  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
  136|  28.6k|  __m128i rep = _mm_set1_epi16((short)0x8000);
  137|  28.6k|  const __m128i one = _mm_set1_epi16(1);
  138|       |
  139|  28.6k|  int i;
  140|   257k|  for (i = 0; i < 8; ++i) {
  ------------------
  |  Branch (140:15): [True: 229k, False: 28.6k]
  ------------------
  141|   229k|    const __m128i l16 = _mm_shuffle_epi8(l, rep);
  142|   229k|    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
  143|       |
  144|   229k|    _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
  145|   229k|    dst += stride;
  146|   229k|    rep = _mm_add_epi16(rep, one);
  147|   229k|  }
  148|  28.6k|}
aom_paeth_predictor_8x16_ssse3:
  151|  4.55k|                                    const uint8_t *above, const uint8_t *left) {
  152|  4.55k|  __m128i l = _mm_load_si128((const __m128i *)left);
  153|  4.55k|  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
  154|  4.55k|  const __m128i zero = _mm_setzero_si128();
  155|  4.55k|  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
  156|  4.55k|  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
  157|  4.55k|  __m128i rep = _mm_set1_epi16((short)0x8000);
  158|  4.55k|  const __m128i one = _mm_set1_epi16(1);
  159|       |
  160|  4.55k|  int i;
  161|  77.3k|  for (i = 0; i < 16; ++i) {
  ------------------
  |  Branch (161:15): [True: 72.8k, False: 4.55k]
  ------------------
  162|  72.8k|    const __m128i l16 = _mm_shuffle_epi8(l, rep);
  163|  72.8k|    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
  164|       |
  165|  72.8k|    _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
  166|  72.8k|    dst += stride;
  167|  72.8k|    rep = _mm_add_epi16(rep, one);
  168|  72.8k|  }
  169|  4.55k|}
aom_paeth_predictor_8x32_ssse3:
  173|  1.52k|                                    const uint8_t *above, const uint8_t *left) {
  174|  1.52k|  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
  175|  1.52k|  const __m128i zero = _mm_setzero_si128();
  176|  1.52k|  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
  177|  1.52k|  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
  178|  1.52k|  const __m128i one = _mm_set1_epi16(1);
  179|       |
  180|  4.56k|  for (int j = 0; j < 2; ++j) {
  ------------------
  |  Branch (180:19): [True: 3.04k, False: 1.52k]
  ------------------
  181|  3.04k|    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
  182|  3.04k|    __m128i rep = _mm_set1_epi16((short)0x8000);
  183|  51.7k|    for (int i = 0; i < 16; ++i) {
  ------------------
  |  Branch (183:21): [True: 48.7k, False: 3.04k]
  ------------------
  184|  48.7k|      const __m128i l16 = _mm_shuffle_epi8(l, rep);
  185|  48.7k|      const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
  186|       |
  187|  48.7k|      _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
  188|  48.7k|      dst += stride;
  189|  48.7k|      rep = _mm_add_epi16(rep, one);
  190|  48.7k|    }
  191|  3.04k|  }
  192|  1.52k|}
aom_paeth_predictor_16x4_ssse3:
  206|  5.15k|                                    const uint8_t *above, const uint8_t *left) {
  207|  5.15k|  __m128i l = _mm_cvtsi32_si128(((const int *)left)[0]);
  208|  5.15k|  const __m128i t = _mm_load_si128((const __m128i *)above);
  209|  5.15k|  const __m128i zero = _mm_setzero_si128();
  210|  5.15k|  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
  211|  5.15k|  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
  212|  5.15k|  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
  213|  5.15k|  __m128i rep = _mm_set1_epi16((short)0x8000);
  214|  5.15k|  const __m128i one = _mm_set1_epi16(1);
  215|       |
  216|  25.7k|  for (int i = 0; i < 4; ++i) {
  ------------------
  |  Branch (216:19): [True: 20.6k, False: 5.15k]
  ------------------
  217|  20.6k|    const __m128i l16 = _mm_shuffle_epi8(l, rep);
  218|  20.6k|    const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
  219|       |
  220|  20.6k|    _mm_store_si128((__m128i *)dst, row);
  221|  20.6k|    dst += stride;
  222|  20.6k|    rep = _mm_add_epi16(rep, one);
  223|  20.6k|  }
  224|  5.15k|}
aom_paeth_predictor_32x8_ssse3:
  332|  1.74k|                                    const uint8_t *above, const uint8_t *left) {
  333|  1.74k|  const __m128i a = _mm_load_si128((const __m128i *)above);
  334|  1.74k|  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
  335|  1.74k|  const __m128i zero = _mm_setzero_si128();
  336|  1.74k|  const __m128i al = _mm_unpacklo_epi8(a, zero);
  337|  1.74k|  const __m128i ah = _mm_unpackhi_epi8(a, zero);
  338|  1.74k|  const __m128i bl = _mm_unpacklo_epi8(b, zero);
  339|  1.74k|  const __m128i bh = _mm_unpackhi_epi8(b, zero);
  340|       |
  341|  1.74k|  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
  342|  1.74k|  __m128i rep = _mm_set1_epi16((short)0x8000);
  343|  1.74k|  const __m128i one = _mm_set1_epi16(1);
  344|  1.74k|  const __m128i l = _mm_loadl_epi64((const __m128i *)left);
  345|  1.74k|  __m128i l16;
  346|       |
  347|  15.7k|  for (int i = 0; i < 8; ++i) {
  ------------------
  |  Branch (347:19): [True: 13.9k, False: 1.74k]
  ------------------
  348|  13.9k|    l16 = _mm_shuffle_epi8(l, rep);
  349|  13.9k|    const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
  350|  13.9k|    const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
  351|       |
  352|  13.9k|    _mm_store_si128((__m128i *)dst, r32l);
  353|  13.9k|    _mm_store_si128((__m128i *)(dst + 16), r32h);
  354|  13.9k|    dst += stride;
  355|  13.9k|    rep = _mm_add_epi16(rep, one);
  356|  13.9k|  }
  357|  1.74k|}
aom_smooth_predictor_4x4_ssse3:
  678|  54.5k|                                    const uint8_t *above, const uint8_t *left) {
  679|  54.5k|  __m128i pixels[3];
  680|  54.5k|  load_pixel_w4(above, left, 4, pixels);
  681|       |
  682|  54.5k|  __m128i wh[4], ww[2];
  683|  54.5k|  load_weight_w4(4, wh, ww);
  684|       |
  685|  54.5k|  smooth_pred_4xh(pixels, wh, ww, 4, dst, stride, 0);
  686|  54.5k|}
aom_smooth_predictor_4x8_ssse3:
  689|  9.00k|                                    const uint8_t *above, const uint8_t *left) {
  690|  9.00k|  __m128i pixels[3];
  691|  9.00k|  load_pixel_w4(above, left, 8, pixels);
  692|       |
  693|  9.00k|  __m128i wh[4], ww[2];
  694|  9.00k|  load_weight_w4(8, wh, ww);
  695|       |
  696|  9.00k|  smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
  697|  9.00k|}
aom_smooth_predictor_4x16_ssse3:
  702|  4.59k|                                     const uint8_t *left) {
  703|  4.59k|  __m128i pixels[3];
  704|  4.59k|  load_pixel_w4(above, left, 16, pixels);
  705|       |
  706|  4.59k|  __m128i wh[4], ww[2];
  707|  4.59k|  load_weight_w4(16, wh, ww);
  708|       |
  709|  4.59k|  smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
  710|  4.59k|  dst += stride << 3;
  711|  4.59k|  smooth_pred_4xh(pixels, &wh[2], ww, 8, dst, stride, 1);
  712|  4.59k|}
aom_smooth_predictor_8x4_ssse3:
  845|  11.8k|                                    const uint8_t *above, const uint8_t *left) {
  846|  11.8k|  __m128i pixels[4];
  847|  11.8k|  load_pixel_w8(above, left, 4, pixels);
  848|       |
  849|  11.8k|  __m128i wh[4], ww[2];
  850|  11.8k|  load_weight_w8(4, wh, ww);
  851|       |
  852|  11.8k|  smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0);
  853|  11.8k|}
aom_smooth_predictor_8x8_ssse3:
  856|  41.9k|                                    const uint8_t *above, const uint8_t *left) {
  857|  41.9k|  __m128i pixels[4];
  858|  41.9k|  load_pixel_w8(above, left, 8, pixels);
  859|       |
  860|  41.9k|  __m128i wh[4], ww[2];
  861|  41.9k|  load_weight_w8(8, wh, ww);
  862|       |
  863|  41.9k|  smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
  864|  41.9k|}
aom_smooth_predictor_8x16_ssse3:
  868|  8.47k|                                     const uint8_t *left) {
  869|  8.47k|  __m128i pixels[4];
  870|  8.47k|  load_pixel_w8(above, left, 16, pixels);
  871|       |
  872|  8.47k|  __m128i wh[4], ww[2];
  873|  8.47k|  load_weight_w8(16, wh, ww);
  874|       |
  875|  8.47k|  smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
  876|  8.47k|  dst += stride << 3;
  877|  8.47k|  smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1);
  878|  8.47k|}
aom_smooth_predictor_8x32_ssse3:
  883|  2.71k|                                     const uint8_t *left) {
  884|  2.71k|  __m128i pixels[8];
  885|  2.71k|  load_pixel_w8(above, left, 32, pixels);
  886|       |
  887|  2.71k|  __m128i wh[8], ww[2];
  888|  2.71k|  load_weight_w8(32, wh, ww);
  889|       |
  890|  2.71k|  smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0);
  891|  2.71k|  dst += stride << 3;
  892|  2.71k|  smooth_pred_8xh(&pixels[0], &wh[2], ww, 8, dst, stride, 1);
  893|  2.71k|  dst += stride << 3;
  894|  2.71k|  smooth_pred_8xh(&pixels[4], &wh[4], ww, 8, dst, stride, 0);
  895|  2.71k|  dst += stride << 3;
  896|  2.71k|  smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1);
  897|  2.71k|}
aom_smooth_predictor_16x4_ssse3:
 1018|  9.66k|                                     const uint8_t *left) {
 1019|  9.66k|  smooth_predictor_wxh(dst, stride, above, left, 16, 4);
 1020|  9.66k|}
aom_smooth_predictor_16x8_ssse3:
 1025|  11.7k|                                     const uint8_t *left) {
 1026|  11.7k|  smooth_predictor_wxh(dst, stride, above, left, 16, 8);
 1027|  11.7k|}
aom_smooth_predictor_16x16_ssse3:
 1031|  15.7k|                                      const uint8_t *left) {
 1032|  15.7k|  smooth_predictor_wxh(dst, stride, above, left, 16, 16);
 1033|  15.7k|}
aom_smooth_predictor_16x32_ssse3:
 1037|  2.39k|                                      const uint8_t *left) {
 1038|  2.39k|  smooth_predictor_wxh(dst, stride, above, left, 16, 32);
 1039|  2.39k|}
aom_smooth_predictor_16x64_ssse3:
 1044|    377|                                      const uint8_t *left) {
 1045|    377|  smooth_predictor_wxh(dst, stride, above, left, 16, 64);
 1046|    377|}
aom_smooth_predictor_32x8_ssse3:
 1050|  3.25k|                                     const uint8_t *left) {
 1051|  3.25k|  smooth_predictor_wxh(dst, stride, above, left, 32, 8);
 1052|  3.25k|}
aom_smooth_predictor_32x16_ssse3:
 1057|  2.47k|                                      const uint8_t *left) {
 1058|  2.47k|  smooth_predictor_wxh(dst, stride, above, left, 32, 16);
 1059|  2.47k|}
aom_smooth_predictor_32x32_ssse3:
 1063|  6.05k|                                      const uint8_t *left) {
 1064|  6.05k|  smooth_predictor_wxh(dst, stride, above, left, 32, 32);
 1065|  6.05k|}
aom_smooth_predictor_32x64_ssse3:
 1069|    199|                                      const uint8_t *left) {
 1070|    199|  smooth_predictor_wxh(dst, stride, above, left, 32, 64);
 1071|    199|}
aom_smooth_predictor_64x16_ssse3:
 1076|    310|                                      const uint8_t *left) {
 1077|    310|  smooth_predictor_wxh(dst, stride, above, left, 64, 16);
 1078|    310|}
aom_smooth_predictor_64x32_ssse3:
 1083|    203|                                      const uint8_t *left) {
 1084|    203|  smooth_predictor_wxh(dst, stride, above, left, 64, 32);
 1085|    203|}
aom_smooth_predictor_64x64_ssse3:
 1089|    488|                                      const uint8_t *left) {
 1090|    488|  smooth_predictor_wxh(dst, stride, above, left, 64, 64);
 1091|    488|}
aom_smooth_v_predictor_4x4_ssse3:
 1198|  17.5k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 1199|  17.5k|  __m128i pixels;
 1200|  17.5k|  load_smooth_vertical_pixels4(top_row, left_column, 4, &pixels);
 1201|       |
 1202|  17.5k|  __m128i weights[2];
 1203|  17.5k|  load_smooth_vertical_weights4(smooth_weights, 4, weights);
 1204|       |
 1205|  17.5k|  write_smooth_vertical4xh(&pixels, weights, 4, dst, stride);
 1206|  17.5k|}
aom_smooth_v_predictor_4x8_ssse3:
 1211|  2.14k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 1212|  2.14k|  __m128i pixels;
 1213|  2.14k|  load_smooth_vertical_pixels4(top_row, left_column, 8, &pixels);
 1214|       |
 1215|  2.14k|  __m128i weights[2];
 1216|  2.14k|  load_smooth_vertical_weights4(smooth_weights, 8, weights);
 1217|       |
 1218|  2.14k|  write_smooth_vertical4xh(&pixels, weights, 8, dst, stride);
 1219|  2.14k|}
aom_smooth_v_predictor_4x16_ssse3:
 1225|  1.31k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 1226|  1.31k|  __m128i pixels;
 1227|  1.31k|  load_smooth_vertical_pixels4(top_row, left_column, 16, &pixels);
 1228|       |
 1229|  1.31k|  __m128i weights[4];
 1230|  1.31k|  load_smooth_vertical_weights4(smooth_weights, 16, weights);
 1231|       |
 1232|  1.31k|  write_smooth_vertical4xh(&pixels, weights, 8, dst, stride);
 1233|  1.31k|  dst += stride << 3;
 1234|  1.31k|  write_smooth_vertical4xh(&pixels, &weights[2], 8, dst, stride);
 1235|  1.31k|}
aom_smooth_v_predictor_8x4_ssse3:
 1241|  3.49k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 1242|  3.49k|  const __m128i bottom_left = _mm_set1_epi16(left_column[3]);
 1243|  3.49k|  const __m128i weights = cvtepu8_epi16(Load4(smooth_weights));
 1244|  3.49k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  3.49k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1245|  3.49k|  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
 1246|  3.49k|  const __m128i scaled_bottom_left =
 1247|  3.49k|      _mm_mullo_epi16(inverted_weights, bottom_left);
 1248|  3.49k|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|  3.49k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1249|  3.49k|  __m128i y_select = _mm_set1_epi32(0x01000100);
 1250|  3.49k|  const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
 1251|  3.49k|  __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
 1252|  3.49k|  __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
 1253|  3.49k|  write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
 1254|  3.49k|                                &round);
 1255|  3.49k|  dst += stride;
 1256|  3.49k|  y_select = _mm_set1_epi32(0x03020302);
 1257|  3.49k|  weights_y = _mm_shuffle_epi8(weights, y_select);
 1258|  3.49k|  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
 1259|  3.49k|  write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
 1260|  3.49k|                                &round);
 1261|  3.49k|  dst += stride;
 1262|  3.49k|  y_select = _mm_set1_epi32(0x05040504);
 1263|  3.49k|  weights_y = _mm_shuffle_epi8(weights, y_select);
 1264|  3.49k|  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
 1265|  3.49k|  write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
 1266|  3.49k|                                &round);
 1267|  3.49k|  dst += stride;
 1268|  3.49k|  y_select = _mm_set1_epi32(0x07060706);
 1269|  3.49k|  weights_y = _mm_shuffle_epi8(weights, y_select);
 1270|  3.49k|  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
 1271|  3.49k|  write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
 1272|  3.49k|                                &round);
 1273|  3.49k|}
aom_smooth_v_predictor_8x8_ssse3:
 1278|  10.1k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 1279|  10.1k|  const __m128i bottom_left = _mm_set1_epi16(left_column[7]);
 1280|  10.1k|  const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
 1281|  10.1k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  10.1k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1282|  10.1k|  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
 1283|  10.1k|  const __m128i scaled_bottom_left =
 1284|  10.1k|      _mm_mullo_epi16(inverted_weights, bottom_left);
 1285|  10.1k|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|  10.1k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1286|  10.1k|  const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
 1287|  90.9k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1287:33): [True: 80.8k, False: 10.1k]
  ------------------
 1288|  80.8k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 1289|  80.8k|    const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
 1290|  80.8k|    const __m128i scaled_bottom_left_y =
 1291|  80.8k|        _mm_shuffle_epi8(scaled_bottom_left, y_select);
 1292|  80.8k|    write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
 1293|  80.8k|                                  &round);
 1294|  80.8k|    dst += stride;
 1295|  80.8k|  }
 1296|  10.1k|}
aom_smooth_v_predictor_8x16_ssse3:
 1301|  1.98k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 1302|  1.98k|  const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
 1303|  1.98k|  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
 1304|       |
 1305|  1.98k|  const __m128i weights1 = cvtepu8_epi16(weights);
 1306|  1.98k|  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
 1307|  1.98k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  1.98k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1308|  1.98k|  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
 1309|  1.98k|  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
 1310|  1.98k|  const __m128i scaled_bottom_left1 =
 1311|  1.98k|      _mm_mullo_epi16(inverted_weights1, bottom_left);
 1312|  1.98k|  const __m128i scaled_bottom_left2 =
 1313|  1.98k|      _mm_mullo_epi16(inverted_weights2, bottom_left);
 1314|  1.98k|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|  1.98k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1315|  1.98k|  const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
 1316|  17.8k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1316:33): [True: 15.8k, False: 1.98k]
  ------------------
 1317|  15.8k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 1318|  15.8k|    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
 1319|  15.8k|    const __m128i scaled_bottom_left_y =
 1320|  15.8k|        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
 1321|  15.8k|    write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
 1322|  15.8k|                                  &round);
 1323|  15.8k|    dst += stride;
 1324|  15.8k|  }
 1325|  17.8k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1325:33): [True: 15.8k, False: 1.98k]
  ------------------
 1326|  15.8k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 1327|  15.8k|    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
 1328|  15.8k|    const __m128i scaled_bottom_left_y =
 1329|  15.8k|        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
 1330|  15.8k|    write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
 1331|  15.8k|                                  &round);
 1332|  15.8k|    dst += stride;
 1333|  15.8k|  }
 1334|  1.98k|}
aom_smooth_v_predictor_8x32_ssse3:
 1340|    784|    const uint8_t *LIBAOM_RESTRICT left_column) {
 1341|    784|  const __m128i zero = _mm_setzero_si128();
 1342|    784|  const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
 1343|    784|  const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
 1344|    784|  const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
 1345|    784|  const __m128i weights1 = cvtepu8_epi16(weights_lo);
 1346|    784|  const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
 1347|    784|  const __m128i weights3 = cvtepu8_epi16(weights_hi);
 1348|    784|  const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
 1349|    784|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|    784|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1350|    784|  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
 1351|    784|  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
 1352|    784|  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
 1353|    784|  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
 1354|    784|  const __m128i scaled_bottom_left1 =
 1355|    784|      _mm_mullo_epi16(inverted_weights1, bottom_left);
 1356|    784|  const __m128i scaled_bottom_left2 =
 1357|    784|      _mm_mullo_epi16(inverted_weights2, bottom_left);
 1358|    784|  const __m128i scaled_bottom_left3 =
 1359|    784|      _mm_mullo_epi16(inverted_weights3, bottom_left);
 1360|    784|  const __m128i scaled_bottom_left4 =
 1361|    784|      _mm_mullo_epi16(inverted_weights4, bottom_left);
 1362|    784|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|    784|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1363|    784|  const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
 1364|  7.05k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1364:33): [True: 6.27k, False: 784]
  ------------------
 1365|  6.27k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 1366|  6.27k|    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
 1367|  6.27k|    const __m128i scaled_bottom_left_y =
 1368|  6.27k|        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
 1369|  6.27k|    write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
 1370|  6.27k|                                  &round);
 1371|  6.27k|    dst += stride;
 1372|  6.27k|  }
 1373|  7.05k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1373:33): [True: 6.27k, False: 784]
  ------------------
 1374|  6.27k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 1375|  6.27k|    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
 1376|  6.27k|    const __m128i scaled_bottom_left_y =
 1377|  6.27k|        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
 1378|  6.27k|    write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
 1379|  6.27k|                                  &round);
 1380|  6.27k|    dst += stride;
 1381|  6.27k|  }
 1382|  7.05k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1382:33): [True: 6.27k, False: 784]
  ------------------
 1383|  6.27k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 1384|  6.27k|    const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
 1385|  6.27k|    const __m128i scaled_bottom_left_y =
 1386|  6.27k|        _mm_shuffle_epi8(scaled_bottom_left3, y_select);
 1387|  6.27k|    write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
 1388|  6.27k|                                  &round);
 1389|  6.27k|    dst += stride;
 1390|  6.27k|  }
 1391|  7.05k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1391:33): [True: 6.27k, False: 784]
  ------------------
 1392|  6.27k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 1393|  6.27k|    const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
 1394|  6.27k|    const __m128i scaled_bottom_left_y =
 1395|  6.27k|        _mm_shuffle_epi8(scaled_bottom_left4, y_select);
 1396|  6.27k|    write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
 1397|  6.27k|                                  &round);
 1398|  6.27k|    dst += stride;
 1399|  6.27k|  }
 1400|    784|}
aom_smooth_v_predictor_16x4_ssse3:
 1405|  2.91k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 1406|  2.91k|  const __m128i bottom_left = _mm_set1_epi16(left_column[3]);
 1407|  2.91k|  const __m128i weights = cvtepu8_epi16(Load4(smooth_weights));
 1408|  2.91k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  2.91k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1409|  2.91k|  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
 1410|  2.91k|  const __m128i scaled_bottom_left =
 1411|  2.91k|      _mm_mullo_epi16(inverted_weights, bottom_left);
 1412|  2.91k|  const __m128i round = _mm_set1_epi16(128);
 1413|  2.91k|  const __m128i top = LoadUnaligned16(top_row);
 1414|  2.91k|  const __m128i top_lo = cvtepu8_epi16(top);
 1415|  2.91k|  const __m128i top_hi = cvtepu8_epi16(_mm_srli_si128(top, 8));
 1416|       |
 1417|  2.91k|  __m128i y_select = _mm_set1_epi32(0x01000100);
 1418|  2.91k|  __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
 1419|  2.91k|  __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
 1420|  2.91k|  write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
 1421|  2.91k|                                 scaled_bottom_left_y, scaled_bottom_left_y,
 1422|  2.91k|                                 round);
 1423|  2.91k|  dst += stride;
 1424|  2.91k|  y_select = _mm_set1_epi32(0x03020302);
 1425|  2.91k|  weights_y = _mm_shuffle_epi8(weights, y_select);
 1426|  2.91k|  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
 1427|  2.91k|  write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
 1428|  2.91k|                                 scaled_bottom_left_y, scaled_bottom_left_y,
 1429|  2.91k|                                 round);
 1430|  2.91k|  dst += stride;
 1431|  2.91k|  y_select = _mm_set1_epi32(0x05040504);
 1432|  2.91k|  weights_y = _mm_shuffle_epi8(weights, y_select);
 1433|  2.91k|  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
 1434|  2.91k|  write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
 1435|  2.91k|                                 scaled_bottom_left_y, scaled_bottom_left_y,
 1436|  2.91k|                                 round);
 1437|  2.91k|  dst += stride;
 1438|  2.91k|  y_select = _mm_set1_epi32(0x07060706);
 1439|  2.91k|  weights_y = _mm_shuffle_epi8(weights, y_select);
 1440|  2.91k|  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
 1441|  2.91k|  write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
 1442|  2.91k|                                 scaled_bottom_left_y, scaled_bottom_left_y,
 1443|  2.91k|                                 round);
 1444|  2.91k|}
aom_smooth_v_predictor_16x8_ssse3:
 1450|  3.43k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 1451|  3.43k|  const __m128i bottom_left = _mm_set1_epi16(left_column[7]);
 1452|  3.43k|  const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
 1453|  3.43k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  3.43k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1454|  3.43k|  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
 1455|  3.43k|  const __m128i scaled_bottom_left =
 1456|  3.43k|      _mm_mullo_epi16(inverted_weights, bottom_left);
 1457|  3.43k|  const __m128i round = _mm_set1_epi16(128);
 1458|  3.43k|  const __m128i top = LoadUnaligned16(top_row);
 1459|  3.43k|  const __m128i top_lo = cvtepu8_epi16(top);
 1460|  3.43k|  const __m128i top_hi = cvtepu8_epi16(_mm_srli_si128(top, 8));
 1461|  30.8k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1461:33): [True: 27.4k, False: 3.43k]
  ------------------
 1462|  27.4k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 1463|  27.4k|    const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
 1464|  27.4k|    const __m128i scaled_bottom_left_y =
 1465|  27.4k|        _mm_shuffle_epi8(scaled_bottom_left, y_select);
 1466|  27.4k|    write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
 1467|  27.4k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1468|  27.4k|                                   round);
 1469|  27.4k|    dst += stride;
 1470|  27.4k|  }
 1471|  3.43k|}
aom_smooth_v_predictor_16x16_ssse3:
 1476|  4.01k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 1477|  4.01k|  const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
 1478|  4.01k|  const __m128i zero = _mm_setzero_si128();
 1479|  4.01k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  4.01k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1480|  4.01k|  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
 1481|  4.01k|  const __m128i weights_lo = cvtepu8_epi16(weights);
 1482|  4.01k|  const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
 1483|  4.01k|  const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
 1484|  4.01k|  const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
 1485|  4.01k|  const __m128i scaled_bottom_left_lo =
 1486|  4.01k|      _mm_mullo_epi16(inverted_weights_lo, bottom_left);
 1487|  4.01k|  const __m128i scaled_bottom_left_hi =
 1488|  4.01k|      _mm_mullo_epi16(inverted_weights_hi, bottom_left);
 1489|  4.01k|  const __m128i round = _mm_set1_epi16(128);
 1490|       |
 1491|  4.01k|  const __m128i top = LoadUnaligned16(top_row);
 1492|  4.01k|  const __m128i top_lo = cvtepu8_epi16(top);
 1493|  4.01k|  const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
 1494|  36.1k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1494:33): [True: 32.1k, False: 4.01k]
  ------------------
 1495|  32.1k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 1496|  32.1k|    const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
 1497|  32.1k|    const __m128i scaled_bottom_left_y =
 1498|  32.1k|        _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
 1499|  32.1k|    write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
 1500|  32.1k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1501|  32.1k|                                   round);
 1502|  32.1k|    dst += stride;
 1503|  32.1k|  }
 1504|  36.1k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1504:33): [True: 32.1k, False: 4.01k]
  ------------------
 1505|  32.1k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 1506|  32.1k|    const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
 1507|  32.1k|    const __m128i scaled_bottom_left_y =
 1508|  32.1k|        _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
 1509|  32.1k|    write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
 1510|  32.1k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1511|  32.1k|                                   round);
 1512|  32.1k|    dst += stride;
 1513|  32.1k|  }
 1514|  4.01k|}
aom_smooth_v_predictor_16x32_ssse3:
 1519|    804|    const uint8_t *LIBAOM_RESTRICT left_column) {
 1520|    804|  const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
 1521|    804|  const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
 1522|    804|  const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
 1523|    804|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|    804|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1524|    804|  const __m128i zero = _mm_setzero_si128();
 1525|    804|  const __m128i weights1 = cvtepu8_epi16(weights_lo);
 1526|    804|  const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
 1527|    804|  const __m128i weights3 = cvtepu8_epi16(weights_hi);
 1528|    804|  const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
 1529|    804|  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
 1530|    804|  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
 1531|    804|  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
 1532|    804|  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
 1533|    804|  const __m128i scaled_bottom_left1 =
 1534|    804|      _mm_mullo_epi16(inverted_weights1, bottom_left);
 1535|    804|  const __m128i scaled_bottom_left2 =
 1536|    804|      _mm_mullo_epi16(inverted_weights2, bottom_left);
 1537|    804|  const __m128i scaled_bottom_left3 =
 1538|    804|      _mm_mullo_epi16(inverted_weights3, bottom_left);
 1539|    804|  const __m128i scaled_bottom_left4 =
 1540|    804|      _mm_mullo_epi16(inverted_weights4, bottom_left);
 1541|    804|  const __m128i round = _mm_set1_epi16(128);
 1542|       |
 1543|    804|  const __m128i top = LoadUnaligned16(top_row);
 1544|    804|  const __m128i top_lo = cvtepu8_epi16(top);
 1545|    804|  const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
 1546|  7.23k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1546:33): [True: 6.43k, False: 804]
  ------------------
 1547|  6.43k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 1548|  6.43k|    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
 1549|  6.43k|    const __m128i scaled_bottom_left_y =
 1550|  6.43k|        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
 1551|  6.43k|    write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
 1552|  6.43k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1553|  6.43k|                                   round);
 1554|  6.43k|    dst += stride;
 1555|  6.43k|  }
 1556|  7.23k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1556:33): [True: 6.43k, False: 804]
  ------------------
 1557|  6.43k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 1558|  6.43k|    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
 1559|  6.43k|    const __m128i scaled_bottom_left_y =
 1560|  6.43k|        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
 1561|  6.43k|    write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
 1562|  6.43k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1563|  6.43k|                                   round);
 1564|  6.43k|    dst += stride;
 1565|  6.43k|  }
 1566|  7.23k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1566:33): [True: 6.43k, False: 804]
  ------------------
 1567|  6.43k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 1568|  6.43k|    const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
 1569|  6.43k|    const __m128i scaled_bottom_left_y =
 1570|  6.43k|        _mm_shuffle_epi8(scaled_bottom_left3, y_select);
 1571|  6.43k|    write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
 1572|  6.43k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1573|  6.43k|                                   round);
 1574|  6.43k|    dst += stride;
 1575|  6.43k|  }
 1576|  7.23k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1576:33): [True: 6.43k, False: 804]
  ------------------
 1577|  6.43k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 1578|  6.43k|    const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
 1579|  6.43k|    const __m128i scaled_bottom_left_y =
 1580|  6.43k|        _mm_shuffle_epi8(scaled_bottom_left4, y_select);
 1581|  6.43k|    write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
 1582|  6.43k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1583|  6.43k|                                   round);
 1584|  6.43k|    dst += stride;
 1585|  6.43k|  }
 1586|    804|}
aom_smooth_v_predictor_16x64_ssse3:
 1592|    137|    const uint8_t *LIBAOM_RESTRICT left_column) {
 1593|    137|  const __m128i bottom_left = _mm_set1_epi16(left_column[63]);
 1594|    137|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|    137|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1595|    137|  const __m128i round = _mm_set1_epi16(128);
 1596|    137|  const __m128i zero = _mm_setzero_si128();
 1597|    137|  const __m128i top = LoadUnaligned16(top_row);
 1598|    137|  const __m128i top_lo = cvtepu8_epi16(top);
 1599|    137|  const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
 1600|    137|  const uint8_t *weights_base_ptr = smooth_weights + 60;
 1601|    685|  for (int left_offset = 0; left_offset < 64; left_offset += 16) {
  ------------------
  |  Branch (1601:29): [True: 548, False: 137]
  ------------------
 1602|    548|    const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
 1603|    548|    const __m128i weights_lo = cvtepu8_epi16(weights);
 1604|    548|    const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
 1605|    548|    const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
 1606|    548|    const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
 1607|    548|    const __m128i scaled_bottom_left_lo =
 1608|    548|        _mm_mullo_epi16(inverted_weights_lo, bottom_left);
 1609|    548|    const __m128i scaled_bottom_left_hi =
 1610|    548|        _mm_mullo_epi16(inverted_weights_hi, bottom_left);
 1611|       |
 1612|  4.93k|    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1612:35): [True: 4.38k, False: 548]
  ------------------
 1613|  4.38k|      const __m128i y_select = _mm_set1_epi32(y_mask);
 1614|  4.38k|      const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
 1615|  4.38k|      const __m128i scaled_bottom_left_y =
 1616|  4.38k|          _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
 1617|  4.38k|      write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
 1618|  4.38k|                                     scaled_bottom_left_y, scaled_bottom_left_y,
 1619|  4.38k|                                     round);
 1620|  4.38k|      dst += stride;
 1621|  4.38k|    }
 1622|  4.93k|    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1622:35): [True: 4.38k, False: 548]
  ------------------
 1623|  4.38k|      const __m128i y_select = _mm_set1_epi32(y_mask);
 1624|  4.38k|      const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
 1625|  4.38k|      const __m128i scaled_bottom_left_y =
 1626|  4.38k|          _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
 1627|  4.38k|      write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
 1628|  4.38k|                                     scaled_bottom_left_y, scaled_bottom_left_y,
 1629|  4.38k|                                     round);
 1630|  4.38k|      dst += stride;
 1631|  4.38k|    }
 1632|    548|  }
 1633|    137|}
aom_smooth_v_predictor_32x8_ssse3:
 1638|  1.01k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 1639|  1.01k|  const __m128i zero = _mm_setzero_si128();
 1640|  1.01k|  const __m128i bottom_left = _mm_set1_epi16(left_column[7]);
 1641|  1.01k|  const __m128i top_lo = LoadUnaligned16(top_row);
 1642|  1.01k|  const __m128i top_hi = LoadUnaligned16(top_row + 16);
 1643|  1.01k|  const __m128i top1 = cvtepu8_epi16(top_lo);
 1644|  1.01k|  const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
 1645|  1.01k|  const __m128i top3 = cvtepu8_epi16(top_hi);
 1646|  1.01k|  const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
 1647|  1.01k|  __m128i scale = _mm_set1_epi16(256);
 1648|  1.01k|  const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
 1649|  1.01k|  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
 1650|  1.01k|  const __m128i scaled_bottom_left =
 1651|  1.01k|      _mm_mullo_epi16(inverted_weights, bottom_left);
 1652|  1.01k|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|  1.01k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1653|  9.10k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1653:33): [True: 8.09k, False: 1.01k]
  ------------------
 1654|  8.09k|    __m128i y_select = _mm_set1_epi32(y_mask);
 1655|  8.09k|    const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
 1656|  8.09k|    const __m128i scaled_bottom_left_y =
 1657|  8.09k|        _mm_shuffle_epi8(scaled_bottom_left, y_select);
 1658|  8.09k|    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
 1659|  8.09k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1660|  8.09k|                                   round);
 1661|  8.09k|    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
 1662|  8.09k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1663|  8.09k|                                   round);
 1664|  8.09k|    dst += stride;
 1665|  8.09k|  }
 1666|  1.01k|}
aom_smooth_v_predictor_32x16_ssse3:
 1672|    605|    const uint8_t *LIBAOM_RESTRICT left_column) {
 1673|    605|  const __m128i zero = _mm_setzero_si128();
 1674|    605|  const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
 1675|    605|  const __m128i top_lo = LoadUnaligned16(top_row);
 1676|    605|  const __m128i top_hi = LoadUnaligned16(top_row + 16);
 1677|    605|  const __m128i top1 = cvtepu8_epi16(top_lo);
 1678|    605|  const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
 1679|    605|  const __m128i top3 = cvtepu8_epi16(top_hi);
 1680|    605|  const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
 1681|    605|  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
 1682|    605|  const __m128i weights1 = cvtepu8_epi16(weights);
 1683|    605|  const __m128i weights2 = _mm_unpackhi_epi8(weights, zero);
 1684|    605|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|    605|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1685|    605|  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
 1686|    605|  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
 1687|    605|  const __m128i scaled_bottom_left1 =
 1688|    605|      _mm_mullo_epi16(inverted_weights1, bottom_left);
 1689|    605|  const __m128i scaled_bottom_left2 =
 1690|    605|      _mm_mullo_epi16(inverted_weights2, bottom_left);
 1691|    605|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|    605|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1692|  5.44k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1692:33): [True: 4.84k, False: 605]
  ------------------
 1693|  4.84k|    __m128i y_select = _mm_set1_epi32(y_mask);
 1694|  4.84k|    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
 1695|  4.84k|    const __m128i scaled_bottom_left_y =
 1696|  4.84k|        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
 1697|  4.84k|    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
 1698|  4.84k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1699|  4.84k|                                   round);
 1700|  4.84k|    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
 1701|  4.84k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1702|  4.84k|                                   round);
 1703|  4.84k|    dst += stride;
 1704|  4.84k|  }
 1705|  5.44k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1705:33): [True: 4.84k, False: 605]
  ------------------
 1706|  4.84k|    __m128i y_select = _mm_set1_epi32(y_mask);
 1707|  4.84k|    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
 1708|  4.84k|    const __m128i scaled_bottom_left_y =
 1709|  4.84k|        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
 1710|  4.84k|    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
 1711|  4.84k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1712|  4.84k|                                   round);
 1713|  4.84k|    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
 1714|  4.84k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1715|  4.84k|                                   round);
 1716|  4.84k|    dst += stride;
 1717|  4.84k|  }
 1718|    605|}
aom_smooth_v_predictor_32x32_ssse3:
 1723|  2.28k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 1724|  2.28k|  const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
 1725|  2.28k|  const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
 1726|  2.28k|  const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
 1727|  2.28k|  const __m128i zero = _mm_setzero_si128();
 1728|  2.28k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  2.28k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1729|  2.28k|  const __m128i top_lo = LoadUnaligned16(top_row);
 1730|  2.28k|  const __m128i top_hi = LoadUnaligned16(top_row + 16);
 1731|  2.28k|  const __m128i top1 = cvtepu8_epi16(top_lo);
 1732|  2.28k|  const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
 1733|  2.28k|  const __m128i top3 = cvtepu8_epi16(top_hi);
 1734|  2.28k|  const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
 1735|  2.28k|  const __m128i weights1 = cvtepu8_epi16(weights_lo);
 1736|  2.28k|  const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
 1737|  2.28k|  const __m128i weights3 = cvtepu8_epi16(weights_hi);
 1738|  2.28k|  const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
 1739|  2.28k|  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
 1740|  2.28k|  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
 1741|  2.28k|  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
 1742|  2.28k|  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
 1743|  2.28k|  const __m128i scaled_bottom_left1 =
 1744|  2.28k|      _mm_mullo_epi16(inverted_weights1, bottom_left);
 1745|  2.28k|  const __m128i scaled_bottom_left2 =
 1746|  2.28k|      _mm_mullo_epi16(inverted_weights2, bottom_left);
 1747|  2.28k|  const __m128i scaled_bottom_left3 =
 1748|  2.28k|      _mm_mullo_epi16(inverted_weights3, bottom_left);
 1749|  2.28k|  const __m128i scaled_bottom_left4 =
 1750|  2.28k|      _mm_mullo_epi16(inverted_weights4, bottom_left);
 1751|  2.28k|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|  2.28k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1752|  20.5k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1752:33): [True: 18.2k, False: 2.28k]
  ------------------
 1753|  18.2k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 1754|  18.2k|    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
 1755|  18.2k|    const __m128i scaled_bottom_left_y =
 1756|  18.2k|        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
 1757|  18.2k|    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
 1758|  18.2k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1759|  18.2k|                                   round);
 1760|  18.2k|    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
 1761|  18.2k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1762|  18.2k|                                   round);
 1763|  18.2k|    dst += stride;
 1764|  18.2k|  }
 1765|  20.5k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1765:33): [True: 18.2k, False: 2.28k]
  ------------------
 1766|  18.2k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 1767|  18.2k|    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
 1768|  18.2k|    const __m128i scaled_bottom_left_y =
 1769|  18.2k|        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
 1770|  18.2k|    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
 1771|  18.2k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1772|  18.2k|                                   round);
 1773|  18.2k|    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
 1774|  18.2k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1775|  18.2k|                                   round);
 1776|  18.2k|    dst += stride;
 1777|  18.2k|  }
 1778|  20.5k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1778:33): [True: 18.2k, False: 2.28k]
  ------------------
 1779|  18.2k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 1780|  18.2k|    const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
 1781|  18.2k|    const __m128i scaled_bottom_left_y =
 1782|  18.2k|        _mm_shuffle_epi8(scaled_bottom_left3, y_select);
 1783|  18.2k|    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
 1784|  18.2k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1785|  18.2k|                                   round);
 1786|  18.2k|    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
 1787|  18.2k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1788|  18.2k|                                   round);
 1789|  18.2k|    dst += stride;
 1790|  18.2k|  }
 1791|  20.5k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1791:33): [True: 18.2k, False: 2.28k]
  ------------------
 1792|  18.2k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 1793|  18.2k|    const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
 1794|  18.2k|    const __m128i scaled_bottom_left_y =
 1795|  18.2k|        _mm_shuffle_epi8(scaled_bottom_left4, y_select);
 1796|  18.2k|    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
 1797|  18.2k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1798|  18.2k|                                   round);
 1799|  18.2k|    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
 1800|  18.2k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1801|  18.2k|                                   round);
 1802|  18.2k|    dst += stride;
 1803|  18.2k|  }
 1804|  2.28k|}
aom_smooth_v_predictor_32x64_ssse3:
 1809|     76|    const uint8_t *LIBAOM_RESTRICT left_column) {
 1810|     76|  const __m128i zero = _mm_setzero_si128();
 1811|     76|  const __m128i bottom_left = _mm_set1_epi16(left_column[63]);
 1812|     76|  const __m128i top_lo = LoadUnaligned16(top_row);
 1813|     76|  const __m128i top_hi = LoadUnaligned16(top_row + 16);
 1814|     76|  const __m128i top1 = cvtepu8_epi16(top_lo);
 1815|     76|  const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
 1816|     76|  const __m128i top3 = cvtepu8_epi16(top_hi);
 1817|     76|  const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
 1818|     76|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|     76|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1819|     76|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|     76|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1820|     76|  const uint8_t *weights_base_ptr = smooth_weights + 60;
 1821|    380|  for (int left_offset = 0; left_offset < 64; left_offset += 16) {
  ------------------
  |  Branch (1821:29): [True: 304, False: 76]
  ------------------
 1822|    304|    const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
 1823|    304|    const __m128i weights_lo = cvtepu8_epi16(weights);
 1824|    304|    const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
 1825|    304|    const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
 1826|    304|    const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
 1827|    304|    const __m128i scaled_bottom_left_lo =
 1828|    304|        _mm_mullo_epi16(inverted_weights_lo, bottom_left);
 1829|    304|    const __m128i scaled_bottom_left_hi =
 1830|    304|        _mm_mullo_epi16(inverted_weights_hi, bottom_left);
 1831|       |
 1832|  2.73k|    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1832:35): [True: 2.43k, False: 304]
  ------------------
 1833|  2.43k|      const __m128i y_select = _mm_set1_epi32(y_mask);
 1834|  2.43k|      const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
 1835|  2.43k|      const __m128i scaled_bottom_left_y =
 1836|  2.43k|          _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
 1837|  2.43k|      write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
 1838|  2.43k|                                     scaled_bottom_left_y, scaled_bottom_left_y,
 1839|  2.43k|                                     round);
 1840|  2.43k|      write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
 1841|  2.43k|                                     scaled_bottom_left_y, scaled_bottom_left_y,
 1842|  2.43k|                                     round);
 1843|  2.43k|      dst += stride;
 1844|  2.43k|    }
 1845|  2.73k|    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1845:35): [True: 2.43k, False: 304]
  ------------------
 1846|  2.43k|      const __m128i y_select = _mm_set1_epi32(y_mask);
 1847|  2.43k|      const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
 1848|  2.43k|      const __m128i scaled_bottom_left_y =
 1849|  2.43k|          _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
 1850|  2.43k|      write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
 1851|  2.43k|                                     scaled_bottom_left_y, scaled_bottom_left_y,
 1852|  2.43k|                                     round);
 1853|  2.43k|      write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
 1854|  2.43k|                                     scaled_bottom_left_y, scaled_bottom_left_y,
 1855|  2.43k|                                     round);
 1856|  2.43k|      dst += stride;
 1857|  2.43k|    }
 1858|    304|  }
 1859|     76|}
aom_smooth_v_predictor_64x16_ssse3:
 1865|     69|    const uint8_t *LIBAOM_RESTRICT left_column) {
 1866|     69|  const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
 1867|     69|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|     69|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1868|     69|  const __m128i zero = _mm_setzero_si128();
 1869|     69|  const __m128i top_lolo = LoadUnaligned16(top_row);
 1870|     69|  const __m128i top_lohi = LoadUnaligned16(top_row + 16);
 1871|     69|  const __m128i top1 = cvtepu8_epi16(top_lolo);
 1872|     69|  const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
 1873|     69|  const __m128i top3 = cvtepu8_epi16(top_lohi);
 1874|     69|  const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
 1875|       |
 1876|     69|  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
 1877|     69|  const __m128i weights1 = cvtepu8_epi16(weights);
 1878|     69|  const __m128i weights2 = _mm_unpackhi_epi8(weights, zero);
 1879|     69|  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
 1880|     69|  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
 1881|     69|  const __m128i top_hilo = LoadUnaligned16(top_row + 32);
 1882|     69|  const __m128i top_hihi = LoadUnaligned16(top_row + 48);
 1883|     69|  const __m128i top5 = cvtepu8_epi16(top_hilo);
 1884|     69|  const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
 1885|     69|  const __m128i top7 = cvtepu8_epi16(top_hihi);
 1886|     69|  const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
 1887|     69|  const __m128i scaled_bottom_left1 =
 1888|     69|      _mm_mullo_epi16(inverted_weights1, bottom_left);
 1889|     69|  const __m128i scaled_bottom_left2 =
 1890|     69|      _mm_mullo_epi16(inverted_weights2, bottom_left);
 1891|     69|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|     69|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1892|    621|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1892:33): [True: 552, False: 69]
  ------------------
 1893|    552|    const __m128i y_select = _mm_set1_epi32(y_mask);
 1894|    552|    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
 1895|    552|    const __m128i scaled_bottom_left_y =
 1896|    552|        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
 1897|    552|    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
 1898|    552|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1899|    552|                                   round);
 1900|    552|    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
 1901|    552|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1902|    552|                                   round);
 1903|    552|    write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
 1904|    552|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1905|    552|                                   round);
 1906|    552|    write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
 1907|    552|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1908|    552|                                   round);
 1909|    552|    dst += stride;
 1910|    552|  }
 1911|    621|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1911:33): [True: 552, False: 69]
  ------------------
 1912|    552|    const __m128i y_select = _mm_set1_epi32(y_mask);
 1913|    552|    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
 1914|    552|    const __m128i scaled_bottom_left_y =
 1915|    552|        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
 1916|    552|    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
 1917|    552|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1918|    552|                                   round);
 1919|    552|    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
 1920|    552|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1921|    552|                                   round);
 1922|    552|    write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
 1923|    552|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1924|    552|                                   round);
 1925|    552|    write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
 1926|    552|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1927|    552|                                   round);
 1928|    552|    dst += stride;
 1929|    552|  }
 1930|     69|}
aom_smooth_v_predictor_64x32_ssse3:
 1936|     84|    const uint8_t *LIBAOM_RESTRICT left_column) {
 1937|     84|  const __m128i zero = _mm_setzero_si128();
 1938|     84|  const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
 1939|     84|  const __m128i top_lolo = LoadUnaligned16(top_row);
 1940|     84|  const __m128i top_lohi = LoadUnaligned16(top_row + 16);
 1941|     84|  const __m128i top1 = cvtepu8_epi16(top_lolo);
 1942|     84|  const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
 1943|     84|  const __m128i top3 = cvtepu8_epi16(top_lohi);
 1944|     84|  const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
 1945|     84|  const __m128i top_hilo = LoadUnaligned16(top_row + 32);
 1946|     84|  const __m128i top_hihi = LoadUnaligned16(top_row + 48);
 1947|     84|  const __m128i top5 = cvtepu8_epi16(top_hilo);
 1948|     84|  const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
 1949|     84|  const __m128i top7 = cvtepu8_epi16(top_hihi);
 1950|     84|  const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
 1951|     84|  const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
 1952|     84|  const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
 1953|     84|  const __m128i weights1 = cvtepu8_epi16(weights_lo);
 1954|     84|  const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
 1955|     84|  const __m128i weights3 = cvtepu8_epi16(weights_hi);
 1956|     84|  const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
 1957|     84|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|     84|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1958|     84|  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
 1959|     84|  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
 1960|     84|  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
 1961|     84|  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
 1962|     84|  const __m128i scaled_bottom_left1 =
 1963|     84|      _mm_mullo_epi16(inverted_weights1, bottom_left);
 1964|     84|  const __m128i scaled_bottom_left2 =
 1965|     84|      _mm_mullo_epi16(inverted_weights2, bottom_left);
 1966|     84|  const __m128i scaled_bottom_left3 =
 1967|     84|      _mm_mullo_epi16(inverted_weights3, bottom_left);
 1968|     84|  const __m128i scaled_bottom_left4 =
 1969|     84|      _mm_mullo_epi16(inverted_weights4, bottom_left);
 1970|     84|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|     84|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1971|       |
 1972|    756|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1972:33): [True: 672, False: 84]
  ------------------
 1973|    672|    const __m128i y_select = _mm_set1_epi32(y_mask);
 1974|    672|    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
 1975|    672|    const __m128i scaled_bottom_left_y =
 1976|    672|        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
 1977|    672|    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
 1978|    672|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1979|    672|                                   round);
 1980|    672|    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
 1981|    672|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1982|    672|                                   round);
 1983|    672|    write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
 1984|    672|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1985|    672|                                   round);
 1986|    672|    write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
 1987|    672|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1988|    672|                                   round);
 1989|    672|    dst += stride;
 1990|    672|  }
 1991|    756|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1991:33): [True: 672, False: 84]
  ------------------
 1992|    672|    const __m128i y_select = _mm_set1_epi32(y_mask);
 1993|    672|    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
 1994|    672|    const __m128i scaled_bottom_left_y =
 1995|    672|        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
 1996|    672|    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
 1997|    672|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1998|    672|                                   round);
 1999|    672|    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
 2000|    672|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 2001|    672|                                   round);
 2002|    672|    write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
 2003|    672|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 2004|    672|                                   round);
 2005|    672|    write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
 2006|    672|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 2007|    672|                                   round);
 2008|    672|    dst += stride;
 2009|    672|  }
 2010|    756|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2010:33): [True: 672, False: 84]
  ------------------
 2011|    672|    const __m128i y_select = _mm_set1_epi32(y_mask);
 2012|    672|    const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
 2013|    672|    const __m128i scaled_bottom_left_y =
 2014|    672|        _mm_shuffle_epi8(scaled_bottom_left3, y_select);
 2015|    672|    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
 2016|    672|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 2017|    672|                                   round);
 2018|    672|    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
 2019|    672|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 2020|    672|                                   round);
 2021|    672|    write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
 2022|    672|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 2023|    672|                                   round);
 2024|    672|    write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
 2025|    672|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 2026|    672|                                   round);
 2027|    672|    dst += stride;
 2028|    672|  }
 2029|    756|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2029:33): [True: 672, False: 84]
  ------------------
 2030|    672|    const __m128i y_select = _mm_set1_epi32(y_mask);
 2031|    672|    const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
 2032|    672|    const __m128i scaled_bottom_left_y =
 2033|    672|        _mm_shuffle_epi8(scaled_bottom_left4, y_select);
 2034|    672|    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
 2035|    672|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 2036|    672|                                   round);
 2037|    672|    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
 2038|    672|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 2039|    672|                                   round);
 2040|    672|    write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
 2041|    672|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 2042|    672|                                   round);
 2043|    672|    write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
 2044|    672|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 2045|    672|                                   round);
 2046|    672|    dst += stride;
 2047|    672|  }
 2048|     84|}
aom_smooth_v_predictor_64x64_ssse3:
 2053|    173|    const uint8_t *LIBAOM_RESTRICT left_column) {
 2054|    173|  const __m128i zero = _mm_setzero_si128();
 2055|    173|  const __m128i bottom_left = _mm_set1_epi16(left_column[63]);
 2056|    173|  const __m128i top_lolo = LoadUnaligned16(top_row);
 2057|    173|  const __m128i top_lohi = LoadUnaligned16(top_row + 16);
 2058|    173|  const __m128i top1 = cvtepu8_epi16(top_lolo);
 2059|    173|  const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
 2060|    173|  const __m128i top3 = cvtepu8_epi16(top_lohi);
 2061|    173|  const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
 2062|    173|  const __m128i top_hilo = LoadUnaligned16(top_row + 32);
 2063|    173|  const __m128i top_hihi = LoadUnaligned16(top_row + 48);
 2064|    173|  const __m128i top5 = cvtepu8_epi16(top_hilo);
 2065|    173|  const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
 2066|    173|  const __m128i top7 = cvtepu8_epi16(top_hihi);
 2067|    173|  const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
 2068|    173|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|    173|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2069|    173|  const __m128i round = _mm_set1_epi16(128);
 2070|    173|  const uint8_t *weights_base_ptr = smooth_weights + 60;
 2071|    865|  for (int left_offset = 0; left_offset < 64; left_offset += 16) {
  ------------------
  |  Branch (2071:29): [True: 692, False: 173]
  ------------------
 2072|    692|    const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
 2073|    692|    const __m128i weights_lo = cvtepu8_epi16(weights);
 2074|    692|    const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
 2075|    692|    const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
 2076|    692|    const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
 2077|    692|    const __m128i scaled_bottom_left_lo =
 2078|    692|        _mm_mullo_epi16(inverted_weights_lo, bottom_left);
 2079|    692|    const __m128i scaled_bottom_left_hi =
 2080|    692|        _mm_mullo_epi16(inverted_weights_hi, bottom_left);
 2081|  6.22k|    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2081:35): [True: 5.53k, False: 692]
  ------------------
 2082|  5.53k|      const __m128i y_select = _mm_set1_epi32(y_mask);
 2083|  5.53k|      const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
 2084|  5.53k|      const __m128i scaled_bottom_left_y =
 2085|  5.53k|          _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
 2086|  5.53k|      write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
 2087|  5.53k|                                     scaled_bottom_left_y, scaled_bottom_left_y,
 2088|  5.53k|                                     round);
 2089|  5.53k|      write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
 2090|  5.53k|                                     scaled_bottom_left_y, scaled_bottom_left_y,
 2091|  5.53k|                                     round);
 2092|  5.53k|      write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
 2093|  5.53k|                                     scaled_bottom_left_y, scaled_bottom_left_y,
 2094|  5.53k|                                     round);
 2095|  5.53k|      write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
 2096|  5.53k|                                     scaled_bottom_left_y, scaled_bottom_left_y,
 2097|  5.53k|                                     round);
 2098|  5.53k|      dst += stride;
 2099|  5.53k|    }
 2100|  6.22k|    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2100:35): [True: 5.53k, False: 692]
  ------------------
 2101|  5.53k|      const __m128i y_select = _mm_set1_epi32(y_mask);
 2102|  5.53k|      const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
 2103|  5.53k|      const __m128i scaled_bottom_left_y =
 2104|  5.53k|          _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
 2105|  5.53k|      write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
 2106|  5.53k|                                     scaled_bottom_left_y, scaled_bottom_left_y,
 2107|  5.53k|                                     round);
 2108|  5.53k|      write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
 2109|  5.53k|                                     scaled_bottom_left_y, scaled_bottom_left_y,
 2110|  5.53k|                                     round);
 2111|  5.53k|      write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
 2112|  5.53k|                                     scaled_bottom_left_y, scaled_bottom_left_y,
 2113|  5.53k|                                     round);
 2114|  5.53k|      write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
 2115|  5.53k|                                     scaled_bottom_left_y, scaled_bottom_left_y,
 2116|  5.53k|                                     round);
 2117|  5.53k|      dst += stride;
 2118|  5.53k|    }
 2119|    692|  }
 2120|    173|}
aom_smooth_h_predictor_4x4_ssse3:
 2138|  22.9k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 2139|  22.9k|  const __m128i top_right = _mm_set1_epi32(top_row[3]);
 2140|  22.9k|  const __m128i left = cvtepu8_epi32(Load4(left_column));
 2141|  22.9k|  const __m128i weights = cvtepu8_epi32(Load4(smooth_weights));
 2142|  22.9k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  22.9k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2143|  22.9k|  const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
 2144|  22.9k|  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
 2145|  22.9k|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|  22.9k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2146|  22.9k|  __m128i left_y = _mm_shuffle_epi32(left, 0);
 2147|  22.9k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2148|  22.9k|                               &round);
 2149|  22.9k|  dst += stride;
 2150|  22.9k|  left_y = _mm_shuffle_epi32(left, 0x55);
 2151|  22.9k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2152|  22.9k|                               &round);
 2153|  22.9k|  dst += stride;
 2154|  22.9k|  left_y = _mm_shuffle_epi32(left, 0xaa);
 2155|  22.9k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2156|  22.9k|                               &round);
 2157|  22.9k|  dst += stride;
 2158|       |  left_y = _mm_shuffle_epi32(left, 0xff);
 2159|  22.9k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2160|  22.9k|                               &round);
 2161|  22.9k|}
aom_smooth_h_predictor_4x8_ssse3:
 2166|  3.59k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 2167|  3.59k|  const __m128i top_right = _mm_set1_epi32(top_row[3]);
 2168|  3.59k|  const __m128i weights = cvtepu8_epi32(Load4(smooth_weights));
 2169|  3.59k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  3.59k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2170|  3.59k|  const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
 2171|  3.59k|  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
 2172|  3.59k|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|  3.59k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2173|  3.59k|  __m128i left = cvtepu8_epi32(Load4(left_column));
 2174|  3.59k|  __m128i left_y = _mm_shuffle_epi32(left, 0);
 2175|  3.59k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2176|  3.59k|                               &round);
 2177|  3.59k|  dst += stride;
 2178|  3.59k|  left_y = _mm_shuffle_epi32(left, 0x55);
 2179|  3.59k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2180|  3.59k|                               &round);
 2181|  3.59k|  dst += stride;
 2182|  3.59k|  left_y = _mm_shuffle_epi32(left, 0xaa);
 2183|  3.59k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2184|  3.59k|                               &round);
 2185|  3.59k|  dst += stride;
 2186|  3.59k|  left_y = _mm_shuffle_epi32(left, 0xff);
 2187|  3.59k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2188|  3.59k|                               &round);
 2189|  3.59k|  dst += stride;
 2190|       |
 2191|  3.59k|  left = cvtepu8_epi32(Load4(left_column + 4));
 2192|  3.59k|  left_y = _mm_shuffle_epi32(left, 0);
 2193|  3.59k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2194|  3.59k|                               &round);
 2195|  3.59k|  dst += stride;
 2196|  3.59k|  left_y = _mm_shuffle_epi32(left, 0x55);
 2197|  3.59k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2198|  3.59k|                               &round);
 2199|  3.59k|  dst += stride;
 2200|  3.59k|  left_y = _mm_shuffle_epi32(left, 0xaa);
 2201|  3.59k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2202|  3.59k|                               &round);
 2203|  3.59k|  dst += stride;
 2204|       |  left_y = _mm_shuffle_epi32(left, 0xff);
 2205|  3.59k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2206|  3.59k|                               &round);
 2207|  3.59k|}
aom_smooth_h_predictor_4x16_ssse3:
 2213|  1.77k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 2214|  1.77k|  const __m128i top_right = _mm_set1_epi32(top_row[3]);
 2215|  1.77k|  const __m128i weights = cvtepu8_epi32(Load4(smooth_weights));
 2216|  1.77k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  1.77k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2217|  1.77k|  const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
 2218|  1.77k|  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
 2219|  1.77k|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|  1.77k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2220|  1.77k|  __m128i left = cvtepu8_epi32(Load4(left_column));
 2221|  1.77k|  __m128i left_y = _mm_shuffle_epi32(left, 0);
 2222|  1.77k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2223|  1.77k|                               &round);
 2224|  1.77k|  dst += stride;
 2225|  1.77k|  left_y = _mm_shuffle_epi32(left, 0x55);
 2226|  1.77k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2227|  1.77k|                               &round);
 2228|  1.77k|  dst += stride;
 2229|  1.77k|  left_y = _mm_shuffle_epi32(left, 0xaa);
 2230|  1.77k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2231|  1.77k|                               &round);
 2232|  1.77k|  dst += stride;
 2233|  1.77k|  left_y = _mm_shuffle_epi32(left, 0xff);
 2234|  1.77k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2235|  1.77k|                               &round);
 2236|  1.77k|  dst += stride;
 2237|       |
 2238|  1.77k|  left = cvtepu8_epi32(Load4(left_column + 4));
 2239|  1.77k|  left_y = _mm_shuffle_epi32(left, 0);
 2240|  1.77k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2241|  1.77k|                               &round);
 2242|  1.77k|  dst += stride;
 2243|  1.77k|  left_y = _mm_shuffle_epi32(left, 0x55);
 2244|  1.77k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2245|  1.77k|                               &round);
 2246|  1.77k|  dst += stride;
 2247|  1.77k|  left_y = _mm_shuffle_epi32(left, 0xaa);
 2248|  1.77k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2249|  1.77k|                               &round);
 2250|  1.77k|  dst += stride;
 2251|  1.77k|  left_y = _mm_shuffle_epi32(left, 0xff);
 2252|  1.77k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2253|  1.77k|                               &round);
 2254|  1.77k|  dst += stride;
 2255|       |
 2256|  1.77k|  left = cvtepu8_epi32(Load4(left_column + 8));
 2257|  1.77k|  left_y = _mm_shuffle_epi32(left, 0);
 2258|  1.77k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2259|  1.77k|                               &round);
 2260|  1.77k|  dst += stride;
 2261|  1.77k|  left_y = _mm_shuffle_epi32(left, 0x55);
 2262|  1.77k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2263|  1.77k|                               &round);
 2264|  1.77k|  dst += stride;
 2265|  1.77k|  left_y = _mm_shuffle_epi32(left, 0xaa);
 2266|  1.77k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2267|  1.77k|                               &round);
 2268|  1.77k|  dst += stride;
 2269|  1.77k|  left_y = _mm_shuffle_epi32(left, 0xff);
 2270|  1.77k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2271|  1.77k|                               &round);
 2272|  1.77k|  dst += stride;
 2273|       |
 2274|  1.77k|  left = cvtepu8_epi32(Load4(left_column + 12));
 2275|  1.77k|  left_y = _mm_shuffle_epi32(left, 0);
 2276|  1.77k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2277|  1.77k|                               &round);
 2278|  1.77k|  dst += stride;
 2279|  1.77k|  left_y = _mm_shuffle_epi32(left, 0x55);
 2280|  1.77k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2281|  1.77k|                               &round);
 2282|  1.77k|  dst += stride;
 2283|  1.77k|  left_y = _mm_shuffle_epi32(left, 0xaa);
 2284|  1.77k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2285|  1.77k|                               &round);
 2286|  1.77k|  dst += stride;
 2287|       |  left_y = _mm_shuffle_epi32(left, 0xff);
 2288|  1.77k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2289|  1.77k|                               &round);
 2290|  1.77k|}
aom_smooth_h_predictor_8x4_ssse3:
 2299|  5.22k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 2300|  5.22k|  const __m128i top_right = _mm_set1_epi16(top_row[7]);
 2301|  5.22k|  const __m128i left = cvtepu8_epi16(Load4(left_column));
 2302|  5.22k|  const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
 2303|  5.22k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  5.22k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2304|  5.22k|  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
 2305|  5.22k|  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
 2306|  5.22k|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|  5.22k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2307|  5.22k|  __m128i y_select = _mm_set1_epi32(0x01000100);
 2308|  5.22k|  __m128i left_y = _mm_shuffle_epi8(left, y_select);
 2309|  5.22k|  write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
 2310|  5.22k|                                &round);
 2311|  5.22k|  dst += stride;
 2312|  5.22k|  y_select = _mm_set1_epi32(0x03020302);
 2313|  5.22k|  left_y = _mm_shuffle_epi8(left, y_select);
 2314|  5.22k|  write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
 2315|  5.22k|                                &round);
 2316|  5.22k|  dst += stride;
 2317|  5.22k|  y_select = _mm_set1_epi32(0x05040504);
 2318|  5.22k|  left_y = _mm_shuffle_epi8(left, y_select);
 2319|  5.22k|  write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
 2320|  5.22k|                                &round);
 2321|  5.22k|  dst += stride;
 2322|  5.22k|  y_select = _mm_set1_epi32(0x07060706);
 2323|  5.22k|  left_y = _mm_shuffle_epi8(left, y_select);
 2324|  5.22k|  write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
 2325|  5.22k|                                &round);
 2326|  5.22k|}
aom_smooth_h_predictor_8x8_ssse3:
 2331|  13.4k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 2332|  13.4k|  const __m128i top_right = _mm_set1_epi16(top_row[7]);
 2333|  13.4k|  const __m128i left = cvtepu8_epi16(LoadLo8(left_column));
 2334|  13.4k|  const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
 2335|  13.4k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  13.4k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2336|  13.4k|  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
 2337|  13.4k|  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
 2338|  13.4k|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|  13.4k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2339|   121k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2339:33): [True: 107k, False: 13.4k]
  ------------------
 2340|   107k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 2341|   107k|    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
 2342|   107k|    write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
 2343|   107k|                                  &round);
 2344|   107k|    dst += stride;
 2345|   107k|  }
 2346|  13.4k|}
aom_smooth_h_predictor_8x16_ssse3:
 2351|  3.20k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 2352|  3.20k|  const __m128i top_right = _mm_set1_epi16(top_row[7]);
 2353|  3.20k|  const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
 2354|  3.20k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  3.20k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2355|  3.20k|  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
 2356|  3.20k|  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
 2357|  3.20k|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|  3.20k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2358|  3.20k|  __m128i left = cvtepu8_epi16(LoadLo8(left_column));
 2359|  28.8k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2359:33): [True: 25.6k, False: 3.20k]
  ------------------
 2360|  25.6k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 2361|  25.6k|    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
 2362|  25.6k|    write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
 2363|  25.6k|                                  &round);
 2364|  25.6k|    dst += stride;
 2365|  25.6k|  }
 2366|  3.20k|  left = cvtepu8_epi16(LoadLo8(left_column + 8));
 2367|  28.8k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2367:33): [True: 25.6k, False: 3.20k]
  ------------------
 2368|  25.6k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 2369|  25.6k|    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
 2370|  25.6k|    write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
 2371|  25.6k|                                  &round);
 2372|  25.6k|    dst += stride;
 2373|  25.6k|  }
 2374|  3.20k|}
aom_smooth_h_predictor_8x32_ssse3:
 2380|  1.38k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 2381|  1.38k|  const __m128i top_right = _mm_set1_epi16(top_row[7]);
 2382|  1.38k|  const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
 2383|  1.38k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  1.38k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2384|  1.38k|  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
 2385|  1.38k|  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
 2386|  1.38k|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|  1.38k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2387|  1.38k|  __m128i left = cvtepu8_epi16(LoadLo8(left_column));
 2388|  12.5k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2388:33): [True: 11.1k, False: 1.38k]
  ------------------
 2389|  11.1k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 2390|  11.1k|    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
 2391|  11.1k|    write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
 2392|  11.1k|                                  &round);
 2393|  11.1k|    dst += stride;
 2394|  11.1k|  }
 2395|  1.38k|  left = cvtepu8_epi16(LoadLo8(left_column + 8));
 2396|  12.5k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2396:33): [True: 11.1k, False: 1.38k]
  ------------------
 2397|  11.1k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 2398|  11.1k|    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
 2399|  11.1k|    write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
 2400|  11.1k|                                  &round);
 2401|  11.1k|    dst += stride;
 2402|  11.1k|  }
 2403|  1.38k|  left = cvtepu8_epi16(LoadLo8(left_column + 16));
 2404|  12.5k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2404:33): [True: 11.1k, False: 1.38k]
  ------------------
 2405|  11.1k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 2406|  11.1k|    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
 2407|  11.1k|    write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
 2408|  11.1k|                                  &round);
 2409|  11.1k|    dst += stride;
 2410|  11.1k|  }
 2411|  1.38k|  left = cvtepu8_epi16(LoadLo8(left_column + 24));
 2412|  12.5k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2412:33): [True: 11.1k, False: 1.38k]
  ------------------
 2413|  11.1k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 2414|  11.1k|    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
 2415|  11.1k|    write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
 2416|  11.1k|                                  &round);
 2417|  11.1k|    dst += stride;
 2418|  11.1k|  }
 2419|  1.38k|}
aom_smooth_h_predictor_16x4_ssse3:
 2424|  3.83k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 2425|  3.83k|  const __m128i top_right = _mm_set1_epi16(top_row[15]);
 2426|  3.83k|  const __m128i left = cvtepu8_epi16(Load4(left_column));
 2427|  3.83k|  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
 2428|  3.83k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  3.83k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2429|  3.83k|  const __m128i weights1 = cvtepu8_epi16(weights);
 2430|  3.83k|  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
 2431|  3.83k|  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
 2432|  3.83k|  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
 2433|  3.83k|  const __m128i scaled_top_right1 =
 2434|  3.83k|      _mm_mullo_epi16(inverted_weights1, top_right);
 2435|  3.83k|  const __m128i scaled_top_right2 =
 2436|  3.83k|      _mm_mullo_epi16(inverted_weights2, top_right);
 2437|  3.83k|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|  3.83k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2438|  3.83k|  __m128i y_mask = _mm_set1_epi32(0x01000100);
 2439|  3.83k|  __m128i left_y = _mm_shuffle_epi8(left, y_mask);
 2440|  3.83k|  write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2441|  3.83k|                                 scaled_top_right1, scaled_top_right2, round);
 2442|  3.83k|  dst += stride;
 2443|  3.83k|  y_mask = _mm_set1_epi32(0x03020302);
 2444|  3.83k|  left_y = _mm_shuffle_epi8(left, y_mask);
 2445|  3.83k|  write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2446|  3.83k|                                 scaled_top_right1, scaled_top_right2, round);
 2447|  3.83k|  dst += stride;
 2448|  3.83k|  y_mask = _mm_set1_epi32(0x05040504);
 2449|  3.83k|  left_y = _mm_shuffle_epi8(left, y_mask);
 2450|  3.83k|  write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2451|  3.83k|                                 scaled_top_right1, scaled_top_right2, round);
 2452|  3.83k|  dst += stride;
 2453|  3.83k|  y_mask = _mm_set1_epi32(0x07060706);
 2454|  3.83k|  left_y = _mm_shuffle_epi8(left, y_mask);
 2455|  3.83k|  write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2456|  3.83k|                                 scaled_top_right1, scaled_top_right2, round);
 2457|  3.83k|}
aom_smooth_h_predictor_16x8_ssse3:
 2463|  5.20k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 2464|  5.20k|  const __m128i top_right = _mm_set1_epi16(top_row[15]);
 2465|  5.20k|  const __m128i left = cvtepu8_epi16(LoadLo8(left_column));
 2466|  5.20k|  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
 2467|  5.20k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  5.20k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2468|  5.20k|  const __m128i weights1 = cvtepu8_epi16(weights);
 2469|  5.20k|  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
 2470|  5.20k|  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
 2471|  5.20k|  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
 2472|  5.20k|  const __m128i scaled_top_right1 =
 2473|  5.20k|      _mm_mullo_epi16(inverted_weights1, top_right);
 2474|  5.20k|  const __m128i scaled_top_right2 =
 2475|  5.20k|      _mm_mullo_epi16(inverted_weights2, top_right);
 2476|  5.20k|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|  5.20k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2477|  46.8k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2477:33): [True: 41.6k, False: 5.20k]
  ------------------
 2478|  41.6k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 2479|  41.6k|    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
 2480|  41.6k|    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2481|  41.6k|                                   scaled_top_right1, scaled_top_right2, round);
 2482|  41.6k|    dst += stride;
 2483|  41.6k|  }
 2484|  5.20k|}
aom_smooth_h_predictor_16x16_ssse3:
 2489|  7.60k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 2490|  7.60k|  const __m128i top_right = _mm_set1_epi16(top_row[15]);
 2491|  7.60k|  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
 2492|  7.60k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  7.60k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2493|  7.60k|  const __m128i weights1 = cvtepu8_epi16(weights);
 2494|  7.60k|  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
 2495|  7.60k|  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
 2496|  7.60k|  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
 2497|  7.60k|  const __m128i scaled_top_right1 =
 2498|  7.60k|      _mm_mullo_epi16(inverted_weights1, top_right);
 2499|  7.60k|  const __m128i scaled_top_right2 =
 2500|  7.60k|      _mm_mullo_epi16(inverted_weights2, top_right);
 2501|  7.60k|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|  7.60k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2502|  7.60k|  __m128i left = cvtepu8_epi16(LoadLo8(left_column));
 2503|  68.4k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2503:33): [True: 60.8k, False: 7.60k]
  ------------------
 2504|  60.8k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 2505|  60.8k|    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
 2506|  60.8k|    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2507|  60.8k|                                   scaled_top_right1, scaled_top_right2, round);
 2508|  60.8k|    dst += stride;
 2509|  60.8k|  }
 2510|  7.60k|  left = cvtepu8_epi16(LoadLo8(left_column + 8));
 2511|  68.4k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2511:33): [True: 60.8k, False: 7.60k]
  ------------------
 2512|  60.8k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 2513|  60.8k|    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
 2514|  60.8k|    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2515|  60.8k|                                   scaled_top_right1, scaled_top_right2, round);
 2516|  60.8k|    dst += stride;
 2517|  60.8k|  }
 2518|  7.60k|}
aom_smooth_h_predictor_16x32_ssse3:
 2523|  1.05k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 2524|  1.05k|  const __m128i top_right = _mm_set1_epi16(top_row[15]);
 2525|  1.05k|  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
 2526|  1.05k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  1.05k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2527|  1.05k|  const __m128i weights1 = cvtepu8_epi16(weights);
 2528|  1.05k|  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
 2529|  1.05k|  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
 2530|  1.05k|  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
 2531|  1.05k|  const __m128i scaled_top_right1 =
 2532|  1.05k|      _mm_mullo_epi16(inverted_weights1, top_right);
 2533|  1.05k|  const __m128i scaled_top_right2 =
 2534|  1.05k|      _mm_mullo_epi16(inverted_weights2, top_right);
 2535|  1.05k|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|  1.05k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2536|  1.05k|  __m128i left = cvtepu8_epi16(LoadLo8(left_column));
 2537|  9.49k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2537:33): [True: 8.44k, False: 1.05k]
  ------------------
 2538|  8.44k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 2539|  8.44k|    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
 2540|  8.44k|    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2541|  8.44k|                                   scaled_top_right1, scaled_top_right2, round);
 2542|  8.44k|    dst += stride;
 2543|  8.44k|  }
 2544|  1.05k|  left = cvtepu8_epi16(LoadLo8(left_column + 8));
 2545|  9.49k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2545:33): [True: 8.44k, False: 1.05k]
  ------------------
 2546|  8.44k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 2547|  8.44k|    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
 2548|  8.44k|    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2549|  8.44k|                                   scaled_top_right1, scaled_top_right2, round);
 2550|  8.44k|    dst += stride;
 2551|  8.44k|  }
 2552|  1.05k|  left = cvtepu8_epi16(LoadLo8(left_column + 16));
 2553|  9.49k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2553:33): [True: 8.44k, False: 1.05k]
  ------------------
 2554|  8.44k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 2555|  8.44k|    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
 2556|  8.44k|    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2557|  8.44k|                                   scaled_top_right1, scaled_top_right2, round);
 2558|  8.44k|    dst += stride;
 2559|  8.44k|  }
 2560|  1.05k|  left = cvtepu8_epi16(LoadLo8(left_column + 24));
 2561|  9.49k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2561:33): [True: 8.44k, False: 1.05k]
  ------------------
 2562|  8.44k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 2563|  8.44k|    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
 2564|  8.44k|    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2565|  8.44k|                                   scaled_top_right1, scaled_top_right2, round);
 2566|  8.44k|    dst += stride;
 2567|  8.44k|  }
 2568|  1.05k|}
aom_smooth_h_predictor_16x64_ssse3:
 2574|    173|    const uint8_t *LIBAOM_RESTRICT left_column) {
 2575|    173|  const __m128i top_right = _mm_set1_epi16(top_row[15]);
 2576|    173|  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
 2577|    173|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|    173|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2578|    173|  const __m128i weights1 = cvtepu8_epi16(weights);
 2579|    173|  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
 2580|    173|  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
 2581|    173|  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
 2582|    173|  const __m128i scaled_top_right1 =
 2583|    173|      _mm_mullo_epi16(inverted_weights1, top_right);
 2584|    173|  const __m128i scaled_top_right2 =
 2585|    173|      _mm_mullo_epi16(inverted_weights2, top_right);
 2586|    173|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|    173|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2587|  1.55k|  for (int left_offset = 0; left_offset < 64; left_offset += 8) {
  ------------------
  |  Branch (2587:29): [True: 1.38k, False: 173]
  ------------------
 2588|  1.38k|    const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset));
 2589|  12.4k|    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2589:35): [True: 11.0k, False: 1.38k]
  ------------------
 2590|  11.0k|      const __m128i y_select = _mm_set1_epi32(y_mask);
 2591|  11.0k|      const __m128i left_y = _mm_shuffle_epi8(left, y_select);
 2592|  11.0k|      write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2593|  11.0k|                                     scaled_top_right1, scaled_top_right2,
 2594|  11.0k|                                     round);
 2595|  11.0k|      dst += stride;
 2596|  11.0k|    }
 2597|  1.38k|  }
 2598|    173|}
aom_smooth_h_predictor_32x8_ssse3:
 2603|  1.53k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 2604|  1.53k|  const __m128i top_right = _mm_set1_epi16(top_row[31]);
 2605|  1.53k|  const __m128i left = cvtepu8_epi16(LoadLo8(left_column));
 2606|  1.53k|  const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
 2607|  1.53k|  const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
 2608|  1.53k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  1.53k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2609|  1.53k|  const __m128i weights1 = cvtepu8_epi16(weights_lo);
 2610|  1.53k|  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
 2611|  1.53k|  const __m128i weights3 = cvtepu8_epi16(weights_hi);
 2612|  1.53k|  const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
 2613|  1.53k|  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
 2614|  1.53k|  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
 2615|  1.53k|  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
 2616|  1.53k|  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
 2617|  1.53k|  const __m128i scaled_top_right1 =
 2618|  1.53k|      _mm_mullo_epi16(inverted_weights1, top_right);
 2619|  1.53k|  const __m128i scaled_top_right2 =
 2620|  1.53k|      _mm_mullo_epi16(inverted_weights2, top_right);
 2621|  1.53k|  const __m128i scaled_top_right3 =
 2622|  1.53k|      _mm_mullo_epi16(inverted_weights3, top_right);
 2623|  1.53k|  const __m128i scaled_top_right4 =
 2624|  1.53k|      _mm_mullo_epi16(inverted_weights4, top_right);
 2625|  1.53k|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|  1.53k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2626|  13.8k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2626:33): [True: 12.3k, False: 1.53k]
  ------------------
 2627|  12.3k|    __m128i y_select = _mm_set1_epi32(y_mask);
 2628|  12.3k|    __m128i left_y = _mm_shuffle_epi8(left, y_select);
 2629|  12.3k|    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2630|  12.3k|                                   scaled_top_right1, scaled_top_right2, round);
 2631|  12.3k|    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
 2632|  12.3k|                                   scaled_top_right3, scaled_top_right4, round);
 2633|  12.3k|    dst += stride;
 2634|  12.3k|  }
 2635|  1.53k|}
aom_smooth_h_predictor_32x16_ssse3:
 2641|    874|    const uint8_t *LIBAOM_RESTRICT left_column) {
 2642|    874|  const __m128i top_right = _mm_set1_epi16(top_row[31]);
 2643|    874|  const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column));
 2644|    874|  const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
 2645|    874|  const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
 2646|    874|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|    874|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2647|    874|  const __m128i weights1 = cvtepu8_epi16(weights_lo);
 2648|    874|  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
 2649|    874|  const __m128i weights3 = cvtepu8_epi16(weights_hi);
 2650|    874|  const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
 2651|    874|  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
 2652|    874|  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
 2653|    874|  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
 2654|    874|  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
 2655|    874|  const __m128i scaled_top_right1 =
 2656|    874|      _mm_mullo_epi16(inverted_weights1, top_right);
 2657|    874|  const __m128i scaled_top_right2 =
 2658|    874|      _mm_mullo_epi16(inverted_weights2, top_right);
 2659|    874|  const __m128i scaled_top_right3 =
 2660|    874|      _mm_mullo_epi16(inverted_weights3, top_right);
 2661|    874|  const __m128i scaled_top_right4 =
 2662|    874|      _mm_mullo_epi16(inverted_weights4, top_right);
 2663|    874|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|    874|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2664|  7.86k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2664:33): [True: 6.99k, False: 874]
  ------------------
 2665|  6.99k|    __m128i y_select = _mm_set1_epi32(y_mask);
 2666|  6.99k|    __m128i left_y = _mm_shuffle_epi8(left1, y_select);
 2667|  6.99k|    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2668|  6.99k|                                   scaled_top_right1, scaled_top_right2, round);
 2669|  6.99k|    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
 2670|  6.99k|                                   scaled_top_right3, scaled_top_right4, round);
 2671|  6.99k|    dst += stride;
 2672|  6.99k|  }
 2673|    874|  const __m128i left2 =
 2674|    874|      cvtepu8_epi16(LoadLo8((const uint8_t *)left_column + 8));
 2675|  7.86k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2675:33): [True: 6.99k, False: 874]
  ------------------
 2676|  6.99k|    __m128i y_select = _mm_set1_epi32(y_mask);
 2677|  6.99k|    __m128i left_y = _mm_shuffle_epi8(left2, y_select);
 2678|  6.99k|    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2679|  6.99k|                                   scaled_top_right1, scaled_top_right2, round);
 2680|  6.99k|    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
 2681|  6.99k|                                   scaled_top_right3, scaled_top_right4, round);
 2682|  6.99k|    dst += stride;
 2683|  6.99k|  }
 2684|    874|}
aom_smooth_h_predictor_32x32_ssse3:
 2689|  3.62k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 2690|  3.62k|  const __m128i top_right = _mm_set1_epi16(top_row[31]);
 2691|  3.62k|  const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
 2692|  3.62k|  const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
 2693|  3.62k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  3.62k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2694|  3.62k|  const __m128i weights1 = cvtepu8_epi16(weights_lo);
 2695|  3.62k|  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
 2696|  3.62k|  const __m128i weights3 = cvtepu8_epi16(weights_hi);
 2697|  3.62k|  const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
 2698|  3.62k|  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
 2699|  3.62k|  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
 2700|  3.62k|  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
 2701|  3.62k|  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
 2702|  3.62k|  const __m128i scaled_top_right1 =
 2703|  3.62k|      _mm_mullo_epi16(inverted_weights1, top_right);
 2704|  3.62k|  const __m128i scaled_top_right2 =
 2705|  3.62k|      _mm_mullo_epi16(inverted_weights2, top_right);
 2706|  3.62k|  const __m128i scaled_top_right3 =
 2707|  3.62k|      _mm_mullo_epi16(inverted_weights3, top_right);
 2708|  3.62k|  const __m128i scaled_top_right4 =
 2709|  3.62k|      _mm_mullo_epi16(inverted_weights4, top_right);
 2710|  3.62k|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|  3.62k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2711|  3.62k|  __m128i left = cvtepu8_epi16(LoadLo8(left_column));
 2712|  32.6k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2712:33): [True: 29.0k, False: 3.62k]
  ------------------
 2713|  29.0k|    __m128i y_select = _mm_set1_epi32(y_mask);
 2714|  29.0k|    __m128i left_y = _mm_shuffle_epi8(left, y_select);
 2715|  29.0k|    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2716|  29.0k|                                   scaled_top_right1, scaled_top_right2, round);
 2717|  29.0k|    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
 2718|  29.0k|                                   scaled_top_right3, scaled_top_right4, round);
 2719|  29.0k|    dst += stride;
 2720|  29.0k|  }
 2721|  3.62k|  left = cvtepu8_epi16(LoadLo8(left_column + 8));
 2722|  32.6k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2722:33): [True: 29.0k, False: 3.62k]
  ------------------
 2723|  29.0k|    __m128i y_select = _mm_set1_epi32(y_mask);
 2724|  29.0k|    __m128i left_y = _mm_shuffle_epi8(left, y_select);
 2725|  29.0k|    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2726|  29.0k|                                   scaled_top_right1, scaled_top_right2, round);
 2727|  29.0k|    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
 2728|  29.0k|                                   scaled_top_right3, scaled_top_right4, round);
 2729|  29.0k|    dst += stride;
 2730|  29.0k|  }
 2731|  3.62k|  left = cvtepu8_epi16(LoadLo8(left_column + 16));
 2732|  32.6k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2732:33): [True: 29.0k, False: 3.62k]
  ------------------
 2733|  29.0k|    __m128i y_select = _mm_set1_epi32(y_mask);
 2734|  29.0k|    __m128i left_y = _mm_shuffle_epi8(left, y_select);
 2735|  29.0k|    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2736|  29.0k|                                   scaled_top_right1, scaled_top_right2, round);
 2737|  29.0k|    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
 2738|  29.0k|                                   scaled_top_right3, scaled_top_right4, round);
 2739|  29.0k|    dst += stride;
 2740|  29.0k|  }
 2741|  3.62k|  left = cvtepu8_epi16(LoadLo8(left_column + 24));
 2742|  32.6k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2742:33): [True: 29.0k, False: 3.62k]
  ------------------
 2743|  29.0k|    __m128i y_select = _mm_set1_epi32(y_mask);
 2744|  29.0k|    __m128i left_y = _mm_shuffle_epi8(left, y_select);
 2745|  29.0k|    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2746|  29.0k|                                   scaled_top_right1, scaled_top_right2, round);
 2747|  29.0k|    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
 2748|  29.0k|                                   scaled_top_right3, scaled_top_right4, round);
 2749|  29.0k|    dst += stride;
 2750|  29.0k|  }
 2751|  3.62k|}
aom_smooth_h_predictor_32x64_ssse3:
 2756|     99|    const uint8_t *LIBAOM_RESTRICT left_column) {
 2757|     99|  const __m128i top_right = _mm_set1_epi16(top_row[31]);
 2758|     99|  const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
 2759|     99|  const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
 2760|     99|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|     99|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2761|     99|  const __m128i weights1 = cvtepu8_epi16(weights_lo);
 2762|     99|  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
 2763|     99|  const __m128i weights3 = cvtepu8_epi16(weights_hi);
 2764|     99|  const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
 2765|     99|  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
 2766|     99|  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
 2767|     99|  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
 2768|     99|  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
 2769|     99|  const __m128i scaled_top_right1 =
 2770|     99|      _mm_mullo_epi16(inverted_weights1, top_right);
 2771|     99|  const __m128i scaled_top_right2 =
 2772|     99|      _mm_mullo_epi16(inverted_weights2, top_right);
 2773|     99|  const __m128i scaled_top_right3 =
 2774|     99|      _mm_mullo_epi16(inverted_weights3, top_right);
 2775|     99|  const __m128i scaled_top_right4 =
 2776|     99|      _mm_mullo_epi16(inverted_weights4, top_right);
 2777|     99|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|     99|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2778|    891|  for (int left_offset = 0; left_offset < 64; left_offset += 8) {
  ------------------
  |  Branch (2778:29): [True: 792, False: 99]
  ------------------
 2779|    792|    const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset));
 2780|  7.12k|    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2780:35): [True: 6.33k, False: 792]
  ------------------
 2781|  6.33k|      const __m128i y_select = _mm_set1_epi32(y_mask);
 2782|  6.33k|      const __m128i left_y = _mm_shuffle_epi8(left, y_select);
 2783|  6.33k|      write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2784|  6.33k|                                     scaled_top_right1, scaled_top_right2,
 2785|  6.33k|                                     round);
 2786|  6.33k|      write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3,
 2787|  6.33k|                                     weights4, scaled_top_right3,
 2788|  6.33k|                                     scaled_top_right4, round);
 2789|  6.33k|      dst += stride;
 2790|  6.33k|    }
 2791|    792|  }
 2792|     99|}
aom_smooth_h_predictor_64x16_ssse3:
 2798|    103|    const uint8_t *LIBAOM_RESTRICT left_column) {
 2799|    103|  const __m128i top_right = _mm_set1_epi16(top_row[63]);
 2800|    103|  const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column));
 2801|    103|  const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60);
 2802|    103|  const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76);
 2803|    103|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|    103|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2804|    103|  const __m128i weights1 = cvtepu8_epi16(weights_lolo);
 2805|    103|  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
 2806|    103|  const __m128i weights3 = cvtepu8_epi16(weights_lohi);
 2807|    103|  const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
 2808|    103|  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
 2809|    103|  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
 2810|    103|  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
 2811|    103|  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
 2812|    103|  const __m128i scaled_top_right1 =
 2813|    103|      _mm_mullo_epi16(inverted_weights1, top_right);
 2814|    103|  const __m128i scaled_top_right2 =
 2815|    103|      _mm_mullo_epi16(inverted_weights2, top_right);
 2816|    103|  const __m128i scaled_top_right3 =
 2817|    103|      _mm_mullo_epi16(inverted_weights3, top_right);
 2818|    103|  const __m128i scaled_top_right4 =
 2819|    103|      _mm_mullo_epi16(inverted_weights4, top_right);
 2820|    103|  const __m128i weights_hilo = LoadUnaligned16(smooth_weights + 92);
 2821|    103|  const __m128i weights_hihi = LoadUnaligned16(smooth_weights + 108);
 2822|    103|  const __m128i weights5 = cvtepu8_epi16(weights_hilo);
 2823|    103|  const __m128i weights6 = cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
 2824|    103|  const __m128i weights7 = cvtepu8_epi16(weights_hihi);
 2825|    103|  const __m128i weights8 = cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
 2826|    103|  const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
 2827|    103|  const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
 2828|    103|  const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
 2829|    103|  const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
 2830|    103|  const __m128i scaled_top_right5 =
 2831|    103|      _mm_mullo_epi16(inverted_weights5, top_right);
 2832|    103|  const __m128i scaled_top_right6 =
 2833|    103|      _mm_mullo_epi16(inverted_weights6, top_right);
 2834|    103|  const __m128i scaled_top_right7 =
 2835|    103|      _mm_mullo_epi16(inverted_weights7, top_right);
 2836|    103|  const __m128i scaled_top_right8 =
 2837|    103|      _mm_mullo_epi16(inverted_weights8, top_right);
 2838|    103|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|    103|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2839|    927|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2839:33): [True: 824, False: 103]
  ------------------
 2840|    824|    __m128i y_select = _mm_set1_epi32(y_mask);
 2841|    824|    __m128i left_y = _mm_shuffle_epi8(left1, y_select);
 2842|    824|    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2843|    824|                                   scaled_top_right1, scaled_top_right2, round);
 2844|    824|    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
 2845|    824|                                   scaled_top_right3, scaled_top_right4, round);
 2846|    824|    write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
 2847|    824|                                   scaled_top_right5, scaled_top_right6, round);
 2848|    824|    write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
 2849|    824|                                   scaled_top_right7, scaled_top_right8, round);
 2850|    824|    dst += stride;
 2851|    824|  }
 2852|    103|  const __m128i left2 = cvtepu8_epi16(LoadLo8(left_column + 8));
 2853|    927|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2853:33): [True: 824, False: 103]
  ------------------
 2854|    824|    __m128i y_select = _mm_set1_epi32(y_mask);
 2855|    824|    __m128i left_y = _mm_shuffle_epi8(left2, y_select);
 2856|    824|    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2857|    824|                                   scaled_top_right1, scaled_top_right2, round);
 2858|    824|    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
 2859|    824|                                   scaled_top_right3, scaled_top_right4, round);
 2860|    824|    write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
 2861|    824|                                   scaled_top_right5, scaled_top_right6, round);
 2862|    824|    write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
 2863|    824|                                   scaled_top_right7, scaled_top_right8, round);
 2864|    824|    dst += stride;
 2865|    824|  }
 2866|    103|}
aom_smooth_h_predictor_64x32_ssse3:
 2872|     67|    const uint8_t *LIBAOM_RESTRICT left_column) {
 2873|     67|  const __m128i top_right = _mm_set1_epi16(top_row[63]);
 2874|     67|  const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column));
 2875|     67|  const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60);
 2876|     67|  const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76);
 2877|     67|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|     67|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2878|     67|  const __m128i weights1 = cvtepu8_epi16(weights_lolo);
 2879|     67|  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
 2880|     67|  const __m128i weights3 = cvtepu8_epi16(weights_lohi);
 2881|     67|  const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
 2882|     67|  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
 2883|     67|  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
 2884|     67|  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
 2885|     67|  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
 2886|     67|  const __m128i scaled_top_right1 =
 2887|     67|      _mm_mullo_epi16(inverted_weights1, top_right);
 2888|     67|  const __m128i scaled_top_right2 =
 2889|     67|      _mm_mullo_epi16(inverted_weights2, top_right);
 2890|     67|  const __m128i scaled_top_right3 =
 2891|     67|      _mm_mullo_epi16(inverted_weights3, top_right);
 2892|     67|  const __m128i scaled_top_right4 =
 2893|     67|      _mm_mullo_epi16(inverted_weights4, top_right);
 2894|     67|  const __m128i weights_hilo = LoadUnaligned16(smooth_weights + 92);
 2895|     67|  const __m128i weights_hihi = LoadUnaligned16(smooth_weights + 108);
 2896|     67|  const __m128i weights5 = cvtepu8_epi16(weights_hilo);
 2897|     67|  const __m128i weights6 = cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
 2898|     67|  const __m128i weights7 = cvtepu8_epi16(weights_hihi);
 2899|     67|  const __m128i weights8 = cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
 2900|     67|  const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
 2901|     67|  const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
 2902|     67|  const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
 2903|     67|  const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
 2904|     67|  const __m128i scaled_top_right5 =
 2905|     67|      _mm_mullo_epi16(inverted_weights5, top_right);
 2906|     67|  const __m128i scaled_top_right6 =
 2907|     67|      _mm_mullo_epi16(inverted_weights6, top_right);
 2908|     67|  const __m128i scaled_top_right7 =
 2909|     67|      _mm_mullo_epi16(inverted_weights7, top_right);
 2910|     67|  const __m128i scaled_top_right8 =
 2911|     67|      _mm_mullo_epi16(inverted_weights8, top_right);
 2912|     67|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|     67|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2913|    603|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2913:33): [True: 536, False: 67]
  ------------------
 2914|    536|    const __m128i y_select = _mm_set1_epi32(y_mask);
 2915|    536|    const __m128i left_y = _mm_shuffle_epi8(left1, y_select);
 2916|    536|    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2917|    536|                                   scaled_top_right1, scaled_top_right2, round);
 2918|    536|    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
 2919|    536|                                   scaled_top_right3, scaled_top_right4, round);
 2920|    536|    write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
 2921|    536|                                   scaled_top_right5, scaled_top_right6, round);
 2922|    536|    write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
 2923|    536|                                   scaled_top_right7, scaled_top_right8, round);
 2924|    536|    dst += stride;
 2925|    536|  }
 2926|     67|  const __m128i left2 = cvtepu8_epi16(LoadLo8(left_column + 8));
 2927|    603|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2927:33): [True: 536, False: 67]
  ------------------
 2928|    536|    const __m128i y_select = _mm_set1_epi32(y_mask);
 2929|    536|    const __m128i left_y = _mm_shuffle_epi8(left2, y_select);
 2930|    536|    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2931|    536|                                   scaled_top_right1, scaled_top_right2, round);
 2932|    536|    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
 2933|    536|                                   scaled_top_right3, scaled_top_right4, round);
 2934|    536|    write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
 2935|    536|                                   scaled_top_right5, scaled_top_right6, round);
 2936|    536|    write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
 2937|    536|                                   scaled_top_right7, scaled_top_right8, round);
 2938|    536|    dst += stride;
 2939|    536|  }
 2940|     67|  const __m128i left3 = cvtepu8_epi16(LoadLo8(left_column + 16));
 2941|    603|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2941:33): [True: 536, False: 67]
  ------------------
 2942|    536|    const __m128i y_select = _mm_set1_epi32(y_mask);
 2943|    536|    const __m128i left_y = _mm_shuffle_epi8(left3, y_select);
 2944|    536|    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2945|    536|                                   scaled_top_right1, scaled_top_right2, round);
 2946|    536|    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
 2947|    536|                                   scaled_top_right3, scaled_top_right4, round);
 2948|    536|    write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
 2949|    536|                                   scaled_top_right5, scaled_top_right6, round);
 2950|    536|    write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
 2951|    536|                                   scaled_top_right7, scaled_top_right8, round);
 2952|    536|    dst += stride;
 2953|    536|  }
 2954|     67|  const __m128i left4 = cvtepu8_epi16(LoadLo8(left_column + 24));
 2955|    603|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2955:33): [True: 536, False: 67]
  ------------------
 2956|    536|    const __m128i y_select = _mm_set1_epi32(y_mask);
 2957|    536|    const __m128i left_y = _mm_shuffle_epi8(left4, y_select);
 2958|    536|    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2959|    536|                                   scaled_top_right1, scaled_top_right2, round);
 2960|    536|    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
 2961|    536|                                   scaled_top_right3, scaled_top_right4, round);
 2962|    536|    write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
 2963|    536|                                   scaled_top_right5, scaled_top_right6, round);
 2964|    536|    write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
 2965|    536|                                   scaled_top_right7, scaled_top_right8, round);
 2966|    536|    dst += stride;
 2967|    536|  }
 2968|     67|}
aom_smooth_h_predictor_64x64_ssse3:
 2973|    263|    const uint8_t *LIBAOM_RESTRICT left_column) {
 2974|    263|  const __m128i top_right = _mm_set1_epi16(top_row[63]);
 2975|    263|  const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60);
 2976|    263|  const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76);
 2977|    263|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|    263|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2978|    263|  const __m128i weights1 = cvtepu8_epi16(weights_lolo);
 2979|    263|  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
 2980|    263|  const __m128i weights3 = cvtepu8_epi16(weights_lohi);
 2981|    263|  const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
 2982|    263|  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
 2983|    263|  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
 2984|    263|  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
 2985|    263|  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
 2986|    263|  const __m128i scaled_top_right1 =
 2987|    263|      _mm_mullo_epi16(inverted_weights1, top_right);
 2988|    263|  const __m128i scaled_top_right2 =
 2989|    263|      _mm_mullo_epi16(inverted_weights2, top_right);
 2990|    263|  const __m128i scaled_top_right3 =
 2991|    263|      _mm_mullo_epi16(inverted_weights3, top_right);
 2992|    263|  const __m128i scaled_top_right4 =
 2993|    263|      _mm_mullo_epi16(inverted_weights4, top_right);
 2994|    263|  const __m128i weights_hilo = LoadUnaligned16(smooth_weights + 92);
 2995|    263|  const __m128i weights_hihi = LoadUnaligned16(smooth_weights + 108);
 2996|    263|  const __m128i weights5 = cvtepu8_epi16(weights_hilo);
 2997|    263|  const __m128i weights6 = cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
 2998|    263|  const __m128i weights7 = cvtepu8_epi16(weights_hihi);
 2999|    263|  const __m128i weights8 = cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
 3000|    263|  const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
 3001|    263|  const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
 3002|    263|  const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
 3003|    263|  const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
 3004|    263|  const __m128i scaled_top_right5 =
 3005|    263|      _mm_mullo_epi16(inverted_weights5, top_right);
 3006|    263|  const __m128i scaled_top_right6 =
 3007|    263|      _mm_mullo_epi16(inverted_weights6, top_right);
 3008|    263|  const __m128i scaled_top_right7 =
 3009|    263|      _mm_mullo_epi16(inverted_weights7, top_right);
 3010|    263|  const __m128i scaled_top_right8 =
 3011|    263|      _mm_mullo_epi16(inverted_weights8, top_right);
 3012|    263|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|    263|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 3013|  2.36k|  for (int left_offset = 0; left_offset < 64; left_offset += 8) {
  ------------------
  |  Branch (3013:29): [True: 2.10k, False: 263]
  ------------------
 3014|  2.10k|    const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset));
 3015|  18.9k|    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (3015:35): [True: 16.8k, False: 2.10k]
  ------------------
 3016|  16.8k|      const __m128i y_select = _mm_set1_epi32(y_mask);
 3017|  16.8k|      const __m128i left_y = _mm_shuffle_epi8(left, y_select);
 3018|  16.8k|      write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 3019|  16.8k|                                     scaled_top_right1, scaled_top_right2,
 3020|  16.8k|                                     round);
 3021|  16.8k|      write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3,
 3022|  16.8k|                                     weights4, scaled_top_right3,
 3023|  16.8k|                                     scaled_top_right4, round);
 3024|  16.8k|      write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5,
 3025|  16.8k|                                     weights6, scaled_top_right5,
 3026|  16.8k|                                     scaled_top_right6, round);
 3027|  16.8k|      write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7,
 3028|  16.8k|                                     weights8, scaled_top_right7,
 3029|  16.8k|                                     scaled_top_right8, round);
 3030|  16.8k|      dst += stride;
 3031|  16.8k|    }
 3032|  2.10k|  }
 3033|    263|}
intrapred_ssse3.c:paeth_8x1_pred:
   23|   730k|                                     const __m128i *topleft) {
   24|   730k|  const __m128i base = _mm_sub_epi16(_mm_add_epi16(*top, *left), *topleft);
   25|       |
   26|   730k|  __m128i pl = _mm_abs_epi16(_mm_sub_epi16(base, *left));
   27|   730k|  __m128i pt = _mm_abs_epi16(_mm_sub_epi16(base, *top));
   28|   730k|  __m128i ptl = _mm_abs_epi16(_mm_sub_epi16(base, *topleft));
   29|       |
   30|   730k|  __m128i mask1 = _mm_cmpgt_epi16(pl, pt);
   31|   730k|  mask1 = _mm_or_si128(mask1, _mm_cmpgt_epi16(pl, ptl));
   32|   730k|  __m128i mask2 = _mm_cmpgt_epi16(pt, ptl);
   33|       |
   34|   730k|  pl = _mm_andnot_si128(mask1, *left);
   35|       |
   36|   730k|  ptl = _mm_and_si128(mask2, *topleft);
   37|   730k|  pt = _mm_andnot_si128(mask2, *top);
   38|   730k|  pt = _mm_or_si128(pt, ptl);
   39|   730k|  pt = _mm_and_si128(mask1, pt);
   40|       |
   41|   730k|  return _mm_or_si128(pl, pt);
   42|   730k|}
intrapred_ssse3.c:paeth_16x1_pred:
  198|  48.5k|                                      const __m128i *topleft) {
  199|  48.5k|  const __m128i p0 = paeth_8x1_pred(left, top0, topleft);
  200|  48.5k|  const __m128i p1 = paeth_8x1_pred(left, top1, topleft);
  201|  48.5k|  return _mm_packus_epi16(p0, p1);
  202|  48.5k|}
intrapred_ssse3.c:load_pixel_w4:
  598|  68.1k|                                 int height, __m128i *pixels) {
  599|  68.1k|  __m128i d = _mm_cvtsi32_si128(((const int *)above)[0]);
  600|  68.1k|  if (height == 4)
  ------------------
  |  Branch (600:7): [True: 54.5k, False: 13.5k]
  ------------------
  601|  54.5k|    pixels[1] = _mm_cvtsi32_si128(((const int *)left)[0]);
  602|  13.5k|  else if (height == 8)
  ------------------
  |  Branch (602:12): [True: 9.00k, False: 4.59k]
  ------------------
  603|  9.00k|    pixels[1] = _mm_loadl_epi64(((const __m128i *)left));
  604|  4.59k|  else
  605|  4.59k|    pixels[1] = _mm_loadu_si128(((const __m128i *)left));
  606|       |
  607|  68.1k|  pixels[2] = _mm_set1_epi16((int16_t)above[3]);
  608|       |
  609|  68.1k|  const __m128i bp = _mm_set1_epi16((int16_t)left[height - 1]);
  610|  68.1k|  const __m128i zero = _mm_setzero_si128();
  611|  68.1k|  d = _mm_unpacklo_epi8(d, zero);
  612|  68.1k|  pixels[0] = _mm_unpacklo_epi16(d, bp);
  613|  68.1k|}
intrapred_ssse3.c:load_weight_w4:
  621|  68.1k|                                  __m128i *weight_w) {
  622|  68.1k|  const __m128i zero = _mm_setzero_si128();
  623|  68.1k|  const __m128i d = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
  ------------------
  |  |   19|  68.1k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
  624|  68.1k|  const __m128i t = _mm_cvtsi32_si128(((const int *)smooth_weights)[0]);
  625|  68.1k|  weight_h[0] = _mm_unpacklo_epi8(t, zero);
  626|  68.1k|  weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
  627|  68.1k|  weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
  628|       |
  629|  68.1k|  if (height == 8) {
  ------------------
  |  Branch (629:7): [True: 9.00k, False: 59.1k]
  ------------------
  630|  9.00k|    const __m128i weight = _mm_loadl_epi64((const __m128i *)&smooth_weights[4]);
  631|  9.00k|    weight_h[0] = _mm_unpacklo_epi8(weight, zero);
  632|  9.00k|    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
  633|  59.1k|  } else if (height == 16) {
  ------------------
  |  Branch (633:14): [True: 4.59k, False: 54.5k]
  ------------------
  634|  4.59k|    const __m128i weight =
  635|  4.59k|        _mm_loadu_si128((const __m128i *)&smooth_weights[12]);
  636|  4.59k|    weight_h[0] = _mm_unpacklo_epi8(weight, zero);
  637|  4.59k|    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
  638|  4.59k|    weight_h[2] = _mm_unpackhi_epi8(weight, zero);
  639|  4.59k|    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
  640|  4.59k|  }
  641|  68.1k|}
intrapred_ssse3.c:smooth_pred_4xh:
  645|  72.7k|                                   ptrdiff_t stride, int second_half) {
  646|  72.7k|  const __m128i round = _mm_set1_epi32((1 << SMOOTH_WEIGHT_LOG2_SCALE));
  ------------------
  |  |   19|  72.7k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
  647|  72.7k|  const __m128i one = _mm_set1_epi16(1);
  648|  72.7k|  const __m128i inc = _mm_set1_epi16(0x202);
  649|  72.7k|  const __m128i gat = _mm_set1_epi32(0xc080400);
  650|  72.7k|  __m128i rep = second_half ? _mm_set1_epi16((short)0x8008)
  ------------------
  |  Branch (650:17): [True: 4.59k, False: 68.1k]
  ------------------
  651|  72.7k|                            : _mm_set1_epi16((short)0x8000);
  652|  72.7k|  __m128i d = _mm_set1_epi16(0x100);
  653|       |
  654|   436k|  for (int i = 0; i < h; ++i) {
  ------------------
  |  Branch (654:19): [True: 363k, False: 72.7k]
  ------------------
  655|   363k|    const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
  656|   363k|    const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
  657|   363k|    const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
  658|   363k|    __m128i s = _mm_madd_epi16(pixel[0], wh_sc);
  659|       |
  660|   363k|    __m128i b = _mm_shuffle_epi8(pixel[1], rep);
  661|   363k|    b = _mm_unpacklo_epi16(b, pixel[2]);
  662|   363k|    __m128i sum = _mm_madd_epi16(b, ww[0]);
  663|       |
  664|   363k|    sum = _mm_add_epi32(s, sum);
  665|   363k|    sum = _mm_add_epi32(sum, round);
  666|   363k|    sum = _mm_srai_epi32(sum, 1 + SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|   363k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
  667|       |
  668|   363k|    sum = _mm_shuffle_epi8(sum, gat);
  669|   363k|    *(int *)dst = _mm_cvtsi128_si32(sum);
  670|   363k|    dst += stride;
  671|       |
  672|   363k|    rep = _mm_add_epi16(rep, one);
  673|   363k|    d = _mm_add_epi16(d, inc);
  674|   363k|  }
  675|  72.7k|}
intrapred_ssse3.c:load_pixel_w8:
  724|  65.0k|                                 int height, __m128i *pixels) {
  725|  65.0k|  const __m128i zero = _mm_setzero_si128();
  726|  65.0k|  const __m128i bp = _mm_set1_epi16((int16_t)left[height - 1]);
  727|  65.0k|  __m128i d = _mm_loadl_epi64((const __m128i *)above);
  728|  65.0k|  d = _mm_unpacklo_epi8(d, zero);
  729|  65.0k|  pixels[0] = _mm_unpacklo_epi16(d, bp);
  730|  65.0k|  pixels[1] = _mm_unpackhi_epi16(d, bp);
  731|       |
  732|  65.0k|  pixels[3] = _mm_set1_epi16((int16_t)above[7]);
  733|       |
  734|  65.0k|  if (height == 4) {
  ------------------
  |  Branch (734:7): [True: 11.8k, False: 53.1k]
  ------------------
  735|  11.8k|    pixels[2] = _mm_cvtsi32_si128(((const int *)left)[0]);
  736|  53.1k|  } else if (height == 8) {
  ------------------
  |  Branch (736:14): [True: 41.9k, False: 11.1k]
  ------------------
  737|  41.9k|    pixels[2] = _mm_loadl_epi64((const __m128i *)left);
  738|  41.9k|  } else if (height == 16) {
  ------------------
  |  Branch (738:14): [True: 8.47k, False: 2.71k]
  ------------------
  739|  8.47k|    pixels[2] = _mm_load_si128((const __m128i *)left);
  740|  8.47k|  } else {
  741|  2.71k|    pixels[2] = _mm_load_si128((const __m128i *)left);
  742|  2.71k|    pixels[4] = pixels[0];
  743|  2.71k|    pixels[5] = pixels[1];
  744|  2.71k|    pixels[6] = _mm_load_si128((const __m128i *)(left + 16));
  745|  2.71k|    pixels[7] = pixels[3];
  746|  2.71k|  }
  747|  65.0k|}
intrapred_ssse3.c:load_weight_w8:
  760|  65.0k|                                  __m128i *weight_w) {
  761|  65.0k|  const __m128i zero = _mm_setzero_si128();
  762|  65.0k|  const int we_offset = height < 8 ? 0 : 4;
  ------------------
  |  Branch (762:25): [True: 11.8k, False: 53.1k]
  ------------------
  763|  65.0k|  __m128i we = _mm_loadu_si128((const __m128i *)&smooth_weights[we_offset]);
  764|  65.0k|  weight_h[0] = _mm_unpacklo_epi8(we, zero);
  765|  65.0k|  const __m128i d = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
  ------------------
  |  |   19|  65.0k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
  766|  65.0k|  weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
  767|       |
  768|  65.0k|  if (height == 4) {
  ------------------
  |  Branch (768:7): [True: 11.8k, False: 53.1k]
  ------------------
  769|  11.8k|    we = _mm_srli_si128(we, 4);
  770|  11.8k|    __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
  771|  11.8k|    __m128i tmp2 = _mm_sub_epi16(d, tmp1);
  772|  11.8k|    weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
  773|  11.8k|    weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
  774|  53.1k|  } else {
  775|  53.1k|    weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
  776|  53.1k|    weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
  777|  53.1k|  }
  778|       |
  779|  65.0k|  if (height == 16) {
  ------------------
  |  Branch (779:7): [True: 8.47k, False: 56.5k]
  ------------------
  780|  8.47k|    we = _mm_loadu_si128((const __m128i *)&smooth_weights[12]);
  781|  8.47k|    weight_h[0] = _mm_unpacklo_epi8(we, zero);
  782|  8.47k|    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
  783|  8.47k|    weight_h[2] = _mm_unpackhi_epi8(we, zero);
  784|  8.47k|    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
  785|  56.5k|  } else if (height == 32) {
  ------------------
  |  Branch (785:14): [True: 2.71k, False: 53.8k]
  ------------------
  786|  2.71k|    const __m128i weight_lo =
  787|  2.71k|        _mm_loadu_si128((const __m128i *)&smooth_weights[28]);
  788|  2.71k|    weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
  789|  2.71k|    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
  790|  2.71k|    weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
  791|  2.71k|    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
  792|  2.71k|    const __m128i weight_hi =
  793|  2.71k|        _mm_loadu_si128((const __m128i *)&smooth_weights[28 + 16]);
  794|  2.71k|    weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
  795|  2.71k|    weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
  796|  2.71k|    weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
  797|  2.71k|    weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
  798|  2.71k|  }
  799|  65.0k|}
intrapred_ssse3.c:smooth_pred_8xh:
  803|  81.6k|                                   ptrdiff_t stride, int second_half) {
  804|  81.6k|  const __m128i round = _mm_set1_epi32((1 << SMOOTH_WEIGHT_LOG2_SCALE));
  ------------------
  |  |   19|  81.6k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
  805|  81.6k|  const __m128i one = _mm_set1_epi16(1);
  806|  81.6k|  const __m128i inc = _mm_set1_epi16(0x202);
  807|  81.6k|  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
  808|       |
  809|  81.6k|  __m128i rep = second_half ? _mm_set1_epi16((short)0x8008)
  ------------------
  |  Branch (809:17): [True: 13.9k, False: 67.7k]
  ------------------
  810|  81.6k|                            : _mm_set1_epi16((short)0x8000);
  811|  81.6k|  __m128i d = _mm_set1_epi16(0x100);
  812|       |
  813|  81.6k|  int i;
  814|   687k|  for (i = 0; i < h; ++i) {
  ------------------
  |  Branch (814:15): [True: 605k, False: 81.6k]
  ------------------
  815|   605k|    const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
  816|   605k|    const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
  817|   605k|    const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
  818|   605k|    __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
  819|   605k|    __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
  820|       |
  821|   605k|    __m128i b = _mm_shuffle_epi8(pixels[2], rep);
  822|   605k|    b = _mm_unpacklo_epi16(b, pixels[3]);
  823|   605k|    __m128i sum0 = _mm_madd_epi16(b, ww[0]);
  824|   605k|    __m128i sum1 = _mm_madd_epi16(b, ww[1]);
  825|       |
  826|   605k|    s0 = _mm_add_epi32(s0, sum0);
  827|   605k|    s0 = _mm_add_epi32(s0, round);
  828|   605k|    s0 = _mm_srai_epi32(s0, 1 + SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|   605k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
  829|       |
  830|   605k|    s1 = _mm_add_epi32(s1, sum1);
  831|   605k|    s1 = _mm_add_epi32(s1, round);
  832|   605k|    s1 = _mm_srai_epi32(s1, 1 + SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|   605k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
  833|       |
  834|   605k|    sum0 = _mm_packus_epi16(s0, s1);
  835|   605k|    sum0 = _mm_shuffle_epi8(sum0, gat);
  836|   605k|    _mm_storel_epi64((__m128i *)dst, sum0);
  837|   605k|    dst += stride;
  838|       |
  839|   605k|    rep = _mm_add_epi16(rep, one);
  840|   605k|    d = _mm_add_epi16(d, inc);
  841|   605k|  }
  842|  81.6k|}
intrapred_ssse3.c:smooth_predictor_wxh:
  960|  52.8k|                                 int width, int height) {
  961|  52.8k|  const uint8_t *const sm_weights_h = smooth_weights + height - 4;
  962|  52.8k|  const uint8_t *const sm_weights_w = smooth_weights + width - 4;
  963|  52.8k|  const __m128i zero = _mm_setzero_si128();
  964|  52.8k|  const __m128i scale_value = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  52.8k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
  965|  52.8k|  const __m128i bottom_left = _mm_cvtsi32_si128(left_column[height - 1]);
  966|  52.8k|  const __m128i top_right = _mm_set1_epi16(top_row[width - 1]);
  967|  52.8k|  const __m128i round = _mm_set1_epi32(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  52.8k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
  968|   851k|  for (int y = 0; y < height; ++y) {
  ------------------
  |  Branch (968:19): [True: 799k, False: 52.8k]
  ------------------
  969|   799k|    const __m128i weights_y = _mm_cvtsi32_si128(sm_weights_h[y]);
  970|   799k|    const __m128i left_y = _mm_cvtsi32_si128(left_column[y]);
  971|   799k|    const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y);
  972|   799k|    __m128i scaled_bottom_left =
  973|   799k|        _mm_mullo_epi16(scale_m_weights_y, bottom_left);
  974|   799k|    const __m128i weight_left_y =
  975|   799k|        _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0);
  976|   799k|    scaled_bottom_left = _mm_add_epi32(scaled_bottom_left, round);
  977|   799k|    scaled_bottom_left = _mm_shuffle_epi32(scaled_bottom_left, 0);
  978|  3.19M|    for (int x = 0; x < width; x += 8) {
  ------------------
  |  Branch (978:21): [True: 2.39M, False: 799k]
  ------------------
  979|  2.39M|      const __m128i top_x = LoadLo8(top_row + x);
  980|  2.39M|      const __m128i weights_x = LoadLo8(sm_weights_w + x);
  981|  2.39M|      const __m128i top_weights_x = _mm_unpacklo_epi8(top_x, weights_x);
  982|  2.39M|      const __m128i top_weights_x_lo = cvtepu8_epi16(top_weights_x);
  983|  2.39M|      const __m128i top_weights_x_hi = _mm_unpackhi_epi8(top_weights_x, zero);
  984|       |
  985|       |      // Here opposite weights and pixels are multiplied, where the order of
  986|       |      // interleaving is indicated in the names.
  987|  2.39M|      __m128i pred_lo = _mm_madd_epi16(top_weights_x_lo, weight_left_y);
  988|  2.39M|      __m128i pred_hi = _mm_madd_epi16(top_weights_x_hi, weight_left_y);
  989|       |
  990|       |      // |scaled_bottom_left| is always scaled by the same weight each row, so
  991|       |      // we only derive |scaled_top_right| values here.
  992|  2.39M|      const __m128i inverted_weights_x =
  993|  2.39M|          _mm_sub_epi16(scale_value, cvtepu8_epi16(weights_x));
  994|  2.39M|      const __m128i scaled_top_right =
  995|  2.39M|          _mm_mullo_epi16(inverted_weights_x, top_right);
  996|  2.39M|      const __m128i scaled_top_right_lo = cvtepu16_epi32(scaled_top_right);
  997|  2.39M|      const __m128i scaled_top_right_hi =
  998|  2.39M|          _mm_unpackhi_epi16(scaled_top_right, zero);
  999|  2.39M|      pred_lo = _mm_add_epi32(pred_lo, scaled_bottom_left);
 1000|  2.39M|      pred_hi = _mm_add_epi32(pred_hi, scaled_bottom_left);
 1001|  2.39M|      pred_lo = _mm_add_epi32(pred_lo, scaled_top_right_lo);
 1002|  2.39M|      pred_hi = _mm_add_epi32(pred_hi, scaled_top_right_hi);
 1003|       |
 1004|       |      // The round value for RightShiftWithRounding was added with
 1005|       |      // |scaled_bottom_left|.
 1006|  2.39M|      pred_lo = _mm_srli_epi32(pred_lo, (1 + SMOOTH_WEIGHT_LOG2_SCALE));
  ------------------
  |  |   19|  2.39M|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1007|  2.39M|      pred_hi = _mm_srli_epi32(pred_hi, (1 + SMOOTH_WEIGHT_LOG2_SCALE));
  ------------------
  |  |   19|  2.39M|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1008|  2.39M|      const __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
 1009|  2.39M|      StoreLo8(dst + x, _mm_packus_epi16(pred, pred));
 1010|  2.39M|    }
 1011|   799k|    dst += stride;
 1012|   799k|  }
 1013|  52.8k|}
intrapred_ssse3.c:cvtepu16_epi32:
  953|  2.39M|static AOM_FORCE_INLINE __m128i cvtepu16_epi32(__m128i x) {
  954|  2.39M|  return _mm_unpacklo_epi16((x), _mm_setzero_si128());
  955|  2.39M|}
intrapred_ssse3.c:StoreLo8:
  936|  2.77M|static AOM_FORCE_INLINE void StoreLo8(void *a, const __m128i v) {
  937|  2.77M|  _mm_storel_epi64((__m128i *)(a), v);
  938|  2.77M|}
intrapred_ssse3.c:load_smooth_vertical_pixels4:
 1135|  20.9k|    const int height, __m128i *pixels) {
 1136|  20.9k|  __m128i top = Load4(above);
 1137|  20.9k|  const __m128i bottom_left = _mm_set1_epi16(left[height - 1]);
 1138|  20.9k|  top = cvtepu8_epi16(top);
 1139|  20.9k|  pixels[0] = _mm_unpacklo_epi16(top, bottom_left);
 1140|  20.9k|}
intrapred_ssse3.c:load_smooth_vertical_weights4:
 1148|  20.9k|    __m128i *weights) {
 1149|  20.9k|  const __m128i inverter = _mm_set1_epi16(256);
 1150|       |
 1151|  20.9k|  if (height == 4) {
  ------------------
  |  Branch (1151:7): [True: 17.5k, False: 3.45k]
  ------------------
 1152|  17.5k|    const __m128i weight = Load4(weight_array);
 1153|  17.5k|    weights[0] = cvtepu8_epi16(weight);
 1154|  17.5k|    weights[1] = _mm_sub_epi16(inverter, weights[0]);
 1155|  17.5k|  } else if (height == 8) {
  ------------------
  |  Branch (1155:14): [True: 2.14k, False: 1.31k]
  ------------------
 1156|  2.14k|    const __m128i weight = LoadLo8(weight_array + 4);
 1157|  2.14k|    weights[0] = cvtepu8_epi16(weight);
 1158|  2.14k|    weights[1] = _mm_sub_epi16(inverter, weights[0]);
 1159|  2.14k|  } else {
 1160|  1.31k|    const __m128i weight = LoadUnaligned16(weight_array + 12);
 1161|  1.31k|    const __m128i zero = _mm_setzero_si128();
 1162|  1.31k|    weights[0] = cvtepu8_epi16(weight);
 1163|  1.31k|    weights[1] = _mm_sub_epi16(inverter, weights[0]);
 1164|  1.31k|    weights[2] = _mm_unpackhi_epi8(weight, zero);
 1165|  1.31k|    weights[3] = _mm_sub_epi16(inverter, weights[2]);
 1166|  1.31k|  }
 1167|  20.9k|}
intrapred_ssse3.c:write_smooth_vertical4xh:
 1171|  22.3k|    uint8_t *LIBAOM_RESTRICT dst, const ptrdiff_t stride) {
 1172|  22.3k|  const __m128i pred_round = _mm_set1_epi32(128);
 1173|  22.3k|  const __m128i mask_increment = _mm_set1_epi16(0x0202);
 1174|  22.3k|  const __m128i cvtepu8_epi32 = _mm_set1_epi32(0xC080400);
 1175|  22.3k|  __m128i y_select = _mm_set1_epi16(0x0100);
 1176|       |
 1177|   130k|  for (int y = 0; y < height; ++y) {
  ------------------
  |  Branch (1177:19): [True: 108k, False: 22.3k]
  ------------------
 1178|   108k|    const __m128i weight_y = _mm_shuffle_epi8(weight[0], y_select);
 1179|   108k|    const __m128i inverted_weight_y = _mm_shuffle_epi8(weight[1], y_select);
 1180|   108k|    const __m128i alternate_weights =
 1181|   108k|        _mm_unpacklo_epi16(weight_y, inverted_weight_y);
 1182|       |    // Here the pixel vector is top_row[0], corner, top_row[1], corner, ...
 1183|       |    // The madd instruction yields four results of the form:
 1184|       |    // (top_row[x] * weight[y] + corner * inverted_weight[y])
 1185|   108k|    __m128i sum = _mm_madd_epi16(pixel[0], alternate_weights);
 1186|   108k|    sum = _mm_add_epi32(sum, pred_round);
 1187|   108k|    sum = _mm_srai_epi32(sum, 8);
 1188|   108k|    sum = _mm_shuffle_epi8(sum, cvtepu8_epi32);
 1189|   108k|    Store4(dst, sum);
 1190|   108k|    dst += stride;
 1191|   108k|    y_select = _mm_add_epi16(y_select, mask_increment);
 1192|   108k|  }
 1193|  22.3k|}
intrapred_ssse3.c:Store4:
  931|   257k|static AOM_FORCE_INLINE void Store4(void *dst, const __m128i x) {
  932|   257k|  const int val = _mm_cvtsi128_si32(x);
  933|   257k|  memcpy(dst, &val, sizeof(val));
  934|   257k|}
intrapred_ssse3.c:cvtepu8_epi16:
  944|  5.08M|static AOM_FORCE_INLINE __m128i cvtepu8_epi16(__m128i x) {
  945|  5.08M|  return _mm_unpacklo_epi8((x), _mm_setzero_si128());
  946|  5.08M|}
intrapred_ssse3.c:Load4:
  911|   119k|static AOM_FORCE_INLINE __m128i Load4(const void *src) {
  912|       |  // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
  913|       |  // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
  914|       |  // movss instruction.
  915|       |  //
  916|       |  // Until compiler support of _mm_loadu_si32 is widespread, use of
  917|       |  // _mm_loadu_si32 is banned.
  918|   119k|  int val;
  919|   119k|  memcpy(&val, src, sizeof(val));
  920|   119k|  return _mm_cvtsi32_si128(val);
  921|   119k|}
intrapred_ssse3.c:LoadLo8:
  923|  4.92M|static AOM_FORCE_INLINE __m128i LoadLo8(const void *a) {
  924|  4.92M|  return _mm_loadl_epi64((const __m128i *)(a));
  925|  4.92M|}
intrapred_ssse3.c:write_smooth_directional_sum8:
 1122|   376k|    const __m128i *scaled_corner, const __m128i *round) {
 1123|   376k|  const __m128i pred_sum =
 1124|   376k|      smooth_directional_sum8(*pixels, *weights, *scaled_corner);
 1125|       |  // Equivalent to RightShiftWithRounding(pred[x][y], 8).
 1126|   376k|  const __m128i pred = _mm_srli_epi16(_mm_add_epi16(pred_sum, *round), 8);
 1127|   376k|  StoreLo8(dst, _mm_packus_epi16(pred, pred));
 1128|   376k|}
intrapred_ssse3.c:smooth_directional_sum8:
 1115|   376k|    const __m128i pixels, const __m128i weights, const __m128i scaled_corner) {
 1116|   376k|  const __m128i weighted_px = _mm_mullo_epi16(pixels, weights);
 1117|   376k|  return _mm_add_epi16(scaled_corner, weighted_px);
 1118|   376k|}
intrapred_ssse3.c:LoadUnaligned16:
  927|  69.8k|static AOM_FORCE_INLINE __m128i LoadUnaligned16(const void *a) {
  928|  69.8k|  return _mm_loadu_si128((const __m128i *)(a));
  929|  69.8k|}
intrapred_ssse3.c:write_smooth_directional_sum16:
 1103|   991k|    const __m128i round) {
 1104|   991k|  const __m128i weighted_px1 = _mm_mullo_epi16(pixels1, weights1);
 1105|   991k|  const __m128i weighted_px2 = _mm_mullo_epi16(pixels2, weights2);
 1106|   991k|  const __m128i pred_sum1 = _mm_add_epi16(scaled_corner1, weighted_px1);
 1107|   991k|  const __m128i pred_sum2 = _mm_add_epi16(scaled_corner2, weighted_px2);
 1108|       |  // Equivalent to RightShiftWithRounding(pred[x][y], 8).
 1109|   991k|  const __m128i pred1 = _mm_srli_epi16(_mm_add_epi16(pred_sum1, round), 8);
 1110|   991k|  const __m128i pred2 = _mm_srli_epi16(_mm_add_epi16(pred_sum2, round), 8);
 1111|   991k|  StoreUnaligned16(dst, _mm_packus_epi16(pred1, pred2));
 1112|   991k|}
intrapred_ssse3.c:StoreUnaligned16:
  940|   991k|static AOM_FORCE_INLINE void StoreUnaligned16(void *a, const __m128i v) {
  941|   991k|  _mm_storeu_si128((__m128i *)(a), v);
  942|   991k|}
intrapred_ssse3.c:cvtepu8_epi32:
  948|  65.5k|static AOM_FORCE_INLINE __m128i cvtepu8_epi32(__m128i x) {
  949|  65.5k|  const __m128i tmp = _mm_unpacklo_epi8((x), _mm_setzero_si128());
  950|  65.5k|  return _mm_unpacklo_epi16(tmp, _mm_setzero_si128());
  951|  65.5k|}
intrapred_ssse3.c:write_smooth_horizontal_sum4:
 2126|   148k|    const __m128i *scaled_top_right, const __m128i *round) {
 2127|   148k|  const __m128i weighted_left_y = _mm_mullo_epi16(*left_y, *weights);
 2128|   148k|  const __m128i pred_sum = _mm_add_epi32(*scaled_top_right, weighted_left_y);
 2129|       |  // Equivalent to RightShiftWithRounding(pred[x][y], 8).
 2130|   148k|  const __m128i pred = _mm_srli_epi32(_mm_add_epi32(pred_sum, *round), 8);
 2131|   148k|  const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
 2132|   148k|  Store4(dst, _mm_shuffle_epi8(pred, cvtepi32_epi8));
 2133|   148k|}

intrapred_avx2.c:transpose16x16_sse2:
   94|  25.4k|static inline void transpose16x16_sse2(__m128i *x, __m128i *d) {
   95|  25.4k|  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
   96|  25.4k|  __m128i w10, w11, w12, w13, w14, w15;
   97|       |
   98|  25.4k|  w0 = _mm_unpacklo_epi8(x[0], x[1]);
   99|  25.4k|  w1 = _mm_unpacklo_epi8(x[2], x[3]);
  100|  25.4k|  w2 = _mm_unpacklo_epi8(x[4], x[5]);
  101|  25.4k|  w3 = _mm_unpacklo_epi8(x[6], x[7]);
  102|       |
  103|  25.4k|  w8 = _mm_unpacklo_epi8(x[8], x[9]);
  104|  25.4k|  w9 = _mm_unpacklo_epi8(x[10], x[11]);
  105|  25.4k|  w10 = _mm_unpacklo_epi8(x[12], x[13]);
  106|  25.4k|  w11 = _mm_unpacklo_epi8(x[14], x[15]);
  107|       |
  108|  25.4k|  w4 = _mm_unpacklo_epi16(w0, w1);
  109|  25.4k|  w5 = _mm_unpacklo_epi16(w2, w3);
  110|  25.4k|  w12 = _mm_unpacklo_epi16(w8, w9);
  111|  25.4k|  w13 = _mm_unpacklo_epi16(w10, w11);
  112|       |
  113|  25.4k|  w6 = _mm_unpacklo_epi32(w4, w5);
  114|  25.4k|  w7 = _mm_unpackhi_epi32(w4, w5);
  115|  25.4k|  w14 = _mm_unpacklo_epi32(w12, w13);
  116|  25.4k|  w15 = _mm_unpackhi_epi32(w12, w13);
  117|       |
  118|       |  // Store first 4-line result
  119|  25.4k|  d[0] = _mm_unpacklo_epi64(w6, w14);
  120|  25.4k|  d[1] = _mm_unpackhi_epi64(w6, w14);
  121|  25.4k|  d[2] = _mm_unpacklo_epi64(w7, w15);
  122|  25.4k|  d[3] = _mm_unpackhi_epi64(w7, w15);
  123|       |
  124|  25.4k|  w4 = _mm_unpackhi_epi16(w0, w1);
  125|  25.4k|  w5 = _mm_unpackhi_epi16(w2, w3);
  126|  25.4k|  w12 = _mm_unpackhi_epi16(w8, w9);
  127|  25.4k|  w13 = _mm_unpackhi_epi16(w10, w11);
  128|       |
  129|  25.4k|  w6 = _mm_unpacklo_epi32(w4, w5);
  130|  25.4k|  w7 = _mm_unpackhi_epi32(w4, w5);
  131|  25.4k|  w14 = _mm_unpacklo_epi32(w12, w13);
  132|  25.4k|  w15 = _mm_unpackhi_epi32(w12, w13);
  133|       |
  134|       |  // Store second 4-line result
  135|  25.4k|  d[4] = _mm_unpacklo_epi64(w6, w14);
  136|  25.4k|  d[5] = _mm_unpackhi_epi64(w6, w14);
  137|  25.4k|  d[6] = _mm_unpacklo_epi64(w7, w15);
  138|  25.4k|  d[7] = _mm_unpackhi_epi64(w7, w15);
  139|       |
  140|       |  // upper half
  141|  25.4k|  w0 = _mm_unpackhi_epi8(x[0], x[1]);
  142|  25.4k|  w1 = _mm_unpackhi_epi8(x[2], x[3]);
  143|  25.4k|  w2 = _mm_unpackhi_epi8(x[4], x[5]);
  144|  25.4k|  w3 = _mm_unpackhi_epi8(x[6], x[7]);
  145|       |
  146|  25.4k|  w8 = _mm_unpackhi_epi8(x[8], x[9]);
  147|  25.4k|  w9 = _mm_unpackhi_epi8(x[10], x[11]);
  148|  25.4k|  w10 = _mm_unpackhi_epi8(x[12], x[13]);
  149|  25.4k|  w11 = _mm_unpackhi_epi8(x[14], x[15]);
  150|       |
  151|  25.4k|  w4 = _mm_unpacklo_epi16(w0, w1);
  152|  25.4k|  w5 = _mm_unpacklo_epi16(w2, w3);
  153|  25.4k|  w12 = _mm_unpacklo_epi16(w8, w9);
  154|  25.4k|  w13 = _mm_unpacklo_epi16(w10, w11);
  155|       |
  156|  25.4k|  w6 = _mm_unpacklo_epi32(w4, w5);
  157|  25.4k|  w7 = _mm_unpackhi_epi32(w4, w5);
  158|  25.4k|  w14 = _mm_unpacklo_epi32(w12, w13);
  159|  25.4k|  w15 = _mm_unpackhi_epi32(w12, w13);
  160|       |
  161|       |  // Store first 4-line result
  162|  25.4k|  d[8] = _mm_unpacklo_epi64(w6, w14);
  163|  25.4k|  d[9] = _mm_unpackhi_epi64(w6, w14);
  164|  25.4k|  d[10] = _mm_unpacklo_epi64(w7, w15);
  165|  25.4k|  d[11] = _mm_unpackhi_epi64(w7, w15);
  166|       |
  167|  25.4k|  w4 = _mm_unpackhi_epi16(w0, w1);
  168|  25.4k|  w5 = _mm_unpackhi_epi16(w2, w3);
  169|  25.4k|  w12 = _mm_unpackhi_epi16(w8, w9);
  170|  25.4k|  w13 = _mm_unpackhi_epi16(w10, w11);
  171|       |
  172|  25.4k|  w6 = _mm_unpacklo_epi32(w4, w5);
  173|  25.4k|  w7 = _mm_unpackhi_epi32(w4, w5);
  174|  25.4k|  w14 = _mm_unpacklo_epi32(w12, w13);
  175|  25.4k|  w15 = _mm_unpackhi_epi32(w12, w13);
  176|       |
  177|       |  // Store second 4-line result
  178|  25.4k|  d[12] = _mm_unpacklo_epi64(w6, w14);
  179|  25.4k|  d[13] = _mm_unpackhi_epi64(w6, w14);
  180|  25.4k|  d[14] = _mm_unpacklo_epi64(w7, w15);
  181|  25.4k|  d[15] = _mm_unpackhi_epi64(w7, w15);
  182|  25.4k|}
intrapred_avx2.c:transpose:
  198|    983|                      ptrdiff_t pitchDst, int width, int height) {
  199|  4.71k|  for (int j = 0; j < height; j += 16)
  ------------------
  |  Branch (199:19): [True: 3.73k, False: 983]
  ------------------
  200|  12.9k|    for (int i = 0; i < width; i += 16)
  ------------------
  |  Branch (200:21): [True: 9.22k, False: 3.73k]
  ------------------
  201|  9.22k|      transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc,
  202|  9.22k|                         dst + j * pitchDst + i, pitchDst);
  203|    983|}
intrapred_avx2.c:transpose_TX_16X16:
  185|  9.22k|                               uint8_t *dst, ptrdiff_t pitchDst) {
  186|  9.22k|  __m128i r[16];
  187|  9.22k|  __m128i d[16];
  188|   156k|  for (int j = 0; j < 16; j++) {
  ------------------
  |  Branch (188:19): [True: 147k, False: 9.22k]
  ------------------
  189|   147k|    r[j] = _mm_loadu_si128((__m128i *)(src + j * pitchSrc));
  190|   147k|  }
  191|  9.22k|  transpose16x16_sse2(r, d);
  192|   156k|  for (int j = 0; j < 16; j++) {
  ------------------
  |  Branch (192:19): [True: 147k, False: 9.22k]
  ------------------
  193|   147k|    _mm_storeu_si128((__m128i *)(dst + j * pitchDst), d[j]);
  194|   147k|  }
  195|  9.22k|}
intrapred_avx2.c:transpose4x16_sse2:
   56|  3.53k|static inline void transpose4x16_sse2(__m128i *x, __m128i *d) {
   57|  3.53k|  __m128i w0, w1, w2, w3, ww0, ww1, ww2, ww3;
   58|  3.53k|  w0 = _mm_unpacklo_epi8(x[0], x[1]);
   59|  3.53k|  w1 = _mm_unpacklo_epi8(x[2], x[3]);
   60|  3.53k|  w2 = _mm_unpackhi_epi8(x[0], x[1]);
   61|  3.53k|  w3 = _mm_unpackhi_epi8(x[2], x[3]);
   62|       |
   63|  3.53k|  ww0 = _mm_unpacklo_epi16(w0, w1);
   64|  3.53k|  ww1 = _mm_unpacklo_epi16(w2, w3);
   65|  3.53k|  ww2 = _mm_unpackhi_epi16(w0, w1);
   66|  3.53k|  ww3 = _mm_unpackhi_epi16(w2, w3);
   67|       |
   68|  3.53k|  w0 = _mm_unpacklo_epi32(ww0, ww1);
   69|  3.53k|  w2 = _mm_unpacklo_epi32(ww2, ww3);
   70|  3.53k|  w1 = _mm_unpackhi_epi32(ww0, ww1);
   71|  3.53k|  w3 = _mm_unpackhi_epi32(ww2, ww3);
   72|       |
   73|  3.53k|  d[0] = _mm_unpacklo_epi64(w0, w2);
   74|  3.53k|  d[1] = _mm_unpackhi_epi64(w0, w2);
   75|  3.53k|  d[2] = _mm_unpacklo_epi64(w1, w3);
   76|  3.53k|  d[3] = _mm_unpackhi_epi64(w1, w3);
   77|       |
   78|  3.53k|  d[4] = _mm_srli_si128(d[0], 8);
   79|  3.53k|  d[5] = _mm_srli_si128(d[1], 8);
   80|  3.53k|  d[6] = _mm_srli_si128(d[2], 8);
   81|  3.53k|  d[7] = _mm_srli_si128(d[3], 8);
   82|       |
   83|  3.53k|  d[8] = _mm_srli_si128(d[0], 4);
   84|  3.53k|  d[9] = _mm_srli_si128(d[1], 4);
   85|  3.53k|  d[10] = _mm_srli_si128(d[2], 4);
   86|  3.53k|  d[11] = _mm_srli_si128(d[3], 4);
   87|       |
   88|  3.53k|  d[12] = _mm_srli_si128(d[0], 12);
   89|  3.53k|  d[13] = _mm_srli_si128(d[1], 12);
   90|  3.53k|  d[14] = _mm_srli_si128(d[2], 12);
   91|       |  d[15] = _mm_srli_si128(d[3], 12);
   92|  3.53k|}

intrapred_sse2.c:dc_sum_16_sse2:
   19|   138k|static inline __m128i dc_sum_16_sse2(const uint8_t *ref) {
   20|   138k|  __m128i x = _mm_load_si128((__m128i const *)ref);
   21|   138k|  const __m128i zero = _mm_setzero_si128();
   22|   138k|  x = _mm_sad_epu8(x, zero);
   23|   138k|  const __m128i high = _mm_unpackhi_epi64(x, x);
   24|   138k|  return _mm_add_epi16(x, high);
   25|   138k|}
intrapred_sse2.c:dc_sum_32_sse2:
   27|  38.4k|static inline __m128i dc_sum_32_sse2(const uint8_t *ref) {
   28|  38.4k|  __m128i x0 = _mm_load_si128((__m128i const *)ref);
   29|  38.4k|  __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
   30|  38.4k|  const __m128i zero = _mm_setzero_si128();
   31|  38.4k|  x0 = _mm_sad_epu8(x0, zero);
   32|  38.4k|  x1 = _mm_sad_epu8(x1, zero);
   33|  38.4k|  x0 = _mm_add_epi16(x0, x1);
   34|  38.4k|  const __m128i high = _mm_unpackhi_epi64(x0, x0);
   35|  38.4k|  return _mm_add_epi16(x0, high);
   36|  38.4k|}
intrapred_avx2.c:dc_sum_32_sse2:
   27|  16.4k|static inline __m128i dc_sum_32_sse2(const uint8_t *ref) {
   28|  16.4k|  __m128i x0 = _mm_load_si128((__m128i const *)ref);
   29|  16.4k|  __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
   30|  16.4k|  const __m128i zero = _mm_setzero_si128();
   31|  16.4k|  x0 = _mm_sad_epu8(x0, zero);
   32|  16.4k|  x1 = _mm_sad_epu8(x1, zero);
   33|  16.4k|  x0 = _mm_add_epi16(x0, x1);
   34|  16.4k|  const __m128i high = _mm_unpackhi_epi64(x0, x0);
   35|  16.4k|  return _mm_add_epi16(x0, high);
   36|  16.4k|}
intrapred_avx2.c:dc_sum_16_sse2:
   19|  18.4k|static inline __m128i dc_sum_16_sse2(const uint8_t *ref) {
   20|  18.4k|  __m128i x = _mm_load_si128((__m128i const *)ref);
   21|  18.4k|  const __m128i zero = _mm_setzero_si128();
   22|  18.4k|  x = _mm_sad_epu8(x, zero);
   23|  18.4k|  const __m128i high = _mm_unpackhi_epi64(x, x);
   24|  18.4k|  return _mm_add_epi16(x, high);
   25|  18.4k|}

aom_lpf_horizontal_4_sse2:
  331|   189k|                               const uint8_t *_thresh) {
  332|   189k|  const __m128i zero = _mm_setzero_si128();
  333|   189k|  __m128i limit = _mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i *)_blimit),
  334|   189k|                                     _mm_loadl_epi64((const __m128i *)_limit));
  335|   189k|  __m128i thresh =
  336|   189k|      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
  337|       |
  338|   189k|  __m128i qs1qs0, ps1ps0;
  339|   189k|  __m128i p1, p0, q0, q1;
  340|       |
  341|   189k|  p1 = xx_loadl_32(s - 2 * p);
  342|   189k|  p0 = xx_loadl_32(s - 1 * p);
  343|   189k|  q0 = xx_loadl_32(s - 0 * p);
  344|   189k|  q1 = xx_loadl_32(s + 1 * p);
  345|       |
  346|   189k|  lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &qs1qs0, &ps1ps0);
  347|       |
  348|   189k|  xx_storel_32(s - 1 * p, ps1ps0);
  349|   189k|  xx_storel_32(s - 2 * p, _mm_srli_si128(ps1ps0, 4));
  350|   189k|  xx_storel_32(s + 0 * p, qs1qs0);
  351|       |  xx_storel_32(s + 1 * p, _mm_srli_si128(qs1qs0, 4));
  352|   189k|}
aom_lpf_vertical_4_sse2:
  356|   158k|                             const uint8_t *_thresh) {
  357|   158k|  __m128i p1p0, q1q0;
  358|   158k|  __m128i p1, p0, q0, q1;
  359|       |
  360|   158k|  const __m128i zero = _mm_setzero_si128();
  361|   158k|  __m128i limit = _mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i *)_blimit),
  362|   158k|                                     _mm_loadl_epi64((const __m128i *)_limit));
  363|   158k|  __m128i thresh =
  364|   158k|      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
  365|       |
  366|   158k|  __m128i x0, x1, x2, x3;
  367|   158k|  __m128i d0, d1, d2, d3;
  368|   158k|  x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p));
  369|   158k|  x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p));
  370|   158k|  x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p));
  371|   158k|  x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p));
  372|       |
  373|   158k|  transpose4x8_8x4_low_sse2(&x0, &x1, &x2, &x3, &p1, &p0, &q0, &q1);
  374|       |
  375|   158k|  lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &q1q0, &p1p0);
  376|       |
  377|       |  // Transpose 8x4 to 4x8
  378|   158k|  p1 = _mm_srli_si128(p1p0, 4);
  379|   158k|  q1 = _mm_srli_si128(q1q0, 4);
  380|       |
  381|   158k|  transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3);
  382|       |
  383|   158k|  xx_storel_32(s + 0 * p - 2, d0);
  384|   158k|  xx_storel_32(s + 1 * p - 2, d1);
  385|   158k|  xx_storel_32(s + 2 * p - 2, d2);
  386|   158k|  xx_storel_32(s + 3 * p - 2, d3);
  387|   158k|}
aom_lpf_horizontal_14_sse2:
  960|  61.4k|                                const unsigned char *_thresh) {
  961|  61.4k|  __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
  962|  61.4k|  __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
  963|  61.4k|  __m128i limit = _mm_load_si128((const __m128i *)_limit);
  964|  61.4k|  __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
  965|       |
  966|  61.4k|  q4p4 = _mm_unpacklo_epi32(xx_loadl_32(s - 5 * p), xx_loadl_32(s + 4 * p));
  967|  61.4k|  q3p3 = _mm_unpacklo_epi32(xx_loadl_32(s - 4 * p), xx_loadl_32(s + 3 * p));
  968|  61.4k|  q2p2 = _mm_unpacklo_epi32(xx_loadl_32(s - 3 * p), xx_loadl_32(s + 2 * p));
  969|  61.4k|  q1p1 = _mm_unpacklo_epi32(xx_loadl_32(s - 2 * p), xx_loadl_32(s + 1 * p));
  970|       |
  971|  61.4k|  q0p0 = _mm_unpacklo_epi32(xx_loadl_32(s - 1 * p), xx_loadl_32(s - 0 * p));
  972|       |
  973|  61.4k|  q5p5 = _mm_unpacklo_epi32(xx_loadl_32(s - 6 * p), xx_loadl_32(s + 5 * p));
  974|       |
  975|  61.4k|  q6p6 = _mm_unpacklo_epi32(xx_loadl_32(s - 7 * p), xx_loadl_32(s + 6 * p));
  976|       |
  977|  61.4k|  lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
  978|  61.4k|                       &limit, &thresh);
  979|       |
  980|  61.4k|  store_buffer_horz_8(q0p0, p, 0, s);
  981|  61.4k|  store_buffer_horz_8(q1p1, p, 1, s);
  982|  61.4k|  store_buffer_horz_8(q2p2, p, 2, s);
  983|  61.4k|  store_buffer_horz_8(q3p3, p, 3, s);
  984|  61.4k|  store_buffer_horz_8(q4p4, p, 4, s);
  985|  61.4k|  store_buffer_horz_8(q5p5, p, 5, s);
  986|  61.4k|}
aom_lpf_horizontal_6_sse2:
 1254|   292k|                               const unsigned char *_thresh) {
 1255|   292k|  __m128i p2, p1, p0, q0, q1, q2;
 1256|   292k|  __m128i p1p0, q1q0;
 1257|   292k|  __m128i blimit = _mm_load_si128((__m128i *)_blimit);
 1258|   292k|  __m128i limit = _mm_load_si128((__m128i *)_limit);
 1259|   292k|  __m128i thresh = _mm_load_si128((__m128i *)_thresh);
 1260|       |
 1261|   292k|  p2 = xx_loadl_32(s - 3 * p);
 1262|   292k|  p1 = xx_loadl_32(s - 2 * p);
 1263|   292k|  p0 = xx_loadl_32(s - 1 * p);
 1264|   292k|  q0 = xx_loadl_32(s - 0 * p);
 1265|   292k|  q1 = xx_loadl_32(s + 1 * p);
 1266|   292k|  q2 = xx_loadl_32(s + 2 * p);
 1267|       |
 1268|   292k|  lpf_internal_6_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit,
 1269|   292k|                      &limit, &thresh);
 1270|       |
 1271|   292k|  xx_storel_32(s - 1 * p, p1p0);
 1272|   292k|  xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 4));
 1273|   292k|  xx_storel_32(s + 0 * p, q1q0);
 1274|       |  xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 4));
 1275|   292k|}
aom_lpf_horizontal_8_sse2:
 1612|   127k|                               const unsigned char *_thresh) {
 1613|   127k|  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
 1614|   127k|  __m128i q1q0, p1p0;
 1615|   127k|  __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
 1616|   127k|  __m128i limit = _mm_load_si128((const __m128i *)_limit);
 1617|   127k|  __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
 1618|       |
 1619|   127k|  p3 = xx_loadl_32(s - 4 * p);
 1620|   127k|  p2 = xx_loadl_32(s - 3 * p);
 1621|   127k|  p1 = xx_loadl_32(s - 2 * p);
 1622|   127k|  p0 = xx_loadl_32(s - 1 * p);
 1623|   127k|  q0 = xx_loadl_32(s - 0 * p);
 1624|   127k|  q1 = xx_loadl_32(s + 1 * p);
 1625|   127k|  q2 = xx_loadl_32(s + 2 * p);
 1626|   127k|  q3 = xx_loadl_32(s + 3 * p);
 1627|       |
 1628|   127k|  lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0,
 1629|   127k|                      &blimit, &limit, &thresh);
 1630|       |
 1631|   127k|  xx_storel_32(s - 1 * p, p1p0);
 1632|   127k|  xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 4));
 1633|   127k|  xx_storel_32(s + 0 * p, q1q0);
 1634|       |  xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 4));
 1635|   127k|  xx_storel_32(s - 3 * p, p2);
 1636|   127k|  xx_storel_32(s + 2 * p, q2);
 1637|   127k|}
aom_lpf_vertical_6_sse2:
 1830|   296k|                             const unsigned char *_thresh) {
 1831|   296k|  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
 1832|   296k|  __m128i x2, x1, x0, x3;
 1833|   296k|  __m128i p0, q0;
 1834|   296k|  __m128i p1p0, q1q0;
 1835|   296k|  __m128i blimit = _mm_load_si128((__m128i *)_blimit);
 1836|   296k|  __m128i limit = _mm_load_si128((__m128i *)_limit);
 1837|   296k|  __m128i thresh = _mm_load_si128((__m128i *)_thresh);
 1838|       |
 1839|   296k|  x3 = _mm_loadl_epi64((__m128i *)((s - 3) + 0 * p));
 1840|   296k|  x2 = _mm_loadl_epi64((__m128i *)((s - 3) + 1 * p));
 1841|   296k|  x1 = _mm_loadl_epi64((__m128i *)((s - 3) + 2 * p));
 1842|   296k|  x0 = _mm_loadl_epi64((__m128i *)((s - 3) + 3 * p));
 1843|       |
 1844|   296k|  transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6,
 1845|   296k|                        &d7);
 1846|       |
 1847|   296k|  lpf_internal_6_sse2(&d0, &d5, &d1, &d4, &d2, &d3, &q1q0, &p1p0, &blimit,
 1848|   296k|                      &limit, &thresh);
 1849|       |
 1850|   296k|  p0 = _mm_srli_si128(p1p0, 4);
 1851|   296k|  q0 = _mm_srli_si128(q1q0, 4);
 1852|       |
 1853|   296k|  transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3);
 1854|       |
 1855|   296k|  xx_storel_32(s + 0 * p - 2, d0);
 1856|   296k|  xx_storel_32(s + 1 * p - 2, d1);
 1857|   296k|  xx_storel_32(s + 2 * p - 2, d2);
 1858|   296k|  xx_storel_32(s + 3 * p - 2, d3);
 1859|   296k|}
aom_lpf_vertical_8_sse2:
 1919|   124k|                             const unsigned char *_thresh) {
 1920|   124k|  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
 1921|       |
 1922|   124k|  __m128i p0, q0;
 1923|   124k|  __m128i x2, x1, x0, x3;
 1924|   124k|  __m128i q1q0, p1p0;
 1925|   124k|  __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
 1926|   124k|  __m128i limit = _mm_load_si128((const __m128i *)_limit);
 1927|   124k|  __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
 1928|       |
 1929|   124k|  x3 = _mm_loadl_epi64((__m128i *)((s - 4) + 0 * p));
 1930|   124k|  x2 = _mm_loadl_epi64((__m128i *)((s - 4) + 1 * p));
 1931|   124k|  x1 = _mm_loadl_epi64((__m128i *)((s - 4) + 2 * p));
 1932|   124k|  x0 = _mm_loadl_epi64((__m128i *)((s - 4) + 3 * p));
 1933|       |
 1934|   124k|  transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6,
 1935|   124k|                        &d7);
 1936|       |  // Loop filtering
 1937|   124k|  lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0, &p1p0,
 1938|   124k|                      &blimit, &limit, &thresh);
 1939|       |
 1940|   124k|  p0 = _mm_srli_si128(p1p0, 4);
 1941|   124k|  q0 = _mm_srli_si128(q1q0, 4);
 1942|       |
 1943|   124k|  transpose8x8_low_sse2(&d0, &d1, &p0, &p1p0, &q1q0, &q0, &d6, &d7, &d0, &d1,
 1944|   124k|                        &d2, &d3);
 1945|       |
 1946|   124k|  _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0);
 1947|   124k|  _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), d1);
 1948|   124k|  _mm_storel_epi64((__m128i *)(s - 4 + 2 * p), d2);
 1949|   124k|  _mm_storel_epi64((__m128i *)(s - 4 + 3 * p), d3);
 1950|   124k|}
aom_lpf_vertical_14_sse2:
 2010|  63.3k|                              const unsigned char *_thresh) {
 2011|  63.3k|  __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
 2012|  63.3k|  __m128i x6, x5, x4, x3;
 2013|  63.3k|  __m128i pq0, pq1, pq2, pq3;
 2014|  63.3k|  __m128i blimit = _mm_load_si128((__m128i *)_blimit);
 2015|  63.3k|  __m128i limit = _mm_load_si128((__m128i *)_limit);
 2016|  63.3k|  __m128i thresh = _mm_load_si128((__m128i *)_thresh);
 2017|       |
 2018|  63.3k|  x6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * p));
 2019|  63.3k|  x5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * p));
 2020|  63.3k|  x4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * p));
 2021|  63.3k|  x3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * p));
 2022|       |
 2023|  63.3k|  transpose_pq_14_sse2(&x6, &x5, &x4, &x3, &q0p0, &q1p1, &q2p2, &q3p3, &q4p4,
 2024|  63.3k|                       &q5p5, &q6p6, &q7p7);
 2025|       |
 2026|  63.3k|  lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
 2027|  63.3k|                       &limit, &thresh);
 2028|       |
 2029|  63.3k|  transpose_pq_14_inv_sse2(&q7p7, &q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1,
 2030|  63.3k|                           &q0p0, &pq0, &pq1, &pq2, &pq3);
 2031|  63.3k|  _mm_storeu_si128((__m128i *)(s - 8 + 0 * p), pq0);
 2032|  63.3k|  _mm_storeu_si128((__m128i *)(s - 8 + 1 * p), pq1);
 2033|  63.3k|  _mm_storeu_si128((__m128i *)(s - 8 + 2 * p), pq2);
 2034|  63.3k|  _mm_storeu_si128((__m128i *)(s - 8 + 3 * p), pq3);
 2035|  63.3k|}
loopfilter_sse2.c:lpf_internal_4_sse2:
  246|   343k|    __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) {
  247|   343k|  __m128i q1p1, q0p0, p1p0, q1q0;
  248|   343k|  __m128i abs_p0q0, abs_p1q1;
  249|   343k|  __m128i mask, flat, hev;
  250|   343k|  const __m128i zero = _mm_setzero_si128();
  251|       |
  252|   343k|  q1p1 = _mm_unpacklo_epi32(*p1, *q1);
  253|   343k|  q0p0 = _mm_unpacklo_epi32(*p0, *q0);
  254|       |
  255|   343k|  p1p0 = _mm_unpacklo_epi32(q0p0, q1p1);
  256|   343k|  q1q0 = _mm_srli_si128(p1p0, 8);
  257|       |
  258|       |  /* (abs(q1 - q0), abs(p1 - p0) */
  259|   343k|  flat = abs_diff(q1p1, q0p0);
  260|       |  /* abs(p1 - q1), abs(p0 - q0) */
  261|   343k|  __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);
  262|       |
  263|       |  /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */
  264|   343k|  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
  265|   343k|  hev = _mm_unpacklo_epi8(flat, zero);
  266|       |
  267|   343k|  hev = _mm_cmpgt_epi16(hev, *thresh);
  268|   343k|  hev = _mm_packs_epi16(hev, hev);
  269|   343k|  hev = _mm_unpacklo_epi32(hev, hev);
  270|       |
  271|   343k|  abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */
  272|   343k|  abs_p1q1 = _mm_srli_si128(abs_p1q1p0q0, 4);           /* abs(p1 - q1) */
  273|   343k|  abs_p1q1 = _mm_unpacklo_epi8(abs_p1q1, abs_p1q1);
  274|   343k|  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);
  275|   343k|  abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */
  276|       |  /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */
  277|       |
  278|   343k|  mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);
  279|   343k|  mask = _mm_unpacklo_epi32(mask, flat);
  280|   343k|  mask = _mm_subs_epu8(mask, *limit);
  281|   343k|  mask = _mm_cmpeq_epi8(mask, zero);
  282|   343k|  mask = _mm_and_si128(mask, _mm_srli_si128(mask, 4));
  283|       |
  284|   343k|  filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
  285|   343k|}
loopfilter_sse2.c:filter4_sse2:
  143|  1.18M|                                          __m128i *qs1qs0, __m128i *ps1ps0) {
  144|  1.18M|  __m128i filter, filter2filter1, work;
  145|  1.18M|  __m128i ps1ps0_work, qs1qs0_work;
  146|  1.18M|  __m128i hev1;
  147|  1.18M|  const __m128i t3t4 =
  148|  1.18M|      _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 4, 4, 4, 4);
  149|  1.18M|  const __m128i t80 = _mm_set1_epi8((char)0x80);
  150|  1.18M|  const __m128i ff = _mm_cmpeq_epi8(t80, t80);
  151|       |
  152|  1.18M|  ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */
  153|  1.18M|  qs1qs0_work = _mm_xor_si128(*q1q0, t80);
  154|       |
  155|       |  /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */
  156|  1.18M|  work = _mm_subs_epi8(ps1ps0_work, qs1qs0_work);
  157|  1.18M|  filter = _mm_and_si128(_mm_srli_si128(work, 4), *hev);
  158|       |  /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */
  159|  1.18M|  filter = _mm_subs_epi8(filter, work);
  160|  1.18M|  filter = _mm_subs_epi8(filter, work);
  161|  1.18M|  filter = _mm_subs_epi8(filter, work);  /* + 3 * (qs0 - ps0) */
  162|  1.18M|  filter = _mm_and_si128(filter, *mask); /* & mask */
  163|  1.18M|  filter = _mm_unpacklo_epi32(filter, filter);
  164|       |
  165|       |  /* filter1 = signed_char_clamp(filter + 4) >> 3; */
  166|       |  /* filter2 = signed_char_clamp(filter + 3) >> 3; */
  167|  1.18M|  filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */
  168|  1.18M|  filter2filter1 =
  169|  1.18M|      _mm_unpacklo_epi8(filter2filter1, filter2filter1);  // goto 16 bit
  170|  1.18M|  filter2filter1 = _mm_srai_epi16(filter2filter1, 11);    /* >> 3 */
  171|  1.18M|  filter2filter1 = _mm_packs_epi16(filter2filter1, filter2filter1);
  172|       |
  173|       |  /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */
  174|  1.18M|  filter = _mm_subs_epi8(filter2filter1, ff);  /* + 1 */
  175|  1.18M|  filter = _mm_unpacklo_epi8(filter, filter);  // goto 16 bit
  176|  1.18M|  filter = _mm_srai_epi16(filter, 9);          /* round */
  177|  1.18M|  filter = _mm_packs_epi16(filter, filter);
  178|  1.18M|  filter = _mm_andnot_si128(*hev, filter);
  179|  1.18M|  filter = _mm_unpacklo_epi32(filter, filter);
  180|       |
  181|  1.18M|  filter2filter1 = _mm_unpacklo_epi32(filter2filter1, filter);
  182|  1.18M|  hev1 = _mm_srli_si128(filter2filter1, 8);
  183|       |  /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */
  184|  1.18M|  qs1qs0_work = _mm_subs_epi8(qs1qs0_work, filter2filter1);
  185|       |  /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */
  186|  1.18M|  ps1ps0_work = _mm_adds_epi8(ps1ps0_work, hev1);
  187|       |
  188|  1.18M|  *qs1qs0 = _mm_xor_si128(qs1qs0_work, t80); /* ^ 0x80 */
  189|  1.18M|  *ps1ps0 = _mm_xor_si128(ps1ps0_work, t80); /* ^ 0x80 */
  190|  1.18M|}
loopfilter_sse2.c:lpf_internal_14_sse2:
  700|   123k|    __m128i *thresh) {
  701|   123k|  const __m128i zero = _mm_setzero_si128();
  702|   123k|  const __m128i one = _mm_set1_epi8(1);
  703|   123k|  __m128i mask, hev, flat, flat2;
  704|   123k|  __m128i flat2_pq[6], flat_pq[3];
  705|   123k|  __m128i qs0ps0, qs1ps1;
  706|   123k|  __m128i p1p0, q1q0, qs1qs0, ps1ps0;
  707|   123k|  __m128i abs_p1p0;
  708|       |
  709|   123k|  p1p0 = _mm_unpacklo_epi32(*q0p0, *q1p1);
  710|   123k|  q1q0 = _mm_srli_si128(p1p0, 8);
  711|       |
  712|   123k|  __m128i fe, ff, work;
  713|   123k|  {
  714|   123k|    __m128i abs_p1q1, abs_p0q0, abs_q1q0;
  715|   123k|    abs_p1p0 = abs_diff(*q1p1, *q0p0);
  716|   123k|    abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
  717|   123k|    fe = _mm_set1_epi8((char)0xfe);
  718|   123k|    ff = _mm_cmpeq_epi8(fe, fe);
  719|   123k|    abs_p0q0 = abs_diff(p1p0, q1q0);
  720|   123k|    abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
  721|       |
  722|   123k|    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
  723|       |
  724|   123k|    hev = _mm_subs_epu8(flat, *thresh);
  725|   123k|    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
  726|       |    // replicate for the further "merged variables" usage
  727|   123k|    hev = _mm_unpacklo_epi32(hev, hev);
  728|       |
  729|   123k|    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
  730|   123k|    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
  731|   123k|    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
  732|   123k|    mask = _mm_unpacklo_epi32(mask, zero);
  733|   123k|    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
  734|       |    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
  735|   123k|    mask = _mm_max_epu8(abs_p1p0, mask);
  736|       |    // mask |= (abs(p1 - p0) > limit) * -1;
  737|       |    // mask |= (abs(q1 - q0) > limit) * -1;
  738|       |
  739|   123k|    work = _mm_max_epu8(abs_diff(*q2p2, *q1p1), abs_diff(*q3p3, *q2p2));
  740|   123k|    mask = _mm_max_epu8(work, mask);
  741|   123k|    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4));
  742|   123k|    mask = _mm_subs_epu8(mask, *limit);
  743|   123k|    mask = _mm_cmpeq_epi8(mask, zero);
  744|   123k|  }
  745|       |
  746|       |  // lp filter - the same for 6, 8 and 14 versions
  747|   123k|  filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
  748|   123k|  qs0ps0 = _mm_unpacklo_epi32(ps1ps0, qs1qs0);
  749|   123k|  qs1ps1 = _mm_srli_si128(qs0ps0, 8);
  750|       |  // loopfilter done
  751|       |
  752|   123k|  flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0));
  753|   123k|  flat = _mm_max_epu8(abs_p1p0, flat);
  754|   123k|  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
  755|   123k|  flat = _mm_subs_epu8(flat, one);
  756|   123k|  flat = _mm_cmpeq_epi8(flat, zero);
  757|   123k|  flat = _mm_and_si128(flat, mask);
  758|   123k|  flat = _mm_unpacklo_epi32(flat, flat);
  759|   123k|  flat = _mm_unpacklo_epi64(flat, flat);
  760|       |
  761|       |  // if flat ==0 then flat2 is zero as well and we don't need any calc below
  762|       |  // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
  763|   123k|  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
  ------------------
  |  Branch (763:7): [True: 67.3k, False: 56.6k]
  ------------------
  764|       |    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  765|       |    // flat and wide flat calculations
  766|  67.3k|    __m128i q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
  767|  67.3k|    __m128i pq_16[7];
  768|  67.3k|    const __m128i eight = _mm_set1_epi16(8);
  769|  67.3k|    const __m128i four = _mm_set1_epi16(4);
  770|  67.3k|    __m128i sum_p6;
  771|  67.3k|    __m128i sum_p3;
  772|       |
  773|  67.3k|    pq_16[0] = _mm_unpacklo_epi8(*q0p0, zero);
  774|  67.3k|    pq_16[1] = _mm_unpacklo_epi8(*q1p1, zero);
  775|  67.3k|    pq_16[2] = _mm_unpacklo_epi8(*q2p2, zero);
  776|  67.3k|    pq_16[3] = _mm_unpacklo_epi8(*q3p3, zero);
  777|  67.3k|    pq_16[4] = _mm_unpacklo_epi8(*q4p4, zero);
  778|  67.3k|    pq_16[5] = _mm_unpacklo_epi8(*q5p5, zero);
  779|  67.3k|    pq_16[6] = _mm_unpacklo_epi8(*q6p6, zero);
  780|  67.3k|    q0_16 = _mm_srli_si128(pq_16[0], 8);
  781|  67.3k|    q1_16 = _mm_srli_si128(pq_16[1], 8);
  782|  67.3k|    q2_16 = _mm_srli_si128(pq_16[2], 8);
  783|  67.3k|    q3_16 = _mm_srli_si128(pq_16[3], 8);
  784|  67.3k|    q4_16 = _mm_srli_si128(pq_16[4], 8);
  785|  67.3k|    q5_16 = _mm_srli_si128(pq_16[5], 8);
  786|       |
  787|  67.3k|    __m128i flat_p[3], flat_q[3];
  788|  67.3k|    __m128i flat2_p[6], flat2_q[6];
  789|       |
  790|  67.3k|    __m128i work0, work0_0, work0_1, sum_p_0;
  791|  67.3k|    __m128i sum_p = _mm_add_epi16(pq_16[5], _mm_add_epi16(pq_16[4], pq_16[3]));
  792|  67.3k|    __m128i sum_lp = _mm_add_epi16(pq_16[0], _mm_add_epi16(pq_16[2], pq_16[1]));
  793|  67.3k|    sum_p = _mm_add_epi16(sum_p, sum_lp);
  794|       |
  795|  67.3k|    __m128i sum_lq = _mm_srli_si128(sum_lp, 8);
  796|  67.3k|    __m128i sum_q = _mm_srli_si128(sum_p, 8);
  797|       |
  798|  67.3k|    sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
  799|  67.3k|    sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
  800|       |
  801|  67.3k|    flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq_16[3], pq_16[0]));
  802|  67.3k|    flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q3_16, q0_16));
  803|       |
  804|  67.3k|    sum_p6 = _mm_add_epi16(pq_16[6], pq_16[6]);
  805|  67.3k|    sum_p3 = _mm_add_epi16(pq_16[3], pq_16[3]);
  806|       |
  807|  67.3k|    sum_q = _mm_sub_epi16(sum_p_0, pq_16[5]);
  808|  67.3k|    sum_p = _mm_sub_epi16(sum_p_0, q5_16);
  809|       |
  810|  67.3k|    work0_0 = _mm_add_epi16(_mm_add_epi16(pq_16[6], pq_16[0]), pq_16[1]);
  811|  67.3k|    work0_1 = _mm_add_epi16(
  812|  67.3k|        sum_p6, _mm_add_epi16(pq_16[1], _mm_add_epi16(pq_16[2], pq_16[0])));
  813|       |
  814|  67.3k|    sum_lq = _mm_sub_epi16(sum_lp, pq_16[2]);
  815|  67.3k|    sum_lp = _mm_sub_epi16(sum_lp, q2_16);
  816|       |
  817|  67.3k|    work0 = _mm_add_epi16(sum_p3, pq_16[1]);
  818|  67.3k|    flat_p[1] = _mm_add_epi16(sum_lp, work0);
  819|  67.3k|    flat_q[1] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
  820|       |
  821|  67.3k|    flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3);
  822|  67.3k|    flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3);
  823|  67.3k|    flat_pq[0] = _mm_packus_epi16(flat_pq[0], flat_pq[0]);
  824|  67.3k|    flat_pq[1] = _mm_packus_epi16(flat_pq[1], flat_pq[1]);
  825|       |
  826|  67.3k|    sum_lp = _mm_sub_epi16(sum_lp, q1_16);
  827|  67.3k|    sum_lq = _mm_sub_epi16(sum_lq, pq_16[1]);
  828|       |
  829|  67.3k|    sum_p3 = _mm_add_epi16(sum_p3, pq_16[3]);
  830|  67.3k|    work0 = _mm_add_epi16(sum_p3, pq_16[2]);
  831|       |
  832|  67.3k|    flat_p[2] = _mm_add_epi16(sum_lp, work0);
  833|  67.3k|    flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
  834|  67.3k|    flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3);
  835|  67.3k|    flat_pq[2] = _mm_packus_epi16(flat_pq[2], flat_pq[2]);
  836|       |
  837|       |    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~ flat 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  838|  67.3k|    flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0));
  839|       |
  840|  67.3k|    work = abs_diff(*q6p6, *q0p0);
  841|  67.3k|    flat2 = _mm_max_epu8(work, flat2);
  842|  67.3k|    flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 4));
  843|  67.3k|    flat2 = _mm_subs_epu8(flat2, one);
  844|  67.3k|    flat2 = _mm_cmpeq_epi8(flat2, zero);
  845|  67.3k|    flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
  846|  67.3k|    flat2 = _mm_unpacklo_epi32(flat2, flat2);
  847|       |
  848|       |    // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  849|  67.3k|    qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
  850|  67.3k|    flat_pq[0] = _mm_and_si128(flat, flat_pq[0]);
  851|  67.3k|    *q0p0 = _mm_or_si128(qs0ps0, flat_pq[0]);
  852|       |
  853|  67.3k|    qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
  854|  67.3k|    flat_pq[1] = _mm_and_si128(flat, flat_pq[1]);
  855|  67.3k|    *q1p1 = _mm_or_si128(qs1ps1, flat_pq[1]);
  856|       |
  857|  67.3k|    *q2p2 = _mm_andnot_si128(flat, *q2p2);
  858|  67.3k|    flat_pq[2] = _mm_and_si128(flat, flat_pq[2]);
  859|  67.3k|    *q2p2 = _mm_or_si128(*q2p2, flat_pq[2]);
  860|       |
  861|  67.3k|    if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) {
  ------------------
  |  Branch (861:9): [True: 59.6k, False: 7.70k]
  ------------------
  862|  59.6k|      flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q0_16));
  863|  59.6k|      flat2_q[0] = _mm_add_epi16(
  864|  59.6k|          sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq_16[0]));
  865|       |
  866|  59.6k|      flat2_p[1] = _mm_add_epi16(sum_p, work0_1);
  867|  59.6k|      flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8));
  868|       |
  869|  59.6k|      flat2_pq[0] =
  870|  59.6k|          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4);
  871|  59.6k|      flat2_pq[1] =
  872|  59.6k|          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4);
  873|  59.6k|      flat2_pq[0] = _mm_packus_epi16(flat2_pq[0], flat2_pq[0]);
  874|  59.6k|      flat2_pq[1] = _mm_packus_epi16(flat2_pq[1], flat2_pq[1]);
  875|       |
  876|  59.6k|      sum_p = _mm_sub_epi16(sum_p, q4_16);
  877|  59.6k|      sum_q = _mm_sub_epi16(sum_q, pq_16[4]);
  878|       |
  879|  59.6k|      sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
  880|  59.6k|      work0 = _mm_add_epi16(
  881|  59.6k|          sum_p6, _mm_add_epi16(pq_16[2], _mm_add_epi16(pq_16[3], pq_16[1])));
  882|  59.6k|      flat2_p[2] = _mm_add_epi16(sum_p, work0);
  883|  59.6k|      flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
  884|  59.6k|      flat2_pq[2] =
  885|  59.6k|          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4);
  886|  59.6k|      flat2_pq[2] = _mm_packus_epi16(flat2_pq[2], flat2_pq[2]);
  887|       |
  888|  59.6k|      sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
  889|  59.6k|      sum_p = _mm_sub_epi16(sum_p, q3_16);
  890|  59.6k|      sum_q = _mm_sub_epi16(sum_q, pq_16[3]);
  891|       |
  892|  59.6k|      work0 = _mm_add_epi16(
  893|  59.6k|          sum_p6, _mm_add_epi16(pq_16[3], _mm_add_epi16(pq_16[4], pq_16[2])));
  894|  59.6k|      flat2_p[3] = _mm_add_epi16(sum_p, work0);
  895|  59.6k|      flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
  896|  59.6k|      flat2_pq[3] =
  897|  59.6k|          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4);
  898|  59.6k|      flat2_pq[3] = _mm_packus_epi16(flat2_pq[3], flat2_pq[3]);
  899|       |
  900|  59.6k|      sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
  901|  59.6k|      sum_p = _mm_sub_epi16(sum_p, q2_16);
  902|  59.6k|      sum_q = _mm_sub_epi16(sum_q, pq_16[2]);
  903|       |
  904|  59.6k|      work0 = _mm_add_epi16(
  905|  59.6k|          sum_p6, _mm_add_epi16(pq_16[4], _mm_add_epi16(pq_16[5], pq_16[3])));
  906|  59.6k|      flat2_p[4] = _mm_add_epi16(sum_p, work0);
  907|  59.6k|      flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
  908|  59.6k|      flat2_pq[4] =
  909|  59.6k|          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4);
  910|  59.6k|      flat2_pq[4] = _mm_packus_epi16(flat2_pq[4], flat2_pq[4]);
  911|       |
  912|  59.6k|      sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
  913|  59.6k|      sum_p = _mm_sub_epi16(sum_p, q1_16);
  914|  59.6k|      sum_q = _mm_sub_epi16(sum_q, pq_16[1]);
  915|       |
  916|  59.6k|      work0 = _mm_add_epi16(
  917|  59.6k|          sum_p6, _mm_add_epi16(pq_16[5], _mm_add_epi16(pq_16[6], pq_16[4])));
  918|  59.6k|      flat2_p[5] = _mm_add_epi16(sum_p, work0);
  919|  59.6k|      flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
  920|  59.6k|      flat2_pq[5] =
  921|  59.6k|          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4);
  922|  59.6k|      flat2_pq[5] = _mm_packus_epi16(flat2_pq[5], flat2_pq[5]);
  923|       |
  924|       |      // wide flat
  925|       |      // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  926|       |
  927|  59.6k|      *q0p0 = _mm_andnot_si128(flat2, *q0p0);
  928|  59.6k|      flat2_pq[0] = _mm_and_si128(flat2, flat2_pq[0]);
  929|  59.6k|      *q0p0 = _mm_or_si128(*q0p0, flat2_pq[0]);
  930|       |
  931|  59.6k|      *q1p1 = _mm_andnot_si128(flat2, *q1p1);
  932|  59.6k|      flat2_pq[1] = _mm_and_si128(flat2, flat2_pq[1]);
  933|  59.6k|      *q1p1 = _mm_or_si128(*q1p1, flat2_pq[1]);
  934|       |
  935|  59.6k|      *q2p2 = _mm_andnot_si128(flat2, *q2p2);
  936|  59.6k|      flat2_pq[2] = _mm_and_si128(flat2, flat2_pq[2]);
  937|  59.6k|      *q2p2 = _mm_or_si128(*q2p2, flat2_pq[2]);
  938|       |
  939|  59.6k|      *q3p3 = _mm_andnot_si128(flat2, *q3p3);
  940|  59.6k|      flat2_pq[3] = _mm_and_si128(flat2, flat2_pq[3]);
  941|  59.6k|      *q3p3 = _mm_or_si128(*q3p3, flat2_pq[3]);
  942|       |
  943|  59.6k|      *q4p4 = _mm_andnot_si128(flat2, *q4p4);
  944|  59.6k|      flat2_pq[4] = _mm_and_si128(flat2, flat2_pq[4]);
  945|  59.6k|      *q4p4 = _mm_or_si128(*q4p4, flat2_pq[4]);
  946|       |
  947|  59.6k|      *q5p5 = _mm_andnot_si128(flat2, *q5p5);
  948|  59.6k|      flat2_pq[5] = _mm_and_si128(flat2, flat2_pq[5]);
  949|  59.6k|      *q5p5 = _mm_or_si128(*q5p5, flat2_pq[5]);
  950|  59.6k|    }
  951|  67.3k|  } else {
  952|  56.6k|    *q0p0 = qs0ps0;
  953|  56.6k|    *q1p1 = qs1ps1;
  954|  56.6k|  }
  955|   123k|}
loopfilter_sse2.c:store_buffer_horz_8:
  389|   364k|static inline void store_buffer_horz_8(__m128i x, int p, int num, uint8_t *s) {
  390|   364k|  xx_storel_32(s - (num + 1) * p, x);
  391|       |  xx_storel_32(s + num * p, _mm_srli_si128(x, 4));
  392|   364k|}
loopfilter_sse2.c:lpf_internal_6_sse2:
 1120|   566k|    __m128i *thresh) {
 1121|   566k|  const __m128i zero = _mm_setzero_si128();
 1122|   566k|  __m128i mask, hev, flat;
 1123|   566k|  __m128i q2p2, q1p1, q0p0, flat_p1p0, flat_q0q1;
 1124|   566k|  __m128i pq2_16, q2_16, pq1_16, pq0_16, q0_16;
 1125|   566k|  __m128i ps1ps0, qs1qs0;
 1126|       |
 1127|   566k|  q2p2 = _mm_unpacklo_epi32(*p2, *q2);
 1128|   566k|  q1p1 = _mm_unpacklo_epi32(*p1, *q1);
 1129|   566k|  q0p0 = _mm_unpacklo_epi32(*p0, *q0);
 1130|       |
 1131|   566k|  *p1p0 = _mm_unpacklo_epi32(*p0, *p1);
 1132|   566k|  *q1q0 = _mm_unpacklo_epi32(*q0, *q1);
 1133|       |
 1134|   566k|  const __m128i one = _mm_set1_epi8(1);
 1135|   566k|  const __m128i fe = _mm_set1_epi8((char)0xfe);
 1136|   566k|  const __m128i ff = _mm_cmpeq_epi8(fe, fe);
 1137|   566k|  {
 1138|       |    // filter_mask and hev_mask
 1139|   566k|    __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
 1140|   566k|    abs_p1p0 = abs_diff(q1p1, q0p0);
 1141|   566k|    abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
 1142|       |
 1143|   566k|    abs_p0q0 = abs_diff(*p1p0, *q1q0);
 1144|   566k|    abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
 1145|       |
 1146|       |    // considering sse doesn't have unsigned elements comparison the idea is
 1147|       |    // to find at least one case when X > limit, it means the corresponding
 1148|       |    // mask bit is set.
 1149|       |    // to achieve that we find global max value of all inputs of abs(x-y) or
 1150|       |    // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
 1151|       |    // otherwise - not
 1152|       |
 1153|   566k|    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
 1154|   566k|    hev = _mm_subs_epu8(flat, *thresh);
 1155|   566k|    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
 1156|       |    // replicate for the further "merged variables" usage
 1157|   566k|    hev = _mm_unpacklo_epi32(hev, hev);
 1158|       |
 1159|   566k|    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
 1160|   566k|    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
 1161|   566k|    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
 1162|   566k|    mask = _mm_unpacklo_epi32(mask, zero);
 1163|   566k|    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
 1164|       |    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
 1165|   566k|    mask = _mm_max_epu8(abs_p1p0, mask);
 1166|       |    // mask |= (abs(p1 - p0) > limit) * -1;
 1167|       |    // mask |= (abs(q1 - q0) > limit) * -1;
 1168|       |
 1169|   566k|    work = abs_diff(q2p2, q1p1);
 1170|   566k|    mask = _mm_max_epu8(work, mask);
 1171|   566k|    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4));
 1172|   566k|    mask = _mm_subs_epu8(mask, *limit);
 1173|   566k|    mask = _mm_cmpeq_epi8(mask, zero);
 1174|       |
 1175|       |    // lp filter - the same for 6, 8 and 14 versions
 1176|   566k|    filter4_sse2(p1p0, q1q0, &hev, &mask, q1q0, p1p0);
 1177|       |
 1178|       |    // flat_mask
 1179|   566k|    flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0);
 1180|   566k|    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
 1181|   566k|    flat = _mm_subs_epu8(flat, one);
 1182|   566k|    flat = _mm_cmpeq_epi8(flat, zero);
 1183|   566k|    flat = _mm_and_si128(flat, mask);
 1184|       |    // replicate for the further "merged variables" usage
 1185|   566k|    flat = _mm_unpacklo_epi32(flat, flat);
 1186|   566k|    flat = _mm_unpacklo_epi64(flat, flat);
 1187|   566k|  }
 1188|       |
 1189|       |  // 5 tap filter
 1190|       |  // need it only if flat !=0
 1191|   566k|  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
  ------------------
  |  Branch (1191:7): [True: 467k, False: 98.7k]
  ------------------
 1192|   467k|    const __m128i four = _mm_set1_epi16(4);
 1193|   467k|    __m128i workp_a, workp_b, workp_c;
 1194|   467k|    __m128i pq0x2_pq1, pq1_pq2;
 1195|   467k|    pq2_16 = _mm_unpacklo_epi8(q2p2, zero);
 1196|   467k|    pq1_16 = _mm_unpacklo_epi8(q1p1, zero);
 1197|   467k|    pq0_16 = _mm_unpacklo_epi8(q0p0, zero);
 1198|   467k|    q0_16 = _mm_srli_si128(pq0_16, 8);
 1199|   467k|    q2_16 = _mm_srli_si128(pq2_16, 8);
 1200|       |
 1201|       |    // op1
 1202|   467k|    pq0x2_pq1 =
 1203|   467k|        _mm_add_epi16(_mm_add_epi16(pq0_16, pq0_16), pq1_16);  // p0 *2 + p1
 1204|   467k|    pq1_pq2 = _mm_add_epi16(pq1_16, pq2_16);                   // p1 + p2
 1205|   467k|    workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four),
 1206|   467k|                            pq1_pq2);  // p2 + p0 * 2 + p1 * 2 + 4
 1207|       |
 1208|   467k|    workp_b = _mm_add_epi16(_mm_add_epi16(pq2_16, pq2_16), q0_16);
 1209|   467k|    workp_b =
 1210|   467k|        _mm_add_epi16(workp_a, workp_b);  // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4
 1211|       |
 1212|       |    // op0
 1213|   467k|    workp_c = _mm_srli_si128(pq0x2_pq1, 8);  // q0 * 2 + q1
 1214|   467k|    workp_a = _mm_add_epi16(workp_a,
 1215|   467k|                            workp_c);  // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4
 1216|   467k|    workp_b = _mm_unpacklo_epi64(workp_a, workp_b);
 1217|   467k|    workp_b = _mm_srli_epi16(workp_b, 3);
 1218|       |
 1219|   467k|    flat_p1p0 = _mm_packus_epi16(workp_b, workp_b);
 1220|       |
 1221|       |    // oq0
 1222|   467k|    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq2_16),
 1223|   467k|                            pq1_16);  // p0 * 2 + p1  + q0 * 2 + q1 + 4
 1224|   467k|    workp_b = _mm_srli_si128(pq1_pq2, 8);
 1225|   467k|    workp_a = _mm_add_epi16(
 1226|   467k|        workp_a, workp_b);  // p0 * 2 + p1  + q0 * 2 + q1 * 2 + q2 + 4
 1227|       |    // workp_shft0 = _mm_srli_epi16(workp_a, 3);
 1228|       |
 1229|       |    // oq1
 1230|   467k|    workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq1_16),
 1231|   467k|                            pq0_16);  // p0   + q0 * 2 + q1 * 2 + q2 + 4
 1232|   467k|    workp_b = _mm_add_epi16(q2_16, q2_16);
 1233|   467k|    workp_b =
 1234|   467k|        _mm_add_epi16(workp_c, workp_b);  // p0  + q0 * 2 + q1 * 2 + q2 * 3 + 4
 1235|       |
 1236|   467k|    workp_a = _mm_unpacklo_epi64(workp_a, workp_b);
 1237|   467k|    workp_a = _mm_srli_epi16(workp_a, 3);
 1238|       |
 1239|   467k|    flat_q0q1 = _mm_packus_epi16(workp_a, workp_a);
 1240|       |
 1241|   467k|    qs1qs0 = _mm_andnot_si128(flat, *q1q0);
 1242|   467k|    *q1q0 = _mm_and_si128(flat, flat_q0q1);
 1243|   467k|    *q1q0 = _mm_or_si128(qs1qs0, *q1q0);
 1244|       |
 1245|   467k|    ps1ps0 = _mm_andnot_si128(flat, *p1p0);
 1246|   467k|    *p1p0 = _mm_and_si128(flat, flat_p1p0);
 1247|   467k|    *p1p0 = _mm_or_si128(ps1ps0, *p1p0);
 1248|   467k|  }
 1249|   566k|}
loopfilter_sse2.c:lpf_internal_8_sse2:
 1313|   246k|    __m128i *blimit, __m128i *limit, __m128i *thresh) {
 1314|   246k|  const __m128i zero = _mm_setzero_si128();
 1315|   246k|  __m128i mask, hev, flat;
 1316|   246k|  __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3,
 1317|   246k|      flat_p1p0, flat_q0q1;
 1318|   246k|  __m128i q2p2, q1p1, q0p0;
 1319|   246k|  __m128i q1q0, p1p0, ps1ps0, qs1qs0;
 1320|   246k|  __m128i work_pq, opq2, pq2;
 1321|       |
 1322|   246k|  q3p3 = _mm_unpacklo_epi32(*p3, *q3);
 1323|   246k|  q2p2 = _mm_unpacklo_epi32(*p2, *q2);
 1324|   246k|  q1p1 = _mm_unpacklo_epi32(*p1, *q1);
 1325|   246k|  q0p0 = _mm_unpacklo_epi32(*p0, *q0);
 1326|       |
 1327|   246k|  p1p0 = _mm_unpacklo_epi32(q0p0, q1p1);  // p1p0 q1q0
 1328|   246k|  q1q0 = _mm_srli_si128(p1p0, 8);
 1329|       |
 1330|       |  // filter_mask and hev_mask
 1331|       |
 1332|       |  // considering sse doesn't have unsigned elements comparison the idea is to
 1333|       |  // find at least one case when X > limit, it means the corresponding  mask
 1334|       |  // bit is set.
 1335|       |  // to achieve that we find global max value of all inputs of abs(x-y) or
 1336|       |  // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
 1337|       |  // otherwise - not
 1338|       |
 1339|   246k|  const __m128i one = _mm_set1_epi8(1);
 1340|   246k|  const __m128i fe = _mm_set1_epi8((char)0xfe);
 1341|   246k|  const __m128i ff = _mm_cmpeq_epi8(fe, fe);
 1342|   246k|  __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
 1343|       |
 1344|   246k|  abs_p1p0 = abs_diff(q1p1, q0p0);
 1345|   246k|  abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
 1346|       |
 1347|   246k|  abs_p0q0 = abs_diff(p1p0, q1q0);
 1348|   246k|  abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
 1349|       |
 1350|   246k|  flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
 1351|   246k|  hev = _mm_subs_epu8(flat, *thresh);
 1352|   246k|  hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
 1353|       |  // replicate for the further "merged variables" usage
 1354|   246k|  hev = _mm_unpacklo_epi32(hev, hev);
 1355|       |
 1356|   246k|  abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
 1357|   246k|  abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
 1358|   246k|  mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
 1359|   246k|  mask = _mm_unpacklo_epi32(mask, zero);
 1360|   246k|  mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
 1361|       |  // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
 1362|   246k|  mask = _mm_max_epu8(abs_p1p0, mask);
 1363|       |  // mask |= (abs(p1 - p0) > limit) * -1;
 1364|       |  // mask |= (abs(q1 - q0) > limit) * -1;
 1365|       |
 1366|   246k|  work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
 1367|       |
 1368|   246k|  mask = _mm_max_epu8(work, mask);
 1369|   246k|  mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4));
 1370|   246k|  mask = _mm_subs_epu8(mask, *limit);
 1371|   246k|  mask = _mm_cmpeq_epi8(mask, zero);
 1372|       |
 1373|       |  // lp filter - the same for 6, 8 and 14 versions
 1374|   246k|  filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
 1375|       |
 1376|       |  // flat_mask4
 1377|   246k|  flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
 1378|   246k|  flat = _mm_max_epu8(abs_p1p0, flat);
 1379|       |
 1380|   246k|  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
 1381|   246k|  flat = _mm_subs_epu8(flat, one);
 1382|   246k|  flat = _mm_cmpeq_epi8(flat, zero);
 1383|   246k|  flat = _mm_and_si128(flat, mask);
 1384|       |  // replicate for the further "merged variables" usage
 1385|   246k|  flat = _mm_unpacklo_epi32(flat, flat);
 1386|   246k|  flat = _mm_unpacklo_epi64(flat, flat);
 1387|       |
 1388|       |  // filter8 need it only if flat !=0
 1389|   246k|  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
  ------------------
  |  Branch (1389:7): [True: 75.8k, False: 171k]
  ------------------
 1390|  75.8k|    const __m128i four = _mm_set1_epi16(4);
 1391|  75.8k|    __m128i workp_a, workp_b, workp_c, workp_d, workp_shft1, workp_shft2;
 1392|  75.8k|    p2_16 = _mm_unpacklo_epi8(*p2, zero);
 1393|  75.8k|    p1_16 = _mm_unpacklo_epi8(*p1, zero);
 1394|  75.8k|    p0_16 = _mm_unpacklo_epi8(*p0, zero);
 1395|  75.8k|    q0_16 = _mm_unpacklo_epi8(*q0, zero);
 1396|  75.8k|    q1_16 = _mm_unpacklo_epi8(*q1, zero);
 1397|  75.8k|    q2_16 = _mm_unpacklo_epi8(*q2, zero);
 1398|  75.8k|    p3_16 = _mm_unpacklo_epi8(*p3, zero);
 1399|  75.8k|    q3_16 = _mm_unpacklo_epi8(*q3, zero);
 1400|       |
 1401|       |    // op2
 1402|  75.8k|    workp_a =
 1403|  75.8k|        _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16));
 1404|  75.8k|    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16);
 1405|  75.8k|    workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16);
 1406|  75.8k|    workp_shft2 = _mm_add_epi16(workp_a, workp_b);
 1407|       |
 1408|       |    // op1
 1409|  75.8k|    workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16);
 1410|  75.8k|    workp_c = _mm_add_epi16(workp_a, workp_b);
 1411|       |    // workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
 1412|       |
 1413|       |    // op0
 1414|  75.8k|    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q2_16);
 1415|  75.8k|    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1_16), p0_16);
 1416|  75.8k|    workp_d = _mm_add_epi16(workp_a, workp_b);
 1417|       |    // workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
 1418|       |
 1419|  75.8k|    workp_c = _mm_unpacklo_epi64(workp_d, workp_c);
 1420|  75.8k|    workp_c = _mm_srli_epi16(workp_c, 3);
 1421|  75.8k|    flat_p1p0 = _mm_packus_epi16(workp_c, workp_c);
 1422|       |
 1423|       |    // oq0
 1424|  75.8k|    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q3_16);
 1425|  75.8k|    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0_16), q0_16);
 1426|       |    // workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
 1427|  75.8k|    workp_c = _mm_add_epi16(workp_a, workp_b);
 1428|       |
 1429|       |    // oq1
 1430|  75.8k|    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2_16), q3_16);
 1431|  75.8k|    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0_16), q1_16);
 1432|  75.8k|    workp_d = _mm_add_epi16(workp_a, workp_b);
 1433|       |    // workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
 1434|       |
 1435|  75.8k|    workp_c = _mm_unpacklo_epi64(workp_c, workp_d);
 1436|  75.8k|    workp_c = _mm_srli_epi16(workp_c, 3);
 1437|  75.8k|    flat_q0q1 = _mm_packus_epi16(workp_c, workp_c);
 1438|       |
 1439|       |    // oq2
 1440|  75.8k|    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16);
 1441|  75.8k|    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16);
 1442|  75.8k|    workp_shft1 = _mm_add_epi16(workp_a, workp_b);
 1443|       |
 1444|  75.8k|    workp_c = _mm_unpacklo_epi64(workp_shft2, workp_shft1);
 1445|  75.8k|    workp_c = _mm_srli_epi16(workp_c, 3);
 1446|       |
 1447|  75.8k|    opq2 = _mm_packus_epi16(workp_c, workp_c);
 1448|       |
 1449|  75.8k|    work_pq = _mm_andnot_si128(flat, q2p2);
 1450|  75.8k|    pq2 = _mm_and_si128(flat, opq2);
 1451|  75.8k|    *p2 = _mm_or_si128(work_pq, pq2);
 1452|  75.8k|    *q2 = _mm_srli_si128(*p2, 4);
 1453|       |
 1454|  75.8k|    qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
 1455|  75.8k|    q1q0 = _mm_and_si128(flat, flat_q0q1);
 1456|  75.8k|    *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
 1457|       |
 1458|  75.8k|    ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
 1459|  75.8k|    p1p0 = _mm_and_si128(flat, flat_p1p0);
 1460|  75.8k|    *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
 1461|  75.8k|  }
 1462|   246k|}
loopfilter_sse2.c:transpose_pq_14_sse2:
   33|  63.3k|                                        __m128i *q7p7) {
   34|  63.3k|  __m128i w0, w1, ww0, ww1, w2, w3, ww2, ww3;
   35|  63.3k|  w0 = _mm_unpacklo_epi8(
   36|  63.3k|      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
   37|  63.3k|  w1 = _mm_unpacklo_epi8(
   38|  63.3k|      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
   39|  63.3k|  w2 = _mm_unpackhi_epi8(
   40|  63.3k|      *x0, *x1);  // 08 18 09 19 010 110 011 111 012 112 013 113 014 114 015 115
   41|  63.3k|  w3 = _mm_unpackhi_epi8(
   42|  63.3k|      *x2, *x3);  // 28 38 29 39 210 310 211 311 212 312 213 313 214 314 215 315
   43|       |
   44|  63.3k|  ww0 = _mm_unpacklo_epi16(
   45|  63.3k|      w0, w1);  // 00 10 20 30 01 11 21 31        02 12 22 32 03 13 23 33
   46|  63.3k|  ww1 = _mm_unpackhi_epi16(
   47|  63.3k|      w0, w1);  // 04 14 24 34 05 15 25 35        06 16 26 36 07 17 27 37
   48|  63.3k|  ww2 = _mm_unpacklo_epi16(
   49|  63.3k|      w2, w3);  // 08 18 28 38 09 19 29 39       010 110 210 310 011 111 211 311
   50|  63.3k|  ww3 = _mm_unpackhi_epi16(
   51|  63.3k|      w2,
   52|  63.3k|      w3);  // 012 112 212 312 013 113 213 313  014 114 214 314 015 115 215 315
   53|       |
   54|  63.3k|  *q7p7 = _mm_unpacklo_epi32(
   55|  63.3k|      ww0,
   56|  63.3k|      _mm_srli_si128(
   57|  63.3k|          ww3, 12));  // 00 10 20 30  015 115 215 315  xx xx xx xx xx xx xx xx
   58|  63.3k|  *q6p6 = _mm_unpackhi_epi32(
   59|  63.3k|      _mm_slli_si128(ww0, 4),
   60|  63.3k|      ww3);  // 01 11 21 31  014 114 214 314  xx xx xx xxxx xx xx xx
   61|  63.3k|  *q5p5 = _mm_unpackhi_epi32(
   62|  63.3k|      ww0,
   63|  63.3k|      _mm_slli_si128(
   64|  63.3k|          ww3, 4));  // 02 12 22 32  013 113 213 313  xx xx xx x xx xx xx xxx
   65|  63.3k|  *q4p4 = _mm_unpacklo_epi32(
   66|  63.3k|      _mm_srli_si128(ww0, 12),
   67|  63.3k|      ww3);  // 03 13 23 33  012 112 212 312 xx xx xx xx xx xx xx xx
   68|  63.3k|  *q3p3 = _mm_unpacklo_epi32(
   69|  63.3k|      ww1,
   70|  63.3k|      _mm_srli_si128(
   71|  63.3k|          ww2, 12));  // 04 14 24 34  011 111 211 311 xx xx xx xx xx xx xx xx
   72|  63.3k|  *q2p2 = _mm_unpackhi_epi32(
   73|  63.3k|      _mm_slli_si128(ww1, 4),
   74|  63.3k|      ww2);  // 05 15 25 35   010 110 210 310 xx xx xx xx xx xx xx xx
   75|  63.3k|  *q1p1 = _mm_unpackhi_epi32(
   76|  63.3k|      ww1,
   77|  63.3k|      _mm_slli_si128(
   78|  63.3k|          ww2, 4));  // 06 16 26 36   09 19 29 39     xx xx xx xx xx xx xx xx
   79|  63.3k|  *q0p0 = _mm_unpacklo_epi32(
   80|       |      _mm_srli_si128(ww1, 12),
   81|  63.3k|      ww2);  // 07 17 27 37  08 18 28 38     xx xx xx xx xx xx xx xx
   82|  63.3k|}
loopfilter_sse2.c:transpose_pq_14_inv_sse2:
   92|  63.1k|                                            __m128i *pq2, __m128i *pq3) {
   93|  63.1k|  __m128i w10, w11, w12, w13;
   94|  63.1k|  __m128i w0, w1, w2, w3, w4, w5;
   95|  63.1k|  __m128i d0, d1, d2, d3;
   96|       |
   97|  63.1k|  w0 = _mm_unpacklo_epi8(
   98|  63.1k|      *x0, *x1);  // p 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
   99|  63.1k|  w1 = _mm_unpacklo_epi8(
  100|  63.1k|      *x2, *x3);  // p 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
  101|  63.1k|  w2 = _mm_unpacklo_epi8(
  102|  63.1k|      *x4, *x5);  // p 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
  103|  63.1k|  w3 = _mm_unpacklo_epi8(
  104|  63.1k|      *x6, *x7);  // p 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
  105|       |
  106|  63.1k|  w4 = _mm_unpacklo_epi16(
  107|  63.1k|      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
  108|  63.1k|  w5 = _mm_unpacklo_epi16(
  109|  63.1k|      w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
  110|       |
  111|  63.1k|  d0 = _mm_unpacklo_epi32(
  112|  63.1k|      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
  113|  63.1k|  d2 = _mm_unpackhi_epi32(
  114|  63.1k|      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
  115|       |
  116|  63.1k|  w10 = _mm_unpacklo_epi8(
  117|  63.1k|      *x7, *x6);  // q xx xx xx xx xx xx xx xx 00 10 01 11 02 12 03 13
  118|  63.1k|  w11 = _mm_unpacklo_epi8(
  119|  63.1k|      *x5, *x4);  // q  xx xx xx xx xx xx xx xx 20 30 21 31 22 32 23 33
  120|  63.1k|  w12 = _mm_unpacklo_epi8(
  121|  63.1k|      *x3, *x2);  // q  xx xx xx xx xx xx xx xx 40 50 41 51 42 52 43 53
  122|  63.1k|  w13 = _mm_unpacklo_epi8(
  123|  63.1k|      *x1, *x0);  // q  xx xx xx xx xx xx xx xx 60 70 61 71 62 72 63 73
  124|       |
  125|  63.1k|  w4 = _mm_unpackhi_epi16(
  126|  63.1k|      w10, w11);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
  127|  63.1k|  w5 = _mm_unpackhi_epi16(
  128|  63.1k|      w12, w13);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
  129|       |
  130|  63.1k|  d1 = _mm_unpacklo_epi32(
  131|  63.1k|      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
  132|  63.1k|  d3 = _mm_unpackhi_epi32(
  133|  63.1k|      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
  134|       |
  135|  63.1k|  *pq0 = _mm_unpacklo_epi64(d0, d1);  // pq
  136|  63.1k|  *pq1 = _mm_unpackhi_epi64(d0, d1);  // pq
  137|  63.1k|  *pq2 = _mm_unpacklo_epi64(d2, d3);  // pq
  138|  63.1k|  *pq3 = _mm_unpackhi_epi64(d2, d3);  // pq
  139|  63.1k|}
loopfilter_sse2.c:abs_diff:
   21|  4.86M|static inline __m128i abs_diff(__m128i a, __m128i b) {
   22|  4.86M|  return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
   23|  4.86M|}

loopfilter_sse2.c:transpose4x8_8x4_low_sse2:
  222|   602k|                                             __m128i *d2, __m128i *d3) {
  223|       |  // input
  224|       |  // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
  225|       |  // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
  226|       |  // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
  227|       |  // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
  228|       |  // output
  229|       |  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
  230|       |  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
  231|       |  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
  232|       |  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
  233|       |
  234|   602k|  __m128i w0, w1;
  235|       |
  236|   602k|  w0 = _mm_unpacklo_epi8(
  237|   602k|      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
  238|   602k|  w1 = _mm_unpacklo_epi8(
  239|   602k|      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
  240|       |
  241|   602k|  *d0 = _mm_unpacklo_epi16(
  242|   602k|      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
  243|       |
  244|   602k|  *d1 = _mm_srli_si128(*d0,
  245|   602k|                       4);  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
  246|   602k|  *d2 = _mm_srli_si128(*d0,
  247|   602k|                       8);  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
  248|       |  *d3 = _mm_srli_si128(*d0,
  249|   602k|                       12);  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
  250|   602k|}
loopfilter_sse2.c:transpose8x8_low_sse2:
  305|   124k|                                         __m128i *d3) {
  306|       |  // input
  307|       |  // x0 00 01 02 03 04 05 06 07
  308|       |  // x1 10 11 12 13 14 15 16 17
  309|       |  // x2 20 21 22 23 24 25 26 27
  310|       |  // x3 30 31 32 33 34 35 36 37
  311|       |  // x4 40 41 42 43 44 45 46 47
  312|       |  // x5  50 51 52 53 54 55 56 57
  313|       |  // x6  60 61 62 63 64 65 66 67
  314|       |  // x7 70 71 72 73 74 75 76 77
  315|       |  // output
  316|       |  // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx
  317|       |  // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx
  318|       |  // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx
  319|       |  // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx
  320|       |
  321|   124k|  __m128i w0, w1, w2, w3, w4, w5;
  322|       |
  323|   124k|  w0 = _mm_unpacklo_epi8(
  324|   124k|      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
  325|       |
  326|   124k|  w1 = _mm_unpacklo_epi8(
  327|   124k|      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
  328|       |
  329|   124k|  w2 = _mm_unpacklo_epi8(
  330|   124k|      *x4, *x5);  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
  331|       |
  332|   124k|  w3 = _mm_unpacklo_epi8(
  333|   124k|      *x6, *x7);  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
  334|       |
  335|   124k|  w4 = _mm_unpacklo_epi16(
  336|   124k|      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
  337|   124k|  w5 = _mm_unpacklo_epi16(
  338|   124k|      w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
  339|       |
  340|   124k|  *d0 = _mm_unpacklo_epi32(
  341|   124k|      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
  342|   124k|  *d1 = _mm_srli_si128(*d0, 8);
  343|   124k|  *d2 = _mm_unpackhi_epi32(
  344|   124k|      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
  345|       |  *d3 = _mm_srli_si128(*d2, 8);
  346|   124k|}
loopfilter_sse2.c:transpose4x8_8x4_sse2:
  256|   403k|                                         __m128i *d7) {
  257|       |  // input
  258|       |  // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
  259|       |  // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
  260|       |  // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
  261|       |  // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
  262|       |  // output
  263|       |  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
  264|       |  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
  265|       |  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
  266|       |  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
  267|       |  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
  268|       |  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
  269|       |  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
  270|       |  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
  271|       |
  272|   403k|  __m128i w0, w1, ww0, ww1;
  273|       |
  274|   403k|  w0 = _mm_unpacklo_epi8(
  275|   403k|      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
  276|   403k|  w1 = _mm_unpacklo_epi8(
  277|   403k|      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
  278|       |
  279|   403k|  ww0 = _mm_unpacklo_epi16(
  280|   403k|      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
  281|   403k|  ww1 = _mm_unpackhi_epi16(
  282|   403k|      w0, w1);  // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
  283|       |
  284|   403k|  *d0 = ww0;  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
  285|   403k|  *d1 = _mm_srli_si128(ww0,
  286|   403k|                       4);  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
  287|   403k|  *d2 = _mm_srli_si128(ww0,
  288|   403k|                       8);  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
  289|   403k|  *d3 = _mm_srli_si128(ww0,
  290|   403k|                       12);  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
  291|       |
  292|   403k|  *d4 = ww1;  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
  293|   403k|  *d5 = _mm_srli_si128(ww1,
  294|   403k|                       4);  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
  295|   403k|  *d6 = _mm_srli_si128(ww1,
  296|   403k|                       8);  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
  297|       |  *d7 = _mm_srli_si128(ww1,
  298|   403k|                       12);  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
  299|   403k|}
highbd_loopfilter_sse2.c:highbd_transpose4x8_8x4_low_sse2:
   65|   470k|                                                    __m128i *d2, __m128i *d3) {
   66|   470k|  __m128i zero = _mm_setzero_si128();
   67|   470k|  __m128i w0, w1, ww0, ww1;
   68|       |
   69|   470k|  w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
   70|   470k|  w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
   71|       |
   72|   470k|  ww0 = _mm_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
   73|   470k|  ww1 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
   74|       |
   75|   470k|  *d0 = _mm_unpacklo_epi64(ww0, zero);  // 00 10 20 30 xx xx xx xx
   76|   470k|  *d1 = _mm_unpackhi_epi64(ww0, zero);  // 01 11 21 31 xx xx xx xx
   77|   470k|  *d2 = _mm_unpacklo_epi64(ww1, zero);  // 02 12 22 32 xx xx xx xx
   78|   470k|  *d3 = _mm_unpackhi_epi64(ww1, zero);  // 03 13 23 33 xx xx xx xx
   79|   470k|}
highbd_loopfilter_sse2.c:highbd_transpose8x8_low_sse2:
  131|   132k|                                                __m128i *d2, __m128i *d3) {
  132|   132k|  __m128i w0, w1, w2, w3, ww0, ww1;
  133|       |  // x0 00 01 02 03 04 05 06 07
  134|       |  // x1 10 11 12 13 14 15 16 17
  135|       |  // x2 20 21 22 23 24 25 26 27
  136|       |  // x3 30 31 32 33 34 35 36 37
  137|       |  // x4 40 41 42 43 44 45 46 47
  138|       |  // x5 50 51 52 53 54 55 56 57
  139|       |  // x6 60 61 62 63 64 65 66 67
  140|       |  // x7 70 71 72 73 74 75 76 77
  141|       |
  142|   132k|  w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
  143|   132k|  w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
  144|   132k|  w2 = _mm_unpacklo_epi16(*x4, *x5);  // 40 50 41 51 42 52 43 53
  145|   132k|  w3 = _mm_unpacklo_epi16(*x6, *x7);  // 60 70 61 71 62 72 63 73
  146|       |
  147|   132k|  ww0 = _mm_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
  148|   132k|  ww1 = _mm_unpacklo_epi32(w2, w3);  // 40 50 60 70 41 51 61 71
  149|       |
  150|   132k|  *d0 = _mm_unpacklo_epi64(ww0, ww1);  // 00 10 20 30 40 50 60 70
  151|   132k|  *d1 = _mm_unpackhi_epi64(ww0, ww1);  // 01 11 21 31 41 51 61 71
  152|       |
  153|   132k|  ww0 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
  154|   132k|  ww1 = _mm_unpackhi_epi32(w2, w3);  // 42 52 62 72 43 53 63 73
  155|       |
  156|   132k|  *d2 = _mm_unpacklo_epi64(ww0, ww1);  // 02 12 22 32 42 52 62 72
  157|   132k|  *d3 = _mm_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
  158|   132k|}
highbd_loopfilter_sse2.c:highbd_transpose4x8_8x4_sse2:
  107|   255k|                                                __m128i *d6, __m128i *d7) {
  108|       |  // input
  109|       |  // x0 00 01 02 03 04 05 06 07
  110|       |  // x1 10 11 12 13 14 15 16 17
  111|       |  // x2 20 21 22 23 24 25 26 27
  112|       |  // x3 30 31 32 33 34 35 36 37
  113|       |  // output
  114|       |  // 00 10 20 30 xx xx xx xx
  115|       |  // 01 11 21 31 xx xx xx xx
  116|       |  // 02 12 22 32 xx xx xx xx
  117|       |  // 03 13 23 33 xx xx xx xx
  118|       |  // 04 14 24 34 xx xx xx xx
  119|       |  // 05 15 25 35 xx xx xx xx
  120|       |  // 06 16 26 36 xx xx xx xx
  121|       |  // 07 17 27 37 xx xx xx xx
  122|   255k|  highbd_transpose4x8_8x4_low_sse2(x0, x1, x2, x3, d0, d1, d2, d3);
  123|   255k|  highbd_transpose4x8_8x4_high_sse2(x0, x1, x2, x3, d4, d5, d6, d7);
  124|   255k|}
highbd_loopfilter_sse2.c:highbd_transpose4x8_8x4_high_sse2:
   84|   256k|                                                     __m128i *d6, __m128i *d7) {
   85|   256k|  __m128i w0, w1, ww2, ww3;
   86|   256k|  __m128i zero = _mm_setzero_si128();
   87|       |
   88|   256k|  w0 = _mm_unpackhi_epi16(*x0, *x1);  // 04 14 05 15 06 16 07 17
   89|   256k|  w1 = _mm_unpackhi_epi16(*x2, *x3);  // 24 34 25 35 26 36 27 37
   90|       |
   91|   256k|  ww2 = _mm_unpacklo_epi32(w0, w1);  //  04 14 24 34 05 15 25 35
   92|   256k|  ww3 = _mm_unpackhi_epi32(w0, w1);  //  06 16 26 36 07 17 27 37
   93|       |
   94|   256k|  *d4 = _mm_unpacklo_epi64(ww2, zero);  // 04 14 24 34 xx xx xx xx
   95|   256k|  *d5 = _mm_unpackhi_epi64(ww2, zero);  // 05 15 25 35 xx xx xx xx
   96|   256k|  *d6 = _mm_unpacklo_epi64(ww3, zero);  // 06 16 26 36 xx xx xx xx
   97|   256k|  *d7 = _mm_unpackhi_epi64(ww3, zero);  // 07 17 27 37 xx xx xx xx
   98|   256k|}
intrapred_avx2.c:highbd_transpose4x8_8x4_low_sse2:
   65|  39.7k|                                                    __m128i *d2, __m128i *d3) {
   66|  39.7k|  __m128i zero = _mm_setzero_si128();
   67|  39.7k|  __m128i w0, w1, ww0, ww1;
   68|       |
   69|  39.7k|  w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
   70|  39.7k|  w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
   71|       |
   72|  39.7k|  ww0 = _mm_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
   73|  39.7k|  ww1 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
   74|       |
   75|  39.7k|  *d0 = _mm_unpacklo_epi64(ww0, zero);  // 00 10 20 30 xx xx xx xx
   76|  39.7k|  *d1 = _mm_unpackhi_epi64(ww0, zero);  // 01 11 21 31 xx xx xx xx
   77|  39.7k|  *d2 = _mm_unpacklo_epi64(ww1, zero);  // 02 12 22 32 xx xx xx xx
   78|  39.7k|  *d3 = _mm_unpackhi_epi64(ww1, zero);  // 03 13 23 33 xx xx xx xx
   79|  39.7k|}
intrapred_avx2.c:highbd_transpose8x8_sse2:
  199|  58.5k|    __m128i *d7) {
  200|  58.5k|  highbd_transpose8x8_low_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3);
  201|  58.5k|  highbd_transpose8x8_high_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d4, d5, d6, d7);
  202|  58.5k|}
intrapred_avx2.c:highbd_transpose8x8_low_sse2:
  131|  64.9k|                                                __m128i *d2, __m128i *d3) {
  132|  64.9k|  __m128i w0, w1, w2, w3, ww0, ww1;
  133|       |  // x0 00 01 02 03 04 05 06 07
  134|       |  // x1 10 11 12 13 14 15 16 17
  135|       |  // x2 20 21 22 23 24 25 26 27
  136|       |  // x3 30 31 32 33 34 35 36 37
  137|       |  // x4 40 41 42 43 44 45 46 47
  138|       |  // x5 50 51 52 53 54 55 56 57
  139|       |  // x6 60 61 62 63 64 65 66 67
  140|       |  // x7 70 71 72 73 74 75 76 77
  141|       |
  142|  64.9k|  w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
  143|  64.9k|  w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
  144|  64.9k|  w2 = _mm_unpacklo_epi16(*x4, *x5);  // 40 50 41 51 42 52 43 53
  145|  64.9k|  w3 = _mm_unpacklo_epi16(*x6, *x7);  // 60 70 61 71 62 72 63 73
  146|       |
  147|  64.9k|  ww0 = _mm_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
  148|  64.9k|  ww1 = _mm_unpacklo_epi32(w2, w3);  // 40 50 60 70 41 51 61 71
  149|       |
  150|  64.9k|  *d0 = _mm_unpacklo_epi64(ww0, ww1);  // 00 10 20 30 40 50 60 70
  151|  64.9k|  *d1 = _mm_unpackhi_epi64(ww0, ww1);  // 01 11 21 31 41 51 61 71
  152|       |
  153|  64.9k|  ww0 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
  154|  64.9k|  ww1 = _mm_unpackhi_epi32(w2, w3);  // 42 52 62 72 43 53 63 73
  155|       |
  156|  64.9k|  *d2 = _mm_unpacklo_epi64(ww0, ww1);  // 02 12 22 32 42 52 62 72
  157|  64.9k|  *d3 = _mm_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
  158|  64.9k|}
intrapred_avx2.c:highbd_transpose8x8_high_sse2:
  165|  58.5k|                                                 __m128i *d6, __m128i *d7) {
  166|  58.5k|  __m128i w0, w1, w2, w3, ww0, ww1;
  167|       |  // x0 00 01 02 03 04 05 06 07
  168|       |  // x1 10 11 12 13 14 15 16 17
  169|       |  // x2 20 21 22 23 24 25 26 27
  170|       |  // x3 30 31 32 33 34 35 36 37
  171|       |  // x4 40 41 42 43 44 45 46 47
  172|       |  // x5 50 51 52 53 54 55 56 57
  173|       |  // x6 60 61 62 63 64 65 66 67
  174|       |  // x7 70 71 72 73 74 75 76 77
  175|  58.5k|  w0 = _mm_unpackhi_epi16(*x0, *x1);  // 04 14 05 15 06 16 07 17
  176|  58.5k|  w1 = _mm_unpackhi_epi16(*x2, *x3);  // 24 34 25 35 26 36 27 37
  177|  58.5k|  w2 = _mm_unpackhi_epi16(*x4, *x5);  // 44 54 45 55 46 56 47 57
  178|  58.5k|  w3 = _mm_unpackhi_epi16(*x6, *x7);  // 64 74 65 75 66 76 67 77
  179|       |
  180|  58.5k|  ww0 = _mm_unpacklo_epi32(w0, w1);  // 04 14 24 34 05 15 25 35
  181|  58.5k|  ww1 = _mm_unpacklo_epi32(w2, w3);  // 44 54 64 74 45 55 65 75
  182|       |
  183|  58.5k|  *d4 = _mm_unpacklo_epi64(ww0, ww1);  // 04 14 24 34 44 54 64 74
  184|  58.5k|  *d5 = _mm_unpackhi_epi64(ww0, ww1);  // 05 15 25 35 45 55 65 75
  185|       |
  186|  58.5k|  ww0 = _mm_unpackhi_epi32(w0, w1);  // 06 16 26 36 07 17 27 37
  187|  58.5k|  ww1 = _mm_unpackhi_epi32(w2, w3);  // 46 56 66 76 47 57 67 77
  188|       |
  189|  58.5k|  *d6 = _mm_unpacklo_epi64(ww0, ww1);  // 06 16 26 36 46 56 66 76
  190|  58.5k|  *d7 = _mm_unpackhi_epi64(ww0, ww1);  // 07 17 27 37 47 57 67 77
  191|  58.5k|}
intrapred_avx2.c:highbd_transpose4x8_8x4_sse2:
  107|  3.98k|                                                __m128i *d6, __m128i *d7) {
  108|       |  // input
  109|       |  // x0 00 01 02 03 04 05 06 07
  110|       |  // x1 10 11 12 13 14 15 16 17
  111|       |  // x2 20 21 22 23 24 25 26 27
  112|       |  // x3 30 31 32 33 34 35 36 37
  113|       |  // output
  114|       |  // 00 10 20 30 xx xx xx xx
  115|       |  // 01 11 21 31 xx xx xx xx
  116|       |  // 02 12 22 32 xx xx xx xx
  117|       |  // 03 13 23 33 xx xx xx xx
  118|       |  // 04 14 24 34 xx xx xx xx
  119|       |  // 05 15 25 35 xx xx xx xx
  120|       |  // 06 16 26 36 xx xx xx xx
  121|       |  // 07 17 27 37 xx xx xx xx
  122|  3.98k|  highbd_transpose4x8_8x4_low_sse2(x0, x1, x2, x3, d0, d1, d2, d3);
  123|  3.98k|  highbd_transpose4x8_8x4_high_sse2(x0, x1, x2, x3, d4, d5, d6, d7);
  124|  3.98k|}
intrapred_avx2.c:highbd_transpose4x8_8x4_high_sse2:
   84|  3.98k|                                                     __m128i *d6, __m128i *d7) {
   85|  3.98k|  __m128i w0, w1, ww2, ww3;
   86|  3.98k|  __m128i zero = _mm_setzero_si128();
   87|       |
   88|  3.98k|  w0 = _mm_unpackhi_epi16(*x0, *x1);  // 04 14 05 15 06 16 07 17
   89|  3.98k|  w1 = _mm_unpackhi_epi16(*x2, *x3);  // 24 34 25 35 26 36 27 37
   90|       |
   91|  3.98k|  ww2 = _mm_unpacklo_epi32(w0, w1);  //  04 14 24 34 05 15 25 35
   92|  3.98k|  ww3 = _mm_unpackhi_epi32(w0, w1);  //  06 16 26 36 07 17 27 37
   93|       |
   94|  3.98k|  *d4 = _mm_unpacklo_epi64(ww2, zero);  // 04 14 24 34 xx xx xx xx
   95|  3.98k|  *d5 = _mm_unpackhi_epi64(ww2, zero);  // 05 15 25 35 xx xx xx xx
   96|  3.98k|  *d6 = _mm_unpacklo_epi64(ww3, zero);  // 06 16 26 36 xx xx xx xx
   97|  3.98k|  *d7 = _mm_unpackhi_epi64(ww3, zero);  // 07 17 27 37 xx xx xx xx
   98|  3.98k|}
intrapred_avx2.c:transpose4x8_8x4_low_sse2:
  222|  57.8k|                                             __m128i *d2, __m128i *d3) {
  223|       |  // input
  224|       |  // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
  225|       |  // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
  226|       |  // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
  227|       |  // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
  228|       |  // output
  229|       |  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
  230|       |  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
  231|       |  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
  232|       |  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
  233|       |
  234|  57.8k|  __m128i w0, w1;
  235|       |
  236|  57.8k|  w0 = _mm_unpacklo_epi8(
  237|  57.8k|      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
  238|  57.8k|  w1 = _mm_unpacklo_epi8(
  239|  57.8k|      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
  240|       |
  241|  57.8k|  *d0 = _mm_unpacklo_epi16(
  242|  57.8k|      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
  243|       |
  244|  57.8k|  *d1 = _mm_srli_si128(*d0,
  245|  57.8k|                       4);  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
  246|  57.8k|  *d2 = _mm_srli_si128(*d0,
  247|  57.8k|                       8);  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
  248|       |  *d3 = _mm_srli_si128(*d0,
  249|  57.8k|                       12);  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
  250|  57.8k|}
intrapred_avx2.c:transpose8x8_sse2:
  352|  29.5k|                                     __m128i *d6d7) {
  353|  29.5k|  __m128i w0, w1, w2, w3, w4, w5, w6, w7;
  354|       |  // x0 00 01 02 03 04 05 06 07
  355|       |  // x1 10 11 12 13 14 15 16 17
  356|  29.5k|  w0 = _mm_unpacklo_epi8(
  357|  29.5k|      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
  358|       |
  359|       |  // x2 20 21 22 23 24 25 26 27
  360|       |  // x3 30 31 32 33 34 35 36 37
  361|  29.5k|  w1 = _mm_unpacklo_epi8(
  362|  29.5k|      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
  363|       |
  364|       |  // x4 40 41 42 43 44 45 46 47
  365|       |  // x5  50 51 52 53 54 55 56 57
  366|  29.5k|  w2 = _mm_unpacklo_epi8(
  367|  29.5k|      *x4, *x5);  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
  368|       |
  369|       |  // x6  60 61 62 63 64 65 66 67
  370|       |  // x7 70 71 72 73 74 75 76 77
  371|  29.5k|  w3 = _mm_unpacklo_epi8(
  372|  29.5k|      *x6, *x7);  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
  373|       |
  374|  29.5k|  w4 = _mm_unpacklo_epi16(
  375|  29.5k|      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
  376|  29.5k|  w5 = _mm_unpacklo_epi16(
  377|  29.5k|      w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
  378|       |
  379|  29.5k|  *d0d1 = _mm_unpacklo_epi32(
  380|  29.5k|      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
  381|  29.5k|  *d2d3 = _mm_unpackhi_epi32(
  382|  29.5k|      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
  383|       |
  384|  29.5k|  w6 = _mm_unpackhi_epi16(
  385|  29.5k|      w0, w1);  // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
  386|  29.5k|  w7 = _mm_unpackhi_epi16(
  387|  29.5k|      w2, w3);  // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
  388|       |
  389|  29.5k|  *d4d5 = _mm_unpacklo_epi32(
  390|  29.5k|      w6, w7);  // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
  391|  29.5k|  *d6d7 = _mm_unpackhi_epi32(
  392|  29.5k|      w6, w7);  // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
  393|  29.5k|}
intrapred_avx2.c:transpose4x8_8x4_sse2:
  256|  4.06k|                                         __m128i *d7) {
  257|       |  // input
  258|       |  // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
  259|       |  // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
  260|       |  // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
  261|       |  // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
  262|       |  // output
  263|       |  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
  264|       |  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
  265|       |  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
  266|       |  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
  267|       |  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
  268|       |  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
  269|       |  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
  270|       |  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
  271|       |
  272|  4.06k|  __m128i w0, w1, ww0, ww1;
  273|       |
  274|  4.06k|  w0 = _mm_unpacklo_epi8(
  275|  4.06k|      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
  276|  4.06k|  w1 = _mm_unpacklo_epi8(
  277|  4.06k|      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
  278|       |
  279|  4.06k|  ww0 = _mm_unpacklo_epi16(
  280|  4.06k|      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
  281|  4.06k|  ww1 = _mm_unpackhi_epi16(
  282|  4.06k|      w0, w1);  // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
  283|       |
  284|  4.06k|  *d0 = ww0;  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
  285|  4.06k|  *d1 = _mm_srli_si128(ww0,
  286|  4.06k|                       4);  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
  287|  4.06k|  *d2 = _mm_srli_si128(ww0,
  288|  4.06k|                       8);  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
  289|  4.06k|  *d3 = _mm_srli_si128(ww0,
  290|  4.06k|                       12);  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
  291|       |
  292|  4.06k|  *d4 = ww1;  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
  293|  4.06k|  *d5 = _mm_srli_si128(ww1,
  294|  4.06k|                       4);  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
  295|  4.06k|  *d6 = _mm_srli_si128(ww1,
  296|  4.06k|                       8);  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
  297|       |  *d7 = _mm_srli_si128(ww1,
  298|  4.06k|                       12);  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
  299|  4.06k|}
intrapred_avx2.c:transpose8x16_16x8_sse2:
  451|  6.02k|    __m128i *d12d13, __m128i *d14d15) {
  452|  6.02k|  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
  453|  6.02k|  __m128i w10, w11, w12, w13, w14, w15;
  454|       |
  455|  6.02k|  w0 = _mm_unpacklo_epi8(*x0, *x1);
  456|  6.02k|  w1 = _mm_unpacklo_epi8(*x2, *x3);
  457|  6.02k|  w2 = _mm_unpacklo_epi8(*x4, *x5);
  458|  6.02k|  w3 = _mm_unpacklo_epi8(*x6, *x7);
  459|       |
  460|  6.02k|  w8 = _mm_unpackhi_epi8(*x0, *x1);
  461|  6.02k|  w9 = _mm_unpackhi_epi8(*x2, *x3);
  462|  6.02k|  w10 = _mm_unpackhi_epi8(*x4, *x5);
  463|  6.02k|  w11 = _mm_unpackhi_epi8(*x6, *x7);
  464|       |
  465|  6.02k|  w4 = _mm_unpacklo_epi16(w0, w1);
  466|  6.02k|  w5 = _mm_unpacklo_epi16(w2, w3);
  467|  6.02k|  w12 = _mm_unpacklo_epi16(w8, w9);
  468|  6.02k|  w13 = _mm_unpacklo_epi16(w10, w11);
  469|       |
  470|  6.02k|  w6 = _mm_unpacklo_epi32(w4, w5);
  471|  6.02k|  w7 = _mm_unpackhi_epi32(w4, w5);
  472|  6.02k|  w14 = _mm_unpacklo_epi32(w12, w13);
  473|  6.02k|  w15 = _mm_unpackhi_epi32(w12, w13);
  474|       |
  475|       |  // Store first 4-line result
  476|  6.02k|  *d0d1 = _mm_unpacklo_epi64(w6, w14);
  477|  6.02k|  *d2d3 = _mm_unpackhi_epi64(w6, w14);
  478|  6.02k|  *d4d5 = _mm_unpacklo_epi64(w7, w15);
  479|  6.02k|  *d6d7 = _mm_unpackhi_epi64(w7, w15);
  480|       |
  481|  6.02k|  w4 = _mm_unpackhi_epi16(w0, w1);
  482|  6.02k|  w5 = _mm_unpackhi_epi16(w2, w3);
  483|  6.02k|  w12 = _mm_unpackhi_epi16(w8, w9);
  484|  6.02k|  w13 = _mm_unpackhi_epi16(w10, w11);
  485|       |
  486|  6.02k|  w6 = _mm_unpacklo_epi32(w4, w5);
  487|  6.02k|  w7 = _mm_unpackhi_epi32(w4, w5);
  488|  6.02k|  w14 = _mm_unpacklo_epi32(w12, w13);
  489|  6.02k|  w15 = _mm_unpackhi_epi32(w12, w13);
  490|       |
  491|       |  // Store second 4-line result
  492|  6.02k|  *d8d9 = _mm_unpacklo_epi64(w6, w14);
  493|  6.02k|  *d10d11 = _mm_unpackhi_epi64(w6, w14);
  494|  6.02k|  *d12d13 = _mm_unpacklo_epi64(w7, w15);
  495|  6.02k|  *d14d15 = _mm_unpackhi_epi64(w7, w15);
  496|  6.02k|}
intrapred_avx2.c:transpose8x8_low_sse2:
  305|  6.05k|                                         __m128i *d3) {
  306|       |  // input
  307|       |  // x0 00 01 02 03 04 05 06 07
  308|       |  // x1 10 11 12 13 14 15 16 17
  309|       |  // x2 20 21 22 23 24 25 26 27
  310|       |  // x3 30 31 32 33 34 35 36 37
  311|       |  // x4 40 41 42 43 44 45 46 47
  312|       |  // x5  50 51 52 53 54 55 56 57
  313|       |  // x6  60 61 62 63 64 65 66 67
  314|       |  // x7 70 71 72 73 74 75 76 77
  315|       |  // output
  316|       |  // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx
  317|       |  // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx
  318|       |  // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx
  319|       |  // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx
  320|       |
  321|  6.05k|  __m128i w0, w1, w2, w3, w4, w5;
  322|       |
  323|  6.05k|  w0 = _mm_unpacklo_epi8(
  324|  6.05k|      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
  325|       |
  326|  6.05k|  w1 = _mm_unpacklo_epi8(
  327|  6.05k|      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
  328|       |
  329|  6.05k|  w2 = _mm_unpacklo_epi8(
  330|  6.05k|      *x4, *x5);  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
  331|       |
  332|  6.05k|  w3 = _mm_unpacklo_epi8(
  333|  6.05k|      *x6, *x7);  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
  334|       |
  335|  6.05k|  w4 = _mm_unpacklo_epi16(
  336|  6.05k|      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
  337|  6.05k|  w5 = _mm_unpacklo_epi16(
  338|  6.05k|      w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
  339|       |
  340|  6.05k|  *d0 = _mm_unpacklo_epi32(
  341|  6.05k|      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
  342|  6.05k|  *d1 = _mm_srli_si128(*d0, 8);
  343|  6.05k|  *d2 = _mm_unpackhi_epi32(
  344|  6.05k|      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
  345|       |  *d3 = _mm_srli_si128(*d2, 8);
  346|  6.05k|}
intrapred_avx2.c:transpose16x8_8x16_sse2:
  400|  22.5k|    __m128i *d4, __m128i *d5, __m128i *d6, __m128i *d7) {
  401|  22.5k|  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
  402|  22.5k|  __m128i w10, w11, w12, w13, w14, w15;
  403|       |
  404|  22.5k|  w0 = _mm_unpacklo_epi8(*x0, *x1);
  405|  22.5k|  w1 = _mm_unpacklo_epi8(*x2, *x3);
  406|  22.5k|  w2 = _mm_unpacklo_epi8(*x4, *x5);
  407|  22.5k|  w3 = _mm_unpacklo_epi8(*x6, *x7);
  408|       |
  409|  22.5k|  w8 = _mm_unpacklo_epi8(*x8, *x9);
  410|  22.5k|  w9 = _mm_unpacklo_epi8(*x10, *x11);
  411|  22.5k|  w10 = _mm_unpacklo_epi8(*x12, *x13);
  412|  22.5k|  w11 = _mm_unpacklo_epi8(*x14, *x15);
  413|       |
  414|  22.5k|  w4 = _mm_unpacklo_epi16(w0, w1);
  415|  22.5k|  w5 = _mm_unpacklo_epi16(w2, w3);
  416|  22.5k|  w12 = _mm_unpacklo_epi16(w8, w9);
  417|  22.5k|  w13 = _mm_unpacklo_epi16(w10, w11);
  418|       |
  419|  22.5k|  w6 = _mm_unpacklo_epi32(w4, w5);
  420|  22.5k|  w7 = _mm_unpackhi_epi32(w4, w5);
  421|  22.5k|  w14 = _mm_unpacklo_epi32(w12, w13);
  422|  22.5k|  w15 = _mm_unpackhi_epi32(w12, w13);
  423|       |
  424|       |  // Store first 4-line result
  425|  22.5k|  *d0 = _mm_unpacklo_epi64(w6, w14);
  426|  22.5k|  *d1 = _mm_unpackhi_epi64(w6, w14);
  427|  22.5k|  *d2 = _mm_unpacklo_epi64(w7, w15);
  428|  22.5k|  *d3 = _mm_unpackhi_epi64(w7, w15);
  429|       |
  430|  22.5k|  w4 = _mm_unpackhi_epi16(w0, w1);
  431|  22.5k|  w5 = _mm_unpackhi_epi16(w2, w3);
  432|  22.5k|  w12 = _mm_unpackhi_epi16(w8, w9);
  433|  22.5k|  w13 = _mm_unpackhi_epi16(w10, w11);
  434|       |
  435|  22.5k|  w6 = _mm_unpacklo_epi32(w4, w5);
  436|  22.5k|  w7 = _mm_unpackhi_epi32(w4, w5);
  437|  22.5k|  w14 = _mm_unpacklo_epi32(w12, w13);
  438|  22.5k|  w15 = _mm_unpackhi_epi32(w12, w13);
  439|       |
  440|       |  // Store second 4-line result
  441|  22.5k|  *d4 = _mm_unpacklo_epi64(w6, w14);
  442|  22.5k|  *d5 = _mm_unpackhi_epi64(w6, w14);
  443|  22.5k|  *d6 = _mm_unpacklo_epi64(w7, w15);
  444|  22.5k|  *d7 = _mm_unpackhi_epi64(w7, w15);
  445|  22.5k|}

convolve_2d_avx2.c:loadu_int32:
   28|  29.5k|static inline int32_t loadu_int32(const void *src) {
   29|  29.5k|  int32_t v;
   30|  29.5k|  memcpy(&v, src, sizeof(v));
   31|  29.5k|  return v;
   32|  29.5k|}
convolve_2d_avx2.c:load_8bit_8x2_to_1_reg_sse2:
   58|   119k|                                                  const int byte_stride) {
   59|   119k|  __m128i dst;
   60|   119k|  dst = _mm_loadl_epi64((__m128i *)((int8_t *)src + 0 * byte_stride));
   61|   119k|  dst = loadh_epi64((int8_t *)src + 1 * byte_stride, dst);
   62|   119k|  return dst;
   63|   119k|}
convolve_2d_avx2.c:loadh_epi64:
   44|   119k|static inline __m128i loadh_epi64(const void *const src, const __m128i s) {
   45|   119k|  return _mm_castps_si128(
   46|   119k|      _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src));
   47|   119k|}
convolve_avx2.c:loadu_int32:
   28|  56.6k|static inline int32_t loadu_int32(const void *src) {
   29|  56.6k|  int32_t v;
   30|  56.6k|  memcpy(&v, src, sizeof(v));
   31|  56.6k|  return v;
   32|  56.6k|}
convolve_avx2.c:_mm_storeh_epi64:
   40|  3.82k|static inline void _mm_storeh_epi64(__m128i *const d, const __m128i s) {
   41|  3.82k|  _mm_storeh_pi((__m64 *)d, _mm_castsi128_ps(s));
   42|  3.82k|}
convolve_avx2.c:loadu_int16:
   22|  8.61k|static inline int16_t loadu_int16(const void *src) {
   23|  8.61k|  int16_t v;
   24|  8.61k|  memcpy(&v, src, sizeof(v));
   25|  8.61k|  return v;
   26|  8.61k|}
convolve_avx2.c:load_8bit_8x2_to_1_reg_sse2:
   58|  17.8k|                                                  const int byte_stride) {
   59|  17.8k|  __m128i dst;
   60|  17.8k|  dst = _mm_loadl_epi64((__m128i *)((int8_t *)src + 0 * byte_stride));
   61|  17.8k|  dst = loadh_epi64((int8_t *)src + 1 * byte_stride, dst);
   62|  17.8k|  return dst;
   63|  17.8k|}
convolve_avx2.c:loadh_epi64:
   44|  17.8k|static inline __m128i loadh_epi64(const void *const src, const __m128i s) {
   45|  17.8k|  return _mm_castps_si128(
   46|  17.8k|      _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src));
   47|  17.8k|}
jnt_convolve_avx2.c:_mm_storeh_epi64:
   40|  10.4k|static inline void _mm_storeh_epi64(__m128i *const d, const __m128i s) {
   41|  10.4k|  _mm_storeh_pi((__m64 *)d, _mm_castsi128_ps(s));
   42|  10.4k|}
jnt_convolve_avx2.c:loadu_int32:
   28|  7.63k|static inline int32_t loadu_int32(const void *src) {
   29|  7.63k|  int32_t v;
   30|  7.63k|  memcpy(&v, src, sizeof(v));
   31|  7.63k|  return v;
   32|  7.63k|}

loopfilter_sse2.c:xx_loadl_32:
   31|  3.83M|static inline __m128i xx_loadl_32(const void *a) {
   32|  3.83M|  int val;
   33|  3.83M|  memcpy(&val, a, sizeof(val));
   34|  3.83M|  return _mm_cvtsi32_si128(val);
   35|  3.83M|}
loopfilter_sse2.c:xx_storel_32:
   57|  4.66M|static inline void xx_storel_32(void *const a, const __m128i v) {
   58|  4.66M|  const int val = _mm_cvtsi128_si32(v);
   59|  4.66M|  memcpy(a, &val, sizeof(val));
   60|  4.66M|}
blend_a64_mask_sse4.c:xx_loadu_128:
   45|   111k|static inline __m128i xx_loadu_128(const void *a) {
   46|   111k|  return _mm_loadu_si128((const __m128i *)a);
   47|   111k|}
blend_a64_mask_sse4.c:xx_storeu_128:
   70|  47.3k|static inline void xx_storeu_128(void *const a, const __m128i v) {
   71|  47.3k|  _mm_storeu_si128((__m128i *)a, v);
   72|  47.3k|}
blend_a64_mask_sse4.c:xx_roundn_epu16:
   88|  29.1k|static inline __m128i xx_roundn_epu16(__m128i v_val_w, int bits) {
   89|  29.1k|  const __m128i v_s_w = _mm_srli_epi16(v_val_w, bits - 1);
   90|  29.1k|  return _mm_avg_epu16(v_s_w, _mm_setzero_si128());
   91|  29.1k|}
blend_a64_mask_sse4.c:xx_loadl_32:
   31|   194k|static inline __m128i xx_loadl_32(const void *a) {
   32|   194k|  int val;
   33|   194k|  memcpy(&val, a, sizeof(val));
   34|   194k|  return _mm_cvtsi32_si128(val);
   35|   194k|}
blend_a64_mask_sse4.c:xx_storel_32:
   57|  59.4k|static inline void xx_storel_32(void *const a, const __m128i v) {
   58|  59.4k|  const int val = _mm_cvtsi128_si32(v);
   59|  59.4k|  memcpy(a, &val, sizeof(val));
   60|  59.4k|}
blend_a64_mask_sse4.c:xx_loadl_64:
   37|   171k|static inline __m128i xx_loadl_64(const void *a) {
   38|   171k|  return _mm_loadl_epi64((const __m128i *)a);
   39|   171k|}
blend_a64_mask_sse4.c:xx_storel_64:
   62|  52.4k|static inline void xx_storel_64(void *const a, const __m128i v) {
   63|  52.4k|  _mm_storel_epi64((__m128i *)a, v);
   64|  52.4k|}
blend_a64_mask_sse4.c:xx_round_epu16:
   84|  18.5k|static inline __m128i xx_round_epu16(__m128i v_val_w) {
   85|  18.5k|  return _mm_avg_epu16(v_val_w, _mm_setzero_si128());
   86|  18.5k|}
blend_a64_vmask_sse4.c:xx_loadl_64:
   37|   154k|static inline __m128i xx_loadl_64(const void *a) {
   38|   154k|  return _mm_loadl_epi64((const __m128i *)a);
   39|   154k|}
blend_a64_vmask_sse4.c:xx_roundn_epu16:
   88|  95.8k|static inline __m128i xx_roundn_epu16(__m128i v_val_w, int bits) {
   89|  95.8k|  const __m128i v_s_w = _mm_srli_epi16(v_val_w, bits - 1);
   90|  95.8k|  return _mm_avg_epu16(v_s_w, _mm_setzero_si128());
   91|  95.8k|}
blend_a64_vmask_sse4.c:xx_storeu_128:
   70|  44.5k|static inline void xx_storeu_128(void *const a, const __m128i v) {
   71|  44.5k|  _mm_storeu_si128((__m128i *)a, v);
   72|  44.5k|}
blend_a64_vmask_sse4.c:xx_loadl_32:
   31|  9.31k|static inline __m128i xx_loadl_32(const void *a) {
   32|  9.31k|  int val;
   33|  9.31k|  memcpy(&val, a, sizeof(val));
   34|  9.31k|  return _mm_cvtsi32_si128(val);
   35|  9.31k|}
blend_a64_vmask_sse4.c:xx_storel_32:
   57|  4.65k|static inline void xx_storel_32(void *const a, const __m128i v) {
   58|  4.65k|  const int val = _mm_cvtsi128_si32(v);
   59|  4.65k|  memcpy(a, &val, sizeof(val));
   60|  4.65k|}
blend_a64_vmask_sse4.c:xx_storel_64:
   62|  27.4k|static inline void xx_storel_64(void *const a, const __m128i v) {
   63|  27.4k|  _mm_storel_epi64((__m128i *)a, v);
   64|  27.4k|}
blend_a64_vmask_sse4.c:xx_loadu_128:
   45|  39.4k|static inline __m128i xx_loadu_128(const void *a) {
   46|  39.4k|  return _mm_loadu_si128((const __m128i *)a);
   47|  39.4k|}
blend_a64_vmask_sse4.c:xx_round_epu16:
   84|  5.75k|static inline __m128i xx_round_epu16(__m128i v_val_w) {
   85|  5.75k|  return _mm_avg_epu16(v_val_w, _mm_setzero_si128());
   86|  5.75k|}
blend_a64_mask_avx2.c:xx_loadl_32:
   31|  19.1k|static inline __m128i xx_loadl_32(const void *a) {
   32|  19.1k|  int val;
   33|  19.1k|  memcpy(&val, a, sizeof(val));
   34|  19.1k|  return _mm_cvtsi32_si128(val);
   35|  19.1k|}
blend_a64_mask_avx2.c:xx_loadl_64:
   37|  74.4k|static inline __m128i xx_loadl_64(const void *a) {
   38|  74.4k|  return _mm_loadl_epi64((const __m128i *)a);
   39|  74.4k|}
blend_a64_mask_avx2.c:xx_storel_32:
   57|  11.1k|static inline void xx_storel_32(void *const a, const __m128i v) {
   58|  11.1k|  const int val = _mm_cvtsi128_si32(v);
   59|  11.1k|  memcpy(a, &val, sizeof(val));
   60|  11.1k|}
blend_a64_mask_avx2.c:xx_loadu_128:
   45|  77.1k|static inline __m128i xx_loadu_128(const void *a) {
   46|  77.1k|  return _mm_loadu_si128((const __m128i *)a);
   47|  77.1k|}
blend_a64_mask_avx2.c:xx_roundn_epu16:
   88|  3.96k|static inline __m128i xx_roundn_epu16(__m128i v_val_w, int bits) {
   89|  3.96k|  const __m128i v_s_w = _mm_srli_epi16(v_val_w, bits - 1);
   90|  3.96k|  return _mm_avg_epu16(v_s_w, _mm_setzero_si128());
   91|  3.96k|}
blend_a64_mask_avx2.c:xx_storel_64:
   62|  16.9k|static inline void xx_storel_64(void *const a, const __m128i v) {
   63|  16.9k|  _mm_storel_epi64((__m128i *)a, v);
   64|  16.9k|}
blend_a64_mask_avx2.c:xx_storeu_128:
   70|  10.1k|static inline void xx_storeu_128(void *const a, const __m128i v) {
   71|  10.1k|  _mm_storeu_si128((__m128i *)a, v);
   72|  10.1k|}
highbd_convolve_avx2.c:xx_storel_32:
   57|  2.57k|static inline void xx_storel_32(void *const a, const __m128i v) {
   58|  2.57k|  const int val = _mm_cvtsi128_si32(v);
   59|  2.57k|  memcpy(a, &val, sizeof(val));
   60|  2.57k|}
av1_convolve_horiz_rs_sse4.c:xx_loadu_128:
   45|  75.6k|static inline __m128i xx_loadu_128(const void *a) {
   46|  75.6k|  return _mm_loadu_si128((const __m128i *)a);
   47|  75.6k|}
av1_convolve_horiz_rs_sse4.c:xx_loadl_64:
   37|  2.22M|static inline __m128i xx_loadl_64(const void *a) {
   38|  2.22M|  return _mm_loadl_epi64((const __m128i *)a);
   39|  2.22M|}
av1_convolve_horiz_rs_sse4.c:xx_storel_32:
   57|   555k|static inline void xx_storel_32(void *const a, const __m128i v) {
   58|   555k|  const int val = _mm_cvtsi128_si32(v);
   59|   555k|  memcpy(a, &val, sizeof(val));
   60|   555k|}
filterintra_sse4.c:xx_load_128:
   41|   781k|static inline __m128i xx_load_128(const void *a) {
   42|   781k|  return _mm_load_si128((const __m128i *)a);
   43|   781k|}
filterintra_sse4.c:xx_loadl_64:
   37|   137k|static inline __m128i xx_loadl_64(const void *a) {
   38|   137k|  return _mm_loadl_epi64((const __m128i *)a);
   39|   137k|}
filterintra_sse4.c:xx_loadl_32:
   31|  2.11M|static inline __m128i xx_loadl_32(const void *a) {
   32|  2.11M|  int val;
   33|  2.11M|  memcpy(&val, a, sizeof(val));
   34|  2.11M|  return _mm_cvtsi32_si128(val);
   35|  2.11M|}
filterintra_sse4.c:xx_roundn_epi16_unsigned:
   99|  3.79M|static inline __m128i xx_roundn_epi16_unsigned(__m128i v_val_d, int bits) {
  100|  3.79M|  const __m128i v_bias_d = _mm_set1_epi16((1 << bits) >> 1);
  101|  3.79M|  const __m128i v_tmp_d = _mm_add_epi16(v_val_d, v_bias_d);
  102|  3.79M|  return _mm_srai_epi16(v_tmp_d, bits);
  103|  3.79M|}
filterintra_sse4.c:xx_storel_32:
   57|  3.79M|static inline void xx_storel_32(void *const a, const __m128i v) {
   58|  3.79M|  const int val = _mm_cvtsi128_si32(v);
   59|  3.79M|  memcpy(a, &val, sizeof(val));
   60|  3.79M|}
convolve_2d_avx2.c:xx_storel_32:
   57|   108k|static inline void xx_storel_32(void *const a, const __m128i v) {
   58|   108k|  const int val = _mm_cvtsi128_si32(v);
   59|   108k|  memcpy(a, &val, sizeof(val));
   60|   108k|}
convolve_avx2.c:xx_storel_32:
   57|  33.2k|static inline void xx_storel_32(void *const a, const __m128i v) {
   58|  33.2k|  const int val = _mm_cvtsi128_si32(v);
   59|  33.2k|  memcpy(a, &val, sizeof(val));
   60|  33.2k|}
selfguided_avx2.c:xx_loadl_64:
   37|   602k|static inline __m128i xx_loadl_64(const void *a) {
   38|   602k|  return _mm_loadl_epi64((const __m128i *)a);
   39|   602k|}
selfguided_avx2.c:xx_loadu_128:
   45|  1.52M|static inline __m128i xx_loadu_128(const void *a) {
   46|  1.52M|  return _mm_loadu_si128((const __m128i *)a);
   47|  1.52M|}
selfguided_avx2.c:xx_storeu_128:
   70|   254k|static inline void xx_storeu_128(void *const a, const __m128i v) {
   71|   254k|  _mm_storeu_si128((__m128i *)a, v);
   72|   254k|}
wiener_convolve_avx2.c:xx_loadu_128:
   45|  1.47k|static inline __m128i xx_loadu_128(const void *a) {
   46|  1.47k|  return _mm_loadu_si128((const __m128i *)a);
   47|  1.47k|}
highbd_convolve_2d_avx2.c:xx_storel_32:
   57|  3.88k|static inline void xx_storel_32(void *const a, const __m128i v) {
   58|  3.88k|  const int val = _mm_cvtsi128_si32(v);
   59|  3.88k|  memcpy(a, &val, sizeof(val));
   60|  3.88k|}
highbd_wiener_convolve_avx2.c:xx_loadu_128:
   45|  2.13k|static inline __m128i xx_loadu_128(const void *a) {
   46|  2.13k|  return _mm_loadu_si128((const __m128i *)a);
   47|  2.13k|}

blend_a64_mask_avx2.c:yy_loadu_256:
   34|   125k|static inline __m256i yy_loadu_256(const void *a) {
   35|   125k|  return _mm256_loadu_si256((const __m256i *)a);
   36|   125k|}
blend_a64_mask_avx2.c:yy_roundn_epu16:
   90|  3.45k|static inline __m256i yy_roundn_epu16(__m256i v_val_w, int bits) {
   91|  3.45k|  const __m256i v_s_w = _mm256_srli_epi16(v_val_w, bits - 1);
   92|  3.45k|  return _mm256_avg_epu16(v_s_w, _mm256_setzero_si256());
   93|  3.45k|}
blend_a64_mask_avx2.c:yy_storeu_256:
   42|  6.22k|static inline void yy_storeu_256(void *const a, const __m256i v) {
   43|  6.22k|  _mm256_storeu_si256((__m256i *)a, v);
   44|  6.22k|}
blend_a64_mask_avx2.c:yy_loadu_4x64:
   67|    344|                                    const void *e1, const void *e0) {
   68|    344|  __m128d v0 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i *)e0));
   69|    344|  __m128d v01 = _mm_loadh_pd(v0, (const double *)e1);
   70|    344|  __m128d v2 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i *)e2));
   71|    344|  __m128d v23 = _mm_loadh_pd(v2, (const double *)e3);
   72|       |  // Note this can be replaced with
   73|       |  // `_mm256_castpd_si256(_mm256_set_m128d(v23, v01))` if immintrin.h contains
   74|       |  // _mm256_set_m128d() with all supported compilers. This version is used to
   75|       |  // match the behavior with yy_set_m128i().
   76|    344|  return yy_set_m128i(_mm_castpd_si128(v23), _mm_castpd_si128(v01));
   77|    344|}
blend_a64_mask_avx2.c:yy_set_m128i:
   59|  2.76k|static inline __m256i yy_set_m128i(__m128i hi, __m128i lo) {
   60|       |  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
   61|  2.76k|}
blend_a64_mask_avx2.c:yy_loadu2_128:
   79|  2.41k|static inline __m256i yy_loadu2_128(const void *hi, const void *lo) {
   80|  2.41k|  __m128i mhi = _mm_loadu_si128((const __m128i *)(hi));
   81|  2.41k|  __m128i mlo = _mm_loadu_si128((const __m128i *)(lo));
   82|  2.41k|  return yy_set_m128i(mhi, mlo);
   83|  2.41k|}
blend_a64_mask_avx2.c:yy_storeu2_128:
   85|    784|static inline void yy_storeu2_128(void *hi, void *lo, const __m256i a) {
   86|       |  _mm_storeu_si128((__m128i *)hi, _mm256_extracti128_si256(a, 1));
   87|    784|  _mm_storeu_si128((__m128i *)lo, _mm256_castsi256_si128(a));
   88|    784|}
reconinter_avx2.c:yy_storeu_256:
   42|  14.3k|static inline void yy_storeu_256(void *const a, const __m256i v) {
   43|  14.3k|  _mm256_storeu_si256((__m256i *)a, v);
   44|  14.3k|}
reconinter_avx2.c:yy_loadu_256:
   34|  54.6k|static inline __m256i yy_loadu_256(const void *a) {
   35|  54.6k|  return _mm256_loadu_si256((const __m256i *)a);
   36|  54.6k|}
reconinter_avx2.c:yy_set_m128i:
   59|  2.51k|static inline __m256i yy_set_m128i(__m128i hi, __m128i lo) {
   60|       |  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
   61|  2.51k|}
reconinter_avx2.c:yy_loadu2_128:
   79|  2.51k|static inline __m256i yy_loadu2_128(const void *hi, const void *lo) {
   80|  2.51k|  __m128i mhi = _mm_loadu_si128((const __m128i *)(hi));
   81|  2.51k|  __m128i mlo = _mm_loadu_si128((const __m128i *)(lo));
   82|  2.51k|  return yy_set_m128i(mhi, mlo);
   83|  2.51k|}
selfguided_avx2.c:yy_load_256:
   30|  1.69M|static inline __m256i yy_load_256(const void *a) {
   31|  1.69M|  return _mm256_load_si256((const __m256i *)a);
   32|  1.69M|}
selfguided_avx2.c:yy_store_256:
   38|  1.67M|static inline void yy_store_256(void *const a, const __m256i v) {
   39|  1.67M|  _mm256_store_si256((__m256i *)a, v);
   40|  1.67M|}
selfguided_avx2.c:yy_loadu_256:
   34|  17.5M|static inline __m256i yy_loadu_256(const void *a) {
   35|  17.5M|  return _mm256_loadu_si256((const __m256i *)a);
   36|  17.5M|}
selfguided_avx2.c:yy_storeu_256:
   42|  2.44M|static inline void yy_storeu_256(void *const a, const __m256i v) {
   43|  2.44M|  _mm256_storeu_si256((__m256i *)a, v);
   44|  2.44M|}
highbd_wiener_convolve_avx2.c:yy_set_m128i:
   59|  8.53k|static inline __m256i yy_set_m128i(__m128i hi, __m128i lo) {
   60|       |  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
   61|  8.53k|}
highbd_wiener_convolve_avx2.c:yy_loadu_256:
   34|  1.84M|static inline __m256i yy_loadu_256(const void *a) {
   35|  1.84M|  return _mm256_loadu_si256((const __m256i *)a);
   36|  1.84M|}
highbd_wiener_convolve_avx2.c:yy_storeu_256:
   42|   230k|static inline void yy_storeu_256(void *const a, const __m256i v) {
   43|   230k|  _mm256_storeu_si256((__m256i *)a, v);
   44|   230k|}

av1_inv_txfm_ssse3.c:transpose_16bit_8x8:
  215|   242k|                                       __m128i *const out) {
  216|       |  // Unpack 16 bit elements. Goes from:
  217|       |  // in[0]: 00 01 02 03  04 05 06 07
  218|       |  // in[1]: 10 11 12 13  14 15 16 17
  219|       |  // in[2]: 20 21 22 23  24 25 26 27
  220|       |  // in[3]: 30 31 32 33  34 35 36 37
  221|       |  // in[4]: 40 41 42 43  44 45 46 47
  222|       |  // in[5]: 50 51 52 53  54 55 56 57
  223|       |  // in[6]: 60 61 62 63  64 65 66 67
  224|       |  // in[7]: 70 71 72 73  74 75 76 77
  225|       |  // to:
  226|       |  // a0:    00 10 01 11  02 12 03 13
  227|       |  // a1:    20 30 21 31  22 32 23 33
  228|       |  // a2:    40 50 41 51  42 52 43 53
  229|       |  // a3:    60 70 61 71  62 72 63 73
  230|       |  // a4:    04 14 05 15  06 16 07 17
  231|       |  // a5:    24 34 25 35  26 36 27 37
  232|       |  // a6:    44 54 45 55  46 56 47 57
  233|       |  // a7:    64 74 65 75  66 76 67 77
  234|   242k|  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
  235|   242k|  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
  236|   242k|  const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
  237|   242k|  const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
  238|   242k|  const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
  239|   242k|  const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
  240|   242k|  const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]);
  241|   242k|  const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]);
  242|       |
  243|       |  // Unpack 32 bit elements resulting in:
  244|       |  // b0: 00 10 20 30  01 11 21 31
  245|       |  // b1: 40 50 60 70  41 51 61 71
  246|       |  // b2: 04 14 24 34  05 15 25 35
  247|       |  // b3: 44 54 64 74  45 55 65 75
  248|       |  // b4: 02 12 22 32  03 13 23 33
  249|       |  // b5: 42 52 62 72  43 53 63 73
  250|       |  // b6: 06 16 26 36  07 17 27 37
  251|       |  // b7: 46 56 66 76  47 57 67 77
  252|   242k|  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
  253|   242k|  const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
  254|   242k|  const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
  255|   242k|  const __m128i b3 = _mm_unpacklo_epi32(a6, a7);
  256|   242k|  const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
  257|   242k|  const __m128i b5 = _mm_unpackhi_epi32(a2, a3);
  258|   242k|  const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
  259|   242k|  const __m128i b7 = _mm_unpackhi_epi32(a6, a7);
  260|       |
  261|       |  // Unpack 64 bit elements resulting in:
  262|       |  // out[0]: 00 10 20 30  40 50 60 70
  263|       |  // out[1]: 01 11 21 31  41 51 61 71
  264|       |  // out[2]: 02 12 22 32  42 52 62 72
  265|       |  // out[3]: 03 13 23 33  43 53 63 73
  266|       |  // out[4]: 04 14 24 34  44 54 64 74
  267|       |  // out[5]: 05 15 25 35  45 55 65 75
  268|       |  // out[6]: 06 16 26 36  46 56 66 76
  269|       |  // out[7]: 07 17 27 37  47 57 67 77
  270|   242k|  out[0] = _mm_unpacklo_epi64(b0, b1);
  271|   242k|  out[1] = _mm_unpackhi_epi64(b0, b1);
  272|   242k|  out[2] = _mm_unpacklo_epi64(b4, b5);
  273|   242k|  out[3] = _mm_unpackhi_epi64(b4, b5);
  274|   242k|  out[4] = _mm_unpacklo_epi64(b2, b3);
  275|   242k|  out[5] = _mm_unpackhi_epi64(b2, b3);
  276|   242k|  out[6] = _mm_unpacklo_epi64(b6, b7);
  277|   242k|  out[7] = _mm_unpackhi_epi64(b6, b7);
  278|   242k|}
av1_inv_txfm_ssse3.c:transpose_16bit_4x4:
   97|   109k|                                       __m128i *const out) {
   98|       |  // Unpack 16 bit elements. Goes from:
   99|       |  // in[0]: 00 01 02 03  XX XX XX XX
  100|       |  // in[1]: 10 11 12 13  XX XX XX XX
  101|       |  // in[2]: 20 21 22 23  XX XX XX XX
  102|       |  // in[3]: 30 31 32 33  XX XX XX XX
  103|       |  // to:
  104|       |  // a0:    00 10 01 11  02 12 03 13
  105|       |  // a1:    20 30 21 31  22 32 23 33
  106|   109k|  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
  107|   109k|  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
  108|       |
  109|       |  // Unpack 32 bit elements resulting in:
  110|       |  // out[0]: 00 10 20 30  01 11 21 31
  111|       |  // out[1]: 01 11 21 31  __ __ __ __
  112|       |  // out[2]: 02 12 22 32  03 13 23 33
  113|       |  // out[3]: 03 13 23 33  __ __ __ __
  114|       |  //
  115|       |  // Note: The high 64 bits of the output registers are shown for informational
  116|       |  // purposes only. Callers should only use the low 64 bits of the output
  117|       |  // registers. "__" indicates zeros.
  118|   109k|  out[0] = _mm_unpacklo_epi32(a0, a1);
  119|   109k|  out[1] = _mm_srli_si128(out[0], 8);
  120|   109k|  out[2] = _mm_unpackhi_epi32(a0, a1);
  121|       |  out[3] = _mm_srli_si128(out[2], 8);
  122|   109k|}
av1_inv_txfm_ssse3.c:transpose_16bit_8x4:
  167|  70.2k|                                       __m128i *const out) {
  168|       |  // Unpack 16 bit elements. Goes from:
  169|       |  // in[0]: 00 01 02 03  04 05 06 07
  170|       |  // in[1]: 10 11 12 13  14 15 16 17
  171|       |  // in[2]: 20 21 22 23  24 25 26 27
  172|       |  // in[3]: 30 31 32 33  34 35 36 37
  173|       |
  174|       |  // to:
  175|       |  // a0:    00 10 01 11  02 12 03 13
  176|       |  // a1:    20 30 21 31  22 32 23 33
  177|       |  // a4:    04 14 05 15  06 16 07 17
  178|       |  // a5:    24 34 25 35  26 36 27 37
  179|  70.2k|  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
  180|  70.2k|  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
  181|  70.2k|  const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
  182|  70.2k|  const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
  183|       |
  184|       |  // Unpack 32 bit elements resulting in:
  185|       |  // b0: 00 10 20 30  01 11 21 31
  186|       |  // b2: 04 14 24 34  05 15 25 35
  187|       |  // b4: 02 12 22 32  03 13 23 33
  188|       |  // b6: 06 16 26 36  07 17 27 37
  189|  70.2k|  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
  190|  70.2k|  const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
  191|  70.2k|  const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
  192|  70.2k|  const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
  193|       |
  194|       |  // Unpack 64 bit elements resulting in:
  195|       |  // out[0]: 00 10 20 30  XX XX XX XX
  196|       |  // out[1]: 01 11 21 31  XX XX XX XX
  197|       |  // out[2]: 02 12 22 32  XX XX XX XX
  198|       |  // out[3]: 03 13 23 33  XX XX XX XX
  199|       |  // out[4]: 04 14 24 34  XX XX XX XX
  200|       |  // out[5]: 05 15 25 35  XX XX XX XX
  201|       |  // out[6]: 06 16 26 36  XX XX XX XX
  202|       |  // out[7]: 07 17 27 37  XX XX XX XX
  203|  70.2k|  const __m128i zeros = _mm_setzero_si128();
  204|  70.2k|  out[0] = _mm_unpacklo_epi64(b0, zeros);
  205|  70.2k|  out[1] = _mm_unpackhi_epi64(b0, zeros);
  206|  70.2k|  out[2] = _mm_unpacklo_epi64(b4, zeros);
  207|  70.2k|  out[3] = _mm_unpackhi_epi64(b4, zeros);
  208|  70.2k|  out[4] = _mm_unpacklo_epi64(b2, zeros);
  209|  70.2k|  out[5] = _mm_unpackhi_epi64(b2, zeros);
  210|  70.2k|  out[6] = _mm_unpacklo_epi64(b6, zeros);
  211|  70.2k|  out[7] = _mm_unpackhi_epi64(b6, zeros);
  212|  70.2k|}
av1_inv_txfm_ssse3.c:transpose_16bit_4x8:
  125|   126k|                                       __m128i *const out) {
  126|       |  // Unpack 16 bit elements. Goes from:
  127|       |  // in[0]: 00 01 02 03  XX XX XX XX
  128|       |  // in[1]: 10 11 12 13  XX XX XX XX
  129|       |  // in[2]: 20 21 22 23  XX XX XX XX
  130|       |  // in[3]: 30 31 32 33  XX XX XX XX
  131|       |  // in[4]: 40 41 42 43  XX XX XX XX
  132|       |  // in[5]: 50 51 52 53  XX XX XX XX
  133|       |  // in[6]: 60 61 62 63  XX XX XX XX
  134|       |  // in[7]: 70 71 72 73  XX XX XX XX
  135|       |  // to:
  136|       |  // a0:    00 10 01 11  02 12 03 13
  137|       |  // a1:    20 30 21 31  22 32 23 33
  138|       |  // a2:    40 50 41 51  42 52 43 53
  139|       |  // a3:    60 70 61 71  62 72 63 73
  140|   126k|  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
  141|   126k|  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
  142|   126k|  const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
  143|   126k|  const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
  144|       |
  145|       |  // Unpack 32 bit elements resulting in:
  146|       |  // b0: 00 10 20 30  01 11 21 31
  147|       |  // b1: 40 50 60 70  41 51 61 71
  148|       |  // b2: 02 12 22 32  03 13 23 33
  149|       |  // b3: 42 52 62 72  43 53 63 73
  150|   126k|  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
  151|   126k|  const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
  152|   126k|  const __m128i b2 = _mm_unpackhi_epi32(a0, a1);
  153|   126k|  const __m128i b3 = _mm_unpackhi_epi32(a2, a3);
  154|       |
  155|       |  // Unpack 64 bit elements resulting in:
  156|       |  // out[0]: 00 10 20 30  40 50 60 70
  157|       |  // out[1]: 01 11 21 31  41 51 61 71
  158|       |  // out[2]: 02 12 22 32  42 52 62 72
  159|       |  // out[3]: 03 13 23 33  43 53 63 73
  160|   126k|  out[0] = _mm_unpacklo_epi64(b0, b1);
  161|   126k|  out[1] = _mm_unpackhi_epi64(b0, b1);
  162|   126k|  out[2] = _mm_unpacklo_epi64(b2, b3);
  163|   126k|  out[3] = _mm_unpackhi_epi64(b2, b3);
  164|   126k|}
highbd_inv_txfm_sse4.c:transpose_32bit_4x4:
  300|   830k|                                       __m128i *const out) {
  301|       |  // Unpack 32 bit elements. Goes from:
  302|       |  // in[0]: 00 01 02 03
  303|       |  // in[1]: 10 11 12 13
  304|       |  // in[2]: 20 21 22 23
  305|       |  // in[3]: 30 31 32 33
  306|       |  // to:
  307|       |  // a0:    00 10 01 11
  308|       |  // a1:    20 30 21 31
  309|       |  // a2:    02 12 03 13
  310|       |  // a3:    22 32 23 33
  311|       |
  312|   830k|  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
  313|   830k|  const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
  314|   830k|  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
  315|   830k|  const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
  316|       |
  317|       |  // Unpack 64 bit elements resulting in:
  318|       |  // out[0]: 00 10 20 30
  319|       |  // out[1]: 01 11 21 31
  320|       |  // out[2]: 02 12 22 32
  321|       |  // out[3]: 03 13 23 33
  322|   830k|  out[0] = _mm_unpacklo_epi64(a0, a1);
  323|   830k|  out[1] = _mm_unpackhi_epi64(a0, a1);
  324|   830k|  out[2] = _mm_unpacklo_epi64(a2, a3);
  325|   830k|  out[3] = _mm_unpackhi_epi64(a2, a3);
  326|   830k|}

av1_inv_txfm_avx2.c:pair_set_w16_epi16:
   23|  5.84M|static inline __m256i pair_set_w16_epi16(int16_t a, int16_t b) {
   24|  5.84M|  return _mm256_set1_epi32(
   25|  5.84M|      (int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)));
   26|  5.84M|}
av1_inv_txfm_avx2.c:btf_16_adds_subs_avx2:
   52|  8.70M|static inline void btf_16_adds_subs_avx2(__m256i *in0, __m256i *in1) {
   53|  8.70M|  const __m256i _in0 = *in0;
   54|  8.70M|  const __m256i _in1 = *in1;
   55|  8.70M|  *in0 = _mm256_adds_epi16(_in0, _in1);
   56|  8.70M|  *in1 = _mm256_subs_epi16(_in0, _in1);
   57|  8.70M|}
av1_inv_txfm_avx2.c:btf_16_w16_avx2:
   30|  4.80M|                                   const int32_t cos_bit) {
   31|  4.80M|  __m256i t0 = _mm256_unpacklo_epi16(*in0, *in1);
   32|  4.80M|  __m256i t1 = _mm256_unpackhi_epi16(*in0, *in1);
   33|  4.80M|  __m256i u0 = _mm256_madd_epi16(t0, w0);
   34|  4.80M|  __m256i u1 = _mm256_madd_epi16(t1, w0);
   35|  4.80M|  __m256i v0 = _mm256_madd_epi16(t0, w1);
   36|  4.80M|  __m256i v1 = _mm256_madd_epi16(t1, w1);
   37|       |
   38|  4.80M|  __m256i a0 = _mm256_add_epi32(u0, _r);
   39|  4.80M|  __m256i a1 = _mm256_add_epi32(u1, _r);
   40|  4.80M|  __m256i b0 = _mm256_add_epi32(v0, _r);
   41|  4.80M|  __m256i b1 = _mm256_add_epi32(v1, _r);
   42|       |
   43|  4.80M|  __m256i c0 = _mm256_srai_epi32(a0, cos_bit);
   44|  4.80M|  __m256i c1 = _mm256_srai_epi32(a1, cos_bit);
   45|  4.80M|  __m256i d0 = _mm256_srai_epi32(b0, cos_bit);
   46|  4.80M|  __m256i d1 = _mm256_srai_epi32(b1, cos_bit);
   47|       |
   48|  4.80M|  *in0 = _mm256_packs_epi32(c0, c1);
   49|  4.80M|  *in1 = _mm256_packs_epi32(d0, d1);
   50|  4.80M|}
av1_inv_txfm_avx2.c:btf_16_adds_subs_out_avx2:
   67|  3.05M|                                             __m256i in0, __m256i in1) {
   68|  3.05M|  const __m256i _in0 = in0;
   69|  3.05M|  const __m256i _in1 = in1;
   70|  3.05M|  *out0 = _mm256_adds_epi16(_in0, _in1);
   71|  3.05M|  *out1 = _mm256_subs_epi16(_in0, _in1);
   72|  3.05M|}
av1_inv_txfm_avx2.c:load_buffer_32bit_to_16bit_w16_avx2:
  111|   157k|                                                       int out_size) {
  112|  2.89M|  for (int i = 0; i < out_size; ++i) {
  ------------------
  |  Branch (112:19): [True: 2.73M, False: 157k]
  ------------------
  113|  2.73M|    out[i] = load_32bit_to_16bit_w16_avx2(in + i * stride);
  114|  2.73M|  }
  115|   157k|}
av1_inv_txfm_avx2.c:load_32bit_to_16bit_w16_avx2:
  103|  2.77M|static inline __m256i load_32bit_to_16bit_w16_avx2(const int32_t *a) {
  104|  2.77M|  const __m256i a_low = _mm256_lddqu_si256((const __m256i *)a);
  105|  2.77M|  const __m256i b = _mm256_packs_epi32(a_low, *(const __m256i *)(a + 8));
  106|       |  return _mm256_permute4x64_epi64(b, 0xD8);
  107|  2.77M|}
av1_inv_txfm_avx2.c:flip_buf_avx2:
  228|    720|static inline void flip_buf_avx2(__m256i *in, __m256i *out, int size) {
  229|  12.2k|  for (int i = 0; i < size; ++i) {
  ------------------
  |  Branch (229:19): [True: 11.5k, False: 720]
  ------------------
  230|  11.5k|    out[size - i - 1] = in[i];
  231|  11.5k|  }
  232|    720|}
av1_inv_txfm_avx2.c:transpose_16bit_16x16_avx2:
  158|   241k|                                              __m256i *const out) {
  159|   241k|  __m256i t[16];
  160|       |
  161|   241k|#define LOADL(idx)                                                            \
  162|   241k|  t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \
  163|   241k|  t[idx] = _mm256_inserti128_si256(                                           \
  164|   241k|      t[idx], _mm_load_si128((__m128i const *)&in[idx + 8]), 1);
  165|       |
  166|   241k|#define LOADR(idx)                                                           \
  167|   241k|  t[8 + idx] =                                                               \
  168|   241k|      _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \
  169|   241k|  t[8 + idx] = _mm256_inserti128_si256(                                      \
  170|   241k|      t[8 + idx], _mm_load_si128((__m128i const *)&in[idx + 8] + 1), 1);
  171|       |
  172|       |  // load left 8x16
  173|   241k|  LOADL(0)
  ------------------
  |  |  162|   241k|  t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \
  |  |  163|   241k|  t[idx] = _mm256_inserti128_si256(                                           \
  |  |  164|   241k|      t[idx], _mm_load_si128((__m128i const *)&in[idx + 8]), 1);
  ------------------
  174|   241k|  LOADL(1)
  ------------------
  |  |  162|   241k|  t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \
  |  |  163|   241k|  t[idx] = _mm256_inserti128_si256(                                           \
  |  |  164|   241k|      t[idx], _mm_load_si128((__m128i const *)&in[idx + 8]), 1);
  ------------------
  175|   241k|  LOADL(2)
  ------------------
  |  |  162|   241k|  t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \
  |  |  163|   241k|  t[idx] = _mm256_inserti128_si256(                                           \
  |  |  164|   241k|      t[idx], _mm_load_si128((__m128i const *)&in[idx + 8]), 1);
  ------------------
  176|   241k|  LOADL(3)
  ------------------
  |  |  162|   241k|  t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \
  |  |  163|   241k|  t[idx] = _mm256_inserti128_si256(                                           \
  |  |  164|   241k|      t[idx], _mm_load_si128((__m128i const *)&in[idx + 8]), 1);
  ------------------
  177|   241k|  LOADL(4)
  ------------------
  |  |  162|   241k|  t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \
  |  |  163|   241k|  t[idx] = _mm256_inserti128_si256(                                           \
  |  |  164|   241k|      t[idx], _mm_load_si128((__m128i const *)&in[idx + 8]), 1);
  ------------------
  178|   241k|  LOADL(5)
  ------------------
  |  |  162|   241k|  t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \
  |  |  163|   241k|  t[idx] = _mm256_inserti128_si256(                                           \
  |  |  164|   241k|      t[idx], _mm_load_si128((__m128i const *)&in[idx + 8]), 1);
  ------------------
  179|   241k|  LOADL(6)
  ------------------
  |  |  162|   241k|  t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \
  |  |  163|   241k|  t[idx] = _mm256_inserti128_si256(                                           \
  |  |  164|   241k|      t[idx], _mm_load_si128((__m128i const *)&in[idx + 8]), 1);
  ------------------
  180|   241k|  LOADL(7)
  ------------------
  |  |  162|   241k|  t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \
  |  |  163|   241k|  t[idx] = _mm256_inserti128_si256(                                           \
  |  |  164|   241k|      t[idx], _mm_load_si128((__m128i const *)&in[idx + 8]), 1);
  ------------------
  181|       |
  182|       |  // load right 8x16
  183|   241k|  LOADR(0)
  ------------------
  |  |  167|   241k|  t[8 + idx] =                                                               \
  |  |  168|   241k|      _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \
  |  |  169|   241k|  t[8 + idx] = _mm256_inserti128_si256(                                      \
  |  |  170|   241k|      t[8 + idx], _mm_load_si128((__m128i const *)&in[idx + 8] + 1), 1);
  ------------------
  184|   241k|  LOADR(1)
  ------------------
  |  |  167|   241k|  t[8 + idx] =                                                               \
  |  |  168|   241k|      _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \
  |  |  169|   241k|  t[8 + idx] = _mm256_inserti128_si256(                                      \
  |  |  170|   241k|      t[8 + idx], _mm_load_si128((__m128i const *)&in[idx + 8] + 1), 1);
  ------------------
  185|   241k|  LOADR(2)
  ------------------
  |  |  167|   241k|  t[8 + idx] =                                                               \
  |  |  168|   241k|      _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \
  |  |  169|   241k|  t[8 + idx] = _mm256_inserti128_si256(                                      \
  |  |  170|   241k|      t[8 + idx], _mm_load_si128((__m128i const *)&in[idx + 8] + 1), 1);
  ------------------
  186|   241k|  LOADR(3)
  ------------------
  |  |  167|   241k|  t[8 + idx] =                                                               \
  |  |  168|   241k|      _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \
  |  |  169|   241k|  t[8 + idx] = _mm256_inserti128_si256(                                      \
  |  |  170|   241k|      t[8 + idx], _mm_load_si128((__m128i const *)&in[idx + 8] + 1), 1);
  ------------------
  187|   241k|  LOADR(4)
  ------------------
  |  |  167|   241k|  t[8 + idx] =                                                               \
  |  |  168|   241k|      _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \
  |  |  169|   241k|  t[8 + idx] = _mm256_inserti128_si256(                                      \
  |  |  170|   241k|      t[8 + idx], _mm_load_si128((__m128i const *)&in[idx + 8] + 1), 1);
  ------------------
  188|   241k|  LOADR(5)
  ------------------
  |  |  167|   241k|  t[8 + idx] =                                                               \
  |  |  168|   241k|      _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \
  |  |  169|   241k|  t[8 + idx] = _mm256_inserti128_si256(                                      \
  |  |  170|   241k|      t[8 + idx], _mm_load_si128((__m128i const *)&in[idx + 8] + 1), 1);
  ------------------
  189|   241k|  LOADR(6)
  ------------------
  |  |  167|   241k|  t[8 + idx] =                                                               \
  |  |  168|   241k|      _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \
  |  |  169|   241k|  t[8 + idx] = _mm256_inserti128_si256(                                      \
  |  |  170|   241k|      t[8 + idx], _mm_load_si128((__m128i const *)&in[idx + 8] + 1), 1);
  ------------------
  190|   241k|  LOADR(7)
  ------------------
  |  |  167|   241k|  t[8 + idx] =                                                               \
  |  |  168|   241k|      _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \
  |  |  169|   241k|  t[8 + idx] = _mm256_inserti128_si256(                                      \
  |  |  170|   241k|      t[8 + idx], _mm_load_si128((__m128i const *)&in[idx + 8] + 1), 1);
  ------------------
  191|       |
  192|       |  // get the top 16x8 result
  193|   241k|  transpose2_8x8_avx2(t, out);
  194|       |  // get the bottom 16x8 result
  195|   241k|  transpose2_8x8_avx2(&t[8], &out[8]);
  196|   241k|}
av1_inv_txfm_avx2.c:transpose2_8x8_avx2:
  118|   482k|                                       __m256i *const out) {
  119|   482k|  __m256i t[16], u[16];
  120|       |  // (1st, 2nd) ==> (lo, hi)
  121|       |  //   (0, 1)   ==>  (0, 1)
  122|       |  //   (2, 3)   ==>  (2, 3)
  123|       |  //   (4, 5)   ==>  (4, 5)
  124|       |  //   (6, 7)   ==>  (6, 7)
  125|  2.41M|  for (int i = 0; i < 4; i++) {
  ------------------
  |  Branch (125:19): [True: 1.92M, False: 482k]
  ------------------
  126|  1.92M|    t[2 * i] = _mm256_unpacklo_epi16(in[2 * i], in[2 * i + 1]);
  127|  1.92M|    t[2 * i + 1] = _mm256_unpackhi_epi16(in[2 * i], in[2 * i + 1]);
  128|  1.92M|  }
  129|       |
  130|       |  // (1st, 2nd) ==> (lo, hi)
  131|       |  //   (0, 2)   ==>  (0, 2)
  132|       |  //   (1, 3)   ==>  (1, 3)
  133|       |  //   (4, 6)   ==>  (4, 6)
  134|       |  //   (5, 7)   ==>  (5, 7)
  135|  1.44M|  for (int i = 0; i < 2; i++) {
  ------------------
  |  Branch (135:19): [True: 964k, False: 482k]
  ------------------
  136|   964k|    u[i] = _mm256_unpacklo_epi32(t[i], t[i + 2]);
  137|   964k|    u[i + 2] = _mm256_unpackhi_epi32(t[i], t[i + 2]);
  138|       |
  139|   964k|    u[i + 4] = _mm256_unpacklo_epi32(t[i + 4], t[i + 6]);
  140|   964k|    u[i + 6] = _mm256_unpackhi_epi32(t[i + 4], t[i + 6]);
  141|   964k|  }
  142|       |
  143|       |  // (1st, 2nd) ==> (lo, hi)
  144|       |  //   (0, 4)   ==>  (0, 1)
  145|       |  //   (1, 5)   ==>  (4, 5)
  146|       |  //   (2, 6)   ==>  (2, 3)
  147|       |  //   (3, 7)   ==>  (6, 7)
  148|  1.44M|  for (int i = 0; i < 2; i++) {
  ------------------
  |  Branch (148:19): [True: 964k, False: 482k]
  ------------------
  149|   964k|    out[2 * i] = _mm256_unpacklo_epi64(u[2 * i], u[2 * i + 4]);
  150|   964k|    out[2 * i + 1] = _mm256_unpackhi_epi64(u[2 * i], u[2 * i + 4]);
  151|       |
  152|   964k|    out[2 * i + 4] = _mm256_unpacklo_epi64(u[2 * i + 1], u[2 * i + 5]);
  153|   964k|    out[2 * i + 5] = _mm256_unpackhi_epi64(u[2 * i + 1], u[2 * i + 5]);
  154|   964k|  }
  155|   482k|}
av1_inv_txfm_avx2.c:round_shift_16bit_w16_avx2:
  234|    308|static inline void round_shift_16bit_w16_avx2(__m256i *in, int size, int bit) {
  235|    308|  if (bit < 0) {
  ------------------
  |  Branch (235:7): [True: 308, False: 0]
  ------------------
  236|    308|    bit = -bit;
  237|    308|    __m256i round = _mm256_set1_epi16(1 << (bit - 1));
  238|  5.23k|    for (int i = 0; i < size; ++i) {
  ------------------
  |  Branch (238:21): [True: 4.92k, False: 308]
  ------------------
  239|  4.92k|      in[i] = _mm256_adds_epi16(in[i], round);
  240|  4.92k|      in[i] = _mm256_srai_epi16(in[i], bit);
  241|  4.92k|    }
  242|    308|  } else if (bit > 0) {
  ------------------
  |  Branch (242:14): [True: 0, False: 0]
  ------------------
  243|      0|    for (int i = 0; i < size; ++i) {
  ------------------
  |  Branch (243:21): [True: 0, False: 0]
  ------------------
  244|      0|      in[i] = _mm256_slli_epi16(in[i], bit);
  245|      0|    }
  246|      0|  }
  247|    308|}
highbd_inv_txfm_avx2.c:round_shift_rect_array_32_avx2:
  274|   129k|                                                  const int val) {
  275|   129k|  const __m256i sqrt2 = _mm256_set1_epi32(val);
  276|   129k|  if (bit > 0) {
  ------------------
  |  Branch (276:7): [True: 0, False: 129k]
  ------------------
  277|      0|    int i;
  278|      0|    for (i = 0; i < size; i++) {
  ------------------
  |  Branch (278:17): [True: 0, False: 0]
  ------------------
  279|      0|      const __m256i r0 = round_shift_32_avx2(input[i], bit);
  280|      0|      const __m256i r1 = _mm256_mullo_epi32(sqrt2, r0);
  281|      0|      output[i] = round_shift_32_avx2(r1, NewSqrt2Bits);
  ------------------
  |  |   41|      0|#define NewSqrt2Bits ((int32_t)12)
  ------------------
  282|      0|    }
  283|   129k|  } else {
  284|   129k|    int i;
  285|  1.49M|    for (i = 0; i < size; i++) {
  ------------------
  |  Branch (285:17): [True: 1.36M, False: 129k]
  ------------------
  286|  1.36M|      const __m256i r0 = _mm256_slli_epi32(input[i], -bit);
  287|  1.36M|      const __m256i r1 = _mm256_mullo_epi32(sqrt2, r0);
  288|  1.36M|      output[i] = round_shift_32_avx2(r1, NewSqrt2Bits);
  ------------------
  |  |   41|  1.36M|#define NewSqrt2Bits ((int32_t)12)
  ------------------
  289|  1.36M|    }
  290|   129k|  }
  291|   129k|}
highbd_inv_txfm_avx2.c:round_shift_32_avx2:
  249|  18.0M|static inline __m256i round_shift_32_avx2(__m256i vec, int bit) {
  250|  18.0M|  __m256i tmp, round;
  251|  18.0M|  round = _mm256_set1_epi32(1 << (bit - 1));
  252|  18.0M|  tmp = _mm256_add_epi32(vec, round);
  253|  18.0M|  return _mm256_srai_epi32(tmp, bit);
  254|  18.0M|}
highbd_inv_txfm_avx2.c:round_shift_array_32_avx2:
  257|   883k|                                             const int size, const int bit) {
  258|   883k|  if (bit > 0) {
  ------------------
  |  Branch (258:7): [True: 883k, False: 1]
  ------------------
  259|   883k|    int i;
  260|  17.5M|    for (i = 0; i < size; i++) {
  ------------------
  |  Branch (260:17): [True: 16.6M, False: 883k]
  ------------------
  261|  16.6M|      output[i] = round_shift_32_avx2(input[i], bit);
  262|  16.6M|    }
  263|   883k|  } else {
  264|      1|    int i;
  265|      1|    for (i = 0; i < size; i++) {
  ------------------
  |  Branch (265:17): [True: 0, False: 1]
  ------------------
  266|      0|      output[i] = _mm256_slli_epi32(input[i], -bit);
  267|      0|    }
  268|      1|  }
  269|   883k|}

aom_memalign:
   55|  3.56M|void *aom_memalign(size_t align, size_t size) {
   56|  3.56M|  void *x = NULL;
   57|  3.56M|  if (!check_size_argument_overflow(1, size, align)) return NULL;
  ------------------
  |  Branch (57:7): [True: 0, False: 3.56M]
  ------------------
   58|  3.56M|  const size_t aligned_size = size + GetAllocationPaddingSize(align);
   59|  3.56M|  void *const addr = malloc(aligned_size);
   60|  3.56M|  if (addr) {
  ------------------
  |  Branch (60:7): [True: 3.56M, False: 18.4E]
  ------------------
   61|  3.56M|    x = aom_align_addr((unsigned char *)addr + ADDRESS_STORAGE_SIZE, align);
  ------------------
  |  |   49|  3.56M|  (void *)(((uintptr_t)(addr) + ((align) - 1)) & ~(uintptr_t)((align) - 1))
  ------------------
   62|  3.56M|    SetActualMallocAddress(x, addr);
   63|  3.56M|  }
   64|  3.56M|  return x;
   65|  3.56M|}
aom_malloc:
   67|  1.20M|void *aom_malloc(size_t size) { return aom_memalign(DEFAULT_ALIGNMENT, size); }
  ------------------
  |  |   25|  1.20M|#define DEFAULT_ALIGNMENT (2 * sizeof(void *)) /* NOLINT */
  ------------------
aom_calloc:
   69|   694k|void *aom_calloc(size_t num, size_t size) {
   70|   694k|  if (!check_size_argument_overflow(num, size, DEFAULT_ALIGNMENT)) return NULL;
  ------------------
  |  |   25|   694k|#define DEFAULT_ALIGNMENT (2 * sizeof(void *)) /* NOLINT */
  ------------------
  |  Branch (70:7): [True: 0, False: 694k]
  ------------------
   71|   694k|  const size_t total_size = num * size;
   72|   694k|  void *const x = aom_malloc(total_size);
   73|   694k|  if (x) memset(x, 0, total_size);
  ------------------
  |  Branch (73:7): [True: 694k, False: 0]
  ------------------
   74|   694k|  return x;
   75|   694k|}
aom_free:
   77|  7.00M|void aom_free(void *memblk) {
   78|  7.00M|  if (memblk) {
  ------------------
  |  Branch (78:7): [True: 3.56M, False: 3.43M]
  ------------------
   79|  3.56M|    void *addr = GetActualMallocAddress(memblk);
   80|  3.56M|    free(addr);
   81|  3.56M|  }
   82|  7.00M|}
aom_mem.c:check_size_argument_overflow:
   27|  4.26M|                                        size_t align) {
   28|  4.26M|  if (nmemb == 0) return 1;
  ------------------
  |  Branch (28:7): [True: 0, False: 4.26M]
  ------------------
   29|  4.26M|  const size_t alloc_padding = GetAllocationPaddingSize(align);
   30|  4.26M|#if defined(AOM_MAX_ALLOCABLE_MEMORY)
   31|  4.26M|  assert(AOM_MAX_ALLOCABLE_MEMORY >= alloc_padding);
   32|  4.26M|  assert(AOM_MAX_ALLOCABLE_MEMORY <= SIZE_MAX);
   33|  4.26M|  if (size > (AOM_MAX_ALLOCABLE_MEMORY - alloc_padding) / nmemb) return 0;
  ------------------
  |  |   28|  4.26M|#define AOM_MAX_ALLOCABLE_MEMORY 8589934592  // 8 GB
  ------------------
  |  Branch (33:7): [True: 0, False: 4.26M]
  ------------------
   34|       |#else
   35|       |  if (size > (SIZE_MAX - alloc_padding) / nmemb) return 0;
   36|       |#endif
   37|  4.26M|  return 1;
   38|  4.26M|}
aom_mem.c:GetAllocationPaddingSize:
   19|  7.82M|static size_t GetAllocationPaddingSize(size_t align) {
   20|  7.82M|  assert(align > 0);
   21|  7.82M|  assert(align < SIZE_MAX - ADDRESS_STORAGE_SIZE);
   22|  7.82M|  return align - 1 + ADDRESS_STORAGE_SIZE;
  ------------------
  |  |   17|  7.82M|#define ADDRESS_STORAGE_SIZE sizeof(size_t)
  ------------------
   23|  7.82M|}
aom_mem.c:SetActualMallocAddress:
   45|  3.56M|                                   const void *const malloc_addr) {
   46|  3.56M|  size_t *const malloc_addr_location = GetMallocAddressLocation(mem);
   47|  3.56M|  *malloc_addr_location = (size_t)malloc_addr;
   48|  3.56M|}
aom_mem.c:GetMallocAddressLocation:
   40|  7.13M|static size_t *GetMallocAddressLocation(void *const mem) {
   41|  7.13M|  return ((size_t *)mem) - 1;
   42|  7.13M|}
aom_mem.c:GetActualMallocAddress:
   50|  3.56M|static void *GetActualMallocAddress(void *const mem) {
   51|  3.56M|  const size_t *const malloc_addr_location = GetMallocAddressLocation(mem);
   52|  3.56M|  return (void *)(*malloc_addr_location);
   53|  3.56M|}

decodeframe.c:aom_memset16:
   40|   164k|static inline void *aom_memset16(void *dest, int val, size_t length) {
   41|   164k|  size_t i;
   42|   164k|  uint16_t *dest16 = (uint16_t *)dest;
   43|  1.02M|  for (i = 0; i < length; i++) *dest16++ = val;
  ------------------
  |  Branch (43:15): [True: 863k, False: 164k]
  ------------------
   44|   164k|  return dest;
   45|   164k|}
intrapred.c:aom_memset16:
   40|  1.51M|static inline void *aom_memset16(void *dest, int val, size_t length) {
   41|  1.51M|  size_t i;
   42|  1.51M|  uint16_t *dest16 = (uint16_t *)dest;
   43|  49.8M|  for (i = 0; i < length; i++) *dest16++ = val;
  ------------------
  |  Branch (43:15): [True: 48.3M, False: 1.51M]
  ------------------
   44|  1.51M|  return dest;
   45|  1.51M|}
reconintra.c:aom_memset16:
   40|  4.20M|static inline void *aom_memset16(void *dest, int val, size_t length) {
   41|  4.20M|  size_t i;
   42|  4.20M|  uint16_t *dest16 = (uint16_t *)dest;
   43|   591M|  for (i = 0; i < length; i++) *dest16++ = val;
  ------------------
  |  Branch (43:15): [True: 587M, False: 4.20M]
  ------------------
   44|  4.20M|  return dest;
   45|  4.20M|}
restoration.c:aom_memset16:
   40|  10.4k|static inline void *aom_memset16(void *dest, int val, size_t length) {
   41|  10.4k|  size_t i;
   42|  10.4k|  uint16_t *dest16 = (uint16_t *)dest;
   43|  52.3k|  for (i = 0; i < length; i++) *dest16++ = val;
  ------------------
  |  Branch (43:15): [True: 41.8k, False: 10.4k]
  ------------------
   44|  10.4k|  return dest;
   45|  10.4k|}

aom_dsp_rtcd.c:aom_once:
   65|  17.9k|static void aom_once(void (*func)(void)) {
   66|       |  static pthread_once_t lock = PTHREAD_ONCE_INIT;
   67|  17.9k|  pthread_once(&lock, func);
   68|  17.9k|}
aom_scale_rtcd.c:aom_once:
   65|  17.9k|static void aom_once(void (*func)(void)) {
   66|       |  static pthread_once_t lock = PTHREAD_ONCE_INIT;
   67|  17.9k|  pthread_once(&lock, func);
   68|  17.9k|}
av1_rtcd.c:aom_once:
   65|  17.9k|static void aom_once(void (*func)(void)) {
   66|       |  static pthread_once_t lock = PTHREAD_ONCE_INIT;
   67|  17.9k|  pthread_once(&lock, func);
   68|  17.9k|}
reconinter.c:aom_once:
   65|  17.9k|static void aom_once(void (*func)(void)) {
   66|       |  static pthread_once_t lock = PTHREAD_ONCE_INIT;
   67|  17.9k|  pthread_once(&lock, func);
   68|  17.9k|}
reconintra.c:aom_once:
   65|  17.9k|static void aom_once(void (*func)(void)) {
   66|       |  static pthread_once_t lock = PTHREAD_ONCE_INIT;
   67|  17.9k|  pthread_once(&lock, func);
   68|  17.9k|}

decodeframe.c:get_msb:
   42|  63.7k|static inline int get_msb(unsigned int n) {
   43|       |  assert(n != 0);
   44|  63.7k|  return 31 ^ __builtin_clz(n);
   45|  63.7k|}
decodemv.c:aom_ceil_log2:
   74|   208k|static inline int aom_ceil_log2(int n) {
   75|   208k|  if (n < 2) return 0;
  ------------------
  |  Branch (75:7): [True: 95.9k, False: 112k]
  ------------------
   76|   112k|  return get_msb(n - 1) + 1;
   77|   208k|}
decodemv.c:get_msb:
   42|   112k|static inline int get_msb(unsigned int n) {
   43|       |  assert(n != 0);
   44|   112k|  return 31 ^ __builtin_clz(n);
   45|   112k|}
detokenize.c:get_msb:
   42|  62.4k|static inline int get_msb(unsigned int n) {
   43|       |  assert(n != 0);
   44|  62.4k|  return 31 ^ __builtin_clz(n);
   45|  62.4k|}
bitreader_buffer.c:get_msb:
   42|     66|static inline int get_msb(unsigned int n) {
   43|       |  assert(n != 0);
   44|     66|  return 31 ^ __builtin_clz(n);
   45|     66|}
binary_codes_reader.c:get_msb:
   42|  10.6k|static inline int get_msb(unsigned int n) {
   43|       |  assert(n != 0);
   44|  10.6k|  return 31 ^ __builtin_clz(n);
   45|  10.6k|}
entdec.c:get_msb:
   42|   136M|static inline int get_msb(unsigned int n) {
   43|       |  assert(n != 0);
   44|   136M|  return 31 ^ __builtin_clz(n);
   45|   136M|}
cdef_block.c:get_msb:
   42|   154k|static inline int get_msb(unsigned int n) {
   43|       |  assert(n != 0);
   44|   154k|  return 31 ^ __builtin_clz(n);
   45|   154k|}
warped_motion.c:get_msb:
   42|  61.8k|static inline int get_msb(unsigned int n) {
   43|       |  assert(n != 0);
   44|  61.8k|  return 31 ^ __builtin_clz(n);
   45|  61.8k|}
cdef_block_avx2.c:get_msb:
   42|  3.01M|static inline int get_msb(unsigned int n) {
   43|       |  assert(n != 0);
   44|  3.01M|  return 31 ^ __builtin_clz(n);
   45|  3.01M|}

decodeframe.c:mem_get_le16_as_int:
  102|    474|static unsigned MEM_VALUE_T mem_get_le16(const void *vmem) {
  103|    474|  unsigned MEM_VALUE_T val;
  104|    474|  const MAU_T *mem = (const MAU_T *)vmem;
  105|       |
  106|    474|  val = mem[1] << 8;
  107|    474|  val |= mem[0];
  108|    474|  return val;
  109|    474|}
decodeframe.c:mem_get_le24_as_int:
  113|     16|static unsigned MEM_VALUE_T mem_get_le24(const void *vmem) {
  114|     16|  unsigned MEM_VALUE_T val;
  115|     16|  const MAU_T *mem = (const MAU_T *)vmem;
  116|       |
  117|     16|  val = mem[2] << 16;
  118|     16|  val |= mem[1] << 8;
  119|     16|  val |= mem[0];
  120|     16|  return val;
  121|     16|}
decodeframe.c:mem_get_le32_as_int:
  125|     28|static unsigned MEM_VALUE_T mem_get_le32(const void *vmem) {
  126|     28|  unsigned MEM_VALUE_T val;
  127|     28|  const MAU_T *mem = (const MAU_T *)vmem;
  128|       |
  129|     28|  val = ((unsigned MEM_VALUE_T)mem[3]) << 24;
  130|     28|  val |= mem[2] << 16;
  131|     28|  val |= mem[1] << 8;
  132|     28|  val |= mem[0];
  133|     28|  return val;
  134|     28|}

aom_dsp_rtcd.c:x86_simd_caps:
  198|      1|static inline int x86_simd_caps(void) {
  199|      1|  unsigned int flags = 0;
  200|      1|  unsigned int mask = ~0u;
  201|      1|  unsigned int max_cpuid_val, reg_eax, reg_ebx, reg_ecx, reg_edx;
  202|      1|  char *env;
  203|       |
  204|       |  /* See if the CPU capabilities are being overridden by the environment */
  205|      1|  env = getenv("AOM_SIMD_CAPS");
  206|      1|  if (env && *env) return (int)strtol(env, NULL, 0);
  ------------------
  |  Branch (206:7): [True: 0, False: 1]
  |  Branch (206:14): [True: 0, False: 0]
  ------------------
  207|       |
  208|      1|  env = getenv("AOM_SIMD_CAPS_MASK");
  209|      1|  if (env && *env) mask = (unsigned int)strtoul(env, NULL, 0);
  ------------------
  |  Branch (209:7): [True: 0, False: 1]
  |  Branch (209:14): [True: 0, False: 0]
  ------------------
  210|       |
  211|       |  /* Ensure that the CPUID instruction supports extended features */
  212|      1|  cpuid(0, 0, max_cpuid_val, reg_ebx, reg_ecx, reg_edx);
  ------------------
  |  |   49|      1|  __asm__ __volatile__("cpuid           \n\t"                   \
  |  |   50|      1|                       : "=a"(ax), "=b"(bx), "=c"(cx), "=d"(dx) \
  |  |   51|      1|                       : "a"(func), "c"(func2))
  ------------------
  213|       |
  214|      1|  if (max_cpuid_val < 1) return 0;
  ------------------
  |  Branch (214:7): [True: 0, False: 1]
  ------------------
  215|       |
  216|       |  /* Get the standard feature flags */
  217|      1|  cpuid(1, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
  ------------------
  |  |   49|      1|  __asm__ __volatile__("cpuid           \n\t"                   \
  |  |   50|      1|                       : "=a"(ax), "=b"(bx), "=c"(cx), "=d"(dx) \
  |  |   51|      1|                       : "a"(func), "c"(func2))
  ------------------
  218|       |
  219|      1|  flags |= FEATURE_SET(reg_edx, MMX) ? HAS_MMX : 0;
  ------------------
  |  |  196|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  176|      1|#define MMX_BITS BIT(23)
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  176|      1|#define MMX_BITS BIT(23)
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (196:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
                flags |= FEATURE_SET(reg_edx, MMX) ? HAS_MMX : 0;
  ------------------
  |  |  161|      1|#define HAS_MMX 0x01
  ------------------
  220|      1|  flags |= FEATURE_SET(reg_edx, SSE) ? HAS_SSE : 0;
  ------------------
  |  |  196|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  177|      1|#define SSE_BITS BIT(25)
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  177|      1|#define SSE_BITS BIT(25)
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (196:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
                flags |= FEATURE_SET(reg_edx, SSE) ? HAS_SSE : 0;
  ------------------
  |  |  162|      1|#define HAS_SSE 0x02
  ------------------
  221|      1|  flags |= FEATURE_SET(reg_edx, SSE2) ? HAS_SSE2 : 0;
  ------------------
  |  |  196|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  178|      1|#define SSE2_BITS BIT(26)
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  178|      1|#define SSE2_BITS BIT(26)
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (196:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
                flags |= FEATURE_SET(reg_edx, SSE2) ? HAS_SSE2 : 0;
  ------------------
  |  |  163|      1|#define HAS_SSE2 0x04
  ------------------
  222|      1|  flags |= FEATURE_SET(reg_ecx, SSE3) ? HAS_SSE3 : 0;
  ------------------
  |  |  196|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  179|      1|#define SSE3_BITS BIT(0)
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  179|      1|#define SSE3_BITS BIT(0)
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (196:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
                flags |= FEATURE_SET(reg_ecx, SSE3) ? HAS_SSE3 : 0;
  ------------------
  |  |  164|      1|#define HAS_SSE3 0x08
  ------------------
  223|      1|  flags |= FEATURE_SET(reg_ecx, SSSE3) ? HAS_SSSE3 : 0;
  ------------------
  |  |  196|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  180|      1|#define SSSE3_BITS BIT(9)
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  180|      1|#define SSSE3_BITS BIT(9)
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (196:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
                flags |= FEATURE_SET(reg_ecx, SSSE3) ? HAS_SSSE3 : 0;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  224|      1|  flags |= FEATURE_SET(reg_ecx, SSE4_1) ? HAS_SSE4_1 : 0;
  ------------------
  |  |  196|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  181|      1|#define SSE4_1_BITS BIT(19)
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  181|      1|#define SSE4_1_BITS BIT(19)
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (196:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
                flags |= FEATURE_SET(reg_ecx, SSE4_1) ? HAS_SSE4_1 : 0;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  225|      1|  flags |= FEATURE_SET(reg_ecx, SSE4_2) ? HAS_SSE4_2 : 0;
  ------------------
  |  |  196|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  182|      1|#define SSE4_2_BITS BIT(20)
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  182|      1|#define SSE4_2_BITS BIT(20)
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (196:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
                flags |= FEATURE_SET(reg_ecx, SSE4_2) ? HAS_SSE4_2 : 0;
  ------------------
  |  |  169|      1|#define HAS_SSE4_2 0x100
  ------------------
  226|       |
  227|       |  // bits 27 (OSXSAVE) & 28 (256-bit AVX)
  228|      1|  if (FEATURE_SET(reg_ecx, AVX)) {
  ------------------
  |  |  196|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  184|      1|#define AVX_BITS (BIT(27) | BIT(28))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |               #define AVX_BITS (BIT(27) | BIT(28))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  184|      1|#define AVX_BITS (BIT(27) | BIT(28))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |               #define AVX_BITS (BIT(27) | BIT(28))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (196:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
  229|       |    // Check for OS-support of YMM state. Necessary for AVX and AVX2.
  230|      1|    if ((xgetbv() & 0x6) == 0x6) {
  ------------------
  |  Branch (230:9): [True: 1, False: 0]
  ------------------
  231|      1|      flags |= HAS_AVX;
  ------------------
  |  |  167|      1|#define HAS_AVX 0x40
  ------------------
  232|      1|      if (max_cpuid_val >= 7) {
  ------------------
  |  Branch (232:11): [True: 1, False: 0]
  ------------------
  233|       |        /* Get the leaf 7 feature flags. Needed to check for AVX2 support */
  234|      1|        cpuid(7, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
  ------------------
  |  |   49|      1|  __asm__ __volatile__("cpuid           \n\t"                   \
  |  |   50|      1|                       : "=a"(ax), "=b"(bx), "=c"(cx), "=d"(dx) \
  |  |   51|      1|                       : "a"(func), "c"(func2))
  ------------------
  235|      1|        flags |= FEATURE_SET(reg_ebx, AVX2) ? HAS_AVX2 : 0;
  ------------------
  |  |  196|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  185|      1|#define AVX2_BITS BIT(5)
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  185|      1|#define AVX2_BITS BIT(5)
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (196:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
                      flags |= FEATURE_SET(reg_ebx, AVX2) ? HAS_AVX2 : 0;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  236|       |        // Check for OS-support of ZMM and YMM state. Necessary for AVX512.
  237|       |        // Only set HAS_AVX512 flag if AVX512_DL feature are supported.
  238|       |        // Older AVX512 implementations (such as Skylake) have turbo curves that
  239|       |        // are currently problematic for mixed AVX512/AVX2 code
  240|      1|        if ((xgetbv() & 0xe6) == 0xe6) {
  ------------------
  |  Branch (240:13): [True: 0, False: 1]
  ------------------
  241|      0|          flags |=
  242|      0|              FEATURE_SET(reg_ebx, AVX512) && FEATURE_SET(reg_ecx, AVX512_DL)
  ------------------
  |  |  196|      0|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  188|      0|#define AVX512_BITS (BIT(16) | BIT(17) | BIT(28) | BIT(30) | BIT(31))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |               #define AVX512_BITS (BIT(16) | BIT(17) | BIT(28) | BIT(30) | BIT(31))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |               #define AVX512_BITS (BIT(16) | BIT(17) | BIT(28) | BIT(30) | BIT(31))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |               #define AVX512_BITS (BIT(16) | BIT(17) | BIT(28) | BIT(30) | BIT(31))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |               #define AVX512_BITS (BIT(16) | BIT(17) | BIT(28) | BIT(30) | BIT(31))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  188|      0|#define AVX512_BITS (BIT(16) | BIT(17) | BIT(28) | BIT(30) | BIT(31))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |               #define AVX512_BITS (BIT(16) | BIT(17) | BIT(28) | BIT(30) | BIT(31))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |               #define AVX512_BITS (BIT(16) | BIT(17) | BIT(28) | BIT(30) | BIT(31))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |               #define AVX512_BITS (BIT(16) | BIT(17) | BIT(28) | BIT(30) | BIT(31))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |               #define AVX512_BITS (BIT(16) | BIT(17) | BIT(28) | BIT(30) | BIT(31))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (196:3): [True: 0, False: 0]
  |  |  ------------------
  ------------------
                            FEATURE_SET(reg_ebx, AVX512) && FEATURE_SET(reg_ecx, AVX512_DL)
  ------------------
  |  |  196|      0|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  193|      0|  (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |                 (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |                 (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |                 (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |                 (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |                 (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |                 (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |                 (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  193|      0|  (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |                 (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |                 (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |                 (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |                 (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |                 (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |                 (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |                 (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (196:3): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  243|      0|                  ? HAS_AVX512
  ------------------
  |  |  170|      0|#define HAS_AVX512 0x200
  ------------------
  244|      0|                  : 0;
  245|      0|        }
  246|      1|      }
  247|      1|    }
  248|      1|  }
  249|      1|  (void)reg_eax;  // Avoid compiler warning on unused-but-set variable.
  250|      1|  return flags & mask;
  251|      1|}
aom_dsp_rtcd.c:xgetbv:
  121|      2|static inline uint64_t xgetbv(void) {
  122|      2|  const uint32_t ecx = 0;
  123|      2|  uint32_t eax, edx;
  124|       |  // Use the raw opcode for xgetbv for compatibility with older toolchains.
  125|      2|  __asm__ volatile(".byte 0x0f, 0x01, 0xd0\n"
  126|      2|                   : "=a"(eax), "=d"(edx)
  127|      2|                   : "c"(ecx));
  128|      2|  return ((uint64_t)edx << 32) | eax;
  129|      2|}
aom_scale_rtcd.c:x86_simd_caps:
  198|      1|static inline int x86_simd_caps(void) {
  199|      1|  unsigned int flags = 0;
  200|      1|  unsigned int mask = ~0u;
  201|      1|  unsigned int max_cpuid_val, reg_eax, reg_ebx, reg_ecx, reg_edx;
  202|      1|  char *env;
  203|       |
  204|       |  /* See if the CPU capabilities are being overridden by the environment */
  205|      1|  env = getenv("AOM_SIMD_CAPS");
  206|      1|  if (env && *env) return (int)strtol(env, NULL, 0);
  ------------------
  |  Branch (206:7): [True: 0, False: 1]
  |  Branch (206:14): [True: 0, False: 0]
  ------------------
  207|       |
  208|      1|  env = getenv("AOM_SIMD_CAPS_MASK");
  209|      1|  if (env && *env) mask = (unsigned int)strtoul(env, NULL, 0);
  ------------------
  |  Branch (209:7): [True: 0, False: 1]
  |  Branch (209:14): [True: 0, False: 0]
  ------------------
  210|       |
  211|       |  /* Ensure that the CPUID instruction supports extended features */
  212|      1|  cpuid(0, 0, max_cpuid_val, reg_ebx, reg_ecx, reg_edx);
  ------------------
  |  |   49|      1|  __asm__ __volatile__("cpuid           \n\t"                   \
  |  |   50|      1|                       : "=a"(ax), "=b"(bx), "=c"(cx), "=d"(dx) \
  |  |   51|      1|                       : "a"(func), "c"(func2))
  ------------------
  213|       |
  214|      1|  if (max_cpuid_val < 1) return 0;
  ------------------
  |  Branch (214:7): [True: 0, False: 1]
  ------------------
  215|       |
  216|       |  /* Get the standard feature flags */
  217|      1|  cpuid(1, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
  ------------------
  |  |   49|      1|  __asm__ __volatile__("cpuid           \n\t"                   \
  |  |   50|      1|                       : "=a"(ax), "=b"(bx), "=c"(cx), "=d"(dx) \
  |  |   51|      1|                       : "a"(func), "c"(func2))
  ------------------
  218|       |
  219|      1|  flags |= FEATURE_SET(reg_edx, MMX) ? HAS_MMX : 0;
  ------------------
  |  |  196|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  176|      1|#define MMX_BITS BIT(23)
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  176|      1|#define MMX_BITS BIT(23)
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (196:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
                flags |= FEATURE_SET(reg_edx, MMX) ? HAS_MMX : 0;
  ------------------
  |  |  161|      1|#define HAS_MMX 0x01
  ------------------
  220|      1|  flags |= FEATURE_SET(reg_edx, SSE) ? HAS_SSE : 0;
  ------------------
  |  |  196|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  177|      1|#define SSE_BITS BIT(25)
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  177|      1|#define SSE_BITS BIT(25)
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (196:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
                flags |= FEATURE_SET(reg_edx, SSE) ? HAS_SSE : 0;
  ------------------
  |  |  162|      1|#define HAS_SSE 0x02
  ------------------
  221|      1|  flags |= FEATURE_SET(reg_edx, SSE2) ? HAS_SSE2 : 0;
  ------------------
  |  |  196|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  178|      1|#define SSE2_BITS BIT(26)
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  178|      1|#define SSE2_BITS BIT(26)
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (196:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
                flags |= FEATURE_SET(reg_edx, SSE2) ? HAS_SSE2 : 0;
  ------------------
  |  |  163|      1|#define HAS_SSE2 0x04
  ------------------
  222|      1|  flags |= FEATURE_SET(reg_ecx, SSE3) ? HAS_SSE3 : 0;
  ------------------
  |  |  196|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  179|      1|#define SSE3_BITS BIT(0)
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  179|      1|#define SSE3_BITS BIT(0)
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (196:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
                flags |= FEATURE_SET(reg_ecx, SSE3) ? HAS_SSE3 : 0;
  ------------------
  |  |  164|      1|#define HAS_SSE3 0x08
  ------------------
  223|      1|  flags |= FEATURE_SET(reg_ecx, SSSE3) ? HAS_SSSE3 : 0;
  ------------------
  |  |  196|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  180|      1|#define SSSE3_BITS BIT(9)
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  180|      1|#define SSSE3_BITS BIT(9)
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (196:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
                flags |= FEATURE_SET(reg_ecx, SSSE3) ? HAS_SSSE3 : 0;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  224|      1|  flags |= FEATURE_SET(reg_ecx, SSE4_1) ? HAS_SSE4_1 : 0;
  ------------------
  |  |  196|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  181|      1|#define SSE4_1_BITS BIT(19)
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  181|      1|#define SSE4_1_BITS BIT(19)
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (196:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
                flags |= FEATURE_SET(reg_ecx, SSE4_1) ? HAS_SSE4_1 : 0;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  225|      1|  flags |= FEATURE_SET(reg_ecx, SSE4_2) ? HAS_SSE4_2 : 0;
  ------------------
  |  |  196|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  182|      1|#define SSE4_2_BITS BIT(20)
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  182|      1|#define SSE4_2_BITS BIT(20)
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (196:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
                flags |= FEATURE_SET(reg_ecx, SSE4_2) ? HAS_SSE4_2 : 0;
  ------------------
  |  |  169|      1|#define HAS_SSE4_2 0x100
  ------------------
  226|       |
  227|       |  // bits 27 (OSXSAVE) & 28 (256-bit AVX)
  228|      1|  if (FEATURE_SET(reg_ecx, AVX)) {
  ------------------
  |  |  196|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  184|      1|#define AVX_BITS (BIT(27) | BIT(28))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |               #define AVX_BITS (BIT(27) | BIT(28))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  184|      1|#define AVX_BITS (BIT(27) | BIT(28))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |               #define AVX_BITS (BIT(27) | BIT(28))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (196:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
  229|       |    // Check for OS-support of YMM state. Necessary for AVX and AVX2.
  230|      1|    if ((xgetbv() & 0x6) == 0x6) {
  ------------------
  |  Branch (230:9): [True: 1, False: 0]
  ------------------
  231|      1|      flags |= HAS_AVX;
  ------------------
  |  |  167|      1|#define HAS_AVX 0x40
  ------------------
  232|      1|      if (max_cpuid_val >= 7) {
  ------------------
  |  Branch (232:11): [True: 1, False: 0]
  ------------------
  233|       |        /* Get the leaf 7 feature flags. Needed to check for AVX2 support */
  234|      1|        cpuid(7, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
  ------------------
  |  |   49|      1|  __asm__ __volatile__("cpuid           \n\t"                   \
  |  |   50|      1|                       : "=a"(ax), "=b"(bx), "=c"(cx), "=d"(dx) \
  |  |   51|      1|                       : "a"(func), "c"(func2))
  ------------------
  235|      1|        flags |= FEATURE_SET(reg_ebx, AVX2) ? HAS_AVX2 : 0;
  ------------------
  |  |  196|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  185|      1|#define AVX2_BITS BIT(5)
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  185|      1|#define AVX2_BITS BIT(5)
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (196:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
                      flags |= FEATURE_SET(reg_ebx, AVX2) ? HAS_AVX2 : 0;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  236|       |        // Check for OS-support of ZMM and YMM state. Necessary for AVX512.
  237|       |        // Only set HAS_AVX512 flag if AVX512_DL feature are supported.
  238|       |        // Older AVX512 implementations (such as Skylake) have turbo curves that
  239|       |        // are currently problematic for mixed AVX512/AVX2 code
  240|      1|        if ((xgetbv() & 0xe6) == 0xe6) {
  ------------------
  |  Branch (240:13): [True: 0, False: 1]
  ------------------
  241|      0|          flags |=
  242|      0|              FEATURE_SET(reg_ebx, AVX512) && FEATURE_SET(reg_ecx, AVX512_DL)
  ------------------
  |  |  196|      0|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  188|      0|#define AVX512_BITS (BIT(16) | BIT(17) | BIT(28) | BIT(30) | BIT(31))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |               #define AVX512_BITS (BIT(16) | BIT(17) | BIT(28) | BIT(30) | BIT(31))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |               #define AVX512_BITS (BIT(16) | BIT(17) | BIT(28) | BIT(30) | BIT(31))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |               #define AVX512_BITS (BIT(16) | BIT(17) | BIT(28) | BIT(30) | BIT(31))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |               #define AVX512_BITS (BIT(16) | BIT(17) | BIT(28) | BIT(30) | BIT(31))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  188|      0|#define AVX512_BITS (BIT(16) | BIT(17) | BIT(28) | BIT(30) | BIT(31))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |               #define AVX512_BITS (BIT(16) | BIT(17) | BIT(28) | BIT(30) | BIT(31))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |               #define AVX512_BITS (BIT(16) | BIT(17) | BIT(28) | BIT(30) | BIT(31))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |               #define AVX512_BITS (BIT(16) | BIT(17) | BIT(28) | BIT(30) | BIT(31))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |               #define AVX512_BITS (BIT(16) | BIT(17) | BIT(28) | BIT(30) | BIT(31))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (196:3): [True: 0, False: 0]
  |  |  ------------------
  ------------------
                            FEATURE_SET(reg_ebx, AVX512) && FEATURE_SET(reg_ecx, AVX512_DL)
  ------------------
  |  |  196|      0|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  193|      0|  (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |                 (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |                 (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |                 (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |                 (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |                 (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |                 (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |                 (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  193|      0|  (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |                 (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |                 (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |                 (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |                 (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |                 (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |                 (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |                 (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (196:3): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  243|      0|                  ? HAS_AVX512
  ------------------
  |  |  170|      0|#define HAS_AVX512 0x200
  ------------------
  244|      0|                  : 0;
  245|      0|        }
  246|      1|      }
  247|      1|    }
  248|      1|  }
  249|      1|  (void)reg_eax;  // Avoid compiler warning on unused-but-set variable.
  250|      1|  return flags & mask;
  251|      1|}
aom_scale_rtcd.c:xgetbv:
  121|      2|static inline uint64_t xgetbv(void) {
  122|      2|  const uint32_t ecx = 0;
  123|      2|  uint32_t eax, edx;
  124|       |  // Use the raw opcode for xgetbv for compatibility with older toolchains.
  125|      2|  __asm__ volatile(".byte 0x0f, 0x01, 0xd0\n"
  126|      2|                   : "=a"(eax), "=d"(edx)
  127|      2|                   : "c"(ecx));
  128|      2|  return ((uint64_t)edx << 32) | eax;
  129|      2|}
av1_rtcd.c:x86_simd_caps:
  198|      1|static inline int x86_simd_caps(void) {
  199|      1|  unsigned int flags = 0;
  200|      1|  unsigned int mask = ~0u;
  201|      1|  unsigned int max_cpuid_val, reg_eax, reg_ebx, reg_ecx, reg_edx;
  202|      1|  char *env;
  203|       |
  204|       |  /* See if the CPU capabilities are being overridden by the environment */
  205|      1|  env = getenv("AOM_SIMD_CAPS");
  206|      1|  if (env && *env) return (int)strtol(env, NULL, 0);
  ------------------
  |  Branch (206:7): [True: 0, False: 1]
  |  Branch (206:14): [True: 0, False: 0]
  ------------------
  207|       |
  208|      1|  env = getenv("AOM_SIMD_CAPS_MASK");
  209|      1|  if (env && *env) mask = (unsigned int)strtoul(env, NULL, 0);
  ------------------
  |  Branch (209:7): [True: 0, False: 1]
  |  Branch (209:14): [True: 0, False: 0]
  ------------------
  210|       |
  211|       |  /* Ensure that the CPUID instruction supports extended features */
  212|      1|  cpuid(0, 0, max_cpuid_val, reg_ebx, reg_ecx, reg_edx);
  ------------------
  |  |   49|      1|  __asm__ __volatile__("cpuid           \n\t"                   \
  |  |   50|      1|                       : "=a"(ax), "=b"(bx), "=c"(cx), "=d"(dx) \
  |  |   51|      1|                       : "a"(func), "c"(func2))
  ------------------
  213|       |
  214|      1|  if (max_cpuid_val < 1) return 0;
  ------------------
  |  Branch (214:7): [True: 0, False: 1]
  ------------------
  215|       |
  216|       |  /* Get the standard feature flags */
  217|      1|  cpuid(1, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
  ------------------
  |  |   49|      1|  __asm__ __volatile__("cpuid           \n\t"                   \
  |  |   50|      1|                       : "=a"(ax), "=b"(bx), "=c"(cx), "=d"(dx) \
  |  |   51|      1|                       : "a"(func), "c"(func2))
  ------------------
  218|       |
  219|      1|  flags |= FEATURE_SET(reg_edx, MMX) ? HAS_MMX : 0;
  ------------------
  |  |  196|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  176|      1|#define MMX_BITS BIT(23)
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  176|      1|#define MMX_BITS BIT(23)
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (196:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
                flags |= FEATURE_SET(reg_edx, MMX) ? HAS_MMX : 0;
  ------------------
  |  |  161|      1|#define HAS_MMX 0x01
  ------------------
  220|      1|  flags |= FEATURE_SET(reg_edx, SSE) ? HAS_SSE : 0;
  ------------------
  |  |  196|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  177|      1|#define SSE_BITS BIT(25)
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  177|      1|#define SSE_BITS BIT(25)
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (196:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
                flags |= FEATURE_SET(reg_edx, SSE) ? HAS_SSE : 0;
  ------------------
  |  |  162|      1|#define HAS_SSE 0x02
  ------------------
  221|      1|  flags |= FEATURE_SET(reg_edx, SSE2) ? HAS_SSE2 : 0;
  ------------------
  |  |  196|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  178|      1|#define SSE2_BITS BIT(26)
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  178|      1|#define SSE2_BITS BIT(26)
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (196:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
                flags |= FEATURE_SET(reg_edx, SSE2) ? HAS_SSE2 : 0;
  ------------------
  |  |  163|      1|#define HAS_SSE2 0x04
  ------------------
  222|      1|  flags |= FEATURE_SET(reg_ecx, SSE3) ? HAS_SSE3 : 0;
  ------------------
  |  |  196|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  179|      1|#define SSE3_BITS BIT(0)
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  179|      1|#define SSE3_BITS BIT(0)
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (196:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
                flags |= FEATURE_SET(reg_ecx, SSE3) ? HAS_SSE3 : 0;
  ------------------
  |  |  164|      1|#define HAS_SSE3 0x08
  ------------------
  223|      1|  flags |= FEATURE_SET(reg_ecx, SSSE3) ? HAS_SSSE3 : 0;
  ------------------
  |  |  196|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  180|      1|#define SSSE3_BITS BIT(9)
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  180|      1|#define SSSE3_BITS BIT(9)
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (196:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
                flags |= FEATURE_SET(reg_ecx, SSSE3) ? HAS_SSSE3 : 0;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  224|      1|  flags |= FEATURE_SET(reg_ecx, SSE4_1) ? HAS_SSE4_1 : 0;
  ------------------
  |  |  196|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  181|      1|#define SSE4_1_BITS BIT(19)
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  181|      1|#define SSE4_1_BITS BIT(19)
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (196:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
                flags |= FEATURE_SET(reg_ecx, SSE4_1) ? HAS_SSE4_1 : 0;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  225|      1|  flags |= FEATURE_SET(reg_ecx, SSE4_2) ? HAS_SSE4_2 : 0;
  ------------------
  |  |  196|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  182|      1|#define SSE4_2_BITS BIT(20)
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  182|      1|#define SSE4_2_BITS BIT(20)
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (196:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
                flags |= FEATURE_SET(reg_ecx, SSE4_2) ? HAS_SSE4_2 : 0;
  ------------------
  |  |  169|      1|#define HAS_SSE4_2 0x100
  ------------------
  226|       |
  227|       |  // bits 27 (OSXSAVE) & 28 (256-bit AVX)
  228|      1|  if (FEATURE_SET(reg_ecx, AVX)) {
  ------------------
  |  |  196|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  184|      1|#define AVX_BITS (BIT(27) | BIT(28))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |               #define AVX_BITS (BIT(27) | BIT(28))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  184|      1|#define AVX_BITS (BIT(27) | BIT(28))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |               #define AVX_BITS (BIT(27) | BIT(28))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (196:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
  229|       |    // Check for OS-support of YMM state. Necessary for AVX and AVX2.
  230|      1|    if ((xgetbv() & 0x6) == 0x6) {
  ------------------
  |  Branch (230:9): [True: 1, False: 0]
  ------------------
  231|      1|      flags |= HAS_AVX;
  ------------------
  |  |  167|      1|#define HAS_AVX 0x40
  ------------------
  232|      1|      if (max_cpuid_val >= 7) {
  ------------------
  |  Branch (232:11): [True: 1, False: 0]
  ------------------
  233|       |        /* Get the leaf 7 feature flags. Needed to check for AVX2 support */
  234|      1|        cpuid(7, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
  ------------------
  |  |   49|      1|  __asm__ __volatile__("cpuid           \n\t"                   \
  |  |   50|      1|                       : "=a"(ax), "=b"(bx), "=c"(cx), "=d"(dx) \
  |  |   51|      1|                       : "a"(func), "c"(func2))
  ------------------
  235|      1|        flags |= FEATURE_SET(reg_ebx, AVX2) ? HAS_AVX2 : 0;
  ------------------
  |  |  196|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  185|      1|#define AVX2_BITS BIT(5)
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  185|      1|#define AVX2_BITS BIT(5)
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      1|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (196:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
                      flags |= FEATURE_SET(reg_ebx, AVX2) ? HAS_AVX2 : 0;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  236|       |        // Check for OS-support of ZMM and YMM state. Necessary for AVX512.
  237|       |        // Only set HAS_AVX512 flag if AVX512_DL feature are supported.
  238|       |        // Older AVX512 implementations (such as Skylake) have turbo curves that
  239|       |        // are currently problematic for mixed AVX512/AVX2 code
  240|      1|        if ((xgetbv() & 0xe6) == 0xe6) {
  ------------------
  |  Branch (240:13): [True: 0, False: 1]
  ------------------
  241|      0|          flags |=
  242|      0|              FEATURE_SET(reg_ebx, AVX512) && FEATURE_SET(reg_ecx, AVX512_DL)
  ------------------
  |  |  196|      0|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  188|      0|#define AVX512_BITS (BIT(16) | BIT(17) | BIT(28) | BIT(30) | BIT(31))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |               #define AVX512_BITS (BIT(16) | BIT(17) | BIT(28) | BIT(30) | BIT(31))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |               #define AVX512_BITS (BIT(16) | BIT(17) | BIT(28) | BIT(30) | BIT(31))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |               #define AVX512_BITS (BIT(16) | BIT(17) | BIT(28) | BIT(30) | BIT(31))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |               #define AVX512_BITS (BIT(16) | BIT(17) | BIT(28) | BIT(30) | BIT(31))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  188|      0|#define AVX512_BITS (BIT(16) | BIT(17) | BIT(28) | BIT(30) | BIT(31))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |               #define AVX512_BITS (BIT(16) | BIT(17) | BIT(28) | BIT(30) | BIT(31))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |               #define AVX512_BITS (BIT(16) | BIT(17) | BIT(28) | BIT(30) | BIT(31))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |               #define AVX512_BITS (BIT(16) | BIT(17) | BIT(28) | BIT(30) | BIT(31))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |               #define AVX512_BITS (BIT(16) | BIT(17) | BIT(28) | BIT(30) | BIT(31))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (196:3): [True: 0, False: 0]
  |  |  ------------------
  ------------------
                            FEATURE_SET(reg_ebx, AVX512) && FEATURE_SET(reg_ecx, AVX512_DL)
  ------------------
  |  |  196|      0|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  193|      0|  (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |                 (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |                 (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |                 (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |                 (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |                 (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |                 (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |                 (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  |  193|      0|  (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |                 (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |                 (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |                 (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |                 (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |                 (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |                 (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  |  |                 (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
  |  |  |  |  ------------------
  |  |  |  |  |  |  173|      0|#define BIT(n) (1u << (n))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (196:3): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  243|      0|                  ? HAS_AVX512
  ------------------
  |  |  170|      0|#define HAS_AVX512 0x200
  ------------------
  244|      0|                  : 0;
  245|      0|        }
  246|      1|      }
  247|      1|    }
  248|      1|  }
  249|      1|  (void)reg_eax;  // Avoid compiler warning on unused-but-set variable.
  250|      1|  return flags & mask;
  251|      1|}
av1_rtcd.c:xgetbv:
  121|      2|static inline uint64_t xgetbv(void) {
  122|      2|  const uint32_t ecx = 0;
  123|      2|  uint32_t eax, edx;
  124|       |  // Use the raw opcode for xgetbv for compatibility with older toolchains.
  125|      2|  __asm__ volatile(".byte 0x0f, 0x01, 0xd0\n"
  126|      2|                   : "=a"(eax), "=d"(edx)
  127|      2|                   : "c"(ecx));
  128|      2|  return ((uint64_t)edx << 32) | eax;
  129|      2|}

aom_scale_rtcd:
   18|  17.9k|void aom_scale_rtcd(void) { aom_once(setup_rtcd_internal); }

aom_free_frame_buffer:
   34|   323k|int aom_free_frame_buffer(YV12_BUFFER_CONFIG *ybf) {
   35|   323k|  if (ybf) {
  ------------------
  |  Branch (35:7): [True: 323k, False: 0]
  ------------------
   36|   323k|    if (ybf->buffer_alloc_sz > 0) {
  ------------------
  |  Branch (36:9): [True: 815, False: 322k]
  ------------------
   37|    815|      aom_free(ybf->buffer_alloc);
   38|    815|    }
   39|   323k|#if CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY
   40|   323k|    if (ybf->y_pyramid) {
  ------------------
  |  Branch (40:9): [True: 0, False: 323k]
  ------------------
   41|      0|      aom_free_pyramid(ybf->y_pyramid);
   42|      0|    }
   43|   323k|    if (ybf->corners) {
  ------------------
  |  Branch (43:9): [True: 0, False: 323k]
  ------------------
   44|      0|      av1_free_corner_list(ybf->corners);
   45|      0|    }
   46|   323k|#endif  // CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY
   47|   323k|    aom_remove_metadata_from_frame_buffer(ybf);
   48|       |    /* buffer_alloc isn't accessed by most functions.  Rather y_buffer,
   49|       |      u_buffer and v_buffer point to buffer_alloc and are used.  Clear out
   50|       |      all of this so that a freed pointer isn't inadvertently used */
   51|   323k|    memset(ybf, 0, sizeof(YV12_BUFFER_CONFIG));
   52|   323k|    return 0;
   53|   323k|  }
   54|       |
   55|      0|  return AOM_CODEC_MEM_ERROR;
   56|   323k|}
aom_realloc_frame_buffer:
  241|  28.6k|                             bool alloc_pyramid, int alloc_y_plane_only) {
  242|  28.6k|  if (ybf) {
  ------------------
  |  Branch (242:7): [True: 28.6k, False: 0]
  ------------------
  243|  28.6k|    int y_stride = 0;
  244|  28.6k|    int uv_stride = 0;
  245|  28.6k|    uint64_t yplane_size = 0;
  246|  28.6k|    uint64_t uvplane_size = 0;
  247|  28.6k|    const int aligned_width = (width + 7) & ~7;
  248|  28.6k|    const int aligned_height = (height + 7) & ~7;
  249|  28.6k|    const int uv_width = aligned_width >> ss_x;
  250|  28.6k|    const int uv_height = aligned_height >> ss_y;
  251|  28.6k|    const int uv_border_w = border >> ss_x;
  252|  28.6k|    const int uv_border_h = border >> ss_y;
  253|       |
  254|  28.6k|    int error = calc_stride_and_planesize(
  255|  28.6k|        ss_x, ss_y, aligned_width, aligned_height, border, byte_alignment,
  256|  28.6k|        alloc_y_plane_only, &y_stride, &uv_stride, &yplane_size, &uvplane_size,
  257|  28.6k|        uv_height);
  258|  28.6k|    if (error) return error;
  ------------------
  |  Branch (258:9): [True: 0, False: 28.6k]
  ------------------
  259|  28.6k|    return realloc_frame_buffer_aligned(
  260|  28.6k|        ybf, width, height, ss_x, ss_y, use_highbitdepth, border,
  261|  28.6k|        byte_alignment, fb, cb, cb_priv, y_stride, yplane_size, uvplane_size,
  262|  28.6k|        aligned_width, aligned_height, uv_width, uv_height, uv_stride,
  263|  28.6k|        uv_border_w, uv_border_h, alloc_pyramid, alloc_y_plane_only);
  264|  28.6k|  }
  265|      0|  return AOM_CODEC_MEM_ERROR;
  266|  28.6k|}
aom_alloc_frame_buffer:
  271|     32|                           int alloc_y_plane_only) {
  272|     32|  if (ybf) {
  ------------------
  |  Branch (272:7): [True: 32, False: 0]
  ------------------
  273|     32|    aom_free_frame_buffer(ybf);
  274|     32|    return aom_realloc_frame_buffer(
  275|     32|        ybf, width, height, ss_x, ss_y, use_highbitdepth, border,
  276|     32|        byte_alignment, NULL, NULL, NULL, alloc_pyramid, alloc_y_plane_only);
  277|     32|  }
  278|      0|  return AOM_CODEC_MEM_ERROR;
  279|     32|}
aom_remove_metadata_from_frame_buffer:
  281|   323k|void aom_remove_metadata_from_frame_buffer(YV12_BUFFER_CONFIG *ybf) {
  282|   323k|  if (ybf && ybf->metadata) {
  ------------------
  |  Branch (282:7): [True: 323k, False: 0]
  |  Branch (282:14): [True: 0, False: 323k]
  ------------------
  283|      0|    aom_img_metadata_array_free(ybf->metadata);
  284|       |    ybf->metadata = NULL;
  285|      0|  }
  286|   323k|}
yv12config.c:calc_stride_and_planesize:
  213|  28.6k|    uint64_t *yplane_size, uint64_t *uvplane_size, const int uv_height) {
  214|       |  /* Only support allocating buffers that have a border that's a multiple
  215|       |   * of 32. The border restriction is required to get 16-byte alignment of
  216|       |   * the start of the chroma rows without introducing an arbitrary gap
  217|       |   * between planes, which would break the semantics of things like
  218|       |   * aom_img_set_rect(). */
  219|  28.6k|  if (border & 0x1f) return AOM_CODEC_MEM_ERROR;
  ------------------
  |  Branch (219:7): [True: 0, False: 28.6k]
  ------------------
  220|  28.6k|  *y_stride = aom_calc_y_stride(aligned_width, border);
  221|  28.6k|  *yplane_size =
  222|  28.6k|      (aligned_height + 2 * border) * (uint64_t)(*y_stride) + byte_alignment;
  223|       |
  224|  28.6k|  if (!alloc_y_plane_only) {
  ------------------
  |  Branch (224:7): [True: 28.6k, False: 0]
  ------------------
  225|  28.6k|    *uv_stride = *y_stride >> ss_x;
  226|  28.6k|    *uvplane_size =
  227|  28.6k|        (uv_height + 2 * (border >> ss_y)) * (uint64_t)(*uv_stride) +
  228|  28.6k|        byte_alignment;
  229|  28.6k|  } else {
  230|      0|    *uv_stride = 0;
  231|      0|    *uvplane_size = 0;
  232|      0|  }
  233|  28.6k|  return 0;
  234|  28.6k|}
yv12config.c:realloc_frame_buffer_aligned:
   66|  28.6k|    bool alloc_pyramid, int alloc_y_plane_only) {
   67|  28.6k|  if (ybf) {
  ------------------
  |  Branch (67:7): [True: 28.6k, False: 0]
  ------------------
   68|  28.6k|    const int aom_byte_align = (byte_alignment == 0) ? 1 : byte_alignment;
  ------------------
  |  Branch (68:32): [True: 28.6k, False: 0]
  ------------------
   69|  28.6k|    const uint64_t frame_size =
   70|  28.6k|        (1 + use_highbitdepth) * (yplane_size + 2 * uvplane_size);
   71|       |
   72|  28.6k|    uint8_t *buf = NULL;
   73|       |
   74|       |#if CONFIG_REALTIME_ONLY || !CONFIG_AV1_ENCODER
   75|       |    // We should only need an 8-bit version of the source frame if we are
   76|       |    // encoding in non-realtime mode
   77|       |    (void)alloc_pyramid;
   78|       |    assert(!alloc_pyramid);
   79|       |#endif  // CONFIG_REALTIME_ONLY || !CONFIG_AV1_ENCODER
   80|       |
   81|  28.6k|#if defined AOM_MAX_ALLOCABLE_MEMORY
   82|       |    // The size of ybf->buffer_alloc.
   83|  28.6k|    uint64_t alloc_size = frame_size;
   84|  28.6k|#if CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY
   85|       |    // The size of ybf->y_pyramid
   86|  28.6k|    if (alloc_pyramid) {
  ------------------
  |  Branch (86:9): [True: 0, False: 28.6k]
  ------------------
   87|      0|      alloc_size += aom_get_pyramid_alloc_size(width, height, use_highbitdepth);
   88|      0|      alloc_size += av1_get_corner_list_size();
   89|      0|    }
   90|  28.6k|#endif  // CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY
   91|       |    // The decoder may allocate REF_FRAMES frame buffers in the frame buffer
   92|       |    // pool. Bound the total amount of allocated memory as if these REF_FRAMES
   93|       |    // frame buffers were allocated in a single allocation.
   94|  28.6k|    if (alloc_size > AOM_MAX_ALLOCABLE_MEMORY / REF_FRAMES)
  ------------------
  |  |   28|  28.6k|#define AOM_MAX_ALLOCABLE_MEMORY 8589934592  // 8 GB
  ------------------
  |  Branch (94:9): [True: 0, False: 28.6k]
  ------------------
   95|      0|      return AOM_CODEC_MEM_ERROR;
   96|  28.6k|#endif
   97|       |
   98|  28.6k|    if (cb != NULL) {
  ------------------
  |  Branch (98:9): [True: 27.7k, False: 815]
  ------------------
   99|  27.7k|      const int align_addr_extra_size = 31;
  100|  27.7k|      const uint64_t external_frame_size = frame_size + align_addr_extra_size;
  101|       |
  102|  27.7k|      assert(fb != NULL);
  103|       |
  104|  27.7k|      if (external_frame_size != (size_t)external_frame_size)
  ------------------
  |  Branch (104:11): [True: 0, False: 27.7k]
  ------------------
  105|      0|        return AOM_CODEC_MEM_ERROR;
  106|       |
  107|       |      // Allocation to hold larger frame, or first allocation.
  108|  27.7k|      if (cb(cb_priv, (size_t)external_frame_size, fb) < 0)
  ------------------
  |  Branch (108:11): [True: 0, False: 27.7k]
  ------------------
  109|      0|        return AOM_CODEC_MEM_ERROR;
  110|       |
  111|  27.7k|      if (fb->data == NULL || fb->size < external_frame_size)
  ------------------
  |  Branch (111:11): [True: 0, False: 27.7k]
  |  Branch (111:31): [True: 0, False: 27.7k]
  ------------------
  112|      0|        return AOM_CODEC_MEM_ERROR;
  113|       |
  114|  27.7k|      ybf->buffer_alloc = (uint8_t *)aom_align_addr(fb->data, 32);
  ------------------
  |  |   49|  27.7k|  (void *)(((uintptr_t)(addr) + ((align) - 1)) & ~(uintptr_t)((align) - 1))
  ------------------
  115|       |
  116|  27.7k|#if defined(__has_feature)
  117|       |#if __has_feature(memory_sanitizer)
  118|       |      // This memset is needed for fixing the issue of using uninitialized
  119|       |      // value in msan test. It will cause a perf loss, so only do this for
  120|       |      // msan test.
  121|       |      memset(ybf->buffer_alloc, 0, (size_t)frame_size);
  122|       |#endif
  123|  27.7k|#endif
  124|  27.7k|    } else if (frame_size > ybf->buffer_alloc_sz) {
  ------------------
  |  Branch (124:16): [True: 815, False: 0]
  ------------------
  125|       |      // Allocation to hold larger frame, or first allocation.
  126|    815|      aom_free(ybf->buffer_alloc);
  127|    815|      ybf->buffer_alloc = NULL;
  128|    815|      ybf->buffer_alloc_sz = 0;
  129|       |
  130|    815|      if (frame_size != (size_t)frame_size) return AOM_CODEC_MEM_ERROR;
  ------------------
  |  Branch (130:11): [True: 0, False: 815]
  ------------------
  131|       |
  132|    815|      ybf->buffer_alloc = (uint8_t *)aom_memalign(32, (size_t)frame_size);
  133|    815|      if (!ybf->buffer_alloc) return AOM_CODEC_MEM_ERROR;
  ------------------
  |  Branch (133:11): [True: 0, False: 815]
  ------------------
  134|       |
  135|    815|      ybf->buffer_alloc_sz = (size_t)frame_size;
  136|       |
  137|       |      // This memset is needed for fixing valgrind error from C loop filter
  138|       |      // due to access uninitialized memory in frame border. It could be
  139|       |      // removed if border is totally removed.
  140|    815|      memset(ybf->buffer_alloc, 0, ybf->buffer_alloc_sz);
  141|    815|    }
  142|       |
  143|  28.6k|    ybf->y_crop_width = width;
  144|  28.6k|    ybf->y_crop_height = height;
  145|  28.6k|    ybf->y_width = aligned_width;
  146|  28.6k|    ybf->y_height = aligned_height;
  147|  28.6k|    ybf->y_stride = y_stride;
  148|       |
  149|  28.6k|    ybf->uv_crop_width = (width + ss_x) >> ss_x;
  150|  28.6k|    ybf->uv_crop_height = (height + ss_y) >> ss_y;
  151|  28.6k|    ybf->uv_width = uv_width;
  152|  28.6k|    ybf->uv_height = uv_height;
  153|  28.6k|    ybf->uv_stride = uv_stride;
  154|       |
  155|  28.6k|    ybf->border = border;
  156|  28.6k|    ybf->frame_size = (size_t)frame_size;
  157|  28.6k|    ybf->subsampling_x = ss_x;
  158|  28.6k|    ybf->subsampling_y = ss_y;
  159|       |
  160|  28.6k|    buf = ybf->buffer_alloc;
  161|  28.6k|    if (use_highbitdepth) {
  ------------------
  |  Branch (161:9): [True: 9.26k, False: 19.3k]
  ------------------
  162|       |      // Store uint16 addresses when using 16bit framebuffers
  163|  9.26k|      buf = CONVERT_TO_BYTEPTR(ybf->buffer_alloc);
  ------------------
  |  |   76|  9.26k|#define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1))
  ------------------
  164|  9.26k|      ybf->flags = YV12_FLAG_HIGHBITDEPTH;
  ------------------
  |  |  142|  9.26k|#define YV12_FLAG_HIGHBITDEPTH 8
  ------------------
  165|  19.3k|    } else {
  166|  19.3k|      ybf->flags = 0;
  167|  19.3k|    }
  168|       |
  169|  28.6k|    ybf->y_buffer = (uint8_t *)aom_align_addr(
  ------------------
  |  |   49|  28.6k|  (void *)(((uintptr_t)(addr) + ((align) - 1)) & ~(uintptr_t)((align) - 1))
  ------------------
  170|  28.6k|        buf + (border * y_stride) + border, aom_byte_align);
  171|  28.6k|    if (!alloc_y_plane_only) {
  ------------------
  |  Branch (171:9): [True: 28.6k, False: 0]
  ------------------
  172|  28.6k|      ybf->u_buffer = (uint8_t *)aom_align_addr(
  ------------------
  |  |   49|  28.6k|  (void *)(((uintptr_t)(addr) + ((align) - 1)) & ~(uintptr_t)((align) - 1))
  ------------------
  173|  28.6k|          buf + yplane_size + (uv_border_h * uv_stride) + uv_border_w,
  174|  28.6k|          aom_byte_align);
  175|  28.6k|      ybf->v_buffer =
  176|  28.6k|          (uint8_t *)aom_align_addr(buf + yplane_size + uvplane_size +
  ------------------
  |  |   49|  28.6k|  (void *)(((uintptr_t)(addr) + ((align) - 1)) & ~(uintptr_t)((align) - 1))
  ------------------
  177|  28.6k|                                        (uv_border_h * uv_stride) + uv_border_w,
  178|  28.6k|                                    aom_byte_align);
  179|  28.6k|    } else {
  180|      0|      ybf->u_buffer = NULL;
  181|      0|      ybf->v_buffer = NULL;
  182|      0|    }
  183|       |
  184|  28.6k|    ybf->use_external_reference_buffers = 0;
  185|       |
  186|  28.6k|#if CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY
  187|  28.6k|    if (ybf->y_pyramid) {
  ------------------
  |  Branch (187:9): [True: 0, False: 28.6k]
  ------------------
  188|      0|      aom_free_pyramid(ybf->y_pyramid);
  189|      0|      ybf->y_pyramid = NULL;
  190|      0|    }
  191|  28.6k|    if (ybf->corners) {
  ------------------
  |  Branch (191:9): [True: 0, False: 28.6k]
  ------------------
  192|      0|      av1_free_corner_list(ybf->corners);
  193|      0|      ybf->corners = NULL;
  194|      0|    }
  195|  28.6k|    if (alloc_pyramid) {
  ------------------
  |  Branch (195:9): [True: 0, False: 28.6k]
  ------------------
  196|      0|      ybf->y_pyramid = aom_alloc_pyramid(width, height, use_highbitdepth);
  197|      0|      if (!ybf->y_pyramid) return AOM_CODEC_MEM_ERROR;
  ------------------
  |  Branch (197:11): [True: 0, False: 0]
  ------------------
  198|      0|      ybf->corners = av1_alloc_corner_list();
  199|      0|      if (!ybf->corners) return AOM_CODEC_MEM_ERROR;
  ------------------
  |  Branch (199:11): [True: 0, False: 0]
  ------------------
  200|      0|    }
  201|  28.6k|#endif  // CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY
  202|       |
  203|  28.6k|    ybf->corrupted = 0; /* assume not corrupted by errors */
  204|  28.6k|    return 0;
  205|  28.6k|  }
  206|      0|  return AOM_CODEC_MEM_ERROR;
  207|  28.6k|}

aom_yv12_extend_frame_borders_c:
  148|     32|                                     const int num_planes) {
  149|     32|  assert(ybf->border % 2 == 0);
  150|     32|  assert(ybf->y_height - ybf->y_crop_height < 16);
  151|     32|  assert(ybf->y_width - ybf->y_crop_width < 16);
  152|     32|  assert(ybf->y_height - ybf->y_crop_height >= 0);
  153|     32|  assert(ybf->y_width - ybf->y_crop_width >= 0);
  154|       |
  155|     32|#if CONFIG_AV1_HIGHBITDEPTH
  156|     32|  if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) {
  ------------------
  |  |  142|     32|#define YV12_FLAG_HIGHBITDEPTH 8
  ------------------
  |  Branch (156:7): [True: 0, False: 32]
  ------------------
  157|      0|    for (int plane = 0; plane < num_planes; ++plane) {
  ------------------
  |  Branch (157:25): [True: 0, False: 0]
  ------------------
  158|      0|      const int is_uv = plane > 0;
  159|      0|      const int plane_border = ybf->border >> is_uv;
  160|      0|      extend_plane_high(
  161|      0|          ybf->buffers[plane], ybf->strides[is_uv], ybf->crop_widths[is_uv],
  162|      0|          ybf->crop_heights[is_uv], plane_border, plane_border,
  163|      0|          plane_border + ybf->heights[is_uv] - ybf->crop_heights[is_uv],
  164|      0|          plane_border + ybf->widths[is_uv] - ybf->crop_widths[is_uv], 0,
  165|      0|          ybf->crop_heights[is_uv]);
  166|      0|    }
  167|      0|    return;
  168|      0|  }
  169|     32|#endif
  170|       |
  171|    128|  for (int plane = 0; plane < num_planes; ++plane) {
  ------------------
  |  Branch (171:23): [True: 96, False: 32]
  ------------------
  172|     96|    const int is_uv = plane > 0;
  173|     96|    const int plane_border = ybf->border >> is_uv;
  174|     96|    extend_plane(ybf->buffers[plane], ybf->strides[is_uv],
  175|     96|                 ybf->crop_widths[is_uv], ybf->crop_heights[is_uv],
  176|     96|                 plane_border, plane_border,
  177|     96|                 plane_border + ybf->heights[is_uv] - ybf->crop_heights[is_uv],
  178|     96|                 plane_border + ybf->widths[is_uv] - ybf->crop_widths[is_uv], 0,
  179|     96|                 ybf->crop_heights[is_uv]);
  180|     96|  }
  181|     32|}
aom_extend_frame_borders_c:
  221|     32|void aom_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf, const int num_planes) {
  222|     32|  extend_frame(ybf, ybf->border, num_planes);
  223|     32|}
aom_yv12_copy_frame_c:
  237|     32|                           YV12_BUFFER_CONFIG *dst_bc, const int num_planes) {
  238|     32|  assert(src_bc->y_width == dst_bc->y_width);
  239|     32|  assert(src_bc->y_height == dst_bc->y_height);
  240|       |
  241|     32|#if CONFIG_AV1_HIGHBITDEPTH
  242|     32|  assert((src_bc->flags & YV12_FLAG_HIGHBITDEPTH) ==
  243|     32|         (dst_bc->flags & YV12_FLAG_HIGHBITDEPTH));
  244|       |
  245|     32|  if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) {
  ------------------
  |  |  142|     32|#define YV12_FLAG_HIGHBITDEPTH 8
  ------------------
  |  Branch (245:7): [True: 0, False: 32]
  ------------------
  246|      0|    for (int plane = 0; plane < num_planes; ++plane) {
  ------------------
  |  Branch (246:25): [True: 0, False: 0]
  ------------------
  247|      0|      const uint8_t *plane_src = src_bc->buffers[plane];
  248|      0|      uint8_t *plane_dst = dst_bc->buffers[plane];
  249|      0|      const int is_uv = plane > 0;
  250|       |
  251|      0|      for (int row = 0; row < src_bc->heights[is_uv]; ++row) {
  ------------------
  |  Branch (251:25): [True: 0, False: 0]
  ------------------
  252|      0|        memcpy_short_addr(plane_dst, plane_src, src_bc->widths[is_uv]);
  253|      0|        plane_src += src_bc->strides[is_uv];
  254|      0|        plane_dst += dst_bc->strides[is_uv];
  255|      0|      }
  256|      0|    }
  257|      0|    aom_yv12_extend_frame_borders_c(dst_bc, num_planes);
  258|      0|    return;
  259|      0|  }
  260|     32|#endif
  261|    128|  for (int plane = 0; plane < num_planes; ++plane) {
  ------------------
  |  Branch (261:23): [True: 96, False: 32]
  ------------------
  262|     96|    const uint8_t *plane_src = src_bc->buffers[plane];
  263|     96|    uint8_t *plane_dst = dst_bc->buffers[plane];
  264|     96|    const int is_uv = plane > 0;
  265|       |
  266|  17.1k|    for (int row = 0; row < src_bc->heights[is_uv]; ++row) {
  ------------------
  |  Branch (266:23): [True: 17.0k, False: 96]
  ------------------
  267|  17.0k|      memcpy(plane_dst, plane_src, src_bc->widths[is_uv]);
  268|  17.0k|      plane_src += src_bc->strides[is_uv];
  269|  17.0k|      plane_dst += dst_bc->strides[is_uv];
  270|  17.0k|    }
  271|     96|  }
  272|     32|  aom_yv12_extend_frame_borders_c(dst_bc, num_planes);
  273|     32|}
aom_yv12_partial_copy_y_c:
  358|  1.37k|                               int vstart2) {
  359|  1.37k|  int row;
  360|  1.37k|  const uint8_t *src = src_ybc->y_buffer;
  361|  1.37k|  uint8_t *dst = dst_ybc->y_buffer;
  362|  1.37k|#if CONFIG_AV1_HIGHBITDEPTH
  363|  1.37k|  if (src_ybc->flags & YV12_FLAG_HIGHBITDEPTH) {
  ------------------
  |  |  142|  1.37k|#define YV12_FLAG_HIGHBITDEPTH 8
  ------------------
  |  Branch (363:7): [True: 824, False: 551]
  ------------------
  364|    824|    const uint16_t *src16 =
  365|    824|        CONVERT_TO_SHORTPTR(src + vstart1 * src_ybc->y_stride + hstart1);
  ------------------
  |  |   75|    824|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  366|    824|    uint16_t *dst16 =
  367|    824|        CONVERT_TO_SHORTPTR(dst + vstart2 * dst_ybc->y_stride + hstart2);
  ------------------
  |  |   75|    824|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  368|       |
  369|  53.4k|    for (row = vstart1; row < vend1; ++row) {
  ------------------
  |  Branch (369:25): [True: 52.6k, False: 824]
  ------------------
  370|  52.6k|      memcpy(dst16, src16, (hend1 - hstart1) * sizeof(uint16_t));
  371|  52.6k|      src16 += src_ybc->y_stride;
  372|  52.6k|      dst16 += dst_ybc->y_stride;
  373|  52.6k|    }
  374|    824|    return;
  375|    824|  }
  376|    551|#endif
  377|    551|  src = (src + vstart1 * src_ybc->y_stride + hstart1);
  378|    551|  dst = (dst + vstart2 * dst_ybc->y_stride + hstart2);
  379|       |
  380|  35.2k|  for (row = vstart1; row < vend1; ++row) {
  ------------------
  |  Branch (380:23): [True: 34.6k, False: 551]
  ------------------
  381|  34.6k|    memcpy(dst, src, (hend1 - hstart1));
  382|  34.6k|    src += src_ybc->y_stride;
  383|  34.6k|    dst += dst_ybc->y_stride;
  384|  34.6k|  }
  385|    551|}
aom_yv12_partial_coloc_copy_y_c:
  389|  1.37k|                                     int hend, int vstart, int vend) {
  390|  1.37k|  aom_yv12_partial_copy_y_c(src_ybc, hstart, hend, vstart, vend, dst_ybc,
  391|  1.37k|                            hstart, vstart);
  392|  1.37k|}
aom_yv12_partial_copy_u_c:
  397|  1.20k|                               int vstart2) {
  398|  1.20k|  int row;
  399|  1.20k|  const uint8_t *src = src_bc->u_buffer;
  400|  1.20k|  uint8_t *dst = dst_bc->u_buffer;
  401|  1.20k|#if CONFIG_AV1_HIGHBITDEPTH
  402|  1.20k|  if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) {
  ------------------
  |  |  142|  1.20k|#define YV12_FLAG_HIGHBITDEPTH 8
  ------------------
  |  Branch (402:7): [True: 678, False: 526]
  ------------------
  403|    678|    const uint16_t *src16 =
  404|    678|        CONVERT_TO_SHORTPTR(src + vstart1 * src_bc->uv_stride + hstart1);
  ------------------
  |  |   75|    678|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  405|    678|    uint16_t *dst16 =
  406|    678|        CONVERT_TO_SHORTPTR(dst + vstart2 * dst_bc->uv_stride + hstart2);
  ------------------
  |  |   75|    678|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  407|  42.2k|    for (row = vstart1; row < vend1; ++row) {
  ------------------
  |  Branch (407:25): [True: 41.6k, False: 678]
  ------------------
  408|  41.6k|      memcpy(dst16, src16, (hend1 - hstart1) * sizeof(uint16_t));
  409|  41.6k|      src16 += src_bc->uv_stride;
  410|  41.6k|      dst16 += dst_bc->uv_stride;
  411|  41.6k|    }
  412|    678|    return;
  413|    678|  }
  414|    526|#endif
  415|    526|  src = (src + vstart1 * src_bc->uv_stride + hstart1);
  416|    526|  dst = (dst + vstart2 * dst_bc->uv_stride + hstart2);
  417|       |
  418|  32.6k|  for (row = vstart1; row < vend1; ++row) {
  ------------------
  |  Branch (418:23): [True: 32.1k, False: 526]
  ------------------
  419|  32.1k|    memcpy(dst, src, (hend1 - hstart1));
  420|  32.1k|    src += src_bc->uv_stride;
  421|  32.1k|    dst += dst_bc->uv_stride;
  422|  32.1k|  }
  423|    526|}
aom_yv12_partial_coloc_copy_u_c:
  427|  1.20k|                                     int hend, int vstart, int vend) {
  428|  1.20k|  aom_yv12_partial_copy_u_c(src_bc, hstart, hend, vstart, vend, dst_bc, hstart,
  429|  1.20k|                            vstart);
  430|  1.20k|}
aom_yv12_partial_copy_v_c:
  435|  1.43k|                               int vstart2) {
  436|  1.43k|  int row;
  437|  1.43k|  const uint8_t *src = src_bc->v_buffer;
  438|  1.43k|  uint8_t *dst = dst_bc->v_buffer;
  439|  1.43k|#if CONFIG_AV1_HIGHBITDEPTH
  440|  1.43k|  if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) {
  ------------------
  |  |  142|  1.43k|#define YV12_FLAG_HIGHBITDEPTH 8
  ------------------
  |  Branch (440:7): [True: 812, False: 619]
  ------------------
  441|    812|    const uint16_t *src16 =
  442|    812|        CONVERT_TO_SHORTPTR(src + vstart1 * src_bc->uv_stride + hstart1);
  ------------------
  |  |   75|    812|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  443|    812|    uint16_t *dst16 =
  444|    812|        CONVERT_TO_SHORTPTR(dst + vstart2 * dst_bc->uv_stride + hstart2);
  ------------------
  |  |   75|    812|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  445|  51.6k|    for (row = vstart1; row < vend1; ++row) {
  ------------------
  |  Branch (445:25): [True: 50.8k, False: 812]
  ------------------
  446|  50.8k|      memcpy(dst16, src16, (hend1 - hstart1) * sizeof(uint16_t));
  447|  50.8k|      src16 += src_bc->uv_stride;
  448|  50.8k|      dst16 += dst_bc->uv_stride;
  449|  50.8k|    }
  450|    812|    return;
  451|    812|  }
  452|    619|#endif
  453|    619|  src = (src + vstart1 * src_bc->uv_stride + hstart1);
  454|    619|  dst = (dst + vstart2 * dst_bc->uv_stride + hstart2);
  455|       |
  456|  40.5k|  for (row = vstart1; row < vend1; ++row) {
  ------------------
  |  Branch (456:23): [True: 39.9k, False: 619]
  ------------------
  457|  39.9k|    memcpy(dst, src, (hend1 - hstart1));
  458|  39.9k|    src += src_bc->uv_stride;
  459|  39.9k|    dst += dst_bc->uv_stride;
  460|  39.9k|  }
  461|    619|}
aom_yv12_partial_coloc_copy_v_c:
  465|  1.43k|                                     int hend, int vstart, int vend) {
  466|  1.43k|  aom_yv12_partial_copy_v_c(src_bc, hstart, hend, vstart, vend, dst_bc, hstart,
  467|  1.43k|                            vstart);
  468|  1.43k|}
yv12extend.c:extend_plane:
   25|    192|                         int v_end) {
   26|    192|  assert(src != NULL);
   27|    192|  int i;
   28|    192|  const int linesize = extend_left + extend_right + width;
   29|    192|  assert(linesize <= src_stride);
   30|       |
   31|       |  /* copy the left and right most columns out */
   32|    192|  uint8_t *src_ptr1 = src + v_start * src_stride;
   33|    192|  uint8_t *src_ptr2 = src + v_start * src_stride + width - 1;
   34|    192|  uint8_t *dst_ptr1 = src + v_start * src_stride - extend_left;
   35|    192|  uint8_t *dst_ptr2 = src_ptr2 + 1;
   36|       |
   37|  33.2k|  for (i = v_start; i < v_end; ++i) {
  ------------------
  |  Branch (37:21): [True: 33.0k, False: 192]
  ------------------
   38|  33.0k|    memset(dst_ptr1, src_ptr1[0], extend_left);
   39|  33.0k|    memset(dst_ptr2, src_ptr2[0], extend_right);
   40|  33.0k|    src_ptr1 += src_stride;
   41|  33.0k|    src_ptr2 += src_stride;
   42|  33.0k|    dst_ptr1 += src_stride;
   43|  33.0k|    dst_ptr2 += src_stride;
   44|  33.0k|  }
   45|       |
   46|       |  /* Now copy the top and bottom lines into each line of the respective
   47|       |   * borders
   48|       |   */
   49|    192|  src_ptr1 = src - extend_left;
   50|    192|  dst_ptr1 = src_ptr1 + src_stride * -extend_top;
   51|       |
   52|  46.2k|  for (i = 0; i < extend_top; ++i) {
  ------------------
  |  Branch (52:15): [True: 46.0k, False: 192]
  ------------------
   53|  46.0k|    memcpy(dst_ptr1, src_ptr1, linesize);
   54|  46.0k|    dst_ptr1 += src_stride;
   55|  46.0k|  }
   56|       |
   57|    192|  src_ptr2 = src_ptr1 + src_stride * (height - 1);
   58|    192|  dst_ptr2 = src_ptr2;
   59|       |
   60|  47.3k|  for (i = 0; i < extend_bottom; ++i) {
  ------------------
  |  Branch (60:15): [True: 47.1k, False: 192]
  ------------------
   61|  47.1k|    dst_ptr2 += src_stride;
   62|  47.1k|    memcpy(dst_ptr2, src_ptr2, linesize);
   63|  47.1k|  }
   64|    192|}
yv12extend.c:extend_frame:
  184|     32|                         const int num_planes) {
  185|     32|  const int ss_x = ybf->subsampling_x;
  186|     32|  const int ss_y = ybf->subsampling_y;
  187|       |
  188|     32|  assert(ybf->y_height - ybf->y_crop_height < 16);
  189|     32|  assert(ybf->y_width - ybf->y_crop_width < 16);
  190|     32|  assert(ybf->y_height - ybf->y_crop_height >= 0);
  191|     32|  assert(ybf->y_width - ybf->y_crop_width >= 0);
  192|       |
  193|     32|#if CONFIG_AV1_HIGHBITDEPTH
  194|     32|  if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) {
  ------------------
  |  |  142|     32|#define YV12_FLAG_HIGHBITDEPTH 8
  ------------------
  |  Branch (194:7): [True: 0, False: 32]
  ------------------
  195|      0|    for (int plane = 0; plane < num_planes; ++plane) {
  ------------------
  |  Branch (195:25): [True: 0, False: 0]
  ------------------
  196|      0|      const int is_uv = plane > 0;
  197|      0|      const int top = ext_size >> (is_uv ? ss_y : 0);
  ------------------
  |  Branch (197:36): [True: 0, False: 0]
  ------------------
  198|      0|      const int left = ext_size >> (is_uv ? ss_x : 0);
  ------------------
  |  Branch (198:37): [True: 0, False: 0]
  ------------------
  199|      0|      const int bottom = top + ybf->heights[is_uv] - ybf->crop_heights[is_uv];
  200|      0|      const int right = left + ybf->widths[is_uv] - ybf->crop_widths[is_uv];
  201|      0|      extend_plane_high(ybf->buffers[plane], ybf->strides[is_uv],
  202|      0|                        ybf->crop_widths[is_uv], ybf->crop_heights[is_uv], top,
  203|      0|                        left, bottom, right, 0, ybf->crop_heights[is_uv]);
  204|      0|    }
  205|      0|    return;
  206|      0|  }
  207|     32|#endif
  208|       |
  209|    128|  for (int plane = 0; plane < num_planes; ++plane) {
  ------------------
  |  Branch (209:23): [True: 96, False: 32]
  ------------------
  210|     96|    const int is_uv = plane > 0;
  211|     96|    const int top = ext_size >> (is_uv ? ss_y : 0);
  ------------------
  |  Branch (211:34): [True: 64, False: 32]
  ------------------
  212|     96|    const int left = ext_size >> (is_uv ? ss_x : 0);
  ------------------
  |  Branch (212:35): [True: 64, False: 32]
  ------------------
  213|     96|    const int bottom = top + ybf->heights[is_uv] - ybf->crop_heights[is_uv];
  214|     96|    const int right = left + ybf->widths[is_uv] - ybf->crop_widths[is_uv];
  215|     96|    extend_plane(ybf->buffers[plane], ybf->strides[is_uv],
  216|     96|                 ybf->crop_widths[is_uv], ybf->crop_heights[is_uv], top, left,
  217|     96|                 bottom, right, 0, ybf->crop_heights[is_uv]);
  218|     96|  }
  219|     32|}

yv12config.c:aom_calc_y_stride:
  215|  28.6k|static inline int aom_calc_y_stride(int aligned_width, int border) {
  216|  28.6k|  return ((aligned_width + 2 * border) + 31) & ~31;
  217|  28.6k|}

aom_get_worker_interface:
  242|   476k|const AVxWorkerInterface *aom_get_worker_interface(void) {
  243|   476k|  return &g_worker_interface;
  244|   476k|}
aom_thread.c:init:
  123|   325k|static void init(AVxWorker *const worker) {
  124|   325k|  memset(worker, 0, sizeof(*worker));
  125|   325k|  worker->status_ = AVX_WORKER_STATUS_NOT_OK;
  126|   325k|}
aom_thread.c:reset:
  136|   280k|static int reset(AVxWorker *const worker) {
  137|   280k|  int ok = 1;
  138|   280k|  worker->had_error = 0;
  139|   280k|  if (worker->status_ < AVX_WORKER_STATUS_OK) {
  ------------------
  |  Branch (139:7): [True: 280k, False: 0]
  ------------------
  140|   280k|#if CONFIG_MULTITHREAD
  141|   280k|    worker->impl_ = (AVxWorkerImpl *)aom_calloc(1, sizeof(*worker->impl_));
  142|   280k|    if (worker->impl_ == NULL) {
  ------------------
  |  Branch (142:9): [True: 0, False: 280k]
  ------------------
  143|      0|      return 0;
  144|      0|    }
  145|   280k|    if (pthread_mutex_init(&worker->impl_->mutex_, NULL)) {
  ------------------
  |  Branch (145:9): [True: 0, False: 280k]
  ------------------
  146|      0|      goto Error;
  147|      0|    }
  148|   280k|    if (pthread_cond_init(&worker->impl_->condition_, NULL)) {
  ------------------
  |  Branch (148:9): [True: 0, False: 280k]
  ------------------
  149|      0|      pthread_mutex_destroy(&worker->impl_->mutex_);
  150|      0|      goto Error;
  151|      0|    }
  152|   280k|    pthread_attr_t attr;
  153|   280k|    if (pthread_attr_init(&attr)) goto Error2;
  ------------------
  |  Branch (153:9): [True: 0, False: 280k]
  ------------------
  154|       |    // Debug ASan builds require at least ~1MiB of stack; prevents
  155|       |    // failures on macOS arm64 where the default is 512KiB.
  156|       |    // See: https://crbug.com/aomedia/3379
  157|       |#if defined(AOM_ADDRESS_SANITIZER) && defined(__APPLE__) && AOM_ARCH_ARM && \
  158|       |    !defined(NDEBUG)
  159|       |    const size_t kMinStackSize = 1024 * 1024;
  160|       |#else
  161|   280k|    const size_t kMinStackSize = 256 * 1024;
  162|   280k|#endif
  163|   280k|    size_t stacksize;
  164|   280k|    if (!pthread_attr_getstacksize(&attr, &stacksize)) {
  ------------------
  |  Branch (164:9): [True: 280k, False: 0]
  ------------------
  165|   280k|      if (stacksize < kMinStackSize &&
  ------------------
  |  Branch (165:11): [True: 0, False: 280k]
  ------------------
  166|      0|          pthread_attr_setstacksize(&attr, kMinStackSize)) {
  ------------------
  |  Branch (166:11): [True: 0, False: 0]
  ------------------
  167|      0|        pthread_attr_destroy(&attr);
  168|      0|        goto Error2;
  169|      0|      }
  170|   280k|    }
  171|   280k|    pthread_mutex_lock(&worker->impl_->mutex_);
  172|   280k|    ok = !pthread_create(&worker->impl_->thread_, &attr, thread_loop, worker);
  173|   280k|    if (ok) worker->status_ = AVX_WORKER_STATUS_OK;
  ------------------
  |  Branch (173:9): [True: 280k, False: 0]
  ------------------
  174|   280k|    pthread_mutex_unlock(&worker->impl_->mutex_);
  175|   280k|    pthread_attr_destroy(&attr);
  176|   280k|    if (!ok) {
  ------------------
  |  Branch (176:9): [True: 0, False: 280k]
  ------------------
  177|      0|    Error2:
  178|      0|      pthread_mutex_destroy(&worker->impl_->mutex_);
  179|      0|      pthread_cond_destroy(&worker->impl_->condition_);
  180|      0|    Error:
  181|      0|      aom_free(worker->impl_);
  182|      0|      worker->impl_ = NULL;
  183|      0|      return 0;
  184|      0|    }
  185|       |#else
  186|       |    worker->status_ = AVX_WORKER_STATUS_OK;
  187|       |#endif
  188|   280k|  } else if (worker->status_ > AVX_WORKER_STATUS_OK) {
  ------------------
  |  Branch (188:14): [True: 0, False: 0]
  ------------------
  189|      0|    ok = sync(worker);
  190|      0|  }
  191|   280k|  assert(!ok || (worker->status_ == AVX_WORKER_STATUS_OK));
  192|   280k|  return ok;
  193|   280k|}
aom_thread.c:thread_loop:
   45|   280k|static THREADFN thread_loop(void *ptr) {
   46|   280k|  AVxWorker *const worker = (AVxWorker *)ptr;
   47|       |#ifdef __APPLE__
   48|       |  if (worker->thread_name != NULL) {
   49|       |    // Apple's version of pthread_setname_np takes one argument and operates on
   50|       |    // the current thread only. The maximum size of the thread_name buffer was
   51|       |    // noted in the Chromium source code and was confirmed by experiments. If
   52|       |    // thread_name is too long, pthread_setname_np returns -1 with errno
   53|       |    // ENAMETOOLONG (63).
   54|       |    char thread_name[64];
   55|       |    strncpy(thread_name, worker->thread_name, sizeof(thread_name) - 1);
   56|       |    thread_name[sizeof(thread_name) - 1] = '\0';
   57|       |    pthread_setname_np(thread_name);
   58|       |  }
   59|       |#elif (defined(__GLIBC__) && !defined(__GNU__)) || defined(__BIONIC__)
   60|   280k|  if (worker->thread_name != NULL) {
  ------------------
  |  Branch (60:7): [True: 280k, False: 2]
  ------------------
   61|       |    // Linux and Android require names (with nul) fit in 16 chars, otherwise
   62|       |    // pthread_setname_np() returns ERANGE (34).
   63|   280k|    char thread_name[16];
   64|   280k|    strncpy(thread_name, worker->thread_name, sizeof(thread_name) - 1);
   65|   280k|    thread_name[sizeof(thread_name) - 1] = '\0';
   66|   280k|    pthread_setname_np(pthread_self(), thread_name);
   67|   280k|  }
   68|   280k|#endif
   69|   280k|  pthread_mutex_lock(&worker->impl_->mutex_);
   70|   401k|  for (;;) {
   71|   802k|    while (worker->status_ == AVX_WORKER_STATUS_OK) {  // wait in idling mode
  ------------------
  |  Branch (71:12): [True: 401k, False: 401k]
  ------------------
   72|   401k|      pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_);
   73|   401k|    }
   74|   401k|    if (worker->status_ == AVX_WORKER_STATUS_WORKING) {
  ------------------
  |  Branch (74:9): [True: 120k, False: 280k]
  ------------------
   75|       |      // When worker->status_ is AVX_WORKER_STATUS_WORKING, the main thread
   76|       |      // doesn't change worker->status_ and will wait until the worker changes
   77|       |      // worker->status_ to AVX_WORKER_STATUS_OK. See change_state(). So the
   78|       |      // worker can safely call execute() without holding worker->impl_->mutex_.
   79|       |      // When the worker reacquires worker->impl_->mutex_, worker->status_ must
   80|       |      // still be AVX_WORKER_STATUS_WORKING.
   81|   120k|      pthread_mutex_unlock(&worker->impl_->mutex_);
   82|   120k|      execute(worker);
   83|   120k|      pthread_mutex_lock(&worker->impl_->mutex_);
   84|   120k|      assert(worker->status_ == AVX_WORKER_STATUS_WORKING);
   85|   120k|      worker->status_ = AVX_WORKER_STATUS_OK;
   86|       |      // signal to the main thread that we're done (for sync())
   87|   120k|      pthread_cond_signal(&worker->impl_->condition_);
   88|   280k|    } else {
   89|   280k|      assert(worker->status_ == AVX_WORKER_STATUS_NOT_OK);  // finish the worker
   90|   280k|      break;
   91|   280k|    }
   92|   401k|  }
   93|   280k|  pthread_mutex_unlock(&worker->impl_->mutex_);
   94|       |  return THREAD_EXIT_SUCCESS;  // Thread is finished
  ------------------
  |  |  170|   280k|#define THREAD_EXIT_SUCCESS NULL
  ------------------
   95|   280k|}
aom_thread.c:sync:
  128|   388k|static int sync(AVxWorker *const worker) {
  129|   388k|#if CONFIG_MULTITHREAD
  130|   388k|  change_state(worker, AVX_WORKER_STATUS_OK);
  131|   388k|#endif
  132|       |  assert(worker->status_ <= AVX_WORKER_STATUS_OK);
  133|   388k|  return !worker->had_error;
  134|   388k|}
aom_thread.c:change_state:
   98|   789k|static void change_state(AVxWorker *const worker, AVxWorkerStatus new_status) {
   99|       |  // No-op when attempting to change state on a thread that didn't come up.
  100|       |  // Checking status_ without acquiring the lock first would result in a data
  101|       |  // race.
  102|   789k|  if (worker->impl_ == NULL) return;
  ------------------
  |  Branch (102:7): [True: 58.2k, False: 731k]
  ------------------
  103|       |
  104|   731k|  pthread_mutex_lock(&worker->impl_->mutex_);
  105|   731k|  if (worker->status_ >= AVX_WORKER_STATUS_OK) {
  ------------------
  |  Branch (105:7): [True: 731k, False: 0]
  ------------------
  106|       |    // wait for the worker to finish
  107|   742k|    while (worker->status_ != AVX_WORKER_STATUS_OK) {
  ------------------
  |  Branch (107:12): [True: 11.3k, False: 731k]
  ------------------
  108|  11.3k|      pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_);
  109|  11.3k|    }
  110|       |    // assign new status and release the working thread if needed
  111|   731k|    if (new_status != AVX_WORKER_STATUS_OK) {
  ------------------
  |  Branch (111:9): [True: 401k, False: 330k]
  ------------------
  112|   401k|      worker->status_ = new_status;
  113|   401k|      pthread_cond_signal(&worker->impl_->condition_);
  114|   401k|    }
  115|   731k|  }
  116|   731k|  pthread_mutex_unlock(&worker->impl_->mutex_);
  117|   731k|}
aom_thread.c:launch:
  201|   120k|static void launch(AVxWorker *const worker) {
  202|   120k|#if CONFIG_MULTITHREAD
  203|   120k|  change_state(worker, AVX_WORKER_STATUS_WORKING);
  204|       |#else
  205|       |  execute(worker);
  206|       |#endif
  207|   120k|}
aom_thread.c:execute:
  195|   166k|static void execute(AVxWorker *const worker) {
  196|   166k|  if (worker->hook != NULL) {
  ------------------
  |  Branch (196:7): [True: 166k, False: 18.4E]
  ------------------
  197|   166k|    worker->had_error |= !worker->hook(worker->data1, worker->data2);
  198|   166k|  }
  199|   166k|}
aom_thread.c:end:
  209|   325k|static void end(AVxWorker *const worker) {
  210|   325k|#if CONFIG_MULTITHREAD
  211|   325k|  if (worker->impl_ != NULL) {
  ------------------
  |  Branch (211:7): [True: 280k, False: 45.0k]
  ------------------
  212|   280k|    change_state(worker, AVX_WORKER_STATUS_NOT_OK);
  213|   280k|    pthread_join(worker->impl_->thread_, NULL);
  214|   280k|    pthread_mutex_destroy(&worker->impl_->mutex_);
  215|   280k|    pthread_cond_destroy(&worker->impl_->condition_);
  216|   280k|    aom_free(worker->impl_);
  217|   280k|    worker->impl_ = NULL;
  218|   280k|  }
  219|       |#else
  220|       |  worker->status_ = AVX_WORKER_STATUS_NOT_OK;
  221|       |  assert(worker->impl_ == NULL);
  222|       |#endif
  223|       |  assert(worker->status_ == AVX_WORKER_STATUS_NOT_OK);
  224|   325k|}

aom_codec_av1_dx:
 1786|  23.4k|aom_codec_iface_t *aom_codec_av1_dx(void) { return &aom_codec_av1_dx_algo; }
av1_dx_iface.c:decoder_init:
   86|  17.9k|static aom_codec_err_t decoder_init(aom_codec_ctx_t *ctx) {
   87|       |  // This function only allocates space for the aom_codec_alg_priv_t
   88|       |  // structure. More memory may be required at the time the stream
   89|       |  // information becomes known.
   90|  17.9k|  if (!ctx->priv) {
  ------------------
  |  Branch (90:7): [True: 17.9k, False: 0]
  ------------------
   91|  17.9k|    aom_codec_alg_priv_t *const priv =
   92|  17.9k|        (aom_codec_alg_priv_t *)aom_calloc(1, sizeof(*priv));
   93|  17.9k|    if (priv == NULL) return AOM_CODEC_MEM_ERROR;
  ------------------
  |  Branch (93:9): [True: 0, False: 17.9k]
  ------------------
   94|       |
   95|  17.9k|    ctx->priv = (aom_codec_priv_t *)priv;
   96|  17.9k|    ctx->priv->init_flags = ctx->init_flags;
   97|  17.9k|    priv->flushed = 0;
   98|       |
   99|       |    // TODO(tdaede): this should not be exposed to the API
  100|  17.9k|    priv->cfg.allow_lowbitdepth = !FORCE_HIGHBITDEPTH_DECODING;
  ------------------
  |  |   79|  17.9k|#define FORCE_HIGHBITDEPTH_DECODING 0
  ------------------
  101|  17.9k|    if (ctx->config.dec) {
  ------------------
  |  Branch (101:9): [True: 17.9k, False: 0]
  ------------------
  102|  17.9k|      priv->cfg = *ctx->config.dec;
  103|  17.9k|      ctx->config.dec = &priv->cfg;
  104|  17.9k|    }
  105|  17.9k|    priv->num_grain_image_frame_buffers = 0;
  106|       |    // Turn row_mt on by default.
  107|  17.9k|    priv->row_mt = 1;
  108|       |
  109|       |    // Turn on normal tile coding mode by default.
  110|       |    // 0 is for normal tile coding mode, and 1 is for large scale tile coding
  111|       |    // mode(refer to lightfield example).
  112|  17.9k|    priv->tile_mode = 0;
  113|  17.9k|    priv->decode_tile_row = -1;
  114|  17.9k|    priv->decode_tile_col = -1;
  115|  17.9k|  }
  116|       |
  117|  17.9k|  return AOM_CODEC_OK;
  118|  17.9k|}
av1_dx_iface.c:decoder_destroy:
  120|  17.9k|static aom_codec_err_t decoder_destroy(aom_codec_alg_priv_t *ctx) {
  121|  17.9k|  if (ctx->frame_worker != NULL) {
  ------------------
  |  Branch (121:7): [True: 17.9k, False: 0]
  ------------------
  122|  17.9k|    AVxWorker *const worker = ctx->frame_worker;
  123|  17.9k|    aom_get_worker_interface()->end(worker);
  124|  17.9k|    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
  125|  17.9k|    if (frame_worker_data != NULL && frame_worker_data->pbi != NULL) {
  ------------------
  |  Branch (125:9): [True: 17.9k, False: 0]
  |  Branch (125:38): [True: 17.9k, False: 0]
  ------------------
  126|  17.9k|      AV1Decoder *const pbi = frame_worker_data->pbi;
  127|  17.9k|      aom_free(pbi->common.tpl_mvs);
  128|  17.9k|      pbi->common.tpl_mvs = NULL;
  129|  17.9k|      av1_remove_common(&pbi->common);
  130|  17.9k|      av1_free_cdef_buffers(&pbi->common, &pbi->cdef_worker, &pbi->cdef_sync);
  131|  17.9k|      av1_free_cdef_sync(&pbi->cdef_sync);
  132|  17.9k|      av1_free_restoration_buffers(&pbi->common);
  133|  17.9k|      av1_decoder_remove(pbi);
  134|  17.9k|    }
  135|  17.9k|    aom_free(frame_worker_data);
  136|  17.9k|  }
  137|       |
  138|  17.9k|  if (ctx->buffer_pool) {
  ------------------
  |  Branch (138:7): [True: 17.9k, False: 0]
  ------------------
  139|  17.9k|    for (size_t i = 0; i < ctx->num_grain_image_frame_buffers; i++) {
  ------------------
  |  Branch (139:24): [True: 0, False: 17.9k]
  ------------------
  140|      0|      ctx->buffer_pool->release_fb_cb(ctx->buffer_pool->cb_priv,
  141|      0|                                      &ctx->grain_image_frame_buffers[i]);
  142|      0|    }
  143|  17.9k|    av1_free_ref_frame_buffers(ctx->buffer_pool);
  144|  17.9k|    av1_free_internal_frame_buffers(&ctx->buffer_pool->int_frame_buffers);
  145|  17.9k|#if CONFIG_MULTITHREAD
  146|  17.9k|    pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex);
  147|  17.9k|#endif
  148|  17.9k|  }
  149|       |
  150|  17.9k|  aom_free(ctx->frame_worker);
  151|  17.9k|  aom_free(ctx->buffer_pool);
  152|       |  assert(!ctx->img.self_allocd);
  153|  17.9k|  aom_img_free(&ctx->img);
  154|  17.9k|  aom_free(ctx);
  155|  17.9k|  return AOM_CODEC_OK;
  156|  17.9k|}
av1_dx_iface.c:ctrl_set_operating_point:
 1632|  17.9k|                                                va_list args) {
 1633|       |  ctx->operating_point = va_arg(args, int);
 1634|  17.9k|  return AOM_CODEC_OK;
 1635|  17.9k|}
av1_dx_iface.c:ctrl_set_output_all_layers:
 1638|  17.9k|                                                  va_list args) {
 1639|       |  ctx->output_all_layers = va_arg(args, int);
 1640|  17.9k|  return AOM_CODEC_OK;
 1641|  17.9k|}
av1_dx_iface.c:decoder_peek_si:
  358|  23.4k|                                       aom_codec_stream_info_t *si) {
  359|       |  return decoder_peek_si_internal(data, data_sz, si, NULL);
  360|  23.4k|}
av1_dx_iface.c:decoder_peek_si_internal:
  262|  41.4k|                                                int *is_intra_only) {
  263|  41.4k|  int intra_only_flag = 0;
  264|  41.4k|  int got_sequence_header = 0;
  265|  41.4k|  int found_keyframe = 0;
  266|       |
  267|  41.4k|  if (data + data_sz <= data || data_sz < 1) return AOM_CODEC_INVALID_PARAM;
  ------------------
  |  Branch (267:7): [True: 0, False: 41.4k]
  |  Branch (267:33): [True: 0, False: 41.4k]
  ------------------
  268|       |
  269|  41.4k|  si->w = 0;
  270|  41.4k|  si->h = 0;
  271|  41.4k|  si->is_kf = 0;  // is_kf indicates whether the current packet contains a RAP
  272|       |
  273|  41.4k|  ObuHeader obu_header;
  274|  41.4k|  memset(&obu_header, 0, sizeof(obu_header));
  275|  41.4k|  size_t payload_size = 0;
  276|  41.4k|  size_t bytes_read = 0;
  277|  41.4k|  uint8_t reduced_still_picture_hdr = 0;
  278|  41.4k|  aom_codec_err_t status = aom_read_obu_header_and_size(
  279|  41.4k|      data, data_sz, si->is_annexb, &obu_header, &payload_size, &bytes_read);
  280|  41.4k|  if (status != AOM_CODEC_OK) return status;
  ------------------
  |  Branch (280:7): [True: 185, False: 41.2k]
  ------------------
  281|       |
  282|       |  // If the first OBU is a temporal delimiter, skip over it and look at the next
  283|       |  // OBU in the bitstream
  284|  41.2k|  if (obu_header.type == OBU_TEMPORAL_DELIMITER) {
  ------------------
  |  Branch (284:7): [True: 26.2k, False: 14.9k]
  ------------------
  285|       |    // Skip any associated payload (there shouldn't be one, but just in case)
  286|  26.2k|    if (data_sz < bytes_read + payload_size) return AOM_CODEC_CORRUPT_FRAME;
  ------------------
  |  Branch (286:9): [True: 25, False: 26.2k]
  ------------------
  287|  26.2k|    data += bytes_read + payload_size;
  288|  26.2k|    data_sz -= bytes_read + payload_size;
  289|       |
  290|  26.2k|    status = aom_read_obu_header_and_size(
  291|  26.2k|        data, data_sz, si->is_annexb, &obu_header, &payload_size, &bytes_read);
  292|  26.2k|    if (status != AOM_CODEC_OK) return status;
  ------------------
  |  Branch (292:9): [True: 31, False: 26.2k]
  ------------------
  293|  26.2k|  }
  294|  85.5k|  while (1) {
  ------------------
  |  Branch (294:10): [True: 85.5k, Folded]
  ------------------
  295|  85.5k|    data += bytes_read;
  296|  85.5k|    data_sz -= bytes_read;
  297|  85.5k|    if (data_sz < payload_size) return AOM_CODEC_CORRUPT_FRAME;
  ------------------
  |  Branch (297:9): [True: 219, False: 85.3k]
  ------------------
  298|       |    // Check that the selected OBU is a sequence header
  299|  85.3k|    if (obu_header.type == OBU_SEQUENCE_HEADER) {
  ------------------
  |  Branch (299:9): [True: 37.1k, False: 48.1k]
  ------------------
  300|       |      // Sanity check on sequence header size
  301|  37.1k|      if (data_sz < 2) return AOM_CODEC_CORRUPT_FRAME;
  ------------------
  |  Branch (301:11): [True: 2, False: 37.1k]
  ------------------
  302|       |      // Read a few values from the sequence header payload
  303|  37.1k|      struct aom_read_bit_buffer rb = { data, data + data_sz, 0, NULL, NULL };
  304|       |
  305|  37.1k|      av1_read_profile(&rb);  // profile
  306|  37.1k|      const uint8_t still_picture = aom_rb_read_bit(&rb);
  307|  37.1k|      reduced_still_picture_hdr = aom_rb_read_bit(&rb);
  308|       |
  309|  37.1k|      if (!still_picture && reduced_still_picture_hdr) {
  ------------------
  |  Branch (309:11): [True: 18.7k, False: 18.4k]
  |  Branch (309:29): [True: 3, False: 18.7k]
  ------------------
  310|      3|        return AOM_CODEC_UNSUP_BITSTREAM;
  311|      3|      }
  312|       |
  313|  37.1k|      status = parse_operating_points(&rb, reduced_still_picture_hdr, si);
  314|  37.1k|      if (status != AOM_CODEC_OK) return status;
  ------------------
  |  Branch (314:11): [True: 4, False: 37.1k]
  ------------------
  315|       |
  316|  37.1k|      int num_bits_width = aom_rb_read_literal(&rb, 4) + 1;
  317|  37.1k|      int num_bits_height = aom_rb_read_literal(&rb, 4) + 1;
  318|  37.1k|      int max_frame_width = aom_rb_read_literal(&rb, num_bits_width) + 1;
  319|  37.1k|      int max_frame_height = aom_rb_read_literal(&rb, num_bits_height) + 1;
  320|  37.1k|      si->w = max_frame_width;
  321|  37.1k|      si->h = max_frame_height;
  322|  37.1k|      got_sequence_header = 1;
  323|  48.1k|    } else if (obu_header.type == OBU_FRAME_HEADER ||
  ------------------
  |  Branch (323:16): [True: 427, False: 47.7k]
  ------------------
  324|  47.7k|               obu_header.type == OBU_FRAME) {
  ------------------
  |  Branch (324:16): [True: 43.1k, False: 4.56k]
  ------------------
  325|  43.6k|      if (got_sequence_header && reduced_still_picture_hdr) {
  ------------------
  |  Branch (325:11): [True: 36.7k, False: 6.85k]
  |  Branch (325:34): [True: 18.1k, False: 18.6k]
  ------------------
  326|  18.1k|        found_keyframe = 1;
  327|  18.1k|        break;
  328|  25.5k|      } else {
  329|       |        // make sure we have enough bits to get the frame type out
  330|  25.5k|        if (data_sz < 1) return AOM_CODEC_CORRUPT_FRAME;
  ------------------
  |  Branch (330:13): [True: 2, False: 25.5k]
  ------------------
  331|  25.5k|        struct aom_read_bit_buffer rb = { data, data + data_sz, 0, NULL, NULL };
  332|  25.5k|        const int show_existing_frame = aom_rb_read_bit(&rb);
  333|  25.5k|        if (!show_existing_frame) {
  ------------------
  |  Branch (333:13): [True: 25.2k, False: 298]
  ------------------
  334|  25.2k|          const FRAME_TYPE frame_type = (FRAME_TYPE)aom_rb_read_literal(&rb, 2);
  335|  25.2k|          if (frame_type == KEY_FRAME) {
  ------------------
  |  Branch (335:15): [True: 18.1k, False: 7.05k]
  ------------------
  336|  18.1k|            found_keyframe = 1;
  337|  18.1k|            break;  // Stop here as no further OBUs will change the outcome.
  338|  18.1k|          } else if (frame_type == INTRA_ONLY_FRAME) {
  ------------------
  |  Branch (338:22): [True: 480, False: 6.57k]
  ------------------
  339|    480|            intra_only_flag = 1;
  340|    480|          }
  341|  25.2k|        }
  342|  25.5k|      }
  343|  43.6k|    }
  344|       |    // skip past any unread OBU header data
  345|  49.1k|    data += payload_size;
  346|  49.1k|    data_sz -= payload_size;
  347|  49.1k|    if (data_sz == 0) break;  // exit if we're out of OBUs
  ------------------
  |  Branch (347:9): [True: 4.17k, False: 44.9k]
  ------------------
  348|  44.9k|    status = aom_read_obu_header_and_size(
  349|  44.9k|        data, data_sz, si->is_annexb, &obu_header, &payload_size, &bytes_read);
  350|  44.9k|    if (status != AOM_CODEC_OK) return status;
  ------------------
  |  Branch (350:9): [True: 530, False: 44.3k]
  ------------------
  351|  44.9k|  }
  352|  40.4k|  if (got_sequence_header && found_keyframe) si->is_kf = 1;
  ------------------
  |  Branch (352:7): [True: 36.6k, False: 3.74k]
  |  Branch (352:30): [True: 36.1k, False: 525]
  ------------------
  353|  40.4k|  if (is_intra_only != NULL) *is_intra_only = intra_only_flag;
  ------------------
  |  Branch (353:7): [True: 17.9k, False: 22.4k]
  ------------------
  354|  40.4k|  return AOM_CODEC_OK;
  355|  41.1k|}
av1_dx_iface.c:parse_operating_points:
  206|  37.1k|                                              aom_codec_stream_info_t *si) {
  207|  37.1k|  int operating_point_idc0 = 0;
  208|  37.1k|  if (is_reduced_header) {
  ------------------
  |  Branch (208:7): [True: 18.1k, False: 19.0k]
  ------------------
  209|  18.1k|    aom_rb_read_literal(rb, LEVEL_BITS);  // level
  ------------------
  |  |  464|  18.1k|#define LEVEL_BITS 5
  ------------------
  210|  19.0k|  } else {
  211|  19.0k|    uint8_t decoder_model_info_present_flag = 0;
  212|  19.0k|    int buffer_delay_length_minus_1 = 0;
  213|  19.0k|    aom_codec_err_t status;
  214|  19.0k|    const uint8_t timing_info_present_flag = aom_rb_read_bit(rb);
  215|  19.0k|    if (timing_info_present_flag) {
  ------------------
  |  Branch (215:9): [True: 465, False: 18.5k]
  ------------------
  216|    465|      if ((status = parse_timing_info(rb)) != AOM_CODEC_OK) return status;
  ------------------
  |  Branch (216:11): [True: 4, False: 461]
  ------------------
  217|    461|      decoder_model_info_present_flag = aom_rb_read_bit(rb);
  218|    461|      if (decoder_model_info_present_flag) {
  ------------------
  |  Branch (218:11): [True: 271, False: 190]
  ------------------
  219|    271|        if ((status = parse_decoder_model_info(
  ------------------
  |  Branch (219:13): [True: 0, False: 271]
  ------------------
  220|    271|                 rb, &buffer_delay_length_minus_1)) != AOM_CODEC_OK)
  221|      0|          return status;
  222|    271|      }
  223|    461|    }
  224|  19.0k|    const uint8_t initial_display_delay_present_flag = aom_rb_read_bit(rb);
  225|  19.0k|    const uint8_t operating_points_cnt_minus_1 =
  226|  19.0k|        aom_rb_read_literal(rb, OP_POINTS_CNT_MINUS_1_BITS);
  ------------------
  |  |   93|  19.0k|#define OP_POINTS_CNT_MINUS_1_BITS 5
  ------------------
  227|  52.3k|    for (int i = 0; i < operating_points_cnt_minus_1 + 1; i++) {
  ------------------
  |  Branch (227:21): [True: 33.3k, False: 19.0k]
  ------------------
  228|  33.3k|      int operating_point_idc;
  229|  33.3k|      operating_point_idc = aom_rb_read_literal(rb, OP_POINTS_IDC_BITS);
  ------------------
  |  |   94|  33.3k|#define OP_POINTS_IDC_BITS 12
  ------------------
  230|  33.3k|      if (i == 0) operating_point_idc0 = operating_point_idc;
  ------------------
  |  Branch (230:11): [True: 19.0k, False: 14.3k]
  ------------------
  231|  33.3k|      int seq_level_idx = aom_rb_read_literal(rb, LEVEL_BITS);  // level
  ------------------
  |  |  464|  33.3k|#define LEVEL_BITS 5
  ------------------
  232|  33.3k|      if (seq_level_idx > 7) aom_rb_read_bit(rb);               // tier
  ------------------
  |  Branch (232:11): [True: 4.04k, False: 29.2k]
  ------------------
  233|  33.3k|      if (decoder_model_info_present_flag) {
  ------------------
  |  Branch (233:11): [True: 4.10k, False: 29.2k]
  ------------------
  234|  4.10k|        const uint8_t decoder_model_present_for_this_op = aom_rb_read_bit(rb);
  235|  4.10k|        if (decoder_model_present_for_this_op) {
  ------------------
  |  Branch (235:13): [True: 1.06k, False: 3.04k]
  ------------------
  236|  1.06k|          if ((status = parse_op_parameters_info(
  ------------------
  |  Branch (236:15): [True: 0, False: 1.06k]
  ------------------
  237|  1.06k|                   rb, buffer_delay_length_minus_1)) != AOM_CODEC_OK)
  238|      0|            return status;
  239|  1.06k|        }
  240|  4.10k|      }
  241|  33.3k|      if (initial_display_delay_present_flag) {
  ------------------
  |  Branch (241:11): [True: 5.09k, False: 28.2k]
  ------------------
  242|  5.09k|        const uint8_t initial_display_delay_present_for_this_op =
  243|  5.09k|            aom_rb_read_bit(rb);
  244|  5.09k|        if (initial_display_delay_present_for_this_op)
  ------------------
  |  Branch (244:13): [True: 1.27k, False: 3.82k]
  ------------------
  245|  1.27k|          aom_rb_read_literal(rb, 4);  // initial_display_delay_minus_1
  246|  5.09k|      }
  247|  33.3k|    }
  248|  19.0k|  }
  249|       |
  250|  37.1k|  if (aom_get_num_layers_from_operating_point_idc(
  ------------------
  |  Branch (250:7): [True: 0, False: 37.1k]
  ------------------
  251|  37.1k|          operating_point_idc0, &si->number_spatial_layers,
  252|  37.1k|          &si->number_temporal_layers) != AOM_CODEC_OK) {
  253|      0|    return AOM_CODEC_ERROR;
  254|      0|  }
  255|       |
  256|  37.1k|  return AOM_CODEC_OK;
  257|  37.1k|}
av1_dx_iface.c:parse_timing_info:
  158|    465|static aom_codec_err_t parse_timing_info(struct aom_read_bit_buffer *rb) {
  159|    465|  const uint32_t num_units_in_display_tick =
  160|    465|      aom_rb_read_unsigned_literal(rb, 32);
  161|    465|  const uint32_t time_scale = aom_rb_read_unsigned_literal(rb, 32);
  162|    465|  if (num_units_in_display_tick == 0 || time_scale == 0)
  ------------------
  |  Branch (162:7): [True: 2, False: 463]
  |  Branch (162:41): [True: 2, False: 461]
  ------------------
  163|      4|    return AOM_CODEC_UNSUP_BITSTREAM;
  164|    461|  const uint8_t equal_picture_interval = aom_rb_read_bit(rb);
  165|    461|  if (equal_picture_interval) {
  ------------------
  |  Branch (165:7): [True: 261, False: 200]
  ------------------
  166|    261|    const uint32_t num_ticks_per_picture_minus_1 = aom_rb_read_uvlc(rb);
  167|    261|    if (num_ticks_per_picture_minus_1 == UINT32_MAX) {
  ------------------
  |  Branch (167:9): [True: 0, False: 261]
  ------------------
  168|       |      // num_ticks_per_picture_minus_1 cannot be (1 << 32) - 1.
  169|      0|      return AOM_CODEC_UNSUP_BITSTREAM;
  170|      0|    }
  171|    261|  }
  172|    461|  return AOM_CODEC_OK;
  173|    461|}
av1_dx_iface.c:parse_decoder_model_info:
  176|    271|    struct aom_read_bit_buffer *rb, int *buffer_delay_length_minus_1) {
  177|    271|  *buffer_delay_length_minus_1 = aom_rb_read_literal(rb, 5);
  178|    271|  const uint32_t num_units_in_decoding_tick =
  179|    271|      aom_rb_read_unsigned_literal(rb, 32);
  180|    271|  const uint8_t buffer_removal_time_length_minus_1 = aom_rb_read_literal(rb, 5);
  181|    271|  const uint8_t frame_presentation_time_length_minus_1 =
  182|    271|      aom_rb_read_literal(rb, 5);
  183|    271|  (void)num_units_in_decoding_tick;
  184|    271|  (void)buffer_removal_time_length_minus_1;
  185|    271|  (void)frame_presentation_time_length_minus_1;
  186|    271|  return AOM_CODEC_OK;
  187|    271|}
av1_dx_iface.c:parse_op_parameters_info:
  190|  1.06k|    struct aom_read_bit_buffer *rb, int buffer_delay_length_minus_1) {
  191|  1.06k|  const int n = buffer_delay_length_minus_1 + 1;
  192|  1.06k|  const uint32_t decoder_buffer_delay = aom_rb_read_unsigned_literal(rb, n);
  193|  1.06k|  const uint32_t encoder_buffer_delay = aom_rb_read_unsigned_literal(rb, n);
  194|  1.06k|  const uint8_t low_delay_mode_flag = aom_rb_read_bit(rb);
  195|  1.06k|  (void)decoder_buffer_delay;
  196|  1.06k|  (void)encoder_buffer_delay;
  197|  1.06k|  (void)low_delay_mode_flag;
  198|  1.06k|  return AOM_CODEC_OK;
  199|  1.06k|}
av1_dx_iface.c:decoder_decode:
  677|  22.4k|                                      void *user_priv) {
  678|  22.4k|  aom_codec_err_t res = AOM_CODEC_OK;
  679|       |
  680|       |#if CONFIG_INSPECTION
  681|       |  if (user_priv != 0) {
  682|       |    return decoder_inspect(ctx, data, data_sz, user_priv);
  683|       |  }
  684|       |#endif
  685|       |
  686|  22.4k|  release_pending_output_frames(ctx);
  687|       |
  688|       |  /* Sanity checks */
  689|       |  /* NULL data ptr allowed if data_sz is 0 too */
  690|  22.4k|  if (data == NULL && data_sz == 0) {
  ------------------
  |  Branch (690:7): [True: 0, False: 22.4k]
  |  Branch (690:23): [True: 0, False: 0]
  ------------------
  691|      0|    ctx->flushed = 1;
  692|      0|    return AOM_CODEC_OK;
  693|      0|  }
  694|  22.4k|  if (data == NULL || data_sz == 0) return AOM_CODEC_INVALID_PARAM;
  ------------------
  |  Branch (694:7): [True: 0, False: 22.4k]
  |  Branch (694:23): [True: 0, False: 22.4k]
  ------------------
  695|       |
  696|       |  // Reset flushed when receiving a valid frame.
  697|  22.4k|  ctx->flushed = 0;
  698|       |
  699|       |  // Initialize the decoder worker on the first frame.
  700|  22.4k|  if (ctx->frame_worker == NULL) {
  ------------------
  |  Branch (700:7): [True: 17.9k, False: 4.49k]
  ------------------
  701|  17.9k|    res = init_decoder(ctx);
  702|  17.9k|    if (res != AOM_CODEC_OK) return res;
  ------------------
  |  Branch (702:9): [True: 0, False: 17.9k]
  ------------------
  703|  17.9k|  }
  704|       |
  705|  22.4k|  const uint8_t *data_start = data;
  706|  22.4k|  const uint8_t *data_end = data + data_sz;
  707|       |
  708|  22.4k|  if (ctx->is_annexb) {
  ------------------
  |  Branch (708:7): [True: 0, False: 22.4k]
  ------------------
  709|       |    // read the size of this temporal unit
  710|      0|    size_t length_of_size;
  711|      0|    uint64_t temporal_unit_size;
  712|      0|    if (aom_uleb_decode(data_start, data_sz, &temporal_unit_size,
  ------------------
  |  Branch (712:9): [True: 0, False: 0]
  ------------------
  713|      0|                        &length_of_size) != 0) {
  714|      0|      return AOM_CODEC_CORRUPT_FRAME;
  715|      0|    }
  716|      0|    data_start += length_of_size;
  717|      0|    if (temporal_unit_size > (size_t)(data_end - data_start))
  ------------------
  |  Branch (717:9): [True: 0, False: 0]
  ------------------
  718|      0|      return AOM_CODEC_CORRUPT_FRAME;
  719|      0|    data_end = data_start + temporal_unit_size;
  720|      0|  }
  721|       |
  722|       |  // Decode in serial mode.
  723|  36.1k|  while (data_start < data_end) {
  ------------------
  |  Branch (723:10): [True: 29.0k, False: 7.15k]
  ------------------
  724|  29.0k|    uint64_t frame_size;
  725|  29.0k|    if (ctx->is_annexb) {
  ------------------
  |  Branch (725:9): [True: 0, False: 29.0k]
  ------------------
  726|       |      // read the size of this frame unit
  727|      0|      size_t length_of_size;
  728|      0|      if (aom_uleb_decode(data_start, (size_t)(data_end - data_start),
  ------------------
  |  Branch (728:11): [True: 0, False: 0]
  ------------------
  729|      0|                          &frame_size, &length_of_size) != 0) {
  730|      0|        return AOM_CODEC_CORRUPT_FRAME;
  731|      0|      }
  732|      0|      data_start += length_of_size;
  733|      0|      if (frame_size > (size_t)(data_end - data_start))
  ------------------
  |  Branch (733:11): [True: 0, False: 0]
  ------------------
  734|      0|        return AOM_CODEC_CORRUPT_FRAME;
  735|  29.0k|    } else {
  736|  29.0k|      frame_size = (uint64_t)(data_end - data_start);
  737|  29.0k|    }
  738|       |
  739|  29.0k|    res = decode_one(ctx, &data_start, (size_t)frame_size, user_priv);
  740|  29.0k|    if (res != AOM_CODEC_OK) return res;
  ------------------
  |  Branch (740:9): [True: 15.2k, False: 13.7k]
  ------------------
  741|       |
  742|       |    // Allow extra zero bytes after the frame end
  743|  14.1k|    while (data_start < data_end) {
  ------------------
  |  Branch (743:12): [True: 6.97k, False: 7.15k]
  ------------------
  744|  6.97k|      const uint8_t marker = data_start[0];
  745|  6.97k|      if (marker) break;
  ------------------
  |  Branch (745:11): [True: 6.58k, False: 392]
  ------------------
  746|    392|      ++data_start;
  747|    392|    }
  748|  13.7k|  }
  749|       |
  750|  7.15k|  return res;
  751|  22.4k|}
av1_dx_iface.c:release_pending_output_frames:
  565|  22.4k|static void release_pending_output_frames(aom_codec_alg_priv_t *ctx) {
  566|       |  // Release any pending output frames from the previous decoder_decode or
  567|       |  // decoder_inspect call. We need to do this even if the decoder is being
  568|       |  // flushed or the input arguments are invalid.
  569|  22.4k|  if (ctx->frame_worker) {
  ------------------
  |  Branch (569:7): [True: 4.49k, False: 17.9k]
  ------------------
  570|  4.49k|    BufferPool *const pool = ctx->buffer_pool;
  571|  4.49k|    lock_buffer_pool(pool);
  572|  4.49k|    AVxWorker *const worker = ctx->frame_worker;
  573|  4.49k|    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
  574|  4.49k|    struct AV1Decoder *pbi = frame_worker_data->pbi;
  575|  8.98k|    for (size_t j = 0; j < pbi->num_output_frames; j++) {
  ------------------
  |  Branch (575:24): [True: 4.48k, False: 4.49k]
  ------------------
  576|  4.48k|      decrease_ref_count(pbi->output_frames[j], pool);
  577|  4.48k|    }
  578|  4.49k|    pbi->num_output_frames = 0;
  579|  4.49k|    unlock_buffer_pool(pool);
  580|  4.49k|    for (size_t j = 0; j < ctx->num_grain_image_frame_buffers; j++) {
  ------------------
  |  Branch (580:24): [True: 0, False: 4.49k]
  ------------------
  581|      0|      pool->release_fb_cb(pool->cb_priv, &ctx->grain_image_frame_buffers[j]);
  582|      0|      ctx->grain_image_frame_buffers[j].data = NULL;
  583|      0|      ctx->grain_image_frame_buffers[j].size = 0;
  584|       |      ctx->grain_image_frame_buffers[j].priv = NULL;
  585|      0|    }
  586|  4.49k|    ctx->num_grain_image_frame_buffers = 0;
  587|  4.49k|  }
  588|  22.4k|}
av1_dx_iface.c:init_decoder:
  426|  17.9k|static aom_codec_err_t init_decoder(aom_codec_alg_priv_t *ctx) {
  427|  17.9k|  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
  428|       |
  429|  17.9k|  ctx->last_show_frame = NULL;
  430|  17.9k|  ctx->need_resync = 1;
  431|  17.9k|  ctx->flushed = 0;
  432|       |
  433|  17.9k|  ctx->buffer_pool = (BufferPool *)aom_calloc(1, sizeof(BufferPool));
  434|  17.9k|  if (ctx->buffer_pool == NULL) return AOM_CODEC_MEM_ERROR;
  ------------------
  |  Branch (434:7): [True: 0, False: 17.9k]
  ------------------
  435|  17.9k|  ctx->buffer_pool->num_frame_bufs = FRAME_BUFFERS;
  ------------------
  |  |  561|  17.9k|#define FRAME_BUFFERS (REF_FRAMES + 1 + INTER_REFS_PER_FRAME)
  ------------------
  436|  17.9k|  ctx->buffer_pool->frame_bufs = (RefCntBuffer *)aom_calloc(
  437|  17.9k|      ctx->buffer_pool->num_frame_bufs, sizeof(*ctx->buffer_pool->frame_bufs));
  438|  17.9k|  if (ctx->buffer_pool->frame_bufs == NULL) {
  ------------------
  |  Branch (438:7): [True: 0, False: 17.9k]
  ------------------
  439|      0|    ctx->buffer_pool->num_frame_bufs = 0;
  440|      0|    aom_free(ctx->buffer_pool);
  441|      0|    ctx->buffer_pool = NULL;
  442|      0|    return AOM_CODEC_MEM_ERROR;
  443|      0|  }
  444|       |
  445|  17.9k|#if CONFIG_MULTITHREAD
  446|  17.9k|  if (pthread_mutex_init(&ctx->buffer_pool->pool_mutex, NULL)) {
  ------------------
  |  Branch (446:7): [True: 0, False: 17.9k]
  ------------------
  447|      0|    aom_free(ctx->buffer_pool->frame_bufs);
  448|      0|    ctx->buffer_pool->frame_bufs = NULL;
  449|      0|    ctx->buffer_pool->num_frame_bufs = 0;
  450|      0|    aom_free(ctx->buffer_pool);
  451|      0|    ctx->buffer_pool = NULL;
  452|      0|    set_error_detail(ctx, "Failed to allocate buffer pool mutex");
  453|      0|    return AOM_CODEC_MEM_ERROR;
  454|      0|  }
  455|  17.9k|#endif
  456|       |
  457|  17.9k|  ctx->frame_worker = (AVxWorker *)aom_malloc(sizeof(*ctx->frame_worker));
  458|  17.9k|  if (ctx->frame_worker == NULL) {
  ------------------
  |  Branch (458:7): [True: 0, False: 17.9k]
  ------------------
  459|      0|    set_error_detail(ctx, "Failed to allocate frame_worker");
  460|      0|    return AOM_CODEC_MEM_ERROR;
  461|      0|  }
  462|       |
  463|  17.9k|  AVxWorker *const worker = ctx->frame_worker;
  464|  17.9k|  winterface->init(worker);
  465|  17.9k|  worker->thread_name = "aom frameworker";
  466|  17.9k|  worker->data1 = aom_memalign(32, sizeof(FrameWorkerData));
  467|  17.9k|  if (worker->data1 == NULL) {
  ------------------
  |  Branch (467:7): [True: 0, False: 17.9k]
  ------------------
  468|      0|    winterface->end(worker);
  469|      0|    aom_free(worker);
  470|      0|    ctx->frame_worker = NULL;
  471|      0|    set_error_detail(ctx, "Failed to allocate frame_worker_data");
  472|      0|    return AOM_CODEC_MEM_ERROR;
  473|      0|  }
  474|  17.9k|  FrameWorkerData *frame_worker_data = (FrameWorkerData *)worker->data1;
  475|  17.9k|  frame_worker_data->pbi = av1_decoder_create(ctx->buffer_pool);
  476|  17.9k|  if (frame_worker_data->pbi == NULL) {
  ------------------
  |  Branch (476:7): [True: 0, False: 17.9k]
  ------------------
  477|      0|    winterface->end(worker);
  478|      0|    aom_free(frame_worker_data);
  479|      0|    aom_free(worker);
  480|      0|    ctx->frame_worker = NULL;
  481|      0|    set_error_detail(ctx, "Failed to allocate frame_worker_data->pbi");
  482|      0|    return AOM_CODEC_MEM_ERROR;
  483|      0|  }
  484|  17.9k|  frame_worker_data->frame_context_ready = 0;
  485|  17.9k|  frame_worker_data->received_frame = 0;
  486|  17.9k|  frame_worker_data->pbi->allow_lowbitdepth = ctx->cfg.allow_lowbitdepth;
  487|       |
  488|       |  // If decoding in serial mode, FrameWorker thread could create tile worker
  489|       |  // thread or loopfilter thread.
  490|  17.9k|  frame_worker_data->pbi->max_threads = ctx->cfg.threads;
  491|  17.9k|  frame_worker_data->pbi->inv_tile_order = ctx->invert_tile_order;
  492|  17.9k|  frame_worker_data->pbi->common.tiles.large_scale = ctx->tile_mode;
  493|  17.9k|  frame_worker_data->pbi->is_annexb = ctx->is_annexb;
  494|  17.9k|  frame_worker_data->pbi->dec_tile_row = ctx->decode_tile_row;
  495|  17.9k|  frame_worker_data->pbi->dec_tile_col = ctx->decode_tile_col;
  496|  17.9k|  frame_worker_data->pbi->operating_point = ctx->operating_point;
  497|  17.9k|  frame_worker_data->pbi->output_all_layers = ctx->output_all_layers;
  498|  17.9k|  frame_worker_data->pbi->ext_tile_debug = ctx->ext_tile_debug;
  499|  17.9k|  frame_worker_data->pbi->row_mt = ctx->row_mt;
  500|  17.9k|  frame_worker_data->pbi->is_fwd_kf_present = 0;
  501|  17.9k|  frame_worker_data->pbi->is_arf_frame_present = 0;
  502|  17.9k|  worker->hook = frame_worker_hook;
  503|       |
  504|  17.9k|  init_buffer_callbacks(ctx);
  505|       |
  506|  17.9k|  return AOM_CODEC_OK;
  507|  17.9k|}
av1_dx_iface.c:set_error_detail:
  370|  15.2k|                             const char *const error) {
  371|  15.2k|  ctx->base.err_detail = error;
  372|  15.2k|}
av1_dx_iface.c:frame_worker_hook:
  410|  29.0k|static int frame_worker_hook(void *arg1, void *arg2) {
  411|  29.0k|  FrameWorkerData *const frame_worker_data = (FrameWorkerData *)arg1;
  412|  29.0k|  const uint8_t *data = frame_worker_data->data;
  413|  29.0k|  (void)arg2;
  414|       |
  415|  29.0k|  int result = av1_receive_compressed_data(frame_worker_data->pbi,
  416|  29.0k|                                           frame_worker_data->data_size, &data);
  417|  29.0k|  frame_worker_data->data_end = data;
  418|       |
  419|  29.0k|  if (result != 0) {
  ------------------
  |  Branch (419:7): [True: 15.2k, False: 13.7k]
  ------------------
  420|       |    // Check decode result in serial decode.
  421|  15.2k|    frame_worker_data->pbi->need_resync = 1;
  422|  15.2k|  }
  423|  29.0k|  return !result;
  424|  29.0k|}
av1_dx_iface.c:init_buffer_callbacks:
  382|  17.9k|static void init_buffer_callbacks(aom_codec_alg_priv_t *ctx) {
  383|  17.9k|  AVxWorker *const worker = ctx->frame_worker;
  384|  17.9k|  FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
  385|  17.9k|  AV1Decoder *const pbi = frame_worker_data->pbi;
  386|  17.9k|  AV1_COMMON *const cm = &pbi->common;
  387|  17.9k|  BufferPool *const pool = cm->buffer_pool;
  388|       |
  389|  17.9k|  cm->cur_frame = NULL;
  390|  17.9k|  cm->features.byte_alignment = ctx->byte_alignment;
  391|  17.9k|  pbi->skip_loop_filter = ctx->skip_loop_filter;
  392|  17.9k|  pbi->skip_film_grain = ctx->skip_film_grain;
  393|       |
  394|  17.9k|  if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) {
  ------------------
  |  Branch (394:7): [True: 0, False: 17.9k]
  |  Branch (394:37): [True: 0, False: 0]
  ------------------
  395|      0|    pool->get_fb_cb = ctx->get_ext_fb_cb;
  396|      0|    pool->release_fb_cb = ctx->release_ext_fb_cb;
  397|      0|    pool->cb_priv = ctx->ext_priv;
  398|  17.9k|  } else {
  399|  17.9k|    pool->get_fb_cb = av1_get_frame_buffer;
  400|  17.9k|    pool->release_fb_cb = av1_release_frame_buffer;
  401|       |
  402|  17.9k|    if (av1_alloc_internal_frame_buffers(&pool->int_frame_buffers))
  ------------------
  |  Branch (402:9): [True: 0, False: 17.9k]
  ------------------
  403|      0|      aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR,
  404|      0|                         "Failed to initialize internal frame buffers");
  405|       |
  406|  17.9k|    pool->cb_priv = &pool->int_frame_buffers;
  407|  17.9k|  }
  408|  17.9k|}
av1_dx_iface.c:decode_one:
  519|  29.0k|                                  void *user_priv) {
  520|  29.0k|  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
  521|       |
  522|       |  // Determine the stream parameters. Note that we rely on peek_si to
  523|       |  // validate that we have a buffer that does not wrap around the top
  524|       |  // of the heap.
  525|  29.0k|  if (!ctx->si.h) {
  ------------------
  |  Branch (525:7): [True: 17.9k, False: 11.0k]
  ------------------
  526|  17.9k|    int is_intra_only = 0;
  527|  17.9k|    ctx->si.is_annexb = ctx->is_annexb;
  528|  17.9k|    const aom_codec_err_t res =
  529|  17.9k|        decoder_peek_si_internal(*data, data_sz, &ctx->si, &is_intra_only);
  530|  17.9k|    if (res != AOM_CODEC_OK) return res;
  ------------------
  |  Branch (530:9): [True: 0, False: 17.9k]
  ------------------
  531|       |
  532|  17.9k|    if (!ctx->si.is_kf && !is_intra_only) return AOM_CODEC_ERROR;
  ------------------
  |  Branch (532:9): [True: 252, False: 17.6k]
  |  Branch (532:27): [True: 19, False: 233]
  ------------------
  533|  17.9k|  }
  534|       |
  535|  29.0k|  AVxWorker *const worker = ctx->frame_worker;
  536|  29.0k|  FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
  537|  29.0k|  frame_worker_data->data = *data;
  538|  29.0k|  frame_worker_data->data_size = data_sz;
  539|  29.0k|  frame_worker_data->user_priv = user_priv;
  540|  29.0k|  frame_worker_data->received_frame = 1;
  541|       |
  542|  29.0k|  frame_worker_data->pbi->common.tiles.large_scale = ctx->tile_mode;
  543|  29.0k|  frame_worker_data->pbi->dec_tile_row = ctx->decode_tile_row;
  544|  29.0k|  frame_worker_data->pbi->dec_tile_col = ctx->decode_tile_col;
  545|  29.0k|  frame_worker_data->pbi->ext_tile_debug = ctx->ext_tile_debug;
  546|  29.0k|  frame_worker_data->pbi->row_mt = ctx->row_mt;
  547|  29.0k|  frame_worker_data->pbi->ext_refs = ctx->ext_refs;
  548|       |
  549|  29.0k|  frame_worker_data->pbi->is_annexb = ctx->is_annexb;
  550|       |
  551|  29.0k|  worker->had_error = 0;
  552|  29.0k|  winterface->execute(worker);
  553|       |
  554|       |  // Update data pointer after decode.
  555|  29.0k|  *data = frame_worker_data->data_end;
  556|       |
  557|  29.0k|  if (worker->had_error)
  ------------------
  |  Branch (557:7): [True: 15.2k, False: 13.7k]
  ------------------
  558|  15.2k|    return update_error_state(ctx, &frame_worker_data->pbi->error);
  559|       |
  560|  13.7k|  check_resync(ctx, frame_worker_data->pbi);
  561|       |
  562|  13.7k|  return AOM_CODEC_OK;
  563|  29.0k|}
av1_dx_iface.c:update_error_state:
  375|  15.2k|    aom_codec_alg_priv_t *ctx, const struct aom_internal_error_info *error) {
  376|  15.2k|  if (error->error_code)
  ------------------
  |  Branch (376:7): [True: 15.2k, False: 0]
  ------------------
  377|  15.2k|    set_error_detail(ctx, error->has_detail ? error->detail : NULL);
  ------------------
  |  Branch (377:27): [True: 13.9k, False: 1.28k]
  ------------------
  378|       |
  379|  15.2k|  return error->error_code;
  380|  15.2k|}
av1_dx_iface.c:check_resync:
  510|  20.8k|                                const AV1Decoder *const pbi) {
  511|       |  // Clear resync flag if worker got a key frame or intra only frame.
  512|  20.8k|  if (ctx->need_resync == 1 && pbi->need_resync == 0 &&
  ------------------
  |  Branch (512:7): [True: 7.65k, False: 13.2k]
  |  Branch (512:32): [True: 7.65k, False: 0]
  ------------------
  513|  7.65k|      frame_is_intra_only(&pbi->common))
  ------------------
  |  Branch (513:7): [True: 7.65k, False: 0]
  ------------------
  514|  7.65k|    ctx->need_resync = 0;
  515|  20.8k|}
av1_dx_iface.c:decoder_get_frame:
  809|  29.6k|                                      aom_codec_iter_t *iter) {
  810|  29.6k|  aom_image_t *img = NULL;
  811|       |
  812|  29.6k|  if (!iter) {
  ------------------
  |  Branch (812:7): [True: 0, False: 29.6k]
  ------------------
  813|      0|    return NULL;
  814|      0|  }
  815|       |
  816|       |  // To avoid having to allocate any extra storage, treat 'iter' as
  817|       |  // simply a pointer to an integer index
  818|  29.6k|  uintptr_t *index = (uintptr_t *)iter;
  819|       |
  820|  29.6k|  if (ctx->frame_worker == NULL) {
  ------------------
  |  Branch (820:7): [True: 17.9k, False: 11.6k]
  ------------------
  821|  17.9k|    return NULL;
  822|  17.9k|  }
  823|  11.6k|  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
  824|  11.6k|  AVxWorker *const worker = ctx->frame_worker;
  825|  11.6k|  FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
  826|  11.6k|  AV1Decoder *const pbi = frame_worker_data->pbi;
  827|  11.6k|  pbi->error.error_code = AOM_CODEC_OK;
  828|  11.6k|  pbi->error.has_detail = 0;
  829|  11.6k|  AV1_COMMON *const cm = &pbi->common;
  830|  11.6k|  CommonTileParams *const tiles = &cm->tiles;
  831|       |  // Wait for the frame from worker thread.
  832|  11.6k|  if (!winterface->sync(worker)) {
  ------------------
  |  Branch (832:7): [True: 0, False: 11.6k]
  ------------------
  833|       |    // Decoding failed. Release the worker thread.
  834|      0|    frame_worker_data->received_frame = 0;
  835|      0|    ctx->need_resync = 1;
  836|       |    // TODO(aomedia:3519): Set an error code. Check if a different error code
  837|       |    // should be used if ctx->flushed != 1.
  838|      0|    return NULL;
  839|      0|  }
  840|       |  // Check if worker has received any frames.
  841|  11.6k|  if (frame_worker_data->received_frame == 1) {
  ------------------
  |  Branch (841:7): [True: 7.15k, False: 4.50k]
  ------------------
  842|  7.15k|    frame_worker_data->received_frame = 0;
  843|  7.15k|    check_resync(ctx, frame_worker_data->pbi);
  844|  7.15k|  }
  845|  11.6k|  YV12_BUFFER_CONFIG *sd;
  846|  11.6k|  aom_film_grain_t *grain_params;
  847|  11.6k|  if (av1_get_raw_frame(frame_worker_data->pbi, *index, &sd, &grain_params) !=
  ------------------
  |  Branch (847:7): [True: 4.61k, False: 7.04k]
  ------------------
  848|  11.6k|      0) {
  849|  4.61k|    return NULL;
  850|  4.61k|  }
  851|  7.04k|  RefCntBuffer *const output_frame_buf = pbi->output_frames[*index];
  852|  7.04k|  ctx->last_show_frame = output_frame_buf;
  853|  7.04k|  if (ctx->need_resync) return NULL;
  ------------------
  |  Branch (853:7): [True: 0, False: 7.04k]
  ------------------
  854|  7.04k|  aom_img_remove_metadata(&ctx->img);
  855|  7.04k|  yuvconfig2image(&ctx->img, sd, frame_worker_data->user_priv);
  856|  7.04k|  move_decoder_metadata_to_img(pbi, &ctx->img);
  857|       |
  858|  7.04k|  if (!pbi->ext_tile_debug && tiles->large_scale) {
  ------------------
  |  Branch (858:7): [True: 7.04k, False: 0]
  |  Branch (858:31): [True: 0, False: 7.04k]
  ------------------
  859|      0|    *index += 1;  // Advance the iterator to point to the next image
  860|      0|    aom_img_remove_metadata(&ctx->img);
  861|      0|    yuvconfig2image(&ctx->img, &pbi->tile_list_outbuf, NULL);
  862|      0|    move_decoder_metadata_to_img(pbi, &ctx->img);
  863|      0|    img = &ctx->img;
  864|      0|    return img;
  865|      0|  }
  866|       |
  867|  7.04k|  const int num_planes = av1_num_planes(cm);
  868|  7.04k|  if (pbi->ext_tile_debug && tiles->single_tile_decoding &&
  ------------------
  |  Branch (868:7): [True: 0, False: 7.04k]
  |  Branch (868:30): [True: 0, False: 0]
  ------------------
  869|      0|      pbi->dec_tile_row >= 0) {
  ------------------
  |  Branch (869:7): [True: 0, False: 0]
  ------------------
  870|      0|    int tile_width, tile_height;
  871|      0|    if (!av1_get_uniform_tile_size(cm, &tile_width, &tile_height)) {
  ------------------
  |  Branch (871:9): [True: 0, False: 0]
  ------------------
  872|      0|      return NULL;
  873|      0|    }
  874|      0|    const int tile_row = AOMMIN(pbi->dec_tile_row, tiles->rows - 1);
  ------------------
  |  |   34|      0|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  875|      0|    const int mi_row = tile_row * tile_height;
  876|      0|    const int ssy = ctx->img.y_chroma_shift;
  877|      0|    int plane;
  878|      0|    ctx->img.planes[0] += mi_row * MI_SIZE * ctx->img.stride[0];
  ------------------
  |  |   40|      0|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|      0|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  879|      0|    if (num_planes > 1) {
  ------------------
  |  Branch (879:9): [True: 0, False: 0]
  ------------------
  880|      0|      for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
  ------------------
  |  |   36|      0|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (880:23): [True: 0, False: 0]
  ------------------
  881|      0|        ctx->img.planes[plane] +=
  882|      0|            mi_row * (MI_SIZE >> ssy) * ctx->img.stride[plane];
  ------------------
  |  |   40|      0|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|      0|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  883|      0|      }
  884|      0|    }
  885|      0|    ctx->img.d_h =
  886|      0|        AOMMIN(tile_height, cm->mi_params.mi_rows - mi_row) * MI_SIZE;
  ------------------
  |  |   34|      0|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
                      AOMMIN(tile_height, cm->mi_params.mi_rows - mi_row) * MI_SIZE;
  ------------------
  |  |   40|      0|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|      0|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  887|      0|  }
  888|       |
  889|  7.04k|  if (pbi->ext_tile_debug && tiles->single_tile_decoding &&
  ------------------
  |  Branch (889:7): [True: 0, False: 7.04k]
  |  Branch (889:30): [True: 0, False: 0]
  ------------------
  890|      0|      pbi->dec_tile_col >= 0) {
  ------------------
  |  Branch (890:7): [True: 0, False: 0]
  ------------------
  891|      0|    int tile_width, tile_height;
  892|      0|    if (!av1_get_uniform_tile_size(cm, &tile_width, &tile_height)) {
  ------------------
  |  Branch (892:9): [True: 0, False: 0]
  ------------------
  893|      0|      return NULL;
  894|      0|    }
  895|      0|    const int tile_col = AOMMIN(pbi->dec_tile_col, tiles->cols - 1);
  ------------------
  |  |   34|      0|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  896|      0|    const int mi_col = tile_col * tile_width;
  897|      0|    const int ssx = ctx->img.x_chroma_shift;
  898|      0|    const int is_hbd = (ctx->img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 1 : 0;
  ------------------
  |  |   38|      0|#define AOM_IMG_FMT_HIGHBITDEPTH 0x800 /**< Image uses 16bit framebuffer. */
  ------------------
  |  Branch (898:24): [True: 0, False: 0]
  ------------------
  899|      0|    int plane;
  900|      0|    ctx->img.planes[0] += mi_col * MI_SIZE * (1 + is_hbd);
  ------------------
  |  |   40|      0|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|      0|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  901|      0|    if (num_planes > 1) {
  ------------------
  |  Branch (901:9): [True: 0, False: 0]
  ------------------
  902|      0|      for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
  ------------------
  |  |   36|      0|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (902:23): [True: 0, False: 0]
  ------------------
  903|      0|        ctx->img.planes[plane] += mi_col * (MI_SIZE >> ssx) * (1 + is_hbd);
  ------------------
  |  |   40|      0|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|      0|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  904|      0|      }
  905|      0|    }
  906|      0|    ctx->img.d_w = AOMMIN(tile_width, cm->mi_params.mi_cols - mi_col) * MI_SIZE;
  ------------------
  |  |   34|      0|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
                  ctx->img.d_w = AOMMIN(tile_width, cm->mi_params.mi_cols - mi_col) * MI_SIZE;
  ------------------
  |  |   40|      0|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|      0|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  907|      0|  }
  908|       |
  909|  7.04k|  ctx->img.fb_priv = output_frame_buf->raw_frame_buffer.priv;
  910|  7.04k|  img = &ctx->img;
  911|  7.04k|  img->temporal_id = output_frame_buf->temporal_id;
  912|  7.04k|  img->spatial_id = output_frame_buf->spatial_id;
  913|  7.04k|  if (pbi->skip_film_grain) grain_params->apply_grain = 0;
  ------------------
  |  Branch (913:7): [True: 0, False: 7.04k]
  ------------------
  914|  7.04k|  aom_image_t *res =
  915|  7.04k|      add_grain_if_needed(ctx, img, &ctx->image_with_grain, grain_params);
  916|  7.04k|  if (!res) {
  ------------------
  |  Branch (916:7): [True: 0, False: 7.04k]
  ------------------
  917|      0|    pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
  918|      0|    pbi->error.has_detail = 1;
  919|      0|    snprintf(pbi->error.detail, sizeof(pbi->error.detail),
  920|      0|             "Grain synthesis failed\n");
  921|      0|    return res;
  922|      0|  }
  923|  7.04k|  *index += 1;  // Advance the iterator to point to the next image
  924|  7.04k|  return res;
  925|  7.04k|}
av1_dx_iface.c:move_decoder_metadata_to_img:
  800|  7.04k|static void move_decoder_metadata_to_img(AV1Decoder *pbi, aom_image_t *img) {
  801|  7.04k|  if (pbi->metadata && img) {
  ------------------
  |  Branch (801:7): [True: 0, False: 7.04k]
  |  Branch (801:24): [True: 0, False: 0]
  ------------------
  802|      0|    assert(!img->metadata);
  803|      0|    img->metadata = pbi->metadata;
  804|       |    pbi->metadata = NULL;
  805|      0|  }
  806|  7.04k|}
av1_dx_iface.c:add_grain_if_needed:
  771|  7.04k|                                        aom_film_grain_t *grain_params) {
  772|  7.04k|  if (!grain_params->apply_grain) return img;
  ------------------
  |  Branch (772:7): [True: 7.04k, False: 0]
  ------------------
  773|       |
  774|      0|  const int w_even = ALIGN_POWER_OF_TWO_UNSIGNED(img->d_w, 1);
  ------------------
  |  |   71|      0|  (((value) + ((1u << (n)) - 1)) & ~((1u << (n)) - 1))
  ------------------
  775|      0|  const int h_even = ALIGN_POWER_OF_TWO_UNSIGNED(img->d_h, 1);
  ------------------
  |  |   71|      0|  (((value) + ((1u << (n)) - 1)) & ~((1u << (n)) - 1))
  ------------------
  776|       |
  777|      0|  BufferPool *const pool = ctx->buffer_pool;
  778|      0|  aom_codec_frame_buffer_t *fb =
  779|      0|      &ctx->grain_image_frame_buffers[ctx->num_grain_image_frame_buffers];
  780|      0|  AllocCbParam param;
  781|      0|  param.pool = pool;
  782|      0|  param.fb = fb;
  783|      0|  if (!aom_img_alloc_with_cb(grain_img, img->fmt, w_even, h_even, 16,
  ------------------
  |  Branch (783:7): [True: 0, False: 0]
  ------------------
  784|      0|                             AllocWithGetFrameBufferCb, &param)) {
  785|      0|    return NULL;
  786|      0|  }
  787|       |
  788|      0|  grain_img->user_priv = img->user_priv;
  789|      0|  grain_img->fb_priv = fb->priv;
  790|      0|  if (av1_add_film_grain(grain_params, img, grain_img)) {
  ------------------
  |  Branch (790:7): [True: 0, False: 0]
  ------------------
  791|      0|    pool->release_fb_cb(pool->cb_priv, fb);
  792|      0|    return NULL;
  793|      0|  }
  794|       |
  795|      0|  ctx->num_grain_image_frame_buffers++;
  796|      0|  return grain_img;
  797|      0|}

av1_dx_iface.c:yuvconfig2image:
   23|  7.04k|                                   void *user_priv) {
   24|       |  /* aom_img_wrap() doesn't allow specifying independent strides for
   25|       |   * the Y, U, and V planes, nor other alignment adjustments that
   26|       |   * might be representable by a YV12_BUFFER_CONFIG, so we just
   27|       |   * initialize all the fields.
   28|       |   */
   29|  7.04k|  int bps;
   30|  7.04k|  if (!yv12->subsampling_y) {
  ------------------
  |  Branch (30:7): [True: 2.52k, False: 4.52k]
  ------------------
   31|  2.52k|    if (!yv12->subsampling_x) {
  ------------------
  |  Branch (31:9): [True: 1.55k, False: 968]
  ------------------
   32|  1.55k|      img->fmt = AOM_IMG_FMT_I444;
   33|  1.55k|      bps = 24;
   34|  1.55k|    } else {
   35|    968|      img->fmt = AOM_IMG_FMT_I422;
   36|    968|      bps = 16;
   37|    968|    }
   38|  4.52k|  } else {
   39|  4.52k|    img->fmt = AOM_IMG_FMT_I420;
   40|  4.52k|    bps = 12;
   41|  4.52k|  }
   42|  7.04k|  img->cp = yv12->color_primaries;
   43|  7.04k|  img->tc = yv12->transfer_characteristics;
   44|  7.04k|  img->mc = yv12->matrix_coefficients;
   45|  7.04k|  img->monochrome = yv12->monochrome;
   46|  7.04k|  img->csp = yv12->chroma_sample_position;
   47|  7.04k|  img->range = yv12->color_range;
   48|  7.04k|  img->bit_depth = 8;
   49|  7.04k|  img->w = yv12->y_width;
   50|  7.04k|  img->h = yv12->y_height;
   51|  7.04k|  img->d_w = yv12->y_crop_width;
   52|  7.04k|  img->d_h = yv12->y_crop_height;
   53|  7.04k|  img->r_w = yv12->render_width;
   54|  7.04k|  img->r_h = yv12->render_height;
   55|  7.04k|  img->x_chroma_shift = yv12->subsampling_x;
   56|  7.04k|  img->y_chroma_shift = yv12->subsampling_y;
   57|  7.04k|  img->planes[AOM_PLANE_Y] = yv12->y_buffer;
  ------------------
  |  |  210|  7.04k|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
   58|  7.04k|  img->planes[AOM_PLANE_U] = yv12->u_buffer;
  ------------------
  |  |  211|  7.04k|#define AOM_PLANE_U 1      /**< U (Chroma) plane */
  ------------------
   59|  7.04k|  img->planes[AOM_PLANE_V] = yv12->v_buffer;
  ------------------
  |  |  212|  7.04k|#define AOM_PLANE_V 2      /**< V (Chroma) plane */
  ------------------
   60|  7.04k|  img->stride[AOM_PLANE_Y] = yv12->y_stride;
  ------------------
  |  |  210|  7.04k|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
   61|  7.04k|  img->stride[AOM_PLANE_U] = yv12->uv_stride;
  ------------------
  |  |  211|  7.04k|#define AOM_PLANE_U 1      /**< U (Chroma) plane */
  ------------------
   62|  7.04k|  img->stride[AOM_PLANE_V] = yv12->uv_stride;
  ------------------
  |  |  212|  7.04k|#define AOM_PLANE_V 2      /**< V (Chroma) plane */
  ------------------
   63|  7.04k|  if (yv12->flags & YV12_FLAG_HIGHBITDEPTH) {
  ------------------
  |  |  142|  7.04k|#define YV12_FLAG_HIGHBITDEPTH 8
  ------------------
  |  Branch (63:7): [True: 2.09k, False: 4.94k]
  ------------------
   64|  2.09k|    bps *= 2;
   65|       |    // aom_image_t uses byte strides and a pointer to the first byte
   66|       |    // of the image.
   67|  2.09k|    img->fmt = (aom_img_fmt_t)(img->fmt | AOM_IMG_FMT_HIGHBITDEPTH);
  ------------------
  |  |   38|  2.09k|#define AOM_IMG_FMT_HIGHBITDEPTH 0x800 /**< Image uses 16bit framebuffer. */
  ------------------
   68|  2.09k|    img->bit_depth = yv12->bit_depth;
   69|  2.09k|    img->planes[AOM_PLANE_Y] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->y_buffer);
  ------------------
  |  |  210|  2.09k|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
                  img->planes[AOM_PLANE_Y] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->y_buffer);
  ------------------
  |  |   75|  2.09k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
   70|  2.09k|    img->planes[AOM_PLANE_U] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->u_buffer);
  ------------------
  |  |  211|  2.09k|#define AOM_PLANE_U 1      /**< U (Chroma) plane */
  ------------------
                  img->planes[AOM_PLANE_U] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->u_buffer);
  ------------------
  |  |   75|  2.09k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
   71|  2.09k|    img->planes[AOM_PLANE_V] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->v_buffer);
  ------------------
  |  |  212|  2.09k|#define AOM_PLANE_V 2      /**< V (Chroma) plane */
  ------------------
                  img->planes[AOM_PLANE_V] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->v_buffer);
  ------------------
  |  |   75|  2.09k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
   72|  2.09k|    img->stride[AOM_PLANE_Y] = 2 * yv12->y_stride;
  ------------------
  |  |  210|  2.09k|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
   73|  2.09k|    img->stride[AOM_PLANE_U] = 2 * yv12->uv_stride;
  ------------------
  |  |  211|  2.09k|#define AOM_PLANE_U 1      /**< U (Chroma) plane */
  ------------------
   74|  2.09k|    img->stride[AOM_PLANE_V] = 2 * yv12->uv_stride;
  ------------------
  |  |  212|  2.09k|#define AOM_PLANE_V 2      /**< V (Chroma) plane */
  ------------------
   75|  2.09k|  }
   76|  7.04k|  img->bps = bps;
   77|  7.04k|  img->user_priv = user_priv;
   78|  7.04k|  img->img_data = yv12->buffer_alloc;
   79|  7.04k|  img->img_data_owner = 0;
   80|  7.04k|  img->self_allocd = 0;
   81|  7.04k|  img->sz = yv12->frame_size;
   82|  7.04k|  assert(!yv12->metadata);
   83|       |  img->metadata = NULL;
   84|  7.04k|}

av1_free_ref_frame_buffers:
   40|  17.9k|void av1_free_ref_frame_buffers(BufferPool *pool) {
   41|  17.9k|  int i;
   42|       |
   43|   305k|  for (i = 0; i < pool->num_frame_bufs; ++i) {
  ------------------
  |  Branch (43:15): [True: 287k, False: 17.9k]
  ------------------
   44|   287k|    if (pool->frame_bufs[i].ref_count > 0 &&
  ------------------
  |  Branch (44:9): [True: 12.3k, False: 274k]
  ------------------
   45|  12.3k|        pool->frame_bufs[i].raw_frame_buffer.data != NULL) {
  ------------------
  |  Branch (45:9): [True: 12.3k, False: 0]
  ------------------
   46|  12.3k|      pool->release_fb_cb(pool->cb_priv, &pool->frame_bufs[i].raw_frame_buffer);
   47|  12.3k|      pool->frame_bufs[i].raw_frame_buffer.data = NULL;
   48|  12.3k|      pool->frame_bufs[i].raw_frame_buffer.size = 0;
   49|  12.3k|      pool->frame_bufs[i].raw_frame_buffer.priv = NULL;
   50|  12.3k|      pool->frame_bufs[i].ref_count = 0;
   51|  12.3k|    }
   52|   287k|    aom_free(pool->frame_bufs[i].mvs);
   53|   287k|    pool->frame_bufs[i].mvs = NULL;
   54|   287k|    aom_free(pool->frame_bufs[i].seg_map);
   55|   287k|    pool->frame_bufs[i].seg_map = NULL;
   56|   287k|    aom_free_frame_buffer(&pool->frame_bufs[i].buf);
   57|   287k|  }
   58|  17.9k|  aom_free(pool->frame_bufs);
   59|       |  pool->frame_bufs = NULL;
   60|  17.9k|  pool->num_frame_bufs = 0;
   61|  17.9k|}
av1_free_cdef_buffers:
  124|  17.9k|                           AV1CdefSync *cdef_sync) {
  125|  17.9k|  CdefInfo *cdef_info = &cm->cdef_info;
  126|  17.9k|  const int num_mi_rows = cdef_info->allocated_mi_rows;
  127|       |
  128|  71.7k|  for (int plane = 0; plane < MAX_MB_PLANE; plane++) {
  ------------------
  |  |   36|  71.7k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (128:23): [True: 53.8k, False: 17.9k]
  ------------------
  129|  53.8k|    aom_free(cdef_info->linebuf[plane]);
  130|  53.8k|    cdef_info->linebuf[plane] = NULL;
  131|  53.8k|  }
  132|       |  // De-allocation of column buffer & source buffer (worker_0).
  133|  17.9k|  free_cdef_bufs(cdef_info->colbuf, &cdef_info->srcbuf);
  134|       |
  135|  17.9k|  free_cdef_row_sync(&cdef_sync->cdef_row_mt, num_mi_rows);
  136|       |
  137|  17.9k|  if (cdef_info->allocated_num_workers < 2) return;
  ------------------
  |  Branch (137:7): [True: 13.3k, False: 4.55k]
  ------------------
  138|  4.55k|  if (*cdef_worker != NULL) {
  ------------------
  |  Branch (138:7): [True: 3.84k, False: 705]
  ------------------
  139|   119k|    for (int idx = cdef_info->allocated_num_workers - 1; idx >= 1; idx--) {
  ------------------
  |  Branch (139:58): [True: 115k, False: 3.84k]
  ------------------
  140|       |      // De-allocation of column buffer & source buffer for remaining workers.
  141|   115k|      free_cdef_bufs((*cdef_worker)[idx].colbuf, &(*cdef_worker)[idx].srcbuf);
  142|   115k|    }
  143|  3.84k|    aom_free(*cdef_worker);
  144|       |    *cdef_worker = NULL;
  145|  3.84k|  }
  146|  4.55k|}
av1_alloc_cdef_buffers:
  195|  12.5k|                            int init_worker) {
  196|  12.5k|  const int num_planes = av1_num_planes(cm);
  197|  12.5k|  size_t new_linebuf_size[MAX_MB_PLANE] = { 0 };
  198|  12.5k|  size_t new_colbuf_size[MAX_MB_PLANE] = { 0 };
  199|  12.5k|  size_t new_srcbuf_size = 0;
  200|  12.5k|  CdefInfo *const cdef_info = &cm->cdef_info;
  201|       |  // Check for configuration change
  202|  12.5k|  const int num_mi_rows =
  203|  12.5k|      (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
  ------------------
  |  |   58|  12.5k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  12.5k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
                    (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
  ------------------
  |  |   58|  12.5k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  12.5k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  204|  12.5k|  const int is_num_workers_changed =
  205|  12.5k|      cdef_info->allocated_num_workers != num_workers;
  206|  12.5k|  const int is_cdef_enabled =
  207|  12.5k|      cm->seq_params->enable_cdef && !cm->tiles.single_tile_decoding;
  ------------------
  |  Branch (207:7): [True: 11.6k, False: 896]
  |  Branch (207:38): [True: 11.6k, False: 0]
  ------------------
  208|       |
  209|       |  // num-bufs=3 represents ping-pong buffers for top linebuf,
  210|       |  // followed by bottom linebuf.
  211|       |  // ping-pong is to avoid top linebuf over-write by consecutive row.
  212|  12.5k|  int num_bufs = 3;
  213|  12.5k|  if (num_workers > 1)
  ------------------
  |  Branch (213:7): [True: 6.98k, False: 5.52k]
  ------------------
  214|  6.98k|    num_bufs = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
  ------------------
  |  |   58|  6.98k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  6.98k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
                  num_bufs = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
  ------------------
  |  |   58|  6.98k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  6.98k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  215|       |
  216|  12.5k|  if (is_cdef_enabled) {
  ------------------
  |  Branch (216:7): [True: 11.6k, False: 896]
  ------------------
  217|       |    // Calculate src buffer size
  218|  11.6k|    new_srcbuf_size = sizeof(*cdef_info->srcbuf) * CDEF_INBUF_SIZE;
  ------------------
  |  |   32|  11.6k|  (CDEF_BSTRIDE * ((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_VBORDER))
  |  |  ------------------
  |  |  |  |   28|  11.6k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  |  |  ------------------
  |  |  |  |  |  |   69|  11.6k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (CDEF_BSTRIDE * ((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_VBORDER))
  |  |  ------------------
  |  |  |  |   31|  11.6k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  |  |                 (CDEF_BSTRIDE * ((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_VBORDER))
  |  |  ------------------
  |  |  |  |   23|  11.6k|#define CDEF_VBORDER (2)
  |  |  ------------------
  ------------------
  219|  42.1k|    for (int plane = 0; plane < num_planes; plane++) {
  ------------------
  |  Branch (219:25): [True: 30.4k, False: 11.6k]
  ------------------
  220|  30.4k|      const int shift =
  221|  30.4k|          plane == AOM_PLANE_Y ? 0 : cm->seq_params->subsampling_x;
  ------------------
  |  |  210|  30.4k|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
  |  Branch (221:11): [True: 11.6k, False: 18.8k]
  ------------------
  222|       |      // Calculate top and bottom line buffer size
  223|  30.4k|      const int luma_stride =
  224|  30.4k|          ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols << MI_SIZE_LOG2, 4);
  ------------------
  |  |   69|  30.4k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  ------------------
  225|  30.4k|      new_linebuf_size[plane] = sizeof(*cdef_info->linebuf) * num_bufs *
  226|  30.4k|                                (CDEF_VBORDER << 1) * (luma_stride >> shift);
  ------------------
  |  |   23|  30.4k|#define CDEF_VBORDER (2)
  ------------------
  227|       |      // Calculate column buffer size
  228|  30.4k|      const int block_height =
  229|  30.4k|          (CDEF_BLOCKSIZE << (MI_SIZE_LOG2 - shift)) * 2 * CDEF_VBORDER;
  ------------------
  |  |   17|  30.4k|#define CDEF_BLOCKSIZE 64
  ------------------
                        (CDEF_BLOCKSIZE << (MI_SIZE_LOG2 - shift)) * 2 * CDEF_VBORDER;
  ------------------
  |  |   39|  30.4k|#define MI_SIZE_LOG2 2
  ------------------
                        (CDEF_BLOCKSIZE << (MI_SIZE_LOG2 - shift)) * 2 * CDEF_VBORDER;
  ------------------
  |  |   23|  30.4k|#define CDEF_VBORDER (2)
  ------------------
  230|  30.4k|      new_colbuf_size[plane] =
  231|  30.4k|          sizeof(*cdef_info->colbuf[plane]) * block_height * CDEF_HBORDER;
  ------------------
  |  |   26|  30.4k|#define CDEF_HBORDER (8)
  ------------------
  232|  30.4k|    }
  233|  11.6k|  }
  234|       |
  235|       |  // Free src, line and column buffers for worker 0 in case of reallocation
  236|  12.5k|  free_cdef_linebuf_conditional(cm, new_linebuf_size);
  237|  12.5k|  free_cdef_bufs_conditional(cm, cdef_info->colbuf, &cdef_info->srcbuf,
  238|  12.5k|                             new_colbuf_size, new_srcbuf_size);
  239|       |
  240|       |  // The flag init_worker indicates if cdef_worker has to be allocated for the
  241|       |  // frame. This is passed as 1 always from decoder. At encoder side, it is 0
  242|       |  // when called for parallel frames during FPMT (where cdef_worker is shared
  243|       |  // across parallel frames) and 1 otherwise.
  244|  12.5k|  if (*cdef_worker != NULL && init_worker) {
  ------------------
  |  Branch (244:7): [True: 2.34k, False: 10.1k]
  |  Branch (244:31): [True: 2.34k, False: 0]
  ------------------
  245|  2.34k|    if (is_num_workers_changed) {
  ------------------
  |  Branch (245:9): [True: 0, False: 2.34k]
  ------------------
  246|       |      // Free src and column buffers for remaining workers in case of change in
  247|       |      // num_workers
  248|      0|      for (int idx = cdef_info->allocated_num_workers - 1; idx >= 1; idx--)
  ------------------
  |  Branch (248:60): [True: 0, False: 0]
  ------------------
  249|      0|        free_cdef_bufs((*cdef_worker)[idx].colbuf, &(*cdef_worker)[idx].srcbuf);
  250|       |
  251|      0|      aom_free(*cdef_worker);
  252|      0|      *cdef_worker = NULL;
  253|  2.34k|    } else if (num_workers > 1) {
  ------------------
  |  Branch (253:16): [True: 2.34k, False: 0]
  ------------------
  254|       |      // Free src and column buffers for remaining workers in case of
  255|       |      // reallocation
  256|  64.5k|      for (int idx = num_workers - 1; idx >= 1; idx--)
  ------------------
  |  Branch (256:39): [True: 62.2k, False: 2.34k]
  ------------------
  257|  62.2k|        free_cdef_bufs_conditional(cm, (*cdef_worker)[idx].colbuf,
  258|  62.2k|                                   &(*cdef_worker)[idx].srcbuf, new_colbuf_size,
  259|  62.2k|                                   new_srcbuf_size);
  260|  2.34k|    }
  261|  2.34k|  }
  262|       |
  263|  12.5k|  if (cdef_info->allocated_mi_rows != num_mi_rows)
  ------------------
  |  Branch (263:7): [True: 7.66k, False: 4.84k]
  ------------------
  264|  7.66k|    free_cdef_row_sync(&cdef_sync->cdef_row_mt, cdef_info->allocated_mi_rows);
  265|       |
  266|       |  // Store allocated sizes for reallocation
  267|  12.5k|  cdef_info->allocated_srcbuf_size = new_srcbuf_size;
  268|  12.5k|  av1_copy(cdef_info->allocated_colbuf_size, new_colbuf_size);
  ------------------
  |  |   31|  12.5k|  do {                                   \
  |  |   32|  12.5k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  12.5k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  12.5k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 12.5k]
  |  |  ------------------
  ------------------
  269|  12.5k|  av1_copy(cdef_info->allocated_linebuf_size, new_linebuf_size);
  ------------------
  |  |   31|  12.5k|  do {                                   \
  |  |   32|  12.5k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  12.5k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  12.5k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 12.5k]
  |  |  ------------------
  ------------------
  270|       |  // Store configuration to check change in configuration
  271|  12.5k|  cdef_info->allocated_mi_rows = num_mi_rows;
  272|  12.5k|  cdef_info->allocated_num_workers = num_workers;
  273|       |
  274|  12.5k|  if (!is_cdef_enabled) return;
  ------------------
  |  Branch (274:7): [True: 896, False: 11.6k]
  ------------------
  275|       |
  276|       |  // Memory allocation of column buffer & source buffer (worker_0).
  277|  11.6k|  alloc_cdef_bufs(cm, cdef_info->colbuf, &cdef_info->srcbuf, num_planes);
  278|  11.6k|  alloc_cdef_linebuf(cm, cdef_info->linebuf, num_planes);
  279|       |
  280|  11.6k|  if (num_workers < 2) return;
  ------------------
  |  Branch (280:7): [True: 5.42k, False: 6.19k]
  ------------------
  281|       |
  282|  6.19k|  if (init_worker) {
  ------------------
  |  Branch (282:7): [True: 6.19k, False: 0]
  ------------------
  283|  6.19k|    if (*cdef_worker == NULL)
  ------------------
  |  Branch (283:9): [True: 3.84k, False: 2.34k]
  ------------------
  284|  6.19k|      CHECK_MEM_ERROR(cm, *cdef_worker,
  ------------------
  |  |   51|  3.84k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  3.84k|  do {                                                    \
  |  |  |  |   69|  3.84k|    lval = (expr);                                        \
  |  |  |  |   70|  3.84k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 3.84k]
  |  |  |  |  ------------------
  |  |  |  |   71|  3.84k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  3.84k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 3.84k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  285|  6.19k|                      aom_calloc(num_workers, sizeof(**cdef_worker)));
  286|       |
  287|       |    // Memory allocation of column buffer & source buffer for remaining workers.
  288|   183k|    for (int idx = num_workers - 1; idx >= 1; idx--)
  ------------------
  |  Branch (288:37): [True: 177k, False: 6.19k]
  ------------------
  289|   177k|      alloc_cdef_bufs(cm, (*cdef_worker)[idx].colbuf,
  290|   177k|                      &(*cdef_worker)[idx].srcbuf, num_planes);
  291|  6.19k|  }
  292|       |
  293|  6.19k|  alloc_cdef_row_sync(cm, &cdef_sync->cdef_row_mt,
  294|  6.19k|                      cdef_info->allocated_mi_rows);
  295|  6.19k|}
av1_alloc_restoration_buffers:
  299|  3.58k|void av1_alloc_restoration_buffers(AV1_COMMON *cm, bool is_sgr_enabled) {
  300|  3.58k|  const int num_planes = av1_num_planes(cm);
  301|       |
  302|  3.58k|  if (cm->rst_tmpbuf == NULL && is_sgr_enabled) {
  ------------------
  |  Branch (302:7): [True: 3.57k, False: 4]
  |  Branch (302:33): [True: 3.57k, False: 0]
  ------------------
  303|  3.57k|    CHECK_MEM_ERROR(cm, cm->rst_tmpbuf,
  ------------------
  |  |   51|  3.57k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  3.57k|  do {                                                    \
  |  |  |  |   69|  3.57k|    lval = (expr);                                        \
  |  |  |  |   70|  3.57k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 3.57k]
  |  |  |  |  ------------------
  |  |  |  |   71|  3.57k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  3.57k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 3.57k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  304|  3.57k|                    (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE));
  305|  3.57k|  }
  306|       |
  307|  3.58k|  if (cm->rlbs == NULL) {
  ------------------
  |  Branch (307:7): [True: 3.57k, False: 4]
  ------------------
  308|  3.57k|    CHECK_MEM_ERROR(cm, cm->rlbs, aom_malloc(sizeof(RestorationLineBuffers)));
  ------------------
  |  |   51|  3.57k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  3.57k|  do {                                                    \
  |  |  |  |   69|  3.57k|    lval = (expr);                                        \
  |  |  |  |   70|  3.57k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 3.57k]
  |  |  |  |  ------------------
  |  |  |  |   71|  3.57k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  3.57k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 3.57k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  309|  3.57k|  }
  310|       |
  311|       |  // For striped loop restoration, we divide each plane into "stripes",
  312|       |  // of height 64 luma pixels but with an offset by RESTORATION_UNIT_OFFSET
  313|       |  // luma pixels to match the output from CDEF. We will need to store 2 *
  314|       |  // RESTORATION_CTX_VERT lines of data for each stripe.
  315|  3.58k|  int mi_h = cm->mi_params.mi_rows;
  316|  3.58k|  const int ext_h = RESTORATION_UNIT_OFFSET + (mi_h << MI_SIZE_LOG2);
  ------------------
  |  |   37|  3.58k|#define RESTORATION_UNIT_OFFSET 8
  ------------------
                const int ext_h = RESTORATION_UNIT_OFFSET + (mi_h << MI_SIZE_LOG2);
  ------------------
  |  |   39|  3.58k|#define MI_SIZE_LOG2 2
  ------------------
  317|  3.58k|  const int num_stripes = (ext_h + 63) / 64;
  318|       |
  319|       |  // Now we need to allocate enough space to store the line buffers for the
  320|       |  // stripes
  321|  3.58k|  const int frame_w = cm->superres_upscaled_width;
  322|  3.58k|  const int use_highbd = cm->seq_params->use_highbitdepth;
  323|       |
  324|  12.4k|  for (int p = 0; p < num_planes; ++p) {
  ------------------
  |  Branch (324:19): [True: 8.90k, False: 3.58k]
  ------------------
  325|  8.90k|    const int is_uv = p > 0;
  326|  8.90k|    const int ss_x = is_uv && cm->seq_params->subsampling_x;
  ------------------
  |  Branch (326:22): [True: 5.32k, False: 3.58k]
  |  Branch (326:31): [True: 1.28k, False: 4.03k]
  ------------------
  327|  8.90k|    const int plane_w = ((frame_w + ss_x) >> ss_x) + 2 * RESTORATION_EXTRA_HORZ;
  ------------------
  |  |   70|  8.90k|#define RESTORATION_EXTRA_HORZ 4
  ------------------
  328|  8.90k|    const int stride = ALIGN_POWER_OF_TWO(plane_w, 5);
  ------------------
  |  |   69|  8.90k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  ------------------
  329|  8.90k|    const int buf_size = num_stripes * stride * RESTORATION_CTX_VERT
  ------------------
  |  |   66|  8.90k|#define RESTORATION_CTX_VERT 2
  ------------------
  330|  8.90k|                         << use_highbd;
  331|  8.90k|    RestorationStripeBoundaries *boundaries = &cm->rst_info[p].boundaries;
  332|       |
  333|  8.90k|    if (buf_size != boundaries->stripe_boundary_size ||
  ------------------
  |  Branch (333:9): [True: 8.89k, False: 12]
  ------------------
  334|     12|        boundaries->stripe_boundary_above == NULL ||
  ------------------
  |  Branch (334:9): [True: 0, False: 12]
  ------------------
  335|  8.89k|        boundaries->stripe_boundary_below == NULL) {
  ------------------
  |  Branch (335:9): [True: 0, False: 12]
  ------------------
  336|  8.89k|      aom_free(boundaries->stripe_boundary_above);
  337|  8.89k|      aom_free(boundaries->stripe_boundary_below);
  338|       |
  339|  8.89k|      CHECK_MEM_ERROR(cm, boundaries->stripe_boundary_above,
  ------------------
  |  |   51|  8.89k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  8.89k|  do {                                                    \
  |  |  |  |   69|  8.89k|    lval = (expr);                                        \
  |  |  |  |   70|  8.89k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 8.89k]
  |  |  |  |  ------------------
  |  |  |  |   71|  8.89k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  8.89k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 8.89k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  340|  8.89k|                      (uint8_t *)aom_memalign(32, buf_size));
  341|  8.89k|      CHECK_MEM_ERROR(cm, boundaries->stripe_boundary_below,
  ------------------
  |  |   51|  8.89k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  8.89k|  do {                                                    \
  |  |  |  |   69|  8.89k|    lval = (expr);                                        \
  |  |  |  |   70|  8.89k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 8.89k]
  |  |  |  |  ------------------
  |  |  |  |   71|  8.89k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  8.89k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 8.89k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  342|  8.89k|                      (uint8_t *)aom_memalign(32, buf_size));
  343|       |
  344|  8.89k|      boundaries->stripe_boundary_size = buf_size;
  345|  8.89k|    }
  346|  8.90k|    boundaries->stripe_boundary_stride = stride;
  347|  8.90k|  }
  348|  3.58k|}
av1_free_restoration_buffers:
  350|  17.9k|void av1_free_restoration_buffers(AV1_COMMON *cm) {
  351|  17.9k|  int p;
  352|  71.7k|  for (p = 0; p < MAX_MB_PLANE; ++p)
  ------------------
  |  |   36|  71.7k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (352:15): [True: 53.8k, False: 17.9k]
  ------------------
  353|  53.8k|    av1_free_restoration_struct(&cm->rst_info[p]);
  354|  17.9k|  aom_free(cm->rst_tmpbuf);
  355|  17.9k|  cm->rst_tmpbuf = NULL;
  356|  17.9k|  aom_free(cm->rlbs);
  357|  17.9k|  cm->rlbs = NULL;
  358|  71.7k|  for (p = 0; p < MAX_MB_PLANE; ++p) {
  ------------------
  |  |   36|  71.7k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (358:15): [True: 53.8k, False: 17.9k]
  ------------------
  359|  53.8k|    RestorationStripeBoundaries *boundaries = &cm->rst_info[p].boundaries;
  360|  53.8k|    aom_free(boundaries->stripe_boundary_above);
  361|  53.8k|    aom_free(boundaries->stripe_boundary_below);
  362|  53.8k|    boundaries->stripe_boundary_above = NULL;
  363|  53.8k|    boundaries->stripe_boundary_below = NULL;
  364|  53.8k|  }
  365|       |
  366|  17.9k|  aom_free_frame_buffer(&cm->rst_frame);
  367|  17.9k|}
av1_free_above_context_buffers:
  370|  54.2k|void av1_free_above_context_buffers(CommonContexts *above_contexts) {
  371|  54.2k|  int i;
  372|  54.2k|  const int num_planes = above_contexts->num_planes;
  373|       |
  374|  73.9k|  for (int tile_row = 0; tile_row < above_contexts->num_tile_rows; tile_row++) {
  ------------------
  |  Branch (374:26): [True: 19.6k, False: 54.2k]
  ------------------
  375|  70.7k|    for (i = 0; i < num_planes; i++) {
  ------------------
  |  Branch (375:17): [True: 51.0k, False: 19.6k]
  ------------------
  376|  51.0k|      if (above_contexts->entropy[i] == NULL) break;
  ------------------
  |  Branch (376:11): [True: 0, False: 51.0k]
  ------------------
  377|  51.0k|      aom_free(above_contexts->entropy[i][tile_row]);
  378|  51.0k|      above_contexts->entropy[i][tile_row] = NULL;
  379|  51.0k|    }
  380|  19.6k|    if (above_contexts->partition != NULL) {
  ------------------
  |  Branch (380:9): [True: 19.6k, False: 0]
  ------------------
  381|  19.6k|      aom_free(above_contexts->partition[tile_row]);
  382|  19.6k|      above_contexts->partition[tile_row] = NULL;
  383|  19.6k|    }
  384|       |
  385|  19.6k|    if (above_contexts->txfm != NULL) {
  ------------------
  |  Branch (385:9): [True: 19.6k, False: 0]
  ------------------
  386|  19.6k|      aom_free(above_contexts->txfm[tile_row]);
  387|  19.6k|      above_contexts->txfm[tile_row] = NULL;
  388|  19.6k|    }
  389|  19.6k|  }
  390|   101k|  for (i = 0; i < num_planes; i++) {
  ------------------
  |  Branch (390:15): [True: 47.5k, False: 54.2k]
  ------------------
  391|  47.5k|    aom_free(above_contexts->entropy[i]);
  392|  47.5k|    above_contexts->entropy[i] = NULL;
  393|  47.5k|  }
  394|  54.2k|  aom_free(above_contexts->partition);
  395|  54.2k|  above_contexts->partition = NULL;
  396|       |
  397|  54.2k|  aom_free(above_contexts->txfm);
  398|  54.2k|  above_contexts->txfm = NULL;
  399|       |
  400|  54.2k|  above_contexts->num_tile_rows = 0;
  401|  54.2k|  above_contexts->num_mi_cols = 0;
  402|  54.2k|  above_contexts->num_planes = 0;
  403|  54.2k|}
av1_free_context_buffers:
  405|  35.8k|void av1_free_context_buffers(AV1_COMMON *cm) {
  406|  35.8k|  if (cm->mi_params.free_mi != NULL) cm->mi_params.free_mi(&cm->mi_params);
  ------------------
  |  Branch (406:7): [True: 35.8k, False: 0]
  ------------------
  407|       |
  408|  35.8k|  av1_free_above_context_buffers(&cm->above_contexts);
  409|  35.8k|}
av1_alloc_above_context_buffers:
  413|  18.4k|                                    int num_planes) {
  414|  18.4k|  const int aligned_mi_cols =
  415|  18.4k|      ALIGN_POWER_OF_TWO(num_mi_cols, MAX_MIB_SIZE_LOG2);
  ------------------
  |  |   69|  18.4k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  ------------------
  416|       |
  417|       |  // Allocate above context buffers
  418|  18.4k|  above_contexts->num_tile_rows = num_tile_rows;
  419|  18.4k|  above_contexts->num_mi_cols = aligned_mi_cols;
  420|  18.4k|  above_contexts->num_planes = num_planes;
  421|  65.9k|  for (int plane_idx = 0; plane_idx < num_planes; plane_idx++) {
  ------------------
  |  Branch (421:27): [True: 47.5k, False: 18.4k]
  ------------------
  422|  47.5k|    above_contexts->entropy[plane_idx] = (ENTROPY_CONTEXT **)aom_calloc(
  423|  47.5k|        num_tile_rows, sizeof(above_contexts->entropy[0]));
  424|  47.5k|    if (!above_contexts->entropy[plane_idx]) return 1;
  ------------------
  |  Branch (424:9): [True: 0, False: 47.5k]
  ------------------
  425|  47.5k|  }
  426|       |
  427|  18.4k|  above_contexts->partition = (PARTITION_CONTEXT **)aom_calloc(
  428|  18.4k|      num_tile_rows, sizeof(above_contexts->partition));
  429|  18.4k|  if (!above_contexts->partition) return 1;
  ------------------
  |  Branch (429:7): [True: 0, False: 18.4k]
  ------------------
  430|       |
  431|  18.4k|  above_contexts->txfm =
  432|  18.4k|      (TXFM_CONTEXT **)aom_calloc(num_tile_rows, sizeof(above_contexts->txfm));
  433|  18.4k|  if (!above_contexts->txfm) return 1;
  ------------------
  |  Branch (433:7): [True: 0, False: 18.4k]
  ------------------
  434|       |
  435|  38.0k|  for (int tile_row = 0; tile_row < num_tile_rows; tile_row++) {
  ------------------
  |  Branch (435:26): [True: 19.6k, False: 18.4k]
  ------------------
  436|  70.7k|    for (int plane_idx = 0; plane_idx < num_planes; plane_idx++) {
  ------------------
  |  Branch (436:29): [True: 51.0k, False: 19.6k]
  ------------------
  437|  51.0k|      above_contexts->entropy[plane_idx][tile_row] =
  438|  51.0k|          (ENTROPY_CONTEXT *)aom_calloc(
  439|  51.0k|              aligned_mi_cols, sizeof(*above_contexts->entropy[0][tile_row]));
  440|  51.0k|      if (!above_contexts->entropy[plane_idx][tile_row]) return 1;
  ------------------
  |  Branch (440:11): [True: 0, False: 51.0k]
  ------------------
  441|  51.0k|    }
  442|       |
  443|  19.6k|    above_contexts->partition[tile_row] = (PARTITION_CONTEXT *)aom_calloc(
  444|  19.6k|        aligned_mi_cols, sizeof(*above_contexts->partition[tile_row]));
  445|  19.6k|    if (!above_contexts->partition[tile_row]) return 1;
  ------------------
  |  Branch (445:9): [True: 0, False: 19.6k]
  ------------------
  446|       |
  447|  19.6k|    above_contexts->txfm[tile_row] = (TXFM_CONTEXT *)aom_calloc(
  448|  19.6k|        aligned_mi_cols, sizeof(*above_contexts->txfm[tile_row]));
  449|  19.6k|    if (!above_contexts->txfm[tile_row]) return 1;
  ------------------
  |  Branch (449:9): [True: 0, False: 19.6k]
  ------------------
  450|  19.6k|  }
  451|       |
  452|  18.4k|  return 0;
  453|  18.4k|}
av1_alloc_context_buffers:
  488|  17.6k|                              BLOCK_SIZE min_partition_size) {
  489|  17.6k|  CommonModeInfoParams *const mi_params = &cm->mi_params;
  490|  17.6k|  mi_params->set_mb_mi(mi_params, width, height, min_partition_size);
  491|  17.6k|  if (alloc_mi(mi_params)) goto fail;
  ------------------
  |  Branch (491:7): [True: 0, False: 17.6k]
  ------------------
  492|  17.6k|  return 0;
  493|       |
  494|      0|fail:
  495|       |  // clear the mi_* values to force a realloc on resync
  496|      0|  mi_params->set_mb_mi(mi_params, 0, 0, BLOCK_4X4);
  497|      0|  av1_free_context_buffers(cm);
  498|      0|  return 1;
  499|  17.6k|}
av1_remove_common:
  501|  35.8k|void av1_remove_common(AV1_COMMON *cm) {
  502|  35.8k|  av1_free_context_buffers(cm);
  503|       |
  504|  35.8k|  aom_free(cm->fc);
  505|  35.8k|  cm->fc = NULL;
  506|  35.8k|  aom_free(cm->default_frame_context);
  507|       |  cm->default_frame_context = NULL;
  508|  35.8k|}
av1_init_mi_buffers:
  510|  17.8k|void av1_init_mi_buffers(CommonModeInfoParams *mi_params) {
  511|  17.8k|  mi_params->setup_mi(mi_params);
  512|  17.8k|}
alloccommon.c:free_cdef_bufs:
   92|   133k|static inline void free_cdef_bufs(uint16_t **colbuf, uint16_t **srcbuf) {
   93|   133k|  aom_free(*srcbuf);
   94|   133k|  *srcbuf = NULL;
   95|   533k|  for (int plane = 0; plane < MAX_MB_PLANE; plane++) {
  ------------------
  |  |   36|   533k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (95:23): [True: 399k, False: 133k]
  ------------------
   96|   399k|    aom_free(colbuf[plane]);
   97|       |    colbuf[plane] = NULL;
   98|   399k|  }
   99|   133k|}
alloccommon.c:free_cdef_row_sync:
  102|  25.6k|                                      const int num_mi_rows) {
  103|  25.6k|  if (*cdef_row_mt == NULL) return;
  ------------------
  |  Branch (103:7): [True: 21.7k, False: 3.85k]
  ------------------
  104|  3.85k|#if CONFIG_MULTITHREAD
  105|  11.5k|  for (int row_idx = 0; row_idx < num_mi_rows; row_idx++) {
  ------------------
  |  Branch (105:25): [True: 7.65k, False: 3.85k]
  ------------------
  106|  7.65k|    if ((*cdef_row_mt)[row_idx].row_mutex_ != NULL) {
  ------------------
  |  Branch (106:9): [True: 7.65k, False: 0]
  ------------------
  107|  7.65k|      pthread_mutex_destroy((*cdef_row_mt)[row_idx].row_mutex_);
  108|  7.65k|      aom_free((*cdef_row_mt)[row_idx].row_mutex_);
  109|  7.65k|    }
  110|  7.65k|    if ((*cdef_row_mt)[row_idx].row_cond_ != NULL) {
  ------------------
  |  Branch (110:9): [True: 7.65k, False: 0]
  ------------------
  111|  7.65k|      pthread_cond_destroy((*cdef_row_mt)[row_idx].row_cond_);
  112|  7.65k|      aom_free((*cdef_row_mt)[row_idx].row_cond_);
  113|  7.65k|    }
  114|  7.65k|  }
  115|       |#else
  116|       |  (void)num_mi_rows;
  117|       |#endif  // CONFIG_MULTITHREAD
  118|  3.85k|  aom_free(*cdef_row_mt);
  119|       |  *cdef_row_mt = NULL;
  120|  3.85k|}
alloccommon.c:free_cdef_linebuf_conditional:
   64|  12.5k|    AV1_COMMON *const cm, const size_t *new_linebuf_size) {
   65|  12.5k|  CdefInfo *cdef_info = &cm->cdef_info;
   66|  50.0k|  for (int plane = 0; plane < MAX_MB_PLANE; plane++) {
  ------------------
  |  |   36|  50.0k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (66:23): [True: 37.5k, False: 12.5k]
  ------------------
   67|  37.5k|    if (new_linebuf_size[plane] != cdef_info->allocated_linebuf_size[plane]) {
  ------------------
  |  Branch (67:9): [True: 18.2k, False: 19.2k]
  ------------------
   68|  18.2k|      aom_free(cdef_info->linebuf[plane]);
   69|       |      cdef_info->linebuf[plane] = NULL;
   70|  18.2k|    }
   71|  37.5k|  }
   72|  12.5k|}
alloccommon.c:free_cdef_bufs_conditional:
   78|  74.7k|                                              const size_t new_srcbuf_size) {
   79|  74.7k|  CdefInfo *cdef_info = &cm->cdef_info;
   80|  74.7k|  if (new_srcbuf_size != cdef_info->allocated_srcbuf_size) {
  ------------------
  |  Branch (80:7): [True: 6.88k, False: 67.8k]
  ------------------
   81|  6.88k|    aom_free(*srcbuf);
   82|  6.88k|    *srcbuf = NULL;
   83|  6.88k|  }
   84|   299k|  for (int plane = 0; plane < MAX_MB_PLANE; plane++) {
  ------------------
  |  |   36|   299k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (84:23): [True: 224k, False: 74.7k]
  ------------------
   85|   224k|    if (new_colbuf_size[plane] != cdef_info->allocated_colbuf_size[plane]) {
  ------------------
  |  Branch (85:9): [True: 20.5k, False: 203k]
  ------------------
   86|  20.5k|      aom_free(colbuf[plane]);
   87|       |      colbuf[plane] = NULL;
   88|  20.5k|    }
   89|   224k|  }
   90|  74.7k|}
alloccommon.c:alloc_cdef_bufs:
  159|   189k|                                   uint16_t **srcbuf, const int num_planes) {
  160|   189k|  CdefInfo *cdef_info = &cm->cdef_info;
  161|   189k|  if (*srcbuf == NULL)
  ------------------
  |  Branch (161:7): [True: 122k, False: 66.9k]
  ------------------
  162|   189k|    CHECK_MEM_ERROR(cm, *srcbuf,
  ------------------
  |  |   51|   122k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|   122k|  do {                                                    \
  |  |  |  |   69|   122k|    lval = (expr);                                        \
  |  |  |  |   70|   122k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 122k]
  |  |  |  |  ------------------
  |  |  |  |   71|   122k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|   122k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 122k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  163|   189k|                    aom_memalign(16, cdef_info->allocated_srcbuf_size));
  164|       |
  165|   678k|  for (int plane = 0; plane < num_planes; plane++) {
  ------------------
  |  Branch (165:23): [True: 489k, False: 189k]
  ------------------
  166|   489k|    if (colbuf[plane] == NULL)
  ------------------
  |  Branch (166:9): [True: 323k, False: 166k]
  ------------------
  167|   489k|      CHECK_MEM_ERROR(cm, colbuf[plane],
  ------------------
  |  |   51|   323k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|   323k|  do {                                                    \
  |  |  |  |   69|   323k|    lval = (expr);                                        \
  |  |  |  |   70|   323k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 323k]
  |  |  |  |  ------------------
  |  |  |  |   71|   323k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|   323k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 323k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  168|   489k|                      aom_malloc(cdef_info->allocated_colbuf_size[plane]));
  169|   489k|  }
  170|   189k|}
alloccommon.c:alloc_cdef_linebuf:
  149|  11.6k|                                      const int num_planes) {
  150|  11.6k|  CdefInfo *cdef_info = &cm->cdef_info;
  151|  42.1k|  for (int plane = 0; plane < num_planes; plane++) {
  ------------------
  |  Branch (151:23): [True: 30.4k, False: 11.6k]
  ------------------
  152|  30.4k|    if (linebuf[plane] == NULL)
  ------------------
  |  Branch (152:9): [True: 18.2k, False: 12.2k]
  ------------------
  153|  30.4k|      CHECK_MEM_ERROR(cm, linebuf[plane],
  ------------------
  |  |   51|  18.2k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  18.2k|  do {                                                    \
  |  |  |  |   69|  18.2k|    lval = (expr);                                        \
  |  |  |  |   70|  18.2k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 18.2k]
  |  |  |  |  ------------------
  |  |  |  |   71|  18.2k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  18.2k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 18.2k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  154|  30.4k|                      aom_malloc(cdef_info->allocated_linebuf_size[plane]));
  155|  30.4k|  }
  156|  11.6k|}
alloccommon.c:alloc_cdef_row_sync:
  174|  6.19k|                                       const int num_mi_rows) {
  175|  6.19k|  if (*cdef_row_mt != NULL) return;
  ------------------
  |  Branch (175:7): [True: 2.34k, False: 3.85k]
  ------------------
  176|       |
  177|  3.85k|  CHECK_MEM_ERROR(cm, *cdef_row_mt,
  ------------------
  |  |   51|  3.85k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  3.85k|  do {                                                    \
  |  |  |  |   69|  3.85k|    lval = (expr);                                        \
  |  |  |  |   70|  3.85k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 3.85k]
  |  |  |  |  ------------------
  |  |  |  |   71|  3.85k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  3.85k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 3.85k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  178|  3.85k|                  aom_calloc(num_mi_rows, sizeof(**cdef_row_mt)));
  179|  3.85k|#if CONFIG_MULTITHREAD
  180|  11.5k|  for (int row_idx = 0; row_idx < num_mi_rows; row_idx++) {
  ------------------
  |  Branch (180:25): [True: 7.65k, False: 3.85k]
  ------------------
  181|  7.65k|    CHECK_MEM_ERROR(cm, (*cdef_row_mt)[row_idx].row_mutex_,
  ------------------
  |  |   51|  7.65k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  7.65k|  do {                                                    \
  |  |  |  |   69|  7.65k|    lval = (expr);                                        \
  |  |  |  |   70|  7.65k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 7.65k]
  |  |  |  |  ------------------
  |  |  |  |   71|  7.65k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  7.65k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 7.65k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  182|  7.65k|                    aom_malloc(sizeof(*(*cdef_row_mt)[row_idx].row_mutex_)));
  183|  7.65k|    pthread_mutex_init((*cdef_row_mt)[row_idx].row_mutex_, NULL);
  184|       |
  185|  7.65k|    CHECK_MEM_ERROR(cm, (*cdef_row_mt)[row_idx].row_cond_,
  ------------------
  |  |   51|  7.65k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  7.65k|  do {                                                    \
  |  |  |  |   69|  7.65k|    lval = (expr);                                        \
  |  |  |  |   70|  7.65k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 7.65k]
  |  |  |  |  ------------------
  |  |  |  |   71|  7.65k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  7.65k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 7.65k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  186|  7.65k|                    aom_malloc(sizeof(*(*cdef_row_mt)[row_idx].row_cond_)));
  187|       |    pthread_cond_init((*cdef_row_mt)[row_idx].row_cond_, NULL);
  188|  7.65k|  }
  189|  3.85k|#endif  // CONFIG_MULTITHREAD
  190|  3.85k|}
alloccommon.c:alloc_mi:
  458|  17.6k|static int alloc_mi(CommonModeInfoParams *mi_params) {
  459|  17.6k|  const int aligned_mi_rows = calc_mi_size(mi_params->mi_rows);
  460|  17.6k|  const int mi_grid_size = mi_params->mi_stride * aligned_mi_rows;
  461|  17.6k|  const int alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize];
  462|  17.6k|  const int alloc_mi_size =
  463|  17.6k|      mi_params->mi_alloc_stride * (aligned_mi_rows / alloc_size_1d);
  464|       |
  465|  17.6k|  if (mi_params->mi_alloc_size < alloc_mi_size ||
  ------------------
  |  Branch (465:7): [True: 17.6k, False: 58]
  ------------------
  466|  17.6k|      mi_params->mi_grid_size < mi_grid_size) {
  ------------------
  |  Branch (466:7): [True: 0, False: 58]
  ------------------
  467|  17.6k|    mi_params->free_mi(mi_params);
  468|       |
  469|  17.6k|    mi_params->mi_alloc =
  470|  17.6k|        aom_calloc(alloc_mi_size, sizeof(*mi_params->mi_alloc));
  471|  17.6k|    if (!mi_params->mi_alloc) return 1;
  ------------------
  |  Branch (471:9): [True: 0, False: 17.6k]
  ------------------
  472|  17.6k|    mi_params->mi_alloc_size = alloc_mi_size;
  473|       |
  474|  17.6k|    mi_params->mi_grid_base = (MB_MODE_INFO **)aom_calloc(
  475|  17.6k|        mi_grid_size, sizeof(*mi_params->mi_grid_base));
  476|  17.6k|    if (!mi_params->mi_grid_base) return 1;
  ------------------
  |  Branch (476:9): [True: 0, False: 17.6k]
  ------------------
  477|       |
  478|  17.6k|    mi_params->tx_type_map =
  479|  17.6k|        aom_calloc(mi_grid_size, sizeof(*mi_params->tx_type_map));
  480|  17.6k|    if (!mi_params->tx_type_map) return 1;
  ------------------
  |  Branch (480:9): [True: 0, False: 17.6k]
  ------------------
  481|  17.6k|    mi_params->mi_grid_size = mi_grid_size;
  482|  17.6k|  }
  483|       |
  484|  17.6k|  return 0;
  485|  17.6k|}

av1_dx_iface.c:lock_buffer_pool:
 1082|  4.49k|static void lock_buffer_pool(BufferPool *const pool) {
 1083|  4.49k|#if CONFIG_MULTITHREAD
 1084|  4.49k|  pthread_mutex_lock(&pool->pool_mutex);
 1085|       |#else
 1086|       |  (void)pool;
 1087|       |#endif
 1088|  4.49k|}
av1_dx_iface.c:unlock_buffer_pool:
 1090|  4.49k|static void unlock_buffer_pool(BufferPool *const pool) {
 1091|  4.49k|#if CONFIG_MULTITHREAD
 1092|  4.49k|  pthread_mutex_unlock(&pool->pool_mutex);
 1093|       |#else
 1094|       |  (void)pool;
 1095|       |#endif
 1096|  4.49k|}
av1_dx_iface.c:frame_is_intra_only:
 1174|  7.65k|static inline int frame_is_intra_only(const AV1_COMMON *const cm) {
 1175|  7.65k|  return cm->current_frame.frame_type == KEY_FRAME ||
  ------------------
  |  Branch (1175:10): [True: 7.63k, False: 20]
  ------------------
 1176|     20|         cm->current_frame.frame_type == INTRA_ONLY_FRAME;
  ------------------
  |  Branch (1176:10): [True: 20, False: 0]
  ------------------
 1177|  7.65k|}
av1_dx_iface.c:av1_num_planes:
 1271|  7.04k|static inline int av1_num_planes(const AV1_COMMON *cm) {
 1272|  7.04k|  return cm->seq_params->monochrome ? 1 : MAX_MB_PLANE;
  ------------------
  |  |   36|  5.49k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (1272:10): [True: 1.55k, False: 5.49k]
  ------------------
 1273|  7.04k|}
decodeframe.c:set_sb_size:
 1851|  18.5k|                               BLOCK_SIZE sb_size) {
 1852|  18.5k|  seq_params->sb_size = sb_size;
 1853|  18.5k|  seq_params->mib_size = mi_size_wide[seq_params->sb_size];
 1854|  18.5k|  seq_params->mib_size_log2 = mi_size_wide_log2[seq_params->sb_size];
 1855|  18.5k|}
decodeframe.c:av1_num_planes:
 1271|  8.64M|static inline int av1_num_planes(const AV1_COMMON *cm) {
 1272|  8.64M|  return cm->seq_params->monochrome ? 1 : MAX_MB_PLANE;
  ------------------
  |  |   36|  5.33M|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (1272:10): [True: 3.30M, False: 5.33M]
  ------------------
 1273|  8.64M|}
decodeframe.c:lock_buffer_pool:
 1082|  29.9k|static void lock_buffer_pool(BufferPool *const pool) {
 1083|  29.9k|#if CONFIG_MULTITHREAD
 1084|  29.9k|  pthread_mutex_lock(&pool->pool_mutex);
 1085|       |#else
 1086|       |  (void)pool;
 1087|       |#endif
 1088|  29.9k|}
decodeframe.c:assign_frame_buffer_p:
 1161|    258|                                         RefCntBuffer *rhs_ptr) {
 1162|    258|  RefCntBuffer *const old_ptr = *lhs_ptr;
 1163|    258|  if (old_ptr != NULL) {
  ------------------
  |  Branch (1163:7): [True: 258, False: 0]
  ------------------
 1164|    258|    assert(old_ptr->ref_count > 0);
 1165|       |    // One less reference to the buffer at 'old_ptr', so decrease ref count.
 1166|    258|    --old_ptr->ref_count;
 1167|    258|  }
 1168|       |
 1169|    258|  *lhs_ptr = rhs_ptr;
 1170|       |  // One more reference to the buffer at 'rhs_ptr', so increase ref count.
 1171|    258|  ++rhs_ptr->ref_count;
 1172|    258|}
decodeframe.c:unlock_buffer_pool:
 1090|  29.9k|static void unlock_buffer_pool(BufferPool *const pool) {
 1091|  29.9k|#if CONFIG_MULTITHREAD
 1092|  29.9k|  pthread_mutex_unlock(&pool->pool_mutex);
 1093|       |#else
 1094|       |  (void)pool;
 1095|       |#endif
 1096|  29.9k|}
decodeframe.c:frame_is_sframe:
 1179|  43.1k|static inline int frame_is_sframe(const AV1_COMMON *cm) {
 1180|  43.1k|  return cm->current_frame.frame_type == S_FRAME;
 1181|  43.1k|}
decodeframe.c:frame_is_intra_only:
 1174|   136k|static inline int frame_is_intra_only(const AV1_COMMON *const cm) {
 1175|   136k|  return cm->current_frame.frame_type == KEY_FRAME ||
  ------------------
  |  Branch (1175:10): [True: 73.0k, False: 63.7k]
  ------------------
 1176|  63.7k|         cm->current_frame.frame_type == INTRA_ONLY_FRAME;
  ------------------
  |  Branch (1176:10): [True: 1.17k, False: 62.5k]
  ------------------
 1177|   136k|}
decodeframe.c:get_free_fb:
 1104|  1.24k|static inline int get_free_fb(AV1_COMMON *cm) {
 1105|  1.24k|  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
 1106|  1.24k|  int i;
 1107|       |
 1108|  1.24k|  lock_buffer_pool(cm->buffer_pool);
 1109|  1.24k|  const int num_frame_bufs = cm->buffer_pool->num_frame_bufs;
 1110|  6.69k|  for (i = 0; i < num_frame_bufs; ++i)
  ------------------
  |  Branch (1110:15): [True: 6.69k, False: 0]
  ------------------
 1111|  6.69k|    if (frame_bufs[i].ref_count == 0) break;
  ------------------
  |  Branch (1111:9): [True: 1.24k, False: 5.45k]
  ------------------
 1112|       |
 1113|  1.24k|  if (i != num_frame_bufs) {
  ------------------
  |  Branch (1113:7): [True: 1.24k, False: 0]
  ------------------
 1114|  1.24k|    if (frame_bufs[i].buf.use_external_reference_buffers) {
  ------------------
  |  Branch (1114:9): [True: 0, False: 1.24k]
  ------------------
 1115|       |      // If this frame buffer's y_buffer, u_buffer, and v_buffer point to the
 1116|       |      // external reference buffers. Restore the buffer pointers to point to the
 1117|       |      // internally allocated memory.
 1118|      0|      YV12_BUFFER_CONFIG *ybf = &frame_bufs[i].buf;
 1119|      0|      ybf->y_buffer = ybf->store_buf_adr[0];
 1120|      0|      ybf->u_buffer = ybf->store_buf_adr[1];
 1121|      0|      ybf->v_buffer = ybf->store_buf_adr[2];
 1122|      0|      ybf->use_external_reference_buffers = 0;
 1123|      0|    }
 1124|       |
 1125|  1.24k|    frame_bufs[i].ref_count = 1;
 1126|  1.24k|  } else {
 1127|       |    // We should never run out of free buffers. If this assertion fails, there
 1128|       |    // is a reference leak.
 1129|      0|    assert(0 && "Ran out of free frame buffers. Likely a reference leak.");
 1130|       |    // Reset i to be INVALID_IDX to indicate no free buffer found.
 1131|      0|    i = INVALID_IDX;
  ------------------
  |  |   15|      0|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
 1132|      0|  }
 1133|       |
 1134|  1.24k|  unlock_buffer_pool(cm->buffer_pool);
 1135|  1.24k|  return i;
 1136|  1.24k|}
decodeframe.c:ensure_mv_buffer:
 1235|  26.5k|static inline void ensure_mv_buffer(RefCntBuffer *buf, AV1_COMMON *cm) {
 1236|  26.5k|  const int buf_rows = buf->mi_rows;
 1237|  26.5k|  const int buf_cols = buf->mi_cols;
 1238|  26.5k|  const CommonModeInfoParams *const mi_params = &cm->mi_params;
 1239|       |
 1240|  26.5k|  if (buf->mvs == NULL || buf_rows != mi_params->mi_rows ||
  ------------------
  |  Branch (1240:7): [True: 25.7k, False: 815]
  |  Branch (1240:27): [True: 90, False: 725]
  ------------------
 1241|  25.8k|      buf_cols != mi_params->mi_cols) {
  ------------------
  |  Branch (1241:7): [True: 11, False: 714]
  ------------------
 1242|  25.8k|    aom_free(buf->mvs);
 1243|  25.8k|    buf->mi_rows = mi_params->mi_rows;
 1244|  25.8k|    buf->mi_cols = mi_params->mi_cols;
 1245|  25.8k|    CHECK_MEM_ERROR(cm, buf->mvs,
  ------------------
  |  |   51|  25.8k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  25.8k|  do {                                                    \
  |  |  |  |   69|  25.8k|    lval = (expr);                                        \
  |  |  |  |   70|  25.8k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 25.8k]
  |  |  |  |  ------------------
  |  |  |  |   71|  25.8k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  25.8k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 25.8k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1246|  25.8k|                    (MV_REF *)aom_calloc(((mi_params->mi_rows + 1) >> 1) *
 1247|  25.8k|                                             ((mi_params->mi_cols + 1) >> 1),
 1248|  25.8k|                                         sizeof(*buf->mvs)));
 1249|  25.8k|    aom_free(buf->seg_map);
 1250|  25.8k|    CHECK_MEM_ERROR(
  ------------------
  |  |   51|  25.8k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  25.8k|  do {                                                    \
  |  |  |  |   69|  25.8k|    lval = (expr);                                        \
  |  |  |  |   70|  25.8k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 25.8k]
  |  |  |  |  ------------------
  |  |  |  |   71|  25.8k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  25.8k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 25.8k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1251|  25.8k|        cm, buf->seg_map,
 1252|  25.8k|        (uint8_t *)aom_calloc(mi_params->mi_rows * mi_params->mi_cols,
 1253|  25.8k|                              sizeof(*buf->seg_map)));
 1254|  25.8k|  }
 1255|       |
 1256|  26.5k|  const int mem_size =
 1257|  26.5k|      ((mi_params->mi_rows + MAX_MIB_SIZE) >> 1) * (mi_params->mi_stride >> 1);
  ------------------
  |  |   44|  26.5k|#define MAX_MIB_SIZE (1 << MAX_MIB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   43|  26.5k|#define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   31|  26.5k|#define MAX_SB_SIZE_LOG2 7
  |  |  |  |  ------------------
  |  |  |  |               #define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   39|  26.5k|#define MI_SIZE_LOG2 2
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1258|       |
 1259|  26.5k|  if (cm->tpl_mvs == NULL || cm->tpl_mvs_mem_size < mem_size) {
  ------------------
  |  Branch (1259:7): [True: 17.6k, False: 8.93k]
  |  Branch (1259:30): [True: 45, False: 8.89k]
  ------------------
 1260|  17.6k|    aom_free(cm->tpl_mvs);
 1261|  17.6k|    CHECK_MEM_ERROR(cm, cm->tpl_mvs,
  ------------------
  |  |   51|  17.6k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  17.6k|  do {                                                    \
  |  |  |  |   69|  17.6k|    lval = (expr);                                        \
  |  |  |  |   70|  17.6k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 17.6k]
  |  |  |  |  ------------------
  |  |  |  |   71|  17.6k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  17.6k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 17.6k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1262|  17.6k|                    (TPL_MV_REF *)aom_calloc(mem_size, sizeof(*cm->tpl_mvs)));
 1263|  17.6k|    cm->tpl_mvs_mem_size = mem_size;
 1264|  17.6k|  }
 1265|  26.5k|}
decodeframe.c:frame_might_allow_ref_frame_mvs:
 1222|  14.8k|static inline int frame_might_allow_ref_frame_mvs(const AV1_COMMON *cm) {
 1223|  14.8k|  return !cm->features.error_resilient_mode &&
  ------------------
  |  Branch (1223:10): [True: 14.8k, False: 24]
  ------------------
 1224|  14.8k|         cm->seq_params->order_hint_info.enable_ref_frame_mvs &&
  ------------------
  |  Branch (1224:10): [True: 14.7k, False: 160]
  ------------------
 1225|  14.7k|         cm->seq_params->order_hint_info.enable_order_hint &&
  ------------------
  |  Branch (1225:10): [True: 14.7k, False: 0]
  ------------------
 1226|  14.7k|         !frame_is_intra_only(cm);
  ------------------
  |  Branch (1226:10): [True: 14.7k, False: 0]
  ------------------
 1227|  14.8k|}
decodeframe.c:get_ref_frame_buf:
 1194|   132k|    const AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) {
 1195|   132k|  const int map_idx = get_ref_frame_map_idx(cm, ref_frame);
 1196|   132k|  return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : NULL;
  ------------------
  |  |   15|   132k|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
  |  Branch (1196:10): [True: 132k, False: 0]
  ------------------
 1197|   132k|}
decodeframe.c:get_ref_frame_map_idx:
 1187|   272k|                                        const MV_REFERENCE_FRAME ref_frame) {
 1188|   272k|  return (ref_frame >= LAST_FRAME && ref_frame <= EXTREF_FRAME)
  ------------------
  |  Branch (1188:11): [True: 272k, False: 0]
  |  Branch (1188:38): [True: 272k, False: 0]
  ------------------
 1189|   272k|             ? cm->remapped_ref_idx[ref_frame - LAST_FRAME]
 1190|   272k|             : INVALID_IDX;
  ------------------
  |  |   15|      0|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
 1191|   272k|}
decodeframe.c:get_ref_scale_factors:
 1208|  55.8k|    AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) {
 1209|  55.8k|  const int map_idx = get_ref_frame_map_idx(cm, ref_frame);
 1210|  55.8k|  return (map_idx != INVALID_IDX) ? &cm->ref_scale_factors[map_idx] : NULL;
  ------------------
  |  |   15|  55.8k|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
  |  Branch (1210:10): [True: 55.8k, False: 0]
  ------------------
 1211|  55.8k|}
decodeframe.c:is_coded_lossless:
 1861|  26.4k|                                    const MACROBLOCKD *xd) {
 1862|  26.4k|  int coded_lossless = 1;
 1863|  26.4k|  if (cm->seg.enabled) {
  ------------------
  |  Branch (1863:7): [True: 2.97k, False: 23.4k]
  ------------------
 1864|  3.49k|    for (int i = 0; i < MAX_SEGMENTS; ++i) {
  ------------------
  |  |   21|  3.49k|#define MAX_SEGMENTS 8
  ------------------
  |  Branch (1864:21): [True: 3.45k, False: 34]
  ------------------
 1865|  3.45k|      if (!xd->lossless[i]) {
  ------------------
  |  Branch (1865:11): [True: 2.94k, False: 514]
  ------------------
 1866|  2.94k|        coded_lossless = 0;
 1867|  2.94k|        break;
 1868|  2.94k|      }
 1869|  3.45k|    }
 1870|  23.4k|  } else {
 1871|  23.4k|    coded_lossless = xd->lossless[0];
 1872|  23.4k|  }
 1873|  26.4k|  return coded_lossless;
 1874|  26.4k|}
decodeframe.c:frame_might_allow_warped_motion:
 1230|  26.3k|static inline int frame_might_allow_warped_motion(const AV1_COMMON *cm) {
 1231|  26.3k|  return !cm->features.error_resilient_mode && !frame_is_intra_only(cm) &&
  ------------------
  |  Branch (1231:10): [True: 8.17k, False: 18.1k]
  |  Branch (1231:48): [True: 7.90k, False: 267]
  ------------------
 1232|  7.90k|         cm->seq_params->enable_warped_motion;
  ------------------
  |  Branch (1232:10): [True: 7.00k, False: 902]
  ------------------
 1233|  26.3k|}
decodeframe.c:get_primary_ref_frame_buf:
 1214|  15.4k|    const AV1_COMMON *const cm) {
 1215|  15.4k|  const int primary_ref_frame = cm->features.primary_ref_frame;
 1216|  15.4k|  if (primary_ref_frame == PRIMARY_REF_NONE) return NULL;
  ------------------
  |  |   66|  15.4k|#define PRIMARY_REF_NONE 7
  ------------------
  |  Branch (1216:7): [True: 4.54k, False: 10.8k]
  ------------------
 1217|  10.8k|  const int map_idx = get_ref_frame_map_idx(cm, primary_ref_frame + 1);
 1218|  10.8k|  return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : NULL;
  ------------------
  |  |   15|  10.8k|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
  |  Branch (1218:10): [True: 10.8k, False: 0]
  ------------------
 1219|  15.4k|}
decodeframe.c:get_ref_scale_factors_const:
 1202|  73.5k|    const AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) {
 1203|  73.5k|  const int map_idx = get_ref_frame_map_idx(cm, ref_frame);
 1204|  73.5k|  return (map_idx != INVALID_IDX) ? &cm->ref_scale_factors[map_idx] : NULL;
  ------------------
  |  |   15|  73.5k|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
  |  Branch (1204:10): [True: 73.5k, False: 0]
  ------------------
 1205|  73.5k|}
decodeframe.c:av1_init_above_context:
 1277|  27.3k|                                          MACROBLOCKD *xd) {
 1278|  98.6k|  for (int i = 0; i < num_planes; ++i) {
  ------------------
  |  Branch (1278:19): [True: 71.3k, False: 27.3k]
  ------------------
 1279|  71.3k|    xd->above_entropy_context[i] = above_contexts->entropy[i][tile_row];
 1280|  71.3k|  }
 1281|  27.3k|  xd->above_partition_context = above_contexts->partition[tile_row];
 1282|  27.3k|  xd->above_txfm_context = above_contexts->txfm[tile_row];
 1283|  27.3k|}
decodeframe.c:av1_zero_above_context:
 1595|  27.3k|                                          const int tile_row) {
 1596|  27.3k|  const SequenceHeader *const seq_params = cm->seq_params;
 1597|  27.3k|  const int num_planes = av1_num_planes(cm);
 1598|  27.3k|  const int width = mi_col_end - mi_col_start;
 1599|  27.3k|  const int aligned_width =
 1600|  27.3k|      ALIGN_POWER_OF_TWO(width, seq_params->mib_size_log2);
  ------------------
  |  |   69|  27.3k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  ------------------
 1601|  27.3k|  const int offset_y = mi_col_start;
 1602|  27.3k|  const int width_y = aligned_width;
 1603|  27.3k|  const int offset_uv = offset_y >> seq_params->subsampling_x;
 1604|  27.3k|  const int width_uv = width_y >> seq_params->subsampling_x;
 1605|  27.3k|  CommonContexts *const above_contexts = &cm->above_contexts;
 1606|       |
 1607|  27.3k|  av1_zero_array(above_contexts->entropy[0][tile_row] + offset_y, width_y);
  ------------------
  |  |   44|  27.3k|#define av1_zero_array(dest, n) memset(dest, 0, n * sizeof(*(dest)))
  ------------------
 1608|  27.3k|  if (num_planes > 1) {
  ------------------
  |  Branch (1608:7): [True: 21.9k, False: 5.32k]
  ------------------
 1609|  21.9k|    if (above_contexts->entropy[1][tile_row] &&
  ------------------
  |  Branch (1609:9): [True: 21.9k, False: 0]
  ------------------
 1610|  21.9k|        above_contexts->entropy[2][tile_row]) {
  ------------------
  |  Branch (1610:9): [True: 21.9k, False: 0]
  ------------------
 1611|  21.9k|      av1_zero_array(above_contexts->entropy[1][tile_row] + offset_uv,
  ------------------
  |  |   44|  21.9k|#define av1_zero_array(dest, n) memset(dest, 0, n * sizeof(*(dest)))
  ------------------
 1612|  21.9k|                     width_uv);
 1613|  21.9k|      av1_zero_array(above_contexts->entropy[2][tile_row] + offset_uv,
  ------------------
  |  |   44|  21.9k|#define av1_zero_array(dest, n) memset(dest, 0, n * sizeof(*(dest)))
  ------------------
 1614|  21.9k|                     width_uv);
 1615|  21.9k|    } else {
 1616|      0|      aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
 1617|      0|                         "Invalid value of planes");
 1618|      0|    }
 1619|  21.9k|  }
 1620|       |
 1621|  27.3k|  av1_zero_array(above_contexts->partition[tile_row] + mi_col_start,
  ------------------
  |  |   44|  27.3k|#define av1_zero_array(dest, n) memset(dest, 0, n * sizeof(*(dest)))
  ------------------
 1622|  27.3k|                 aligned_width);
 1623|       |
 1624|  27.3k|  memset(above_contexts->txfm[tile_row] + mi_col_start,
 1625|  27.3k|         tx_size_wide[TX_SIZES_LARGEST], aligned_width * sizeof(TXFM_CONTEXT));
 1626|  27.3k|}
decodeframe.c:av1_zero_left_context:
 1628|  53.0k|static inline void av1_zero_left_context(MACROBLOCKD *const xd) {
 1629|  53.0k|  av1_zero(xd->left_entropy_context);
  ------------------
  |  |   43|  53.0k|#define av1_zero(dest) memset(&(dest), 0, sizeof(dest))
  ------------------
 1630|  53.0k|  av1_zero(xd->left_partition_context);
  ------------------
  |  |   43|  53.0k|#define av1_zero(dest) memset(&(dest), 0, sizeof(dest))
  ------------------
 1631|       |
 1632|  53.0k|  memset(xd->left_txfm_context_buffer, tx_size_high[TX_SIZES_LARGEST],
 1633|  53.0k|         sizeof(xd->left_txfm_context_buffer));
 1634|  53.0k|}
decodeframe.c:set_mi_offsets:
 1672|  1.75M|                                  int mi_col) {
 1673|       |  // 'mi_grid_base' should point to appropriate memory in 'mi'.
 1674|  1.75M|  const int mi_grid_idx = get_mi_grid_idx(mi_params, mi_row, mi_col);
 1675|  1.75M|  const int mi_alloc_idx = get_alloc_mi_idx(mi_params, mi_row, mi_col);
 1676|  1.75M|  mi_params->mi_grid_base[mi_grid_idx] = &mi_params->mi_alloc[mi_alloc_idx];
 1677|       |  // 'xd->mi' should point to an offset in 'mi_grid_base';
 1678|  1.75M|  xd->mi = mi_params->mi_grid_base + mi_grid_idx;
 1679|       |  // 'xd->tx_type_map' should point to an offset in 'mi_params->tx_type_map'.
 1680|  1.75M|  xd->tx_type_map = mi_params->tx_type_map + mi_grid_idx;
 1681|  1.75M|  xd->tx_type_map_stride = mi_params->mi_stride;
 1682|  1.75M|}
decodeframe.c:get_mi_grid_idx:
 1656|  1.75M|                                  int mi_row, int mi_col) {
 1657|  1.75M|  return mi_row * mi_params->mi_stride + mi_col;
 1658|  1.75M|}
decodeframe.c:get_alloc_mi_idx:
 1661|  1.75M|                                   int mi_row, int mi_col) {
 1662|  1.75M|  const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize];
 1663|  1.75M|  const int mi_alloc_row = mi_row / mi_alloc_size_1d;
 1664|  1.75M|  const int mi_alloc_col = mi_col / mi_alloc_size_1d;
 1665|       |
 1666|  1.75M|  return mi_alloc_row * mi_params->mi_alloc_stride + mi_alloc_col;
 1667|  1.75M|}
decodeframe.c:set_plane_n4:
 1345|  2.51M|                                const int num_planes) {
 1346|  2.51M|  int i;
 1347|  7.99M|  for (i = 0; i < num_planes; i++) {
  ------------------
  |  Branch (1347:15): [True: 5.48M, False: 2.51M]
  ------------------
 1348|  5.48M|    xd->plane[i].width = (bw * MI_SIZE) >> xd->plane[i].subsampling_x;
  ------------------
  |  |   40|  5.48M|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  5.48M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1349|  5.48M|    xd->plane[i].height = (bh * MI_SIZE) >> xd->plane[i].subsampling_y;
  ------------------
  |  |   40|  5.48M|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  5.48M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1350|       |
 1351|  5.48M|    xd->plane[i].width = AOMMAX(xd->plane[i].width, 4);
  ------------------
  |  |   35|  5.48M|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 4.49M, False: 990k]
  |  |  ------------------
  ------------------
 1352|  5.48M|    xd->plane[i].height = AOMMAX(xd->plane[i].height, 4);
  ------------------
  |  |   35|  5.48M|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 4.11M, False: 1.37M]
  |  |  ------------------
  ------------------
 1353|  5.48M|  }
 1354|  2.51M|}
decodeframe.c:set_entropy_context:
 1318|  1.75M|                                       const int num_planes) {
 1319|  1.75M|  int i;
 1320|  1.75M|  int row_offset = mi_row;
 1321|  1.75M|  int col_offset = mi_col;
 1322|  5.63M|  for (i = 0; i < num_planes; ++i) {
  ------------------
  |  Branch (1322:15): [True: 3.87M, False: 1.75M]
  ------------------
 1323|  3.87M|    struct macroblockd_plane *const pd = &xd->plane[i];
 1324|       |    // Offset the buffer pointer
 1325|  3.87M|    const BLOCK_SIZE bsize = xd->mi[0]->bsize;
 1326|  3.87M|    if (pd->subsampling_y && (mi_row & 0x01) && (mi_size_high[bsize] == 1))
  ------------------
  |  Branch (1326:9): [True: 309k, False: 3.56M]
  |  Branch (1326:30): [True: 17.5k, False: 292k]
  |  Branch (1326:49): [True: 17.5k, False: 0]
  ------------------
 1327|  17.5k|      row_offset = mi_row - 1;
 1328|  3.87M|    if (pd->subsampling_x && (mi_col & 0x01) && (mi_size_wide[bsize] == 1))
  ------------------
  |  Branch (1328:9): [True: 320k, False: 3.55M]
  |  Branch (1328:30): [True: 13.5k, False: 307k]
  |  Branch (1328:49): [True: 13.5k, False: 0]
  ------------------
 1329|  13.5k|      col_offset = mi_col - 1;
 1330|  3.87M|    int above_idx = col_offset;
 1331|  3.87M|    int left_idx = row_offset & MAX_MIB_MASK;
  ------------------
  |  |   50|  3.87M|#define MAX_MIB_MASK (MAX_MIB_SIZE - 1)
  |  |  ------------------
  |  |  |  |   44|  3.87M|#define MAX_MIB_SIZE (1 << MAX_MIB_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   43|  3.87M|#define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   31|  3.87M|#define MAX_SB_SIZE_LOG2 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |               #define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   39|  3.87M|#define MI_SIZE_LOG2 2
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1332|  3.87M|    pd->above_entropy_context =
 1333|  3.87M|        &xd->above_entropy_context[i][above_idx >> pd->subsampling_x];
 1334|  3.87M|    pd->left_entropy_context =
 1335|  3.87M|        &xd->left_entropy_context[i][left_idx >> pd->subsampling_y];
 1336|  3.87M|  }
 1337|  1.75M|}
decodeframe.c:set_mi_row_col:
 1358|  2.51M|                                  int mi_rows, int mi_cols) {
 1359|  2.51M|  xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE);
  ------------------
  |  |   29|  2.51M|#define GET_MV_SUBPEL(x) ((x) * 8)
  ------------------
 1360|  2.51M|  xd->mb_to_bottom_edge = GET_MV_SUBPEL((mi_rows - bh - mi_row) * MI_SIZE);
  ------------------
  |  |   29|  2.51M|#define GET_MV_SUBPEL(x) ((x) * 8)
  ------------------
 1361|  2.51M|  xd->mb_to_left_edge = -GET_MV_SUBPEL((mi_col * MI_SIZE));
  ------------------
  |  |   29|  2.51M|#define GET_MV_SUBPEL(x) ((x) * 8)
  ------------------
 1362|  2.51M|  xd->mb_to_right_edge = GET_MV_SUBPEL((mi_cols - bw - mi_col) * MI_SIZE);
  ------------------
  |  |   29|  2.51M|#define GET_MV_SUBPEL(x) ((x) * 8)
  ------------------
 1363|       |
 1364|  2.51M|  xd->mi_row = mi_row;
 1365|  2.51M|  xd->mi_col = mi_col;
 1366|       |
 1367|       |  // Are edges available for intra prediction?
 1368|  2.51M|  xd->up_available = (mi_row > tile->mi_row_start);
 1369|       |
 1370|  2.51M|  const int ss_x = xd->plane[1].subsampling_x;
 1371|  2.51M|  const int ss_y = xd->plane[1].subsampling_y;
 1372|       |
 1373|  2.51M|  xd->left_available = (mi_col > tile->mi_col_start);
 1374|  2.51M|  xd->chroma_up_available = xd->up_available;
 1375|  2.51M|  xd->chroma_left_available = xd->left_available;
 1376|  2.51M|  if (ss_x && bw < mi_size_wide[BLOCK_8X8])
  ------------------
  |  Branch (1376:7): [True: 1.22M, False: 1.28M]
  |  Branch (1376:15): [True: 189k, False: 1.03M]
  ------------------
 1377|   189k|    xd->chroma_left_available = (mi_col - 1) > tile->mi_col_start;
 1378|  2.51M|  if (ss_y && bh < mi_size_high[BLOCK_8X8])
  ------------------
  |  Branch (1378:7): [True: 1.22M, False: 1.29M]
  |  Branch (1378:15): [True: 272k, False: 948k]
  ------------------
 1379|   272k|    xd->chroma_up_available = (mi_row - 1) > tile->mi_row_start;
 1380|  2.51M|  if (xd->up_available) {
  ------------------
  |  Branch (1380:7): [True: 2.32M, False: 185k]
  ------------------
 1381|  2.32M|    xd->above_mbmi = xd->mi[-xd->mi_stride];
 1382|  2.32M|  } else {
 1383|   185k|    xd->above_mbmi = NULL;
 1384|   185k|  }
 1385|       |
 1386|  2.51M|  if (xd->left_available) {
  ------------------
  |  Branch (1386:7): [True: 2.36M, False: 146k]
  ------------------
 1387|  2.36M|    xd->left_mbmi = xd->mi[-1];
 1388|  2.36M|  } else {
 1389|   146k|    xd->left_mbmi = NULL;
 1390|   146k|  }
 1391|       |
 1392|  2.51M|  const int chroma_ref = ((mi_row & 0x01) || !(bh & 0x01) || !ss_y) &&
  ------------------
  |  Branch (1392:27): [True: 302k, False: 2.21M]
  |  Branch (1392:46): [True: 1.90M, False: 305k]
  |  Branch (1392:62): [True: 169k, False: 136k]
  ------------------
 1393|  2.37M|                         ((mi_col & 0x01) || !(bw & 0x01) || !ss_x);
  ------------------
  |  Branch (1393:27): [True: 194k, False: 2.18M]
  |  Branch (1393:46): [True: 1.98M, False: 195k]
  |  Branch (1393:62): [True: 120k, False: 75.0k]
  ------------------
 1394|  2.51M|  xd->is_chroma_ref = chroma_ref;
 1395|  2.51M|  if (chroma_ref) {
  ------------------
  |  Branch (1395:7): [True: 2.30M, False: 208k]
  ------------------
 1396|       |    // To help calculate the "above" and "left" chroma blocks, note that the
 1397|       |    // current block may cover multiple luma blocks (e.g., if partitioned into
 1398|       |    // 4x4 luma blocks).
 1399|       |    // First, find the top-left-most luma block covered by this chroma block
 1400|  2.30M|    MB_MODE_INFO **base_mi =
 1401|  2.30M|        &xd->mi[-(mi_row & ss_y) * xd->mi_stride - (mi_col & ss_x)];
 1402|       |
 1403|       |    // Then, we consider the luma region covered by the left or above 4x4 chroma
 1404|       |    // prediction. We want to point to the chroma reference block in that
 1405|       |    // region, which is the bottom-right-most mi unit.
 1406|       |    // This leads to the following offsets:
 1407|  2.30M|    MB_MODE_INFO *chroma_above_mi =
 1408|  2.30M|        xd->chroma_up_available ? base_mi[-xd->mi_stride + ss_x] : NULL;
  ------------------
  |  Branch (1408:9): [True: 2.12M, False: 184k]
  ------------------
 1409|  2.30M|    xd->chroma_above_mbmi = chroma_above_mi;
 1410|       |
 1411|  2.30M|    MB_MODE_INFO *chroma_left_mi =
 1412|  2.30M|        xd->chroma_left_available ? base_mi[ss_y * xd->mi_stride - 1] : NULL;
  ------------------
  |  Branch (1412:9): [True: 2.15M, False: 146k]
  ------------------
 1413|  2.30M|    xd->chroma_left_mbmi = chroma_left_mi;
 1414|  2.30M|  }
 1415|       |
 1416|  2.51M|  xd->height = bh;
 1417|  2.51M|  xd->width = bw;
 1418|       |
 1419|  2.51M|  xd->is_last_vertical_rect = 0;
 1420|  2.51M|  if (xd->width < xd->height) {
  ------------------
  |  Branch (1420:7): [True: 530k, False: 1.98M]
  ------------------
 1421|   530k|    if (!((mi_col + xd->width) & (xd->height - 1))) {
  ------------------
  |  Branch (1421:9): [True: 208k, False: 322k]
  ------------------
 1422|   208k|      xd->is_last_vertical_rect = 1;
 1423|   208k|    }
 1424|   530k|  }
 1425|       |
 1426|  2.51M|  xd->is_first_horizontal_rect = 0;
 1427|  2.51M|  if (xd->width > xd->height)
  ------------------
  |  Branch (1427:7): [True: 845k, False: 1.66M]
  ------------------
 1428|   845k|    if (!(mi_row & (xd->width - 1))) xd->is_first_horizontal_rect = 1;
  ------------------
  |  Branch (1428:9): [True: 365k, False: 479k]
  ------------------
 1429|  2.51M|}
decodeframe.c:max_block_high:
 1580|  2.85M|                                 int plane) {
 1581|  2.85M|  int max_blocks_high = block_size_high[bsize];
 1582|       |
 1583|  2.85M|  if (xd->mb_to_bottom_edge < 0) {
  ------------------
  |  Branch (1583:7): [True: 44.6k, False: 2.81M]
  ------------------
 1584|  44.6k|    const struct macroblockd_plane *const pd = &xd->plane[plane];
 1585|  44.6k|    max_blocks_high += xd->mb_to_bottom_edge >> (3 + pd->subsampling_y);
 1586|  44.6k|  }
 1587|       |
 1588|       |  // Scale the height in the transform block unit.
 1589|  2.85M|  return max_blocks_high >> MI_SIZE_LOG2;
  ------------------
  |  |   39|  2.85M|#define MI_SIZE_LOG2 2
  ------------------
 1590|  2.85M|}
decodeframe.c:max_block_wide:
 1566|  2.85M|                                 int plane) {
 1567|  2.85M|  assert(bsize < BLOCK_SIZES_ALL);
 1568|  2.85M|  int max_blocks_wide = block_size_wide[bsize];
 1569|       |
 1570|  2.85M|  if (xd->mb_to_right_edge < 0) {
  ------------------
  |  Branch (1570:7): [True: 57.1k, False: 2.80M]
  ------------------
 1571|  57.1k|    const struct macroblockd_plane *const pd = &xd->plane[plane];
 1572|  57.1k|    max_blocks_wide += xd->mb_to_right_edge >> (3 + pd->subsampling_x);
 1573|  57.1k|  }
 1574|       |
 1575|       |  // Scale the width in the transform block unit.
 1576|  2.85M|  return max_blocks_wide >> MI_SIZE_LOG2;
  ------------------
  |  |   39|  2.85M|#define MI_SIZE_LOG2 2
  ------------------
 1577|  2.85M|}
decodeframe.c:txfm_partition_update:
 1686|  47.5k|                                         TX_SIZE tx_size, TX_SIZE txb_size) {
 1687|  47.5k|  BLOCK_SIZE bsize = txsize_to_bsize[txb_size];
 1688|  47.5k|  int bh = mi_size_high[bsize];
 1689|  47.5k|  int bw = mi_size_wide[bsize];
 1690|  47.5k|  uint8_t txw = tx_size_wide[tx_size];
 1691|  47.5k|  uint8_t txh = tx_size_high[tx_size];
 1692|  47.5k|  int i;
 1693|   183k|  for (i = 0; i < bh; ++i) left_ctx[i] = txh;
  ------------------
  |  Branch (1693:15): [True: 135k, False: 47.5k]
  ------------------
 1694|   193k|  for (i = 0; i < bw; ++i) above_ctx[i] = txw;
  ------------------
  |  Branch (1694:15): [True: 145k, False: 47.5k]
  ------------------
 1695|  47.5k|}
decodeframe.c:txfm_partition_context:
 1749|  49.8k|                                         BLOCK_SIZE bsize, TX_SIZE tx_size) {
 1750|  49.8k|  const uint8_t txw = tx_size_wide[tx_size];
 1751|  49.8k|  const uint8_t txh = tx_size_high[tx_size];
 1752|  49.8k|  const int above = *above_ctx < txw;
 1753|  49.8k|  const int left = *left_ctx < txh;
 1754|  49.8k|  int category = TXFM_PARTITION_CONTEXTS;
  ------------------
  |  |  521|  49.8k|#define TXFM_PARTITION_CONTEXTS ((TX_SIZES - TX_8X8) * 6 - 3)
  ------------------
 1755|       |
 1756|       |  // dummy return, not used by others.
 1757|  49.8k|  if (tx_size <= TX_4X4) return 0;
  ------------------
  |  Branch (1757:7): [True: 0, False: 49.8k]
  ------------------
 1758|       |
 1759|  49.8k|  TX_SIZE max_tx_size =
 1760|  49.8k|      get_sqr_tx_size(AOMMAX(block_size_wide[bsize], block_size_high[bsize]));
  ------------------
  |  |   35|  49.8k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 18.1k, False: 31.7k]
  |  |  ------------------
  ------------------
 1761|       |
 1762|  49.8k|  if (max_tx_size >= TX_8X8) {
  ------------------
  |  Branch (1762:7): [True: 49.8k, False: 18.4E]
  ------------------
 1763|  49.8k|    category =
 1764|  49.8k|        (txsize_sqr_up_map[tx_size] != max_tx_size && max_tx_size > TX_8X8) +
  ------------------
  |  Branch (1764:10): [True: 18.4k, False: 31.3k]
  |  Branch (1764:55): [True: 18.4k, False: 0]
  ------------------
 1765|  49.8k|        (TX_SIZES - 1 - max_tx_size) * 2;
 1766|  49.8k|  }
 1767|       |  assert(category != TXFM_PARTITION_CONTEXTS);
 1768|  49.8k|  return category * 3 + above + left;
 1769|  49.8k|}
decodeframe.c:get_sqr_tx_size:
 1697|  49.8k|static inline TX_SIZE get_sqr_tx_size(int tx_dim) {
 1698|  49.8k|  switch (tx_dim) {
 1699|    568|    case 128:
  ------------------
  |  Branch (1699:5): [True: 568, False: 49.2k]
  ------------------
 1700|  2.34k|    case 64: return TX_64X64; break;
  ------------------
  |  Branch (1700:5): [True: 1.78k, False: 48.0k]
  ------------------
 1701|  11.7k|    case 32: return TX_32X32; break;
  ------------------
  |  Branch (1701:5): [True: 11.7k, False: 38.1k]
  ------------------
 1702|  25.4k|    case 16: return TX_16X16; break;
  ------------------
  |  Branch (1702:5): [True: 25.4k, False: 24.3k]
  ------------------
 1703|  10.3k|    case 8: return TX_8X8; break;
  ------------------
  |  Branch (1703:5): [True: 10.3k, False: 39.4k]
  ------------------
 1704|      0|    default: return TX_4X4;
  ------------------
  |  Branch (1704:5): [True: 0, False: 49.8k]
  ------------------
 1705|  49.8k|  }
 1706|  49.8k|}
decodeframe.c:set_txfm_ctxs:
 1642|  1.72M|                                 const MACROBLOCKD *xd) {
 1643|  1.72M|  uint8_t bw = tx_size_wide[tx_size];
 1644|  1.72M|  uint8_t bh = tx_size_high[tx_size];
 1645|       |
 1646|  1.72M|  if (skip) {
  ------------------
  |  Branch (1646:7): [True: 15.2k, False: 1.70M]
  ------------------
 1647|  15.2k|    bw = n4_w * MI_SIZE;
  ------------------
  |  |   40|  15.2k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  15.2k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1648|  15.2k|    bh = n4_h * MI_SIZE;
  ------------------
  |  |   40|  15.2k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  15.2k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1649|  15.2k|  }
 1650|       |
 1651|  1.72M|  set_txfm_ctx(xd->above_txfm_context, bw, n4_w);
 1652|  1.72M|  set_txfm_ctx(xd->left_txfm_context, bh, n4_h);
 1653|  1.72M|}
decodeframe.c:set_txfm_ctx:
 1636|  3.44M|static inline void set_txfm_ctx(TXFM_CONTEXT *txfm_ctx, uint8_t txs, int len) {
 1637|  3.44M|  int i;
 1638|  15.3M|  for (i = 0; i < len; ++i) txfm_ctx[i] = txs;
  ------------------
  |  Branch (1638:15): [True: 11.9M, False: 3.44M]
  ------------------
 1639|  3.44M|}
decodeframe.c:partition_plane_context:
 1540|  1.31M|                                          int mi_col, BLOCK_SIZE bsize) {
 1541|  1.31M|  const PARTITION_CONTEXT *above_ctx = xd->above_partition_context + mi_col;
 1542|  1.31M|  const PARTITION_CONTEXT *left_ctx =
 1543|  1.31M|      xd->left_partition_context + (mi_row & MAX_MIB_MASK);
  ------------------
  |  |   50|  1.31M|#define MAX_MIB_MASK (MAX_MIB_SIZE - 1)
  |  |  ------------------
  |  |  |  |   44|  1.31M|#define MAX_MIB_SIZE (1 << MAX_MIB_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   43|  1.31M|#define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   31|  1.31M|#define MAX_SB_SIZE_LOG2 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |               #define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   39|  1.31M|#define MI_SIZE_LOG2 2
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1544|       |  // Minimum partition point is 8x8. Offset the bsl accordingly.
 1545|  1.31M|  const int bsl = mi_size_wide_log2[bsize] - mi_size_wide_log2[BLOCK_8X8];
 1546|  1.31M|  int above = (*above_ctx >> bsl) & 1, left = (*left_ctx >> bsl) & 1;
 1547|       |
 1548|  1.31M|  assert(mi_size_wide_log2[bsize] == mi_size_high_log2[bsize]);
 1549|  1.31M|  assert(bsl >= 0);
 1550|       |
 1551|  1.31M|  return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
  ------------------
  |  |  169|  1.31M|#define PARTITION_PLOFFSET 4  // number of probability models per block size
  ------------------
 1552|  1.31M|}
decodeframe.c:partition_cdf_length:
 1556|  1.20M|static inline int partition_cdf_length(BLOCK_SIZE bsize) {
 1557|  1.20M|  if (bsize <= BLOCK_8X8)
  ------------------
  |  Branch (1557:7): [True: 384k, False: 824k]
  ------------------
 1558|   384k|    return PARTITION_TYPES;
 1559|   824k|  else if (bsize == BLOCK_128X128)
  ------------------
  |  Branch (1559:12): [True: 3.23k, False: 820k]
  ------------------
 1560|  3.23k|    return EXT_PARTITION_TYPES - 2;
 1561|   820k|  else
 1562|   820k|    return EXT_PARTITION_TYPES;
 1563|  1.20M|}
decodeframe.c:partition_gather_vert_alike:
 1487|  42.1k|                                               BLOCK_SIZE bsize) {
 1488|  42.1k|  (void)bsize;
 1489|  42.1k|  out[0] = CDF_PROB_TOP;
  ------------------
  |  |   33|  42.1k|#define CDF_PROB_TOP (1 << CDF_PROB_BITS)
  |  |  ------------------
  |  |  |  |   32|  42.1k|#define CDF_PROB_BITS 15
  |  |  ------------------
  ------------------
 1490|  42.1k|  out[0] -= cdf_element_prob(in, PARTITION_VERT);
 1491|  42.1k|  out[0] -= cdf_element_prob(in, PARTITION_SPLIT);
 1492|  42.1k|  out[0] -= cdf_element_prob(in, PARTITION_HORZ_A);
 1493|  42.1k|  out[0] -= cdf_element_prob(in, PARTITION_VERT_A);
 1494|  42.1k|  out[0] -= cdf_element_prob(in, PARTITION_VERT_B);
 1495|  42.1k|  if (bsize != BLOCK_128X128) out[0] -= cdf_element_prob(in, PARTITION_VERT_4);
  ------------------
  |  Branch (1495:7): [True: 40.5k, False: 1.68k]
  ------------------
 1496|  42.1k|  out[0] = AOM_ICDF(out[0]);
  ------------------
  |  |   38|  42.1k|#define AOM_ICDF(x) (CDF_PROB_TOP - (x))
  |  |  ------------------
  |  |  |  |   33|  42.1k|#define CDF_PROB_TOP (1 << CDF_PROB_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   32|  42.1k|#define CDF_PROB_BITS 15
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1497|  42.1k|  out[1] = AOM_ICDF(CDF_PROB_TOP);
  ------------------
  |  |   38|  42.1k|#define AOM_ICDF(x) (CDF_PROB_TOP - (x))
  |  |  ------------------
  |  |  |  |   33|  42.1k|#define CDF_PROB_TOP (1 << CDF_PROB_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   32|  42.1k|#define CDF_PROB_BITS 15
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1498|  42.1k|}
decodeframe.c:cdf_element_prob:
 1465|   523k|                                            size_t element) {
 1466|   523k|  assert(cdf != NULL);
 1467|  18.4E|  return (element > 0 ? cdf[element - 1] : CDF_PROB_TOP) - cdf[element];
  ------------------
  |  |   33|  18.4E|#define CDF_PROB_TOP (1 << CDF_PROB_BITS)
  |  |  ------------------
  |  |  |  |   32|  18.4E|#define CDF_PROB_BITS 15
  |  |  ------------------
  ------------------
  |  Branch (1467:11): [True: 523k, False: 18.4E]
  ------------------
 1468|   523k|}
decodeframe.c:partition_gather_horz_alike:
 1472|  45.5k|                                               BLOCK_SIZE bsize) {
 1473|  45.5k|  (void)bsize;
 1474|  45.5k|  out[0] = CDF_PROB_TOP;
  ------------------
  |  |   33|  45.5k|#define CDF_PROB_TOP (1 << CDF_PROB_BITS)
  |  |  ------------------
  |  |  |  |   32|  45.5k|#define CDF_PROB_BITS 15
  |  |  ------------------
  ------------------
 1475|  45.5k|  out[0] -= cdf_element_prob(in, PARTITION_HORZ);
 1476|  45.5k|  out[0] -= cdf_element_prob(in, PARTITION_SPLIT);
 1477|  45.5k|  out[0] -= cdf_element_prob(in, PARTITION_HORZ_A);
 1478|  45.5k|  out[0] -= cdf_element_prob(in, PARTITION_HORZ_B);
 1479|  45.5k|  out[0] -= cdf_element_prob(in, PARTITION_VERT_A);
 1480|  45.5k|  if (bsize != BLOCK_128X128) out[0] -= cdf_element_prob(in, PARTITION_HORZ_4);
  ------------------
  |  Branch (1480:7): [True: 44.0k, False: 1.49k]
  ------------------
 1481|  45.5k|  out[0] = AOM_ICDF(out[0]);
  ------------------
  |  |   38|  45.5k|#define AOM_ICDF(x) (CDF_PROB_TOP - (x))
  |  |  ------------------
  |  |  |  |   33|  45.5k|#define CDF_PROB_TOP (1 << CDF_PROB_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   32|  45.5k|#define CDF_PROB_BITS 15
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1482|  45.5k|  out[1] = AOM_ICDF(CDF_PROB_TOP);
  ------------------
  |  |   38|  45.5k|#define AOM_ICDF(x) (CDF_PROB_TOP - (x))
  |  |  ------------------
  |  |  |  |   33|  45.5k|#define CDF_PROB_TOP (1 << CDF_PROB_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   32|  45.5k|#define CDF_PROB_BITS 15
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1483|  45.5k|}
decodeframe.c:get_partition:
 1775|   642k|                                           BLOCK_SIZE bsize) {
 1776|   642k|  const CommonModeInfoParams *const mi_params = &cm->mi_params;
 1777|   642k|  if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols)
  ------------------
  |  Branch (1777:7): [True: 17, False: 642k]
  |  Branch (1777:39): [True: 0, False: 642k]
  ------------------
 1778|      0|    return PARTITION_INVALID;
 1779|       |
 1780|   642k|  const int offset = mi_row * mi_params->mi_stride + mi_col;
 1781|   642k|  MB_MODE_INFO **mi = mi_params->mi_grid_base + offset;
 1782|   642k|  const BLOCK_SIZE subsize = mi[0]->bsize;
 1783|       |
 1784|   642k|  assert(bsize < BLOCK_SIZES_ALL);
 1785|       |
 1786|   642k|  if (subsize == bsize) return PARTITION_NONE;
  ------------------
  |  Branch (1786:7): [True: 291k, False: 351k]
  ------------------
 1787|       |
 1788|   351k|  const int bhigh = mi_size_high[bsize];
 1789|   351k|  const int bwide = mi_size_wide[bsize];
 1790|   351k|  const int sshigh = mi_size_high[subsize];
 1791|   351k|  const int sswide = mi_size_wide[subsize];
 1792|       |
 1793|   351k|  if (bsize > BLOCK_8X8 && mi_row + bwide / 2 < mi_params->mi_rows &&
  ------------------
  |  Branch (1793:7): [True: 285k, False: 65.6k]
  |  Branch (1793:28): [True: 263k, False: 22.1k]
  ------------------
 1794|   263k|      mi_col + bhigh / 2 < mi_params->mi_cols) {
  ------------------
  |  Branch (1794:7): [True: 244k, False: 18.7k]
  ------------------
 1795|       |    // In this case, the block might be using an extended partition
 1796|       |    // type.
 1797|   244k|    const MB_MODE_INFO *const mbmi_right = mi[bwide / 2];
 1798|   244k|    const MB_MODE_INFO *const mbmi_below = mi[bhigh / 2 * mi_params->mi_stride];
 1799|       |
 1800|   244k|    if (sswide == bwide) {
  ------------------
  |  Branch (1800:9): [True: 69.0k, False: 175k]
  ------------------
 1801|       |      // Smaller height but same width. Is PARTITION_HORZ_4, PARTITION_HORZ or
 1802|       |      // PARTITION_HORZ_B. To distinguish the latter two, check if the lower
 1803|       |      // half was split.
 1804|  69.0k|      if (sshigh * 4 == bhigh) return PARTITION_HORZ_4;
  ------------------
  |  Branch (1804:11): [True: 22.1k, False: 46.9k]
  ------------------
 1805|  69.0k|      assert(sshigh * 2 == bhigh);
 1806|       |
 1807|  46.9k|      if (mbmi_below->bsize == subsize)
  ------------------
  |  Branch (1807:11): [True: 35.5k, False: 11.3k]
  ------------------
 1808|  35.5k|        return PARTITION_HORZ;
 1809|  11.3k|      else
 1810|  11.3k|        return PARTITION_HORZ_B;
 1811|   175k|    } else if (sshigh == bhigh) {
  ------------------
  |  Branch (1811:16): [True: 41.3k, False: 134k]
  ------------------
 1812|       |      // Smaller width but same height. Is PARTITION_VERT_4, PARTITION_VERT or
 1813|       |      // PARTITION_VERT_B. To distinguish the latter two, check if the right
 1814|       |      // half was split.
 1815|  41.3k|      if (sswide * 4 == bwide) return PARTITION_VERT_4;
  ------------------
  |  Branch (1815:11): [True: 11.7k, False: 29.6k]
  ------------------
 1816|  41.3k|      assert(sswide * 2 == bwide);
 1817|       |
 1818|  29.6k|      if (mbmi_right->bsize == subsize)
  ------------------
  |  Branch (1818:11): [True: 21.6k, False: 7.96k]
  ------------------
 1819|  21.6k|        return PARTITION_VERT;
 1820|  7.96k|      else
 1821|  7.96k|        return PARTITION_VERT_B;
 1822|   134k|    } else {
 1823|       |      // Smaller width and smaller height. Might be PARTITION_SPLIT or could be
 1824|       |      // PARTITION_HORZ_A or PARTITION_VERT_A. If subsize isn't halved in both
 1825|       |      // dimensions, we immediately know this is a split (which will recurse to
 1826|       |      // get to subsize). Otherwise look down and to the right. With
 1827|       |      // PARTITION_VERT_A, the right block will have height bhigh; with
 1828|       |      // PARTITION_HORZ_A, the lower block with have width bwide. Otherwise
 1829|       |      // it's PARTITION_SPLIT.
 1830|   134k|      if (sswide * 2 != bwide || sshigh * 2 != bhigh) return PARTITION_SPLIT;
  ------------------
  |  Branch (1830:11): [True: 41.5k, False: 92.6k]
  |  Branch (1830:34): [True: 22.9k, False: 69.6k]
  ------------------
 1831|       |
 1832|  69.6k|      if (mi_size_wide[mbmi_below->bsize] == bwide) return PARTITION_HORZ_A;
  ------------------
  |  Branch (1832:11): [True: 9.03k, False: 60.5k]
  ------------------
 1833|  60.5k|      if (mi_size_high[mbmi_right->bsize] == bhigh) return PARTITION_VERT_A;
  ------------------
  |  Branch (1833:11): [True: 6.41k, False: 54.1k]
  ------------------
 1834|       |
 1835|  54.1k|      return PARTITION_SPLIT;
 1836|  60.5k|    }
 1837|   244k|  }
 1838|   106k|  const int vert_split = sswide < bwide;
 1839|   106k|  const int horz_split = sshigh < bhigh;
 1840|   106k|  const int split_idx = (vert_split << 1) | horz_split;
 1841|   106k|  assert(split_idx != 0);
 1842|       |
 1843|   106k|  static const PARTITION_TYPE base_partitions[4] = {
 1844|   106k|    PARTITION_INVALID, PARTITION_HORZ, PARTITION_VERT, PARTITION_SPLIT
 1845|   106k|  };
 1846|       |
 1847|   106k|  return base_partitions[split_idx];
 1848|   351k|}
decodeframe.c:update_ext_partition_context:
 1503|  1.44M|                                                PARTITION_TYPE partition) {
 1504|  1.44M|  if (bsize >= BLOCK_8X8) {
  ------------------
  |  Branch (1504:7): [True: 1.31M, False: 133k]
  ------------------
 1505|  1.31M|    const int hbs = mi_size_wide[bsize] / 2;
 1506|  1.31M|    BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
 1507|  1.31M|    switch (partition) {
 1508|   362k|      case PARTITION_SPLIT:
  ------------------
  |  Branch (1508:7): [True: 362k, False: 951k]
  ------------------
 1509|   362k|        if (bsize != BLOCK_8X8) break;
  ------------------
  |  Branch (1509:13): [True: 329k, False: 33.3k]
  ------------------
 1510|  33.3k|        AOM_FALLTHROUGH_INTENDED;
  ------------------
  |  |   52|  33.3k|  do {                           \
  |  |   53|  33.3k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (53:12): [Folded, False: 33.3k]
  |  |  ------------------
  ------------------
 1511|   519k|      case PARTITION_NONE:
  ------------------
  |  Branch (1511:7): [True: 485k, False: 828k]
  ------------------
 1512|   699k|      case PARTITION_HORZ:
  ------------------
  |  Branch (1512:7): [True: 180k, False: 1.13M]
  ------------------
 1513|   822k|      case PARTITION_VERT:
  ------------------
  |  Branch (1513:7): [True: 122k, False: 1.19M]
  ------------------
 1514|   874k|      case PARTITION_HORZ_4:
  ------------------
  |  Branch (1514:7): [True: 52.5k, False: 1.26M]
  ------------------
 1515|   903k|      case PARTITION_VERT_4:
  ------------------
  |  Branch (1515:7): [True: 28.8k, False: 1.28M]
  ------------------
 1516|   903k|        update_partition_context(xd, mi_row, mi_col, subsize, bsize);
 1517|   903k|        break;
 1518|  21.6k|      case PARTITION_HORZ_A:
  ------------------
  |  Branch (1518:7): [True: 21.6k, False: 1.29M]
  ------------------
 1519|  21.6k|        update_partition_context(xd, mi_row, mi_col, bsize2, subsize);
 1520|  21.6k|        update_partition_context(xd, mi_row + hbs, mi_col, subsize, subsize);
 1521|  21.6k|        break;
 1522|  25.5k|      case PARTITION_HORZ_B:
  ------------------
  |  Branch (1522:7): [True: 25.5k, False: 1.28M]
  ------------------
 1523|  25.5k|        update_partition_context(xd, mi_row, mi_col, subsize, subsize);
 1524|  25.5k|        update_partition_context(xd, mi_row + hbs, mi_col, bsize2, subsize);
 1525|  25.5k|        break;
 1526|  15.7k|      case PARTITION_VERT_A:
  ------------------
  |  Branch (1526:7): [True: 15.7k, False: 1.29M]
  ------------------
 1527|  15.7k|        update_partition_context(xd, mi_row, mi_col, bsize2, subsize);
 1528|  15.7k|        update_partition_context(xd, mi_row, mi_col + hbs, subsize, subsize);
 1529|  15.7k|        break;
 1530|  18.6k|      case PARTITION_VERT_B:
  ------------------
  |  Branch (1530:7): [True: 18.6k, False: 1.29M]
  ------------------
 1531|  18.6k|        update_partition_context(xd, mi_row, mi_col, subsize, subsize);
 1532|  18.6k|        update_partition_context(xd, mi_row, mi_col + hbs, bsize2, subsize);
 1533|  18.6k|        break;
 1534|      0|      default: assert(0 && "Invalid partition type");
  ------------------
  |  Branch (1534:7): [True: 0, False: 1.31M]
  ------------------
 1535|  1.31M|    }
 1536|  1.31M|  }
 1537|  1.44M|}
decodeframe.c:update_partition_context:
 1443|  1.06M|                                            BLOCK_SIZE bsize) {
 1444|  1.06M|  PARTITION_CONTEXT *const above_ctx = xd->above_partition_context + mi_col;
 1445|  1.06M|  PARTITION_CONTEXT *const left_ctx =
 1446|  1.06M|      xd->left_partition_context + (mi_row & MAX_MIB_MASK);
  ------------------
  |  |   50|  1.06M|#define MAX_MIB_MASK (MAX_MIB_SIZE - 1)
  |  |  ------------------
  |  |  |  |   44|  1.06M|#define MAX_MIB_SIZE (1 << MAX_MIB_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   43|  1.06M|#define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   31|  1.06M|#define MAX_SB_SIZE_LOG2 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |               #define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   39|  1.06M|#define MI_SIZE_LOG2 2
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1447|       |
 1448|  1.06M|  const int bw = mi_size_wide[bsize];
 1449|  1.06M|  const int bh = mi_size_high[bsize];
 1450|  1.06M|  memset(above_ctx, partition_context_lookup[subsize].above, bw);
 1451|  1.06M|  memset(left_ctx, partition_context_lookup[subsize].left, bh);
 1452|  1.06M|}
decodeframe.c:av1_init_macroblockd:
 1285|  48.6k|static inline void av1_init_macroblockd(AV1_COMMON *cm, MACROBLOCKD *xd) {
 1286|  48.6k|  const int num_planes = av1_num_planes(cm);
 1287|  48.6k|  const CommonQuantParams *const quant_params = &cm->quant_params;
 1288|       |
 1289|   173k|  for (int i = 0; i < num_planes; ++i) {
  ------------------
  |  Branch (1289:19): [True: 125k, False: 48.6k]
  ------------------
 1290|   125k|    if (xd->plane[i].plane_type == PLANE_TYPE_Y) {
  ------------------
  |  Branch (1290:9): [True: 48.5k, False: 76.6k]
  ------------------
 1291|  48.5k|      memcpy(xd->plane[i].seg_dequant_QTX, quant_params->y_dequant_QTX,
 1292|  48.5k|             sizeof(quant_params->y_dequant_QTX));
 1293|  48.5k|      memcpy(xd->plane[i].seg_iqmatrix, quant_params->y_iqmatrix,
 1294|  48.5k|             sizeof(quant_params->y_iqmatrix));
 1295|       |
 1296|  76.6k|    } else {
 1297|  76.6k|      if (i == AOM_PLANE_U) {
  ------------------
  |  |  211|  76.6k|#define AOM_PLANE_U 1      /**< U (Chroma) plane */
  ------------------
  |  Branch (1297:11): [True: 38.4k, False: 38.1k]
  ------------------
 1298|  38.4k|        memcpy(xd->plane[i].seg_dequant_QTX, quant_params->u_dequant_QTX,
 1299|  38.4k|               sizeof(quant_params->u_dequant_QTX));
 1300|  38.4k|        memcpy(xd->plane[i].seg_iqmatrix, quant_params->u_iqmatrix,
 1301|  38.4k|               sizeof(quant_params->u_iqmatrix));
 1302|  38.4k|      } else {
 1303|  38.1k|        memcpy(xd->plane[i].seg_dequant_QTX, quant_params->v_dequant_QTX,
 1304|  38.1k|               sizeof(quant_params->v_dequant_QTX));
 1305|  38.1k|        memcpy(xd->plane[i].seg_iqmatrix, quant_params->v_iqmatrix,
 1306|  38.1k|               sizeof(quant_params->v_iqmatrix));
 1307|  38.1k|      }
 1308|  76.6k|    }
 1309|   125k|  }
 1310|  48.6k|  xd->mi_stride = cm->mi_params.mi_stride;
 1311|  48.6k|  xd->error_info = cm->error;
 1312|  48.6k|#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
 1313|  48.6k|  cfl_init(&xd->cfl, cm->seq_params);
 1314|  48.6k|#endif
 1315|  48.6k|}
decodemv.c:frame_is_intra_only:
 1174|  3.37M|static inline int frame_is_intra_only(const AV1_COMMON *const cm) {
 1175|  3.37M|  return cm->current_frame.frame_type == KEY_FRAME ||
  ------------------
  |  Branch (1175:10): [True: 3.23M, False: 140k]
  ------------------
 1176|   140k|         cm->current_frame.frame_type == INTRA_ONLY_FRAME;
  ------------------
  |  Branch (1176:10): [True: 1.32k, False: 139k]
  ------------------
 1177|  3.37M|}
decodemv.c:get_mi_grid_idx:
 1656|   123k|                                  int mi_row, int mi_col) {
 1657|   123k|  return mi_row * mi_params->mi_stride + mi_col;
 1658|   123k|}
decodemv.c:av1_num_planes:
 1271|   873k|static inline int av1_num_planes(const AV1_COMMON *cm) {
 1272|   873k|  return cm->seq_params->monochrome ? 1 : MAX_MB_PLANE;
  ------------------
  |  |   36|   509k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (1272:10): [True: 363k, False: 509k]
  ------------------
 1273|   873k|}
decodemv.c:get_y_mode_cdf:
 1433|  1.60M|                                           const MB_MODE_INFO *left_mi) {
 1434|  1.60M|  const PREDICTION_MODE above = av1_above_block_mode(above_mi);
 1435|  1.60M|  const PREDICTION_MODE left = av1_left_block_mode(left_mi);
 1436|  1.60M|  const int above_ctx = intra_mode_context[above];
 1437|  1.60M|  const int left_ctx = intra_mode_context[left];
 1438|  1.60M|  return tile_ctx->kf_y_cdf[above_ctx][left_ctx];
 1439|  1.60M|}
decodemv.c:get_ref_scale_factors_const:
 1202|  88.6k|    const AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) {
 1203|  88.6k|  const int map_idx = get_ref_frame_map_idx(cm, ref_frame);
 1204|  88.6k|  return (map_idx != INVALID_IDX) ? &cm->ref_scale_factors[map_idx] : NULL;
  ------------------
  |  |   15|  88.6k|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
  |  Branch (1204:10): [True: 88.6k, False: 18.4E]
  ------------------
 1205|  88.6k|}
decodemv.c:get_ref_frame_map_idx:
 1187|   101k|                                        const MV_REFERENCE_FRAME ref_frame) {
 1188|   101k|  return (ref_frame >= LAST_FRAME && ref_frame <= EXTREF_FRAME)
  ------------------
  |  Branch (1188:11): [True: 101k, False: 18.4E]
  |  Branch (1188:38): [True: 101k, False: 18.4E]
  ------------------
 1189|   101k|             ? cm->remapped_ref_idx[ref_frame - LAST_FRAME]
 1190|  18.4E|             : INVALID_IDX;
  ------------------
  |  |   15|  18.4E|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
 1191|   101k|}
decodemv.c:get_ref_frame_buf:
 1194|  12.4k|    const AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) {
 1195|  12.4k|  const int map_idx = get_ref_frame_map_idx(cm, ref_frame);
 1196|  12.4k|  return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : NULL;
  ------------------
  |  |   15|  12.4k|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
  |  Branch (1196:10): [True: 12.4k, False: 18.4E]
  ------------------
 1197|  12.4k|}
decoder.c:calc_mi_size:
 1339|  61.7k|static inline int calc_mi_size(int len) {
 1340|       |  // len is in mi units. Align to a multiple of SBs.
 1341|  61.7k|  return ALIGN_POWER_OF_TWO(len, MAX_MIB_SIZE_LOG2);
  ------------------
  |  |   69|  61.7k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  ------------------
 1342|  61.7k|}
decoder.c:av1_num_planes:
 1271|  13.9M|static inline int av1_num_planes(const AV1_COMMON *cm) {
 1272|  13.9M|  return cm->seq_params->monochrome ? 1 : MAX_MB_PLANE;
  ------------------
  |  |   36|  7.16M|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (1272:10): [True: 6.80M, False: 7.16M]
  ------------------
 1273|  13.9M|}
decoder.c:assign_cur_frame_new_fb:
 1138|  29.0k|static inline RefCntBuffer *assign_cur_frame_new_fb(AV1_COMMON *const cm) {
 1139|       |  // Release the previously-used frame-buffer
 1140|  29.0k|  if (cm->cur_frame != NULL) {
  ------------------
  |  Branch (1140:7): [True: 0, False: 29.0k]
  ------------------
 1141|      0|    --cm->cur_frame->ref_count;
 1142|      0|    cm->cur_frame = NULL;
 1143|      0|  }
 1144|       |
 1145|       |  // Assign a new framebuffer
 1146|  29.0k|  const int new_fb_idx = get_free_fb(cm);
 1147|  29.0k|  if (new_fb_idx == INVALID_IDX) return NULL;
  ------------------
  |  |   15|  29.0k|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
  |  Branch (1147:7): [True: 0, False: 29.0k]
  ------------------
 1148|       |
 1149|  29.0k|  cm->cur_frame = &cm->buffer_pool->frame_bufs[new_fb_idx];
 1150|  29.0k|#if CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY
 1151|  29.0k|  aom_invalidate_pyramid(cm->cur_frame->buf.y_pyramid);
 1152|  29.0k|  av1_invalidate_corner_list(cm->cur_frame->buf.corners);
 1153|  29.0k|#endif  // CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY
 1154|  29.0k|  av1_zero(cm->cur_frame->interp_filter_selected);
  ------------------
  |  |   43|  29.0k|#define av1_zero(dest) memset(&(dest), 0, sizeof(dest))
  ------------------
 1155|  29.0k|  return cm->cur_frame;
 1156|  29.0k|}
decoder.c:get_free_fb:
 1104|  29.0k|static inline int get_free_fb(AV1_COMMON *cm) {
 1105|  29.0k|  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
 1106|  29.0k|  int i;
 1107|       |
 1108|  29.0k|  lock_buffer_pool(cm->buffer_pool);
 1109|  29.0k|  const int num_frame_bufs = cm->buffer_pool->num_frame_bufs;
 1110|  45.8k|  for (i = 0; i < num_frame_bufs; ++i)
  ------------------
  |  Branch (1110:15): [True: 45.8k, False: 0]
  ------------------
 1111|  45.8k|    if (frame_bufs[i].ref_count == 0) break;
  ------------------
  |  Branch (1111:9): [True: 29.0k, False: 16.8k]
  ------------------
 1112|       |
 1113|  29.0k|  if (i != num_frame_bufs) {
  ------------------
  |  Branch (1113:7): [True: 29.0k, False: 0]
  ------------------
 1114|  29.0k|    if (frame_bufs[i].buf.use_external_reference_buffers) {
  ------------------
  |  Branch (1114:9): [True: 0, False: 29.0k]
  ------------------
 1115|       |      // If this frame buffer's y_buffer, u_buffer, and v_buffer point to the
 1116|       |      // external reference buffers. Restore the buffer pointers to point to the
 1117|       |      // internally allocated memory.
 1118|      0|      YV12_BUFFER_CONFIG *ybf = &frame_bufs[i].buf;
 1119|      0|      ybf->y_buffer = ybf->store_buf_adr[0];
 1120|      0|      ybf->u_buffer = ybf->store_buf_adr[1];
 1121|      0|      ybf->v_buffer = ybf->store_buf_adr[2];
 1122|      0|      ybf->use_external_reference_buffers = 0;
 1123|      0|    }
 1124|       |
 1125|  29.0k|    frame_bufs[i].ref_count = 1;
 1126|  29.0k|  } else {
 1127|       |    // We should never run out of free buffers. If this assertion fails, there
 1128|       |    // is a reference leak.
 1129|      0|    assert(0 && "Ran out of free frame buffers. Likely a reference leak.");
 1130|       |    // Reset i to be INVALID_IDX to indicate no free buffer found.
 1131|      0|    i = INVALID_IDX;
  ------------------
  |  |   15|      0|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
 1132|      0|  }
 1133|       |
 1134|  29.0k|  unlock_buffer_pool(cm->buffer_pool);
 1135|  29.0k|  return i;
 1136|  29.0k|}
decoder.c:lock_buffer_pool:
 1082|  58.0k|static void lock_buffer_pool(BufferPool *const pool) {
 1083|  58.0k|#if CONFIG_MULTITHREAD
 1084|  58.0k|  pthread_mutex_lock(&pool->pool_mutex);
 1085|       |#else
 1086|       |  (void)pool;
 1087|       |#endif
 1088|  58.0k|}
decoder.c:unlock_buffer_pool:
 1090|  58.0k|static void unlock_buffer_pool(BufferPool *const pool) {
 1091|  58.0k|#if CONFIG_MULTITHREAD
 1092|  58.0k|  pthread_mutex_unlock(&pool->pool_mutex);
 1093|       |#else
 1094|       |  (void)pool;
 1095|       |#endif
 1096|  58.0k|}
obu.c:is_valid_seq_level_idx:
 1876|  21.7k|static inline int is_valid_seq_level_idx(AV1_LEVEL seq_level_idx) {
 1877|  21.7k|  return seq_level_idx == SEQ_LEVEL_MAX ||
  ------------------
  |  Branch (1877:10): [True: 1.29k, False: 20.4k]
  ------------------
 1878|  20.4k|         (seq_level_idx < SEQ_LEVELS &&
  ------------------
  |  Branch (1878:11): [True: 20.4k, False: 12]
  ------------------
 1879|       |          // The following levels are currently undefined.
 1880|  20.4k|          seq_level_idx != SEQ_LEVEL_2_2 && seq_level_idx != SEQ_LEVEL_2_3 &&
  ------------------
  |  Branch (1880:11): [True: 20.4k, False: 12]
  |  Branch (1880:45): [True: 20.4k, False: 10]
  ------------------
 1881|  20.4k|          seq_level_idx != SEQ_LEVEL_3_2 && seq_level_idx != SEQ_LEVEL_3_3 &&
  ------------------
  |  Branch (1881:11): [True: 20.3k, False: 7]
  |  Branch (1881:45): [True: 20.3k, False: 10]
  ------------------
 1882|  20.3k|          seq_level_idx != SEQ_LEVEL_4_2 && seq_level_idx != SEQ_LEVEL_4_3
  ------------------
  |  Branch (1882:11): [True: 20.3k, False: 5]
  |  Branch (1882:45): [True: 20.3k, False: 6]
  ------------------
 1883|  20.3k|#if !CONFIG_CWG_C013
 1884|  20.3k|          && seq_level_idx != SEQ_LEVEL_7_0 && seq_level_idx != SEQ_LEVEL_7_1 &&
  ------------------
  |  Branch (1884:14): [True: 20.3k, False: 6]
  |  Branch (1884:48): [True: 20.3k, False: 6]
  ------------------
 1885|  20.3k|          seq_level_idx != SEQ_LEVEL_7_2 && seq_level_idx != SEQ_LEVEL_7_3 &&
  ------------------
  |  Branch (1885:11): [True: 20.3k, False: 6]
  |  Branch (1885:45): [True: 20.3k, False: 6]
  ------------------
 1886|  20.3k|          seq_level_idx != SEQ_LEVEL_8_0 && seq_level_idx != SEQ_LEVEL_8_1 &&
  ------------------
  |  Branch (1886:11): [True: 20.3k, False: 8]
  |  Branch (1886:45): [True: 20.3k, False: 7]
  ------------------
 1887|  20.3k|          seq_level_idx != SEQ_LEVEL_8_2 && seq_level_idx != SEQ_LEVEL_8_3
  ------------------
  |  Branch (1887:11): [True: 20.3k, False: 11]
  |  Branch (1887:45): [True: 20.3k, False: 5]
  ------------------
 1888|  20.4k|#endif
 1889|  20.4k|         );
 1890|  21.7k|}
alloccommon.c:av1_num_planes:
 1271|  16.0k|static inline int av1_num_planes(const AV1_COMMON *cm) {
 1272|  16.0k|  return cm->seq_params->monochrome ? 1 : MAX_MB_PLANE;
  ------------------
  |  |   36|  12.8k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (1272:10): [True: 3.19k, False: 12.8k]
  ------------------
 1273|  16.0k|}
alloccommon.c:calc_mi_size:
 1339|  17.6k|static inline int calc_mi_size(int len) {
 1340|       |  // len is in mi units. Align to a multiple of SBs.
 1341|  17.6k|  return ALIGN_POWER_OF_TWO(len, MAX_MIB_SIZE_LOG2);
  ------------------
  |  |   69|  17.6k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  ------------------
 1342|  17.6k|}
blockd.c:max_block_wide:
 1566|  44.2k|                                 int plane) {
 1567|  44.2k|  assert(bsize < BLOCK_SIZES_ALL);
 1568|  44.2k|  int max_blocks_wide = block_size_wide[bsize];
 1569|       |
 1570|  44.2k|  if (xd->mb_to_right_edge < 0) {
  ------------------
  |  Branch (1570:7): [True: 44.2k, False: 0]
  ------------------
 1571|  44.2k|    const struct macroblockd_plane *const pd = &xd->plane[plane];
 1572|  44.2k|    max_blocks_wide += xd->mb_to_right_edge >> (3 + pd->subsampling_x);
 1573|  44.2k|  }
 1574|       |
 1575|       |  // Scale the width in the transform block unit.
 1576|  44.2k|  return max_blocks_wide >> MI_SIZE_LOG2;
  ------------------
  |  |   39|  44.2k|#define MI_SIZE_LOG2 2
  ------------------
 1577|  44.2k|}
blockd.c:max_block_high:
 1580|  27.9k|                                 int plane) {
 1581|  27.9k|  int max_blocks_high = block_size_high[bsize];
 1582|       |
 1583|  27.9k|  if (xd->mb_to_bottom_edge < 0) {
  ------------------
  |  Branch (1583:7): [True: 27.9k, False: 0]
  ------------------
 1584|  27.9k|    const struct macroblockd_plane *const pd = &xd->plane[plane];
 1585|  27.9k|    max_blocks_high += xd->mb_to_bottom_edge >> (3 + pd->subsampling_y);
 1586|  27.9k|  }
 1587|       |
 1588|       |  // Scale the height in the transform block unit.
 1589|  27.9k|  return max_blocks_high >> MI_SIZE_LOG2;
  ------------------
  |  |   39|  27.9k|#define MI_SIZE_LOG2 2
  ------------------
 1590|  27.9k|}
cdef.c:av1_num_planes:
 1271|  18.2k|static inline int av1_num_planes(const AV1_COMMON *cm) {
 1272|  18.2k|  return cm->seq_params->monochrome ? 1 : MAX_MB_PLANE;
  ------------------
  |  |   36|  17.1k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (1272:10): [True: 1.09k, False: 17.1k]
  ------------------
 1273|  18.2k|}
cfl.c:max_block_wide:
 1566|  6.53k|                                 int plane) {
 1567|  6.53k|  assert(bsize < BLOCK_SIZES_ALL);
 1568|  6.53k|  int max_blocks_wide = block_size_wide[bsize];
 1569|       |
 1570|  6.53k|  if (xd->mb_to_right_edge < 0) {
  ------------------
  |  Branch (1570:7): [True: 0, False: 6.53k]
  ------------------
 1571|      0|    const struct macroblockd_plane *const pd = &xd->plane[plane];
 1572|      0|    max_blocks_wide += xd->mb_to_right_edge >> (3 + pd->subsampling_x);
 1573|      0|  }
 1574|       |
 1575|       |  // Scale the width in the transform block unit.
 1576|  6.53k|  return max_blocks_wide >> MI_SIZE_LOG2;
  ------------------
  |  |   39|  6.53k|#define MI_SIZE_LOG2 2
  ------------------
 1577|  6.53k|}
cfl.c:max_block_high:
 1580|  6.53k|                                 int plane) {
 1581|  6.53k|  int max_blocks_high = block_size_high[bsize];
 1582|       |
 1583|  6.53k|  if (xd->mb_to_bottom_edge < 0) {
  ------------------
  |  Branch (1583:7): [True: 0, False: 6.53k]
  ------------------
 1584|      0|    const struct macroblockd_plane *const pd = &xd->plane[plane];
 1585|      0|    max_blocks_high += xd->mb_to_bottom_edge >> (3 + pd->subsampling_y);
 1586|      0|  }
 1587|       |
 1588|       |  // Scale the height in the transform block unit.
 1589|  6.53k|  return max_blocks_high >> MI_SIZE_LOG2;
  ------------------
  |  |   39|  6.53k|#define MI_SIZE_LOG2 2
  ------------------
 1590|  6.53k|}
cfl.c:get_tx_size:
 1708|  6.53k|static inline TX_SIZE get_tx_size(int width, int height) {
 1709|  6.53k|  if (width == height) {
  ------------------
  |  Branch (1709:7): [True: 1.94k, False: 4.58k]
  ------------------
 1710|  1.94k|    return get_sqr_tx_size(width);
 1711|  1.94k|  }
 1712|  4.58k|  if (width < height) {
  ------------------
  |  Branch (1712:7): [True: 1.82k, False: 2.76k]
  ------------------
 1713|  1.82k|    if (width + width == height) {
  ------------------
  |  Branch (1713:9): [True: 1.02k, False: 802]
  ------------------
 1714|  1.02k|      switch (width) {
  ------------------
  |  Branch (1714:15): [True: 1.02k, False: 0]
  ------------------
 1715|  1.02k|        case 4: return TX_4X8; break;
  ------------------
  |  Branch (1715:9): [True: 1.02k, False: 0]
  ------------------
 1716|      0|        case 8: return TX_8X16; break;
  ------------------
  |  Branch (1716:9): [True: 0, False: 1.02k]
  ------------------
 1717|      0|        case 16: return TX_16X32; break;
  ------------------
  |  Branch (1717:9): [True: 0, False: 1.02k]
  ------------------
 1718|      0|        case 32: return TX_32X64; break;
  ------------------
  |  Branch (1718:9): [True: 0, False: 1.02k]
  ------------------
 1719|  1.02k|      }
 1720|  1.02k|    } else {
 1721|    802|      switch (width) {
  ------------------
  |  Branch (1721:15): [True: 802, False: 0]
  ------------------
 1722|    802|        case 4: return TX_4X16; break;
  ------------------
  |  Branch (1722:9): [True: 802, False: 0]
  ------------------
 1723|      0|        case 8: return TX_8X32; break;
  ------------------
  |  Branch (1723:9): [True: 0, False: 802]
  ------------------
 1724|      0|        case 16: return TX_16X64; break;
  ------------------
  |  Branch (1724:9): [True: 0, False: 802]
  ------------------
 1725|    802|      }
 1726|    802|    }
 1727|  2.76k|  } else {
 1728|  2.76k|    if (height + height == width) {
  ------------------
  |  Branch (1728:9): [True: 1.37k, False: 1.39k]
  ------------------
 1729|  1.37k|      switch (height) {
  ------------------
  |  Branch (1729:15): [True: 1.37k, False: 0]
  ------------------
 1730|  1.37k|        case 4: return TX_8X4; break;
  ------------------
  |  Branch (1730:9): [True: 1.37k, False: 0]
  ------------------
 1731|      0|        case 8: return TX_16X8; break;
  ------------------
  |  Branch (1731:9): [True: 0, False: 1.37k]
  ------------------
 1732|      0|        case 16: return TX_32X16; break;
  ------------------
  |  Branch (1732:9): [True: 0, False: 1.37k]
  ------------------
 1733|      0|        case 32: return TX_64X32; break;
  ------------------
  |  Branch (1733:9): [True: 0, False: 1.37k]
  ------------------
 1734|  1.37k|      }
 1735|  1.39k|    } else {
 1736|  1.39k|      switch (height) {
  ------------------
  |  Branch (1736:15): [True: 1.39k, False: 0]
  ------------------
 1737|  1.39k|        case 4: return TX_16X4; break;
  ------------------
  |  Branch (1737:9): [True: 1.39k, False: 0]
  ------------------
 1738|      0|        case 8: return TX_32X8; break;
  ------------------
  |  Branch (1738:9): [True: 0, False: 1.39k]
  ------------------
 1739|      0|        case 16: return TX_64X16; break;
  ------------------
  |  Branch (1739:9): [True: 0, False: 1.39k]
  ------------------
 1740|  1.39k|      }
 1741|  1.39k|    }
 1742|  2.76k|  }
 1743|  4.58k|  assert(0);
 1744|      0|  return TX_4X4;
 1745|  4.58k|}
cfl.c:get_sqr_tx_size:
 1697|  1.94k|static inline TX_SIZE get_sqr_tx_size(int tx_dim) {
 1698|  1.94k|  switch (tx_dim) {
 1699|      0|    case 128:
  ------------------
  |  Branch (1699:5): [True: 0, False: 1.94k]
  ------------------
 1700|      0|    case 64: return TX_64X64; break;
  ------------------
  |  Branch (1700:5): [True: 0, False: 1.94k]
  ------------------
 1701|      0|    case 32: return TX_32X32; break;
  ------------------
  |  Branch (1701:5): [True: 0, False: 1.94k]
  ------------------
 1702|      0|    case 16: return TX_16X16; break;
  ------------------
  |  Branch (1702:5): [True: 0, False: 1.94k]
  ------------------
 1703|      0|    case 8: return TX_8X8; break;
  ------------------
  |  Branch (1703:5): [True: 0, False: 1.94k]
  ------------------
 1704|  1.94k|    default: return TX_4X4;
  ------------------
  |  Branch (1704:5): [True: 1.94k, False: 0]
  ------------------
 1705|  1.94k|  }
 1706|  1.94k|}
mvref_common.c:get_ref_frame_buf:
 1194|   583k|    const AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) {
 1195|   583k|  const int map_idx = get_ref_frame_map_idx(cm, ref_frame);
 1196|   583k|  return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : NULL;
  ------------------
  |  |   15|   583k|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
  |  Branch (1196:10): [True: 263k, False: 320k]
  ------------------
 1197|   583k|}
mvref_common.c:get_ref_frame_map_idx:
 1187|   583k|                                        const MV_REFERENCE_FRAME ref_frame) {
 1188|   583k|  return (ref_frame >= LAST_FRAME && ref_frame <= EXTREF_FRAME)
  ------------------
  |  Branch (1188:11): [True: 583k, False: 0]
  |  Branch (1188:38): [True: 583k, False: 0]
  ------------------
 1189|   583k|             ? cm->remapped_ref_idx[ref_frame - LAST_FRAME]
 1190|   583k|             : INVALID_IDX;
  ------------------
  |  |   15|      0|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
 1191|   583k|}
mvref_common.c:frame_is_intra_only:
 1174|  17.2k|static inline int frame_is_intra_only(const AV1_COMMON *const cm) {
 1175|  17.2k|  return cm->current_frame.frame_type == KEY_FRAME ||
  ------------------
  |  Branch (1175:10): [True: 9.17k, False: 8.11k]
  ------------------
 1176|  8.11k|         cm->current_frame.frame_type == INTRA_ONLY_FRAME;
  ------------------
  |  Branch (1176:10): [True: 198, False: 7.91k]
  ------------------
 1177|  17.2k|}
reconinter.c:get_ref_frame_buf:
 1194|  25.0k|    const AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) {
 1195|  25.0k|  const int map_idx = get_ref_frame_map_idx(cm, ref_frame);
 1196|  25.0k|  return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : NULL;
  ------------------
  |  |   15|  25.0k|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
  |  Branch (1196:10): [True: 25.0k, False: 0]
  ------------------
 1197|  25.0k|}
reconinter.c:get_ref_frame_map_idx:
 1187|  39.0k|                                        const MV_REFERENCE_FRAME ref_frame) {
 1188|  39.0k|  return (ref_frame >= LAST_FRAME && ref_frame <= EXTREF_FRAME)
  ------------------
  |  Branch (1188:11): [True: 39.0k, False: 0]
  |  Branch (1188:38): [True: 39.0k, False: 0]
  ------------------
 1189|  39.0k|             ? cm->remapped_ref_idx[ref_frame - LAST_FRAME]
 1190|  39.0k|             : INVALID_IDX;
  ------------------
  |  |   15|      0|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
 1191|  39.0k|}
reconinter.c:av1_num_planes:
 1271|  61.5k|static inline int av1_num_planes(const AV1_COMMON *cm) {
 1272|  61.5k|  return cm->seq_params->monochrome ? 1 : MAX_MB_PLANE;
  ------------------
  |  |   36|  59.7k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (1272:10): [True: 1.82k, False: 59.7k]
  ------------------
 1273|  61.5k|}
reconinter.c:get_ref_scale_factors_const:
 1202|  14.0k|    const AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) {
 1203|  14.0k|  const int map_idx = get_ref_frame_map_idx(cm, ref_frame);
 1204|  14.0k|  return (map_idx != INVALID_IDX) ? &cm->ref_scale_factors[map_idx] : NULL;
  ------------------
  |  |   15|  14.0k|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
  |  Branch (1204:10): [True: 14.0k, False: 0]
  ------------------
 1205|  14.0k|}
resize.c:av1_num_planes:
 1271|     64|static inline int av1_num_planes(const AV1_COMMON *cm) {
 1272|     64|  return cm->seq_params->monochrome ? 1 : MAX_MB_PLANE;
  ------------------
  |  |   36|     64|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (1272:10): [True: 0, False: 64]
  ------------------
 1273|     64|}
resize.c:lock_buffer_pool:
 1082|     32|static void lock_buffer_pool(BufferPool *const pool) {
 1083|     32|#if CONFIG_MULTITHREAD
 1084|     32|  pthread_mutex_lock(&pool->pool_mutex);
 1085|       |#else
 1086|       |  (void)pool;
 1087|       |#endif
 1088|     32|}
resize.c:unlock_buffer_pool:
 1090|     32|static void unlock_buffer_pool(BufferPool *const pool) {
 1091|     32|#if CONFIG_MULTITHREAD
 1092|     32|  pthread_mutex_unlock(&pool->pool_mutex);
 1093|       |#else
 1094|       |  (void)pool;
 1095|       |#endif
 1096|     32|}
restoration.c:av1_num_planes:
 1271|  1.42k|static inline int av1_num_planes(const AV1_COMMON *cm) {
 1272|  1.42k|  return cm->seq_params->monochrome ? 1 : MAX_MB_PLANE;
  ------------------
  |  |   36|  1.19k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (1272:10): [True: 234, False: 1.19k]
  ------------------
 1273|  1.42k|}
thread_common.c:av1_num_planes:
 1271|  56.2k|static inline int av1_num_planes(const AV1_COMMON *cm) {
 1272|  56.2k|  return cm->seq_params->monochrome ? 1 : MAX_MB_PLANE;
  ------------------
  |  |   36|  48.7k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (1272:10): [True: 7.52k, False: 48.7k]
  ------------------
 1273|  56.2k|}
thread_common.c:frame_is_intra_only:
 1174|  14.6k|static inline int frame_is_intra_only(const AV1_COMMON *const cm) {
 1175|  14.6k|  return cm->current_frame.frame_type == KEY_FRAME ||
  ------------------
  |  Branch (1175:10): [True: 10.6k, False: 4.03k]
  ------------------
 1176|  4.03k|         cm->current_frame.frame_type == INTRA_ONLY_FRAME;
  ------------------
  |  Branch (1176:10): [True: 93, False: 3.93k]
  ------------------
 1177|  14.6k|}

av1_highbd_iwht4x4_1_add_c:
   82|  76.0k|                                int dest_stride, int bd) {
   83|  76.0k|  int i;
   84|  76.0k|  tran_low_t a1, e1;
   85|  76.0k|  tran_low_t tmp[4];
   86|  76.0k|  const tran_low_t *ip = in;
   87|  76.0k|  tran_low_t *op = tmp;
   88|  76.0k|  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
  ------------------
  |  |   75|  76.0k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
   89|  76.0k|  (void)bd;
   90|       |
   91|  76.0k|  a1 = ip[0 * 4] >> UNIT_QUANT_SHIFT;
  ------------------
  |  |   21|  76.0k|#define UNIT_QUANT_SHIFT 2
  ------------------
   92|  76.0k|  e1 = a1 >> 1;
   93|  76.0k|  a1 -= e1;
   94|  76.0k|  op[0] = a1;
   95|  76.0k|  op[1] = op[2] = op[3] = e1;
   96|       |
   97|  76.0k|  ip = tmp;
   98|   380k|  for (i = 0; i < 4; i++) {
  ------------------
  |  Branch (98:15): [True: 304k, False: 76.0k]
  ------------------
   99|   304k|    e1 = ip[0] >> 1;
  100|   304k|    a1 = ip[0] - e1;
  101|   304k|    dest[dest_stride * 0] =
  102|   304k|        highbd_clip_pixel_add(dest[dest_stride * 0], a1, bd);
  103|   304k|    dest[dest_stride * 1] =
  104|   304k|        highbd_clip_pixel_add(dest[dest_stride * 1], e1, bd);
  105|   304k|    dest[dest_stride * 2] =
  106|   304k|        highbd_clip_pixel_add(dest[dest_stride * 2], e1, bd);
  107|   304k|    dest[dest_stride * 3] =
  108|   304k|        highbd_clip_pixel_add(dest[dest_stride * 3], e1, bd);
  109|   304k|    ip++;
  110|   304k|    dest++;
  111|   304k|  }
  112|  76.0k|}

av1_loop_filter_init:
  110|  17.9k|void av1_loop_filter_init(AV1_COMMON *cm) {
  111|  17.9k|  assert(MB_MODE_COUNT == NELEMENTS(mode_lf_lut));
  112|  17.9k|  loop_filter_info_n *lfi = &cm->lf_info;
  113|  17.9k|  struct loopfilter *lf = &cm->lf;
  114|  17.9k|  int lvl;
  115|       |
  116|       |  // init limits for given sharpness
  117|  17.9k|  update_sharpness(lfi, lf->sharpness_level);
  118|       |
  119|       |  // init hev threshold const vectors
  120|  1.16M|  for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++)
  ------------------
  |  |   27|  1.16M|#define MAX_LOOP_FILTER 63
  ------------------
  |  Branch (120:17): [True: 1.14M, False: 17.9k]
  ------------------
  121|  1.14M|    memset(lfi->lfthr[lvl].hev_thr, (lvl >> 4), SIMD_WIDTH);
  ------------------
  |  |   30|  1.14M|#define SIMD_WIDTH 16
  ------------------
  122|  17.9k|}
av1_loop_filter_frame_init:
  128|  1.86k|                                int plane_end) {
  129|  1.86k|  int filt_lvl[MAX_MB_PLANE], filt_lvl_r[MAX_MB_PLANE];
  130|  1.86k|  int plane;
  131|  1.86k|  int seg_id;
  132|       |  // n_shift is the multiplier for lf_deltas
  133|       |  // the multiplier is 1 for when filter_lvl is between 0 and 31;
  134|       |  // 2 when filter_lvl is between 32 and 63
  135|  1.86k|  loop_filter_info_n *const lfi = &cm->lf_info;
  136|  1.86k|  struct loopfilter *const lf = &cm->lf;
  137|  1.86k|  const struct segmentation *const seg = &cm->seg;
  138|       |
  139|       |  // update sharpness limits
  140|  1.86k|  update_sharpness(lfi, lf->sharpness_level);
  141|       |
  142|  1.86k|  filt_lvl[0] = cm->lf.filter_level[0];
  143|  1.86k|  filt_lvl[1] = cm->lf.filter_level_u;
  144|  1.86k|  filt_lvl[2] = cm->lf.filter_level_v;
  145|       |
  146|  1.86k|  filt_lvl_r[0] = cm->lf.filter_level[1];
  147|  1.86k|  filt_lvl_r[1] = cm->lf.filter_level_u;
  148|  1.86k|  filt_lvl_r[2] = cm->lf.filter_level_v;
  149|       |
  150|  1.86k|  assert(plane_start >= AOM_PLANE_Y);
  151|  1.86k|  assert(plane_end <= MAX_MB_PLANE);
  152|       |
  153|  7.03k|  for (plane = plane_start; plane < plane_end; plane++) {
  ------------------
  |  Branch (153:29): [True: 5.16k, False: 1.86k]
  ------------------
  154|  5.16k|    if (plane == 0 && !filt_lvl[0] && !filt_lvl_r[0])
  ------------------
  |  Branch (154:9): [True: 1.86k, False: 3.30k]
  |  Branch (154:23): [True: 387, False: 1.47k]
  |  Branch (154:39): [True: 0, False: 387]
  ------------------
  155|      0|      break;
  156|  5.16k|    else if (plane == 1 && !filt_lvl[1])
  ------------------
  |  Branch (156:14): [True: 1.65k, False: 3.51k]
  |  Branch (156:28): [True: 30, False: 1.62k]
  ------------------
  157|     30|      continue;
  158|  5.13k|    else if (plane == 2 && !filt_lvl[2])
  ------------------
  |  Branch (158:14): [True: 1.65k, False: 3.48k]
  |  Branch (158:28): [True: 55, False: 1.59k]
  ------------------
  159|     55|      continue;
  160|       |
  161|  45.7k|    for (seg_id = 0; seg_id < MAX_SEGMENTS; seg_id++) {
  ------------------
  |  |   21|  45.7k|#define MAX_SEGMENTS 8
  ------------------
  |  Branch (161:22): [True: 40.6k, False: 5.08k]
  ------------------
  162|   121k|      for (int dir = 0; dir < 2; ++dir) {
  ------------------
  |  Branch (162:25): [True: 81.3k, False: 40.6k]
  ------------------
  163|  81.3k|        int lvl_seg = (dir == 0) ? filt_lvl[plane] : filt_lvl_r[plane];
  ------------------
  |  Branch (163:23): [True: 40.6k, False: 40.6k]
  ------------------
  164|  81.3k|        const int seg_lf_feature_id = seg_lvl_lf_lut[plane][dir];
  165|  81.3k|        if (segfeature_active(seg, seg_id, seg_lf_feature_id)) {
  ------------------
  |  Branch (165:13): [True: 7.34k, False: 73.9k]
  ------------------
  166|  7.34k|          const int data = get_segdata(&cm->seg, seg_id, seg_lf_feature_id);
  167|  7.34k|          lvl_seg = clamp(lvl_seg + data, 0, MAX_LOOP_FILTER);
  ------------------
  |  |   27|  7.34k|#define MAX_LOOP_FILTER 63
  ------------------
  168|  7.34k|        }
  169|       |
  170|  81.3k|        if (!lf->mode_ref_delta_enabled) {
  ------------------
  |  Branch (170:13): [True: 21.0k, False: 60.3k]
  ------------------
  171|       |          // we could get rid of this if we assume that deltas are set to
  172|       |          // zero when not in use; encoder always uses deltas
  173|  21.0k|          memset(lfi->lvl[plane][seg_id][dir], lvl_seg,
  174|  21.0k|                 sizeof(lfi->lvl[plane][seg_id][dir]));
  175|  60.3k|        } else {
  176|  60.3k|          int ref, mode;
  177|  60.3k|          const int scale = 1 << (lvl_seg >> 5);
  178|  60.3k|          const int intra_lvl = lvl_seg + lf->ref_deltas[INTRA_FRAME] * scale;
  179|  60.3k|          lfi->lvl[plane][seg_id][dir][INTRA_FRAME][0] =
  180|  60.3k|              clamp(intra_lvl, 0, MAX_LOOP_FILTER);
  ------------------
  |  |   27|  60.3k|#define MAX_LOOP_FILTER 63
  ------------------
  181|       |
  182|   482k|          for (ref = LAST_FRAME; ref < REF_FRAMES; ++ref) {
  ------------------
  |  Branch (182:34): [True: 422k, False: 60.3k]
  ------------------
  183|  1.26M|            for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) {
  ------------------
  |  |   74|  1.26M|#define MAX_MODE_LF_DELTAS 2
  ------------------
  |  Branch (183:28): [True: 844k, False: 422k]
  ------------------
  184|   844k|              const int inter_lvl = lvl_seg + lf->ref_deltas[ref] * scale +
  185|   844k|                                    lf->mode_deltas[mode] * scale;
  186|   844k|              lfi->lvl[plane][seg_id][dir][ref][mode] =
  187|   844k|                  clamp(inter_lvl, 0, MAX_LOOP_FILTER);
  ------------------
  |  |   27|   844k|#define MAX_LOOP_FILTER 63
  ------------------
  188|   844k|            }
  189|   422k|          }
  190|  60.3k|        }
  191|  81.3k|      }
  192|  40.6k|    }
  193|  5.08k|  }
  194|  1.86k|}
av1_filter_block_plane_vert:
 1308|  9.21k|                                 const uint32_t mi_row, const uint32_t mi_col) {
 1309|  9.21k|  const uint32_t scale_horz = plane_ptr->subsampling_x;
 1310|  9.21k|  const uint32_t scale_vert = plane_ptr->subsampling_y;
 1311|  9.21k|  uint8_t *const dst_ptr = plane_ptr->dst.buf;
 1312|  9.21k|  const int dst_stride = plane_ptr->dst.stride;
 1313|  9.21k|  const int plane_mi_rows =
 1314|  9.21k|      ROUND_POWER_OF_TWO(cm->mi_params.mi_rows, scale_vert);
  ------------------
  |  |   41|  9.21k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
 1315|  9.21k|  const int plane_mi_cols =
 1316|  9.21k|      ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, scale_horz);
  ------------------
  |  |   41|  9.21k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
 1317|  9.21k|  const int y_range = AOMMIN((int)(plane_mi_rows - (mi_row >> scale_vert)),
  ------------------
  |  |   34|  9.21k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 5.44k, False: 3.76k]
  |  |  ------------------
  ------------------
 1318|  9.21k|                             (MAX_MIB_SIZE >> scale_vert));
 1319|  9.21k|  const int x_range = AOMMIN((int)(plane_mi_cols - (mi_col >> scale_horz)),
  ------------------
  |  |   34|  9.21k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 5.27k, False: 3.93k]
  |  |  ------------------
  ------------------
 1320|  9.21k|                             (MAX_MIB_SIZE >> scale_horz));
 1321|       |
 1322|   207k|  for (int y = 0; y < y_range; y++) {
  ------------------
  |  Branch (1322:19): [True: 197k, False: 9.21k]
  ------------------
 1323|   197k|    uint8_t *p = dst_ptr + y * MI_SIZE * dst_stride;
  ------------------
  |  |   40|   197k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|   197k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1324|  1.43M|    for (int x = 0; x < x_range;) {
  ------------------
  |  Branch (1324:21): [True: 1.23M, False: 197k]
  ------------------
 1325|       |      // inner loop always filter vertical edges in a MI block. If MI size
 1326|       |      // is 8x8, it will filter the vertical edge aligned with a 8x8 block.
 1327|       |      // If 4x4 transform is used, it will then filter the internal edge
 1328|       |      //  aligned with a 4x4 block
 1329|  1.23M|      const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
  ------------------
  |  |   40|  1.23M|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  1.23M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
                    const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
  ------------------
  |  |   40|  1.23M|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  1.23M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1330|  1.23M|      const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
  ------------------
  |  |   40|  1.23M|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  1.23M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
                    const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
  ------------------
  |  |   40|  1.23M|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  1.23M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1331|  1.23M|      uint32_t advance_units;
 1332|  1.23M|      TX_SIZE tx_size;
 1333|  1.23M|      AV1_DEBLOCKING_PARAMETERS params;
 1334|  1.23M|      memset(&params, 0, sizeof(params));
 1335|       |
 1336|  1.23M|      tx_size =
 1337|  1.23M|          set_lpf_parameters(&params, ((ptrdiff_t)1 << scale_horz), cm, xd,
 1338|  1.23M|                             VERT_EDGE, curr_x, curr_y, plane, plane_ptr);
 1339|  1.23M|      if (tx_size == TX_INVALID) {
  ------------------
  |  Branch (1339:11): [True: 0, False: 1.23M]
  ------------------
 1340|      0|        params.filter_length = 0;
 1341|      0|        tx_size = TX_4X4;
 1342|      0|      }
 1343|       |
 1344|  1.23M|      filter_vert(p, dst_stride, &params, cm->seq_params, USE_SINGLE);
 1345|       |
 1346|       |      // advance the destination pointer
 1347|  1.23M|      advance_units = tx_size_wide_unit[tx_size];
 1348|  1.23M|      x += advance_units;
 1349|  1.23M|      p += advance_units * MI_SIZE;
  ------------------
  |  |   40|  1.23M|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  1.23M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1350|  1.23M|    }
 1351|   197k|  }
 1352|  9.21k|}
av1_filter_block_plane_horz:
 1910|  9.21k|                                 const uint32_t mi_row, const uint32_t mi_col) {
 1911|  9.21k|  const uint32_t scale_horz = plane_ptr->subsampling_x;
 1912|  9.21k|  const uint32_t scale_vert = plane_ptr->subsampling_y;
 1913|  9.21k|  uint8_t *const dst_ptr = plane_ptr->dst.buf;
 1914|  9.21k|  const int dst_stride = plane_ptr->dst.stride;
 1915|  9.21k|  const int plane_mi_rows =
 1916|  9.21k|      ROUND_POWER_OF_TWO(cm->mi_params.mi_rows, scale_vert);
  ------------------
  |  |   41|  9.21k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
 1917|  9.21k|  const int plane_mi_cols =
 1918|  9.21k|      ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, scale_horz);
  ------------------
  |  |   41|  9.21k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
 1919|  9.21k|  const int y_range = AOMMIN((int)(plane_mi_rows - (mi_row >> scale_vert)),
  ------------------
  |  |   34|  9.21k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 5.44k, False: 3.76k]
  |  |  ------------------
  ------------------
 1920|  9.21k|                             (MAX_MIB_SIZE >> scale_vert));
 1921|  9.21k|  const int x_range = AOMMIN((int)(plane_mi_cols - (mi_col >> scale_horz)),
  ------------------
  |  |   34|  9.21k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 5.26k, False: 3.94k]
  |  |  ------------------
  ------------------
 1922|  9.21k|                             (MAX_MIB_SIZE >> scale_horz));
 1923|   189k|  for (int x = 0; x < x_range; x++) {
  ------------------
  |  Branch (1923:19): [True: 180k, False: 9.21k]
  ------------------
 1924|   180k|    uint8_t *p = dst_ptr + x * MI_SIZE;
  ------------------
  |  |   40|   180k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|   180k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1925|  1.45M|    for (int y = 0; y < y_range;) {
  ------------------
  |  Branch (1925:21): [True: 1.27M, False: 180k]
  ------------------
 1926|       |      // inner loop always filter vertical edges in a MI block. If MI size
 1927|       |      // is 8x8, it will first filter the vertical edge aligned with a 8x8
 1928|       |      // block. If 4x4 transform is used, it will then filter the internal
 1929|       |      // edge aligned with a 4x4 block
 1930|  1.27M|      const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
  ------------------
  |  |   40|  1.27M|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  1.27M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
                    const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
  ------------------
  |  |   40|  1.27M|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  1.27M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1931|  1.27M|      const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
  ------------------
  |  |   40|  1.27M|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  1.27M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
                    const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
  ------------------
  |  |   40|  1.27M|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  1.27M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1932|  1.27M|      uint32_t advance_units;
 1933|  1.27M|      TX_SIZE tx_size;
 1934|  1.27M|      AV1_DEBLOCKING_PARAMETERS params;
 1935|  1.27M|      memset(&params, 0, sizeof(params));
 1936|       |
 1937|  1.27M|      tx_size = set_lpf_parameters(
 1938|  1.27M|          &params, (cm->mi_params.mi_stride << scale_vert), cm, xd, HORZ_EDGE,
 1939|  1.27M|          curr_x, curr_y, plane, plane_ptr);
 1940|  1.27M|      if (tx_size == TX_INVALID) {
  ------------------
  |  Branch (1940:11): [True: 0, False: 1.27M]
  ------------------
 1941|      0|        params.filter_length = 0;
 1942|      0|        tx_size = TX_4X4;
 1943|      0|      }
 1944|       |
 1945|  1.27M|      filter_horz(p, dst_stride, &params, cm->seq_params, USE_SINGLE);
 1946|       |
 1947|       |      // advance the destination pointer
 1948|  1.27M|      advance_units = tx_size_high_unit[tx_size];
 1949|  1.27M|      y += advance_units;
 1950|  1.27M|      p += advance_units * dst_stride * MI_SIZE;
  ------------------
  |  |   40|  1.27M|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  1.27M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1951|  1.27M|    }
 1952|   180k|  }
 1953|  9.21k|}
av1_loopfilter.c:update_sharpness:
   47|  19.8k|static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
   48|  19.8k|  int lvl;
   49|       |
   50|       |  // For each possible value for the loop filter fill out limits
   51|  1.28M|  for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++) {
  ------------------
  |  |   27|  1.28M|#define MAX_LOOP_FILTER 63
  ------------------
  |  Branch (51:17): [True: 1.26M, False: 19.8k]
  ------------------
   52|       |    // Set loop filter parameters that control sharpness.
   53|  1.26M|    int block_inside_limit = lvl >> ((sharpness_lvl > 0) + (sharpness_lvl > 4));
   54|       |
   55|  1.26M|    if (sharpness_lvl > 0) {
  ------------------
  |  Branch (55:9): [True: 52.2k, False: 1.21M]
  ------------------
   56|  52.2k|      if (block_inside_limit > (9 - sharpness_lvl))
  ------------------
  |  Branch (56:11): [True: 39.4k, False: 12.8k]
  ------------------
   57|  39.4k|        block_inside_limit = (9 - sharpness_lvl);
   58|  52.2k|    }
   59|       |
   60|  1.26M|    if (block_inside_limit < 1) block_inside_limit = 1;
  ------------------
  |  Branch (60:9): [True: 21.4k, False: 1.24M]
  ------------------
   61|       |
   62|  1.26M|    memset(lfi->lfthr[lvl].lim, block_inside_limit, SIMD_WIDTH);
  ------------------
  |  |   30|  1.26M|#define SIMD_WIDTH 16
  ------------------
   63|  1.26M|    memset(lfi->lfthr[lvl].mblim, (2 * (lvl + 2) + block_inside_limit),
   64|  1.26M|           SIMD_WIDTH);
  ------------------
  |  |   30|  1.26M|#define SIMD_WIDTH 16
  ------------------
   65|  1.26M|  }
   66|  19.8k|}
av1_loopfilter.c:set_lpf_parameters:
  228|  2.43M|    const int plane, const struct macroblockd_plane *const plane_ptr) {
  229|       |  // reset to initial values
  230|  2.43M|  params->filter_length = 0;
  231|       |
  232|       |  // no deblocking is required
  233|  2.43M|  const uint32_t width = plane_ptr->dst.width;
  234|  2.43M|  const uint32_t height = plane_ptr->dst.height;
  235|  2.43M|  if ((width <= x) || (height <= y)) {
  ------------------
  |  Branch (235:7): [True: 26.6k, False: 2.40M]
  |  Branch (235:23): [True: 40.3k, False: 2.36M]
  ------------------
  236|       |    // just return the smallest transform unit size
  237|  70.4k|    return TX_4X4;
  238|  70.4k|  }
  239|       |
  240|  2.36M|  const uint32_t scale_horz = plane_ptr->subsampling_x;
  241|  2.36M|  const uint32_t scale_vert = plane_ptr->subsampling_y;
  242|       |  // for sub8x8 block, chroma prediction mode is obtained from the bottom/right
  243|       |  // mi structure of the co-located 8x8 luma block. so for chroma plane, mi_row
  244|       |  // and mi_col should map to the bottom/right mi structure, i.e, both mi_row
  245|       |  // and mi_col should be odd number for chroma plane.
  246|  2.36M|  const int mi_row = scale_vert | ((y << scale_vert) >> MI_SIZE_LOG2);
  ------------------
  |  |   39|  2.36M|#define MI_SIZE_LOG2 2
  ------------------
  247|  2.36M|  const int mi_col = scale_horz | ((x << scale_horz) >> MI_SIZE_LOG2);
  ------------------
  |  |   39|  2.36M|#define MI_SIZE_LOG2 2
  ------------------
  248|  2.36M|  MB_MODE_INFO **mi =
  249|  2.36M|      cm->mi_params.mi_grid_base + mi_row * cm->mi_params.mi_stride + mi_col;
  250|  2.36M|  const MB_MODE_INFO *mbmi = mi[0];
  251|       |  // If current mbmi is not correctly setup, return an invalid value to stop
  252|       |  // filtering. One example is that if this tile is not coded, then its mbmi
  253|       |  // it not set up.
  254|  2.36M|  if (mbmi == NULL) return TX_INVALID;
  ------------------
  |  Branch (254:7): [True: 0, False: 2.36M]
  ------------------
  255|       |
  256|  2.36M|  const TX_SIZE ts = get_transform_size(xd, mi[0], mi_row, mi_col, plane,
  257|  2.36M|                                        scale_horz, scale_vert);
  258|       |
  259|  2.36M|  {
  260|  2.36M|    const uint32_t coord = (VERT_EDGE == edge_dir) ? (x) : (y);
  ------------------
  |  Branch (260:28): [True: 1.15M, False: 1.20M]
  ------------------
  261|  2.36M|    const uint32_t transform_masks =
  262|  2.36M|        edge_dir == VERT_EDGE ? tx_size_wide[ts] - 1 : tx_size_high[ts] - 1;
  ------------------
  |  Branch (262:9): [True: 1.15M, False: 1.20M]
  ------------------
  263|  2.36M|    const int32_t tu_edge = (coord & transform_masks) ? (0) : (1);
  ------------------
  |  Branch (263:29): [True: 0, False: 2.36M]
  ------------------
  264|       |
  265|  2.36M|    if (!tu_edge) return ts;
  ------------------
  |  Branch (265:9): [True: 0, False: 2.36M]
  ------------------
  266|       |
  267|       |    // prepare outer edge parameters. deblock the edge if it's an edge of a TU
  268|  2.36M|    {
  269|  2.36M|      const uint32_t curr_level =
  270|  2.36M|          get_filter_level(cm, &cm->lf_info, edge_dir, plane, mbmi);
  271|  2.36M|      const int curr_skipped = mbmi->skip_txfm && is_inter_block(mbmi);
  ------------------
  |  Branch (271:32): [True: 773k, False: 1.58M]
  |  Branch (271:51): [True: 320, False: 773k]
  ------------------
  272|  2.36M|      uint32_t level = curr_level;
  273|  2.36M|      if (coord) {
  ------------------
  |  Branch (273:11): [True: 2.15M, False: 202k]
  ------------------
  274|  2.15M|        {
  275|  2.15M|          const MB_MODE_INFO *const mi_prev = *(mi - mode_step);
  276|  2.15M|          if (mi_prev == NULL) return TX_INVALID;
  ------------------
  |  Branch (276:15): [True: 0, False: 2.15M]
  ------------------
  277|  2.15M|          const int pv_row =
  278|  2.15M|              (VERT_EDGE == edge_dir) ? (mi_row) : (mi_row - (1 << scale_vert));
  ------------------
  |  Branch (278:15): [True: 1.04M, False: 1.11M]
  ------------------
  279|  2.15M|          const int pv_col =
  280|  2.15M|              (VERT_EDGE == edge_dir) ? (mi_col - (1 << scale_horz)) : (mi_col);
  ------------------
  |  Branch (280:15): [True: 1.04M, False: 1.11M]
  ------------------
  281|  2.15M|          const TX_SIZE pv_ts = get_transform_size(
  282|  2.15M|              xd, mi_prev, pv_row, pv_col, plane, scale_horz, scale_vert);
  283|       |
  284|  2.15M|          const uint32_t pv_lvl =
  285|  2.15M|              get_filter_level(cm, &cm->lf_info, edge_dir, plane, mi_prev);
  286|       |
  287|  2.15M|          const int pv_skip_txfm =
  288|  2.15M|              mi_prev->skip_txfm && is_inter_block(mi_prev);
  ------------------
  |  Branch (288:15): [True: 726k, False: 1.43M]
  |  Branch (288:37): [True: 0, False: 726k]
  ------------------
  289|  2.15M|          const BLOCK_SIZE bsize = get_plane_block_size(
  290|  2.15M|              mbmi->bsize, plane_ptr->subsampling_x, plane_ptr->subsampling_y);
  291|  2.15M|          assert(bsize < BLOCK_SIZES_ALL);
  292|  2.15M|          const int prediction_masks = edge_dir == VERT_EDGE
  ------------------
  |  Branch (292:40): [True: 1.07M, False: 1.08M]
  ------------------
  293|  2.15M|                                           ? block_size_wide[bsize] - 1
  294|  2.15M|                                           : block_size_high[bsize] - 1;
  295|  2.15M|          const int32_t pu_edge = !(coord & prediction_masks);
  296|       |          // if the current and the previous blocks are skipped,
  297|       |          // deblock the edge if the edge belongs to a PU's edge only.
  298|  2.15M|          if ((curr_level || pv_lvl) &&
  ------------------
  |  Branch (298:16): [True: 1.77M, False: 386k]
  |  Branch (298:30): [True: 13.6k, False: 373k]
  ------------------
  299|  1.82M|              (!pv_skip_txfm || !curr_skipped || pu_edge)) {
  ------------------
  |  Branch (299:16): [True: 1.82M, False: 0]
  |  Branch (299:33): [True: 0, False: 0]
  |  Branch (299:50): [True: 0, False: 0]
  ------------------
  300|  1.81M|            const int dim = (VERT_EDGE == edge_dir)
  ------------------
  |  Branch (300:29): [True: 881k, False: 937k]
  ------------------
  301|  1.81M|                                ? AOMMIN(tx_size_wide_unit_log2[ts],
  ------------------
  |  |   34|   881k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 124k, False: 757k]
  |  |  ------------------
  ------------------
  302|  1.81M|                                         tx_size_wide_unit_log2[pv_ts])
  303|  1.81M|                                : AOMMIN(tx_size_high_unit_log2[ts],
  ------------------
  |  |   34|   937k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 162k, False: 774k]
  |  |  ------------------
  ------------------
  304|  1.81M|                                         tx_size_high_unit_log2[pv_ts]);
  305|  1.81M|            if (plane) {
  ------------------
  |  Branch (305:17): [True: 1.07M, False: 747k]
  ------------------
  306|  1.07M|              params->filter_length = (dim == 0) ? 4 : 6;
  ------------------
  |  Branch (306:39): [True: 252k, False: 820k]
  ------------------
  307|  1.07M|            } else {
  308|   747k|              assert(dim < TX_SIZES);
  309|   747k|              assert(dim >= 0);
  310|   747k|              params->filter_length = tx_dim_to_filter_length[dim];
  311|   747k|            }
  312|       |
  313|       |            // update the level if the current block is skipped,
  314|       |            // but the previous one is not
  315|  1.81M|            level = (curr_level) ? (curr_level) : (pv_lvl);
  ------------------
  |  Branch (315:21): [True: 1.75M, False: 67.5k]
  ------------------
  316|  1.81M|          }
  317|  2.15M|        }
  318|  2.15M|      }
  319|       |      // prepare common parameters
  320|  2.36M|      if (params->filter_length) {
  ------------------
  |  Branch (320:11): [True: 1.77M, False: 587k]
  ------------------
  321|  1.77M|        const loop_filter_thresh *const limits = cm->lf_info.lfthr + level;
  322|  1.77M|        params->lfthr = limits;
  323|  1.77M|      }
  324|  2.36M|    }
  325|  2.36M|  }
  326|       |
  327|      0|  return ts;
  328|  2.36M|}
av1_loopfilter.c:get_transform_size:
  199|  4.16M|                   const int ss_x, const int ss_y) {
  200|  4.16M|  assert(mbmi != NULL);
  201|  4.18M|  if (xd && xd->lossless[mbmi->segment_id]) return TX_4X4;
  ------------------
  |  Branch (201:7): [True: 4.18M, False: 18.4E]
  |  Branch (201:13): [True: 280k, False: 3.90M]
  ------------------
  202|       |
  203|  3.88M|  TX_SIZE tx_size = (plane == AOM_PLANE_Y)
  ------------------
  |  |  210|  3.88M|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
  |  Branch (203:21): [True: 1.82M, False: 2.06M]
  ------------------
  204|  3.88M|                        ? mbmi->tx_size
  205|  3.88M|                        : av1_get_max_uv_txsize(mbmi->bsize, ss_x, ss_y);
  206|  3.88M|  assert(tx_size < TX_SIZES_ALL);
  207|  3.88M|  if ((plane == AOM_PLANE_Y) && is_inter_block(mbmi) && !mbmi->skip_txfm) {
  ------------------
  |  |  210|  3.88M|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
  |  Branch (207:7): [True: 1.82M, False: 2.06M]
  |  Branch (207:33): [True: 448, False: 1.82M]
  |  Branch (207:57): [True: 128, False: 320]
  ------------------
  208|    128|    const BLOCK_SIZE sb_type = mbmi->bsize;
  209|    128|    const int blk_row = mi_row & (mi_size_high[sb_type] - 1);
  210|    128|    const int blk_col = mi_col & (mi_size_wide[sb_type] - 1);
  211|    128|    const TX_SIZE mb_tx_size =
  212|    128|        mbmi->inter_tx_size[av1_get_txb_size_index(sb_type, blk_row, blk_col)];
  213|    128|    assert(mb_tx_size < TX_SIZES_ALL);
  214|    128|    tx_size = mb_tx_size;
  215|    128|  }
  216|       |
  217|  3.88M|  return tx_size;
  218|  4.16M|}
av1_loopfilter.c:get_filter_level:
   71|  4.19M|                                const MB_MODE_INFO *mbmi) {
   72|  4.19M|  const int segment_id = mbmi->segment_id;
   73|  4.19M|  if (cm->delta_q_info.delta_lf_present_flag) {
  ------------------
  |  Branch (73:7): [True: 891k, False: 3.30M]
  ------------------
   74|   891k|    int8_t delta_lf;
   75|   891k|    if (cm->delta_q_info.delta_lf_multi) {
  ------------------
  |  Branch (75:9): [True: 805k, False: 85.5k]
  ------------------
   76|   805k|      const int delta_lf_idx = delta_lf_id_lut[plane][dir_idx];
   77|   805k|      delta_lf = mbmi->delta_lf[delta_lf_idx];
   78|   805k|    } else {
   79|  85.5k|      delta_lf = mbmi->delta_lf_from_base;
   80|  85.5k|    }
   81|   891k|    int base_level;
   82|   891k|    if (plane == 0)
  ------------------
  |  Branch (82:9): [True: 434k, False: 456k]
  ------------------
   83|   434k|      base_level = cm->lf.filter_level[dir_idx];
   84|   456k|    else if (plane == 1)
  ------------------
  |  Branch (84:14): [True: 332k, False: 123k]
  ------------------
   85|   332k|      base_level = cm->lf.filter_level_u;
   86|   123k|    else
   87|   123k|      base_level = cm->lf.filter_level_v;
   88|   891k|    int lvl_seg = clamp(delta_lf + base_level, 0, MAX_LOOP_FILTER);
  ------------------
  |  |   27|   891k|#define MAX_LOOP_FILTER 63
  ------------------
   89|   891k|    assert(plane >= 0 && plane <= 2);
   90|   891k|    const int seg_lf_feature_id = seg_lvl_lf_lut[plane][dir_idx];
   91|   891k|    if (segfeature_active(&cm->seg, segment_id, seg_lf_feature_id)) {
  ------------------
  |  Branch (91:9): [True: 554k, False: 336k]
  ------------------
   92|   554k|      const int data = get_segdata(&cm->seg, segment_id, seg_lf_feature_id);
   93|   554k|      lvl_seg = clamp(lvl_seg + data, 0, MAX_LOOP_FILTER);
  ------------------
  |  |   27|   554k|#define MAX_LOOP_FILTER 63
  ------------------
   94|   554k|    }
   95|       |
   96|   891k|    if (cm->lf.mode_ref_delta_enabled) {
  ------------------
  |  Branch (96:9): [True: 768k, False: 122k]
  ------------------
   97|   768k|      const int scale = 1 << (lvl_seg >> 5);
   98|   768k|      lvl_seg += cm->lf.ref_deltas[mbmi->ref_frame[0]] * scale;
   99|   768k|      if (mbmi->ref_frame[0] > INTRA_FRAME)
  ------------------
  |  Branch (99:11): [True: 0, False: 768k]
  ------------------
  100|      0|        lvl_seg += cm->lf.mode_deltas[mode_lf_lut[mbmi->mode]] * scale;
  101|   768k|      lvl_seg = clamp(lvl_seg, 0, MAX_LOOP_FILTER);
  ------------------
  |  |   27|   768k|#define MAX_LOOP_FILTER 63
  ------------------
  102|   768k|    }
  103|   891k|    return lvl_seg;
  104|  3.30M|  } else {
  105|  3.30M|    return lfi_n->lvl[plane][segment_id][dir_idx][mbmi->ref_frame[0]]
  106|  3.30M|                     [mode_lf_lut[mbmi->mode]];
  107|  3.30M|  }
  108|  4.19M|}
av1_loopfilter.c:filter_vert:
  909|  1.23M|                               USE_FILTER_TYPE use_filter_type) {
  910|  1.23M|  const loop_filter_thresh *limits = params->lfthr;
  911|  1.23M|#if CONFIG_AV1_HIGHBITDEPTH
  912|  1.23M|  const int use_highbitdepth = seq_params->use_highbitdepth;
  913|  1.23M|  const aom_bit_depth_t bit_depth = seq_params->bit_depth;
  914|  1.23M|  if (use_highbitdepth) {
  ------------------
  |  Branch (914:7): [True: 376k, False: 861k]
  ------------------
  915|   376k|    uint16_t *dst_shortptr = CONVERT_TO_SHORTPTR(dst);
  ------------------
  |  |   75|   376k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  916|   376k|    if (use_filter_type == USE_QUAD) {
  ------------------
  |  Branch (916:9): [True: 0, False: 376k]
  ------------------
  917|      0|      switch (params->filter_length) {
  918|       |        // apply 4-tap filtering
  919|      0|        case 4:
  ------------------
  |  Branch (919:9): [True: 0, False: 0]
  ------------------
  920|      0|          aom_highbd_lpf_vertical_4_dual(
  921|      0|              dst_shortptr, dst_stride, limits->mblim, limits->lim,
  922|      0|              limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
  923|      0|              bit_depth);
  924|      0|          aom_highbd_lpf_vertical_4_dual(
  925|      0|              dst_shortptr + (2 * MI_SIZE * dst_stride), dst_stride,
  ------------------
  |  |   40|      0|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|      0|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  926|      0|              limits->mblim, limits->lim, limits->hev_thr, limits->mblim,
  927|      0|              limits->lim, limits->hev_thr, bit_depth);
  928|      0|          break;
  929|      0|        case 6:  // apply 6-tap filter for chroma plane only
  ------------------
  |  Branch (929:9): [True: 0, False: 0]
  ------------------
  930|      0|          aom_highbd_lpf_vertical_6_dual(
  ------------------
  |  | 2597|      0|#define aom_highbd_lpf_vertical_6_dual aom_highbd_lpf_vertical_6_dual_sse2
  ------------------
  931|      0|              dst_shortptr, dst_stride, limits->mblim, limits->lim,
  932|      0|              limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
  933|      0|              bit_depth);
  934|      0|          aom_highbd_lpf_vertical_6_dual(
  ------------------
  |  | 2597|      0|#define aom_highbd_lpf_vertical_6_dual aom_highbd_lpf_vertical_6_dual_sse2
  ------------------
  935|      0|              dst_shortptr + (2 * MI_SIZE * dst_stride), dst_stride,
  ------------------
  |  |   40|      0|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|      0|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  936|      0|              limits->mblim, limits->lim, limits->hev_thr, limits->mblim,
  937|      0|              limits->lim, limits->hev_thr, bit_depth);
  938|      0|          break;
  939|       |        // apply 8-tap filtering
  940|      0|        case 8:
  ------------------
  |  Branch (940:9): [True: 0, False: 0]
  ------------------
  941|      0|          aom_highbd_lpf_vertical_8_dual(
  942|      0|              dst_shortptr, dst_stride, limits->mblim, limits->lim,
  943|      0|              limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
  944|      0|              bit_depth);
  945|      0|          aom_highbd_lpf_vertical_8_dual(
  946|      0|              dst_shortptr + (2 * MI_SIZE * dst_stride), dst_stride,
  ------------------
  |  |   40|      0|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|      0|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  947|      0|              limits->mblim, limits->lim, limits->hev_thr, limits->mblim,
  948|      0|              limits->lim, limits->hev_thr, bit_depth);
  949|      0|          break;
  950|       |        // apply 14-tap filtering
  951|      0|        case 14:
  ------------------
  |  Branch (951:9): [True: 0, False: 0]
  ------------------
  952|      0|          aom_highbd_lpf_vertical_14_dual(
  953|      0|              dst_shortptr, dst_stride, limits->mblim, limits->lim,
  954|      0|              limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
  955|      0|              bit_depth);
  956|      0|          aom_highbd_lpf_vertical_14_dual(
  957|      0|              dst_shortptr + (2 * MI_SIZE * dst_stride), dst_stride,
  ------------------
  |  |   40|      0|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|      0|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  958|      0|              limits->mblim, limits->lim, limits->hev_thr, limits->mblim,
  959|      0|              limits->lim, limits->hev_thr, bit_depth);
  960|      0|          break;
  961|       |        // no filtering
  962|      0|        default: break;
  ------------------
  |  Branch (962:9): [True: 0, False: 0]
  ------------------
  963|      0|      }
  964|   376k|    } else if (use_filter_type == USE_DUAL) {
  ------------------
  |  Branch (964:16): [True: 0, False: 376k]
  ------------------
  965|      0|      switch (params->filter_length) {
  966|       |        // apply 4-tap filtering
  967|      0|        case 4:
  ------------------
  |  Branch (967:9): [True: 0, False: 0]
  ------------------
  968|      0|          aom_highbd_lpf_vertical_4_dual(
  969|      0|              dst_shortptr, dst_stride, limits->mblim, limits->lim,
  970|      0|              limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
  971|      0|              bit_depth);
  972|      0|          break;
  973|      0|        case 6:  // apply 6-tap filter for chroma plane only
  ------------------
  |  Branch (973:9): [True: 0, False: 0]
  ------------------
  974|      0|          aom_highbd_lpf_vertical_6_dual(
  ------------------
  |  | 2597|      0|#define aom_highbd_lpf_vertical_6_dual aom_highbd_lpf_vertical_6_dual_sse2
  ------------------
  975|      0|              dst_shortptr, dst_stride, limits->mblim, limits->lim,
  976|      0|              limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
  977|      0|              bit_depth);
  978|      0|          break;
  979|       |        // apply 8-tap filtering
  980|      0|        case 8:
  ------------------
  |  Branch (980:9): [True: 0, False: 0]
  ------------------
  981|      0|          aom_highbd_lpf_vertical_8_dual(
  982|      0|              dst_shortptr, dst_stride, limits->mblim, limits->lim,
  983|      0|              limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
  984|      0|              bit_depth);
  985|      0|          break;
  986|       |        // apply 14-tap filtering
  987|      0|        case 14:
  ------------------
  |  Branch (987:9): [True: 0, False: 0]
  ------------------
  988|      0|          aom_highbd_lpf_vertical_14_dual(
  989|      0|              dst_shortptr, dst_stride, limits->mblim, limits->lim,
  990|      0|              limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
  991|      0|              bit_depth);
  992|      0|          break;
  993|       |        // no filtering
  994|      0|        default: break;
  ------------------
  |  Branch (994:9): [True: 0, False: 0]
  ------------------
  995|      0|      }
  996|   376k|    } else {
  997|   376k|      assert(use_filter_type == USE_SINGLE);
  998|   376k|      switch (params->filter_length) {
  999|       |        // apply 4-tap filtering
 1000|  72.9k|        case 4:
  ------------------
  |  Branch (1000:9): [True: 72.9k, False: 303k]
  ------------------
 1001|  72.9k|          aom_highbd_lpf_vertical_4(dst_shortptr, dst_stride, limits->mblim,
  ------------------
  |  | 2584|  72.9k|#define aom_highbd_lpf_vertical_4 aom_highbd_lpf_vertical_4_sse2
  ------------------
 1002|  72.9k|                                    limits->lim, limits->hev_thr, bit_depth);
 1003|  72.9k|          break;
 1004|   131k|        case 6:  // apply 6-tap filter for chroma plane only
  ------------------
  |  Branch (1004:9): [True: 131k, False: 245k]
  ------------------
 1005|   131k|          aom_highbd_lpf_vertical_6(dst_shortptr, dst_stride, limits->mblim,
  ------------------
  |  | 2593|   131k|#define aom_highbd_lpf_vertical_6 aom_highbd_lpf_vertical_6_sse2
  ------------------
 1006|   131k|                                    limits->lim, limits->hev_thr, bit_depth);
 1007|   131k|          break;
 1008|       |        // apply 8-tap filtering
 1009|  46.8k|        case 8:
  ------------------
  |  Branch (1009:9): [True: 46.8k, False: 330k]
  ------------------
 1010|  46.8k|          aom_highbd_lpf_vertical_8(dst_shortptr, dst_stride, limits->mblim,
  ------------------
  |  | 2601|  46.8k|#define aom_highbd_lpf_vertical_8 aom_highbd_lpf_vertical_8_sse2
  ------------------
 1011|  46.8k|                                    limits->lim, limits->hev_thr, bit_depth);
 1012|  46.8k|          break;
 1013|       |        // apply 14-tap filtering
 1014|  43.7k|        case 14:
  ------------------
  |  Branch (1014:9): [True: 43.7k, False: 333k]
  ------------------
 1015|  43.7k|          aom_highbd_lpf_vertical_14(dst_shortptr, dst_stride, limits->mblim,
  ------------------
  |  | 2575|  43.7k|#define aom_highbd_lpf_vertical_14 aom_highbd_lpf_vertical_14_sse2
  ------------------
 1016|  43.7k|                                     limits->lim, limits->hev_thr, bit_depth);
 1017|  43.7k|          break;
 1018|       |        // no filtering
 1019|   104k|        default: break;
  ------------------
  |  Branch (1019:9): [True: 104k, False: 272k]
  ------------------
 1020|   376k|      }
 1021|   376k|    }
 1022|   364k|    return;
 1023|   376k|  }
 1024|   861k|#endif  // CONFIG_AV1_HIGHBITDEPTH
 1025|   861k|  if (use_filter_type == USE_QUAD) {
  ------------------
  |  Branch (1025:7): [True: 0, False: 861k]
  ------------------
 1026|       |    // Only one set of loop filter parameters (mblim, lim and hev_thr) is
 1027|       |    // passed as argument to quad loop filter because quad loop filter is
 1028|       |    // called for those cases where all the 4 set of loop filter parameters
 1029|       |    // are equal.
 1030|      0|    switch (params->filter_length) {
 1031|       |      // apply 4-tap filtering
 1032|      0|      case 4:
  ------------------
  |  Branch (1032:7): [True: 0, False: 0]
  ------------------
 1033|      0|        aom_lpf_vertical_4_quad(dst, dst_stride, limits->mblim, limits->lim,
  ------------------
  |  | 3776|      0|#define aom_lpf_vertical_4_quad aom_lpf_vertical_4_quad_sse2
  ------------------
 1034|      0|                                limits->hev_thr);
 1035|      0|        break;
 1036|      0|      case 6:  // apply 6-tap filter for chroma plane only
  ------------------
  |  Branch (1036:7): [True: 0, False: 0]
  ------------------
 1037|      0|        aom_lpf_vertical_6_quad(dst, dst_stride, limits->mblim, limits->lim,
  ------------------
  |  | 3788|      0|#define aom_lpf_vertical_6_quad aom_lpf_vertical_6_quad_sse2
  ------------------
 1038|      0|                                limits->hev_thr);
 1039|      0|        break;
 1040|       |      // apply 8-tap filtering
 1041|      0|      case 8:
  ------------------
  |  Branch (1041:7): [True: 0, False: 0]
  ------------------
 1042|      0|        aom_lpf_vertical_8_quad(dst, dst_stride, limits->mblim, limits->lim,
  ------------------
  |  | 3800|      0|#define aom_lpf_vertical_8_quad aom_lpf_vertical_8_quad_sse2
  ------------------
 1043|      0|                                limits->hev_thr);
 1044|      0|        break;
 1045|       |      // apply 14-tap filtering
 1046|      0|      case 14:
  ------------------
  |  Branch (1046:7): [True: 0, False: 0]
  ------------------
 1047|      0|        aom_lpf_vertical_14_quad(dst, dst_stride, limits->mblim, limits->lim,
 1048|      0|                                 limits->hev_thr);
 1049|      0|        break;
 1050|       |      // no filtering
 1051|      0|      default: break;
  ------------------
  |  Branch (1051:7): [True: 0, False: 0]
  ------------------
 1052|      0|    }
 1053|   861k|  } else if (use_filter_type == USE_DUAL) {
  ------------------
  |  Branch (1053:14): [True: 0, False: 861k]
  ------------------
 1054|      0|    switch (params->filter_length) {
 1055|       |      // apply 4-tap filtering
 1056|      0|      case 4:
  ------------------
  |  Branch (1056:7): [True: 0, False: 0]
  ------------------
 1057|      0|        aom_lpf_vertical_4_dual(dst, dst_stride, limits->mblim, limits->lim,
  ------------------
  |  | 3772|      0|#define aom_lpf_vertical_4_dual aom_lpf_vertical_4_dual_sse2
  ------------------
 1058|      0|                                limits->hev_thr, limits->mblim, limits->lim,
 1059|      0|                                limits->hev_thr);
 1060|      0|        break;
 1061|      0|      case 6:  // apply 6-tap filter for chroma plane only
  ------------------
  |  Branch (1061:7): [True: 0, False: 0]
  ------------------
 1062|      0|        aom_lpf_vertical_6_dual(dst, dst_stride, limits->mblim, limits->lim,
  ------------------
  |  | 3784|      0|#define aom_lpf_vertical_6_dual aom_lpf_vertical_6_dual_sse2
  ------------------
 1063|      0|                                limits->hev_thr, limits->mblim, limits->lim,
 1064|      0|                                limits->hev_thr);
 1065|      0|        break;
 1066|       |      // apply 8-tap filtering
 1067|      0|      case 8:
  ------------------
  |  Branch (1067:7): [True: 0, False: 0]
  ------------------
 1068|      0|        aom_lpf_vertical_8_dual(dst, dst_stride, limits->mblim, limits->lim,
  ------------------
  |  | 3796|      0|#define aom_lpf_vertical_8_dual aom_lpf_vertical_8_dual_sse2
  ------------------
 1069|      0|                                limits->hev_thr, limits->mblim, limits->lim,
 1070|      0|                                limits->hev_thr);
 1071|      0|        break;
 1072|       |      // apply 14-tap filtering
 1073|      0|      case 14:
  ------------------
  |  Branch (1073:7): [True: 0, False: 0]
  ------------------
 1074|      0|        aom_lpf_vertical_14_dual(dst, dst_stride, limits->mblim, limits->lim,
  ------------------
  |  | 3759|      0|#define aom_lpf_vertical_14_dual aom_lpf_vertical_14_dual_sse2
  ------------------
 1075|      0|                                 limits->hev_thr, limits->mblim, limits->lim,
 1076|      0|                                 limits->hev_thr);
 1077|      0|        break;
 1078|       |      // no filtering
 1079|      0|      default: break;
  ------------------
  |  Branch (1079:7): [True: 0, False: 0]
  ------------------
 1080|      0|    }
 1081|   861k|  } else {
 1082|   861k|    assert(use_filter_type == USE_SINGLE);
 1083|   861k|    switch (params->filter_length) {
 1084|       |      // apply 4-tap filtering
 1085|   159k|      case 4:
  ------------------
  |  Branch (1085:7): [True: 159k, False: 702k]
  ------------------
 1086|   159k|        aom_lpf_vertical_4(dst, dst_stride, limits->mblim, limits->lim,
  ------------------
  |  | 3768|   159k|#define aom_lpf_vertical_4 aom_lpf_vertical_4_sse2
  ------------------
 1087|   159k|                           limits->hev_thr);
 1088|   159k|        break;
 1089|   297k|      case 6:  // apply 6-tap filter for chroma plane only
  ------------------
  |  Branch (1089:7): [True: 297k, False: 564k]
  ------------------
 1090|   297k|        aom_lpf_vertical_6(dst, dst_stride, limits->mblim, limits->lim,
  ------------------
  |  | 3780|   297k|#define aom_lpf_vertical_6 aom_lpf_vertical_6_sse2
  ------------------
 1091|   297k|                           limits->hev_thr);
 1092|   297k|        break;
 1093|       |      // apply 8-tap filtering
 1094|   124k|      case 8:
  ------------------
  |  Branch (1094:7): [True: 124k, False: 737k]
  ------------------
 1095|   124k|        aom_lpf_vertical_8(dst, dst_stride, limits->mblim, limits->lim,
  ------------------
  |  | 3792|   124k|#define aom_lpf_vertical_8 aom_lpf_vertical_8_sse2
  ------------------
 1096|   124k|                           limits->hev_thr);
 1097|   124k|        break;
 1098|       |      // apply 14-tap filtering
 1099|  63.3k|      case 14:
  ------------------
  |  Branch (1099:7): [True: 63.3k, False: 798k]
  ------------------
 1100|  63.3k|        aom_lpf_vertical_14(dst, dst_stride, limits->mblim, limits->lim,
  ------------------
  |  | 3755|  63.3k|#define aom_lpf_vertical_14 aom_lpf_vertical_14_sse2
  ------------------
 1101|  63.3k|                            limits->hev_thr);
 1102|  63.3k|        break;
 1103|       |      // no filtering
 1104|   263k|      default: break;
  ------------------
  |  Branch (1104:7): [True: 263k, False: 598k]
  ------------------
 1105|   861k|    }
 1106|   861k|  }
 1107|       |#if !CONFIG_AV1_HIGHBITDEPTH
 1108|       |  (void)seq_params;
 1109|       |#endif  // !CONFIG_AV1_HIGHBITDEPTH
 1110|   861k|}
av1_loopfilter.c:filter_horz:
 1511|  1.26M|                               USE_FILTER_TYPE use_filter_type) {
 1512|  1.26M|  const loop_filter_thresh *limits = params->lfthr;
 1513|  1.26M|#if CONFIG_AV1_HIGHBITDEPTH
 1514|  1.26M|  const int use_highbitdepth = seq_params->use_highbitdepth;
 1515|  1.26M|  const aom_bit_depth_t bit_depth = seq_params->bit_depth;
 1516|  1.26M|  if (use_highbitdepth) {
  ------------------
  |  Branch (1516:7): [True: 380k, False: 888k]
  ------------------
 1517|   380k|    uint16_t *dst_shortptr = CONVERT_TO_SHORTPTR(dst);
  ------------------
  |  |   75|   380k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
 1518|   380k|    if (use_filter_type == USE_QUAD) {
  ------------------
  |  Branch (1518:9): [True: 0, False: 380k]
  ------------------
 1519|      0|      switch (params->filter_length) {
 1520|       |        // apply 4-tap filtering
 1521|      0|        case 4:
  ------------------
  |  Branch (1521:9): [True: 0, False: 0]
  ------------------
 1522|      0|          aom_highbd_lpf_horizontal_4_dual(
 1523|      0|              dst_shortptr, dst_stride, limits->mblim, limits->lim,
 1524|      0|              limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
 1525|      0|              bit_depth);
 1526|      0|          aom_highbd_lpf_horizontal_4_dual(
 1527|      0|              dst_shortptr + (2 * MI_SIZE), dst_stride, limits->mblim,
  ------------------
  |  |   40|      0|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|      0|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1528|      0|              limits->lim, limits->hev_thr, limits->mblim, limits->lim,
 1529|      0|              limits->hev_thr, bit_depth);
 1530|      0|          break;
 1531|      0|        case 6:  // apply 6-tap filter for chroma plane only
  ------------------
  |  Branch (1531:9): [True: 0, False: 0]
  ------------------
 1532|      0|          aom_highbd_lpf_horizontal_6_dual(
  ------------------
  |  | 2562|      0|#define aom_highbd_lpf_horizontal_6_dual aom_highbd_lpf_horizontal_6_dual_sse2
  ------------------
 1533|      0|              dst_shortptr, dst_stride, limits->mblim, limits->lim,
 1534|      0|              limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
 1535|      0|              bit_depth);
 1536|      0|          aom_highbd_lpf_horizontal_6_dual(
  ------------------
  |  | 2562|      0|#define aom_highbd_lpf_horizontal_6_dual aom_highbd_lpf_horizontal_6_dual_sse2
  ------------------
 1537|      0|              dst_shortptr + (2 * MI_SIZE), dst_stride, limits->mblim,
  ------------------
  |  |   40|      0|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|      0|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1538|      0|              limits->lim, limits->hev_thr, limits->mblim, limits->lim,
 1539|      0|              limits->hev_thr, bit_depth);
 1540|      0|          break;
 1541|       |        // apply 8-tap filtering
 1542|      0|        case 8:
  ------------------
  |  Branch (1542:9): [True: 0, False: 0]
  ------------------
 1543|      0|          aom_highbd_lpf_horizontal_8_dual(
 1544|      0|              dst_shortptr, dst_stride, limits->mblim, limits->lim,
 1545|      0|              limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
 1546|      0|              bit_depth);
 1547|      0|          aom_highbd_lpf_horizontal_8_dual(
 1548|      0|              dst_shortptr + (2 * MI_SIZE), dst_stride, limits->mblim,
  ------------------
  |  |   40|      0|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|      0|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1549|      0|              limits->lim, limits->hev_thr, limits->mblim, limits->lim,
 1550|      0|              limits->hev_thr, bit_depth);
 1551|      0|          break;
 1552|       |        // apply 14-tap filtering
 1553|      0|        case 14:
  ------------------
  |  Branch (1553:9): [True: 0, False: 0]
  ------------------
 1554|      0|          aom_highbd_lpf_horizontal_14_dual(
 1555|      0|              dst_shortptr, dst_stride, limits->mblim, limits->lim,
 1556|      0|              limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
 1557|      0|              bit_depth);
 1558|      0|          aom_highbd_lpf_horizontal_14_dual(
 1559|      0|              dst_shortptr + (2 * MI_SIZE), dst_stride, limits->mblim,
  ------------------
  |  |   40|      0|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|      0|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1560|      0|              limits->lim, limits->hev_thr, limits->mblim, limits->lim,
 1561|      0|              limits->hev_thr, bit_depth);
 1562|      0|          break;
 1563|       |        // no filtering
 1564|      0|        default: break;
  ------------------
  |  Branch (1564:9): [True: 0, False: 0]
  ------------------
 1565|      0|      }
 1566|   380k|    } else if (use_filter_type == USE_DUAL) {
  ------------------
  |  Branch (1566:16): [True: 0, False: 380k]
  ------------------
 1567|      0|      switch (params->filter_length) {
 1568|       |        // apply 4-tap filtering
 1569|      0|        case 4:
  ------------------
  |  Branch (1569:9): [True: 0, False: 0]
  ------------------
 1570|      0|          aom_highbd_lpf_horizontal_4_dual(
 1571|      0|              dst_shortptr, dst_stride, limits->mblim, limits->lim,
 1572|      0|              limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
 1573|      0|              bit_depth);
 1574|      0|          break;
 1575|      0|        case 6:  // apply 6-tap filter for chroma plane only
  ------------------
  |  Branch (1575:9): [True: 0, False: 0]
  ------------------
 1576|      0|          aom_highbd_lpf_horizontal_6_dual(
  ------------------
  |  | 2562|      0|#define aom_highbd_lpf_horizontal_6_dual aom_highbd_lpf_horizontal_6_dual_sse2
  ------------------
 1577|      0|              dst_shortptr, dst_stride, limits->mblim, limits->lim,
 1578|      0|              limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
 1579|      0|              bit_depth);
 1580|      0|          break;
 1581|       |        // apply 8-tap filtering
 1582|      0|        case 8:
  ------------------
  |  Branch (1582:9): [True: 0, False: 0]
  ------------------
 1583|      0|          aom_highbd_lpf_horizontal_8_dual(
 1584|      0|              dst_shortptr, dst_stride, limits->mblim, limits->lim,
 1585|      0|              limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
 1586|      0|              bit_depth);
 1587|      0|          break;
 1588|       |        // apply 14-tap filtering
 1589|      0|        case 14:
  ------------------
  |  Branch (1589:9): [True: 0, False: 0]
  ------------------
 1590|      0|          aom_highbd_lpf_horizontal_14_dual(
 1591|      0|              dst_shortptr, dst_stride, limits->mblim, limits->lim,
 1592|      0|              limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
 1593|      0|              bit_depth);
 1594|      0|          break;
 1595|       |        // no filtering
 1596|      0|        default: break;
  ------------------
  |  Branch (1596:9): [True: 0, False: 0]
  ------------------
 1597|      0|      }
 1598|   380k|    } else {
 1599|   380k|      assert(use_filter_type == USE_SINGLE);
 1600|   380k|      switch (params->filter_length) {
 1601|       |        // apply 4-tap filtering
 1602|  88.4k|        case 4:
  ------------------
  |  Branch (1602:9): [True: 88.4k, False: 292k]
  ------------------
 1603|  88.4k|          aom_highbd_lpf_horizontal_4(dst_shortptr, dst_stride, limits->mblim,
  ------------------
  |  | 2549|  88.4k|#define aom_highbd_lpf_horizontal_4 aom_highbd_lpf_horizontal_4_sse2
  ------------------
 1604|  88.4k|                                      limits->lim, limits->hev_thr, bit_depth);
 1605|  88.4k|          break;
 1606|   144k|        case 6:  // apply 6-tap filter for chroma plane only
  ------------------
  |  Branch (1606:9): [True: 144k, False: 236k]
  ------------------
 1607|   144k|          aom_highbd_lpf_horizontal_6(dst_shortptr, dst_stride, limits->mblim,
  ------------------
  |  | 2558|   144k|#define aom_highbd_lpf_horizontal_6 aom_highbd_lpf_horizontal_6_sse2
  ------------------
 1608|   144k|                                      limits->lim, limits->hev_thr, bit_depth);
 1609|   144k|          break;
 1610|       |        // apply 8-tap filtering
 1611|  50.3k|        case 8:
  ------------------
  |  Branch (1611:9): [True: 50.3k, False: 330k]
  ------------------
 1612|  50.3k|          aom_highbd_lpf_horizontal_8(dst_shortptr, dst_stride, limits->mblim,
  ------------------
  |  | 2566|  50.3k|#define aom_highbd_lpf_horizontal_8 aom_highbd_lpf_horizontal_8_sse2
  ------------------
 1613|  50.3k|                                      limits->lim, limits->hev_thr, bit_depth);
 1614|  50.3k|          break;
 1615|       |        // apply 14-tap filtering
 1616|  49.6k|        case 14:
  ------------------
  |  Branch (1616:9): [True: 49.6k, False: 331k]
  ------------------
 1617|  49.6k|          aom_highbd_lpf_horizontal_14(dst_shortptr, dst_stride, limits->mblim,
  ------------------
  |  | 2540|  49.6k|#define aom_highbd_lpf_horizontal_14 aom_highbd_lpf_horizontal_14_sse2
  ------------------
 1618|  49.6k|                                       limits->lim, limits->hev_thr, bit_depth);
 1619|  49.6k|          break;
 1620|       |        // no filtering
 1621|  70.8k|        default: break;
  ------------------
  |  Branch (1621:9): [True: 70.8k, False: 309k]
  ------------------
 1622|   380k|      }
 1623|   380k|    }
 1624|   373k|    return;
 1625|   380k|  }
 1626|   888k|#endif  // CONFIG_AV1_HIGHBITDEPTH
 1627|   888k|  if (use_filter_type == USE_QUAD) {
  ------------------
  |  Branch (1627:7): [True: 0, False: 888k]
  ------------------
 1628|       |    // Only one set of loop filter parameters (mblim, lim and hev_thr) is
 1629|       |    // passed as argument to quad loop filter because quad loop filter is
 1630|       |    // called for those cases where all the 4 set of loop filter parameters
 1631|       |    // are equal.
 1632|      0|    switch (params->filter_length) {
 1633|       |      // apply 4-tap filtering
 1634|      0|      case 4:
  ------------------
  |  Branch (1634:7): [True: 0, False: 0]
  ------------------
 1635|      0|        aom_lpf_horizontal_4_quad(dst, dst_stride, limits->mblim, limits->lim,
  ------------------
  |  | 3725|      0|#define aom_lpf_horizontal_4_quad aom_lpf_horizontal_4_quad_sse2
  ------------------
 1636|      0|                                  limits->hev_thr);
 1637|      0|        break;
 1638|      0|      case 6:  // apply 6-tap filter for chroma plane only
  ------------------
  |  Branch (1638:7): [True: 0, False: 0]
  ------------------
 1639|      0|        aom_lpf_horizontal_6_quad(dst, dst_stride, limits->mblim, limits->lim,
 1640|      0|                                  limits->hev_thr);
 1641|      0|        break;
 1642|       |      // apply 8-tap filtering
 1643|      0|      case 8:
  ------------------
  |  Branch (1643:7): [True: 0, False: 0]
  ------------------
 1644|      0|        aom_lpf_horizontal_8_quad(dst, dst_stride, limits->mblim, limits->lim,
 1645|      0|                                  limits->hev_thr);
 1646|      0|        break;
 1647|       |      // apply 14-tap filtering
 1648|      0|      case 14:
  ------------------
  |  Branch (1648:7): [True: 0, False: 0]
  ------------------
 1649|      0|        aom_lpf_horizontal_14_quad(dst, dst_stride, limits->mblim, limits->lim,
 1650|      0|                                   limits->hev_thr);
 1651|      0|        break;
 1652|       |      // no filtering
 1653|      0|      default: break;
  ------------------
  |  Branch (1653:7): [True: 0, False: 0]
  ------------------
 1654|      0|    }
 1655|   888k|  } else if (use_filter_type == USE_DUAL) {
  ------------------
  |  Branch (1655:14): [True: 0, False: 888k]
  ------------------
 1656|      0|    switch (params->filter_length) {
 1657|       |      // apply 4-tap filtering
 1658|      0|      case 4:
  ------------------
  |  Branch (1658:7): [True: 0, False: 0]
  ------------------
 1659|      0|        aom_lpf_horizontal_4_dual(dst, dst_stride, limits->mblim, limits->lim,
  ------------------
  |  | 3721|      0|#define aom_lpf_horizontal_4_dual aom_lpf_horizontal_4_dual_sse2
  ------------------
 1660|      0|                                  limits->hev_thr, limits->mblim, limits->lim,
 1661|      0|                                  limits->hev_thr);
 1662|      0|        break;
 1663|      0|      case 6:  // apply 6-tap filter for chroma plane only
  ------------------
  |  Branch (1663:7): [True: 0, False: 0]
  ------------------
 1664|      0|        aom_lpf_horizontal_6_dual(dst, dst_stride, limits->mblim, limits->lim,
  ------------------
  |  | 3733|      0|#define aom_lpf_horizontal_6_dual aom_lpf_horizontal_6_dual_sse2
  ------------------
 1665|      0|                                  limits->hev_thr, limits->mblim, limits->lim,
 1666|      0|                                  limits->hev_thr);
 1667|      0|        break;
 1668|       |      // apply 8-tap filtering
 1669|      0|      case 8:
  ------------------
  |  Branch (1669:7): [True: 0, False: 0]
  ------------------
 1670|      0|        aom_lpf_horizontal_8_dual(dst, dst_stride, limits->mblim, limits->lim,
  ------------------
  |  | 3746|      0|#define aom_lpf_horizontal_8_dual aom_lpf_horizontal_8_dual_sse2
  ------------------
 1671|      0|                                  limits->hev_thr, limits->mblim, limits->lim,
 1672|      0|                                  limits->hev_thr);
 1673|      0|        break;
 1674|       |      // apply 14-tap filtering
 1675|      0|      case 14:
  ------------------
  |  Branch (1675:7): [True: 0, False: 0]
  ------------------
 1676|      0|        aom_lpf_horizontal_14_dual(dst, dst_stride, limits->mblim, limits->lim,
  ------------------
  |  | 3708|      0|#define aom_lpf_horizontal_14_dual aom_lpf_horizontal_14_dual_sse2
  ------------------
 1677|      0|                                   limits->hev_thr, limits->mblim, limits->lim,
 1678|      0|                                   limits->hev_thr);
 1679|      0|        break;
 1680|       |      // no filtering
 1681|      0|      default: break;
  ------------------
  |  Branch (1681:7): [True: 0, False: 0]
  ------------------
 1682|      0|    }
 1683|   888k|  } else {
 1684|   888k|    assert(use_filter_type == USE_SINGLE);
 1685|   888k|    switch (params->filter_length) {
 1686|       |      // apply 4-tap filtering
 1687|   189k|      case 4:
  ------------------
  |  Branch (1687:7): [True: 189k, False: 698k]
  ------------------
 1688|   189k|        aom_lpf_horizontal_4(dst, dst_stride, limits->mblim, limits->lim,
  ------------------
  |  | 3717|   189k|#define aom_lpf_horizontal_4 aom_lpf_horizontal_4_sse2
  ------------------
 1689|   189k|                             limits->hev_thr);
 1690|   189k|        break;
 1691|   291k|      case 6:  // apply 6-tap filter for chroma plane only
  ------------------
  |  Branch (1691:7): [True: 291k, False: 596k]
  ------------------
 1692|   291k|        aom_lpf_horizontal_6(dst, dst_stride, limits->mblim, limits->lim,
  ------------------
  |  | 3729|   291k|#define aom_lpf_horizontal_6 aom_lpf_horizontal_6_sse2
  ------------------
 1693|   291k|                             limits->hev_thr);
 1694|   291k|        break;
 1695|       |      // apply 8-tap filtering
 1696|   128k|      case 8:
  ------------------
  |  Branch (1696:7): [True: 128k, False: 760k]
  ------------------
 1697|   128k|        aom_lpf_horizontal_8(dst, dst_stride, limits->mblim, limits->lim,
  ------------------
  |  | 3742|   128k|#define aom_lpf_horizontal_8 aom_lpf_horizontal_8_sse2
  ------------------
 1698|   128k|                             limits->hev_thr);
 1699|   128k|        break;
 1700|       |      // apply 14-tap filtering
 1701|  61.4k|      case 14:
  ------------------
  |  Branch (1701:7): [True: 61.4k, False: 826k]
  ------------------
 1702|  61.4k|        aom_lpf_horizontal_14(dst, dst_stride, limits->mblim, limits->lim,
  ------------------
  |  | 3704|  61.4k|#define aom_lpf_horizontal_14 aom_lpf_horizontal_14_sse2
  ------------------
 1703|  61.4k|                              limits->hev_thr);
 1704|  61.4k|        break;
 1705|       |      // no filtering
 1706|   270k|      default: break;
  ------------------
  |  Branch (1706:7): [True: 270k, False: 617k]
  ------------------
 1707|   888k|    }
 1708|   888k|  }
 1709|       |#if !CONFIG_AV1_HIGHBITDEPTH
 1710|       |  (void)seq_params;
 1711|       |#endif  // !CONFIG_AV1_HIGHBITDEPTH
 1712|   888k|}

av1_rtcd:
   18|  17.9k|void av1_rtcd(void) { aom_once(setup_rtcd_internal); }

av1_inv_txfm2d.c:highbd_clip_pixel_add:
  127|  1.21M|                                             int bd) {
  128|  1.21M|  return clip_pixel_highbd(dest + (int)trans, bd);
  129|  1.21M|}
av1_inv_txfm_ssse3.c:cospi_arr:
   47|  1.08M|static inline const int32_t *cospi_arr(int n) {
   48|  1.08M|  return av1_cospi_arr_data[n - cos_bit_min];
   49|  1.08M|}
av1_inv_txfm_ssse3.c:get_txw_idx:
  242|   378k|static inline int get_txw_idx(TX_SIZE tx_size) {
  243|   378k|  return tx_size_wide_log2[tx_size] - tx_size_wide_log2[0];
  244|   378k|}
av1_inv_txfm_ssse3.c:get_txh_idx:
  245|   378k|static inline int get_txh_idx(TX_SIZE tx_size) {
  246|   378k|  return tx_size_high_log2[tx_size] - tx_size_high_log2[0];
  247|   378k|}
av1_inv_txfm_ssse3.c:get_rect_tx_log_ratio:
  215|   129k|static inline int get_rect_tx_log_ratio(int col, int row) {
  216|   129k|  if (col == row) return 0;
  ------------------
  |  Branch (216:7): [True: 21.4k, False: 107k]
  ------------------
  217|   107k|  if (col > row) {
  ------------------
  |  Branch (217:7): [True: 63.3k, False: 44.2k]
  ------------------
  218|  63.3k|    if (col == row * 2) return 1;
  ------------------
  |  Branch (218:9): [True: 48.6k, False: 14.7k]
  ------------------
  219|  14.7k|    if (col == row * 4) return 2;
  ------------------
  |  Branch (219:9): [True: 14.7k, False: 0]
  ------------------
  220|  14.7k|    assert(0 && "Unsupported transform size");
  221|  44.2k|  } else {
  222|  44.2k|    if (row == col * 2) return -1;
  ------------------
  |  Branch (222:9): [True: 33.0k, False: 11.1k]
  ------------------
  223|  11.1k|    if (row == col * 4) return -2;
  ------------------
  |  Branch (223:9): [True: 11.1k, False: 0]
  ------------------
  224|  11.1k|    assert(0 && "Unsupported transform size");
  225|      0|  }
  226|      0|  return 0;  // Invalid
  227|   107k|}
av1_inv_txfm_ssse3.c:sinpi_arr:
   51|   161k|static inline const int32_t *sinpi_arr(int n) {
   52|   161k|  return av1_sinpi_arr_data[n - cos_bit_min];
   53|   161k|}
av1_inv_txfm_ssse3.c:get_flip_cfg:
  169|   363k|static inline void get_flip_cfg(TX_TYPE tx_type, int *ud_flip, int *lr_flip) {
  170|   363k|  switch (tx_type) {
  171|   121k|    case DCT_DCT:
  ------------------
  |  Branch (171:5): [True: 121k, False: 242k]
  ------------------
  172|   169k|    case ADST_DCT:
  ------------------
  |  Branch (172:5): [True: 48.0k, False: 315k]
  ------------------
  173|   229k|    case DCT_ADST:
  ------------------
  |  Branch (173:5): [True: 60.1k, False: 303k]
  ------------------
  174|   293k|    case ADST_ADST:
  ------------------
  |  Branch (174:5): [True: 64.5k, False: 299k]
  ------------------
  175|   293k|      *ud_flip = 0;
  176|   293k|      *lr_flip = 0;
  177|   293k|      break;
  178|  17.1k|    case IDTX:
  ------------------
  |  Branch (178:5): [True: 17.1k, False: 346k]
  ------------------
  179|  30.5k|    case V_DCT:
  ------------------
  |  Branch (179:5): [True: 13.3k, False: 350k]
  ------------------
  180|  56.8k|    case H_DCT:
  ------------------
  |  Branch (180:5): [True: 26.2k, False: 337k]
  ------------------
  181|  58.3k|    case V_ADST:
  ------------------
  |  Branch (181:5): [True: 1.49k, False: 362k]
  ------------------
  182|  60.8k|    case H_ADST:
  ------------------
  |  Branch (182:5): [True: 2.49k, False: 361k]
  ------------------
  183|  60.8k|      *ud_flip = 0;
  184|  60.8k|      *lr_flip = 0;
  185|  60.8k|      break;
  186|  1.05k|    case FLIPADST_DCT:
  ------------------
  |  Branch (186:5): [True: 1.05k, False: 362k]
  ------------------
  187|  2.48k|    case FLIPADST_ADST:
  ------------------
  |  Branch (187:5): [True: 1.42k, False: 362k]
  ------------------
  188|  3.60k|    case V_FLIPADST:
  ------------------
  |  Branch (188:5): [True: 1.11k, False: 362k]
  ------------------
  189|  3.60k|      *ud_flip = 1;
  190|  3.60k|      *lr_flip = 0;
  191|  3.60k|      break;
  192|  1.38k|    case DCT_FLIPADST:
  ------------------
  |  Branch (192:5): [True: 1.38k, False: 362k]
  ------------------
  193|  2.52k|    case ADST_FLIPADST:
  ------------------
  |  Branch (193:5): [True: 1.14k, False: 362k]
  ------------------
  194|  4.83k|    case H_FLIPADST:
  ------------------
  |  Branch (194:5): [True: 2.31k, False: 361k]
  ------------------
  195|  4.83k|      *ud_flip = 0;
  196|  4.83k|      *lr_flip = 1;
  197|  4.83k|      break;
  198|    936|    case FLIPADST_FLIPADST:
  ------------------
  |  Branch (198:5): [True: 936, False: 363k]
  ------------------
  199|    936|      *ud_flip = 1;
  200|    936|      *lr_flip = 1;
  201|    936|      break;
  202|      0|    default:
  ------------------
  |  Branch (202:5): [True: 0, False: 363k]
  ------------------
  203|      0|      *ud_flip = 0;
  204|      0|      *lr_flip = 0;
  205|       |      assert(0);
  206|   363k|  }
  207|   363k|}
highbd_inv_txfm_sse4.c:cospi_arr:
   47|   595k|static inline const int32_t *cospi_arr(int n) {
   48|   595k|  return av1_cospi_arr_data[n - cos_bit_min];
   49|   595k|}
highbd_inv_txfm_sse4.c:sinpi_arr:
   51|   305k|static inline const int32_t *sinpi_arr(int n) {
   52|   305k|  return av1_sinpi_arr_data[n - cos_bit_min];
   53|   305k|}
highbd_inv_txfm_sse4.c:get_txw_idx:
  242|   216k|static inline int get_txw_idx(TX_SIZE tx_size) {
  243|   216k|  return tx_size_wide_log2[tx_size] - tx_size_wide_log2[0];
  244|   216k|}
highbd_inv_txfm_sse4.c:get_txh_idx:
  245|   216k|static inline int get_txh_idx(TX_SIZE tx_size) {
  246|   216k|  return tx_size_high_log2[tx_size] - tx_size_high_log2[0];
  247|   216k|}
highbd_inv_txfm_sse4.c:get_rect_tx_log_ratio:
  215|  36.8k|static inline int get_rect_tx_log_ratio(int col, int row) {
  216|  36.8k|  if (col == row) return 0;
  ------------------
  |  Branch (216:7): [True: 25.9k, False: 10.9k]
  ------------------
  217|  10.9k|  if (col > row) {
  ------------------
  |  Branch (217:7): [True: 6.40k, False: 4.51k]
  ------------------
  218|  6.40k|    if (col == row * 2) return 1;
  ------------------
  |  Branch (218:9): [True: 6.34k, False: 68]
  ------------------
  219|     68|    if (col == row * 4) return 2;
  ------------------
  |  Branch (219:9): [True: 68, False: 0]
  ------------------
  220|     68|    assert(0 && "Unsupported transform size");
  221|  4.51k|  } else {
  222|  4.51k|    if (row == col * 2) return -1;
  ------------------
  |  Branch (222:9): [True: 4.48k, False: 34]
  ------------------
  223|     34|    if (row == col * 4) return -2;
  ------------------
  |  Branch (223:9): [True: 34, False: 0]
  ------------------
  224|     34|    assert(0 && "Unsupported transform size");
  225|      0|  }
  226|      0|  return 0;  // Invalid
  227|  10.9k|}
highbd_inv_txfm_sse4.c:get_flip_cfg:
  169|   196k|static inline void get_flip_cfg(TX_TYPE tx_type, int *ud_flip, int *lr_flip) {
  170|   196k|  switch (tx_type) {
  171|  56.6k|    case DCT_DCT:
  ------------------
  |  Branch (171:5): [True: 56.6k, False: 140k]
  ------------------
  172|  83.2k|    case ADST_DCT:
  ------------------
  |  Branch (172:5): [True: 26.6k, False: 170k]
  ------------------
  173|   116k|    case DCT_ADST:
  ------------------
  |  Branch (173:5): [True: 33.0k, False: 163k]
  ------------------
  174|   153k|    case ADST_ADST:
  ------------------
  |  Branch (174:5): [True: 37.5k, False: 159k]
  ------------------
  175|   153k|      *ud_flip = 0;
  176|   153k|      *lr_flip = 0;
  177|   153k|      break;
  178|  11.2k|    case IDTX:
  ------------------
  |  Branch (178:5): [True: 11.2k, False: 185k]
  ------------------
  179|  21.2k|    case V_DCT:
  ------------------
  |  Branch (179:5): [True: 10.0k, False: 186k]
  ------------------
  180|  40.9k|    case H_DCT:
  ------------------
  |  Branch (180:5): [True: 19.6k, False: 177k]
  ------------------
  181|  41.1k|    case V_ADST:
  ------------------
  |  Branch (181:5): [True: 226, False: 196k]
  ------------------
  182|  41.5k|    case H_ADST:
  ------------------
  |  Branch (182:5): [True: 393, False: 196k]
  ------------------
  183|  41.5k|      *ud_flip = 0;
  184|  41.5k|      *lr_flip = 0;
  185|  41.5k|      break;
  186|     88|    case FLIPADST_DCT:
  ------------------
  |  Branch (186:5): [True: 88, False: 196k]
  ------------------
  187|    206|    case FLIPADST_ADST:
  ------------------
  |  Branch (187:5): [True: 118, False: 196k]
  ------------------
  188|    347|    case V_FLIPADST:
  ------------------
  |  Branch (188:5): [True: 141, False: 196k]
  ------------------
  189|    347|      *ud_flip = 1;
  190|    347|      *lr_flip = 0;
  191|    347|      break;
  192|    415|    case DCT_FLIPADST:
  ------------------
  |  Branch (192:5): [True: 415, False: 196k]
  ------------------
  193|    515|    case ADST_FLIPADST:
  ------------------
  |  Branch (193:5): [True: 100, False: 196k]
  ------------------
  194|    793|    case H_FLIPADST:
  ------------------
  |  Branch (194:5): [True: 278, False: 196k]
  ------------------
  195|    793|      *ud_flip = 0;
  196|    793|      *lr_flip = 1;
  197|    793|      break;
  198|    110|    case FLIPADST_FLIPADST:
  ------------------
  |  Branch (198:5): [True: 110, False: 196k]
  ------------------
  199|    110|      *ud_flip = 1;
  200|    110|      *lr_flip = 1;
  201|    110|      break;
  202|      0|    default:
  ------------------
  |  Branch (202:5): [True: 0, False: 196k]
  ------------------
  203|      0|      *ud_flip = 0;
  204|      0|      *lr_flip = 0;
  205|       |      assert(0);
  206|   196k|  }
  207|   196k|}
av1_inv_txfm_avx2.c:get_flip_cfg:
  169|   372k|static inline void get_flip_cfg(TX_TYPE tx_type, int *ud_flip, int *lr_flip) {
  170|   372k|  switch (tx_type) {
  171|   239k|    case DCT_DCT:
  ------------------
  |  Branch (171:5): [True: 239k, False: 133k]
  ------------------
  172|   276k|    case ADST_DCT:
  ------------------
  |  Branch (172:5): [True: 37.0k, False: 335k]
  ------------------
  173|   325k|    case DCT_ADST:
  ------------------
  |  Branch (173:5): [True: 49.4k, False: 323k]
  ------------------
  174|   368k|    case ADST_ADST:
  ------------------
  |  Branch (174:5): [True: 42.8k, False: 330k]
  ------------------
  175|   368k|      *ud_flip = 0;
  176|   368k|      *lr_flip = 0;
  177|   368k|      break;
  178|      0|    case IDTX:
  ------------------
  |  Branch (178:5): [True: 0, False: 372k]
  ------------------
  179|    194|    case V_DCT:
  ------------------
  |  Branch (179:5): [True: 194, False: 372k]
  ------------------
  180|    502|    case H_DCT:
  ------------------
  |  Branch (180:5): [True: 308, False: 372k]
  ------------------
  181|    502|    case V_ADST:
  ------------------
  |  Branch (181:5): [True: 0, False: 372k]
  ------------------
  182|    502|    case H_ADST:
  ------------------
  |  Branch (182:5): [True: 0, False: 372k]
  ------------------
  183|    502|      *ud_flip = 0;
  184|    502|      *lr_flip = 0;
  185|    502|      break;
  186|    780|    case FLIPADST_DCT:
  ------------------
  |  Branch (186:5): [True: 780, False: 372k]
  ------------------
  187|  1.52k|    case FLIPADST_ADST:
  ------------------
  |  Branch (187:5): [True: 742, False: 372k]
  ------------------
  188|  1.52k|    case V_FLIPADST:
  ------------------
  |  Branch (188:5): [True: 0, False: 372k]
  ------------------
  189|  1.52k|      *ud_flip = 1;
  190|  1.52k|      *lr_flip = 0;
  191|  1.52k|      break;
  192|    804|    case DCT_FLIPADST:
  ------------------
  |  Branch (192:5): [True: 804, False: 372k]
  ------------------
  193|  1.48k|    case ADST_FLIPADST:
  ------------------
  |  Branch (193:5): [True: 676, False: 372k]
  ------------------
  194|  1.48k|    case H_FLIPADST:
  ------------------
  |  Branch (194:5): [True: 0, False: 372k]
  ------------------
  195|  1.48k|      *ud_flip = 0;
  196|  1.48k|      *lr_flip = 1;
  197|  1.48k|      break;
  198|    714|    case FLIPADST_FLIPADST:
  ------------------
  |  Branch (198:5): [True: 714, False: 372k]
  ------------------
  199|    714|      *ud_flip = 1;
  200|    714|      *lr_flip = 1;
  201|    714|      break;
  202|      0|    default:
  ------------------
  |  Branch (202:5): [True: 0, False: 372k]
  ------------------
  203|      0|      *ud_flip = 0;
  204|      0|      *lr_flip = 0;
  205|       |      assert(0);
  206|   372k|  }
  207|   372k|}
av1_inv_txfm_avx2.c:get_txw_idx:
  242|   152k|static inline int get_txw_idx(TX_SIZE tx_size) {
  243|   152k|  return tx_size_wide_log2[tx_size] - tx_size_wide_log2[0];
  244|   152k|}
av1_inv_txfm_avx2.c:get_txh_idx:
  245|   152k|static inline int get_txh_idx(TX_SIZE tx_size) {
  246|   152k|  return tx_size_high_log2[tx_size] - tx_size_high_log2[0];
  247|   152k|}
av1_inv_txfm_avx2.c:get_rect_tx_log_ratio:
  215|   152k|static inline int get_rect_tx_log_ratio(int col, int row) {
  216|   152k|  if (col == row) return 0;
  ------------------
  |  Branch (216:7): [True: 126k, False: 25.9k]
  ------------------
  217|  25.9k|  if (col > row) {
  ------------------
  |  Branch (217:7): [True: 13.4k, False: 12.5k]
  ------------------
  218|  13.4k|    if (col == row * 2) return 1;
  ------------------
  |  Branch (218:9): [True: 11.2k, False: 2.14k]
  ------------------
  219|  2.14k|    if (col == row * 4) return 2;
  ------------------
  |  Branch (219:9): [True: 2.14k, False: 0]
  ------------------
  220|  2.14k|    assert(0 && "Unsupported transform size");
  221|  12.5k|  } else {
  222|  12.5k|    if (row == col * 2) return -1;
  ------------------
  |  Branch (222:9): [True: 10.4k, False: 2.10k]
  ------------------
  223|  2.10k|    if (row == col * 4) return -2;
  ------------------
  |  Branch (223:9): [True: 2.10k, False: 0]
  ------------------
  224|  2.10k|    assert(0 && "Unsupported transform size");
  225|      0|  }
  226|      0|  return 0;  // Invalid
  227|  25.9k|}
av1_inv_txfm_avx2.c:cospi_arr:
   47|   381k|static inline const int32_t *cospi_arr(int n) {
   48|   381k|  return av1_cospi_arr_data[n - cos_bit_min];
   49|   381k|}
highbd_inv_txfm_avx2.c:get_txw_idx:
  242|   465k|static inline int get_txw_idx(TX_SIZE tx_size) {
  243|   465k|  return tx_size_wide_log2[tx_size] - tx_size_wide_log2[0];
  244|   465k|}
highbd_inv_txfm_avx2.c:get_txh_idx:
  245|   465k|static inline int get_txh_idx(TX_SIZE tx_size) {
  246|   465k|  return tx_size_high_log2[tx_size] - tx_size_high_log2[0];
  247|   465k|}
highbd_inv_txfm_avx2.c:get_rect_tx_log_ratio:
  215|   465k|static inline int get_rect_tx_log_ratio(int col, int row) {
  216|   465k|  if (col == row) return 0;
  ------------------
  |  Branch (216:7): [True: 322k, False: 142k]
  ------------------
  217|   142k|  if (col > row) {
  ------------------
  |  Branch (217:7): [True: 81.7k, False: 60.5k]
  ------------------
  218|  81.7k|    if (col == row * 2) return 1;
  ------------------
  |  Branch (218:9): [True: 61.7k, False: 20.0k]
  ------------------
  219|  20.0k|    if (col == row * 4) return 2;
  ------------------
  |  Branch (219:9): [True: 20.0k, False: 0]
  ------------------
  220|  20.0k|    assert(0 && "Unsupported transform size");
  221|  60.5k|  } else {
  222|  60.5k|    if (row == col * 2) return -1;
  ------------------
  |  Branch (222:9): [True: 46.8k, False: 13.6k]
  ------------------
  223|  13.6k|    if (row == col * 4) return -2;
  ------------------
  |  Branch (223:9): [True: 13.6k, False: 18.4E]
  ------------------
  224|  18.4E|    assert(0 && "Unsupported transform size");
  225|  18.4E|  }
  226|  18.4E|  return 0;  // Invalid
  227|   142k|}
highbd_inv_txfm_avx2.c:cospi_arr:
   47|  1.42M|static inline const int32_t *cospi_arr(int n) {
   48|  1.42M|  return av1_cospi_arr_data[n - cos_bit_min];
   49|  1.42M|}
highbd_inv_txfm_avx2.c:get_flip_cfg:
  169|   465k|static inline void get_flip_cfg(TX_TYPE tx_type, int *ud_flip, int *lr_flip) {
  170|   465k|  switch (tx_type) {
  171|   259k|    case DCT_DCT:
  ------------------
  |  Branch (171:5): [True: 259k, False: 206k]
  ------------------
  172|   317k|    case ADST_DCT:
  ------------------
  |  Branch (172:5): [True: 58.6k, False: 406k]
  ------------------
  173|   395k|    case DCT_ADST:
  ------------------
  |  Branch (173:5): [True: 77.9k, False: 387k]
  ------------------
  174|   463k|    case ADST_ADST:
  ------------------
  |  Branch (174:5): [True: 67.9k, False: 397k]
  ------------------
  175|   463k|      *ud_flip = 0;
  176|   463k|      *lr_flip = 0;
  177|   463k|      break;
  178|      0|    case IDTX:
  ------------------
  |  Branch (178:5): [True: 0, False: 465k]
  ------------------
  179|      0|    case V_DCT:
  ------------------
  |  Branch (179:5): [True: 0, False: 465k]
  ------------------
  180|      0|    case H_DCT:
  ------------------
  |  Branch (180:5): [True: 0, False: 465k]
  ------------------
  181|      0|    case V_ADST:
  ------------------
  |  Branch (181:5): [True: 0, False: 465k]
  ------------------
  182|      0|    case H_ADST:
  ------------------
  |  Branch (182:5): [True: 0, False: 465k]
  ------------------
  183|      0|      *ud_flip = 0;
  184|      0|      *lr_flip = 0;
  185|      0|      break;
  186|    491|    case FLIPADST_DCT:
  ------------------
  |  Branch (186:5): [True: 491, False: 464k]
  ------------------
  187|    791|    case FLIPADST_ADST:
  ------------------
  |  Branch (187:5): [True: 300, False: 464k]
  ------------------
  188|    791|    case V_FLIPADST:
  ------------------
  |  Branch (188:5): [True: 0, False: 465k]
  ------------------
  189|    791|      *ud_flip = 1;
  190|    791|      *lr_flip = 0;
  191|    791|      break;
  192|    299|    case DCT_FLIPADST:
  ------------------
  |  Branch (192:5): [True: 299, False: 464k]
  ------------------
  193|    505|    case ADST_FLIPADST:
  ------------------
  |  Branch (193:5): [True: 206, False: 464k]
  ------------------
  194|    505|    case H_FLIPADST:
  ------------------
  |  Branch (194:5): [True: 0, False: 465k]
  ------------------
  195|    505|      *ud_flip = 0;
  196|    505|      *lr_flip = 1;
  197|    505|      break;
  198|    267|    case FLIPADST_FLIPADST:
  ------------------
  |  Branch (198:5): [True: 267, False: 464k]
  ------------------
  199|    267|      *ud_flip = 1;
  200|    267|      *lr_flip = 1;
  201|    267|      break;
  202|      0|    default:
  ------------------
  |  Branch (202:5): [True: 0, False: 465k]
  ------------------
  203|      0|      *ud_flip = 0;
  204|      0|      *lr_flip = 0;
  205|       |      assert(0);
  206|   465k|  }
  207|   465k|}

av1_left_block_mode:
   17|  1.60M|PREDICTION_MODE av1_left_block_mode(const MB_MODE_INFO *left_mi) {
   18|  1.60M|  if (!left_mi) return DC_PRED;
  ------------------
  |  Branch (18:7): [True: 83.7k, False: 1.52M]
  ------------------
   19|  1.60M|  assert(!is_inter_block(left_mi) || is_intrabc_block(left_mi));
   20|  1.52M|  return left_mi->mode;
   21|  1.60M|}
av1_above_block_mode:
   23|  1.60M|PREDICTION_MODE av1_above_block_mode(const MB_MODE_INFO *above_mi) {
   24|  1.60M|  if (!above_mi) return DC_PRED;
  ------------------
  |  Branch (24:7): [True: 99.5k, False: 1.50M]
  ------------------
   25|  1.60M|  assert(!is_inter_block(above_mi) || is_intrabc_block(above_mi));
   26|  1.50M|  return above_mi->mode;
   27|  1.60M|}
av1_set_entropy_contexts:
   32|  4.71M|                              int has_eob, int aoff, int loff) {
   33|  4.71M|  ENTROPY_CONTEXT *const a = pd->above_entropy_context + aoff;
   34|  4.71M|  ENTROPY_CONTEXT *const l = pd->left_entropy_context + loff;
   35|  4.71M|  const int txs_wide = tx_size_wide_unit[tx_size];
   36|  4.71M|  const int txs_high = tx_size_high_unit[tx_size];
   37|       |
   38|       |  // above
   39|  4.71M|  if (has_eob && xd->mb_to_right_edge < 0) {
  ------------------
  |  Branch (39:7): [True: 2.77M, False: 1.94M]
  |  Branch (39:18): [True: 44.2k, False: 2.73M]
  ------------------
   40|  44.2k|    const int blocks_wide = max_block_wide(xd, plane_bsize, plane);
   41|  44.2k|    const int above_contexts = AOMMIN(txs_wide, blocks_wide - aoff);
  ------------------
  |  |   34|  44.2k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 22.8k, False: 21.4k]
  |  |  ------------------
  ------------------
   42|  44.2k|    memset(a, has_eob, sizeof(*a) * above_contexts);
   43|  44.2k|    memset(a + above_contexts, 0, sizeof(*a) * (txs_wide - above_contexts));
   44|  4.67M|  } else {
   45|  4.67M|    memset(a, has_eob, sizeof(*a) * txs_wide);
   46|  4.67M|  }
   47|       |
   48|       |  // left
   49|  4.71M|  if (has_eob && xd->mb_to_bottom_edge < 0) {
  ------------------
  |  Branch (49:7): [True: 2.77M, False: 1.94M]
  |  Branch (49:18): [True: 27.9k, False: 2.74M]
  ------------------
   50|  27.9k|    const int blocks_high = max_block_high(xd, plane_bsize, plane);
   51|  27.9k|    const int left_contexts = AOMMIN(txs_high, blocks_high - loff);
  ------------------
  |  |   34|  27.9k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 15.1k, False: 12.7k]
  |  |  ------------------
  ------------------
   52|  27.9k|    memset(l, has_eob, sizeof(*l) * left_contexts);
   53|  27.9k|    memset(l + left_contexts, 0, sizeof(*l) * (txs_high - left_contexts));
   54|  4.68M|  } else {
   55|  4.68M|    memset(l, has_eob, sizeof(*l) * txs_high);
   56|  4.68M|  }
   57|  4.71M|}
av1_reset_entropy_context:
   59|   452k|                               const int num_planes) {
   60|   452k|  assert(bsize < BLOCK_SIZES_ALL);
   61|   452k|  const int nplanes = 1 + (num_planes - 1) * xd->is_chroma_ref;
   62|  1.43M|  for (int i = 0; i < nplanes; i++) {
  ------------------
  |  Branch (62:19): [True: 984k, False: 452k]
  ------------------
   63|   984k|    struct macroblockd_plane *const pd = &xd->plane[i];
   64|   984k|    const BLOCK_SIZE plane_bsize =
   65|   984k|        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
   66|   984k|    const int txs_wide = mi_size_wide[plane_bsize];
   67|   984k|    const int txs_high = mi_size_high[plane_bsize];
   68|   984k|    memset(pd->above_entropy_context, 0, sizeof(ENTROPY_CONTEXT) * txs_wide);
   69|   984k|    memset(pd->left_entropy_context, 0, sizeof(ENTROPY_CONTEXT) * txs_high);
   70|   984k|  }
   71|   452k|}
av1_reset_loop_filter_delta:
   73|  29.1k|void av1_reset_loop_filter_delta(MACROBLOCKD *xd, int num_planes) {
   74|  29.1k|  xd->delta_lf_from_base = 0;
   75|  29.1k|  const int frame_lf_count =
   76|  29.1k|      num_planes > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
  ------------------
  |  |   72|  23.5k|#define FRAME_LF_COUNT 4
  ------------------
                    num_planes > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
  ------------------
  |  |   72|  5.57k|#define FRAME_LF_COUNT 4
  ------------------
  |  Branch (76:7): [True: 23.5k, False: 5.57k]
  ------------------
   77|   134k|  for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) xd->delta_lf[lf_id] = 0;
  ------------------
  |  Branch (77:23): [True: 105k, False: 29.1k]
  ------------------
   78|  29.1k|}
av1_reset_loop_restoration:
   80|  27.3k|void av1_reset_loop_restoration(MACROBLOCKD *xd, const int num_planes) {
   81|  98.6k|  for (int p = 0; p < num_planes; ++p) {
  ------------------
  |  Branch (81:19): [True: 71.3k, False: 27.3k]
  ------------------
   82|  71.3k|    set_default_wiener(xd->wiener_info + p);
   83|  71.3k|    set_default_sgrproj(xd->sgrproj_info + p);
   84|  71.3k|  }
   85|  27.3k|}
av1_setup_block_planes:
   88|  26.1k|                            const int num_planes) {
   89|  26.1k|  int i;
   90|       |
   91|  94.4k|  for (i = 0; i < num_planes; i++) {
  ------------------
  |  Branch (91:15): [True: 68.3k, False: 26.1k]
  ------------------
   92|  68.3k|    xd->plane[i].plane_type = get_plane_type(i);
   93|  68.3k|    xd->plane[i].subsampling_x = i ? ss_x : 0;
  ------------------
  |  Branch (93:34): [True: 42.2k, False: 26.1k]
  ------------------
   94|  68.3k|    xd->plane[i].subsampling_y = i ? ss_y : 0;
  ------------------
  |  Branch (94:34): [True: 42.2k, False: 26.1k]
  ------------------
   95|  68.3k|  }
   96|  36.1k|  for (i = num_planes; i < MAX_MB_PLANE; i++) {
  ------------------
  |  |   36|  36.1k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (96:24): [True: 10.0k, False: 26.1k]
  ------------------
   97|  10.0k|    xd->plane[i].subsampling_x = 1;
   98|  10.0k|    xd->plane[i].subsampling_y = 1;
   99|  10.0k|  }
  100|  26.1k|}

decodeframe.c:get_plane_type:
 1592|  4.68M|static inline PLANE_TYPE get_plane_type(int plane) {
 1593|  4.68M|  return (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
  ------------------
  |  Branch (1593:10): [True: 2.47M, False: 2.21M]
  ------------------
 1594|  4.68M|}
decodeframe.c:av1_get_tx_type:
 1281|  2.09M|                                      int reduced_tx_set) {
 1282|  2.09M|  const MB_MODE_INFO *const mbmi = xd->mi[0];
 1283|  2.09M|  if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32) {
  ------------------
  |  Branch (1283:7): [True: 459k, False: 1.63M]
  |  Branch (1283:41): [True: 26.8k, False: 1.60M]
  ------------------
 1284|   486k|    return DCT_DCT;
 1285|   486k|  }
 1286|       |
 1287|  1.60M|  TX_TYPE tx_type;
 1288|  1.60M|  if (plane_type == PLANE_TYPE_Y) {
  ------------------
  |  Branch (1288:7): [True: 1.09M, False: 509k]
  ------------------
 1289|  1.09M|    tx_type = xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col];
 1290|  1.09M|  } else {
 1291|   509k|    if (is_inter_block(mbmi)) {
  ------------------
  |  Branch (1291:9): [True: 61.9k, False: 447k]
  ------------------
 1292|       |      // scale back to y plane's coordinate
 1293|  61.9k|      const struct macroblockd_plane *const pd = &xd->plane[plane_type];
 1294|  61.9k|      blk_row <<= pd->subsampling_y;
 1295|  61.9k|      blk_col <<= pd->subsampling_x;
 1296|  61.9k|      tx_type = xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col];
 1297|   447k|    } else {
 1298|       |      // In intra mode, uv planes don't share the same prediction mode as y
 1299|       |      // plane, so the tx_type should not be shared
 1300|   447k|      tx_type = intra_mode_to_tx_type(mbmi, PLANE_TYPE_UV);
 1301|   447k|    }
 1302|   509k|    const TxSetType tx_set_type =
 1303|   509k|        av1_get_ext_tx_set_type(tx_size, is_inter_block(mbmi), reduced_tx_set);
 1304|   509k|    if (!av1_ext_tx_used[tx_set_type][tx_type]) tx_type = DCT_DCT;
  ------------------
  |  Branch (1304:9): [True: 28.5k, False: 480k]
  ------------------
 1305|   509k|  }
 1306|  1.60M|  assert(tx_type < TX_TYPES);
 1307|       |  assert(av1_ext_tx_used[av1_get_ext_tx_set_type(tx_size, is_inter_block(mbmi),
 1308|  1.60M|                                                 reduced_tx_set)][tx_type]);
 1309|  1.60M|  return tx_type;
 1310|  2.09M|}
decodeframe.c:is_inter_block:
  372|  8.47M|static inline int is_inter_block(const MB_MODE_INFO *mbmi) {
  373|  8.47M|  return is_intrabc_block(mbmi) || mbmi->ref_frame[0] > INTRA_FRAME;
  ------------------
  |  Branch (373:10): [True: 50.5k, False: 8.42M]
  |  Branch (373:36): [True: 357k, False: 8.06M]
  ------------------
  374|  8.47M|}
decodeframe.c:is_intrabc_block:
  345|  10.4M|static inline int is_intrabc_block(const MB_MODE_INFO *mbmi) {
  346|  10.4M|  return mbmi->use_intrabc;
  347|  10.4M|}
decodeframe.c:intra_mode_to_tx_type:
 1003|   447k|                                     PLANE_TYPE plane_type) {
 1004|   447k|  static const TX_TYPE _intra_mode_to_tx_type[INTRA_MODES] = {
 1005|   447k|    DCT_DCT,    // DC_PRED
 1006|   447k|    ADST_DCT,   // V_PRED
 1007|   447k|    DCT_ADST,   // H_PRED
 1008|   447k|    DCT_DCT,    // D45_PRED
 1009|   447k|    ADST_ADST,  // D135_PRED
 1010|   447k|    ADST_DCT,   // D113_PRED
 1011|   447k|    DCT_ADST,   // D157_PRED
 1012|   447k|    DCT_ADST,   // D203_PRED
 1013|   447k|    ADST_DCT,   // D67_PRED
 1014|   447k|    ADST_ADST,  // SMOOTH_PRED
 1015|   447k|    ADST_DCT,   // SMOOTH_V_PRED
 1016|   447k|    DCT_ADST,   // SMOOTH_H_PRED
 1017|   447k|    ADST_ADST,  // PAETH_PRED
 1018|   447k|  };
 1019|   447k|  const PREDICTION_MODE mode =
 1020|   447k|      (plane_type == PLANE_TYPE_Y) ? mbmi->mode : get_uv_mode(mbmi->uv_mode);
  ------------------
  |  Branch (1020:7): [True: 0, False: 447k]
  ------------------
 1021|       |  assert(mode < INTRA_MODES);
 1022|   447k|  return _intra_mode_to_tx_type[mode];
 1023|   447k|}
decodeframe.c:get_uv_mode:
  349|   447k|static inline PREDICTION_MODE get_uv_mode(UV_PREDICTION_MODE mode) {
  350|       |  assert(mode < UV_INTRA_MODES);
  351|   447k|  static const PREDICTION_MODE uv2y[] = {
  352|   447k|    DC_PRED,        // UV_DC_PRED
  353|   447k|    V_PRED,         // UV_V_PRED
  354|   447k|    H_PRED,         // UV_H_PRED
  355|   447k|    D45_PRED,       // UV_D45_PRED
  356|   447k|    D135_PRED,      // UV_D135_PRED
  357|   447k|    D113_PRED,      // UV_D113_PRED
  358|   447k|    D157_PRED,      // UV_D157_PRED
  359|   447k|    D203_PRED,      // UV_D203_PRED
  360|   447k|    D67_PRED,       // UV_D67_PRED
  361|   447k|    SMOOTH_PRED,    // UV_SMOOTH_PRED
  362|   447k|    SMOOTH_V_PRED,  // UV_SMOOTH_V_PRED
  363|   447k|    SMOOTH_H_PRED,  // UV_SMOOTH_H_PRED
  364|   447k|    PAETH_PRED,     // UV_PAETH_PRED
  365|   447k|    DC_PRED,        // UV_CFL_PRED
  366|   447k|    INTRA_INVALID,  // UV_INTRA_MODES
  367|   447k|    INTRA_INVALID,  // UV_MODE_INVALID
  368|   447k|  };
  369|   447k|  return uv2y[mode];
  370|   447k|}
decodeframe.c:av1_get_ext_tx_set_type:
 1098|   509k|                                                int use_reduced_set) {
 1099|   509k|  const TX_SIZE tx_size_sqr_up = txsize_sqr_up_map[tx_size];
 1100|   509k|  if (tx_size_sqr_up > TX_32X32) return EXT_TX_SET_DCTONLY;
  ------------------
  |  Branch (1100:7): [True: 0, False: 509k]
  ------------------
 1101|   509k|  if (tx_size_sqr_up == TX_32X32)
  ------------------
  |  Branch (1101:7): [True: 81.0k, False: 428k]
  ------------------
 1102|  81.0k|    return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DCTONLY;
  ------------------
  |  Branch (1102:12): [True: 6.66k, False: 74.3k]
  ------------------
 1103|   428k|  if (use_reduced_set)
  ------------------
  |  Branch (1103:7): [True: 45.2k, False: 382k]
  ------------------
 1104|  45.2k|    return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DTT4_IDTX;
  ------------------
  |  Branch (1104:12): [True: 12.1k, False: 33.0k]
  ------------------
 1105|   382k|  const TX_SIZE tx_size_sqr = txsize_sqr_map[tx_size];
 1106|   382k|  return av1_ext_tx_set_lookup[is_inter][tx_size_sqr == TX_16X16];
 1107|   428k|}
decodeframe.c:has_second_ref:
  376|   272k|static inline int has_second_ref(const MB_MODE_INFO *mbmi) {
  377|   272k|  return mbmi->ref_frame[1] > INTRA_FRAME;
  378|   272k|}
decodeframe.c:get_plane_block_size:
 1188|  5.18M|                                              int subsampling_y) {
 1189|  5.18M|  assert(bsize < BLOCK_SIZES_ALL);
 1190|  5.18M|  assert(subsampling_x >= 0 && subsampling_x < 2);
 1191|       |  assert(subsampling_y >= 0 && subsampling_y < 2);
 1192|  5.18M|  return av1_ss_size_lookup[bsize][subsampling_x][subsampling_y];
 1193|  5.18M|}
decodeframe.c:is_cur_buf_hbd:
  932|   193k|static inline int is_cur_buf_hbd(const MACROBLOCKD *xd) {
  933|   193k|#if CONFIG_AV1_HIGHBITDEPTH
  934|   193k|  return xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH ? 1 : 0;
  ------------------
  |  |  142|   193k|#define YV12_FLAG_HIGHBITDEPTH 8
  ------------------
  |  Branch (934:10): [True: 37.2k, False: 156k]
  ------------------
  935|       |#else
  936|       |  (void)xd;
  937|       |  return 0;
  938|       |#endif
  939|   193k|}
decodeframe.c:is_global_mv_block:
  422|   171k|                                     TransformationType type) {
  423|   171k|  const PREDICTION_MODE mode = mbmi->mode;
  424|   171k|  const BLOCK_SIZE bsize = mbmi->bsize;
  425|   171k|  const int block_size_allowed =
  426|   171k|      AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8;
  ------------------
  |  |   34|   171k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 35.4k, False: 136k]
  |  |  ------------------
  ------------------
  427|   171k|  return (mode == GLOBALMV || mode == GLOBAL_GLOBALMV) && type > TRANSLATION &&
  ------------------
  |  Branch (427:11): [True: 3.28k, False: 168k]
  |  Branch (427:31): [True: 4.25k, False: 164k]
  |  Branch (427:59): [True: 1.13k, False: 6.40k]
  ------------------
  428|  1.13k|         block_size_allowed;
  ------------------
  |  Branch (428:10): [True: 1.08k, False: 45]
  ------------------
  429|   171k|}
decodeframe.c:is_masked_compound_type:
  161|   171k|static inline int is_masked_compound_type(COMPOUND_TYPE type) {
  162|   171k|  return (type == COMPOUND_WEDGE || type == COMPOUND_DIFFWTD);
  ------------------
  |  Branch (162:11): [True: 2.77k, False: 168k]
  |  Branch (162:37): [True: 3.61k, False: 165k]
  ------------------
  163|   171k|}
decodeframe.c:is_interintra_pred:
 1442|   131k|static inline int is_interintra_pred(const MB_MODE_INFO *mbmi) {
 1443|   131k|  return mbmi->ref_frame[0] > INTRA_FRAME &&
  ------------------
  |  Branch (1443:10): [True: 118k, False: 13.1k]
  ------------------
 1444|   118k|         mbmi->ref_frame[1] == INTRA_FRAME && is_interintra_allowed(mbmi);
  ------------------
  |  Branch (1444:10): [True: 4.89k, False: 113k]
  |  Branch (1444:47): [True: 4.89k, False: 0]
  ------------------
 1445|   131k|}
decodeframe.c:is_interintra_allowed:
 1425|  4.89k|static inline int is_interintra_allowed(const MB_MODE_INFO *mbmi) {
 1426|  4.89k|  return is_interintra_allowed_bsize(mbmi->bsize) &&
  ------------------
  |  Branch (1426:10): [True: 4.89k, False: 0]
  ------------------
 1427|  4.89k|         is_interintra_allowed_mode(mbmi->mode) &&
  ------------------
  |  Branch (1427:10): [True: 4.89k, False: 0]
  ------------------
 1428|  4.89k|         is_interintra_allowed_ref(mbmi->ref_frame);
  ------------------
  |  Branch (1428:10): [True: 4.89k, False: 0]
  ------------------
 1429|  4.89k|}
decodeframe.c:is_interintra_allowed_bsize:
 1413|  4.89k|static inline int is_interintra_allowed_bsize(const BLOCK_SIZE bsize) {
 1414|  4.89k|  return (bsize >= BLOCK_8X8) && (bsize <= BLOCK_32X32);
  ------------------
  |  Branch (1414:10): [True: 4.89k, False: 0]
  |  Branch (1414:34): [True: 4.89k, False: 0]
  ------------------
 1415|  4.89k|}
decodeframe.c:is_interintra_allowed_mode:
 1417|  4.89k|static inline int is_interintra_allowed_mode(const PREDICTION_MODE mode) {
 1418|  4.89k|  return (mode >= SINGLE_INTER_MODE_START) && (mode < SINGLE_INTER_MODE_END);
  ------------------
  |  Branch (1418:10): [True: 4.89k, False: 0]
  |  Branch (1418:47): [True: 4.89k, False: 0]
  ------------------
 1419|  4.89k|}
decodeframe.c:is_interintra_allowed_ref:
 1421|  4.89k|static inline int is_interintra_allowed_ref(const MV_REFERENCE_FRAME rf[2]) {
 1422|  4.89k|  return (rf[0] > INTRA_FRAME) && (rf[1] <= INTRA_FRAME);
  ------------------
  |  Branch (1422:10): [True: 4.89k, False: 0]
  |  Branch (1422:35): [True: 4.89k, False: 0]
  ------------------
 1423|  4.89k|}
decodeframe.c:is_neighbor_overlappable:
 1494|  14.3k|static inline int is_neighbor_overlappable(const MB_MODE_INFO *mbmi) {
 1495|  14.3k|  return (is_inter_block(mbmi));
 1496|  14.3k|}
decodeframe.c:get_partition_subsize:
  991|  5.69M|                                               PARTITION_TYPE partition) {
  992|  5.69M|  if (partition == PARTITION_INVALID) {
  ------------------
  |  Branch (992:7): [True: 0, False: 5.69M]
  ------------------
  993|      0|    return BLOCK_INVALID;
  994|  5.69M|  } else {
  995|  5.69M|    const int sqr_bsize_idx = get_sqr_bsize_idx(bsize);
  996|  5.69M|    return sqr_bsize_idx >= SQR_BLOCK_SIZES
  ------------------
  |  |  129|  5.69M|#define SQR_BLOCK_SIZES 6
  ------------------
  |  Branch (996:12): [True: 0, False: 5.69M]
  ------------------
  997|  5.69M|               ? BLOCK_INVALID
  998|  5.69M|               : subsize_lookup[partition][sqr_bsize_idx];
  999|  5.69M|  }
 1000|  5.69M|}
decodeframe.c:get_sqr_bsize_idx:
  971|  5.69M|static inline int get_sqr_bsize_idx(BLOCK_SIZE bsize) {
  972|  5.69M|  switch (bsize) {
  973|   381k|    case BLOCK_4X4: return 0;
  ------------------
  |  Branch (973:5): [True: 381k, False: 5.31M]
  ------------------
  974|  1.53M|    case BLOCK_8X8: return 1;
  ------------------
  |  Branch (974:5): [True: 1.53M, False: 4.16M]
  ------------------
  975|  2.00M|    case BLOCK_16X16: return 2;
  ------------------
  |  Branch (975:5): [True: 2.00M, False: 3.69M]
  ------------------
  976|  1.12M|    case BLOCK_32X32: return 3;
  ------------------
  |  Branch (976:5): [True: 1.12M, False: 4.57M]
  ------------------
  977|   621k|    case BLOCK_64X64: return 4;
  ------------------
  |  Branch (977:5): [True: 621k, False: 5.07M]
  ------------------
  978|  42.7k|    case BLOCK_128X128: return 5;
  ------------------
  |  Branch (978:5): [True: 42.7k, False: 5.65M]
  ------------------
  979|      0|    default: return SQR_BLOCK_SIZES;
  ------------------
  |  |  129|      0|#define SQR_BLOCK_SIZES 6
  ------------------
  |  Branch (979:5): [True: 0, False: 5.69M]
  ------------------
  980|  5.69M|  }
  981|  5.69M|}
decodeframe.c:block_signals_txsize:
 1027|  2.42M|static inline int block_signals_txsize(BLOCK_SIZE bsize) {
 1028|  2.42M|  return bsize > BLOCK_4X4;
 1029|  2.42M|}
decodeframe.c:bsize_to_tx_size_cat:
 1344|   646k|static inline int bsize_to_tx_size_cat(BLOCK_SIZE bsize) {
 1345|   646k|  assert(bsize < BLOCK_SIZES_ALL);
 1346|   646k|  static const uint8_t bsize_to_tx_size_depth_table[BLOCK_SIZES_ALL] = {
 1347|   646k|    0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 2, 2, 3, 3, 4, 4,
 1348|   646k|  };
 1349|   646k|  const int depth = bsize_to_tx_size_depth_table[bsize];
 1350|       |  assert(depth <= MAX_TX_CATS);
 1351|   646k|  return depth - 1;
 1352|   646k|}
decodeframe.c:bsize_to_max_depth:
 1325|   646k|static inline int bsize_to_max_depth(BLOCK_SIZE bsize) {
 1326|   646k|  static const uint8_t bsize_to_max_depth_table[BLOCK_SIZES_ALL] = {
 1327|   646k|    0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 1328|   646k|  };
 1329|   646k|  return bsize_to_max_depth_table[bsize];
 1330|   646k|}
decodeframe.c:depth_to_tx_size:
 1354|   646k|static inline TX_SIZE depth_to_tx_size(int depth, BLOCK_SIZE bsize) {
 1355|   646k|  TX_SIZE max_tx_size = max_txsize_rect_lookup[bsize];
 1356|   646k|  TX_SIZE tx_size = max_tx_size;
 1357|   989k|  for (int d = 0; d < depth; ++d) tx_size = sub_tx_size_map[tx_size];
  ------------------
  |  Branch (1357:19): [True: 343k, False: 646k]
  ------------------
 1358|   646k|  return tx_size;
 1359|   646k|}
decodeframe.c:tx_size_from_tx_mode:
 1134|   897k|static inline TX_SIZE tx_size_from_tx_mode(BLOCK_SIZE bsize, TX_MODE tx_mode) {
 1135|   897k|  const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
 1136|   897k|  const TX_SIZE max_rect_tx_size = max_txsize_rect_lookup[bsize];
 1137|   897k|  if (bsize == BLOCK_4X4)
  ------------------
  |  Branch (1137:7): [True: 0, False: 897k]
  ------------------
 1138|      0|    return AOMMIN(max_txsize_lookup[bsize], largest_tx_size);
  ------------------
  |  |   34|      0|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1139|   897k|  if (txsize_sqr_map[max_rect_tx_size] <= largest_tx_size)
  ------------------
  |  Branch (1139:7): [True: 897k, False: 18.4E]
  ------------------
 1140|   897k|    return max_rect_tx_size;
 1141|  18.4E|  else
 1142|  18.4E|    return largest_tx_size;
 1143|   897k|}
decodeframe.c:av1_get_tx_size:
 1381|  5.24M|static inline TX_SIZE av1_get_tx_size(int plane, const MACROBLOCKD *xd) {
 1382|  5.24M|  const MB_MODE_INFO *mbmi = xd->mi[0];
 1383|  5.24M|  if (xd->lossless[mbmi->segment_id]) return TX_4X4;
  ------------------
  |  Branch (1383:7): [True: 148k, False: 5.09M]
  ------------------
 1384|  5.09M|  if (plane == 0) return mbmi->tx_size;
  ------------------
  |  Branch (1384:7): [True: 2.36M, False: 2.72M]
  ------------------
 1385|  2.72M|  const MACROBLOCKD_PLANE *pd = &xd->plane[plane];
 1386|  2.72M|  return av1_get_max_uv_txsize(mbmi->bsize, pd->subsampling_x,
 1387|  2.72M|                               pd->subsampling_y);
 1388|  5.09M|}
decodeframe.c:av1_get_max_uv_txsize:
 1373|  2.89M|                                            int subsampling_y) {
 1374|  2.89M|  const BLOCK_SIZE plane_bsize =
 1375|  2.89M|      get_plane_block_size(bsize, subsampling_x, subsampling_y);
 1376|       |  assert(plane_bsize < BLOCK_SIZES_ALL);
 1377|  2.89M|  const TX_SIZE uv_tx = max_txsize_rect_lookup[plane_bsize];
 1378|  2.89M|  return av1_get_adjusted_tx_size(uv_tx);
 1379|  2.89M|}
decodeframe.c:av1_get_adjusted_tx_size:
 1361|  2.99M|static inline TX_SIZE av1_get_adjusted_tx_size(TX_SIZE tx_size) {
 1362|  2.99M|  switch (tx_size) {
 1363|  28.7k|    case TX_64X64:
  ------------------
  |  Branch (1363:5): [True: 28.7k, False: 2.96M]
  ------------------
 1364|  44.8k|    case TX_64X32:
  ------------------
  |  Branch (1364:5): [True: 16.0k, False: 2.98M]
  ------------------
 1365|  60.1k|    case TX_32X64: return TX_32X32;
  ------------------
  |  Branch (1365:5): [True: 15.3k, False: 2.98M]
  ------------------
 1366|  13.4k|    case TX_64X16: return TX_32X16;
  ------------------
  |  Branch (1366:5): [True: 13.4k, False: 2.98M]
  ------------------
 1367|  14.7k|    case TX_16X64: return TX_16X32;
  ------------------
  |  Branch (1367:5): [True: 14.7k, False: 2.98M]
  ------------------
 1368|  2.90M|    default: return tx_size;
  ------------------
  |  Branch (1368:5): [True: 2.90M, False: 88.0k]
  ------------------
 1369|  2.99M|  }
 1370|  2.99M|}
decodeframe.c:get_vartx_max_txsize:
 1448|   194k|                                       int plane) {
 1449|   194k|  if (xd->lossless[xd->mi[0]->segment_id]) return TX_4X4;
  ------------------
  |  Branch (1449:7): [True: 17.4k, False: 176k]
  ------------------
 1450|   176k|  const TX_SIZE max_txsize = max_txsize_rect_lookup[bsize];
 1451|   176k|  if (plane == 0) return max_txsize;            // luma
  ------------------
  |  Branch (1451:7): [True: 70.1k, False: 106k]
  ------------------
 1452|   106k|  return av1_get_adjusted_tx_size(max_txsize);  // chroma
 1453|   176k|}
decodeframe.c:av1_get_txb_size_index:
 1207|   146k|                                         int blk_col) {
 1208|   146k|  static const uint8_t tw_w_log2_table[BLOCK_SIZES_ALL] = {
 1209|   146k|    0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 0, 1, 1, 2, 2, 3,
 1210|   146k|  };
 1211|   146k|  static const uint8_t tw_h_log2_table[BLOCK_SIZES_ALL] = {
 1212|   146k|    0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 1, 0, 2, 1, 3, 2,
 1213|   146k|  };
 1214|   146k|  static const uint8_t stride_log2_table[BLOCK_SIZES_ALL] = {
 1215|   146k|    0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 2, 2, 0, 1, 0, 1, 0, 1,
 1216|   146k|  };
 1217|   146k|  const int index =
 1218|   146k|      ((blk_row >> tw_h_log2_table[bsize]) << stride_log2_table[bsize]) +
 1219|   146k|      (blk_col >> tw_w_log2_table[bsize]);
 1220|       |  assert(index < INTER_TX_SIZE_BUF_LEN);
 1221|   146k|  return index;
 1222|   146k|}
decodeframe.c:av1_get_block_dimensions:
 1516|  93.0k|                                            int *cols_within_bounds) {
 1517|  93.0k|  const int block_height = block_size_high[bsize];
 1518|  93.0k|  const int block_width = block_size_wide[bsize];
 1519|  93.0k|  const int block_rows = (xd->mb_to_bottom_edge >= 0)
  ------------------
  |  Branch (1519:26): [True: 92.8k, False: 219]
  ------------------
 1520|  93.0k|                             ? block_height
 1521|  93.0k|                             : (xd->mb_to_bottom_edge >> 3) + block_height;
 1522|  93.0k|  const int block_cols = (xd->mb_to_right_edge >= 0)
  ------------------
  |  Branch (1522:26): [True: 92.5k, False: 538]
  ------------------
 1523|  93.0k|                             ? block_width
 1524|  93.0k|                             : (xd->mb_to_right_edge >> 3) + block_width;
 1525|  93.0k|  const struct macroblockd_plane *const pd = &xd->plane[plane];
 1526|  93.0k|  assert(IMPLIES(plane == PLANE_TYPE_Y, pd->subsampling_x == 0));
 1527|  93.0k|  assert(IMPLIES(plane == PLANE_TYPE_Y, pd->subsampling_y == 0));
 1528|  93.0k|  assert(block_width >= block_cols);
 1529|  93.0k|  assert(block_height >= block_rows);
 1530|  93.0k|  const int plane_block_width = block_width >> pd->subsampling_x;
 1531|  93.0k|  const int plane_block_height = block_height >> pd->subsampling_y;
 1532|       |  // Special handling for chroma sub8x8.
 1533|  93.0k|  const int is_chroma_sub8_x = plane > 0 && plane_block_width < 4;
  ------------------
  |  Branch (1533:32): [True: 22.2k, False: 70.8k]
  |  Branch (1533:45): [True: 0, False: 22.2k]
  ------------------
 1534|  93.0k|  const int is_chroma_sub8_y = plane > 0 && plane_block_height < 4;
  ------------------
  |  Branch (1534:32): [True: 22.2k, False: 70.8k]
  |  Branch (1534:45): [True: 0, False: 22.2k]
  ------------------
 1535|  93.0k|  if (width) {
  ------------------
  |  Branch (1535:7): [True: 93.0k, False: 20]
  ------------------
 1536|  93.0k|    *width = plane_block_width + 2 * is_chroma_sub8_x;
 1537|  93.0k|    assert(*width >= 0);
 1538|  93.0k|  }
 1539|  93.0k|  if (height) {
  ------------------
  |  Branch (1539:7): [True: 93.0k, False: 20]
  ------------------
 1540|  93.0k|    *height = plane_block_height + 2 * is_chroma_sub8_y;
 1541|  93.0k|    assert(*height >= 0);
 1542|  93.0k|  }
 1543|  93.0k|  if (rows_within_bounds) {
  ------------------
  |  Branch (1543:7): [True: 0, False: 93.0k]
  ------------------
 1544|      0|    *rows_within_bounds =
 1545|      0|        (block_rows >> pd->subsampling_y) + 2 * is_chroma_sub8_y;
 1546|      0|    assert(*rows_within_bounds >= 0);
 1547|      0|  }
 1548|  93.0k|  if (cols_within_bounds) {
  ------------------
  |  Branch (1548:7): [True: 0, False: 93.0k]
  ------------------
 1549|      0|    *cols_within_bounds =
 1550|      0|        (block_cols >> pd->subsampling_x) + 2 * is_chroma_sub8_x;
 1551|       |    assert(*cols_within_bounds >= 0);
 1552|      0|  }
 1553|  93.0k|}
decodemv.c:is_inter_block:
  372|  2.63M|static inline int is_inter_block(const MB_MODE_INFO *mbmi) {
  373|  2.63M|  return is_intrabc_block(mbmi) || mbmi->ref_frame[0] > INTRA_FRAME;
  ------------------
  |  Branch (373:10): [True: 9.76k, False: 2.62M]
  |  Branch (373:36): [True: 252k, False: 2.37M]
  ------------------
  374|  2.63M|}
decodemv.c:is_intrabc_block:
  345|  2.98M|static inline int is_intrabc_block(const MB_MODE_INFO *mbmi) {
  346|  2.98M|  return mbmi->use_intrabc;
  347|  2.98M|}
decodemv.c:get_ext_tx_types:
 1125|  1.46M|                                   int use_reduced_set) {
 1126|  1.46M|  const int set_type =
 1127|  1.46M|      av1_get_ext_tx_set_type(tx_size, is_inter, use_reduced_set);
 1128|  1.46M|  return av1_num_ext_tx_set[set_type];
 1129|  1.46M|}
decodemv.c:av1_get_ext_tx_set_type:
 1098|  4.13M|                                                int use_reduced_set) {
 1099|  4.13M|  const TX_SIZE tx_size_sqr_up = txsize_sqr_up_map[tx_size];
 1100|  4.13M|  if (tx_size_sqr_up > TX_32X32) return EXT_TX_SET_DCTONLY;
  ------------------
  |  Branch (1100:7): [True: 30.4k, False: 4.10M]
  ------------------
 1101|  4.10M|  if (tx_size_sqr_up == TX_32X32)
  ------------------
  |  Branch (1101:7): [True: 125k, False: 3.98M]
  ------------------
 1102|   125k|    return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DCTONLY;
  ------------------
  |  Branch (1102:12): [True: 21.7k, False: 103k]
  ------------------
 1103|  3.98M|  if (use_reduced_set)
  ------------------
  |  Branch (1103:7): [True: 1.09M, False: 2.88M]
  ------------------
 1104|  1.09M|    return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DTT4_IDTX;
  ------------------
  |  Branch (1104:12): [True: 43.9k, False: 1.05M]
  ------------------
 1105|  2.88M|  const TX_SIZE tx_size_sqr = txsize_sqr_map[tx_size];
 1106|  2.88M|  return av1_ext_tx_set_lookup[is_inter][tx_size_sqr == TX_16X16];
 1107|  3.98M|}
decodemv.c:get_ext_tx_set:
 1118|  1.33M|                                 int use_reduced_set) {
 1119|  1.33M|  const TxSetType set_type =
 1120|  1.33M|      av1_get_ext_tx_set_type(tx_size, is_inter, use_reduced_set);
 1121|  1.33M|  return ext_tx_set_index[is_inter][set_type];
 1122|  1.33M|}
decodemv.c:get_plane_block_size:
 1188|  27.0k|                                              int subsampling_y) {
 1189|  27.0k|  assert(bsize < BLOCK_SIZES_ALL);
 1190|  27.0k|  assert(subsampling_x >= 0 && subsampling_x < 2);
 1191|       |  assert(subsampling_y >= 0 && subsampling_y < 2);
 1192|  27.0k|  return av1_ss_size_lookup[bsize][subsampling_x][subsampling_y];
 1193|  27.0k|}
decodemv.c:get_uv_mode:
  349|   977k|static inline PREDICTION_MODE get_uv_mode(UV_PREDICTION_MODE mode) {
  350|       |  assert(mode < UV_INTRA_MODES);
  351|   977k|  static const PREDICTION_MODE uv2y[] = {
  352|   977k|    DC_PRED,        // UV_DC_PRED
  353|   977k|    V_PRED,         // UV_V_PRED
  354|   977k|    H_PRED,         // UV_H_PRED
  355|   977k|    D45_PRED,       // UV_D45_PRED
  356|   977k|    D135_PRED,      // UV_D135_PRED
  357|   977k|    D113_PRED,      // UV_D113_PRED
  358|   977k|    D157_PRED,      // UV_D157_PRED
  359|   977k|    D203_PRED,      // UV_D203_PRED
  360|   977k|    D67_PRED,       // UV_D67_PRED
  361|   977k|    SMOOTH_PRED,    // UV_SMOOTH_PRED
  362|   977k|    SMOOTH_V_PRED,  // UV_SMOOTH_V_PRED
  363|   977k|    SMOOTH_H_PRED,  // UV_SMOOTH_H_PRED
  364|   977k|    PAETH_PRED,     // UV_PAETH_PRED
  365|   977k|    DC_PRED,        // UV_CFL_PRED
  366|   977k|    INTRA_INVALID,  // UV_INTRA_MODES
  367|   977k|    INTRA_INVALID,  // UV_MODE_INVALID
  368|   977k|  };
  369|   977k|  return uv2y[mode];
  370|   977k|}
decodemv.c:av1_allow_palette:
 1499|  1.66M|                                    BLOCK_SIZE sb_type) {
 1500|  1.66M|  assert(sb_type < BLOCK_SIZES_ALL);
 1501|  1.66M|  return allow_screen_content_tools &&
  ------------------
  |  Branch (1501:10): [True: 782k, False: 885k]
  ------------------
 1502|   782k|         block_size_wide[sb_type] <= MAX_PALETTE_BLOCK_WIDTH &&
  ------------------
  |  |   44|  2.45M|#define MAX_PALETTE_BLOCK_WIDTH 64
  ------------------
  |  Branch (1502:10): [True: 781k, False: 862]
  ------------------
 1503|   781k|         block_size_high[sb_type] <= MAX_PALETTE_BLOCK_HEIGHT &&
  ------------------
  |  |   46|  2.45M|#define MAX_PALETTE_BLOCK_HEIGHT 64
  ------------------
  |  Branch (1503:10): [True: 781k, False: 53]
  ------------------
 1504|   781k|         sb_type >= BLOCK_8X8;
  ------------------
  |  Branch (1504:10): [True: 601k, False: 180k]
  ------------------
 1505|  1.66M|}
decodemv.c:is_comp_ref_allowed:
   65|   109k|static inline int is_comp_ref_allowed(BLOCK_SIZE bsize) {
   66|   109k|  return AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8;
  ------------------
  |  |   34|   109k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 23.9k, False: 85.5k]
  |  |  ------------------
  ------------------
   67|   109k|}
decodemv.c:has_second_ref:
  376|   560k|static inline int has_second_ref(const MB_MODE_INFO *mbmi) {
  377|   560k|  return mbmi->ref_frame[1] > INTRA_FRAME;
  378|   560k|}
decodemv.c:comp_ref0:
  385|  18.3k|static inline MV_REFERENCE_FRAME comp_ref0(int ref_idx) {
  386|  18.3k|  static const MV_REFERENCE_FRAME lut[] = {
  387|  18.3k|    LAST_FRAME,     // LAST_LAST2_FRAMES,
  388|  18.3k|    LAST_FRAME,     // LAST_LAST3_FRAMES,
  389|  18.3k|    LAST_FRAME,     // LAST_GOLDEN_FRAMES,
  390|  18.3k|    BWDREF_FRAME,   // BWDREF_ALTREF_FRAMES,
  391|  18.3k|    LAST2_FRAME,    // LAST2_LAST3_FRAMES
  392|  18.3k|    LAST2_FRAME,    // LAST2_GOLDEN_FRAMES,
  393|  18.3k|    LAST3_FRAME,    // LAST3_GOLDEN_FRAMES,
  394|  18.3k|    BWDREF_FRAME,   // BWDREF_ALTREF2_FRAMES,
  395|  18.3k|    ALTREF2_FRAME,  // ALTREF2_ALTREF_FRAMES,
  396|  18.3k|  };
  397|       |  assert(NELEMENTS(lut) == TOTAL_UNIDIR_COMP_REFS);
  398|  18.3k|  return lut[ref_idx];
  399|  18.3k|}
decodemv.c:comp_ref1:
  401|  11.0k|static inline MV_REFERENCE_FRAME comp_ref1(int ref_idx) {
  402|  11.0k|  static const MV_REFERENCE_FRAME lut[] = {
  403|  11.0k|    LAST2_FRAME,    // LAST_LAST2_FRAMES,
  404|  11.0k|    LAST3_FRAME,    // LAST_LAST3_FRAMES,
  405|  11.0k|    GOLDEN_FRAME,   // LAST_GOLDEN_FRAMES,
  406|  11.0k|    ALTREF_FRAME,   // BWDREF_ALTREF_FRAMES,
  407|  11.0k|    LAST3_FRAME,    // LAST2_LAST3_FRAMES
  408|  11.0k|    GOLDEN_FRAME,   // LAST2_GOLDEN_FRAMES,
  409|  11.0k|    GOLDEN_FRAME,   // LAST3_GOLDEN_FRAMES,
  410|  11.0k|    ALTREF2_FRAME,  // BWDREF_ALTREF2_FRAMES,
  411|  11.0k|    ALTREF_FRAME,   // ALTREF2_ALTREF_FRAMES,
  412|  11.0k|  };
  413|       |  assert(NELEMENTS(lut) == TOTAL_UNIDIR_COMP_REFS);
  414|  11.0k|  return lut[ref_idx];
  415|  11.0k|}
decodemv.c:have_nearmv_in_inter_mode:
  151|  76.5k|static inline int have_nearmv_in_inter_mode(PREDICTION_MODE mode) {
  152|  76.5k|  return (mode == NEARMV || mode == NEAR_NEARMV || mode == NEAR_NEWMV ||
  ------------------
  |  Branch (152:11): [True: 19.1k, False: 57.4k]
  |  Branch (152:29): [True: 4.33k, False: 53.0k]
  |  Branch (152:52): [True: 880, False: 52.1k]
  ------------------
  153|  52.1k|          mode == NEW_NEARMV);
  ------------------
  |  Branch (153:11): [True: 705, False: 51.4k]
  ------------------
  154|  76.5k|}
decodemv.c:is_inter_compound_mode:
   81|  76.7k|static inline int is_inter_compound_mode(PREDICTION_MODE mode) {
   82|  76.7k|  return mode >= COMP_INTER_MODE_START && mode < COMP_INTER_MODE_END;
  ------------------
  |  Branch (82:10): [True: 12.0k, False: 64.7k]
  |  Branch (82:43): [True: 12.0k, False: 0]
  ------------------
   83|  76.7k|}
decodemv.c:compound_ref0_mode:
   85|  12.0k|static inline PREDICTION_MODE compound_ref0_mode(PREDICTION_MODE mode) {
   86|  12.0k|  static const PREDICTION_MODE lut[] = {
   87|  12.0k|    DC_PRED,        // DC_PRED
   88|  12.0k|    V_PRED,         // V_PRED
   89|  12.0k|    H_PRED,         // H_PRED
   90|  12.0k|    D45_PRED,       // D45_PRED
   91|  12.0k|    D135_PRED,      // D135_PRED
   92|  12.0k|    D113_PRED,      // D113_PRED
   93|  12.0k|    D157_PRED,      // D157_PRED
   94|  12.0k|    D203_PRED,      // D203_PRED
   95|  12.0k|    D67_PRED,       // D67_PRED
   96|  12.0k|    SMOOTH_PRED,    // SMOOTH_PRED
   97|  12.0k|    SMOOTH_V_PRED,  // SMOOTH_V_PRED
   98|  12.0k|    SMOOTH_H_PRED,  // SMOOTH_H_PRED
   99|  12.0k|    PAETH_PRED,     // PAETH_PRED
  100|  12.0k|    NEARESTMV,      // NEARESTMV
  101|  12.0k|    NEARMV,         // NEARMV
  102|  12.0k|    GLOBALMV,       // GLOBALMV
  103|  12.0k|    NEWMV,          // NEWMV
  104|  12.0k|    NEARESTMV,      // NEAREST_NEARESTMV
  105|  12.0k|    NEARMV,         // NEAR_NEARMV
  106|  12.0k|    NEARESTMV,      // NEAREST_NEWMV
  107|  12.0k|    NEWMV,          // NEW_NEARESTMV
  108|  12.0k|    NEARMV,         // NEAR_NEWMV
  109|  12.0k|    NEWMV,          // NEW_NEARMV
  110|  12.0k|    GLOBALMV,       // GLOBAL_GLOBALMV
  111|  12.0k|    NEWMV,          // NEW_NEWMV
  112|  12.0k|  };
  113|  12.0k|  assert(NELEMENTS(lut) == MB_MODE_COUNT);
  114|       |  assert(is_inter_compound_mode(mode) || is_inter_singleref_mode(mode));
  115|  12.0k|  return lut[mode];
  116|  12.0k|}
decodemv.c:compound_ref1_mode:
  118|  12.0k|static inline PREDICTION_MODE compound_ref1_mode(PREDICTION_MODE mode) {
  119|  12.0k|  static const PREDICTION_MODE lut[] = {
  120|  12.0k|    MB_MODE_COUNT,  // DC_PRED
  121|  12.0k|    MB_MODE_COUNT,  // V_PRED
  122|  12.0k|    MB_MODE_COUNT,  // H_PRED
  123|  12.0k|    MB_MODE_COUNT,  // D45_PRED
  124|  12.0k|    MB_MODE_COUNT,  // D135_PRED
  125|  12.0k|    MB_MODE_COUNT,  // D113_PRED
  126|  12.0k|    MB_MODE_COUNT,  // D157_PRED
  127|  12.0k|    MB_MODE_COUNT,  // D203_PRED
  128|  12.0k|    MB_MODE_COUNT,  // D67_PRED
  129|  12.0k|    MB_MODE_COUNT,  // SMOOTH_PRED
  130|  12.0k|    MB_MODE_COUNT,  // SMOOTH_V_PRED
  131|  12.0k|    MB_MODE_COUNT,  // SMOOTH_H_PRED
  132|  12.0k|    MB_MODE_COUNT,  // PAETH_PRED
  133|  12.0k|    MB_MODE_COUNT,  // NEARESTMV
  134|  12.0k|    MB_MODE_COUNT,  // NEARMV
  135|  12.0k|    MB_MODE_COUNT,  // GLOBALMV
  136|  12.0k|    MB_MODE_COUNT,  // NEWMV
  137|  12.0k|    NEARESTMV,      // NEAREST_NEARESTMV
  138|  12.0k|    NEARMV,         // NEAR_NEARMV
  139|  12.0k|    NEWMV,          // NEAREST_NEWMV
  140|  12.0k|    NEARESTMV,      // NEW_NEARESTMV
  141|  12.0k|    NEWMV,          // NEAR_NEWMV
  142|  12.0k|    NEARMV,         // NEW_NEARMV
  143|  12.0k|    GLOBALMV,       // GLOBAL_GLOBALMV
  144|  12.0k|    NEWMV,          // NEW_NEWMV
  145|  12.0k|  };
  146|  12.0k|  assert(NELEMENTS(lut) == MB_MODE_COUNT);
  147|       |  assert(is_inter_compound_mode(mode));
  148|  12.0k|  return lut[mode];
  149|  12.0k|}
decodemv.c:is_interintra_allowed:
 1425|  28.7k|static inline int is_interintra_allowed(const MB_MODE_INFO *mbmi) {
 1426|  28.7k|  return is_interintra_allowed_bsize(mbmi->bsize) &&
  ------------------
  |  Branch (1426:10): [True: 16.6k, False: 12.1k]
  ------------------
 1427|  16.6k|         is_interintra_allowed_mode(mbmi->mode) &&
  ------------------
  |  Branch (1427:10): [True: 14.2k, False: 2.36k]
  ------------------
 1428|  14.2k|         is_interintra_allowed_ref(mbmi->ref_frame);
  ------------------
  |  Branch (1428:10): [True: 14.2k, False: 0]
  ------------------
 1429|  28.7k|}
decodemv.c:is_interintra_allowed_bsize:
 1413|  28.7k|static inline int is_interintra_allowed_bsize(const BLOCK_SIZE bsize) {
 1414|  28.7k|  return (bsize >= BLOCK_8X8) && (bsize <= BLOCK_32X32);
  ------------------
  |  Branch (1414:10): [True: 22.7k, False: 6.02k]
  |  Branch (1414:34): [True: 16.6k, False: 6.13k]
  ------------------
 1415|  28.7k|}
decodemv.c:is_interintra_allowed_mode:
 1417|  16.6k|static inline int is_interintra_allowed_mode(const PREDICTION_MODE mode) {
 1418|  16.6k|  return (mode >= SINGLE_INTER_MODE_START) && (mode < SINGLE_INTER_MODE_END);
  ------------------
  |  Branch (1418:10): [True: 16.6k, False: 0]
  |  Branch (1418:47): [True: 14.2k, False: 2.36k]
  ------------------
 1419|  16.6k|}
decodemv.c:is_interintra_allowed_ref:
 1421|  14.2k|static inline int is_interintra_allowed_ref(const MV_REFERENCE_FRAME rf[2]) {
 1422|  14.2k|  return (rf[0] > INTRA_FRAME) && (rf[1] <= INTRA_FRAME);
  ------------------
  |  Branch (1422:10): [True: 14.2k, False: 0]
  |  Branch (1422:35): [True: 14.2k, False: 0]
  ------------------
 1423|  14.2k|}
decodemv.c:is_motion_variation_allowed_bsize:
 1455|   114k|static inline int is_motion_variation_allowed_bsize(BLOCK_SIZE bsize) {
 1456|   114k|  assert(bsize < BLOCK_SIZES_ALL);
 1457|   114k|  return AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8;
  ------------------
  |  |   34|   114k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 24.1k, False: 90.7k]
  |  |  ------------------
  ------------------
 1458|   114k|}
decodemv.c:motion_mode_allowed:
 1474|  62.9k|    const MB_MODE_INFO *mbmi, int allow_warped_motion) {
 1475|  62.9k|  if (!check_num_overlappable_neighbors(mbmi)) return SIMPLE_TRANSLATION;
  ------------------
  |  Branch (1475:7): [True: 24.4k, False: 38.4k]
  ------------------
 1476|  38.4k|  if (xd->cur_frame_force_integer_mv == 0) {
  ------------------
  |  Branch (1476:7): [True: 36.7k, False: 1.68k]
  ------------------
 1477|  36.7k|    const TransformationType gm_type = gm_params[mbmi->ref_frame[0]].wmtype;
 1478|  36.7k|    if (is_global_mv_block(mbmi, gm_type)) return SIMPLE_TRANSLATION;
  ------------------
  |  Branch (1478:9): [True: 232, False: 36.5k]
  ------------------
 1479|  36.7k|  }
 1480|  38.2k|  if (is_motion_variation_allowed_bsize(mbmi->bsize) &&
  ------------------
  |  Branch (1480:7): [True: 38.2k, False: 18.4E]
  ------------------
 1481|  38.2k|      is_inter_mode(mbmi->mode) && mbmi->ref_frame[1] != INTRA_FRAME &&
  ------------------
  |  Branch (1481:7): [True: 38.2k, False: 18.4E]
  |  Branch (1481:36): [True: 38.2k, False: 18.4E]
  ------------------
 1482|  38.2k|      is_motion_variation_allowed_compound(mbmi)) {
  ------------------
  |  Branch (1482:7): [True: 29.0k, False: 9.13k]
  ------------------
 1483|  29.0k|    assert(!has_second_ref(mbmi));
 1484|  29.0k|    if (mbmi->num_proj_ref >= 1 && allow_warped_motion &&
  ------------------
  |  Branch (1484:9): [True: 23.8k, False: 5.19k]
  |  Branch (1484:36): [True: 16.9k, False: 6.90k]
  ------------------
 1485|  16.9k|        !xd->cur_frame_force_integer_mv &&
  ------------------
  |  Branch (1485:9): [True: 15.6k, False: 1.37k]
  ------------------
 1486|  15.6k|        !av1_is_scaled(xd->block_ref_scale_factors[0])) {
  ------------------
  |  Branch (1486:9): [True: 15.6k, False: 0]
  ------------------
 1487|  15.6k|      return WARPED_CAUSAL;
 1488|  15.6k|    }
 1489|  13.4k|    return OBMC_CAUSAL;
 1490|  29.0k|  }
 1491|  9.13k|  return SIMPLE_TRANSLATION;
 1492|  38.2k|}
decodemv.c:check_num_overlappable_neighbors:
 1468|  62.9k|static inline int check_num_overlappable_neighbors(const MB_MODE_INFO *mbmi) {
 1469|  62.9k|  return mbmi->overlappable_neighbors != 0;
 1470|  62.9k|}
decodemv.c:is_global_mv_block:
  422|  36.7k|                                     TransformationType type) {
  423|  36.7k|  const PREDICTION_MODE mode = mbmi->mode;
  424|  36.7k|  const BLOCK_SIZE bsize = mbmi->bsize;
  425|  36.7k|  const int block_size_allowed =
  426|  36.7k|      AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8;
  ------------------
  |  |   34|  36.7k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 7.42k, False: 29.3k]
  |  |  ------------------
  ------------------
  427|  36.7k|  return (mode == GLOBALMV || mode == GLOBAL_GLOBALMV) && type > TRANSLATION &&
  ------------------
  |  Branch (427:11): [True: 1.20k, False: 35.5k]
  |  Branch (427:31): [True: 1.07k, False: 34.4k]
  |  Branch (427:59): [True: 232, False: 2.04k]
  ------------------
  428|    232|         block_size_allowed;
  ------------------
  |  Branch (428:10): [True: 232, False: 0]
  ------------------
  429|  36.7k|}
decodemv.c:is_inter_mode:
   69|  38.2k|static inline int is_inter_mode(PREDICTION_MODE mode) {
   70|  38.2k|  return mode >= INTER_MODE_START && mode < INTER_MODE_END;
  ------------------
  |  Branch (70:10): [True: 38.2k, False: 18.4E]
  |  Branch (70:38): [True: 38.2k, False: 0]
  ------------------
   71|  38.2k|}
decodemv.c:is_motion_variation_allowed_compound:
 1461|  38.2k|    const MB_MODE_INFO *mbmi) {
 1462|  38.2k|  return !has_second_ref(mbmi);
 1463|  38.2k|}
decodemv.c:is_masked_compound_type:
  161|  36.3k|static inline int is_masked_compound_type(COMPOUND_TYPE type) {
  162|  36.3k|  return (type == COMPOUND_WEDGE || type == COMPOUND_DIFFWTD);
  ------------------
  |  Branch (162:11): [True: 11.8k, False: 24.5k]
  |  Branch (162:37): [True: 744, False: 23.7k]
  ------------------
  163|  36.3k|}
decodemv.c:is_nontrans_global_motion:
 1576|  73.1k|                                            const MB_MODE_INFO *mbmi) {
 1577|  73.1k|  int ref;
 1578|       |
 1579|       |  // First check if all modes are GLOBALMV
 1580|  73.1k|  if (mbmi->mode != GLOBALMV && mbmi->mode != GLOBAL_GLOBALMV) return 0;
  ------------------
  |  Branch (1580:7): [True: 70.8k, False: 2.29k]
  |  Branch (1580:33): [True: 69.4k, False: 1.37k]
  ------------------
 1581|       |
 1582|  3.67k|  if (AOMMIN(mi_size_wide[mbmi->bsize], mi_size_high[mbmi->bsize]) < 2)
  ------------------
  |  |   34|  3.67k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 899, False: 2.77k]
  |  |  ------------------
  ------------------
  |  Branch (1582:7): [True: 921, False: 2.75k]
  ------------------
 1583|    921|    return 0;
 1584|       |
 1585|       |  // Now check if all global motion is non translational
 1586|  6.65k|  for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
  ------------------
  |  Branch (1586:17): [True: 4.08k, False: 2.57k]
  ------------------
 1587|  4.08k|    if (xd->global_motion[mbmi->ref_frame[ref]].wmtype == TRANSLATION) return 0;
  ------------------
  |  Branch (1587:9): [True: 186, False: 3.89k]
  ------------------
 1588|  4.08k|  }
 1589|  2.57k|  return 1;
 1590|  2.75k|}
decoder.c:is_inter_block:
  372|  4.27M|static inline int is_inter_block(const MB_MODE_INFO *mbmi) {
  373|  4.27M|  return is_intrabc_block(mbmi) || mbmi->ref_frame[0] > INTRA_FRAME;
  ------------------
  |  Branch (373:10): [True: 25.4k, False: 4.24M]
  |  Branch (373:36): [True: 155k, False: 4.08M]
  ------------------
  374|  4.27M|}
decoder.c:is_intrabc_block:
  345|  4.27M|static inline int is_intrabc_block(const MB_MODE_INFO *mbmi) {
  346|  4.27M|  return mbmi->use_intrabc;
  347|  4.27M|}
decodetxb.c:get_plane_block_size:
 1188|  4.71M|                                              int subsampling_y) {
 1189|  4.71M|  assert(bsize < BLOCK_SIZES_ALL);
 1190|  4.71M|  assert(subsampling_x >= 0 && subsampling_x < 2);
 1191|       |  assert(subsampling_y >= 0 && subsampling_y < 2);
 1192|  4.71M|  return av1_ss_size_lookup[bsize][subsampling_x][subsampling_y];
 1193|  4.71M|}
decodetxb.c:av1_get_adjusted_tx_size:
 1361|  14.1M|static inline TX_SIZE av1_get_adjusted_tx_size(TX_SIZE tx_size) {
 1362|  14.1M|  switch (tx_size) {
 1363|   104k|    case TX_64X64:
  ------------------
  |  Branch (1363:5): [True: 104k, False: 14.0M]
  ------------------
 1364|   147k|    case TX_64X32:
  ------------------
  |  Branch (1364:5): [True: 42.3k, False: 14.1M]
  ------------------
 1365|   174k|    case TX_32X64: return TX_32X32;
  ------------------
  |  Branch (1365:5): [True: 26.9k, False: 14.1M]
  ------------------
 1366|  16.9k|    case TX_64X16: return TX_32X16;
  ------------------
  |  Branch (1366:5): [True: 16.9k, False: 14.1M]
  ------------------
 1367|  16.2k|    case TX_16X64: return TX_16X32;
  ------------------
  |  Branch (1367:5): [True: 16.2k, False: 14.1M]
  ------------------
 1368|  13.9M|    default: return tx_size;
  ------------------
  |  Branch (1368:5): [True: 13.9M, False: 200k]
  ------------------
 1369|  14.1M|  }
 1370|  14.1M|}
decodetxb.c:is_inter_block:
  372|  6.12M|static inline int is_inter_block(const MB_MODE_INFO *mbmi) {
  373|  6.12M|  return is_intrabc_block(mbmi) || mbmi->ref_frame[0] > INTRA_FRAME;
  ------------------
  |  Branch (373:10): [True: 36.5k, False: 6.08M]
  |  Branch (373:36): [True: 511k, False: 5.57M]
  ------------------
  374|  6.12M|}
decodetxb.c:is_intrabc_block:
  345|  6.12M|static inline int is_intrabc_block(const MB_MODE_INFO *mbmi) {
  346|  6.12M|  return mbmi->use_intrabc;
  347|  6.12M|}
decodetxb.c:get_plane_type:
 1592|  4.99M|static inline PLANE_TYPE get_plane_type(int plane) {
 1593|  4.99M|  return (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
  ------------------
  |  Branch (1593:10): [True: 2.66M, False: 2.33M]
  ------------------
 1594|  4.99M|}
decodetxb.c:av1_get_tx_type:
 1281|  3.05M|                                      int reduced_tx_set) {
 1282|  3.05M|  const MB_MODE_INFO *const mbmi = xd->mi[0];
 1283|  3.05M|  if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32) {
  ------------------
  |  Branch (1283:7): [True: 761k, False: 2.29M]
  |  Branch (1283:41): [True: 35.0k, False: 2.26M]
  ------------------
 1284|   796k|    return DCT_DCT;
 1285|   796k|  }
 1286|       |
 1287|  2.26M|  TX_TYPE tx_type;
 1288|  2.26M|  if (plane_type == PLANE_TYPE_Y) {
  ------------------
  |  Branch (1288:7): [True: 1.55M, False: 703k]
  ------------------
 1289|  1.55M|    tx_type = xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col];
 1290|  1.55M|  } else {
 1291|   703k|    if (is_inter_block(mbmi)) {
  ------------------
  |  Branch (1291:9): [True: 133k, False: 569k]
  ------------------
 1292|       |      // scale back to y plane's coordinate
 1293|   133k|      const struct macroblockd_plane *const pd = &xd->plane[plane_type];
 1294|   133k|      blk_row <<= pd->subsampling_y;
 1295|   133k|      blk_col <<= pd->subsampling_x;
 1296|   133k|      tx_type = xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col];
 1297|   569k|    } else {
 1298|       |      // In intra mode, uv planes don't share the same prediction mode as y
 1299|       |      // plane, so the tx_type should not be shared
 1300|   569k|      tx_type = intra_mode_to_tx_type(mbmi, PLANE_TYPE_UV);
 1301|   569k|    }
 1302|   703k|    const TxSetType tx_set_type =
 1303|   703k|        av1_get_ext_tx_set_type(tx_size, is_inter_block(mbmi), reduced_tx_set);
 1304|   703k|    if (!av1_ext_tx_used[tx_set_type][tx_type]) tx_type = DCT_DCT;
  ------------------
  |  Branch (1304:9): [True: 37.4k, False: 665k]
  ------------------
 1305|   703k|  }
 1306|  2.26M|  assert(tx_type < TX_TYPES);
 1307|       |  assert(av1_ext_tx_used[av1_get_ext_tx_set_type(tx_size, is_inter_block(mbmi),
 1308|  2.26M|                                                 reduced_tx_set)][tx_type]);
 1309|  2.26M|  return tx_type;
 1310|  3.05M|}
decodetxb.c:intra_mode_to_tx_type:
 1003|   569k|                                     PLANE_TYPE plane_type) {
 1004|   569k|  static const TX_TYPE _intra_mode_to_tx_type[INTRA_MODES] = {
 1005|   569k|    DCT_DCT,    // DC_PRED
 1006|   569k|    ADST_DCT,   // V_PRED
 1007|   569k|    DCT_ADST,   // H_PRED
 1008|   569k|    DCT_DCT,    // D45_PRED
 1009|   569k|    ADST_ADST,  // D135_PRED
 1010|   569k|    ADST_DCT,   // D113_PRED
 1011|   569k|    DCT_ADST,   // D157_PRED
 1012|   569k|    DCT_ADST,   // D203_PRED
 1013|   569k|    ADST_DCT,   // D67_PRED
 1014|   569k|    ADST_ADST,  // SMOOTH_PRED
 1015|   569k|    ADST_DCT,   // SMOOTH_V_PRED
 1016|   569k|    DCT_ADST,   // SMOOTH_H_PRED
 1017|   569k|    ADST_ADST,  // PAETH_PRED
 1018|   569k|  };
 1019|   569k|  const PREDICTION_MODE mode =
 1020|   569k|      (plane_type == PLANE_TYPE_Y) ? mbmi->mode : get_uv_mode(mbmi->uv_mode);
  ------------------
  |  Branch (1020:7): [True: 0, False: 569k]
  ------------------
 1021|       |  assert(mode < INTRA_MODES);
 1022|   569k|  return _intra_mode_to_tx_type[mode];
 1023|   569k|}
decodetxb.c:get_uv_mode:
  349|   569k|static inline PREDICTION_MODE get_uv_mode(UV_PREDICTION_MODE mode) {
  350|       |  assert(mode < UV_INTRA_MODES);
  351|   569k|  static const PREDICTION_MODE uv2y[] = {
  352|   569k|    DC_PRED,        // UV_DC_PRED
  353|   569k|    V_PRED,         // UV_V_PRED
  354|   569k|    H_PRED,         // UV_H_PRED
  355|   569k|    D45_PRED,       // UV_D45_PRED
  356|   569k|    D135_PRED,      // UV_D135_PRED
  357|   569k|    D113_PRED,      // UV_D113_PRED
  358|   569k|    D157_PRED,      // UV_D157_PRED
  359|   569k|    D203_PRED,      // UV_D203_PRED
  360|   569k|    D67_PRED,       // UV_D67_PRED
  361|   569k|    SMOOTH_PRED,    // UV_SMOOTH_PRED
  362|   569k|    SMOOTH_V_PRED,  // UV_SMOOTH_V_PRED
  363|   569k|    SMOOTH_H_PRED,  // UV_SMOOTH_H_PRED
  364|   569k|    PAETH_PRED,     // UV_PAETH_PRED
  365|   569k|    DC_PRED,        // UV_CFL_PRED
  366|   569k|    INTRA_INVALID,  // UV_INTRA_MODES
  367|   569k|    INTRA_INVALID,  // UV_MODE_INVALID
  368|   569k|  };
  369|   569k|  return uv2y[mode];
  370|   569k|}
decodetxb.c:av1_get_ext_tx_set_type:
 1098|   703k|                                                int use_reduced_set) {
 1099|   703k|  const TX_SIZE tx_size_sqr_up = txsize_sqr_up_map[tx_size];
 1100|   703k|  if (tx_size_sqr_up > TX_32X32) return EXT_TX_SET_DCTONLY;
  ------------------
  |  Branch (1100:7): [True: 0, False: 703k]
  ------------------
 1101|   703k|  if (tx_size_sqr_up == TX_32X32)
  ------------------
  |  Branch (1101:7): [True: 105k, False: 598k]
  ------------------
 1102|   105k|    return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DCTONLY;
  ------------------
  |  Branch (1102:12): [True: 11.3k, False: 93.9k]
  ------------------
 1103|   598k|  if (use_reduced_set)
  ------------------
  |  Branch (1103:7): [True: 61.8k, False: 536k]
  ------------------
 1104|  61.8k|    return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DTT4_IDTX;
  ------------------
  |  Branch (1104:12): [True: 24.0k, False: 37.7k]
  ------------------
 1105|   536k|  const TX_SIZE tx_size_sqr = txsize_sqr_map[tx_size];
 1106|   536k|  return av1_ext_tx_set_lookup[is_inter][tx_size_sqr == TX_16X16];
 1107|   598k|}
detokenize.c:av1_get_block_dimensions:
 1516|  62.4k|                                            int *cols_within_bounds) {
 1517|  62.4k|  const int block_height = block_size_high[bsize];
 1518|  62.4k|  const int block_width = block_size_wide[bsize];
 1519|  62.4k|  const int block_rows = (xd->mb_to_bottom_edge >= 0)
  ------------------
  |  Branch (1519:26): [True: 62.2k, False: 218]
  ------------------
 1520|  62.4k|                             ? block_height
 1521|  62.4k|                             : (xd->mb_to_bottom_edge >> 3) + block_height;
 1522|  62.4k|  const int block_cols = (xd->mb_to_right_edge >= 0)
  ------------------
  |  Branch (1522:26): [True: 62.1k, False: 365]
  ------------------
 1523|  62.4k|                             ? block_width
 1524|  62.4k|                             : (xd->mb_to_right_edge >> 3) + block_width;
 1525|  62.4k|  const struct macroblockd_plane *const pd = &xd->plane[plane];
 1526|  62.4k|  assert(IMPLIES(plane == PLANE_TYPE_Y, pd->subsampling_x == 0));
 1527|  62.4k|  assert(IMPLIES(plane == PLANE_TYPE_Y, pd->subsampling_y == 0));
 1528|  62.4k|  assert(block_width >= block_cols);
 1529|  62.4k|  assert(block_height >= block_rows);
 1530|  62.4k|  const int plane_block_width = block_width >> pd->subsampling_x;
 1531|  62.4k|  const int plane_block_height = block_height >> pd->subsampling_y;
 1532|       |  // Special handling for chroma sub8x8.
 1533|  62.4k|  const int is_chroma_sub8_x = plane > 0 && plane_block_width < 4;
  ------------------
  |  Branch (1533:32): [True: 14.5k, False: 47.9k]
  |  Branch (1533:45): [True: 0, False: 14.5k]
  ------------------
 1534|  62.4k|  const int is_chroma_sub8_y = plane > 0 && plane_block_height < 4;
  ------------------
  |  Branch (1534:32): [True: 14.5k, False: 47.9k]
  |  Branch (1534:45): [True: 0, False: 14.5k]
  ------------------
 1535|  62.4k|  if (width) {
  ------------------
  |  Branch (1535:7): [True: 62.4k, False: 0]
  ------------------
 1536|  62.4k|    *width = plane_block_width + 2 * is_chroma_sub8_x;
 1537|  62.4k|    assert(*width >= 0);
 1538|  62.4k|  }
 1539|  62.4k|  if (height) {
  ------------------
  |  Branch (1539:7): [True: 62.4k, False: 0]
  ------------------
 1540|  62.4k|    *height = plane_block_height + 2 * is_chroma_sub8_y;
 1541|  62.4k|    assert(*height >= 0);
 1542|  62.4k|  }
 1543|  62.4k|  if (rows_within_bounds) {
  ------------------
  |  Branch (1543:7): [True: 62.4k, False: 0]
  ------------------
 1544|  62.4k|    *rows_within_bounds =
 1545|  62.4k|        (block_rows >> pd->subsampling_y) + 2 * is_chroma_sub8_y;
 1546|  62.4k|    assert(*rows_within_bounds >= 0);
 1547|  62.4k|  }
 1548|  62.4k|  if (cols_within_bounds) {
  ------------------
  |  Branch (1548:7): [True: 62.4k, False: 0]
  ------------------
 1549|  62.4k|    *cols_within_bounds =
 1550|  62.4k|        (block_cols >> pd->subsampling_x) + 2 * is_chroma_sub8_x;
 1551|       |    assert(*cols_within_bounds >= 0);
 1552|  62.4k|  }
 1553|  62.4k|}
av1_loopfilter.c:av1_get_max_uv_txsize:
 1373|  2.55M|                                            int subsampling_y) {
 1374|  2.55M|  const BLOCK_SIZE plane_bsize =
 1375|  2.55M|      get_plane_block_size(bsize, subsampling_x, subsampling_y);
 1376|       |  assert(plane_bsize < BLOCK_SIZES_ALL);
 1377|  2.55M|  const TX_SIZE uv_tx = max_txsize_rect_lookup[plane_bsize];
 1378|  2.55M|  return av1_get_adjusted_tx_size(uv_tx);
 1379|  2.55M|}
av1_loopfilter.c:av1_get_adjusted_tx_size:
 1361|  2.58M|static inline TX_SIZE av1_get_adjusted_tx_size(TX_SIZE tx_size) {
 1362|  2.58M|  switch (tx_size) {
 1363|   138k|    case TX_64X64:
  ------------------
  |  Branch (1363:5): [True: 138k, False: 2.44M]
  ------------------
 1364|   198k|    case TX_64X32:
  ------------------
  |  Branch (1364:5): [True: 60.4k, False: 2.52M]
  ------------------
 1365|   237k|    case TX_32X64: return TX_32X32;
  ------------------
  |  Branch (1365:5): [True: 38.7k, False: 2.54M]
  ------------------
 1366|  17.6k|    case TX_64X16: return TX_32X16;
  ------------------
  |  Branch (1366:5): [True: 17.6k, False: 2.56M]
  ------------------
 1367|  32.5k|    case TX_16X64: return TX_16X32;
  ------------------
  |  Branch (1367:5): [True: 32.5k, False: 2.55M]
  ------------------
 1368|  2.34M|    default: return tx_size;
  ------------------
  |  Branch (1368:5): [True: 2.34M, False: 241k]
  ------------------
 1369|  2.58M|  }
 1370|  2.58M|}
av1_loopfilter.c:av1_get_txb_size_index:
 1207|    128|                                         int blk_col) {
 1208|    128|  static const uint8_t tw_w_log2_table[BLOCK_SIZES_ALL] = {
 1209|    128|    0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 0, 1, 1, 2, 2, 3,
 1210|    128|  };
 1211|    128|  static const uint8_t tw_h_log2_table[BLOCK_SIZES_ALL] = {
 1212|    128|    0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 1, 0, 2, 1, 3, 2,
 1213|    128|  };
 1214|    128|  static const uint8_t stride_log2_table[BLOCK_SIZES_ALL] = {
 1215|    128|    0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 2, 2, 0, 1, 0, 1, 0, 1,
 1216|    128|  };
 1217|    128|  const int index =
 1218|    128|      ((blk_row >> tw_h_log2_table[bsize]) << stride_log2_table[bsize]) +
 1219|    128|      (blk_col >> tw_w_log2_table[bsize]);
 1220|       |  assert(index < INTER_TX_SIZE_BUF_LEN);
 1221|    128|  return index;
 1222|    128|}
av1_loopfilter.c:is_inter_block:
  372|  3.03M|static inline int is_inter_block(const MB_MODE_INFO *mbmi) {
  373|  3.03M|  return is_intrabc_block(mbmi) || mbmi->ref_frame[0] > INTRA_FRAME;
  ------------------
  |  Branch (373:10): [True: 3.76k, False: 3.02M]
  |  Branch (373:36): [True: 18.4E, False: 3.02M]
  ------------------
  374|  3.03M|}
av1_loopfilter.c:is_intrabc_block:
  345|  3.03M|static inline int is_intrabc_block(const MB_MODE_INFO *mbmi) {
  346|  3.03M|  return mbmi->use_intrabc;
  347|  3.03M|}
av1_loopfilter.c:get_plane_block_size:
 1188|  4.30M|                                              int subsampling_y) {
 1189|  4.30M|  assert(bsize < BLOCK_SIZES_ALL);
 1190|  4.30M|  assert(subsampling_x >= 0 && subsampling_x < 2);
 1191|       |  assert(subsampling_y >= 0 && subsampling_y < 2);
 1192|  4.30M|  return av1_ss_size_lookup[bsize][subsampling_x][subsampling_y];
 1193|  4.30M|}
blockd.c:get_plane_block_size:
 1188|   984k|                                              int subsampling_y) {
 1189|   984k|  assert(bsize < BLOCK_SIZES_ALL);
 1190|   984k|  assert(subsampling_x >= 0 && subsampling_x < 2);
 1191|       |  assert(subsampling_y >= 0 && subsampling_y < 2);
 1192|   984k|  return av1_ss_size_lookup[bsize][subsampling_x][subsampling_y];
 1193|   984k|}
blockd.c:get_plane_type:
 1592|  68.3k|static inline PLANE_TYPE get_plane_type(int plane) {
 1593|  68.3k|  return (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
  ------------------
  |  Branch (1593:10): [True: 26.1k, False: 42.2k]
  ------------------
 1594|  68.3k|}
cdef.c:get_plane_type:
 1592|  61.9k|static inline PLANE_TYPE get_plane_type(int plane) {
 1593|  61.9k|  return (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
  ------------------
  |  Branch (1593:10): [True: 14.0k, False: 47.9k]
  ------------------
 1594|  61.9k|}
cfl.c:is_cur_buf_hbd:
  932|   492k|static inline int is_cur_buf_hbd(const MACROBLOCKD *xd) {
  933|   492k|#if CONFIG_AV1_HIGHBITDEPTH
  934|   492k|  return xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH ? 1 : 0;
  ------------------
  |  |  142|   492k|#define YV12_FLAG_HIGHBITDEPTH 8
  ------------------
  |  Branch (934:10): [True: 228k, False: 263k]
  ------------------
  935|       |#else
  936|       |  (void)xd;
  937|       |  return 0;
  938|       |#endif
  939|   492k|}
idct.c:is_cur_buf_hbd:
  932|  2.01M|static inline int is_cur_buf_hbd(const MACROBLOCKD *xd) {
  933|  2.01M|#if CONFIG_AV1_HIGHBITDEPTH
  934|  2.01M|  return xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH ? 1 : 0;
  ------------------
  |  |  142|  2.01M|#define YV12_FLAG_HIGHBITDEPTH 8
  ------------------
  |  Branch (934:10): [True: 969k, False: 1.04M]
  ------------------
  935|       |#else
  936|       |  (void)xd;
  937|       |  return 0;
  938|       |#endif
  939|  2.01M|}
idct.c:av1_get_ext_tx_set_type:
 1098|  2.01M|                                                int use_reduced_set) {
 1099|  2.01M|  const TX_SIZE tx_size_sqr_up = txsize_sqr_up_map[tx_size];
 1100|  2.01M|  if (tx_size_sqr_up > TX_32X32) return EXT_TX_SET_DCTONLY;
  ------------------
  |  Branch (1100:7): [True: 26.1k, False: 1.99M]
  ------------------
 1101|  1.99M|  if (tx_size_sqr_up == TX_32X32)
  ------------------
  |  Branch (1101:7): [True: 162k, False: 1.82M]
  ------------------
 1102|   162k|    return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DCTONLY;
  ------------------
  |  Branch (1102:12): [True: 6.10k, False: 156k]
  ------------------
 1103|  1.82M|  if (use_reduced_set)
  ------------------
  |  Branch (1103:7): [True: 338k, False: 1.49M]
  ------------------
 1104|   338k|    return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DTT4_IDTX;
  ------------------
  |  Branch (1104:12): [True: 13.3k, False: 325k]
  ------------------
 1105|  1.49M|  const TX_SIZE tx_size_sqr = txsize_sqr_map[tx_size];
 1106|  1.49M|  return av1_ext_tx_set_lookup[is_inter][tx_size_sqr == TX_16X16];
 1107|  1.82M|}
idct.c:is_inter_block:
  372|  2.01M|static inline int is_inter_block(const MB_MODE_INFO *mbmi) {
  373|  2.01M|  return is_intrabc_block(mbmi) || mbmi->ref_frame[0] > INTRA_FRAME;
  ------------------
  |  Branch (373:10): [True: 7.22k, False: 2.01M]
  |  Branch (373:36): [True: 82.4k, False: 1.92M]
  ------------------
  374|  2.01M|}
idct.c:is_intrabc_block:
  345|  2.01M|static inline int is_intrabc_block(const MB_MODE_INFO *mbmi) {
  346|  2.01M|  return mbmi->use_intrabc;
  347|  2.01M|}
mvref_common.c:is_inter_block:
  372|   451k|static inline int is_inter_block(const MB_MODE_INFO *mbmi) {
  373|   451k|  return is_intrabc_block(mbmi) || mbmi->ref_frame[0] > INTRA_FRAME;
  ------------------
  |  Branch (373:10): [True: 13.1k, False: 438k]
  |  Branch (373:36): [True: 361k, False: 76.6k]
  ------------------
  374|   451k|}
mvref_common.c:is_intrabc_block:
  345|   451k|static inline int is_intrabc_block(const MB_MODE_INFO *mbmi) {
  346|   451k|  return mbmi->use_intrabc;
  347|   451k|}
mvref_common.c:is_global_mv_block:
  422|   245k|                                     TransformationType type) {
  423|   245k|  const PREDICTION_MODE mode = mbmi->mode;
  424|   245k|  const BLOCK_SIZE bsize = mbmi->bsize;
  425|   245k|  const int block_size_allowed =
  426|   245k|      AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8;
  ------------------
  |  |   34|   245k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 51.4k, False: 194k]
  |  |  ------------------
  ------------------
  427|   245k|  return (mode == GLOBALMV || mode == GLOBAL_GLOBALMV) && type > TRANSLATION &&
  ------------------
  |  Branch (427:11): [True: 9.02k, False: 236k]
  |  Branch (427:31): [True: 5.10k, False: 231k]
  |  Branch (427:59): [True: 1.45k, False: 12.6k]
  ------------------
  428|  1.45k|         block_size_allowed;
  ------------------
  |  Branch (428:10): [True: 1.32k, False: 136]
  ------------------
  429|   245k|}
mvref_common.c:have_newmv_in_inter_mode:
  156|   231k|static inline int have_newmv_in_inter_mode(PREDICTION_MODE mode) {
  157|   231k|  return (mode == NEWMV || mode == NEW_NEWMV || mode == NEAREST_NEWMV ||
  ------------------
  |  Branch (157:11): [True: 63.1k, False: 168k]
  |  Branch (157:28): [True: 4.49k, False: 163k]
  |  Branch (157:49): [True: 2.20k, False: 161k]
  ------------------
  158|   161k|          mode == NEW_NEARESTMV || mode == NEAR_NEWMV || mode == NEW_NEARMV);
  ------------------
  |  Branch (158:11): [True: 2.32k, False: 158k]
  |  Branch (158:36): [True: 1.51k, False: 157k]
  |  Branch (158:58): [True: 1.08k, False: 156k]
  ------------------
  159|   231k|}
pred_common.c:is_inter_block:
  372|   248k|static inline int is_inter_block(const MB_MODE_INFO *mbmi) {
  373|   248k|  return is_intrabc_block(mbmi) || mbmi->ref_frame[0] > INTRA_FRAME;
  ------------------
  |  Branch (373:10): [True: 0, False: 248k]
  |  Branch (373:36): [True: 149k, False: 98.4k]
  ------------------
  374|   248k|}
pred_common.c:is_intrabc_block:
  345|   248k|static inline int is_intrabc_block(const MB_MODE_INFO *mbmi) {
  346|   248k|  return mbmi->use_intrabc;
  347|   248k|}
pred_common.c:has_second_ref:
  376|  74.3k|static inline int has_second_ref(const MB_MODE_INFO *mbmi) {
  377|  74.3k|  return mbmi->ref_frame[1] > INTRA_FRAME;
  378|  74.3k|}
pred_common.c:has_uni_comp_refs:
  380|  12.4k|static inline int has_uni_comp_refs(const MB_MODE_INFO *mbmi) {
  381|  12.4k|  return has_second_ref(mbmi) && (!((mbmi->ref_frame[0] >= BWDREF_FRAME) ^
  ------------------
  |  Branch (381:10): [True: 12.4k, False: 0]
  |  Branch (381:34): [True: 3.24k, False: 9.16k]
  ------------------
  382|  12.4k|                                    (mbmi->ref_frame[1] >= BWDREF_FRAME)));
  383|  12.4k|}
quant_common.c:av1_get_adjusted_tx_size:
 1361|  19.1M|static inline TX_SIZE av1_get_adjusted_tx_size(TX_SIZE tx_size) {
 1362|  19.1M|  switch (tx_size) {
 1363|   875k|    case TX_64X64:
  ------------------
  |  Branch (1363:5): [True: 875k, False: 18.2M]
  ------------------
 1364|  1.73M|    case TX_64X32:
  ------------------
  |  Branch (1364:5): [True: 864k, False: 18.2M]
  ------------------
 1365|  2.60M|    case TX_32X64: return TX_32X32;
  ------------------
  |  Branch (1365:5): [True: 864k, False: 18.2M]
  ------------------
 1366|   866k|    case TX_64X16: return TX_32X16;
  ------------------
  |  Branch (1366:5): [True: 866k, False: 18.2M]
  ------------------
 1367|   866k|    case TX_16X64: return TX_16X32;
  ------------------
  |  Branch (1367:5): [True: 866k, False: 18.2M]
  ------------------
 1368|  14.8M|    default: return tx_size;
  ------------------
  |  Branch (1368:5): [True: 14.8M, False: 4.33M]
  ------------------
 1369|  19.1M|  }
 1370|  19.1M|}
reconinter.c:is_motion_variation_allowed_bsize:
 1455|  76.7k|static inline int is_motion_variation_allowed_bsize(BLOCK_SIZE bsize) {
 1456|  76.7k|  assert(bsize < BLOCK_SIZES_ALL);
 1457|  76.7k|  return AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8;
  ------------------
  |  |   34|  76.7k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 16.7k, False: 59.9k]
  |  |  ------------------
  ------------------
 1458|  76.7k|}
reconinter.c:is_neighbor_overlappable:
 1494|  68.2k|static inline int is_neighbor_overlappable(const MB_MODE_INFO *mbmi) {
 1495|  68.2k|  return (is_inter_block(mbmi));
 1496|  68.2k|}
reconinter.c:is_inter_block:
  372|  68.2k|static inline int is_inter_block(const MB_MODE_INFO *mbmi) {
  373|  68.2k|  return is_intrabc_block(mbmi) || mbmi->ref_frame[0] > INTRA_FRAME;
  ------------------
  |  Branch (373:10): [True: 18.4E, False: 68.2k]
  |  Branch (373:36): [True: 66.0k, False: 2.15k]
  ------------------
  374|  68.2k|}
reconinter.c:is_intrabc_block:
  345|  68.2k|static inline int is_intrabc_block(const MB_MODE_INFO *mbmi) {
  346|  68.2k|  return mbmi->use_intrabc;
  347|  68.2k|}
reconinter.c:get_plane_block_size:
 1188|  92.9k|                                              int subsampling_y) {
 1189|  92.9k|  assert(bsize < BLOCK_SIZES_ALL);
 1190|  92.9k|  assert(subsampling_x >= 0 && subsampling_x < 2);
 1191|       |  assert(subsampling_y >= 0 && subsampling_y < 2);
 1192|  92.9k|  return av1_ss_size_lookup[bsize][subsampling_x][subsampling_y];
 1193|  92.9k|}
reconinter.c:is_cur_buf_hbd:
  932|  49.6k|static inline int is_cur_buf_hbd(const MACROBLOCKD *xd) {
  933|  49.6k|#if CONFIG_AV1_HIGHBITDEPTH
  934|  49.6k|  return xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH ? 1 : 0;
  ------------------
  |  |  142|  49.6k|#define YV12_FLAG_HIGHBITDEPTH 8
  ------------------
  |  Branch (934:10): [True: 9.39k, False: 40.2k]
  ------------------
  935|       |#else
  936|       |  (void)xd;
  937|       |  return 0;
  938|       |#endif
  939|  49.6k|}
reconinter.c:has_second_ref:
  376|  14.0k|static inline int has_second_ref(const MB_MODE_INFO *mbmi) {
  377|  14.0k|  return mbmi->ref_frame[1] > INTRA_FRAME;
  378|  14.0k|}
reconintra.c:is_cur_buf_hbd:
  932|  4.53M|static inline int is_cur_buf_hbd(const MACROBLOCKD *xd) {
  933|  4.53M|#if CONFIG_AV1_HIGHBITDEPTH
  934|  4.53M|  return xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH ? 1 : 0;
  ------------------
  |  |  142|  4.53M|#define YV12_FLAG_HIGHBITDEPTH 8
  ------------------
  |  Branch (934:10): [True: 1.83M, False: 2.69M]
  ------------------
  935|       |#else
  936|       |  (void)xd;
  937|       |  return 0;
  938|       |#endif
  939|  4.53M|}
reconintra.c:is_inter_block:
  372|   796k|static inline int is_inter_block(const MB_MODE_INFO *mbmi) {
  373|   796k|  return is_intrabc_block(mbmi) || mbmi->ref_frame[0] > INTRA_FRAME;
  ------------------
  |  Branch (373:10): [True: 2.77k, False: 794k]
  |  Branch (373:36): [True: 5.28k, False: 788k]
  ------------------
  374|   796k|}
reconintra.c:is_intrabc_block:
  345|   796k|static inline int is_intrabc_block(const MB_MODE_INFO *mbmi) {
  346|   796k|  return mbmi->use_intrabc;
  347|   796k|}
reconintra.c:get_uv_mode:
  349|  2.12M|static inline PREDICTION_MODE get_uv_mode(UV_PREDICTION_MODE mode) {
  350|       |  assert(mode < UV_INTRA_MODES);
  351|  2.12M|  static const PREDICTION_MODE uv2y[] = {
  352|  2.12M|    DC_PRED,        // UV_DC_PRED
  353|  2.12M|    V_PRED,         // UV_V_PRED
  354|  2.12M|    H_PRED,         // UV_H_PRED
  355|  2.12M|    D45_PRED,       // UV_D45_PRED
  356|  2.12M|    D135_PRED,      // UV_D135_PRED
  357|  2.12M|    D113_PRED,      // UV_D113_PRED
  358|  2.12M|    D157_PRED,      // UV_D157_PRED
  359|  2.12M|    D203_PRED,      // UV_D203_PRED
  360|  2.12M|    D67_PRED,       // UV_D67_PRED
  361|  2.12M|    SMOOTH_PRED,    // UV_SMOOTH_PRED
  362|  2.12M|    SMOOTH_V_PRED,  // UV_SMOOTH_V_PRED
  363|  2.12M|    SMOOTH_H_PRED,  // UV_SMOOTH_H_PRED
  364|  2.12M|    PAETH_PRED,     // UV_PAETH_PRED
  365|  2.12M|    DC_PRED,        // UV_CFL_PRED
  366|  2.12M|    INTRA_INVALID,  // UV_INTRA_MODES
  367|  2.12M|    INTRA_INVALID,  // UV_MODE_INVALID
  368|  2.12M|  };
  369|  2.12M|  return uv2y[mode];
  370|  2.12M|}

av1_cdef_compute_sb_list:
   43|  14.7k|                             BLOCK_SIZE bs) {
   44|  14.7k|  MB_MODE_INFO **grid = mi_params->mi_grid_base;
   45|  14.7k|  int maxc = mi_params->mi_cols - mi_col;
   46|  14.7k|  int maxr = mi_params->mi_rows - mi_row;
   47|       |
   48|  14.7k|  if (bs == BLOCK_128X128 || bs == BLOCK_128X64)
  ------------------
  |  Branch (48:7): [True: 18.4E, False: 14.7k]
  |  Branch (48:30): [True: 0, False: 14.7k]
  ------------------
   49|      0|    maxc = AOMMIN(maxc, MI_SIZE_128X128);
  ------------------
  |  |   34|      0|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
   50|  14.7k|  else
   51|  14.7k|    maxc = AOMMIN(maxc, MI_SIZE_64X64);
  ------------------
  |  |   34|  14.7k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 4.30k, False: 10.4k]
  |  |  ------------------
  ------------------
   52|  14.7k|  if (bs == BLOCK_128X128 || bs == BLOCK_64X128)
  ------------------
  |  Branch (52:7): [True: 18.4E, False: 14.7k]
  |  Branch (52:30): [True: 0, False: 14.7k]
  ------------------
   53|      0|    maxr = AOMMIN(maxr, MI_SIZE_128X128);
  ------------------
  |  |   34|      0|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
   54|  14.7k|  else
   55|  14.7k|    maxr = AOMMIN(maxr, MI_SIZE_64X64);
  ------------------
  |  |   34|  14.7k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 4.81k, False: 9.89k]
  |  |  ------------------
  ------------------
   56|       |
   57|  14.7k|  const int r_step = 2;  // mi_size_high[BLOCK_8X8]
   58|  14.7k|  const int c_step = 2;  // mi_size_wide[BLOCK_8X8]
   59|  14.7k|  const int r_shift = 1;
   60|  14.7k|  const int c_shift = 1;
   61|  14.7k|  int count = 0;
   62|   107k|  for (int r = 0; r < maxr; r += r_step) {
  ------------------
  |  Branch (62:19): [True: 92.7k, False: 14.7k]
  ------------------
   63|   707k|    for (int c = 0; c < maxc; c += c_step) {
  ------------------
  |  Branch (63:21): [True: 614k, False: 92.7k]
  ------------------
   64|   614k|      if (!is_8x8_block_skip(grid, mi_row + r, mi_col + c,
  ------------------
  |  Branch (64:11): [True: 552k, False: 62.5k]
  ------------------
   65|   614k|                             mi_params->mi_stride)) {
   66|   552k|        dlist[count].by = r >> r_shift;
   67|   552k|        dlist[count].bx = c >> c_shift;
   68|   552k|        count++;
   69|   552k|      }
   70|   614k|    }
   71|  92.7k|  }
   72|  14.7k|  return count;
   73|  14.7k|}
av1_cdef_copy_sb8_16_lowbd:
  100|  36.0k|                                int hsize) {
  101|  36.0k|  const uint8_t *base = &src[src_voffset * (ptrdiff_t)sstride + src_hoffset];
  102|  36.0k|  cdef_copy_rect8_8bit_to_16bit(dst, dstride, base, sstride, hsize, vsize);
  103|  36.0k|}
av1_cdef_copy_sb8_16_highbd:
  109|  18.1k|                                 int hsize) {
  110|  18.1k|  const uint16_t *base =
  111|  18.1k|      &CONVERT_TO_SHORTPTR(src)[src_voffset * (ptrdiff_t)sstride + src_hoffset];
  ------------------
  |  |   75|  18.1k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  112|  18.1k|  cdef_copy_rect8_16bit_to_16bit(dst, dstride, base, sstride, hsize, vsize);
  113|  18.1k|}
av1_cdef_copy_sb8_16:
  118|  54.2k|                          int src_hoffset, int sstride, int vsize, int hsize) {
  119|  54.2k|#if CONFIG_AV1_HIGHBITDEPTH
  120|  54.2k|  if (cm->seq_params->use_highbitdepth) {
  ------------------
  |  Branch (120:7): [True: 18.1k, False: 36.0k]
  ------------------
  121|  18.1k|    av1_cdef_copy_sb8_16_highbd(dst, dstride, src, src_voffset, src_hoffset,
  122|  18.1k|                                sstride, vsize, hsize);
  123|  18.1k|    return;
  124|  18.1k|  }
  125|       |#else
  126|       |  (void)cm;
  127|       |#endif  // CONFIG_AV1_HIGHBITDEPTH
  128|  36.0k|  av1_cdef_copy_sb8_16_lowbd(dst, dstride, src, src_voffset, src_hoffset,
  129|  36.0k|                             sstride, vsize, hsize);
  130|  36.0k|}
av1_cdef_init_fb_row:
  373|  2.32k|                          struct AV1CdefSyncData *const cdef_sync, int fbr) {
  374|  2.32k|  (void)cdef_sync;
  375|  2.32k|  const int num_planes = av1_num_planes(cm);
  376|  2.32k|  const int nvfb = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
  ------------------
  |  |   58|  2.32k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  2.32k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
                const int nvfb = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
  ------------------
  |  |   58|  2.32k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  2.32k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  377|  2.32k|  const int luma_stride =
  378|  2.32k|      ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols << MI_SIZE_LOG2, 4);
  ------------------
  |  |   69|  2.32k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  ------------------
  379|  2.32k|  const bool ping_pong = fbr & 1;
  380|       |  // for the current filter block, it's top left corner mi structure (mi_tl)
  381|       |  // is first accessed to check whether the top and left boundaries are
  382|       |  // frame boundaries. Then bottom-left and top-right mi structures are
  383|       |  // accessed to check whether the bottom and right boundaries
  384|       |  // (respectively) are frame boundaries.
  385|       |  //
  386|       |  // Note that we can't just check the bottom-right mi structure - eg. if
  387|       |  // we're at the right-hand edge of the frame but not the bottom, then
  388|       |  // the bottom-right mi is NULL but the bottom-left is not.
  389|  2.32k|  fb_info->frame_boundary[TOP] = (MI_SIZE_64X64 * fbr == 0) ? 1 : 0;
  ------------------
  |  |   58|  2.32k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  2.32k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  |  Branch (389:34): [True: 1.17k, False: 1.14k]
  ------------------
  390|  2.32k|  if (fbr != nvfb - 1)
  ------------------
  |  Branch (390:7): [True: 1.14k, False: 1.17k]
  ------------------
  391|  1.14k|    fb_info->frame_boundary[BOTTOM] =
  392|  1.14k|        (MI_SIZE_64X64 * (fbr + 1) == cm->mi_params.mi_rows) ? 1 : 0;
  ------------------
  |  |   58|  1.14k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  1.14k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  |  Branch (392:9): [True: 0, False: 1.14k]
  ------------------
  393|  1.17k|  else
  394|  1.17k|    fb_info->frame_boundary[BOTTOM] = 1;
  395|       |
  396|  2.32k|  fb_info->src = src;
  397|  2.32k|  fb_info->damping = cm->cdef_info.cdef_damping;
  398|  2.32k|  fb_info->coeff_shift = AOMMAX(cm->seq_params->bit_depth - 8, 0);
  ------------------
  |  |   35|  2.32k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 1.20k, False: 1.12k]
  |  |  ------------------
  ------------------
  399|  2.32k|  av1_zero(fb_info->dir);
  ------------------
  |  |   43|  2.32k|#define av1_zero(dest) memset(&(dest), 0, sizeof(dest))
  ------------------
  400|  2.32k|  av1_zero(fb_info->var);
  ------------------
  |  |   43|  2.32k|#define av1_zero(dest) memset(&(dest), 0, sizeof(dest))
  ------------------
  401|       |
  402|  9.13k|  for (int plane = 0; plane < num_planes; plane++) {
  ------------------
  |  Branch (402:23): [True: 6.80k, False: 2.32k]
  ------------------
  403|  6.80k|    const int mi_high_l2 = MI_SIZE_LOG2 - xd->plane[plane].subsampling_y;
  ------------------
  |  |   39|  6.80k|#define MI_SIZE_LOG2 2
  ------------------
  404|  6.80k|    const int offset = MI_SIZE_64X64 * (fbr + 1) << mi_high_l2;
  ------------------
  |  |   58|  6.80k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  6.80k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  405|  6.80k|    const int stride = luma_stride >> xd->plane[plane].subsampling_x;
  406|       |    // here ping-pong buffers are maintained for top linebuf
  407|       |    // to avoid linebuf over-write by consecutive row.
  408|  6.80k|    uint16_t *const top_linebuf =
  409|  6.80k|        &linebuf[plane][ping_pong * CDEF_VBORDER * stride];
  ------------------
  |  |   23|  6.80k|#define CDEF_VBORDER (2)
  ------------------
  410|  6.80k|    fb_info->bot_linebuf[plane] = &linebuf[plane][(CDEF_VBORDER << 1) * stride];
  ------------------
  |  |   23|  6.80k|#define CDEF_VBORDER (2)
  ------------------
  411|       |
  412|  6.80k|    if (fbr != nvfb - 1)  // top line buffer copy
  ------------------
  |  Branch (412:9): [True: 3.36k, False: 3.44k]
  ------------------
  413|  3.36k|      av1_cdef_copy_sb8_16(cm, top_linebuf, stride, xd->plane[plane].dst.buf,
  414|  3.36k|                           offset - CDEF_VBORDER, 0,
  ------------------
  |  |   23|  3.36k|#define CDEF_VBORDER (2)
  ------------------
  415|  3.36k|                           xd->plane[plane].dst.stride, CDEF_VBORDER, stride);
  ------------------
  |  |   23|  3.36k|#define CDEF_VBORDER (2)
  ------------------
  416|  6.80k|    fb_info->top_linebuf[plane] =
  417|  6.80k|        &linebuf[plane][(!ping_pong) * CDEF_VBORDER * stride];
  ------------------
  |  |   23|  6.80k|#define CDEF_VBORDER (2)
  ------------------
  418|       |
  419|  6.80k|    if (fbr != nvfb - 1)  // bottom line buffer copy
  ------------------
  |  Branch (419:9): [True: 3.36k, False: 3.44k]
  ------------------
  420|  3.36k|      av1_cdef_copy_sb8_16(cm, fb_info->bot_linebuf[plane], stride,
  421|  3.36k|                           xd->plane[plane].dst.buf, offset, 0,
  422|  3.36k|                           xd->plane[plane].dst.stride, CDEF_VBORDER, stride);
  ------------------
  |  |   23|  3.36k|#define CDEF_VBORDER (2)
  ------------------
  423|  6.80k|  }
  424|  2.32k|}
av1_cdef_fb_row:
  431|  6.11k|                     struct aom_internal_error_info *error_info) {
  432|       |  // TODO(aomedia:3276): Pass error_info to the low-level functions as required
  433|       |  // in future to handle error propagation.
  434|  6.11k|  (void)error_info;
  435|  6.11k|  CdefBlockInfo fb_info;
  436|  6.11k|  int cdef_left[MAX_MB_PLANE] = { 1, 1, 1 };
  437|  6.11k|  const int nhfb = (cm->mi_params.mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
  ------------------
  |  |   58|  6.11k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  6.11k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
                const int nhfb = (cm->mi_params.mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
  ------------------
  |  |   58|  6.11k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  6.11k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  438|       |
  439|  6.11k|  cdef_init_fb_row_fn(cm, xd, &fb_info, linebuf, src, cdef_sync, fbr);
  440|  6.11k|#if CONFIG_MULTITHREAD
  441|  6.11k|  if (cdef_sync && cm->cdef_info.allocated_num_workers > 1) {
  ------------------
  |  Branch (441:7): [True: 3.78k, False: 2.32k]
  |  Branch (441:20): [True: 3.78k, False: 0]
  ------------------
  442|  3.78k|    pthread_mutex_lock(cdef_sync->mutex_);
  443|  3.78k|    const bool cdef_mt_exit = cdef_sync->cdef_mt_exit;
  444|  3.78k|    pthread_mutex_unlock(cdef_sync->mutex_);
  445|       |    // Exit in case any worker has encountered an error.
  446|  3.78k|    if (cdef_mt_exit) return;
  ------------------
  |  Branch (446:9): [True: 0, False: 3.78k]
  ------------------
  447|  3.78k|  }
  448|  6.11k|#endif
  449|  20.8k|  for (int fbc = 0; fbc < nhfb; fbc++) {
  ------------------
  |  Branch (449:21): [True: 14.7k, False: 6.11k]
  ------------------
  450|  14.7k|    fb_info.frame_boundary[LEFT] = (MI_SIZE_64X64 * fbc == 0) ? 1 : 0;
  ------------------
  |  |   58|  14.7k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  14.7k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  |  Branch (450:36): [True: 6.10k, False: 8.60k]
  ------------------
  451|  14.7k|    if (fbc != nhfb - 1)
  ------------------
  |  Branch (451:9): [True: 8.60k, False: 6.10k]
  ------------------
  452|  8.60k|      fb_info.frame_boundary[RIGHT] =
  453|  8.60k|          (MI_SIZE_64X64 * (fbc + 1) == cm->mi_params.mi_cols) ? 1 : 0;
  ------------------
  |  |   58|  8.60k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  8.60k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  |  Branch (453:11): [True: 0, False: 8.60k]
  ------------------
  454|  6.10k|    else
  455|  6.10k|      fb_info.frame_boundary[RIGHT] = 1;
  456|  14.7k|    cdef_fb_col(cm, xd, &fb_info, colbuf, &cdef_left[0], fbc, fbr);
  457|  14.7k|  }
  458|  6.11k|}
av1_cdef_frame:
  468|  1.17k|                    MACROBLOCKD *xd, cdef_init_fb_row_t cdef_init_fb_row_fn) {
  469|  1.17k|  const int num_planes = av1_num_planes(cm);
  470|  1.17k|  const int nvfb = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
  ------------------
  |  |   58|  1.17k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  1.17k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
                const int nvfb = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
  ------------------
  |  |   58|  1.17k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  1.17k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  471|       |
  472|  1.17k|  av1_setup_dst_planes(xd->plane, cm->seq_params->sb_size, frame, 0, 0, 0,
  473|  1.17k|                       num_planes);
  474|       |
  475|  3.50k|  for (int fbr = 0; fbr < nvfb; fbr++)
  ------------------
  |  Branch (475:21): [True: 2.32k, False: 1.17k]
  ------------------
  476|  2.32k|    av1_cdef_fb_row(cm, xd, cm->cdef_info.linebuf, cm->cdef_info.colbuf,
  477|       |                    cm->cdef_info.srcbuf, fbr, cdef_init_fb_row_fn, NULL,
  478|  2.32k|                    xd->error_info);
  479|  1.17k|}
cdef.c:is_8x8_block_skip:
   30|   614k|                             int mi_stride) {
   31|   614k|  MB_MODE_INFO **mbmi = grid + mi_row * mi_stride + mi_col;
   32|   740k|  for (int r = 0; r < mi_size_high[BLOCK_8X8]; ++r, mbmi += mi_stride) {
  ------------------
  |  Branch (32:19): [True: 677k, False: 62.4k]
  ------------------
   33|   930k|    for (int c = 0; c < mi_size_wide[BLOCK_8X8]; ++c) {
  ------------------
  |  Branch (33:21): [True: 804k, False: 125k]
  ------------------
   34|   804k|      if (!mbmi[c]->skip_txfm) return 0;
  ------------------
  |  Branch (34:11): [True: 552k, False: 252k]
  ------------------
   35|   804k|    }
   36|   677k|  }
   37|       |
   38|  62.4k|  return 1;
   39|   614k|}
cdef.c:cdef_fb_col:
  302|  14.7k|                        int *cdef_left, int fbc, int fbr) {
  303|  14.7k|  const CommonModeInfoParams *const mi_params = &cm->mi_params;
  304|  14.7k|  const int mbmi_cdef_strength =
  305|  14.7k|      mi_params
  306|  14.7k|          ->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride +
  ------------------
  |  |   58|  14.7k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  14.7k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  307|  14.7k|                         MI_SIZE_64X64 * fbc]
  ------------------
  |  |   58|  14.7k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  14.7k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  308|  14.7k|          ->cdef_strength;
  309|  14.7k|  const int num_planes = av1_num_planes(cm);
  310|  14.7k|  int is_zero_level[PLANE_TYPES] = { 1, 1 };
  311|  14.7k|  int level[PLANE_TYPES] = { 0 };
  312|  14.7k|  int sec_strength[PLANE_TYPES] = { 0 };
  313|  14.7k|  const CdefInfo *const cdef_info = &cm->cdef_info;
  314|       |
  315|  14.7k|  if (mi_params->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride +
  ------------------
  |  |   58|  14.7k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  14.7k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  |  Branch (315:7): [True: 4, False: 14.7k]
  ------------------
  316|  14.7k|                              MI_SIZE_64X64 * fbc] == NULL ||
  ------------------
  |  |   58|  14.7k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  14.7k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  317|  14.7k|      mbmi_cdef_strength == -1) {
  ------------------
  |  Branch (317:7): [True: 0, False: 14.7k]
  ------------------
  318|      0|    av1_zero_array(cdef_left, num_planes);
  ------------------
  |  |   44|      0|#define av1_zero_array(dest, n) memset(dest, 0, n * sizeof(*(dest)))
  ------------------
  319|      0|    return;
  320|      0|  }
  321|       |
  322|       |  // Compute level and secondary strength for planes
  323|  14.7k|  level[PLANE_TYPE_Y] =
  324|  14.7k|      cdef_info->cdef_strengths[mbmi_cdef_strength] / CDEF_SEC_STRENGTHS;
  ------------------
  |  |   17|  14.7k|#define CDEF_SEC_STRENGTHS 4
  ------------------
  325|  14.7k|  sec_strength[PLANE_TYPE_Y] =
  326|  14.7k|      cdef_info->cdef_strengths[mbmi_cdef_strength] % CDEF_SEC_STRENGTHS;
  ------------------
  |  |   17|  14.7k|#define CDEF_SEC_STRENGTHS 4
  ------------------
  327|  14.7k|  sec_strength[PLANE_TYPE_Y] += sec_strength[PLANE_TYPE_Y] == 3;
  328|  14.7k|  is_zero_level[PLANE_TYPE_Y] =
  329|  14.7k|      (level[PLANE_TYPE_Y] == 0) && (sec_strength[PLANE_TYPE_Y] == 0);
  ------------------
  |  Branch (329:7): [True: 4.00k, False: 10.7k]
  |  Branch (329:37): [True: 3.46k, False: 540]
  ------------------
  330|       |
  331|  14.7k|  if (num_planes > 1) {
  ------------------
  |  Branch (331:7): [True: 13.7k, False: 963]
  ------------------
  332|  13.7k|    level[PLANE_TYPE_UV] =
  333|  13.7k|        cdef_info->cdef_uv_strengths[mbmi_cdef_strength] / CDEF_SEC_STRENGTHS;
  ------------------
  |  |   17|  13.7k|#define CDEF_SEC_STRENGTHS 4
  ------------------
  334|  13.7k|    sec_strength[PLANE_TYPE_UV] =
  335|  13.7k|        cdef_info->cdef_uv_strengths[mbmi_cdef_strength] % CDEF_SEC_STRENGTHS;
  ------------------
  |  |   17|  13.7k|#define CDEF_SEC_STRENGTHS 4
  ------------------
  336|  13.7k|    sec_strength[PLANE_TYPE_UV] += sec_strength[PLANE_TYPE_UV] == 3;
  337|  13.7k|    is_zero_level[PLANE_TYPE_UV] =
  338|  13.7k|        (level[PLANE_TYPE_UV] == 0) && (sec_strength[PLANE_TYPE_UV] == 0);
  ------------------
  |  Branch (338:9): [True: 3.43k, False: 10.3k]
  |  Branch (338:40): [True: 2.36k, False: 1.07k]
  ------------------
  339|  13.7k|  }
  340|       |
  341|  14.7k|  if (is_zero_level[PLANE_TYPE_Y] && is_zero_level[PLANE_TYPE_UV]) {
  ------------------
  |  Branch (341:7): [True: 3.46k, False: 11.2k]
  |  Branch (341:38): [True: 2, False: 3.46k]
  ------------------
  342|      2|    av1_zero_array(cdef_left, num_planes);
  ------------------
  |  |   44|      2|#define av1_zero_array(dest, n) memset(dest, 0, n * sizeof(*(dest)))
  ------------------
  343|      2|    return;
  344|      2|  }
  345|       |
  346|  14.7k|  fb_info->cdef_count = av1_cdef_compute_sb_list(mi_params, fbr * MI_SIZE_64X64,
  ------------------
  |  |   58|  14.7k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  14.7k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  347|  14.7k|                                                 fbc * MI_SIZE_64X64,
  ------------------
  |  |   58|  14.7k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  14.7k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  348|  14.7k|                                                 fb_info->dlist, BLOCK_64X64);
  349|  14.7k|  if (!fb_info->cdef_count) {
  ------------------
  |  Branch (349:7): [True: 661, False: 14.0k]
  ------------------
  350|    661|    av1_zero_array(cdef_left, num_planes);
  ------------------
  |  |   44|    661|#define av1_zero_array(dest, n) memset(dest, 0, n * sizeof(*(dest)))
  ------------------
  351|    661|    return;
  352|    661|  }
  353|       |
  354|  54.4k|  for (int plane = 0; plane < num_planes; plane++) {
  ------------------
  |  Branch (354:23): [True: 40.3k, False: 14.0k]
  ------------------
  355|       |    // Do not skip cdef filtering for luma plane as filter direction is
  356|       |    // computed based on luma.
  357|  40.3k|    if (plane && is_zero_level[get_plane_type(plane)]) {
  ------------------
  |  Branch (357:9): [True: 26.3k, False: 14.0k]
  |  Branch (357:18): [True: 4.71k, False: 21.5k]
  ------------------
  358|  4.71k|      cdef_left[plane] = 0;
  359|  4.71k|      continue;
  360|  4.71k|    }
  361|  35.6k|    cdef_init_fb_col(xd, fb_info, level, sec_strength, fbc, fbr, plane);
  362|  35.6k|    cdef_prepare_fb(cm, fb_info, colbuf, cdef_left[plane], fbc, fbr, plane);
  363|  35.6k|    cdef_filter_fb(fb_info, plane, cm->seq_params->use_highbitdepth);
  364|  35.6k|    cdef_left[plane] = 1;
  365|  35.6k|  }
  366|  14.0k|}
cdef.c:cdef_init_fb_col:
  285|  35.6k|                                    int plane) {
  286|  35.6k|  const PLANE_TYPE plane_type = get_plane_type(plane);
  287|  35.6k|  fb_info->level = level[plane_type];
  288|  35.6k|  fb_info->sec_strength = sec_strength[plane_type];
  289|  35.6k|  fb_info->dst = xd->plane[plane].dst.buf;
  290|  35.6k|  fb_info->dst_stride = xd->plane[plane].dst.stride;
  291|       |
  292|  35.6k|  fb_info->xdec = xd->plane[plane].subsampling_x;
  293|  35.6k|  fb_info->ydec = xd->plane[plane].subsampling_y;
  294|  35.6k|  fb_info->mi_wide_l2 = MI_SIZE_LOG2 - xd->plane[plane].subsampling_x;
  ------------------
  |  |   39|  35.6k|#define MI_SIZE_LOG2 2
  ------------------
  295|  35.6k|  fb_info->mi_high_l2 = MI_SIZE_LOG2 - xd->plane[plane].subsampling_y;
  ------------------
  |  |   39|  35.6k|#define MI_SIZE_LOG2 2
  ------------------
  296|  35.6k|  fb_info->roffset = MI_SIZE_64X64 * fbr << fb_info->mi_high_l2;
  ------------------
  |  |   58|  35.6k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  35.6k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  297|  35.6k|  fb_info->coffset = MI_SIZE_64X64 * fbc << fb_info->mi_wide_l2;
  ------------------
  |  |   58|  35.6k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  35.6k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  298|  35.6k|}
cdef.c:cdef_prepare_fb:
  153|  35.6k|                            int fbc, int fbr, int plane) {
  154|  35.6k|  const CommonModeInfoParams *const mi_params = &cm->mi_params;
  155|  35.6k|  uint16_t *src = fb_info->src;
  156|  35.6k|  const int luma_stride =
  157|  35.6k|      ALIGN_POWER_OF_TWO(mi_params->mi_cols << MI_SIZE_LOG2, 4);
  ------------------
  |  |   69|  35.6k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  ------------------
  158|  35.6k|  const int nvfb = (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
  ------------------
  |  |   58|  35.6k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  35.6k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
                const int nvfb = (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
  ------------------
  |  |   58|  35.6k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  35.6k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  159|  35.6k|  const int nhfb = (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
  ------------------
  |  |   58|  35.6k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  35.6k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
                const int nhfb = (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
  ------------------
  |  |   58|  35.6k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  35.6k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  160|  35.6k|  int cstart = 0;
  161|  35.6k|  if (!cdef_left) cstart = -CDEF_HBORDER;
  ------------------
  |  |   26|    433|#define CDEF_HBORDER (8)
  ------------------
  |  Branch (161:7): [True: 433, False: 35.1k]
  ------------------
  162|  35.6k|  int rend, cend;
  163|  35.6k|  const int nhb =
  164|  35.6k|      AOMMIN(MI_SIZE_64X64, mi_params->mi_cols - MI_SIZE_64X64 * fbc);
  ------------------
  |  |   34|  35.6k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 20.9k, False: 14.7k]
  |  |  ------------------
  ------------------
  165|  35.6k|  const int nvb =
  166|  35.6k|      AOMMIN(MI_SIZE_64X64, mi_params->mi_rows - MI_SIZE_64X64 * fbr);
  ------------------
  |  |   34|  35.6k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 22.0k, False: 13.5k]
  |  |  ------------------
  ------------------
  167|  35.6k|  const int hsize = nhb << fb_info->mi_wide_l2;
  168|  35.6k|  const int vsize = nvb << fb_info->mi_high_l2;
  169|  35.6k|  const uint16_t *top_linebuf = fb_info->top_linebuf[plane];
  170|  35.6k|  const uint16_t *bot_linebuf = fb_info->bot_linebuf[plane];
  171|  35.6k|  const int bot_offset = (vsize + CDEF_VBORDER) * CDEF_BSTRIDE;
  ------------------
  |  |   23|  35.6k|#define CDEF_VBORDER (2)
  ------------------
                const int bot_offset = (vsize + CDEF_VBORDER) * CDEF_BSTRIDE;
  ------------------
  |  |   28|  35.6k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  35.6k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  172|  35.6k|  const int stride =
  173|  35.6k|      luma_stride >> (plane == AOM_PLANE_Y ? 0 : cm->seq_params->subsampling_x);
  ------------------
  |  |  210|  35.6k|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
  |  Branch (173:23): [True: 14.0k, False: 21.5k]
  ------------------
  174|       |
  175|  35.6k|  if (fbc == nhfb - 1)
  ------------------
  |  Branch (175:7): [True: 14.7k, False: 20.9k]
  ------------------
  176|  14.7k|    cend = hsize;
  177|  20.9k|  else
  178|  20.9k|    cend = hsize + CDEF_HBORDER;
  ------------------
  |  |   26|  20.9k|#define CDEF_HBORDER (8)
  ------------------
  179|       |
  180|  35.6k|  if (fbr == nvfb - 1)
  ------------------
  |  Branch (180:7): [True: 13.6k, False: 22.0k]
  ------------------
  181|  13.6k|    rend = vsize;
  182|  22.0k|  else
  183|  22.0k|    rend = vsize + CDEF_VBORDER;
  ------------------
  |  |   23|  22.0k|#define CDEF_VBORDER (2)
  ------------------
  184|       |
  185|       |  /* Copy in the pixels we need from the current superblock for
  186|       |  deringing.*/
  187|  35.6k|  av1_cdef_copy_sb8_16(
  188|  35.6k|      cm, &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER + cstart],
  ------------------
  |  |   23|  35.6k|#define CDEF_VBORDER (2)
  ------------------
                    cm, &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER + cstart],
  ------------------
  |  |   28|  35.6k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  35.6k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
                    cm, &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER + cstart],
  ------------------
  |  |   26|  35.6k|#define CDEF_HBORDER (8)
  ------------------
  189|  35.6k|      CDEF_BSTRIDE, fb_info->dst, fb_info->roffset, fb_info->coffset + cstart,
  ------------------
  |  |   28|  35.6k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  35.6k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  190|  35.6k|      fb_info->dst_stride, vsize, cend - cstart);
  191|       |
  192|       |  /* Copy in the pixels we need for the current superblock from bottom buffer.*/
  193|  35.6k|  if (fbr < nvfb - 1) {
  ------------------
  |  Branch (193:7): [True: 22.0k, False: 13.5k]
  ------------------
  194|  22.0k|    copy_rect(&src[bot_offset + CDEF_HBORDER], CDEF_BSTRIDE,
  ------------------
  |  |   26|  22.0k|#define CDEF_HBORDER (8)
  ------------------
                  copy_rect(&src[bot_offset + CDEF_HBORDER], CDEF_BSTRIDE,
  ------------------
  |  |   28|  22.0k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  22.0k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  195|  22.0k|              &bot_linebuf[fb_info->coffset], stride, CDEF_VBORDER, hsize);
  ------------------
  |  |   23|  22.0k|#define CDEF_VBORDER (2)
  ------------------
  196|  22.0k|  } else {
  197|  13.5k|    fill_rect(&src[bot_offset + CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER,
  ------------------
  |  |   26|  13.5k|#define CDEF_HBORDER (8)
  ------------------
                  fill_rect(&src[bot_offset + CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER,
  ------------------
  |  |   28|  13.5k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  13.5k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
                  fill_rect(&src[bot_offset + CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER,
  ------------------
  |  |   23|  13.5k|#define CDEF_VBORDER (2)
  ------------------
  198|  13.5k|              hsize, CDEF_VERY_LARGE);
  ------------------
  |  |   30|  13.5k|#define CDEF_VERY_LARGE (0x4000)
  ------------------
  199|  13.5k|  }
  200|  35.6k|  if (fbr < nvfb - 1 && fbc > 0) {
  ------------------
  |  Branch (200:7): [True: 22.0k, False: 13.5k]
  |  Branch (200:25): [True: 14.1k, False: 7.92k]
  ------------------
  201|  14.1k|    copy_rect(&src[bot_offset], CDEF_BSTRIDE,
  ------------------
  |  |   28|  14.1k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  14.1k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  202|  14.1k|              &bot_linebuf[fb_info->coffset - CDEF_HBORDER], stride,
  ------------------
  |  |   26|  14.1k|#define CDEF_HBORDER (8)
  ------------------
  203|  14.1k|              CDEF_VBORDER, CDEF_HBORDER);
  ------------------
  |  |   23|  14.1k|#define CDEF_VBORDER (2)
  ------------------
                            CDEF_VBORDER, CDEF_HBORDER);
  ------------------
  |  |   26|  14.1k|#define CDEF_HBORDER (8)
  ------------------
  204|  21.5k|  } else {
  205|  21.5k|    fill_rect(&src[bot_offset], CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER,
  ------------------
  |  |   28|  21.5k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  21.5k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
                  fill_rect(&src[bot_offset], CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER,
  ------------------
  |  |   23|  21.5k|#define CDEF_VBORDER (2)
  ------------------
                  fill_rect(&src[bot_offset], CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER,
  ------------------
  |  |   26|  21.5k|#define CDEF_HBORDER (8)
  ------------------
  206|  21.5k|              CDEF_VERY_LARGE);
  ------------------
  |  |   30|  21.5k|#define CDEF_VERY_LARGE (0x4000)
  ------------------
  207|  21.5k|  }
  208|  35.6k|  if (fbr < nvfb - 1 && fbc < nhfb - 1) {
  ------------------
  |  Branch (208:7): [True: 22.0k, False: 13.5k]
  |  Branch (208:25): [True: 14.3k, False: 7.67k]
  ------------------
  209|  14.3k|    copy_rect(&src[bot_offset + hsize + CDEF_HBORDER], CDEF_BSTRIDE,
  ------------------
  |  |   26|  14.3k|#define CDEF_HBORDER (8)
  ------------------
                  copy_rect(&src[bot_offset + hsize + CDEF_HBORDER], CDEF_BSTRIDE,
  ------------------
  |  |   28|  14.3k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  14.3k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  210|  14.3k|              &bot_linebuf[fb_info->coffset + hsize], stride, CDEF_VBORDER,
  ------------------
  |  |   23|  14.3k|#define CDEF_VBORDER (2)
  ------------------
  211|  14.3k|              CDEF_HBORDER);
  ------------------
  |  |   26|  14.3k|#define CDEF_HBORDER (8)
  ------------------
  212|  21.2k|  } else {
  213|  21.2k|    fill_rect(&src[bot_offset + hsize + CDEF_HBORDER], CDEF_BSTRIDE,
  ------------------
  |  |   26|  21.2k|#define CDEF_HBORDER (8)
  ------------------
                  fill_rect(&src[bot_offset + hsize + CDEF_HBORDER], CDEF_BSTRIDE,
  ------------------
  |  |   28|  21.2k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  21.2k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  214|  21.2k|              CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
  ------------------
  |  |   23|  21.2k|#define CDEF_VBORDER (2)
  ------------------
                            CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
  ------------------
  |  |   26|  21.2k|#define CDEF_HBORDER (8)
  ------------------
                            CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
  ------------------
  |  |   30|  21.2k|#define CDEF_VERY_LARGE (0x4000)
  ------------------
  215|  21.2k|  }
  216|       |
  217|       |  /* Copy in the pixels we need from the current superblock from top buffer.*/
  218|  35.6k|  if (fbr > 0) {
  ------------------
  |  Branch (218:7): [True: 21.6k, False: 13.9k]
  ------------------
  219|  21.6k|    copy_rect(&src[CDEF_HBORDER], CDEF_BSTRIDE, &top_linebuf[fb_info->coffset],
  ------------------
  |  |   26|  21.6k|#define CDEF_HBORDER (8)
  ------------------
                  copy_rect(&src[CDEF_HBORDER], CDEF_BSTRIDE, &top_linebuf[fb_info->coffset],
  ------------------
  |  |   28|  21.6k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  21.6k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  220|  21.6k|              stride, CDEF_VBORDER, hsize);
  ------------------
  |  |   23|  21.6k|#define CDEF_VBORDER (2)
  ------------------
  221|  21.6k|  } else {
  222|  13.9k|    fill_rect(&src[CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER, hsize,
  ------------------
  |  |   26|  13.9k|#define CDEF_HBORDER (8)
  ------------------
                  fill_rect(&src[CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER, hsize,
  ------------------
  |  |   28|  13.9k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  13.9k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
                  fill_rect(&src[CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER, hsize,
  ------------------
  |  |   23|  13.9k|#define CDEF_VBORDER (2)
  ------------------
  223|  13.9k|              CDEF_VERY_LARGE);
  ------------------
  |  |   30|  13.9k|#define CDEF_VERY_LARGE (0x4000)
  ------------------
  224|  13.9k|  }
  225|  35.6k|  if (fbr > 0 && fbc > 0) {
  ------------------
  |  Branch (225:7): [True: 21.6k, False: 13.9k]
  |  Branch (225:18): [True: 14.0k, False: 7.64k]
  ------------------
  226|  14.0k|    copy_rect(src, CDEF_BSTRIDE, &top_linebuf[fb_info->coffset - CDEF_HBORDER],
  ------------------
  |  |   28|  14.0k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  14.0k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
                  copy_rect(src, CDEF_BSTRIDE, &top_linebuf[fb_info->coffset - CDEF_HBORDER],
  ------------------
  |  |   26|  14.0k|#define CDEF_HBORDER (8)
  ------------------
  227|  14.0k|              stride, CDEF_VBORDER, CDEF_HBORDER);
  ------------------
  |  |   23|  14.0k|#define CDEF_VBORDER (2)
  ------------------
                            stride, CDEF_VBORDER, CDEF_HBORDER);
  ------------------
  |  |   26|  14.0k|#define CDEF_HBORDER (8)
  ------------------
  228|  21.6k|  } else {
  229|  21.6k|    fill_rect(src, CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
  ------------------
  |  |   28|  21.6k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  21.6k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
                  fill_rect(src, CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
  ------------------
  |  |   23|  21.6k|#define CDEF_VBORDER (2)
  ------------------
                  fill_rect(src, CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
  ------------------
  |  |   26|  21.6k|#define CDEF_HBORDER (8)
  ------------------
                  fill_rect(src, CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
  ------------------
  |  |   30|  21.6k|#define CDEF_VERY_LARGE (0x4000)
  ------------------
  230|  21.6k|  }
  231|  35.6k|  if (fbr > 0 && fbc < nhfb - 1) {
  ------------------
  |  Branch (231:7): [True: 21.6k, False: 13.9k]
  |  Branch (231:18): [True: 14.0k, False: 7.58k]
  ------------------
  232|  14.0k|    copy_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE,
  ------------------
  |  |   26|  14.0k|#define CDEF_HBORDER (8)
  ------------------
                  copy_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE,
  ------------------
  |  |   28|  14.0k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  14.0k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  233|  14.0k|              &top_linebuf[fb_info->coffset + hsize], stride, CDEF_VBORDER,
  ------------------
  |  |   23|  14.0k|#define CDEF_VBORDER (2)
  ------------------
  234|  14.0k|              CDEF_HBORDER);
  ------------------
  |  |   26|  14.0k|#define CDEF_HBORDER (8)
  ------------------
  235|  21.5k|  } else {
  236|  21.5k|    fill_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER,
  ------------------
  |  |   26|  21.5k|#define CDEF_HBORDER (8)
  ------------------
                  fill_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER,
  ------------------
  |  |   28|  21.5k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  21.5k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
                  fill_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER,
  ------------------
  |  |   23|  21.5k|#define CDEF_VBORDER (2)
  ------------------
  237|  21.5k|              CDEF_HBORDER, CDEF_VERY_LARGE);
  ------------------
  |  |   26|  21.5k|#define CDEF_HBORDER (8)
  ------------------
                            CDEF_HBORDER, CDEF_VERY_LARGE);
  ------------------
  |  |   30|  21.5k|#define CDEF_VERY_LARGE (0x4000)
  ------------------
  238|  21.5k|  }
  239|  35.6k|  if (cdef_left) {
  ------------------
  |  Branch (239:7): [True: 35.2k, False: 405]
  ------------------
  240|       |    /* If we deringed the superblock on the left then we need to copy in
  241|       |    saved pixels. */
  242|  35.2k|    copy_rect(src, CDEF_BSTRIDE, colbuf[plane], CDEF_HBORDER,
  ------------------
  |  |   28|  35.2k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  35.2k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
                  copy_rect(src, CDEF_BSTRIDE, colbuf[plane], CDEF_HBORDER,
  ------------------
  |  |   26|  35.2k|#define CDEF_HBORDER (8)
  ------------------
  243|  35.2k|              rend + CDEF_VBORDER, CDEF_HBORDER);
  ------------------
  |  |   23|  35.2k|#define CDEF_VBORDER (2)
  ------------------
                            rend + CDEF_VBORDER, CDEF_HBORDER);
  ------------------
  |  |   26|  35.2k|#define CDEF_HBORDER (8)
  ------------------
  244|  35.2k|  }
  245|       |  /* Saving pixels in case we need to dering the superblock on the
  246|       |  right. */
  247|  35.6k|  copy_rect(colbuf[plane], CDEF_HBORDER, src + hsize, CDEF_BSTRIDE,
  ------------------
  |  |   26|  35.6k|#define CDEF_HBORDER (8)
  ------------------
                copy_rect(colbuf[plane], CDEF_HBORDER, src + hsize, CDEF_BSTRIDE,
  ------------------
  |  |   28|  35.6k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  35.6k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  248|  35.6k|            rend + CDEF_VBORDER, CDEF_HBORDER);
  ------------------
  |  |   23|  35.6k|#define CDEF_VBORDER (2)
  ------------------
                          rend + CDEF_VBORDER, CDEF_HBORDER);
  ------------------
  |  |   26|  35.6k|#define CDEF_HBORDER (8)
  ------------------
  249|       |
  250|  35.6k|  if (fb_info->frame_boundary[LEFT]) {
  ------------------
  |  Branch (250:7): [True: 15.0k, False: 20.6k]
  ------------------
  251|  15.0k|    fill_rect(src, CDEF_BSTRIDE, vsize + 2 * CDEF_VBORDER, CDEF_HBORDER,
  ------------------
  |  |   28|  15.0k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  15.0k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
                  fill_rect(src, CDEF_BSTRIDE, vsize + 2 * CDEF_VBORDER, CDEF_HBORDER,
  ------------------
  |  |   23|  15.0k|#define CDEF_VBORDER (2)
  ------------------
                  fill_rect(src, CDEF_BSTRIDE, vsize + 2 * CDEF_VBORDER, CDEF_HBORDER,
  ------------------
  |  |   26|  15.0k|#define CDEF_HBORDER (8)
  ------------------
  252|  15.0k|              CDEF_VERY_LARGE);
  ------------------
  |  |   30|  15.0k|#define CDEF_VERY_LARGE (0x4000)
  ------------------
  253|  15.0k|  }
  254|  35.6k|  if (fb_info->frame_boundary[RIGHT]) {
  ------------------
  |  Branch (254:7): [True: 14.7k, False: 20.9k]
  ------------------
  255|  14.7k|    fill_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE,
  ------------------
  |  |   26|  14.7k|#define CDEF_HBORDER (8)
  ------------------
                  fill_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE,
  ------------------
  |  |   28|  14.7k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  14.7k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  256|  14.7k|              vsize + 2 * CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
  ------------------
  |  |   23|  14.7k|#define CDEF_VBORDER (2)
  ------------------
                            vsize + 2 * CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
  ------------------
  |  |   26|  14.7k|#define CDEF_HBORDER (8)
  ------------------
                            vsize + 2 * CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
  ------------------
  |  |   30|  14.7k|#define CDEF_VERY_LARGE (0x4000)
  ------------------
  257|  14.7k|  }
  258|  35.6k|}
cdef.c:copy_rect:
  133|   170k|                             int sstride, int v, int h) {
  134|  3.36M|  for (int i = 0; i < v; i++) {
  ------------------
  |  Branch (134:19): [True: 3.19M, False: 170k]
  ------------------
  135|  31.2M|    for (int j = 0; j < h; j++) {
  ------------------
  |  Branch (135:21): [True: 28.0M, False: 3.19M]
  ------------------
  136|  28.0M|      dst[i * dstride + j] = src[i * sstride + j];
  137|  28.0M|    }
  138|  3.19M|  }
  139|   170k|}
cdef.c:cdef_filter_fb:
  261|  35.6k|                                  uint8_t use_highbitdepth) {
  262|  35.6k|  ptrdiff_t offset =
  263|  35.6k|      (ptrdiff_t)fb_info->dst_stride * fb_info->roffset + fb_info->coffset;
  264|  35.6k|  if (use_highbitdepth) {
  ------------------
  |  Branch (264:7): [True: 12.9k, False: 22.6k]
  ------------------
  265|  12.9k|    av1_cdef_filter_fb(
  266|  12.9k|        NULL, CONVERT_TO_SHORTPTR(fb_info->dst + offset), fb_info->dst_stride,
  ------------------
  |  |   75|  12.9k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  267|  12.9k|        &fb_info->src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER],
  ------------------
  |  |   23|  12.9k|#define CDEF_VBORDER (2)
  ------------------
                      &fb_info->src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER],
  ------------------
  |  |   28|  12.9k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  12.9k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
                      &fb_info->src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER],
  ------------------
  |  |   26|  12.9k|#define CDEF_HBORDER (8)
  ------------------
  268|  12.9k|        fb_info->xdec, fb_info->ydec, fb_info->dir, NULL, fb_info->var, plane,
  269|  12.9k|        fb_info->dlist, fb_info->cdef_count, fb_info->level,
  270|  12.9k|        fb_info->sec_strength, fb_info->damping, fb_info->coeff_shift);
  271|  22.6k|  } else {
  272|  22.6k|    av1_cdef_filter_fb(
  273|  22.6k|        fb_info->dst + offset, NULL, fb_info->dst_stride,
  274|  22.6k|        &fb_info->src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER],
  ------------------
  |  |   23|  22.6k|#define CDEF_VBORDER (2)
  ------------------
                      &fb_info->src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER],
  ------------------
  |  |   28|  22.6k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  22.6k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
                      &fb_info->src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER],
  ------------------
  |  |   26|  22.6k|#define CDEF_HBORDER (8)
  ------------------
  275|       |        fb_info->xdec, fb_info->ydec, fb_info->dir, NULL, fb_info->var, plane,
  276|  22.6k|        fb_info->dlist, fb_info->cdef_count, fb_info->level,
  277|  22.6k|        fb_info->sec_strength, fb_info->damping, fb_info->coeff_shift);
  278|  22.6k|  }
  279|  35.6k|}

av1_cdef_filter_fb:
  328|  35.6k|                        int sec_strength, int damping, int coeff_shift) {
  329|  35.6k|  int bi;
  330|  35.6k|  int bx;
  331|  35.6k|  int by;
  332|  35.6k|  const int pri_strength = level << coeff_shift;
  333|  35.6k|  sec_strength <<= coeff_shift;
  334|  35.6k|  damping += coeff_shift - (pli != AOM_PLANE_Y);
  ------------------
  |  |  210|  35.6k|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
  335|  35.6k|  const int bw_log2 = 3 - xdec;
  336|  35.6k|  const int bh_log2 = 3 - ydec;
  337|  35.6k|  if (dirinit && pri_strength == 0 && sec_strength == 0) {
  ------------------
  |  Branch (337:7): [True: 0, False: 35.6k]
  |  Branch (337:18): [True: 0, False: 0]
  |  Branch (337:39): [True: 0, False: 0]
  ------------------
  338|       |    // If we're here, both primary and secondary strengths are 0, and
  339|       |    // we still haven't written anything to y[] yet, so we just copy
  340|       |    // the input to y[]. This is necessary only for av1_cdef_search()
  341|       |    // and only av1_cdef_search() sets dirinit.
  342|      0|    for (bi = 0; bi < cdef_count; bi++) {
  ------------------
  |  Branch (342:18): [True: 0, False: 0]
  ------------------
  343|      0|      by = dlist[bi].by;
  344|      0|      bx = dlist[bi].bx;
  345|       |      // TODO(stemidts/jmvalin): SIMD optimisations
  346|      0|      for (int iy = 0; iy < 1 << bh_log2; iy++) {
  ------------------
  |  Branch (346:24): [True: 0, False: 0]
  ------------------
  347|      0|        memcpy(&dst16[(bi << (bw_log2 + bh_log2)) + (iy << bw_log2)],
  348|      0|               &in[((by << bh_log2) + iy) * CDEF_BSTRIDE + (bx << bw_log2)],
  ------------------
  |  |   28|      0|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|      0|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  349|      0|               ((size_t)1 << bw_log2) * sizeof(*dst16));
  350|      0|      }
  351|      0|    }
  352|      0|    return;
  353|      0|  }
  354|       |
  355|  35.6k|  if (pli == 0) {
  ------------------
  |  Branch (355:7): [True: 14.0k, False: 21.6k]
  ------------------
  356|  14.0k|    if (!dirinit || !*dirinit) {
  ------------------
  |  Branch (356:9): [True: 14.0k, False: 0]
  |  Branch (356:21): [True: 0, False: 0]
  ------------------
  357|  14.0k|      aom_cdef_find_dir(in, dlist, var, cdef_count, coeff_shift, dir);
  358|  14.0k|      if (dirinit) *dirinit = 1;
  ------------------
  |  Branch (358:11): [True: 0, False: 14.0k]
  ------------------
  359|  14.0k|    }
  360|  14.0k|  }
  361|  35.6k|  if (pli == 1 && xdec != ydec) {
  ------------------
  |  Branch (361:7): [True: 10.8k, False: 24.8k]
  |  Branch (361:19): [True: 762, False: 10.0k]
  ------------------
  362|  46.1k|    for (bi = 0; bi < cdef_count; bi++) {
  ------------------
  |  Branch (362:18): [True: 45.3k, False: 762]
  ------------------
  363|  45.3k|      static const int conv422[8] = { 7, 0, 2, 4, 5, 6, 6, 6 };
  364|  45.3k|      static const int conv440[8] = { 1, 2, 2, 2, 3, 4, 6, 0 };
  365|  45.3k|      by = dlist[bi].by;
  366|  45.3k|      bx = dlist[bi].bx;
  367|  45.3k|      dir[by][bx] = (xdec ? conv422 : conv440)[dir[by][bx]];
  ------------------
  |  Branch (367:22): [True: 45.3k, False: 0]
  ------------------
  368|  45.3k|    }
  369|    762|  }
  370|       |
  371|  35.6k|  if (dst8) {
  ------------------
  |  Branch (371:7): [True: 22.6k, False: 12.9k]
  ------------------
  372|  22.6k|    const int block_width = 8 >> xdec;
  373|  22.6k|    const int block_height = 8 >> ydec;
  374|       |    /*
  375|       |     * strength_index == 0 : enable_primary = 1, enable_secondary = 1
  376|       |     * strength_index == 1 : enable_primary = 1, enable_secondary = 0
  377|       |     * strength_index == 2 : enable_primary = 0, enable_secondary = 1
  378|       |     * strength_index == 3 : enable_primary = 0, enable_secondary = 0
  379|       |     */
  380|  22.6k|    const cdef_filter_block_func cdef_filter_fn[4] = {
  381|  22.6k|      cdef_filter_8_0, cdef_filter_8_1, cdef_filter_8_2, cdef_filter_8_3
  382|  22.6k|    };
  383|       |
  384|   839k|    for (bi = 0; bi < cdef_count; bi++) {
  ------------------
  |  Branch (384:18): [True: 816k, False: 22.6k]
  ------------------
  385|   816k|      by = dlist[bi].by;
  386|   816k|      bx = dlist[bi].bx;
  387|   816k|      const int t =
  388|   816k|          (pli ? pri_strength : adjust_strength(pri_strength, var[by][bx]));
  ------------------
  |  Branch (388:12): [True: 470k, False: 346k]
  ------------------
  389|   816k|      const int strength_index = (sec_strength == 0) | ((t == 0) << 1);
  390|       |
  391|   816k|      cdef_filter_fn[strength_index](
  392|   816k|          &dst8[(by << bh_log2) * dstride + (bx << bw_log2)], dstride,
  393|   816k|          &in[(by * CDEF_BSTRIDE << bh_log2) + (bx << bw_log2)], t,
  ------------------
  |  |   28|   816k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   816k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  394|   816k|          sec_strength, pri_strength ? dir[by][bx] : 0, damping, damping,
  ------------------
  |  Branch (394:25): [True: 631k, False: 184k]
  ------------------
  395|   816k|          coeff_shift, block_width, block_height);
  396|   816k|    }
  397|  22.6k|  } else {
  398|  12.9k|    const int block_width = 8 >> xdec;
  399|  12.9k|    const int block_height = 8 >> ydec;
  400|       |    /*
  401|       |     * strength_index == 0 : enable_primary = 1, enable_secondary = 1
  402|       |     * strength_index == 1 : enable_primary = 1, enable_secondary = 0
  403|       |     * strength_index == 2 : enable_primary = 0, enable_secondary = 1
  404|       |     * strength_index == 3 : enable_primary = 0, enable_secondary = 0
  405|       |     */
  406|  12.9k|    const cdef_filter_block_func cdef_filter_fn[4] = {
  407|  12.9k|      cdef_filter_16_0, cdef_filter_16_1, cdef_filter_16_2, cdef_filter_16_3
  408|  12.9k|    };
  409|       |
  410|   549k|    for (bi = 0; bi < cdef_count; bi++) {
  ------------------
  |  Branch (410:18): [True: 536k, False: 12.9k]
  ------------------
  411|   536k|      by = dlist[bi].by;
  412|   536k|      bx = dlist[bi].bx;
  413|   536k|      const int t =
  414|   536k|          (pli ? pri_strength : adjust_strength(pri_strength, var[by][bx]));
  ------------------
  |  Branch (414:12): [True: 357k, False: 179k]
  ------------------
  415|   536k|      const int strength_index = (sec_strength == 0) | ((t == 0) << 1);
  416|       |
  417|   536k|      cdef_filter_fn[strength_index](
  418|   536k|          &dst16[dirinit ? bi << (bw_log2 + bh_log2)
  ------------------
  |  Branch (418:18): [True: 0, False: 536k]
  ------------------
  419|   536k|                         : (by << bh_log2) * dstride + (bx << bw_log2)],
  420|   536k|          dirinit ? 1 << bw_log2 : dstride,
  ------------------
  |  Branch (420:11): [True: 0, False: 536k]
  ------------------
  421|   536k|          &in[(by * CDEF_BSTRIDE << bh_log2) + (bx << bw_log2)], t,
  ------------------
  |  |   28|   536k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   536k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  422|   536k|          sec_strength, pri_strength ? dir[by][bx] : 0, damping, damping,
  ------------------
  |  Branch (422:25): [True: 481k, False: 55.0k]
  ------------------
  423|   536k|          coeff_shift, block_width, block_height);
  424|   536k|    }
  425|  12.9k|  }
  426|  35.6k|}
cdef_block.c:aom_cdef_find_dir:
  298|  14.0k|                                     int dir[CDEF_NBLOCKS][CDEF_NBLOCKS]) {
  299|  14.0k|  int bi;
  300|       |
  301|       |  // Find direction of two 8x8 blocks together.
  302|   288k|  for (bi = 0; bi < cdef_count - 1; bi += 2) {
  ------------------
  |  Branch (302:16): [True: 274k, False: 14.0k]
  ------------------
  303|   274k|    const int by = dlist[bi].by;
  304|   274k|    const int bx = dlist[bi].bx;
  305|   274k|    const int by2 = dlist[bi + 1].by;
  306|   274k|    const int bx2 = dlist[bi + 1].bx;
  307|   274k|    const int pos1 = 8 * by * CDEF_BSTRIDE + 8 * bx;
  ------------------
  |  |   28|   274k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   274k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  308|   274k|    const int pos2 = 8 * by2 * CDEF_BSTRIDE + 8 * bx2;
  ------------------
  |  |   28|   274k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   274k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  309|   274k|    cdef_find_dir_dual(&in[pos1], &in[pos2], CDEF_BSTRIDE, &var[by][bx],
  ------------------
  |  |   28|   274k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   274k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  310|   274k|                       &var[by2][bx2], coeff_shift, &dir[by][bx],
  311|   274k|                       &dir[by2][bx2]);
  312|   274k|  }
  313|       |
  314|       |  // Process remaining 8x8 blocks here. One 8x8 at a time.
  315|  14.0k|  if (cdef_count % 2) {
  ------------------
  |  Branch (315:7): [True: 1.60k, False: 12.4k]
  ------------------
  316|  1.60k|    const int by = dlist[bi].by;
  317|  1.60k|    const int bx = dlist[bi].bx;
  318|  1.60k|    dir[by][bx] = cdef_find_dir(&in[8 * by * CDEF_BSTRIDE + 8 * bx],
  ------------------
  |  |   28|  1.60k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  1.60k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  319|  1.60k|                                CDEF_BSTRIDE, &var[by][bx], coeff_shift);
  ------------------
  |  |   28|  1.60k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  1.60k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  320|  1.60k|  }
  321|  14.0k|}
cdef_block.c:adjust_strength:
  289|   545k|static inline int adjust_strength(int strength, int32_t var) {
  290|   545k|  const int i = var >> 6 ? AOMMIN(get_msb(var >> 6), 12) : 0;
  ------------------
  |  |   34|  78.1k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 76.8k, False: 1.38k]
  |  |  ------------------
  ------------------
  |  Branch (290:17): [True: 78.1k, False: 467k]
  ------------------
  291|       |  /* We use the variance of 8x8 blocks to adjust the strength. */
  292|   545k|  return var ? (strength * (4 + i) + 8) >> 4 : 0;
  ------------------
  |  Branch (292:10): [True: 95.0k, False: 450k]
  ------------------
  293|   545k|}

cdef.c:fill_rect:
   58|   143k|                             uint16_t x) {
   59|  1.71M|  for (int i = 0; i < v; i++) {
  ------------------
  |  Branch (59:19): [True: 1.57M, False: 143k]
  ------------------
   60|  15.8M|    for (int j = 0; j < h; j++) {
  ------------------
  |  Branch (60:21): [True: 14.2M, False: 1.57M]
  ------------------
   61|  14.2M|      dst[i * dstride + j] = x;
   62|  14.2M|    }
   63|  1.57M|  }
   64|   143k|}

cdef_find_dir_avx2:
  163|  1.60k|                             int coeff_shift) {
  164|  1.60k|  int i;
  165|  1.60k|  int32_t cost[8];
  166|  1.60k|  int32_t best_cost = 0;
  167|  1.60k|  int best_dir = 0;
  168|  1.60k|  v128 lines[8];
  169|  14.3k|  for (i = 0; i < 8; i++) {
  ------------------
  |  Branch (169:15): [True: 12.7k, False: 1.60k]
  ------------------
  170|  12.7k|    lines[i] = v128_load_unaligned(&img[i * stride]);
  171|  12.7k|    lines[i] =
  172|  12.7k|        v128_sub_16(v128_shr_s16(lines[i], coeff_shift), v128_dup_16(128));
  173|  12.7k|  }
  174|       |
  175|       |  /* Compute "mostly vertical" directions. */
  176|  1.60k|  v128 dir47 = compute_directions(lines, cost + 4);
  177|       |
  178|  1.60k|  array_reverse_transpose_8x8(lines, lines);
  179|       |
  180|       |  /* Compute "mostly horizontal" directions. */
  181|  1.60k|  v128 dir03 = compute_directions(lines, cost);
  182|       |
  183|  1.60k|  v128 max = v128_max_s32(dir03, dir47);
  184|  1.60k|  max = v128_max_s32(max, v128_align(max, max, 8));
  ------------------
  |  |   75|  1.60k|#define v128_align(a, b, c) ((c) ? _mm_alignr_epi8(a, b, (uint8_t)(c)) : (b))
  |  |  ------------------
  |  |  |  Branch (75:30): [True: 1.59k, Folded]
  |  |  ------------------
  ------------------
  185|  1.60k|  max = v128_max_s32(max, v128_align(max, max, 4));
  ------------------
  |  |   75|  1.60k|#define v128_align(a, b, c) ((c) ? _mm_alignr_epi8(a, b, (uint8_t)(c)) : (b))
  |  |  ------------------
  |  |  |  Branch (75:30): [True: 1.59k, Folded]
  |  |  ------------------
  ------------------
  186|  1.60k|  best_cost = v128_low_u32(max);
  187|  1.60k|  v128 t =
  188|  1.60k|      v128_pack_s32_s16(v128_cmpeq_32(max, dir47), v128_cmpeq_32(max, dir03));
  189|  1.60k|  best_dir = v128_movemask_8(v128_pack_s16_s8(t, t));
  190|  1.60k|  best_dir = get_msb(best_dir ^ (best_dir - 1));  // Count trailing zeros
  191|       |
  192|       |  /* Difference between the optimal variance and the variance along the
  193|       |     orthogonal direction. Again, the sum(x^2) terms cancel out. */
  194|  1.60k|  *var = best_cost - cost[(best_dir + 4) & 7];
  195|       |  /* We'd normally divide by 840, but dividing by 1024 is close enough
  196|       |     for what we're going to do with this. */
  197|  1.60k|  *var >>= 10;
  198|  1.60k|  return best_dir;
  199|  1.60k|}
cdef_filter_8_0_avx2:
  687|   124k|                                int block_height) {
  688|   124k|  if (block_width == 8) {
  ------------------
  |  Branch (688:7): [True: 111k, False: 13.5k]
  ------------------
  689|   111k|    filter_block_8x8(/*is_lowbd=*/1, dest, dstride, in, pri_strength,
  690|   111k|                     sec_strength, dir, pri_damping, sec_damping, coeff_shift,
  691|   111k|                     block_height, /*enable_primary=*/1,
  692|   111k|                     /*enable_secondary=*/1);
  693|   111k|  } else {
  694|  13.5k|    filter_block_4x4(/*is_lowbd=*/1, dest, dstride, in, pri_strength,
  695|  13.5k|                     sec_strength, dir, pri_damping, sec_damping, coeff_shift,
  696|  13.5k|                     block_height, /*enable_primary=*/1,
  697|  13.5k|                     /*enable_secondary=*/1);
  698|  13.5k|  }
  699|   124k|}
cdef_filter_8_1_avx2:
  705|   320k|                                int block_height) {
  706|   320k|  if (block_width == 8) {
  ------------------
  |  Branch (706:7): [True: 61.0k, False: 259k]
  ------------------
  707|  61.0k|    filter_block_8x8(/*is_lowbd=*/1, dest, dstride, in, pri_strength,
  708|  61.0k|                     sec_strength, dir, pri_damping, sec_damping, coeff_shift,
  709|  61.0k|                     block_height, /*enable_primary=*/1,
  710|  61.0k|                     /*enable_secondary=*/0);
  711|   259k|  } else {
  712|   259k|    filter_block_4x4(/*is_lowbd=*/1, dest, dstride, in, pri_strength,
  713|   259k|                     sec_strength, dir, pri_damping, sec_damping, coeff_shift,
  714|   259k|                     block_height, /*enable_primary=*/1,
  715|   259k|                     /*enable_secondary=*/0);
  716|   259k|  }
  717|   320k|}
cdef_filter_8_2_avx2:
  722|   163k|                                int block_height) {
  723|   163k|  if (block_width == 8) {
  ------------------
  |  Branch (723:7): [True: 98.8k, False: 64.5k]
  ------------------
  724|  98.8k|    filter_block_8x8(/*is_lowbd=*/1, dest, dstride, in, pri_strength,
  725|  98.8k|                     sec_strength, dir, pri_damping, sec_damping, coeff_shift,
  726|  98.8k|                     block_height, /*enable_primary=*/0,
  727|  98.8k|                     /*enable_secondary=*/1);
  728|  98.8k|  } else {
  729|  64.5k|    filter_block_4x4(/*is_lowbd=*/1, dest, dstride, in, pri_strength,
  730|  64.5k|                     sec_strength, dir, pri_damping, sec_damping, coeff_shift,
  731|  64.5k|                     block_height, /*enable_primary=*/0,
  732|  64.5k|                     /*enable_secondary=*/1);
  733|  64.5k|  }
  734|   163k|}
cdef_filter_8_3_avx2:
  740|   220k|                                int block_height) {
  741|   220k|  (void)pri_strength;
  742|   220k|  (void)sec_strength;
  743|   220k|  (void)dir;
  744|   220k|  (void)pri_damping;
  745|   220k|  (void)sec_damping;
  746|   220k|  (void)coeff_shift;
  747|   220k|  (void)block_width;
  748|       |
  749|   220k|  if (block_width == 8) {
  ------------------
  |  Branch (749:7): [True: 220k, False: 18.4E]
  ------------------
  750|   220k|    copy_block_8xh(/*is_lowbd=*/1, dest, dstride, in, block_height);
  751|  18.4E|  } else {
  752|  18.4E|    copy_block_4xh(/*is_lowbd=*/1, dest, dstride, in, block_height);
  753|  18.4E|  }
  754|   220k|}
cdef_filter_16_0_avx2:
  760|   104k|                                 int block_height) {
  761|   104k|  if (block_width == 8) {
  ------------------
  |  Branch (761:7): [True: 31.8k, False: 72.4k]
  ------------------
  762|  31.8k|    filter_block_8x8(/*is_lowbd=*/0, dest, dstride, in, pri_strength,
  763|  31.8k|                     sec_strength, dir, pri_damping, sec_damping, coeff_shift,
  764|  31.8k|                     block_height, /*enable_primary=*/1,
  765|  31.8k|                     /*enable_secondary=*/1);
  766|  72.4k|  } else {
  767|  72.4k|    filter_block_4x4(/*is_lowbd=*/0, dest, dstride, in, pri_strength,
  768|  72.4k|                     sec_strength, dir, pri_damping, sec_damping, coeff_shift,
  769|  72.4k|                     block_height, /*enable_primary=*/1,
  770|  72.4k|                     /*enable_secondary=*/1);
  771|  72.4k|  }
  772|   104k|}
cdef_filter_16_1_avx2:
  778|   282k|                                 int block_height) {
  779|   282k|  if (block_width == 8) {
  ------------------
  |  Branch (779:7): [True: 34.6k, False: 248k]
  ------------------
  780|  34.6k|    filter_block_8x8(/*is_lowbd=*/0, dest, dstride, in, pri_strength,
  781|  34.6k|                     sec_strength, dir, pri_damping, sec_damping, coeff_shift,
  782|  34.6k|                     block_height, /*enable_primary=*/1,
  783|  34.6k|                     /*enable_secondary=*/0);
  784|   248k|  } else {
  785|   248k|    filter_block_4x4(/*is_lowbd=*/0, dest, dstride, in, pri_strength,
  786|   248k|                     sec_strength, dir, pri_damping, sec_damping, coeff_shift,
  787|   248k|                     block_height, /*enable_primary=*/1,
  788|   248k|                     /*enable_secondary=*/0);
  789|   248k|  }
  790|   282k|}
cdef_filter_16_2_avx2:
  795|   117k|                                 int block_height) {
  796|   117k|  if (block_width == 8) {
  ------------------
  |  Branch (796:7): [True: 114k, False: 2.70k]
  ------------------
  797|   114k|    filter_block_8x8(/*is_lowbd=*/0, dest, dstride, in, pri_strength,
  798|   114k|                     sec_strength, dir, pri_damping, sec_damping, coeff_shift,
  799|   114k|                     block_height, /*enable_primary=*/0,
  800|   114k|                     /*enable_secondary=*/1);
  801|   114k|  } else {
  802|  2.70k|    filter_block_4x4(/*is_lowbd=*/0, dest, dstride, in, pri_strength,
  803|  2.70k|                     sec_strength, dir, pri_damping, sec_damping, coeff_shift,
  804|  2.70k|                     block_height, /*enable_primary=*/0,
  805|  2.70k|                     /*enable_secondary=*/1);
  806|  2.70k|  }
  807|   117k|}
cdef_filter_16_3_avx2:
  813|  46.3k|                                 int block_height) {
  814|  46.3k|  (void)pri_strength;
  815|  46.3k|  (void)sec_strength;
  816|  46.3k|  (void)dir;
  817|  46.3k|  (void)pri_damping;
  818|  46.3k|  (void)sec_damping;
  819|  46.3k|  (void)coeff_shift;
  820|  46.3k|  (void)block_width;
  821|  46.3k|  if (block_width == 8) {
  ------------------
  |  Branch (821:7): [True: 46.3k, False: 18.4E]
  ------------------
  822|  46.3k|    copy_block_8xh(/*is_lowbd=*/0, dest, dstride, in, block_height);
  823|  18.4E|  } else {
  824|  18.4E|    copy_block_4xh(/*is_lowbd=*/0, dest, dstride, in, block_height);
  825|  18.4E|  }
  826|  46.3k|}
cdef_copy_rect8_16bit_to_16bit_avx2:
  831|  18.1k|                                               int width, int height) {
  832|  18.1k|  int i, j;
  833|   547k|  for (i = 0; i < height; i++) {
  ------------------
  |  Branch (833:15): [True: 529k, False: 18.1k]
  ------------------
  834|  3.63M|    for (j = 0; j < (width & ~0x7); j += 8) {
  ------------------
  |  Branch (834:17): [True: 3.10M, False: 529k]
  ------------------
  835|  3.10M|      v128 row = v128_load_unaligned(&src[i * sstride + j]);
  836|  3.10M|      v128_store_unaligned(&dst[i * dstride + j], row);
  837|  3.10M|    }
  838|   747k|    for (; j < width; j++) {
  ------------------
  |  Branch (838:12): [True: 218k, False: 529k]
  ------------------
  839|   218k|      dst[i * dstride + j] = src[i * sstride + j];
  840|   218k|    }
  841|   529k|  }
  842|  18.1k|}
cdef_block_avx2.c:compute_directions:
   62|  3.20k|static inline v128 compute_directions(v128 lines[8], int32_t tmp_cost1[4]) {
   63|  3.20k|  v128 partial4a, partial4b, partial5a, partial5b, partial7a, partial7b;
   64|  3.20k|  v128 partial6;
   65|  3.20k|  v128 tmp;
   66|       |  /* Partial sums for lines 0 and 1. */
   67|  3.20k|  partial4a = v128_shl_n_byte(lines[0], 14);
  ------------------
  |  |  595|  3.20k|#define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c) & 127)
  ------------------
   68|  3.20k|  partial4b = v128_shr_n_byte(lines[0], 2);
  ------------------
  |  |  596|  3.20k|#define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c) & 127)
  ------------------
   69|  3.20k|  partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[1], 12));
  ------------------
  |  |  595|  3.20k|#define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c) & 127)
  ------------------
   70|  3.20k|  partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[1], 4));
  ------------------
  |  |  596|  3.20k|#define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c) & 127)
  ------------------
   71|  3.20k|  tmp = v128_add_16(lines[0], lines[1]);
   72|  3.20k|  partial5a = v128_shl_n_byte(tmp, 10);
  ------------------
  |  |  595|  3.20k|#define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c) & 127)
  ------------------
   73|  3.20k|  partial5b = v128_shr_n_byte(tmp, 6);
  ------------------
  |  |  596|  3.20k|#define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c) & 127)
  ------------------
   74|  3.20k|  partial7a = v128_shl_n_byte(tmp, 4);
  ------------------
  |  |  595|  3.20k|#define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c) & 127)
  ------------------
   75|  3.20k|  partial7b = v128_shr_n_byte(tmp, 12);
  ------------------
  |  |  596|  3.20k|#define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c) & 127)
  ------------------
   76|  3.20k|  partial6 = tmp;
   77|       |
   78|       |  /* Partial sums for lines 2 and 3. */
   79|  3.20k|  partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[2], 10));
  ------------------
  |  |  595|  3.20k|#define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c) & 127)
  ------------------
   80|  3.20k|  partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[2], 6));
  ------------------
  |  |  596|  3.20k|#define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c) & 127)
  ------------------
   81|  3.20k|  partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[3], 8));
  ------------------
  |  |  595|  3.20k|#define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c) & 127)
  ------------------
   82|  3.20k|  partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[3], 8));
  ------------------
  |  |  596|  3.20k|#define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c) & 127)
  ------------------
   83|  3.20k|  tmp = v128_add_16(lines[2], lines[3]);
   84|  3.20k|  partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 8));
  ------------------
  |  |  595|  3.20k|#define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c) & 127)
  ------------------
   85|  3.20k|  partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 8));
  ------------------
  |  |  596|  3.20k|#define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c) & 127)
  ------------------
   86|  3.20k|  partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 6));
  ------------------
  |  |  595|  3.20k|#define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c) & 127)
  ------------------
   87|  3.20k|  partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 10));
  ------------------
  |  |  596|  3.20k|#define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c) & 127)
  ------------------
   88|  3.20k|  partial6 = v128_add_16(partial6, tmp);
   89|       |
   90|       |  /* Partial sums for lines 4 and 5. */
   91|  3.20k|  partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[4], 6));
  ------------------
  |  |  595|  3.20k|#define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c) & 127)
  ------------------
   92|  3.20k|  partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[4], 10));
  ------------------
  |  |  596|  3.20k|#define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c) & 127)
  ------------------
   93|  3.20k|  partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[5], 4));
  ------------------
  |  |  595|  3.20k|#define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c) & 127)
  ------------------
   94|  3.20k|  partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[5], 12));
  ------------------
  |  |  596|  3.20k|#define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c) & 127)
  ------------------
   95|  3.20k|  tmp = v128_add_16(lines[4], lines[5]);
   96|  3.20k|  partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 6));
  ------------------
  |  |  595|  3.20k|#define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c) & 127)
  ------------------
   97|  3.20k|  partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 10));
  ------------------
  |  |  596|  3.20k|#define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c) & 127)
  ------------------
   98|  3.20k|  partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 8));
  ------------------
  |  |  595|  3.20k|#define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c) & 127)
  ------------------
   99|  3.20k|  partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 8));
  ------------------
  |  |  596|  3.20k|#define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c) & 127)
  ------------------
  100|  3.20k|  partial6 = v128_add_16(partial6, tmp);
  101|       |
  102|       |  /* Partial sums for lines 6 and 7. */
  103|  3.20k|  partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[6], 2));
  ------------------
  |  |  595|  3.20k|#define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c) & 127)
  ------------------
  104|  3.20k|  partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[6], 14));
  ------------------
  |  |  596|  3.20k|#define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c) & 127)
  ------------------
  105|  3.20k|  partial4a = v128_add_16(partial4a, lines[7]);
  106|  3.20k|  tmp = v128_add_16(lines[6], lines[7]);
  107|  3.20k|  partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 4));
  ------------------
  |  |  595|  3.20k|#define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c) & 127)
  ------------------
  108|  3.20k|  partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 12));
  ------------------
  |  |  596|  3.20k|#define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c) & 127)
  ------------------
  109|  3.20k|  partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 10));
  ------------------
  |  |  595|  3.20k|#define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c) & 127)
  ------------------
  110|  3.20k|  partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 6));
  ------------------
  |  |  596|  3.20k|#define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c) & 127)
  ------------------
  111|  3.20k|  partial6 = v128_add_16(partial6, tmp);
  112|       |
  113|       |  /* Compute costs in terms of partial sums. */
  114|  3.20k|  partial4a =
  115|  3.20k|      fold_mul_and_sum(partial4a, partial4b, v128_from_32(210, 280, 420, 840),
  116|  3.20k|                       v128_from_32(105, 120, 140, 168));
  117|  3.20k|  partial7a =
  118|  3.20k|      fold_mul_and_sum(partial7a, partial7b, v128_from_32(210, 420, 0, 0),
  119|  3.20k|                       v128_from_32(105, 105, 105, 140));
  120|  3.20k|  partial5a =
  121|  3.20k|      fold_mul_and_sum(partial5a, partial5b, v128_from_32(210, 420, 0, 0),
  122|  3.20k|                       v128_from_32(105, 105, 105, 140));
  123|  3.20k|  partial6 = v128_madd_s16(partial6, partial6);
  124|  3.20k|  partial6 = v128_mullo_s32(partial6, v128_dup_32(105));
  125|       |
  126|  3.20k|  partial4a = hsum4(partial4a, partial5a, partial6, partial7a);
  127|  3.20k|  v128_store_unaligned(tmp_cost1, partial4a);
  128|  3.20k|  return partial4a;
  129|  3.20k|}
cdef_block_avx2.c:fold_mul_and_sum:
   27|  9.60k|                                    v128 const2) {
   28|  9.60k|  v128 tmp;
   29|       |  /* Reverse partial B. */
   30|  9.60k|  partialb = v128_shuffle_8(
   31|  9.60k|      partialb, v128_from_32(0x0f0e0100, 0x03020504, 0x07060908, 0x0b0a0d0c));
   32|       |  /* Interleave the x and y values of identical indices and pair x8 with 0. */
   33|  9.60k|  tmp = partiala;
   34|  9.60k|  partiala = v128_ziplo_16(partialb, partiala);
   35|  9.60k|  partialb = v128_ziphi_16(partialb, tmp);
   36|       |  /* Square and add the corresponding x and y values. */
   37|  9.60k|  partiala = v128_madd_s16(partiala, partiala);
   38|  9.60k|  partialb = v128_madd_s16(partialb, partialb);
   39|       |  /* Multiply by constant. */
   40|  9.60k|  partiala = v128_mullo_s32(partiala, const1);
   41|  9.60k|  partialb = v128_mullo_s32(partialb, const2);
   42|       |  /* Sum all results. */
   43|  9.60k|  partiala = v128_add_32(partiala, partialb);
   44|  9.60k|  return partiala;
   45|  9.60k|}
cdef_block_avx2.c:hsum4:
   47|  3.20k|static inline v128 hsum4(v128 x0, v128 x1, v128 x2, v128 x3) {
   48|  3.20k|  v128 t0, t1, t2, t3;
   49|  3.20k|  t0 = v128_ziplo_32(x1, x0);
   50|  3.20k|  t1 = v128_ziplo_32(x3, x2);
   51|  3.20k|  t2 = v128_ziphi_32(x1, x0);
   52|  3.20k|  t3 = v128_ziphi_32(x3, x2);
   53|  3.20k|  x0 = v128_ziplo_64(t1, t0);
   54|  3.20k|  x1 = v128_ziphi_64(t1, t0);
   55|  3.20k|  x2 = v128_ziplo_64(t3, t2);
   56|  3.20k|  x3 = v128_ziphi_64(t3, t2);
   57|  3.20k|  return v128_add_32(v128_add_32(x0, x1), v128_add_32(x2, x3));
   58|  3.20k|}
cdef_block_avx2.c:array_reverse_transpose_8x8:
  133|  1.60k|static inline void array_reverse_transpose_8x8(v128 *in, v128 *res) {
  134|  1.60k|  const v128 tr0_0 = v128_ziplo_16(in[1], in[0]);
  135|  1.60k|  const v128 tr0_1 = v128_ziplo_16(in[3], in[2]);
  136|  1.60k|  const v128 tr0_2 = v128_ziphi_16(in[1], in[0]);
  137|  1.60k|  const v128 tr0_3 = v128_ziphi_16(in[3], in[2]);
  138|  1.60k|  const v128 tr0_4 = v128_ziplo_16(in[5], in[4]);
  139|  1.60k|  const v128 tr0_5 = v128_ziplo_16(in[7], in[6]);
  140|  1.60k|  const v128 tr0_6 = v128_ziphi_16(in[5], in[4]);
  141|  1.60k|  const v128 tr0_7 = v128_ziphi_16(in[7], in[6]);
  142|       |
  143|  1.60k|  const v128 tr1_0 = v128_ziplo_32(tr0_1, tr0_0);
  144|  1.60k|  const v128 tr1_1 = v128_ziplo_32(tr0_5, tr0_4);
  145|  1.60k|  const v128 tr1_2 = v128_ziphi_32(tr0_1, tr0_0);
  146|  1.60k|  const v128 tr1_3 = v128_ziphi_32(tr0_5, tr0_4);
  147|  1.60k|  const v128 tr1_4 = v128_ziplo_32(tr0_3, tr0_2);
  148|  1.60k|  const v128 tr1_5 = v128_ziplo_32(tr0_7, tr0_6);
  149|  1.60k|  const v128 tr1_6 = v128_ziphi_32(tr0_3, tr0_2);
  150|  1.60k|  const v128 tr1_7 = v128_ziphi_32(tr0_7, tr0_6);
  151|       |
  152|  1.60k|  res[7] = v128_ziplo_64(tr1_1, tr1_0);
  153|  1.60k|  res[6] = v128_ziphi_64(tr1_1, tr1_0);
  154|  1.60k|  res[5] = v128_ziplo_64(tr1_3, tr1_2);
  155|  1.60k|  res[4] = v128_ziphi_64(tr1_3, tr1_2);
  156|  1.60k|  res[3] = v128_ziplo_64(tr1_5, tr1_4);
  157|  1.60k|  res[2] = v128_ziphi_64(tr1_5, tr1_4);
  158|  1.60k|  res[1] = v128_ziplo_64(tr1_7, tr1_6);
  159|  1.60k|  res[0] = v128_ziphi_64(tr1_7, tr1_6);
  160|  1.60k|}
cdef_block_avx2.c:filter_block_8x8:
  476|   446k|                                  int enable_primary, int enable_secondary) {
  477|   446k|  uint8_t *dst8 = (uint8_t *)dest;
  478|   446k|  uint16_t *dst16 = (uint16_t *)dest;
  479|   446k|  const int clipping_required = enable_primary && enable_secondary;
  ------------------
  |  Branch (479:33): [True: 233k, False: 212k]
  |  Branch (479:51): [True: 143k, False: 90.7k]
  ------------------
  480|   446k|  int i;
  481|   446k|  v256 sum, p0, p1, p2, p3, row, res;
  482|   446k|  const v256 cdef_large_value_mask = v256_dup_16((uint16_t)~CDEF_VERY_LARGE);
  ------------------
  |  |   30|   446k|#define CDEF_VERY_LARGE (0x4000)
  ------------------
  483|   446k|  v256 max, min;
  484|   446k|  const int po1 = cdef_directions[dir][0];
  485|   446k|  const int po2 = cdef_directions[dir][1];
  486|   446k|  const int s1o1 = cdef_directions[dir + 2][0];
  487|   446k|  const int s1o2 = cdef_directions[dir + 2][1];
  488|   446k|  const int s2o1 = cdef_directions[dir - 2][0];
  489|   446k|  const int s2o2 = cdef_directions[dir - 2][1];
  490|   446k|  const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
  491|   446k|  const int *sec_taps = cdef_sec_taps;
  492|       |
  493|   446k|  if (enable_primary && pri_strength)
  ------------------
  |  Branch (493:7): [True: 235k, False: 211k]
  |  Branch (493:25): [True: 235k, False: 18.4E]
  ------------------
  494|   235k|    pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
  ------------------
  |  |   35|   235k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 60.9k, False: 174k]
  |  |  ------------------
  ------------------
  495|   446k|  if (enable_secondary && sec_strength)
  ------------------
  |  Branch (495:7): [True: 357k, False: 89.3k]
  |  Branch (495:27): [True: 357k, False: 18.4E]
  ------------------
  496|   357k|    sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
  ------------------
  |  |   35|   357k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 357k]
  |  |  ------------------
  ------------------
  497|       |
  498|  2.05M|  for (i = 0; i < height; i += 2) {
  ------------------
  |  Branch (498:15): [True: 1.61M, False: 446k]
  ------------------
  499|  1.61M|    v256 tap[8];
  500|  1.61M|    sum = v256_zero();
  501|  1.61M|    row = v256_from_v128(v128_load_aligned(&in[i * CDEF_BSTRIDE]),
  ------------------
  |  |   28|  1.61M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  1.61M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  502|  1.61M|                         v128_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]));
  ------------------
  |  |   28|  1.61M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  1.61M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  503|       |
  504|  1.61M|    min = max = row;
  505|  1.61M|    if (enable_primary) {
  ------------------
  |  Branch (505:9): [True: 826k, False: 784k]
  ------------------
  506|       |      // Primary near taps
  507|   826k|      tap[0] = v256_from_v128(
  508|   826k|          v128_load_unaligned(&in[i * CDEF_BSTRIDE + po1]),
  ------------------
  |  |   28|   826k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   826k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  509|   826k|          v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]));
  ------------------
  |  |   28|   826k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   826k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  510|   826k|      tap[1] = v256_from_v128(
  511|   826k|          v128_load_unaligned(&in[i * CDEF_BSTRIDE - po1]),
  ------------------
  |  |   28|   826k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   826k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  512|   826k|          v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]));
  ------------------
  |  |   28|   826k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   826k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  513|   826k|      p0 = constrain16(tap[0], row, pri_strength, pri_damping);
  514|   826k|      p1 = constrain16(tap[1], row, pri_strength, pri_damping);
  515|       |
  516|       |      // sum += pri_taps[0] * (p0 + p1)
  517|   826k|      sum = v256_add_16(
  518|   826k|          sum, v256_mullo_s16(v256_dup_16(pri_taps[0]), v256_add_16(p0, p1)));
  519|       |
  520|       |      // Primary far taps
  521|   826k|      tap[2] = v256_from_v128(
  522|   826k|          v128_load_unaligned(&in[i * CDEF_BSTRIDE + po2]),
  ------------------
  |  |   28|   826k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   826k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  523|   826k|          v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]));
  ------------------
  |  |   28|   826k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   826k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  524|   826k|      tap[3] = v256_from_v128(
  525|   826k|          v128_load_unaligned(&in[i * CDEF_BSTRIDE - po2]),
  ------------------
  |  |   28|   826k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   826k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  526|   826k|          v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]));
  ------------------
  |  |   28|   826k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   826k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  527|   826k|      p0 = constrain16(tap[2], row, pri_strength, pri_damping);
  528|   826k|      p1 = constrain16(tap[3], row, pri_strength, pri_damping);
  529|       |
  530|       |      // sum += pri_taps[1] * (p0 + p1)
  531|   826k|      sum = v256_add_16(
  532|   826k|          sum, v256_mullo_s16(v256_dup_16(pri_taps[1]), v256_add_16(p0, p1)));
  533|       |
  534|   826k|      if (clipping_required) {
  ------------------
  |  Branch (534:11): [True: 465k, False: 360k]
  ------------------
  535|   465k|        max = get_max_primary(is_lowbd, tap, max, cdef_large_value_mask);
  536|       |
  537|   465k|        min = v256_min_s16(min, tap[0]);
  538|   465k|        min = v256_min_s16(min, tap[1]);
  539|   465k|        min = v256_min_s16(min, tap[2]);
  540|   465k|        min = v256_min_s16(min, tap[3]);
  541|   465k|      }
  542|       |      // End primary
  543|   826k|    }
  544|       |
  545|  1.61M|    if (enable_secondary) {
  ------------------
  |  Branch (545:9): [True: 1.27M, False: 334k]
  ------------------
  546|       |      // Secondary near taps
  547|  1.27M|      tap[0] = v256_from_v128(
  548|  1.27M|          v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]),
  ------------------
  |  |   28|  1.27M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  1.27M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  549|  1.27M|          v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]));
  ------------------
  |  |   28|  1.27M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  1.27M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  550|  1.27M|      tap[1] = v256_from_v128(
  551|  1.27M|          v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]),
  ------------------
  |  |   28|  1.27M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  1.27M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  552|  1.27M|          v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1]));
  ------------------
  |  |   28|  1.27M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  1.27M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  553|  1.27M|      tap[2] = v256_from_v128(
  554|  1.27M|          v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]),
  ------------------
  |  |   28|  1.27M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  1.27M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  555|  1.27M|          v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]));
  ------------------
  |  |   28|  1.27M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  1.27M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  556|  1.27M|      tap[3] = v256_from_v128(
  557|  1.27M|          v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]),
  ------------------
  |  |   28|  1.27M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  1.27M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  558|  1.27M|          v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]));
  ------------------
  |  |   28|  1.27M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  1.27M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  559|  1.27M|      p0 = constrain16(tap[0], row, sec_strength, sec_damping);
  560|  1.27M|      p1 = constrain16(tap[1], row, sec_strength, sec_damping);
  561|  1.27M|      p2 = constrain16(tap[2], row, sec_strength, sec_damping);
  562|  1.27M|      p3 = constrain16(tap[3], row, sec_strength, sec_damping);
  563|       |
  564|       |      // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
  565|  1.27M|      sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[0]),
  566|  1.27M|                                            v256_add_16(v256_add_16(p0, p1),
  567|  1.27M|                                                        v256_add_16(p2, p3))));
  568|       |
  569|       |      // Secondary far taps
  570|  1.27M|      tap[4] = v256_from_v128(
  571|  1.27M|          v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]),
  ------------------
  |  |   28|  1.27M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  1.27M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  572|  1.27M|          v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2]));
  ------------------
  |  |   28|  1.27M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  1.27M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  573|  1.27M|      tap[5] = v256_from_v128(
  574|  1.27M|          v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]),
  ------------------
  |  |   28|  1.27M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  1.27M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  575|  1.27M|          v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2]));
  ------------------
  |  |   28|  1.27M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  1.27M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  576|  1.27M|      tap[6] = v256_from_v128(
  577|  1.27M|          v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]),
  ------------------
  |  |   28|  1.27M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  1.27M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  578|  1.27M|          v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]));
  ------------------
  |  |   28|  1.27M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  1.27M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  579|  1.27M|      tap[7] = v256_from_v128(
  580|  1.27M|          v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]),
  ------------------
  |  |   28|  1.27M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  1.27M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  581|  1.27M|          v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]));
  ------------------
  |  |   28|  1.27M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  1.27M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  582|  1.27M|      p0 = constrain16(tap[4], row, sec_strength, sec_damping);
  583|  1.27M|      p1 = constrain16(tap[5], row, sec_strength, sec_damping);
  584|  1.27M|      p2 = constrain16(tap[6], row, sec_strength, sec_damping);
  585|  1.27M|      p3 = constrain16(tap[7], row, sec_strength, sec_damping);
  586|       |
  587|       |      // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
  588|  1.27M|      sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[1]),
  589|  1.27M|                                            v256_add_16(v256_add_16(p0, p1),
  590|  1.27M|                                                        v256_add_16(p2, p3))));
  591|       |
  592|  1.27M|      if (clipping_required) {
  ------------------
  |  Branch (592:11): [True: 460k, False: 816k]
  ------------------
  593|   460k|        max = get_max_secondary(is_lowbd, tap, max, cdef_large_value_mask);
  594|       |
  595|   460k|        min = v256_min_s16(min, tap[0]);
  596|   460k|        min = v256_min_s16(min, tap[1]);
  597|   460k|        min = v256_min_s16(min, tap[2]);
  598|   460k|        min = v256_min_s16(min, tap[3]);
  599|   460k|        min = v256_min_s16(min, tap[4]);
  600|   460k|        min = v256_min_s16(min, tap[5]);
  601|   460k|        min = v256_min_s16(min, tap[6]);
  602|   460k|        min = v256_min_s16(min, tap[7]);
  603|   460k|      }
  604|       |      // End secondary
  605|  1.27M|    }
  606|       |
  607|       |    // res = row + ((sum - (sum < 0) + 8) >> 4)
  608|  1.61M|    sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero()));
  609|  1.61M|    res = v256_add_16(sum, v256_dup_16(8));
  610|  1.61M|    res = v256_shr_n_s16(res, 4);
  ------------------
  |  |  695|  1.61M|#define v256_shr_n_s16(a, c) _mm256_srai_epi16(a, c)
  ------------------
  611|  1.61M|    res = v256_add_16(row, res);
  612|  1.61M|    if (clipping_required) {
  ------------------
  |  Branch (612:9): [True: 460k, False: 1.15M]
  ------------------
  613|   460k|      res = v256_min_s16(v256_max_s16(res, min), max);
  614|   460k|    }
  615|       |
  616|  1.61M|    if (is_lowbd) {
  ------------------
  |  Branch (616:9): [True: 869k, False: 741k]
  ------------------
  617|   869k|      const v128 res_128 = v256_low_v128(v256_pack_s16_u8(res, res));
  618|   869k|      v64_store_aligned(&dst8[i * dstride], v128_high_v64(res_128));
  619|   869k|      v64_store_aligned(&dst8[(i + 1) * dstride], v128_low_v64(res_128));
  620|   869k|    } else {
  621|   741k|      v128_store_unaligned(&dst16[i * dstride], v256_high_v128(res));
  622|   741k|      v128_store_unaligned(&dst16[(i + 1) * dstride], v256_low_v128(res));
  623|   741k|    }
  624|  1.61M|  }
  625|   446k|}
cdef_block_avx2.c:constrain16:
  211|  15.3M|                             unsigned int adjdamp) {
  212|  15.3M|  v256 diff = v256_sub_16(a, b);
  213|  15.3M|  const v256 sign = v256_shr_n_s16(diff, 15);
  ------------------
  |  |  695|  15.3M|#define v256_shr_n_s16(a, c) _mm256_srai_epi16(a, c)
  ------------------
  214|  15.3M|  diff = v256_abs_s16(diff);
  215|  15.3M|  const v256 s =
  216|  15.3M|      v256_ssub_u16(v256_dup_16(threshold), v256_shr_u16(diff, adjdamp));
  217|  15.3M|  return v256_xor(v256_add_16(sign, v256_min_s16(diff, s)), sign);
  218|  15.3M|}
cdef_block_avx2.c:get_max_primary:
  221|   607k|                                 v256 cdef_large_value_mask) {
  222|   607k|  if (is_lowbd) {
  ------------------
  |  Branch (222:7): [True: 366k, False: 240k]
  ------------------
  223|   366k|    v256 max_u8;
  224|   366k|    max_u8 = tap[0];
  225|   366k|    max_u8 = v256_max_u8(max_u8, tap[1]);
  226|   366k|    max_u8 = v256_max_u8(max_u8, tap[2]);
  227|   366k|    max_u8 = v256_max_u8(max_u8, tap[3]);
  228|       |    /* The source is 16 bits, however, we only really care about the lower
  229|       |    8 bits.  The upper 8 bits contain the "large" flag.  After the final
  230|       |    primary max has been calculated, zero out the upper 8 bits.  Use this
  231|       |    to find the "16 bit" max. */
  232|   366k|    max = v256_max_s16(max, v256_and(max_u8, cdef_large_value_mask));
  233|   366k|  } else {
  234|       |    /* Convert CDEF_VERY_LARGE to 0 before calculating max. */
  235|   240k|    max = v256_max_s16(max, v256_and(tap[0], cdef_large_value_mask));
  236|   240k|    max = v256_max_s16(max, v256_and(tap[1], cdef_large_value_mask));
  237|   240k|    max = v256_max_s16(max, v256_and(tap[2], cdef_large_value_mask));
  238|   240k|    max = v256_max_s16(max, v256_and(tap[3], cdef_large_value_mask));
  239|   240k|  }
  240|   607k|  return max;
  241|   607k|}
cdef_block_avx2.c:get_max_secondary:
  244|   601k|                                   v256 cdef_large_value_mask) {
  245|   601k|  if (is_lowbd) {
  ------------------
  |  Branch (245:7): [True: 355k, False: 245k]
  ------------------
  246|   355k|    v256 max_u8;
  247|   355k|    max_u8 = tap[0];
  248|   355k|    max_u8 = v256_max_u8(max_u8, tap[1]);
  249|   355k|    max_u8 = v256_max_u8(max_u8, tap[2]);
  250|   355k|    max_u8 = v256_max_u8(max_u8, tap[3]);
  251|   355k|    max_u8 = v256_max_u8(max_u8, tap[4]);
  252|   355k|    max_u8 = v256_max_u8(max_u8, tap[5]);
  253|   355k|    max_u8 = v256_max_u8(max_u8, tap[6]);
  254|   355k|    max_u8 = v256_max_u8(max_u8, tap[7]);
  255|       |    /* The source is 16 bits, however, we only really care about the lower
  256|       |    8 bits.  The upper 8 bits contain the "large" flag.  After the final
  257|       |    primary max has been calculated, zero out the upper 8 bits.  Use this
  258|       |    to find the "16 bit" max. */
  259|   355k|    max = v256_max_s16(max, v256_and(max_u8, cdef_large_value_mask));
  260|   355k|  } else {
  261|       |    /* Convert CDEF_VERY_LARGE to 0 before calculating max. */
  262|   245k|    max = v256_max_s16(max, v256_and(tap[0], cdef_large_value_mask));
  263|   245k|    max = v256_max_s16(max, v256_and(tap[1], cdef_large_value_mask));
  264|   245k|    max = v256_max_s16(max, v256_and(tap[2], cdef_large_value_mask));
  265|   245k|    max = v256_max_s16(max, v256_and(tap[3], cdef_large_value_mask));
  266|   245k|    max = v256_max_s16(max, v256_and(tap[4], cdef_large_value_mask));
  267|   245k|    max = v256_max_s16(max, v256_and(tap[5], cdef_large_value_mask));
  268|   245k|    max = v256_max_s16(max, v256_and(tap[6], cdef_large_value_mask));
  269|   245k|    max = v256_max_s16(max, v256_and(tap[7], cdef_large_value_mask));
  270|   245k|  }
  271|   601k|  return max;
  272|   601k|}
cdef_block_avx2.c:filter_block_4x4:
  284|   658k|                                  int enable_primary, int enable_secondary) {
  285|   658k|  uint8_t *dst8 = (uint8_t *)dest;
  286|   658k|  uint16_t *dst16 = (uint16_t *)dest;
  287|   658k|  const int clipping_required = enable_primary && enable_secondary;
  ------------------
  |  Branch (287:33): [True: 591k, False: 67.3k]
  |  Branch (287:51): [True: 86.0k, False: 505k]
  ------------------
  288|   658k|  v256 p0, p1, p2, p3;
  289|   658k|  v256 sum, row, res;
  290|   658k|  v256 max, min;
  291|   658k|  const v256 cdef_large_value_mask = v256_dup_16((uint16_t)~CDEF_VERY_LARGE);
  ------------------
  |  |   30|   658k|#define CDEF_VERY_LARGE (0x4000)
  ------------------
  292|   658k|  const int po1 = cdef_directions[dir][0];
  293|   658k|  const int po2 = cdef_directions[dir][1];
  294|   658k|  const int s1o1 = cdef_directions[dir + 2][0];
  295|   658k|  const int s1o2 = cdef_directions[dir + 2][1];
  296|   658k|  const int s2o1 = cdef_directions[dir - 2][0];
  297|   658k|  const int s2o2 = cdef_directions[dir - 2][1];
  298|   658k|  const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
  299|   658k|  const int *sec_taps = cdef_sec_taps;
  300|   658k|  int i;
  301|       |
  302|   658k|  if (enable_primary && pri_strength)
  ------------------
  |  Branch (302:7): [True: 591k, False: 66.6k]
  |  Branch (302:25): [True: 592k, False: 18.4E]
  ------------------
  303|   592k|    pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
  ------------------
  |  |   35|   592k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 112k, False: 479k]
  |  |  ------------------
  ------------------
  304|   658k|  if (enable_secondary && sec_strength)
  ------------------
  |  Branch (304:7): [True: 153k, False: 505k]
  |  Branch (304:27): [True: 153k, False: 71]
  ------------------
  305|   153k|    sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
  ------------------
  |  |   35|   153k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 153k]
  |  |  ------------------
  ------------------
  306|       |
  307|  1.40M|  for (i = 0; i < height; i += 4) {
  ------------------
  |  Branch (307:15): [True: 747k, False: 658k]
  ------------------
  308|   747k|    sum = v256_zero();
  309|   747k|    row = v256_from_v64(v64_load_aligned(&in[(i + 0) * CDEF_BSTRIDE]),
  ------------------
  |  |   28|   747k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   747k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  310|   747k|                        v64_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]),
  ------------------
  |  |   28|   747k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   747k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  311|   747k|                        v64_load_aligned(&in[(i + 2) * CDEF_BSTRIDE]),
  ------------------
  |  |   28|   747k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   747k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  312|   747k|                        v64_load_aligned(&in[(i + 3) * CDEF_BSTRIDE]));
  ------------------
  |  |   28|   747k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   747k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  313|   747k|    max = min = row;
  314|       |
  315|   747k|    if (enable_primary) {
  ------------------
  |  Branch (315:9): [True: 681k, False: 66.5k]
  ------------------
  316|   681k|      v256 tap[4];
  317|       |      // Primary near taps
  318|   681k|      tap[0] =
  319|   681k|          v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + po1]),
  ------------------
  |  |   28|   681k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   681k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  320|   681k|                        v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]),
  ------------------
  |  |   28|   681k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   681k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  321|   681k|                        v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + po1]),
  ------------------
  |  |   28|   681k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   681k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  322|   681k|                        v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + po1]));
  ------------------
  |  |   28|   681k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   681k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  323|   681k|      p0 = constrain16(tap[0], row, pri_strength, pri_damping);
  324|   681k|      tap[1] =
  325|   681k|          v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - po1]),
  ------------------
  |  |   28|   681k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   681k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  326|   681k|                        v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]),
  ------------------
  |  |   28|   681k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   681k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  327|   681k|                        v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - po1]),
  ------------------
  |  |   28|   681k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   681k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  328|   681k|                        v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - po1]));
  ------------------
  |  |   28|   681k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   681k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  329|   681k|      p1 = constrain16(tap[1], row, pri_strength, pri_damping);
  330|       |
  331|       |      // sum += pri_taps[0] * (p0 + p1)
  332|   681k|      sum = v256_add_16(
  333|   681k|          sum, v256_mullo_s16(v256_dup_16(pri_taps[0]), v256_add_16(p0, p1)));
  334|       |
  335|       |      // Primary far taps
  336|   681k|      tap[2] =
  337|   681k|          v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + po2]),
  ------------------
  |  |   28|   681k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   681k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  338|   681k|                        v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]),
  ------------------
  |  |   28|   681k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   681k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  339|   681k|                        v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + po2]),
  ------------------
  |  |   28|   681k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   681k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  340|   681k|                        v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + po2]));
  ------------------
  |  |   28|   681k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   681k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  341|   681k|      p0 = constrain16(tap[2], row, pri_strength, pri_damping);
  342|   681k|      tap[3] =
  343|   681k|          v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - po2]),
  ------------------
  |  |   28|   681k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   681k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  344|   681k|                        v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]),
  ------------------
  |  |   28|   681k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   681k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  345|   681k|                        v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - po2]),
  ------------------
  |  |   28|   681k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   681k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  346|   681k|                        v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - po2]));
  ------------------
  |  |   28|   681k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   681k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  347|   681k|      p1 = constrain16(tap[3], row, pri_strength, pri_damping);
  348|       |
  349|       |      // sum += pri_taps[1] * (p0 + p1)
  350|   681k|      sum = v256_add_16(
  351|   681k|          sum, v256_mullo_s16(v256_dup_16(pri_taps[1]), v256_add_16(p0, p1)));
  352|   681k|      if (clipping_required) {
  ------------------
  |  Branch (352:11): [True: 157k, False: 524k]
  ------------------
  353|   157k|        max = get_max_primary(is_lowbd, tap, max, cdef_large_value_mask);
  354|       |
  355|   157k|        min = v256_min_s16(min, tap[0]);
  356|   157k|        min = v256_min_s16(min, tap[1]);
  357|   157k|        min = v256_min_s16(min, tap[2]);
  358|   157k|        min = v256_min_s16(min, tap[3]);
  359|   157k|      }
  360|   681k|    }
  361|       |
  362|   747k|    if (enable_secondary) {
  ------------------
  |  Branch (362:9): [True: 224k, False: 523k]
  ------------------
  363|   224k|      v256 tap[8];
  364|       |      // Secondary near taps
  365|   224k|      tap[0] =
  366|   224k|          v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + s1o1]),
  ------------------
  |  |   28|   224k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   224k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  367|   224k|                        v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]),
  ------------------
  |  |   28|   224k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   224k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  368|   224k|                        v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s1o1]),
  ------------------
  |  |   28|   224k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   224k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  369|   224k|                        v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s1o1]));
  ------------------
  |  |   28|   224k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   224k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  370|   224k|      p0 = constrain16(tap[0], row, sec_strength, sec_damping);
  371|   224k|      tap[1] =
  372|   224k|          v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - s1o1]),
  ------------------
  |  |   28|   224k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   224k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  373|   224k|                        v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1]),
  ------------------
  |  |   28|   224k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   224k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  374|   224k|                        v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s1o1]),
  ------------------
  |  |   28|   224k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   224k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  375|   224k|                        v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s1o1]));
  ------------------
  |  |   28|   224k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   224k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  376|   224k|      p1 = constrain16(tap[1], row, sec_strength, sec_damping);
  377|   224k|      tap[2] =
  378|   224k|          v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + s2o1]),
  ------------------
  |  |   28|   224k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   224k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  379|   224k|                        v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]),
  ------------------
  |  |   28|   224k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   224k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  380|   224k|                        v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s2o1]),
  ------------------
  |  |   28|   224k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   224k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  381|   224k|                        v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s2o1]));
  ------------------
  |  |   28|   224k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   224k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  382|   224k|      p2 = constrain16(tap[2], row, sec_strength, sec_damping);
  383|   224k|      tap[3] =
  384|   224k|          v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - s2o1]),
  ------------------
  |  |   28|   224k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   224k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  385|   224k|                        v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]),
  ------------------
  |  |   28|   224k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   224k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  386|   224k|                        v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s2o1]),
  ------------------
  |  |   28|   224k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   224k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  387|   224k|                        v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s2o1]));
  ------------------
  |  |   28|   224k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   224k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  388|   224k|      p3 = constrain16(tap[3], row, sec_strength, sec_damping);
  389|       |
  390|       |      // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
  391|   224k|      sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[0]),
  392|   224k|                                            v256_add_16(v256_add_16(p0, p1),
  393|   224k|                                                        v256_add_16(p2, p3))));
  394|       |
  395|       |      // Secondary far taps
  396|   224k|      tap[4] =
  397|   224k|          v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + s1o2]),
  ------------------
  |  |   28|   224k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   224k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  398|   224k|                        v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2]),
  ------------------
  |  |   28|   224k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   224k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  399|   224k|                        v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s1o2]),
  ------------------
  |  |   28|   224k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   224k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  400|   224k|                        v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s1o2]));
  ------------------
  |  |   28|   224k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   224k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  401|   224k|      p0 = constrain16(tap[4], row, sec_strength, sec_damping);
  402|   224k|      tap[5] =
  403|   224k|          v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - s1o2]),
  ------------------
  |  |   28|   224k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   224k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  404|   224k|                        v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2]),
  ------------------
  |  |   28|   224k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   224k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  405|   224k|                        v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s1o2]),
  ------------------
  |  |   28|   224k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   224k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  406|   224k|                        v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s1o2]));
  ------------------
  |  |   28|   224k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   224k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  407|   224k|      p1 = constrain16(tap[5], row, sec_strength, sec_damping);
  408|   224k|      tap[6] =
  409|   224k|          v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + s2o2]),
  ------------------
  |  |   28|   224k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   224k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  410|   224k|                        v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]),
  ------------------
  |  |   28|   224k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   224k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  411|   224k|                        v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s2o2]),
  ------------------
  |  |   28|   224k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   224k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  412|   224k|                        v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s2o2]));
  ------------------
  |  |   28|   224k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   224k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  413|   224k|      p2 = constrain16(tap[6], row, sec_strength, sec_damping);
  414|   224k|      tap[7] =
  415|   224k|          v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - s2o2]),
  ------------------
  |  |   28|   224k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   224k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  416|   224k|                        v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]),
  ------------------
  |  |   28|   224k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   224k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  417|   224k|                        v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s2o2]),
  ------------------
  |  |   28|   224k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   224k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  418|   224k|                        v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s2o2]));
  ------------------
  |  |   28|   224k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   224k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  419|   224k|      p3 = constrain16(tap[7], row, sec_strength, sec_damping);
  420|       |
  421|       |      // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
  422|   224k|      sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[1]),
  423|   224k|                                            v256_add_16(v256_add_16(p0, p1),
  424|   224k|                                                        v256_add_16(p2, p3))));
  425|       |
  426|   224k|      if (clipping_required) {
  ------------------
  |  Branch (426:11): [True: 157k, False: 66.8k]
  ------------------
  427|   157k|        max = get_max_secondary(is_lowbd, tap, max, cdef_large_value_mask);
  428|       |
  429|   157k|        min = v256_min_s16(min, tap[0]);
  430|   157k|        min = v256_min_s16(min, tap[1]);
  431|   157k|        min = v256_min_s16(min, tap[2]);
  432|   157k|        min = v256_min_s16(min, tap[3]);
  433|   157k|        min = v256_min_s16(min, tap[4]);
  434|   157k|        min = v256_min_s16(min, tap[5]);
  435|   157k|        min = v256_min_s16(min, tap[6]);
  436|   157k|        min = v256_min_s16(min, tap[7]);
  437|   157k|      }
  438|   224k|    }
  439|       |
  440|       |    // res = row + ((sum - (sum < 0) + 8) >> 4)
  441|   747k|    sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero()));
  442|   747k|    res = v256_add_16(sum, v256_dup_16(8));
  443|   747k|    res = v256_shr_n_s16(res, 4);
  ------------------
  |  |  695|   747k|#define v256_shr_n_s16(a, c) _mm256_srai_epi16(a, c)
  ------------------
  444|   747k|    res = v256_add_16(row, res);
  445|   747k|    if (clipping_required) {
  ------------------
  |  Branch (445:9): [True: 157k, False: 590k]
  ------------------
  446|   157k|      res = v256_min_s16(v256_max_s16(res, min), max);
  447|   157k|    }
  448|       |
  449|   747k|    if (is_lowbd) {
  ------------------
  |  Branch (449:9): [True: 317k, False: 430k]
  ------------------
  450|   317k|      const v128 res_128 = v256_low_v128(v256_pack_s16_u8(res, res));
  451|   317k|      u32_store_aligned(&dst8[(i + 0) * dstride],
  452|   317k|                        v64_high_u32(v128_high_v64(res_128)));
  453|   317k|      u32_store_aligned(&dst8[(i + 1) * dstride],
  454|   317k|                        v64_low_u32(v128_high_v64(res_128)));
  455|   317k|      u32_store_aligned(&dst8[(i + 2) * dstride],
  456|   317k|                        v64_high_u32(v128_low_v64(res_128)));
  457|   317k|      u32_store_aligned(&dst8[(i + 3) * dstride],
  458|   317k|                        v64_low_u32(v128_low_v64(res_128)));
  459|   430k|    } else {
  460|   430k|      v64_store_aligned(&dst16[(i + 0) * dstride],
  461|   430k|                        v128_high_v64(v256_high_v128(res)));
  462|   430k|      v64_store_aligned(&dst16[(i + 1) * dstride],
  463|   430k|                        v128_low_v64(v256_high_v128(res)));
  464|   430k|      v64_store_aligned(&dst16[(i + 2) * dstride],
  465|   430k|                        v128_high_v64(v256_low_v128(res)));
  466|   430k|      v64_store_aligned(&dst16[(i + 3) * dstride],
  467|   430k|                        v128_low_v64(v256_low_v128(res)));
  468|   430k|    }
  469|   747k|  }
  470|   658k|}
cdef_block_avx2.c:copy_block_8xh:
  664|   266k|                                const uint16_t *in, int height) {
  665|   266k|  uint8_t *dst8 = (uint8_t *)dest;
  666|   266k|  uint16_t *dst16 = (uint16_t *)dest;
  667|   266k|  int i;
  668|  1.32M|  for (i = 0; i < height; i += 2) {
  ------------------
  |  Branch (668:15): [True: 1.05M, False: 266k]
  ------------------
  669|  1.05M|    const v128 row0 = v128_load_aligned(&in[i * CDEF_BSTRIDE]);
  ------------------
  |  |   28|  1.05M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  1.05M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  670|  1.05M|    const v128 row1 = v128_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]);
  ------------------
  |  |   28|  1.05M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  1.05M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  671|  1.05M|    if (is_lowbd) {
  ------------------
  |  Branch (671:9): [True: 873k, False: 185k]
  ------------------
  672|       |      /* Note: v128_pack_s16_u8(). The parameter order is swapped internally */
  673|   873k|      const v128 res_128 = v128_pack_s16_u8(row1, row0);
  674|   873k|      v64_store_aligned(&dst8[i * dstride], v128_low_v64(res_128));
  675|   873k|      v64_store_aligned(&dst8[(i + 1) * dstride], v128_high_v64(res_128));
  676|   873k|    } else {
  677|   185k|      v128_store_unaligned(&dst16[i * dstride], row0);
  678|   185k|      v128_store_unaligned(&dst16[(i + 1) * dstride], row1);
  679|   185k|    }
  680|  1.05M|  }
  681|   266k|}

cfl_init:
   18|  48.6k|void cfl_init(CFL_CTX *cfl, const SequenceHeader *seq_params) {
   19|  48.6k|  assert(block_size_wide[CFL_MAX_BLOCK_SIZE] == CFL_BUF_LINE);
   20|  48.6k|  assert(block_size_high[CFL_MAX_BLOCK_SIZE] == CFL_BUF_LINE);
   21|       |
   22|  48.6k|  memset(&cfl->recon_buf_q3, 0, sizeof(cfl->recon_buf_q3));
   23|  48.6k|  memset(&cfl->ac_buf_q3, 0, sizeof(cfl->ac_buf_q3));
   24|  48.6k|  cfl->subsampling_x = seq_params->subsampling_x;
   25|  48.6k|  cfl->subsampling_y = seq_params->subsampling_y;
   26|  48.6k|  cfl->are_parameters_computed = 0;
   27|  48.6k|  cfl->store_y = 0;
   28|       |  // The DC_PRED cache is disabled by default and is only enabled in
   29|       |  // cfl_rd_pick_alpha
   30|  48.6k|  clear_cfl_dc_pred_cache_flags(cfl);
   31|  48.6k|}
av1_cfl_predict_block:
  189|   272k|                           TX_SIZE tx_size, int plane) {
  190|   272k|  CFL_CTX *const cfl = &xd->cfl;
  191|   272k|  MB_MODE_INFO *mbmi = xd->mi[0];
  192|   272k|  assert(is_cfl_allowed(xd));
  193|       |
  194|   272k|  if (!cfl->are_parameters_computed) cfl_compute_parameters(xd, tx_size);
  ------------------
  |  Branch (194:7): [True: 136k, False: 136k]
  ------------------
  195|       |
  196|   272k|  const int alpha_q3 =
  197|   272k|      cfl_idx_to_alpha(mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, plane - 1);
  198|   272k|  assert((tx_size_high[tx_size] - 1) * CFL_BUF_LINE + tx_size_wide[tx_size] <=
  199|   272k|         CFL_BUF_SQUARE);
  200|   272k|#if CONFIG_AV1_HIGHBITDEPTH
  201|   272k|  if (is_cur_buf_hbd(xd)) {
  ------------------
  |  Branch (201:7): [True: 126k, False: 146k]
  ------------------
  202|   126k|    uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst);
  ------------------
  |  |   75|   126k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  203|   126k|    cfl_get_predict_hbd_fn(tx_size)(cfl->ac_buf_q3, dst_16, dst_stride,
  204|   126k|                                    alpha_q3, xd->bd);
  205|   126k|    return;
  206|   126k|  }
  207|   146k|#endif
  208|   146k|  cfl_get_predict_lbd_fn(tx_size)(cfl->ac_buf_q3, dst, dst_stride, alpha_q3);
  209|   146k|}
cfl_store_tx:
  391|   212k|                  BLOCK_SIZE bsize) {
  392|   212k|  CFL_CTX *const cfl = &xd->cfl;
  393|   212k|  struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
  ------------------
  |  |  210|   212k|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
  394|   212k|  uint8_t *dst = &pd->dst.buf[(row * pd->dst.stride + col) << MI_SIZE_LOG2];
  ------------------
  |  |   39|   212k|#define MI_SIZE_LOG2 2
  ------------------
  395|       |
  396|   212k|  if (block_size_high[bsize] == 4 || block_size_wide[bsize] == 4) {
  ------------------
  |  Branch (396:7): [True: 46.4k, False: 166k]
  |  Branch (396:38): [True: 20.1k, False: 146k]
  ------------------
  397|       |    // Only dimensions of size 4 can have an odd offset.
  398|  66.6k|    assert(!((col & 1) && tx_size_wide[tx_size] != 4));
  399|       |    assert(!((row & 1) && tx_size_high[tx_size] != 4));
  400|  66.6k|    sub8x8_adjust_offset(cfl, xd->mi_row, xd->mi_col, &row, &col);
  401|  66.6k|  }
  402|   212k|  cfl_store(cfl, dst, pd->dst.stride, row, col, tx_size, is_cur_buf_hbd(xd));
  403|   212k|}
cfl_store_block:
  421|  6.53k|void cfl_store_block(MACROBLOCKD *const xd, BLOCK_SIZE bsize, TX_SIZE tx_size) {
  422|  6.53k|  CFL_CTX *const cfl = &xd->cfl;
  423|  6.53k|  struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
  ------------------
  |  |  210|  6.53k|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
  424|  6.53k|  int row = 0;
  425|  6.53k|  int col = 0;
  426|       |
  427|  6.53k|  if (block_size_high[bsize] == 4 || block_size_wide[bsize] == 4) {
  ------------------
  |  Branch (427:7): [True: 4.70k, False: 1.82k]
  |  Branch (427:38): [True: 1.82k, False: 0]
  ------------------
  428|  6.53k|    sub8x8_adjust_offset(cfl, xd->mi_row, xd->mi_col, &row, &col);
  429|  6.53k|  }
  430|  6.53k|  const int width = max_intra_block_width(xd, bsize, AOM_PLANE_Y, tx_size);
  ------------------
  |  |  210|  6.53k|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
  431|  6.53k|  const int height = max_intra_block_height(xd, bsize, AOM_PLANE_Y, tx_size);
  ------------------
  |  |  210|  6.53k|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
  432|  6.53k|  tx_size = get_tx_size(width, height);
  433|  6.53k|  cfl_store(cfl, pd->dst.buf, pd->dst.stride, row, col, tx_size,
  434|  6.53k|            is_cur_buf_hbd(xd));
  435|  6.53k|}
cfl.c:cfl_compute_parameters:
  178|   136k|static void cfl_compute_parameters(MACROBLOCKD *const xd, TX_SIZE tx_size) {
  179|   136k|  CFL_CTX *const cfl = &xd->cfl;
  180|       |  // Do not call cfl_compute_parameters multiple time on the same values.
  181|   136k|  assert(cfl->are_parameters_computed == 0);
  182|       |
  183|   136k|  cfl_pad(cfl, tx_size_wide[tx_size], tx_size_high[tx_size]);
  184|   136k|  cfl_get_subtract_average_fn(tx_size)(cfl->recon_buf_q3, cfl->ac_buf_q3);
  185|   136k|  cfl->are_parameters_computed = 1;
  186|   136k|}
cfl.c:cfl_pad:
   83|   136k|static inline void cfl_pad(CFL_CTX *cfl, int width, int height) {
   84|   136k|  const int diff_width = width - cfl->buf_width;
   85|   136k|  const int diff_height = height - cfl->buf_height;
   86|       |
   87|   136k|  if (diff_width > 0) {
  ------------------
  |  Branch (87:7): [True: 208, False: 136k]
  ------------------
   88|    208|    const int min_height = height - diff_height;
   89|    208|    uint16_t *recon_buf_q3 = cfl->recon_buf_q3 + (width - diff_width);
   90|  5.51k|    for (int j = 0; j < min_height; j++) {
  ------------------
  |  Branch (90:21): [True: 5.30k, False: 208]
  ------------------
   91|  5.30k|      const uint16_t last_pixel = recon_buf_q3[-1];
   92|  5.30k|      assert(recon_buf_q3 + diff_width <= cfl->recon_buf_q3 + CFL_BUF_SQUARE);
   93|  53.6k|      for (int i = 0; i < diff_width; i++) {
  ------------------
  |  Branch (93:23): [True: 48.3k, False: 5.30k]
  ------------------
   94|  48.3k|        recon_buf_q3[i] = last_pixel;
   95|  48.3k|      }
   96|  5.30k|      recon_buf_q3 += CFL_BUF_LINE;
  ------------------
  |  |  522|  5.30k|#define CFL_BUF_LINE (32)
  ------------------
   97|  5.30k|    }
   98|    208|    cfl->buf_width = width;
   99|    208|  }
  100|   136k|  if (diff_height > 0) {
  ------------------
  |  Branch (100:7): [True: 70, False: 136k]
  ------------------
  101|     70|    uint16_t *recon_buf_q3 =
  102|     70|        cfl->recon_buf_q3 + ((height - diff_height) * CFL_BUF_LINE);
  ------------------
  |  |  522|     70|#define CFL_BUF_LINE (32)
  ------------------
  103|    726|    for (int j = 0; j < diff_height; j++) {
  ------------------
  |  Branch (103:21): [True: 656, False: 70]
  ------------------
  104|    656|      const uint16_t *last_row_q3 = recon_buf_q3 - CFL_BUF_LINE;
  ------------------
  |  |  522|    656|#define CFL_BUF_LINE (32)
  ------------------
  105|    656|      assert(recon_buf_q3 + width <= cfl->recon_buf_q3 + CFL_BUF_SQUARE);
  106|  18.1k|      for (int i = 0; i < width; i++) {
  ------------------
  |  Branch (106:23): [True: 17.4k, False: 656]
  ------------------
  107|  17.4k|        recon_buf_q3[i] = last_row_q3[i];
  108|  17.4k|      }
  109|    656|      recon_buf_q3 += CFL_BUF_LINE;
  ------------------
  |  |  522|    656|#define CFL_BUF_LINE (32)
  ------------------
  110|    656|    }
  111|     70|    cfl->buf_height = height;
  112|     70|  }
  113|   136k|}
cfl.c:cfl_idx_to_alpha:
  138|   272k|                                   CFL_PRED_TYPE pred_type) {
  139|   272k|  const int alpha_sign = (pred_type == CFL_PRED_U) ? CFL_SIGN_U(joint_sign)
  ------------------
  |  |  281|   136k|#define CFL_SIGN_U(js) (((js + 1) * 11) >> 5)
  ------------------
  |  Branch (139:26): [True: 136k, False: 136k]
  ------------------
  140|   272k|                                                   : CFL_SIGN_V(joint_sign);
  ------------------
  |  |  283|   136k|#define CFL_SIGN_V(js) ((js + 1) - CFL_SIGNS * CFL_SIGN_U(js))
  |  |  ------------------
  |  |  |  |  281|   136k|#define CFL_SIGN_U(js) (((js + 1) * 11) >> 5)
  |  |  ------------------
  ------------------
  141|   272k|  if (alpha_sign == CFL_SIGN_ZERO) return 0;
  ------------------
  |  Branch (141:7): [True: 57.9k, False: 214k]
  ------------------
  142|   214k|  const int abs_alpha_q3 =
  143|   214k|      (pred_type == CFL_PRED_U) ? CFL_IDX_U(alpha_idx) : CFL_IDX_V(alpha_idx);
  ------------------
  |  |  260|   127k|#define CFL_IDX_U(idx) (idx >> CFL_ALPHABET_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |  256|   127k|#define CFL_ALPHABET_SIZE_LOG2 4
  |  |  ------------------
  ------------------
                    (pred_type == CFL_PRED_U) ? CFL_IDX_U(alpha_idx) : CFL_IDX_V(alpha_idx);
  ------------------
  |  |  261|  87.1k|#define CFL_IDX_V(idx) (idx & (CFL_ALPHABET_SIZE - 1))
  |  |  ------------------
  |  |  |  |  257|  87.1k|#define CFL_ALPHABET_SIZE (1 << CFL_ALPHABET_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |  256|  87.1k|#define CFL_ALPHABET_SIZE_LOG2 4
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  |  Branch (143:7): [True: 127k, False: 87.1k]
  ------------------
  144|   214k|  return (alpha_sign == CFL_SIGN_POS) ? abs_alpha_q3 + 1 : -abs_alpha_q3 - 1;
  ------------------
  |  Branch (144:10): [True: 67.6k, False: 147k]
  ------------------
  145|   272k|}
cfl.c:sub8x8_adjust_offset:
  376|  73.1k|                                        int *col_out) {
  377|       |  // Increment row index for bottom: 8x4, 16x4 or both bottom 4x4s.
  378|  73.1k|  if ((mi_row & 0x01) && cfl->subsampling_y) {
  ------------------
  |  Branch (378:7): [True: 23.5k, False: 49.5k]
  |  Branch (378:26): [True: 988, False: 22.5k]
  ------------------
  379|    988|    assert(*row_out == 0);
  380|    988|    (*row_out)++;
  381|    988|  }
  382|       |
  383|       |  // Increment col index for right: 4x8, 4x16 or both right 4x4s.
  384|  73.1k|  if ((mi_col & 0x01) && cfl->subsampling_x) {
  ------------------
  |  Branch (384:7): [True: 16.9k, False: 56.2k]
  |  Branch (384:26): [True: 972, False: 15.9k]
  ------------------
  385|       |    assert(*col_out == 0);
  386|    972|    (*col_out)++;
  387|    972|  }
  388|  73.1k|}
cfl.c:cfl_store:
  326|   219k|                      int row, int col, TX_SIZE tx_size, int use_hbd) {
  327|   219k|  const int width = tx_size_wide[tx_size];
  328|   219k|  const int height = tx_size_high[tx_size];
  329|   219k|  const int tx_off_log2 = MI_SIZE_LOG2;
  ------------------
  |  |   39|   219k|#define MI_SIZE_LOG2 2
  ------------------
  330|   219k|  const int sub_x = cfl->subsampling_x;
  331|   219k|  const int sub_y = cfl->subsampling_y;
  332|   219k|  const int store_row = row << (tx_off_log2 - sub_y);
  333|   219k|  const int store_col = col << (tx_off_log2 - sub_x);
  334|   219k|  const int store_height = height >> sub_y;
  335|   219k|  const int store_width = width >> sub_x;
  336|       |
  337|       |  // Invalidate current parameters
  338|   219k|  cfl->are_parameters_computed = 0;
  339|       |
  340|       |  // Store the surface of the pixel buffer that was written to, this way we
  341|       |  // can manage chroma overrun (e.g. when the chroma surfaces goes beyond the
  342|       |  // frame boundary)
  343|   219k|  if (col == 0 && row == 0) {
  ------------------
  |  Branch (343:7): [True: 166k, False: 52.7k]
  |  Branch (343:19): [True: 142k, False: 23.8k]
  ------------------
  344|   142k|    cfl->buf_width = store_width;
  345|   142k|    cfl->buf_height = store_height;
  346|   142k|  } else {
  347|  76.5k|    cfl->buf_width = OD_MAXI(store_col + store_width, cfl->buf_width);
  ------------------
  |  |   45|  76.5k|#define OD_MAXI AOMMAX
  |  |  ------------------
  |  |  |  |   35|  76.5k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (35:23): [True: 28.0k, False: 48.5k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  348|  76.5k|    cfl->buf_height = OD_MAXI(store_row + store_height, cfl->buf_height);
  ------------------
  |  |   45|  76.5k|#define OD_MAXI AOMMAX
  |  |  ------------------
  |  |  |  |   35|  76.5k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (35:23): [True: 23.8k, False: 52.7k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  349|  76.5k|  }
  350|       |
  351|       |  // Check that we will remain inside the pixel buffer.
  352|   219k|  assert(store_row + store_height <= CFL_BUF_LINE);
  353|   219k|  assert(store_col + store_width <= CFL_BUF_LINE);
  354|       |
  355|       |  // Store the input into the CfL pixel buffer
  356|   219k|  uint16_t *recon_buf_q3 =
  357|   219k|      cfl->recon_buf_q3 + (store_row * CFL_BUF_LINE + store_col);
  ------------------
  |  |  522|   219k|#define CFL_BUF_LINE (32)
  ------------------
  358|   219k|#if CONFIG_AV1_HIGHBITDEPTH
  359|   219k|  if (use_hbd) {
  ------------------
  |  Branch (359:7): [True: 101k, False: 117k]
  ------------------
  360|   101k|    cfl_subsampling_hbd(tx_size, sub_x, sub_y)(CONVERT_TO_SHORTPTR(input),
  ------------------
  |  |   75|   101k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  361|   101k|                                               input_stride, recon_buf_q3);
  362|   117k|  } else {
  363|   117k|    cfl_subsampling_lbd(tx_size, sub_x, sub_y)(input, input_stride,
  364|   117k|                                               recon_buf_q3);
  365|   117k|  }
  366|       |#else
  367|       |  (void)use_hbd;
  368|       |  cfl_subsampling_lbd(tx_size, sub_x, sub_y)(input, input_stride, recon_buf_q3);
  369|       |#endif
  370|   219k|}
cfl.c:cfl_subsampling_hbd:
  303|   101k|                                                       int sub_x, int sub_y) {
  304|   101k|  if (sub_x == 1) {
  ------------------
  |  Branch (304:7): [True: 2.12k, False: 99.1k]
  ------------------
  305|  2.12k|    if (sub_y == 1) {
  ------------------
  |  Branch (305:9): [True: 1.48k, False: 642]
  ------------------
  306|  1.48k|      return cfl_get_luma_subsampling_420_hbd(tx_size);
  307|  1.48k|    }
  308|    642|    return cfl_get_luma_subsampling_422_hbd(tx_size);
  309|  2.12k|  }
  310|  99.1k|  return cfl_get_luma_subsampling_444_hbd(tx_size);
  311|   101k|}
cfl.c:cfl_subsampling_lbd:
  315|   117k|                                                       int sub_x, int sub_y) {
  316|   117k|  if (sub_x == 1) {
  ------------------
  |  Branch (316:7): [True: 8.87k, False: 109k]
  ------------------
  317|  8.87k|    if (sub_y == 1) {
  ------------------
  |  Branch (317:9): [True: 8.49k, False: 376]
  ------------------
  318|  8.49k|      return cfl_get_luma_subsampling_420_lbd(tx_size);
  319|  8.49k|    }
  320|    376|    return cfl_get_luma_subsampling_422_lbd(tx_size);
  321|  8.87k|  }
  322|   109k|  return cfl_get_luma_subsampling_444_lbd(tx_size);
  323|   117k|}
cfl.c:max_intra_block_width:
  407|  6.53k|                                        TX_SIZE tx_size) {
  408|  6.53k|  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane)
  409|  6.53k|                              << MI_SIZE_LOG2;
  ------------------
  |  |   39|  6.53k|#define MI_SIZE_LOG2 2
  ------------------
  410|  6.53k|  return ALIGN_POWER_OF_TWO(max_blocks_wide, tx_size_wide_log2[tx_size]);
  ------------------
  |  |   69|  6.53k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  ------------------
  411|  6.53k|}
cfl.c:max_intra_block_height:
  415|  6.53k|                                         TX_SIZE tx_size) {
  416|  6.53k|  const int max_blocks_high = max_block_high(xd, plane_bsize, plane)
  417|  6.53k|                              << MI_SIZE_LOG2;
  ------------------
  |  |   39|  6.53k|#define MI_SIZE_LOG2 2
  ------------------
  418|  6.53k|  return ALIGN_POWER_OF_TWO(max_blocks_high, tx_size_high_log2[tx_size]);
  ------------------
  |  |   69|  6.53k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  ------------------
  419|  6.53k|}

decodeframe.c:store_cfl_required:
   39|  2.45M|                                                  const MACROBLOCKD *xd) {
   40|  2.45M|  const MB_MODE_INFO *mbmi = xd->mi[0];
   41|       |
   42|  2.45M|  if (cm->seq_params->monochrome) return CFL_DISALLOWED;
  ------------------
  |  Branch (42:7): [True: 939k, False: 1.51M]
  ------------------
   43|       |
   44|  1.51M|  if (!xd->is_chroma_ref) {
  ------------------
  |  Branch (44:7): [True: 8.38k, False: 1.50M]
  ------------------
   45|       |    // For non-chroma-reference blocks, we should always store the luma pixels,
   46|       |    // in case the corresponding chroma-reference block uses CfL.
   47|       |    // Note that this can only happen for block sizes which are <8 on
   48|       |    // their shortest side, as otherwise they would be chroma reference
   49|       |    // blocks.
   50|  8.38k|    return CFL_ALLOWED;
   51|  8.38k|  }
   52|       |
   53|       |  // If this block has chroma information, we know whether we're
   54|       |  // actually going to perform a CfL prediction
   55|  1.50M|  return (CFL_ALLOWED_TYPE)(!is_inter_block(mbmi) &&
  ------------------
  |  Branch (55:29): [True: 1.46M, False: 40.0k]
  ------------------
   56|  1.46M|                            mbmi->uv_mode == UV_CFL_PRED);
  ------------------
  |  Branch (56:29): [True: 210k, False: 1.25M]
  ------------------
   57|  1.51M|}
decodemv.c:is_cfl_allowed:
   19|   977k|static inline CFL_ALLOWED_TYPE is_cfl_allowed(const MACROBLOCKD *xd) {
   20|   977k|  const MB_MODE_INFO *mbmi = xd->mi[0];
   21|   977k|  const BLOCK_SIZE bsize = mbmi->bsize;
   22|   977k|  assert(bsize < BLOCK_SIZES_ALL);
   23|   977k|  if (xd->lossless[mbmi->segment_id]) {
  ------------------
  |  Branch (23:7): [True: 27.0k, False: 950k]
  ------------------
   24|       |    // In lossless, CfL is available when the partition size is equal to the
   25|       |    // transform size.
   26|  27.0k|    const int ssx = xd->plane[AOM_PLANE_U].subsampling_x;
  ------------------
  |  |  211|  27.0k|#define AOM_PLANE_U 1      /**< U (Chroma) plane */
  ------------------
   27|  27.0k|    const int ssy = xd->plane[AOM_PLANE_U].subsampling_y;
  ------------------
  |  |  211|  27.0k|#define AOM_PLANE_U 1      /**< U (Chroma) plane */
  ------------------
   28|  27.0k|    const int plane_bsize = get_plane_block_size(bsize, ssx, ssy);
   29|  27.0k|    return (CFL_ALLOWED_TYPE)(plane_bsize == BLOCK_4X4);
   30|  27.0k|  }
   31|       |  // Spec: CfL is available to luma partitions lesser than or equal to 32x32
   32|   950k|  return (CFL_ALLOWED_TYPE)(block_size_wide[bsize] <= 32 &&
  ------------------
  |  Branch (32:29): [True: 903k, False: 47.5k]
  ------------------
   33|   903k|                            block_size_high[bsize] <= 32);
  ------------------
  |  Branch (33:29): [True: 889k, False: 13.6k]
  ------------------
   34|   977k|}
decodemv.c:store_cfl_required:
   39|  1.74M|                                                  const MACROBLOCKD *xd) {
   40|  1.74M|  const MB_MODE_INFO *mbmi = xd->mi[0];
   41|       |
   42|  1.74M|  if (cm->seq_params->monochrome) return CFL_DISALLOWED;
  ------------------
  |  Branch (42:7): [True: 690k, False: 1.05M]
  ------------------
   43|       |
   44|  1.05M|  if (!xd->is_chroma_ref) {
  ------------------
  |  Branch (44:7): [True: 14.1k, False: 1.04M]
  ------------------
   45|       |    // For non-chroma-reference blocks, we should always store the luma pixels,
   46|       |    // in case the corresponding chroma-reference block uses CfL.
   47|       |    // Note that this can only happen for block sizes which are <8 on
   48|       |    // their shortest side, as otherwise they would be chroma reference
   49|       |    // blocks.
   50|  14.1k|    return CFL_ALLOWED;
   51|  14.1k|  }
   52|       |
   53|       |  // If this block has chroma information, we know whether we're
   54|       |  // actually going to perform a CfL prediction
   55|  1.04M|  return (CFL_ALLOWED_TYPE)(!is_inter_block(mbmi) &&
  ------------------
  |  Branch (55:29): [True: 977k, False: 62.4k]
  ------------------
   56|   977k|                            mbmi->uv_mode == UV_CFL_PRED);
  ------------------
  |  Branch (56:29): [True: 172k, False: 805k]
  ------------------
   57|  1.05M|}
cfl.c:clear_cfl_dc_pred_cache_flags:
   69|  48.5k|static inline void clear_cfl_dc_pred_cache_flags(CFL_CTX *cfl) {
   70|  48.5k|  cfl->use_dc_pred_cache = false;
   71|  48.5k|  cfl->dc_pred_is_cached[CFL_PRED_U] = false;
   72|       |  cfl->dc_pred_is_cached[CFL_PRED_V] = false;
   73|  48.5k|}
reconintra.c:get_cfl_pred_type:
   64|   272k|static inline CFL_PRED_TYPE get_cfl_pred_type(int plane) {
   65|       |  assert(plane > 0);
   66|   272k|  return (CFL_PRED_TYPE)(plane - 1);
   67|   272k|}
cfl_subtract_average_4x4_sse2:
  178|  13.0k|                                                        int16_t *dst) {      \
  179|  13.0k|    subtract_average_##arch(src, dst, width, height, round_offset,           \
  180|  13.0k|                            num_pel_log2);                                   \
  181|  13.0k|  }
cfl_subtract_average_4x8_sse2:
  178|  10.1k|                                                        int16_t *dst) {      \
  179|  10.1k|    subtract_average_##arch(src, dst, width, height, round_offset,           \
  180|  10.1k|                            num_pel_log2);                                   \
  181|  10.1k|  }
cfl_subtract_average_4x16_sse2:
  178|  6.33k|                                                        int16_t *dst) {      \
  179|  6.33k|    subtract_average_##arch(src, dst, width, height, round_offset,           \
  180|  6.33k|                            num_pel_log2);                                   \
  181|  6.33k|  }
cfl_subtract_average_8x4_sse2:
  178|  13.3k|                                                        int16_t *dst) {      \
  179|  13.3k|    subtract_average_##arch(src, dst, width, height, round_offset,           \
  180|  13.3k|                            num_pel_log2);                                   \
  181|  13.3k|  }
cfl_subtract_average_8x8_sse2:
  178|  26.8k|                                                        int16_t *dst) {      \
  179|  26.8k|    subtract_average_##arch(src, dst, width, height, round_offset,           \
  180|  26.8k|                            num_pel_log2);                                   \
  181|  26.8k|  }
cfl_subtract_average_8x16_sse2:
  178|  10.4k|                                                        int16_t *dst) {      \
  179|  10.4k|    subtract_average_##arch(src, dst, width, height, round_offset,           \
  180|  10.4k|                            num_pel_log2);                                   \
  181|  10.4k|  }
cfl_subtract_average_8x32_sse2:
  178|  3.19k|                                                        int16_t *dst) {      \
  179|  3.19k|    subtract_average_##arch(src, dst, width, height, round_offset,           \
  180|  3.19k|                            num_pel_log2);                                   \
  181|  3.19k|  }
cfl_subsample_lbd_420_4x4_ssse3:
  101|  2.43k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  2.43k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  2.43k|                                               output_q3, width, height); \
  104|  2.43k|  }
cfl_subsample_lbd_420_8x8_ssse3:
  101|    454|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|    454|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|    454|                                               output_q3, width, height); \
  104|    454|  }
cfl_subsample_lbd_420_16x16_ssse3:
  101|    153|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|    153|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|    153|                                               output_q3, width, height); \
  104|    153|  }
cfl_subsample_lbd_420_4x8_ssse3:
  101|  1.14k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  1.14k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  1.14k|                                               output_q3, width, height); \
  104|  1.14k|  }
cfl_subsample_lbd_420_8x4_ssse3:
  101|  1.58k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  1.58k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  1.58k|                                               output_q3, width, height); \
  104|  1.58k|  }
cfl_subsample_lbd_420_8x16_ssse3:
  101|    106|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|    106|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|    106|                                               output_q3, width, height); \
  104|    106|  }
cfl_subsample_lbd_420_16x8_ssse3:
  101|    160|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|    160|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|    160|                                               output_q3, width, height); \
  104|    160|  }
cfl_subsample_lbd_420_16x32_ssse3:
  101|     16|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|     16|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|     16|                                               output_q3, width, height); \
  104|     16|  }
cfl_subsample_lbd_420_4x16_ssse3:
  101|    888|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|    888|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|    888|                                               output_q3, width, height); \
  104|    888|  }
cfl_subsample_lbd_420_16x4_ssse3:
  101|  1.43k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  1.43k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  1.43k|                                               output_q3, width, height); \
  104|  1.43k|  }
cfl_subsample_lbd_420_8x32_ssse3:
  101|     29|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|     29|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|     29|                                               output_q3, width, height); \
  104|     29|  }
cfl_subsample_lbd_422_4x4_ssse3:
  101|    120|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|    120|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|    120|                                               output_q3, width, height); \
  104|    120|  }
cfl_subsample_lbd_422_8x8_ssse3:
  101|    114|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|    114|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|    114|                                               output_q3, width, height); \
  104|    114|  }
cfl_subsample_lbd_422_16x16_ssse3:
  101|     24|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|     24|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|     24|                                               output_q3, width, height); \
  104|     24|  }
cfl_subsample_lbd_422_8x4_ssse3:
  101|     26|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|     26|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|     26|                                               output_q3, width, height); \
  104|     26|  }
cfl_subsample_lbd_422_16x8_ssse3:
  101|     36|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|     36|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|     36|                                               output_q3, width, height); \
  104|     36|  }
cfl_subsample_lbd_422_16x4_ssse3:
  101|     38|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|     38|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|     38|                                               output_q3, width, height); \
  104|     38|  }
cfl_subsample_lbd_444_4x4_ssse3:
  101|  29.7k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  29.7k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  29.7k|                                               output_q3, width, height); \
  104|  29.7k|  }
cfl_subsample_lbd_444_8x8_ssse3:
  101|  34.3k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  34.3k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  34.3k|                                               output_q3, width, height); \
  104|  34.3k|  }
cfl_subsample_lbd_444_16x16_ssse3:
  101|  5.33k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  5.33k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  5.33k|                                               output_q3, width, height); \
  104|  5.33k|  }
cfl_subsample_lbd_444_4x8_ssse3:
  101|  5.99k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  5.99k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  5.99k|                                               output_q3, width, height); \
  104|  5.99k|  }
cfl_subsample_lbd_444_8x4_ssse3:
  101|  8.76k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  8.76k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  8.76k|                                               output_q3, width, height); \
  104|  8.76k|  }
cfl_subsample_lbd_444_8x16_ssse3:
  101|  4.17k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  4.17k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  4.17k|                                               output_q3, width, height); \
  104|  4.17k|  }
cfl_subsample_lbd_444_16x8_ssse3:
  101|  7.02k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  7.02k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  7.02k|                                               output_q3, width, height); \
  104|  7.02k|  }
cfl_subsample_lbd_444_16x32_ssse3:
  101|    683|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|    683|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|    683|                                               output_q3, width, height); \
  104|    683|  }
cfl_subsample_lbd_444_4x16_ssse3:
  101|  2.72k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  2.72k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  2.72k|                                               output_q3, width, height); \
  104|  2.72k|  }
cfl_subsample_lbd_444_16x4_ssse3:
  101|  5.03k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  5.03k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  5.03k|                                               output_q3, width, height); \
  104|  5.03k|  }
cfl_subsample_lbd_444_8x32_ssse3:
  101|  1.45k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  1.45k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  1.45k|                                               output_q3, width, height); \
  104|  1.45k|  }
cfl_subsample_hbd_420_4x4_ssse3:
  101|    462|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|    462|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|    462|                                               output_q3, width, height); \
  104|    462|  }
cfl_subsample_hbd_420_8x8_ssse3:
  101|     98|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|     98|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|     98|                                               output_q3, width, height); \
  104|     98|  }
cfl_subsample_hbd_420_16x16_ssse3:
  101|     34|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|     34|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|     34|                                               output_q3, width, height); \
  104|     34|  }
cfl_subsample_hbd_420_4x8_ssse3:
  101|    182|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|    182|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|    182|                                               output_q3, width, height); \
  104|    182|  }
cfl_subsample_hbd_420_8x4_ssse3:
  101|    260|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|    260|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|    260|                                               output_q3, width, height); \
  104|    260|  }
cfl_subsample_hbd_420_8x16_ssse3:
  101|     20|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|     20|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|     20|                                               output_q3, width, height); \
  104|     20|  }
cfl_subsample_hbd_420_16x8_ssse3:
  101|     34|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|     34|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|     34|                                               output_q3, width, height); \
  104|     34|  }
cfl_subsample_hbd_420_16x32_ssse3:
  101|      8|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|      8|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|      8|                                               output_q3, width, height); \
  104|      8|  }
cfl_subsample_hbd_420_4x16_ssse3:
  101|    154|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|    154|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|    154|                                               output_q3, width, height); \
  104|    154|  }
cfl_subsample_hbd_420_16x4_ssse3:
  101|    192|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|    192|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|    192|                                               output_q3, width, height); \
  104|    192|  }
cfl_subsample_hbd_420_8x32_ssse3:
  101|      8|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|      8|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|      8|                                               output_q3, width, height); \
  104|      8|  }
cfl_subsample_hbd_422_4x4_ssse3:
  101|     98|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|     98|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|     98|                                               output_q3, width, height); \
  104|     98|  }
cfl_subsample_hbd_422_8x8_ssse3:
  101|    160|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|    160|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|    160|                                               output_q3, width, height); \
  104|    160|  }
cfl_subsample_hbd_422_16x16_ssse3:
  101|     54|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|     54|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|     54|                                               output_q3, width, height); \
  104|     54|  }
cfl_subsample_hbd_422_8x4_ssse3:
  101|     28|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|     28|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|     28|                                               output_q3, width, height); \
  104|     28|  }
cfl_subsample_hbd_422_16x8_ssse3:
  101|     56|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|     56|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|     56|                                               output_q3, width, height); \
  104|     56|  }
cfl_subsample_hbd_422_16x4_ssse3:
  101|     52|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|     52|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|     52|                                               output_q3, width, height); \
  104|     52|  }
cfl_subsample_hbd_444_4x4_ssse3:
  101|  27.7k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  27.7k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  27.7k|                                               output_q3, width, height); \
  104|  27.7k|  }
cfl_subsample_hbd_444_8x8_ssse3:
  101|  32.6k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  32.6k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  32.6k|                                               output_q3, width, height); \
  104|  32.6k|  }
cfl_subsample_hbd_444_16x16_ssse3:
  101|  4.30k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  4.30k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  4.30k|                                               output_q3, width, height); \
  104|  4.30k|  }
cfl_subsample_hbd_444_4x8_ssse3:
  101|  5.20k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  5.20k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  5.20k|                                               output_q3, width, height); \
  104|  5.20k|  }
cfl_subsample_hbd_444_8x4_ssse3:
  101|  7.68k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  7.68k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  7.68k|                                               output_q3, width, height); \
  104|  7.68k|  }
cfl_subsample_hbd_444_8x16_ssse3:
  101|  3.62k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  3.62k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  3.62k|                                               output_q3, width, height); \
  104|  3.62k|  }
cfl_subsample_hbd_444_16x8_ssse3:
  101|  5.03k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  5.03k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  5.03k|                                               output_q3, width, height); \
  104|  5.03k|  }
cfl_subsample_hbd_444_16x32_ssse3:
  101|  1.04k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  1.04k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  1.04k|                                               output_q3, width, height); \
  104|  1.04k|  }
cfl_subsample_hbd_444_4x16_ssse3:
  101|  1.99k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  1.99k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  1.99k|                                               output_q3, width, height); \
  104|  1.99k|  }
cfl_subsample_hbd_444_16x4_ssse3:
  101|  5.15k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  5.15k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  5.15k|                                               output_q3, width, height); \
  104|  5.15k|  }
cfl_subsample_hbd_444_8x32_ssse3:
  101|  1.10k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  1.10k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  1.10k|                                               output_q3, width, height); \
  104|  1.10k|  }
cfl_predict_lbd_4x4_ssse3:
  232|  14.8k|      int alpha_q3) {                                                          \
  233|  14.8k|    cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width,      \
  234|  14.8k|                           height);                                            \
  235|  14.8k|  }
cfl_predict_lbd_4x8_ssse3:
  232|  11.1k|      int alpha_q3) {                                                          \
  233|  11.1k|    cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width,      \
  234|  11.1k|                           height);                                            \
  235|  11.1k|  }
cfl_predict_lbd_4x16_ssse3:
  232|  7.04k|      int alpha_q3) {                                                          \
  233|  7.04k|    cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width,      \
  234|  7.04k|                           height);                                            \
  235|  7.04k|  }
cfl_predict_lbd_8x4_ssse3:
  232|  13.9k|      int alpha_q3) {                                                          \
  233|  13.9k|    cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width,      \
  234|  13.9k|                           height);                                            \
  235|  13.9k|  }
cfl_predict_lbd_8x8_ssse3:
  232|  28.5k|      int alpha_q3) {                                                          \
  233|  28.5k|    cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width,      \
  234|  28.5k|                           height);                                            \
  235|  28.5k|  }
cfl_predict_lbd_8x16_ssse3:
  232|  10.9k|      int alpha_q3) {                                                          \
  233|  10.9k|    cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width,      \
  234|  10.9k|                           height);                                            \
  235|  10.9k|  }
cfl_predict_lbd_8x32_ssse3:
  232|  3.48k|      int alpha_q3) {                                                          \
  233|  3.48k|    cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width,      \
  234|  3.48k|                           height);                                            \
  235|  3.48k|  }
cfl_predict_lbd_16x4_ssse3:
  232|  13.4k|      int alpha_q3) {                                                          \
  233|  13.4k|    cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width,      \
  234|  13.4k|                           height);                                            \
  235|  13.4k|  }
cfl_predict_lbd_16x8_ssse3:
  232|  17.3k|      int alpha_q3) {                                                          \
  233|  17.3k|    cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width,      \
  234|  17.3k|                           height);                                            \
  235|  17.3k|  }
cfl_predict_lbd_16x16_ssse3:
  232|  13.0k|      int alpha_q3) {                                                          \
  233|  13.0k|    cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width,      \
  234|  13.0k|                           height);                                            \
  235|  13.0k|  }
cfl_predict_lbd_16x32_ssse3:
  232|  2.00k|      int alpha_q3) {                                                          \
  233|  2.00k|    cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width,      \
  234|  2.00k|                           height);                                            \
  235|  2.00k|  }
cfl_predict_hbd_4x4_ssse3:
  244|  11.1k|      int bd) {                                                                \
  245|  11.1k|    cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width,  \
  246|  11.1k|                           height);                                            \
  247|  11.1k|  }
cfl_predict_hbd_4x8_ssse3:
  244|  9.18k|      int bd) {                                                                \
  245|  9.18k|    cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width,  \
  246|  9.18k|                           height);                                            \
  247|  9.18k|  }
cfl_predict_hbd_4x16_ssse3:
  244|  5.62k|      int bd) {                                                                \
  245|  5.62k|    cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width,  \
  246|  5.62k|                           height);                                            \
  247|  5.62k|  }
cfl_predict_hbd_8x4_ssse3:
  244|  12.7k|      int bd) {                                                                \
  245|  12.7k|    cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width,  \
  246|  12.7k|                           height);                                            \
  247|  12.7k|  }
cfl_predict_hbd_8x8_ssse3:
  244|  25.0k|      int bd) {                                                                \
  245|  25.0k|    cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width,  \
  246|  25.0k|                           height);                                            \
  247|  25.0k|  }
cfl_predict_hbd_8x16_ssse3:
  244|  9.94k|      int bd) {                                                                \
  245|  9.94k|    cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width,  \
  246|  9.94k|                           height);                                            \
  247|  9.94k|  }
cfl_predict_hbd_8x32_ssse3:
  244|  2.89k|      int bd) {                                                                \
  245|  2.89k|    cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width,  \
  246|  2.89k|                           height);                                            \
  247|  2.89k|  }
cfl_subsample_lbd_420_32x32_avx2:
  101|     22|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|     22|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|     22|                                               output_q3, width, height); \
  104|     22|  }
cfl_subsample_lbd_420_32x16_avx2:
  101|     34|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|     34|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|     34|                                               output_q3, width, height); \
  104|     34|  }
cfl_subsample_lbd_420_32x8_avx2:
  101|     38|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|     38|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|     38|                                               output_q3, width, height); \
  104|     38|  }
cfl_subsample_lbd_422_32x32_avx2:
  101|      6|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|      6|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|      6|                                               output_q3, width, height); \
  104|      6|  }
cfl_subsample_lbd_422_32x16_avx2:
  101|      6|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|      6|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|      6|                                               output_q3, width, height); \
  104|      6|  }
cfl_subsample_lbd_422_32x8_avx2:
  101|      6|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|      6|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|      6|                                               output_q3, width, height); \
  104|      6|  }
cfl_subsample_lbd_444_32x32_avx2:
  101|  1.40k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  1.40k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  1.40k|                                               output_q3, width, height); \
  104|  1.40k|  }
cfl_subsample_lbd_444_32x16_avx2:
  101|    868|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|    868|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|    868|                                               output_q3, width, height); \
  104|    868|  }
cfl_subsample_lbd_444_32x8_avx2:
  101|  1.60k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  1.60k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  1.60k|                                               output_q3, width, height); \
  104|  1.60k|  }
cfl_subsample_hbd_420_32x32_avx2:
  101|     14|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|     14|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|     14|                                               output_q3, width, height); \
  104|     14|  }
cfl_subsample_hbd_420_32x16_avx2:
  101|      8|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|      8|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|      8|                                               output_q3, width, height); \
  104|      8|  }
cfl_subsample_hbd_420_32x8_avx2:
  101|     12|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|     12|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|     12|                                               output_q3, width, height); \
  104|     12|  }
cfl_subsample_hbd_422_32x32_avx2:
  101|     98|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|     98|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|     98|                                               output_q3, width, height); \
  104|     98|  }
cfl_subsample_hbd_422_32x16_avx2:
  101|     44|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|     44|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|     44|                                               output_q3, width, height); \
  104|     44|  }
cfl_subsample_hbd_422_32x8_avx2:
  101|     52|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|     52|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|     52|                                               output_q3, width, height); \
  104|     52|  }
cfl_subsample_hbd_444_32x32_avx2:
  101|  1.19k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  1.19k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  1.19k|                                               output_q3, width, height); \
  104|  1.19k|  }
cfl_subsample_hbd_444_32x16_avx2:
  101|    691|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|    691|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|    691|                                               output_q3, width, height); \
  104|    691|  }
cfl_subsample_hbd_444_32x8_avx2:
  101|  1.70k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  1.70k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  1.70k|                                               output_q3, width, height); \
  104|  1.70k|  }
cfl_predict_lbd_32x8_avx2:
  232|  4.10k|      int alpha_q3) {                                                          \
  233|  4.10k|    cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width,      \
  234|  4.10k|                           height);                                            \
  235|  4.10k|  }
cfl_predict_lbd_32x16_avx2:
  232|  2.40k|      int alpha_q3) {                                                          \
  233|  2.40k|    cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width,      \
  234|  2.40k|                           height);                                            \
  235|  2.40k|  }
cfl_predict_lbd_32x32_avx2:
  232|  3.67k|      int alpha_q3) {                                                          \
  233|  3.67k|    cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width,      \
  234|  3.67k|                           height);                                            \
  235|  3.67k|  }
cfl_predict_hbd_16x4_avx2:
  244|  13.2k|      int bd) {                                                                \
  245|  13.2k|    cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width,  \
  246|  13.2k|                           height);                                            \
  247|  13.2k|  }
cfl_predict_hbd_16x8_avx2:
  244|  13.7k|      int bd) {                                                                \
  245|  13.7k|    cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width,  \
  246|  13.7k|                           height);                                            \
  247|  13.7k|  }
cfl_predict_hbd_16x16_avx2:
  244|  10.7k|      int bd) {                                                                \
  245|  10.7k|    cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width,  \
  246|  10.7k|                           height);                                            \
  247|  10.7k|  }
cfl_predict_hbd_16x32_avx2:
  244|  3.01k|      int bd) {                                                                \
  245|  3.01k|    cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width,  \
  246|  3.01k|                           height);                                            \
  247|  3.01k|  }
cfl_predict_hbd_32x8_avx2:
  244|  4.20k|      int bd) {                                                                \
  245|  4.20k|    cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width,  \
  246|  4.20k|                           height);                                            \
  247|  4.20k|  }
cfl_predict_hbd_32x16_avx2:
  244|  1.93k|      int bd) {                                                                \
  245|  1.93k|    cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width,  \
  246|  1.93k|                           height);                                            \
  247|  1.93k|  }
cfl_predict_hbd_32x32_avx2:
  244|  3.24k|      int bd) {                                                                \
  245|  3.24k|    cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width,  \
  246|  3.24k|                           height);                                            \
  247|  3.24k|  }
cfl_subtract_average_16x4_avx2:
  178|  13.3k|                                                        int16_t *dst) {      \
  179|  13.3k|    subtract_average_##arch(src, dst, width, height, round_offset,           \
  180|  13.3k|                            num_pel_log2);                                   \
  181|  13.3k|  }
cfl_subtract_average_16x8_avx2:
  178|  15.5k|                                                        int16_t *dst) {      \
  179|  15.5k|    subtract_average_##arch(src, dst, width, height, round_offset,           \
  180|  15.5k|                            num_pel_log2);                                   \
  181|  15.5k|  }
cfl_subtract_average_16x16_avx2:
  178|  11.8k|                                                        int16_t *dst) {      \
  179|  11.8k|    subtract_average_##arch(src, dst, width, height, round_offset,           \
  180|  11.8k|                            num_pel_log2);                                   \
  181|  11.8k|  }
cfl_subtract_average_16x32_avx2:
  178|  2.50k|                                                        int16_t *dst) {      \
  179|  2.50k|    subtract_average_##arch(src, dst, width, height, round_offset,           \
  180|  2.50k|                            num_pel_log2);                                   \
  181|  2.50k|  }
cfl_subtract_average_32x8_avx2:
  178|  4.15k|                                                        int16_t *dst) {      \
  179|  4.15k|    subtract_average_##arch(src, dst, width, height, round_offset,           \
  180|  4.15k|                            num_pel_log2);                                   \
  181|  4.15k|  }
cfl_subtract_average_32x16_avx2:
  178|  2.16k|                                                        int16_t *dst) {      \
  179|  2.16k|    subtract_average_##arch(src, dst, width, height, round_offset,           \
  180|  2.16k|                            num_pel_log2);                                   \
  181|  2.16k|  }
cfl_subtract_average_32x32_avx2:
  178|  3.46k|                                                        int16_t *dst) {      \
  179|  3.46k|    subtract_average_##arch(src, dst, width, height, round_offset,           \
  180|  3.46k|                            num_pel_log2);                                   \
  181|  3.46k|  }

decodeframe.c:get_unsigned_bits:
   46|  83.6k|static inline int get_unsigned_bits(unsigned int num_values) {
   47|  83.6k|  return num_values > 0 ? get_msb(num_values) + 1 : 0;
  ------------------
  |  Branch (47:10): [True: 63.7k, False: 19.8k]
  ------------------
   48|  83.6k|}
detokenize.c:get_unsigned_bits:
   46|  62.4k|static inline int get_unsigned_bits(unsigned int num_values) {
   47|  62.4k|  return num_values > 0 ? get_msb(num_values) + 1 : 0;
  ------------------
  |  Branch (47:10): [True: 62.4k, False: 0]
  ------------------
   48|  62.4k|}

av1_convolve_2d_facade:
  643|   153k|                            ConvolveParams *conv_params) {
  644|   153k|  (void)x_step_q4;
  645|   153k|  (void)y_step_q4;
  646|   153k|  (void)dst;
  647|   153k|  (void)dst_stride;
  648|       |
  649|   153k|  const InterpFilterParams *filter_params_x = interp_filters[0];
  650|   153k|  const InterpFilterParams *filter_params_y = interp_filters[1];
  651|       |
  652|       |  // TODO(jingning, yunqing): Add SIMD support to 2-tap filter case.
  653|       |  // 2-tap filter indicates that it is for IntraBC.
  654|   153k|  if (filter_params_x->taps == 2 || filter_params_y->taps == 2) {
  ------------------
  |  Branch (654:7): [True: 7.23k, False: 146k]
  |  Branch (654:37): [True: 0, False: 146k]
  ------------------
  655|  7.23k|    assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
  656|  7.23k|    assert(!scaled);
  657|  7.23k|    if (subpel_x_qn && subpel_y_qn) {
  ------------------
  |  Branch (657:9): [True: 0, False: 7.23k]
  |  Branch (657:24): [True: 0, False: 0]
  ------------------
  658|      0|      av1_convolve_2d_sr_intrabc(src, src_stride, dst, dst_stride, w, h,
  ------------------
  |  |  221|      0|#define av1_convolve_2d_sr_intrabc av1_convolve_2d_sr_intrabc_c
  ------------------
  659|      0|                                 filter_params_x, filter_params_y, subpel_x_qn,
  660|      0|                                 subpel_y_qn, conv_params);
  661|      0|      return;
  662|  7.23k|    } else if (subpel_x_qn) {
  ------------------
  |  Branch (662:16): [True: 0, False: 7.23k]
  ------------------
  663|      0|      av1_convolve_x_sr_intrabc(src, src_stride, dst, dst_stride, w, h,
  ------------------
  |  |  233|      0|#define av1_convolve_x_sr_intrabc av1_convolve_x_sr_intrabc_c
  ------------------
  664|      0|                                filter_params_x, subpel_x_qn, conv_params);
  665|      0|      return;
  666|  7.23k|    } else if (subpel_y_qn) {
  ------------------
  |  Branch (666:16): [True: 0, False: 7.23k]
  ------------------
  667|      0|      av1_convolve_y_sr_intrabc(src, src_stride, dst, dst_stride, w, h,
  ------------------
  |  |  241|      0|#define av1_convolve_y_sr_intrabc av1_convolve_y_sr_intrabc_c
  ------------------
  668|      0|                                filter_params_y, subpel_y_qn);
  669|      0|      return;
  670|      0|    }
  671|  7.23k|  }
  672|       |
  673|   153k|  if (scaled) {
  ------------------
  |  Branch (673:7): [True: 24, False: 153k]
  ------------------
  674|     24|    convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h,
  675|     24|                              filter_params_x, filter_params_y, subpel_x_qn,
  676|     24|                              x_step_q4, subpel_y_qn, y_step_q4, conv_params);
  677|   153k|  } else if (conv_params->is_compound) {
  ------------------
  |  Branch (677:14): [True: 30.2k, False: 123k]
  ------------------
  678|  30.2k|    convolve_2d_facade_compound(src, src_stride, dst, dst_stride, w, h,
  679|  30.2k|                                filter_params_x, filter_params_y, subpel_x_qn,
  680|  30.2k|                                subpel_y_qn, conv_params);
  681|   123k|  } else {
  682|   123k|    convolve_2d_facade_single(src, src_stride, dst, dst_stride, w, h,
  683|   123k|                              filter_params_x, filter_params_y, subpel_x_qn,
  684|   123k|                              subpel_y_qn, conv_params);
  685|   123k|  }
  686|   153k|}
av1_highbd_convolve_2d_facade:
 1252|  36.2k|                                   int bd) {
 1253|  36.2k|  (void)x_step_q4;
 1254|  36.2k|  (void)y_step_q4;
 1255|  36.2k|  (void)dst_stride;
 1256|  36.2k|  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  ------------------
  |  |   75|  36.2k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
 1257|       |
 1258|  36.2k|  const InterpFilterParams *filter_params_x = interp_filters[0];
 1259|  36.2k|  const InterpFilterParams *filter_params_y = interp_filters[1];
 1260|       |
 1261|  36.2k|  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
  ------------------
  |  |   75|  36.2k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
 1262|       |  // 2-tap filter indicates that it is for IntraBC.
 1263|  36.2k|  if (filter_params_x->taps == 2 || filter_params_y->taps == 2) {
  ------------------
  |  Branch (1263:7): [True: 5.89k, False: 30.3k]
  |  Branch (1263:37): [True: 0, False: 30.3k]
  ------------------
 1264|  5.89k|    assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
 1265|  5.89k|    assert(!scaled);
 1266|  5.89k|    if (subpel_x_qn && subpel_y_qn) {
  ------------------
  |  Branch (1266:9): [True: 0, False: 5.89k]
  |  Branch (1266:24): [True: 0, False: 0]
  ------------------
 1267|      0|      av1_highbd_convolve_2d_sr_intrabc_c(
 1268|      0|          src, src_stride, dst, dst_stride, w, h, filter_params_x,
 1269|      0|          filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd);
 1270|      0|      return;
 1271|  5.89k|    } else if (subpel_x_qn) {
  ------------------
  |  Branch (1271:16): [True: 0, False: 5.89k]
  ------------------
 1272|      0|      av1_highbd_convolve_x_sr_intrabc_c(src, src_stride, dst, dst_stride, w, h,
 1273|      0|                                         filter_params_x, subpel_x_qn,
 1274|      0|                                         conv_params, bd);
 1275|      0|      return;
 1276|  5.89k|    } else if (subpel_y_qn) {
  ------------------
  |  Branch (1276:16): [True: 0, False: 5.89k]
  ------------------
 1277|      0|      av1_highbd_convolve_y_sr_intrabc_c(src, src_stride, dst, dst_stride, w, h,
 1278|      0|                                         filter_params_y, subpel_y_qn, bd);
 1279|      0|      return;
 1280|      0|    }
 1281|  5.89k|  }
 1282|       |
 1283|  36.2k|  if (scaled) {
  ------------------
  |  Branch (1283:7): [True: 0, False: 36.2k]
  ------------------
 1284|      0|    if (conv_params->is_compound) {
  ------------------
  |  Branch (1284:9): [True: 0, False: 0]
  ------------------
 1285|      0|      assert(conv_params->dst != NULL);
 1286|      0|    }
 1287|      0|    av1_highbd_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h,
 1288|      0|                                 filter_params_x, filter_params_y, subpel_x_qn,
 1289|      0|                                 x_step_q4, subpel_y_qn, y_step_q4, conv_params,
 1290|      0|                                 bd);
 1291|  36.2k|  } else if (conv_params->is_compound) {
  ------------------
  |  Branch (1291:14): [True: 5.22k, False: 31.0k]
  ------------------
 1292|  5.22k|    highbd_convolve_2d_facade_compound(
 1293|  5.22k|        src, src_stride, dst, dst_stride, w, h, filter_params_x,
 1294|  5.22k|        filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd);
 1295|  31.0k|  } else {
 1296|  31.0k|    highbd_convolve_2d_facade_single(src, src_stride, dst, dst_stride, w, h,
 1297|  31.0k|                                     filter_params_x, filter_params_y,
 1298|  31.0k|                                     subpel_x_qn, subpel_y_qn, conv_params, bd);
 1299|  31.0k|  }
 1300|  36.2k|}
convolve.c:convolve_2d_scale_wrapper:
  583|     24|    ConvolveParams *conv_params) {
  584|     24|  if (conv_params->is_compound) {
  ------------------
  |  Branch (584:7): [True: 0, False: 24]
  ------------------
  585|       |    assert(conv_params->dst != NULL);
  586|      0|  }
  587|     24|  av1_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h, filter_params_x,
  588|     24|                        filter_params_y, subpel_x_qn, x_step_qn, subpel_y_qn,
  589|     24|                        y_step_qn, conv_params);
  590|     24|}
convolve.c:convolve_2d_facade_compound:
  596|  30.2k|    const int subpel_y_qn, ConvolveParams *conv_params) {
  597|  30.2k|  const bool need_x = subpel_x_qn != 0;
  598|  30.2k|  const bool need_y = subpel_y_qn != 0;
  599|  30.2k|  if (!need_x && !need_y) {
  ------------------
  |  Branch (599:7): [True: 16.6k, False: 13.5k]
  |  Branch (599:18): [True: 13.6k, False: 2.95k]
  ------------------
  600|  13.6k|    av1_dist_wtd_convolve_2d_copy(src, src_stride, dst, dst_stride, w, h,
  601|  13.6k|                                  conv_params);
  602|  16.5k|  } else if (need_x && !need_y) {
  ------------------
  |  Branch (602:14): [True: 13.5k, False: 2.95k]
  |  Branch (602:24): [True: 4.16k, False: 9.41k]
  ------------------
  603|  4.16k|    av1_dist_wtd_convolve_x(src, src_stride, dst, dst_stride, w, h,
  604|  4.16k|                            filter_params_x, subpel_x_qn, conv_params);
  605|  12.3k|  } else if (!need_x && need_y) {
  ------------------
  |  Branch (605:14): [True: 2.95k, False: 9.41k]
  |  Branch (605:25): [True: 2.95k, False: 0]
  ------------------
  606|  2.95k|    av1_dist_wtd_convolve_y(src, src_stride, dst, dst_stride, w, h,
  607|  2.95k|                            filter_params_y, subpel_y_qn, conv_params);
  608|  9.41k|  } else {
  609|       |    assert(need_y && need_x);
  610|  9.41k|    av1_dist_wtd_convolve_2d(src, src_stride, dst, dst_stride, w, h,
  611|  9.41k|                             filter_params_x, filter_params_y, subpel_x_qn,
  612|  9.41k|                             subpel_y_qn, conv_params);
  613|  9.41k|  }
  614|  30.2k|}
convolve.c:convolve_2d_facade_single:
  620|   123k|    const int subpel_y_qn, ConvolveParams *conv_params) {
  621|   123k|  const bool need_x = subpel_x_qn != 0;
  622|   123k|  const bool need_y = subpel_y_qn != 0;
  623|   123k|  if (!need_x && !need_y) {
  ------------------
  |  Branch (623:7): [True: 60.0k, False: 63.2k]
  |  Branch (623:18): [True: 45.5k, False: 14.4k]
  ------------------
  624|  45.5k|    aom_convolve_copy(src, src_stride, dst, dst_stride, w, h);
  625|  77.7k|  } else if (need_x && !need_y) {
  ------------------
  |  Branch (625:14): [True: 63.2k, False: 14.4k]
  |  Branch (625:24): [True: 14.2k, False: 49.0k]
  ------------------
  626|  14.2k|    av1_convolve_x_sr(src, src_stride, dst, dst_stride, w, h, filter_params_x,
  627|  14.2k|                      subpel_x_qn, conv_params);
  628|  63.4k|  } else if (!need_x && need_y) {
  ------------------
  |  Branch (628:14): [True: 14.4k, False: 49.0k]
  |  Branch (628:25): [True: 14.4k, False: 0]
  ------------------
  629|  14.4k|    av1_convolve_y_sr(src, src_stride, dst, dst_stride, w, h, filter_params_y,
  630|  14.4k|                      subpel_y_qn);
  631|  49.0k|  } else {
  632|       |    assert(need_x && need_y);
  633|  49.0k|    av1_convolve_2d_sr(src, src_stride, dst, dst_stride, w, h, filter_params_x,
  634|  49.0k|                       filter_params_y, subpel_x_qn, subpel_y_qn, conv_params);
  635|  49.0k|  }
  636|   123k|}
convolve.c:highbd_convolve_2d_facade_compound:
 1200|  5.22k|    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
 1201|  5.22k|  const bool need_x = subpel_x_qn != 0;
 1202|  5.22k|  const bool need_y = subpel_y_qn != 0;
 1203|  5.22k|  if (!need_x && !need_y) {
  ------------------
  |  Branch (1203:7): [True: 1.70k, False: 3.51k]
  |  Branch (1203:18): [True: 1.37k, False: 332]
  ------------------
 1204|  1.37k|    av1_highbd_dist_wtd_convolve_2d_copy(src, src_stride, dst, dst_stride, w, h,
 1205|  1.37k|                                         conv_params, bd);
 1206|  3.85k|  } else if (need_x && !need_y) {
  ------------------
  |  Branch (1206:14): [True: 3.51k, False: 332]
  |  Branch (1206:24): [True: 1.12k, False: 2.39k]
  ------------------
 1207|  1.12k|    av1_highbd_dist_wtd_convolve_x(src, src_stride, dst, dst_stride, w, h,
 1208|  1.12k|                                   filter_params_x, subpel_x_qn, conv_params,
 1209|  1.12k|                                   bd);
 1210|  2.72k|  } else if (!need_x && need_y) {
  ------------------
  |  Branch (1210:14): [True: 332, False: 2.39k]
  |  Branch (1210:25): [True: 332, False: 0]
  ------------------
 1211|    332|    av1_highbd_dist_wtd_convolve_y(src, src_stride, dst, dst_stride, w, h,
 1212|    332|                                   filter_params_y, subpel_y_qn, conv_params,
 1213|    332|                                   bd);
 1214|  2.39k|  } else {
 1215|       |    assert(need_x && need_y);
 1216|  2.39k|    av1_highbd_dist_wtd_convolve_2d(src, src_stride, dst, dst_stride, w, h,
 1217|  2.39k|                                    filter_params_x, filter_params_y,
 1218|  2.39k|                                    subpel_x_qn, subpel_y_qn, conv_params, bd);
 1219|  2.39k|  }
 1220|  5.22k|}
convolve.c:highbd_convolve_2d_facade_single:
 1226|  31.0k|    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
 1227|  31.0k|  const bool need_x = subpel_x_qn != 0;
 1228|  31.0k|  const bool need_y = subpel_y_qn != 0;
 1229|       |
 1230|  31.0k|  if (!need_x && !need_y) {
  ------------------
  |  Branch (1230:7): [True: 17.3k, False: 13.6k]
  |  Branch (1230:18): [True: 14.3k, False: 3.03k]
  ------------------
 1231|  14.3k|    aom_highbd_convolve_copy(src, src_stride, dst, dst_stride, w, h);
 1232|  16.7k|  } else if (need_x && !need_y) {
  ------------------
  |  Branch (1232:14): [True: 13.6k, False: 3.03k]
  |  Branch (1232:24): [True: 4.32k, False: 9.37k]
  ------------------
 1233|  4.32k|    av1_highbd_convolve_x_sr(src, src_stride, dst, dst_stride, w, h,
 1234|  4.32k|                             filter_params_x, subpel_x_qn, conv_params, bd);
 1235|  12.4k|  } else if (!need_x && need_y) {
  ------------------
  |  Branch (1235:14): [True: 3.03k, False: 9.37k]
  |  Branch (1235:25): [True: 3.03k, False: 0]
  ------------------
 1236|  3.03k|    av1_highbd_convolve_y_sr(src, src_stride, dst, dst_stride, w, h,
 1237|  3.03k|                             filter_params_y, subpel_y_qn, bd);
 1238|  9.37k|  } else {
 1239|       |    assert(need_x && need_y);
 1240|  9.37k|    av1_highbd_convolve_2d_sr(src, src_stride, dst, dst_stride, w, h,
 1241|  9.37k|                              filter_params_x, filter_params_y, subpel_x_qn,
 1242|  9.37k|                              subpel_y_qn, conv_params, bd);
 1243|  9.37k|  }
 1244|  31.0k|}

decodeframe.c:get_conv_params_no_round:
   71|   193k|                                                      int is_compound, int bd) {
   72|   193k|  ConvolveParams conv_params;
   73|   193k|  assert(IMPLIES(cmp_index, is_compound));
   74|       |
   75|   193k|  conv_params.is_compound = is_compound;
   76|   193k|  conv_params.use_dist_wtd_comp_avg = 0;
   77|   193k|  conv_params.round_0 = ROUND0_BITS;
  ------------------
  |  |   39|   193k|#define ROUND0_BITS 3
  ------------------
   78|   193k|  conv_params.round_1 = is_compound ? COMPOUND_ROUND1_BITS
  ------------------
  |  |   40|  35.9k|#define COMPOUND_ROUND1_BITS 7
  ------------------
  |  Branch (78:25): [True: 35.9k, False: 158k]
  ------------------
   79|   193k|                                    : 2 * FILTER_BITS - conv_params.round_0;
  ------------------
  |  |   21|   158k|#define FILTER_BITS 7
  ------------------
   80|   193k|#if CONFIG_AV1_HIGHBITDEPTH
   81|   193k|  const int intbufrange = bd + FILTER_BITS - conv_params.round_0 + 2;
  ------------------
  |  |   21|   193k|#define FILTER_BITS 7
  ------------------
   82|   193k|  assert(IMPLIES(bd < 12, intbufrange <= 16));
   83|   193k|  if (intbufrange > 16) {
  ------------------
  |  Branch (83:7): [True: 5.16k, False: 188k]
  ------------------
   84|  5.16k|    conv_params.round_0 += intbufrange - 16;
   85|  5.16k|    if (!is_compound) conv_params.round_1 -= intbufrange - 16;
  ------------------
  |  Branch (85:9): [True: 4.47k, False: 684]
  ------------------
   86|  5.16k|  }
   87|       |#else
   88|       |  (void)bd;
   89|       |#endif  // CONFIG_AV1_HIGHBITDEPTH
   90|       |  // TODO(yunqing): The following dst should only be valid while
   91|       |  // is_compound = 1;
   92|   193k|  conv_params.dst = dst;
   93|   193k|  conv_params.dst_stride = dst_stride;
   94|   193k|  conv_params.plane = plane;
   95|       |
   96|       |  // By default, set do average to 1 if this is the second single prediction
   97|       |  // in a compound mode.
   98|   193k|  conv_params.do_average = cmp_index;
   99|   193k|  return conv_params;
  100|   193k|}
restoration.c:get_conv_params_wiener:
  107|  2.77k|static inline WienerConvolveParams get_conv_params_wiener(int bd) {
  108|  2.77k|  WienerConvolveParams conv_params;
  109|  2.77k|  conv_params.round_0 = WIENER_ROUND0_BITS;
  ------------------
  |  |   41|  2.77k|#define WIENER_ROUND0_BITS 3
  ------------------
  110|  2.77k|  conv_params.round_1 = 2 * FILTER_BITS - conv_params.round_0;
  ------------------
  |  |   21|  2.77k|#define FILTER_BITS 7
  ------------------
  111|  2.77k|  const int intbufrange = bd + FILTER_BITS - conv_params.round_0 + 2;
  ------------------
  |  |   21|  2.77k|#define FILTER_BITS 7
  ------------------
  112|  2.77k|  assert(IMPLIES(bd < 12, intbufrange <= 16));
  113|  2.77k|  if (intbufrange > 16) {
  ------------------
  |  Branch (113:7): [True: 4, False: 2.76k]
  ------------------
  114|      4|    conv_params.round_0 += intbufrange - 16;
  115|      4|    conv_params.round_1 -= intbufrange - 16;
  116|      4|  }
  117|  2.77k|  return conv_params;
  118|  2.77k|}

av1_default_coef_probs:
   31|  22.8k|void av1_default_coef_probs(AV1_COMMON *cm) {
   32|  22.8k|  const int index = get_q_ctx(cm->quant_params.base_qindex);
   33|       |#if CONFIG_ENTROPY_STATS
   34|       |  cm->coef_cdf_category = index;
   35|       |#endif
   36|       |
   37|  22.8k|  av1_copy(cm->fc->txb_skip_cdf, av1_default_txb_skip_cdfs[index]);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
   38|  22.8k|  av1_copy(cm->fc->eob_extra_cdf, av1_default_eob_extra_cdfs[index]);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
   39|  22.8k|  av1_copy(cm->fc->dc_sign_cdf, av1_default_dc_sign_cdfs[index]);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
   40|  22.8k|  av1_copy(cm->fc->coeff_br_cdf, av1_default_coeff_lps_multi_cdfs[index]);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
   41|  22.8k|  av1_copy(cm->fc->coeff_base_cdf, av1_default_coeff_base_multi_cdfs[index]);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
   42|  22.8k|  av1_copy(cm->fc->coeff_base_eob_cdf,
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
   43|  22.8k|           av1_default_coeff_base_eob_multi_cdfs[index]);
   44|  22.8k|  av1_copy(cm->fc->eob_flag_cdf16, av1_default_eob_multi16_cdfs[index]);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
   45|  22.8k|  av1_copy(cm->fc->eob_flag_cdf32, av1_default_eob_multi32_cdfs[index]);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
   46|  22.8k|  av1_copy(cm->fc->eob_flag_cdf64, av1_default_eob_multi64_cdfs[index]);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
   47|  22.8k|  av1_copy(cm->fc->eob_flag_cdf128, av1_default_eob_multi128_cdfs[index]);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
   48|  22.8k|  av1_copy(cm->fc->eob_flag_cdf256, av1_default_eob_multi256_cdfs[index]);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
   49|  22.8k|  av1_copy(cm->fc->eob_flag_cdf512, av1_default_eob_multi512_cdfs[index]);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
   50|       |  av1_copy(cm->fc->eob_flag_cdf1024, av1_default_eob_multi1024_cdfs[index]);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
   51|  22.8k|}
av1_reset_cdf_symbol_counters:
   85|  11.0k|void av1_reset_cdf_symbol_counters(FRAME_CONTEXT *fc) {
   86|  11.0k|  RESET_CDF_COUNTER(fc->txb_skip_cdf, 2);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   87|  11.0k|  RESET_CDF_COUNTER(fc->eob_extra_cdf, 2);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   88|  11.0k|  RESET_CDF_COUNTER(fc->dc_sign_cdf, 2);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   89|  11.0k|  RESET_CDF_COUNTER(fc->eob_flag_cdf16, 5);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   90|  11.0k|  RESET_CDF_COUNTER(fc->eob_flag_cdf32, 6);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   91|  11.0k|  RESET_CDF_COUNTER(fc->eob_flag_cdf64, 7);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   92|  11.0k|  RESET_CDF_COUNTER(fc->eob_flag_cdf128, 8);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   93|  11.0k|  RESET_CDF_COUNTER(fc->eob_flag_cdf256, 9);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   94|  11.0k|  RESET_CDF_COUNTER(fc->eob_flag_cdf512, 10);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   95|  11.0k|  RESET_CDF_COUNTER(fc->eob_flag_cdf1024, 11);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   96|  11.0k|  RESET_CDF_COUNTER(fc->coeff_base_eob_cdf, 3);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   97|  11.0k|  RESET_CDF_COUNTER(fc->coeff_base_cdf, 4);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   98|  11.0k|  RESET_CDF_COUNTER(fc->coeff_br_cdf, BR_CDF_SIZE);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   99|  11.0k|  RESET_CDF_COUNTER(fc->newmv_cdf, 2);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  100|  11.0k|  RESET_CDF_COUNTER(fc->zeromv_cdf, 2);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  101|  11.0k|  RESET_CDF_COUNTER(fc->refmv_cdf, 2);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  102|  11.0k|  RESET_CDF_COUNTER(fc->drl_cdf, 2);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  103|  11.0k|  RESET_CDF_COUNTER(fc->inter_compound_mode_cdf, INTER_COMPOUND_MODES);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  104|  11.0k|  RESET_CDF_COUNTER(fc->compound_type_cdf, MASKED_COMPOUND_TYPES);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  105|  11.0k|  RESET_CDF_COUNTER(fc->wedge_idx_cdf, 16);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  106|  11.0k|  RESET_CDF_COUNTER(fc->interintra_cdf, 2);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  107|  11.0k|  RESET_CDF_COUNTER(fc->wedge_interintra_cdf, 2);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  108|  11.0k|  RESET_CDF_COUNTER(fc->interintra_mode_cdf, INTERINTRA_MODES);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  109|  11.0k|  RESET_CDF_COUNTER(fc->motion_mode_cdf, MOTION_MODES);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  110|  11.0k|  RESET_CDF_COUNTER(fc->obmc_cdf, 2);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  111|  11.0k|  RESET_CDF_COUNTER(fc->palette_y_size_cdf, PALETTE_SIZES);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  112|  11.0k|  RESET_CDF_COUNTER(fc->palette_uv_size_cdf, PALETTE_SIZES);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  113|  88.0k|  for (int j = 0; j < PALETTE_SIZES; j++) {
  ------------------
  |  Branch (113:19): [True: 77.0k, False: 11.0k]
  ------------------
  114|  77.0k|    int nsymbs = j + PALETTE_MIN_SIZE;
  ------------------
  |  |   65|  77.0k|#define PALETTE_MIN_SIZE 2
  ------------------
  115|  77.0k|    RESET_CDF_COUNTER_STRIDE(fc->palette_y_color_index_cdf[j], nsymbs,
  ------------------
  |  |   64|  77.0k|  do {                                                               \
  |  |   65|  77.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |   66|  77.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |   67|  77.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |   68|  77.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |   69|  77.0k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (69:12): [Folded, False: 77.0k]
  |  |  ------------------
  ------------------
  116|  77.0k|                             CDF_SIZE(PALETTE_COLORS));
  117|  77.0k|    RESET_CDF_COUNTER_STRIDE(fc->palette_uv_color_index_cdf[j], nsymbs,
  ------------------
  |  |   64|  77.0k|  do {                                                               \
  |  |   65|  77.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |   66|  77.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |   67|  77.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |   68|  77.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |   69|  77.0k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (69:12): [Folded, False: 77.0k]
  |  |  ------------------
  ------------------
  118|  77.0k|                             CDF_SIZE(PALETTE_COLORS));
  119|  77.0k|  }
  120|  11.0k|  RESET_CDF_COUNTER(fc->palette_y_mode_cdf, 2);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  121|  11.0k|  RESET_CDF_COUNTER(fc->palette_uv_mode_cdf, 2);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  122|  11.0k|  RESET_CDF_COUNTER(fc->comp_inter_cdf, 2);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  123|  11.0k|  RESET_CDF_COUNTER(fc->single_ref_cdf, 2);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  124|  11.0k|  RESET_CDF_COUNTER(fc->comp_ref_type_cdf, 2);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  125|  11.0k|  RESET_CDF_COUNTER(fc->uni_comp_ref_cdf, 2);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  126|  11.0k|  RESET_CDF_COUNTER(fc->comp_ref_cdf, 2);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  127|  11.0k|  RESET_CDF_COUNTER(fc->comp_bwdref_cdf, 2);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  128|  11.0k|  RESET_CDF_COUNTER(fc->txfm_partition_cdf, 2);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  129|  11.0k|  RESET_CDF_COUNTER(fc->compound_index_cdf, 2);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  130|  11.0k|  RESET_CDF_COUNTER(fc->comp_group_idx_cdf, 2);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  131|  11.0k|  RESET_CDF_COUNTER(fc->skip_mode_cdfs, 2);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  132|  11.0k|  RESET_CDF_COUNTER(fc->skip_txfm_cdfs, 2);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  133|  11.0k|  RESET_CDF_COUNTER(fc->intra_inter_cdf, 2);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  134|  11.0k|  reset_nmv_counter(&fc->nmvc);
  135|  11.0k|  reset_nmv_counter(&fc->ndvc);
  136|  11.0k|  RESET_CDF_COUNTER(fc->intrabc_cdf, 2);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  137|  11.0k|  RESET_CDF_COUNTER(fc->seg.pred_cdf, 2);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  138|  11.0k|  RESET_CDF_COUNTER(fc->seg.spatial_pred_seg_cdf, MAX_SEGMENTS);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  139|  11.0k|  RESET_CDF_COUNTER(fc->filter_intra_cdfs, 2);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  140|  11.0k|  RESET_CDF_COUNTER(fc->filter_intra_mode_cdf, FILTER_INTRA_MODES);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  141|  11.0k|  RESET_CDF_COUNTER(fc->switchable_restore_cdf, RESTORE_SWITCHABLE_TYPES);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  142|  11.0k|  RESET_CDF_COUNTER(fc->wiener_restore_cdf, 2);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  143|  11.0k|  RESET_CDF_COUNTER(fc->sgrproj_restore_cdf, 2);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  144|  11.0k|  RESET_CDF_COUNTER(fc->y_mode_cdf, INTRA_MODES);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  145|  11.0k|  RESET_CDF_COUNTER_STRIDE(fc->uv_mode_cdf[0], UV_INTRA_MODES - 1,
  ------------------
  |  |   64|  11.0k|  do {                                                               \
  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |   69|  11.0k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  ------------------
  ------------------
  146|  11.0k|                           CDF_SIZE(UV_INTRA_MODES));
  147|  11.0k|  RESET_CDF_COUNTER(fc->uv_mode_cdf[1], UV_INTRA_MODES);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  148|   231k|  for (int i = 0; i < PARTITION_CONTEXTS; i++) {
  ------------------
  |  |  171|   231k|#define PARTITION_CONTEXTS (PARTITION_BLOCK_SIZES * PARTITION_PLOFFSET)
  |  |  ------------------
  |  |  |  |  170|   231k|#define PARTITION_BLOCK_SIZES 5
  |  |  ------------------
  |  |               #define PARTITION_CONTEXTS (PARTITION_BLOCK_SIZES * PARTITION_PLOFFSET)
  |  |  ------------------
  |  |  |  |  169|   231k|#define PARTITION_PLOFFSET 4  // number of probability models per block size
  |  |  ------------------
  ------------------
  |  Branch (148:19): [True: 220k, False: 11.0k]
  ------------------
  149|   220k|    if (i < 4) {
  ------------------
  |  Branch (149:9): [True: 44.0k, False: 176k]
  ------------------
  150|  44.0k|      RESET_CDF_COUNTER_STRIDE(fc->partition_cdf[i], 4, CDF_SIZE(10));
  ------------------
  |  |   64|  44.0k|  do {                                                               \
  |  |   65|  44.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |   66|  44.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |   67|  44.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |   68|  44.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |   69|  44.0k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (69:12): [Folded, False: 44.0k]
  |  |  ------------------
  ------------------
  151|   176k|    } else if (i < 16) {
  ------------------
  |  Branch (151:16): [True: 132k, False: 44.0k]
  ------------------
  152|   132k|      RESET_CDF_COUNTER(fc->partition_cdf[i], 10);
  ------------------
  |  |   61|   132k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|   132k|  do {                                                               \
  |  |  |  |   65|   132k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|   132k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|   132k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|   132k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|   132k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 132k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  153|   132k|    } else {
  154|  44.0k|      RESET_CDF_COUNTER_STRIDE(fc->partition_cdf[i], 8, CDF_SIZE(10));
  ------------------
  |  |   64|  44.0k|  do {                                                               \
  |  |   65|  44.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |   66|  44.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |   67|  44.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |   68|  44.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |   69|  44.0k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (69:12): [Folded, False: 44.0k]
  |  |  ------------------
  ------------------
  155|  44.0k|    }
  156|   220k|  }
  157|  11.0k|  RESET_CDF_COUNTER(fc->switchable_interp_cdf, SWITCHABLE_FILTERS);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  158|  11.0k|  RESET_CDF_COUNTER(fc->kf_y_cdf, INTRA_MODES);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  159|  11.0k|  RESET_CDF_COUNTER(fc->angle_delta_cdf, 2 * MAX_ANGLE_DELTA + 1);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  160|  11.0k|  RESET_CDF_COUNTER_STRIDE(fc->tx_size_cdf[0], MAX_TX_DEPTH,
  ------------------
  |  |   64|  11.0k|  do {                                                               \
  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |   69|  11.0k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  ------------------
  ------------------
  161|  11.0k|                           CDF_SIZE(MAX_TX_DEPTH + 1));
  162|  11.0k|  RESET_CDF_COUNTER(fc->tx_size_cdf[1], MAX_TX_DEPTH + 1);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  163|  11.0k|  RESET_CDF_COUNTER(fc->tx_size_cdf[2], MAX_TX_DEPTH + 1);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  164|  11.0k|  RESET_CDF_COUNTER(fc->tx_size_cdf[3], MAX_TX_DEPTH + 1);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  165|  11.0k|  RESET_CDF_COUNTER(fc->delta_q_cdf, DELTA_Q_PROBS + 1);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  166|  11.0k|  RESET_CDF_COUNTER(fc->delta_lf_cdf, DELTA_LF_PROBS + 1);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  167|  55.0k|  for (int i = 0; i < FRAME_LF_COUNT; i++) {
  ------------------
  |  |   72|  55.0k|#define FRAME_LF_COUNT 4
  ------------------
  |  Branch (167:19): [True: 44.0k, False: 11.0k]
  ------------------
  168|  44.0k|    RESET_CDF_COUNTER(fc->delta_lf_multi_cdf[i], DELTA_LF_PROBS + 1);
  ------------------
  |  |   61|  44.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  44.0k|  do {                                                               \
  |  |  |  |   65|  44.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  44.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  44.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  44.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  44.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 44.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  169|  44.0k|  }
  170|  11.0k|  RESET_CDF_COUNTER_STRIDE(fc->intra_ext_tx_cdf[1], 7, CDF_SIZE(TX_TYPES));
  ------------------
  |  |   64|  11.0k|  do {                                                               \
  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |   69|  11.0k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  ------------------
  ------------------
  171|  11.0k|  RESET_CDF_COUNTER_STRIDE(fc->intra_ext_tx_cdf[2], 5, CDF_SIZE(TX_TYPES));
  ------------------
  |  |   64|  11.0k|  do {                                                               \
  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |   69|  11.0k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  ------------------
  ------------------
  172|  11.0k|  RESET_CDF_COUNTER_STRIDE(fc->inter_ext_tx_cdf[1], 16, CDF_SIZE(TX_TYPES));
  ------------------
  |  |   64|  11.0k|  do {                                                               \
  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |   69|  11.0k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  ------------------
  ------------------
  173|  11.0k|  RESET_CDF_COUNTER_STRIDE(fc->inter_ext_tx_cdf[2], 12, CDF_SIZE(TX_TYPES));
  ------------------
  |  |   64|  11.0k|  do {                                                               \
  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |   69|  11.0k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  ------------------
  ------------------
  174|  11.0k|  RESET_CDF_COUNTER_STRIDE(fc->inter_ext_tx_cdf[3], 2, CDF_SIZE(TX_TYPES));
  ------------------
  |  |   64|  11.0k|  do {                                                               \
  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |   69|  11.0k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  ------------------
  ------------------
  175|  11.0k|  RESET_CDF_COUNTER(fc->cfl_sign_cdf, CFL_JOINT_SIGNS);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  176|  11.0k|  RESET_CDF_COUNTER(fc->cfl_alpha_cdf, CFL_ALPHABET_SIZE);
  ------------------
  |  |   61|  11.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  11.0k|  do {                                                               \
  |  |  |  |   65|  11.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  11.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  11.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  11.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  11.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 11.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  177|  11.0k|}
entropy.c:get_q_ctx:
   24|  22.8k|static int get_q_ctx(int q) {
   25|  22.8k|  if (q <= 20) return 0;
  ------------------
  |  Branch (25:7): [True: 3.48k, False: 19.3k]
  ------------------
   26|  19.3k|  if (q <= 60) return 1;
  ------------------
  |  Branch (26:7): [True: 7.45k, False: 11.8k]
  ------------------
   27|  11.8k|  if (q <= 120) return 2;
  ------------------
  |  Branch (27:7): [True: 7.40k, False: 4.47k]
  ------------------
   28|  4.47k|  return 3;
   29|  11.8k|}
entropy.c:reset_cdf_symbol_counter:
   54|  1.54M|                                            int cdf_stride, int nsymbs) {
   55|  18.6M|  for (int i = 0; i < num_cdfs; i++) {
  ------------------
  |  Branch (55:19): [True: 17.0M, False: 1.54M]
  ------------------
   56|  17.0M|    cdf_ptr[i * cdf_stride + nsymbs] = 0;
   57|  17.0M|  }
   58|  1.54M|}
entropy.c:reset_nmv_counter:
   71|  22.0k|static inline void reset_nmv_counter(nmv_context *nmv) {
   72|  22.0k|  RESET_CDF_COUNTER(nmv->joints_cdf, 4);
  ------------------
  |  |   61|  22.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  22.0k|  do {                                                               \
  |  |  |  |   65|  22.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  22.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  22.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  22.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  22.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 22.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   73|  66.0k|  for (int i = 0; i < 2; i++) {
  ------------------
  |  Branch (73:19): [True: 44.0k, False: 22.0k]
  ------------------
   74|  44.0k|    RESET_CDF_COUNTER(nmv->comps[i].classes_cdf, MV_CLASSES);
  ------------------
  |  |   61|  44.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  44.0k|  do {                                                               \
  |  |  |  |   65|  44.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  44.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  44.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  44.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  44.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 44.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   75|  44.0k|    RESET_CDF_COUNTER(nmv->comps[i].class0_fp_cdf, MV_FP_SIZE);
  ------------------
  |  |   61|  44.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  44.0k|  do {                                                               \
  |  |  |  |   65|  44.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  44.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  44.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  44.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  44.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 44.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   76|  44.0k|    RESET_CDF_COUNTER(nmv->comps[i].fp_cdf, MV_FP_SIZE);
  ------------------
  |  |   61|  44.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  44.0k|  do {                                                               \
  |  |  |  |   65|  44.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  44.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  44.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  44.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  44.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 44.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   77|  44.0k|    RESET_CDF_COUNTER(nmv->comps[i].sign_cdf, 2);
  ------------------
  |  |   61|  44.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  44.0k|  do {                                                               \
  |  |  |  |   65|  44.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  44.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  44.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  44.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  44.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 44.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   78|  44.0k|    RESET_CDF_COUNTER(nmv->comps[i].class0_hp_cdf, 2);
  ------------------
  |  |   61|  44.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  44.0k|  do {                                                               \
  |  |  |  |   65|  44.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  44.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  44.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  44.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  44.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 44.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   79|  44.0k|    RESET_CDF_COUNTER(nmv->comps[i].hp_cdf, 2);
  ------------------
  |  |   61|  44.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  44.0k|  do {                                                               \
  |  |  |  |   65|  44.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  44.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  44.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  44.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  44.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 44.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   80|  44.0k|    RESET_CDF_COUNTER(nmv->comps[i].class0_cdf, CLASS0_SIZE);
  ------------------
  |  |   61|  44.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  44.0k|  do {                                                               \
  |  |  |  |   65|  44.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  44.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  44.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  44.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  44.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 44.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   81|  44.0k|    RESET_CDF_COUNTER(nmv->comps[i].bits_cdf, 2);
  ------------------
  |  |   61|  44.0k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  44.0k|  do {                                                               \
  |  |  |  |   65|  44.0k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  44.0k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  44.0k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  44.0k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  44.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded, False: 44.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   82|  44.0k|  }
   83|  22.0k|}

decodetxb.c:get_entropy_context:
   88|  2.17M|                                      const ENTROPY_CONTEXT *l) {
   89|  2.17M|  ENTROPY_CONTEXT above_ec = 0, left_ec = 0;
   90|       |
   91|  2.17M|  switch (tx_size) {
   92|   743k|    case TX_4X4:
  ------------------
  |  Branch (92:5): [True: 743k, False: 1.43M]
  ------------------
   93|   743k|      above_ec = a[0] != 0;
   94|   743k|      left_ec = l[0] != 0;
   95|   743k|      break;
   96|  88.9k|    case TX_4X8:
  ------------------
  |  Branch (96:5): [True: 88.9k, False: 2.09M]
  ------------------
   97|  88.9k|      above_ec = a[0] != 0;
   98|  88.9k|      left_ec = !!*(const uint16_t *)l;
   99|  88.9k|      break;
  100|   132k|    case TX_8X4:
  ------------------
  |  Branch (100:5): [True: 132k, False: 2.04M]
  ------------------
  101|   132k|      above_ec = !!*(const uint16_t *)a;
  102|   132k|      left_ec = l[0] != 0;
  103|   132k|      break;
  104|  86.2k|    case TX_8X16:
  ------------------
  |  Branch (104:5): [True: 86.2k, False: 2.09M]
  ------------------
  105|  86.2k|      above_ec = !!*(const uint16_t *)a;
  106|  86.2k|      left_ec = !!*(const uint32_t *)l;
  107|  86.2k|      break;
  108|   131k|    case TX_16X8:
  ------------------
  |  Branch (108:5): [True: 131k, False: 2.04M]
  ------------------
  109|   131k|      above_ec = !!*(const uint32_t *)a;
  110|   131k|      left_ec = !!*(const uint16_t *)l;
  111|   131k|      break;
  112|  46.0k|    case TX_16X32:
  ------------------
  |  Branch (112:5): [True: 46.0k, False: 2.13M]
  ------------------
  113|  46.0k|      above_ec = !!*(const uint32_t *)a;
  114|  46.0k|      left_ec = !!*(const uint64_t *)l;
  115|  46.0k|      break;
  116|  56.1k|    case TX_32X16:
  ------------------
  |  Branch (116:5): [True: 56.1k, False: 2.12M]
  ------------------
  117|  56.1k|      above_ec = !!*(const uint64_t *)a;
  118|  56.1k|      left_ec = !!*(const uint32_t *)l;
  119|  56.1k|      break;
  120|   371k|    case TX_8X8:
  ------------------
  |  Branch (120:5): [True: 371k, False: 1.80M]
  ------------------
  121|   371k|      above_ec = !!*(const uint16_t *)a;
  122|   371k|      left_ec = !!*(const uint16_t *)l;
  123|   371k|      break;
  124|   158k|    case TX_16X16:
  ------------------
  |  Branch (124:5): [True: 158k, False: 2.02M]
  ------------------
  125|   158k|      above_ec = !!*(const uint32_t *)a;
  126|   158k|      left_ec = !!*(const uint32_t *)l;
  127|   158k|      break;
  128|   162k|    case TX_32X32:
  ------------------
  |  Branch (128:5): [True: 162k, False: 2.01M]
  ------------------
  129|   162k|      above_ec = !!*(const uint64_t *)a;
  130|   162k|      left_ec = !!*(const uint64_t *)l;
  131|   162k|      break;
  132|      0|    case TX_64X64:
  ------------------
  |  Branch (132:5): [True: 0, False: 2.17M]
  ------------------
  133|      0|      above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8));
  134|      0|      left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8));
  135|      0|      break;
  136|      0|    case TX_32X64:
  ------------------
  |  Branch (136:5): [True: 0, False: 2.17M]
  ------------------
  137|      0|      above_ec = !!*(const uint64_t *)a;
  138|      0|      left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8));
  139|      0|      break;
  140|      0|    case TX_64X32:
  ------------------
  |  Branch (140:5): [True: 0, False: 2.17M]
  ------------------
  141|      0|      above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8));
  142|      0|      left_ec = !!*(const uint64_t *)l;
  143|      0|      break;
  144|  48.4k|    case TX_4X16:
  ------------------
  |  Branch (144:5): [True: 48.4k, False: 2.13M]
  ------------------
  145|  48.4k|      above_ec = a[0] != 0;
  146|  48.4k|      left_ec = !!*(const uint32_t *)l;
  147|  48.4k|      break;
  148|  99.0k|    case TX_16X4:
  ------------------
  |  Branch (148:5): [True: 99.0k, False: 2.08M]
  ------------------
  149|  99.0k|      above_ec = !!*(const uint32_t *)a;
  150|  99.0k|      left_ec = l[0] != 0;
  151|  99.0k|      break;
  152|  23.0k|    case TX_8X32:
  ------------------
  |  Branch (152:5): [True: 23.0k, False: 2.15M]
  ------------------
  153|  23.0k|      above_ec = !!*(const uint16_t *)a;
  154|  23.0k|      left_ec = !!*(const uint64_t *)l;
  155|  23.0k|      break;
  156|  32.7k|    case TX_32X8:
  ------------------
  |  Branch (156:5): [True: 32.7k, False: 2.14M]
  ------------------
  157|  32.7k|      above_ec = !!*(const uint64_t *)a;
  158|  32.7k|      left_ec = !!*(const uint16_t *)l;
  159|  32.7k|      break;
  160|      0|    case TX_16X64:
  ------------------
  |  Branch (160:5): [True: 0, False: 2.17M]
  ------------------
  161|      0|      above_ec = !!*(const uint32_t *)a;
  162|      0|      left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8));
  163|      0|      break;
  164|      0|    case TX_64X16:
  ------------------
  |  Branch (164:5): [True: 0, False: 2.17M]
  ------------------
  165|      0|      above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8));
  166|      0|      left_ec = !!*(const uint32_t *)l;
  167|      0|      break;
  168|      0|    default: assert(0 && "Invalid transform size."); break;
  ------------------
  |  Branch (168:5): [True: 0, False: 2.17M]
  ------------------
  169|  2.17M|  }
  170|  2.17M|  return combine_entropy_contexts(above_ec, left_ec);
  171|  2.17M|}
decodetxb.c:combine_entropy_contexts:
   83|  2.17M|                                           ENTROPY_CONTEXT b) {
   84|  2.17M|  return (a != 0) + (b != 0);
   85|  2.17M|}
decodetxb.c:get_txsize_entropy_ctx:
  173|  4.72M|static inline TX_SIZE get_txsize_entropy_ctx(TX_SIZE txsize) {
  174|  4.72M|  return (TX_SIZE)((txsize_sqr_map[txsize] + txsize_sqr_up_map[txsize] + 1) >>
  175|  4.72M|                   1);
  176|  4.72M|}

av1_get_palette_color_index_context:
  895|  10.8M|                                        uint8_t *color_order, int *color_idx) {
  896|  10.8M|  assert(palette_size <= PALETTE_MAX_SIZE);
  897|  10.8M|  assert(r > 0 || c > 0);
  898|       |
  899|       |  // Get color indices of neighbors.
  900|  10.8M|  int color_neighbors[NUM_PALETTE_NEIGHBORS];
  901|  10.8M|  color_neighbors[0] = (c - 1 >= 0) ? color_map[r * stride + c - 1] : -1;
  ------------------
  |  Branch (901:24): [True: 10.1M, False: 653k]
  ------------------
  902|  10.8M|  color_neighbors[1] =
  903|  10.8M|      (c - 1 >= 0 && r - 1 >= 0) ? color_map[(r - 1) * stride + c - 1] : -1;
  ------------------
  |  Branch (903:8): [True: 10.1M, False: 653k]
  |  Branch (903:22): [True: 9.32M, False: 819k]
  ------------------
  904|  10.8M|  color_neighbors[2] = (r - 1 >= 0) ? color_map[(r - 1) * stride + c] : -1;
  ------------------
  |  Branch (904:24): [True: 9.98M, False: 818k]
  ------------------
  905|       |
  906|       |  // The +10 below should not be needed. But we get a warning "array subscript
  907|       |  // is above array bounds [-Werror=array-bounds]" without it, possibly due to
  908|       |  // this (or similar) bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=59124
  909|  10.8M|  int scores[PALETTE_MAX_SIZE + 10] = { 0 };
  910|  10.8M|  int i;
  911|  10.8M|  static const int weights[NUM_PALETTE_NEIGHBORS] = { 2, 1, 2 };
  912|  43.2M|  for (i = 0; i < NUM_PALETTE_NEIGHBORS; ++i) {
  ------------------
  |  |   60|  43.2M|#define NUM_PALETTE_NEIGHBORS 3  // left, top-left and top.
  ------------------
  |  Branch (912:15): [True: 32.4M, False: 10.8M]
  ------------------
  913|  32.4M|    if (color_neighbors[i] >= 0) {
  ------------------
  |  Branch (913:9): [True: 29.4M, False: 2.94M]
  ------------------
  914|  29.4M|      scores[color_neighbors[i]] += weights[i];
  915|  29.4M|    }
  916|  32.4M|  }
  917|       |
  918|  10.8M|  int inverse_color_order[PALETTE_MAX_SIZE];
  919|  97.1M|  for (i = 0; i < PALETTE_MAX_SIZE; ++i) {
  ------------------
  |  |   63|  97.1M|#define PALETTE_MAX_SIZE 8
  ------------------
  |  Branch (919:15): [True: 86.3M, False: 10.8M]
  ------------------
  920|  86.3M|    color_order[i] = i;
  921|  86.3M|    inverse_color_order[i] = i;
  922|  86.3M|  }
  923|       |
  924|       |  // Get the top NUM_PALETTE_NEIGHBORS scores (sorted from large to small).
  925|  43.1M|  for (i = 0; i < NUM_PALETTE_NEIGHBORS; ++i) {
  ------------------
  |  |   60|  43.1M|#define NUM_PALETTE_NEIGHBORS 3  // left, top-left and top.
  ------------------
  |  Branch (925:15): [True: 32.3M, False: 10.8M]
  ------------------
  926|  32.3M|    int max = scores[i];
  927|  32.3M|    int max_idx = i;
  928|   103M|    for (int j = i + 1; j < palette_size; ++j) {
  ------------------
  |  Branch (928:25): [True: 71.4M, False: 32.3M]
  ------------------
  929|  71.4M|      if (scores[j] > max) {
  ------------------
  |  Branch (929:11): [True: 11.7M, False: 59.7M]
  ------------------
  930|  11.7M|        max = scores[j];
  931|  11.7M|        max_idx = j;
  932|  11.7M|      }
  933|  71.4M|    }
  934|  32.3M|    if (max_idx != i) {
  ------------------
  |  Branch (934:9): [True: 10.4M, False: 21.9M]
  ------------------
  935|       |      // Move the score at index 'max_idx' to index 'i', and shift the scores
  936|       |      // from 'i' to 'max_idx - 1' by 1.
  937|  10.4M|      const int max_score = scores[max_idx];
  938|  10.4M|      const uint8_t max_color_order = color_order[max_idx];
  939|  34.7M|      for (int k = max_idx; k > i; --k) {
  ------------------
  |  Branch (939:29): [True: 24.2M, False: 10.4M]
  ------------------
  940|  24.2M|        scores[k] = scores[k - 1];
  941|  24.2M|        color_order[k] = color_order[k - 1];
  942|  24.2M|        inverse_color_order[color_order[k]] = k;
  943|  24.2M|      }
  944|  10.4M|      scores[i] = max_score;
  945|  10.4M|      color_order[i] = max_color_order;
  946|  10.4M|      inverse_color_order[color_order[i]] = i;
  947|  10.4M|    }
  948|  32.3M|  }
  949|       |
  950|  10.8M|  if (color_idx != NULL)
  ------------------
  |  Branch (950:7): [True: 0, False: 10.8M]
  ------------------
  951|      0|    *color_idx = inverse_color_order[color_map[r * stride + c]];
  952|       |
  953|       |  // Get hash value of context.
  954|  10.8M|  int color_index_ctx_hash = 0;
  955|  10.8M|  static const int hash_multipliers[NUM_PALETTE_NEIGHBORS] = { 1, 2, 2 };
  956|  43.2M|  for (i = 0; i < NUM_PALETTE_NEIGHBORS; ++i) {
  ------------------
  |  |   60|  43.2M|#define NUM_PALETTE_NEIGHBORS 3  // left, top-left and top.
  ------------------
  |  Branch (956:15): [True: 32.4M, False: 10.8M]
  ------------------
  957|  32.4M|    color_index_ctx_hash += scores[i] * hash_multipliers[i];
  958|  32.4M|  }
  959|  10.8M|  assert(color_index_ctx_hash > 0);
  960|  10.8M|  assert(color_index_ctx_hash <= MAX_COLOR_CONTEXT_HASH);
  961|       |
  962|       |  // Lookup context from hash.
  963|  10.8M|  const int color_index_ctx =
  964|  10.8M|      av1_palette_color_index_context_lookup[color_index_ctx_hash];
  965|  10.8M|  assert(color_index_ctx >= 0);
  966|       |  assert(color_index_ctx < PALETTE_COLOR_INDEX_CONTEXTS);
  967|  10.8M|  return color_index_ctx;
  968|  10.8M|}
av1_init_mode_probs:
  970|  22.8k|void av1_init_mode_probs(FRAME_CONTEXT *fc) {
  971|  22.8k|  av1_copy(fc->palette_y_size_cdf, default_palette_y_size_cdf);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
  972|  22.8k|  av1_copy(fc->palette_uv_size_cdf, default_palette_uv_size_cdf);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
  973|  22.8k|  av1_copy(fc->palette_y_color_index_cdf, default_palette_y_color_index_cdf);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
  974|  22.8k|  av1_copy(fc->palette_uv_color_index_cdf, default_palette_uv_color_index_cdf);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
  975|  22.8k|  av1_copy(fc->kf_y_cdf, default_kf_y_mode_cdf);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
  976|  22.8k|  av1_copy(fc->angle_delta_cdf, default_angle_delta_cdf);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
  977|  22.8k|  av1_copy(fc->comp_inter_cdf, default_comp_inter_cdf);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
  978|  22.8k|  av1_copy(fc->comp_ref_type_cdf, default_comp_ref_type_cdf);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
  979|  22.8k|  av1_copy(fc->uni_comp_ref_cdf, default_uni_comp_ref_cdf);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
  980|  22.8k|  av1_copy(fc->palette_y_mode_cdf, default_palette_y_mode_cdf);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
  981|  22.8k|  av1_copy(fc->palette_uv_mode_cdf, default_palette_uv_mode_cdf);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
  982|  22.8k|  av1_copy(fc->comp_ref_cdf, default_comp_ref_cdf);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
  983|  22.8k|  av1_copy(fc->comp_bwdref_cdf, default_comp_bwdref_cdf);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
  984|  22.8k|  av1_copy(fc->single_ref_cdf, default_single_ref_cdf);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
  985|  22.8k|  av1_copy(fc->txfm_partition_cdf, default_txfm_partition_cdf);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
  986|  22.8k|  av1_copy(fc->compound_index_cdf, default_compound_idx_cdfs);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
  987|  22.8k|  av1_copy(fc->comp_group_idx_cdf, default_comp_group_idx_cdfs);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
  988|  22.8k|  av1_copy(fc->newmv_cdf, default_newmv_cdf);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
  989|  22.8k|  av1_copy(fc->zeromv_cdf, default_zeromv_cdf);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
  990|  22.8k|  av1_copy(fc->refmv_cdf, default_refmv_cdf);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
  991|  22.8k|  av1_copy(fc->drl_cdf, default_drl_cdf);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
  992|  22.8k|  av1_copy(fc->motion_mode_cdf, default_motion_mode_cdf);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
  993|  22.8k|  av1_copy(fc->obmc_cdf, default_obmc_cdf);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
  994|  22.8k|  av1_copy(fc->inter_compound_mode_cdf, default_inter_compound_mode_cdf);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
  995|  22.8k|  av1_copy(fc->compound_type_cdf, default_compound_type_cdf);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
  996|  22.8k|  av1_copy(fc->wedge_idx_cdf, default_wedge_idx_cdf);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
  997|  22.8k|  av1_copy(fc->interintra_cdf, default_interintra_cdf);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
  998|  22.8k|  av1_copy(fc->wedge_interintra_cdf, default_wedge_interintra_cdf);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
  999|  22.8k|  av1_copy(fc->interintra_mode_cdf, default_interintra_mode_cdf);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
 1000|  22.8k|  av1_copy(fc->seg.pred_cdf, default_segment_pred_cdf);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
 1001|  22.8k|  av1_copy(fc->filter_intra_cdfs, default_filter_intra_cdfs);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
 1002|  22.8k|  av1_copy(fc->filter_intra_mode_cdf, default_filter_intra_mode_cdf);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
 1003|  22.8k|  av1_copy(fc->switchable_restore_cdf, default_switchable_restore_cdf);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
 1004|  22.8k|  av1_copy(fc->wiener_restore_cdf, default_wiener_restore_cdf);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
 1005|  22.8k|  av1_copy(fc->sgrproj_restore_cdf, default_sgrproj_restore_cdf);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
 1006|  22.8k|  av1_copy(fc->y_mode_cdf, default_if_y_mode_cdf);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
 1007|  22.8k|  av1_copy(fc->uv_mode_cdf, default_uv_mode_cdf);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
 1008|  22.8k|  av1_copy(fc->switchable_interp_cdf, default_switchable_interp_cdf);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
 1009|  22.8k|  av1_copy(fc->partition_cdf, default_partition_cdf);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
 1010|  22.8k|  av1_copy(fc->intra_ext_tx_cdf, default_intra_ext_tx_cdf);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
 1011|  22.8k|  av1_copy(fc->inter_ext_tx_cdf, default_inter_ext_tx_cdf);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
 1012|  22.8k|  av1_copy(fc->skip_mode_cdfs, default_skip_mode_cdfs);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
 1013|  22.8k|  av1_copy(fc->skip_txfm_cdfs, default_skip_txfm_cdfs);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
 1014|  22.8k|  av1_copy(fc->intra_inter_cdf, default_intra_inter_cdf);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
 1015|  91.2k|  for (int i = 0; i < SPATIAL_PREDICTION_PROBS; i++)
  ------------------
  |  |   25|  91.2k|#define SPATIAL_PREDICTION_PROBS 3
  ------------------
  |  Branch (1015:19): [True: 68.4k, False: 22.8k]
  ------------------
 1016|  68.4k|    av1_copy(fc->seg.spatial_pred_seg_cdf[i],
  ------------------
  |  |   31|  68.4k|  do {                                   \
  |  |   32|  68.4k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  68.4k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  68.4k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 68.4k]
  |  |  ------------------
  ------------------
 1017|  22.8k|             default_spatial_pred_seg_tree_cdf[i]);
 1018|  22.8k|  av1_copy(fc->tx_size_cdf, default_tx_size_cdf);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
 1019|  22.8k|  av1_copy(fc->delta_q_cdf, default_delta_q_cdf);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
 1020|  22.8k|  av1_copy(fc->delta_lf_cdf, default_delta_lf_cdf);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
 1021|  22.8k|  av1_copy(fc->delta_lf_multi_cdf, default_delta_lf_multi_cdf);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
 1022|  22.8k|  av1_copy(fc->cfl_sign_cdf, default_cfl_sign_cdf);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
 1023|  22.8k|  av1_copy(fc->cfl_alpha_cdf, default_cfl_alpha_cdf);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
 1024|       |  av1_copy(fc->intrabc_cdf, default_intrabc_cdf);
  ------------------
  |  |   31|  22.8k|  do {                                   \
  |  |   32|  22.8k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  22.8k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
 1025|  22.8k|}
av1_set_default_ref_deltas:
 1027|  69.7k|void av1_set_default_ref_deltas(int8_t *ref_deltas) {
 1028|  69.7k|  assert(ref_deltas != NULL);
 1029|       |
 1030|  69.7k|  ref_deltas[INTRA_FRAME] = 1;
 1031|  69.7k|  ref_deltas[LAST_FRAME] = 0;
 1032|  69.7k|  ref_deltas[LAST2_FRAME] = ref_deltas[LAST_FRAME];
 1033|  69.7k|  ref_deltas[LAST3_FRAME] = ref_deltas[LAST_FRAME];
 1034|  69.7k|  ref_deltas[BWDREF_FRAME] = ref_deltas[LAST_FRAME];
 1035|  69.7k|  ref_deltas[GOLDEN_FRAME] = -1;
 1036|  69.7k|  ref_deltas[ALTREF2_FRAME] = -1;
 1037|  69.7k|  ref_deltas[ALTREF_FRAME] = -1;
 1038|  69.7k|}
av1_set_default_mode_deltas:
 1040|  69.7k|void av1_set_default_mode_deltas(int8_t *mode_deltas) {
 1041|  69.7k|  assert(mode_deltas != NULL);
 1042|       |
 1043|  69.7k|  mode_deltas[0] = 0;
 1044|  69.7k|  mode_deltas[1] = 0;
 1045|  69.7k|}
av1_setup_frame_contexts:
 1055|  22.8k|void av1_setup_frame_contexts(AV1_COMMON *cm) {
 1056|       |  // Store the frame context into a special slot (not associated with any
 1057|       |  // reference buffer), so that we can set up cm->pre_fc correctly later
 1058|       |  // This function must ONLY be called when cm->fc has been initialized with
 1059|       |  // default probs, either by av1_setup_past_independence or after manually
 1060|       |  // initializing them
 1061|  22.8k|  *cm->default_frame_context = *cm->fc;
 1062|       |  // TODO(jack.haughton@argondesign.com): don't think this should be necessary,
 1063|       |  // but could do with fuller testing
 1064|  22.8k|  if (cm->tiles.large_scale) {
  ------------------
  |  Branch (1064:7): [True: 0, False: 22.8k]
  ------------------
 1065|      0|    for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
  ------------------
  |  Branch (1065:30): [True: 0, False: 0]
  ------------------
 1066|      0|      RefCntBuffer *const buf = get_ref_frame_buf(cm, i);
 1067|      0|      if (buf != NULL) buf->frame_context = *cm->fc;
  ------------------
  |  Branch (1067:11): [True: 0, False: 0]
  ------------------
 1068|      0|    }
 1069|      0|    for (int i = 0; i < cm->buffer_pool->num_frame_bufs; ++i)
  ------------------
  |  Branch (1069:21): [True: 0, False: 0]
  ------------------
 1070|      0|      cm->buffer_pool->frame_bufs[i].frame_context = *cm->fc;
 1071|      0|  }
 1072|  22.8k|}
av1_setup_past_independence:
 1074|  22.8k|void av1_setup_past_independence(AV1_COMMON *cm) {
 1075|       |  // Reset the segment feature data to the default stats:
 1076|       |  // Features disabled, 0, with delta coding (Default state).
 1077|  22.8k|  av1_clearall_segfeatures(&cm->seg);
 1078|       |
 1079|  22.8k|  if (cm->cur_frame->seg_map) {
  ------------------
  |  Branch (1079:7): [True: 22.8k, False: 0]
  ------------------
 1080|  22.8k|    memset(cm->cur_frame->seg_map, 0,
 1081|  22.8k|           (cm->cur_frame->mi_rows * cm->cur_frame->mi_cols));
 1082|  22.8k|  }
 1083|       |
 1084|       |  // reset mode ref deltas
 1085|  22.8k|  av1_set_default_ref_deltas(cm->cur_frame->ref_deltas);
 1086|  22.8k|  av1_set_default_mode_deltas(cm->cur_frame->mode_deltas);
 1087|  22.8k|  set_default_lf_deltas(&cm->lf);
 1088|       |
 1089|  22.8k|  av1_default_coef_probs(cm);
 1090|  22.8k|  av1_init_mode_probs(cm->fc);
 1091|  22.8k|  av1_init_mv_probs(cm);
 1092|  22.8k|  cm->fc->initialized = 1;
 1093|  22.8k|  av1_setup_frame_contexts(cm);
 1094|  22.8k|}
entropymode.c:set_default_lf_deltas:
 1047|  22.8k|static void set_default_lf_deltas(struct loopfilter *lf) {
 1048|  22.8k|  lf->mode_ref_delta_enabled = 1;
 1049|  22.8k|  lf->mode_ref_delta_update = 1;
 1050|       |
 1051|  22.8k|  av1_set_default_ref_deltas(lf->ref_deltas);
 1052|  22.8k|  av1_set_default_mode_deltas(lf->mode_deltas);
 1053|  22.8k|}

av1_init_mv_probs:
   63|  22.8k|void av1_init_mv_probs(AV1_COMMON *cm) {
   64|       |  // NB: this sets CDFs too
   65|  22.8k|  cm->fc->nmvc = default_nmv_context;
   66|  22.8k|  cm->fc->ndvc = default_nmv_context;
   67|  22.8k|}

decodemv.c:mv_joint_vertical:
   40|  41.2k|static inline int mv_joint_vertical(MV_JOINT_TYPE type) {
   41|  41.2k|  return type == MV_JOINT_HZVNZ || type == MV_JOINT_HNZVNZ;
  ------------------
  |  Branch (41:10): [True: 9.98k, False: 31.2k]
  |  Branch (41:36): [True: 16.1k, False: 15.0k]
  ------------------
   42|  41.2k|}
decodemv.c:mv_joint_horizontal:
   44|  41.1k|static inline int mv_joint_horizontal(MV_JOINT_TYPE type) {
   45|  41.1k|  return type == MV_JOINT_HNZVZ || type == MV_JOINT_HNZVNZ;
  ------------------
  |  Branch (45:10): [True: 9.37k, False: 31.8k]
  |  Branch (45:36): [True: 16.1k, False: 15.6k]
  ------------------
   46|  41.1k|}

decodeframe.c:av1_get_interp_filter_params_with_block_size:
  249|   361k|                                             const int w) {
  250|   361k|  if (w <= 4 && interp_filter != MULTITAP_SHARP2)
  ------------------
  |  Branch (250:7): [True: 143k, False: 217k]
  |  Branch (250:17): [True: 143k, False: 0]
  ------------------
  251|   143k|    return &av1_interp_4tap[interp_filter];
  252|   217k|  return &av1_interp_filter_params_list[interp_filter];
  253|   361k|}
decodemv.c:av1_broadcast_interp_filter:
   86|  71.5k|    InterpFilter filter) {
   87|  71.5k|  int_interpfilters filters;
   88|  71.5k|  filters.as_filters.x_filter = filter;
   89|  71.5k|  filters.as_filters.y_filter = filter;
   90|  71.5k|  return filters;
   91|  71.5k|}
decodemv.c:av1_unswitchable_filter:
   93|  6.14k|static inline InterpFilter av1_unswitchable_filter(InterpFilter filter) {
   94|  6.14k|  return filter == SWITCHABLE ? EIGHTTAP_REGULAR : filter;
  ------------------
  |  Branch (94:10): [True: 826, False: 5.31k]
  ------------------
   95|  6.14k|}
highbd_convolve_avx2.c:av1_get_interp_filter_subpel_kernel:
  266|  7.35k|    const InterpFilterParams *const filter_params, const int subpel) {
  267|  7.35k|  return filter_params->filter_ptr + filter_params->taps * subpel;
  268|  7.35k|}
pred_common.c:av1_extract_interp_filter:
   80|  34.4k|                                                     int dir) {
   81|  34.4k|  return (InterpFilter)((dir) ? filters.as_filters.x_filter
  ------------------
  |  Branch (81:25): [True: 16.4k, False: 17.9k]
  ------------------
   82|  34.4k|                              : filters.as_filters.y_filter);
   83|  34.4k|}
av1_convolve_scale_sse4.c:av1_get_interp_filter_subpel_kernel:
  266|    688|    const InterpFilterParams *const filter_params, const int subpel) {
  267|    688|  return filter_params->filter_ptr + filter_params->taps * subpel;
  268|    688|}
convolve_2d_avx2.c:get_filter_tap:
  298|   196k|                                 int subpel_qn) {
  299|   196k|  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
  300|   196k|      filter_params, subpel_qn & SUBPEL_MASK);
  ------------------
  |  |   24|   196k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|   196k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  301|   196k|  if (filter_params->taps == 12) {
  ------------------
  |  Branch (301:7): [True: 0, False: 196k]
  ------------------
  302|      0|    return 12;
  303|      0|  }
  304|   196k|  if (filter[0] | filter[7]) {
  ------------------
  |  Branch (304:7): [True: 6.37k, False: 189k]
  ------------------
  305|  6.37k|    return 8;
  306|  6.37k|  }
  307|   189k|  if (filter[1] | filter[6]) {
  ------------------
  |  Branch (307:7): [True: 76.1k, False: 113k]
  ------------------
  308|  76.1k|    return 6;
  309|  76.1k|  }
  310|   113k|#if CONFIG_SVT_AV1
  311|   113k|  if (filter[2] | filter[5]) {
  ------------------
  |  Branch (311:7): [True: 101k, False: 12.1k]
  ------------------
  312|   101k|    return 4;
  313|   101k|  }
  314|  12.1k|  return 2;
  315|       |#else
  316|       |  return 4;
  317|       |#endif
  318|   113k|}
convolve_2d_avx2.c:av1_get_interp_filter_subpel_kernel:
  266|   293k|    const InterpFilterParams *const filter_params, const int subpel) {
  267|   293k|  return filter_params->filter_ptr + filter_params->taps * subpel;
  268|   293k|}
convolve_avx2.c:get_filter_tap:
  298|  57.3k|                                 int subpel_qn) {
  299|  57.3k|  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
  300|  57.3k|      filter_params, subpel_qn & SUBPEL_MASK);
  ------------------
  |  |   24|  57.3k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|  57.3k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  301|  57.3k|  if (filter_params->taps == 12) {
  ------------------
  |  Branch (301:7): [True: 0, False: 57.3k]
  ------------------
  302|      0|    return 12;
  303|      0|  }
  304|  57.3k|  if (filter[0] | filter[7]) {
  ------------------
  |  Branch (304:7): [True: 2.63k, False: 54.7k]
  ------------------
  305|  2.63k|    return 8;
  306|  2.63k|  }
  307|  54.7k|  if (filter[1] | filter[6]) {
  ------------------
  |  Branch (307:7): [True: 22.7k, False: 31.9k]
  ------------------
  308|  22.7k|    return 6;
  309|  22.7k|  }
  310|  31.9k|#if CONFIG_SVT_AV1
  311|  31.9k|  if (filter[2] | filter[5]) {
  ------------------
  |  Branch (311:7): [True: 26.3k, False: 5.65k]
  ------------------
  312|  26.3k|    return 4;
  313|  26.3k|  }
  314|  5.65k|  return 2;
  315|       |#else
  316|       |  return 4;
  317|       |#endif
  318|  31.9k|}
convolve_avx2.c:av1_get_interp_filter_subpel_kernel:
  266|  86.1k|    const InterpFilterParams *const filter_params, const int subpel) {
  267|  86.1k|  return filter_params->filter_ptr + filter_params->taps * subpel;
  268|  86.1k|}
jnt_convolve_avx2.c:av1_get_interp_filter_subpel_kernel:
  266|  25.9k|    const InterpFilterParams *const filter_params, const int subpel) {
  267|  25.9k|  return filter_params->filter_ptr + filter_params->taps * subpel;
  268|  25.9k|}
highbd_convolve_2d_avx2.c:av1_get_interp_filter_subpel_kernel:
  266|  18.7k|    const InterpFilterParams *const filter_params, const int subpel) {
  267|  18.7k|  return filter_params->filter_ptr + filter_params->taps * subpel;
  268|  18.7k|}
highbd_jnt_convolve_avx2.c:av1_get_interp_filter_subpel_kernel:
  266|  6.24k|    const InterpFilterParams *const filter_params, const int subpel) {
  267|  6.24k|  return filter_params->filter_ptr + filter_params->taps * subpel;
  268|  6.24k|}

av1_alloc_internal_frame_buffers:
   17|  17.9k|int av1_alloc_internal_frame_buffers(InternalFrameBufferList *list) {
   18|  17.9k|  assert(list != NULL);
   19|  17.9k|  av1_free_internal_frame_buffers(list);
   20|       |
   21|  17.9k|  list->num_internal_frame_buffers =
   22|  17.9k|      AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS;
  ------------------
  |  |   34|  17.9k|#define AOM_MAXIMUM_REF_BUFFERS 8
  ------------------
                    AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS;
  ------------------
  |  |   30|  17.9k|#define AOM_MAXIMUM_WORK_BUFFERS 8
  ------------------
   23|  17.9k|  list->int_fb = (InternalFrameBuffer *)aom_calloc(
   24|  17.9k|      list->num_internal_frame_buffers, sizeof(*list->int_fb));
   25|  17.9k|  if (list->int_fb == NULL) {
  ------------------
  |  Branch (25:7): [True: 0, False: 17.9k]
  ------------------
   26|      0|    list->num_internal_frame_buffers = 0;
   27|      0|    return 1;
   28|      0|  }
   29|  17.9k|  return 0;
   30|  17.9k|}
av1_free_internal_frame_buffers:
   32|  35.8k|void av1_free_internal_frame_buffers(InternalFrameBufferList *list) {
   33|  35.8k|  int i;
   34|       |
   35|  35.8k|  assert(list != NULL);
   36|       |
   37|   322k|  for (i = 0; i < list->num_internal_frame_buffers; ++i) {
  ------------------
  |  Branch (37:15): [True: 287k, False: 35.8k]
  ------------------
   38|   287k|    aom_free(list->int_fb[i].data);
   39|   287k|    list->int_fb[i].data = NULL;
   40|   287k|  }
   41|  35.8k|  aom_free(list->int_fb);
   42|       |  list->int_fb = NULL;
   43|  35.8k|  list->num_internal_frame_buffers = 0;
   44|  35.8k|}
av1_zero_unused_internal_frame_buffers:
   46|    255|void av1_zero_unused_internal_frame_buffers(InternalFrameBufferList *list) {
   47|    255|  int i;
   48|       |
   49|    255|  assert(list != NULL);
   50|       |
   51|  4.33k|  for (i = 0; i < list->num_internal_frame_buffers; ++i) {
  ------------------
  |  Branch (51:15): [True: 4.08k, False: 255]
  ------------------
   52|  4.08k|    if (list->int_fb[i].data && !list->int_fb[i].in_use)
  ------------------
  |  Branch (52:9): [True: 511, False: 3.56k]
  |  Branch (52:33): [True: 425, False: 86]
  ------------------
   53|    425|      memset(list->int_fb[i].data, 0, list->int_fb[i].size);
   54|  4.08k|  }
   55|    255|}
av1_get_frame_buffer:
   58|  27.7k|                         aom_codec_frame_buffer_t *fb) {
   59|  27.7k|  int i;
   60|  27.7k|  InternalFrameBufferList *const int_fb_list =
   61|  27.7k|      (InternalFrameBufferList *)cb_priv;
   62|  27.7k|  if (int_fb_list == NULL) return -1;
  ------------------
  |  Branch (62:7): [True: 0, False: 27.7k]
  ------------------
   63|       |
   64|       |  // Find a free frame buffer.
   65|  46.5k|  for (i = 0; i < int_fb_list->num_internal_frame_buffers; ++i) {
  ------------------
  |  Branch (65:15): [True: 46.5k, False: 0]
  ------------------
   66|  46.5k|    if (!int_fb_list->int_fb[i].in_use) break;
  ------------------
  |  Branch (66:9): [True: 27.7k, False: 18.7k]
  ------------------
   67|  46.5k|  }
   68|       |
   69|  27.7k|  if (i == int_fb_list->num_internal_frame_buffers) return -1;
  ------------------
  |  Branch (69:7): [True: 0, False: 27.7k]
  ------------------
   70|       |
   71|  27.7k|  if (int_fb_list->int_fb[i].size < min_size) {
  ------------------
  |  Branch (71:7): [True: 26.9k, False: 814]
  ------------------
   72|  26.9k|    aom_free(int_fb_list->int_fb[i].data);
   73|       |    // The data must be zeroed to fix a valgrind error from the C loop filter
   74|       |    // due to access uninitialized memory in frame border. It could be
   75|       |    // skipped if border were totally removed.
   76|  26.9k|    int_fb_list->int_fb[i].data = (uint8_t *)aom_calloc(1, min_size);
   77|  26.9k|    if (!int_fb_list->int_fb[i].data) {
  ------------------
  |  Branch (77:9): [True: 0, False: 26.9k]
  ------------------
   78|      0|      int_fb_list->int_fb[i].size = 0;
   79|      0|      return -1;
   80|      0|    }
   81|  26.9k|    int_fb_list->int_fb[i].size = min_size;
   82|  26.9k|  }
   83|       |
   84|  27.7k|  fb->data = int_fb_list->int_fb[i].data;
   85|  27.7k|  fb->size = int_fb_list->int_fb[i].size;
   86|  27.7k|  int_fb_list->int_fb[i].in_use = 1;
   87|       |
   88|       |  // Set the frame buffer's private data to point at the internal frame buffer.
   89|  27.7k|  fb->priv = &int_fb_list->int_fb[i];
   90|  27.7k|  return 0;
   91|  27.7k|}
av1_release_frame_buffer:
   93|  27.7k|int av1_release_frame_buffer(void *cb_priv, aom_codec_frame_buffer_t *fb) {
   94|  27.7k|  InternalFrameBuffer *const int_fb = (InternalFrameBuffer *)fb->priv;
   95|  27.7k|  (void)cb_priv;
   96|  27.7k|  if (int_fb) int_fb->in_use = 0;
  ------------------
  |  Branch (96:7): [True: 27.7k, False: 0]
  ------------------
   97|  27.7k|  return 0;
   98|  27.7k|}

av1_get_tx_scale:
   24|  4.71M|int av1_get_tx_scale(const TX_SIZE tx_size) {
   25|  4.71M|  const int pels = tx_size_2d[tx_size];
   26|       |  // Largest possible pels is 4096 (64x64).
   27|  4.71M|  return (pels > 256) + (pels > 1024);
   28|  4.71M|}
av1_highbd_iwht4x4_add:
   35|   436k|                            int eob, int bd) {
   36|   436k|  if (eob > 1)
  ------------------
  |  Branch (36:7): [True: 360k, False: 76.0k]
  ------------------
   37|   360k|    av1_highbd_iwht4x4_16_add(input, dest, stride, bd);
   38|  76.0k|  else
   39|  76.0k|    av1_highbd_iwht4x4_1_add(input, dest, stride, bd);
  ------------------
  |  |  501|  76.0k|#define av1_highbd_iwht4x4_1_add av1_highbd_iwht4x4_1_add_c
  ------------------
   40|   436k|}
av1_inv_txfm_add_c:
  297|   295k|                        const TxfmParam *txfm_param) {
  298|   295k|  const TX_SIZE tx_size = txfm_param->tx_size;
  299|   295k|  DECLARE_ALIGNED(32, uint16_t, tmp[MAX_TX_SQUARE]);
  ------------------
  |  |   19|   295k|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
  300|   295k|  int tmp_stride = MAX_TX_SIZE;
  ------------------
  |  |  183|   295k|#define MAX_TX_SIZE (1 << MAX_TX_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |  182|   295k|#define MAX_TX_SIZE_LOG2 (6)
  |  |  ------------------
  ------------------
  301|   295k|  int w = tx_size_wide[tx_size];
  302|   295k|  int h = tx_size_high[tx_size];
  303|  1.47M|  for (int r = 0; r < h; ++r) {
  ------------------
  |  Branch (303:19): [True: 1.18M, False: 295k]
  ------------------
  304|  5.91M|    for (int c = 0; c < w; ++c) {
  ------------------
  |  Branch (304:21): [True: 4.73M, False: 1.18M]
  ------------------
  305|  4.73M|      tmp[r * tmp_stride + c] = dst[r * stride + c];
  306|  4.73M|    }
  307|  1.18M|  }
  308|       |
  309|   295k|  av1_highbd_inv_txfm_add(dqcoeff, CONVERT_TO_BYTEPTR(tmp), tmp_stride,
  ------------------
  |  |   76|   295k|#define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1))
  ------------------
  310|   295k|                          txfm_param);
  311|       |
  312|  1.47M|  for (int r = 0; r < h; ++r) {
  ------------------
  |  Branch (312:19): [True: 1.18M, False: 295k]
  ------------------
  313|  5.91M|    for (int c = 0; c < w; ++c) {
  ------------------
  |  Branch (313:21): [True: 4.73M, False: 1.18M]
  ------------------
  314|  4.73M|      dst[r * stride + c] = (uint8_t)tmp[r * tmp_stride + c];
  315|  4.73M|    }
  316|  1.18M|  }
  317|   295k|}
av1_inverse_transform_block:
  322|  2.09M|                                 int stride, int eob, int reduced_tx_set) {
  323|  2.09M|  if (!eob) return;
  ------------------
  |  Branch (323:7): [True: 72.0k, False: 2.01M]
  ------------------
  324|       |
  325|  2.09M|  assert(eob <= av1_get_max_eob(tx_size));
  326|       |
  327|  2.01M|  TxfmParam txfm_param;
  328|  2.01M|  init_txfm_param(xd, plane, tx_size, tx_type, eob, reduced_tx_set,
  329|  2.01M|                  &txfm_param);
  330|  2.01M|  assert(av1_ext_tx_used[txfm_param.tx_set_type][txfm_param.tx_type]);
  331|       |
  332|  2.01M|  if (txfm_param.is_hbd) {
  ------------------
  |  Branch (332:7): [True: 969k, False: 1.04M]
  ------------------
  333|   969k|    av1_highbd_inv_txfm_add(dqcoeff, dst, stride, &txfm_param);
  334|  1.04M|  } else {
  335|  1.04M|    av1_inv_txfm_add(dqcoeff, dst, stride, &txfm_param);
  336|  1.04M|  }
  337|  2.01M|}
idct.c:init_txfm_param:
  215|  2.01M|                            TxfmParam *txfm_param) {
  216|  2.01M|  (void)plane;
  217|  2.01M|  txfm_param->tx_type = tx_type;
  218|  2.01M|  txfm_param->tx_size = tx_size;
  219|  2.01M|  txfm_param->eob = eob;
  220|  2.01M|  txfm_param->lossless = xd->lossless[xd->mi[0]->segment_id];
  221|  2.01M|  txfm_param->bd = xd->bd;
  222|  2.01M|  txfm_param->is_hbd = is_cur_buf_hbd(xd);
  223|  2.01M|  txfm_param->tx_set_type = av1_get_ext_tx_set_type(
  224|  2.01M|      txfm_param->tx_size, is_inter_block(xd->mi[0]), reduced_tx_set);
  225|  2.01M|}

highbd_inv_txfm_sse4.c:cast_to_int32:
   42|   583k|static inline const int32_t *cast_to_int32(const tran_low_t *input) {
   43|       |  assert(sizeof(int32_t) == sizeof(tran_low_t));
   44|   583k|  return (const int32_t *)input;
   45|   583k|}

decodeframe.c:clamp_mv:
  323|   193k|static inline void clamp_mv(MV *mv, const SubpelMvLimits *mv_limits) {
  324|   193k|  mv->col = clamp(mv->col, mv_limits->col_min, mv_limits->col_max);
  325|   193k|  mv->row = clamp(mv->row, mv_limits->row_min, mv_limits->row_max);
  326|   193k|}
decodemv.c:convert_fullmv_to_mv:
   91|  5.17k|static inline void convert_fullmv_to_mv(int_mv *mv) {
   92|  5.17k|  mv->as_mv = get_mv_from_fullmv(&mv->as_fullmv);
   93|  5.17k|}
decodemv.c:get_mv_from_fullmv:
   85|  5.17k|static inline MV get_mv_from_fullmv(const FULLPEL_MV *full_mv) {
   86|  5.17k|  const MV subpel_mv = { (int16_t)GET_MV_SUBPEL(full_mv->row),
  ------------------
  |  |   29|  5.17k|#define GET_MV_SUBPEL(x) ((x) * 8)
  ------------------
   87|  5.17k|                         (int16_t)GET_MV_SUBPEL(full_mv->col) };
  ------------------
  |  |   29|  5.17k|#define GET_MV_SUBPEL(x) ((x) * 8)
  ------------------
   88|  5.17k|  return subpel_mv;
   89|  5.17k|}
decodemv.c:integer_mv_precision:
  199|  2.82k|static inline void integer_mv_precision(MV *mv) {
  200|  2.82k|  int mod = (mv->row % 8);
  201|  2.82k|  if (mod != 0) {
  ------------------
  |  Branch (201:7): [True: 12, False: 2.81k]
  ------------------
  202|     12|    mv->row -= mod;
  203|     12|    if (abs(mod) > 4) {
  ------------------
  |  Branch (203:9): [True: 4, False: 8]
  ------------------
  204|      4|      if (mod > 0) {
  ------------------
  |  Branch (204:11): [True: 4, False: 0]
  ------------------
  205|      4|        mv->row += 8;
  206|      4|      } else {
  207|      0|        mv->row -= 8;
  208|      0|      }
  209|      4|    }
  210|     12|  }
  211|       |
  212|  2.82k|  mod = (mv->col % 8);
  213|  2.82k|  if (mod != 0) {
  ------------------
  |  Branch (213:7): [True: 6, False: 2.81k]
  ------------------
  214|      6|    mv->col -= mod;
  215|      6|    if (abs(mod) > 4) {
  ------------------
  |  Branch (215:9): [True: 6, False: 0]
  ------------------
  216|      6|      if (mod > 0) {
  ------------------
  |  Branch (216:11): [True: 6, False: 0]
  ------------------
  217|      6|        mv->col += 8;
  218|      6|      } else {
  219|      0|        mv->col -= 8;
  220|      0|      }
  221|      6|    }
  222|      6|  }
  223|  2.82k|}
decodemv.c:gm_get_motion_vector:
  234|  5.14k|                                          int is_integer) {
  235|  5.14k|  int_mv res;
  236|       |
  237|  5.14k|  if (gm->wmtype == IDENTITY) {
  ------------------
  |  Branch (237:7): [True: 4.38k, False: 759]
  ------------------
  238|  4.38k|    res.as_int = 0;
  239|  4.38k|    return res;
  240|  4.38k|  }
  241|       |
  242|    759|  const int32_t *mat = gm->wmmat;
  243|    759|  int x, y, tx, ty;
  244|       |
  245|    759|  if (gm->wmtype == TRANSLATION) {
  ------------------
  |  Branch (245:7): [True: 210, False: 549]
  ------------------
  246|       |    // All global motion vectors are stored with WARPEDMODEL_PREC_BITS (16)
  247|       |    // bits of fractional precision. The offset for a translation is stored in
  248|       |    // entries 0 and 1. For translations, all but the top three (two if
  249|       |    // cm->features.allow_high_precision_mv is false) fractional bits are always
  250|       |    // zero.
  251|       |    //
  252|       |    // After the right shifts, there are 3 fractional bits of precision. If
  253|       |    // allow_hp is false, the bottom bit is always zero (so we don't need a
  254|       |    // call to convert_to_trans_prec here)
  255|       |    //
  256|       |    // Note: There is an AV1 specification bug here:
  257|       |    //
  258|       |    // gm->wmmat[0] is supposed to be the horizontal translation, and so should
  259|       |    // go into res.as_mv.col, and gm->wmmat[1] is supposed to be the vertical
  260|       |    // translation and so should go into res.as_mv.row
  261|       |    //
  262|       |    // However, in the spec, these assignments are accidentally reversed, and so
  263|       |    // we must keep this incorrect logic to match the spec.
  264|       |    //
  265|       |    // See also: https://crbug.com/aomedia/3328
  266|    210|    res.as_mv.row = gm->wmmat[0] >> GM_TRANS_ONLY_PREC_DIFF;
  ------------------
  |  |  168|    210|#define GM_TRANS_ONLY_PREC_DIFF (WARPEDMODEL_PREC_BITS - 3)
  |  |  ------------------
  |  |  |  |   96|    210|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  ------------------
  267|    210|    res.as_mv.col = gm->wmmat[1] >> GM_TRANS_ONLY_PREC_DIFF;
  ------------------
  |  |  168|    210|#define GM_TRANS_ONLY_PREC_DIFF (WARPEDMODEL_PREC_BITS - 3)
  |  |  ------------------
  |  |  |  |   96|    210|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  ------------------
  268|    210|    assert(IMPLIES(1 & (res.as_mv.row | res.as_mv.col), allow_hp));
  269|    210|    if (is_integer) {
  ------------------
  |  Branch (269:9): [True: 6, False: 204]
  ------------------
  270|      6|      integer_mv_precision(&res.as_mv);
  271|      6|    }
  272|    210|    return res;
  273|    210|  }
  274|       |
  275|    549|  x = block_center_x(mi_col, bsize);
  276|    549|  y = block_center_y(mi_row, bsize);
  277|       |
  278|    549|  if (gm->wmtype == ROTZOOM) {
  ------------------
  |  Branch (278:7): [True: 488, False: 61]
  ------------------
  279|    488|    assert(gm->wmmat[5] == gm->wmmat[2]);
  280|    488|    assert(gm->wmmat[4] == -gm->wmmat[3]);
  281|    488|  }
  282|       |
  283|    549|  const int xc =
  284|    549|      (mat[2] - (1 << WARPEDMODEL_PREC_BITS)) * x + mat[3] * y + mat[0];
  ------------------
  |  |   96|    549|#define WARPEDMODEL_PREC_BITS 16
  ------------------
  285|    549|  const int yc =
  286|    549|      mat[4] * x + (mat[5] - (1 << WARPEDMODEL_PREC_BITS)) * y + mat[1];
  ------------------
  |  |   96|    549|#define WARPEDMODEL_PREC_BITS 16
  ------------------
  287|    549|  tx = convert_to_trans_prec(allow_hp, xc);
  288|    549|  ty = convert_to_trans_prec(allow_hp, yc);
  289|       |
  290|    549|  res.as_mv.row = ty;
  291|    549|  res.as_mv.col = tx;
  292|       |
  293|    549|  if (is_integer) {
  ------------------
  |  Branch (293:7): [True: 8, False: 541]
  ------------------
  294|      8|    integer_mv_precision(&res.as_mv);
  295|      8|  }
  296|    549|  return res;
  297|    759|}
decodemv.c:block_center_x:
  183|    549|static inline int block_center_x(int mi_col, BLOCK_SIZE bs) {
  184|    549|  const int bw = block_size_wide[bs];
  185|    549|  return mi_col * MI_SIZE + bw / 2 - 1;
  ------------------
  |  |   40|    549|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|    549|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  186|    549|}
decodemv.c:block_center_y:
  188|    549|static inline int block_center_y(int mi_row, BLOCK_SIZE bs) {
  189|    549|  const int bh = block_size_high[bs];
  190|    549|  return mi_row * MI_SIZE + bh / 2 - 1;
  ------------------
  |  |   40|    549|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|    549|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  191|    549|}
decodemv.c:convert_to_trans_prec:
  193|  1.09k|static inline int convert_to_trans_prec(int allow_hp, int coor) {
  194|  1.09k|  if (allow_hp)
  ------------------
  |  Branch (194:7): [True: 764, False: 334]
  ------------------
  195|    764|    return ROUND_POWER_OF_TWO_SIGNED(coor, WARPEDMODEL_PREC_BITS - 3);
  ------------------
  |  |   45|    764|  (((value) < 0) ? -ROUND_POWER_OF_TWO(-(value), (n)) \
  |  |  ------------------
  |  |  |  |   41|    348|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |  |  Branch (45:4): [True: 348, False: 416]
  |  |  ------------------
  |  |   46|    764|                 : ROUND_POWER_OF_TWO((value), (n)))
  |  |  ------------------
  |  |  |  |   41|    416|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
  196|    334|  else
  197|    334|    return ROUND_POWER_OF_TWO_SIGNED(coor, WARPEDMODEL_PREC_BITS - 2) * 2;
  ------------------
  |  |   45|    334|  (((value) < 0) ? -ROUND_POWER_OF_TWO(-(value), (n)) \
  |  |  ------------------
  |  |  |  |   41|     99|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |  |  Branch (45:4): [True: 99, False: 235]
  |  |  ------------------
  |  |   46|    334|                 : ROUND_POWER_OF_TWO((value), (n)))
  |  |  ------------------
  |  |  |  |   41|    235|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
  198|  1.09k|}
mvref_common.c:gm_get_motion_vector:
  234|  88.7k|                                          int is_integer) {
  235|  88.7k|  int_mv res;
  236|       |
  237|  88.7k|  if (gm->wmtype == IDENTITY) {
  ------------------
  |  Branch (237:7): [True: 79.7k, False: 8.95k]
  ------------------
  238|  79.7k|    res.as_int = 0;
  239|  79.7k|    return res;
  240|  79.7k|  }
  241|       |
  242|  8.95k|  const int32_t *mat = gm->wmmat;
  243|  8.95k|  int x, y, tx, ty;
  244|       |
  245|  8.95k|  if (gm->wmtype == TRANSLATION) {
  ------------------
  |  Branch (245:7): [True: 2.15k, False: 6.80k]
  ------------------
  246|       |    // All global motion vectors are stored with WARPEDMODEL_PREC_BITS (16)
  247|       |    // bits of fractional precision. The offset for a translation is stored in
  248|       |    // entries 0 and 1. For translations, all but the top three (two if
  249|       |    // cm->features.allow_high_precision_mv is false) fractional bits are always
  250|       |    // zero.
  251|       |    //
  252|       |    // After the right shifts, there are 3 fractional bits of precision. If
  253|       |    // allow_hp is false, the bottom bit is always zero (so we don't need a
  254|       |    // call to convert_to_trans_prec here)
  255|       |    //
  256|       |    // Note: There is an AV1 specification bug here:
  257|       |    //
  258|       |    // gm->wmmat[0] is supposed to be the horizontal translation, and so should
  259|       |    // go into res.as_mv.col, and gm->wmmat[1] is supposed to be the vertical
  260|       |    // translation and so should go into res.as_mv.row
  261|       |    //
  262|       |    // However, in the spec, these assignments are accidentally reversed, and so
  263|       |    // we must keep this incorrect logic to match the spec.
  264|       |    //
  265|       |    // See also: https://crbug.com/aomedia/3328
  266|  2.15k|    res.as_mv.row = gm->wmmat[0] >> GM_TRANS_ONLY_PREC_DIFF;
  ------------------
  |  |  168|  2.15k|#define GM_TRANS_ONLY_PREC_DIFF (WARPEDMODEL_PREC_BITS - 3)
  |  |  ------------------
  |  |  |  |   96|  2.15k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  ------------------
  267|  2.15k|    res.as_mv.col = gm->wmmat[1] >> GM_TRANS_ONLY_PREC_DIFF;
  ------------------
  |  |  168|  2.15k|#define GM_TRANS_ONLY_PREC_DIFF (WARPEDMODEL_PREC_BITS - 3)
  |  |  ------------------
  |  |  |  |   96|  2.15k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  ------------------
  268|  2.15k|    assert(IMPLIES(1 & (res.as_mv.row | res.as_mv.col), allow_hp));
  269|  2.15k|    if (is_integer) {
  ------------------
  |  Branch (269:9): [True: 61, False: 2.09k]
  ------------------
  270|     61|      integer_mv_precision(&res.as_mv);
  271|     61|    }
  272|  2.15k|    return res;
  273|  2.15k|  }
  274|       |
  275|  6.80k|  x = block_center_x(mi_col, bsize);
  276|  6.80k|  y = block_center_y(mi_row, bsize);
  277|       |
  278|  6.80k|  if (gm->wmtype == ROTZOOM) {
  ------------------
  |  Branch (278:7): [True: 6.08k, False: 721]
  ------------------
  279|  6.08k|    assert(gm->wmmat[5] == gm->wmmat[2]);
  280|  6.08k|    assert(gm->wmmat[4] == -gm->wmmat[3]);
  281|  6.08k|  }
  282|       |
  283|  6.80k|  const int xc =
  284|  6.80k|      (mat[2] - (1 << WARPEDMODEL_PREC_BITS)) * x + mat[3] * y + mat[0];
  ------------------
  |  |   96|  6.80k|#define WARPEDMODEL_PREC_BITS 16
  ------------------
  285|  6.80k|  const int yc =
  286|  6.80k|      mat[4] * x + (mat[5] - (1 << WARPEDMODEL_PREC_BITS)) * y + mat[1];
  ------------------
  |  |   96|  6.80k|#define WARPEDMODEL_PREC_BITS 16
  ------------------
  287|  6.80k|  tx = convert_to_trans_prec(allow_hp, xc);
  288|  6.80k|  ty = convert_to_trans_prec(allow_hp, yc);
  289|       |
  290|  6.80k|  res.as_mv.row = ty;
  291|  6.80k|  res.as_mv.col = tx;
  292|       |
  293|  6.80k|  if (is_integer) {
  ------------------
  |  Branch (293:7): [True: 83, False: 6.71k]
  ------------------
  294|     83|    integer_mv_precision(&res.as_mv);
  295|     83|  }
  296|  6.80k|  return res;
  297|  8.95k|}
mvref_common.c:integer_mv_precision:
  199|  9.60k|static inline void integer_mv_precision(MV *mv) {
  200|  9.60k|  int mod = (mv->row % 8);
  201|  9.60k|  if (mod != 0) {
  ------------------
  |  Branch (201:7): [True: 59, False: 9.54k]
  ------------------
  202|     59|    mv->row -= mod;
  203|     59|    if (abs(mod) > 4) {
  ------------------
  |  Branch (203:9): [True: 18, False: 41]
  ------------------
  204|     18|      if (mod > 0) {
  ------------------
  |  Branch (204:11): [True: 14, False: 4]
  ------------------
  205|     14|        mv->row += 8;
  206|     14|      } else {
  207|      4|        mv->row -= 8;
  208|      4|      }
  209|     18|    }
  210|     59|  }
  211|       |
  212|  9.60k|  mod = (mv->col % 8);
  213|  9.60k|  if (mod != 0) {
  ------------------
  |  Branch (213:7): [True: 73, False: 9.53k]
  ------------------
  214|     73|    mv->col -= mod;
  215|     73|    if (abs(mod) > 4) {
  ------------------
  |  Branch (215:9): [True: 41, False: 32]
  ------------------
  216|     41|      if (mod > 0) {
  ------------------
  |  Branch (216:11): [True: 34, False: 7]
  ------------------
  217|     34|        mv->col += 8;
  218|     34|      } else {
  219|      7|        mv->col -= 8;
  220|      7|      }
  221|     41|    }
  222|     73|  }
  223|  9.60k|}
mvref_common.c:block_center_x:
  183|  6.80k|static inline int block_center_x(int mi_col, BLOCK_SIZE bs) {
  184|  6.80k|  const int bw = block_size_wide[bs];
  185|  6.80k|  return mi_col * MI_SIZE + bw / 2 - 1;
  ------------------
  |  |   40|  6.80k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  6.80k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  186|  6.80k|}
mvref_common.c:block_center_y:
  188|  6.80k|static inline int block_center_y(int mi_row, BLOCK_SIZE bs) {
  189|  6.80k|  const int bh = block_size_high[bs];
  190|  6.80k|  return mi_row * MI_SIZE + bh / 2 - 1;
  ------------------
  |  |   40|  6.80k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  6.80k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  191|  6.80k|}
mvref_common.c:convert_to_trans_prec:
  193|  13.6k|static inline int convert_to_trans_prec(int allow_hp, int coor) {
  194|  13.6k|  if (allow_hp)
  ------------------
  |  Branch (194:7): [True: 9.70k, False: 3.89k]
  ------------------
  195|  9.70k|    return ROUND_POWER_OF_TWO_SIGNED(coor, WARPEDMODEL_PREC_BITS - 3);
  ------------------
  |  |   45|  9.70k|  (((value) < 0) ? -ROUND_POWER_OF_TWO(-(value), (n)) \
  |  |  ------------------
  |  |  |  |   41|  4.33k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |  |  Branch (45:4): [True: 4.33k, False: 5.37k]
  |  |  ------------------
  |  |   46|  9.70k|                 : ROUND_POWER_OF_TWO((value), (n)))
  |  |  ------------------
  |  |  |  |   41|  5.37k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
  196|  3.89k|  else
  197|  3.89k|    return ROUND_POWER_OF_TWO_SIGNED(coor, WARPEDMODEL_PREC_BITS - 2) * 2;
  ------------------
  |  |   45|  3.89k|  (((value) < 0) ? -ROUND_POWER_OF_TWO(-(value), (n)) \
  |  |  ------------------
  |  |  |  |   41|  1.25k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |  |  Branch (45:4): [True: 1.25k, False: 2.63k]
  |  |  ------------------
  |  |   46|  3.89k|                 : ROUND_POWER_OF_TWO((value), (n)))
  |  |  ------------------
  |  |  |  |   41|  2.63k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
  198|  13.6k|}
mvref_common.c:clamp_mv:
  323|   181k|static inline void clamp_mv(MV *mv, const SubpelMvLimits *mv_limits) {
  324|   181k|  mv->col = clamp(mv->col, mv_limits->col_min, mv_limits->col_max);
  325|   181k|  mv->row = clamp(mv->row, mv_limits->row_min, mv_limits->row_max);
  326|   181k|}

av1_get_mv_projection:
   27|  29.0k|void av1_get_mv_projection(MV *output, MV ref, int num, int den) {
   28|  29.0k|  den = AOMMIN(den, MAX_FRAME_DISTANCE);
  ------------------
  |  |   34|  29.0k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 29.0k, False: 0]
  |  |  ------------------
  ------------------
   29|  29.0k|  num = num > 0 ? AOMMIN(num, MAX_FRAME_DISTANCE)
  ------------------
  |  |   34|  26.2k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 26.2k, False: 32]
  |  |  ------------------
  ------------------
  |  Branch (29:9): [True: 26.2k, False: 2.78k]
  ------------------
   30|  29.0k|                : AOMMAX(num, -MAX_FRAME_DISTANCE);
  ------------------
  |  |   35|  31.8k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 2.30k, False: 480]
  |  |  ------------------
  ------------------
   31|  29.0k|  const int mv_row =
   32|  29.0k|      ROUND_POWER_OF_TWO_SIGNED(ref.row * num * div_mult[den], 14);
  ------------------
  |  |   45|  29.0k|  (((value) < 0) ? -ROUND_POWER_OF_TWO(-(value), (n)) \
  |  |  ------------------
  |  |  |  |   41|      0|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |  |  Branch (45:4): [True: 0, False: 29.0k]
  |  |  ------------------
  |  |   46|  29.0k|                 : ROUND_POWER_OF_TWO((value), (n)))
  |  |  ------------------
  |  |  |  |   41|  29.0k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
   33|  29.0k|  const int mv_col =
   34|  29.0k|      ROUND_POWER_OF_TWO_SIGNED(ref.col * num * div_mult[den], 14);
  ------------------
  |  |   45|  29.0k|  (((value) < 0) ? -ROUND_POWER_OF_TWO(-(value), (n)) \
  |  |  ------------------
  |  |  |  |   41|      0|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |  |  Branch (45:4): [True: 0, False: 29.0k]
  |  |  ------------------
  |  |   46|  29.0k|                 : ROUND_POWER_OF_TWO((value), (n)))
  |  |  ------------------
  |  |  |  |   41|  29.0k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
   35|  29.0k|  const int clamp_max = MV_UPP - 1;
  ------------------
  |  |   75|  29.0k|#define MV_UPP (1 << MV_IN_USE_BITS)
  |  |  ------------------
  |  |  |  |   74|  29.0k|#define MV_IN_USE_BITS 14
  |  |  ------------------
  ------------------
   36|  29.0k|  const int clamp_min = MV_LOW + 1;
  ------------------
  |  |   76|  29.0k|#define MV_LOW (-(1 << MV_IN_USE_BITS))
  |  |  ------------------
  |  |  |  |   74|  29.0k|#define MV_IN_USE_BITS 14
  |  |  ------------------
  ------------------
   37|  29.0k|  output->row = (int16_t)clamp(mv_row, clamp_min, clamp_max);
   38|  29.0k|  output->col = (int16_t)clamp(mv_col, clamp_min, clamp_max);
   39|  29.0k|}
av1_copy_frame_mvs:
   43|   138k|                        int x_mis, int y_mis) {
   44|   138k|  const int frame_mvs_stride = ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, 1);
  ------------------
  |  |   41|   138k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
   45|   138k|  MV_REF *frame_mvs =
   46|   138k|      cm->cur_frame->mvs + (mi_row >> 1) * frame_mvs_stride + (mi_col >> 1);
   47|   138k|  x_mis = ROUND_POWER_OF_TWO(x_mis, 1);
  ------------------
  |  |   41|   138k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
   48|   138k|  y_mis = ROUND_POWER_OF_TWO(y_mis, 1);
  ------------------
  |  |   41|   138k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
   49|   138k|  int w, h;
   50|       |
   51|   513k|  for (h = 0; h < y_mis; h++) {
  ------------------
  |  Branch (51:15): [True: 374k, False: 138k]
  ------------------
   52|   374k|    MV_REF *mv = frame_mvs;
   53|  2.06M|    for (w = 0; w < x_mis; w++) {
  ------------------
  |  Branch (53:17): [True: 1.69M, False: 374k]
  ------------------
   54|  1.69M|      mv->ref_frame = NONE_FRAME;
   55|  1.69M|      mv->mv.as_int = 0;
   56|       |
   57|  5.07M|      for (int idx = 0; idx < 2; ++idx) {
  ------------------
  |  Branch (57:25): [True: 3.38M, False: 1.69M]
  ------------------
   58|  3.38M|        MV_REFERENCE_FRAME ref_frame = mi->ref_frame[idx];
   59|  3.38M|        if (ref_frame > INTRA_FRAME) {
  ------------------
  |  Branch (59:13): [True: 504k, False: 2.87M]
  ------------------
   60|   504k|          int8_t ref_idx = cm->ref_frame_side[ref_frame];
   61|   504k|          if (ref_idx) continue;
  ------------------
  |  Branch (61:15): [True: 92.9k, False: 411k]
  ------------------
   62|   411k|          if ((abs(mi->mv[idx].as_mv.row) > REFMVS_LIMIT) ||
  ------------------
  |  |   27|   411k|#define REFMVS_LIMIT ((1 << 12) - 1)
  ------------------
  |  Branch (62:15): [True: 396, False: 411k]
  ------------------
   63|   411k|              (abs(mi->mv[idx].as_mv.col) > REFMVS_LIMIT))
  ------------------
  |  |   27|   411k|#define REFMVS_LIMIT ((1 << 12) - 1)
  ------------------
  |  Branch (63:15): [True: 501, False: 410k]
  ------------------
   64|    886|            continue;
   65|   410k|          mv->ref_frame = ref_frame;
   66|   410k|          mv->mv.as_int = mi->mv[idx].as_int;
   67|   410k|        }
   68|  3.38M|      }
   69|  1.69M|      mv++;
   70|  1.69M|    }
   71|   374k|    frame_mvs += frame_mvs_stride;
   72|   374k|  }
   73|   138k|}
av1_find_mv_refs:
  794|  88.0k|                      int_mv *global_mvs, int16_t *mode_context) {
  795|  88.0k|  const int mi_row = xd->mi_row;
  796|  88.0k|  const int mi_col = xd->mi_col;
  797|  88.0k|  int_mv gm_mv[2];
  798|       |
  799|  88.0k|  if (ref_frame == INTRA_FRAME) {
  ------------------
  |  Branch (799:7): [True: 11.3k, False: 76.7k]
  ------------------
  800|  11.3k|    gm_mv[0].as_int = gm_mv[1].as_int = 0;
  801|  11.3k|    if (global_mvs != NULL) {
  ------------------
  |  Branch (801:9): [True: 0, False: 11.3k]
  ------------------
  802|      0|      global_mvs[ref_frame].as_int = INVALID_MV;
  ------------------
  |  |   26|      0|#define INVALID_MV 0x80008000
  ------------------
  803|      0|    }
  804|  76.7k|  } else {
  805|  76.7k|    const BLOCK_SIZE bsize = mi->bsize;
  806|  76.7k|    const int allow_high_precision_mv = cm->features.allow_high_precision_mv;
  807|  76.7k|    const int force_integer_mv = cm->features.cur_frame_force_integer_mv;
  808|  76.7k|    if (ref_frame < REF_FRAMES) {
  ------------------
  |  Branch (808:9): [True: 64.7k, False: 11.9k]
  ------------------
  809|  64.7k|      gm_mv[0] = gm_get_motion_vector(&cm->global_motion[ref_frame],
  810|  64.7k|                                      allow_high_precision_mv, bsize, mi_col,
  811|  64.7k|                                      mi_row, force_integer_mv);
  812|  64.7k|      gm_mv[1].as_int = 0;
  813|  64.7k|      if (global_mvs != NULL) global_mvs[ref_frame] = gm_mv[0];
  ------------------
  |  Branch (813:11): [True: 0, False: 64.7k]
  ------------------
  814|  64.7k|    } else {
  815|  11.9k|      MV_REFERENCE_FRAME rf[2];
  816|  11.9k|      av1_set_ref_frame(rf, ref_frame);
  817|  11.9k|      gm_mv[0] = gm_get_motion_vector(&cm->global_motion[rf[0]],
  818|  11.9k|                                      allow_high_precision_mv, bsize, mi_col,
  819|  11.9k|                                      mi_row, force_integer_mv);
  820|  11.9k|      gm_mv[1] = gm_get_motion_vector(&cm->global_motion[rf[1]],
  821|  11.9k|                                      allow_high_precision_mv, bsize, mi_col,
  822|  11.9k|                                      mi_row, force_integer_mv);
  823|  11.9k|    }
  824|  76.7k|  }
  825|       |
  826|  88.0k|  setup_ref_mv_list(cm, xd, ref_frame, &ref_mv_count[ref_frame],
  827|  88.0k|                    ref_mv_stack[ref_frame], ref_mv_weight[ref_frame],
  828|  88.0k|                    mv_ref_list ? mv_ref_list[ref_frame] : NULL, gm_mv, mi_row,
  ------------------
  |  Branch (828:21): [True: 88.0k, False: 18.4E]
  ------------------
  829|  88.0k|                    mi_col, mode_context);
  830|  88.0k|}
av1_find_best_ref_mvs:
  833|  73.7k|                           int_mv *near_mv, int is_integer) {
  834|  73.7k|  int i;
  835|       |  // Make sure all the candidates are properly clamped etc
  836|   221k|  for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {
  ------------------
  |  |  508|   221k|#define MAX_MV_REF_CANDIDATES 2
  ------------------
  |  Branch (836:15): [True: 147k, False: 73.7k]
  ------------------
  837|   147k|    lower_mv_precision(&mvlist[i].as_mv, allow_hp, is_integer);
  838|   147k|  }
  839|  73.7k|  *nearest_mv = mvlist[0];
  840|  73.7k|  *near_mv = mvlist[1];
  841|  73.7k|}
av1_setup_frame_buf_refs:
  843|  26.5k|void av1_setup_frame_buf_refs(AV1_COMMON *cm) {
  844|  26.5k|  cm->cur_frame->order_hint = cm->current_frame.order_hint;
  845|  26.5k|  cm->cur_frame->display_order_hint = cm->current_frame.display_order_hint;
  846|  26.5k|  cm->cur_frame->pyramid_level = cm->current_frame.pyramid_level;
  847|  26.5k|  cm->cur_frame->filter_level[0] = -1;
  848|  26.5k|  cm->cur_frame->filter_level[1] = -1;
  849|  26.5k|  MV_REFERENCE_FRAME ref_frame;
  850|   212k|  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
  ------------------
  |  Branch (850:32): [True: 185k, False: 26.5k]
  ------------------
  851|   185k|    const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
  852|   185k|    if (buf != NULL) {
  ------------------
  |  Branch (852:9): [True: 56.1k, False: 129k]
  ------------------
  853|  56.1k|      cm->cur_frame->ref_order_hints[ref_frame - LAST_FRAME] = buf->order_hint;
  854|  56.1k|      cm->cur_frame->ref_display_order_hint[ref_frame - LAST_FRAME] =
  855|  56.1k|          buf->display_order_hint;
  856|  56.1k|    }
  857|   185k|  }
  858|  26.5k|}
av1_setup_frame_sign_bias:
  860|  26.5k|void av1_setup_frame_sign_bias(AV1_COMMON *cm) {
  861|  26.5k|  MV_REFERENCE_FRAME ref_frame;
  862|   212k|  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
  ------------------
  |  Branch (862:32): [True: 185k, False: 26.5k]
  ------------------
  863|   185k|    const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
  864|   185k|    if (cm->seq_params->order_hint_info.enable_order_hint && buf != NULL) {
  ------------------
  |  Branch (864:9): [True: 122k, False: 63.5k]
  |  Branch (864:62): [True: 56.1k, False: 65.8k]
  ------------------
  865|  56.1k|      const int ref_order_hint = buf->order_hint;
  866|  56.1k|      cm->ref_frame_sign_bias[ref_frame] =
  867|  56.1k|          (get_relative_dist(&cm->seq_params->order_hint_info, ref_order_hint,
  ------------------
  |  Branch (867:11): [True: 51.0k, False: 5.07k]
  ------------------
  868|  56.1k|                             (int)cm->current_frame.order_hint) <= 0)
  869|  56.1k|              ? 0
  870|  56.1k|              : 1;
  871|   129k|    } else {
  872|   129k|      cm->ref_frame_sign_bias[ref_frame] = 0;
  873|   129k|    }
  874|   185k|  }
  875|  26.5k|}
av1_calculate_ref_frame_side:
  993|  26.1k|void av1_calculate_ref_frame_side(AV1_COMMON *cm) {
  994|  26.1k|  const OrderHintInfo *const order_hint_info = &cm->seq_params->order_hint_info;
  995|       |
  996|  26.1k|  memset(cm->ref_frame_side, 0, sizeof(cm->ref_frame_side));
  997|  26.1k|  if (!order_hint_info->enable_order_hint) return;
  ------------------
  |  Branch (997:7): [True: 9.01k, False: 17.1k]
  ------------------
  998|       |
  999|  17.1k|  const int cur_order_hint = cm->cur_frame->order_hint;
 1000|       |
 1001|   136k|  for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
  ------------------
  |  Branch (1001:36): [True: 119k, False: 17.1k]
  ------------------
 1002|   119k|    const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
 1003|   119k|    int order_hint = 0;
 1004|       |
 1005|   119k|    if (buf != NULL) order_hint = buf->order_hint;
  ------------------
  |  Branch (1005:9): [True: 54.7k, False: 65.0k]
  ------------------
 1006|       |
 1007|   119k|    if (get_relative_dist(order_hint_info, order_hint, cur_order_hint) > 0)
  ------------------
  |  Branch (1007:9): [True: 5.71k, False: 114k]
  ------------------
 1008|  5.71k|      cm->ref_frame_side[ref_frame] = 1;
 1009|   114k|    else if (order_hint == cur_order_hint)
  ------------------
  |  Branch (1009:14): [True: 57.9k, False: 56.1k]
  ------------------
 1010|  57.9k|      cm->ref_frame_side[ref_frame] = -1;
 1011|   119k|  }
 1012|  17.1k|}
av1_setup_motion_field:
 1014|  6.84k|void av1_setup_motion_field(AV1_COMMON *cm) {
 1015|  6.84k|  const OrderHintInfo *const order_hint_info = &cm->seq_params->order_hint_info;
 1016|       |
 1017|  6.84k|  if (!order_hint_info->enable_order_hint) return;
  ------------------
  |  Branch (1017:7): [True: 0, False: 6.84k]
  ------------------
 1018|       |
 1019|  6.84k|  TPL_MV_REF *tpl_mvs_base = cm->tpl_mvs;
 1020|  6.84k|  int size = ((cm->mi_params.mi_rows + MAX_MIB_SIZE) >> 1) *
  ------------------
  |  |   44|  6.84k|#define MAX_MIB_SIZE (1 << MAX_MIB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   43|  6.84k|#define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   31|  6.84k|#define MAX_SB_SIZE_LOG2 7
  |  |  |  |  ------------------
  |  |  |  |               #define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   39|  6.84k|#define MI_SIZE_LOG2 2
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1021|  6.84k|             (cm->mi_params.mi_stride >> 1);
 1022|  6.66M|  for (int idx = 0; idx < size; ++idx) {
  ------------------
  |  Branch (1022:21): [True: 6.65M, False: 6.84k]
  ------------------
 1023|  6.65M|    tpl_mvs_base[idx].mfmv0.as_int = INVALID_MV;
  ------------------
  |  |   26|  6.65M|#define INVALID_MV 0x80008000
  ------------------
 1024|  6.65M|    tpl_mvs_base[idx].ref_frame_offset = 0;
 1025|  6.65M|  }
 1026|       |
 1027|  6.84k|  const int cur_order_hint = cm->cur_frame->order_hint;
 1028|  6.84k|  const RefCntBuffer *ref_buf[INTER_REFS_PER_FRAME];
 1029|  6.84k|  int ref_order_hint[INTER_REFS_PER_FRAME];
 1030|       |
 1031|  54.7k|  for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
  ------------------
  |  Branch (1031:36): [True: 47.8k, False: 6.84k]
  ------------------
 1032|  47.8k|    const int ref_idx = ref_frame - LAST_FRAME;
 1033|  47.8k|    const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
 1034|  47.8k|    int order_hint = 0;
 1035|       |
 1036|  47.8k|    if (buf != NULL) order_hint = buf->order_hint;
  ------------------
  |  Branch (1036:9): [True: 47.8k, False: 0]
  ------------------
 1037|       |
 1038|  47.8k|    ref_buf[ref_idx] = buf;
 1039|  47.8k|    ref_order_hint[ref_idx] = order_hint;
 1040|  47.8k|  }
 1041|       |
 1042|  6.84k|  int ref_stamp = MFMV_STACK_SIZE - 1;
  ------------------
  |  |  105|  6.84k|#define MFMV_STACK_SIZE 3
  ------------------
 1043|       |
 1044|  6.84k|  if (ref_buf[LAST_FRAME - LAST_FRAME] != NULL) {
  ------------------
  |  Branch (1044:7): [True: 6.84k, False: 0]
  ------------------
 1045|  6.84k|    const int alt_of_lst_order_hint =
 1046|  6.84k|        ref_buf[LAST_FRAME - LAST_FRAME]
 1047|  6.84k|            ->ref_order_hints[ALTREF_FRAME - LAST_FRAME];
 1048|       |
 1049|  6.84k|    const int is_lst_overlay =
 1050|  6.84k|        (alt_of_lst_order_hint == ref_order_hint[GOLDEN_FRAME - LAST_FRAME]);
 1051|  6.84k|    if (!is_lst_overlay) motion_field_projection(cm, LAST_FRAME, 2);
  ------------------
  |  Branch (1051:9): [True: 819, False: 6.02k]
  ------------------
 1052|  6.84k|    --ref_stamp;
 1053|  6.84k|  }
 1054|       |
 1055|  6.84k|  if (get_relative_dist(order_hint_info,
  ------------------
  |  Branch (1055:7): [True: 1.13k, False: 5.70k]
  ------------------
 1056|  6.84k|                        ref_order_hint[BWDREF_FRAME - LAST_FRAME],
 1057|  6.84k|                        cur_order_hint) > 0) {
 1058|  1.13k|    if (motion_field_projection(cm, BWDREF_FRAME, 0)) --ref_stamp;
  ------------------
  |  Branch (1058:9): [True: 972, False: 162]
  ------------------
 1059|  1.13k|  }
 1060|       |
 1061|  6.84k|  if (get_relative_dist(order_hint_info,
  ------------------
  |  Branch (1061:7): [True: 216, False: 6.62k]
  ------------------
 1062|  6.84k|                        ref_order_hint[ALTREF2_FRAME - LAST_FRAME],
 1063|  6.84k|                        cur_order_hint) > 0) {
 1064|    216|    if (motion_field_projection(cm, ALTREF2_FRAME, 0)) --ref_stamp;
  ------------------
  |  Branch (1064:9): [True: 32, False: 184]
  ------------------
 1065|    216|  }
 1066|       |
 1067|  6.84k|  if (get_relative_dist(order_hint_info,
  ------------------
  |  Branch (1067:7): [True: 1.93k, False: 4.90k]
  ------------------
 1068|  6.84k|                        ref_order_hint[ALTREF_FRAME - LAST_FRAME],
 1069|  6.84k|                        cur_order_hint) > 0 &&
 1070|  1.93k|      ref_stamp >= 0)
  ------------------
  |  Branch (1070:7): [True: 1.93k, False: 6]
  ------------------
 1071|  1.93k|    if (motion_field_projection(cm, ALTREF_FRAME, 0)) --ref_stamp;
  ------------------
  |  Branch (1071:9): [True: 1.79k, False: 140]
  ------------------
 1072|       |
 1073|  6.84k|  if (ref_stamp >= 0) motion_field_projection(cm, LAST2_FRAME, 2);
  ------------------
  |  Branch (1073:7): [True: 6.17k, False: 670]
  ------------------
 1074|  6.84k|}
av1_selectSamples:
 1092|  2.13k|                          BLOCK_SIZE bsize) {
 1093|  2.13k|  const int bw = block_size_wide[bsize];
 1094|  2.13k|  const int bh = block_size_high[bsize];
 1095|  2.13k|  const int thresh = clamp(AOMMAX(bw, bh), 16, 112);
  ------------------
  |  |   35|  2.13k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 587, False: 1.54k]
  |  |  ------------------
  ------------------
 1096|  2.13k|  uint8_t ret = 0;
 1097|  2.13k|  assert(len <= LEAST_SQUARES_SAMPLES_MAX);
 1098|       |
 1099|       |  // Only keep the samples with MV differences within threshold.
 1100|  8.12k|  for (int i = 0; i < len; ++i) {
  ------------------
  |  Branch (1100:19): [True: 5.98k, False: 2.13k]
  ------------------
 1101|  5.98k|    const int diff = abs(pts_inref[2 * i] - pts[2 * i] - mv->col) +
 1102|  5.98k|                     abs(pts_inref[2 * i + 1] - pts[2 * i + 1] - mv->row);
 1103|  5.98k|    if (diff > thresh) continue;
  ------------------
  |  Branch (1103:9): [True: 1.28k, False: 4.70k]
  ------------------
 1104|  4.70k|    if (ret != i) {
  ------------------
  |  Branch (1104:9): [True: 467, False: 4.23k]
  ------------------
 1105|    467|      memcpy(pts + 2 * ret, pts + 2 * i, 2 * sizeof(pts[0]));
 1106|    467|      memcpy(pts_inref + 2 * ret, pts_inref + 2 * i, 2 * sizeof(pts_inref[0]));
 1107|    467|    }
 1108|  4.70k|    ++ret;
 1109|  4.70k|  }
 1110|       |  // Keep at least 1 sample.
 1111|  2.13k|  return AOMMAX(ret, 1);
  ------------------
  |  |   35|  2.13k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 1.65k, False: 486]
  |  |  ------------------
  ------------------
 1112|  2.13k|}
av1_findSamples:
 1118|  40.1k|                        int *pts_inref) {
 1119|  40.1k|  const MB_MODE_INFO *const mbmi0 = xd->mi[0];
 1120|  40.1k|  const int ref_frame = mbmi0->ref_frame[0];
 1121|  40.1k|  const int up_available = xd->up_available;
 1122|  40.1k|  const int left_available = xd->left_available;
 1123|  40.1k|  uint8_t np = 0;
 1124|  40.1k|  int do_tl = 1;
 1125|  40.1k|  int do_tr = 1;
 1126|  40.1k|  const int mi_stride = xd->mi_stride;
 1127|  40.1k|  const int mi_row = xd->mi_row;
 1128|  40.1k|  const int mi_col = xd->mi_col;
 1129|       |
 1130|       |  // scan the nearest above rows
 1131|  40.1k|  if (up_available) {
  ------------------
  |  Branch (1131:7): [True: 28.6k, False: 11.5k]
  ------------------
 1132|  28.6k|    const int mi_row_offset = -1;
 1133|  28.6k|    const MB_MODE_INFO *mbmi = xd->mi[mi_row_offset * mi_stride];
 1134|  28.6k|    uint8_t superblock_width = mi_size_wide[mbmi->bsize];
 1135|       |
 1136|  28.6k|    if (xd->width <= superblock_width) {
  ------------------
  |  Branch (1136:9): [True: 24.9k, False: 3.77k]
  ------------------
 1137|       |      // Handle "current block width <= above block width" case.
 1138|  24.9k|      const int col_offset = -mi_col % superblock_width;
 1139|       |
 1140|  24.9k|      if (col_offset < 0) do_tl = 0;
  ------------------
  |  Branch (1140:11): [True: 4.00k, False: 20.8k]
  ------------------
 1141|  24.9k|      if (col_offset + superblock_width > xd->width) do_tr = 0;
  ------------------
  |  Branch (1141:11): [True: 4.27k, False: 20.6k]
  ------------------
 1142|       |
 1143|  24.9k|      if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
  ------------------
  |  Branch (1143:11): [True: 17.5k, False: 7.34k]
  |  Branch (1143:46): [True: 15.9k, False: 1.64k]
  ------------------
 1144|  15.9k|        record_samples(mbmi, pts, pts_inref, 0, -1, col_offset, 1);
 1145|  15.9k|        pts += 2;
 1146|  15.9k|        pts_inref += 2;
 1147|  15.9k|        if (++np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
  ------------------
  |  |   29|  15.9k|#define LEAST_SQUARES_SAMPLES_MAX (1 << LEAST_SQUARES_SAMPLES_MAX_BITS)
  |  |  ------------------
  |  |  |  |   28|  15.9k|#define LEAST_SQUARES_SAMPLES_MAX_BITS 3
  |  |  ------------------
  ------------------
                      if (++np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
  ------------------
  |  |   29|      0|#define LEAST_SQUARES_SAMPLES_MAX (1 << LEAST_SQUARES_SAMPLES_MAX_BITS)
  |  |  ------------------
  |  |  |  |   28|      0|#define LEAST_SQUARES_SAMPLES_MAX_BITS 3
  |  |  ------------------
  ------------------
  |  Branch (1147:13): [True: 0, False: 15.9k]
  ------------------
 1148|  15.9k|      }
 1149|  24.9k|    } else {
 1150|       |      // Handle "current block width > above block width" case.
 1151|  12.4k|      for (int i = 0; i < AOMMIN(xd->width, cm->mi_params.mi_cols - mi_col);
  ------------------
  |  |   34|  12.4k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 12.1k, False: 340]
  |  |  ------------------
  ------------------
  |  Branch (1151:23): [True: 8.70k, False: 3.77k]
  ------------------
 1152|  8.70k|           i += superblock_width) {
 1153|  8.70k|        mbmi = xd->mi[i + mi_row_offset * mi_stride];
 1154|  8.70k|        superblock_width = mi_size_wide[mbmi->bsize];
 1155|       |
 1156|  8.70k|        if (mbmi->ref_frame[0] == ref_frame &&
  ------------------
  |  Branch (1156:13): [True: 5.09k, False: 3.60k]
  ------------------
 1157|  5.09k|            mbmi->ref_frame[1] == NONE_FRAME) {
  ------------------
  |  Branch (1157:13): [True: 4.65k, False: 440]
  ------------------
 1158|  4.65k|          record_samples(mbmi, pts, pts_inref, 0, -1, i, 1);
 1159|  4.65k|          pts += 2;
 1160|  4.65k|          pts_inref += 2;
 1161|  4.65k|          if (++np >= LEAST_SQUARES_SAMPLES_MAX)
  ------------------
  |  |   29|  4.65k|#define LEAST_SQUARES_SAMPLES_MAX (1 << LEAST_SQUARES_SAMPLES_MAX_BITS)
  |  |  ------------------
  |  |  |  |   28|  4.65k|#define LEAST_SQUARES_SAMPLES_MAX_BITS 3
  |  |  ------------------
  ------------------
  |  Branch (1161:15): [True: 2, False: 4.65k]
  ------------------
 1162|      2|            return LEAST_SQUARES_SAMPLES_MAX;
  ------------------
  |  |   29|      2|#define LEAST_SQUARES_SAMPLES_MAX (1 << LEAST_SQUARES_SAMPLES_MAX_BITS)
  |  |  ------------------
  |  |  |  |   28|      2|#define LEAST_SQUARES_SAMPLES_MAX_BITS 3
  |  |  ------------------
  ------------------
 1163|  4.65k|        }
 1164|  8.70k|      }
 1165|  3.77k|    }
 1166|  28.6k|  }
 1167|  40.1k|  assert(np <= LEAST_SQUARES_SAMPLES_MAX);
 1168|       |
 1169|       |  // scan the nearest left columns
 1170|  40.1k|  if (left_available) {
  ------------------
  |  Branch (1170:7): [True: 31.8k, False: 8.28k]
  ------------------
 1171|  31.8k|    const int mi_col_offset = -1;
 1172|  31.8k|    const MB_MODE_INFO *mbmi = xd->mi[mi_col_offset];
 1173|  31.8k|    uint8_t superblock_height = mi_size_high[mbmi->bsize];
 1174|       |
 1175|  31.8k|    if (xd->height <= superblock_height) {
  ------------------
  |  Branch (1175:9): [True: 27.3k, False: 4.51k]
  ------------------
 1176|       |      // Handle "current block height <= above block height" case.
 1177|  27.3k|      const int row_offset = -mi_row % superblock_height;
 1178|       |
 1179|  27.3k|      if (row_offset < 0) do_tl = 0;
  ------------------
  |  Branch (1179:11): [True: 5.60k, False: 21.7k]
  ------------------
 1180|       |
 1181|  27.3k|      if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
  ------------------
  |  Branch (1181:11): [True: 19.3k, False: 7.99k]
  |  Branch (1181:46): [True: 17.6k, False: 1.72k]
  ------------------
 1182|  17.6k|        record_samples(mbmi, pts, pts_inref, row_offset, 1, 0, -1);
 1183|  17.6k|        pts += 2;
 1184|  17.6k|        pts_inref += 2;
 1185|  17.6k|        np++;
 1186|  17.6k|        if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
  ------------------
  |  |   29|  17.6k|#define LEAST_SQUARES_SAMPLES_MAX (1 << LEAST_SQUARES_SAMPLES_MAX_BITS)
  |  |  ------------------
  |  |  |  |   28|  17.6k|#define LEAST_SQUARES_SAMPLES_MAX_BITS 3
  |  |  ------------------
  ------------------
                      if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
  ------------------
  |  |   29|      0|#define LEAST_SQUARES_SAMPLES_MAX (1 << LEAST_SQUARES_SAMPLES_MAX_BITS)
  |  |  ------------------
  |  |  |  |   28|      0|#define LEAST_SQUARES_SAMPLES_MAX_BITS 3
  |  |  ------------------
  ------------------
  |  Branch (1186:13): [True: 0, False: 17.6k]
  ------------------
 1187|  17.6k|      }
 1188|  27.3k|    } else {
 1189|       |      // Handle "current block height > above block height" case.
 1190|  14.9k|      for (int i = 0; i < AOMMIN(xd->height, cm->mi_params.mi_rows - mi_row);
  ------------------
  |  |   34|  14.9k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 13.8k, False: 1.13k]
  |  |  ------------------
  ------------------
  |  Branch (1190:23): [True: 10.4k, False: 4.50k]
  ------------------
 1191|  10.4k|           i += superblock_height) {
 1192|  10.4k|        mbmi = xd->mi[mi_col_offset + i * mi_stride];
 1193|  10.4k|        superblock_height = mi_size_high[mbmi->bsize];
 1194|       |
 1195|  10.4k|        if (mbmi->ref_frame[0] == ref_frame &&
  ------------------
  |  Branch (1195:13): [True: 6.49k, False: 3.99k]
  ------------------
 1196|  6.49k|            mbmi->ref_frame[1] == NONE_FRAME) {
  ------------------
  |  Branch (1196:13): [True: 6.09k, False: 398]
  ------------------
 1197|  6.09k|          record_samples(mbmi, pts, pts_inref, i, 1, 0, -1);
 1198|  6.09k|          pts += 2;
 1199|  6.09k|          pts_inref += 2;
 1200|  6.09k|          if (++np >= LEAST_SQUARES_SAMPLES_MAX)
  ------------------
  |  |   29|  6.09k|#define LEAST_SQUARES_SAMPLES_MAX (1 << LEAST_SQUARES_SAMPLES_MAX_BITS)
  |  |  ------------------
  |  |  |  |   28|  6.09k|#define LEAST_SQUARES_SAMPLES_MAX_BITS 3
  |  |  ------------------
  ------------------
  |  Branch (1200:15): [True: 4, False: 6.08k]
  ------------------
 1201|      4|            return LEAST_SQUARES_SAMPLES_MAX;
  ------------------
  |  |   29|      4|#define LEAST_SQUARES_SAMPLES_MAX (1 << LEAST_SQUARES_SAMPLES_MAX_BITS)
  |  |  ------------------
  |  |  |  |   28|      4|#define LEAST_SQUARES_SAMPLES_MAX_BITS 3
  |  |  ------------------
  ------------------
 1202|  6.09k|        }
 1203|  10.4k|      }
 1204|  4.51k|    }
 1205|  31.8k|  }
 1206|  40.1k|  assert(np <= LEAST_SQUARES_SAMPLES_MAX);
 1207|       |
 1208|       |  // Top-left block
 1209|  40.1k|  if (do_tl && left_available && up_available) {
  ------------------
  |  Branch (1209:7): [True: 30.5k, False: 9.60k]
  |  Branch (1209:16): [True: 22.2k, False: 8.28k]
  |  Branch (1209:34): [True: 14.8k, False: 7.43k]
  ------------------
 1210|  14.8k|    const int mi_row_offset = -1;
 1211|  14.8k|    const int mi_col_offset = -1;
 1212|  14.8k|    MB_MODE_INFO *mbmi = xd->mi[mi_col_offset + mi_row_offset * mi_stride];
 1213|       |
 1214|  14.8k|    if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
  ------------------
  |  Branch (1214:9): [True: 8.81k, False: 6.03k]
  |  Branch (1214:44): [True: 7.89k, False: 927]
  ------------------
 1215|  7.89k|      record_samples(mbmi, pts, pts_inref, 0, -1, 0, -1);
 1216|  7.89k|      pts += 2;
 1217|  7.89k|      pts_inref += 2;
 1218|  7.89k|      if (++np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
  ------------------
  |  |   29|  7.89k|#define LEAST_SQUARES_SAMPLES_MAX (1 << LEAST_SQUARES_SAMPLES_MAX_BITS)
  |  |  ------------------
  |  |  |  |   28|  7.89k|#define LEAST_SQUARES_SAMPLES_MAX_BITS 3
  |  |  ------------------
  ------------------
                    if (++np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
  ------------------
  |  |   29|     12|#define LEAST_SQUARES_SAMPLES_MAX (1 << LEAST_SQUARES_SAMPLES_MAX_BITS)
  |  |  ------------------
  |  |  |  |   28|     12|#define LEAST_SQUARES_SAMPLES_MAX_BITS 3
  |  |  ------------------
  ------------------
  |  Branch (1218:11): [True: 12, False: 7.87k]
  ------------------
 1219|  7.89k|    }
 1220|  14.8k|  }
 1221|  40.1k|  assert(np <= LEAST_SQUARES_SAMPLES_MAX);
 1222|       |
 1223|       |  // Top-right block
 1224|  40.1k|  if (do_tr &&
  ------------------
  |  Branch (1224:7): [True: 35.8k, False: 4.28k]
  ------------------
 1225|  35.8k|      has_top_right(cm, xd, mi_row, mi_col, AOMMAX(xd->width, xd->height))) {
  ------------------
  |  |   35|  35.8k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 9.84k, False: 26.0k]
  |  |  ------------------
  ------------------
  |  Branch (1225:7): [True: 23.1k, False: 12.7k]
  ------------------
 1226|  23.1k|    const POSITION trb_pos = { -1, xd->width };
 1227|  23.1k|    const TileInfo *const tile = &xd->tile;
 1228|  23.1k|    if (is_inside(tile, mi_col, mi_row, &trb_pos)) {
  ------------------
  |  Branch (1228:9): [True: 10.6k, False: 12.4k]
  ------------------
 1229|  10.6k|      const int mi_row_offset = -1;
 1230|  10.6k|      const int mi_col_offset = xd->width;
 1231|  10.6k|      const MB_MODE_INFO *mbmi =
 1232|  10.6k|          xd->mi[mi_col_offset + mi_row_offset * mi_stride];
 1233|       |
 1234|  10.6k|      if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
  ------------------
  |  Branch (1234:11): [True: 5.57k, False: 5.08k]
  |  Branch (1234:46): [True: 4.93k, False: 638]
  ------------------
 1235|  4.93k|        record_samples(mbmi, pts, pts_inref, 0, -1, xd->width, 1);
 1236|  4.93k|        if (++np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
  ------------------
  |  |   29|  4.93k|#define LEAST_SQUARES_SAMPLES_MAX (1 << LEAST_SQUARES_SAMPLES_MAX_BITS)
  |  |  ------------------
  |  |  |  |   28|  4.93k|#define LEAST_SQUARES_SAMPLES_MAX_BITS 3
  |  |  ------------------
  ------------------
                      if (++np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
  ------------------
  |  |   29|      4|#define LEAST_SQUARES_SAMPLES_MAX (1 << LEAST_SQUARES_SAMPLES_MAX_BITS)
  |  |  ------------------
  |  |  |  |   28|      4|#define LEAST_SQUARES_SAMPLES_MAX_BITS 3
  |  |  ------------------
  ------------------
  |  Branch (1236:13): [True: 4, False: 4.93k]
  ------------------
 1237|  4.93k|      }
 1238|  10.6k|    }
 1239|  23.1k|  }
 1240|  40.1k|  assert(np <= LEAST_SQUARES_SAMPLES_MAX);
 1241|       |
 1242|  40.1k|  return np;
 1243|  40.1k|}
av1_setup_skip_mode_allowed:
 1245|  26.3k|void av1_setup_skip_mode_allowed(AV1_COMMON *cm) {
 1246|  26.3k|  const OrderHintInfo *const order_hint_info = &cm->seq_params->order_hint_info;
 1247|  26.3k|  SkipModeInfo *const skip_mode_info = &cm->current_frame.skip_mode_info;
 1248|       |
 1249|  26.3k|  skip_mode_info->skip_mode_allowed = 0;
 1250|  26.3k|  skip_mode_info->ref_frame_idx_0 = INVALID_IDX;
  ------------------
  |  |   15|  26.3k|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
 1251|  26.3k|  skip_mode_info->ref_frame_idx_1 = INVALID_IDX;
  ------------------
  |  |   15|  26.3k|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
 1252|       |
 1253|  26.3k|  if (!order_hint_info->enable_order_hint || frame_is_intra_only(cm) ||
  ------------------
  |  Branch (1253:7): [True: 9.01k, False: 17.2k]
  |  Branch (1253:46): [True: 9.36k, False: 7.91k]
  ------------------
 1254|  7.91k|      cm->current_frame.reference_mode == SINGLE_REFERENCE)
  ------------------
  |  Branch (1254:7): [True: 5.98k, False: 1.93k]
  ------------------
 1255|  24.3k|    return;
 1256|       |
 1257|  1.93k|  const int cur_order_hint = cm->current_frame.order_hint;
 1258|  1.93k|  int ref_order_hints[2] = { -1, INT_MAX };
 1259|  1.93k|  int ref_idx[2] = { INVALID_IDX, INVALID_IDX };
  ------------------
  |  |   15|  1.93k|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
                int ref_idx[2] = { INVALID_IDX, INVALID_IDX };
  ------------------
  |  |   15|  1.93k|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
 1260|       |
 1261|       |  // Identify the nearest forward and backward references.
 1262|  15.5k|  for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
  ------------------
  |  Branch (1262:19): [True: 13.5k, False: 1.93k]
  ------------------
 1263|  13.5k|    const RefCntBuffer *const buf = get_ref_frame_buf(cm, LAST_FRAME + i);
 1264|  13.5k|    if (buf == NULL) continue;
  ------------------
  |  Branch (1264:9): [True: 0, False: 13.5k]
  ------------------
 1265|       |
 1266|  13.5k|    const int ref_order_hint = buf->order_hint;
 1267|  13.5k|    if (get_relative_dist(order_hint_info, ref_order_hint, cur_order_hint) <
  ------------------
  |  Branch (1267:9): [True: 11.9k, False: 1.67k]
  ------------------
 1268|  13.5k|        0) {
 1269|       |      // Forward reference
 1270|  11.9k|      if (ref_order_hints[0] == -1 ||
  ------------------
  |  Branch (1270:11): [True: 1.82k, False: 10.0k]
  ------------------
 1271|  10.0k|          get_relative_dist(order_hint_info, ref_order_hint,
  ------------------
  |  Branch (1271:11): [True: 56, False: 10.0k]
  ------------------
 1272|  10.0k|                            ref_order_hints[0]) > 0) {
 1273|  1.88k|        ref_order_hints[0] = ref_order_hint;
 1274|  1.88k|        ref_idx[0] = i;
 1275|  1.88k|      }
 1276|  11.9k|    } else if (get_relative_dist(order_hint_info, ref_order_hint,
  ------------------
  |  Branch (1276:16): [True: 1.57k, False: 101]
  ------------------
 1277|  1.67k|                                 cur_order_hint) > 0) {
 1278|       |      // Backward reference
 1279|  1.57k|      if (ref_order_hints[1] == INT_MAX ||
  ------------------
  |  Branch (1279:11): [True: 549, False: 1.02k]
  ------------------
 1280|  1.02k|          get_relative_dist(order_hint_info, ref_order_hint,
  ------------------
  |  Branch (1280:11): [True: 48, False: 974]
  ------------------
 1281|  1.02k|                            ref_order_hints[1]) < 0) {
 1282|    597|        ref_order_hints[1] = ref_order_hint;
 1283|    597|        ref_idx[1] = i;
 1284|    597|      }
 1285|  1.57k|    }
 1286|  13.5k|  }
 1287|       |
 1288|  1.93k|  if (ref_idx[0] != INVALID_IDX && ref_idx[1] != INVALID_IDX) {
  ------------------
  |  |   15|  3.87k|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
                if (ref_idx[0] != INVALID_IDX && ref_idx[1] != INVALID_IDX) {
  ------------------
  |  |   15|  1.82k|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
  |  Branch (1288:7): [True: 1.82k, False: 114]
  |  Branch (1288:36): [True: 442, False: 1.38k]
  ------------------
 1289|       |    // == Bi-directional prediction ==
 1290|    442|    skip_mode_info->skip_mode_allowed = 1;
 1291|    442|    skip_mode_info->ref_frame_idx_0 = AOMMIN(ref_idx[0], ref_idx[1]);
  ------------------
  |  |   34|    442|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 420, False: 22]
  |  |  ------------------
  ------------------
 1292|    442|    skip_mode_info->ref_frame_idx_1 = AOMMAX(ref_idx[0], ref_idx[1]);
  ------------------
  |  |   35|    442|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 22, False: 420]
  |  |  ------------------
  ------------------
 1293|  1.49k|  } else if (ref_idx[0] != INVALID_IDX && ref_idx[1] == INVALID_IDX) {
  ------------------
  |  |   15|  2.99k|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
                } else if (ref_idx[0] != INVALID_IDX && ref_idx[1] == INVALID_IDX) {
  ------------------
  |  |   15|  1.38k|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
  |  Branch (1293:14): [True: 1.38k, False: 114]
  |  Branch (1293:43): [True: 1.38k, False: 0]
  ------------------
 1294|       |    // == Forward prediction only ==
 1295|       |    // Identify the second nearest forward reference.
 1296|  1.38k|    ref_order_hints[1] = -1;
 1297|  11.0k|    for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
  ------------------
  |  Branch (1297:21): [True: 9.68k, False: 1.38k]
  ------------------
 1298|  9.68k|      const RefCntBuffer *const buf = get_ref_frame_buf(cm, LAST_FRAME + i);
 1299|  9.68k|      if (buf == NULL) continue;
  ------------------
  |  Branch (1299:11): [True: 0, False: 9.68k]
  ------------------
 1300|       |
 1301|  9.68k|      const int ref_order_hint = buf->order_hint;
 1302|  9.68k|      if ((ref_order_hints[0] != -1 &&
  ------------------
  |  Branch (1302:12): [True: 9.68k, False: 0]
  ------------------
 1303|  9.68k|           get_relative_dist(order_hint_info, ref_order_hint,
  ------------------
  |  Branch (1303:12): [True: 342, False: 9.33k]
  ------------------
 1304|  9.68k|                             ref_order_hints[0]) < 0) &&
 1305|    342|          (ref_order_hints[1] == -1 ||
  ------------------
  |  Branch (1305:12): [True: 64, False: 278]
  ------------------
 1306|    278|           get_relative_dist(order_hint_info, ref_order_hint,
  ------------------
  |  Branch (1306:12): [True: 10, False: 268]
  ------------------
 1307|    278|                             ref_order_hints[1]) > 0)) {
 1308|       |        // Second closest forward reference
 1309|     74|        ref_order_hints[1] = ref_order_hint;
 1310|     74|        ref_idx[1] = i;
 1311|     74|      }
 1312|  9.68k|    }
 1313|  1.38k|    if (ref_order_hints[1] != -1) {
  ------------------
  |  Branch (1313:9): [True: 64, False: 1.31k]
  ------------------
 1314|     64|      skip_mode_info->skip_mode_allowed = 1;
 1315|     64|      skip_mode_info->ref_frame_idx_0 = AOMMIN(ref_idx[0], ref_idx[1]);
  ------------------
  |  |   34|     64|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 22, False: 42]
  |  |  ------------------
  ------------------
 1316|     64|      skip_mode_info->ref_frame_idx_1 = AOMMAX(ref_idx[0], ref_idx[1]);
  ------------------
  |  |   35|     64|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 42, False: 22]
  |  |  ------------------
  ------------------
 1317|     64|    }
 1318|  1.38k|  }
 1319|  1.93k|}
av1_set_frame_refs:
 1346|    221|                        int lst_map_idx, int gld_map_idx) {
 1347|    221|  int lst_frame_sort_idx = -1;
 1348|    221|  int gld_frame_sort_idx = -1;
 1349|       |
 1350|    221|  assert(cm->seq_params->order_hint_info.enable_order_hint);
 1351|    221|  assert(cm->seq_params->order_hint_info.order_hint_bits_minus_1 >= 0);
 1352|    221|  const int cur_order_hint = (int)cm->current_frame.order_hint;
 1353|    221|  const int cur_frame_sort_idx =
 1354|    221|      1 << cm->seq_params->order_hint_info.order_hint_bits_minus_1;
 1355|       |
 1356|    221|  REF_FRAME_INFO ref_frame_info[REF_FRAMES];
 1357|    221|  int ref_flag_list[INTER_REFS_PER_FRAME] = { 0, 0, 0, 0, 0, 0, 0 };
 1358|       |
 1359|  1.98k|  for (int i = 0; i < REF_FRAMES; ++i) {
  ------------------
  |  Branch (1359:19): [True: 1.76k, False: 221]
  ------------------
 1360|  1.76k|    const int map_idx = i;
 1361|       |
 1362|  1.76k|    ref_frame_info[i].map_idx = map_idx;
 1363|  1.76k|    ref_frame_info[i].sort_idx = -1;
 1364|       |
 1365|  1.76k|    RefCntBuffer *const buf = cm->ref_frame_map[map_idx];
 1366|  1.76k|    ref_frame_info[i].buf = buf;
 1367|       |
 1368|  1.76k|    if (buf == NULL) continue;
  ------------------
  |  Branch (1368:9): [True: 0, False: 1.76k]
  ------------------
 1369|       |    // If this assertion fails, there is a reference leak.
 1370|  1.76k|    assert(buf->ref_count > 0);
 1371|       |
 1372|  1.76k|    const int offset = (int)buf->order_hint;
 1373|  1.76k|    ref_frame_info[i].sort_idx =
 1374|  1.76k|        (offset == -1) ? -1
  ------------------
  |  Branch (1374:9): [True: 0, False: 1.76k]
  ------------------
 1375|  1.76k|                       : cur_frame_sort_idx +
 1376|  1.76k|                             get_relative_dist(&cm->seq_params->order_hint_info,
 1377|  1.76k|                                               offset, cur_order_hint);
 1378|  1.76k|    assert(ref_frame_info[i].sort_idx >= -1);
 1379|       |
 1380|  1.76k|    if (map_idx == lst_map_idx) lst_frame_sort_idx = ref_frame_info[i].sort_idx;
  ------------------
  |  Branch (1380:9): [True: 221, False: 1.54k]
  ------------------
 1381|  1.76k|    if (map_idx == gld_map_idx) gld_frame_sort_idx = ref_frame_info[i].sort_idx;
  ------------------
  |  Branch (1381:9): [True: 221, False: 1.54k]
  ------------------
 1382|  1.76k|  }
 1383|       |
 1384|       |  // Confirm both LAST_FRAME and GOLDEN_FRAME are valid forward reference
 1385|       |  // frames.
 1386|    221|  if (lst_frame_sort_idx == -1 || lst_frame_sort_idx >= cur_frame_sort_idx) {
  ------------------
  |  Branch (1386:7): [True: 0, False: 221]
  |  Branch (1386:35): [True: 3, False: 218]
  ------------------
 1387|      3|    aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
 1388|      3|                       "Inter frame requests a look-ahead frame as LAST");
 1389|      3|  }
 1390|    221|  if (gld_frame_sort_idx == -1 || gld_frame_sort_idx >= cur_frame_sort_idx) {
  ------------------
  |  Branch (1390:7): [True: 3, False: 218]
  |  Branch (1390:35): [True: 3, False: 215]
  ------------------
 1391|      3|    aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
 1392|      3|                       "Inter frame requests a look-ahead frame as GOLDEN");
 1393|      3|  }
 1394|       |
 1395|       |  // Sort ref frames based on their frame_offset values.
 1396|    221|  qsort(ref_frame_info, REF_FRAMES, sizeof(REF_FRAME_INFO),
 1397|    221|        compare_ref_frame_info);
 1398|       |
 1399|       |  // Identify forward and backward reference frames.
 1400|       |  // Forward  reference: offset < order_hint
 1401|       |  // Backward reference: offset >= order_hint
 1402|    221|  int fwd_start_idx = 0, fwd_end_idx = REF_FRAMES - 1;
 1403|       |
 1404|  1.81k|  for (int i = 0; i < REF_FRAMES; i++) {
  ------------------
  |  Branch (1404:19): [True: 1.63k, False: 181]
  ------------------
 1405|  1.63k|    if (ref_frame_info[i].sort_idx == -1) {
  ------------------
  |  Branch (1405:9): [True: 0, False: 1.63k]
  ------------------
 1406|      0|      fwd_start_idx++;
 1407|      0|      continue;
 1408|      0|    }
 1409|       |
 1410|  1.63k|    if (ref_frame_info[i].sort_idx >= cur_frame_sort_idx) {
  ------------------
  |  Branch (1410:9): [True: 40, False: 1.59k]
  ------------------
 1411|     40|      fwd_end_idx = i - 1;
 1412|     40|      break;
 1413|     40|    }
 1414|  1.63k|  }
 1415|       |
 1416|    221|  int bwd_start_idx = fwd_end_idx + 1;
 1417|    221|  int bwd_end_idx = REF_FRAMES - 1;
 1418|       |
 1419|       |  // === Backward Reference Frames ===
 1420|       |
 1421|       |  // == ALTREF_FRAME ==
 1422|    221|  if (bwd_start_idx <= bwd_end_idx) {
  ------------------
  |  Branch (1422:7): [True: 40, False: 181]
  ------------------
 1423|     40|    set_ref_frame_info(remapped_ref_idx, ALTREF_FRAME - LAST_FRAME,
 1424|     40|                       &ref_frame_info[bwd_end_idx]);
 1425|     40|    ref_flag_list[ALTREF_FRAME - LAST_FRAME] = 1;
 1426|     40|    bwd_end_idx--;
 1427|     40|  }
 1428|       |
 1429|       |  // == BWDREF_FRAME ==
 1430|    221|  if (bwd_start_idx <= bwd_end_idx) {
  ------------------
  |  Branch (1430:7): [True: 27, False: 194]
  ------------------
 1431|     27|    set_ref_frame_info(remapped_ref_idx, BWDREF_FRAME - LAST_FRAME,
 1432|     27|                       &ref_frame_info[bwd_start_idx]);
 1433|     27|    ref_flag_list[BWDREF_FRAME - LAST_FRAME] = 1;
 1434|     27|    bwd_start_idx++;
 1435|     27|  }
 1436|       |
 1437|       |  // == ALTREF2_FRAME ==
 1438|    221|  if (bwd_start_idx <= bwd_end_idx) {
  ------------------
  |  Branch (1438:7): [True: 20, False: 201]
  ------------------
 1439|     20|    set_ref_frame_info(remapped_ref_idx, ALTREF2_FRAME - LAST_FRAME,
 1440|     20|                       &ref_frame_info[bwd_start_idx]);
 1441|     20|    ref_flag_list[ALTREF2_FRAME - LAST_FRAME] = 1;
 1442|     20|  }
 1443|       |
 1444|       |  // === Forward Reference Frames ===
 1445|       |
 1446|  1.81k|  for (int i = fwd_start_idx; i <= fwd_end_idx; ++i) {
  ------------------
  |  Branch (1446:31): [True: 1.59k, False: 221]
  ------------------
 1447|       |    // == LAST_FRAME ==
 1448|  1.59k|    if (ref_frame_info[i].map_idx == lst_map_idx) {
  ------------------
  |  Branch (1448:9): [True: 215, False: 1.37k]
  ------------------
 1449|    215|      set_ref_frame_info(remapped_ref_idx, LAST_FRAME - LAST_FRAME,
 1450|    215|                         &ref_frame_info[i]);
 1451|    215|      ref_flag_list[LAST_FRAME - LAST_FRAME] = 1;
 1452|    215|    }
 1453|       |
 1454|       |    // == GOLDEN_FRAME ==
 1455|  1.59k|    if (ref_frame_info[i].map_idx == gld_map_idx) {
  ------------------
  |  Branch (1455:9): [True: 215, False: 1.37k]
  ------------------
 1456|    215|      set_ref_frame_info(remapped_ref_idx, GOLDEN_FRAME - LAST_FRAME,
 1457|    215|                         &ref_frame_info[i]);
 1458|    215|      ref_flag_list[GOLDEN_FRAME - LAST_FRAME] = 1;
 1459|    215|    }
 1460|  1.59k|  }
 1461|       |
 1462|    221|  assert(ref_flag_list[LAST_FRAME - LAST_FRAME] == 1 &&
 1463|    221|         ref_flag_list[GOLDEN_FRAME - LAST_FRAME] == 1);
 1464|       |
 1465|       |  // == LAST2_FRAME ==
 1466|       |  // == LAST3_FRAME ==
 1467|       |  // == BWDREF_FRAME ==
 1468|       |  // == ALTREF2_FRAME ==
 1469|       |  // == ALTREF_FRAME ==
 1470|       |
 1471|       |  // Set up the reference frames in the anti-chronological order.
 1472|    221|  static const MV_REFERENCE_FRAME ref_frame_list[INTER_REFS_PER_FRAME - 2] = {
 1473|    221|    LAST2_FRAME, LAST3_FRAME, BWDREF_FRAME, ALTREF2_FRAME, ALTREF_FRAME
 1474|    221|  };
 1475|       |
 1476|    221|  int ref_idx;
 1477|  1.23k|  for (ref_idx = 0; ref_idx < (INTER_REFS_PER_FRAME - 2); ref_idx++) {
  ------------------
  |  Branch (1477:21): [True: 1.03k, False: 208]
  ------------------
 1478|  1.03k|    const MV_REFERENCE_FRAME ref_frame = ref_frame_list[ref_idx];
 1479|       |
 1480|  1.03k|    if (ref_flag_list[ref_frame - LAST_FRAME] == 1) continue;
  ------------------
  |  Branch (1480:9): [True: 48, False: 982]
  ------------------
 1481|       |
 1482|  1.31k|    while (fwd_start_idx <= fwd_end_idx &&
  ------------------
  |  Branch (1482:12): [True: 1.30k, False: 13]
  ------------------
 1483|  1.30k|           (ref_frame_info[fwd_end_idx].map_idx == lst_map_idx ||
  ------------------
  |  Branch (1483:13): [True: 175, False: 1.13k]
  ------------------
 1484|  1.13k|            ref_frame_info[fwd_end_idx].map_idx == gld_map_idx)) {
  ------------------
  |  Branch (1484:13): [True: 161, False: 969]
  ------------------
 1485|    336|      fwd_end_idx--;
 1486|    336|    }
 1487|    982|    if (fwd_start_idx > fwd_end_idx) break;
  ------------------
  |  Branch (1487:9): [True: 13, False: 969]
  ------------------
 1488|       |
 1489|    969|    set_ref_frame_info(remapped_ref_idx, ref_frame - LAST_FRAME,
 1490|    969|                       &ref_frame_info[fwd_end_idx]);
 1491|    969|    ref_flag_list[ref_frame - LAST_FRAME] = 1;
 1492|       |
 1493|    969|    fwd_end_idx--;
 1494|    969|  }
 1495|       |
 1496|       |  // Assign all the remaining frame(s), if any, to the earliest reference
 1497|       |  // frame.
 1498|    279|  for (; ref_idx < (INTER_REFS_PER_FRAME - 2); ref_idx++) {
  ------------------
  |  Branch (1498:10): [True: 58, False: 221]
  ------------------
 1499|     58|    const MV_REFERENCE_FRAME ref_frame = ref_frame_list[ref_idx];
 1500|     58|    if (ref_flag_list[ref_frame - LAST_FRAME] == 1) continue;
  ------------------
  |  Branch (1500:9): [True: 39, False: 19]
  ------------------
 1501|     19|    set_ref_frame_info(remapped_ref_idx, ref_frame - LAST_FRAME,
 1502|     19|                       &ref_frame_info[fwd_start_idx]);
 1503|     19|    ref_flag_list[ref_frame - LAST_FRAME] = 1;
 1504|     19|  }
 1505|       |
 1506|  1.72k|  for (int i = 0; i < INTER_REFS_PER_FRAME; i++) {
  ------------------
  |  Branch (1506:19): [True: 1.50k, False: 221]
  ------------------
 1507|       |    assert(ref_flag_list[i] == 1);
 1508|  1.50k|  }
 1509|    221|}
mvref_common.c:setup_ref_mv_list:
  485|  88.0k|    int mi_row, int mi_col, int16_t *mode_context) {
  486|  88.0k|  const int bs = AOMMAX(xd->width, xd->height);
  ------------------
  |  |   35|  88.0k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 29.7k, False: 58.3k]
  |  |  ------------------
  ------------------
  487|  88.0k|  const int has_tr = has_top_right(cm, xd, mi_row, mi_col, bs);
  488|  88.0k|  MV_REFERENCE_FRAME rf[2];
  489|       |
  490|  88.0k|  const TileInfo *const tile = &xd->tile;
  491|  88.0k|  int max_row_offset = 0, max_col_offset = 0;
  492|  88.0k|  const int row_adj = (xd->height < mi_size_high[BLOCK_8X8]) && (mi_row & 0x01);
  ------------------
  |  Branch (492:23): [True: 20.5k, False: 67.5k]
  |  Branch (492:65): [True: 10.1k, False: 10.3k]
  ------------------
  493|  88.0k|  const int col_adj = (xd->width < mi_size_wide[BLOCK_8X8]) && (mi_col & 0x01);
  ------------------
  |  Branch (493:23): [True: 14.7k, False: 73.3k]
  |  Branch (493:64): [True: 7.41k, False: 7.36k]
  ------------------
  494|  88.0k|  int processed_rows = 0;
  495|  88.0k|  int processed_cols = 0;
  496|       |
  497|  88.0k|  av1_set_ref_frame(rf, ref_frame);
  498|  88.0k|  mode_context[ref_frame] = 0;
  499|  88.0k|  *refmv_count = 0;
  500|       |
  501|       |  // Find valid maximum row/col offset.
  502|  88.0k|  if (xd->up_available) {
  ------------------
  |  Branch (502:7): [True: 71.4k, False: 16.6k]
  ------------------
  503|  71.4k|    max_row_offset = -(MVREF_ROW_COLS << 1) + row_adj;
  ------------------
  |  |   21|  71.4k|#define MVREF_ROW_COLS 3
  ------------------
  504|       |
  505|  71.4k|    if (xd->height < mi_size_high[BLOCK_8X8])
  ------------------
  |  Branch (505:9): [True: 19.3k, False: 52.0k]
  ------------------
  506|  19.3k|      max_row_offset = -(2 << 1) + row_adj;
  507|       |
  508|  71.4k|    max_row_offset = find_valid_row_offset(tile, mi_row, max_row_offset);
  509|  71.4k|  }
  510|       |
  511|  88.0k|  if (xd->left_available) {
  ------------------
  |  Branch (511:7): [True: 76.7k, False: 11.3k]
  ------------------
  512|  76.7k|    max_col_offset = -(MVREF_ROW_COLS << 1) + col_adj;
  ------------------
  |  |   21|  76.7k|#define MVREF_ROW_COLS 3
  ------------------
  513|       |
  514|  76.7k|    if (xd->width < mi_size_wide[BLOCK_8X8])
  ------------------
  |  Branch (514:9): [True: 14.3k, False: 62.3k]
  ------------------
  515|  14.3k|      max_col_offset = -(2 << 1) + col_adj;
  516|       |
  517|  76.7k|    max_col_offset = find_valid_col_offset(tile, mi_col, max_col_offset);
  518|  76.7k|  }
  519|       |
  520|  88.0k|  uint8_t col_match_count = 0;
  521|  88.0k|  uint8_t row_match_count = 0;
  522|  88.0k|  uint8_t newmv_count = 0;
  523|       |
  524|       |  // Scan the first above row mode info. row_offset = -1;
  525|  88.0k|  if (abs(max_row_offset) >= 1)
  ------------------
  |  Branch (525:7): [True: 71.4k, False: 16.6k]
  ------------------
  526|  71.4k|    scan_row_mbmi(cm, xd, mi_col, rf, -1, ref_mv_stack, ref_mv_weight,
  527|  71.4k|                  refmv_count, &row_match_count, &newmv_count, gm_mv_candidates,
  528|  71.4k|                  max_row_offset, &processed_rows);
  529|       |  // Scan the first left column mode info. col_offset = -1;
  530|  88.0k|  if (abs(max_col_offset) >= 1)
  ------------------
  |  Branch (530:7): [True: 76.7k, False: 11.3k]
  ------------------
  531|  76.7k|    scan_col_mbmi(cm, xd, mi_row, rf, -1, ref_mv_stack, ref_mv_weight,
  532|  76.7k|                  refmv_count, &col_match_count, &newmv_count, gm_mv_candidates,
  533|  76.7k|                  max_col_offset, &processed_cols);
  534|       |  // Check top-right boundary
  535|  88.0k|  if (has_tr)
  ------------------
  |  Branch (535:7): [True: 54.5k, False: 33.5k]
  ------------------
  536|  54.5k|    scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, xd->width, ref_mv_stack,
  537|  54.5k|                  ref_mv_weight, &row_match_count, &newmv_count,
  538|  54.5k|                  gm_mv_candidates, refmv_count);
  539|       |
  540|  88.0k|  const uint8_t nearest_match = (row_match_count > 0) + (col_match_count > 0);
  541|  88.0k|  const uint8_t nearest_refmv_count = *refmv_count;
  542|       |
  543|       |  // TODO(yunqing): for comp_search, do it for all 3 cases.
  544|   173k|  for (int idx = 0; idx < nearest_refmv_count; ++idx)
  ------------------
  |  Branch (544:21): [True: 85.6k, False: 88.0k]
  ------------------
  545|  85.6k|    ref_mv_weight[idx] += REF_CAT_LEVEL;
  ------------------
  |  |  512|  85.6k|#define REF_CAT_LEVEL 640
  ------------------
  546|       |
  547|  88.0k|  if (cm->features.allow_ref_frame_mvs) {
  ------------------
  |  Branch (547:7): [True: 66.1k, False: 21.9k]
  ------------------
  548|  66.1k|    int is_available = 0;
  549|  66.1k|    const int voffset = AOMMAX(mi_size_high[BLOCK_8X8], xd->height);
  ------------------
  |  |   35|  66.1k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 14.7k, False: 51.3k]
  |  |  ------------------
  ------------------
  550|  66.1k|    const int hoffset = AOMMAX(mi_size_wide[BLOCK_8X8], xd->width);
  ------------------
  |  |   35|  66.1k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 10.8k, False: 55.2k]
  |  |  ------------------
  ------------------
  551|  66.1k|    const int blk_row_end = AOMMIN(xd->height, mi_size_high[BLOCK_64X64]);
  ------------------
  |  |   34|  66.1k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 62.9k, False: 3.15k]
  |  |  ------------------
  ------------------
  552|  66.1k|    const int blk_col_end = AOMMIN(xd->width, mi_size_wide[BLOCK_64X64]);
  ------------------
  |  |   34|  66.1k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 62.7k, False: 3.38k]
  |  |  ------------------
  ------------------
  553|       |
  554|  66.1k|    const int tpl_sample_pos[3][2] = {
  555|  66.1k|      { voffset, -2 },
  556|  66.1k|      { voffset, hoffset },
  557|  66.1k|      { voffset - 2, hoffset },
  558|  66.1k|    };
  559|  66.1k|    const int allow_extension = (xd->height >= mi_size_high[BLOCK_8X8]) &&
  ------------------
  |  Branch (559:33): [True: 51.3k, False: 14.7k]
  ------------------
  560|  51.3k|                                (xd->height < mi_size_high[BLOCK_64X64]) &&
  ------------------
  |  Branch (560:33): [True: 48.2k, False: 3.15k]
  ------------------
  561|  48.2k|                                (xd->width >= mi_size_wide[BLOCK_8X8]) &&
  ------------------
  |  Branch (561:33): [True: 42.1k, False: 6.07k]
  ------------------
  562|  42.1k|                                (xd->width < mi_size_wide[BLOCK_64X64]);
  ------------------
  |  Branch (562:33): [True: 40.9k, False: 1.18k]
  ------------------
  563|       |
  564|  66.1k|    const int step_h = (xd->height >= mi_size_high[BLOCK_64X64])
  ------------------
  |  Branch (564:24): [True: 3.15k, False: 62.9k]
  ------------------
  565|  66.1k|                           ? mi_size_high[BLOCK_16X16]
  566|  66.1k|                           : mi_size_high[BLOCK_8X8];
  567|  66.1k|    const int step_w = (xd->width >= mi_size_wide[BLOCK_64X64])
  ------------------
  |  Branch (567:24): [True: 3.38k, False: 62.7k]
  ------------------
  568|  66.1k|                           ? mi_size_wide[BLOCK_16X16]
  569|  66.1k|                           : mi_size_wide[BLOCK_8X8];
  570|       |
  571|   177k|    for (int blk_row = 0; blk_row < blk_row_end; blk_row += step_h) {
  ------------------
  |  Branch (571:27): [True: 111k, False: 66.1k]
  ------------------
  572|   351k|      for (int blk_col = 0; blk_col < blk_col_end; blk_col += step_w) {
  ------------------
  |  Branch (572:29): [True: 240k, False: 111k]
  ------------------
  573|   240k|        int ret = add_tpl_ref_mv(cm, xd, mi_row, mi_col, ref_frame, blk_row,
  574|   240k|                                 blk_col, gm_mv_candidates, refmv_count,
  575|   240k|                                 ref_mv_stack, ref_mv_weight, mode_context);
  576|   240k|        if (blk_row == 0 && blk_col == 0) is_available = ret;
  ------------------
  |  Branch (576:13): [True: 119k, False: 120k]
  |  Branch (576:29): [True: 66.1k, False: 53.1k]
  ------------------
  577|   240k|      }
  578|   111k|    }
  579|       |
  580|  66.1k|    if (is_available == 0) mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET);
  ------------------
  |  |  487|  64.3k|#define GLOBALMV_OFFSET 3
  ------------------
  |  Branch (580:9): [True: 64.3k, False: 1.83k]
  ------------------
  581|       |
  582|   188k|    for (int i = 0; i < 3 && allow_extension; ++i) {
  ------------------
  |  Branch (582:21): [True: 147k, False: 40.9k]
  |  Branch (582:30): [True: 122k, False: 25.1k]
  ------------------
  583|   122k|      const int blk_row = tpl_sample_pos[i][0];
  584|   122k|      const int blk_col = tpl_sample_pos[i][1];
  585|       |
  586|   122k|      if (!check_sb_border(mi_row, mi_col, blk_row, blk_col)) continue;
  ------------------
  |  Branch (586:11): [True: 40.0k, False: 82.7k]
  ------------------
  587|  82.7k|      add_tpl_ref_mv(cm, xd, mi_row, mi_col, ref_frame, blk_row, blk_col,
  588|  82.7k|                     gm_mv_candidates, refmv_count, ref_mv_stack, ref_mv_weight,
  589|  82.7k|                     mode_context);
  590|  82.7k|    }
  591|  66.1k|  }
  592|       |
  593|  88.0k|  uint8_t dummy_newmv_count = 0;
  594|       |
  595|       |  // Scan the second outer area.
  596|  88.0k|  scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, -1, ref_mv_stack, ref_mv_weight,
  597|  88.0k|                &row_match_count, &dummy_newmv_count, gm_mv_candidates,
  598|  88.0k|                refmv_count);
  599|       |
  600|   264k|  for (int idx = 2; idx <= MVREF_ROW_COLS; ++idx) {
  ------------------
  |  |   21|   264k|#define MVREF_ROW_COLS 3
  ------------------
  |  Branch (600:21): [True: 176k, False: 88.0k]
  ------------------
  601|   176k|    const int row_offset = -(idx << 1) + 1 + row_adj;
  602|   176k|    const int col_offset = -(idx << 1) + 1 + col_adj;
  603|       |
  604|   176k|    if (abs(row_offset) <= abs(max_row_offset) &&
  ------------------
  |  Branch (604:9): [True: 107k, False: 68.5k]
  ------------------
  605|   107k|        abs(row_offset) > processed_rows)
  ------------------
  |  Branch (605:9): [True: 75.8k, False: 31.8k]
  ------------------
  606|  75.8k|      scan_row_mbmi(cm, xd, mi_col, rf, row_offset, ref_mv_stack, ref_mv_weight,
  607|  75.8k|                    refmv_count, &row_match_count, &dummy_newmv_count,
  608|  75.8k|                    gm_mv_candidates, max_row_offset, &processed_rows);
  609|       |
  610|   176k|    if (abs(col_offset) <= abs(max_col_offset) &&
  ------------------
  |  Branch (610:9): [True: 128k, False: 47.2k]
  ------------------
  611|   128k|        abs(col_offset) > processed_cols)
  ------------------
  |  Branch (611:9): [True: 86.5k, False: 42.3k]
  ------------------
  612|  86.5k|      scan_col_mbmi(cm, xd, mi_row, rf, col_offset, ref_mv_stack, ref_mv_weight,
  613|  86.5k|                    refmv_count, &col_match_count, &dummy_newmv_count,
  614|  86.5k|                    gm_mv_candidates, max_col_offset, &processed_cols);
  615|   176k|  }
  616|       |
  617|  88.0k|  const uint8_t ref_match_count = (row_match_count > 0) + (col_match_count > 0);
  618|       |
  619|  88.0k|  switch (nearest_match) {
  620|  25.8k|    case 0:
  ------------------
  |  Branch (620:5): [True: 25.8k, False: 62.2k]
  ------------------
  621|  25.8k|      if (ref_match_count >= 1) mode_context[ref_frame] |= 1;
  ------------------
  |  Branch (621:11): [True: 3.62k, False: 22.2k]
  ------------------
  622|  25.8k|      if (ref_match_count == 1)
  ------------------
  |  Branch (622:11): [True: 3.16k, False: 22.7k]
  ------------------
  623|  3.16k|        mode_context[ref_frame] |= (1 << REFMV_OFFSET);
  ------------------
  |  |  488|  3.16k|#define REFMV_OFFSET 4
  ------------------
  624|  22.7k|      else if (ref_match_count >= 2)
  ------------------
  |  Branch (624:16): [True: 459, False: 22.2k]
  ------------------
  625|    459|        mode_context[ref_frame] |= (2 << REFMV_OFFSET);
  ------------------
  |  |  488|    459|#define REFMV_OFFSET 4
  ------------------
  626|  25.8k|      break;
  627|  34.8k|    case 1:
  ------------------
  |  Branch (627:5): [True: 34.8k, False: 53.1k]
  ------------------
  628|  34.8k|      mode_context[ref_frame] |= (newmv_count > 0) ? 2 : 3;
  ------------------
  |  Branch (628:34): [True: 13.6k, False: 21.2k]
  ------------------
  629|  34.8k|      if (ref_match_count == 1)
  ------------------
  |  Branch (629:11): [True: 27.4k, False: 7.42k]
  ------------------
  630|  27.4k|        mode_context[ref_frame] |= (3 << REFMV_OFFSET);
  ------------------
  |  |  488|  27.4k|#define REFMV_OFFSET 4
  ------------------
  631|  7.42k|      else if (ref_match_count >= 2)
  ------------------
  |  Branch (631:16): [True: 7.42k, False: 18.4E]
  ------------------
  632|  7.42k|        mode_context[ref_frame] |= (4 << REFMV_OFFSET);
  ------------------
  |  |  488|  7.42k|#define REFMV_OFFSET 4
  ------------------
  633|  34.8k|      break;
  634|  27.3k|    case 2:
  ------------------
  |  Branch (634:5): [True: 27.3k, False: 60.7k]
  ------------------
  635|  27.3k|    default:
  ------------------
  |  Branch (635:5): [True: 0, False: 88.0k]
  ------------------
  636|  27.3k|      if (newmv_count >= 1)
  ------------------
  |  Branch (636:11): [True: 14.2k, False: 13.0k]
  ------------------
  637|  14.2k|        mode_context[ref_frame] |= 4;
  638|  13.0k|      else
  639|  13.0k|        mode_context[ref_frame] |= 5;
  640|       |
  641|  27.3k|      mode_context[ref_frame] |= (5 << REFMV_OFFSET);
  ------------------
  |  |  488|  27.3k|#define REFMV_OFFSET 4
  ------------------
  642|  27.3k|      break;
  643|  88.0k|  }
  644|       |
  645|       |  // Rank the likelihood and assign nearest and near mvs.
  646|  88.0k|  int len = nearest_refmv_count;
  647|   158k|  while (len > 0) {
  ------------------
  |  Branch (647:10): [True: 70.6k, False: 88.0k]
  ------------------
  648|  70.6k|    int nr_len = 0;
  649|  95.9k|    for (int idx = 1; idx < len; ++idx) {
  ------------------
  |  Branch (649:23): [True: 25.3k, False: 70.6k]
  ------------------
  650|  25.3k|      if (ref_mv_weight[idx - 1] < ref_mv_weight[idx]) {
  ------------------
  |  Branch (650:11): [True: 9.17k, False: 16.1k]
  ------------------
  651|  9.17k|        const CANDIDATE_MV tmp_mv = ref_mv_stack[idx - 1];
  652|  9.17k|        const uint16_t tmp_ref_mv_weight = ref_mv_weight[idx - 1];
  653|  9.17k|        ref_mv_stack[idx - 1] = ref_mv_stack[idx];
  654|  9.17k|        ref_mv_stack[idx] = tmp_mv;
  655|  9.17k|        ref_mv_weight[idx - 1] = ref_mv_weight[idx];
  656|  9.17k|        ref_mv_weight[idx] = tmp_ref_mv_weight;
  657|  9.17k|        nr_len = idx;
  658|  9.17k|      }
  659|  25.3k|    }
  660|  70.6k|    len = nr_len;
  661|  70.6k|  }
  662|       |
  663|  88.0k|  len = *refmv_count;
  664|   118k|  while (len > nearest_refmv_count) {
  ------------------
  |  Branch (664:10): [True: 30.4k, False: 88.0k]
  ------------------
  665|  30.4k|    int nr_len = nearest_refmv_count;
  666|  40.8k|    for (int idx = nearest_refmv_count + 1; idx < len; ++idx) {
  ------------------
  |  Branch (666:45): [True: 10.4k, False: 30.4k]
  ------------------
  667|  10.4k|      if (ref_mv_weight[idx - 1] < ref_mv_weight[idx]) {
  ------------------
  |  Branch (667:11): [True: 2.41k, False: 8.03k]
  ------------------
  668|  2.41k|        const CANDIDATE_MV tmp_mv = ref_mv_stack[idx - 1];
  669|  2.41k|        const uint16_t tmp_ref_mv_weight = ref_mv_weight[idx - 1];
  670|  2.41k|        ref_mv_stack[idx - 1] = ref_mv_stack[idx];
  671|  2.41k|        ref_mv_stack[idx] = tmp_mv;
  672|  2.41k|        ref_mv_weight[idx - 1] = ref_mv_weight[idx];
  673|  2.41k|        ref_mv_weight[idx] = tmp_ref_mv_weight;
  674|  2.41k|        nr_len = idx;
  675|  2.41k|      }
  676|  10.4k|    }
  677|  30.4k|    len = nr_len;
  678|  30.4k|  }
  679|       |
  680|  88.0k|  int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->width);
  ------------------
  |  |   34|  88.0k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 162, False: 87.9k]
  |  |  ------------------
  ------------------
  681|  88.0k|  mi_width = AOMMIN(mi_width, cm->mi_params.mi_cols - mi_col);
  ------------------
  |  |   34|  88.0k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 82.9k, False: 5.11k]
  |  |  ------------------
  ------------------
  682|  88.0k|  int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->height);
  ------------------
  |  |   34|  88.0k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 172, False: 87.9k]
  |  |  ------------------
  ------------------
  683|  88.0k|  mi_height = AOMMIN(mi_height, cm->mi_params.mi_rows - mi_row);
  ------------------
  |  |   34|  88.0k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 80.0k, False: 8.08k]
  |  |  ------------------
  ------------------
  684|  88.0k|  const int mi_size = AOMMIN(mi_width, mi_height);
  ------------------
  |  |   34|  88.0k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 19.4k, False: 68.6k]
  |  |  ------------------
  ------------------
  685|  88.0k|  if (rf[1] > NONE_FRAME) {
  ------------------
  |  Branch (685:7): [True: 12.0k, False: 76.0k]
  ------------------
  686|       |    // TODO(jingning, yunqing): Refactor and consolidate the compound and
  687|       |    // single reference frame modes. Reduce unnecessary redundancy.
  688|  12.0k|    if (*refmv_count < MAX_MV_REF_CANDIDATES) {
  ------------------
  |  |  508|  12.0k|#define MAX_MV_REF_CANDIDATES 2
  ------------------
  |  Branch (688:9): [True: 10.1k, False: 1.82k]
  ------------------
  689|  10.1k|      int_mv ref_id[2][2], ref_diff[2][2];
  690|  10.1k|      int ref_id_count[2] = { 0 }, ref_diff_count[2] = { 0 };
  691|       |
  692|  18.0k|      for (int idx = 0; abs(max_row_offset) >= 1 && idx < mi_size;) {
  ------------------
  |  Branch (692:25): [True: 15.2k, False: 2.80k]
  |  Branch (692:53): [True: 7.91k, False: 7.37k]
  ------------------
  693|  7.91k|        const MB_MODE_INFO *const candidate = xd->mi[-xd->mi_stride + idx];
  694|  7.91k|        process_compound_ref_mv_candidate(
  695|  7.91k|            candidate, cm, rf, ref_id, ref_id_count, ref_diff, ref_diff_count);
  696|  7.91k|        idx += mi_size_wide[candidate->bsize];
  697|  7.91k|      }
  698|       |
  699|  19.4k|      for (int idx = 0; abs(max_col_offset) >= 1 && idx < mi_size;) {
  ------------------
  |  Branch (699:25): [True: 17.7k, False: 1.69k]
  |  Branch (699:53): [True: 9.28k, False: 8.48k]
  ------------------
  700|  9.28k|        const MB_MODE_INFO *const candidate = xd->mi[idx * xd->mi_stride - 1];
  701|  9.28k|        process_compound_ref_mv_candidate(
  702|  9.28k|            candidate, cm, rf, ref_id, ref_id_count, ref_diff, ref_diff_count);
  703|  9.28k|        idx += mi_size_high[candidate->bsize];
  704|  9.28k|      }
  705|       |
  706|       |      // Build up the compound mv predictor
  707|  10.1k|      int_mv comp_list[MAX_MV_REF_CANDIDATES][2];
  708|       |
  709|  30.5k|      for (int idx = 0; idx < 2; ++idx) {
  ------------------
  |  Branch (709:25): [True: 20.3k, False: 10.1k]
  ------------------
  710|  20.3k|        int comp_idx = 0;
  711|  20.3k|        for (int list_idx = 0;
  712|  39.0k|             list_idx < ref_id_count[idx] && comp_idx < MAX_MV_REF_CANDIDATES;
  ------------------
  |  |  508|  18.6k|#define MAX_MV_REF_CANDIDATES 2
  ------------------
  |  Branch (712:14): [True: 18.6k, False: 20.3k]
  |  Branch (712:46): [True: 18.6k, False: 0]
  ------------------
  713|  20.3k|             ++list_idx, ++comp_idx)
  714|  18.6k|          comp_list[comp_idx][idx] = ref_id[idx][list_idx];
  715|  20.3k|        for (int list_idx = 0;
  716|  36.9k|             list_idx < ref_diff_count[idx] && comp_idx < MAX_MV_REF_CANDIDATES;
  ------------------
  |  |  508|  25.6k|#define MAX_MV_REF_CANDIDATES 2
  ------------------
  |  Branch (716:14): [True: 25.6k, False: 11.3k]
  |  Branch (716:48): [True: 16.6k, False: 9.00k]
  ------------------
  717|  20.3k|             ++list_idx, ++comp_idx)
  718|  16.6k|          comp_list[comp_idx][idx] = ref_diff[idx][list_idx];
  719|  25.7k|        for (; comp_idx < MAX_MV_REF_CANDIDATES; ++comp_idx)
  ------------------
  |  |  508|  25.7k|#define MAX_MV_REF_CANDIDATES 2
  ------------------
  |  Branch (719:16): [True: 5.42k, False: 20.3k]
  ------------------
  720|  5.42k|          comp_list[comp_idx][idx] = gm_mv_candidates[idx];
  721|  20.3k|      }
  722|       |
  723|  10.1k|      if (*refmv_count) {
  ------------------
  |  Branch (723:11): [True: 4.72k, False: 5.45k]
  ------------------
  724|  4.72k|        assert(*refmv_count == 1);
  725|  4.72k|        if (comp_list[0][0].as_int == ref_mv_stack[0].this_mv.as_int &&
  ------------------
  |  Branch (725:13): [True: 4.11k, False: 611]
  ------------------
  726|  4.11k|            comp_list[0][1].as_int == ref_mv_stack[0].comp_mv.as_int) {
  ------------------
  |  Branch (726:13): [True: 3.73k, False: 381]
  ------------------
  727|  3.73k|          ref_mv_stack[*refmv_count].this_mv = comp_list[1][0];
  728|  3.73k|          ref_mv_stack[*refmv_count].comp_mv = comp_list[1][1];
  729|  3.73k|        } else {
  730|    992|          ref_mv_stack[*refmv_count].this_mv = comp_list[0][0];
  731|    992|          ref_mv_stack[*refmv_count].comp_mv = comp_list[0][1];
  732|    992|        }
  733|  4.72k|        ref_mv_weight[*refmv_count] = 2;
  734|  4.72k|        ++*refmv_count;
  735|  5.45k|      } else {
  736|  16.3k|        for (int idx = 0; idx < MAX_MV_REF_CANDIDATES; ++idx) {
  ------------------
  |  |  508|  16.3k|#define MAX_MV_REF_CANDIDATES 2
  ------------------
  |  Branch (736:27): [True: 10.9k, False: 5.45k]
  ------------------
  737|  10.9k|          ref_mv_stack[*refmv_count].this_mv = comp_list[idx][0];
  738|  10.9k|          ref_mv_stack[*refmv_count].comp_mv = comp_list[idx][1];
  739|  10.9k|          ref_mv_weight[*refmv_count] = 2;
  740|  10.9k|          ++*refmv_count;
  741|  10.9k|        }
  742|  5.45k|      }
  743|  10.1k|    }
  744|       |
  745|  12.0k|    assert(*refmv_count >= 2);
  746|       |
  747|  36.4k|    for (int idx = 0; idx < *refmv_count; ++idx) {
  ------------------
  |  Branch (747:23): [True: 24.4k, False: 12.0k]
  ------------------
  748|  24.4k|      clamp_mv_ref(&ref_mv_stack[idx].this_mv.as_mv, xd->width << MI_SIZE_LOG2,
  ------------------
  |  |   39|  24.4k|#define MI_SIZE_LOG2 2
  ------------------
  749|  24.4k|                   xd->height << MI_SIZE_LOG2, xd);
  ------------------
  |  |   39|  24.4k|#define MI_SIZE_LOG2 2
  ------------------
  750|  24.4k|      clamp_mv_ref(&ref_mv_stack[idx].comp_mv.as_mv, xd->width << MI_SIZE_LOG2,
  ------------------
  |  |   39|  24.4k|#define MI_SIZE_LOG2 2
  ------------------
  751|  24.4k|                   xd->height << MI_SIZE_LOG2, xd);
  ------------------
  |  |   39|  24.4k|#define MI_SIZE_LOG2 2
  ------------------
  752|  24.4k|    }
  753|  76.0k|  } else {
  754|       |    // Handle single reference frame extension
  755|   109k|    for (int idx = 0; abs(max_row_offset) >= 1 && idx < mi_size &&
  ------------------
  |  Branch (755:23): [True: 95.4k, False: 13.7k]
  |  Branch (755:51): [True: 63.9k, False: 31.5k]
  ------------------
  756|  63.9k|                      *refmv_count < MAX_MV_REF_CANDIDATES;) {
  ------------------
  |  |  508|  63.9k|#define MAX_MV_REF_CANDIDATES 2
  ------------------
  |  Branch (756:23): [True: 33.0k, False: 30.8k]
  ------------------
  757|  33.0k|      const MB_MODE_INFO *const candidate = xd->mi[-xd->mi_stride + idx];
  758|  33.0k|      process_single_ref_mv_candidate(candidate, cm, ref_frame, refmv_count,
  759|  33.0k|                                      ref_mv_stack, ref_mv_weight);
  760|  33.0k|      idx += mi_size_wide[candidate->bsize];
  761|  33.0k|    }
  762|       |
  763|   108k|    for (int idx = 0; abs(max_col_offset) >= 1 && idx < mi_size &&
  ------------------
  |  Branch (763:23): [True: 99.2k, False: 9.60k]
  |  Branch (763:51): [True: 68.4k, False: 30.7k]
  ------------------
  764|  68.4k|                      *refmv_count < MAX_MV_REF_CANDIDATES;) {
  ------------------
  |  |  508|  68.4k|#define MAX_MV_REF_CANDIDATES 2
  ------------------
  |  Branch (764:23): [True: 32.7k, False: 35.7k]
  ------------------
  765|  32.7k|      const MB_MODE_INFO *const candidate = xd->mi[idx * xd->mi_stride - 1];
  766|  32.7k|      process_single_ref_mv_candidate(candidate, cm, ref_frame, refmv_count,
  767|  32.7k|                                      ref_mv_stack, ref_mv_weight);
  768|  32.7k|      idx += mi_size_high[candidate->bsize];
  769|  32.7k|    }
  770|       |
  771|   208k|    for (int idx = 0; idx < *refmv_count; ++idx) {
  ------------------
  |  Branch (771:23): [True: 132k, False: 76.0k]
  ------------------
  772|   132k|      clamp_mv_ref(&ref_mv_stack[idx].this_mv.as_mv, xd->width << MI_SIZE_LOG2,
  ------------------
  |  |   39|   132k|#define MI_SIZE_LOG2 2
  ------------------
  773|   132k|                   xd->height << MI_SIZE_LOG2, xd);
  ------------------
  |  |   39|   132k|#define MI_SIZE_LOG2 2
  ------------------
  774|   132k|    }
  775|       |
  776|  76.0k|    if (mv_ref_list != NULL) {
  ------------------
  |  Branch (776:9): [True: 76.0k, False: 18.4E]
  ------------------
  777|   118k|      for (int idx = *refmv_count; idx < MAX_MV_REF_CANDIDATES; ++idx)
  ------------------
  |  |  508|   118k|#define MAX_MV_REF_CANDIDATES 2
  ------------------
  |  Branch (777:36): [True: 42.8k, False: 76.0k]
  ------------------
  778|  42.8k|        mv_ref_list[idx].as_int = gm_mv_candidates[0].as_int;
  779|       |
  780|   185k|      for (int idx = 0; idx < AOMMIN(MAX_MV_REF_CANDIDATES, *refmv_count);
  ------------------
  |  |   34|   185k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 44.2k, False: 141k]
  |  |  ------------------
  ------------------
  |  Branch (780:25): [True: 109k, False: 76.0k]
  ------------------
  781|   109k|           ++idx) {
  782|   109k|        mv_ref_list[idx].as_int = ref_mv_stack[idx].this_mv.as_int;
  783|   109k|      }
  784|  76.0k|    }
  785|  76.0k|  }
  786|  88.0k|}
mvref_common.c:scan_row_mbmi:
  149|   147k|                                 int *processed_rows) {
  150|   147k|  int end_mi = AOMMIN(xd->width, cm->mi_params.mi_cols - mi_col);
  ------------------
  |  |   34|   147k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 142k, False: 4.59k]
  |  |  ------------------
  ------------------
  151|   147k|  end_mi = AOMMIN(end_mi, mi_size_wide[BLOCK_64X64]);
  ------------------
  |  |   34|   147k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 144k, False: 2.42k]
  |  |  ------------------
  ------------------
  152|   147k|  const int width_8x8 = mi_size_wide[BLOCK_8X8];
  153|   147k|  const int width_16x16 = mi_size_wide[BLOCK_16X16];
  154|   147k|  int col_offset = 0;
  155|       |  // TODO(jingning): Revisit this part after cb4x4 is stable.
  156|   147k|  if (abs(row_offset) > 1) {
  ------------------
  |  Branch (156:7): [True: 75.8k, False: 71.3k]
  ------------------
  157|  75.8k|    col_offset = 1;
  158|  75.8k|    if ((mi_col & 0x01) && xd->width < width_8x8) --col_offset;
  ------------------
  |  Branch (158:9): [True: 8.85k, False: 66.9k]
  |  Branch (158:28): [True: 8.85k, False: 0]
  ------------------
  159|  75.8k|  }
  160|   147k|  const int use_step_16 = (xd->width >= 16);
  161|   147k|  MB_MODE_INFO **const candidate_mi0 = xd->mi + row_offset * xd->mi_stride;
  162|       |
  163|   313k|  for (int i = 0; i < end_mi;) {
  ------------------
  |  Branch (163:19): [True: 166k, False: 147k]
  ------------------
  164|   166k|    const MB_MODE_INFO *const candidate = candidate_mi0[col_offset + i];
  165|   166k|    const int candidate_bsize = candidate->bsize;
  166|   166k|    const int n4_w = mi_size_wide[candidate_bsize];
  167|   166k|    int len = AOMMIN(xd->width, n4_w);
  ------------------
  |  |   34|   166k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 54.1k, False: 112k]
  |  |  ------------------
  ------------------
  168|   166k|    if (use_step_16)
  ------------------
  |  Branch (168:9): [True: 3.33k, False: 163k]
  ------------------
  169|  3.33k|      len = AOMMAX(width_16x16, len);
  ------------------
  |  |   35|  3.33k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 326, False: 3.01k]
  |  |  ------------------
  ------------------
  170|   163k|    else if (abs(row_offset) > 1)
  ------------------
  |  Branch (170:14): [True: 85.6k, False: 77.8k]
  ------------------
  171|  85.6k|      len = AOMMAX(len, width_8x8);
  ------------------
  |  |   35|  85.6k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 23.3k, False: 62.2k]
  |  |  ------------------
  ------------------
  172|       |
  173|   166k|    uint16_t weight = 2;
  174|   166k|    if (xd->width >= width_8x8 && xd->width <= n4_w) {
  ------------------
  |  Branch (174:9): [True: 135k, False: 31.0k]
  |  Branch (174:35): [True: 96.9k, False: 38.7k]
  ------------------
  175|  96.9k|      uint16_t inc = AOMMIN(-max_row_offset + row_offset + 1,
  ------------------
  |  |   34|  96.9k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 22.6k, False: 74.3k]
  |  |  ------------------
  ------------------
  176|  96.9k|                            mi_size_high[candidate_bsize]);
  177|       |      // Obtain range used in weight calculation.
  178|  96.9k|      weight = AOMMAX(weight, inc);
  ------------------
  |  |   35|  96.9k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 19.3k, False: 77.6k]
  |  |  ------------------
  ------------------
  179|       |      // Update processed rows.
  180|  96.9k|      *processed_rows = inc - row_offset - 1;
  181|  96.9k|    }
  182|       |
  183|   166k|    add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count,
  184|   166k|                         newmv_count, ref_mv_stack, ref_mv_weight,
  185|   166k|                         gm_mv_candidates, cm->global_motion, len * weight);
  186|       |
  187|   166k|    i += len;
  188|   166k|  }
  189|   147k|}
mvref_common.c:add_ref_mv_candidate:
   80|   451k|    uint16_t weight) {
   81|   451k|  if (!is_inter_block(candidate)) return;
  ------------------
  |  Branch (81:7): [True: 76.6k, False: 374k]
  ------------------
   82|   451k|  assert(weight % 2 == 0);
   83|   374k|  int index, ref;
   84|       |
   85|   374k|  if (rf[1] == NONE_FRAME) {
  ------------------
  |  Branch (85:7): [True: 321k, False: 53.9k]
  ------------------
   86|       |    // single reference frame
   87|   963k|    for (ref = 0; ref < 2; ++ref) {
  ------------------
  |  Branch (87:19): [True: 642k, False: 321k]
  ------------------
   88|   642k|      if (candidate->ref_frame[ref] == rf[0]) {
  ------------------
  |  Branch (88:11): [True: 216k, False: 425k]
  ------------------
   89|   216k|        const int is_gm_block =
   90|   216k|            is_global_mv_block(candidate, gm_params[rf[0]].wmtype);
   91|   216k|        const int_mv this_refmv =
   92|   216k|            is_gm_block ? gm_mv_candidates[0] : get_block_mv(candidate, ref);
  ------------------
  |  Branch (92:13): [True: 810, False: 215k]
  ------------------
   93|   337k|        for (index = 0; index < *refmv_count; ++index) {
  ------------------
  |  Branch (93:25): [True: 223k, False: 113k]
  ------------------
   94|   223k|          if (ref_mv_stack[index].this_mv.as_int == this_refmv.as_int) {
  ------------------
  |  Branch (94:15): [True: 102k, False: 120k]
  ------------------
   95|   102k|            ref_mv_weight[index] += weight;
   96|   102k|            break;
   97|   102k|          }
   98|   223k|        }
   99|       |
  100|       |        // Add a new item to the list.
  101|   216k|        if (index == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) {
  ------------------
  |  |  510|   113k|#define MAX_REF_MV_STACK_SIZE 8
  ------------------
  |  Branch (101:13): [True: 113k, False: 102k]
  |  Branch (101:38): [True: 113k, False: 46]
  ------------------
  102|   113k|          ref_mv_stack[index].this_mv = this_refmv;
  103|   113k|          ref_mv_weight[index] = weight;
  104|   113k|          ++(*refmv_count);
  105|   113k|        }
  106|   216k|        if (have_newmv_in_inter_mode(candidate->mode)) ++*newmv_count;
  ------------------
  |  Branch (106:13): [True: 70.1k, False: 146k]
  ------------------
  107|   216k|        ++*ref_match_count;
  108|   216k|      }
  109|   642k|    }
  110|   321k|  } else {
  111|       |    // compound reference frame
  112|  53.9k|    if (candidate->ref_frame[0] == rf[0] && candidate->ref_frame[1] == rf[1]) {
  ------------------
  |  Branch (112:9): [True: 27.8k, False: 26.0k]
  |  Branch (112:45): [True: 14.5k, False: 13.3k]
  ------------------
  113|  14.5k|      int_mv this_refmv[2];
  114|       |
  115|  43.6k|      for (ref = 0; ref < 2; ++ref) {
  ------------------
  |  Branch (115:21): [True: 29.0k, False: 14.5k]
  ------------------
  116|  29.0k|        if (is_global_mv_block(candidate, gm_params[rf[ref]].wmtype))
  ------------------
  |  Branch (116:13): [True: 510, False: 28.5k]
  ------------------
  117|    510|          this_refmv[ref] = gm_mv_candidates[ref];
  118|  28.5k|        else
  119|  28.5k|          this_refmv[ref] = get_block_mv(candidate, ref);
  120|  29.0k|      }
  121|       |
  122|  18.1k|      for (index = 0; index < *refmv_count; ++index) {
  ------------------
  |  Branch (122:23): [True: 9.49k, False: 8.63k]
  ------------------
  123|  9.49k|        if ((ref_mv_stack[index].this_mv.as_int == this_refmv[0].as_int) &&
  ------------------
  |  Branch (123:13): [True: 6.87k, False: 2.62k]
  ------------------
  124|  6.87k|            (ref_mv_stack[index].comp_mv.as_int == this_refmv[1].as_int)) {
  ------------------
  |  Branch (124:13): [True: 5.90k, False: 973]
  ------------------
  125|  5.90k|          ref_mv_weight[index] += weight;
  126|  5.90k|          break;
  127|  5.90k|        }
  128|  9.49k|      }
  129|       |
  130|       |      // Add a new item to the list.
  131|  14.5k|      if (index == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) {
  ------------------
  |  |  510|  8.63k|#define MAX_REF_MV_STACK_SIZE 8
  ------------------
  |  Branch (131:11): [True: 8.63k, False: 5.90k]
  |  Branch (131:36): [True: 8.63k, False: 0]
  ------------------
  132|  8.63k|        ref_mv_stack[index].this_mv = this_refmv[0];
  133|  8.63k|        ref_mv_stack[index].comp_mv = this_refmv[1];
  134|  8.63k|        ref_mv_weight[index] = weight;
  135|  8.63k|        ++(*refmv_count);
  136|  8.63k|      }
  137|  14.5k|      if (have_newmv_in_inter_mode(candidate->mode)) ++*newmv_count;
  ------------------
  |  Branch (137:11): [True: 4.55k, False: 9.98k]
  ------------------
  138|  14.5k|      ++*ref_match_count;
  139|  14.5k|    }
  140|  53.9k|  }
  141|   374k|}
mvref_common.c:scan_col_mbmi:
  197|   163k|                                 int *processed_cols) {
  198|   163k|  int end_mi = AOMMIN(xd->height, cm->mi_params.mi_rows - mi_row);
  ------------------
  |  |   34|   163k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 151k, False: 11.3k]
  |  |  ------------------
  ------------------
  199|   163k|  end_mi = AOMMIN(end_mi, mi_size_high[BLOCK_64X64]);
  ------------------
  |  |   34|   163k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 160k, False: 2.59k]
  |  |  ------------------
  ------------------
  200|   163k|  const int n8_h_8 = mi_size_high[BLOCK_8X8];
  201|   163k|  const int n8_h_16 = mi_size_high[BLOCK_16X16];
  202|   163k|  int i;
  203|   163k|  int row_offset = 0;
  204|   163k|  if (abs(col_offset) > 1) {
  ------------------
  |  Branch (204:7): [True: 86.5k, False: 76.7k]
  ------------------
  205|  86.5k|    row_offset = 1;
  206|  86.5k|    if ((mi_row & 0x01) && xd->height < n8_h_8) --row_offset;
  ------------------
  |  Branch (206:9): [True: 15.0k, False: 71.4k]
  |  Branch (206:28): [True: 15.0k, False: 0]
  ------------------
  207|  86.5k|  }
  208|   163k|  const int use_step_16 = (xd->height >= 16);
  209|       |
  210|   347k|  for (i = 0; i < end_mi;) {
  ------------------
  |  Branch (210:15): [True: 183k, False: 163k]
  ------------------
  211|   183k|    const MB_MODE_INFO *const candidate =
  212|   183k|        xd->mi[(row_offset + i) * xd->mi_stride + col_offset];
  213|   183k|    const int candidate_bsize = candidate->bsize;
  214|   183k|    const int n4_h = mi_size_high[candidate_bsize];
  215|   183k|    int len = AOMMIN(xd->height, n4_h);
  ------------------
  |  |   34|   183k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 70.1k, False: 113k]
  |  |  ------------------
  ------------------
  216|   183k|    if (use_step_16)
  ------------------
  |  Branch (216:9): [True: 3.81k, False: 180k]
  ------------------
  217|  3.81k|      len = AOMMAX(n8_h_16, len);
  ------------------
  |  |   35|  3.81k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 703, False: 3.11k]
  |  |  ------------------
  ------------------
  218|   180k|    else if (abs(col_offset) > 1)
  ------------------
  |  Branch (218:14): [True: 95.8k, False: 84.2k]
  ------------------
  219|  95.8k|      len = AOMMAX(len, n8_h_8);
  ------------------
  |  |   35|  95.8k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 16.5k, False: 79.3k]
  |  |  ------------------
  ------------------
  220|       |
  221|   183k|    int weight = 2;
  222|   183k|    if (xd->height >= n8_h_8 && xd->height <= n4_h) {
  ------------------
  |  Branch (222:9): [True: 133k, False: 49.9k]
  |  Branch (222:33): [True: 91.2k, False: 42.6k]
  ------------------
  223|  91.2k|      int inc = AOMMIN(-max_col_offset + col_offset + 1,
  ------------------
  |  |   34|  91.2k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 28.7k, False: 62.5k]
  |  |  ------------------
  ------------------
  224|  91.2k|                       mi_size_wide[candidate_bsize]);
  225|       |      // Obtain range used in weight calculation.
  226|  91.2k|      weight = AOMMAX(weight, inc);
  ------------------
  |  |   35|  91.2k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 10.8k, False: 80.4k]
  |  |  ------------------
  ------------------
  227|       |      // Update processed cols.
  228|  91.2k|      *processed_cols = inc - col_offset - 1;
  229|  91.2k|    }
  230|       |
  231|   183k|    add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count,
  232|   183k|                         newmv_count, ref_mv_stack, ref_mv_weight,
  233|   183k|                         gm_mv_candidates, cm->global_motion, len * weight);
  234|       |
  235|   183k|    i += len;
  236|   183k|  }
  237|   163k|}
mvref_common.c:scan_blk_mbmi:
  246|   142k|                                 uint8_t *refmv_count) {
  247|   142k|  const TileInfo *const tile = &xd->tile;
  248|   142k|  POSITION mi_pos;
  249|       |
  250|   142k|  mi_pos.row = row_offset;
  251|   142k|  mi_pos.col = col_offset;
  252|       |
  253|   142k|  if (is_inside(tile, mi_col, mi_row, &mi_pos)) {
  ------------------
  |  Branch (253:7): [True: 101k, False: 41.5k]
  ------------------
  254|   101k|    const MB_MODE_INFO *const candidate =
  255|   101k|        xd->mi[mi_pos.row * xd->mi_stride + mi_pos.col];
  256|   101k|    const int len = mi_size_wide[BLOCK_8X8];
  257|       |
  258|   101k|    add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count,
  259|   101k|                         newmv_count, ref_mv_stack, ref_mv_weight,
  260|   101k|                         gm_mv_candidates, cm->global_motion, 2 * len);
  261|   101k|  }  // Analyze a single 8x8 block motion information.
  262|   142k|}
mvref_common.c:add_tpl_ref_mv:
  335|   322k|                          int16_t *mode_context) {
  336|   322k|  POSITION mi_pos;
  337|   322k|  mi_pos.row = (mi_row & 0x01) ? blk_row : blk_row + 1;
  ------------------
  |  Branch (337:16): [True: 9.72k, False: 313k]
  ------------------
  338|   322k|  mi_pos.col = (mi_col & 0x01) ? blk_col : blk_col + 1;
  ------------------
  |  Branch (338:16): [True: 6.54k, False: 316k]
  ------------------
  339|       |
  340|   322k|  if (!is_inside(&xd->tile, mi_col, mi_row, &mi_pos)) return 0;
  ------------------
  |  Branch (340:7): [True: 15.6k, False: 307k]
  ------------------
  341|       |
  342|   307k|  const TPL_MV_REF *prev_frame_mvs =
  343|   307k|      cm->tpl_mvs +
  344|   307k|      ((mi_row + mi_pos.row) >> 1) * (cm->mi_params.mi_stride >> 1) +
  345|   307k|      ((mi_col + mi_pos.col) >> 1);
  346|   307k|  if (prev_frame_mvs->mfmv0.as_int == INVALID_MV) return 0;
  ------------------
  |  |   26|   307k|#define INVALID_MV 0x80008000
  ------------------
  |  Branch (346:7): [True: 297k, False: 9.66k]
  ------------------
  347|       |
  348|  9.66k|  MV_REFERENCE_FRAME rf[2];
  349|  9.66k|  av1_set_ref_frame(rf, ref_frame);
  350|       |
  351|  9.66k|  const uint16_t weight_unit = 1;  // mi_size_wide[BLOCK_8X8];
  352|  9.66k|  const int cur_frame_index = cm->cur_frame->order_hint;
  353|  9.66k|  const RefCntBuffer *const buf_0 = get_ref_frame_buf(cm, rf[0]);
  354|  9.66k|  const int frame0_index = buf_0->order_hint;
  355|  9.66k|  const int cur_offset_0 = get_relative_dist(&cm->seq_params->order_hint_info,
  356|  9.66k|                                             cur_frame_index, frame0_index);
  357|  9.66k|  int idx;
  358|  9.66k|  const int allow_high_precision_mv = cm->features.allow_high_precision_mv;
  359|  9.66k|  const int force_integer_mv = cm->features.cur_frame_force_integer_mv;
  360|       |
  361|  9.66k|  int_mv this_refmv;
  362|  9.66k|  av1_get_mv_projection(&this_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv,
  363|  9.66k|                        cur_offset_0, prev_frame_mvs->ref_frame_offset);
  364|  9.66k|  lower_mv_precision(&this_refmv.as_mv, allow_high_precision_mv,
  365|  9.66k|                     force_integer_mv);
  366|       |
  367|  9.66k|  if (rf[1] == NONE_FRAME) {
  ------------------
  |  Branch (367:7): [True: 7.93k, False: 1.72k]
  ------------------
  368|  7.93k|    if (blk_row == 0 && blk_col == 0) {
  ------------------
  |  Branch (368:9): [True: 3.17k, False: 4.75k]
  |  Branch (368:25): [True: 1.54k, False: 1.63k]
  ------------------
  369|  1.54k|      if (abs(this_refmv.as_mv.row - gm_mv_candidates[0].as_mv.row) >= 16 ||
  ------------------
  |  Branch (369:11): [True: 10, False: 1.53k]
  ------------------
  370|  1.53k|          abs(this_refmv.as_mv.col - gm_mv_candidates[0].as_mv.col) >= 16)
  ------------------
  |  Branch (370:11): [True: 4, False: 1.53k]
  ------------------
  371|     14|        mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET);
  ------------------
  |  |  487|     14|#define GLOBALMV_OFFSET 3
  ------------------
  372|  1.54k|    }
  373|       |
  374|  10.8k|    for (idx = 0; idx < *refmv_count; ++idx)
  ------------------
  |  Branch (374:19): [True: 9.71k, False: 1.11k]
  ------------------
  375|  9.71k|      if (this_refmv.as_int == ref_mv_stack[idx].this_mv.as_int) break;
  ------------------
  |  Branch (375:11): [True: 6.81k, False: 2.89k]
  ------------------
  376|       |
  377|  7.93k|    if (idx < *refmv_count) ref_mv_weight[idx] += 2 * weight_unit;
  ------------------
  |  Branch (377:9): [True: 6.81k, False: 1.11k]
  ------------------
  378|       |
  379|  7.93k|    if (idx == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) {
  ------------------
  |  |  510|  1.11k|#define MAX_REF_MV_STACK_SIZE 8
  ------------------
  |  Branch (379:9): [True: 1.11k, False: 6.81k]
  |  Branch (379:32): [True: 1.11k, False: 0]
  ------------------
  380|  1.11k|      ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int;
  381|  1.11k|      ref_mv_weight[idx] = 2 * weight_unit;
  382|  1.11k|      ++(*refmv_count);
  383|  1.11k|    }
  384|  7.93k|  } else {
  385|       |    // Process compound inter mode
  386|  1.72k|    const RefCntBuffer *const buf_1 = get_ref_frame_buf(cm, rf[1]);
  387|  1.72k|    const int frame1_index = buf_1->order_hint;
  388|  1.72k|    const int cur_offset_1 = get_relative_dist(&cm->seq_params->order_hint_info,
  389|  1.72k|                                               cur_frame_index, frame1_index);
  390|  1.72k|    int_mv comp_refmv;
  391|  1.72k|    av1_get_mv_projection(&comp_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv,
  392|  1.72k|                          cur_offset_1, prev_frame_mvs->ref_frame_offset);
  393|  1.72k|    lower_mv_precision(&comp_refmv.as_mv, allow_high_precision_mv,
  394|  1.72k|                       force_integer_mv);
  395|       |
  396|  1.72k|    if (blk_row == 0 && blk_col == 0) {
  ------------------
  |  Branch (396:9): [True: 606, False: 1.12k]
  |  Branch (396:25): [True: 292, False: 314]
  ------------------
  397|    292|      if (abs(this_refmv.as_mv.row - gm_mv_candidates[0].as_mv.row) >= 16 ||
  ------------------
  |  Branch (397:11): [True: 2, False: 290]
  ------------------
  398|    290|          abs(this_refmv.as_mv.col - gm_mv_candidates[0].as_mv.col) >= 16 ||
  ------------------
  |  Branch (398:11): [True: 0, False: 290]
  ------------------
  399|    290|          abs(comp_refmv.as_mv.row - gm_mv_candidates[1].as_mv.row) >= 16 ||
  ------------------
  |  Branch (399:11): [True: 0, False: 290]
  ------------------
  400|    290|          abs(comp_refmv.as_mv.col - gm_mv_candidates[1].as_mv.col) >= 16)
  ------------------
  |  Branch (400:11): [True: 0, False: 290]
  ------------------
  401|      2|        mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET);
  ------------------
  |  |  487|      2|#define GLOBALMV_OFFSET 3
  ------------------
  402|    292|    }
  403|       |
  404|  2.27k|    for (idx = 0; idx < *refmv_count; ++idx) {
  ------------------
  |  Branch (404:19): [True: 2.10k, False: 170]
  ------------------
  405|  2.10k|      if (this_refmv.as_int == ref_mv_stack[idx].this_mv.as_int &&
  ------------------
  |  Branch (405:11): [True: 1.65k, False: 452]
  ------------------
  406|  1.65k|          comp_refmv.as_int == ref_mv_stack[idx].comp_mv.as_int)
  ------------------
  |  Branch (406:11): [True: 1.55k, False: 92]
  ------------------
  407|  1.55k|        break;
  408|  2.10k|    }
  409|       |
  410|  1.72k|    if (idx < *refmv_count) ref_mv_weight[idx] += 2 * weight_unit;
  ------------------
  |  Branch (410:9): [True: 1.55k, False: 170]
  ------------------
  411|       |
  412|  1.72k|    if (idx == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) {
  ------------------
  |  |  510|    228|#define MAX_REF_MV_STACK_SIZE 8
  ------------------
  |  Branch (412:9): [True: 228, False: 1.50k]
  |  Branch (412:32): [True: 228, False: 0]
  ------------------
  413|    228|      ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int;
  414|    228|      ref_mv_stack[idx].comp_mv.as_int = comp_refmv.as_int;
  415|    228|      ref_mv_weight[idx] = 2 * weight_unit;
  416|    228|      ++(*refmv_count);
  417|    228|    }
  418|  1.72k|  }
  419|       |
  420|  9.66k|  return 1;
  421|   307k|}
mvref_common.c:check_sb_border:
  317|   122k|                           const int row_offset, const int col_offset) {
  318|   122k|  const int sb_mi_size = mi_size_wide[BLOCK_64X64];
  319|   122k|  const int row = mi_row & (sb_mi_size - 1);
  320|   122k|  const int col = mi_col & (sb_mi_size - 1);
  321|       |
  322|   122k|  if (row + row_offset < 0 || row + row_offset >= sb_mi_size ||
  ------------------
  |  Branch (322:7): [True: 1, False: 122k]
  |  Branch (322:31): [True: 15.7k, False: 107k]
  ------------------
  323|   107k|      col + col_offset < 0 || col + col_offset >= sb_mi_size)
  ------------------
  |  Branch (323:7): [True: 8.17k, False: 98.9k]
  |  Branch (323:31): [True: 16.1k, False: 82.7k]
  ------------------
  324|  40.0k|    return 0;
  325|       |
  326|  82.7k|  return 1;
  327|   122k|}
mvref_common.c:process_compound_ref_mv_candidate:
  426|  17.1k|    int ref_id_count[2], int_mv ref_diff[2][2], int ref_diff_count[2]) {
  427|  51.5k|  for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
  ------------------
  |  Branch (427:24): [True: 34.3k, False: 17.1k]
  ------------------
  428|  34.3k|    MV_REFERENCE_FRAME can_rf = candidate->ref_frame[rf_idx];
  429|       |
  430|   103k|    for (int cmp_idx = 0; cmp_idx < 2; ++cmp_idx) {
  ------------------
  |  Branch (430:27): [True: 68.7k, False: 34.3k]
  ------------------
  431|  68.7k|      if (can_rf == rf[cmp_idx] && ref_id_count[cmp_idx] < 2) {
  ------------------
  |  Branch (431:11): [True: 19.1k, False: 49.6k]
  |  Branch (431:36): [True: 18.6k, False: 445]
  ------------------
  432|  18.6k|        ref_id[cmp_idx][ref_id_count[cmp_idx]] = candidate->mv[rf_idx];
  433|  18.6k|        ++ref_id_count[cmp_idx];
  434|  50.1k|      } else if (can_rf > INTRA_FRAME && ref_diff_count[cmp_idx] < 2) {
  ------------------
  |  Branch (434:18): [True: 34.9k, False: 15.1k]
  |  Branch (434:42): [True: 28.6k, False: 6.27k]
  ------------------
  435|  28.6k|        int_mv this_mv = candidate->mv[rf_idx];
  436|  28.6k|        if (cm->ref_frame_sign_bias[can_rf] !=
  ------------------
  |  Branch (436:13): [True: 4.78k, False: 23.8k]
  ------------------
  437|  28.6k|            cm->ref_frame_sign_bias[rf[cmp_idx]]) {
  438|  4.78k|          this_mv.as_mv.row = -this_mv.as_mv.row;
  439|  4.78k|          this_mv.as_mv.col = -this_mv.as_mv.col;
  440|  4.78k|        }
  441|  28.6k|        ref_diff[cmp_idx][ref_diff_count[cmp_idx]] = this_mv;
  442|  28.6k|        ++ref_diff_count[cmp_idx];
  443|  28.6k|      }
  444|  68.7k|    }
  445|  34.3k|  }
  446|  17.1k|}
mvref_common.c:process_single_ref_mv_candidate:
  452|  65.8k|    uint16_t ref_mv_weight[MAX_REF_MV_STACK_SIZE]) {
  453|   197k|  for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
  ------------------
  |  Branch (453:24): [True: 131k, False: 65.8k]
  ------------------
  454|   131k|    if (candidate->ref_frame[rf_idx] > INTRA_FRAME) {
  ------------------
  |  Branch (454:9): [True: 48.1k, False: 83.4k]
  ------------------
  455|  48.1k|      int_mv this_mv = candidate->mv[rf_idx];
  456|  48.1k|      if (cm->ref_frame_sign_bias[candidate->ref_frame[rf_idx]] !=
  ------------------
  |  Branch (456:11): [True: 2.38k, False: 45.8k]
  ------------------
  457|  48.1k|          cm->ref_frame_sign_bias[ref_frame]) {
  458|  2.38k|        this_mv.as_mv.row = -this_mv.as_mv.row;
  459|  2.38k|        this_mv.as_mv.col = -this_mv.as_mv.col;
  460|  2.38k|      }
  461|  48.1k|      int stack_idx;
  462|  59.2k|      for (stack_idx = 0; stack_idx < *refmv_count; ++stack_idx) {
  ------------------
  |  Branch (462:27): [True: 41.9k, False: 17.2k]
  ------------------
  463|  41.9k|        const int_mv stack_mv = ref_mv_stack[stack_idx].this_mv;
  464|  41.9k|        if (this_mv.as_int == stack_mv.as_int) break;
  ------------------
  |  Branch (464:13): [True: 30.9k, False: 11.0k]
  ------------------
  465|  41.9k|      }
  466|       |
  467|  48.1k|      if (stack_idx == *refmv_count) {
  ------------------
  |  Branch (467:11): [True: 17.2k, False: 30.9k]
  ------------------
  468|  17.2k|        ref_mv_stack[stack_idx].this_mv = this_mv;
  469|       |
  470|       |        // TODO(jingning): Set an arbitrary small number here. The weight
  471|       |        // doesn't matter as long as it is properly initialized.
  472|  17.2k|        ref_mv_weight[stack_idx] = 2;
  473|  17.2k|        ++(*refmv_count);
  474|  17.2k|      }
  475|  48.1k|    }
  476|   131k|  }
  477|  65.8k|}
mvref_common.c:motion_field_projection:
  919|  10.2k|                                   MV_REFERENCE_FRAME start_frame, int dir) {
  920|  10.2k|  TPL_MV_REF *tpl_mvs_base = cm->tpl_mvs;
  921|  10.2k|  int ref_offset[REF_FRAMES] = { 0 };
  922|       |
  923|  10.2k|  const RefCntBuffer *const start_frame_buf =
  924|  10.2k|      get_ref_frame_buf(cm, start_frame);
  925|  10.2k|  if (start_frame_buf == NULL) return 0;
  ------------------
  |  Branch (925:7): [True: 0, False: 10.2k]
  ------------------
  926|       |
  927|  10.2k|  if (start_frame_buf->frame_type == KEY_FRAME ||
  ------------------
  |  Branch (927:7): [True: 6.60k, False: 3.66k]
  ------------------
  928|  3.66k|      start_frame_buf->frame_type == INTRA_ONLY_FRAME)
  ------------------
  |  Branch (928:7): [True: 0, False: 3.66k]
  ------------------
  929|  6.60k|    return 0;
  930|       |
  931|  3.66k|  if (start_frame_buf->mi_rows != cm->mi_params.mi_rows ||
  ------------------
  |  Branch (931:7): [True: 2, False: 3.66k]
  ------------------
  932|  3.66k|      start_frame_buf->mi_cols != cm->mi_params.mi_cols)
  ------------------
  |  Branch (932:7): [True: 0, False: 3.66k]
  ------------------
  933|      2|    return 0;
  934|       |
  935|  3.66k|  const int start_frame_order_hint = start_frame_buf->order_hint;
  936|  3.66k|  const unsigned int *const ref_order_hints =
  937|  3.66k|      &start_frame_buf->ref_order_hints[0];
  938|  3.66k|  const int cur_order_hint = cm->cur_frame->order_hint;
  939|  3.66k|  int start_to_current_frame_offset = get_relative_dist(
  940|  3.66k|      &cm->seq_params->order_hint_info, start_frame_order_hint, cur_order_hint);
  941|       |
  942|  29.3k|  for (MV_REFERENCE_FRAME rf = LAST_FRAME; rf <= INTER_REFS_PER_FRAME; ++rf) {
  ------------------
  |  Branch (942:44): [True: 25.6k, False: 3.66k]
  ------------------
  943|  25.6k|    ref_offset[rf] = get_relative_dist(&cm->seq_params->order_hint_info,
  944|  25.6k|                                       start_frame_order_hint,
  945|  25.6k|                                       ref_order_hints[rf - LAST_FRAME]);
  946|  25.6k|  }
  947|       |
  948|  3.66k|  if (dir == 2) start_to_current_frame_offset = -start_to_current_frame_offset;
  ------------------
  |  Branch (948:7): [True: 872, False: 2.79k]
  ------------------
  949|       |
  950|  3.66k|  MV_REF *mv_ref_base = start_frame_buf->mvs;
  951|  3.66k|  const int mvs_rows = (cm->mi_params.mi_rows + 1) >> 1;
  952|  3.66k|  const int mvs_cols = (cm->mi_params.mi_cols + 1) >> 1;
  953|       |
  954|  73.3k|  for (int blk_row = 0; blk_row < mvs_rows; ++blk_row) {
  ------------------
  |  Branch (954:25): [True: 69.6k, False: 3.66k]
  ------------------
  955|  1.39M|    for (int blk_col = 0; blk_col < mvs_cols; ++blk_col) {
  ------------------
  |  Branch (955:27): [True: 1.32M, False: 69.6k]
  ------------------
  956|  1.32M|      MV_REF *mv_ref = &mv_ref_base[blk_row * mvs_cols + blk_col];
  957|  1.32M|      MV fwd_mv = mv_ref->mv.as_mv;
  958|       |
  959|  1.32M|      if (mv_ref->ref_frame > INTRA_FRAME) {
  ------------------
  |  Branch (959:11): [True: 17.8k, False: 1.30M]
  ------------------
  960|  17.8k|        int_mv this_mv;
  961|  17.8k|        int mi_r, mi_c;
  962|  17.8k|        const int ref_frame_offset = ref_offset[mv_ref->ref_frame];
  963|       |
  964|  17.8k|        int pos_valid =
  965|  17.8k|            abs(ref_frame_offset) <= MAX_FRAME_DISTANCE &&
  ------------------
  |  |   68|  35.6k|#define MAX_FRAME_DISTANCE ((1 << FRAME_OFFSET_BITS) - 1)
  |  |  ------------------
  |  |  |  |   67|  17.8k|#define FRAME_OFFSET_BITS 5
  |  |  ------------------
  ------------------
  |  Branch (965:13): [True: 17.6k, False: 152]
  ------------------
  966|  17.6k|            ref_frame_offset > 0 &&
  ------------------
  |  Branch (966:13): [True: 17.6k, False: 0]
  ------------------
  967|  17.6k|            abs(start_to_current_frame_offset) <= MAX_FRAME_DISTANCE;
  ------------------
  |  |   68|  17.6k|#define MAX_FRAME_DISTANCE ((1 << FRAME_OFFSET_BITS) - 1)
  |  |  ------------------
  |  |  |  |   67|  17.6k|#define FRAME_OFFSET_BITS 5
  |  |  ------------------
  ------------------
  |  Branch (967:13): [True: 17.5k, False: 128]
  ------------------
  968|       |
  969|  17.8k|        if (pos_valid) {
  ------------------
  |  Branch (969:13): [True: 17.5k, False: 280]
  ------------------
  970|  17.5k|          av1_get_mv_projection(&this_mv.as_mv, fwd_mv,
  971|  17.5k|                                start_to_current_frame_offset,
  972|  17.5k|                                ref_frame_offset);
  973|  17.5k|          pos_valid = get_block_position(cm, &mi_r, &mi_c, blk_row, blk_col,
  974|  17.5k|                                         this_mv.as_mv, dir >> 1);
  975|  17.5k|        }
  976|       |
  977|  17.8k|        if (pos_valid) {
  ------------------
  |  Branch (977:13): [True: 17.5k, False: 280]
  ------------------
  978|  17.5k|          const int mi_offset = mi_r * (cm->mi_params.mi_stride >> 1) + mi_c;
  979|       |
  980|  17.5k|          tpl_mvs_base[mi_offset].mfmv0.as_mv.row = fwd_mv.row;
  981|  17.5k|          tpl_mvs_base[mi_offset].mfmv0.as_mv.col = fwd_mv.col;
  982|  17.5k|          tpl_mvs_base[mi_offset].ref_frame_offset = ref_frame_offset;
  983|  17.5k|        }
  984|  17.8k|      }
  985|  1.32M|    }
  986|  69.6k|  }
  987|       |
  988|  3.66k|  return 1;
  989|  3.66k|}
mvref_common.c:get_block_position:
  881|  17.5k|                              int blk_col, MV mv, int sign_bias) {
  882|  17.5k|  const int base_blk_row = (blk_row >> 3) << 3;
  883|  17.5k|  const int base_blk_col = (blk_col >> 3) << 3;
  884|       |
  885|  17.5k|  const int row_offset = (mv.row >= 0) ? (mv.row >> (4 + MI_SIZE_LOG2))
  ------------------
  |  |   39|  17.5k|#define MI_SIZE_LOG2 2
  ------------------
  |  Branch (885:26): [True: 17.5k, False: 0]
  ------------------
  886|  17.5k|                                       : -((-mv.row) >> (4 + MI_SIZE_LOG2));
  ------------------
  |  |   39|      0|#define MI_SIZE_LOG2 2
  ------------------
  887|       |
  888|  17.5k|  const int col_offset = (mv.col >= 0) ? (mv.col >> (4 + MI_SIZE_LOG2))
  ------------------
  |  |   39|  17.5k|#define MI_SIZE_LOG2 2
  ------------------
  |  Branch (888:26): [True: 17.5k, False: 0]
  ------------------
  889|  17.5k|                                       : -((-mv.col) >> (4 + MI_SIZE_LOG2));
  ------------------
  |  |   39|      0|#define MI_SIZE_LOG2 2
  ------------------
  890|       |
  891|  17.5k|  const int row =
  892|  17.5k|      (sign_bias == 1) ? blk_row - row_offset : blk_row + row_offset;
  ------------------
  |  Branch (892:7): [True: 17.3k, False: 152]
  ------------------
  893|  17.5k|  const int col =
  894|  17.5k|      (sign_bias == 1) ? blk_col - col_offset : blk_col + col_offset;
  ------------------
  |  Branch (894:7): [True: 17.3k, False: 152]
  ------------------
  895|       |
  896|  17.5k|  if (row < 0 || row >= (cm->mi_params.mi_rows >> 1) || col < 0 ||
  ------------------
  |  Branch (896:7): [True: 0, False: 17.5k]
  |  Branch (896:18): [True: 0, False: 17.5k]
  |  Branch (896:57): [True: 0, False: 17.5k]
  ------------------
  897|  17.5k|      col >= (cm->mi_params.mi_cols >> 1))
  ------------------
  |  Branch (897:7): [True: 0, False: 17.5k]
  ------------------
  898|      0|    return 0;
  899|       |
  900|  17.5k|  if (row < base_blk_row - (MAX_OFFSET_HEIGHT >> 3) ||
  ------------------
  |  |  878|  17.5k|#define MAX_OFFSET_HEIGHT 0
  ------------------
  |  Branch (900:7): [True: 0, False: 17.5k]
  ------------------
  901|  17.5k|      row >= base_blk_row + 8 + (MAX_OFFSET_HEIGHT >> 3) ||
  ------------------
  |  |  878|  17.5k|#define MAX_OFFSET_HEIGHT 0
  ------------------
  |  Branch (901:7): [True: 0, False: 17.5k]
  ------------------
  902|  17.5k|      col < base_blk_col - (MAX_OFFSET_WIDTH >> 3) ||
  ------------------
  |  |  877|  17.5k|#define MAX_OFFSET_WIDTH 64
  ------------------
  |  Branch (902:7): [True: 0, False: 17.5k]
  ------------------
  903|  17.5k|      col >= base_blk_col + 8 + (MAX_OFFSET_WIDTH >> 3))
  ------------------
  |  |  877|  17.5k|#define MAX_OFFSET_WIDTH 64
  ------------------
  |  Branch (903:7): [True: 0, False: 17.5k]
  ------------------
  904|      0|    return 0;
  905|       |
  906|  17.5k|  *mi_r = row;
  907|  17.5k|  *mi_c = col;
  908|       |
  909|  17.5k|  return 1;
  910|  17.5k|}
mvref_common.c:record_samples:
 1078|  57.1k|                                  int col_offset, int sign_c) {
 1079|  57.1k|  const int bw = block_size_wide[mbmi->bsize];
 1080|  57.1k|  const int bh = block_size_high[mbmi->bsize];
 1081|  57.1k|  const int x = col_offset * MI_SIZE + sign_c * bw / 2 - 1;
  ------------------
  |  |   40|  57.1k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  57.1k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1082|  57.1k|  const int y = row_offset * MI_SIZE + sign_r * bh / 2 - 1;
  ------------------
  |  |   40|  57.1k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  57.1k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1083|       |
 1084|  57.1k|  pts[0] = GET_MV_SUBPEL(x);
  ------------------
  |  |   29|  57.1k|#define GET_MV_SUBPEL(x) ((x) * 8)
  ------------------
 1085|  57.1k|  pts[1] = GET_MV_SUBPEL(y);
  ------------------
  |  |   29|  57.1k|#define GET_MV_SUBPEL(x) ((x) * 8)
  ------------------
 1086|  57.1k|  pts_inref[0] = pts[0] + mbmi->mv[0].as_mv.col;
 1087|  57.1k|  pts_inref[1] = pts[1] + mbmi->mv[0].as_mv.row;
 1088|  57.1k|}
mvref_common.c:has_top_right:
  265|   123k|                         int mi_row, int mi_col, int bs) {
  266|   123k|  const int sb_mi_size = mi_size_wide[cm->seq_params->sb_size];
  267|   123k|  const int mask_row = mi_row & (sb_mi_size - 1);
  268|   123k|  const int mask_col = mi_col & (sb_mi_size - 1);
  269|       |
  270|   123k|  if (bs > mi_size_wide[BLOCK_64X64]) return 0;
  ------------------
  |  Branch (270:7): [True: 366, False: 123k]
  ------------------
  271|       |
  272|       |  // In a split partition all apart from the bottom right has a top right
  273|   123k|  int has_tr = !((mask_row & bs) && (mask_col & bs));
  ------------------
  |  Branch (273:18): [True: 53.2k, False: 70.3k]
  |  Branch (273:37): [True: 24.9k, False: 28.2k]
  ------------------
  274|       |
  275|       |  // bs > 0 and bs is a power of 2
  276|   123k|  assert(bs > 0 && !(bs & (bs - 1)));
  277|       |
  278|       |  // For each 4x4 group of blocks, when the bottom right is decoded the blocks
  279|       |  // to the right have not been decoded therefore the bottom right does
  280|       |  // not have a top right
  281|   179k|  while (bs < sb_mi_size) {
  ------------------
  |  Branch (281:10): [True: 159k, False: 19.3k]
  ------------------
  282|   159k|    if (mask_col & bs) {
  ------------------
  |  Branch (282:9): [True: 69.0k, False: 90.6k]
  ------------------
  283|  69.0k|      if ((mask_col & (2 * bs)) && (mask_row & (2 * bs))) {
  ------------------
  |  Branch (283:11): [True: 27.5k, False: 41.5k]
  |  Branch (283:36): [True: 13.5k, False: 13.9k]
  ------------------
  284|  13.5k|        has_tr = 0;
  285|  13.5k|        break;
  286|  13.5k|      }
  287|  90.6k|    } else {
  288|  90.6k|      break;
  289|  90.6k|    }
  290|  55.4k|    bs <<= 1;
  291|  55.4k|  }
  292|       |
  293|       |  // In a VERTICAL or VERTICAL_4 partition, all partition before the last one
  294|       |  // always have a top right (as the block above will have been decoded).
  295|   123k|  if (xd->width < xd->height) {
  ------------------
  |  Branch (295:7): [True: 24.6k, False: 98.9k]
  ------------------
  296|  24.6k|    if (!xd->is_last_vertical_rect) has_tr = 1;
  ------------------
  |  Branch (296:9): [True: 14.5k, False: 10.1k]
  ------------------
  297|  24.6k|  }
  298|       |
  299|       |  // In a HORIZONTAL or HORIZONTAL_4 partition, partitions after the first one
  300|       |  // never have a top right (as the block to the right won't have been decoded).
  301|   123k|  if (xd->width > xd->height) {
  ------------------
  |  Branch (301:7): [True: 39.4k, False: 84.1k]
  ------------------
  302|  39.4k|    if (!xd->is_first_horizontal_rect) has_tr = 0;
  ------------------
  |  Branch (302:9): [True: 22.3k, False: 17.1k]
  ------------------
  303|  39.4k|  }
  304|       |
  305|       |  // The bottom left square of a Vertical A (in the old format) does
  306|       |  // not have a top right as it is decoded before the right hand
  307|       |  // rectangle of the partition
  308|   123k|  if (xd->mi[0]->partition == PARTITION_VERT_A) {
  ------------------
  |  Branch (308:7): [True: 3.95k, False: 119k]
  ------------------
  309|  3.95k|    if (xd->width == xd->height)
  ------------------
  |  Branch (309:9): [True: 2.60k, False: 1.35k]
  ------------------
  310|  2.60k|      if (mask_row & bs) has_tr = 0;
  ------------------
  |  Branch (310:11): [True: 1.42k, False: 1.17k]
  ------------------
  311|  3.95k|  }
  312|       |
  313|   123k|  return has_tr;
  314|   123k|}
mvref_common.c:compare_ref_frame_info:
 1329|  2.77k|static int compare_ref_frame_info(const void *arg_a, const void *arg_b) {
 1330|  2.77k|  const REF_FRAME_INFO *info_a = (REF_FRAME_INFO *)arg_a;
 1331|  2.77k|  const REF_FRAME_INFO *info_b = (REF_FRAME_INFO *)arg_b;
 1332|       |
 1333|  2.77k|  const int sort_idx_diff = info_a->sort_idx - info_b->sort_idx;
 1334|  2.77k|  if (sort_idx_diff != 0) return sort_idx_diff;
  ------------------
  |  Branch (1334:7): [True: 573, False: 2.19k]
  ------------------
 1335|  2.19k|  return info_a->map_idx - info_b->map_idx;
 1336|  2.77k|}
mvref_common.c:set_ref_frame_info:
 1339|  1.50k|                                      REF_FRAME_INFO *ref_info) {
 1340|  1.50k|  assert(frame_idx >= 0 && frame_idx < INTER_REFS_PER_FRAME);
 1341|       |
 1342|  1.50k|  remapped_ref_idx[frame_idx] = ref_info->map_idx;
 1343|  1.50k|}

decodemv.c:av1_find_ref_dv:
  268|  5.17k|                                   int mib_size, int mi_row) {
  269|  5.17k|  if (mi_row - mib_size < tile->mi_row_start) {
  ------------------
  |  Branch (269:7): [True: 680, False: 4.49k]
  ------------------
  270|    680|    ref_dv->as_fullmv.row = 0;
  271|    680|    ref_dv->as_fullmv.col = -MI_SIZE * mib_size - INTRABC_DELAY_PIXELS;
  ------------------
  |  |   40|    680|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|    680|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
                  ref_dv->as_fullmv.col = -MI_SIZE * mib_size - INTRABC_DELAY_PIXELS;
  ------------------
  |  |  264|    680|#define INTRABC_DELAY_PIXELS 256  //  Delay of 256 pixels
  ------------------
  272|  4.49k|  } else {
  273|  4.49k|    ref_dv->as_fullmv.row = -MI_SIZE * mib_size;
  ------------------
  |  |   40|  4.49k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  4.49k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  274|  4.49k|    ref_dv->as_fullmv.col = 0;
  275|  4.49k|  }
  276|  5.17k|  convert_fullmv_to_mv(ref_dv);
  277|  5.17k|}
decodemv.c:av1_is_dv_valid:
  281|  11.3k|                                  BLOCK_SIZE bsize, int mib_size_log2) {
  282|  11.3k|  const int bw = block_size_wide[bsize];
  283|  11.3k|  const int bh = block_size_high[bsize];
  284|  11.3k|  const int SCALE_PX_TO_MV = 8;
  285|       |  // Disallow subpixel for now
  286|       |  // SUBPEL_MASK is not the correct scale
  287|  11.3k|  if (((dv.row & (SCALE_PX_TO_MV - 1)) || (dv.col & (SCALE_PX_TO_MV - 1))))
  ------------------
  |  Branch (287:8): [True: 0, False: 11.3k]
  |  Branch (287:43): [True: 0, False: 11.3k]
  ------------------
  288|      0|    return 0;
  289|       |
  290|  11.3k|  const TileInfo *const tile = &xd->tile;
  291|       |  // Is the source top-left inside the current tile?
  292|  11.3k|  const int src_top_edge = mi_row * MI_SIZE * SCALE_PX_TO_MV + dv.row;
  ------------------
  |  |   40|  11.3k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  11.3k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  293|  11.3k|  const int tile_top_edge = tile->mi_row_start * MI_SIZE * SCALE_PX_TO_MV;
  ------------------
  |  |   40|  11.3k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  11.3k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  294|  11.3k|  if (src_top_edge < tile_top_edge) return 0;
  ------------------
  |  Branch (294:7): [True: 62, False: 11.3k]
  ------------------
  295|  11.3k|  const int src_left_edge = mi_col * MI_SIZE * SCALE_PX_TO_MV + dv.col;
  ------------------
  |  |   40|  11.3k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  11.3k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  296|  11.3k|  const int tile_left_edge = tile->mi_col_start * MI_SIZE * SCALE_PX_TO_MV;
  ------------------
  |  |   40|  11.3k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  11.3k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  297|  11.3k|  if (src_left_edge < tile_left_edge) return 0;
  ------------------
  |  Branch (297:7): [True: 240, False: 11.0k]
  ------------------
  298|       |  // Is the bottom right inside the current tile?
  299|  11.0k|  const int src_bottom_edge = (mi_row * MI_SIZE + bh) * SCALE_PX_TO_MV + dv.row;
  ------------------
  |  |   40|  11.0k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  11.0k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  300|  11.0k|  const int tile_bottom_edge = tile->mi_row_end * MI_SIZE * SCALE_PX_TO_MV;
  ------------------
  |  |   40|  11.0k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  11.0k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  301|  11.0k|  if (src_bottom_edge > tile_bottom_edge) return 0;
  ------------------
  |  Branch (301:7): [True: 2, False: 11.0k]
  ------------------
  302|  11.0k|  const int src_right_edge = (mi_col * MI_SIZE + bw) * SCALE_PX_TO_MV + dv.col;
  ------------------
  |  |   40|  11.0k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  11.0k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  303|  11.0k|  const int tile_right_edge = tile->mi_col_end * MI_SIZE * SCALE_PX_TO_MV;
  ------------------
  |  |   40|  11.0k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  11.0k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  304|  11.0k|  if (src_right_edge > tile_right_edge) return 0;
  ------------------
  |  Branch (304:7): [True: 44, False: 11.0k]
  ------------------
  305|       |
  306|       |  // Special case for sub 8x8 chroma cases, to prevent referring to chroma
  307|       |  // pixels outside current tile.
  308|  11.0k|  if (xd->is_chroma_ref && av1_num_planes(cm) > 1) {
  ------------------
  |  Branch (308:7): [True: 9.88k, False: 1.14k]
  |  Branch (308:28): [True: 5.28k, False: 4.60k]
  ------------------
  309|  5.28k|    const struct macroblockd_plane *const pd = &xd->plane[1];
  310|  5.28k|    if (bw < 8 && pd->subsampling_x)
  ------------------
  |  Branch (310:9): [True: 886, False: 4.39k]
  |  Branch (310:19): [True: 0, False: 886]
  ------------------
  311|      0|      if (src_left_edge < tile_left_edge + 4 * SCALE_PX_TO_MV) return 0;
  ------------------
  |  Branch (311:11): [True: 0, False: 0]
  ------------------
  312|  5.28k|    if (bh < 8 && pd->subsampling_y)
  ------------------
  |  Branch (312:9): [True: 1.58k, False: 3.70k]
  |  Branch (312:19): [True: 0, False: 1.58k]
  ------------------
  313|      0|      if (src_top_edge < tile_top_edge + 4 * SCALE_PX_TO_MV) return 0;
  ------------------
  |  Branch (313:11): [True: 0, False: 0]
  ------------------
  314|  5.28k|  }
  315|       |
  316|       |  // Is the bottom right within an already coded SB? Also consider additional
  317|       |  // constraints to facilitate HW decoder.
  318|  11.0k|  const int max_mib_size = 1 << mib_size_log2;
  319|  11.0k|  const int active_sb_row = mi_row >> mib_size_log2;
  320|  11.0k|  const int active_sb64_col = (mi_col * MI_SIZE) >> 6;
  ------------------
  |  |   40|  11.0k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  11.0k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  321|  11.0k|  const int sb_size = max_mib_size * MI_SIZE;
  ------------------
  |  |   40|  11.0k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  11.0k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  322|  11.0k|  const int src_sb_row = ((src_bottom_edge >> 3) - 1) / sb_size;
  323|  11.0k|  const int src_sb64_col = ((src_right_edge >> 3) - 1) >> 6;
  324|  11.0k|  const int total_sb64_per_row =
  325|  11.0k|      ((tile->mi_col_end - tile->mi_col_start - 1) >> 4) + 1;
  326|  11.0k|  const int active_sb64 = active_sb_row * total_sb64_per_row + active_sb64_col;
  327|  11.0k|  const int src_sb64 = src_sb_row * total_sb64_per_row + src_sb64_col;
  328|  11.0k|  if (src_sb64 >= active_sb64 - INTRABC_DELAY_SB64) return 0;
  ------------------
  |  |  265|  11.0k|#define INTRABC_DELAY_SB64 (INTRABC_DELAY_PIXELS / 64)
  |  |  ------------------
  |  |  |  |  264|  11.0k|#define INTRABC_DELAY_PIXELS 256  //  Delay of 256 pixels
  |  |  ------------------
  ------------------
  |  Branch (328:7): [True: 116, False: 10.9k]
  ------------------
  329|       |
  330|       |  // Wavefront constraint: use only top left area of frame for reference.
  331|  10.9k|  const int gradient = 1 + INTRABC_DELAY_SB64 + (sb_size > 64);
  ------------------
  |  |  265|  10.9k|#define INTRABC_DELAY_SB64 (INTRABC_DELAY_PIXELS / 64)
  |  |  ------------------
  |  |  |  |  264|  10.9k|#define INTRABC_DELAY_PIXELS 256  //  Delay of 256 pixels
  |  |  ------------------
  ------------------
  332|  10.9k|  const int wf_offset = gradient * (active_sb_row - src_sb_row);
  333|  10.9k|  if (src_sb_row > active_sb_row ||
  ------------------
  |  Branch (333:7): [True: 0, False: 10.9k]
  ------------------
  334|  10.9k|      src_sb64_col >= active_sb64_col - INTRABC_DELAY_SB64 + wf_offset)
  ------------------
  |  |  265|  10.9k|#define INTRABC_DELAY_SB64 (INTRABC_DELAY_PIXELS / 64)
  |  |  ------------------
  |  |  |  |  264|  10.9k|#define INTRABC_DELAY_PIXELS 256  //  Delay of 256 pixels
  |  |  ------------------
  ------------------
  |  Branch (334:7): [True: 62, False: 10.8k]
  ------------------
  335|     62|    return 0;
  336|       |
  337|  10.8k|  return 1;
  338|  10.9k|}
decodemv.c:av1_collect_neighbors_ref_counts:
  209|  76.7k|static inline void av1_collect_neighbors_ref_counts(MACROBLOCKD *const xd) {
  210|  76.7k|  av1_zero(xd->neighbors_ref_counts);
  ------------------
  |  |   43|  76.7k|#define av1_zero(dest) memset(&(dest), 0, sizeof(dest))
  ------------------
  211|       |
  212|  76.7k|  uint8_t *const ref_counts = xd->neighbors_ref_counts;
  213|       |
  214|  76.7k|  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
  215|  76.7k|  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
  216|  76.7k|  const int above_in_image = xd->up_available;
  217|  76.7k|  const int left_in_image = xd->left_available;
  218|       |
  219|       |  // Above neighbor
  220|  76.7k|  if (above_in_image && is_inter_block(above_mbmi)) {
  ------------------
  |  Branch (220:7): [True: 60.2k, False: 16.4k]
  |  Branch (220:25): [True: 58.4k, False: 1.76k]
  ------------------
  221|  58.4k|    ref_counts[above_mbmi->ref_frame[0]]++;
  222|  58.4k|    if (has_second_ref(above_mbmi)) {
  ------------------
  |  Branch (222:9): [True: 9.30k, False: 49.1k]
  ------------------
  223|  9.30k|      ref_counts[above_mbmi->ref_frame[1]]++;
  224|  9.30k|    }
  225|  58.4k|  }
  226|       |
  227|       |  // Left neighbor
  228|  76.7k|  if (left_in_image && is_inter_block(left_mbmi)) {
  ------------------
  |  Branch (228:7): [True: 65.5k, False: 11.1k]
  |  Branch (228:24): [True: 63.7k, False: 1.77k]
  ------------------
  229|  63.7k|    ref_counts[left_mbmi->ref_frame[0]]++;
  230|  63.7k|    if (has_second_ref(left_mbmi)) {
  ------------------
  |  Branch (230:9): [True: 10.4k, False: 53.2k]
  ------------------
  231|  10.4k|      ref_counts[left_mbmi->ref_frame[1]]++;
  232|  10.4k|    }
  233|  63.7k|  }
  234|  76.7k|}
decodemv.c:av1_ref_frame_type:
  113|   190k|static inline int8_t av1_ref_frame_type(const MV_REFERENCE_FRAME *const rf) {
  114|   190k|  if (rf[1] > INTRA_FRAME) {
  ------------------
  |  Branch (114:7): [True: 28.5k, False: 162k]
  ------------------
  115|  28.5k|    const int8_t uni_comp_ref_idx = get_uni_comp_ref_idx(rf);
  116|  28.5k|    if (uni_comp_ref_idx >= 0) {
  ------------------
  |  Branch (116:9): [True: 6.75k, False: 21.8k]
  ------------------
  117|  6.75k|      assert((REF_FRAMES + FWD_REFS * BWD_REFS + uni_comp_ref_idx) <
  118|  6.75k|             MODE_CTX_REF_FRAMES);
  119|  6.75k|      return REF_FRAMES + FWD_REFS * BWD_REFS + uni_comp_ref_idx;
  120|  21.8k|    } else {
  121|  21.8k|      return REF_FRAMES + FWD_RF_OFFSET(rf[0]) +
  ------------------
  |  |  569|  21.8k|#define FWD_RF_OFFSET(ref) (ref - LAST_FRAME)
  ------------------
  122|  21.8k|             BWD_RF_OFFSET(rf[1]) * FWD_REFS;
  ------------------
  |  |  570|  21.8k|#define BWD_RF_OFFSET(ref) (ref - BWDREF_FRAME)
  ------------------
  123|  21.8k|    }
  124|  28.5k|  }
  125|       |
  126|   162k|  return rf[0];
  127|   190k|}
decodemv.c:get_uni_comp_ref_idx:
   99|  28.5k|static inline int8_t get_uni_comp_ref_idx(const MV_REFERENCE_FRAME *const rf) {
  100|       |  // Single ref pred
  101|  28.5k|  if (rf[1] <= INTRA_FRAME) return -1;
  ------------------
  |  Branch (101:7): [True: 0, False: 28.5k]
  ------------------
  102|       |
  103|       |  // Bi-directional comp ref pred
  104|  28.5k|  if ((rf[0] < BWDREF_FRAME) && (rf[1] >= BWDREF_FRAME)) return -1;
  ------------------
  |  Branch (104:7): [True: 26.1k, False: 2.42k]
  |  Branch (104:33): [True: 21.8k, False: 4.32k]
  ------------------
  105|       |
  106|  18.3k|  for (int8_t ref_idx = 0; ref_idx < TOTAL_UNIDIR_COMP_REFS; ++ref_idx) {
  ------------------
  |  Branch (106:28): [True: 18.3k, False: 18.4E]
  ------------------
  107|  18.3k|    if (rf[0] == comp_ref0(ref_idx) && rf[1] == comp_ref1(ref_idx))
  ------------------
  |  Branch (107:9): [True: 11.0k, False: 7.29k]
  |  Branch (107:40): [True: 6.75k, False: 4.34k]
  ------------------
  108|  6.75k|      return ref_idx;
  109|  18.3k|  }
  110|  18.4E|  return -1;
  111|  6.74k|}
decodemv.c:av1_mode_context_analyzer:
  171|  75.6k|    const int16_t *const mode_context, const MV_REFERENCE_FRAME *const rf) {
  172|  75.6k|  const int8_t ref_frame = av1_ref_frame_type(rf);
  173|       |
  174|  75.6k|  if (rf[1] <= INTRA_FRAME) return mode_context[ref_frame];
  ------------------
  |  Branch (174:7): [True: 63.7k, False: 11.8k]
  ------------------
  175|       |
  176|  11.8k|  const int16_t newmv_ctx = mode_context[ref_frame] & NEWMV_CTX_MASK;
  ------------------
  |  |  490|  11.8k|#define NEWMV_CTX_MASK ((1 << GLOBALMV_OFFSET) - 1)
  |  |  ------------------
  |  |  |  |  487|  11.8k|#define GLOBALMV_OFFSET 3
  |  |  ------------------
  ------------------
  177|  11.8k|  const int16_t refmv_ctx =
  178|  11.8k|      (mode_context[ref_frame] >> REFMV_OFFSET) & REFMV_CTX_MASK;
  ------------------
  |  |  488|  11.8k|#define REFMV_OFFSET 4
  ------------------
                    (mode_context[ref_frame] >> REFMV_OFFSET) & REFMV_CTX_MASK;
  ------------------
  |  |  492|  11.8k|#define REFMV_CTX_MASK ((1 << (8 - REFMV_OFFSET)) - 1)
  |  |  ------------------
  |  |  |  |  488|  11.8k|#define REFMV_OFFSET 4
  |  |  ------------------
  ------------------
  179|       |
  180|  11.8k|  const int16_t comp_ctx = compound_mode_ctx_map[refmv_ctx >> 1][AOMMIN(
  ------------------
  |  |   34|  11.8k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 10.6k, False: 1.18k]
  |  |  ------------------
  ------------------
  181|  11.8k|      newmv_ctx, COMP_NEWMV_CTXS - 1)];
  182|  11.8k|  return comp_ctx;
  183|  75.6k|}
decodemv.c:av1_drl_ctx:
  185|  22.9k|static inline uint8_t av1_drl_ctx(const uint16_t *ref_mv_weight, int ref_idx) {
  186|  22.9k|  if (ref_mv_weight[ref_idx] >= REF_CAT_LEVEL &&
  ------------------
  |  |  512|  45.9k|#define REF_CAT_LEVEL 640
  ------------------
  |  Branch (186:7): [True: 18.2k, False: 4.73k]
  ------------------
  187|  18.2k|      ref_mv_weight[ref_idx + 1] >= REF_CAT_LEVEL)
  ------------------
  |  |  512|  18.2k|#define REF_CAT_LEVEL 640
  ------------------
  |  Branch (187:7): [True: 7.73k, False: 10.5k]
  ------------------
  188|  7.73k|    return 0;
  189|       |
  190|  15.2k|  if (ref_mv_weight[ref_idx] >= REF_CAT_LEVEL &&
  ------------------
  |  |  512|  30.4k|#define REF_CAT_LEVEL 640
  ------------------
  |  Branch (190:7): [True: 10.5k, False: 4.73k]
  ------------------
  191|  10.5k|      ref_mv_weight[ref_idx + 1] < REF_CAT_LEVEL)
  ------------------
  |  |  512|  10.5k|#define REF_CAT_LEVEL 640
  ------------------
  |  Branch (191:7): [True: 10.5k, False: 0]
  ------------------
  192|  10.5k|    return 1;
  193|       |
  194|  4.73k|  if (ref_mv_weight[ref_idx] < REF_CAT_LEVEL &&
  ------------------
  |  |  512|  9.46k|#define REF_CAT_LEVEL 640
  ------------------
  |  Branch (194:7): [True: 4.73k, False: 0]
  ------------------
  195|  4.73k|      ref_mv_weight[ref_idx + 1] < REF_CAT_LEVEL)
  ------------------
  |  |  512|  4.73k|#define REF_CAT_LEVEL 640
  ------------------
  |  Branch (195:7): [True: 4.73k, False: 0]
  ------------------
  196|  4.73k|    return 2;
  197|       |
  198|      0|  return 0;
  199|  4.73k|}
decodemv.c:lower_mv_precision:
   88|  42.4k|static inline void lower_mv_precision(MV *mv, int allow_hp, int is_integer) {
   89|  42.4k|  if (is_integer) {
  ------------------
  |  Branch (89:7): [True: 2.80k, False: 39.6k]
  ------------------
   90|  2.80k|    integer_mv_precision(mv);
   91|  39.6k|  } else {
   92|  39.6k|    if (!allow_hp) {
  ------------------
  |  Branch (92:9): [True: 14.9k, False: 24.7k]
  ------------------
   93|  14.9k|      if (mv->row & 1) mv->row += (mv->row > 0 ? -1 : 1);
  ------------------
  |  Branch (93:11): [True: 0, False: 14.9k]
  |  Branch (93:36): [True: 0, False: 0]
  ------------------
   94|  14.9k|      if (mv->col & 1) mv->col += (mv->col > 0 ? -1 : 1);
  ------------------
  |  Branch (94:11): [True: 0, False: 14.9k]
  |  Branch (94:36): [True: 0, False: 0]
  ------------------
   95|  14.9k|    }
   96|  39.6k|  }
   97|  42.4k|}
decodemv.c:get_relative_dist:
   37|  12.4k|static inline int get_relative_dist(const OrderHintInfo *oh, int a, int b) {
   38|  12.4k|  if (!oh->enable_order_hint) return 0;
  ------------------
  |  Branch (38:7): [True: 0, False: 12.4k]
  ------------------
   39|       |
   40|  12.4k|  const int bits = oh->order_hint_bits_minus_1 + 1;
   41|       |
   42|  12.4k|  assert(bits >= 1);
   43|  12.4k|  assert(a >= 0 && a < (1 << bits));
   44|  12.4k|  assert(b >= 0 && b < (1 << bits));
   45|       |
   46|  12.4k|  int diff = a - b;
   47|  12.4k|  const int m = 1 << (bits - 1);
   48|  12.4k|  diff = (diff & (m - 1)) - (diff & m);
   49|  12.4k|  return diff;
   50|  12.4k|}
mvref_common.c:av1_set_ref_frame:
  153|   109k|                                     MV_REFERENCE_FRAME ref_frame_type) {
  154|   109k|  if (ref_frame_type >= REF_FRAMES) {
  ------------------
  |  Branch (154:7): [True: 25.7k, False: 84.0k]
  ------------------
  155|  25.7k|    rf[0] = ref_frame_map[ref_frame_type - REF_FRAMES][0];
  156|  25.7k|    rf[1] = ref_frame_map[ref_frame_type - REF_FRAMES][1];
  157|  84.0k|  } else {
  158|       |    assert(ref_frame_type > NONE_FRAME);
  159|  84.0k|    rf[0] = ref_frame_type;
  160|  84.0k|    rf[1] = NONE_FRAME;
  161|  84.0k|  }
  162|   109k|}
mvref_common.c:find_valid_row_offset:
   77|  71.4k|                                        int row_offset) {
   78|  71.4k|  return clamp(row_offset, tile->mi_row_start - mi_row,
   79|  71.4k|               tile->mi_row_end - mi_row - 1);
   80|  71.4k|}
mvref_common.c:find_valid_col_offset:
   83|  76.7k|                                        int col_offset) {
   84|  76.7k|  return clamp(col_offset, tile->mi_col_start - mi_col,
   85|  76.7k|               tile->mi_col_end - mi_col - 1);
   86|  76.7k|}
mvref_common.c:get_block_mv:
   62|   244k|static inline int_mv get_block_mv(const MB_MODE_INFO *candidate, int which_mv) {
   63|   244k|  return candidate->mv[which_mv];
   64|   244k|}
mvref_common.c:clamp_mv_ref:
   52|   181k|static inline void clamp_mv_ref(MV *mv, int bw, int bh, const MACROBLOCKD *xd) {
   53|   181k|  const SubpelMvLimits mv_limits = {
   54|   181k|    xd->mb_to_left_edge - GET_MV_SUBPEL(bw) - MV_BORDER,
  ------------------
  |  |   29|   181k|#define GET_MV_SUBPEL(x) ((x) * 8)
  ------------------
                  xd->mb_to_left_edge - GET_MV_SUBPEL(bw) - MV_BORDER,
  ------------------
  |  |   35|   181k|#define MV_BORDER (16 << 3)  // Allow 16 pels in 1/8th pel units
  ------------------
   55|   181k|    xd->mb_to_right_edge + GET_MV_SUBPEL(bw) + MV_BORDER,
  ------------------
  |  |   29|   181k|#define GET_MV_SUBPEL(x) ((x) * 8)
  ------------------
                  xd->mb_to_right_edge + GET_MV_SUBPEL(bw) + MV_BORDER,
  ------------------
  |  |   35|   181k|#define MV_BORDER (16 << 3)  // Allow 16 pels in 1/8th pel units
  ------------------
   56|   181k|    xd->mb_to_top_edge - GET_MV_SUBPEL(bh) - MV_BORDER,
  ------------------
  |  |   29|   181k|#define GET_MV_SUBPEL(x) ((x) * 8)
  ------------------
                  xd->mb_to_top_edge - GET_MV_SUBPEL(bh) - MV_BORDER,
  ------------------
  |  |   35|   181k|#define MV_BORDER (16 << 3)  // Allow 16 pels in 1/8th pel units
  ------------------
   57|   181k|    xd->mb_to_bottom_edge + GET_MV_SUBPEL(bh) + MV_BORDER
  ------------------
  |  |   29|   181k|#define GET_MV_SUBPEL(x) ((x) * 8)
  ------------------
                  xd->mb_to_bottom_edge + GET_MV_SUBPEL(bh) + MV_BORDER
  ------------------
  |  |   35|   181k|#define MV_BORDER (16 << 3)  // Allow 16 pels in 1/8th pel units
  ------------------
   58|   181k|  };
   59|   181k|  clamp_mv(mv, &mv_limits);
   60|   181k|}
mvref_common.c:lower_mv_precision:
   88|   158k|static inline void lower_mv_precision(MV *mv, int allow_hp, int is_integer) {
   89|   158k|  if (is_integer) {
  ------------------
  |  Branch (89:7): [True: 9.46k, False: 149k]
  ------------------
   90|  9.46k|    integer_mv_precision(mv);
   91|   149k|  } else {
   92|   149k|    if (!allow_hp) {
  ------------------
  |  Branch (92:9): [True: 51.7k, False: 97.6k]
  ------------------
   93|  51.7k|      if (mv->row & 1) mv->row += (mv->row > 0 ? -1 : 1);
  ------------------
  |  Branch (93:11): [True: 0, False: 51.7k]
  |  Branch (93:36): [True: 0, False: 0]
  ------------------
   94|  51.7k|      if (mv->col & 1) mv->col += (mv->col > 0 ? -1 : 1);
  ------------------
  |  Branch (94:11): [True: 0, False: 51.7k]
  |  Branch (94:36): [True: 0, False: 0]
  ------------------
   95|  51.7k|    }
   96|   149k|  }
   97|   158k|}
mvref_common.c:get_relative_dist:
   37|   275k|static inline int get_relative_dist(const OrderHintInfo *oh, int a, int b) {
   38|   275k|  if (!oh->enable_order_hint) return 0;
  ------------------
  |  Branch (38:7): [True: 0, False: 275k]
  ------------------
   39|       |
   40|   275k|  const int bits = oh->order_hint_bits_minus_1 + 1;
   41|       |
   42|   275k|  assert(bits >= 1);
   43|   275k|  assert(a >= 0 && a < (1 << bits));
   44|   275k|  assert(b >= 0 && b < (1 << bits));
   45|       |
   46|   275k|  int diff = a - b;
   47|   275k|  const int m = 1 << (bits - 1);
   48|   275k|  diff = (diff & (m - 1)) - (diff & m);
   49|   275k|  return diff;
   50|   275k|}
mvref_common.c:is_inside:
   69|   488k|                            const POSITION *mi_pos) {
   70|   488k|  return !(mi_row + mi_pos->row < tile->mi_row_start ||
  ------------------
  |  Branch (70:12): [True: 44.5k, False: 443k]
  ------------------
   71|   443k|           mi_col + mi_pos->col < tile->mi_col_start ||
  ------------------
  |  Branch (71:12): [True: 6.44k, False: 437k]
  ------------------
   72|   437k|           mi_row + mi_pos->row >= tile->mi_row_end ||
  ------------------
  |  Branch (72:12): [True: 9.36k, False: 428k]
  ------------------
   73|   428k|           mi_col + mi_pos->col >= tile->mi_col_end);
  ------------------
  |  Branch (73:12): [True: 9.16k, False: 418k]
  ------------------
   74|   488k|}
reconinter.c:get_relative_dist:
   37|  11.0k|static inline int get_relative_dist(const OrderHintInfo *oh, int a, int b) {
   38|  11.0k|  if (!oh->enable_order_hint) return 0;
  ------------------
  |  Branch (38:7): [True: 0, False: 11.0k]
  ------------------
   39|       |
   40|  11.0k|  const int bits = oh->order_hint_bits_minus_1 + 1;
   41|       |
   42|  11.0k|  assert(bits >= 1);
   43|  11.0k|  assert(a >= 0 && a < (1 << bits));
   44|  11.0k|  assert(b >= 0 && b < (1 << bits));
   45|       |
   46|  11.0k|  int diff = a - b;
   47|  11.0k|  const int m = 1 << (bits - 1);
   48|  11.0k|  diff = (diff & (m - 1)) - (diff & m);
   49|  11.0k|  return diff;
   50|  11.0k|}

decodeframe.c:foreach_overlappable_nb_above:
   23|  6.36k|                                                 void *fun_ctxt) {
   24|  6.36k|  if (!xd->up_available) return;
  ------------------
  |  Branch (24:7): [True: 0, False: 6.36k]
  ------------------
   25|       |
   26|  6.36k|  const int num_planes = av1_num_planes(cm);
   27|  6.36k|  int nb_count = 0;
   28|  6.36k|  const int mi_col = xd->mi_col;
   29|       |  // prev_row_mi points into the mi array, starting at the beginning of the
   30|       |  // previous row.
   31|  6.36k|  MB_MODE_INFO **prev_row_mi = xd->mi - mi_col - 1 * xd->mi_stride;
   32|  6.36k|  const int end_col = AOMMIN(mi_col + xd->width, cm->mi_params.mi_cols);
  ------------------
  |  |   34|  6.36k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 6.12k, False: 249]
  |  |  ------------------
  ------------------
   33|  6.36k|  uint8_t mi_step;
   34|  13.3k|  for (int above_mi_col = mi_col; above_mi_col < end_col && nb_count < nb_max;
  ------------------
  |  Branch (34:35): [True: 6.98k, False: 6.35k]
  |  Branch (34:61): [True: 6.97k, False: 18]
  ------------------
   35|  6.97k|       above_mi_col += mi_step) {
   36|  6.97k|    MB_MODE_INFO **above_mi = prev_row_mi + above_mi_col;
   37|  6.97k|    mi_step =
   38|  6.97k|        AOMMIN(mi_size_wide[above_mi[0]->bsize], mi_size_wide[BLOCK_64X64]);
  ------------------
  |  |   34|  6.97k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 6.76k, False: 211]
  |  |  ------------------
  ------------------
   39|       |    // If we're considering a block with width 4, it should be treated as
   40|       |    // half of a pair of blocks with chroma information in the second. Move
   41|       |    // above_mi_col back to the start of the pair if needed, set above_mbmi
   42|       |    // to point at the block with chroma information, and set mi_step to 2 to
   43|       |    // step over the entire pair at the end of the iteration.
   44|  6.97k|    if (mi_step == 1) {
  ------------------
  |  Branch (44:9): [True: 407, False: 6.56k]
  ------------------
   45|    407|      above_mi_col &= ~1;
   46|    407|      above_mi = prev_row_mi + above_mi_col + 1;
   47|    407|      mi_step = 2;
   48|    407|    }
   49|  6.97k|    if (is_neighbor_overlappable(*above_mi)) {
  ------------------
  |  Branch (49:9): [True: 6.80k, False: 167]
  ------------------
   50|  6.80k|      ++nb_count;
   51|  6.80k|      fun(xd, 0, above_mi_col - mi_col, AOMMIN(xd->width, mi_step), 0,
  ------------------
  |  |   34|  6.80k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 1.76k, False: 5.03k]
  |  |  ------------------
  ------------------
   52|  6.80k|          *above_mi, fun_ctxt, num_planes);
   53|  6.80k|    }
   54|  6.97k|  }
   55|  6.36k|}
decodeframe.c:foreach_overlappable_nb_left:
   60|  6.83k|                                                void *fun_ctxt) {
   61|  6.83k|  if (!xd->left_available) return;
  ------------------
  |  Branch (61:7): [True: 0, False: 6.83k]
  ------------------
   62|       |
   63|  6.83k|  const int num_planes = av1_num_planes(cm);
   64|  6.83k|  int nb_count = 0;
   65|       |  // prev_col_mi points into the mi array, starting at the top of the
   66|       |  // previous column
   67|  6.83k|  const int mi_row = xd->mi_row;
   68|  6.83k|  MB_MODE_INFO **prev_col_mi = xd->mi - 1 - mi_row * xd->mi_stride;
   69|  6.83k|  const int end_row = AOMMIN(mi_row + xd->height, cm->mi_params.mi_rows);
  ------------------
  |  |   34|  6.83k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 6.55k, False: 285]
  |  |  ------------------
  ------------------
   70|  6.83k|  uint8_t mi_step;
   71|  14.2k|  for (int left_mi_row = mi_row; left_mi_row < end_row && nb_count < nb_max;
  ------------------
  |  Branch (71:34): [True: 7.43k, False: 6.81k]
  |  Branch (71:59): [True: 7.41k, False: 26]
  ------------------
   72|  7.41k|       left_mi_row += mi_step) {
   73|  7.41k|    MB_MODE_INFO **left_mi = prev_col_mi + left_mi_row * xd->mi_stride;
   74|  7.41k|    mi_step =
   75|  7.41k|        AOMMIN(mi_size_high[left_mi[0]->bsize], mi_size_high[BLOCK_64X64]);
  ------------------
  |  |   34|  7.41k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 6.83k, False: 572]
  |  |  ------------------
  ------------------
   76|  7.41k|    if (mi_step == 1) {
  ------------------
  |  Branch (76:9): [True: 597, False: 6.81k]
  ------------------
   77|    597|      left_mi_row &= ~1;
   78|    597|      left_mi = prev_col_mi + (left_mi_row + 1) * xd->mi_stride;
   79|    597|      mi_step = 2;
   80|    597|    }
   81|  7.41k|    if (is_neighbor_overlappable(*left_mi)) {
  ------------------
  |  Branch (81:9): [True: 7.21k, False: 193]
  ------------------
   82|  7.21k|      ++nb_count;
   83|  7.21k|      fun(xd, left_mi_row - mi_row, 0, AOMMIN(xd->height, mi_step), 1, *left_mi,
  ------------------
  |  |   34|  7.21k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 2.24k, False: 4.97k]
  |  |  ------------------
  ------------------
   84|  7.21k|          fun_ctxt, num_planes);
   85|  7.21k|    }
   86|  7.41k|  }
   87|  6.83k|}
reconinter.c:foreach_overlappable_nb_above:
   23|  59.6k|                                                 void *fun_ctxt) {
   24|  59.6k|  if (!xd->up_available) return;
  ------------------
  |  Branch (24:7): [True: 15.6k, False: 44.0k]
  ------------------
   25|       |
   26|  44.0k|  const int num_planes = av1_num_planes(cm);
   27|  44.0k|  int nb_count = 0;
   28|  44.0k|  const int mi_col = xd->mi_col;
   29|       |  // prev_row_mi points into the mi array, starting at the beginning of the
   30|       |  // previous row.
   31|  44.0k|  MB_MODE_INFO **prev_row_mi = xd->mi - mi_col - 1 * xd->mi_stride;
   32|  44.0k|  const int end_col = AOMMIN(mi_col + xd->width, cm->mi_params.mi_cols);
  ------------------
  |  |   34|  44.0k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 42.1k, False: 1.95k]
  |  |  ------------------
  ------------------
   33|  44.0k|  uint8_t mi_step;
   34|  92.7k|  for (int above_mi_col = mi_col; above_mi_col < end_col && nb_count < nb_max;
  ------------------
  |  Branch (34:35): [True: 48.7k, False: 44.0k]
  |  Branch (34:61): [True: 48.6k, False: 14]
  ------------------
   35|  48.6k|       above_mi_col += mi_step) {
   36|  48.6k|    MB_MODE_INFO **above_mi = prev_row_mi + above_mi_col;
   37|  48.6k|    mi_step =
   38|  48.6k|        AOMMIN(mi_size_wide[above_mi[0]->bsize], mi_size_wide[BLOCK_64X64]);
  ------------------
  |  |   34|  48.6k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 45.8k, False: 2.87k]
  |  |  ------------------
  ------------------
   39|       |    // If we're considering a block with width 4, it should be treated as
   40|       |    // half of a pair of blocks with chroma information in the second. Move
   41|       |    // above_mi_col back to the start of the pair if needed, set above_mbmi
   42|       |    // to point at the block with chroma information, and set mi_step to 2 to
   43|       |    // step over the entire pair at the end of the iteration.
   44|  48.6k|    if (mi_step == 1) {
  ------------------
  |  Branch (44:9): [True: 2.50k, False: 46.1k]
  ------------------
   45|  2.50k|      above_mi_col &= ~1;
   46|  2.50k|      above_mi = prev_row_mi + above_mi_col + 1;
   47|  2.50k|      mi_step = 2;
   48|  2.50k|    }
   49|  48.6k|    if (is_neighbor_overlappable(*above_mi)) {
  ------------------
  |  Branch (49:9): [True: 47.2k, False: 1.45k]
  ------------------
   50|  47.2k|      ++nb_count;
   51|  47.2k|      fun(xd, 0, above_mi_col - mi_col, AOMMIN(xd->width, mi_step), 0,
  ------------------
  |  |   34|  47.2k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 11.7k, False: 35.5k]
  |  |  ------------------
  ------------------
   52|  47.2k|          *above_mi, fun_ctxt, num_planes);
   53|  47.2k|    }
   54|  48.6k|  }
   55|  44.0k|}
reconinter.c:foreach_overlappable_nb_left:
   60|  22.9k|                                                void *fun_ctxt) {
   61|  22.9k|  if (!xd->left_available) return;
  ------------------
  |  Branch (61:7): [True: 5.51k, False: 17.4k]
  ------------------
   62|       |
   63|  17.4k|  const int num_planes = av1_num_planes(cm);
   64|  17.4k|  int nb_count = 0;
   65|       |  // prev_col_mi points into the mi array, starting at the top of the
   66|       |  // previous column
   67|  17.4k|  const int mi_row = xd->mi_row;
   68|  17.4k|  MB_MODE_INFO **prev_col_mi = xd->mi - 1 - mi_row * xd->mi_stride;
   69|  17.4k|  const int end_row = AOMMIN(mi_row + xd->height, cm->mi_params.mi_rows);
  ------------------
  |  |   34|  17.4k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 15.8k, False: 1.63k]
  |  |  ------------------
  ------------------
   70|  17.4k|  uint8_t mi_step;
   71|  37.0k|  for (int left_mi_row = mi_row; left_mi_row < end_row && nb_count < nb_max;
  ------------------
  |  Branch (71:34): [True: 19.5k, False: 17.4k]
  |  Branch (71:59): [True: 19.5k, False: 25]
  ------------------
   72|  19.5k|       left_mi_row += mi_step) {
   73|  19.5k|    MB_MODE_INFO **left_mi = prev_col_mi + left_mi_row * xd->mi_stride;
   74|  19.5k|    mi_step =
   75|  19.5k|        AOMMIN(mi_size_high[left_mi[0]->bsize], mi_size_high[BLOCK_64X64]);
  ------------------
  |  |   34|  19.5k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 17.1k, False: 2.40k]
  |  |  ------------------
  ------------------
   76|  19.5k|    if (mi_step == 1) {
  ------------------
  |  Branch (76:9): [True: 1.24k, False: 18.3k]
  ------------------
   77|  1.24k|      left_mi_row &= ~1;
   78|  1.24k|      left_mi = prev_col_mi + (left_mi_row + 1) * xd->mi_stride;
   79|  1.24k|      mi_step = 2;
   80|  1.24k|    }
   81|  19.5k|    if (is_neighbor_overlappable(*left_mi)) {
  ------------------
  |  Branch (81:9): [True: 18.8k, False: 699]
  ------------------
   82|  18.8k|      ++nb_count;
   83|  18.8k|      fun(xd, left_mi_row - mi_row, 0, AOMMIN(xd->height, mi_step), 1, *left_mi,
  ------------------
  |  |   34|  18.8k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 4.74k, False: 14.1k]
  |  |  ------------------
  ------------------
   84|  18.8k|          fun_ctxt, num_planes);
   85|  18.8k|    }
   86|  19.5k|  }
   87|  17.4k|}

aom_read_obu_header_and_size:
   95|   176k|                                             size_t *const bytes_read) {
   96|   176k|  size_t length_field_size_obu = 0;
   97|   176k|  size_t length_field_size_payload = 0;
   98|   176k|  size_t obu_size = 0;
   99|   176k|  aom_codec_err_t status;
  100|       |
  101|   176k|  if (is_annexb) {
  ------------------
  |  Branch (101:7): [True: 0, False: 176k]
  ------------------
  102|       |    // Size field comes before the OBU header, and includes the OBU header
  103|      0|    status =
  104|      0|        read_obu_size(data, bytes_available, &obu_size, &length_field_size_obu);
  105|       |
  106|      0|    if (status != AOM_CODEC_OK) return status;
  ------------------
  |  Branch (106:9): [True: 0, False: 0]
  ------------------
  107|      0|  }
  108|       |
  109|   176k|  struct aom_read_bit_buffer rb = { data + length_field_size_obu,
  110|   176k|                                    data + bytes_available, 0, NULL, NULL };
  111|       |
  112|   176k|  status = read_obu_header(&rb, is_annexb, obu_header);
  113|   176k|  if (status != AOM_CODEC_OK) return status;
  ------------------
  |  Branch (113:7): [True: 1.21k, False: 175k]
  ------------------
  114|       |
  115|   175k|  if (!obu_header->has_size_field) {
  ------------------
  |  Branch (115:7): [True: 0, False: 175k]
  ------------------
  116|      0|    assert(is_annexb);
  117|       |    // Derive the payload size from the data we've already read
  118|      0|    if (obu_size < obu_header->size) return AOM_CODEC_CORRUPT_FRAME;
  ------------------
  |  Branch (118:9): [True: 0, False: 0]
  ------------------
  119|       |
  120|      0|    *payload_size = obu_size - obu_header->size;
  121|   175k|  } else {
  122|       |    // Size field comes after the OBU header, and is just the payload size
  123|   175k|    status = read_obu_size(
  124|   175k|        data + length_field_size_obu + obu_header->size,
  125|   175k|        bytes_available - length_field_size_obu - obu_header->size,
  126|   175k|        payload_size, &length_field_size_payload);
  127|   175k|    if (status != AOM_CODEC_OK) return status;
  ------------------
  |  Branch (127:9): [True: 227, False: 175k]
  ------------------
  128|   175k|  }
  129|       |
  130|   175k|  *bytes_read =
  131|   175k|      length_field_size_obu + obu_header->size + length_field_size_payload;
  132|   175k|  return AOM_CODEC_OK;
  133|   175k|}
obu_util.c:read_obu_header:
   34|   176k|                                       int is_annexb, ObuHeader *header) {
   35|   176k|  if (!rb || !header) return AOM_CODEC_INVALID_PARAM;
  ------------------
  |  Branch (35:7): [True: 0, False: 176k]
  |  Branch (35:14): [True: 0, False: 176k]
  ------------------
   36|       |
   37|   176k|  const ptrdiff_t bit_buffer_byte_length = rb->bit_buffer_end - rb->bit_buffer;
   38|   176k|  if (bit_buffer_byte_length < 1) return AOM_CODEC_CORRUPT_FRAME;
  ------------------
  |  Branch (38:7): [True: 9, False: 176k]
  ------------------
   39|       |
   40|   176k|  header->size = 1;
   41|       |
   42|   176k|  if (aom_rb_read_bit(rb) != 0) {
  ------------------
  |  Branch (42:7): [True: 727, False: 176k]
  ------------------
   43|       |    // Forbidden bit. Must not be set.
   44|    727|    return AOM_CODEC_CORRUPT_FRAME;
   45|    727|  }
   46|       |
   47|   176k|  header->type = (OBU_TYPE)aom_rb_read_literal(rb, 4);
   48|   176k|  header->has_extension = aom_rb_read_bit(rb);
   49|   176k|  header->has_size_field = aom_rb_read_bit(rb);
   50|       |
   51|   176k|  if (!header->has_size_field && !is_annexb) {
  ------------------
  |  Branch (51:7): [True: 478, False: 175k]
  |  Branch (51:34): [True: 478, False: 0]
  ------------------
   52|       |    // section 5 obu streams must have obu_size field set.
   53|    478|    return AOM_CODEC_UNSUP_BITSTREAM;
   54|    478|  }
   55|       |
   56|       |  // obu_reserved_1bit must be set to 0. The value is ignored by a decoder.
   57|   175k|  aom_rb_read_bit(rb);
   58|       |
   59|   175k|  if (header->has_extension) {
  ------------------
  |  Branch (59:7): [True: 3.10k, False: 172k]
  ------------------
   60|  3.10k|    if (bit_buffer_byte_length == 1) return AOM_CODEC_CORRUPT_FRAME;
  ------------------
  |  Branch (60:9): [True: 5, False: 3.09k]
  ------------------
   61|       |
   62|  3.09k|    header->size += 1;
   63|  3.09k|    header->temporal_layer_id = aom_rb_read_literal(rb, 3);
   64|  3.09k|    header->spatial_layer_id = aom_rb_read_literal(rb, 2);
   65|       |    // extension_header_reserved_3bits must be set to 0. The value is ignored by
   66|       |    // a decoder.
   67|  3.09k|    aom_rb_read_literal(rb, 3);
   68|   172k|  } else {
   69|   172k|    header->temporal_layer_id = 0;
   70|   172k|    header->spatial_layer_id = 0;
   71|   172k|  }
   72|       |
   73|   175k|  return AOM_CODEC_OK;
   74|   175k|}
obu_util.c:read_obu_size:
   20|   175k|                                     size_t *const length_field_size) {
   21|   175k|  uint64_t u_obu_size = 0;
   22|   175k|  if (aom_uleb_decode(data, bytes_available, &u_obu_size, length_field_size) !=
  ------------------
  |  Branch (22:7): [True: 227, False: 175k]
  ------------------
   23|   175k|      0) {
   24|    227|    return AOM_CODEC_CORRUPT_FRAME;
   25|    227|  }
   26|       |
   27|   175k|  if (u_obu_size > UINT32_MAX) return AOM_CODEC_CORRUPT_FRAME;
  ------------------
  |  Branch (27:7): [True: 0, False: 175k]
  ------------------
   28|   175k|  *obu_size = (size_t)u_obu_size;
   29|   175k|  return AOM_CODEC_OK;
   30|   175k|}

av1_get_pred_context_switchable_interp:
   30|  31.7k|int av1_get_pred_context_switchable_interp(const MACROBLOCKD *xd, int dir) {
   31|  31.7k|  const MB_MODE_INFO *const mbmi = xd->mi[0];
   32|  31.7k|  const int ctx_offset =
   33|  31.7k|      (mbmi->ref_frame[1] > INTRA_FRAME) * INTER_FILTER_COMP_OFFSET;
  ------------------
  |  |  101|  31.7k|#define INTER_FILTER_COMP_OFFSET (SWITCHABLE_FILTERS + 1)
  ------------------
   34|  31.7k|  assert(dir == 0 || dir == 1);
   35|  31.7k|  const MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame[0];
   36|       |  // Note:
   37|       |  // The mode info data structure has a one element border above and to the
   38|       |  // left of the entries corresponding to real macroblocks.
   39|       |  // The prediction flags in these dummy entries are initialized to 0.
   40|  31.7k|  int filter_type_ctx = ctx_offset + (dir & 0x01) * INTER_FILTER_DIR_OFFSET;
  ------------------
  |  |  102|  31.7k|#define INTER_FILTER_DIR_OFFSET ((SWITCHABLE_FILTERS + 1) * 2)
  ------------------
   41|  31.7k|  int left_type = SWITCHABLE_FILTERS;
   42|  31.7k|  int above_type = SWITCHABLE_FILTERS;
   43|       |
   44|  31.7k|  if (xd->left_available)
  ------------------
  |  Branch (44:7): [True: 26.0k, False: 5.73k]
  ------------------
   45|  26.0k|    left_type = get_ref_filter_type(xd->mi[-1], xd, dir, ref_frame);
   46|       |
   47|  31.7k|  if (xd->up_available)
  ------------------
  |  Branch (47:7): [True: 24.2k, False: 7.54k]
  ------------------
   48|  24.2k|    above_type =
   49|  24.2k|        get_ref_filter_type(xd->mi[-xd->mi_stride], xd, dir, ref_frame);
   50|       |
   51|  31.7k|  if (left_type == above_type) {
  ------------------
  |  Branch (51:7): [True: 16.4k, False: 15.3k]
  ------------------
   52|  16.4k|    filter_type_ctx += left_type;
   53|  16.4k|  } else if (left_type == SWITCHABLE_FILTERS) {
  ------------------
  |  Branch (53:14): [True: 6.32k, False: 9.01k]
  ------------------
   54|  6.32k|    assert(above_type != SWITCHABLE_FILTERS);
   55|  6.32k|    filter_type_ctx += above_type;
   56|  9.01k|  } else if (above_type == SWITCHABLE_FILTERS) {
  ------------------
  |  Branch (56:14): [True: 7.89k, False: 1.12k]
  ------------------
   57|  7.89k|    assert(left_type != SWITCHABLE_FILTERS);
   58|  7.89k|    filter_type_ctx += left_type;
   59|  7.89k|  } else {
   60|  1.12k|    filter_type_ctx += SWITCHABLE_FILTERS;
   61|  1.12k|  }
   62|       |
   63|  31.7k|  return filter_type_ctx;
   64|  31.7k|}
av1_get_palette_cache:
   74|  62.4k|                          uint16_t *cache) {
   75|  62.4k|  const int row = -xd->mb_to_top_edge >> 3;
   76|       |  // Do not refer to above SB row when on SB boundary.
   77|  62.4k|  const MB_MODE_INFO *const above_mi =
   78|  62.4k|      (row % (1 << MIN_SB_SIZE_LOG2)) ? xd->above_mbmi : NULL;
  ------------------
  |  |   36|  62.4k|#define MIN_SB_SIZE_LOG2 6
  ------------------
  |  Branch (78:7): [True: 45.0k, False: 17.4k]
  ------------------
   79|  62.4k|  const MB_MODE_INFO *const left_mi = xd->left_mbmi;
   80|  62.4k|  int above_n = 0, left_n = 0;
   81|  62.4k|  if (above_mi) above_n = above_mi->palette_mode_info.palette_size[plane != 0];
  ------------------
  |  Branch (81:7): [True: 45.0k, False: 17.4k]
  ------------------
   82|  62.4k|  if (left_mi) left_n = left_mi->palette_mode_info.palette_size[plane != 0];
  ------------------
  |  Branch (82:7): [True: 54.7k, False: 7.78k]
  ------------------
   83|  62.4k|  if (above_n == 0 && left_n == 0) return 0;
  ------------------
  |  Branch (83:7): [True: 42.4k, False: 20.0k]
  |  Branch (83:23): [True: 24.5k, False: 17.8k]
  ------------------
   84|  37.9k|  int above_idx = plane * PALETTE_MAX_SIZE;
  ------------------
  |  |   63|  37.9k|#define PALETTE_MAX_SIZE 8
  ------------------
   85|  37.9k|  int left_idx = plane * PALETTE_MAX_SIZE;
  ------------------
  |  |   63|  37.9k|#define PALETTE_MAX_SIZE 8
  ------------------
   86|  37.9k|  int n = 0;
   87|  37.9k|  const uint16_t *above_colors =
   88|  37.9k|      above_mi ? above_mi->palette_mode_info.palette_colors : NULL;
  ------------------
  |  Branch (88:7): [True: 30.5k, False: 7.34k]
  ------------------
   89|  37.9k|  const uint16_t *left_colors =
   90|  37.9k|      left_mi ? left_mi->palette_mode_info.palette_colors : NULL;
  ------------------
  |  Branch (90:7): [True: 36.9k, False: 935]
  ------------------
   91|       |  // Merge the sorted lists of base colors from above and left to get
   92|       |  // combined sorted color cache.
   93|  87.8k|  while (above_n > 0 && left_n > 0) {
  ------------------
  |  Branch (93:10): [True: 64.4k, False: 23.3k]
  |  Branch (93:25): [True: 49.9k, False: 14.5k]
  ------------------
   94|  49.9k|    uint16_t v_above = above_colors[above_idx];
   95|  49.9k|    uint16_t v_left = left_colors[left_idx];
   96|  49.9k|    if (v_left < v_above) {
  ------------------
  |  Branch (96:9): [True: 18.5k, False: 31.4k]
  ------------------
   97|  18.5k|      palette_add_to_cache(cache, &n, v_left);
   98|  18.5k|      ++left_idx, --left_n;
   99|  31.4k|    } else {
  100|  31.4k|      palette_add_to_cache(cache, &n, v_above);
  101|  31.4k|      ++above_idx, --above_n;
  102|  31.4k|      if (v_left == v_above) ++left_idx, --left_n;
  ------------------
  |  Branch (102:11): [True: 13.0k, False: 18.3k]
  ------------------
  103|  31.4k|    }
  104|  49.9k|  }
  105|  95.1k|  while (above_n-- > 0) {
  ------------------
  |  Branch (105:10): [True: 57.2k, False: 37.9k]
  ------------------
  106|  57.2k|    uint16_t val = above_colors[above_idx++];
  107|  57.2k|    palette_add_to_cache(cache, &n, val);
  108|  57.2k|  }
  109|   118k|  while (left_n-- > 0) {
  ------------------
  |  Branch (109:10): [True: 80.8k, False: 37.9k]
  ------------------
  110|  80.8k|    uint16_t val = left_colors[left_idx++];
  111|  80.8k|    palette_add_to_cache(cache, &n, val);
  112|  80.8k|  }
  113|       |  assert(n <= 2 * PALETTE_MAX_SIZE);
  114|  37.9k|  return n;
  115|  62.4k|}
av1_get_intra_inter_context:
  124|   139k|int av1_get_intra_inter_context(const MACROBLOCKD *xd) {
  125|   139k|  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
  126|   139k|  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
  127|   139k|  const int has_above = xd->up_available;
  128|   139k|  const int has_left = xd->left_available;
  129|       |
  130|   139k|  if (has_above && has_left) {  // both edges available
  ------------------
  |  Branch (130:7): [True: 110k, False: 28.8k]
  |  Branch (130:20): [True: 95.0k, False: 15.3k]
  ------------------
  131|  95.0k|    const int above_intra = !is_inter_block(above_mbmi);
  132|  95.0k|    const int left_intra = !is_inter_block(left_mbmi);
  133|  95.0k|    return left_intra && above_intra ? 3 : left_intra || above_intra;
  ------------------
  |  Branch (133:12): [True: 40.4k, False: 54.6k]
  |  Branch (133:26): [True: 38.1k, False: 2.27k]
  |  Branch (133:44): [True: 2.27k, False: 54.6k]
  |  Branch (133:58): [True: 2.39k, False: 52.2k]
  ------------------
  134|  95.0k|  } else if (has_above || has_left) {  // one edge available
  ------------------
  |  Branch (134:14): [True: 15.3k, False: 28.8k]
  |  Branch (134:27): [True: 20.7k, False: 8.06k]
  ------------------
  135|  36.1k|    return 2 * !is_inter_block(has_above ? above_mbmi : left_mbmi);
  ------------------
  |  Branch (135:32): [True: 15.3k, False: 20.7k]
  ------------------
  136|  36.1k|  } else {
  137|  8.06k|    return 0;
  138|  8.06k|  }
  139|   139k|}
av1_get_reference_mode_context:
  145|  22.6k|int av1_get_reference_mode_context(const MACROBLOCKD *xd) {
  146|  22.6k|  int ctx;
  147|  22.6k|  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
  148|  22.6k|  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
  149|  22.6k|  const int has_above = xd->up_available;
  150|  22.6k|  const int has_left = xd->left_available;
  151|       |
  152|       |  // Note:
  153|       |  // The mode info data structure has a one element border above and to the
  154|       |  // left of the entries corresponding to real macroblocks.
  155|       |  // The prediction flags in these dummy entries are initialized to 0.
  156|  22.6k|  if (has_above && has_left) {  // both edges available
  ------------------
  |  Branch (156:7): [True: 16.1k, False: 6.47k]
  |  Branch (156:20): [True: 13.6k, False: 2.58k]
  ------------------
  157|  13.6k|    if (!has_second_ref(above_mbmi) && !has_second_ref(left_mbmi))
  ------------------
  |  Branch (157:9): [True: 7.12k, False: 6.48k]
  |  Branch (157:40): [True: 4.81k, False: 2.31k]
  ------------------
  158|       |      // neither edge uses comp pred (0/1)
  159|  4.81k|      ctx = IS_BACKWARD_REF_FRAME(above_mbmi->ref_frame[0]) ^
  ------------------
  |  |  143|  4.81k|#define IS_BACKWARD_REF_FRAME(ref_frame) CHECK_BACKWARD_REFS(ref_frame)
  |  |  ------------------
  |  |  |  |  142|  4.81k|  (((ref_frame) >= BWDREF_FRAME) && ((ref_frame) <= ALTREF_FRAME))
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (142:4): [True: 2.07k, False: 2.73k]
  |  |  |  |  |  Branch (142:37): [True: 2.07k, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  160|  4.81k|            IS_BACKWARD_REF_FRAME(left_mbmi->ref_frame[0]);
  ------------------
  |  |  143|  4.81k|#define IS_BACKWARD_REF_FRAME(ref_frame) CHECK_BACKWARD_REFS(ref_frame)
  |  |  ------------------
  |  |  |  |  142|  4.81k|  (((ref_frame) >= BWDREF_FRAME) && ((ref_frame) <= ALTREF_FRAME))
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (142:4): [True: 2.07k, False: 2.74k]
  |  |  |  |  |  Branch (142:37): [True: 2.07k, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  161|  8.79k|    else if (!has_second_ref(above_mbmi))
  ------------------
  |  Branch (161:14): [True: 2.31k, False: 6.48k]
  ------------------
  162|       |      // one of two edges uses comp pred (2/3)
  163|  2.31k|      ctx = 2 + (IS_BACKWARD_REF_FRAME(above_mbmi->ref_frame[0]) ||
  ------------------
  |  |  143|  2.31k|#define IS_BACKWARD_REF_FRAME(ref_frame) CHECK_BACKWARD_REFS(ref_frame)
  |  |  ------------------
  |  |  |  |  142|  4.63k|  (((ref_frame) >= BWDREF_FRAME) && ((ref_frame) <= ALTREF_FRAME))
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (142:4): [True: 1.01k, False: 1.30k]
  |  |  |  |  |  Branch (142:37): [True: 1.01k, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  164|  1.30k|                 !is_inter_block(above_mbmi));
  ------------------
  |  Branch (164:18): [True: 137, False: 1.16k]
  ------------------
  165|  6.48k|    else if (!has_second_ref(left_mbmi))
  ------------------
  |  Branch (165:14): [True: 2.65k, False: 3.83k]
  ------------------
  166|       |      // one of two edges uses comp pred (2/3)
  167|  2.65k|      ctx = 2 + (IS_BACKWARD_REF_FRAME(left_mbmi->ref_frame[0]) ||
  ------------------
  |  |  143|  2.65k|#define IS_BACKWARD_REF_FRAME(ref_frame) CHECK_BACKWARD_REFS(ref_frame)
  |  |  ------------------
  |  |  |  |  142|  5.30k|  (((ref_frame) >= BWDREF_FRAME) && ((ref_frame) <= ALTREF_FRAME))
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (142:4): [True: 1.21k, False: 1.44k]
  |  |  |  |  |  Branch (142:37): [True: 1.21k, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  168|  1.44k|                 !is_inter_block(left_mbmi));
  ------------------
  |  Branch (168:18): [True: 113, False: 1.32k]
  ------------------
  169|  3.83k|    else  // both edges use comp pred (4)
  170|  3.83k|      ctx = 4;
  171|  13.6k|  } else if (has_above || has_left) {  // one edge available
  ------------------
  |  Branch (171:14): [True: 2.57k, False: 6.48k]
  |  Branch (171:27): [True: 4.58k, False: 1.90k]
  ------------------
  172|  7.16k|    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
  ------------------
  |  Branch (172:37): [True: 2.58k, False: 4.58k]
  ------------------
  173|       |
  174|  7.16k|    if (!has_second_ref(edge_mbmi))
  ------------------
  |  Branch (174:9): [True: 4.42k, False: 2.73k]
  ------------------
  175|       |      // edge does not use comp pred (0/1)
  176|  4.42k|      ctx = IS_BACKWARD_REF_FRAME(edge_mbmi->ref_frame[0]);
  ------------------
  |  |  143|  4.42k|#define IS_BACKWARD_REF_FRAME(ref_frame) CHECK_BACKWARD_REFS(ref_frame)
  |  |  ------------------
  |  |  |  |  142|  4.42k|  (((ref_frame) >= BWDREF_FRAME) && ((ref_frame) <= ALTREF_FRAME))
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (142:4): [True: 2.02k, False: 2.40k]
  |  |  |  |  |  Branch (142:37): [True: 2.02k, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  177|  2.73k|    else
  178|       |      // edge uses comp pred (3)
  179|  2.73k|      ctx = 3;
  180|  7.16k|  } else {  // no edges available (1)
  181|  1.88k|    ctx = 1;
  182|  1.88k|  }
  183|       |  assert(ctx >= 0 && ctx < COMP_INTER_CONTEXTS);
  184|  22.6k|  return ctx;
  185|  22.6k|}
av1_get_comp_reference_type_context:
  187|  11.8k|int av1_get_comp_reference_type_context(const MACROBLOCKD *xd) {
  188|  11.8k|  int pred_context;
  189|  11.8k|  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
  190|  11.8k|  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
  191|  11.8k|  const int above_in_image = xd->up_available;
  192|  11.8k|  const int left_in_image = xd->left_available;
  193|       |
  194|  11.8k|  if (above_in_image && left_in_image) {  // both edges available
  ------------------
  |  Branch (194:7): [True: 8.95k, False: 2.93k]
  |  Branch (194:25): [True: 7.87k, False: 1.07k]
  ------------------
  195|  7.87k|    const int above_intra = !is_inter_block(above_mbmi);
  196|  7.87k|    const int left_intra = !is_inter_block(left_mbmi);
  197|       |
  198|  7.87k|    if (above_intra && left_intra) {  // intra/intra
  ------------------
  |  Branch (198:9): [True: 150, False: 7.72k]
  |  Branch (198:24): [True: 17, False: 133]
  ------------------
  199|     17|      pred_context = 2;
  200|  7.85k|    } else if (above_intra || left_intra) {  // intra/inter
  ------------------
  |  Branch (200:16): [True: 133, False: 7.72k]
  |  Branch (200:31): [True: 121, False: 7.60k]
  ------------------
  201|    254|      const MB_MODE_INFO *inter_mbmi = above_intra ? left_mbmi : above_mbmi;
  ------------------
  |  Branch (201:40): [True: 133, False: 121]
  ------------------
  202|       |
  203|    254|      if (!has_second_ref(inter_mbmi))  // single pred
  ------------------
  |  Branch (203:11): [True: 65, False: 189]
  ------------------
  204|     65|        pred_context = 2;
  205|    189|      else  // comp pred
  206|    189|        pred_context = 1 + 2 * has_uni_comp_refs(inter_mbmi);
  207|  7.60k|    } else {  // inter/inter
  208|  7.60k|      const int a_sg = !has_second_ref(above_mbmi);
  209|  7.60k|      const int l_sg = !has_second_ref(left_mbmi);
  210|  7.60k|      const MV_REFERENCE_FRAME frfa = above_mbmi->ref_frame[0];
  211|  7.60k|      const MV_REFERENCE_FRAME frfl = left_mbmi->ref_frame[0];
  212|       |
  213|  7.60k|      if (a_sg && l_sg) {  // single/single
  ------------------
  |  Branch (213:11): [True: 2.39k, False: 5.21k]
  |  Branch (213:19): [True: 966, False: 1.42k]
  ------------------
  214|    966|        pred_context = 1 + 2 * (!(IS_BACKWARD_REF_FRAME(frfa) ^
  ------------------
  |  |  143|    966|#define IS_BACKWARD_REF_FRAME(ref_frame) CHECK_BACKWARD_REFS(ref_frame)
  |  |  ------------------
  |  |  |  |  142|    966|  (((ref_frame) >= BWDREF_FRAME) && ((ref_frame) <= ALTREF_FRAME))
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (142:4): [True: 476, False: 490]
  |  |  |  |  |  Branch (142:37): [True: 476, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  215|    966|                                  IS_BACKWARD_REF_FRAME(frfl)));
  ------------------
  |  |  143|    966|#define IS_BACKWARD_REF_FRAME(ref_frame) CHECK_BACKWARD_REFS(ref_frame)
  |  |  ------------------
  |  |  |  |  142|    966|  (((ref_frame) >= BWDREF_FRAME) && ((ref_frame) <= ALTREF_FRAME))
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (142:4): [True: 457, False: 509]
  |  |  |  |  |  Branch (142:37): [True: 457, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  216|  6.63k|      } else if (l_sg || a_sg) {  // single/comp
  ------------------
  |  Branch (216:18): [True: 1.66k, False: 4.97k]
  |  Branch (216:26): [True: 1.42k, False: 3.54k]
  ------------------
  217|  3.09k|        const int uni_rfc =
  218|  3.09k|            a_sg ? has_uni_comp_refs(left_mbmi) : has_uni_comp_refs(above_mbmi);
  ------------------
  |  Branch (218:13): [True: 1.42k, False: 1.66k]
  ------------------
  219|       |
  220|  3.09k|        if (!uni_rfc)  // comp bidir
  ------------------
  |  Branch (220:13): [True: 2.31k, False: 782]
  ------------------
  221|  2.31k|          pred_context = 1;
  222|    782|        else  // comp unidir
  223|    782|          pred_context = 3 + (!(IS_BACKWARD_REF_FRAME(frfa) ^
  ------------------
  |  |  143|    782|#define IS_BACKWARD_REF_FRAME(ref_frame) CHECK_BACKWARD_REFS(ref_frame)
  |  |  ------------------
  |  |  |  |  142|    782|  (((ref_frame) >= BWDREF_FRAME) && ((ref_frame) <= ALTREF_FRAME))
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (142:4): [True: 325, False: 457]
  |  |  |  |  |  Branch (142:37): [True: 325, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  224|    782|                                IS_BACKWARD_REF_FRAME(frfl)));
  ------------------
  |  |  143|    782|#define IS_BACKWARD_REF_FRAME(ref_frame) CHECK_BACKWARD_REFS(ref_frame)
  |  |  ------------------
  |  |  |  |  142|    782|  (((ref_frame) >= BWDREF_FRAME) && ((ref_frame) <= ALTREF_FRAME))
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (142:4): [True: 353, False: 429]
  |  |  |  |  |  Branch (142:37): [True: 353, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  225|  3.54k|      } else {  // comp/comp
  226|  3.54k|        const int a_uni_rfc = has_uni_comp_refs(above_mbmi);
  227|  3.54k|        const int l_uni_rfc = has_uni_comp_refs(left_mbmi);
  228|       |
  229|  3.54k|        if (!a_uni_rfc && !l_uni_rfc)  // bidir/bidir
  ------------------
  |  Branch (229:13): [True: 2.54k, False: 1.00k]
  |  Branch (229:27): [True: 2.21k, False: 330]
  ------------------
  230|  2.21k|          pred_context = 0;
  231|  1.33k|        else if (!a_uni_rfc || !l_uni_rfc)  // unidir/bidir
  ------------------
  |  Branch (231:18): [True: 328, False: 1.00k]
  |  Branch (231:32): [True: 495, False: 508]
  ------------------
  232|    824|          pred_context = 2;
  233|    507|        else  // unidir/unidir
  234|    507|          pred_context =
  235|    507|              3 + (!((frfa == BWDREF_FRAME) ^ (frfl == BWDREF_FRAME)));
  236|  3.54k|      }
  237|  7.60k|    }
  238|  7.87k|  } else if (above_in_image || left_in_image) {  // one edge available
  ------------------
  |  Branch (238:14): [True: 1.07k, False: 2.93k]
  |  Branch (238:32): [True: 2.28k, False: 658]
  ------------------
  239|  3.35k|    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
  ------------------
  |  Branch (239:37): [True: 1.07k, False: 2.28k]
  ------------------
  240|       |
  241|  3.35k|    if (!is_inter_block(edge_mbmi)) {  // intra
  ------------------
  |  Branch (241:9): [True: 38, False: 3.32k]
  ------------------
  242|     38|      pred_context = 2;
  243|  3.32k|    } else {                           // inter
  244|  3.32k|      if (!has_second_ref(edge_mbmi))  // single pred
  ------------------
  |  Branch (244:11): [True: 1.28k, False: 2.04k]
  ------------------
  245|  1.28k|        pred_context = 2;
  246|  2.04k|      else  // comp pred
  247|  2.04k|        pred_context = 4 * has_uni_comp_refs(edge_mbmi);
  248|  3.32k|    }
  249|  3.35k|  } else {  // no edges available
  250|    656|    pred_context = 2;
  251|    656|  }
  252|       |
  253|       |  assert(pred_context >= 0 && pred_context < COMP_REF_TYPE_CONTEXTS);
  254|  11.8k|  return pred_context;
  255|  11.8k|}
av1_get_pred_context_uni_comp_ref_p:
  265|  2.75k|int av1_get_pred_context_uni_comp_ref_p(const MACROBLOCKD *xd) {
  266|  2.75k|  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
  267|       |
  268|       |  // Count of forward references (L, L2, L3, or G)
  269|  2.75k|  const int frf_count = ref_counts[LAST_FRAME] + ref_counts[LAST2_FRAME] +
  270|  2.75k|                        ref_counts[LAST3_FRAME] + ref_counts[GOLDEN_FRAME];
  271|       |  // Count of backward references (B or A)
  272|  2.75k|  const int brf_count = ref_counts[BWDREF_FRAME] + ref_counts[ALTREF2_FRAME] +
  273|  2.75k|                        ref_counts[ALTREF_FRAME];
  274|       |
  275|  2.75k|  const int pred_context =
  276|  2.75k|      (frf_count == brf_count) ? 1 : ((frf_count < brf_count) ? 0 : 2);
  ------------------
  |  Branch (276:7): [True: 345, False: 2.40k]
  |  Branch (276:39): [True: 1.01k, False: 1.39k]
  ------------------
  277|       |
  278|       |  assert(pred_context >= 0 && pred_context < UNI_COMP_REF_CONTEXTS);
  279|  2.75k|  return pred_context;
  280|  2.75k|}
av1_get_pred_context_uni_comp_ref_p1:
  290|  1.75k|int av1_get_pred_context_uni_comp_ref_p1(const MACROBLOCKD *xd) {
  291|  1.75k|  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
  292|       |
  293|       |  // Count of LAST2
  294|  1.75k|  const int last2_count = ref_counts[LAST2_FRAME];
  295|       |  // Count of LAST3 or GOLDEN
  296|  1.75k|  const int last3_or_gld_count =
  297|  1.75k|      ref_counts[LAST3_FRAME] + ref_counts[GOLDEN_FRAME];
  298|       |
  299|  1.75k|  const int pred_context = (last2_count == last3_or_gld_count)
  ------------------
  |  Branch (299:28): [True: 558, False: 1.19k]
  ------------------
  300|  1.75k|                               ? 1
  301|  1.75k|                               : ((last2_count < last3_or_gld_count) ? 0 : 2);
  ------------------
  |  Branch (301:35): [True: 864, False: 334]
  ------------------
  302|       |
  303|       |  assert(pred_context >= 0 && pred_context < UNI_COMP_REF_CONTEXTS);
  304|  1.75k|  return pred_context;
  305|  1.75k|}
av1_get_pred_context_uni_comp_ref_p2:
  315|  1.23k|int av1_get_pred_context_uni_comp_ref_p2(const MACROBLOCKD *xd) {
  316|  1.23k|  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
  317|       |
  318|       |  // Count of LAST3
  319|  1.23k|  const int last3_count = ref_counts[LAST3_FRAME];
  320|       |  // Count of GOLDEN
  321|  1.23k|  const int gld_count = ref_counts[GOLDEN_FRAME];
  322|       |
  323|  1.23k|  const int pred_context =
  324|  1.23k|      (last3_count == gld_count) ? 1 : ((last3_count < gld_count) ? 0 : 2);
  ------------------
  |  Branch (324:7): [True: 444, False: 786]
  |  Branch (324:41): [True: 323, False: 463]
  ------------------
  325|       |
  326|       |  assert(pred_context >= 0 && pred_context < UNI_COMP_REF_CONTEXTS);
  327|  1.23k|  return pred_context;
  328|  1.23k|}
av1_get_pred_context_comp_ref_p:
  421|  9.13k|int av1_get_pred_context_comp_ref_p(const MACROBLOCKD *xd) {
  422|  9.13k|  return get_pred_context_ll2_or_l3gld(xd);
  423|  9.13k|}
av1_get_pred_context_comp_ref_p1:
  428|  6.19k|int av1_get_pred_context_comp_ref_p1(const MACROBLOCKD *xd) {
  429|  6.19k|  return get_pred_context_last_or_last2(xd);
  430|  6.19k|}
av1_get_pred_context_comp_ref_p2:
  435|  2.94k|int av1_get_pred_context_comp_ref_p2(const MACROBLOCKD *xd) {
  436|  2.94k|  return get_pred_context_last3_or_gld(xd);
  437|  2.94k|}
av1_get_pred_context_comp_bwdref_p:
  441|  9.13k|int av1_get_pred_context_comp_bwdref_p(const MACROBLOCKD *xd) {
  442|  9.13k|  return get_pred_context_brfarf2_or_arf(xd);
  443|  9.13k|}
av1_get_pred_context_comp_bwdref_p1:
  447|  4.75k|int av1_get_pred_context_comp_bwdref_p1(const MACROBLOCKD *xd) {
  448|  4.75k|  return get_pred_context_brf_or_arf2(xd);
  449|  4.75k|}
av1_get_pred_context_single_ref_p1:
  455|  63.7k|int av1_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
  456|  63.7k|  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
  457|       |
  458|       |  // Count of forward reference frames
  459|  63.7k|  const int fwd_count = ref_counts[LAST_FRAME] + ref_counts[LAST2_FRAME] +
  460|  63.7k|                        ref_counts[LAST3_FRAME] + ref_counts[GOLDEN_FRAME];
  461|       |  // Count of backward reference frames
  462|  63.7k|  const int bwd_count = ref_counts[BWDREF_FRAME] + ref_counts[ALTREF2_FRAME] +
  463|  63.7k|                        ref_counts[ALTREF_FRAME];
  464|       |
  465|  63.7k|  const int pred_context =
  466|  63.7k|      (fwd_count == bwd_count) ? 1 : ((fwd_count < bwd_count) ? 0 : 2);
  ------------------
  |  Branch (466:7): [True: 13.4k, False: 50.2k]
  |  Branch (466:39): [True: 17.1k, False: 33.1k]
  ------------------
  467|       |
  468|       |  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
  469|  63.7k|  return pred_context;
  470|  63.7k|}
av1_get_pred_context_single_ref_p2:
  475|  22.8k|int av1_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
  476|  22.8k|  return get_pred_context_brfarf2_or_arf(xd);
  477|  22.8k|}
av1_get_pred_context_single_ref_p3:
  481|  40.8k|int av1_get_pred_context_single_ref_p3(const MACROBLOCKD *xd) {
  482|  40.8k|  return get_pred_context_ll2_or_l3gld(xd);
  483|  40.8k|}
av1_get_pred_context_single_ref_p4:
  487|  31.4k|int av1_get_pred_context_single_ref_p4(const MACROBLOCKD *xd) {
  488|  31.4k|  return get_pred_context_last_or_last2(xd);
  489|  31.4k|}
av1_get_pred_context_single_ref_p5:
  493|  9.40k|int av1_get_pred_context_single_ref_p5(const MACROBLOCKD *xd) {
  494|  9.40k|  return get_pred_context_last3_or_gld(xd);
  495|  9.40k|}
av1_get_pred_context_single_ref_p6:
  499|  10.2k|int av1_get_pred_context_single_ref_p6(const MACROBLOCKD *xd) {
  500|  10.2k|  return get_pred_context_brf_or_arf2(xd);
  501|  10.2k|}
pred_common.c:get_ref_filter_type:
   21|  50.3k|                                        MV_REFERENCE_FRAME ref_frame) {
   22|  50.3k|  (void)xd;
   23|       |
   24|  50.3k|  return ((ref_mbmi->ref_frame[0] == ref_frame ||
  ------------------
  |  Branch (24:12): [True: 32.0k, False: 18.2k]
  ------------------
   25|  18.2k|           ref_mbmi->ref_frame[1] == ref_frame)
  ------------------
  |  Branch (25:12): [True: 2.27k, False: 15.9k]
  ------------------
   26|  50.3k|              ? av1_extract_interp_filter(ref_mbmi->interp_filters, dir & 0x01)
   27|  50.3k|              : SWITCHABLE_FILTERS);
   28|  50.3k|}
pred_common.c:palette_add_to_cache:
   66|   188k|static void palette_add_to_cache(uint16_t *cache, int *n, uint16_t val) {
   67|       |  // Do not add an already existing value
   68|   188k|  if (*n > 0 && val == cache[*n - 1]) return;
  ------------------
  |  Branch (68:7): [True: 150k, False: 37.9k]
  |  Branch (68:17): [True: 24.8k, False: 125k]
  ------------------
   69|       |
   70|   163k|  cache[(*n)++] = val;
   71|   163k|}
pred_common.c:get_pred_context_ll2_or_l3gld:
  334|  50.0k|static int get_pred_context_ll2_or_l3gld(const MACROBLOCKD *xd) {
  335|  50.0k|  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
  336|       |
  337|       |  // Count of LAST + LAST2
  338|  50.0k|  const int last_last2_count = ref_counts[LAST_FRAME] + ref_counts[LAST2_FRAME];
  339|       |  // Count of LAST3 + GOLDEN
  340|  50.0k|  const int last3_gld_count =
  341|  50.0k|      ref_counts[LAST3_FRAME] + ref_counts[GOLDEN_FRAME];
  342|       |
  343|  50.0k|  const int pred_context = (last_last2_count == last3_gld_count)
  ------------------
  |  Branch (343:28): [True: 10.5k, False: 39.5k]
  ------------------
  344|  50.0k|                               ? 1
  345|  50.0k|                               : ((last_last2_count < last3_gld_count) ? 0 : 2);
  ------------------
  |  Branch (345:35): [True: 8.66k, False: 30.8k]
  ------------------
  346|       |
  347|       |  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
  348|  50.0k|  return pred_context;
  349|  50.0k|}
pred_common.c:get_pred_context_last_or_last2:
  352|  37.6k|static int get_pred_context_last_or_last2(const MACROBLOCKD *xd) {
  353|  37.6k|  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
  354|       |
  355|       |  // Count of LAST
  356|  37.6k|  const int last_count = ref_counts[LAST_FRAME];
  357|       |  // Count of LAST2
  358|  37.6k|  const int last2_count = ref_counts[LAST2_FRAME];
  359|       |
  360|  37.6k|  const int pred_context =
  361|  37.6k|      (last_count == last2_count) ? 1 : ((last_count < last2_count) ? 0 : 2);
  ------------------
  |  Branch (361:7): [True: 7.24k, False: 30.4k]
  |  Branch (361:42): [True: 2.44k, False: 27.9k]
  ------------------
  362|       |
  363|       |  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
  364|  37.6k|  return pred_context;
  365|  37.6k|}
pred_common.c:get_pred_context_last3_or_gld:
  368|  12.3k|static int get_pred_context_last3_or_gld(const MACROBLOCKD *xd) {
  369|  12.3k|  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
  370|       |
  371|       |  // Count of LAST3
  372|  12.3k|  const int last3_count = ref_counts[LAST3_FRAME];
  373|       |  // Count of GOLDEN
  374|  12.3k|  const int gld_count = ref_counts[GOLDEN_FRAME];
  375|       |
  376|  12.3k|  const int pred_context =
  377|  12.3k|      (last3_count == gld_count) ? 1 : ((last3_count < gld_count) ? 0 : 2);
  ------------------
  |  Branch (377:7): [True: 3.71k, False: 8.63k]
  |  Branch (377:41): [True: 5.40k, False: 3.23k]
  ------------------
  378|       |
  379|       |  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
  380|  12.3k|  return pred_context;
  381|  12.3k|}
pred_common.c:get_pred_context_brfarf2_or_arf:
  385|  32.0k|static int get_pred_context_brfarf2_or_arf(const MACROBLOCKD *xd) {
  386|  32.0k|  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
  387|       |
  388|       |  // Counts of BWDREF, ALTREF2, or ALTREF frames (B, A2, or A)
  389|  32.0k|  const int brfarf2_count =
  390|  32.0k|      ref_counts[BWDREF_FRAME] + ref_counts[ALTREF2_FRAME];
  391|  32.0k|  const int arf_count = ref_counts[ALTREF_FRAME];
  392|       |
  393|  32.0k|  const int pred_context =
  394|  32.0k|      (brfarf2_count == arf_count) ? 1 : ((brfarf2_count < arf_count) ? 0 : 2);
  ------------------
  |  Branch (394:7): [True: 7.65k, False: 24.3k]
  |  Branch (394:43): [True: 13.2k, False: 11.1k]
  ------------------
  395|       |
  396|       |  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
  397|  32.0k|  return pred_context;
  398|  32.0k|}
pred_common.c:get_pred_context_brf_or_arf2:
  401|  15.0k|static int get_pred_context_brf_or_arf2(const MACROBLOCKD *xd) {
  402|  15.0k|  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
  403|       |
  404|       |  // Count of BWDREF frames (B)
  405|  15.0k|  const int brf_count = ref_counts[BWDREF_FRAME];
  406|       |  // Count of ALTREF2 frames (A2)
  407|  15.0k|  const int arf2_count = ref_counts[ALTREF2_FRAME];
  408|       |
  409|  15.0k|  const int pred_context =
  410|  15.0k|      (brf_count == arf2_count) ? 1 : ((brf_count < arf2_count) ? 0 : 2);
  ------------------
  |  Branch (410:7): [True: 3.93k, False: 11.0k]
  |  Branch (410:40): [True: 5.94k, False: 5.14k]
  ------------------
  411|       |
  412|       |  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
  413|  15.0k|  return pred_context;
  414|  15.0k|}

decodeframe.c:get_tx_size_context:
  342|   646k|static inline int get_tx_size_context(const MACROBLOCKD *xd) {
  343|   646k|  const MB_MODE_INFO *mbmi = xd->mi[0];
  344|   646k|  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
  345|   646k|  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
  346|   646k|  const TX_SIZE max_tx_size = max_txsize_rect_lookup[mbmi->bsize];
  347|   646k|  const int max_tx_wide = tx_size_wide[max_tx_size];
  348|   646k|  const int max_tx_high = tx_size_high[max_tx_size];
  349|   646k|  const int has_above = xd->up_available;
  350|   646k|  const int has_left = xd->left_available;
  351|       |
  352|   646k|  int above = xd->above_txfm_context[0] >= max_tx_wide;
  353|   646k|  int left = xd->left_txfm_context[0] >= max_tx_high;
  354|       |
  355|   646k|  if (has_above)
  ------------------
  |  Branch (355:7): [True: 601k, False: 44.4k]
  ------------------
  356|   601k|    if (is_inter_block(above_mbmi))
  ------------------
  |  Branch (356:9): [True: 7.20k, False: 594k]
  ------------------
  357|  7.20k|      above = block_size_wide[above_mbmi->bsize] >= max_tx_wide;
  358|       |
  359|   646k|  if (has_left)
  ------------------
  |  Branch (359:7): [True: 611k, False: 34.9k]
  ------------------
  360|   611k|    if (is_inter_block(left_mbmi))
  ------------------
  |  Branch (360:9): [True: 7.22k, False: 604k]
  ------------------
  361|  7.22k|      left = block_size_high[left_mbmi->bsize] >= max_tx_high;
  362|       |
  363|   646k|  if (has_above && has_left)
  ------------------
  |  Branch (363:7): [True: 601k, False: 44.4k]
  |  Branch (363:20): [True: 574k, False: 27.7k]
  ------------------
  364|   574k|    return (above + left);
  365|  72.1k|  else if (has_above)
  ------------------
  |  Branch (365:12): [True: 27.7k, False: 44.4k]
  ------------------
  366|  27.7k|    return above;
  367|  44.4k|  else if (has_left)
  ------------------
  |  Branch (367:12): [True: 37.2k, False: 7.18k]
  ------------------
  368|  37.2k|    return left;
  369|  7.18k|  else
  370|  7.18k|    return 0;
  371|   646k|}
decodemv.c:av1_get_spatial_seg_pred:
   51|   444k|                                               int skip_over4x4) {
   52|   444k|  const int step_size = skip_over4x4 ? 2 : 1;
  ------------------
  |  Branch (52:25): [True: 0, False: 444k]
  ------------------
   53|   444k|  uint8_t prev_ul = UINT8_MAX;  // top left segment_id
   54|   444k|  uint8_t prev_l = UINT8_MAX;   // left segment_id
   55|   444k|  uint8_t prev_u = UINT8_MAX;   // top segment_id
   56|   444k|  const int mi_row = xd->mi_row;
   57|   444k|  const int mi_col = xd->mi_col;
   58|   444k|  const CommonModeInfoParams *const mi_params = &cm->mi_params;
   59|   444k|  const uint8_t *seg_map = cm->cur_frame->seg_map;
   60|   444k|  if ((xd->up_available) && (xd->left_available)) {
  ------------------
  |  Branch (60:7): [True: 423k, False: 20.9k]
  |  Branch (60:29): [True: 408k, False: 15.0k]
  ------------------
   61|   408k|    prev_ul = get_segment_id(mi_params, seg_map, BLOCK_4X4, mi_row - step_size,
   62|   408k|                             mi_col - step_size);
   63|   408k|  }
   64|   444k|  if (xd->up_available) {
  ------------------
  |  Branch (64:7): [True: 423k, False: 20.9k]
  ------------------
   65|   423k|    prev_u = get_segment_id(mi_params, seg_map, BLOCK_4X4, mi_row - step_size,
   66|   423k|                            mi_col - 0);
   67|   423k|  }
   68|   444k|  if (xd->left_available) {
  ------------------
  |  Branch (68:7): [True: 426k, False: 18.1k]
  ------------------
   69|   426k|    prev_l = get_segment_id(mi_params, seg_map, BLOCK_4X4, mi_row - 0,
   70|   426k|                            mi_col - step_size);
   71|   426k|  }
   72|   444k|  assert(IMPLIES(prev_ul != UINT8_MAX,
   73|   444k|                 prev_u != UINT8_MAX && prev_l != UINT8_MAX));
   74|       |
   75|       |  // Pick CDF index based on number of matching/out-of-bounds segment IDs.
   76|   444k|  if (prev_ul == UINT8_MAX) /* Edge cases */
  ------------------
  |  Branch (76:7): [True: 35.9k, False: 408k]
  ------------------
   77|  35.9k|    *cdf_index = 0;
   78|   408k|  else if ((prev_ul == prev_u) && (prev_ul == prev_l))
  ------------------
  |  Branch (78:12): [True: 198k, False: 209k]
  |  Branch (78:35): [True: 96.9k, False: 101k]
  ------------------
   79|  96.9k|    *cdf_index = 2;
   80|   311k|  else if ((prev_ul == prev_u) || (prev_ul == prev_l) || (prev_u == prev_l))
  ------------------
  |  Branch (80:12): [True: 101k, False: 209k]
  |  Branch (80:35): [True: 103k, False: 106k]
  |  Branch (80:58): [True: 19.5k, False: 86.9k]
  ------------------
   81|   224k|    *cdf_index = 1;
   82|  86.9k|  else
   83|  86.9k|    *cdf_index = 0;
   84|       |
   85|       |  // If 2 or more are identical returns that as predictor, otherwise prev_l.
   86|   444k|  if (prev_u == UINT8_MAX)  // edge case
  ------------------
  |  Branch (86:7): [True: 20.9k, False: 423k]
  ------------------
   87|  20.9k|    return prev_l == UINT8_MAX ? 0 : prev_l;
  ------------------
  |  Branch (87:12): [True: 3.10k, False: 17.8k]
  ------------------
   88|   423k|  if (prev_l == UINT8_MAX)  // edge case
  ------------------
  |  Branch (88:7): [True: 15.0k, False: 408k]
  ------------------
   89|  15.0k|    return prev_u;
   90|   408k|  return (prev_ul == prev_u) ? prev_u : prev_l;
  ------------------
  |  Branch (90:10): [True: 198k, False: 209k]
  ------------------
   91|   423k|}
decodemv.c:get_segment_id:
   28|  1.25M|    BLOCK_SIZE bsize, int mi_row, int mi_col) {
   29|  1.25M|  const int mi_offset = mi_row * mi_params->mi_cols + mi_col;
   30|  1.25M|  const int bw = mi_size_wide[bsize];
   31|  1.25M|  const int bh = mi_size_high[bsize];
   32|  1.25M|  const int xmis = AOMMIN(mi_params->mi_cols - mi_col, bw);
  ------------------
  |  |   34|  1.25M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 1.25M]
  |  |  ------------------
  ------------------
   33|  1.25M|  const int ymis = AOMMIN(mi_params->mi_rows - mi_row, bh);
  ------------------
  |  |   34|  1.25M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 1.25M]
  |  |  ------------------
  ------------------
   34|  1.25M|  const int seg_stride = mi_params->mi_cols;
   35|  1.25M|  uint8_t segment_id = MAX_SEGMENTS;
  ------------------
  |  |   21|  1.25M|#define MAX_SEGMENTS 8
  ------------------
   36|       |
   37|  2.51M|  for (int y = 0; y < ymis; ++y) {
  ------------------
  |  Branch (37:19): [True: 1.25M, False: 1.25M]
  ------------------
   38|  2.51M|    for (int x = 0; x < xmis; ++x) {
  ------------------
  |  Branch (38:21): [True: 1.25M, False: 1.25M]
  ------------------
   39|  1.25M|      segment_id =
   40|  1.25M|          AOMMIN(segment_id, segment_ids[mi_offset + y * seg_stride + x]);
  ------------------
  |  |   34|  1.25M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 1.25M]
  |  |  ------------------
  ------------------
   41|  1.25M|    }
   42|  1.25M|  }
   43|       |
   44|       |  assert(segment_id < MAX_SEGMENTS);
   45|  1.25M|  return segment_id;
   46|  1.25M|}
decodemv.c:av1_get_skip_txfm_context:
  175|  1.56M|static inline int av1_get_skip_txfm_context(const MACROBLOCKD *xd) {
  176|  1.56M|  const MB_MODE_INFO *const above_mi = xd->above_mbmi;
  177|  1.56M|  const MB_MODE_INFO *const left_mi = xd->left_mbmi;
  178|  1.56M|  const int above_skip_txfm = above_mi ? above_mi->skip_txfm : 0;
  ------------------
  |  Branch (178:31): [True: 1.44M, False: 120k]
  ------------------
  179|  1.56M|  const int left_skip_txfm = left_mi ? left_mi->skip_txfm : 0;
  ------------------
  |  Branch (179:30): [True: 1.46M, False: 99.3k]
  ------------------
  180|  1.56M|  return above_skip_txfm + left_skip_txfm;
  181|  1.56M|}
decodemv.c:av1_get_palette_bsize_ctx:
  192|   601k|static inline int av1_get_palette_bsize_ctx(BLOCK_SIZE bsize) {
  193|       |  assert(bsize < BLOCK_SIZES_ALL);
  194|   601k|  return num_pels_log2_lookup[bsize] - num_pels_log2_lookup[BLOCK_8X8];
  195|   601k|}
decodemv.c:av1_get_palette_mode_ctx:
  197|   264k|static inline int av1_get_palette_mode_ctx(const MACROBLOCKD *xd) {
  198|   264k|  const MB_MODE_INFO *const above_mi = xd->above_mbmi;
  199|   264k|  const MB_MODE_INFO *const left_mi = xd->left_mbmi;
  200|   264k|  int ctx = 0;
  201|   264k|  if (above_mi) ctx += (above_mi->palette_mode_info.palette_size[0] > 0);
  ------------------
  |  Branch (201:7): [True: 240k, False: 23.8k]
  ------------------
  202|   264k|  if (left_mi) ctx += (left_mi->palette_mode_info.palette_size[0] > 0);
  ------------------
  |  Branch (202:7): [True: 246k, False: 18.4k]
  ------------------
  203|   264k|  return ctx;
  204|   264k|}
decodemv.c:av1_get_pred_context_seg_id:
   93|  1.03k|static inline uint8_t av1_get_pred_context_seg_id(const MACROBLOCKD *xd) {
   94|  1.03k|  const MB_MODE_INFO *const above_mi = xd->above_mbmi;
   95|  1.03k|  const MB_MODE_INFO *const left_mi = xd->left_mbmi;
   96|  1.03k|  const int above_sip = (above_mi != NULL) ? above_mi->seg_id_predicted : 0;
  ------------------
  |  Branch (96:25): [True: 860, False: 178]
  ------------------
   97|  1.03k|  const int left_sip = (left_mi != NULL) ? left_mi->seg_id_predicted : 0;
  ------------------
  |  Branch (97:24): [True: 930, False: 108]
  ------------------
   98|       |
   99|  1.03k|  return above_sip + left_sip;
  100|  1.03k|}
decodemv.c:av1_get_skip_mode_context:
  167|  5.54k|static inline int av1_get_skip_mode_context(const MACROBLOCKD *xd) {
  168|  5.54k|  const MB_MODE_INFO *const above_mi = xd->above_mbmi;
  169|  5.54k|  const MB_MODE_INFO *const left_mi = xd->left_mbmi;
  170|  5.54k|  const int above_skip_mode = above_mi ? above_mi->skip_mode : 0;
  ------------------
  |  Branch (170:31): [True: 4.37k, False: 1.16k]
  ------------------
  171|  5.54k|  const int left_skip_mode = left_mi ? left_mi->skip_mode : 0;
  ------------------
  |  Branch (171:30): [True: 4.78k, False: 758]
  ------------------
  172|  5.54k|  return above_skip_mode + left_skip_mode;
  173|  5.54k|}
decodemv.c:av1_get_pred_cdf_uni_comp_ref_p:
  235|  2.75k|    const MACROBLOCKD *xd) {
  236|  2.75k|  const int pred_context = av1_get_pred_context_uni_comp_ref_p(xd);
  237|  2.75k|  return xd->tile_ctx->uni_comp_ref_cdf[pred_context][0];
  238|  2.75k|}
decodemv.c:av1_get_pred_cdf_uni_comp_ref_p1:
  241|  1.75k|    const MACROBLOCKD *xd) {
  242|  1.75k|  const int pred_context = av1_get_pred_context_uni_comp_ref_p1(xd);
  243|  1.75k|  return xd->tile_ctx->uni_comp_ref_cdf[pred_context][1];
  244|  1.75k|}
decodemv.c:av1_get_pred_cdf_uni_comp_ref_p2:
  247|  1.23k|    const MACROBLOCKD *xd) {
  248|  1.23k|  const int pred_context = av1_get_pred_context_uni_comp_ref_p2(xd);
  249|  1.23k|  return xd->tile_ctx->uni_comp_ref_cdf[pred_context][2];
  250|  1.23k|}
decodemv.c:av1_get_pred_cdf_comp_ref_p:
  264|  9.13k|static inline aom_cdf_prob *av1_get_pred_cdf_comp_ref_p(const MACROBLOCKD *xd) {
  265|  9.13k|  const int pred_context = av1_get_pred_context_comp_ref_p(xd);
  266|  9.13k|  return xd->tile_ctx->comp_ref_cdf[pred_context][0];
  267|  9.13k|}
decodemv.c:av1_get_pred_cdf_comp_ref_p1:
  270|  6.19k|    const MACROBLOCKD *xd) {
  271|  6.19k|  const int pred_context = av1_get_pred_context_comp_ref_p1(xd);
  272|  6.19k|  return xd->tile_ctx->comp_ref_cdf[pred_context][1];
  273|  6.19k|}
decodemv.c:av1_get_pred_cdf_comp_ref_p2:
  276|  2.94k|    const MACROBLOCKD *xd) {
  277|  2.94k|  const int pred_context = av1_get_pred_context_comp_ref_p2(xd);
  278|  2.94k|  return xd->tile_ctx->comp_ref_cdf[pred_context][2];
  279|  2.94k|}
decodemv.c:av1_get_pred_cdf_comp_bwdref_p:
  282|  9.13k|    const MACROBLOCKD *xd) {
  283|  9.13k|  const int pred_context = av1_get_pred_context_comp_bwdref_p(xd);
  284|  9.13k|  return xd->tile_ctx->comp_bwdref_cdf[pred_context][0];
  285|  9.13k|}
decodemv.c:av1_get_pred_cdf_comp_bwdref_p1:
  288|  4.75k|    const MACROBLOCKD *xd) {
  289|  4.75k|  const int pred_context = av1_get_pred_context_comp_bwdref_p1(xd);
  290|  4.75k|  return xd->tile_ctx->comp_bwdref_cdf[pred_context][1];
  291|  4.75k|}
decodemv.c:av1_get_pred_cdf_single_ref_p1:
  308|  63.7k|    const MACROBLOCKD *xd) {
  309|  63.7k|  return xd->tile_ctx
  310|  63.7k|      ->single_ref_cdf[av1_get_pred_context_single_ref_p1(xd)][0];
  311|  63.7k|}
decodemv.c:av1_get_pred_cdf_single_ref_p2:
  313|  22.8k|    const MACROBLOCKD *xd) {
  314|  22.8k|  return xd->tile_ctx
  315|  22.8k|      ->single_ref_cdf[av1_get_pred_context_single_ref_p2(xd)][1];
  316|  22.8k|}
decodemv.c:av1_get_pred_cdf_single_ref_p6:
  333|  10.2k|    const MACROBLOCKD *xd) {
  334|  10.2k|  return xd->tile_ctx
  335|  10.2k|      ->single_ref_cdf[av1_get_pred_context_single_ref_p6(xd)][5];
  336|  10.2k|}
decodemv.c:av1_get_pred_cdf_single_ref_p3:
  318|  40.8k|    const MACROBLOCKD *xd) {
  319|  40.8k|  return xd->tile_ctx
  320|  40.8k|      ->single_ref_cdf[av1_get_pred_context_single_ref_p3(xd)][2];
  321|  40.8k|}
decodemv.c:av1_get_pred_cdf_single_ref_p5:
  328|  9.40k|    const MACROBLOCKD *xd) {
  329|  9.40k|  return xd->tile_ctx
  330|  9.40k|      ->single_ref_cdf[av1_get_pred_context_single_ref_p5(xd)][4];
  331|  9.40k|}
decodemv.c:av1_get_pred_cdf_single_ref_p4:
  323|  31.4k|    const MACROBLOCKD *xd) {
  324|  31.4k|  return xd->tile_ctx
  325|  31.4k|      ->single_ref_cdf[av1_get_pred_context_single_ref_p4(xd)][3];
  326|  31.4k|}
decodemv.c:get_comp_group_idx_context:
  141|  7.69k|static inline int get_comp_group_idx_context(const MACROBLOCKD *xd) {
  142|  7.69k|  const MB_MODE_INFO *const above_mi = xd->above_mbmi;
  143|  7.69k|  const MB_MODE_INFO *const left_mi = xd->left_mbmi;
  144|  7.69k|  int above_ctx = 0, left_ctx = 0;
  145|       |
  146|  7.69k|  if (above_mi) {
  ------------------
  |  Branch (146:7): [True: 5.89k, False: 1.79k]
  ------------------
  147|  5.89k|    if (has_second_ref(above_mi))
  ------------------
  |  Branch (147:9): [True: 4.05k, False: 1.84k]
  ------------------
  148|  4.05k|      above_ctx = above_mi->comp_group_idx;
  149|  1.84k|    else if (above_mi->ref_frame[0] == ALTREF_FRAME)
  ------------------
  |  Branch (149:14): [True: 435, False: 1.40k]
  ------------------
  150|    435|      above_ctx = 3;
  151|  5.89k|  }
  152|  7.69k|  if (left_mi) {
  ------------------
  |  Branch (152:7): [True: 6.70k, False: 984]
  ------------------
  153|  6.70k|    if (has_second_ref(left_mi))
  ------------------
  |  Branch (153:9): [True: 4.30k, False: 2.40k]
  ------------------
  154|  4.30k|      left_ctx = left_mi->comp_group_idx;
  155|  2.40k|    else if (left_mi->ref_frame[0] == ALTREF_FRAME)
  ------------------
  |  Branch (155:14): [True: 708, False: 1.69k]
  ------------------
  156|    708|      left_ctx = 3;
  157|  6.70k|  }
  158|       |
  159|  7.69k|  return AOMMIN(5, above_ctx + left_ctx);
  ------------------
  |  |   34|  7.69k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 111, False: 7.57k]
  |  |  ------------------
  ------------------
  160|  7.69k|}
decodemv.c:get_comp_index_context:
  103|  6.20k|                                         const MACROBLOCKD *xd) {
  104|  6.20k|  MB_MODE_INFO *mbmi = xd->mi[0];
  105|  6.20k|  const RefCntBuffer *const bck_buf = get_ref_frame_buf(cm, mbmi->ref_frame[0]);
  106|  6.20k|  const RefCntBuffer *const fwd_buf = get_ref_frame_buf(cm, mbmi->ref_frame[1]);
  107|  6.20k|  int bck_frame_index = 0, fwd_frame_index = 0;
  108|  6.20k|  int cur_frame_index = cm->cur_frame->order_hint;
  109|       |
  110|  6.20k|  if (bck_buf != NULL) bck_frame_index = bck_buf->order_hint;
  ------------------
  |  Branch (110:7): [True: 6.20k, False: 18.4E]
  ------------------
  111|  6.20k|  if (fwd_buf != NULL) fwd_frame_index = fwd_buf->order_hint;
  ------------------
  |  Branch (111:7): [True: 6.20k, False: 18.4E]
  ------------------
  112|       |
  113|  6.20k|  int fwd = abs(get_relative_dist(&cm->seq_params->order_hint_info,
  114|  6.20k|                                  fwd_frame_index, cur_frame_index));
  115|  6.20k|  int bck = abs(get_relative_dist(&cm->seq_params->order_hint_info,
  116|  6.20k|                                  cur_frame_index, bck_frame_index));
  117|       |
  118|  6.20k|  const MB_MODE_INFO *const above_mi = xd->above_mbmi;
  119|  6.20k|  const MB_MODE_INFO *const left_mi = xd->left_mbmi;
  120|       |
  121|  6.20k|  int above_ctx = 0, left_ctx = 0;
  122|  6.20k|  const int offset = (fwd == bck);
  123|       |
  124|  6.20k|  if (above_mi != NULL) {
  ------------------
  |  Branch (124:7): [True: 4.86k, False: 1.33k]
  ------------------
  125|  4.86k|    if (has_second_ref(above_mi))
  ------------------
  |  Branch (125:9): [True: 3.02k, False: 1.83k]
  ------------------
  126|  3.02k|      above_ctx = above_mi->compound_idx;
  127|  1.83k|    else if (above_mi->ref_frame[0] == ALTREF_FRAME)
  ------------------
  |  Branch (127:14): [True: 487, False: 1.35k]
  ------------------
  128|    487|      above_ctx = 1;
  129|  4.86k|  }
  130|       |
  131|  6.20k|  if (left_mi != NULL) {
  ------------------
  |  Branch (131:7): [True: 5.20k, False: 1.00k]
  ------------------
  132|  5.20k|    if (has_second_ref(left_mi))
  ------------------
  |  Branch (132:9): [True: 3.28k, False: 1.92k]
  ------------------
  133|  3.28k|      left_ctx = left_mi->compound_idx;
  134|  1.92k|    else if (left_mi->ref_frame[0] == ALTREF_FRAME)
  ------------------
  |  Branch (134:14): [True: 417, False: 1.50k]
  ------------------
  135|    417|      left_ctx = 1;
  136|  5.20k|  }
  137|       |
  138|  6.20k|  return above_ctx + left_ctx + 3 * offset;
  139|  6.20k|}

av1_dc_quant_QTX:
  198|  13.6M|int16_t av1_dc_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) {
  199|  13.6M|  const int q_clamped = clamp(qindex + delta, 0, MAXQ);
  ------------------
  |  |   26|  13.6M|#define MAXQ 255
  ------------------
  200|  13.6M|  switch (bit_depth) {
  201|  6.82M|    case AOM_BITS_8: return dc_qlookup_QTX[q_clamped];
  ------------------
  |  Branch (201:5): [True: 6.82M, False: 6.85M]
  ------------------
  202|  6.84M|    case AOM_BITS_10: return dc_qlookup_10_QTX[q_clamped];
  ------------------
  |  Branch (202:5): [True: 6.84M, False: 6.83M]
  ------------------
  203|  7.26k|    case AOM_BITS_12: return dc_qlookup_12_QTX[q_clamped];
  ------------------
  |  Branch (203:5): [True: 7.26k, False: 13.6M]
  ------------------
  204|      0|    default:
  ------------------
  |  Branch (204:5): [True: 0, False: 13.6M]
  ------------------
  205|       |      assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
  206|      0|      return -1;
  207|  13.6M|  }
  208|  13.6M|}
av1_ac_quant_QTX:
  210|  13.6M|int16_t av1_ac_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) {
  211|  13.6M|  const int q_clamped = clamp(qindex + delta, 0, MAXQ);
  ------------------
  |  |   26|  13.6M|#define MAXQ 255
  ------------------
  212|  13.6M|  switch (bit_depth) {
  213|  6.82M|    case AOM_BITS_8: return ac_qlookup_QTX[q_clamped];
  ------------------
  |  Branch (213:5): [True: 6.82M, False: 6.85M]
  ------------------
  214|  6.84M|    case AOM_BITS_10: return ac_qlookup_10_QTX[q_clamped];
  ------------------
  |  Branch (214:5): [True: 6.84M, False: 6.83M]
  ------------------
  215|  7.26k|    case AOM_BITS_12: return ac_qlookup_12_QTX[q_clamped];
  ------------------
  |  Branch (215:5): [True: 7.26k, False: 13.6M]
  ------------------
  216|      0|    default:
  ------------------
  |  Branch (216:5): [True: 0, False: 13.6M]
  ------------------
  217|       |      assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
  218|      0|      return -1;
  219|  13.6M|  }
  220|  13.6M|}
av1_get_qindex:
  223|  6.76M|                   int base_qindex) {
  224|  6.76M|  if (segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) {
  ------------------
  |  Branch (224:7): [True: 1.37M, False: 5.38M]
  ------------------
  225|  1.37M|    const int data = get_segdata(seg, segment_id, SEG_LVL_ALT_Q);
  226|  1.37M|    const int seg_qindex = base_qindex + data;
  227|  1.37M|    return clamp(seg_qindex, 0, MAXQ);
  ------------------
  |  |   26|  1.37M|#define MAXQ 255
  ------------------
  228|  5.38M|  } else {
  229|  5.38M|    return base_qindex;
  230|  5.38M|  }
  231|  6.76M|}
av1_use_qmatrix:
  234|  47.2k|                     const struct macroblockd *xd, int segment_id) {
  235|       |  // True if explicit Q matrix levels and this is not a lossless segment.
  236|  47.2k|  return quant_params->using_qmatrix && !xd->lossless[segment_id];
  ------------------
  |  Branch (236:10): [True: 10.4k, False: 36.7k]
  |  Branch (236:41): [True: 9.91k, False: 572]
  ------------------
  237|  47.2k|}
av1_get_iqmatrix:
  245|  2.77M|                                 TX_SIZE tx_size, TX_TYPE tx_type) {
  246|  2.77M|  const struct macroblockd_plane *const pd = &xd->plane[plane];
  247|  2.77M|  const MB_MODE_INFO *const mbmi = xd->mi[0];
  248|  2.77M|  const int seg_id = mbmi->segment_id;
  249|  2.77M|  const TX_SIZE qm_tx_size = av1_get_adjusted_tx_size(tx_size);
  250|       |  // Use a flat matrix (i.e. no weighting) for 1D and Identity transforms
  251|  2.77M|  return is_2d_transform(tx_type)
  ------------------
  |  Branch (251:10): [True: 2.53M, False: 237k]
  ------------------
  252|  2.77M|             ? pd->seg_iqmatrix[seg_id][qm_tx_size]
  253|  2.77M|             : quant_params->giqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size];
  ------------------
  |  |   31|   237k|#define NUM_QM_LEVELS (1 << QM_LEVEL_BITS)
  |  |  ------------------
  |  |  |  |   30|   237k|#define QM_LEVEL_BITS 4
  |  |  ------------------
  ------------------
  254|  2.77M|}
av1_qm_init:
  277|  17.9k|void av1_qm_init(CommonQuantParams *quant_params, int num_planes) {
  278|  17.9k|#if CONFIG_QUANT_MATRIX || CONFIG_AV1_DECODER
  279|   305k|  for (int q = 0; q < NUM_QM_LEVELS; ++q) {
  ------------------
  |  |   31|   305k|#define NUM_QM_LEVELS (1 << QM_LEVEL_BITS)
  |  |  ------------------
  |  |  |  |   30|   305k|#define QM_LEVEL_BITS 4
  |  |  ------------------
  ------------------
  |  Branch (279:19): [True: 287k, False: 17.9k]
  ------------------
  280|  1.14M|    for (int c = 0; c < num_planes; ++c) {
  ------------------
  |  Branch (280:21): [True: 861k, False: 287k]
  ------------------
  281|   861k|      int current = 0;
  282|  17.2M|      for (int t = 0; t < TX_SIZES_ALL; ++t) {
  ------------------
  |  Branch (282:23): [True: 16.3M, False: 861k]
  ------------------
  283|  16.3M|        const int size = tx_size_2d[t];
  284|  16.3M|        const int qm_tx_size = av1_get_adjusted_tx_size(t);
  285|  16.3M|        if (q == NUM_QM_LEVELS - 1) {
  ------------------
  |  |   31|  16.3M|#define NUM_QM_LEVELS (1 << QM_LEVEL_BITS)
  |  |  ------------------
  |  |  |  |   30|  16.3M|#define QM_LEVEL_BITS 4
  |  |  ------------------
  ------------------
  |  Branch (285:13): [True: 1.02M, False: 15.3M]
  ------------------
  286|  1.02M|          quant_params->gqmatrix[q][c][t] = NULL;
  287|  1.02M|          quant_params->giqmatrix[q][c][t] = NULL;
  288|  15.3M|        } else if (t != qm_tx_size) {  // Reuse matrices for 'qm_tx_size'
  ------------------
  |  Branch (288:20): [True: 4.03M, False: 11.3M]
  ------------------
  289|  4.03M|          assert(t > qm_tx_size);
  290|  4.03M|          quant_params->gqmatrix[q][c][t] =
  291|  4.03M|              quant_params->gqmatrix[q][c][qm_tx_size];
  292|  4.03M|          quant_params->giqmatrix[q][c][t] =
  293|  4.03M|              quant_params->giqmatrix[q][c][qm_tx_size];
  294|  11.3M|        } else {
  295|  11.3M|          assert(current + size <= QM_TOTAL_SIZE);
  296|  11.3M|          quant_params->gqmatrix[q][c][t] = &wt_matrix_ref[q][c >= 1][current];
  297|  11.3M|          quant_params->giqmatrix[q][c][t] =
  298|  11.3M|              &iwt_matrix_ref[q][c >= 1][current];
  299|  11.3M|          current += size;
  300|  11.3M|        }
  301|  16.3M|      }
  302|   861k|    }
  303|   287k|  }
  304|       |#else
  305|       |  (void)quant_params;
  306|       |  (void)num_planes;
  307|       |#endif  // CONFIG_QUANT_MATRIX || CONFIG_AV1_DECODER
  308|  17.9k|}
quant_common.c:is_2d_transform:
  241|  2.77M|static inline bool is_2d_transform(TX_TYPE tx_type) { return (tx_type < IDTX); }

av1_init_warp_params:
   60|   139k|                          const MACROBLOCKD *xd, const MB_MODE_INFO *mi) {
   61|   139k|  if (inter_pred_params->block_height < 8 || inter_pred_params->block_width < 8)
  ------------------
  |  Branch (61:7): [True: 43.8k, False: 95.5k]
  |  Branch (61:46): [True: 15.7k, False: 79.7k]
  ------------------
   62|  59.5k|    return;
   63|       |
   64|  79.7k|  if (xd->cur_frame_force_integer_mv) return;
  ------------------
  |  Branch (64:7): [True: 5.69k, False: 74.0k]
  ------------------
   65|       |
   66|  74.0k|  if (allow_warp(mi, warp_types, &xd->global_motion[mi->ref_frame[ref]], 0,
  ------------------
  |  Branch (66:7): [True: 4.17k, False: 69.9k]
  ------------------
   67|  74.0k|                 inter_pred_params->scale_factors,
   68|  74.0k|                 &inter_pred_params->warp_params)) {
   69|       |#if CONFIG_REALTIME_ONLY && !CONFIG_AV1_DECODER
   70|       |    aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_FEATURE,
   71|       |                       "Warped motion is disabled in realtime only build.");
   72|       |#endif  // CONFIG_REALTIME_ONLY && !CONFIG_AV1_DECODER
   73|  4.17k|    inter_pred_params->mode = WARP_PRED;
   74|  4.17k|  }
   75|  74.0k|}
av1_make_inter_predictor:
   80|   194k|                              const SubpelParams *subpel_params) {
   81|   194k|  assert(IMPLIES(inter_pred_params->conv_params.is_compound,
   82|   194k|                 inter_pred_params->conv_params.dst != NULL));
   83|       |
   84|   194k|  if (inter_pred_params->mode == TRANSLATION_PRED) {
  ------------------
  |  Branch (84:7): [True: 189k, False: 4.17k]
  ------------------
   85|   189k|#if CONFIG_AV1_HIGHBITDEPTH
   86|   189k|    if (inter_pred_params->use_hbd_buf) {
  ------------------
  |  Branch (86:9): [True: 36.2k, False: 153k]
  ------------------
   87|  36.2k|      highbd_inter_predictor(src, src_stride, dst, dst_stride, subpel_params,
   88|  36.2k|                             inter_pred_params->block_width,
   89|  36.2k|                             inter_pred_params->block_height,
   90|  36.2k|                             &inter_pred_params->conv_params,
   91|  36.2k|                             inter_pred_params->interp_filter_params,
   92|  36.2k|                             inter_pred_params->bit_depth);
   93|   153k|    } else {
   94|   153k|      inter_predictor(src, src_stride, dst, dst_stride, subpel_params,
   95|   153k|                      inter_pred_params->block_width,
   96|   153k|                      inter_pred_params->block_height,
   97|   153k|                      &inter_pred_params->conv_params,
   98|   153k|                      inter_pred_params->interp_filter_params);
   99|   153k|    }
  100|       |#else
  101|       |    inter_predictor(src, src_stride, dst, dst_stride, subpel_params,
  102|       |                    inter_pred_params->block_width,
  103|       |                    inter_pred_params->block_height,
  104|       |                    &inter_pred_params->conv_params,
  105|       |                    inter_pred_params->interp_filter_params);
  106|       |#endif
  107|   189k|  }
  108|  4.17k|#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
  109|       |  // TODO(jingning): av1_warp_plane() can be further cleaned up.
  110|  4.17k|  else if (inter_pred_params->mode == WARP_PRED) {
  ------------------
  |  Branch (110:12): [True: 4.17k, False: 0]
  ------------------
  111|  4.17k|    av1_warp_plane(
  112|  4.17k|        &inter_pred_params->warp_params, inter_pred_params->use_hbd_buf,
  113|  4.17k|        inter_pred_params->bit_depth, inter_pred_params->ref_frame_buf.buf0,
  114|  4.17k|        inter_pred_params->ref_frame_buf.width,
  115|  4.17k|        inter_pred_params->ref_frame_buf.height,
  116|  4.17k|        inter_pred_params->ref_frame_buf.stride, dst,
  117|  4.17k|        inter_pred_params->pix_col, inter_pred_params->pix_row,
  118|  4.17k|        inter_pred_params->block_width, inter_pred_params->block_height,
  119|  4.17k|        dst_stride, inter_pred_params->subsampling_x,
  120|  4.17k|        inter_pred_params->subsampling_y, &inter_pred_params->conv_params);
  121|  4.17k|  }
  122|      0|#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
  123|      0|  else {
  124|       |    assert(0 && "Unsupported inter_pred_params->mode");
  125|      0|  }
  126|   194k|}
av1_get_compound_type_mask:
  291|  3.19k|    const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type) {
  292|  3.19k|  (void)sb_type;
  293|  3.19k|  switch (comp_data->type) {
  294|  1.38k|    case COMPOUND_WEDGE:
  ------------------
  |  Branch (294:5): [True: 1.38k, False: 1.80k]
  ------------------
  295|  1.38k|      return av1_get_contiguous_soft_mask(comp_data->wedge_index,
  296|  1.38k|                                          comp_data->wedge_sign, sb_type);
  297|  1.80k|    default: return comp_data->seg_mask;
  ------------------
  |  Branch (297:5): [True: 1.80k, False: 1.38k]
  ------------------
  298|  3.19k|  }
  299|  3.19k|}
av1_init_wedge_masks:
  600|  17.9k|void av1_init_wedge_masks(void) { aom_once(init_all_wedge_masks); }
av1_make_masked_inter_predictor:
  632|  3.19k|                                     const SubpelParams *subpel_params) {
  633|  3.19k|  const INTERINTER_COMPOUND_DATA *comp_data = &inter_pred_params->mask_comp;
  634|  3.19k|  BLOCK_SIZE sb_type = inter_pred_params->sb_type;
  635|       |
  636|       |  // We're going to call av1_make_inter_predictor to generate a prediction into
  637|       |  // a temporary buffer, then will blend that temporary buffer with that from
  638|       |  // the other reference.
  639|  3.19k|  DECLARE_ALIGNED(32, uint8_t, tmp_buf[2 * MAX_SB_SQUARE]);
  ------------------
  |  |   19|  3.19k|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
  640|  3.19k|  uint8_t *tmp_dst =
  641|  3.19k|      inter_pred_params->use_hbd_buf ? CONVERT_TO_BYTEPTR(tmp_buf) : tmp_buf;
  ------------------
  |  |   76|    498|#define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1))
  ------------------
  |  Branch (641:7): [True: 498, False: 2.69k]
  ------------------
  642|       |
  643|  3.19k|  const int tmp_buf_stride = MAX_SB_SIZE;
  ------------------
  |  |   32|  3.19k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|  3.19k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
  644|  3.19k|  CONV_BUF_TYPE *org_dst = inter_pred_params->conv_params.dst;
  645|  3.19k|  int org_dst_stride = inter_pred_params->conv_params.dst_stride;
  646|  3.19k|  CONV_BUF_TYPE *tmp_buf16 = (CONV_BUF_TYPE *)tmp_buf;
  647|  3.19k|  inter_pred_params->conv_params.dst = tmp_buf16;
  648|  3.19k|  inter_pred_params->conv_params.dst_stride = tmp_buf_stride;
  649|  3.19k|  assert(inter_pred_params->conv_params.do_average == 0);
  650|       |
  651|       |  // This will generate a prediction in tmp_buf for the second reference
  652|  3.19k|  av1_make_inter_predictor(pre, pre_stride, tmp_dst, MAX_SB_SIZE,
  ------------------
  |  |   32|  3.19k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|  3.19k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
  653|  3.19k|                           inter_pred_params, subpel_params);
  654|       |
  655|  3.19k|  if (!inter_pred_params->conv_params.plane &&
  ------------------
  |  Branch (655:7): [True: 1.07k, False: 2.12k]
  ------------------
  656|  1.07k|      comp_data->type == COMPOUND_DIFFWTD) {
  ------------------
  |  Branch (656:7): [True: 604, False: 469]
  ------------------
  657|    604|    av1_build_compound_diffwtd_mask_d16(
  658|    604|        comp_data->seg_mask, comp_data->mask_type, org_dst, org_dst_stride,
  659|    604|        tmp_buf16, tmp_buf_stride, inter_pred_params->block_height,
  660|    604|        inter_pred_params->block_width, &inter_pred_params->conv_params,
  661|    604|        inter_pred_params->bit_depth);
  662|    604|  }
  663|  3.19k|  build_masked_compound_no_round(
  664|  3.19k|      dst, dst_stride, org_dst, org_dst_stride, tmp_buf16, tmp_buf_stride,
  665|  3.19k|      comp_data, sb_type, inter_pred_params->block_height,
  666|  3.19k|      inter_pred_params->block_width, inter_pred_params);
  667|  3.19k|}
av1_dist_wtd_comp_weight_assign:
  673|   171k|                                     int is_compound) {
  674|   171k|  assert(fwd_offset != NULL && bck_offset != NULL);
  675|   171k|  if (!is_compound || mbmi->compound_idx) {
  ------------------
  |  Branch (675:7): [True: 135k, False: 35.9k]
  |  Branch (675:23): [True: 30.3k, False: 5.52k]
  ------------------
  676|   166k|    *fwd_offset = 8;
  677|   166k|    *bck_offset = 8;
  678|   166k|    *use_dist_wtd_comp_avg = 0;
  679|   166k|    return;
  680|   166k|  }
  681|       |
  682|  5.52k|  *use_dist_wtd_comp_avg = 1;
  683|  5.52k|  const RefCntBuffer *const bck_buf = get_ref_frame_buf(cm, mbmi->ref_frame[0]);
  684|  5.52k|  const RefCntBuffer *const fwd_buf = get_ref_frame_buf(cm, mbmi->ref_frame[1]);
  685|  5.52k|  const int cur_frame_index = cm->cur_frame->order_hint;
  686|  5.52k|  int bck_frame_index = 0, fwd_frame_index = 0;
  687|       |
  688|  5.52k|  if (bck_buf != NULL) bck_frame_index = bck_buf->order_hint;
  ------------------
  |  Branch (688:7): [True: 5.52k, False: 0]
  ------------------
  689|  5.52k|  if (fwd_buf != NULL) fwd_frame_index = fwd_buf->order_hint;
  ------------------
  |  Branch (689:7): [True: 5.52k, False: 0]
  ------------------
  690|       |
  691|  5.52k|  int d0 = clamp(abs(get_relative_dist(&cm->seq_params->order_hint_info,
  692|  5.52k|                                       fwd_frame_index, cur_frame_index)),
  693|  5.52k|                 0, MAX_FRAME_DISTANCE);
  ------------------
  |  |   68|  5.52k|#define MAX_FRAME_DISTANCE ((1 << FRAME_OFFSET_BITS) - 1)
  |  |  ------------------
  |  |  |  |   67|  5.52k|#define FRAME_OFFSET_BITS 5
  |  |  ------------------
  ------------------
  694|  5.52k|  int d1 = clamp(abs(get_relative_dist(&cm->seq_params->order_hint_info,
  695|  5.52k|                                       cur_frame_index, bck_frame_index)),
  696|  5.52k|                 0, MAX_FRAME_DISTANCE);
  ------------------
  |  |   68|  5.52k|#define MAX_FRAME_DISTANCE ((1 << FRAME_OFFSET_BITS) - 1)
  |  |  ------------------
  |  |  |  |   67|  5.52k|#define FRAME_OFFSET_BITS 5
  |  |  ------------------
  ------------------
  697|       |
  698|  5.52k|  const int order = d0 <= d1;
  699|       |
  700|  5.52k|  if (d0 == 0 || d1 == 0) {
  ------------------
  |  Branch (700:7): [True: 0, False: 5.52k]
  |  Branch (700:18): [True: 0, False: 5.52k]
  ------------------
  701|      0|    *fwd_offset = quant_dist_lookup_table[3][order];
  702|      0|    *bck_offset = quant_dist_lookup_table[3][1 - order];
  703|      0|    return;
  704|      0|  }
  705|       |
  706|  5.52k|  int i;
  707|  6.11k|  for (i = 0; i < 3; ++i) {
  ------------------
  |  Branch (707:15): [True: 5.96k, False: 156]
  ------------------
  708|  5.96k|    int c0 = quant_dist_weight[i][order];
  709|  5.96k|    int c1 = quant_dist_weight[i][!order];
  710|  5.96k|    int d0_c0 = d0 * c0;
  711|  5.96k|    int d1_c1 = d1 * c1;
  712|  5.96k|    if ((d0 > d1 && d0_c0 < d1_c1) || (d0 <= d1 && d0_c0 > d1_c1)) break;
  ------------------
  |  Branch (712:10): [True: 480, False: 5.48k]
  |  Branch (712:21): [True: 72, False: 408]
  |  Branch (712:40): [True: 5.48k, False: 408]
  |  Branch (712:52): [True: 5.30k, False: 180]
  ------------------
  713|  5.96k|  }
  714|       |
  715|  5.52k|  *fwd_offset = quant_dist_lookup_table[i][order];
  716|  5.52k|  *bck_offset = quant_dist_lookup_table[i][1 - order];
  717|  5.52k|}
av1_setup_dst_planes:
  721|  2.54M|                          const int plane_start, const int plane_end) {
  722|       |  // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
  723|       |  // the static analysis warnings.
  724|  8.08M|  for (int i = plane_start; i < AOMMIN(plane_end, MAX_MB_PLANE); ++i) {
  ------------------
  |  |   34|  8.08M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 2.07M, False: 6.01M]
  |  |  ------------------
  ------------------
  |  Branch (724:29): [True: 5.54M, False: 2.54M]
  ------------------
  725|  5.54M|    struct macroblockd_plane *const pd = &planes[i];
  726|  5.54M|    const int is_uv = i > 0;
  727|  5.54M|    setup_pred_plane(&pd->dst, bsize, src->buffers[i], src->crop_widths[is_uv],
  728|  5.54M|                     src->crop_heights[is_uv], src->strides[is_uv], mi_row,
  729|       |                     mi_col, NULL, pd->subsampling_x, pd->subsampling_y);
  730|  5.54M|  }
  731|  2.54M|}
av1_setup_pre_planes:
  736|  65.2k|                          const int num_planes) {
  737|  65.2k|  if (src != NULL) {
  ------------------
  |  Branch (737:7): [True: 65.2k, False: 0]
  ------------------
  738|       |    // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
  739|       |    // the static analysis warnings.
  740|   256k|    for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) {
  ------------------
  |  |   34|   256k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 4.85k, False: 251k]
  |  |  ------------------
  ------------------
  |  Branch (740:21): [True: 190k, False: 65.2k]
  ------------------
  741|   190k|      struct macroblockd_plane *const pd = &xd->plane[i];
  742|   190k|      const int is_uv = i > 0;
  743|   190k|      setup_pred_plane(&pd->pre[idx], xd->mi[0]->bsize, src->buffers[i],
  744|   190k|                       src->crop_widths[is_uv], src->crop_heights[is_uv],
  745|   190k|                       src->strides[is_uv], mi_row, mi_col, sf,
  746|   190k|                       pd->subsampling_x, pd->subsampling_y);
  747|   190k|    }
  748|  65.2k|  }
  749|  65.2k|}
av1_get_obmc_mask:
  774|  32.3k|const uint8_t *av1_get_obmc_mask(int length) {
  775|  32.3k|  switch (length) {
  776|      0|    case 1: return obmc_mask_1;
  ------------------
  |  Branch (776:5): [True: 0, False: 32.3k]
  ------------------
  777|  7.51k|    case 2: return obmc_mask_2;
  ------------------
  |  Branch (777:5): [True: 7.51k, False: 24.8k]
  ------------------
  778|  16.0k|    case 4: return obmc_mask_4;
  ------------------
  |  Branch (778:5): [True: 16.0k, False: 16.2k]
  ------------------
  779|  7.26k|    case 8: return obmc_mask_8;
  ------------------
  |  Branch (779:5): [True: 7.26k, False: 25.0k]
  ------------------
  780|  1.40k|    case 16: return obmc_mask_16;
  ------------------
  |  Branch (780:5): [True: 1.40k, False: 30.9k]
  ------------------
  781|    103|    case 32: return obmc_mask_32;
  ------------------
  |  Branch (781:5): [True: 103, False: 32.2k]
  ------------------
  782|      0|    case 64: return obmc_mask_64;
  ------------------
  |  Branch (782:5): [True: 0, False: 32.3k]
  ------------------
  783|      0|    default: assert(0); return NULL;
  ------------------
  |  Branch (783:5): [True: 0, False: 32.3k]
  ------------------
  784|  32.3k|  }
  785|  32.3k|}
av1_count_overlappable_neighbors:
  801|  76.7k|void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd) {
  802|  76.7k|  MB_MODE_INFO *mbmi = xd->mi[0];
  803|       |
  804|  76.7k|  mbmi->overlappable_neighbors = 0;
  805|       |
  806|  76.7k|  if (!is_motion_variation_allowed_bsize(mbmi->bsize)) return;
  ------------------
  |  Branch (806:7): [True: 24.5k, False: 52.1k]
  ------------------
  807|       |
  808|  52.1k|  foreach_overlappable_nb_above(cm, xd, INT_MAX, increment_int_ptr,
  809|  52.1k|                                &mbmi->overlappable_neighbors);
  810|  52.1k|  if (mbmi->overlappable_neighbors) return;
  ------------------
  |  Branch (810:7): [True: 36.6k, False: 15.4k]
  ------------------
  811|  15.4k|  foreach_overlappable_nb_left(cm, xd, INT_MAX, increment_int_ptr,
  812|  15.4k|                               &mbmi->overlappable_neighbors);
  813|  15.4k|}
av1_skip_u4x4_pred_in_obmc:
  821|  83.1k|                               const struct macroblockd_plane *pd, int dir) {
  822|  83.1k|  assert(is_motion_variation_allowed_bsize(bsize));
  823|       |
  824|  83.1k|  const BLOCK_SIZE bsize_plane =
  825|  83.1k|      get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
  826|  83.1k|  switch (bsize_plane) {
  827|       |#if DISABLE_CHROMA_U8X8_OBMC
  828|       |    case BLOCK_4X4:
  829|       |    case BLOCK_8X4:
  830|       |    case BLOCK_4X8: return 1;
  831|       |#else
  832|  16.9k|    case BLOCK_4X4:
  ------------------
  |  Branch (832:5): [True: 16.9k, False: 66.2k]
  ------------------
  833|  28.5k|    case BLOCK_8X4:
  ------------------
  |  Branch (833:5): [True: 11.5k, False: 71.5k]
  ------------------
  834|  37.3k|    case BLOCK_4X8: return dir == 0;
  ------------------
  |  Branch (834:5): [True: 8.78k, False: 74.3k]
  ------------------
  835|      0|#endif
  836|  45.8k|    default: return 0;
  ------------------
  |  Branch (836:5): [True: 45.8k, False: 37.3k]
  ------------------
  837|  83.1k|  }
  838|  83.1k|}
av1_build_obmc_inter_prediction:
  939|  7.49k|                                     int left_stride[MAX_MB_PLANE]) {
  940|  7.49k|  const BLOCK_SIZE bsize = xd->mi[0]->bsize;
  941|       |
  942|       |  // handle above row
  943|  7.49k|  struct obmc_inter_pred_ctxt ctxt_above = { above, above_stride };
  944|  7.49k|  foreach_overlappable_nb_above(cm, xd,
  945|  7.49k|                                max_neighbor_obmc[mi_size_wide_log2[bsize]],
  946|  7.49k|                                build_obmc_inter_pred_above, &ctxt_above);
  947|       |
  948|       |  // handle left column
  949|  7.49k|  struct obmc_inter_pred_ctxt ctxt_left = { left, left_stride };
  950|  7.49k|  foreach_overlappable_nb_left(cm, xd,
  951|  7.49k|                               max_neighbor_obmc[mi_size_high_log2[bsize]],
  952|  7.49k|                               build_obmc_inter_pred_left, &ctxt_left);
  953|  7.49k|}
av1_setup_obmc_dst_bufs:
  956|  7.49k|                             uint8_t **dst_buf2) {
  957|  7.49k|  if (is_cur_buf_hbd(xd)) {
  ------------------
  |  Branch (957:7): [True: 1.57k, False: 5.92k]
  ------------------
  958|  1.57k|    int len = sizeof(uint16_t);
  959|  1.57k|    dst_buf1[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0]);
  ------------------
  |  |   76|  1.57k|#define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1))
  ------------------
  960|  1.57k|    dst_buf1[1] =
  961|  1.57k|        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * len);
  ------------------
  |  |   76|  1.57k|#define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1))
  ------------------
  962|  1.57k|    dst_buf1[2] =
  963|  1.57k|        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2 * len);
  ------------------
  |  |   76|  1.57k|#define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1))
  ------------------
  964|  1.57k|    dst_buf2[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1]);
  ------------------
  |  |   76|  1.57k|#define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1))
  ------------------
  965|  1.57k|    dst_buf2[1] =
  966|  1.57k|        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * len);
  ------------------
  |  |   76|  1.57k|#define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1))
  ------------------
  967|  1.57k|    dst_buf2[2] =
  968|  1.57k|        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2 * len);
  ------------------
  |  |   76|  1.57k|#define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1))
  ------------------
  969|  5.92k|  } else {
  970|  5.92k|    dst_buf1[0] = xd->tmp_obmc_bufs[0];
  971|  5.92k|    dst_buf1[1] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE;
  ------------------
  |  |   33|  5.92k|#define MAX_SB_SQUARE (MAX_SB_SIZE * MAX_SB_SIZE)
  |  |  ------------------
  |  |  |  |   32|  5.92k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   31|  5.92k|#define MAX_SB_SIZE_LOG2 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |               #define MAX_SB_SQUARE (MAX_SB_SIZE * MAX_SB_SIZE)
  |  |  ------------------
  |  |  |  |   32|  5.92k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   31|  5.92k|#define MAX_SB_SIZE_LOG2 7
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  972|  5.92k|    dst_buf1[2] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2;
  ------------------
  |  |   33|  5.92k|#define MAX_SB_SQUARE (MAX_SB_SIZE * MAX_SB_SIZE)
  |  |  ------------------
  |  |  |  |   32|  5.92k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   31|  5.92k|#define MAX_SB_SIZE_LOG2 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |               #define MAX_SB_SQUARE (MAX_SB_SIZE * MAX_SB_SIZE)
  |  |  ------------------
  |  |  |  |   32|  5.92k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   31|  5.92k|#define MAX_SB_SIZE_LOG2 7
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  973|  5.92k|    dst_buf2[0] = xd->tmp_obmc_bufs[1];
  974|  5.92k|    dst_buf2[1] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE;
  ------------------
  |  |   33|  5.92k|#define MAX_SB_SQUARE (MAX_SB_SIZE * MAX_SB_SIZE)
  |  |  ------------------
  |  |  |  |   32|  5.92k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   31|  5.92k|#define MAX_SB_SIZE_LOG2 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |               #define MAX_SB_SQUARE (MAX_SB_SIZE * MAX_SB_SIZE)
  |  |  ------------------
  |  |  |  |   32|  5.92k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   31|  5.92k|#define MAX_SB_SIZE_LOG2 7
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  975|  5.92k|    dst_buf2[2] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2;
  ------------------
  |  |   33|  5.92k|#define MAX_SB_SQUARE (MAX_SB_SIZE * MAX_SB_SIZE)
  |  |  ------------------
  |  |  |  |   32|  5.92k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   31|  5.92k|#define MAX_SB_SIZE_LOG2 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |               #define MAX_SB_SQUARE (MAX_SB_SIZE * MAX_SB_SIZE)
  |  |  ------------------
  |  |  |  |   32|  5.92k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   31|  5.92k|#define MAX_SB_SIZE_LOG2 7
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  976|  5.92k|  }
  977|  7.49k|}
av1_setup_build_prediction_by_above_pred:
  983|  6.80k|    const int num_planes) {
  984|  6.80k|  const BLOCK_SIZE a_bsize = AOMMAX(BLOCK_8X8, above_mbmi->bsize);
  ------------------
  |  |   35|  6.80k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 520, False: 6.28k]
  |  |  ------------------
  ------------------
  985|  6.80k|  const int above_mi_col = xd->mi_col + rel_mi_col;
  986|       |
  987|  6.80k|  modify_neighbor_predictor_for_obmc(above_mbmi);
  988|       |
  989|  27.0k|  for (int j = 0; j < num_planes; ++j) {
  ------------------
  |  Branch (989:19): [True: 20.2k, False: 6.80k]
  ------------------
  990|  20.2k|    struct macroblockd_plane *const pd = &xd->plane[j];
  991|  20.2k|    setup_pred_plane(&pd->dst, a_bsize, ctxt->tmp_buf[j], ctxt->tmp_width[j],
  992|  20.2k|                     ctxt->tmp_height[j], ctxt->tmp_stride[j], 0, rel_mi_col,
  993|  20.2k|                     NULL, pd->subsampling_x, pd->subsampling_y);
  994|  20.2k|  }
  995|       |
  996|  6.80k|  const int num_refs = 1 + has_second_ref(above_mbmi);
  997|       |
  998|  13.6k|  for (int ref = 0; ref < num_refs; ++ref) {
  ------------------
  |  Branch (998:21): [True: 6.80k, False: 6.80k]
  ------------------
  999|  6.80k|    const MV_REFERENCE_FRAME frame = above_mbmi->ref_frame[ref];
 1000|       |
 1001|  6.80k|    const RefCntBuffer *const ref_buf = get_ref_frame_buf(ctxt->cm, frame);
 1002|  6.80k|    const struct scale_factors *const sf =
 1003|  6.80k|        get_ref_scale_factors_const(ctxt->cm, frame);
 1004|  6.80k|    xd->block_ref_scale_factors[ref] = sf;
 1005|  6.80k|    if ((!av1_is_valid_scale(sf)))
  ------------------
  |  Branch (1005:9): [True: 0, False: 6.80k]
  ------------------
 1006|      0|      aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
 1007|      0|                         "Reference frame has invalid dimensions");
 1008|  6.80k|    av1_setup_pre_planes(xd, ref, &ref_buf->buf, xd->mi_row, above_mi_col, sf,
 1009|  6.80k|                         num_planes);
 1010|  6.80k|  }
 1011|       |
 1012|  6.80k|  xd->mb_to_left_edge = 8 * MI_SIZE * (-above_mi_col);
  ------------------
  |  |   40|  6.80k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  6.80k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1013|  6.80k|  xd->mb_to_right_edge =
 1014|  6.80k|      ctxt->mb_to_far_edge +
 1015|  6.80k|      (xd->width - rel_mi_col - above_mi_width) * MI_SIZE * 8;
  ------------------
  |  |   40|  6.80k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  6.80k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1016|  6.80k|}
av1_setup_build_prediction_by_left_pred:
 1022|  7.21k|                                             const int num_planes) {
 1023|  7.21k|  const BLOCK_SIZE l_bsize = AOMMAX(BLOCK_8X8, left_mbmi->bsize);
  ------------------
  |  |   35|  7.21k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 529, False: 6.68k]
  |  |  ------------------
  ------------------
 1024|  7.21k|  const int left_mi_row = xd->mi_row + rel_mi_row;
 1025|       |
 1026|  7.21k|  modify_neighbor_predictor_for_obmc(left_mbmi);
 1027|       |
 1028|  28.5k|  for (int j = 0; j < num_planes; ++j) {
  ------------------
  |  Branch (1028:19): [True: 21.3k, False: 7.21k]
  ------------------
 1029|  21.3k|    struct macroblockd_plane *const pd = &xd->plane[j];
 1030|  21.3k|    setup_pred_plane(&pd->dst, l_bsize, ctxt->tmp_buf[j], ctxt->tmp_width[j],
 1031|  21.3k|                     ctxt->tmp_height[j], ctxt->tmp_stride[j], rel_mi_row, 0,
 1032|  21.3k|                     NULL, pd->subsampling_x, pd->subsampling_y);
 1033|  21.3k|  }
 1034|       |
 1035|  7.21k|  const int num_refs = 1 + has_second_ref(left_mbmi);
 1036|       |
 1037|  14.4k|  for (int ref = 0; ref < num_refs; ++ref) {
  ------------------
  |  Branch (1037:21): [True: 7.21k, False: 7.21k]
  ------------------
 1038|  7.21k|    const MV_REFERENCE_FRAME frame = left_mbmi->ref_frame[ref];
 1039|       |
 1040|  7.21k|    const RefCntBuffer *const ref_buf = get_ref_frame_buf(ctxt->cm, frame);
 1041|  7.21k|    const struct scale_factors *const ref_scale_factors =
 1042|  7.21k|        get_ref_scale_factors_const(ctxt->cm, frame);
 1043|       |
 1044|  7.21k|    xd->block_ref_scale_factors[ref] = ref_scale_factors;
 1045|  7.21k|    if ((!av1_is_valid_scale(ref_scale_factors)))
  ------------------
  |  Branch (1045:9): [True: 0, False: 7.21k]
  ------------------
 1046|      0|      aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
 1047|      0|                         "Reference frame has invalid dimensions");
 1048|  7.21k|    av1_setup_pre_planes(xd, ref, &ref_buf->buf, left_mi_row, xd->mi_col,
 1049|  7.21k|                         ref_scale_factors, num_planes);
 1050|  7.21k|  }
 1051|       |
 1052|  7.21k|  xd->mb_to_top_edge = GET_MV_SUBPEL(MI_SIZE * (-left_mi_row));
  ------------------
  |  |   29|  7.21k|#define GET_MV_SUBPEL(x) ((x) * 8)
  ------------------
 1053|  7.21k|  xd->mb_to_bottom_edge =
 1054|  7.21k|      ctxt->mb_to_far_edge +
 1055|  7.21k|      GET_MV_SUBPEL((xd->height - rel_mi_row - left_mi_height) * MI_SIZE);
  ------------------
  |  |   29|  7.21k|#define GET_MV_SUBPEL(x) ((x) * 8)
  ------------------
 1056|  7.21k|}
av1_build_intra_predictors_for_interintra:
 1119|  4.89k|                                               uint8_t *dst, int dst_stride) {
 1120|  4.89k|  struct macroblockd_plane *const pd = &xd->plane[plane];
 1121|  4.89k|  const int ssx = xd->plane[plane].subsampling_x;
 1122|  4.89k|  const int ssy = xd->plane[plane].subsampling_y;
 1123|  4.89k|  BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ssx, ssy);
 1124|  4.89k|  PREDICTION_MODE mode = interintra_to_intra_mode[xd->mi[0]->interintra_mode];
 1125|  4.89k|  assert(xd->mi[0]->angle_delta[PLANE_TYPE_Y] == 0);
 1126|  4.89k|  assert(xd->mi[0]->angle_delta[PLANE_TYPE_UV] == 0);
 1127|  4.89k|  assert(xd->mi[0]->filter_intra_mode_info.use_filter_intra == 0);
 1128|  4.89k|  assert(xd->mi[0]->use_intrabc == 0);
 1129|  4.89k|  const SequenceHeader *seq_params = cm->seq_params;
 1130|       |
 1131|  4.89k|  av1_predict_intra_block(xd, seq_params->sb_size,
 1132|  4.89k|                          seq_params->enable_intra_edge_filter, pd->width,
 1133|  4.89k|                          pd->height, max_txsize_rect_lookup[plane_bsize], mode,
 1134|  4.89k|                          0, 0, FILTER_INTRA_MODES, ctx->plane[plane],
 1135|  4.89k|                          ctx->stride[plane], dst, dst_stride, 0, 0, plane);
 1136|  4.89k|}
av1_combine_interintra:
 1140|  4.89k|                            const uint8_t *intra_pred, int intra_stride) {
 1141|  4.89k|  const int ssx = xd->plane[plane].subsampling_x;
 1142|  4.89k|  const int ssy = xd->plane[plane].subsampling_y;
 1143|  4.89k|  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ssx, ssy);
 1144|  4.89k|#if CONFIG_AV1_HIGHBITDEPTH
 1145|  4.89k|  if (is_cur_buf_hbd(xd)) {
  ------------------
  |  Branch (1145:7): [True: 472, False: 4.42k]
  ------------------
 1146|    472|    combine_interintra_highbd(
 1147|    472|        xd->mi[0]->interintra_mode, xd->mi[0]->use_wedge_interintra,
 1148|    472|        xd->mi[0]->interintra_wedge_index, INTERINTRA_WEDGE_SIGN, bsize,
  ------------------
  |  |   40|    472|#define INTERINTRA_WEDGE_SIGN 0
  ------------------
 1149|    472|        plane_bsize, xd->plane[plane].dst.buf, xd->plane[plane].dst.stride,
 1150|    472|        inter_pred, inter_stride, intra_pred, intra_stride, xd->bd);
 1151|    472|    return;
 1152|    472|  }
 1153|  4.42k|#endif
 1154|  4.42k|  combine_interintra(
 1155|  4.42k|      xd->mi[0]->interintra_mode, xd->mi[0]->use_wedge_interintra,
 1156|  4.42k|      xd->mi[0]->interintra_wedge_index, INTERINTRA_WEDGE_SIGN, bsize,
  ------------------
  |  |   40|  4.42k|#define INTERINTRA_WEDGE_SIGN 0
  ------------------
 1157|  4.42k|      plane_bsize, xd->plane[plane].dst.buf, xd->plane[plane].dst.stride,
 1158|  4.42k|      inter_pred, inter_stride, intra_pred, intra_stride);
 1159|  4.42k|}
av1_build_interintra_predictor:
 1165|  4.89k|                                    BLOCK_SIZE bsize) {
 1166|  4.89k|  assert(bsize < BLOCK_SIZES_ALL);
 1167|  4.89k|  if (is_cur_buf_hbd(xd)) {
  ------------------
  |  Branch (1167:7): [True: 472, False: 4.42k]
  ------------------
 1168|    472|    DECLARE_ALIGNED(16, uint16_t, intrapredictor[MAX_SB_SQUARE]);
  ------------------
  |  |   19|    472|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
 1169|    472|    av1_build_intra_predictors_for_interintra(
 1170|    472|        cm, xd, bsize, plane, ctx, CONVERT_TO_BYTEPTR(intrapredictor),
  ------------------
  |  |   76|    472|#define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1))
  ------------------
 1171|    472|        MAX_SB_SIZE);
  ------------------
  |  |   32|    472|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|    472|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
 1172|    472|    av1_combine_interintra(xd, bsize, plane, pred, stride,
 1173|    472|                           CONVERT_TO_BYTEPTR(intrapredictor), MAX_SB_SIZE);
  ------------------
  |  |   76|    472|#define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1))
  ------------------
                                         CONVERT_TO_BYTEPTR(intrapredictor), MAX_SB_SIZE);
  ------------------
  |  |   32|    472|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|    472|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
 1174|  4.42k|  } else {
 1175|  4.42k|    DECLARE_ALIGNED(16, uint8_t, intrapredictor[MAX_SB_SQUARE]);
  ------------------
  |  |   19|  4.42k|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
 1176|  4.42k|    av1_build_intra_predictors_for_interintra(cm, xd, bsize, plane, ctx,
 1177|  4.42k|                                              intrapredictor, MAX_SB_SIZE);
  ------------------
  |  |   32|  4.42k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|  4.42k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
 1178|  4.42k|    av1_combine_interintra(xd, bsize, plane, pred, stride, intrapredictor,
 1179|  4.42k|                           MAX_SB_SIZE);
  ------------------
  |  |   32|  4.42k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|  4.42k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
 1180|  4.42k|  }
 1181|  4.89k|}
reconinter.c:allow_warp:
   37|  74.0k|                      WarpedMotionParams *final_warp_params) {
   38|       |  // Note: As per the spec, we must test the fixed point scales here, which are
   39|       |  // at a higher precision (1 << 14) than the xs and ys in subpel_params (that
   40|       |  // have 1 << 10 precision).
   41|  74.0k|  if (av1_is_scaled(sf)) return 0;
  ------------------
  |  Branch (41:7): [True: 0, False: 74.0k]
  ------------------
   42|       |
   43|  74.0k|  if (final_warp_params != NULL) *final_warp_params = default_warp_params;
  ------------------
  |  Branch (43:7): [True: 74.0k, False: 0]
  ------------------
   44|       |
   45|  74.0k|  if (build_for_obmc) return 0;
  ------------------
  |  Branch (45:7): [True: 0, False: 74.0k]
  ------------------
   46|       |
   47|  74.0k|  if (warp_types->local_warp_allowed && !mbmi->wm_params.invalid) {
  ------------------
  |  Branch (47:7): [True: 3.79k, False: 70.2k]
  |  Branch (47:41): [True: 3.63k, False: 166]
  ------------------
   48|  3.63k|    if (final_warp_params != NULL) *final_warp_params = mbmi->wm_params;
  ------------------
  |  Branch (48:9): [True: 3.63k, False: 0]
  ------------------
   49|  3.63k|    return 1;
   50|  70.4k|  } else if (warp_types->global_warp_allowed && !gm_params->invalid) {
  ------------------
  |  Branch (50:14): [True: 546, False: 69.9k]
  |  Branch (50:49): [True: 546, False: 0]
  ------------------
   51|    546|    if (final_warp_params != NULL) *final_warp_params = *gm_params;
  ------------------
  |  Branch (51:9): [True: 546, False: 0]
  ------------------
   52|    546|    return 1;
   53|    546|  }
   54|       |
   55|  69.9k|  return 0;
   56|  74.0k|}
reconinter.c:init_all_wedge_masks:
  594|      1|static void init_all_wedge_masks(void) {
  595|      1|  init_wedge_master_masks();
  596|      1|  init_wedge_masks();
  597|      1|  init_smooth_interintra_masks();
  598|      1|}
reconinter.c:init_wedge_master_masks:
  449|      1|static inline void init_wedge_master_masks(void) {
  450|      1|  int i, j;
  451|      1|  const int w = MASK_MASTER_SIZE;
  ------------------
  |  |  451|      1|#define MASK_MASTER_SIZE ((MAX_WEDGE_SIZE) << 1)
  |  |  ------------------
  |  |  |  |   41|      1|#define MAX_WEDGE_SIZE (1 << MAX_WEDGE_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   40|      1|#define MAX_WEDGE_SIZE_LOG2 5  // 32x32
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  452|      1|  const int h = MASK_MASTER_SIZE;
  ------------------
  |  |  451|      1|#define MASK_MASTER_SIZE ((MAX_WEDGE_SIZE) << 1)
  |  |  ------------------
  |  |  |  |   41|      1|#define MAX_WEDGE_SIZE (1 << MAX_WEDGE_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   40|      1|#define MAX_WEDGE_SIZE_LOG2 5  // 32x32
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  453|      1|  const int stride = MASK_MASTER_STRIDE;
  ------------------
  |  |  452|      1|#define MASK_MASTER_STRIDE (MASK_MASTER_SIZE)
  |  |  ------------------
  |  |  |  |  451|      1|#define MASK_MASTER_SIZE ((MAX_WEDGE_SIZE) << 1)
  |  |  |  |  ------------------
  |  |  |  |  |  |   41|      1|#define MAX_WEDGE_SIZE (1 << MAX_WEDGE_SIZE_LOG2)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   40|      1|#define MAX_WEDGE_SIZE_LOG2 5  // 32x32
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  454|       |  // Note: index [0] stores the masters, and [1] its complement.
  455|       |  // Generate prototype by shifting the masters
  456|      1|  int shift = h / 4;
  457|     33|  for (i = 0; i < h; i += 2) {
  ------------------
  |  Branch (457:15): [True: 32, False: 1]
  ------------------
  458|     32|    shift_copy(wedge_master_oblique_even,
  459|     32|               &wedge_mask_obl[0][WEDGE_OBLIQUE63][i * stride], shift,
  460|     32|               MASK_MASTER_SIZE);
  ------------------
  |  |  451|     32|#define MASK_MASTER_SIZE ((MAX_WEDGE_SIZE) << 1)
  |  |  ------------------
  |  |  |  |   41|     32|#define MAX_WEDGE_SIZE (1 << MAX_WEDGE_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   40|     32|#define MAX_WEDGE_SIZE_LOG2 5  // 32x32
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  461|     32|    shift--;
  462|     32|    shift_copy(wedge_master_oblique_odd,
  463|     32|               &wedge_mask_obl[0][WEDGE_OBLIQUE63][(i + 1) * stride], shift,
  464|     32|               MASK_MASTER_SIZE);
  ------------------
  |  |  451|     32|#define MASK_MASTER_SIZE ((MAX_WEDGE_SIZE) << 1)
  |  |  ------------------
  |  |  |  |   41|     32|#define MAX_WEDGE_SIZE (1 << MAX_WEDGE_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   40|     32|#define MAX_WEDGE_SIZE_LOG2 5  // 32x32
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  465|     32|    memcpy(&wedge_mask_obl[0][WEDGE_VERTICAL][i * stride],
  466|     32|           wedge_master_vertical,
  467|     32|           MASK_MASTER_SIZE * sizeof(wedge_master_vertical[0]));
  ------------------
  |  |  451|     32|#define MASK_MASTER_SIZE ((MAX_WEDGE_SIZE) << 1)
  |  |  ------------------
  |  |  |  |   41|     32|#define MAX_WEDGE_SIZE (1 << MAX_WEDGE_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   40|     32|#define MAX_WEDGE_SIZE_LOG2 5  // 32x32
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  468|     32|    memcpy(&wedge_mask_obl[0][WEDGE_VERTICAL][(i + 1) * stride],
  469|     32|           wedge_master_vertical,
  470|     32|           MASK_MASTER_SIZE * sizeof(wedge_master_vertical[0]));
  ------------------
  |  |  451|     32|#define MASK_MASTER_SIZE ((MAX_WEDGE_SIZE) << 1)
  |  |  ------------------
  |  |  |  |   41|     32|#define MAX_WEDGE_SIZE (1 << MAX_WEDGE_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   40|     32|#define MAX_WEDGE_SIZE_LOG2 5  // 32x32
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  471|     32|  }
  472|       |
  473|     65|  for (i = 0; i < h; ++i) {
  ------------------
  |  Branch (473:15): [True: 64, False: 1]
  ------------------
  474|  4.16k|    for (j = 0; j < w; ++j) {
  ------------------
  |  Branch (474:17): [True: 4.09k, False: 64]
  ------------------
  475|  4.09k|      const int msk = wedge_mask_obl[0][WEDGE_OBLIQUE63][i * stride + j];
  476|  4.09k|      wedge_mask_obl[0][WEDGE_OBLIQUE27][j * stride + i] = msk;
  477|  4.09k|      wedge_mask_obl[0][WEDGE_OBLIQUE117][i * stride + w - 1 - j] =
  478|  4.09k|          wedge_mask_obl[0][WEDGE_OBLIQUE153][(w - 1 - j) * stride + i] =
  479|  4.09k|              (1 << WEDGE_WEIGHT_BITS) - msk;
  ------------------
  |  |   44|  4.09k|#define WEDGE_WEIGHT_BITS 6
  ------------------
  480|  4.09k|      wedge_mask_obl[1][WEDGE_OBLIQUE63][i * stride + j] =
  481|  4.09k|          wedge_mask_obl[1][WEDGE_OBLIQUE27][j * stride + i] =
  482|  4.09k|              (1 << WEDGE_WEIGHT_BITS) - msk;
  ------------------
  |  |   44|  4.09k|#define WEDGE_WEIGHT_BITS 6
  ------------------
  483|  4.09k|      wedge_mask_obl[1][WEDGE_OBLIQUE117][i * stride + w - 1 - j] =
  484|  4.09k|          wedge_mask_obl[1][WEDGE_OBLIQUE153][(w - 1 - j) * stride + i] = msk;
  485|  4.09k|      const int mskx = wedge_mask_obl[0][WEDGE_VERTICAL][i * stride + j];
  486|  4.09k|      wedge_mask_obl[0][WEDGE_HORIZONTAL][j * stride + i] = mskx;
  487|  4.09k|      wedge_mask_obl[1][WEDGE_VERTICAL][i * stride + j] =
  488|  4.09k|          wedge_mask_obl[1][WEDGE_HORIZONTAL][j * stride + i] =
  489|  4.09k|              (1 << WEDGE_WEIGHT_BITS) - mskx;
  ------------------
  |  |   44|  4.09k|#define WEDGE_WEIGHT_BITS 6
  ------------------
  490|  4.09k|    }
  491|     64|  }
  492|      1|}
reconinter.c:shift_copy:
  148|     64|                              int width) {
  149|     64|  if (shift >= 0) {
  ------------------
  |  Branch (149:7): [True: 33, False: 31]
  ------------------
  150|     33|    memcpy(dst + shift, src, width - shift);
  151|     33|    memset(dst, src[0], shift);
  152|     33|  } else {
  153|     31|    shift = -shift;
  154|     31|    memcpy(dst, src + shift, width - shift);
  155|     31|    memset(dst + width - shift, src[width - 1], shift);
  156|     31|  }
  157|     64|}
reconinter.c:init_wedge_masks:
  494|      1|static inline void init_wedge_masks(void) {
  495|      1|  uint8_t *dst = wedge_mask_buf;
  496|      1|  BLOCK_SIZE bsize;
  497|      1|  memset(wedge_masks, 0, sizeof(wedge_masks));
  498|     23|  for (bsize = BLOCK_4X4; bsize < BLOCK_SIZES_ALL; ++bsize) {
  ------------------
  |  Branch (498:27): [True: 22, False: 1]
  ------------------
  499|     22|    const wedge_params_type *wedge_params = &av1_wedge_params_lookup[bsize];
  500|     22|    const int wtypes = wedge_params->wedge_types;
  501|     22|    if (wtypes == 0) continue;
  ------------------
  |  Branch (501:9): [True: 13, False: 9]
  ------------------
  502|      9|    const uint8_t *mask;
  503|      9|    const int bw = block_size_wide[bsize];
  504|      9|    const int bh = block_size_high[bsize];
  505|      9|    int w;
  506|    153|    for (w = 0; w < wtypes; ++w) {
  ------------------
  |  Branch (506:17): [True: 144, False: 9]
  ------------------
  507|    144|      mask = get_wedge_mask_inplace(w, 0, bsize);
  508|    144|      aom_convolve_copy(mask, MASK_MASTER_STRIDE, dst, bw /* dst_stride */, bw,
  ------------------
  |  |  452|    144|#define MASK_MASTER_STRIDE (MASK_MASTER_SIZE)
  |  |  ------------------
  |  |  |  |  451|    144|#define MASK_MASTER_SIZE ((MAX_WEDGE_SIZE) << 1)
  |  |  |  |  ------------------
  |  |  |  |  |  |   41|    144|#define MAX_WEDGE_SIZE (1 << MAX_WEDGE_SIZE_LOG2)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   40|    144|#define MAX_WEDGE_SIZE_LOG2 5  // 32x32
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  509|    144|                        bh);
  510|    144|      wedge_params->masks[0][w] = dst;
  511|    144|      dst += bw * bh;
  512|       |
  513|    144|      mask = get_wedge_mask_inplace(w, 1, bsize);
  514|    144|      aom_convolve_copy(mask, MASK_MASTER_STRIDE, dst, bw /* dst_stride */, bw,
  ------------------
  |  |  452|    144|#define MASK_MASTER_STRIDE (MASK_MASTER_SIZE)
  |  |  ------------------
  |  |  |  |  451|    144|#define MASK_MASTER_SIZE ((MAX_WEDGE_SIZE) << 1)
  |  |  |  |  ------------------
  |  |  |  |  |  |   41|    144|#define MAX_WEDGE_SIZE (1 << MAX_WEDGE_SIZE_LOG2)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   40|    144|#define MAX_WEDGE_SIZE_LOG2 5  // 32x32
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  515|    144|                        bh);
  516|    144|      wedge_params->masks[1][w] = dst;
  517|    144|      dst += bw * bh;
  518|    144|    }
  519|       |    assert(sizeof(wedge_mask_buf) >= (size_t)(dst - wedge_mask_buf));
  520|      9|  }
  521|      1|}
reconinter.c:get_wedge_mask_inplace:
  271|    288|                                             BLOCK_SIZE sb_type) {
  272|    288|  const uint8_t *master;
  273|    288|  const int bh = block_size_high[sb_type];
  274|    288|  const int bw = block_size_wide[sb_type];
  275|    288|  const wedge_code_type *a =
  276|    288|      av1_wedge_params_lookup[sb_type].codebook + wedge_index;
  277|    288|  int woff, hoff;
  278|    288|  const uint8_t wsignflip =
  279|    288|      av1_wedge_params_lookup[sb_type].signflip[wedge_index];
  280|       |
  281|    288|  assert(wedge_index >= 0 && wedge_index < get_wedge_types_lookup(sb_type));
  282|    288|  woff = (a->x_offset * bw) >> 3;
  283|    288|  hoff = (a->y_offset * bh) >> 3;
  284|    288|  master = wedge_mask_obl[neg ^ wsignflip][a->direction] +
  285|    288|           MASK_MASTER_STRIDE * (MASK_MASTER_SIZE / 2 - hoff) +
  ------------------
  |  |  452|    288|#define MASK_MASTER_STRIDE (MASK_MASTER_SIZE)
  |  |  ------------------
  |  |  |  |  451|    288|#define MASK_MASTER_SIZE ((MAX_WEDGE_SIZE) << 1)
  |  |  |  |  ------------------
  |  |  |  |  |  |   41|    288|#define MAX_WEDGE_SIZE (1 << MAX_WEDGE_SIZE_LOG2)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   40|    288|#define MAX_WEDGE_SIZE_LOG2 5  // 32x32
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
                         MASK_MASTER_STRIDE * (MASK_MASTER_SIZE / 2 - hoff) +
  ------------------
  |  |  451|    288|#define MASK_MASTER_SIZE ((MAX_WEDGE_SIZE) << 1)
  |  |  ------------------
  |  |  |  |   41|    288|#define MAX_WEDGE_SIZE (1 << MAX_WEDGE_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   40|    288|#define MAX_WEDGE_SIZE_LOG2 5  // 32x32
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  286|    288|           MASK_MASTER_SIZE / 2 - woff;
  ------------------
  |  |  451|    288|#define MASK_MASTER_SIZE ((MAX_WEDGE_SIZE) << 1)
  |  |  ------------------
  |  |  |  |   41|    288|#define MAX_WEDGE_SIZE (1 << MAX_WEDGE_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   40|    288|#define MAX_WEDGE_SIZE_LOG2 5  // 32x32
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  287|    288|  return master;
  288|    288|}
reconinter.c:init_smooth_interintra_masks:
  581|      1|static inline void init_smooth_interintra_masks(void) {
  582|      5|  for (int m = 0; m < INTERINTRA_MODES; ++m) {
  ------------------
  |  Branch (582:19): [True: 4, False: 1]
  ------------------
  583|     92|    for (int bs = 0; bs < BLOCK_SIZES_ALL; ++bs) {
  ------------------
  |  Branch (583:22): [True: 88, False: 4]
  ------------------
  584|     88|      const int bw = block_size_wide[bs];
  585|     88|      const int bh = block_size_high[bs];
  586|     88|      if (bw > MAX_WEDGE_SIZE || bh > MAX_WEDGE_SIZE) continue;
  ------------------
  |  |   41|    176|#define MAX_WEDGE_SIZE (1 << MAX_WEDGE_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   40|     88|#define MAX_WEDGE_SIZE_LOG2 5  // 32x32
  |  |  ------------------
  ------------------
                    if (bw > MAX_WEDGE_SIZE || bh > MAX_WEDGE_SIZE) continue;
  ------------------
  |  |   41|     64|#define MAX_WEDGE_SIZE (1 << MAX_WEDGE_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   40|     64|#define MAX_WEDGE_SIZE_LOG2 5  // 32x32
  |  |  ------------------
  ------------------
  |  Branch (586:11): [True: 24, False: 64]
  |  Branch (586:34): [True: 8, False: 56]
  ------------------
  587|     56|      build_smooth_interintra_mask(smooth_interintra_mask_buf[m][bs], bw, bs,
  588|     56|                                   m);
  589|     56|    }
  590|      4|  }
  591|      1|}
reconinter.c:build_smooth_interintra_mask:
  542|    440|                                                INTERINTRA_MODE mode) {
  543|    440|  int i, j;
  544|    440|  const int bw = block_size_wide[plane_bsize];
  545|    440|  const int bh = block_size_high[plane_bsize];
  546|    440|  const int size_scale = ii_size_scales[plane_bsize];
  547|       |
  548|    440|  switch (mode) {
  549|    112|    case II_V_PRED:
  ------------------
  |  Branch (549:5): [True: 112, False: 328]
  ------------------
  550|  1.32k|      for (i = 0; i < bh; ++i) {
  ------------------
  |  Branch (550:19): [True: 1.21k, False: 112]
  ------------------
  551|  1.21k|        memset(mask, ii_weights1d[i * size_scale], bw * sizeof(mask[0]));
  552|  1.21k|        mask += stride;
  553|  1.21k|      }
  554|    112|      break;
  555|       |
  556|    196|    case II_H_PRED:
  ------------------
  |  Branch (556:5): [True: 196, False: 244]
  ------------------
  557|  3.58k|      for (i = 0; i < bh; ++i) {
  ------------------
  |  Branch (557:19): [True: 3.38k, False: 196]
  ------------------
  558|  60.5k|        for (j = 0; j < bw; ++j) mask[j] = ii_weights1d[j * size_scale];
  ------------------
  |  Branch (558:21): [True: 57.1k, False: 3.38k]
  ------------------
  559|  3.38k|        mask += stride;
  560|  3.38k|      }
  561|    196|      break;
  562|       |
  563|     76|    case II_SMOOTH_PRED:
  ------------------
  |  Branch (563:5): [True: 76, False: 364]
  ------------------
  564|  1.35k|      for (i = 0; i < bh; ++i) {
  ------------------
  |  Branch (564:19): [True: 1.27k, False: 76]
  ------------------
  565|  21.2k|        for (j = 0; j < bw; ++j)
  ------------------
  |  Branch (565:21): [True: 19.9k, False: 1.27k]
  ------------------
  566|  19.9k|          mask[j] = ii_weights1d[(i < j ? i : j) * size_scale];
  ------------------
  |  Branch (566:35): [True: 8.09k, False: 11.8k]
  ------------------
  567|  1.27k|        mask += stride;
  568|  1.27k|      }
  569|     76|      break;
  570|       |
  571|     56|    case II_DC_PRED:
  ------------------
  |  Branch (571:5): [True: 56, False: 384]
  ------------------
  572|     56|    default:
  ------------------
  |  Branch (572:5): [True: 0, False: 440]
  ------------------
  573|    884|      for (i = 0; i < bh; ++i) {
  ------------------
  |  Branch (573:19): [True: 828, False: 56]
  ------------------
  574|    828|        memset(mask, 32, bw * sizeof(mask[0]));
  575|    828|        mask += stride;
  576|    828|      }
  577|     56|      break;
  578|    440|  }
  579|    440|}
reconinter.c:build_masked_compound_no_round:
  606|  3.19k|    int w, InterPredParams *inter_pred_params) {
  607|  3.19k|  const int ssy = inter_pred_params->subsampling_y;
  608|  3.19k|  const int ssx = inter_pred_params->subsampling_x;
  609|  3.19k|  const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
  610|  3.19k|  const int mask_stride = block_size_wide[sb_type];
  611|  3.19k|#if CONFIG_AV1_HIGHBITDEPTH
  612|  3.19k|  if (inter_pred_params->use_hbd_buf) {
  ------------------
  |  Branch (612:7): [True: 498, False: 2.69k]
  ------------------
  613|    498|    aom_highbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1,
  614|    498|                                  src1_stride, mask, mask_stride, w, h, ssx,
  615|    498|                                  ssy, &inter_pred_params->conv_params,
  616|    498|                                  inter_pred_params->bit_depth);
  617|  2.69k|  } else {
  618|  2.69k|    aom_lowbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1,
  619|  2.69k|                                 src1_stride, mask, mask_stride, w, h, ssx, ssy,
  620|  2.69k|                                 &inter_pred_params->conv_params);
  621|  2.69k|  }
  622|       |#else
  623|       |  aom_lowbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1,
  624|       |                               src1_stride, mask, mask_stride, w, h, ssx, ssy,
  625|       |                               &inter_pred_params->conv_params);
  626|       |#endif
  627|  3.19k|}
reconinter.c:increment_int_ptr:
  790|  52.0k|                                     const int num_planes) {
  791|  52.0k|  (void)xd;
  792|  52.0k|  (void)rel_mi_row;
  793|  52.0k|  (void)rel_mi_col;
  794|  52.0k|  (void)op_mi_size;
  795|  52.0k|  (void)dir;
  796|  52.0k|  (void)mi;
  797|  52.0k|  ++*(uint8_t *)fun_ctxt;
  798|  52.0k|  (void)num_planes;
  799|  52.0k|}
reconinter.c:build_obmc_inter_pred_above:
  854|  6.80k|    int dir, MB_MODE_INFO *above_mi, void *fun_ctxt, const int num_planes) {
  855|  6.80k|  (void)above_mi;
  856|  6.80k|  (void)rel_mi_row;
  857|  6.80k|  (void)dir;
  858|  6.80k|  struct obmc_inter_pred_ctxt *ctxt = (struct obmc_inter_pred_ctxt *)fun_ctxt;
  859|  6.80k|  const BLOCK_SIZE bsize = xd->mi[0]->bsize;
  860|  6.80k|  const int overlap =
  861|  6.80k|      AOMMIN(block_size_high[bsize], block_size_high[BLOCK_64X64]) >> 1;
  ------------------
  |  |   34|  6.80k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 6.77k, False: 32]
  |  |  ------------------
  ------------------
  862|       |
  863|  27.0k|  for (int plane = 0; plane < num_planes; ++plane) {
  ------------------
  |  Branch (863:23): [True: 20.2k, False: 6.80k]
  ------------------
  864|  20.2k|    const struct macroblockd_plane *pd = &xd->plane[plane];
  865|  20.2k|    const int bw = (op_mi_size * MI_SIZE) >> pd->subsampling_x;
  ------------------
  |  |   40|  20.2k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  20.2k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  866|  20.2k|    const int bh = overlap >> pd->subsampling_y;
  867|  20.2k|    const int plane_col = (rel_mi_col * MI_SIZE) >> pd->subsampling_x;
  ------------------
  |  |   40|  20.2k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  20.2k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  868|       |
  869|  20.2k|    if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue;
  ------------------
  |  Branch (869:9): [True: 9.26k, False: 10.9k]
  ------------------
  870|       |
  871|  10.9k|    const int dst_stride = pd->dst.stride;
  872|  10.9k|    uint8_t *const dst = &pd->dst.buf[plane_col];
  873|  10.9k|    const int tmp_stride = ctxt->adjacent_stride[plane];
  874|  10.9k|    const uint8_t *const tmp = &ctxt->adjacent[plane][plane_col];
  875|  10.9k|    const uint8_t *const mask = av1_get_obmc_mask(bh);
  876|  10.9k|#if CONFIG_AV1_HIGHBITDEPTH
  877|  10.9k|    const int is_hbd = is_cur_buf_hbd(xd);
  878|  10.9k|    if (is_hbd)
  ------------------
  |  Branch (878:9): [True: 2.38k, False: 8.56k]
  ------------------
  879|  2.38k|      aom_highbd_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp,
  880|  2.38k|                                 tmp_stride, mask, bw, bh, xd->bd);
  881|  8.56k|    else
  882|  8.56k|      aom_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride,
  883|  8.56k|                          mask, bw, bh);
  884|       |#else
  885|       |    aom_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride, mask,
  886|       |                        bw, bh);
  887|       |#endif
  888|  10.9k|  }
  889|  6.80k|}
reconinter.c:build_obmc_inter_pred_left:
  893|  7.21k|    int dir, MB_MODE_INFO *left_mi, void *fun_ctxt, const int num_planes) {
  894|  7.21k|  (void)left_mi;
  895|  7.21k|  (void)rel_mi_col;
  896|  7.21k|  (void)dir;
  897|  7.21k|  struct obmc_inter_pred_ctxt *ctxt = (struct obmc_inter_pred_ctxt *)fun_ctxt;
  898|  7.21k|  const BLOCK_SIZE bsize = xd->mi[0]->bsize;
  899|  7.21k|  const int overlap =
  900|  7.21k|      AOMMIN(block_size_wide[bsize], block_size_wide[BLOCK_64X64]) >> 1;
  ------------------
  |  |   34|  7.21k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 7.14k, False: 71]
  |  |  ------------------
  ------------------
  901|       |
  902|  28.5k|  for (int plane = 0; plane < num_planes; ++plane) {
  ------------------
  |  Branch (902:23): [True: 21.3k, False: 7.21k]
  ------------------
  903|  21.3k|    const struct macroblockd_plane *pd = &xd->plane[plane];
  904|  21.3k|    const int bw = overlap >> pd->subsampling_x;
  905|  21.3k|    const int bh = (op_mi_size * MI_SIZE) >> pd->subsampling_y;
  ------------------
  |  |   40|  21.3k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  21.3k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  906|  21.3k|    const int plane_row = (rel_mi_row * MI_SIZE) >> pd->subsampling_y;
  ------------------
  |  |   40|  21.3k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  21.3k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  907|       |
  908|  21.3k|    if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue;
  ------------------
  |  Branch (908:9): [True: 0, False: 21.3k]
  ------------------
  909|       |
  910|  21.3k|    const int dst_stride = pd->dst.stride;
  911|  21.3k|    uint8_t *const dst = &pd->dst.buf[plane_row * dst_stride];
  912|  21.3k|    const int tmp_stride = ctxt->adjacent_stride[plane];
  913|  21.3k|    const uint8_t *const tmp = &ctxt->adjacent[plane][plane_row * tmp_stride];
  914|  21.3k|    const uint8_t *const mask = av1_get_obmc_mask(bw);
  915|       |
  916|  21.3k|#if CONFIG_AV1_HIGHBITDEPTH
  917|  21.3k|    const int is_hbd = is_cur_buf_hbd(xd);
  918|  21.3k|    if (is_hbd)
  ------------------
  |  Branch (918:9): [True: 4.48k, False: 16.8k]
  ------------------
  919|  4.48k|      aom_highbd_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp,
  920|  4.48k|                                 tmp_stride, mask, bw, bh, xd->bd);
  921|  16.8k|    else
  922|  16.8k|      aom_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride,
  923|  16.8k|                          mask, bw, bh);
  924|       |#else
  925|       |    aom_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride, mask,
  926|       |                        bw, bh);
  927|       |#endif
  928|  21.3k|  }
  929|  7.21k|}
reconinter.c:modify_neighbor_predictor_for_obmc:
  841|  14.0k|static void modify_neighbor_predictor_for_obmc(MB_MODE_INFO *mbmi) {
  842|  14.0k|  mbmi->ref_frame[1] = NONE_FRAME;
  843|  14.0k|  mbmi->interinter_comp.type = COMPOUND_AVERAGE;
  844|  14.0k|}
reconinter.c:combine_interintra_highbd:
 1090|    472|    int interstride, const uint8_t *intrapred8, int intrastride, int bd) {
 1091|    472|  const int bw = block_size_wide[plane_bsize];
 1092|    472|  const int bh = block_size_high[plane_bsize];
 1093|       |
 1094|    472|  if (use_wedge_interintra) {
  ------------------
  |  Branch (1094:7): [True: 88, False: 384]
  ------------------
 1095|     88|    if (av1_is_wedge_used(bsize)) {
  ------------------
  |  Branch (1095:9): [True: 88, False: 0]
  ------------------
 1096|     88|      const uint8_t *mask =
 1097|     88|          av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
 1098|     88|      const int subh = 2 * mi_size_high[bsize] == bh;
 1099|     88|      const int subw = 2 * mi_size_wide[bsize] == bw;
 1100|     88|      aom_highbd_blend_a64_mask(comppred8, compstride, intrapred8, intrastride,
 1101|     88|                                interpred8, interstride, mask,
 1102|     88|                                block_size_wide[bsize], bw, bh, subw, subh, bd);
 1103|     88|    }
 1104|     88|    return;
 1105|     88|  }
 1106|       |
 1107|    384|  uint8_t mask[MAX_SB_SQUARE];
 1108|    384|  build_smooth_interintra_mask(mask, bw, plane_bsize, mode);
 1109|    384|  aom_highbd_blend_a64_mask(comppred8, compstride, intrapred8, intrastride,
 1110|    384|                            interpred8, interstride, mask, bw, bw, bh, 0, 0,
 1111|    384|                            bd);
 1112|    384|}
reconinter.c:combine_interintra:
 1063|  4.42k|    int interstride, const uint8_t *intrapred, int intrastride) {
 1064|  4.42k|  const int bw = block_size_wide[plane_bsize];
 1065|  4.42k|  const int bh = block_size_high[plane_bsize];
 1066|       |
 1067|  4.42k|  if (use_wedge_interintra) {
  ------------------
  |  Branch (1067:7): [True: 1.25k, False: 3.17k]
  ------------------
 1068|  1.25k|    if (av1_is_wedge_used(bsize)) {
  ------------------
  |  Branch (1068:9): [True: 1.25k, False: 0]
  ------------------
 1069|  1.25k|      const uint8_t *mask =
 1070|  1.25k|          av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
 1071|  1.25k|      const int subw = 2 * mi_size_wide[bsize] == bw;
 1072|  1.25k|      const int subh = 2 * mi_size_high[bsize] == bh;
 1073|  1.25k|      aom_blend_a64_mask(comppred, compstride, intrapred, intrastride,
 1074|  1.25k|                         interpred, interstride, mask, block_size_wide[bsize],
 1075|  1.25k|                         bw, bh, subw, subh);
 1076|  1.25k|    }
 1077|  1.25k|    return;
 1078|  1.25k|  }
 1079|       |
 1080|  3.17k|  const uint8_t *mask = smooth_interintra_mask_buf[mode][plane_bsize];
 1081|  3.17k|  aom_blend_a64_mask(comppred, compstride, intrapred, intrastride, interpred,
 1082|  3.17k|                     interstride, mask, bw, bw, bh, 0, 0);
 1083|  3.17k|}

decodeframe.c:av1_init_inter_params:
  221|   193k|    int_interpfilters interp_filters) {
  222|   193k|  init_inter_block_params(inter_pred_params, block_width, block_height, pix_row,
  223|   193k|                          pix_col, subsampling_x, subsampling_y, bit_depth,
  224|   193k|                          use_hbd_buf, is_intrabc);
  225|   193k|  init_interp_filter_params(inter_pred_params->interp_filter_params,
  226|   193k|                            &interp_filters.as_filters, block_width,
  227|   193k|                            block_height, is_intrabc);
  228|   193k|  inter_pred_params->scale_factors = sf;
  229|   193k|  inter_pred_params->ref_frame_buf = *ref_buf;
  230|   193k|}
decodeframe.c:init_inter_block_params:
  199|   193k|                                           int is_intrabc) {
  200|   193k|  inter_pred_params->block_width = block_width;
  201|   193k|  inter_pred_params->block_height = block_height;
  202|   193k|  inter_pred_params->pix_row = pix_row;
  203|   193k|  inter_pred_params->pix_col = pix_col;
  204|   193k|  inter_pred_params->subsampling_x = subsampling_x;
  205|   193k|  inter_pred_params->subsampling_y = subsampling_y;
  206|   193k|  inter_pred_params->bit_depth = bit_depth;
  207|   193k|  inter_pred_params->use_hbd_buf = use_hbd_buf;
  208|   193k|  inter_pred_params->is_intrabc = is_intrabc;
  209|   193k|  inter_pred_params->mode = TRANSLATION_PRED;
  210|   193k|  inter_pred_params->comp_mode = UNIFORM_SINGLE;
  211|   193k|  inter_pred_params->top = -AOM_LEFT_TOP_MARGIN_SCALED(subsampling_y);
  ------------------
  |  |   32|   193k|  (AOM_LEFT_TOP_MARGIN_PX(subsampling) << SCALE_SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   30|   193k|  ((AOM_BORDER_IN_PIXELS >> subsampling) - AOM_INTERP_EXTEND)
  |  |  |  |  ------------------
  |  |  |  |  |  |   32|   193k|#define AOM_BORDER_IN_PIXELS 288
  |  |  |  |  ------------------
  |  |  |  |                 ((AOM_BORDER_IN_PIXELS >> subsampling) - AOM_INTERP_EXTEND)
  |  |  |  |  ------------------
  |  |  |  |  |  |   31|   193k|#define AOM_INTERP_EXTEND 4
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (AOM_LEFT_TOP_MARGIN_PX(subsampling) << SCALE_SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   28|   193k|#define SCALE_SUBPEL_BITS 10
  |  |  ------------------
  ------------------
  212|   193k|  inter_pred_params->left = -AOM_LEFT_TOP_MARGIN_SCALED(subsampling_x);
  ------------------
  |  |   32|   193k|  (AOM_LEFT_TOP_MARGIN_PX(subsampling) << SCALE_SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   30|   193k|  ((AOM_BORDER_IN_PIXELS >> subsampling) - AOM_INTERP_EXTEND)
  |  |  |  |  ------------------
  |  |  |  |  |  |   32|   193k|#define AOM_BORDER_IN_PIXELS 288
  |  |  |  |  ------------------
  |  |  |  |                 ((AOM_BORDER_IN_PIXELS >> subsampling) - AOM_INTERP_EXTEND)
  |  |  |  |  ------------------
  |  |  |  |  |  |   31|   193k|#define AOM_INTERP_EXTEND 4
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (AOM_LEFT_TOP_MARGIN_PX(subsampling) << SCALE_SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   28|   193k|#define SCALE_SUBPEL_BITS 10
  |  |  ------------------
  ------------------
  213|   193k|}
decodeframe.c:init_interp_filter_params:
  172|   193k|    int is_intrabc) {
  173|   193k|  if (UNLIKELY(is_intrabc)) {
  ------------------
  |  |   55|   193k|#define UNLIKELY(v) __builtin_expect(v, 0)
  |  |  ------------------
  |  |  |  Branch (55:21): [True: 13.1k, False: 180k]
  |  |  ------------------
  ------------------
  174|  13.1k|    interp_filter_params[0] = &av1_intrabc_filter_params;
  175|  13.1k|    interp_filter_params[1] = &av1_intrabc_filter_params;
  176|   180k|  } else {
  177|   180k|    interp_filter_params[0] = av1_get_interp_filter_params_with_block_size(
  178|   180k|        (InterpFilter)filter->x_filter, block_width);
  179|   180k|    interp_filter_params[1] = av1_get_interp_filter_params_with_block_size(
  180|   180k|        (InterpFilter)filter->y_filter, block_height);
  181|   180k|  }
  182|   193k|}
decodeframe.c:clamp_mv_to_umv_border_sb:
  345|   193k|                                           int ss_x, int ss_y) {
  346|       |  // If the MV points so far into the UMV border that no visible pixels
  347|       |  // are used for reconstruction, the subpel part of the MV can be
  348|       |  // discarded and the MV limited to 16 pixels with equivalent results.
  349|   193k|  const int spel_left = (AOM_INTERP_EXTEND + bw) << SUBPEL_BITS;
  ------------------
  |  |   31|   193k|#define AOM_INTERP_EXTEND 4
  ------------------
                const int spel_left = (AOM_INTERP_EXTEND + bw) << SUBPEL_BITS;
  ------------------
  |  |   23|   193k|#define SUBPEL_BITS 4
  ------------------
  350|   193k|  const int spel_right = spel_left - SUBPEL_SHIFTS;
  ------------------
  |  |   25|   193k|#define SUBPEL_SHIFTS (1 << SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   23|   193k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  351|   193k|  const int spel_top = (AOM_INTERP_EXTEND + bh) << SUBPEL_BITS;
  ------------------
  |  |   31|   193k|#define AOM_INTERP_EXTEND 4
  ------------------
                const int spel_top = (AOM_INTERP_EXTEND + bh) << SUBPEL_BITS;
  ------------------
  |  |   23|   193k|#define SUBPEL_BITS 4
  ------------------
  352|   193k|  const int spel_bottom = spel_top - SUBPEL_SHIFTS;
  ------------------
  |  |   25|   193k|#define SUBPEL_SHIFTS (1 << SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   23|   193k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  353|   193k|  MV clamped_mv = { (int16_t)(src_mv->row * (1 << (1 - ss_y))),
  354|   193k|                    (int16_t)(src_mv->col * (1 << (1 - ss_x))) };
  355|   193k|  assert(ss_x <= 1);
  356|   193k|  assert(ss_y <= 1);
  357|   193k|  const SubpelMvLimits mv_limits = {
  358|   193k|    xd->mb_to_left_edge * (1 << (1 - ss_x)) - spel_left,
  359|   193k|    xd->mb_to_right_edge * (1 << (1 - ss_x)) + spel_right,
  360|   193k|    xd->mb_to_top_edge * (1 << (1 - ss_y)) - spel_top,
  361|   193k|    xd->mb_to_bottom_edge * (1 << (1 - ss_y)) + spel_bottom
  362|   193k|  };
  363|       |
  364|   193k|  clamp_mv(&clamped_mv, &mv_limits);
  365|       |
  366|   193k|  return clamped_mv;
  367|   193k|}
decodeframe.c:av1_init_comp_mode:
  232|  35.9k|static inline void av1_init_comp_mode(InterPredParams *inter_pred_params) {
  233|  35.9k|  inter_pred_params->comp_mode = UNIFORM_COMP;
  234|  35.9k|}
decodemv.c:av1_is_wedge_used:
  329|  2.50k|static inline int av1_is_wedge_used(BLOCK_SIZE sb_type) {
  330|  2.50k|  return av1_wedge_params_lookup[sb_type].wedge_types > 0;
  331|  2.50k|}
decodemv.c:is_any_masked_compound_used:
  312|  11.8k|static inline int is_any_masked_compound_used(BLOCK_SIZE sb_type) {
  313|  11.8k|  COMPOUND_TYPE comp_type;
  314|  11.8k|  int i;
  315|  11.8k|  if (!is_comp_ref_allowed(sb_type)) return 0;
  ------------------
  |  Branch (315:7): [True: 0, False: 11.8k]
  ------------------
  316|  36.3k|  for (i = 0; i < COMPOUND_TYPES; i++) {
  ------------------
  |  Branch (316:15): [True: 36.3k, False: 12]
  ------------------
  317|  36.3k|    comp_type = (COMPOUND_TYPE)i;
  318|  36.3k|    if (is_masked_compound_type(comp_type) &&
  ------------------
  |  Branch (318:9): [True: 12.6k, False: 23.7k]
  ------------------
  319|  12.6k|        is_interinter_compound_used(comp_type, sb_type))
  ------------------
  |  Branch (319:9): [True: 11.8k, False: 745]
  ------------------
  320|  11.8k|      return 1;
  321|  36.3k|  }
  322|     12|  return 0;
  323|  11.8k|}
decodemv.c:is_interinter_compound_used:
  300|  14.2k|                                              BLOCK_SIZE sb_type) {
  301|  14.2k|  const int comp_allowed = is_comp_ref_allowed(sb_type);
  302|  14.2k|  switch (type) {
  303|      0|    case COMPOUND_AVERAGE:
  ------------------
  |  Branch (303:5): [True: 0, False: 14.2k]
  ------------------
  304|      0|    case COMPOUND_DISTWTD:
  ------------------
  |  Branch (304:5): [True: 0, False: 14.2k]
  ------------------
  305|    740|    case COMPOUND_DIFFWTD: return comp_allowed;
  ------------------
  |  Branch (305:5): [True: 740, False: 13.5k]
  ------------------
  306|  13.5k|    case COMPOUND_WEDGE:
  ------------------
  |  Branch (306:5): [True: 13.5k, False: 741]
  ------------------
  307|  13.5k|      return comp_allowed && av1_wedge_params_lookup[sb_type].wedge_types > 0;
  ------------------
  |  Branch (307:14): [True: 13.5k, False: 0]
  |  Branch (307:30): [True: 12.6k, False: 870]
  ------------------
  308|      0|    default: assert(0); return 0;
  ------------------
  |  Branch (308:5): [True: 0, False: 14.2k]
  ------------------
  309|  14.2k|  }
  310|  14.2k|}
decodemv.c:av1_is_interp_needed:
  420|  76.6k|static inline int av1_is_interp_needed(const MACROBLOCKD *const xd) {
  421|  76.6k|  const MB_MODE_INFO *const mbmi = xd->mi[0];
  422|  76.6k|  if (mbmi->skip_mode) return 0;
  ------------------
  |  Branch (422:7): [True: 110, False: 76.5k]
  ------------------
  423|  76.5k|  if (mbmi->motion_mode == WARPED_CAUSAL) return 0;
  ------------------
  |  Branch (423:7): [True: 3.45k, False: 73.1k]
  ------------------
  424|  73.1k|  if (is_nontrans_global_motion(xd, xd->mi[0])) return 0;
  ------------------
  |  Branch (424:7): [True: 2.57k, False: 70.5k]
  ------------------
  425|  70.5k|  return 1;
  426|  73.1k|}
decodemv.c:set_default_interp_filters:
  415|  6.14k|    MB_MODE_INFO *const mbmi, InterpFilter frame_interp_filter) {
  416|  6.14k|  mbmi->interp_filters =
  417|  6.14k|      av1_broadcast_interp_filter(av1_unswitchable_filter(frame_interp_filter));
  418|  6.14k|}
reconinter.c:highbd_inter_predictor:
  279|  36.2k|    int bd) {
  280|  36.2k|  assert(conv_params->do_average == 0 || conv_params->do_average == 1);
  281|  36.2k|  const int is_scaled = has_scale(subpel_params->xs, subpel_params->ys);
  282|  36.2k|  if (is_scaled) {
  ------------------
  |  Branch (282:7): [True: 0, False: 36.2k]
  ------------------
  283|      0|    av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
  284|      0|                                  interp_filters, subpel_params->subpel_x,
  285|      0|                                  subpel_params->xs, subpel_params->subpel_y,
  286|      0|                                  subpel_params->ys, 1, conv_params, bd);
  287|  36.2k|  } else {
  288|  36.2k|    SubpelParams sp = *subpel_params;
  289|  36.2k|    revert_scale_extra_bits(&sp);
  290|  36.2k|    av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
  291|  36.2k|                                  interp_filters, sp.subpel_x, sp.xs,
  292|  36.2k|                                  sp.subpel_y, sp.ys, 0, conv_params, bd);
  293|  36.2k|  }
  294|  36.2k|}
reconinter.c:has_scale:
  240|   189k|static inline int has_scale(int xs, int ys) {
  241|   189k|  return xs != SCALE_SUBPEL_SHIFTS || ys != SCALE_SUBPEL_SHIFTS;
  ------------------
  |  |   29|   379k|#define SCALE_SUBPEL_SHIFTS (1 << SCALE_SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   28|   189k|#define SCALE_SUBPEL_BITS 10
  |  |  ------------------
  ------------------
                return xs != SCALE_SUBPEL_SHIFTS || ys != SCALE_SUBPEL_SHIFTS;
  ------------------
  |  |   29|   189k|#define SCALE_SUBPEL_SHIFTS (1 << SCALE_SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   28|   189k|#define SCALE_SUBPEL_BITS 10
  |  |  ------------------
  ------------------
  |  Branch (241:10): [True: 24, False: 189k]
  |  Branch (241:39): [True: 0, False: 189k]
  ------------------
  242|   189k|}
reconinter.c:revert_scale_extra_bits:
  244|   189k|static inline void revert_scale_extra_bits(SubpelParams *sp) {
  245|   189k|  sp->subpel_x >>= SCALE_EXTRA_BITS;
  ------------------
  |  |   31|   189k|#define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   28|   189k|#define SCALE_SUBPEL_BITS 10
  |  |  ------------------
  |  |               #define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   23|   189k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  246|   189k|  sp->subpel_y >>= SCALE_EXTRA_BITS;
  ------------------
  |  |   31|   189k|#define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   28|   189k|#define SCALE_SUBPEL_BITS 10
  |  |  ------------------
  |  |               #define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   23|   189k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  247|   189k|  sp->xs >>= SCALE_EXTRA_BITS;
  ------------------
  |  |   31|   189k|#define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   28|   189k|#define SCALE_SUBPEL_BITS 10
  |  |  ------------------
  |  |               #define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   23|   189k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  248|   189k|  sp->ys >>= SCALE_EXTRA_BITS;
  ------------------
  |  |   31|   189k|#define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   28|   189k|#define SCALE_SUBPEL_BITS 10
  |  |  ------------------
  |  |               #define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   23|   189k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  249|   189k|  assert(sp->subpel_x < SUBPEL_SHIFTS);
  250|   189k|  assert(sp->subpel_y < SUBPEL_SHIFTS);
  251|   189k|  assert(sp->xs <= SUBPEL_SHIFTS);
  252|       |  assert(sp->ys <= SUBPEL_SHIFTS);
  253|   189k|}
reconinter.c:inter_predictor:
  258|   153k|    ConvolveParams *conv_params, const InterpFilterParams *interp_filters[2]) {
  259|   153k|  assert(conv_params->do_average == 0 || conv_params->do_average == 1);
  260|   153k|  const int is_scaled = has_scale(subpel_params->xs, subpel_params->ys);
  261|   153k|  if (is_scaled) {
  ------------------
  |  Branch (261:7): [True: 24, False: 153k]
  ------------------
  262|     24|    av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
  263|     24|                           interp_filters, subpel_params->subpel_x,
  264|     24|                           subpel_params->xs, subpel_params->subpel_y,
  265|     24|                           subpel_params->ys, 1, conv_params);
  266|   153k|  } else {
  267|   153k|    SubpelParams sp = *subpel_params;
  268|   153k|    revert_scale_extra_bits(&sp);
  269|   153k|    av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
  270|   153k|                           interp_filters, sp.subpel_x, sp.xs, sp.subpel_y,
  271|   153k|                           sp.ys, 0, conv_params);
  272|   153k|  }
  273|   153k|}
reconinter.c:av1_get_contiguous_soft_mask:
  458|  2.72k|                                                          BLOCK_SIZE sb_type) {
  459|  2.72k|  return av1_wedge_params_lookup[sb_type].masks[wedge_sign][wedge_index];
  460|  2.72k|}
reconinter.c:setup_pred_plane:
  390|  5.77M|                                    int subsampling_x, int subsampling_y) {
  391|       |  // Offset the buffer pointer
  392|  5.77M|  if (subsampling_y && (mi_row & 0x01) && (mi_size_high[bsize] == 1))
  ------------------
  |  Branch (392:7): [True: 549k, False: 5.22M]
  |  Branch (392:24): [True: 25.8k, False: 523k]
  |  Branch (392:43): [True: 25.8k, False: 0]
  ------------------
  393|  25.8k|    mi_row -= 1;
  394|  5.77M|  if (subsampling_x && (mi_col & 0x01) && (mi_size_wide[bsize] == 1))
  ------------------
  |  Branch (394:7): [True: 568k, False: 5.20M]
  |  Branch (394:24): [True: 20.0k, False: 548k]
  |  Branch (394:43): [True: 20.0k, False: 0]
  ------------------
  395|  20.0k|    mi_col -= 1;
  396|       |
  397|  5.77M|  const int x = (MI_SIZE * mi_col) >> subsampling_x;
  ------------------
  |  |   40|  5.77M|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  5.77M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  398|  5.77M|  const int y = (MI_SIZE * mi_row) >> subsampling_y;
  ------------------
  |  |   40|  5.77M|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  5.77M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  399|  5.77M|  dst->buf = src + scaled_buffer_offset(x, y, stride, scale);
  400|  5.77M|  dst->buf0 = src;
  401|  5.77M|  dst->width = width;
  402|  5.77M|  dst->height = height;
  403|  5.77M|  dst->stride = stride;
  404|  5.77M|}
reconinter.c:scaled_buffer_offset:
  371|  5.77M|                                           const struct scale_factors *sf) {
  372|  5.77M|  int x, y;
  373|  5.77M|  if (!sf) {
  ------------------
  |  Branch (373:7): [True: 5.58M, False: 190k]
  ------------------
  374|  5.58M|    x = x_offset;
  375|  5.58M|    y = y_offset;
  376|  5.58M|  } else if (av1_is_scaled(sf)) {
  ------------------
  |  Branch (376:14): [True: 24, False: 190k]
  ------------------
  377|     24|    x = av1_scaled_x(x_offset, sf) >> SCALE_EXTRA_BITS;
  ------------------
  |  |   31|     24|#define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   28|     24|#define SCALE_SUBPEL_BITS 10
  |  |  ------------------
  |  |               #define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   23|     24|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  378|     24|    y = av1_scaled_y(y_offset, sf) >> SCALE_EXTRA_BITS;
  ------------------
  |  |   31|     24|#define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   28|     24|#define SCALE_SUBPEL_BITS 10
  |  |  ------------------
  |  |               #define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   23|     24|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  379|   190k|  } else {
  380|   190k|    x = av1_unscaled_value(x_offset, sf) >> SCALE_EXTRA_BITS;
  ------------------
  |  |   31|   190k|#define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   28|   190k|#define SCALE_SUBPEL_BITS 10
  |  |  ------------------
  |  |               #define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   23|   190k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  381|   190k|    y = av1_unscaled_value(y_offset, sf) >> SCALE_EXTRA_BITS;
  ------------------
  |  |   31|   190k|#define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   28|   190k|#define SCALE_SUBPEL_BITS 10
  |  |  ------------------
  |  |               #define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   23|   190k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  382|   190k|  }
  383|  5.77M|  return (int64_t)y * stride + x;
  384|  5.77M|}
reconinter.c:av1_is_wedge_used:
  329|  1.33k|static inline int av1_is_wedge_used(BLOCK_SIZE sb_type) {
  330|  1.33k|  return av1_wedge_params_lookup[sb_type].wedge_types > 0;
  331|  1.33k|}

decodeframe.c:build_inter_predictors:
  246|   163k|                                          uint8_t **mc_buf) {
  247|   163k|  if (is_sub8x8_inter(xd, plane, mi->bsize, is_intrabc_block(mi),
  ------------------
  |  Branch (247:7): [True: 10.0k, False: 153k]
  ------------------
  248|   163k|                      build_for_obmc)) {
  249|  10.0k|    assert(bw < 8 || bh < 8);
  250|  10.0k|    build_inter_predictors_sub8x8(cm, xd, plane, mi, mi_x, mi_y, mc_buf);
  251|   153k|  } else {
  252|   153k|    build_inter_predictors_8x8_and_bigger(cm, xd, plane, mi, build_for_obmc, bw,
  253|   153k|                                          bh, mi_x, mi_y, mc_buf);
  254|   153k|  }
  255|   163k|}
decodeframe.c:is_sub8x8_inter:
   55|   163k|                            int is_intrabc, int build_for_obmc) {
   56|   163k|  if (is_intrabc || build_for_obmc) {
  ------------------
  |  Branch (56:7): [True: 13.1k, False: 150k]
  |  Branch (56:21): [True: 32.3k, False: 118k]
  ------------------
   57|  45.4k|    return false;
   58|  45.4k|  }
   59|       |
   60|   118k|  const struct macroblockd_plane *const pd = &xd->plane[plane];
   61|   118k|  const int ss_x = pd->subsampling_x;
   62|   118k|  const int ss_y = pd->subsampling_y;
   63|   118k|  const int is_sub4_x = (block_size_wide[bsize] == 4) && ss_x;
  ------------------
  |  Branch (63:25): [True: 13.7k, False: 104k]
  |  Branch (63:58): [True: 4.94k, False: 8.84k]
  ------------------
   64|   118k|  const int is_sub4_y = (block_size_high[bsize] == 4) && ss_y;
  ------------------
  |  Branch (64:25): [True: 19.0k, False: 99.2k]
  |  Branch (64:58): [True: 6.80k, False: 12.2k]
  ------------------
   65|   118k|  if (!is_sub4_x && !is_sub4_y) {
  ------------------
  |  Branch (65:7): [True: 113k, False: 4.94k]
  |  Branch (65:21): [True: 107k, False: 5.52k]
  ------------------
   66|   107k|    return false;
   67|   107k|  }
   68|       |
   69|       |  // For sub8x8 chroma blocks, we may be covering more than one luma block's
   70|       |  // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for
   71|       |  // the top-left corner of the prediction source - the correct top-left corner
   72|       |  // is at (pre_x, pre_y).
   73|  10.4k|  const int row_start = is_sub4_y ? -1 : 0;
  ------------------
  |  Branch (73:25): [True: 6.80k, False: 3.67k]
  ------------------
   74|  10.4k|  const int col_start = is_sub4_x ? -1 : 0;
  ------------------
  |  Branch (74:25): [True: 4.94k, False: 5.53k]
  ------------------
   75|       |
   76|  26.9k|  for (int row = row_start; row <= 0; ++row) {
  ------------------
  |  Branch (76:29): [True: 16.9k, False: 10.0k]
  ------------------
   77|  39.3k|    for (int col = col_start; col <= 0; ++col) {
  ------------------
  |  Branch (77:31): [True: 22.8k, False: 16.5k]
  ------------------
   78|  22.8k|      const MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
   79|  22.8k|      if (!is_inter_block(this_mbmi)) return false;
  ------------------
  |  Branch (79:11): [True: 452, False: 22.4k]
  ------------------
   80|  22.4k|      if (is_intrabc_block(this_mbmi)) return false;
  ------------------
  |  Branch (80:11): [True: 0, False: 22.4k]
  ------------------
   81|  22.4k|    }
   82|  16.9k|  }
   83|  10.0k|  return true;
   84|  10.4k|}
decodeframe.c:build_inter_predictors_sub8x8:
   91|  10.0k|                                                 uint8_t **mc_buf) {
   92|       |#else
   93|       |static inline void build_inter_predictors_sub8x8(const AV1_COMMON *cm,
   94|       |                                                 MACROBLOCKD *xd, int plane,
   95|       |                                                 const MB_MODE_INFO *mi,
   96|       |                                                 int mi_x, int mi_y) {
   97|       |#endif  // IS_DEC
   98|  10.0k|  const BLOCK_SIZE bsize = mi->bsize;
   99|  10.0k|  struct macroblockd_plane *const pd = &xd->plane[plane];
  100|  10.0k|  const bool ss_x = pd->subsampling_x;
  101|  10.0k|  const bool ss_y = pd->subsampling_y;
  102|  10.0k|  const int b4_w = block_size_wide[bsize] >> ss_x;
  103|  10.0k|  const int b4_h = block_size_high[bsize] >> ss_y;
  104|  10.0k|  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
  105|  10.0k|  const int b8_w = block_size_wide[plane_bsize];
  106|  10.0k|  const int b8_h = block_size_high[plane_bsize];
  107|  10.0k|  const int is_compound = has_second_ref(mi);
  108|  10.0k|  assert(!is_compound);
  109|  10.0k|  assert(!is_intrabc_block(mi));
  110|       |
  111|       |  // For sub8x8 chroma blocks, we may be covering more than one luma block's
  112|       |  // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for
  113|       |  // the top-left corner of the prediction source - the correct top-left corner
  114|       |  // is at (pre_x, pre_y).
  115|  10.0k|  const int row_start = (block_size_high[bsize] == 4) && ss_y ? -1 : 0;
  ------------------
  |  Branch (115:25): [True: 6.49k, False: 3.52k]
  |  Branch (115:58): [True: 6.46k, False: 36]
  ------------------
  116|  10.0k|  const int col_start = (block_size_wide[bsize] == 4) && ss_x ? -1 : 0;
  ------------------
  |  Branch (116:25): [True: 4.69k, False: 5.32k]
  |  Branch (116:58): [True: 4.69k, False: 0]
  ------------------
  117|  10.0k|  const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x;
  ------------------
  |  |   40|  10.0k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  10.0k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  118|  10.0k|  const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y;
  ------------------
  |  |   40|  10.0k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  10.0k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  119|       |
  120|  10.0k|  int row = row_start;
  121|  26.4k|  for (int y = 0; y < b8_h; y += b4_h) {
  ------------------
  |  Branch (121:19): [True: 16.4k, False: 10.0k]
  ------------------
  122|  16.4k|    int col = col_start;
  123|  38.7k|    for (int x = 0; x < b8_w; x += b4_w) {
  ------------------
  |  Branch (123:21): [True: 22.3k, False: 16.4k]
  ------------------
  124|  22.3k|      MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
  125|  22.3k|      struct buf_2d *const dst_buf = &pd->dst;
  126|  22.3k|      uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x;
  127|  22.3k|      int ref = 0;
  128|  22.3k|      const RefCntBuffer *ref_buf =
  129|  22.3k|          get_ref_frame_buf(cm, this_mbmi->ref_frame[ref]);
  130|  22.3k|      const struct scale_factors *ref_scale_factors =
  131|  22.3k|          get_ref_scale_factors_const(cm, this_mbmi->ref_frame[ref]);
  132|  22.3k|      const struct scale_factors *const sf = ref_scale_factors;
  133|  22.3k|      const struct buf_2d pre_buf = {
  134|  22.3k|        NULL,
  135|  22.3k|        (plane == 1) ? ref_buf->buf.u_buffer : ref_buf->buf.v_buffer,
  ------------------
  |  Branch (135:9): [True: 11.1k, False: 11.1k]
  ------------------
  136|  22.3k|        ref_buf->buf.uv_crop_width,
  137|  22.3k|        ref_buf->buf.uv_crop_height,
  138|  22.3k|        ref_buf->buf.uv_stride,
  139|  22.3k|      };
  140|       |
  141|  22.3k|      const MV mv = this_mbmi->mv[ref].as_mv;
  142|       |
  143|  22.3k|      InterPredParams inter_pred_params;
  144|  22.3k|      av1_init_inter_params(&inter_pred_params, b4_w, b4_h, pre_y + y,
  145|  22.3k|                            pre_x + x, pd->subsampling_x, pd->subsampling_y,
  146|  22.3k|                            xd->bd, is_cur_buf_hbd(xd), mi->use_intrabc, sf,
  147|  22.3k|                            &pre_buf, this_mbmi->interp_filters);
  148|  22.3k|      inter_pred_params.conv_params =
  149|  22.3k|          get_conv_params_no_round(ref, plane, NULL, 0, is_compound, xd->bd);
  150|       |
  151|  22.3k|#if IS_DEC
  152|  22.3k|      build_one_inter_predictor(dst, dst_buf->stride, &mv, &inter_pred_params,
  153|  22.3k|                                xd, mi_x + x, mi_y + y, ref, mc_buf);
  154|       |#else
  155|       |      build_one_inter_predictor(dst, dst_buf->stride, &mv, &inter_pred_params);
  156|       |#endif  // IS_DEC
  157|       |
  158|  22.3k|      ++col;
  159|  22.3k|    }
  160|  16.4k|    ++row;
  161|  16.4k|  }
  162|  10.0k|}
decodeframe.c:build_one_inter_predictor:
   22|   193k|                                             uint8_t **mc_buf) {
   23|       |#else
   24|       |static inline void build_one_inter_predictor(
   25|       |    uint8_t *dst, int dst_stride, const MV *src_mv,
   26|       |    InterPredParams *inter_pred_params) {
   27|       |#endif  // IS_DEC
   28|   193k|  SubpelParams subpel_params;
   29|   193k|  uint8_t *src;
   30|   193k|  int src_stride;
   31|   193k|#if IS_DEC
   32|   193k|  dec_calc_subpel_params_and_extend(src_mv, inter_pred_params, xd, mi_x, mi_y,
   33|   193k|                                    ref, mc_buf, &src, &subpel_params,
   34|   193k|                                    &src_stride);
   35|       |#else
   36|       |  enc_calc_subpel_params(src_mv, inter_pred_params, &src, &subpel_params,
   37|       |                         &src_stride);
   38|       |#endif  // IS_DEC
   39|   193k|  if (inter_pred_params->comp_mode == UNIFORM_SINGLE ||
  ------------------
  |  Branch (39:7): [True: 158k, False: 35.9k]
  ------------------
   40|   190k|      inter_pred_params->comp_mode == UNIFORM_COMP) {
  ------------------
  |  Branch (40:7): [True: 32.7k, False: 3.19k]
  ------------------
   41|   190k|    av1_make_inter_predictor(src, src_stride, dst, dst_stride,
   42|   190k|                             inter_pred_params, &subpel_params);
   43|   190k|  } else {
   44|  3.19k|    av1_make_masked_inter_predictor(src, src_stride, dst, dst_stride,
   45|  3.19k|                                    inter_pred_params, &subpel_params);
   46|  3.19k|  }
   47|   193k|}
decodeframe.c:build_inter_predictors_8x8_and_bigger:
  167|   153k|    int build_for_obmc, int bw, int bh, int mi_x, int mi_y, uint8_t **mc_buf) {
  168|       |#else
  169|       |static inline void build_inter_predictors_8x8_and_bigger(
  170|       |    const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, const MB_MODE_INFO *mi,
  171|       |    int build_for_obmc, int bw, int bh, int mi_x, int mi_y) {
  172|       |#endif  // IS_DEC
  173|   153k|  const int is_compound = has_second_ref(mi);
  174|   153k|  const int is_intrabc = is_intrabc_block(mi);
  175|   153k|  assert(IMPLIES(is_intrabc, !is_compound));
  176|   153k|  struct macroblockd_plane *const pd = &xd->plane[plane];
  177|   153k|  struct buf_2d *const dst_buf = &pd->dst;
  178|   153k|  uint8_t *const dst = dst_buf->buf;
  179|       |
  180|   153k|  int is_global[2] = { 0, 0 };
  181|   325k|  for (int ref = 0; ref < 1 + is_compound; ++ref) {
  ------------------
  |  Branch (181:21): [True: 171k, False: 153k]
  ------------------
  182|   171k|    const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]];
  183|   171k|    is_global[ref] = is_global_mv_block(mi, wm->wmtype);
  184|   171k|  }
  185|       |
  186|   153k|  const BLOCK_SIZE bsize = mi->bsize;
  187|   153k|  const int ss_x = pd->subsampling_x;
  188|   153k|  const int ss_y = pd->subsampling_y;
  189|   153k|  const int row_start =
  190|   153k|      (block_size_high[bsize] == 4) && ss_y && !build_for_obmc ? -1 : 0;
  ------------------
  |  Branch (190:7): [True: 18.6k, False: 135k]
  |  Branch (190:40): [True: 1.59k, False: 17.0k]
  |  Branch (190:48): [True: 340, False: 1.25k]
  ------------------
  191|   153k|  const int col_start =
  192|   153k|      (block_size_wide[bsize] == 4) && ss_x && !build_for_obmc ? -1 : 0;
  ------------------
  |  Branch (192:7): [True: 13.2k, False: 140k]
  |  Branch (192:40): [True: 1.26k, False: 11.9k]
  |  Branch (192:48): [True: 244, False: 1.01k]
  ------------------
  193|   153k|  const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x;
  ------------------
  |  |   40|   153k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|   153k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  194|   153k|  const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y;
  ------------------
  |  |   40|   153k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|   153k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  195|       |
  196|   325k|  for (int ref = 0; ref < 1 + is_compound; ++ref) {
  ------------------
  |  Branch (196:21): [True: 171k, False: 153k]
  ------------------
  197|   171k|    const struct scale_factors *const sf =
  198|   171k|        is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref];
  ------------------
  |  Branch (198:9): [True: 13.1k, False: 158k]
  ------------------
  199|   171k|    struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
  ------------------
  |  Branch (199:36): [True: 13.1k, False: 158k]
  ------------------
  200|   171k|    const MV mv = mi->mv[ref].as_mv;
  201|   171k|    const WarpTypesAllowed warp_types = { is_global[ref],
  202|   171k|                                          mi->motion_mode == WARPED_CAUSAL };
  203|       |
  204|   171k|    InterPredParams inter_pred_params;
  205|   171k|    av1_init_inter_params(&inter_pred_params, bw, bh, pre_y, pre_x,
  206|   171k|                          pd->subsampling_x, pd->subsampling_y, xd->bd,
  207|   171k|                          is_cur_buf_hbd(xd), mi->use_intrabc, sf, pre_buf,
  208|   171k|                          mi->interp_filters);
  209|   171k|    if (is_compound) av1_init_comp_mode(&inter_pred_params);
  ------------------
  |  Branch (209:9): [True: 35.9k, False: 135k]
  ------------------
  210|   171k|    inter_pred_params.conv_params = get_conv_params_no_round(
  211|   171k|        ref, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd);
  ------------------
  |  |   32|   171k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|   171k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
  212|       |
  213|   171k|    av1_dist_wtd_comp_weight_assign(
  214|   171k|        cm, mi, &inter_pred_params.conv_params.fwd_offset,
  215|   171k|        &inter_pred_params.conv_params.bck_offset,
  216|   171k|        &inter_pred_params.conv_params.use_dist_wtd_comp_avg, is_compound);
  217|       |
  218|   171k|    if (!build_for_obmc)
  ------------------
  |  Branch (218:9): [True: 139k, False: 32.3k]
  ------------------
  219|   139k|      av1_init_warp_params(&inter_pred_params, &warp_types, ref, xd, mi);
  220|       |
  221|   171k|    if (is_masked_compound_type(mi->interinter_comp.type)) {
  ------------------
  |  Branch (221:9): [True: 6.38k, False: 165k]
  ------------------
  222|  6.38k|      inter_pred_params.sb_type = mi->bsize;
  223|  6.38k|      inter_pred_params.mask_comp = mi->interinter_comp;
  224|  6.38k|      if (ref == 1) {
  ------------------
  |  Branch (224:11): [True: 3.19k, False: 3.19k]
  ------------------
  225|  3.19k|        inter_pred_params.conv_params.do_average = 0;
  226|  3.19k|        inter_pred_params.comp_mode = MASK_COMP;
  227|  3.19k|      }
  228|       |      // Assign physical buffer.
  229|  6.38k|      inter_pred_params.mask_comp.seg_mask = xd->seg_mask;
  230|  6.38k|    }
  231|       |
  232|   171k|#if IS_DEC
  233|   171k|    build_one_inter_predictor(dst, dst_buf->stride, &mv, &inter_pred_params, xd,
  234|   171k|                              mi_x, mi_y, ref, mc_buf);
  235|       |#else
  236|       |    build_one_inter_predictor(dst, dst_buf->stride, &mv, &inter_pred_params);
  237|       |#endif  // IS_DEC
  238|   171k|  }
  239|   153k|}

av1_predict_intra_block:
 1695|  4.53M|                             int plane) {
 1696|  4.53M|  const MB_MODE_INFO *const mbmi = xd->mi[0];
 1697|  4.53M|  const int txwpx = tx_size_wide[tx_size];
 1698|  4.53M|  const int txhpx = tx_size_high[tx_size];
 1699|  4.53M|  const int x = col_off << MI_SIZE_LOG2;
  ------------------
  |  |   39|  4.53M|#define MI_SIZE_LOG2 2
  ------------------
 1700|  4.53M|  const int y = row_off << MI_SIZE_LOG2;
  ------------------
  |  |   39|  4.53M|#define MI_SIZE_LOG2 2
  ------------------
 1701|  4.53M|  const int is_hbd = is_cur_buf_hbd(xd);
 1702|       |
 1703|  4.53M|  assert(mode < INTRA_MODES);
 1704|       |
 1705|  4.53M|  if (use_palette) {
  ------------------
  |  Branch (1705:7): [True: 138k, False: 4.39M]
  ------------------
 1706|   138k|    int r, c;
 1707|   138k|    const uint8_t *const map = xd->plane[plane != 0].color_index_map +
 1708|   138k|                               xd->color_index_map_offset[plane != 0];
 1709|   138k|    const uint16_t *const palette =
 1710|   138k|        mbmi->palette_mode_info.palette_colors + plane * PALETTE_MAX_SIZE;
  ------------------
  |  |   63|   138k|#define PALETTE_MAX_SIZE 8
  ------------------
 1711|   138k|    if (is_hbd) {
  ------------------
  |  Branch (1711:9): [True: 37.7k, False: 101k]
  ------------------
 1712|  37.7k|      uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
  ------------------
  |  |   75|  37.7k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
 1713|   353k|      for (r = 0; r < txhpx; ++r) {
  ------------------
  |  Branch (1713:19): [True: 315k, False: 37.7k]
  ------------------
 1714|  4.10M|        for (c = 0; c < txwpx; ++c) {
  ------------------
  |  Branch (1714:21): [True: 3.78M, False: 315k]
  ------------------
 1715|  3.78M|          dst16[r * dst_stride + c] = palette[map[(r + y) * wpx + c + x]];
 1716|  3.78M|        }
 1717|   315k|      }
 1718|   101k|    } else {
 1719|   694k|      for (r = 0; r < txhpx; ++r) {
  ------------------
  |  Branch (1719:19): [True: 593k, False: 101k]
  ------------------
 1720|  5.91M|        for (c = 0; c < txwpx; ++c) {
  ------------------
  |  Branch (1720:21): [True: 5.31M, False: 593k]
  ------------------
 1721|  5.31M|          dst[r * dst_stride + c] =
 1722|  5.31M|              (uint8_t)palette[map[(r + y) * wpx + c + x]];
 1723|  5.31M|        }
 1724|   593k|      }
 1725|   101k|    }
 1726|   138k|    return;
 1727|   138k|  }
 1728|       |
 1729|  4.39M|  const struct macroblockd_plane *const pd = &xd->plane[plane];
 1730|  4.39M|  const int ss_x = pd->subsampling_x;
 1731|  4.39M|  const int ss_y = pd->subsampling_y;
 1732|  4.39M|  const int have_top =
 1733|  4.39M|      row_off || (ss_y ? xd->chroma_up_available : xd->up_available);
  ------------------
  |  Branch (1733:7): [True: 1.17M, False: 3.21M]
  |  Branch (1733:18): [True: 2.92M, False: 296k]
  |  Branch (1733:19): [True: 169k, False: 3.04M]
  ------------------
 1734|  4.39M|  const int have_left =
 1735|  4.39M|      col_off || (ss_x ? xd->chroma_left_available : xd->left_available);
  ------------------
  |  Branch (1735:7): [True: 1.21M, False: 3.17M]
  |  Branch (1735:18): [True: 2.91M, False: 252k]
  |  Branch (1735:19): [True: 179k, False: 2.99M]
  ------------------
 1736|       |
 1737|       |  // Distance between the right edge of this prediction block to
 1738|       |  // the frame right edge
 1739|  4.39M|  const int xr = (xd->mb_to_right_edge >> (3 + ss_x)) + wpx - x - txwpx;
 1740|       |  // Distance between the bottom edge of this prediction block to
 1741|       |  // the frame bottom edge
 1742|  4.39M|  const int yd = (xd->mb_to_bottom_edge >> (3 + ss_y)) + hpx - y - txhpx;
 1743|  4.39M|  const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES;
 1744|  4.39M|  const int is_dr_mode = av1_is_directional_mode(mode);
 1745|       |
 1746|       |  // The computations in this function, as well as in build_intra_predictors(),
 1747|       |  // are generalized for all intra modes. Some of these operations are not
 1748|       |  // required since non-directional intra modes (i.e., DC, SMOOTH, SMOOTH_H,
 1749|       |  // SMOOTH_V, and PAETH) specifically require left and top neighbors. Hence, a
 1750|       |  // separate function build_non_directional_intra_predictors() is introduced
 1751|       |  // for these modes to avoid redundant computations while generating pred data.
 1752|       |
 1753|  4.39M|  const int n_top_px = have_top ? AOMMIN(txwpx, xr + txwpx) : 0;
  ------------------
  |  |   34|  4.09M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 3.85M, False: 238k]
  |  |  ------------------
  ------------------
  |  Branch (1753:24): [True: 4.09M, False: 295k]
  ------------------
 1754|  4.39M|  const int n_left_px = have_left ? AOMMIN(txhpx, yd + txhpx) : 0;
  ------------------
  |  |   34|  4.13M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 3.95M, False: 182k]
  |  |  ------------------
  ------------------
  |  Branch (1754:25): [True: 4.13M, False: 252k]
  ------------------
 1755|  4.39M|  if (!use_filter_intra && !is_dr_mode) {
  ------------------
  |  Branch (1755:7): [True: 4.00M, False: 384k]
  |  Branch (1755:28): [True: 2.71M, False: 1.29M]
  ------------------
 1756|  2.71M|#if CONFIG_AV1_HIGHBITDEPTH
 1757|  2.71M|    if (is_hbd) {
  ------------------
  |  Branch (1757:9): [True: 1.01M, False: 1.69M]
  ------------------
 1758|  1.01M|      highbd_build_non_directional_intra_predictors(
 1759|  1.01M|          ref, ref_stride, dst, dst_stride, mode, tx_size, n_top_px, n_left_px,
 1760|  1.01M|          xd->bd);
 1761|  1.01M|      return;
 1762|  1.01M|    }
 1763|  1.69M|#endif  // CONFIG_AV1_HIGHBITDEPTH
 1764|  1.69M|    build_non_directional_intra_predictors(ref, ref_stride, dst, dst_stride,
 1765|  1.69M|                                           mode, tx_size, n_top_px, n_left_px);
 1766|  1.69M|    return;
 1767|  2.71M|  }
 1768|       |
 1769|  1.68M|  const int txw = tx_size_wide_unit[tx_size];
 1770|  1.68M|  const int txh = tx_size_high_unit[tx_size];
 1771|  1.68M|  const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2);
  ------------------
  |  |   39|  1.68M|#define MI_SIZE_LOG2 2
  ------------------
 1772|  1.68M|  const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2);
  ------------------
  |  |   39|  1.68M|#define MI_SIZE_LOG2 2
  ------------------
 1773|  1.68M|  const int right_available =
 1774|  1.68M|      mi_col + ((col_off + txw) << ss_x) < xd->tile.mi_col_end;
 1775|  1.68M|  const int bottom_available =
 1776|  1.68M|      (yd > 0) && (mi_row + ((row_off + txh) << ss_y) < xd->tile.mi_row_end);
  ------------------
  |  Branch (1776:7): [True: 1.61M, False: 67.5k]
  |  Branch (1776:19): [True: 1.61M, False: 486]
  ------------------
 1777|       |
 1778|  1.68M|  const PARTITION_TYPE partition = mbmi->partition;
 1779|       |
 1780|  1.68M|  BLOCK_SIZE bsize = mbmi->bsize;
 1781|       |  // force 4x4 chroma component block size.
 1782|  1.68M|  if (ss_x || ss_y) {
  ------------------
  |  Branch (1782:7): [True: 6.99k, False: 1.67M]
  |  Branch (1782:15): [True: 0, False: 1.67M]
  ------------------
 1783|  7.28k|    bsize = scale_chroma_bsize(bsize, ss_x, ss_y);
 1784|  7.28k|  }
 1785|       |
 1786|  1.68M|  int p_angle = 0;
 1787|  1.68M|  int need_top_right = extend_modes[mode] & NEED_ABOVERIGHT;
 1788|  1.68M|  int need_bottom_left = extend_modes[mode] & NEED_BOTTOMLEFT;
 1789|       |
 1790|  1.68M|  if (use_filter_intra) {
  ------------------
  |  Branch (1790:7): [True: 384k, False: 1.29M]
  ------------------
 1791|   384k|    need_top_right = 0;
 1792|   384k|    need_bottom_left = 0;
 1793|   384k|  }
 1794|  1.68M|  if (is_dr_mode) {
  ------------------
  |  Branch (1794:7): [True: 1.29M, False: 384k]
  ------------------
 1795|  1.29M|    p_angle = mode_to_angle_map[mode] + angle_delta;
 1796|  1.29M|    need_top_right = p_angle < 90;
 1797|  1.29M|    need_bottom_left = p_angle > 180;
 1798|  1.29M|  }
 1799|       |
 1800|       |  // Possible states for have_top_right(TR) and have_bottom_left(BL)
 1801|       |  // -1 : TR and BL are not needed
 1802|       |  //  0 : TR and BL are needed but not available
 1803|       |  // > 0 : TR and BL are needed and pixels are available
 1804|  1.68M|  const int have_top_right =
 1805|  1.68M|      need_top_right ? has_top_right(sb_size, bsize, mi_row, mi_col, have_top,
  ------------------
  |  Branch (1805:7): [True: 237k, False: 1.44M]
  ------------------
 1806|   237k|                                     right_available, partition, tx_size,
 1807|   237k|                                     row_off, col_off, ss_x, ss_y)
 1808|  1.68M|                     : -1;
 1809|  1.68M|  const int have_bottom_left =
 1810|  1.68M|      need_bottom_left ? has_bottom_left(sb_size, bsize, mi_row, mi_col,
  ------------------
  |  Branch (1810:7): [True: 283k, False: 1.39M]
  ------------------
 1811|   283k|                                         bottom_available, have_left, partition,
 1812|   283k|                                         tx_size, row_off, col_off, ss_x, ss_y)
 1813|  1.68M|                       : -1;
 1814|       |
 1815|  1.68M|  const int disable_edge_filter = !enable_intra_edge_filter;
 1816|  1.68M|  const int intra_edge_filter_type = get_intra_edge_filter_type(xd, plane);
 1817|  1.68M|  const int n_topright_px =
 1818|  1.68M|      have_top_right > 0 ? AOMMIN(txwpx, xr) : have_top_right;
  ------------------
  |  |   34|   144k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 137k, False: 7.01k]
  |  |  ------------------
  ------------------
  |  Branch (1818:7): [True: 144k, False: 1.53M]
  ------------------
 1819|  1.68M|  const int n_bottomleft_px =
 1820|  1.68M|      have_bottom_left > 0 ? AOMMIN(txhpx, yd) : have_bottom_left;
  ------------------
  |  |   34|  91.3k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 87.9k, False: 3.40k]
  |  |  ------------------
  ------------------
  |  Branch (1820:7): [True: 91.3k, False: 1.58M]
  ------------------
 1821|  1.68M|#if CONFIG_AV1_HIGHBITDEPTH
 1822|  1.68M|  if (is_hbd) {
  ------------------
  |  Branch (1822:7): [True: 783k, False: 897k]
  ------------------
 1823|   783k|    highbd_build_directional_and_filter_intra_predictors(
 1824|   783k|        ref, ref_stride, dst, dst_stride, mode, p_angle, filter_intra_mode,
 1825|   783k|        tx_size, disable_edge_filter, n_top_px, n_topright_px, n_left_px,
 1826|   783k|        n_bottomleft_px, intra_edge_filter_type, xd->bd);
 1827|   783k|    return;
 1828|   783k|  }
 1829|   897k|#endif
 1830|   897k|  build_directional_and_filter_intra_predictors(
 1831|   897k|      ref, ref_stride, dst, dst_stride, mode, p_angle, filter_intra_mode,
 1832|   897k|      tx_size, disable_edge_filter, n_top_px, n_topright_px, n_left_px,
 1833|   897k|      n_bottomleft_px, intra_edge_filter_type);
 1834|   897k|}
av1_predict_intra_block_facade:
 1838|  4.52M|                                    TX_SIZE tx_size) {
 1839|  4.52M|  const MB_MODE_INFO *const mbmi = xd->mi[0];
 1840|  4.52M|  struct macroblockd_plane *const pd = &xd->plane[plane];
 1841|  4.52M|  const int dst_stride = pd->dst.stride;
 1842|  4.52M|  uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2];
  ------------------
  |  |   39|  4.52M|#define MI_SIZE_LOG2 2
  ------------------
 1843|  4.52M|  const PREDICTION_MODE mode =
 1844|  4.52M|      (plane == AOM_PLANE_Y) ? mbmi->mode : get_uv_mode(mbmi->uv_mode);
  ------------------
  |  |  210|  4.52M|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
  |  Branch (1844:7): [True: 2.40M, False: 2.12M]
  ------------------
 1845|  4.52M|  const int use_palette = mbmi->palette_mode_info.palette_size[plane != 0] > 0;
 1846|  4.52M|  const FILTER_INTRA_MODE filter_intra_mode =
 1847|  4.52M|      (plane == AOM_PLANE_Y && mbmi->filter_intra_mode_info.use_filter_intra)
  ------------------
  |  |  210|  9.05M|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
  |  Branch (1847:8): [True: 2.40M, False: 2.12M]
  |  Branch (1847:32): [True: 384k, False: 2.01M]
  ------------------
 1848|  4.52M|          ? mbmi->filter_intra_mode_info.filter_intra_mode
 1849|  4.52M|          : FILTER_INTRA_MODES;
 1850|  4.52M|  const int angle_delta = mbmi->angle_delta[plane != AOM_PLANE_Y] * ANGLE_STEP;
  ------------------
  |  |  210|  4.52M|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
                const int angle_delta = mbmi->angle_delta[plane != AOM_PLANE_Y] * ANGLE_STEP;
  ------------------
  |  |  468|  4.52M|#define ANGLE_STEP 3
  ------------------
 1851|  4.52M|  const SequenceHeader *seq_params = cm->seq_params;
 1852|       |
 1853|  4.52M|#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
 1854|  4.52M|  if (plane != AOM_PLANE_Y && mbmi->uv_mode == UV_CFL_PRED) {
  ------------------
  |  |  210|  9.05M|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
  |  Branch (1854:7): [True: 2.12M, False: 2.40M]
  |  Branch (1854:31): [True: 272k, False: 1.85M]
  ------------------
 1855|       |#if CONFIG_DEBUG
 1856|       |    assert(is_cfl_allowed(xd));
 1857|       |    const BLOCK_SIZE plane_bsize =
 1858|       |        get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y);
 1859|       |    (void)plane_bsize;
 1860|       |    assert(plane_bsize < BLOCK_SIZES_ALL);
 1861|       |    if (!xd->lossless[mbmi->segment_id]) {
 1862|       |      assert(blk_col == 0);
 1863|       |      assert(blk_row == 0);
 1864|       |      assert(block_size_wide[plane_bsize] == tx_size_wide[tx_size]);
 1865|       |      assert(block_size_high[plane_bsize] == tx_size_high[tx_size]);
 1866|       |    }
 1867|       |#endif
 1868|   272k|    CFL_CTX *const cfl = &xd->cfl;
 1869|   272k|    CFL_PRED_TYPE pred_plane = get_cfl_pred_type(plane);
 1870|   272k|    if (!cfl->dc_pred_is_cached[pred_plane]) {
  ------------------
  |  Branch (1870:9): [True: 272k, False: 0]
  ------------------
 1871|   272k|      av1_predict_intra_block(xd, seq_params->sb_size,
 1872|   272k|                              seq_params->enable_intra_edge_filter, pd->width,
 1873|   272k|                              pd->height, tx_size, mode, angle_delta,
 1874|   272k|                              use_palette, filter_intra_mode, dst, dst_stride,
 1875|   272k|                              dst, dst_stride, blk_col, blk_row, plane);
 1876|   272k|      if (cfl->use_dc_pred_cache) {
  ------------------
  |  Branch (1876:11): [True: 0, False: 272k]
  ------------------
 1877|      0|        cfl_store_dc_pred(xd, dst, pred_plane, tx_size_wide[tx_size]);
 1878|      0|        cfl->dc_pred_is_cached[pred_plane] = true;
 1879|      0|      }
 1880|   272k|    } else {
 1881|      0|      cfl_load_dc_pred(xd, dst, dst_stride, tx_size, pred_plane);
 1882|      0|    }
 1883|   272k|    av1_cfl_predict_block(xd, dst, dst_stride, tx_size, plane);
 1884|   272k|    return;
 1885|   272k|  }
 1886|  4.25M|#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
 1887|  4.25M|  av1_predict_intra_block(
 1888|  4.25M|      xd, seq_params->sb_size, seq_params->enable_intra_edge_filter, pd->width,
 1889|  4.25M|      pd->height, tx_size, mode, angle_delta, use_palette, filter_intra_mode,
 1890|  4.25M|      dst, dst_stride, dst, dst_stride, blk_col, blk_row, plane);
 1891|  4.25M|}
av1_init_intra_predictors:
 1893|  17.9k|void av1_init_intra_predictors(void) {
 1894|  17.9k|  aom_once(init_intra_predictors_internal);
 1895|  17.9k|}
reconintra.c:highbd_build_non_directional_intra_predictors:
 1557|  1.01M|    int bit_depth) {
 1558|  1.01M|  int i = 0;
 1559|  1.01M|  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
  ------------------
  |  |   75|  1.01M|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
 1560|  1.01M|  const uint16_t *const ref = CONVERT_TO_SHORTPTR(ref8);
  ------------------
  |  |   75|  1.01M|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
 1561|  1.01M|  const int txwpx = tx_size_wide[tx_size];
 1562|  1.01M|  const int txhpx = tx_size_high[tx_size];
 1563|  1.01M|  int need_left = extend_modes[mode] & NEED_LEFT;
 1564|  1.01M|  int need_above = extend_modes[mode] & NEED_ABOVE;
 1565|  1.01M|  int need_above_left = extend_modes[mode] & NEED_ABOVELEFT;
 1566|  1.01M|  const uint16_t *above_ref = ref - ref_stride;
 1567|  1.01M|  const uint16_t *left_ref = ref - 1;
 1568|  1.01M|  const int base = 128 << (bit_depth - 8);
 1569|       |
 1570|  1.01M|  assert(n_top_px >= 0);
 1571|  1.01M|  assert(n_left_px >= 0);
 1572|  1.01M|  assert(mode == DC_PRED || mode == SMOOTH_PRED || mode == SMOOTH_V_PRED ||
 1573|  1.01M|         mode == SMOOTH_H_PRED || mode == PAETH_PRED);
 1574|       |
 1575|  1.01M|  if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) {
  ------------------
  |  Branch (1575:8): [True: 0, False: 1.01M]
  |  Branch (1575:23): [True: 0, False: 0]
  |  Branch (1575:43): [True: 0, False: 1.01M]
  |  Branch (1575:57): [True: 0, False: 0]
  ------------------
 1576|      0|    int val = 0;
 1577|      0|    if (need_left) {
  ------------------
  |  Branch (1577:9): [True: 0, False: 0]
  ------------------
 1578|      0|      val = (n_top_px > 0) ? above_ref[0] : base + 1;
  ------------------
  |  Branch (1578:13): [True: 0, False: 0]
  ------------------
 1579|      0|    } else {
 1580|      0|      val = (n_left_px > 0) ? left_ref[0] : base - 1;
  ------------------
  |  Branch (1580:13): [True: 0, False: 0]
  ------------------
 1581|      0|    }
 1582|      0|    for (i = 0; i < txhpx; ++i) {
  ------------------
  |  Branch (1582:17): [True: 0, False: 0]
  ------------------
 1583|      0|      aom_memset16(dst, val, txwpx);
 1584|      0|      dst += dst_stride;
 1585|      0|    }
 1586|      0|    return;
 1587|      0|  }
 1588|       |
 1589|  1.01M|  DECLARE_ALIGNED(16, uint16_t, left_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
  ------------------
  |  |   19|  1.01M|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
 1590|  1.01M|  DECLARE_ALIGNED(16, uint16_t, above_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
  ------------------
  |  |   19|  1.01M|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
 1591|  1.01M|  uint16_t *const above_row = above_data + 16;
 1592|  1.01M|  uint16_t *const left_col = left_data + 16;
 1593|       |
 1594|  1.01M|  if (need_left) {
  ------------------
  |  Branch (1594:7): [True: 1.01M, False: 6]
  ------------------
 1595|  1.01M|    aom_memset16(left_data, base + 1, NUM_INTRA_NEIGHBOUR_PIXELS);
  ------------------
  |  |   38|  1.01M|#define NUM_INTRA_NEIGHBOUR_PIXELS (MAX_TX_SIZE * 2 + 32)
  |  |  ------------------
  |  |  |  |  183|  1.01M|#define MAX_TX_SIZE (1 << MAX_TX_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |  182|  1.01M|#define MAX_TX_SIZE_LOG2 (6)
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1596|  1.01M|    if (n_left_px > 0) {
  ------------------
  |  Branch (1596:9): [True: 937k, False: 80.5k]
  ------------------
 1597|  10.5M|      for (i = 0; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride];
  ------------------
  |  Branch (1597:19): [True: 9.57M, False: 937k]
  ------------------
 1598|   937k|      if (i < txhpx) aom_memset16(&left_col[i], left_col[i - 1], txhpx - i);
  ------------------
  |  Branch (1598:11): [True: 8.65k, False: 928k]
  ------------------
 1599|   937k|    } else if (n_top_px > 0) {
  ------------------
  |  Branch (1599:16): [True: 63.0k, False: 17.5k]
  ------------------
 1600|  63.0k|      aom_memset16(left_col, above_ref[0], txhpx);
 1601|  63.0k|    }
 1602|  1.01M|  }
 1603|       |
 1604|  1.01M|  if (need_above) {
  ------------------
  |  Branch (1604:7): [True: 1.01M, False: 18.4E]
  ------------------
 1605|  1.01M|    aom_memset16(above_data, base - 1, NUM_INTRA_NEIGHBOUR_PIXELS);
  ------------------
  |  |   38|  1.01M|#define NUM_INTRA_NEIGHBOUR_PIXELS (MAX_TX_SIZE * 2 + 32)
  |  |  ------------------
  |  |  |  |  183|  1.01M|#define MAX_TX_SIZE (1 << MAX_TX_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |  182|  1.01M|#define MAX_TX_SIZE_LOG2 (6)
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1606|  1.01M|    if (n_top_px > 0) {
  ------------------
  |  Branch (1606:9): [True: 933k, False: 84.8k]
  ------------------
 1607|   933k|      memcpy(above_row, above_ref, n_top_px * sizeof(above_ref[0]));
 1608|   933k|      i = n_top_px;
 1609|   933k|      if (i < txwpx) aom_memset16(&above_row[i], above_row[i - 1], (txwpx - i));
  ------------------
  |  Branch (1609:11): [True: 17.0k, False: 916k]
  ------------------
 1610|   933k|    } else if (n_left_px > 0) {
  ------------------
  |  Branch (1610:16): [True: 67.2k, False: 17.5k]
  ------------------
 1611|  67.2k|      aom_memset16(above_row, left_ref[0], txwpx);
 1612|  67.2k|    }
 1613|  1.01M|  }
 1614|       |
 1615|  1.01M|  if (need_above_left) {
  ------------------
  |  Branch (1615:7): [True: 93.9k, False: 924k]
  ------------------
 1616|  93.9k|    if (n_top_px > 0 && n_left_px > 0) {
  ------------------
  |  Branch (1616:9): [True: 87.4k, False: 6.43k]
  |  Branch (1616:25): [True: 82.6k, False: 4.84k]
  ------------------
 1617|  82.6k|      above_row[-1] = above_ref[-1];
 1618|  82.6k|    } else if (n_top_px > 0) {
  ------------------
  |  Branch (1618:16): [True: 4.84k, False: 6.43k]
  ------------------
 1619|  4.84k|      above_row[-1] = above_ref[0];
 1620|  6.43k|    } else if (n_left_px > 0) {
  ------------------
  |  Branch (1620:16): [True: 5.59k, False: 835]
  ------------------
 1621|  5.59k|      above_row[-1] = left_ref[0];
 1622|  5.59k|    } else {
 1623|    835|      above_row[-1] = base;
 1624|    835|    }
 1625|  93.9k|    left_col[-1] = above_row[-1];
 1626|  93.9k|  }
 1627|       |
 1628|  1.01M|  if (mode == DC_PRED) {
  ------------------
  |  Branch (1628:7): [True: 625k, False: 392k]
  ------------------
 1629|   625k|    dc_pred_high[n_left_px > 0][n_top_px > 0][tx_size](
 1630|   625k|        dst, dst_stride, above_row, left_col, bit_depth);
 1631|   625k|  } else {
 1632|   392k|    pred_high[mode][tx_size](dst, dst_stride, above_row, left_col, bit_depth);
 1633|   392k|  }
 1634|  1.01M|}
reconintra.c:build_non_directional_intra_predictors:
 1250|  1.69M|    PREDICTION_MODE mode, TX_SIZE tx_size, int n_top_px, int n_left_px) {
 1251|  1.69M|  const uint8_t *above_ref = ref - ref_stride;
 1252|  1.69M|  const uint8_t *left_ref = ref - 1;
 1253|  1.69M|  const int txwpx = tx_size_wide[tx_size];
 1254|  1.69M|  const int txhpx = tx_size_high[tx_size];
 1255|  1.69M|  const int need_left = extend_modes[mode] & NEED_LEFT;
 1256|  1.69M|  const int need_above = extend_modes[mode] & NEED_ABOVE;
 1257|  1.69M|  const int need_above_left = extend_modes[mode] & NEED_ABOVELEFT;
 1258|  1.69M|  int i = 0;
 1259|  1.69M|  assert(n_top_px >= 0);
 1260|  1.69M|  assert(n_left_px >= 0);
 1261|  1.69M|  assert(mode == DC_PRED || mode == SMOOTH_PRED || mode == SMOOTH_V_PRED ||
 1262|  1.69M|         mode == SMOOTH_H_PRED || mode == PAETH_PRED);
 1263|       |
 1264|  1.69M|  if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) {
  ------------------
  |  Branch (1264:8): [True: 0, False: 1.69M]
  |  Branch (1264:23): [True: 0, False: 0]
  |  Branch (1264:43): [True: 0, False: 1.69M]
  |  Branch (1264:57): [True: 0, False: 0]
  ------------------
 1265|      0|    int val = 0;
 1266|      0|    if (need_left) {
  ------------------
  |  Branch (1266:9): [True: 0, False: 0]
  ------------------
 1267|      0|      val = (n_top_px > 0) ? above_ref[0] : 129;
  ------------------
  |  Branch (1267:13): [True: 0, False: 0]
  ------------------
 1268|      0|    } else {
 1269|      0|      val = (n_left_px > 0) ? left_ref[0] : 127;
  ------------------
  |  Branch (1269:13): [True: 0, False: 0]
  ------------------
 1270|      0|    }
 1271|      0|    for (i = 0; i < txhpx; ++i) {
  ------------------
  |  Branch (1271:17): [True: 0, False: 0]
  ------------------
 1272|      0|      memset(dst, val, txwpx);
 1273|      0|      dst += dst_stride;
 1274|      0|    }
 1275|      0|    return;
 1276|      0|  }
 1277|       |
 1278|  1.69M|  DECLARE_ALIGNED(16, uint8_t, left_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
  ------------------
  |  |   19|  1.69M|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
 1279|  1.69M|  DECLARE_ALIGNED(16, uint8_t, above_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
  ------------------
  |  |   19|  1.69M|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
 1280|  1.69M|  uint8_t *const above_row = above_data + 16;
 1281|  1.69M|  uint8_t *const left_col = left_data + 16;
 1282|       |
 1283|  1.69M|  if (need_left) {
  ------------------
  |  Branch (1283:7): [True: 1.69M, False: 18.4E]
  ------------------
 1284|  1.69M|    memset(left_data, 129, NUM_INTRA_NEIGHBOUR_PIXELS);
  ------------------
  |  |   38|  1.69M|#define NUM_INTRA_NEIGHBOUR_PIXELS (MAX_TX_SIZE * 2 + 32)
  |  |  ------------------
  |  |  |  |  183|  1.69M|#define MAX_TX_SIZE (1 << MAX_TX_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |  182|  1.69M|#define MAX_TX_SIZE_LOG2 (6)
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1285|  1.69M|    if (n_left_px > 0) {
  ------------------
  |  Branch (1285:9): [True: 1.58M, False: 110k]
  ------------------
 1286|  17.3M|      for (i = 0; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride];
  ------------------
  |  Branch (1286:19): [True: 15.7M, False: 1.58M]
  ------------------
 1287|  1.58M|      if (i < txhpx) memset(&left_col[i], left_col[i - 1], txhpx - i);
  ------------------
  |  Branch (1287:11): [True: 33.9k, False: 1.54M]
  ------------------
 1288|  1.58M|    } else if (n_top_px > 0) {
  ------------------
  |  Branch (1288:16): [True: 88.1k, False: 22.0k]
  ------------------
 1289|  88.1k|      memset(left_col, above_ref[0], txhpx);
 1290|  88.1k|    }
 1291|  1.69M|  }
 1292|       |
 1293|  1.69M|  if (need_above) {
  ------------------
  |  Branch (1293:7): [True: 1.69M, False: 31]
  ------------------
 1294|  1.69M|    memset(above_data, 127, NUM_INTRA_NEIGHBOUR_PIXELS);
  ------------------
  |  |   38|  1.69M|#define NUM_INTRA_NEIGHBOUR_PIXELS (MAX_TX_SIZE * 2 + 32)
  |  |  ------------------
  |  |  |  |  183|  1.69M|#define MAX_TX_SIZE (1 << MAX_TX_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |  182|  1.69M|#define MAX_TX_SIZE_LOG2 (6)
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1295|  1.69M|    if (n_top_px > 0) {
  ------------------
  |  Branch (1295:9): [True: 1.55M, False: 135k]
  ------------------
 1296|  1.55M|      memcpy(above_row, above_ref, n_top_px);
 1297|  1.55M|      i = n_top_px;
 1298|  1.55M|      if (i < txwpx) memset(&above_row[i], above_row[i - 1], txwpx - i);
  ------------------
  |  Branch (1298:11): [True: 47.3k, False: 1.51M]
  ------------------
 1299|  1.55M|    } else if (n_left_px > 0) {
  ------------------
  |  Branch (1299:16): [True: 113k, False: 22.0k]
  ------------------
 1300|   113k|      memset(above_row, left_ref[0], txwpx);
 1301|   113k|    }
 1302|  1.69M|  }
 1303|       |
 1304|  1.69M|  if (need_above_left) {
  ------------------
  |  Branch (1304:7): [True: 121k, False: 1.57M]
  ------------------
 1305|   121k|    if (n_top_px > 0 && n_left_px > 0) {
  ------------------
  |  Branch (1305:9): [True: 112k, False: 9.57k]
  |  Branch (1305:25): [True: 104k, False: 7.69k]
  ------------------
 1306|   104k|      above_row[-1] = above_ref[-1];
 1307|   104k|    } else if (n_top_px > 0) {
  ------------------
  |  Branch (1307:16): [True: 7.69k, False: 9.57k]
  ------------------
 1308|  7.69k|      above_row[-1] = above_ref[0];
 1309|  9.57k|    } else if (n_left_px > 0) {
  ------------------
  |  Branch (1309:16): [True: 8.76k, False: 809]
  ------------------
 1310|  8.76k|      above_row[-1] = left_ref[0];
 1311|  8.76k|    } else {
 1312|    809|      above_row[-1] = 128;
 1313|    809|    }
 1314|   121k|    left_col[-1] = above_row[-1];
 1315|   121k|  }
 1316|       |
 1317|  1.69M|  if (mode == DC_PRED) {
  ------------------
  |  Branch (1317:7): [True: 1.25M, False: 436k]
  ------------------
 1318|  1.25M|    dc_pred[n_left_px > 0][n_top_px > 0][tx_size](dst, dst_stride, above_row,
 1319|  1.25M|                                                  left_col);
 1320|  1.25M|  } else {
 1321|   436k|    pred[mode][tx_size](dst, dst_stride, above_row, left_col);
 1322|   436k|  }
 1323|  1.69M|}
reconintra.c:scale_chroma_bsize:
 1638|  7.28k|                                            int subsampling_y) {
 1639|  7.28k|  assert(subsampling_x >= 0 && subsampling_x < 2);
 1640|  7.28k|  assert(subsampling_y >= 0 && subsampling_y < 2);
 1641|  7.28k|  BLOCK_SIZE bs = bsize;
 1642|  7.28k|  switch (bsize) {
 1643|    100|    case BLOCK_4X4:
  ------------------
  |  Branch (1643:5): [True: 100, False: 7.18k]
  ------------------
 1644|    100|      if (subsampling_x == 1 && subsampling_y == 1)
  ------------------
  |  Branch (1644:11): [True: 100, False: 0]
  |  Branch (1644:33): [True: 52, False: 48]
  ------------------
 1645|     52|        bs = BLOCK_8X8;
 1646|     48|      else if (subsampling_x == 1)
  ------------------
  |  Branch (1646:16): [True: 48, False: 0]
  ------------------
 1647|     48|        bs = BLOCK_8X4;
 1648|      0|      else if (subsampling_y == 1)
  ------------------
  |  Branch (1648:16): [True: 0, False: 0]
  ------------------
 1649|      0|        bs = BLOCK_4X8;
 1650|    100|      break;
 1651|     72|    case BLOCK_4X8:
  ------------------
  |  Branch (1651:5): [True: 72, False: 7.21k]
  ------------------
 1652|     72|      if (subsampling_x == 1 && subsampling_y == 1)
  ------------------
  |  Branch (1652:11): [True: 72, False: 0]
  |  Branch (1652:33): [True: 72, False: 0]
  ------------------
 1653|     72|        bs = BLOCK_8X8;
 1654|      0|      else if (subsampling_x == 1)
  ------------------
  |  Branch (1654:16): [True: 0, False: 0]
  ------------------
 1655|      0|        bs = BLOCK_8X8;
 1656|      0|      else if (subsampling_y == 1)
  ------------------
  |  Branch (1656:16): [True: 0, False: 0]
  ------------------
 1657|      0|        bs = BLOCK_4X8;
 1658|     72|      break;
 1659|    276|    case BLOCK_8X4:
  ------------------
  |  Branch (1659:5): [True: 276, False: 7.00k]
  ------------------
 1660|    276|      if (subsampling_x == 1 && subsampling_y == 1)
  ------------------
  |  Branch (1660:11): [True: 276, False: 0]
  |  Branch (1660:33): [True: 128, False: 148]
  ------------------
 1661|    128|        bs = BLOCK_8X8;
 1662|    148|      else if (subsampling_x == 1)
  ------------------
  |  Branch (1662:16): [True: 148, False: 0]
  ------------------
 1663|    148|        bs = BLOCK_8X4;
 1664|      0|      else if (subsampling_y == 1)
  ------------------
  |  Branch (1664:16): [True: 0, False: 0]
  ------------------
 1665|      0|        bs = BLOCK_8X8;
 1666|    276|      break;
 1667|    180|    case BLOCK_4X16:
  ------------------
  |  Branch (1667:5): [True: 180, False: 7.10k]
  ------------------
 1668|    180|      if (subsampling_x == 1 && subsampling_y == 1)
  ------------------
  |  Branch (1668:11): [True: 180, False: 0]
  |  Branch (1668:33): [True: 180, False: 0]
  ------------------
 1669|    180|        bs = BLOCK_8X16;
 1670|      0|      else if (subsampling_x == 1)
  ------------------
  |  Branch (1670:16): [True: 0, False: 0]
  ------------------
 1671|      0|        bs = BLOCK_8X16;
 1672|      0|      else if (subsampling_y == 1)
  ------------------
  |  Branch (1672:16): [True: 0, False: 0]
  ------------------
 1673|      0|        bs = BLOCK_4X16;
 1674|    180|      break;
 1675|    312|    case BLOCK_16X4:
  ------------------
  |  Branch (1675:5): [True: 312, False: 6.97k]
  ------------------
 1676|    312|      if (subsampling_x == 1 && subsampling_y == 1)
  ------------------
  |  Branch (1676:11): [True: 312, False: 0]
  |  Branch (1676:33): [True: 116, False: 196]
  ------------------
 1677|    116|        bs = BLOCK_16X8;
 1678|    196|      else if (subsampling_x == 1)
  ------------------
  |  Branch (1678:16): [True: 196, False: 0]
  ------------------
 1679|    196|        bs = BLOCK_16X4;
 1680|      0|      else if (subsampling_y == 1)
  ------------------
  |  Branch (1680:16): [True: 0, False: 0]
  ------------------
 1681|      0|        bs = BLOCK_16X8;
 1682|    312|      break;
 1683|  6.34k|    default: break;
  ------------------
  |  Branch (1683:5): [True: 6.34k, False: 940]
  ------------------
 1684|  7.28k|  }
 1685|  7.28k|  return bs;
 1686|  7.28k|}
reconintra.c:has_top_right:
  199|   237k|                         int col_off, int ss_x, int ss_y) {
  200|   237k|  if (!top_available || !right_available) return 0;
  ------------------
  |  Branch (200:7): [True: 9.80k, False: 228k]
  |  Branch (200:25): [True: 10.9k, False: 217k]
  ------------------
  201|       |
  202|   217k|  const int bw_unit = mi_size_wide[bsize];
  203|   217k|  const int plane_bw_unit = AOMMAX(bw_unit >> ss_x, 1);
  ------------------
  |  |   35|   217k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 193k, False: 23.6k]
  |  |  ------------------
  ------------------
  204|   217k|  const int top_right_count_unit = tx_size_wide_unit[txsz];
  205|       |
  206|   217k|  if (row_off > 0) {  // Just need to check if enough pixels on the right.
  ------------------
  |  Branch (206:7): [True: 78.1k, False: 138k]
  ------------------
  207|  78.1k|    if (block_size_wide[bsize] > block_size_wide[BLOCK_64X64]) {
  ------------------
  |  Branch (207:9): [True: 12.2k, False: 65.9k]
  ------------------
  208|       |      // Special case: For 128x128 blocks, the transform unit whose
  209|       |      // top-right corner is at the center of the block does in fact have
  210|       |      // pixels available at its top-right corner.
  211|  12.2k|      if (row_off == mi_size_high[BLOCK_64X64] >> ss_y &&
  ------------------
  |  Branch (211:11): [True: 1.05k, False: 11.2k]
  ------------------
  212|  1.05k|          col_off + top_right_count_unit == mi_size_wide[BLOCK_64X64] >> ss_x) {
  ------------------
  |  Branch (212:11): [True: 174, False: 879]
  ------------------
  213|    174|        return 1;
  214|    174|      }
  215|  12.0k|      const int plane_bw_unit_64 = mi_size_wide[BLOCK_64X64] >> ss_x;
  216|  12.0k|      const int col_off_64 = col_off % plane_bw_unit_64;
  217|  12.0k|      return col_off_64 + top_right_count_unit < plane_bw_unit_64;
  218|  12.2k|    }
  219|  65.9k|    return col_off + top_right_count_unit < plane_bw_unit;
  220|   138k|  } else {
  221|       |    // All top-right pixels are in the block above, which is already available.
  222|   138k|    if (col_off + top_right_count_unit < plane_bw_unit) return 1;
  ------------------
  |  Branch (222:9): [True: 23.1k, False: 115k]
  ------------------
  223|       |
  224|   115k|    const int bw_in_mi_log2 = mi_size_wide_log2[bsize];
  225|   115k|    const int bh_in_mi_log2 = mi_size_high_log2[bsize];
  226|   115k|    const int sb_mi_size = mi_size_high[sb_size];
  227|   115k|    const int blk_row_in_sb = (mi_row & (sb_mi_size - 1)) >> bh_in_mi_log2;
  228|   115k|    const int blk_col_in_sb = (mi_col & (sb_mi_size - 1)) >> bw_in_mi_log2;
  229|       |
  230|       |    // Top row of superblock: so top-right pixels are in the top and/or
  231|       |    // top-right superblocks, both of which are already available.
  232|   115k|    if (blk_row_in_sb == 0) return 1;
  ------------------
  |  Branch (232:9): [True: 14.7k, False: 101k]
  ------------------
  233|       |
  234|       |    // Rightmost column of superblock (and not the top row): so top-right pixels
  235|       |    // fall in the right superblock, which is not available yet.
  236|   101k|    if (((blk_col_in_sb + 1) << bw_in_mi_log2) >= sb_mi_size) {
  ------------------
  |  Branch (236:9): [True: 18.4k, False: 82.5k]
  ------------------
  237|  18.4k|      return 0;
  238|  18.4k|    }
  239|       |
  240|       |    // General case (neither top row nor rightmost column): check if the
  241|       |    // top-right block is coded before the current block.
  242|  82.5k|    const int this_blk_index =
  243|  82.5k|        ((blk_row_in_sb + 0) << (MAX_MIB_SIZE_LOG2 - bw_in_mi_log2)) +
  ------------------
  |  |   43|  82.5k|#define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|  82.5k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  |  |               #define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  82.5k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  244|  82.5k|        blk_col_in_sb + 0;
  245|  82.5k|    const int idx1 = this_blk_index / 8;
  246|  82.5k|    const int idx2 = this_blk_index % 8;
  247|  82.5k|    const uint8_t *has_tr_table = get_has_tr_table(partition, bsize);
  248|  82.5k|    return (has_tr_table[idx1] >> idx2) & 1;
  249|   101k|  }
  250|   217k|}
reconintra.c:get_has_tr_table:
  183|  82.5k|                                       BLOCK_SIZE bsize) {
  184|  82.5k|  const uint8_t *ret = NULL;
  185|       |  // If this is a mixed vertical partition, look up bsize in orders_vert.
  186|  82.5k|  if (partition == PARTITION_VERT_A || partition == PARTITION_VERT_B) {
  ------------------
  |  Branch (186:7): [True: 3.14k, False: 79.4k]
  |  Branch (186:40): [True: 2.84k, False: 76.6k]
  ------------------
  187|  5.98k|    assert(bsize < BLOCK_SIZES);
  188|  5.98k|    ret = has_tr_vert_tables[bsize];
  189|  76.6k|  } else {
  190|  76.6k|    ret = has_tr_tables[bsize];
  191|  76.6k|  }
  192|       |  assert(ret);
  193|  82.5k|  return ret;
  194|  82.5k|}
reconintra.c:has_bottom_left:
  384|   283k|                           int col_off, int ss_x, int ss_y) {
  385|   283k|  if (!bottom_available || !left_available) return 0;
  ------------------
  |  Branch (385:7): [True: 9.83k, False: 273k]
  |  Branch (385:28): [True: 8.40k, False: 265k]
  ------------------
  386|       |
  387|       |  // Special case for 128x* blocks, when col_off is half the block width.
  388|       |  // This is needed because 128x* superblocks are divided into 64x* blocks in
  389|       |  // raster order
  390|   265k|  if (block_size_wide[bsize] > block_size_wide[BLOCK_64X64] && col_off > 0) {
  ------------------
  |  Branch (390:7): [True: 10.7k, False: 254k]
  |  Branch (390:64): [True: 10.3k, False: 408]
  ------------------
  391|  10.3k|    const int plane_bw_unit_64 = mi_size_wide[BLOCK_64X64] >> ss_x;
  392|  10.3k|    const int col_off_64 = col_off % plane_bw_unit_64;
  393|  10.3k|    if (col_off_64 == 0) {
  ------------------
  |  Branch (393:9): [True: 890, False: 9.44k]
  ------------------
  394|       |      // We are at the left edge of top-right or bottom-right 64x* block.
  395|    890|      const int plane_bh_unit_64 = mi_size_high[BLOCK_64X64] >> ss_y;
  396|    890|      const int row_off_64 = row_off % plane_bh_unit_64;
  397|    890|      const int plane_bh_unit =
  398|    890|          AOMMIN(mi_size_high[bsize] >> ss_y, plane_bh_unit_64);
  ------------------
  |  |   34|    890|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 890]
  |  |  ------------------
  ------------------
  399|       |      // Check if all bottom-left pixels are in the left 64x* block (which is
  400|       |      // already coded).
  401|    890|      return row_off_64 + tx_size_high_unit[txsz] < plane_bh_unit;
  402|    890|    }
  403|  10.3k|  }
  404|       |
  405|   264k|  if (col_off > 0) {
  ------------------
  |  Branch (405:7): [True: 90.3k, False: 173k]
  ------------------
  406|       |    // Bottom-left pixels are in the bottom-left block, which is not available.
  407|  90.3k|    return 0;
  408|   173k|  } else {
  409|   173k|    const int bh_unit = mi_size_high[bsize];
  410|   173k|    const int plane_bh_unit = AOMMAX(bh_unit >> ss_y, 1);
  ------------------
  |  |   35|   173k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 137k, False: 36.1k]
  |  |  ------------------
  ------------------
  411|   173k|    const int bottom_left_count_unit = tx_size_high_unit[txsz];
  412|       |
  413|       |    // All bottom-left pixels are in the left block, which is already available.
  414|   173k|    if (row_off + bottom_left_count_unit < plane_bh_unit) return 1;
  ------------------
  |  Branch (414:9): [True: 23.6k, False: 150k]
  ------------------
  415|       |
  416|   150k|    const int bw_in_mi_log2 = mi_size_wide_log2[bsize];
  417|   150k|    const int bh_in_mi_log2 = mi_size_high_log2[bsize];
  418|   150k|    const int sb_mi_size = mi_size_high[sb_size];
  419|   150k|    const int blk_row_in_sb = (mi_row & (sb_mi_size - 1)) >> bh_in_mi_log2;
  420|   150k|    const int blk_col_in_sb = (mi_col & (sb_mi_size - 1)) >> bw_in_mi_log2;
  421|       |
  422|       |    // Leftmost column of superblock: so bottom-left pixels maybe in the left
  423|       |    // and/or bottom-left superblocks. But only the left superblock is
  424|       |    // available, so check if all required pixels fall in that superblock.
  425|   150k|    if (blk_col_in_sb == 0) {
  ------------------
  |  Branch (425:9): [True: 27.8k, False: 122k]
  ------------------
  426|  27.8k|      const int blk_start_row_off =
  427|  27.8k|          blk_row_in_sb << (bh_in_mi_log2 + MI_SIZE_LOG2 - MI_SIZE_LOG2) >>
  ------------------
  |  |   39|  27.8k|#define MI_SIZE_LOG2 2
  ------------------
                        blk_row_in_sb << (bh_in_mi_log2 + MI_SIZE_LOG2 - MI_SIZE_LOG2) >>
  ------------------
  |  |   39|  27.8k|#define MI_SIZE_LOG2 2
  ------------------
  428|  27.8k|          ss_y;
  429|  27.8k|      const int row_off_in_sb = blk_start_row_off + row_off;
  430|  27.8k|      const int sb_height_unit = sb_mi_size >> ss_y;
  431|  27.8k|      return row_off_in_sb + bottom_left_count_unit < sb_height_unit;
  432|  27.8k|    }
  433|       |
  434|       |    // Bottom row of superblock (and not the leftmost column): so bottom-left
  435|       |    // pixels fall in the bottom superblock, which is not available yet.
  436|   122k|    if (((blk_row_in_sb + 1) << bh_in_mi_log2) >= sb_mi_size) return 0;
  ------------------
  |  Branch (436:9): [True: 19.1k, False: 103k]
  ------------------
  437|       |
  438|       |    // General case (neither leftmost column nor bottom row): check if the
  439|       |    // bottom-left block is coded before the current block.
  440|   103k|    const int this_blk_index =
  441|   103k|        ((blk_row_in_sb + 0) << (MAX_MIB_SIZE_LOG2 - bw_in_mi_log2)) +
  ------------------
  |  |   43|   103k|#define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|   103k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  |  |               #define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|   103k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  442|   103k|        blk_col_in_sb + 0;
  443|   103k|    const int idx1 = this_blk_index / 8;
  444|   103k|    const int idx2 = this_blk_index % 8;
  445|   103k|    const uint8_t *has_bl_table = get_has_bl_table(partition, bsize);
  446|   103k|    return (has_bl_table[idx1] >> idx2) & 1;
  447|   122k|  }
  448|   264k|}
reconintra.c:get_has_bl_table:
  368|   103k|                                       BLOCK_SIZE bsize) {
  369|   103k|  const uint8_t *ret = NULL;
  370|       |  // If this is a mixed vertical partition, look up bsize in orders_vert.
  371|   103k|  if (partition == PARTITION_VERT_A || partition == PARTITION_VERT_B) {
  ------------------
  |  Branch (371:7): [True: 3.39k, False: 99.8k]
  |  Branch (371:40): [True: 4.13k, False: 95.7k]
  ------------------
  372|  7.53k|    assert(bsize < BLOCK_SIZES);
  373|  7.53k|    ret = has_bl_vert_tables[bsize];
  374|  95.7k|  } else {
  375|  95.7k|    ret = has_bl_tables[bsize];
  376|  95.7k|  }
  377|       |  assert(ret);
  378|   103k|  return ret;
  379|   103k|}
reconintra.c:get_intra_edge_filter_type:
  974|  1.68M|static int get_intra_edge_filter_type(const MACROBLOCKD *xd, int plane) {
  975|  1.68M|  const MB_MODE_INFO *above;
  976|  1.68M|  const MB_MODE_INFO *left;
  977|       |
  978|  1.68M|  if (plane == 0) {
  ------------------
  |  Branch (978:7): [True: 1.18M, False: 496k]
  ------------------
  979|  1.18M|    above = xd->above_mbmi;
  980|  1.18M|    left = xd->left_mbmi;
  981|  1.18M|  } else {
  982|   496k|    above = xd->chroma_above_mbmi;
  983|   496k|    left = xd->chroma_left_mbmi;
  984|   496k|  }
  985|       |
  986|  1.68M|  return (above && is_smooth(above, plane)) || (left && is_smooth(left, plane));
  ------------------
  |  Branch (986:11): [True: 1.48M, False: 191k]
  |  Branch (986:20): [True: 250k, False: 1.23M]
  |  Branch (986:49): [True: 1.29M, False: 134k]
  |  Branch (986:57): [True: 215k, False: 1.07M]
  ------------------
  987|  1.68M|}
reconintra.c:is_smooth:
  958|  2.78M|static int is_smooth(const MB_MODE_INFO *mbmi, int plane) {
  959|  2.78M|  if (plane == 0) {
  ------------------
  |  Branch (959:7): [True: 1.98M, False: 796k]
  ------------------
  960|  1.98M|    const PREDICTION_MODE mode = mbmi->mode;
  961|  1.98M|    return (mode == SMOOTH_PRED || mode == SMOOTH_V_PRED ||
  ------------------
  |  Branch (961:13): [True: 204k, False: 1.78M]
  |  Branch (961:36): [True: 53.2k, False: 1.73M]
  ------------------
  962|  1.73M|            mode == SMOOTH_H_PRED);
  ------------------
  |  Branch (962:13): [True: 94.7k, False: 1.63M]
  ------------------
  963|  1.98M|  } else {
  964|       |    // uv_mode is not set for inter blocks, so need to explicitly
  965|       |    // detect that case.
  966|   796k|    if (is_inter_block(mbmi)) return 0;
  ------------------
  |  Branch (966:9): [True: 8.05k, False: 788k]
  ------------------
  967|       |
  968|   788k|    const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
  969|   788k|    return (uv_mode == UV_SMOOTH_PRED || uv_mode == UV_SMOOTH_V_PRED ||
  ------------------
  |  Branch (969:13): [True: 60.8k, False: 727k]
  |  Branch (969:42): [True: 23.9k, False: 703k]
  ------------------
  970|   703k|            uv_mode == UV_SMOOTH_H_PRED);
  ------------------
  |  Branch (970:13): [True: 29.2k, False: 674k]
  ------------------
  971|   796k|  }
  972|  2.78M|}
reconintra.c:highbd_build_directional_and_filter_intra_predictors:
 1389|   783k|    int bit_depth) {
 1390|   783k|  int i;
 1391|   783k|  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
  ------------------
  |  |   75|   783k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
 1392|   783k|  const uint16_t *const ref = CONVERT_TO_SHORTPTR(ref8);
  ------------------
  |  |   75|   783k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
 1393|   783k|  DECLARE_ALIGNED(16, uint16_t, left_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
  ------------------
  |  |   19|   783k|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
 1394|   783k|  DECLARE_ALIGNED(16, uint16_t, above_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
  ------------------
  |  |   19|   783k|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
 1395|   783k|  uint16_t *const above_row = above_data + 16;
 1396|   783k|  uint16_t *const left_col = left_data + 16;
 1397|   783k|  const int txwpx = tx_size_wide[tx_size];
 1398|   783k|  const int txhpx = tx_size_high[tx_size];
 1399|   783k|  int need_left = extend_modes[mode] & NEED_LEFT;
 1400|   783k|  int need_above = extend_modes[mode] & NEED_ABOVE;
 1401|   783k|  int need_above_left = extend_modes[mode] & NEED_ABOVELEFT;
 1402|   783k|  const uint16_t *above_ref = ref - ref_stride;
 1403|   783k|  const uint16_t *left_ref = ref - 1;
 1404|   783k|  const int is_dr_mode = av1_is_directional_mode(mode);
 1405|   783k|  const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES;
 1406|   783k|  assert(use_filter_intra || is_dr_mode);
 1407|   783k|  const int base = 128 << (bit_depth - 8);
 1408|       |  // The left_data, above_data buffers must be zeroed to fix some intermittent
 1409|       |  // valgrind errors. Uninitialized reads in intra pred modules (e.g. width = 4
 1410|       |  // path in av1_highbd_dr_prediction_z2_avx2()) from left_data, above_data are
 1411|       |  // seen to be the potential reason for this issue.
 1412|   783k|  aom_memset16(left_data, base + 1, NUM_INTRA_NEIGHBOUR_PIXELS);
  ------------------
  |  |   38|   783k|#define NUM_INTRA_NEIGHBOUR_PIXELS (MAX_TX_SIZE * 2 + 32)
  |  |  ------------------
  |  |  |  |  183|   783k|#define MAX_TX_SIZE (1 << MAX_TX_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |  182|   783k|#define MAX_TX_SIZE_LOG2 (6)
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1413|   783k|  aom_memset16(above_data, base - 1, NUM_INTRA_NEIGHBOUR_PIXELS);
  ------------------
  |  |   38|   783k|#define NUM_INTRA_NEIGHBOUR_PIXELS (MAX_TX_SIZE * 2 + 32)
  |  |  ------------------
  |  |  |  |  183|   783k|#define MAX_TX_SIZE (1 << MAX_TX_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |  182|   783k|#define MAX_TX_SIZE_LOG2 (6)
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1414|       |
 1415|       |  // The default values if ref pixels are not available:
 1416|       |  // base   base-1 base-1 .. base-1 base-1 base-1 base-1 base-1 base-1
 1417|       |  // base+1   A      B  ..     Y      Z
 1418|       |  // base+1   C      D  ..     W      X
 1419|       |  // base+1   E      F  ..     U      V
 1420|       |  // base+1   G      H  ..     S      T      T      T      T      T
 1421|       |
 1422|   783k|  if (is_dr_mode) {
  ------------------
  |  Branch (1422:7): [True: 594k, False: 188k]
  ------------------
 1423|   594k|    if (p_angle <= 90)
  ------------------
  |  Branch (1423:9): [True: 171k, False: 422k]
  ------------------
 1424|   171k|      need_above = 1, need_left = 0, need_above_left = 1;
 1425|   422k|    else if (p_angle < 180)
  ------------------
  |  Branch (1425:14): [True: 183k, False: 238k]
  ------------------
 1426|   183k|      need_above = 1, need_left = 1, need_above_left = 1;
 1427|   238k|    else
 1428|   238k|      need_above = 0, need_left = 1, need_above_left = 1;
 1429|   594k|  }
 1430|   783k|  if (use_filter_intra) need_left = need_above = need_above_left = 1;
  ------------------
  |  Branch (1430:7): [True: 188k, False: 594k]
  ------------------
 1431|       |
 1432|   783k|  assert(n_top_px >= 0);
 1433|   783k|  assert(n_topright_px >= -1);
 1434|   783k|  assert(n_left_px >= 0);
 1435|   783k|  assert(n_bottomleft_px >= -1);
 1436|       |
 1437|   783k|  if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) {
  ------------------
  |  Branch (1437:8): [True: 238k, False: 544k]
  |  Branch (1437:23): [True: 9.11k, False: 229k]
  |  Branch (1437:43): [True: 171k, False: 602k]
  |  Branch (1437:57): [True: 7.16k, False: 164k]
  ------------------
 1438|  16.2k|    int val;
 1439|  16.2k|    if (need_left) {
  ------------------
  |  Branch (1439:9): [True: 9.11k, False: 7.16k]
  ------------------
 1440|  9.11k|      val = (n_top_px > 0) ? above_ref[0] : base + 1;
  ------------------
  |  Branch (1440:13): [True: 7.99k, False: 1.12k]
  ------------------
 1441|  9.11k|    } else {
 1442|  7.16k|      val = (n_left_px > 0) ? left_ref[0] : base - 1;
  ------------------
  |  Branch (1442:13): [True: 6.73k, False: 428]
  ------------------
 1443|  7.16k|    }
 1444|   283k|    for (i = 0; i < txhpx; ++i) {
  ------------------
  |  Branch (1444:17): [True: 267k, False: 16.2k]
  ------------------
 1445|   267k|      aom_memset16(dst, val, txwpx);
 1446|   267k|      dst += dst_stride;
 1447|   267k|    }
 1448|  16.2k|    return;
 1449|  16.2k|  }
 1450|       |
 1451|       |  // NEED_LEFT
 1452|   766k|  if (need_left) {
  ------------------
  |  Branch (1452:7): [True: 602k, False: 164k]
  ------------------
 1453|   602k|    const int num_left_pixels_needed =
 1454|   602k|        txhpx + (n_bottomleft_px >= 0 ? txwpx : 0);
  ------------------
  |  Branch (1454:18): [True: 124k, False: 477k]
  ------------------
 1455|   602k|    i = 0;
 1456|   602k|    if (n_left_px > 0) {
  ------------------
  |  Branch (1456:9): [True: 588k, False: 13.9k]
  ------------------
 1457|  5.60M|      for (; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride];
  ------------------
  |  Branch (1457:14): [True: 5.01M, False: 588k]
  ------------------
 1458|   588k|      if (n_bottomleft_px > 0) {
  ------------------
  |  Branch (1458:11): [True: 42.9k, False: 545k]
  ------------------
 1459|  42.9k|        assert(i == txhpx);
 1460|   394k|        for (; i < txhpx + n_bottomleft_px; i++)
  ------------------
  |  Branch (1460:16): [True: 352k, False: 42.9k]
  ------------------
 1461|   352k|          left_col[i] = left_ref[i * ref_stride];
 1462|  42.9k|      }
 1463|   588k|      if (i < num_left_pixels_needed)
  ------------------
  |  Branch (1463:11): [True: 100k, False: 488k]
  ------------------
 1464|   100k|        aom_memset16(&left_col[i], left_col[i - 1], num_left_pixels_needed - i);
 1465|   588k|    } else if (n_top_px > 0) {
  ------------------
  |  Branch (1465:16): [True: 13.1k, False: 811]
  ------------------
 1466|  13.1k|      aom_memset16(left_col, above_ref[0], num_left_pixels_needed);
 1467|  13.1k|    }
 1468|   602k|  }
 1469|       |
 1470|       |  // NEED_ABOVE
 1471|   766k|  if (need_above) {
  ------------------
  |  Branch (1471:7): [True: 537k, False: 229k]
  ------------------
 1472|   537k|    const int num_top_pixels_needed = txwpx + (n_topright_px >= 0 ? txhpx : 0);
  ------------------
  |  Branch (1472:48): [True: 101k, False: 435k]
  ------------------
 1473|   537k|    if (n_top_px > 0) {
  ------------------
  |  Branch (1473:9): [True: 518k, False: 18.2k]
  ------------------
 1474|   518k|      memcpy(above_row, above_ref, n_top_px * sizeof(above_ref[0]));
 1475|   518k|      i = n_top_px;
 1476|   518k|      if (n_topright_px > 0) {
  ------------------
  |  Branch (1476:11): [True: 61.0k, False: 457k]
  ------------------
 1477|  61.0k|        assert(n_top_px == txwpx);
 1478|  61.0k|        memcpy(above_row + txwpx, above_ref + txwpx,
 1479|  61.0k|               n_topright_px * sizeof(above_ref[0]));
 1480|  61.0k|        i += n_topright_px;
 1481|  61.0k|      }
 1482|   518k|      if (i < num_top_pixels_needed)
  ------------------
  |  Branch (1482:11): [True: 51.8k, False: 467k]
  ------------------
 1483|  51.8k|        aom_memset16(&above_row[i], above_row[i - 1],
 1484|  51.8k|                     num_top_pixels_needed - i);
 1485|   518k|    } else if (n_left_px > 0) {
  ------------------
  |  Branch (1485:16): [True: 17.4k, False: 810]
  ------------------
 1486|  17.4k|      aom_memset16(above_row, left_ref[0], num_top_pixels_needed);
 1487|  17.4k|    }
 1488|   537k|  }
 1489|       |
 1490|   766k|  if (need_above_left) {
  ------------------
  |  Branch (1490:7): [True: 766k, False: 16]
  ------------------
 1491|   766k|    if (n_top_px > 0 && n_left_px > 0) {
  ------------------
  |  Branch (1491:9): [True: 737k, False: 28.9k]
  |  Branch (1491:25): [True: 716k, False: 21.8k]
  ------------------
 1492|   716k|      above_row[-1] = above_ref[-1];
 1493|   716k|    } else if (n_top_px > 0) {
  ------------------
  |  Branch (1493:16): [True: 21.8k, False: 28.9k]
  ------------------
 1494|  21.8k|      above_row[-1] = above_ref[0];
 1495|  28.9k|    } else if (n_left_px > 0) {
  ------------------
  |  Branch (1495:16): [True: 28.1k, False: 806]
  ------------------
 1496|  28.1k|      above_row[-1] = left_ref[0];
 1497|  28.1k|    } else {
 1498|    806|      above_row[-1] = base;
 1499|    806|    }
 1500|   766k|    left_col[-1] = above_row[-1];
 1501|   766k|  }
 1502|       |
 1503|   766k|  if (use_filter_intra) {
  ------------------
  |  Branch (1503:7): [True: 188k, False: 577k]
  ------------------
 1504|   188k|    highbd_filter_intra_predictor(dst, dst_stride, tx_size, above_row, left_col,
 1505|   188k|                                  filter_intra_mode, bit_depth);
 1506|   188k|    return;
 1507|   188k|  }
 1508|       |
 1509|   766k|  assert(is_dr_mode);
 1510|   577k|  int upsample_above = 0;
 1511|   577k|  int upsample_left = 0;
 1512|   577k|  if (!disable_edge_filter) {
  ------------------
  |  Branch (1512:7): [True: 490k, False: 87.2k]
  ------------------
 1513|   490k|    const int need_right = p_angle < 90;
 1514|   490k|    const int need_bottom = p_angle > 180;
 1515|   490k|    if (p_angle != 90 && p_angle != 180) {
  ------------------
  |  Branch (1515:9): [True: 437k, False: 53.1k]
  |  Branch (1515:26): [True: 346k, False: 91.1k]
  ------------------
 1516|   346k|      assert(need_above_left);
 1517|   346k|      const int ab_le = 1;
 1518|   346k|      if (need_above && need_left && (txwpx + txhpx >= 24)) {
  ------------------
  |  Branch (1518:11): [True: 243k, False: 103k]
  |  Branch (1518:25): [True: 156k, False: 86.5k]
  |  Branch (1518:38): [True: 43.1k, False: 113k]
  ------------------
 1519|  43.1k|        highbd_filter_intra_edge_corner(above_row, left_col);
 1520|  43.1k|      }
 1521|   346k|      if (need_above && n_top_px > 0) {
  ------------------
  |  Branch (1521:11): [True: 243k, False: 103k]
  |  Branch (1521:25): [True: 235k, False: 8.00k]
  ------------------
 1522|   235k|        const int strength = intra_edge_filter_strength(
 1523|   235k|            txwpx, txhpx, p_angle - 90, intra_edge_filter_type);
 1524|   235k|        const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0);
  ------------------
  |  Branch (1524:46): [True: 86.5k, False: 148k]
  ------------------
 1525|   235k|        av1_highbd_filter_intra_edge(above_row - ab_le, n_px, strength);
 1526|   235k|      }
 1527|   346k|      if (need_left && n_left_px > 0) {
  ------------------
  |  Branch (1527:11): [True: 259k, False: 86.5k]
  |  Branch (1527:24): [True: 252k, False: 7.25k]
  ------------------
 1528|   252k|        const int strength = intra_edge_filter_strength(
 1529|   252k|            txhpx, txwpx, p_angle - 180, intra_edge_filter_type);
 1530|   252k|        const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0);
  ------------------
  |  Branch (1530:47): [True: 103k, False: 149k]
  ------------------
 1531|   252k|        av1_highbd_filter_intra_edge(left_col - ab_le, n_px, strength);
 1532|   252k|      }
 1533|   346k|    }
 1534|   490k|    upsample_above = av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90,
 1535|   490k|                                                 intra_edge_filter_type);
 1536|   490k|    if (need_above && upsample_above) {
  ------------------
  |  Branch (1536:9): [True: 296k, False: 194k]
  |  Branch (1536:23): [True: 73.0k, False: 223k]
  ------------------
 1537|  73.0k|      const int n_px = txwpx + (need_right ? txhpx : 0);
  ------------------
  |  Branch (1537:33): [True: 36.7k, False: 36.2k]
  ------------------
 1538|  73.0k|      av1_highbd_upsample_intra_edge(above_row, n_px, bit_depth);
 1539|  73.0k|    }
 1540|   490k|    upsample_left = av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180,
 1541|   490k|                                                intra_edge_filter_type);
 1542|   490k|    if (need_left && upsample_left) {
  ------------------
  |  Branch (1542:9): [True: 351k, False: 139k]
  |  Branch (1542:22): [True: 98.9k, False: 252k]
  ------------------
 1543|  98.9k|      const int n_px = txhpx + (need_bottom ? txwpx : 0);
  ------------------
  |  Branch (1543:33): [True: 56.5k, False: 42.3k]
  ------------------
 1544|  98.9k|      av1_highbd_upsample_intra_edge(left_col, n_px, bit_depth);
 1545|  98.9k|    }
 1546|   490k|  }
 1547|   577k|  highbd_dr_predictor(dst, dst_stride, tx_size, above_row, left_col,
 1548|   577k|                      upsample_above, upsample_left, p_angle, bit_depth);
 1549|   577k|}
reconintra.c:highbd_filter_intra_predictor:
  912|   188k|                                          int bd) {
  913|   188k|  int r, c;
  914|   188k|  uint16_t buffer[33][33];
  915|   188k|  const int bw = tx_size_wide[tx_size];
  916|   188k|  const int bh = tx_size_high[tx_size];
  917|       |
  918|   188k|  assert(bw <= 32 && bh <= 32);
  919|       |
  920|  1.62M|  for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r];
  ------------------
  |  Branch (920:15): [True: 1.43M, False: 188k]
  ------------------
  921|   188k|  memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(buffer[0][0]));
  922|       |
  923|   906k|  for (r = 1; r < bh + 1; r += 2)
  ------------------
  |  Branch (923:15): [True: 717k, False: 188k]
  ------------------
  924|  2.58M|    for (c = 1; c < bw + 1; c += 4) {
  ------------------
  |  Branch (924:17): [True: 1.86M, False: 717k]
  ------------------
  925|  1.86M|      const uint16_t p0 = buffer[r - 1][c - 1];
  926|  1.86M|      const uint16_t p1 = buffer[r - 1][c];
  927|  1.86M|      const uint16_t p2 = buffer[r - 1][c + 1];
  928|  1.86M|      const uint16_t p3 = buffer[r - 1][c + 2];
  929|  1.86M|      const uint16_t p4 = buffer[r - 1][c + 3];
  930|  1.86M|      const uint16_t p5 = buffer[r][c - 1];
  931|  1.86M|      const uint16_t p6 = buffer[r + 1][c - 1];
  932|  16.8M|      for (int k = 0; k < 8; ++k) {
  ------------------
  |  Branch (932:23): [True: 14.9M, False: 1.86M]
  ------------------
  933|  14.9M|        int r_offset = k >> 2;
  934|  14.9M|        int c_offset = k & 0x03;
  935|  14.9M|        int pr = av1_filter_intra_taps[mode][k][0] * p0 +
  936|  14.9M|                 av1_filter_intra_taps[mode][k][1] * p1 +
  937|  14.9M|                 av1_filter_intra_taps[mode][k][2] * p2 +
  938|  14.9M|                 av1_filter_intra_taps[mode][k][3] * p3 +
  939|  14.9M|                 av1_filter_intra_taps[mode][k][4] * p4 +
  940|  14.9M|                 av1_filter_intra_taps[mode][k][5] * p5 +
  941|  14.9M|                 av1_filter_intra_taps[mode][k][6] * p6;
  942|       |        // Section 7.11.2.3 specifies the right-hand side of the assignment as
  943|       |        //   Clip1( Round2Signed( pr, INTRA_FILTER_SCALE_BITS ) ).
  944|       |        // Since Clip1() clips a negative value to 0, it is safe to replace
  945|       |        // Round2Signed() with Round2().
  946|  14.9M|        buffer[r + r_offset][c + c_offset] = clip_pixel_highbd(
  947|  14.9M|            ROUND_POWER_OF_TWO(pr, FILTER_INTRA_SCALE_BITS), bd);
  ------------------
  |  |   41|  14.9M|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
  948|  14.9M|      }
  949|  1.86M|    }
  950|       |
  951|  1.62M|  for (r = 0; r < bh; ++r) {
  ------------------
  |  Branch (951:15): [True: 1.43M, False: 188k]
  ------------------
  952|  1.43M|    memcpy(dst, &buffer[r + 1][1], bw * sizeof(dst[0]));
  953|  1.43M|    dst += stride;
  954|  1.43M|  }
  955|   188k|}
reconintra.c:highbd_filter_intra_edge_corner:
 1350|  43.1k|                                            uint16_t *p_left) {
 1351|  43.1k|  const int kernel[3] = { 5, 6, 5 };
 1352|       |
 1353|  43.1k|  int s = (p_left[0] * kernel[0]) + (p_above[-1] * kernel[1]) +
 1354|  43.1k|          (p_above[0] * kernel[2]);
 1355|  43.1k|  s = (s + 8) >> 4;
 1356|  43.1k|  p_above[-1] = s;
 1357|  43.1k|  p_left[-1] = s;
 1358|  43.1k|}
reconintra.c:intra_edge_filter_strength:
  989|  1.14M|static int intra_edge_filter_strength(int bs0, int bs1, int delta, int type) {
  990|  1.14M|  const int d = abs(delta);
  991|  1.14M|  int strength = 0;
  992|       |
  993|  1.14M|  const int blk_wh = bs0 + bs1;
  994|  1.14M|  if (type == 0) {
  ------------------
  |  Branch (994:7): [True: 841k, False: 305k]
  ------------------
  995|   841k|    if (blk_wh <= 8) {
  ------------------
  |  Branch (995:9): [True: 306k, False: 535k]
  ------------------
  996|   306k|      if (d >= 56) strength = 1;
  ------------------
  |  Branch (996:11): [True: 68.0k, False: 238k]
  ------------------
  997|   535k|    } else if (blk_wh <= 12) {
  ------------------
  |  Branch (997:16): [True: 65.4k, False: 469k]
  ------------------
  998|  65.4k|      if (d >= 40) strength = 1;
  ------------------
  |  Branch (998:11): [True: 30.3k, False: 35.1k]
  ------------------
  999|   469k|    } else if (blk_wh <= 16) {
  ------------------
  |  Branch (999:16): [True: 186k, False: 282k]
  ------------------
 1000|   186k|      if (d >= 40) strength = 1;
  ------------------
  |  Branch (1000:11): [True: 68.6k, False: 118k]
  ------------------
 1001|   282k|    } else if (blk_wh <= 24) {
  ------------------
  |  Branch (1001:16): [True: 145k, False: 137k]
  ------------------
 1002|   145k|      if (d >= 8) strength = 1;
  ------------------
  |  Branch (1002:11): [True: 117k, False: 27.5k]
  ------------------
 1003|   145k|      if (d >= 16) strength = 2;
  ------------------
  |  Branch (1003:11): [True: 104k, False: 40.1k]
  ------------------
 1004|   145k|      if (d >= 32) strength = 3;
  ------------------
  |  Branch (1004:11): [True: 67.3k, False: 77.7k]
  ------------------
 1005|   145k|    } else if (blk_wh <= 32) {
  ------------------
  |  Branch (1005:16): [True: 65.5k, False: 72.1k]
  ------------------
 1006|  65.5k|      if (d >= 1) strength = 1;
  ------------------
  |  Branch (1006:11): [True: 65.5k, False: 0]
  ------------------
 1007|  65.5k|      if (d >= 4) strength = 2;
  ------------------
  |  Branch (1007:11): [True: 58.4k, False: 7.12k]
  ------------------
 1008|  65.5k|      if (d >= 32) strength = 3;
  ------------------
  |  Branch (1008:11): [True: 30.7k, False: 34.7k]
  ------------------
 1009|  72.1k|    } else {
 1010|  72.1k|      if (d >= 1) strength = 3;
  ------------------
  |  Branch (1010:11): [True: 72.1k, False: 18.4E]
  ------------------
 1011|  72.1k|    }
 1012|   841k|  } else {
 1013|   305k|    if (blk_wh <= 8) {
  ------------------
  |  Branch (1013:9): [True: 93.5k, False: 211k]
  ------------------
 1014|  93.5k|      if (d >= 40) strength = 1;
  ------------------
  |  Branch (1014:11): [True: 30.6k, False: 62.9k]
  ------------------
 1015|  93.5k|      if (d >= 64) strength = 2;
  ------------------
  |  Branch (1015:11): [True: 18.6k, False: 74.9k]
  ------------------
 1016|   211k|    } else if (blk_wh <= 16) {
  ------------------
  |  Branch (1016:16): [True: 100k, False: 111k]
  ------------------
 1017|   100k|      if (d >= 20) strength = 1;
  ------------------
  |  Branch (1017:11): [True: 67.1k, False: 33.1k]
  ------------------
 1018|   100k|      if (d >= 48) strength = 2;
  ------------------
  |  Branch (1018:11): [True: 30.0k, False: 70.2k]
  ------------------
 1019|   111k|    } else if (blk_wh <= 24) {
  ------------------
  |  Branch (1019:16): [True: 60.5k, False: 50.9k]
  ------------------
 1020|  60.5k|      if (d >= 4) strength = 3;
  ------------------
  |  Branch (1020:11): [True: 53.4k, False: 7.12k]
  ------------------
 1021|  60.5k|    } else {
 1022|  50.9k|      if (d >= 1) strength = 3;
  ------------------
  |  Branch (1022:11): [True: 50.9k, False: 18.4E]
  ------------------
 1023|  50.9k|    }
 1024|   305k|  }
 1025|  1.14M|  return strength;
 1026|  1.14M|}
reconintra.c:highbd_dr_predictor:
  782|   577k|                                int upsample_left, int angle, int bd) {
  783|   577k|  const int dx = av1_get_dx(angle);
  784|   577k|  const int dy = av1_get_dy(angle);
  785|   577k|  const int bw = tx_size_wide[tx_size];
  786|   577k|  const int bh = tx_size_high[tx_size];
  787|   577k|  assert(angle > 0 && angle < 270);
  788|       |
  789|   577k|  if (angle > 0 && angle < 90) {
  ------------------
  |  Branch (789:7): [True: 577k, False: 1]
  |  Branch (789:20): [True: 101k, False: 476k]
  ------------------
  790|   101k|    av1_highbd_dr_prediction_z1(dst, stride, bw, bh, above, left,
  791|   101k|                                upsample_above, dx, dy, bd);
  792|   476k|  } else if (angle > 90 && angle < 180) {
  ------------------
  |  Branch (792:14): [True: 413k, False: 62.8k]
  |  Branch (792:28): [True: 183k, False: 229k]
  ------------------
  793|   183k|    av1_highbd_dr_prediction_z2(dst, stride, bw, bh, above, left,
  794|   183k|                                upsample_above, upsample_left, dx, dy, bd);
  795|   292k|  } else if (angle > 180 && angle < 270) {
  ------------------
  |  Branch (795:14): [True: 124k, False: 167k]
  |  Branch (795:29): [True: 124k, False: 0]
  ------------------
  796|   124k|    av1_highbd_dr_prediction_z3(dst, stride, bw, bh, above, left, upsample_left,
  797|   124k|                                dx, dy, bd);
  798|   167k|  } else if (angle == 90) {
  ------------------
  |  Branch (798:14): [True: 62.8k, False: 104k]
  ------------------
  799|  62.8k|    pred_high[V_PRED][tx_size](dst, stride, above, left, bd);
  800|   104k|  } else if (angle == 180) {
  ------------------
  |  Branch (800:14): [True: 104k, False: 18.4E]
  ------------------
  801|   104k|    pred_high[H_PRED][tx_size](dst, stride, above, left, bd);
  802|   104k|  }
  803|   577k|}
reconintra.c:build_directional_and_filter_intra_predictors:
 1088|   898k|    int n_left_px, int n_bottomleft_px, int intra_edge_filter_type) {
 1089|   898k|  int i;
 1090|   898k|  const uint8_t *above_ref = ref - ref_stride;
 1091|   898k|  const uint8_t *left_ref = ref - 1;
 1092|   898k|  DECLARE_ALIGNED(16, uint8_t, left_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
  ------------------
  |  |   19|   898k|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
 1093|   898k|  DECLARE_ALIGNED(16, uint8_t, above_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
  ------------------
  |  |   19|   898k|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
 1094|   898k|  uint8_t *const above_row = above_data + 16;
 1095|   898k|  uint8_t *const left_col = left_data + 16;
 1096|   898k|  const int txwpx = tx_size_wide[tx_size];
 1097|   898k|  const int txhpx = tx_size_high[tx_size];
 1098|   898k|  int need_left = extend_modes[mode] & NEED_LEFT;
 1099|   898k|  int need_above = extend_modes[mode] & NEED_ABOVE;
 1100|   898k|  int need_above_left = extend_modes[mode] & NEED_ABOVELEFT;
 1101|   898k|  const int is_dr_mode = av1_is_directional_mode(mode);
 1102|   898k|  const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES;
 1103|   898k|  assert(use_filter_intra || is_dr_mode);
 1104|       |  // The left_data, above_data buffers must be zeroed to fix some intermittent
 1105|       |  // valgrind errors. Uninitialized reads in intra pred modules (e.g. width = 4
 1106|       |  // path in av1_dr_prediction_z1_avx2()) from left_data, above_data are seen to
 1107|       |  // be the potential reason for this issue.
 1108|   898k|  memset(left_data, 129, NUM_INTRA_NEIGHBOUR_PIXELS);
  ------------------
  |  |   38|   898k|#define NUM_INTRA_NEIGHBOUR_PIXELS (MAX_TX_SIZE * 2 + 32)
  |  |  ------------------
  |  |  |  |  183|   898k|#define MAX_TX_SIZE (1 << MAX_TX_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |  182|   898k|#define MAX_TX_SIZE_LOG2 (6)
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1109|   898k|  memset(above_data, 127, NUM_INTRA_NEIGHBOUR_PIXELS);
  ------------------
  |  |   38|   898k|#define NUM_INTRA_NEIGHBOUR_PIXELS (MAX_TX_SIZE * 2 + 32)
  |  |  ------------------
  |  |  |  |  183|   898k|#define MAX_TX_SIZE (1 << MAX_TX_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |  182|   898k|#define MAX_TX_SIZE_LOG2 (6)
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1110|       |
 1111|       |  // The default values if ref pixels are not available:
 1112|       |  // 128 127 127 .. 127 127 127 127 127 127
 1113|       |  // 129  A   B  ..  Y   Z
 1114|       |  // 129  C   D  ..  W   X
 1115|       |  // 129  E   F  ..  U   V
 1116|       |  // 129  G   H  ..  S   T   T   T   T   T
 1117|       |  // ..
 1118|       |
 1119|   898k|  if (is_dr_mode) {
  ------------------
  |  Branch (1119:7): [True: 702k, False: 195k]
  ------------------
 1120|   702k|    if (p_angle <= 90)
  ------------------
  |  Branch (1120:9): [True: 216k, False: 486k]
  ------------------
 1121|   216k|      need_above = 1, need_left = 0, need_above_left = 1;
 1122|   486k|    else if (p_angle < 180)
  ------------------
  |  Branch (1122:14): [True: 205k, False: 280k]
  ------------------
 1123|   205k|      need_above = 1, need_left = 1, need_above_left = 1;
 1124|   280k|    else
 1125|   280k|      need_above = 0, need_left = 1, need_above_left = 1;
 1126|   702k|  }
 1127|   898k|  if (use_filter_intra) need_left = need_above = need_above_left = 1;
  ------------------
  |  Branch (1127:7): [True: 195k, False: 702k]
  ------------------
 1128|       |
 1129|   898k|  assert(n_top_px >= 0);
 1130|   898k|  assert(n_topright_px >= -1);
 1131|   898k|  assert(n_left_px >= 0);
 1132|   898k|  assert(n_bottomleft_px >= -1);
 1133|       |
 1134|   898k|  if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) {
  ------------------
  |  Branch (1134:8): [True: 280k, False: 617k]
  |  Branch (1134:23): [True: 7.46k, False: 272k]
  |  Branch (1134:43): [True: 216k, False: 674k]
  |  Branch (1134:57): [True: 7.51k, False: 209k]
  ------------------
 1135|  14.9k|    int val;
 1136|  14.9k|    if (need_left) {
  ------------------
  |  Branch (1136:9): [True: 7.46k, False: 7.51k]
  ------------------
 1137|  7.46k|      val = (n_top_px > 0) ? above_ref[0] : 129;
  ------------------
  |  Branch (1137:13): [True: 6.38k, False: 1.07k]
  ------------------
 1138|  7.51k|    } else {
 1139|  7.51k|      val = (n_left_px > 0) ? left_ref[0] : 127;
  ------------------
  |  Branch (1139:13): [True: 6.99k, False: 517]
  ------------------
 1140|  7.51k|    }
 1141|   245k|    for (i = 0; i < txhpx; ++i) {
  ------------------
  |  Branch (1141:17): [True: 230k, False: 14.9k]
  ------------------
 1142|   230k|      memset(dst, val, txwpx);
 1143|   230k|      dst += dst_stride;
 1144|   230k|    }
 1145|  14.9k|    return;
 1146|  14.9k|  }
 1147|       |
 1148|       |  // NEED_LEFT
 1149|   883k|  if (need_left) {
  ------------------
  |  Branch (1149:7): [True: 674k, False: 209k]
  ------------------
 1150|   674k|    const int num_left_pixels_needed =
 1151|   674k|        txhpx + (n_bottomleft_px >= 0 ? txwpx : 0);
  ------------------
  |  Branch (1151:18): [True: 149k, False: 524k]
  ------------------
 1152|   674k|    i = 0;
 1153|   674k|    if (n_left_px > 0) {
  ------------------
  |  Branch (1153:9): [True: 663k, False: 10.6k]
  ------------------
 1154|  6.35M|      for (; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride];
  ------------------
  |  Branch (1154:14): [True: 5.68M, False: 663k]
  ------------------
 1155|   663k|      if (n_bottomleft_px > 0) {
  ------------------
  |  Branch (1155:11): [True: 48.3k, False: 615k]
  ------------------
 1156|  48.3k|        assert(i == txhpx);
 1157|   429k|        for (; i < txhpx + n_bottomleft_px; i++)
  ------------------
  |  Branch (1157:16): [True: 381k, False: 48.3k]
  ------------------
 1158|   381k|          left_col[i] = left_ref[i * ref_stride];
 1159|  48.3k|      }
 1160|   663k|      if (i < num_left_pixels_needed)
  ------------------
  |  Branch (1160:11): [True: 120k, False: 542k]
  ------------------
 1161|   120k|        memset(&left_col[i], left_col[i - 1], num_left_pixels_needed - i);
 1162|   663k|    } else if (n_top_px > 0) {
  ------------------
  |  Branch (1162:16): [True: 9.86k, False: 822]
  ------------------
 1163|  9.86k|      memset(left_col, above_ref[0], num_left_pixels_needed);
 1164|  9.86k|    }
 1165|   674k|  }
 1166|       |
 1167|       |  // NEED_ABOVE
 1168|   883k|  if (need_above) {
  ------------------
  |  Branch (1168:7): [True: 610k, False: 272k]
  ------------------
 1169|   610k|    const int num_top_pixels_needed = txwpx + (n_topright_px >= 0 ? txhpx : 0);
  ------------------
  |  Branch (1169:48): [True: 126k, False: 483k]
  ------------------
 1170|   610k|    if (n_top_px > 0) {
  ------------------
  |  Branch (1170:9): [True: 592k, False: 18.1k]
  ------------------
 1171|   592k|      memcpy(above_row, above_ref, n_top_px);
 1172|   592k|      i = n_top_px;
 1173|   592k|      if (n_topright_px > 0) {
  ------------------
  |  Branch (1173:11): [True: 83.6k, False: 508k]
  ------------------
 1174|  83.6k|        assert(n_top_px == txwpx);
 1175|  83.6k|        memcpy(above_row + txwpx, above_ref + txwpx, n_topright_px);
 1176|  83.6k|        i += n_topright_px;
 1177|  83.6k|      }
 1178|   592k|      if (i < num_top_pixels_needed)
  ------------------
  |  Branch (1178:11): [True: 53.7k, False: 538k]
  ------------------
 1179|  53.7k|        memset(&above_row[i], above_row[i - 1], num_top_pixels_needed - i);
 1180|   592k|    } else if (n_left_px > 0) {
  ------------------
  |  Branch (1180:16): [True: 17.3k, False: 823]
  ------------------
 1181|  17.3k|      memset(above_row, left_ref[0], num_top_pixels_needed);
 1182|  17.3k|    }
 1183|   610k|  }
 1184|       |
 1185|   883k|  if (need_above_left) {
  ------------------
  |  Branch (1185:7): [True: 883k, False: 18.4E]
  ------------------
 1186|   883k|    if (n_top_px > 0 && n_left_px > 0) {
  ------------------
  |  Branch (1186:9): [True: 853k, False: 30.0k]
  |  Branch (1186:25): [True: 832k, False: 20.9k]
  ------------------
 1187|   832k|      above_row[-1] = above_ref[-1];
 1188|   832k|    } else if (n_top_px > 0) {
  ------------------
  |  Branch (1188:16): [True: 20.9k, False: 29.9k]
  ------------------
 1189|  20.9k|      above_row[-1] = above_ref[0];
 1190|  29.9k|    } else if (n_left_px > 0) {
  ------------------
  |  Branch (1190:16): [True: 29.1k, False: 822]
  ------------------
 1191|  29.1k|      above_row[-1] = left_ref[0];
 1192|  29.1k|    } else {
 1193|    822|      above_row[-1] = 128;
 1194|    822|    }
 1195|   883k|    left_col[-1] = above_row[-1];
 1196|   883k|  }
 1197|       |
 1198|   883k|  if (use_filter_intra) {
  ------------------
  |  Branch (1198:7): [True: 195k, False: 687k]
  ------------------
 1199|   195k|    av1_filter_intra_predictor(dst, dst_stride, tx_size, above_row, left_col,
 1200|   195k|                               filter_intra_mode);
 1201|   195k|    return;
 1202|   195k|  }
 1203|       |
 1204|   883k|  assert(is_dr_mode);
 1205|   687k|  int upsample_above = 0;
 1206|   687k|  int upsample_left = 0;
 1207|   687k|  if (!disable_edge_filter) {
  ------------------
  |  Branch (1207:7): [True: 676k, False: 11.0k]
  ------------------
 1208|   676k|    const int need_right = p_angle < 90;
 1209|   676k|    const int need_bottom = p_angle > 180;
 1210|   676k|    if (p_angle != 90 && p_angle != 180) {
  ------------------
  |  Branch (1210:9): [True: 594k, False: 81.9k]
  |  Branch (1210:26): [True: 473k, False: 121k]
  ------------------
 1211|   473k|      assert(need_above_left);
 1212|   473k|      const int ab_le = 1;
 1213|   473k|      if (need_above && need_left && (txwpx + txhpx >= 24)) {
  ------------------
  |  Branch (1213:11): [True: 327k, False: 146k]
  |  Branch (1213:25): [True: 201k, False: 125k]
  |  Branch (1213:38): [True: 59.2k, False: 142k]
  ------------------
 1214|  59.2k|        filter_intra_edge_corner(above_row, left_col);
 1215|  59.2k|      }
 1216|   473k|      if (need_above && n_top_px > 0) {
  ------------------
  |  Branch (1216:11): [True: 327k, False: 146k]
  |  Branch (1216:25): [True: 317k, False: 10.1k]
  ------------------
 1217|   317k|        const int strength = intra_edge_filter_strength(
 1218|   317k|            txwpx, txhpx, p_angle - 90, intra_edge_filter_type);
 1219|   317k|        const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0);
  ------------------
  |  Branch (1219:46): [True: 125k, False: 191k]
  ------------------
 1220|   317k|        av1_filter_intra_edge(above_row - ab_le, n_px, strength);
 1221|   317k|      }
 1222|   473k|      if (need_left && n_left_px > 0) {
  ------------------
  |  Branch (1222:11): [True: 348k, False: 125k]
  |  Branch (1222:24): [True: 342k, False: 5.99k]
  ------------------
 1223|   342k|        const int strength = intra_edge_filter_strength(
 1224|   342k|            txhpx, txwpx, p_angle - 180, intra_edge_filter_type);
 1225|   342k|        const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0);
  ------------------
  |  Branch (1225:47): [True: 146k, False: 195k]
  ------------------
 1226|   342k|        av1_filter_intra_edge(left_col - ab_le, n_px, strength);
 1227|   342k|      }
 1228|   473k|    }
 1229|   676k|    upsample_above = av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90,
 1230|   676k|                                                 intra_edge_filter_type);
 1231|   676k|    if (need_above && upsample_above) {
  ------------------
  |  Branch (1231:9): [True: 409k, False: 267k]
  |  Branch (1231:23): [True: 101k, False: 307k]
  ------------------
 1232|   101k|      const int n_px = txwpx + (need_right ? txhpx : 0);
  ------------------
  |  Branch (1232:33): [True: 58.6k, False: 43.2k]
  ------------------
 1233|   101k|      av1_upsample_intra_edge(above_row, n_px);
 1234|   101k|    }
 1235|   676k|    upsample_left = av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180,
 1236|   676k|                                                intra_edge_filter_type);
 1237|   676k|    if (need_left && upsample_left) {
  ------------------
  |  Branch (1237:9): [True: 469k, False: 207k]
  |  Branch (1237:22): [True: 140k, False: 329k]
  ------------------
 1238|   140k|      const int n_px = txhpx + (need_bottom ? txwpx : 0);
  ------------------
  |  Branch (1238:33): [True: 85.4k, False: 54.8k]
  ------------------
 1239|   140k|      av1_upsample_intra_edge(left_col, n_px);
 1240|   140k|    }
 1241|   676k|  }
 1242|   687k|  dr_predictor(dst, dst_stride, tx_size, above_row, left_col, upsample_above,
 1243|   687k|               upsample_left, p_angle);
 1244|   687k|}
reconintra.c:filter_intra_edge_corner:
 1051|  59.2k|static void filter_intra_edge_corner(uint8_t *p_above, uint8_t *p_left) {
 1052|  59.2k|  const int kernel[3] = { 5, 6, 5 };
 1053|       |
 1054|  59.2k|  int s = (p_left[0] * kernel[0]) + (p_above[-1] * kernel[1]) +
 1055|  59.2k|          (p_above[0] * kernel[2]);
 1056|  59.2k|  s = (s + 8) >> 4;
 1057|  59.2k|  p_above[-1] = s;
 1058|  59.2k|  p_left[-1] = s;
 1059|  59.2k|}
reconintra.c:dr_predictor:
  642|   687k|                         int upsample_above, int upsample_left, int angle) {
  643|   687k|  const int dx = av1_get_dx(angle);
  644|   687k|  const int dy = av1_get_dy(angle);
  645|   687k|  const int bw = tx_size_wide[tx_size];
  646|   687k|  const int bh = tx_size_high[tx_size];
  647|   687k|  assert(angle > 0 && angle < 270);
  648|       |
  649|   687k|  if (angle > 0 && angle < 90) {
  ------------------
  |  Branch (649:7): [True: 687k, False: 1]
  |  Branch (649:20): [True: 126k, False: 561k]
  ------------------
  650|   126k|    av1_dr_prediction_z1(dst, stride, bw, bh, above, left, upsample_above, dx,
  651|   126k|                         dy);
  652|   561k|  } else if (angle > 90 && angle < 180) {
  ------------------
  |  Branch (652:14): [True: 478k, False: 82.6k]
  |  Branch (652:28): [True: 205k, False: 272k]
  ------------------
  653|   205k|    av1_dr_prediction_z2(dst, stride, bw, bh, above, left, upsample_above,
  654|   205k|                         upsample_left, dx, dy);
  655|   355k|  } else if (angle > 180 && angle < 270) {
  ------------------
  |  Branch (655:14): [True: 149k, False: 206k]
  |  Branch (655:29): [True: 149k, False: 0]
  ------------------
  656|   149k|    av1_dr_prediction_z3(dst, stride, bw, bh, above, left, upsample_left, dx,
  657|   149k|                         dy);
  658|   206k|  } else if (angle == 90) {
  ------------------
  |  Branch (658:14): [True: 82.7k, False: 123k]
  ------------------
  659|  82.7k|    pred[V_PRED][tx_size](dst, stride, above, left);
  660|   123k|  } else if (angle == 180) {
  ------------------
  |  Branch (660:14): [True: 123k, False: 18.4E]
  ------------------
  661|   123k|    pred[H_PRED][tx_size](dst, stride, above, left);
  662|   123k|  }
  663|   687k|}
reconintra.c:init_intra_predictors_internal:
  464|      1|static void init_intra_predictors_internal(void) {
  465|      1|  assert(NELEMENTS(mode_to_angle_map) == INTRA_MODES);
  466|       |
  467|       |#if CONFIG_REALTIME_ONLY && !CONFIG_AV1_DECODER
  468|       |#define INIT_RECTANGULAR(p, type)             \
  469|       |  p[TX_4X8] = aom_##type##_predictor_4x8;     \
  470|       |  p[TX_8X4] = aom_##type##_predictor_8x4;     \
  471|       |  p[TX_8X16] = aom_##type##_predictor_8x16;   \
  472|       |  p[TX_16X8] = aom_##type##_predictor_16x8;   \
  473|       |  p[TX_16X32] = aom_##type##_predictor_16x32; \
  474|       |  p[TX_32X16] = aom_##type##_predictor_32x16; \
  475|       |  p[TX_32X64] = aom_##type##_predictor_32x64; \
  476|       |  p[TX_64X32] = aom_##type##_predictor_64x32;
  477|       |#else
  478|      1|#define INIT_RECTANGULAR(p, type)             \
  479|      1|  p[TX_4X8] = aom_##type##_predictor_4x8;     \
  480|      1|  p[TX_8X4] = aom_##type##_predictor_8x4;     \
  481|      1|  p[TX_8X16] = aom_##type##_predictor_8x16;   \
  482|      1|  p[TX_16X8] = aom_##type##_predictor_16x8;   \
  483|      1|  p[TX_16X32] = aom_##type##_predictor_16x32; \
  484|      1|  p[TX_32X16] = aom_##type##_predictor_32x16; \
  485|      1|  p[TX_32X64] = aom_##type##_predictor_32x64; \
  486|      1|  p[TX_64X32] = aom_##type##_predictor_64x32; \
  487|      1|  p[TX_4X16] = aom_##type##_predictor_4x16;   \
  488|      1|  p[TX_16X4] = aom_##type##_predictor_16x4;   \
  489|      1|  p[TX_8X32] = aom_##type##_predictor_8x32;   \
  490|      1|  p[TX_32X8] = aom_##type##_predictor_32x8;   \
  491|      1|  p[TX_16X64] = aom_##type##_predictor_16x64; \
  492|      1|  p[TX_64X16] = aom_##type##_predictor_64x16;
  493|      1|#endif  // CONFIG_REALTIME_ONLY && !CONFIG_AV1_DECODER
  494|       |
  495|      1|#define INIT_NO_4X4(p, type)                  \
  496|      1|  p[TX_8X8] = aom_##type##_predictor_8x8;     \
  497|      1|  p[TX_16X16] = aom_##type##_predictor_16x16; \
  498|      1|  p[TX_32X32] = aom_##type##_predictor_32x32; \
  499|      1|  p[TX_64X64] = aom_##type##_predictor_64x64; \
  500|      1|  INIT_RECTANGULAR(p, type)
  501|       |
  502|      1|#define INIT_ALL_SIZES(p, type)           \
  503|      1|  p[TX_4X4] = aom_##type##_predictor_4x4; \
  504|      1|  INIT_NO_4X4(p, type)
  505|       |
  506|      1|  INIT_ALL_SIZES(pred[V_PRED], v)
  ------------------
  |  |  503|      1|  p[TX_4X4] = aom_##type##_predictor_4x4; \
  |  |  ------------------
  |  |  |  | 5469|      1|#define aom_v_predictor_4x4 aom_v_predictor_4x4_sse2
  |  |  ------------------
  |  |  504|      1|  INIT_NO_4X4(p, type)
  |  |  ------------------
  |  |  |  |  496|      1|  p[TX_8X8] = aom_##type##_predictor_8x8;     \
  |  |  |  |  ------------------
  |  |  |  |  |  | 5504|      1|#define aom_v_predictor_8x8 aom_v_predictor_8x8_sse2
  |  |  |  |  ------------------
  |  |  |  |  497|      1|  p[TX_16X16] = aom_##type##_predictor_16x16; \
  |  |  |  |  ------------------
  |  |  |  |  |  | 5426|      1|#define aom_v_predictor_16x16 aom_v_predictor_16x16_sse2
  |  |  |  |  ------------------
  |  |  |  |  498|      1|  p[TX_32X32] = aom_##type##_predictor_32x32; \
  |  |  |  |  499|      1|  p[TX_64X64] = aom_##type##_predictor_64x64; \
  |  |  |  |  500|      1|  INIT_RECTANGULAR(p, type)
  |  |  |  |  ------------------
  |  |  |  |  |  |  479|      1|  p[TX_4X8] = aom_##type##_predictor_4x8;     \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 5473|      1|#define aom_v_predictor_4x8 aom_v_predictor_4x8_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  480|      1|  p[TX_8X4] = aom_##type##_predictor_8x4;     \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 5500|      1|#define aom_v_predictor_8x4 aom_v_predictor_8x4_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  481|      1|  p[TX_8X16] = aom_##type##_predictor_8x16;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 5492|      1|#define aom_v_predictor_8x16 aom_v_predictor_8x16_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  482|      1|  p[TX_16X8] = aom_##type##_predictor_16x8;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 5442|      1|#define aom_v_predictor_16x8 aom_v_predictor_16x8_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  483|      1|  p[TX_16X32] = aom_##type##_predictor_16x32; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 5430|      1|#define aom_v_predictor_16x32 aom_v_predictor_16x32_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  484|      1|  p[TX_32X16] = aom_##type##_predictor_32x16; \
  |  |  |  |  |  |  485|      1|  p[TX_32X64] = aom_##type##_predictor_32x64; \
  |  |  |  |  |  |  486|      1|  p[TX_64X32] = aom_##type##_predictor_64x32; \
  |  |  |  |  |  |  487|      1|  p[TX_4X16] = aom_##type##_predictor_4x16;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 5465|      1|#define aom_v_predictor_4x16 aom_v_predictor_4x16_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  488|      1|  p[TX_16X4] = aom_##type##_predictor_16x4;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 5434|      1|#define aom_v_predictor_16x4 aom_v_predictor_16x4_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  489|      1|  p[TX_8X32] = aom_##type##_predictor_8x32;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 5496|      1|#define aom_v_predictor_8x32 aom_v_predictor_8x32_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  490|      1|  p[TX_32X8] = aom_##type##_predictor_32x8;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 5461|      1|#define aom_v_predictor_32x8 aom_v_predictor_32x8_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  491|      1|  p[TX_16X64] = aom_##type##_predictor_16x64; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 5438|      1|#define aom_v_predictor_16x64 aom_v_predictor_16x64_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  492|      1|  p[TX_64X16] = aom_##type##_predictor_64x16;
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  507|      1|  INIT_ALL_SIZES(pred[H_PRED], h)
  ------------------
  |  |  503|      1|  p[TX_4X4] = aom_##type##_predictor_4x4; \
  |  |  ------------------
  |  |  |  |  521|      1|#define aom_h_predictor_4x4 aom_h_predictor_4x4_sse2
  |  |  ------------------
  |  |  504|      1|  INIT_NO_4X4(p, type)
  |  |  ------------------
  |  |  |  |  496|      1|  p[TX_8X8] = aom_##type##_predictor_8x8;     \
  |  |  |  |  ------------------
  |  |  |  |  |  |  553|      1|#define aom_h_predictor_8x8 aom_h_predictor_8x8_sse2
  |  |  |  |  ------------------
  |  |  |  |  497|      1|  p[TX_16X16] = aom_##type##_predictor_16x16; \
  |  |  |  |  ------------------
  |  |  |  |  |  |  480|      1|#define aom_h_predictor_16x16 aom_h_predictor_16x16_sse2
  |  |  |  |  ------------------
  |  |  |  |  498|      1|  p[TX_32X32] = aom_##type##_predictor_32x32; \
  |  |  |  |  499|      1|  p[TX_64X64] = aom_##type##_predictor_64x64; \
  |  |  |  |  ------------------
  |  |  |  |  |  |  537|      1|#define aom_h_predictor_64x64 aom_h_predictor_64x64_sse2
  |  |  |  |  ------------------
  |  |  |  |  500|      1|  INIT_RECTANGULAR(p, type)
  |  |  |  |  ------------------
  |  |  |  |  |  |  479|      1|  p[TX_4X8] = aom_##type##_predictor_4x8;     \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  525|      1|#define aom_h_predictor_4x8 aom_h_predictor_4x8_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  480|      1|  p[TX_8X4] = aom_##type##_predictor_8x4;     \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  549|      1|#define aom_h_predictor_8x4 aom_h_predictor_8x4_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  481|      1|  p[TX_8X16] = aom_##type##_predictor_8x16;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  541|      1|#define aom_h_predictor_8x16 aom_h_predictor_8x16_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  482|      1|  p[TX_16X8] = aom_##type##_predictor_16x8;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  496|      1|#define aom_h_predictor_16x8 aom_h_predictor_16x8_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  483|      1|  p[TX_16X32] = aom_##type##_predictor_16x32; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  484|      1|#define aom_h_predictor_16x32 aom_h_predictor_16x32_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  484|      1|  p[TX_32X16] = aom_##type##_predictor_32x16; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  500|      1|#define aom_h_predictor_32x16 aom_h_predictor_32x16_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  485|      1|  p[TX_32X64] = aom_##type##_predictor_32x64; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  509|      1|#define aom_h_predictor_32x64 aom_h_predictor_32x64_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  486|      1|  p[TX_64X32] = aom_##type##_predictor_64x32; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  533|      1|#define aom_h_predictor_64x32 aom_h_predictor_64x32_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  487|      1|  p[TX_4X16] = aom_##type##_predictor_4x16;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  517|      1|#define aom_h_predictor_4x16 aom_h_predictor_4x16_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  488|      1|  p[TX_16X4] = aom_##type##_predictor_16x4;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  488|      1|#define aom_h_predictor_16x4 aom_h_predictor_16x4_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  489|      1|  p[TX_8X32] = aom_##type##_predictor_8x32;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  545|      1|#define aom_h_predictor_8x32 aom_h_predictor_8x32_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  490|      1|  p[TX_32X8] = aom_##type##_predictor_32x8;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  513|      1|#define aom_h_predictor_32x8 aom_h_predictor_32x8_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  491|      1|  p[TX_16X64] = aom_##type##_predictor_16x64; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  492|      1|#define aom_h_predictor_16x64 aom_h_predictor_16x64_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  492|      1|  p[TX_64X16] = aom_##type##_predictor_64x16;
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  529|      1|#define aom_h_predictor_64x16 aom_h_predictor_64x16_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  508|      1|  INIT_ALL_SIZES(pred[PAETH_PRED], paeth)
  ------------------
  |  |  503|      1|  p[TX_4X4] = aom_##type##_predictor_4x4; \
  |  |  504|      1|  INIT_NO_4X4(p, type)
  |  |  ------------------
  |  |  |  |  496|      1|  p[TX_8X8] = aom_##type##_predictor_8x8;     \
  |  |  |  |  497|      1|  p[TX_16X16] = aom_##type##_predictor_16x16; \
  |  |  |  |  498|      1|  p[TX_32X32] = aom_##type##_predictor_32x32; \
  |  |  |  |  499|      1|  p[TX_64X64] = aom_##type##_predictor_64x64; \
  |  |  |  |  500|      1|  INIT_RECTANGULAR(p, type)
  |  |  |  |  ------------------
  |  |  |  |  |  |  479|      1|  p[TX_4X8] = aom_##type##_predictor_4x8;     \
  |  |  |  |  |  |  480|      1|  p[TX_8X4] = aom_##type##_predictor_8x4;     \
  |  |  |  |  |  |  481|      1|  p[TX_8X16] = aom_##type##_predictor_8x16;   \
  |  |  |  |  |  |  482|      1|  p[TX_16X8] = aom_##type##_predictor_16x8;   \
  |  |  |  |  |  |  483|      1|  p[TX_16X32] = aom_##type##_predictor_16x32; \
  |  |  |  |  |  |  484|      1|  p[TX_32X16] = aom_##type##_predictor_32x16; \
  |  |  |  |  |  |  485|      1|  p[TX_32X64] = aom_##type##_predictor_32x64; \
  |  |  |  |  |  |  486|      1|  p[TX_64X32] = aom_##type##_predictor_64x32; \
  |  |  |  |  |  |  487|      1|  p[TX_4X16] = aom_##type##_predictor_4x16;   \
  |  |  |  |  |  |  488|      1|  p[TX_16X4] = aom_##type##_predictor_16x4;   \
  |  |  |  |  |  |  489|      1|  p[TX_8X32] = aom_##type##_predictor_8x32;   \
  |  |  |  |  |  |  490|      1|  p[TX_32X8] = aom_##type##_predictor_32x8;   \
  |  |  |  |  |  |  491|      1|  p[TX_16X64] = aom_##type##_predictor_16x64; \
  |  |  |  |  |  |  492|      1|  p[TX_64X16] = aom_##type##_predictor_64x16;
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  509|      1|  INIT_ALL_SIZES(pred[SMOOTH_PRED], smooth)
  ------------------
  |  |  503|      1|  p[TX_4X4] = aom_##type##_predictor_4x4; \
  |  |  504|      1|  INIT_NO_4X4(p, type)
  |  |  ------------------
  |  |  |  |  496|      1|  p[TX_8X8] = aom_##type##_predictor_8x8;     \
  |  |  |  |  497|      1|  p[TX_16X16] = aom_##type##_predictor_16x16; \
  |  |  |  |  498|      1|  p[TX_32X32] = aom_##type##_predictor_32x32; \
  |  |  |  |  499|      1|  p[TX_64X64] = aom_##type##_predictor_64x64; \
  |  |  |  |  500|      1|  INIT_RECTANGULAR(p, type)
  |  |  |  |  ------------------
  |  |  |  |  |  |  479|      1|  p[TX_4X8] = aom_##type##_predictor_4x8;     \
  |  |  |  |  |  |  480|      1|  p[TX_8X4] = aom_##type##_predictor_8x4;     \
  |  |  |  |  |  |  481|      1|  p[TX_8X16] = aom_##type##_predictor_8x16;   \
  |  |  |  |  |  |  482|      1|  p[TX_16X8] = aom_##type##_predictor_16x8;   \
  |  |  |  |  |  |  483|      1|  p[TX_16X32] = aom_##type##_predictor_16x32; \
  |  |  |  |  |  |  484|      1|  p[TX_32X16] = aom_##type##_predictor_32x16; \
  |  |  |  |  |  |  485|      1|  p[TX_32X64] = aom_##type##_predictor_32x64; \
  |  |  |  |  |  |  486|      1|  p[TX_64X32] = aom_##type##_predictor_64x32; \
  |  |  |  |  |  |  487|      1|  p[TX_4X16] = aom_##type##_predictor_4x16;   \
  |  |  |  |  |  |  488|      1|  p[TX_16X4] = aom_##type##_predictor_16x4;   \
  |  |  |  |  |  |  489|      1|  p[TX_8X32] = aom_##type##_predictor_8x32;   \
  |  |  |  |  |  |  490|      1|  p[TX_32X8] = aom_##type##_predictor_32x8;   \
  |  |  |  |  |  |  491|      1|  p[TX_16X64] = aom_##type##_predictor_16x64; \
  |  |  |  |  |  |  492|      1|  p[TX_64X16] = aom_##type##_predictor_64x16;
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  510|      1|  INIT_ALL_SIZES(pred[SMOOTH_V_PRED], smooth_v)
  ------------------
  |  |  503|      1|  p[TX_4X4] = aom_##type##_predictor_4x4; \
  |  |  504|      1|  INIT_NO_4X4(p, type)
  |  |  ------------------
  |  |  |  |  496|      1|  p[TX_8X8] = aom_##type##_predictor_8x8;     \
  |  |  |  |  497|      1|  p[TX_16X16] = aom_##type##_predictor_16x16; \
  |  |  |  |  498|      1|  p[TX_32X32] = aom_##type##_predictor_32x32; \
  |  |  |  |  499|      1|  p[TX_64X64] = aom_##type##_predictor_64x64; \
  |  |  |  |  500|      1|  INIT_RECTANGULAR(p, type)
  |  |  |  |  ------------------
  |  |  |  |  |  |  479|      1|  p[TX_4X8] = aom_##type##_predictor_4x8;     \
  |  |  |  |  |  |  480|      1|  p[TX_8X4] = aom_##type##_predictor_8x4;     \
  |  |  |  |  |  |  481|      1|  p[TX_8X16] = aom_##type##_predictor_8x16;   \
  |  |  |  |  |  |  482|      1|  p[TX_16X8] = aom_##type##_predictor_16x8;   \
  |  |  |  |  |  |  483|      1|  p[TX_16X32] = aom_##type##_predictor_16x32; \
  |  |  |  |  |  |  484|      1|  p[TX_32X16] = aom_##type##_predictor_32x16; \
  |  |  |  |  |  |  485|      1|  p[TX_32X64] = aom_##type##_predictor_32x64; \
  |  |  |  |  |  |  486|      1|  p[TX_64X32] = aom_##type##_predictor_64x32; \
  |  |  |  |  |  |  487|      1|  p[TX_4X16] = aom_##type##_predictor_4x16;   \
  |  |  |  |  |  |  488|      1|  p[TX_16X4] = aom_##type##_predictor_16x4;   \
  |  |  |  |  |  |  489|      1|  p[TX_8X32] = aom_##type##_predictor_8x32;   \
  |  |  |  |  |  |  490|      1|  p[TX_32X8] = aom_##type##_predictor_32x8;   \
  |  |  |  |  |  |  491|      1|  p[TX_16X64] = aom_##type##_predictor_16x64; \
  |  |  |  |  |  |  492|      1|  p[TX_64X16] = aom_##type##_predictor_64x16;
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  511|      1|  INIT_ALL_SIZES(pred[SMOOTH_H_PRED], smooth_h)
  ------------------
  |  |  503|      1|  p[TX_4X4] = aom_##type##_predictor_4x4; \
  |  |  504|      1|  INIT_NO_4X4(p, type)
  |  |  ------------------
  |  |  |  |  496|      1|  p[TX_8X8] = aom_##type##_predictor_8x8;     \
  |  |  |  |  497|      1|  p[TX_16X16] = aom_##type##_predictor_16x16; \
  |  |  |  |  498|      1|  p[TX_32X32] = aom_##type##_predictor_32x32; \
  |  |  |  |  499|      1|  p[TX_64X64] = aom_##type##_predictor_64x64; \
  |  |  |  |  500|      1|  INIT_RECTANGULAR(p, type)
  |  |  |  |  ------------------
  |  |  |  |  |  |  479|      1|  p[TX_4X8] = aom_##type##_predictor_4x8;     \
  |  |  |  |  |  |  480|      1|  p[TX_8X4] = aom_##type##_predictor_8x4;     \
  |  |  |  |  |  |  481|      1|  p[TX_8X16] = aom_##type##_predictor_8x16;   \
  |  |  |  |  |  |  482|      1|  p[TX_16X8] = aom_##type##_predictor_16x8;   \
  |  |  |  |  |  |  483|      1|  p[TX_16X32] = aom_##type##_predictor_16x32; \
  |  |  |  |  |  |  484|      1|  p[TX_32X16] = aom_##type##_predictor_32x16; \
  |  |  |  |  |  |  485|      1|  p[TX_32X64] = aom_##type##_predictor_32x64; \
  |  |  |  |  |  |  486|      1|  p[TX_64X32] = aom_##type##_predictor_64x32; \
  |  |  |  |  |  |  487|      1|  p[TX_4X16] = aom_##type##_predictor_4x16;   \
  |  |  |  |  |  |  488|      1|  p[TX_16X4] = aom_##type##_predictor_16x4;   \
  |  |  |  |  |  |  489|      1|  p[TX_8X32] = aom_##type##_predictor_8x32;   \
  |  |  |  |  |  |  490|      1|  p[TX_32X8] = aom_##type##_predictor_32x8;   \
  |  |  |  |  |  |  491|      1|  p[TX_16X64] = aom_##type##_predictor_16x64; \
  |  |  |  |  |  |  492|      1|  p[TX_64X16] = aom_##type##_predictor_64x16;
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  512|      1|  INIT_ALL_SIZES(dc_pred[0][0], dc_128)
  ------------------
  |  |  503|      1|  p[TX_4X4] = aom_##type##_predictor_4x4; \
  |  |  ------------------
  |  |  |  |  146|      1|#define aom_dc_128_predictor_4x4 aom_dc_128_predictor_4x4_sse2
  |  |  ------------------
  |  |  504|      1|  INIT_NO_4X4(p, type)
  |  |  ------------------
  |  |  |  |  496|      1|  p[TX_8X8] = aom_##type##_predictor_8x8;     \
  |  |  |  |  ------------------
  |  |  |  |  |  |  181|      1|#define aom_dc_128_predictor_8x8 aom_dc_128_predictor_8x8_sse2
  |  |  |  |  ------------------
  |  |  |  |  497|      1|  p[TX_16X16] = aom_##type##_predictor_16x16; \
  |  |  |  |  ------------------
  |  |  |  |  |  |  103|      1|#define aom_dc_128_predictor_16x16 aom_dc_128_predictor_16x16_sse2
  |  |  |  |  ------------------
  |  |  |  |  498|      1|  p[TX_32X32] = aom_##type##_predictor_32x32; \
  |  |  |  |  499|      1|  p[TX_64X64] = aom_##type##_predictor_64x64; \
  |  |  |  |  500|      1|  INIT_RECTANGULAR(p, type)
  |  |  |  |  ------------------
  |  |  |  |  |  |  479|      1|  p[TX_4X8] = aom_##type##_predictor_4x8;     \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  150|      1|#define aom_dc_128_predictor_4x8 aom_dc_128_predictor_4x8_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  480|      1|  p[TX_8X4] = aom_##type##_predictor_8x4;     \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  177|      1|#define aom_dc_128_predictor_8x4 aom_dc_128_predictor_8x4_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  481|      1|  p[TX_8X16] = aom_##type##_predictor_8x16;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  169|      1|#define aom_dc_128_predictor_8x16 aom_dc_128_predictor_8x16_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  482|      1|  p[TX_16X8] = aom_##type##_predictor_16x8;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  119|      1|#define aom_dc_128_predictor_16x8 aom_dc_128_predictor_16x8_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  483|      1|  p[TX_16X32] = aom_##type##_predictor_16x32; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  107|      1|#define aom_dc_128_predictor_16x32 aom_dc_128_predictor_16x32_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  484|      1|  p[TX_32X16] = aom_##type##_predictor_32x16; \
  |  |  |  |  |  |  485|      1|  p[TX_32X64] = aom_##type##_predictor_32x64; \
  |  |  |  |  |  |  486|      1|  p[TX_64X32] = aom_##type##_predictor_64x32; \
  |  |  |  |  |  |  487|      1|  p[TX_4X16] = aom_##type##_predictor_4x16;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  142|      1|#define aom_dc_128_predictor_4x16 aom_dc_128_predictor_4x16_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  488|      1|  p[TX_16X4] = aom_##type##_predictor_16x4;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  111|      1|#define aom_dc_128_predictor_16x4 aom_dc_128_predictor_16x4_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  489|      1|  p[TX_8X32] = aom_##type##_predictor_8x32;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  173|      1|#define aom_dc_128_predictor_8x32 aom_dc_128_predictor_8x32_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  490|      1|  p[TX_32X8] = aom_##type##_predictor_32x8;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  138|      1|#define aom_dc_128_predictor_32x8 aom_dc_128_predictor_32x8_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  491|      1|  p[TX_16X64] = aom_##type##_predictor_16x64; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  115|      1|#define aom_dc_128_predictor_16x64 aom_dc_128_predictor_16x64_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  492|      1|  p[TX_64X16] = aom_##type##_predictor_64x16;
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  513|      1|  INIT_ALL_SIZES(dc_pred[0][1], dc_top)
  ------------------
  |  |  503|      1|  p[TX_4X4] = aom_##type##_predictor_4x4; \
  |  |  ------------------
  |  |  |  |  392|      1|#define aom_dc_top_predictor_4x4 aom_dc_top_predictor_4x4_sse2
  |  |  ------------------
  |  |  504|      1|  INIT_NO_4X4(p, type)
  |  |  ------------------
  |  |  |  |  496|      1|  p[TX_8X8] = aom_##type##_predictor_8x8;     \
  |  |  |  |  ------------------
  |  |  |  |  |  |  427|      1|#define aom_dc_top_predictor_8x8 aom_dc_top_predictor_8x8_sse2
  |  |  |  |  ------------------
  |  |  |  |  497|      1|  p[TX_16X16] = aom_##type##_predictor_16x16; \
  |  |  |  |  ------------------
  |  |  |  |  |  |  349|      1|#define aom_dc_top_predictor_16x16 aom_dc_top_predictor_16x16_sse2
  |  |  |  |  ------------------
  |  |  |  |  498|      1|  p[TX_32X32] = aom_##type##_predictor_32x32; \
  |  |  |  |  499|      1|  p[TX_64X64] = aom_##type##_predictor_64x64; \
  |  |  |  |  500|      1|  INIT_RECTANGULAR(p, type)
  |  |  |  |  ------------------
  |  |  |  |  |  |  479|      1|  p[TX_4X8] = aom_##type##_predictor_4x8;     \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  396|      1|#define aom_dc_top_predictor_4x8 aom_dc_top_predictor_4x8_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  480|      1|  p[TX_8X4] = aom_##type##_predictor_8x4;     \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  423|      1|#define aom_dc_top_predictor_8x4 aom_dc_top_predictor_8x4_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  481|      1|  p[TX_8X16] = aom_##type##_predictor_8x16;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  415|      1|#define aom_dc_top_predictor_8x16 aom_dc_top_predictor_8x16_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  482|      1|  p[TX_16X8] = aom_##type##_predictor_16x8;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  365|      1|#define aom_dc_top_predictor_16x8 aom_dc_top_predictor_16x8_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  483|      1|  p[TX_16X32] = aom_##type##_predictor_16x32; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  353|      1|#define aom_dc_top_predictor_16x32 aom_dc_top_predictor_16x32_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  484|      1|  p[TX_32X16] = aom_##type##_predictor_32x16; \
  |  |  |  |  |  |  485|      1|  p[TX_32X64] = aom_##type##_predictor_32x64; \
  |  |  |  |  |  |  486|      1|  p[TX_64X32] = aom_##type##_predictor_64x32; \
  |  |  |  |  |  |  487|      1|  p[TX_4X16] = aom_##type##_predictor_4x16;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  388|      1|#define aom_dc_top_predictor_4x16 aom_dc_top_predictor_4x16_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  488|      1|  p[TX_16X4] = aom_##type##_predictor_16x4;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  357|      1|#define aom_dc_top_predictor_16x4 aom_dc_top_predictor_16x4_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  489|      1|  p[TX_8X32] = aom_##type##_predictor_8x32;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  419|      1|#define aom_dc_top_predictor_8x32 aom_dc_top_predictor_8x32_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  490|      1|  p[TX_32X8] = aom_##type##_predictor_32x8;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  384|      1|#define aom_dc_top_predictor_32x8 aom_dc_top_predictor_32x8_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  491|      1|  p[TX_16X64] = aom_##type##_predictor_16x64; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  361|      1|#define aom_dc_top_predictor_16x64 aom_dc_top_predictor_16x64_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  492|      1|  p[TX_64X16] = aom_##type##_predictor_64x16;
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  514|      1|  INIT_ALL_SIZES(dc_pred[1][0], dc_left)
  ------------------
  |  |  503|      1|  p[TX_4X4] = aom_##type##_predictor_4x4; \
  |  |  ------------------
  |  |  |  |  228|      1|#define aom_dc_left_predictor_4x4 aom_dc_left_predictor_4x4_sse2
  |  |  ------------------
  |  |  504|      1|  INIT_NO_4X4(p, type)
  |  |  ------------------
  |  |  |  |  496|      1|  p[TX_8X8] = aom_##type##_predictor_8x8;     \
  |  |  |  |  ------------------
  |  |  |  |  |  |  263|      1|#define aom_dc_left_predictor_8x8 aom_dc_left_predictor_8x8_sse2
  |  |  |  |  ------------------
  |  |  |  |  497|      1|  p[TX_16X16] = aom_##type##_predictor_16x16; \
  |  |  |  |  ------------------
  |  |  |  |  |  |  185|      1|#define aom_dc_left_predictor_16x16 aom_dc_left_predictor_16x16_sse2
  |  |  |  |  ------------------
  |  |  |  |  498|      1|  p[TX_32X32] = aom_##type##_predictor_32x32; \
  |  |  |  |  499|      1|  p[TX_64X64] = aom_##type##_predictor_64x64; \
  |  |  |  |  500|      1|  INIT_RECTANGULAR(p, type)
  |  |  |  |  ------------------
  |  |  |  |  |  |  479|      1|  p[TX_4X8] = aom_##type##_predictor_4x8;     \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  232|      1|#define aom_dc_left_predictor_4x8 aom_dc_left_predictor_4x8_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  480|      1|  p[TX_8X4] = aom_##type##_predictor_8x4;     \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  259|      1|#define aom_dc_left_predictor_8x4 aom_dc_left_predictor_8x4_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  481|      1|  p[TX_8X16] = aom_##type##_predictor_8x16;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  251|      1|#define aom_dc_left_predictor_8x16 aom_dc_left_predictor_8x16_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  482|      1|  p[TX_16X8] = aom_##type##_predictor_16x8;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  201|      1|#define aom_dc_left_predictor_16x8 aom_dc_left_predictor_16x8_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  483|      1|  p[TX_16X32] = aom_##type##_predictor_16x32; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  189|      1|#define aom_dc_left_predictor_16x32 aom_dc_left_predictor_16x32_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  484|      1|  p[TX_32X16] = aom_##type##_predictor_32x16; \
  |  |  |  |  |  |  485|      1|  p[TX_32X64] = aom_##type##_predictor_32x64; \
  |  |  |  |  |  |  486|      1|  p[TX_64X32] = aom_##type##_predictor_64x32; \
  |  |  |  |  |  |  487|      1|  p[TX_4X16] = aom_##type##_predictor_4x16;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  224|      1|#define aom_dc_left_predictor_4x16 aom_dc_left_predictor_4x16_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  488|      1|  p[TX_16X4] = aom_##type##_predictor_16x4;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  193|      1|#define aom_dc_left_predictor_16x4 aom_dc_left_predictor_16x4_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  489|      1|  p[TX_8X32] = aom_##type##_predictor_8x32;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  255|      1|#define aom_dc_left_predictor_8x32 aom_dc_left_predictor_8x32_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  490|      1|  p[TX_32X8] = aom_##type##_predictor_32x8;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  220|      1|#define aom_dc_left_predictor_32x8 aom_dc_left_predictor_32x8_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  491|      1|  p[TX_16X64] = aom_##type##_predictor_16x64; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  197|      1|#define aom_dc_left_predictor_16x64 aom_dc_left_predictor_16x64_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  492|      1|  p[TX_64X16] = aom_##type##_predictor_64x16;
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  515|      1|  INIT_ALL_SIZES(dc_pred[1][1], dc)
  ------------------
  |  |  503|      1|  p[TX_4X4] = aom_##type##_predictor_4x4; \
  |  |  ------------------
  |  |  |  |  310|      1|#define aom_dc_predictor_4x4 aom_dc_predictor_4x4_sse2
  |  |  ------------------
  |  |  504|      1|  INIT_NO_4X4(p, type)
  |  |  ------------------
  |  |  |  |  496|      1|  p[TX_8X8] = aom_##type##_predictor_8x8;     \
  |  |  |  |  ------------------
  |  |  |  |  |  |  345|      1|#define aom_dc_predictor_8x8 aom_dc_predictor_8x8_sse2
  |  |  |  |  ------------------
  |  |  |  |  497|      1|  p[TX_16X16] = aom_##type##_predictor_16x16; \
  |  |  |  |  ------------------
  |  |  |  |  |  |  267|      1|#define aom_dc_predictor_16x16 aom_dc_predictor_16x16_sse2
  |  |  |  |  ------------------
  |  |  |  |  498|      1|  p[TX_32X32] = aom_##type##_predictor_32x32; \
  |  |  |  |  499|      1|  p[TX_64X64] = aom_##type##_predictor_64x64; \
  |  |  |  |  500|      1|  INIT_RECTANGULAR(p, type)
  |  |  |  |  ------------------
  |  |  |  |  |  |  479|      1|  p[TX_4X8] = aom_##type##_predictor_4x8;     \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  314|      1|#define aom_dc_predictor_4x8 aom_dc_predictor_4x8_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  480|      1|  p[TX_8X4] = aom_##type##_predictor_8x4;     \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  341|      1|#define aom_dc_predictor_8x4 aom_dc_predictor_8x4_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  481|      1|  p[TX_8X16] = aom_##type##_predictor_8x16;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  333|      1|#define aom_dc_predictor_8x16 aom_dc_predictor_8x16_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  482|      1|  p[TX_16X8] = aom_##type##_predictor_16x8;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  283|      1|#define aom_dc_predictor_16x8 aom_dc_predictor_16x8_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  483|      1|  p[TX_16X32] = aom_##type##_predictor_16x32; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  271|      1|#define aom_dc_predictor_16x32 aom_dc_predictor_16x32_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  484|      1|  p[TX_32X16] = aom_##type##_predictor_32x16; \
  |  |  |  |  |  |  485|      1|  p[TX_32X64] = aom_##type##_predictor_32x64; \
  |  |  |  |  |  |  486|      1|  p[TX_64X32] = aom_##type##_predictor_64x32; \
  |  |  |  |  |  |  487|      1|  p[TX_4X16] = aom_##type##_predictor_4x16;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  306|      1|#define aom_dc_predictor_4x16 aom_dc_predictor_4x16_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  488|      1|  p[TX_16X4] = aom_##type##_predictor_16x4;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  275|      1|#define aom_dc_predictor_16x4 aom_dc_predictor_16x4_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  489|      1|  p[TX_8X32] = aom_##type##_predictor_8x32;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  337|      1|#define aom_dc_predictor_8x32 aom_dc_predictor_8x32_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  490|      1|  p[TX_32X8] = aom_##type##_predictor_32x8;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  302|      1|#define aom_dc_predictor_32x8 aom_dc_predictor_32x8_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  491|      1|  p[TX_16X64] = aom_##type##_predictor_16x64; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  279|      1|#define aom_dc_predictor_16x64 aom_dc_predictor_16x64_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  492|      1|  p[TX_64X16] = aom_##type##_predictor_64x16;
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  516|      1|#if CONFIG_AV1_HIGHBITDEPTH
  517|      1|  INIT_ALL_SIZES(pred_high[V_PRED], highbd_v)
  ------------------
  |  |  503|      1|  p[TX_4X4] = aom_##type##_predictor_4x4; \
  |  |  ------------------
  |  |  |  | 3635|      1|#define aom_highbd_v_predictor_4x4 aom_highbd_v_predictor_4x4_sse2
  |  |  ------------------
  |  |  504|      1|  INIT_NO_4X4(p, type)
  |  |  ------------------
  |  |  |  |  496|      1|  p[TX_8X8] = aom_##type##_predictor_8x8;     \
  |  |  |  |  ------------------
  |  |  |  |  |  | 3663|      1|#define aom_highbd_v_predictor_8x8 aom_highbd_v_predictor_8x8_sse2
  |  |  |  |  ------------------
  |  |  |  |  497|      1|  p[TX_16X16] = aom_##type##_predictor_16x16; \
  |  |  |  |  ------------------
  |  |  |  |  |  | 3600|      1|#define aom_highbd_v_predictor_16x16 aom_highbd_v_predictor_16x16_sse2
  |  |  |  |  ------------------
  |  |  |  |  498|      1|  p[TX_32X32] = aom_##type##_predictor_32x32; \
  |  |  |  |  ------------------
  |  |  |  |  |  | 3622|      1|#define aom_highbd_v_predictor_32x32 aom_highbd_v_predictor_32x32_sse2
  |  |  |  |  ------------------
  |  |  |  |  499|      1|  p[TX_64X64] = aom_##type##_predictor_64x64; \
  |  |  |  |  ------------------
  |  |  |  |  |  | 3648|      1|#define aom_highbd_v_predictor_64x64 aom_highbd_v_predictor_64x64_c
  |  |  |  |  ------------------
  |  |  |  |  500|      1|  INIT_RECTANGULAR(p, type)
  |  |  |  |  ------------------
  |  |  |  |  |  |  479|      1|  p[TX_4X8] = aom_##type##_predictor_4x8;     \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3639|      1|#define aom_highbd_v_predictor_4x8 aom_highbd_v_predictor_4x8_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  480|      1|  p[TX_8X4] = aom_##type##_predictor_8x4;     \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3659|      1|#define aom_highbd_v_predictor_8x4 aom_highbd_v_predictor_8x4_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  481|      1|  p[TX_8X16] = aom_##type##_predictor_8x16;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3652|      1|#define aom_highbd_v_predictor_8x16 aom_highbd_v_predictor_8x16_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  482|      1|  p[TX_16X8] = aom_##type##_predictor_16x8;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3614|      1|#define aom_highbd_v_predictor_16x8 aom_highbd_v_predictor_16x8_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  483|      1|  p[TX_16X32] = aom_##type##_predictor_16x32; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3604|      1|#define aom_highbd_v_predictor_16x32 aom_highbd_v_predictor_16x32_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  484|      1|  p[TX_32X16] = aom_##type##_predictor_32x16; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3618|      1|#define aom_highbd_v_predictor_32x16 aom_highbd_v_predictor_32x16_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  485|      1|  p[TX_32X64] = aom_##type##_predictor_32x64; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3625|      1|#define aom_highbd_v_predictor_32x64 aom_highbd_v_predictor_32x64_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  486|      1|  p[TX_64X32] = aom_##type##_predictor_64x32; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3645|      1|#define aom_highbd_v_predictor_64x32 aom_highbd_v_predictor_64x32_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  487|      1|  p[TX_4X16] = aom_##type##_predictor_4x16;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3631|      1|#define aom_highbd_v_predictor_4x16 aom_highbd_v_predictor_4x16_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  488|      1|  p[TX_16X4] = aom_##type##_predictor_16x4;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3607|      1|#define aom_highbd_v_predictor_16x4 aom_highbd_v_predictor_16x4_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  489|      1|  p[TX_8X32] = aom_##type##_predictor_8x32;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3655|      1|#define aom_highbd_v_predictor_8x32 aom_highbd_v_predictor_8x32_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  490|      1|  p[TX_32X8] = aom_##type##_predictor_32x8;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3628|      1|#define aom_highbd_v_predictor_32x8 aom_highbd_v_predictor_32x8_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  491|      1|  p[TX_16X64] = aom_##type##_predictor_16x64; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3610|      1|#define aom_highbd_v_predictor_16x64 aom_highbd_v_predictor_16x64_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  492|      1|  p[TX_64X16] = aom_##type##_predictor_64x16;
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3642|      1|#define aom_highbd_v_predictor_64x16 aom_highbd_v_predictor_64x16_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  518|      1|  INIT_ALL_SIZES(pred_high[H_PRED], highbd_h)
  ------------------
  |  |  503|      1|  p[TX_4X4] = aom_##type##_predictor_4x4; \
  |  |  ------------------
  |  |  |  | 2496|      1|#define aom_highbd_h_predictor_4x4 aom_highbd_h_predictor_4x4_sse2
  |  |  ------------------
  |  |  504|      1|  INIT_NO_4X4(p, type)
  |  |  ------------------
  |  |  |  |  496|      1|  p[TX_8X8] = aom_##type##_predictor_8x8;     \
  |  |  |  |  ------------------
  |  |  |  |  |  | 2524|      1|#define aom_highbd_h_predictor_8x8 aom_highbd_h_predictor_8x8_sse2
  |  |  |  |  ------------------
  |  |  |  |  497|      1|  p[TX_16X16] = aom_##type##_predictor_16x16; \
  |  |  |  |  ------------------
  |  |  |  |  |  | 2461|      1|#define aom_highbd_h_predictor_16x16 aom_highbd_h_predictor_16x16_sse2
  |  |  |  |  ------------------
  |  |  |  |  498|      1|  p[TX_32X32] = aom_##type##_predictor_32x32; \
  |  |  |  |  ------------------
  |  |  |  |  |  | 2483|      1|#define aom_highbd_h_predictor_32x32 aom_highbd_h_predictor_32x32_sse2
  |  |  |  |  ------------------
  |  |  |  |  499|      1|  p[TX_64X64] = aom_##type##_predictor_64x64; \
  |  |  |  |  ------------------
  |  |  |  |  |  | 2509|      1|#define aom_highbd_h_predictor_64x64 aom_highbd_h_predictor_64x64_c
  |  |  |  |  ------------------
  |  |  |  |  500|      1|  INIT_RECTANGULAR(p, type)
  |  |  |  |  ------------------
  |  |  |  |  |  |  479|      1|  p[TX_4X8] = aom_##type##_predictor_4x8;     \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2500|      1|#define aom_highbd_h_predictor_4x8 aom_highbd_h_predictor_4x8_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  480|      1|  p[TX_8X4] = aom_##type##_predictor_8x4;     \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2520|      1|#define aom_highbd_h_predictor_8x4 aom_highbd_h_predictor_8x4_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  481|      1|  p[TX_8X16] = aom_##type##_predictor_8x16;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2513|      1|#define aom_highbd_h_predictor_8x16 aom_highbd_h_predictor_8x16_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  482|      1|  p[TX_16X8] = aom_##type##_predictor_16x8;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2475|      1|#define aom_highbd_h_predictor_16x8 aom_highbd_h_predictor_16x8_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  483|      1|  p[TX_16X32] = aom_##type##_predictor_16x32; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2465|      1|#define aom_highbd_h_predictor_16x32 aom_highbd_h_predictor_16x32_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  484|      1|  p[TX_32X16] = aom_##type##_predictor_32x16; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2479|      1|#define aom_highbd_h_predictor_32x16 aom_highbd_h_predictor_32x16_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  485|      1|  p[TX_32X64] = aom_##type##_predictor_32x64; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2486|      1|#define aom_highbd_h_predictor_32x64 aom_highbd_h_predictor_32x64_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  486|      1|  p[TX_64X32] = aom_##type##_predictor_64x32; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2506|      1|#define aom_highbd_h_predictor_64x32 aom_highbd_h_predictor_64x32_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  487|      1|  p[TX_4X16] = aom_##type##_predictor_4x16;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2492|      1|#define aom_highbd_h_predictor_4x16 aom_highbd_h_predictor_4x16_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  488|      1|  p[TX_16X4] = aom_##type##_predictor_16x4;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2468|      1|#define aom_highbd_h_predictor_16x4 aom_highbd_h_predictor_16x4_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  489|      1|  p[TX_8X32] = aom_##type##_predictor_8x32;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2516|      1|#define aom_highbd_h_predictor_8x32 aom_highbd_h_predictor_8x32_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  490|      1|  p[TX_32X8] = aom_##type##_predictor_32x8;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2489|      1|#define aom_highbd_h_predictor_32x8 aom_highbd_h_predictor_32x8_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  491|      1|  p[TX_16X64] = aom_##type##_predictor_16x64; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2471|      1|#define aom_highbd_h_predictor_16x64 aom_highbd_h_predictor_16x64_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  492|      1|  p[TX_64X16] = aom_##type##_predictor_64x16;
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2503|      1|#define aom_highbd_h_predictor_64x16 aom_highbd_h_predictor_64x16_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  519|      1|  INIT_ALL_SIZES(pred_high[PAETH_PRED], highbd_paeth)
  ------------------
  |  |  503|      1|  p[TX_4X4] = aom_##type##_predictor_4x4; \
  |  |  ------------------
  |  |  |  | 2862|      1|#define aom_highbd_paeth_predictor_4x4 aom_highbd_paeth_predictor_4x4_c
  |  |  ------------------
  |  |  504|      1|  INIT_NO_4X4(p, type)
  |  |  ------------------
  |  |  |  |  496|      1|  p[TX_8X8] = aom_##type##_predictor_8x8;     \
  |  |  |  |  ------------------
  |  |  |  |  |  | 2886|      1|#define aom_highbd_paeth_predictor_8x8 aom_highbd_paeth_predictor_8x8_c
  |  |  |  |  ------------------
  |  |  |  |  497|      1|  p[TX_16X16] = aom_##type##_predictor_16x16; \
  |  |  |  |  ------------------
  |  |  |  |  |  | 2832|      1|#define aom_highbd_paeth_predictor_16x16 aom_highbd_paeth_predictor_16x16_c
  |  |  |  |  ------------------
  |  |  |  |  498|      1|  p[TX_32X32] = aom_##type##_predictor_32x32; \
  |  |  |  |  ------------------
  |  |  |  |  |  | 2850|      1|#define aom_highbd_paeth_predictor_32x32 aom_highbd_paeth_predictor_32x32_c
  |  |  |  |  ------------------
  |  |  |  |  499|      1|  p[TX_64X64] = aom_##type##_predictor_64x64; \
  |  |  |  |  ------------------
  |  |  |  |  |  | 2874|      1|#define aom_highbd_paeth_predictor_64x64 aom_highbd_paeth_predictor_64x64_c
  |  |  |  |  ------------------
  |  |  |  |  500|      1|  INIT_RECTANGULAR(p, type)
  |  |  |  |  ------------------
  |  |  |  |  |  |  479|      1|  p[TX_4X8] = aom_##type##_predictor_4x8;     \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2865|      1|#define aom_highbd_paeth_predictor_4x8 aom_highbd_paeth_predictor_4x8_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  480|      1|  p[TX_8X4] = aom_##type##_predictor_8x4;     \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2883|      1|#define aom_highbd_paeth_predictor_8x4 aom_highbd_paeth_predictor_8x4_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  481|      1|  p[TX_8X16] = aom_##type##_predictor_8x16;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2877|      1|#define aom_highbd_paeth_predictor_8x16 aom_highbd_paeth_predictor_8x16_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  482|      1|  p[TX_16X8] = aom_##type##_predictor_16x8;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2844|      1|#define aom_highbd_paeth_predictor_16x8 aom_highbd_paeth_predictor_16x8_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  483|      1|  p[TX_16X32] = aom_##type##_predictor_16x32; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2835|      1|#define aom_highbd_paeth_predictor_16x32 aom_highbd_paeth_predictor_16x32_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  484|      1|  p[TX_32X16] = aom_##type##_predictor_32x16; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2847|      1|#define aom_highbd_paeth_predictor_32x16 aom_highbd_paeth_predictor_32x16_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  485|      1|  p[TX_32X64] = aom_##type##_predictor_32x64; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2853|      1|#define aom_highbd_paeth_predictor_32x64 aom_highbd_paeth_predictor_32x64_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  486|      1|  p[TX_64X32] = aom_##type##_predictor_64x32; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2871|      1|#define aom_highbd_paeth_predictor_64x32 aom_highbd_paeth_predictor_64x32_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  487|      1|  p[TX_4X16] = aom_##type##_predictor_4x16;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2859|      1|#define aom_highbd_paeth_predictor_4x16 aom_highbd_paeth_predictor_4x16_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  488|      1|  p[TX_16X4] = aom_##type##_predictor_16x4;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2838|      1|#define aom_highbd_paeth_predictor_16x4 aom_highbd_paeth_predictor_16x4_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  489|      1|  p[TX_8X32] = aom_##type##_predictor_8x32;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2880|      1|#define aom_highbd_paeth_predictor_8x32 aom_highbd_paeth_predictor_8x32_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  490|      1|  p[TX_32X8] = aom_##type##_predictor_32x8;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2856|      1|#define aom_highbd_paeth_predictor_32x8 aom_highbd_paeth_predictor_32x8_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  491|      1|  p[TX_16X64] = aom_##type##_predictor_16x64; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2841|      1|#define aom_highbd_paeth_predictor_16x64 aom_highbd_paeth_predictor_16x64_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  492|      1|  p[TX_64X16] = aom_##type##_predictor_64x16;
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2868|      1|#define aom_highbd_paeth_predictor_64x16 aom_highbd_paeth_predictor_64x16_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  520|      1|  INIT_ALL_SIZES(pred_high[SMOOTH_PRED], highbd_smooth)
  ------------------
  |  |  503|      1|  p[TX_4X4] = aom_##type##_predictor_4x4; \
  |  |  ------------------
  |  |  |  | 3503|      1|#define aom_highbd_smooth_predictor_4x4 aom_highbd_smooth_predictor_4x4_c
  |  |  ------------------
  |  |  504|      1|  INIT_NO_4X4(p, type)
  |  |  ------------------
  |  |  |  |  496|      1|  p[TX_8X8] = aom_##type##_predictor_8x8;     \
  |  |  |  |  ------------------
  |  |  |  |  |  | 3527|      1|#define aom_highbd_smooth_predictor_8x8 aom_highbd_smooth_predictor_8x8_c
  |  |  |  |  ------------------
  |  |  |  |  497|      1|  p[TX_16X16] = aom_##type##_predictor_16x16; \
  |  |  |  |  ------------------
  |  |  |  |  |  | 3473|      1|#define aom_highbd_smooth_predictor_16x16 aom_highbd_smooth_predictor_16x16_c
  |  |  |  |  ------------------
  |  |  |  |  498|      1|  p[TX_32X32] = aom_##type##_predictor_32x32; \
  |  |  |  |  ------------------
  |  |  |  |  |  | 3491|      1|#define aom_highbd_smooth_predictor_32x32 aom_highbd_smooth_predictor_32x32_c
  |  |  |  |  ------------------
  |  |  |  |  499|      1|  p[TX_64X64] = aom_##type##_predictor_64x64; \
  |  |  |  |  ------------------
  |  |  |  |  |  | 3515|      1|#define aom_highbd_smooth_predictor_64x64 aom_highbd_smooth_predictor_64x64_c
  |  |  |  |  ------------------
  |  |  |  |  500|      1|  INIT_RECTANGULAR(p, type)
  |  |  |  |  ------------------
  |  |  |  |  |  |  479|      1|  p[TX_4X8] = aom_##type##_predictor_4x8;     \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3506|      1|#define aom_highbd_smooth_predictor_4x8 aom_highbd_smooth_predictor_4x8_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  480|      1|  p[TX_8X4] = aom_##type##_predictor_8x4;     \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3524|      1|#define aom_highbd_smooth_predictor_8x4 aom_highbd_smooth_predictor_8x4_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  481|      1|  p[TX_8X16] = aom_##type##_predictor_8x16;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3518|      1|#define aom_highbd_smooth_predictor_8x16 aom_highbd_smooth_predictor_8x16_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  482|      1|  p[TX_16X8] = aom_##type##_predictor_16x8;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3485|      1|#define aom_highbd_smooth_predictor_16x8 aom_highbd_smooth_predictor_16x8_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  483|      1|  p[TX_16X32] = aom_##type##_predictor_16x32; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3476|      1|#define aom_highbd_smooth_predictor_16x32 aom_highbd_smooth_predictor_16x32_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  484|      1|  p[TX_32X16] = aom_##type##_predictor_32x16; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3488|      1|#define aom_highbd_smooth_predictor_32x16 aom_highbd_smooth_predictor_32x16_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  485|      1|  p[TX_32X64] = aom_##type##_predictor_32x64; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3494|      1|#define aom_highbd_smooth_predictor_32x64 aom_highbd_smooth_predictor_32x64_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  486|      1|  p[TX_64X32] = aom_##type##_predictor_64x32; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3512|      1|#define aom_highbd_smooth_predictor_64x32 aom_highbd_smooth_predictor_64x32_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  487|      1|  p[TX_4X16] = aom_##type##_predictor_4x16;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3500|      1|#define aom_highbd_smooth_predictor_4x16 aom_highbd_smooth_predictor_4x16_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  488|      1|  p[TX_16X4] = aom_##type##_predictor_16x4;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3479|      1|#define aom_highbd_smooth_predictor_16x4 aom_highbd_smooth_predictor_16x4_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  489|      1|  p[TX_8X32] = aom_##type##_predictor_8x32;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3521|      1|#define aom_highbd_smooth_predictor_8x32 aom_highbd_smooth_predictor_8x32_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  490|      1|  p[TX_32X8] = aom_##type##_predictor_32x8;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3497|      1|#define aom_highbd_smooth_predictor_32x8 aom_highbd_smooth_predictor_32x8_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  491|      1|  p[TX_16X64] = aom_##type##_predictor_16x64; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3482|      1|#define aom_highbd_smooth_predictor_16x64 aom_highbd_smooth_predictor_16x64_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  492|      1|  p[TX_64X16] = aom_##type##_predictor_64x16;
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3509|      1|#define aom_highbd_smooth_predictor_64x16 aom_highbd_smooth_predictor_64x16_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  521|      1|  INIT_ALL_SIZES(pred_high[SMOOTH_V_PRED], highbd_smooth_v)
  ------------------
  |  |  503|      1|  p[TX_4X4] = aom_##type##_predictor_4x4; \
  |  |  ------------------
  |  |  |  | 3560|      1|#define aom_highbd_smooth_v_predictor_4x4 aom_highbd_smooth_v_predictor_4x4_c
  |  |  ------------------
  |  |  504|      1|  INIT_NO_4X4(p, type)
  |  |  ------------------
  |  |  |  |  496|      1|  p[TX_8X8] = aom_##type##_predictor_8x8;     \
  |  |  |  |  ------------------
  |  |  |  |  |  | 3584|      1|#define aom_highbd_smooth_v_predictor_8x8 aom_highbd_smooth_v_predictor_8x8_c
  |  |  |  |  ------------------
  |  |  |  |  497|      1|  p[TX_16X16] = aom_##type##_predictor_16x16; \
  |  |  |  |  ------------------
  |  |  |  |  |  | 3530|      1|#define aom_highbd_smooth_v_predictor_16x16 aom_highbd_smooth_v_predictor_16x16_c
  |  |  |  |  ------------------
  |  |  |  |  498|      1|  p[TX_32X32] = aom_##type##_predictor_32x32; \
  |  |  |  |  ------------------
  |  |  |  |  |  | 3548|      1|#define aom_highbd_smooth_v_predictor_32x32 aom_highbd_smooth_v_predictor_32x32_c
  |  |  |  |  ------------------
  |  |  |  |  499|      1|  p[TX_64X64] = aom_##type##_predictor_64x64; \
  |  |  |  |  ------------------
  |  |  |  |  |  | 3572|      1|#define aom_highbd_smooth_v_predictor_64x64 aom_highbd_smooth_v_predictor_64x64_c
  |  |  |  |  ------------------
  |  |  |  |  500|      1|  INIT_RECTANGULAR(p, type)
  |  |  |  |  ------------------
  |  |  |  |  |  |  479|      1|  p[TX_4X8] = aom_##type##_predictor_4x8;     \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3563|      1|#define aom_highbd_smooth_v_predictor_4x8 aom_highbd_smooth_v_predictor_4x8_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  480|      1|  p[TX_8X4] = aom_##type##_predictor_8x4;     \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3581|      1|#define aom_highbd_smooth_v_predictor_8x4 aom_highbd_smooth_v_predictor_8x4_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  481|      1|  p[TX_8X16] = aom_##type##_predictor_8x16;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3575|      1|#define aom_highbd_smooth_v_predictor_8x16 aom_highbd_smooth_v_predictor_8x16_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  482|      1|  p[TX_16X8] = aom_##type##_predictor_16x8;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3542|      1|#define aom_highbd_smooth_v_predictor_16x8 aom_highbd_smooth_v_predictor_16x8_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  483|      1|  p[TX_16X32] = aom_##type##_predictor_16x32; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3533|      1|#define aom_highbd_smooth_v_predictor_16x32 aom_highbd_smooth_v_predictor_16x32_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  484|      1|  p[TX_32X16] = aom_##type##_predictor_32x16; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3545|      1|#define aom_highbd_smooth_v_predictor_32x16 aom_highbd_smooth_v_predictor_32x16_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  485|      1|  p[TX_32X64] = aom_##type##_predictor_32x64; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3551|      1|#define aom_highbd_smooth_v_predictor_32x64 aom_highbd_smooth_v_predictor_32x64_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  486|      1|  p[TX_64X32] = aom_##type##_predictor_64x32; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3569|      1|#define aom_highbd_smooth_v_predictor_64x32 aom_highbd_smooth_v_predictor_64x32_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  487|      1|  p[TX_4X16] = aom_##type##_predictor_4x16;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3557|      1|#define aom_highbd_smooth_v_predictor_4x16 aom_highbd_smooth_v_predictor_4x16_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  488|      1|  p[TX_16X4] = aom_##type##_predictor_16x4;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3536|      1|#define aom_highbd_smooth_v_predictor_16x4 aom_highbd_smooth_v_predictor_16x4_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  489|      1|  p[TX_8X32] = aom_##type##_predictor_8x32;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3578|      1|#define aom_highbd_smooth_v_predictor_8x32 aom_highbd_smooth_v_predictor_8x32_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  490|      1|  p[TX_32X8] = aom_##type##_predictor_32x8;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3554|      1|#define aom_highbd_smooth_v_predictor_32x8 aom_highbd_smooth_v_predictor_32x8_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  491|      1|  p[TX_16X64] = aom_##type##_predictor_16x64; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3539|      1|#define aom_highbd_smooth_v_predictor_16x64 aom_highbd_smooth_v_predictor_16x64_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  492|      1|  p[TX_64X16] = aom_##type##_predictor_64x16;
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3566|      1|#define aom_highbd_smooth_v_predictor_64x16 aom_highbd_smooth_v_predictor_64x16_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  522|      1|  INIT_ALL_SIZES(pred_high[SMOOTH_H_PRED], highbd_smooth_h)
  ------------------
  |  |  503|      1|  p[TX_4X4] = aom_##type##_predictor_4x4; \
  |  |  ------------------
  |  |  |  | 3446|      1|#define aom_highbd_smooth_h_predictor_4x4 aom_highbd_smooth_h_predictor_4x4_c
  |  |  ------------------
  |  |  504|      1|  INIT_NO_4X4(p, type)
  |  |  ------------------
  |  |  |  |  496|      1|  p[TX_8X8] = aom_##type##_predictor_8x8;     \
  |  |  |  |  ------------------
  |  |  |  |  |  | 3470|      1|#define aom_highbd_smooth_h_predictor_8x8 aom_highbd_smooth_h_predictor_8x8_c
  |  |  |  |  ------------------
  |  |  |  |  497|      1|  p[TX_16X16] = aom_##type##_predictor_16x16; \
  |  |  |  |  ------------------
  |  |  |  |  |  | 3416|      1|#define aom_highbd_smooth_h_predictor_16x16 aom_highbd_smooth_h_predictor_16x16_c
  |  |  |  |  ------------------
  |  |  |  |  498|      1|  p[TX_32X32] = aom_##type##_predictor_32x32; \
  |  |  |  |  ------------------
  |  |  |  |  |  | 3434|      1|#define aom_highbd_smooth_h_predictor_32x32 aom_highbd_smooth_h_predictor_32x32_c
  |  |  |  |  ------------------
  |  |  |  |  499|      1|  p[TX_64X64] = aom_##type##_predictor_64x64; \
  |  |  |  |  ------------------
  |  |  |  |  |  | 3458|      1|#define aom_highbd_smooth_h_predictor_64x64 aom_highbd_smooth_h_predictor_64x64_c
  |  |  |  |  ------------------
  |  |  |  |  500|      1|  INIT_RECTANGULAR(p, type)
  |  |  |  |  ------------------
  |  |  |  |  |  |  479|      1|  p[TX_4X8] = aom_##type##_predictor_4x8;     \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3449|      1|#define aom_highbd_smooth_h_predictor_4x8 aom_highbd_smooth_h_predictor_4x8_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  480|      1|  p[TX_8X4] = aom_##type##_predictor_8x4;     \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3467|      1|#define aom_highbd_smooth_h_predictor_8x4 aom_highbd_smooth_h_predictor_8x4_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  481|      1|  p[TX_8X16] = aom_##type##_predictor_8x16;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3461|      1|#define aom_highbd_smooth_h_predictor_8x16 aom_highbd_smooth_h_predictor_8x16_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  482|      1|  p[TX_16X8] = aom_##type##_predictor_16x8;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3428|      1|#define aom_highbd_smooth_h_predictor_16x8 aom_highbd_smooth_h_predictor_16x8_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  483|      1|  p[TX_16X32] = aom_##type##_predictor_16x32; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3419|      1|#define aom_highbd_smooth_h_predictor_16x32 aom_highbd_smooth_h_predictor_16x32_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  484|      1|  p[TX_32X16] = aom_##type##_predictor_32x16; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3431|      1|#define aom_highbd_smooth_h_predictor_32x16 aom_highbd_smooth_h_predictor_32x16_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  485|      1|  p[TX_32X64] = aom_##type##_predictor_32x64; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3437|      1|#define aom_highbd_smooth_h_predictor_32x64 aom_highbd_smooth_h_predictor_32x64_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  486|      1|  p[TX_64X32] = aom_##type##_predictor_64x32; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3455|      1|#define aom_highbd_smooth_h_predictor_64x32 aom_highbd_smooth_h_predictor_64x32_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  487|      1|  p[TX_4X16] = aom_##type##_predictor_4x16;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3443|      1|#define aom_highbd_smooth_h_predictor_4x16 aom_highbd_smooth_h_predictor_4x16_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  488|      1|  p[TX_16X4] = aom_##type##_predictor_16x4;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3422|      1|#define aom_highbd_smooth_h_predictor_16x4 aom_highbd_smooth_h_predictor_16x4_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  489|      1|  p[TX_8X32] = aom_##type##_predictor_8x32;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3464|      1|#define aom_highbd_smooth_h_predictor_8x32 aom_highbd_smooth_h_predictor_8x32_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  490|      1|  p[TX_32X8] = aom_##type##_predictor_32x8;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3440|      1|#define aom_highbd_smooth_h_predictor_32x8 aom_highbd_smooth_h_predictor_32x8_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  491|      1|  p[TX_16X64] = aom_##type##_predictor_16x64; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3425|      1|#define aom_highbd_smooth_h_predictor_16x64 aom_highbd_smooth_h_predictor_16x64_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  492|      1|  p[TX_64X16] = aom_##type##_predictor_64x16;
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3452|      1|#define aom_highbd_smooth_h_predictor_64x16 aom_highbd_smooth_h_predictor_64x16_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  523|      1|  INIT_ALL_SIZES(dc_pred_high[0][0], highbd_dc_128)
  ------------------
  |  |  503|      1|  p[TX_4X4] = aom_##type##_predictor_4x4; \
  |  |  ------------------
  |  |  |  | 2228|      1|#define aom_highbd_dc_128_predictor_4x4 aom_highbd_dc_128_predictor_4x4_sse2
  |  |  ------------------
  |  |  504|      1|  INIT_NO_4X4(p, type)
  |  |  ------------------
  |  |  |  |  496|      1|  p[TX_8X8] = aom_##type##_predictor_8x8;     \
  |  |  |  |  ------------------
  |  |  |  |  |  | 2256|      1|#define aom_highbd_dc_128_predictor_8x8 aom_highbd_dc_128_predictor_8x8_sse2
  |  |  |  |  ------------------
  |  |  |  |  497|      1|  p[TX_16X16] = aom_##type##_predictor_16x16; \
  |  |  |  |  ------------------
  |  |  |  |  |  | 2193|      1|#define aom_highbd_dc_128_predictor_16x16 aom_highbd_dc_128_predictor_16x16_sse2
  |  |  |  |  ------------------
  |  |  |  |  498|      1|  p[TX_32X32] = aom_##type##_predictor_32x32; \
  |  |  |  |  ------------------
  |  |  |  |  |  | 2215|      1|#define aom_highbd_dc_128_predictor_32x32 aom_highbd_dc_128_predictor_32x32_sse2
  |  |  |  |  ------------------
  |  |  |  |  499|      1|  p[TX_64X64] = aom_##type##_predictor_64x64; \
  |  |  |  |  ------------------
  |  |  |  |  |  | 2241|      1|#define aom_highbd_dc_128_predictor_64x64 aom_highbd_dc_128_predictor_64x64_c
  |  |  |  |  ------------------
  |  |  |  |  500|      1|  INIT_RECTANGULAR(p, type)
  |  |  |  |  ------------------
  |  |  |  |  |  |  479|      1|  p[TX_4X8] = aom_##type##_predictor_4x8;     \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2232|      1|#define aom_highbd_dc_128_predictor_4x8 aom_highbd_dc_128_predictor_4x8_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  480|      1|  p[TX_8X4] = aom_##type##_predictor_8x4;     \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2252|      1|#define aom_highbd_dc_128_predictor_8x4 aom_highbd_dc_128_predictor_8x4_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  481|      1|  p[TX_8X16] = aom_##type##_predictor_8x16;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2245|      1|#define aom_highbd_dc_128_predictor_8x16 aom_highbd_dc_128_predictor_8x16_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  482|      1|  p[TX_16X8] = aom_##type##_predictor_16x8;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2207|      1|#define aom_highbd_dc_128_predictor_16x8 aom_highbd_dc_128_predictor_16x8_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  483|      1|  p[TX_16X32] = aom_##type##_predictor_16x32; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2197|      1|#define aom_highbd_dc_128_predictor_16x32 aom_highbd_dc_128_predictor_16x32_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  484|      1|  p[TX_32X16] = aom_##type##_predictor_32x16; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2211|      1|#define aom_highbd_dc_128_predictor_32x16 aom_highbd_dc_128_predictor_32x16_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  485|      1|  p[TX_32X64] = aom_##type##_predictor_32x64; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2218|      1|#define aom_highbd_dc_128_predictor_32x64 aom_highbd_dc_128_predictor_32x64_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  486|      1|  p[TX_64X32] = aom_##type##_predictor_64x32; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2238|      1|#define aom_highbd_dc_128_predictor_64x32 aom_highbd_dc_128_predictor_64x32_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  487|      1|  p[TX_4X16] = aom_##type##_predictor_4x16;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2224|      1|#define aom_highbd_dc_128_predictor_4x16 aom_highbd_dc_128_predictor_4x16_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  488|      1|  p[TX_16X4] = aom_##type##_predictor_16x4;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2200|      1|#define aom_highbd_dc_128_predictor_16x4 aom_highbd_dc_128_predictor_16x4_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  489|      1|  p[TX_8X32] = aom_##type##_predictor_8x32;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2248|      1|#define aom_highbd_dc_128_predictor_8x32 aom_highbd_dc_128_predictor_8x32_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  490|      1|  p[TX_32X8] = aom_##type##_predictor_32x8;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2221|      1|#define aom_highbd_dc_128_predictor_32x8 aom_highbd_dc_128_predictor_32x8_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  491|      1|  p[TX_16X64] = aom_##type##_predictor_16x64; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2203|      1|#define aom_highbd_dc_128_predictor_16x64 aom_highbd_dc_128_predictor_16x64_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  492|      1|  p[TX_64X16] = aom_##type##_predictor_64x16;
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2235|      1|#define aom_highbd_dc_128_predictor_64x16 aom_highbd_dc_128_predictor_64x16_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  524|      1|  INIT_ALL_SIZES(dc_pred_high[0][1], highbd_dc_top)
  ------------------
  |  |  503|      1|  p[TX_4X4] = aom_##type##_predictor_4x4; \
  |  |  ------------------
  |  |  |  | 2429|      1|#define aom_highbd_dc_top_predictor_4x4 aom_highbd_dc_top_predictor_4x4_sse2
  |  |  ------------------
  |  |  504|      1|  INIT_NO_4X4(p, type)
  |  |  ------------------
  |  |  |  |  496|      1|  p[TX_8X8] = aom_##type##_predictor_8x8;     \
  |  |  |  |  ------------------
  |  |  |  |  |  | 2457|      1|#define aom_highbd_dc_top_predictor_8x8 aom_highbd_dc_top_predictor_8x8_sse2
  |  |  |  |  ------------------
  |  |  |  |  497|      1|  p[TX_16X16] = aom_##type##_predictor_16x16; \
  |  |  |  |  ------------------
  |  |  |  |  |  | 2394|      1|#define aom_highbd_dc_top_predictor_16x16 aom_highbd_dc_top_predictor_16x16_sse2
  |  |  |  |  ------------------
  |  |  |  |  498|      1|  p[TX_32X32] = aom_##type##_predictor_32x32; \
  |  |  |  |  ------------------
  |  |  |  |  |  | 2416|      1|#define aom_highbd_dc_top_predictor_32x32 aom_highbd_dc_top_predictor_32x32_sse2
  |  |  |  |  ------------------
  |  |  |  |  499|      1|  p[TX_64X64] = aom_##type##_predictor_64x64; \
  |  |  |  |  ------------------
  |  |  |  |  |  | 2442|      1|#define aom_highbd_dc_top_predictor_64x64 aom_highbd_dc_top_predictor_64x64_c
  |  |  |  |  ------------------
  |  |  |  |  500|      1|  INIT_RECTANGULAR(p, type)
  |  |  |  |  ------------------
  |  |  |  |  |  |  479|      1|  p[TX_4X8] = aom_##type##_predictor_4x8;     \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2433|      1|#define aom_highbd_dc_top_predictor_4x8 aom_highbd_dc_top_predictor_4x8_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  480|      1|  p[TX_8X4] = aom_##type##_predictor_8x4;     \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2453|      1|#define aom_highbd_dc_top_predictor_8x4 aom_highbd_dc_top_predictor_8x4_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  481|      1|  p[TX_8X16] = aom_##type##_predictor_8x16;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2446|      1|#define aom_highbd_dc_top_predictor_8x16 aom_highbd_dc_top_predictor_8x16_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  482|      1|  p[TX_16X8] = aom_##type##_predictor_16x8;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2408|      1|#define aom_highbd_dc_top_predictor_16x8 aom_highbd_dc_top_predictor_16x8_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  483|      1|  p[TX_16X32] = aom_##type##_predictor_16x32; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2398|      1|#define aom_highbd_dc_top_predictor_16x32 aom_highbd_dc_top_predictor_16x32_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  484|      1|  p[TX_32X16] = aom_##type##_predictor_32x16; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2412|      1|#define aom_highbd_dc_top_predictor_32x16 aom_highbd_dc_top_predictor_32x16_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  485|      1|  p[TX_32X64] = aom_##type##_predictor_32x64; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2419|      1|#define aom_highbd_dc_top_predictor_32x64 aom_highbd_dc_top_predictor_32x64_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  486|      1|  p[TX_64X32] = aom_##type##_predictor_64x32; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2439|      1|#define aom_highbd_dc_top_predictor_64x32 aom_highbd_dc_top_predictor_64x32_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  487|      1|  p[TX_4X16] = aom_##type##_predictor_4x16;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2425|      1|#define aom_highbd_dc_top_predictor_4x16 aom_highbd_dc_top_predictor_4x16_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  488|      1|  p[TX_16X4] = aom_##type##_predictor_16x4;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2401|      1|#define aom_highbd_dc_top_predictor_16x4 aom_highbd_dc_top_predictor_16x4_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  489|      1|  p[TX_8X32] = aom_##type##_predictor_8x32;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2449|      1|#define aom_highbd_dc_top_predictor_8x32 aom_highbd_dc_top_predictor_8x32_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  490|      1|  p[TX_32X8] = aom_##type##_predictor_32x8;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2422|      1|#define aom_highbd_dc_top_predictor_32x8 aom_highbd_dc_top_predictor_32x8_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  491|      1|  p[TX_16X64] = aom_##type##_predictor_16x64; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2404|      1|#define aom_highbd_dc_top_predictor_16x64 aom_highbd_dc_top_predictor_16x64_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  492|      1|  p[TX_64X16] = aom_##type##_predictor_64x16;
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2436|      1|#define aom_highbd_dc_top_predictor_64x16 aom_highbd_dc_top_predictor_64x16_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  525|      1|  INIT_ALL_SIZES(dc_pred_high[1][0], highbd_dc_left)
  ------------------
  |  |  503|      1|  p[TX_4X4] = aom_##type##_predictor_4x4; \
  |  |  ------------------
  |  |  |  | 2295|      1|#define aom_highbd_dc_left_predictor_4x4 aom_highbd_dc_left_predictor_4x4_sse2
  |  |  ------------------
  |  |  504|      1|  INIT_NO_4X4(p, type)
  |  |  ------------------
  |  |  |  |  496|      1|  p[TX_8X8] = aom_##type##_predictor_8x8;     \
  |  |  |  |  ------------------
  |  |  |  |  |  | 2323|      1|#define aom_highbd_dc_left_predictor_8x8 aom_highbd_dc_left_predictor_8x8_sse2
  |  |  |  |  ------------------
  |  |  |  |  497|      1|  p[TX_16X16] = aom_##type##_predictor_16x16; \
  |  |  |  |  ------------------
  |  |  |  |  |  | 2260|      1|#define aom_highbd_dc_left_predictor_16x16 aom_highbd_dc_left_predictor_16x16_sse2
  |  |  |  |  ------------------
  |  |  |  |  498|      1|  p[TX_32X32] = aom_##type##_predictor_32x32; \
  |  |  |  |  ------------------
  |  |  |  |  |  | 2282|      1|#define aom_highbd_dc_left_predictor_32x32 aom_highbd_dc_left_predictor_32x32_sse2
  |  |  |  |  ------------------
  |  |  |  |  499|      1|  p[TX_64X64] = aom_##type##_predictor_64x64; \
  |  |  |  |  ------------------
  |  |  |  |  |  | 2308|      1|#define aom_highbd_dc_left_predictor_64x64 aom_highbd_dc_left_predictor_64x64_c
  |  |  |  |  ------------------
  |  |  |  |  500|      1|  INIT_RECTANGULAR(p, type)
  |  |  |  |  ------------------
  |  |  |  |  |  |  479|      1|  p[TX_4X8] = aom_##type##_predictor_4x8;     \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2299|      1|#define aom_highbd_dc_left_predictor_4x8 aom_highbd_dc_left_predictor_4x8_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  480|      1|  p[TX_8X4] = aom_##type##_predictor_8x4;     \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2319|      1|#define aom_highbd_dc_left_predictor_8x4 aom_highbd_dc_left_predictor_8x4_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  481|      1|  p[TX_8X16] = aom_##type##_predictor_8x16;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2312|      1|#define aom_highbd_dc_left_predictor_8x16 aom_highbd_dc_left_predictor_8x16_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  482|      1|  p[TX_16X8] = aom_##type##_predictor_16x8;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2274|      1|#define aom_highbd_dc_left_predictor_16x8 aom_highbd_dc_left_predictor_16x8_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  483|      1|  p[TX_16X32] = aom_##type##_predictor_16x32; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2264|      1|#define aom_highbd_dc_left_predictor_16x32 aom_highbd_dc_left_predictor_16x32_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  484|      1|  p[TX_32X16] = aom_##type##_predictor_32x16; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2278|      1|#define aom_highbd_dc_left_predictor_32x16 aom_highbd_dc_left_predictor_32x16_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  485|      1|  p[TX_32X64] = aom_##type##_predictor_32x64; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2285|      1|#define aom_highbd_dc_left_predictor_32x64 aom_highbd_dc_left_predictor_32x64_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  486|      1|  p[TX_64X32] = aom_##type##_predictor_64x32; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2305|      1|#define aom_highbd_dc_left_predictor_64x32 aom_highbd_dc_left_predictor_64x32_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  487|      1|  p[TX_4X16] = aom_##type##_predictor_4x16;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2291|      1|#define aom_highbd_dc_left_predictor_4x16 aom_highbd_dc_left_predictor_4x16_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  488|      1|  p[TX_16X4] = aom_##type##_predictor_16x4;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2267|      1|#define aom_highbd_dc_left_predictor_16x4 aom_highbd_dc_left_predictor_16x4_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  489|      1|  p[TX_8X32] = aom_##type##_predictor_8x32;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2315|      1|#define aom_highbd_dc_left_predictor_8x32 aom_highbd_dc_left_predictor_8x32_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  490|      1|  p[TX_32X8] = aom_##type##_predictor_32x8;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2288|      1|#define aom_highbd_dc_left_predictor_32x8 aom_highbd_dc_left_predictor_32x8_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  491|      1|  p[TX_16X64] = aom_##type##_predictor_16x64; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2270|      1|#define aom_highbd_dc_left_predictor_16x64 aom_highbd_dc_left_predictor_16x64_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  492|      1|  p[TX_64X16] = aom_##type##_predictor_64x16;
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2302|      1|#define aom_highbd_dc_left_predictor_64x16 aom_highbd_dc_left_predictor_64x16_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  526|      1|  INIT_ALL_SIZES(dc_pred_high[1][1], highbd_dc)
  ------------------
  |  |  503|      1|  p[TX_4X4] = aom_##type##_predictor_4x4; \
  |  |  ------------------
  |  |  |  | 2362|      1|#define aom_highbd_dc_predictor_4x4 aom_highbd_dc_predictor_4x4_sse2
  |  |  ------------------
  |  |  504|      1|  INIT_NO_4X4(p, type)
  |  |  ------------------
  |  |  |  |  496|      1|  p[TX_8X8] = aom_##type##_predictor_8x8;     \
  |  |  |  |  ------------------
  |  |  |  |  |  | 2390|      1|#define aom_highbd_dc_predictor_8x8 aom_highbd_dc_predictor_8x8_sse2
  |  |  |  |  ------------------
  |  |  |  |  497|      1|  p[TX_16X16] = aom_##type##_predictor_16x16; \
  |  |  |  |  ------------------
  |  |  |  |  |  | 2327|      1|#define aom_highbd_dc_predictor_16x16 aom_highbd_dc_predictor_16x16_sse2
  |  |  |  |  ------------------
  |  |  |  |  498|      1|  p[TX_32X32] = aom_##type##_predictor_32x32; \
  |  |  |  |  ------------------
  |  |  |  |  |  | 2349|      1|#define aom_highbd_dc_predictor_32x32 aom_highbd_dc_predictor_32x32_sse2
  |  |  |  |  ------------------
  |  |  |  |  499|      1|  p[TX_64X64] = aom_##type##_predictor_64x64; \
  |  |  |  |  ------------------
  |  |  |  |  |  | 2375|      1|#define aom_highbd_dc_predictor_64x64 aom_highbd_dc_predictor_64x64_c
  |  |  |  |  ------------------
  |  |  |  |  500|      1|  INIT_RECTANGULAR(p, type)
  |  |  |  |  ------------------
  |  |  |  |  |  |  479|      1|  p[TX_4X8] = aom_##type##_predictor_4x8;     \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2366|      1|#define aom_highbd_dc_predictor_4x8 aom_highbd_dc_predictor_4x8_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  480|      1|  p[TX_8X4] = aom_##type##_predictor_8x4;     \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2386|      1|#define aom_highbd_dc_predictor_8x4 aom_highbd_dc_predictor_8x4_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  481|      1|  p[TX_8X16] = aom_##type##_predictor_8x16;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2379|      1|#define aom_highbd_dc_predictor_8x16 aom_highbd_dc_predictor_8x16_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  482|      1|  p[TX_16X8] = aom_##type##_predictor_16x8;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2341|      1|#define aom_highbd_dc_predictor_16x8 aom_highbd_dc_predictor_16x8_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  483|      1|  p[TX_16X32] = aom_##type##_predictor_16x32; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2331|      1|#define aom_highbd_dc_predictor_16x32 aom_highbd_dc_predictor_16x32_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  484|      1|  p[TX_32X16] = aom_##type##_predictor_32x16; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2345|      1|#define aom_highbd_dc_predictor_32x16 aom_highbd_dc_predictor_32x16_sse2
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  485|      1|  p[TX_32X64] = aom_##type##_predictor_32x64; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2352|      1|#define aom_highbd_dc_predictor_32x64 aom_highbd_dc_predictor_32x64_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  486|      1|  p[TX_64X32] = aom_##type##_predictor_64x32; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2372|      1|#define aom_highbd_dc_predictor_64x32 aom_highbd_dc_predictor_64x32_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  487|      1|  p[TX_4X16] = aom_##type##_predictor_4x16;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2358|      1|#define aom_highbd_dc_predictor_4x16 aom_highbd_dc_predictor_4x16_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  488|      1|  p[TX_16X4] = aom_##type##_predictor_16x4;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2334|      1|#define aom_highbd_dc_predictor_16x4 aom_highbd_dc_predictor_16x4_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  489|      1|  p[TX_8X32] = aom_##type##_predictor_8x32;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2382|      1|#define aom_highbd_dc_predictor_8x32 aom_highbd_dc_predictor_8x32_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  490|      1|  p[TX_32X8] = aom_##type##_predictor_32x8;   \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2355|      1|#define aom_highbd_dc_predictor_32x8 aom_highbd_dc_predictor_32x8_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  491|      1|  p[TX_16X64] = aom_##type##_predictor_16x64; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2337|      1|#define aom_highbd_dc_predictor_16x64 aom_highbd_dc_predictor_16x64_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  492|      1|  p[TX_64X16] = aom_##type##_predictor_64x16;
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 2369|      1|#define aom_highbd_dc_predictor_64x16 aom_highbd_dc_predictor_64x16_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  527|      1|#endif
  528|      1|#undef intra_pred_allsizes
  529|      1|}

decodeframe.c:av1_allow_intrabc:
   63|  26.3k|static inline int av1_allow_intrabc(const AV1_COMMON *const cm) {
   64|  26.3k|  return frame_is_intra_only(cm) && cm->features.allow_screen_content_tools &&
  ------------------
  |  Branch (64:10): [True: 18.3k, False: 8.05k]
  |  Branch (64:37): [True: 8.01k, False: 10.3k]
  ------------------
   65|  8.01k|         cm->features.allow_intrabc;
  ------------------
  |  Branch (65:10): [True: 1.67k, False: 6.34k]
  ------------------
   66|  26.3k|}
decodemv.c:av1_allow_intrabc:
   63|  1.61M|static inline int av1_allow_intrabc(const AV1_COMMON *const cm) {
   64|  1.61M|  return frame_is_intra_only(cm) && cm->features.allow_screen_content_tools &&
  ------------------
  |  Branch (64:10): [True: 1.61M, False: 5]
  |  Branch (64:37): [True: 793k, False: 823k]
  ------------------
   65|   793k|         cm->features.allow_intrabc;
  ------------------
  |  Branch (65:10): [True: 353k, False: 439k]
  ------------------
   66|  1.61M|}
decodemv.c:av1_use_angle_delta:
   59|  1.66M|static inline int av1_use_angle_delta(BLOCK_SIZE bsize) {
   60|  1.66M|  return bsize >= BLOCK_8X8;
   61|  1.66M|}
decodemv.c:av1_is_directional_mode:
   51|  2.05M|static inline int av1_is_directional_mode(PREDICTION_MODE mode) {
   52|  2.05M|  return mode >= V_PRED && mode <= D67_PRED;
  ------------------
  |  Branch (52:10): [True: 1.00M, False: 1.05M]
  |  Branch (52:28): [True: 604k, False: 397k]
  ------------------
   53|  2.05M|}
decodemv.c:av1_filter_intra_allowed:
   76|  1.66M|                                           const MB_MODE_INFO *mbmi) {
   77|  1.66M|  return mbmi->mode == DC_PRED &&
  ------------------
  |  Branch (77:10): [True: 734k, False: 933k]
  ------------------
   78|   734k|         mbmi->palette_mode_info.palette_size[0] == 0 &&
  ------------------
  |  Branch (78:10): [True: 686k, False: 47.9k]
  ------------------
   79|   686k|         av1_filter_intra_allowed_bsize(cm, mbmi->bsize);
  ------------------
  |  Branch (79:10): [True: 607k, False: 79.4k]
  ------------------
   80|  1.66M|}
decodemv.c:av1_filter_intra_allowed_bsize:
   69|   686k|                                                 BLOCK_SIZE bs) {
   70|   686k|  if (!cm->seq_params->enable_filter_intra || bs == BLOCK_INVALID) return 0;
  ------------------
  |  Branch (70:7): [True: 22.5k, False: 664k]
  |  Branch (70:47): [True: 18.4E, False: 664k]
  ------------------
   71|       |
   72|   664k|  return block_size_wide[bs] <= 32 && block_size_high[bs] <= 32;
  ------------------
  |  Branch (72:10): [True: 617k, False: 46.3k]
  |  Branch (72:39): [True: 607k, False: 10.5k]
  ------------------
   73|   686k|}
reconintra.c:av1_is_directional_mode:
   51|  6.07M|static inline int av1_is_directional_mode(PREDICTION_MODE mode) {
   52|  6.07M|  return mode >= V_PRED && mode <= D67_PRED;
  ------------------
  |  Branch (52:10): [True: 3.42M, False: 2.64M]
  |  Branch (52:28): [True: 2.59M, False: 829k]
  ------------------
   53|  6.07M|}
reconintra.c:av1_use_intra_edge_upsample:
  149|  2.33M|                                              int type) {
  150|  2.33M|  const int d = abs(delta);
  151|  2.33M|  const int blk_wh = bs0 + bs1;
  152|  2.33M|  if (d == 0 || d >= 40) return 0;
  ------------------
  |  Branch (152:7): [True: 347k, False: 1.98M]
  |  Branch (152:17): [True: 1.25M, False: 734k]
  ------------------
  153|   734k|  return type ? (blk_wh <= 8) : (blk_wh <= 16);
  ------------------
  |  Branch (153:10): [True: 197k, False: 536k]
  ------------------
  154|  2.33M|}
reconintra.c:av1_get_dx:
  122|  1.26M|static inline int av1_get_dx(int angle) {
  123|  1.26M|  if (angle > 0 && angle < 90) {
  ------------------
  |  Branch (123:7): [True: 1.26M, False: 0]
  |  Branch (123:20): [True: 228k, False: 1.03M]
  ------------------
  124|   228k|    return dr_intra_derivative[angle];
  125|  1.03M|  } else if (angle > 90 && angle < 180) {
  ------------------
  |  Branch (125:14): [True: 892k, False: 145k]
  |  Branch (125:28): [True: 389k, False: 502k]
  ------------------
  126|   389k|    return dr_intra_derivative[180 - angle];
  127|   648k|  } else {
  128|       |    // In this case, we are not really going to use dx. We may return any value.
  129|   648k|    return 1;
  130|   648k|  }
  131|  1.26M|}
reconintra.c:av1_get_dy:
  137|  1.26M|static inline int av1_get_dy(int angle) {
  138|  1.26M|  if (angle > 90 && angle < 180) {
  ------------------
  |  Branch (138:7): [True: 892k, False: 373k]
  |  Branch (138:21): [True: 389k, False: 502k]
  ------------------
  139|   389k|    return dr_intra_derivative[angle - 90];
  140|   876k|  } else if (angle > 180 && angle < 270) {
  ------------------
  |  Branch (140:14): [True: 274k, False: 601k]
  |  Branch (140:29): [True: 274k, False: 0]
  ------------------
  141|   274k|    return dr_intra_derivative[270 - angle];
  142|   601k|  } else {
  143|       |    // In this case, we are not really going to use dy. We may return any value.
  144|   601k|    return 1;
  145|   601k|  }
  146|  1.26M|}
thread_common.c:av1_allow_intrabc:
   63|  14.6k|static inline int av1_allow_intrabc(const AV1_COMMON *const cm) {
   64|  14.6k|  return frame_is_intra_only(cm) && cm->features.allow_screen_content_tools &&
  ------------------
  |  Branch (64:10): [True: 10.7k, False: 3.93k]
  |  Branch (64:37): [True: 5.93k, False: 4.79k]
  ------------------
   65|  5.93k|         cm->features.allow_intrabc;
  ------------------
  |  Branch (65:10): [True: 1.57k, False: 4.35k]
  ------------------
   66|  14.6k|}

av1_get_upscale_convolve_step:
  324|    600|int32_t av1_get_upscale_convolve_step(int in_length, int out_length) {
  325|    600|  return ((in_length << RS_SCALE_SUBPEL_BITS) + out_length / 2) / out_length;
  ------------------
  |  |   36|    600|#define RS_SCALE_SUBPEL_BITS 14
  ------------------
  326|    600|}
av1_upscale_normative_rows:
 1121|    600|                                int plane, int rows) {
 1122|    600|  const int is_uv = (plane > 0);
 1123|    600|  const int ss_x = is_uv && cm->seq_params->subsampling_x;
  ------------------
  |  Branch (1123:20): [True: 400, False: 200]
  |  Branch (1123:29): [True: 0, False: 400]
  ------------------
 1124|    600|  const int downscaled_plane_width = ROUND_POWER_OF_TWO(cm->width, ss_x);
  ------------------
  |  |   41|    600|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
 1125|    600|  const int upscaled_plane_width =
 1126|    600|      ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x);
  ------------------
  |  |   41|    600|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
 1127|    600|  const int superres_denom = cm->superres_scale_denominator;
 1128|       |
 1129|    600|  TileInfo tile_col;
 1130|    600|  const int32_t x_step_qn = av1_get_upscale_convolve_step(
 1131|    600|      downscaled_plane_width, upscaled_plane_width);
 1132|    600|  int32_t x0_qn = get_upscale_convolve_x0(downscaled_plane_width,
 1133|    600|                                          upscaled_plane_width, x_step_qn);
 1134|       |
 1135|  1.20k|  for (int j = 0; j < cm->tiles.cols; j++) {
  ------------------
  |  Branch (1135:19): [True: 600, False: 600]
  ------------------
 1136|    600|    av1_tile_set_col(&tile_col, cm, j);
 1137|       |    // Determine the limits of this tile column in both the source
 1138|       |    // and destination images.
 1139|       |    // Note: The actual location which we start sampling from is
 1140|       |    // (downscaled_x0 - 1 + (x0_qn/2^14)), and this quantity increases
 1141|       |    // by exactly dst_width * (x_step_qn/2^14) pixels each iteration.
 1142|    600|    const int downscaled_x0 = tile_col.mi_col_start << (MI_SIZE_LOG2 - ss_x);
  ------------------
  |  |   39|    600|#define MI_SIZE_LOG2 2
  ------------------
 1143|    600|    const int downscaled_x1 = tile_col.mi_col_end << (MI_SIZE_LOG2 - ss_x);
  ------------------
  |  |   39|    600|#define MI_SIZE_LOG2 2
  ------------------
 1144|    600|    const int src_width = downscaled_x1 - downscaled_x0;
 1145|       |
 1146|    600|    const int upscaled_x0 = (downscaled_x0 * superres_denom) / SCALE_NUMERATOR;
  ------------------
  |  |   22|    600|#define SCALE_NUMERATOR 8
  ------------------
 1147|    600|    int upscaled_x1;
 1148|    600|    if (j == cm->tiles.cols - 1) {
  ------------------
  |  Branch (1148:9): [True: 600, False: 0]
  ------------------
 1149|       |      // Note that we can't just use AOMMIN here - due to rounding,
 1150|       |      // (downscaled_x1 * superres_denom) / SCALE_NUMERATOR may be less than
 1151|       |      // upscaled_plane_width.
 1152|    600|      upscaled_x1 = upscaled_plane_width;
 1153|    600|    } else {
 1154|      0|      upscaled_x1 = (downscaled_x1 * superres_denom) / SCALE_NUMERATOR;
  ------------------
  |  |   22|      0|#define SCALE_NUMERATOR 8
  ------------------
 1155|      0|    }
 1156|       |
 1157|    600|    const uint8_t *const src_ptr = src + downscaled_x0;
 1158|    600|    uint8_t *const dst_ptr = dst + upscaled_x0;
 1159|    600|    const int dst_width = upscaled_x1 - upscaled_x0;
 1160|       |
 1161|    600|    const int pad_left = (j == 0);
 1162|    600|    const int pad_right = (j == cm->tiles.cols - 1);
 1163|       |
 1164|    600|    bool success;
 1165|    600|#if CONFIG_AV1_HIGHBITDEPTH
 1166|    600|    if (cm->seq_params->use_highbitdepth)
  ------------------
  |  Branch (1166:9): [True: 0, False: 600]
  ------------------
 1167|      0|      success = highbd_upscale_normative_rect(
 1168|      0|          src_ptr, rows, src_width, src_stride, dst_ptr, rows, dst_width,
 1169|      0|          dst_stride, x_step_qn, x0_qn, pad_left, pad_right,
 1170|      0|          cm->seq_params->bit_depth);
 1171|    600|    else
 1172|    600|      success = upscale_normative_rect(src_ptr, rows, src_width, src_stride,
 1173|    600|                                       dst_ptr, rows, dst_width, dst_stride,
 1174|    600|                                       x_step_qn, x0_qn, pad_left, pad_right);
 1175|       |#else
 1176|       |    success = upscale_normative_rect(src_ptr, rows, src_width, src_stride,
 1177|       |                                     dst_ptr, rows, dst_width, dst_stride,
 1178|       |                                     x_step_qn, x0_qn, pad_left, pad_right);
 1179|       |#endif
 1180|    600|    if (!success) {
  ------------------
  |  Branch (1180:9): [True: 0, False: 600]
  ------------------
 1181|      0|      aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
 1182|      0|                         "Error upscaling frame");
 1183|      0|    }
 1184|       |    // Update the fractional pixel offset to prepare for the next tile column.
 1185|    600|    x0_qn += (dst_width * x_step_qn) - (src_width << RS_SCALE_SUBPEL_BITS);
  ------------------
  |  |   36|    600|#define RS_SCALE_SUBPEL_BITS 14
  ------------------
 1186|    600|  }
 1187|    600|}
av1_calculate_scaled_superres_size:
 1297|    185|                                        int superres_denom) {
 1298|    185|  (void)height;
 1299|    185|  calculate_scaled_size_helper(width, superres_denom);
 1300|    185|}
av1_superres_upscale:
 1318|     32|                          bool alloc_pyramid) {
 1319|     32|  const int num_planes = av1_num_planes(cm);
 1320|     32|  if (!av1_superres_scaled(cm)) return;
  ------------------
  |  Branch (1320:7): [True: 0, False: 32]
  ------------------
 1321|     32|  const SequenceHeader *const seq_params = cm->seq_params;
 1322|     32|  const int byte_alignment = cm->features.byte_alignment;
 1323|       |
 1324|     32|  YV12_BUFFER_CONFIG copy_buffer;
 1325|     32|  memset(&copy_buffer, 0, sizeof(copy_buffer));
 1326|       |
 1327|     32|  YV12_BUFFER_CONFIG *const frame_to_show = &cm->cur_frame->buf;
 1328|       |
 1329|     32|  const int aligned_width = ALIGN_POWER_OF_TWO(cm->width, 3);
  ------------------
  |  |   69|     32|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  ------------------
 1330|     32|  if (aom_alloc_frame_buffer(
  ------------------
  |  Branch (1330:7): [True: 0, False: 32]
  ------------------
 1331|     32|          &copy_buffer, aligned_width, cm->height, seq_params->subsampling_x,
 1332|     32|          seq_params->subsampling_y, seq_params->use_highbitdepth,
 1333|     32|          AOM_BORDER_IN_PIXELS, byte_alignment, false, 0))
  ------------------
  |  |   32|     32|#define AOM_BORDER_IN_PIXELS 288
  ------------------
 1334|      0|    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
 1335|      0|                       "Failed to allocate copy buffer for superres upscaling");
 1336|       |
 1337|       |  // Copy function assumes the frames are the same size.
 1338|       |  // Note that it does not copy YV12_BUFFER_CONFIG config data.
 1339|     32|  aom_yv12_copy_frame(frame_to_show, &copy_buffer, num_planes);
  ------------------
  |  |   37|     32|#define aom_yv12_copy_frame aom_yv12_copy_frame_c
  ------------------
 1340|       |
 1341|     32|  assert(copy_buffer.y_crop_width == aligned_width);
 1342|     32|  assert(copy_buffer.y_crop_height == cm->height);
 1343|       |
 1344|       |  // Realloc the current frame buffer at a higher resolution in place.
 1345|     32|  if (pool != NULL) {
  ------------------
  |  Branch (1345:7): [True: 32, False: 0]
  ------------------
 1346|       |    // Use callbacks if on the decoder.
 1347|     32|    aom_codec_frame_buffer_t *fb = &cm->cur_frame->raw_frame_buffer;
 1348|     32|    aom_release_frame_buffer_cb_fn_t release_fb_cb = pool->release_fb_cb;
 1349|     32|    aom_get_frame_buffer_cb_fn_t cb = pool->get_fb_cb;
 1350|     32|    void *cb_priv = pool->cb_priv;
 1351|       |
 1352|     32|    lock_buffer_pool(pool);
 1353|       |    // Realloc with callback does not release the frame buffer - release first.
 1354|     32|    if (release_fb_cb(cb_priv, fb)) {
  ------------------
  |  Branch (1354:9): [True: 0, False: 32]
  ------------------
 1355|      0|      unlock_buffer_pool(pool);
 1356|      0|      aom_internal_error(
 1357|      0|          cm->error, AOM_CODEC_MEM_ERROR,
 1358|      0|          "Failed to free current frame buffer before superres upscaling");
 1359|      0|    }
 1360|       |    // aom_realloc_frame_buffer() leaves config data for frame_to_show intact
 1361|     32|    if (aom_realloc_frame_buffer(
  ------------------
  |  Branch (1361:9): [True: 0, False: 32]
  ------------------
 1362|     32|            frame_to_show, cm->superres_upscaled_width,
 1363|     32|            cm->superres_upscaled_height, seq_params->subsampling_x,
 1364|     32|            seq_params->subsampling_y, seq_params->use_highbitdepth,
 1365|     32|            AOM_BORDER_IN_PIXELS, byte_alignment, fb, cb, cb_priv,
  ------------------
  |  |   32|     32|#define AOM_BORDER_IN_PIXELS 288
  ------------------
 1366|     32|            alloc_pyramid, 0)) {
 1367|      0|      unlock_buffer_pool(pool);
 1368|      0|      aom_internal_error(
 1369|      0|          cm->error, AOM_CODEC_MEM_ERROR,
 1370|      0|          "Failed to allocate current frame buffer for superres upscaling");
 1371|      0|    }
 1372|     32|    unlock_buffer_pool(pool);
 1373|     32|  } else {
 1374|       |    // Make a copy of the config data for frame_to_show in copy_buffer
 1375|      0|    copy_buffer_config(frame_to_show, &copy_buffer);
 1376|       |
 1377|       |    // Don't use callbacks on the encoder.
 1378|       |    // aom_alloc_frame_buffer() clears the config data for frame_to_show
 1379|      0|    if (aom_alloc_frame_buffer(
  ------------------
  |  Branch (1379:9): [True: 0, False: 0]
  ------------------
 1380|      0|            frame_to_show, cm->superres_upscaled_width,
 1381|      0|            cm->superres_upscaled_height, seq_params->subsampling_x,
 1382|      0|            seq_params->subsampling_y, seq_params->use_highbitdepth,
 1383|      0|            AOM_BORDER_IN_PIXELS, byte_alignment, alloc_pyramid, 0))
  ------------------
  |  |   32|      0|#define AOM_BORDER_IN_PIXELS 288
  ------------------
 1384|      0|      aom_internal_error(
 1385|      0|          cm->error, AOM_CODEC_MEM_ERROR,
 1386|      0|          "Failed to reallocate current frame buffer for superres upscaling");
 1387|       |
 1388|       |    // Restore config data back to frame_to_show
 1389|      0|    copy_buffer_config(&copy_buffer, frame_to_show);
 1390|      0|  }
 1391|       |  // TODO(afergs): verify frame_to_show is correct after realloc
 1392|       |  //               encoder:
 1393|       |  //               decoder:
 1394|       |
 1395|     32|  assert(frame_to_show->y_crop_width == cm->superres_upscaled_width);
 1396|     32|  assert(frame_to_show->y_crop_height == cm->superres_upscaled_height);
 1397|       |
 1398|       |  // Scale up and back into frame_to_show.
 1399|     32|  assert(frame_to_show->y_crop_width != cm->width);
 1400|     32|  upscale_normative_and_extend_frame(cm, &copy_buffer, frame_to_show);
 1401|       |
 1402|       |  // Free the copy buffer
 1403|     32|  aom_free_frame_buffer(&copy_buffer);
 1404|     32|}
resize.c:get_upscale_convolve_x0:
  329|    600|                                       int32_t x_step_qn) {
  330|    600|  const int err = out_length * x_step_qn - (in_length << RS_SCALE_SUBPEL_BITS);
  ------------------
  |  |   36|    600|#define RS_SCALE_SUBPEL_BITS 14
  ------------------
  331|    600|  const int32_t x0 =
  332|    600|      (-((out_length - in_length) << (RS_SCALE_SUBPEL_BITS - 1)) +
  ------------------
  |  |   36|    600|#define RS_SCALE_SUBPEL_BITS 14
  ------------------
  333|    600|       out_length / 2) /
  334|    600|          out_length +
  335|    600|      RS_SCALE_EXTRA_OFF - err / 2;
  ------------------
  |  |   39|    600|#define RS_SCALE_EXTRA_OFF (1 << (RS_SCALE_EXTRA_BITS - 1))
  |  |  ------------------
  |  |  |  |   38|    600|#define RS_SCALE_EXTRA_BITS (RS_SCALE_SUBPEL_BITS - RS_SUBPEL_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   36|    600|#define RS_SCALE_SUBPEL_BITS 14
  |  |  |  |  ------------------
  |  |  |  |               #define RS_SCALE_EXTRA_BITS (RS_SCALE_SUBPEL_BITS - RS_SUBPEL_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   34|    600|#define RS_SUBPEL_BITS 6
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  336|    600|  return (int32_t)((uint32_t)x0 & RS_SCALE_SUBPEL_MASK);
  ------------------
  |  |   37|    600|#define RS_SCALE_SUBPEL_MASK ((1 << RS_SCALE_SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   36|    600|#define RS_SCALE_SUBPEL_BITS 14
  |  |  ------------------
  ------------------
  337|    600|}
resize.c:upscale_normative_rect:
  617|    600|                                   int pad_right) {
  618|    600|  assert(width > 0);
  619|    600|  assert(height > 0);
  620|    600|  assert(width2 > 0);
  621|    600|  assert(height2 > 0);
  622|    600|  assert(height2 == height);
  623|       |
  624|       |  // Extend the left/right pixels of the tile column if needed
  625|       |  // (either because we can't sample from other tiles, or because we're at
  626|       |  // a frame edge).
  627|       |  // Save the overwritten pixels into tmp_left and tmp_right.
  628|       |  // Note: Because we pass input-1 to av1_convolve_horiz_rs, we need one extra
  629|       |  // column of border pixels compared to what we'd naively think.
  630|    600|  const int border_cols = UPSCALE_NORMATIVE_TAPS / 2 + 1;
  ------------------
  |  |  101|    600|#define UPSCALE_NORMATIVE_TAPS 8
  ------------------
  631|    600|  uint8_t *tmp_left =
  632|    600|      NULL;  // Silence spurious "may be used uninitialized" warnings
  633|    600|  uint8_t *tmp_right = NULL;
  634|    600|  uint8_t *const in_tl = (uint8_t *)(input - border_cols);  // Cast off 'const'
  635|    600|  uint8_t *const in_tr = (uint8_t *)(input + width);
  636|    600|  if (pad_left) {
  ------------------
  |  Branch (636:7): [True: 600, False: 0]
  ------------------
  637|    600|    tmp_left = (uint8_t *)aom_malloc(sizeof(*tmp_left) * border_cols * height);
  638|    600|    if (!tmp_left) return false;
  ------------------
  |  Branch (638:9): [True: 0, False: 600]
  ------------------
  639|  18.1k|    for (int i = 0; i < height; i++) {
  ------------------
  |  Branch (639:21): [True: 17.5k, False: 600]
  ------------------
  640|  17.5k|      memcpy(tmp_left + i * border_cols, in_tl + i * in_stride, border_cols);
  641|  17.5k|      memset(in_tl + i * in_stride, input[i * in_stride], border_cols);
  642|  17.5k|    }
  643|    600|  }
  644|    600|  if (pad_right) {
  ------------------
  |  Branch (644:7): [True: 600, False: 0]
  ------------------
  645|    600|    tmp_right =
  646|    600|        (uint8_t *)aom_malloc(sizeof(*tmp_right) * border_cols * height);
  647|    600|    if (!tmp_right) {
  ------------------
  |  Branch (647:9): [True: 0, False: 600]
  ------------------
  648|      0|      aom_free(tmp_left);
  649|      0|      return false;
  650|      0|    }
  651|  18.1k|    for (int i = 0; i < height; i++) {
  ------------------
  |  Branch (651:21): [True: 17.5k, False: 600]
  ------------------
  652|  17.5k|      memcpy(tmp_right + i * border_cols, in_tr + i * in_stride, border_cols);
  653|  17.5k|      memset(in_tr + i * in_stride, input[i * in_stride + width - 1],
  654|  17.5k|             border_cols);
  655|  17.5k|    }
  656|    600|  }
  657|       |
  658|    600|  av1_convolve_horiz_rs(input - 1, in_stride, output, out_stride, width2,
  659|    600|                        height2, &av1_resize_filter_normative[0][0], x0_qn,
  660|    600|                        x_step_qn);
  661|       |
  662|       |  // Restore the left/right border pixels
  663|    600|  if (pad_left) {
  ------------------
  |  Branch (663:7): [True: 600, False: 0]
  ------------------
  664|  18.1k|    for (int i = 0; i < height; i++) {
  ------------------
  |  Branch (664:21): [True: 17.5k, False: 600]
  ------------------
  665|  17.5k|      memcpy(in_tl + i * in_stride, tmp_left + i * border_cols, border_cols);
  666|  17.5k|    }
  667|    600|    aom_free(tmp_left);
  668|    600|  }
  669|    600|  if (pad_right) {
  ------------------
  |  Branch (669:7): [True: 600, False: 0]
  ------------------
  670|  18.1k|    for (int i = 0; i < height; i++) {
  ------------------
  |  Branch (670:21): [True: 17.5k, False: 600]
  ------------------
  671|  17.5k|      memcpy(in_tr + i * in_stride, tmp_right + i * border_cols, border_cols);
  672|  17.5k|    }
  673|    600|    aom_free(tmp_right);
  674|    600|  }
  675|       |  return true;
  676|    600|}
resize.c:calculate_scaled_size_helper:
 1273|    185|static void calculate_scaled_size_helper(int *dim, int denom) {
 1274|    185|  if (denom != SCALE_NUMERATOR) {
  ------------------
  |  |   22|    185|#define SCALE_NUMERATOR 8
  ------------------
  |  Branch (1274:7): [True: 185, False: 0]
  ------------------
 1275|       |    // We need to ensure the constraint in "Appendix A" of the spec:
 1276|       |    // * FrameWidth is greater than or equal to 16
 1277|       |    // * FrameHeight is greater than or equal to 16
 1278|       |    // For this, we clamp the downscaled dimension to at least 16. One
 1279|       |    // exception: if original dimension itself was < 16, then we keep the
 1280|       |    // downscaled dimension to be same as the original, to ensure that resizing
 1281|       |    // is valid.
 1282|    185|    const int min_dim = AOMMIN(16, *dim);
  ------------------
  |  |   34|    185|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 174, False: 11]
  |  |  ------------------
  ------------------
 1283|       |    // Use this version if we need *dim to be even
 1284|       |    // *width = (*width * SCALE_NUMERATOR + denom) / (2 * denom);
 1285|       |    // *width <<= 1;
 1286|    185|    *dim = (*dim * SCALE_NUMERATOR + denom / 2) / (denom);
  ------------------
  |  |   22|    185|#define SCALE_NUMERATOR 8
  ------------------
 1287|    185|    *dim = AOMMAX(*dim, min_dim);
  ------------------
  |  |   35|    185|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 172, False: 13]
  |  |  ------------------
  ------------------
 1288|    185|  }
 1289|    185|}
resize.c:upscale_normative_and_extend_frame:
 1191|     32|                                               YV12_BUFFER_CONFIG *dst) {
 1192|     32|  const int num_planes = av1_num_planes(cm);
 1193|    128|  for (int i = 0; i < num_planes; ++i) {
  ------------------
  |  Branch (1193:19): [True: 96, False: 32]
  ------------------
 1194|     96|    const int is_uv = (i > 0);
 1195|     96|    av1_upscale_normative_rows(cm, src->buffers[i], src->strides[is_uv],
 1196|     96|                               dst->buffers[i], dst->strides[is_uv], i,
 1197|     96|                               src->crop_heights[is_uv]);
 1198|     96|  }
 1199|       |
 1200|     32|  aom_extend_frame_borders(dst, num_planes);
  ------------------
  |  |   31|     32|#define aom_extend_frame_borders aom_extend_frame_borders_c
  ------------------
 1201|     32|}

decodeframe.c:av1_superres_scaled:
   66|  26.4k|static inline int av1_superres_scaled(const AV1_COMMON *cm) {
   67|       |  // Note: for some corner cases (e.g. cm->width of 1), there may be no scaling
   68|       |  // required even though cm->superres_scale_denominator != SCALE_NUMERATOR.
   69|       |  // So, the following check is more accurate.
   70|  26.4k|  return (cm->width != cm->superres_upscaled_width);
   71|  26.4k|}
resize.c:av1_superres_scaled:
   66|     32|static inline int av1_superres_scaled(const AV1_COMMON *cm) {
   67|       |  // Note: for some corner cases (e.g. cm->width of 1), there may be no scaling
   68|       |  // required even though cm->superres_scale_denominator != SCALE_NUMERATOR.
   69|       |  // So, the following check is more accurate.
   70|     32|  return (cm->width != cm->superres_upscaled_width);
   71|     32|}
restoration.c:av1_superres_scaled:
   66|  58.6k|static inline int av1_superres_scaled(const AV1_COMMON *cm) {
   67|       |  // Note: for some corner cases (e.g. cm->width of 1), there may be no scaling
   68|       |  // required even though cm->superres_scale_denominator != SCALE_NUMERATOR.
   69|       |  // So, the following check is more accurate.
   70|  58.6k|  return (cm->width != cm->superres_upscaled_width);
   71|  58.6k|}
tile_common.c:av1_superres_scaled:
   66|    437|static inline int av1_superres_scaled(const AV1_COMMON *cm) {
   67|       |  // Note: for some corner cases (e.g. cm->width of 1), there may be no scaling
   68|       |  // required even though cm->superres_scale_denominator != SCALE_NUMERATOR.
   69|       |  // So, the following check is more accurate.
   70|    437|  return (cm->width != cm->superres_upscaled_width);
   71|    437|}

av1_get_upsampled_plane_size:
   48|  14.3k|                                  int *plane_h) {
   49|  14.3k|  int ss_x = is_uv && cm->seq_params->subsampling_x;
  ------------------
  |  Branch (49:14): [True: 8.71k, False: 5.61k]
  |  Branch (49:23): [True: 1.84k, False: 6.87k]
  ------------------
   50|  14.3k|  int ss_y = is_uv && cm->seq_params->subsampling_y;
  ------------------
  |  Branch (50:14): [True: 8.71k, False: 5.61k]
  |  Branch (50:23): [True: 464, False: 8.25k]
  ------------------
   51|  14.3k|  *plane_w = ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x);
  ------------------
  |  |   41|  14.3k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
   52|  14.3k|  *plane_h = ROUND_POWER_OF_TWO(cm->height, ss_y);
  ------------------
  |  |   41|  14.3k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
   53|  14.3k|}
av1_lr_count_units:
   63|  20.7k|int av1_lr_count_units(int unit_size, int plane_size) {
   64|  20.7k|  return AOMMAX((plane_size + (unit_size >> 1)) / unit_size, 1);
  ------------------
  |  |   35|  20.7k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 6.32k, False: 14.4k]
  |  |  ------------------
  ------------------
   65|  20.7k|}
av1_alloc_restoration_struct:
   68|  8.90k|                                  int is_uv) {
   69|  8.90k|  int plane_w, plane_h;
   70|  8.90k|  av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
   71|       |
   72|  8.90k|  const int unit_size = rsi->restoration_unit_size;
   73|  8.90k|  const int horz_units = av1_lr_count_units(unit_size, plane_w);
   74|  8.90k|  const int vert_units = av1_lr_count_units(unit_size, plane_h);
   75|       |
   76|  8.90k|  rsi->num_rest_units = horz_units * vert_units;
   77|  8.90k|  rsi->horz_units = horz_units;
   78|  8.90k|  rsi->vert_units = vert_units;
   79|       |
   80|  8.90k|  aom_free(rsi->unit_info);
   81|  8.90k|  CHECK_MEM_ERROR(cm, rsi->unit_info,
  ------------------
  |  |   51|  8.90k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  8.90k|  do {                                                    \
  |  |  |  |   69|  8.90k|    lval = (expr);                                        \
  |  |  |  |   70|  8.90k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 8.90k]
  |  |  |  |  ------------------
  |  |  |  |   71|  8.90k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  8.90k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 8.90k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   82|  8.90k|                  (RestorationUnitInfo *)aom_memalign(
   83|  8.90k|                      16, sizeof(*rsi->unit_info) * rsi->num_rest_units));
   84|  8.90k|}
av1_free_restoration_struct:
   86|  53.8k|void av1_free_restoration_struct(RestorationInfo *rst_info) {
   87|  53.8k|  aom_free(rst_info->unit_info);
   88|       |  rst_info->unit_info = NULL;
   89|  53.8k|}
av1_loop_restoration_precal:
  116|  17.9k|void av1_loop_restoration_precal(void) {
  117|       |#if 0
  118|       |  GenSgrprojVtable();
  119|       |#endif
  120|  17.9k|}
av1_extend_frame:
  173|  1.64k|                      int border_horz, int border_vert, int highbd) {
  174|  1.64k|#if CONFIG_AV1_HIGHBITDEPTH
  175|  1.64k|  if (highbd) {
  ------------------
  |  Branch (175:7): [True: 710, False: 932]
  ------------------
  176|    710|    extend_frame_highbd(CONVERT_TO_SHORTPTR(data), width, height, stride,
  ------------------
  |  |   75|    710|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  177|    710|                        border_horz, border_vert);
  178|    710|    return;
  179|    710|  }
  180|    932|#endif
  181|    932|  (void)highbd;
  182|    932|  extend_frame_lowbd(data, width, height, stride, border_horz, border_vert);
  183|    932|}
av1_decode_xq:
  584|  2.65k|void av1_decode_xq(const int *xqd, int *xq, const sgr_params_type *params) {
  585|  2.65k|  if (params->r[0] == 0) {
  ------------------
  |  Branch (585:7): [True: 987, False: 1.66k]
  ------------------
  586|    987|    xq[0] = 0;
  587|    987|    xq[1] = (1 << SGRPROJ_PRJ_BITS) - xqd[1];
  ------------------
  |  |   99|    987|#define SGRPROJ_PRJ_BITS 7
  ------------------
  588|  1.66k|  } else if (params->r[1] == 0) {
  ------------------
  |  Branch (588:14): [True: 640, False: 1.02k]
  ------------------
  589|    640|    xq[0] = xqd[0];
  590|    640|    xq[1] = 0;
  591|  1.02k|  } else {
  592|  1.02k|    xq[0] = xqd[0];
  593|  1.02k|    xq[1] = (1 << SGRPROJ_PRJ_BITS) - xq[0] - xqd[1];
  ------------------
  |  |   99|  1.02k|#define SGRPROJ_PRJ_BITS 7
  ------------------
  594|  1.02k|  }
  595|  2.65k|}
av1_loop_restoration_filter_unit:
  992|  5.81k|    int optimized_lr, struct aom_internal_error_info *error_info) {
  993|  5.81k|  RestorationType unit_rtype = rui->restoration_type;
  994|       |
  995|  5.81k|  int unit_h = limits->v_end - limits->v_start;
  996|  5.81k|  int unit_w = limits->h_end - limits->h_start;
  997|  5.81k|  uint8_t *data8_tl =
  998|  5.81k|      data8 + limits->v_start * (ptrdiff_t)stride + limits->h_start;
  999|  5.81k|  uint8_t *dst8_tl =
 1000|  5.81k|      dst8 + limits->v_start * (ptrdiff_t)dst_stride + limits->h_start;
 1001|       |
 1002|  5.81k|  if (unit_rtype == RESTORE_NONE) {
  ------------------
  |  Branch (1002:7): [True: 1.90k, False: 3.90k]
  ------------------
 1003|  1.90k|    copy_rest_unit(unit_w, unit_h, data8_tl, stride, dst8_tl, dst_stride,
 1004|  1.90k|                   highbd);
 1005|  1.90k|    return;
 1006|  1.90k|  }
 1007|       |
 1008|  3.90k|  const int filter_idx = 2 * highbd + (unit_rtype == RESTORE_SGRPROJ);
 1009|  3.90k|  assert(filter_idx < NUM_STRIPE_FILTERS);
 1010|  3.90k|  const stripe_filter_fun stripe_filter = stripe_filters[filter_idx];
 1011|       |
 1012|  3.90k|  const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x;
  ------------------
  |  |   34|  3.90k|#define RESTORATION_PROC_UNIT_SIZE 64
  ------------------
 1013|       |
 1014|       |  // Filter the whole image one stripe at a time
 1015|  3.90k|  RestorationTileLimits remaining_stripes = *limits;
 1016|  3.90k|  int i = 0;
 1017|  8.91k|  while (i < unit_h) {
  ------------------
  |  Branch (1017:10): [True: 5.00k, False: 3.90k]
  ------------------
 1018|  5.00k|    int copy_above, copy_below;
 1019|  5.00k|    remaining_stripes.v_start = limits->v_start + i;
 1020|       |
 1021|  5.00k|    get_stripe_boundary_info(&remaining_stripes, plane_w, plane_h, ss_y,
 1022|  5.00k|                             &copy_above, &copy_below);
 1023|       |
 1024|  5.00k|    const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
  ------------------
  |  |   34|  5.00k|#define RESTORATION_PROC_UNIT_SIZE 64
  ------------------
 1025|  5.00k|    const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
  ------------------
  |  |   37|  5.00k|#define RESTORATION_UNIT_OFFSET 8
  ------------------
 1026|       |
 1027|       |    // Work out where this stripe's boundaries are within
 1028|       |    // rsb->stripe_boundary_{above,below}
 1029|  5.00k|    const int frame_stripe =
 1030|  5.00k|        (remaining_stripes.v_start + runit_offset) / full_stripe_height;
 1031|  5.00k|    const int rsb_row = RESTORATION_CTX_VERT * frame_stripe;
  ------------------
  |  |   66|  5.00k|#define RESTORATION_CTX_VERT 2
  ------------------
 1032|       |
 1033|       |    // Calculate this stripe's height, based on two rules:
 1034|       |    // * The topmost stripe in the frame is 8 luma pixels shorter than usual.
 1035|       |    // * We can't extend past the end of the current restoration unit
 1036|  5.00k|    const int nominal_stripe_height =
 1037|  5.00k|        full_stripe_height - ((frame_stripe == 0) ? runit_offset : 0);
  ------------------
  |  Branch (1037:31): [True: 1.61k, False: 3.39k]
  ------------------
 1038|  5.00k|    const int h = AOMMIN(nominal_stripe_height,
  ------------------
  |  |   34|  5.00k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 1.08k, False: 3.91k]
  |  |  ------------------
  ------------------
 1039|  5.00k|                         remaining_stripes.v_end - remaining_stripes.v_start);
 1040|       |
 1041|  5.00k|    setup_processing_stripe_boundary(&remaining_stripes, rsb, rsb_row, highbd,
 1042|  5.00k|                                     h, data8, stride, rlbs, copy_above,
 1043|  5.00k|                                     copy_below, optimized_lr);
 1044|       |
 1045|  5.00k|    stripe_filter(rui, unit_w, h, procunit_width, data8_tl + i * stride, stride,
 1046|  5.00k|                  dst8_tl + i * dst_stride, dst_stride, tmpbuf, bit_depth,
 1047|  5.00k|                  error_info);
 1048|       |
 1049|  5.00k|    restore_processing_stripe_boundary(&remaining_stripes, rlbs, highbd, h,
 1050|  5.00k|                                       data8, stride, copy_above, copy_below,
 1051|  5.00k|                                       optimized_lr);
 1052|       |
 1053|  5.00k|    i += h;
 1054|  5.00k|  }
 1055|  3.90k|}
av1_loop_restoration_filter_frame_init:
 1074|    783|                                            int num_planes) {
 1075|    783|  const SequenceHeader *const seq_params = cm->seq_params;
 1076|    783|  const int bit_depth = seq_params->bit_depth;
 1077|    783|  const int highbd = seq_params->use_highbitdepth;
 1078|    783|  lr_ctxt->dst = &cm->rst_frame;
 1079|       |
 1080|    783|  const int frame_width = frame->crop_widths[0];
 1081|    783|  const int frame_height = frame->crop_heights[0];
 1082|    783|  if (aom_realloc_frame_buffer(
  ------------------
  |  Branch (1082:7): [True: 0, False: 783]
  ------------------
 1083|    783|          lr_ctxt->dst, frame_width, frame_height, seq_params->subsampling_x,
 1084|    783|          seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER,
  ------------------
  |  |   30|    783|#define AOM_RESTORATION_FRAME_BORDER 32
  ------------------
 1085|    783|          cm->features.byte_alignment, NULL, NULL, NULL, false,
 1086|    783|          0) != AOM_CODEC_OK)
 1087|      0|    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
 1088|      0|                       "Failed to allocate restoration dst buffer");
 1089|       |
 1090|    783|  lr_ctxt->on_rest_unit = filter_frame_on_unit;
 1091|    783|  lr_ctxt->frame = frame;
 1092|  2.90k|  for (int plane = 0; plane < num_planes; ++plane) {
  ------------------
  |  Branch (1092:23): [True: 2.12k, False: 783]
  ------------------
 1093|  2.12k|    RestorationInfo *rsi = &cm->rst_info[plane];
 1094|  2.12k|    RestorationType rtype = rsi->frame_restoration_type;
 1095|  2.12k|    rsi->optimized_lr = optimized_lr;
 1096|  2.12k|    lr_ctxt->ctxt[plane].rsi = rsi;
 1097|       |
 1098|  2.12k|    if (rtype == RESTORE_NONE) {
  ------------------
  |  Branch (1098:9): [True: 483, False: 1.64k]
  ------------------
 1099|    483|      continue;
 1100|    483|    }
 1101|       |
 1102|  1.64k|    const int is_uv = plane > 0;
 1103|  1.64k|    int plane_w, plane_h;
 1104|  1.64k|    av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
 1105|  1.64k|    assert(plane_w == frame->crop_widths[is_uv]);
 1106|  1.64k|    assert(plane_h == frame->crop_heights[is_uv]);
 1107|       |
 1108|  1.64k|    av1_extend_frame(frame->buffers[plane], plane_w, plane_h,
 1109|  1.64k|                     frame->strides[is_uv], RESTORATION_BORDER,
  ------------------
  |  |   62|  1.64k|#define RESTORATION_BORDER 3
  ------------------
 1110|  1.64k|                     RESTORATION_BORDER, highbd);
  ------------------
  |  |   62|  1.64k|#define RESTORATION_BORDER 3
  ------------------
 1111|       |
 1112|  1.64k|    FilterFrameCtxt *lr_plane_ctxt = &lr_ctxt->ctxt[plane];
 1113|  1.64k|    lr_plane_ctxt->ss_x = is_uv && seq_params->subsampling_x;
  ------------------
  |  Branch (1113:27): [True: 1.02k, False: 613]
  |  Branch (1113:36): [True: 158, False: 871]
  ------------------
 1114|  1.64k|    lr_plane_ctxt->ss_y = is_uv && seq_params->subsampling_y;
  ------------------
  |  Branch (1114:27): [True: 1.02k, False: 613]
  |  Branch (1114:36): [True: 8, False: 1.02k]
  ------------------
 1115|  1.64k|    lr_plane_ctxt->plane_w = plane_w;
 1116|  1.64k|    lr_plane_ctxt->plane_h = plane_h;
 1117|  1.64k|    lr_plane_ctxt->highbd = highbd;
 1118|  1.64k|    lr_plane_ctxt->bit_depth = bit_depth;
 1119|  1.64k|    lr_plane_ctxt->data8 = frame->buffers[plane];
 1120|  1.64k|    lr_plane_ctxt->dst8 = lr_ctxt->dst->buffers[plane];
 1121|  1.64k|    lr_plane_ctxt->data_stride = frame->strides[is_uv];
 1122|  1.64k|    lr_plane_ctxt->dst_stride = lr_ctxt->dst->strides[is_uv];
 1123|  1.64k|  }
 1124|    783|}
av1_loop_restoration_filter_frame:
 1199|     65|                                       void *lr_ctxt) {
 1200|     65|  assert(!cm->features.all_lossless);
 1201|     65|  const int num_planes = av1_num_planes(cm);
 1202|       |
 1203|     65|  AV1LrStruct *loop_rest_ctxt = (AV1LrStruct *)lr_ctxt;
 1204|       |
 1205|     65|  av1_loop_restoration_filter_frame_init(loop_rest_ctxt, frame, cm,
 1206|     65|                                         optimized_lr, num_planes);
 1207|       |
 1208|     65|  foreach_rest_unit_in_planes(loop_rest_ctxt, cm, num_planes);
 1209|       |
 1210|     65|  loop_restoration_copy_planes(loop_rest_ctxt, cm, num_planes);
 1211|     65|}
av1_foreach_rest_unit_in_row:
 1219|  4.14k|    struct aom_internal_error_info *error_info) {
 1220|  4.14k|  const int ext_size = unit_size * 3 / 2;
 1221|  4.14k|  int x0 = 0, j = 0;
 1222|  9.95k|  while (x0 < plane_w) {
  ------------------
  |  Branch (1222:10): [True: 5.81k, False: 4.14k]
  ------------------
 1223|  5.81k|    int remaining_w = plane_w - x0;
 1224|  5.81k|    int w = (remaining_w < ext_size) ? remaining_w : unit_size;
  ------------------
  |  Branch (1224:13): [True: 4.14k, False: 1.67k]
  ------------------
 1225|       |
 1226|  5.81k|    limits->h_start = x0;
 1227|  5.81k|    limits->h_end = x0 + w;
 1228|  5.81k|    assert(limits->h_end <= plane_w);
 1229|       |
 1230|  5.81k|    const int unit_idx = row_number * hnum_rest_units + j;
 1231|       |
 1232|       |    // No sync for even numbered rows
 1233|       |    // For odd numbered rows, Loop Restoration of current block requires the LR
 1234|       |    // of top-right and bottom-right blocks to be completed
 1235|       |
 1236|       |    // top-right sync
 1237|  5.81k|    on_sync_read(lr_sync, row_number, j, plane);
 1238|  5.81k|    if ((row_number + 1) < vnum_rest_units)
  ------------------
  |  Branch (1238:9): [True: 3.64k, False: 2.17k]
  ------------------
 1239|       |      // bottom-right sync
 1240|  3.64k|      on_sync_read(lr_sync, row_number + 2, j, plane);
 1241|       |
 1242|  5.81k|#if CONFIG_MULTITHREAD
 1243|  5.81k|    if (lr_sync && lr_sync->num_workers > 1) {
  ------------------
  |  Branch (1243:9): [True: 5.38k, False: 435]
  |  Branch (1243:20): [True: 5.38k, False: 2]
  ------------------
 1244|  5.38k|      pthread_mutex_lock(lr_sync->job_mutex);
 1245|  5.38k|      const bool lr_mt_exit = lr_sync->lr_mt_exit;
 1246|  5.38k|      pthread_mutex_unlock(lr_sync->job_mutex);
 1247|       |      // Exit in case any worker has encountered an error.
 1248|  5.38k|      if (lr_mt_exit) return;
  ------------------
  |  Branch (1248:11): [True: 0, False: 5.38k]
  ------------------
 1249|  5.38k|    }
 1250|  5.81k|#endif
 1251|       |
 1252|  5.81k|    on_rest_unit(limits, unit_idx, priv, tmpbuf, rlbs, error_info);
 1253|       |
 1254|  5.81k|    on_sync_write(lr_sync, row_number, j, hnum_rest_units, plane);
 1255|       |
 1256|  5.81k|    x0 += w;
 1257|  5.81k|    ++j;
 1258|  5.81k|  }
 1259|  4.14k|}
av1_lr_sync_read_dummy:
 1261|  6.05k|void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane) {
 1262|  6.05k|  (void)lr_sync;
 1263|  6.05k|  (void)r;
 1264|  6.05k|  (void)c;
 1265|  6.05k|  (void)plane;
 1266|  6.05k|}
av1_lr_sync_write_dummy:
 1269|  2.26k|                             const int sb_cols, int plane) {
 1270|  2.26k|  (void)lr_sync;
 1271|  2.26k|  (void)r;
 1272|  2.26k|  (void)c;
 1273|  2.26k|  (void)sb_cols;
 1274|  2.26k|  (void)plane;
 1275|  2.26k|}
av1_loop_restoration_corners_in_sb:
 1280|   232k|                                       int *rrow1) {
 1281|   232k|  assert(rcol0 && rcol1 && rrow0 && rrow1);
 1282|       |
 1283|   232k|  if (bsize != cm->seq_params->sb_size) return 0;
  ------------------
  |  Branch (1283:7): [True: 208k, False: 24.6k]
  ------------------
 1284|       |
 1285|   232k|  assert(!cm->features.all_lossless);
 1286|       |
 1287|  24.6k|  const int is_uv = plane > 0;
 1288|       |
 1289|       |  // Compute the mi-unit corners of the superblock
 1290|  24.6k|  const int mi_row0 = mi_row;
 1291|  24.6k|  const int mi_col0 = mi_col;
 1292|  24.6k|  const int mi_row1 = mi_row0 + mi_size_high[bsize];
 1293|  24.6k|  const int mi_col1 = mi_col0 + mi_size_wide[bsize];
 1294|       |
 1295|  24.6k|  const RestorationInfo *rsi = &cm->rst_info[plane];
 1296|  24.6k|  const int size = rsi->restoration_unit_size;
 1297|  24.6k|  const int horz_units = rsi->horz_units;
 1298|  24.6k|  const int vert_units = rsi->vert_units;
 1299|       |
 1300|       |  // The size of an MI-unit on this plane of the image
 1301|  24.6k|  const int ss_x = is_uv && cm->seq_params->subsampling_x;
  ------------------
  |  Branch (1301:20): [True: 14.8k, False: 9.82k]
  |  Branch (1301:29): [True: 1.19k, False: 13.6k]
  ------------------
 1302|  24.6k|  const int ss_y = is_uv && cm->seq_params->subsampling_y;
  ------------------
  |  Branch (1302:20): [True: 14.8k, False: 9.82k]
  |  Branch (1302:29): [True: 520, False: 14.3k]
  ------------------
 1303|  24.6k|  const int mi_size_x = MI_SIZE >> ss_x;
  ------------------
  |  |   40|  24.6k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  24.6k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1304|  24.6k|  const int mi_size_y = MI_SIZE >> ss_y;
  ------------------
  |  |   40|  24.6k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  24.6k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1305|       |
 1306|       |  // Write m for the relative mi column or row, D for the superres denominator
 1307|       |  // and N for the superres numerator. If u is the upscaled pixel offset then
 1308|       |  // we can write the downscaled pixel offset in two ways as:
 1309|       |  //
 1310|       |  //   MI_SIZE * m = N / D u
 1311|       |  //
 1312|       |  // from which we get u = D * MI_SIZE * m / N
 1313|  24.6k|  const int mi_to_num_x = av1_superres_scaled(cm)
  ------------------
  |  Branch (1313:27): [True: 917, False: 23.7k]
  ------------------
 1314|  24.6k|                              ? mi_size_x * cm->superres_scale_denominator
 1315|  24.6k|                              : mi_size_x;
 1316|  24.6k|  const int mi_to_num_y = mi_size_y;
 1317|  24.6k|  const int denom_x = av1_superres_scaled(cm) ? size * SCALE_NUMERATOR : size;
  ------------------
  |  |   22|    917|#define SCALE_NUMERATOR 8
  ------------------
  |  Branch (1317:23): [True: 917, False: 23.7k]
  ------------------
 1318|  24.6k|  const int denom_y = size;
 1319|       |
 1320|  24.6k|  const int rnd_x = denom_x - 1;
 1321|  24.6k|  const int rnd_y = denom_y - 1;
 1322|       |
 1323|       |  // rcol0/rrow0 should be the first column/row of restoration units that
 1324|       |  // doesn't start left/below of mi_col/mi_row. For this calculation, we need
 1325|       |  // to round up the division (if the sb starts at runit column 10.1, the first
 1326|       |  // matching runit has column index 11)
 1327|  24.6k|  *rcol0 = (mi_col0 * mi_to_num_x + rnd_x) / denom_x;
 1328|  24.6k|  *rrow0 = (mi_row0 * mi_to_num_y + rnd_y) / denom_y;
 1329|       |
 1330|       |  // rel_col1/rel_row1 is the equivalent calculation, but for the superblock
 1331|       |  // below-right. If we're at the bottom or right of the frame, this restoration
 1332|       |  // unit might not exist, in which case we'll clamp accordingly.
 1333|  24.6k|  *rcol1 = AOMMIN((mi_col1 * mi_to_num_x + rnd_x) / denom_x, horz_units);
  ------------------
  |  |   34|  24.6k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 7.71k, False: 16.9k]
  |  |  ------------------
  ------------------
 1334|  24.6k|  *rrow1 = AOMMIN((mi_row1 * mi_to_num_y + rnd_y) / denom_y, vert_units);
  ------------------
  |  |   34|  24.6k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 10.7k, False: 13.9k]
  |  |  ------------------
  ------------------
 1335|       |
 1336|  24.6k|  return *rcol0 < *rcol1 && *rrow0 < *rrow1;
  ------------------
  |  Branch (1336:10): [True: 21.2k, False: 3.39k]
  |  Branch (1336:29): [True: 18.1k, False: 3.14k]
  ------------------
 1337|   232k|}
av1_loop_restoration_save_boundary_lines:
 1498|  1.36k|                                              AV1_COMMON *cm, int after_cdef) {
 1499|  1.36k|  const int num_planes = av1_num_planes(cm);
 1500|  1.36k|  const int use_highbd = cm->seq_params->use_highbitdepth;
 1501|  4.99k|  for (int p = 0; p < num_planes; ++p) {
  ------------------
  |  Branch (1501:19): [True: 3.63k, False: 1.36k]
  ------------------
 1502|  3.63k|    save_boundary_lines(frame, use_highbd, p, cm, after_cdef);
 1503|  3.63k|  }
 1504|  1.36k|}
restoration.c:extend_frame_highbd:
  145|    710|                                int border_vert) {
  146|    710|  uint16_t *data_p;
  147|    710|  int i, j;
  148|   148k|  for (i = 0; i < height; ++i) {
  ------------------
  |  Branch (148:15): [True: 148k, False: 710]
  ------------------
  149|   148k|    data_p = data + i * stride;
  150|   593k|    for (j = -border_horz; j < 0; ++j) data_p[j] = data_p[0];
  ------------------
  |  Branch (150:28): [True: 444k, False: 148k]
  ------------------
  151|   593k|    for (j = width; j < width + border_horz; ++j) data_p[j] = data_p[width - 1];
  ------------------
  |  Branch (151:21): [True: 444k, False: 148k]
  ------------------
  152|   148k|  }
  153|    710|  data_p = data - border_horz;
  154|  2.84k|  for (i = -border_vert; i < 0; ++i) {
  ------------------
  |  Branch (154:26): [True: 2.13k, False: 710]
  ------------------
  155|  2.13k|    memcpy(data_p + i * stride, data_p,
  156|  2.13k|           (width + 2 * border_horz) * sizeof(uint16_t));
  157|  2.13k|  }
  158|  2.84k|  for (i = height; i < height + border_vert; ++i) {
  ------------------
  |  Branch (158:20): [True: 2.13k, False: 710]
  ------------------
  159|  2.13k|    memcpy(data_p + i * stride, data_p + (height - 1) * stride,
  160|  2.13k|           (width + 2 * border_horz) * sizeof(uint16_t));
  161|  2.13k|  }
  162|    710|}
restoration.c:extend_frame_lowbd:
  124|    932|                               int border_vert) {
  125|    932|  uint8_t *data_p;
  126|    932|  int i;
  127|   108k|  for (i = 0; i < height; ++i) {
  ------------------
  |  Branch (127:15): [True: 107k, False: 932]
  ------------------
  128|   107k|    data_p = data + i * stride;
  129|   107k|    memset(data_p - border_horz, data_p[0], border_horz);
  130|   107k|    memset(data_p + width, data_p[width - 1], border_horz);
  131|   107k|  }
  132|    932|  data_p = data - border_horz;
  133|  3.72k|  for (i = -border_vert; i < 0; ++i) {
  ------------------
  |  Branch (133:26): [True: 2.79k, False: 932]
  ------------------
  134|  2.79k|    memcpy(data_p + i * stride, data_p, width + 2 * border_horz);
  135|  2.79k|  }
  136|  3.72k|  for (i = height; i < height + border_vert; ++i) {
  ------------------
  |  Branch (136:20): [True: 2.79k, False: 932]
  ------------------
  137|  2.79k|    memcpy(data_p + i * stride, data_p + (height - 1) * stride,
  138|  2.79k|           width + 2 * border_horz);
  139|  2.79k|  }
  140|    932|}
restoration.c:copy_rest_unit:
  193|  1.90k|                           int highbd) {
  194|  1.90k|#if CONFIG_AV1_HIGHBITDEPTH
  195|  1.90k|  if (highbd) {
  ------------------
  |  Branch (195:7): [True: 870, False: 1.03k]
  ------------------
  196|    870|    copy_rest_unit_highbd(width, height, CONVERT_TO_SHORTPTR(src), src_stride,
  ------------------
  |  |   75|    870|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  197|    870|                          CONVERT_TO_SHORTPTR(dst), dst_stride);
  ------------------
  |  |   75|    870|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  198|    870|    return;
  199|    870|  }
  200|  1.03k|#endif
  201|  1.03k|  (void)highbd;
  202|  1.03k|  copy_rest_unit_lowbd(width, height, src, src_stride, dst, dst_stride);
  203|  1.03k|}
restoration.c:copy_rest_unit_highbd:
  166|    870|                                  int dst_stride) {
  167|  53.1k|  for (int i = 0; i < height; ++i)
  ------------------
  |  Branch (167:19): [True: 52.2k, False: 870]
  ------------------
  168|  52.2k|    memcpy(dst + i * dst_stride, src + i * src_stride, width * sizeof(*dst));
  169|    870|}
restoration.c:copy_rest_unit_lowbd:
  186|  1.03k|                                 int src_stride, uint8_t *dst, int dst_stride) {
  187|  63.8k|  for (int i = 0; i < height; ++i)
  ------------------
  |  Branch (187:19): [True: 62.7k, False: 1.03k]
  ------------------
  188|  62.7k|    memcpy(dst + i * dst_stride, src + i * src_stride, width);
  189|  1.03k|}
restoration.c:wiener_filter_stripe:
  394|  1.35k|                                 struct aom_internal_error_info *error_info) {
  395|  1.35k|  (void)tmpbuf;
  396|  1.35k|  (void)bit_depth;
  397|  1.35k|  (void)error_info;
  398|  1.35k|  assert(bit_depth == 8);
  399|  1.35k|  const WienerConvolveParams conv_params = get_conv_params_wiener(8);
  400|       |
  401|  2.82k|  for (int j = 0; j < stripe_width; j += procunit_width) {
  ------------------
  |  Branch (401:19): [True: 1.47k, False: 1.35k]
  ------------------
  402|  1.47k|    int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
  ------------------
  |  |   34|  1.47k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 121, False: 1.35k]
  |  |  ------------------
  ------------------
  403|  1.47k|    const uint8_t *src_p = src + j;
  404|  1.47k|    uint8_t *dst_p = dst + j;
  405|  1.47k|    av1_wiener_convolve_add_src(
  406|  1.47k|        src_p, src_stride, dst_p, dst_stride, rui->wiener_info.hfilter, 16,
  407|  1.47k|        rui->wiener_info.vfilter, 16, w, stripe_height, &conv_params);
  408|  1.47k|  }
  409|  1.35k|}
restoration.c:sgrproj_filter_stripe:
  909|  1.30k|                                  struct aom_internal_error_info *error_info) {
  910|  1.30k|  (void)bit_depth;
  911|  1.30k|  assert(bit_depth == 8);
  912|       |
  913|  3.01k|  for (int j = 0; j < stripe_width; j += procunit_width) {
  ------------------
  |  Branch (913:19): [True: 1.70k, False: 1.30k]
  ------------------
  914|  1.70k|    int w = AOMMIN(procunit_width, stripe_width - j);
  ------------------
  |  |   34|  1.70k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 402, False: 1.30k]
  |  |  ------------------
  ------------------
  915|  1.70k|    if (av1_apply_selfguided_restoration(
  ------------------
  |  Branch (915:9): [True: 0, False: 1.70k]
  ------------------
  916|  1.70k|            src + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
  917|  1.70k|            rui->sgrproj_info.xqd, dst + j, dst_stride, tmpbuf, bit_depth,
  918|  1.70k|            0) != 0) {
  919|      0|      aom_internal_error(
  920|      0|          error_info, AOM_CODEC_MEM_ERROR,
  921|      0|          "Error allocating buffer in av1_apply_selfguided_restoration");
  922|      0|    }
  923|  1.70k|  }
  924|  1.30k|}
restoration.c:wiener_filter_stripe_highbd:
  931|  1.42k|    struct aom_internal_error_info *error_info) {
  932|  1.42k|  (void)tmpbuf;
  933|  1.42k|  (void)error_info;
  934|  1.42k|  const WienerConvolveParams conv_params = get_conv_params_wiener(bit_depth);
  935|       |
  936|  2.84k|  for (int j = 0; j < stripe_width; j += procunit_width) {
  ------------------
  |  Branch (936:19): [True: 1.42k, False: 1.42k]
  ------------------
  937|  1.42k|    int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
  ------------------
  |  |   34|  1.42k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 1.42k]
  |  |  ------------------
  ------------------
  938|  1.42k|    const uint8_t *src8_p = src8 + j;
  939|  1.42k|    uint8_t *dst8_p = dst8 + j;
  940|  1.42k|    av1_highbd_wiener_convolve_add_src(src8_p, src_stride, dst8_p, dst_stride,
  941|  1.42k|                                       rui->wiener_info.hfilter, 16,
  942|  1.42k|                                       rui->wiener_info.vfilter, 16, w,
  943|  1.42k|                                       stripe_height, &conv_params, bit_depth);
  944|  1.42k|  }
  945|  1.42k|}
restoration.c:sgrproj_filter_stripe_highbd:
  951|    918|    struct aom_internal_error_info *error_info) {
  952|  1.86k|  for (int j = 0; j < stripe_width; j += procunit_width) {
  ------------------
  |  Branch (952:19): [True: 944, False: 918]
  ------------------
  953|    944|    int w = AOMMIN(procunit_width, stripe_width - j);
  ------------------
  |  |   34|    944|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 24, False: 920]
  |  |  ------------------
  ------------------
  954|    944|    if (av1_apply_selfguided_restoration(
  ------------------
  |  Branch (954:9): [True: 0, False: 944]
  ------------------
  955|    944|            src8 + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
  956|    944|            rui->sgrproj_info.xqd, dst8 + j, dst_stride, tmpbuf, bit_depth,
  957|    944|            1) != 0) {
  958|      0|      aom_internal_error(
  959|      0|          error_info, AOM_CODEC_MEM_ERROR,
  960|      0|          "Error allocating buffer in av1_apply_selfguided_restoration");
  961|      0|    }
  962|    944|  }
  963|    918|}
restoration.c:get_stripe_boundary_info:
  221|  5.00k|                                     int *copy_above, int *copy_below) {
  222|  5.00k|  (void)plane_w;
  223|       |
  224|  5.00k|  *copy_above = 1;
  225|  5.00k|  *copy_below = 1;
  226|       |
  227|  5.00k|  const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
  ------------------
  |  |   34|  5.00k|#define RESTORATION_PROC_UNIT_SIZE 64
  ------------------
  228|  5.00k|  const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
  ------------------
  |  |   37|  5.00k|#define RESTORATION_UNIT_OFFSET 8
  ------------------
  229|       |
  230|  5.00k|  const int first_stripe_in_plane = (limits->v_start == 0);
  231|  5.00k|  const int this_stripe_height =
  232|  5.00k|      full_stripe_height - (first_stripe_in_plane ? runit_offset : 0);
  ------------------
  |  Branch (232:29): [True: 1.61k, False: 3.39k]
  ------------------
  233|  5.00k|  const int last_stripe_in_plane =
  234|  5.00k|      (limits->v_start + this_stripe_height >= plane_h);
  235|       |
  236|  5.00k|  if (first_stripe_in_plane) *copy_above = 0;
  ------------------
  |  Branch (236:7): [True: 1.61k, False: 3.39k]
  ------------------
  237|  5.00k|  if (last_stripe_in_plane) *copy_below = 0;
  ------------------
  |  Branch (237:7): [True: 1.42k, False: 3.58k]
  ------------------
  238|  5.00k|}
restoration.c:setup_processing_stripe_boundary:
  252|  5.00k|    RestorationLineBuffers *rlbs, int copy_above, int copy_below, int opt) {
  253|       |  // Offsets within the line buffers. The buffer logically starts at column
  254|       |  // -RESTORATION_EXTRA_HORZ so the 1st column (at x0 - RESTORATION_EXTRA_HORZ)
  255|       |  // has column x0 in the buffer.
  256|  5.00k|  const int buf_stride = rsb->stripe_boundary_stride;
  257|  5.00k|  const int buf_x0_off = limits->h_start;
  258|  5.00k|  const int line_width =
  259|  5.00k|      (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
  ------------------
  |  |   70|  5.00k|#define RESTORATION_EXTRA_HORZ 4
  ------------------
  260|  5.00k|  const int line_size = line_width << use_highbd;
  261|       |
  262|  5.00k|  const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
  ------------------
  |  |   70|  5.00k|#define RESTORATION_EXTRA_HORZ 4
  ------------------
  263|       |
  264|       |  // Replace RESTORATION_BORDER pixels above the top of the stripe
  265|       |  // We expand RESTORATION_CTX_VERT=2 lines from rsb->stripe_boundary_above
  266|       |  // to fill RESTORATION_BORDER=3 lines of above pixels. This is done by
  267|       |  // duplicating the topmost of the 2 lines (see the AOMMAX call when
  268|       |  // calculating src_row, which gets the values 0, 0, 1 for i = -3, -2, -1).
  269|  5.00k|  if (!opt) {
  ------------------
  |  Branch (269:7): [True: 3.62k, False: 1.38k]
  ------------------
  270|  3.62k|    if (copy_above) {
  ------------------
  |  Branch (270:9): [True: 2.15k, False: 1.46k]
  ------------------
  271|  2.15k|      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
  272|       |
  273|  8.62k|      for (int i = -RESTORATION_BORDER; i < 0; ++i) {
  ------------------
  |  |   62|  2.15k|#define RESTORATION_BORDER 3
  ------------------
  |  Branch (273:41): [True: 6.46k, False: 2.15k]
  ------------------
  274|  6.46k|        const int buf_row = rsb_row + AOMMAX(i + RESTORATION_CTX_VERT, 0);
  ------------------
  |  |   35|  6.46k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 2.15k, False: 4.30k]
  |  |  ------------------
  ------------------
  275|  6.46k|        const int buf_off = buf_x0_off + buf_row * buf_stride;
  276|  6.46k|        const uint8_t *buf =
  277|  6.46k|            rsb->stripe_boundary_above + (buf_off << use_highbd);
  278|  6.46k|        uint8_t *dst8 = data8_tl + i * data_stride;
  279|       |        // Save old pixels, then replace with data from stripe_boundary_above
  280|  6.46k|        memcpy(rlbs->tmp_save_above[i + RESTORATION_BORDER],
  ------------------
  |  |   62|  6.46k|#define RESTORATION_BORDER 3
  ------------------
  281|  6.46k|               REAL_PTR(use_highbd, dst8), line_size);
  ------------------
  |  |  205|  6.46k|#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
  |  |  ------------------
  |  |  |  |   75|  2.03k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  |  |  ------------------
  |  |  |  Branch (205:27): [True: 2.03k, False: 4.42k]
  |  |  ------------------
  ------------------
  282|  6.46k|        memcpy(REAL_PTR(use_highbd, dst8), buf, line_size);
  ------------------
  |  |  205|  6.46k|#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
  |  |  ------------------
  |  |  |  |   75|  2.03k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  |  |  ------------------
  |  |  |  Branch (205:27): [True: 2.03k, False: 4.43k]
  |  |  ------------------
  ------------------
  283|  6.46k|      }
  284|  2.15k|    }
  285|       |
  286|       |    // Replace RESTORATION_BORDER pixels below the bottom of the stripe.
  287|       |    // The second buffer row is repeated, so src_row gets the values 0, 1, 1
  288|       |    // for i = 0, 1, 2.
  289|  3.62k|    if (copy_below) {
  ------------------
  |  Branch (289:9): [True: 2.37k, False: 1.24k]
  ------------------
  290|  2.37k|      const int stripe_end = limits->v_start + h;
  291|  2.37k|      uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
  292|       |
  293|  9.46k|      for (int i = 0; i < RESTORATION_BORDER; ++i) {
  ------------------
  |  |   62|  9.46k|#define RESTORATION_BORDER 3
  ------------------
  |  Branch (293:23): [True: 7.08k, False: 2.37k]
  ------------------
  294|  7.08k|        const int buf_row = rsb_row + AOMMIN(i, RESTORATION_CTX_VERT - 1);
  ------------------
  |  |   34|  7.08k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 2.37k, False: 4.71k]
  |  |  ------------------
  ------------------
  295|  7.08k|        const int buf_off = buf_x0_off + buf_row * buf_stride;
  296|  7.08k|        const uint8_t *src =
  297|  7.08k|            rsb->stripe_boundary_below + (buf_off << use_highbd);
  298|       |
  299|  7.08k|        uint8_t *dst8 = data8_bl + i * data_stride;
  300|       |        // Save old pixels, then replace with data from stripe_boundary_below
  301|  7.08k|        memcpy(rlbs->tmp_save_below[i], REAL_PTR(use_highbd, dst8), line_size);
  ------------------
  |  |  205|  7.08k|#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
  |  |  ------------------
  |  |  |  |   75|  2.18k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  |  |  ------------------
  |  |  |  Branch (205:27): [True: 2.18k, False: 4.90k]
  |  |  ------------------
  ------------------
  302|  7.08k|        memcpy(REAL_PTR(use_highbd, dst8), src, line_size);
  ------------------
  |  |  205|  7.08k|#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
  |  |  ------------------
  |  |  |  |   75|  2.18k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  |  |  ------------------
  |  |  |  Branch (205:27): [True: 2.18k, False: 4.90k]
  |  |  ------------------
  ------------------
  303|  7.08k|      }
  304|  2.37k|    }
  305|  3.62k|  } else {
  306|  1.38k|    if (copy_above) {
  ------------------
  |  Branch (306:9): [True: 1.23k, False: 147]
  ------------------
  307|  1.23k|      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
  308|       |
  309|       |      // Only save and overwrite i=-RESTORATION_BORDER line.
  310|  1.23k|      uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
  ------------------
  |  |   62|  1.23k|#define RESTORATION_BORDER 3
  ------------------
  311|       |      // Save old pixels, then replace with data from stripe_boundary_above
  312|  1.23k|      memcpy(rlbs->tmp_save_above[0], REAL_PTR(use_highbd, dst8), line_size);
  ------------------
  |  |  205|  1.23k|#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
  |  |  ------------------
  |  |  |  |   75|  1.05k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  |  |  ------------------
  |  |  |  Branch (205:27): [True: 1.05k, False: 184]
  |  |  ------------------
  ------------------
  313|  1.23k|      memcpy(REAL_PTR(use_highbd, dst8),
  ------------------
  |  |  205|  1.23k|#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
  |  |  ------------------
  |  |  |  |   75|  1.04k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  |  |  ------------------
  |  |  |  Branch (205:27): [True: 1.04k, False: 187]
  |  |  ------------------
  ------------------
  314|  1.23k|             REAL_PTR(use_highbd,
  ------------------
  |  |  205|  1.23k|#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
  |  |  ------------------
  |  |  |  |   75|  1.04k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  |  |  ------------------
  |  |  |  Branch (205:27): [True: 1.04k, False: 187]
  |  |  ------------------
  ------------------
  315|  1.23k|                      data8_tl + (-RESTORATION_BORDER + 1) * data_stride),
  316|  1.23k|             line_size);
  317|  1.23k|    }
  318|       |
  319|  1.38k|    if (copy_below) {
  ------------------
  |  Branch (319:9): [True: 1.20k, False: 181]
  ------------------
  320|  1.20k|      const int stripe_end = limits->v_start + h;
  321|  1.20k|      uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
  322|       |
  323|       |      // Only save and overwrite i=2 line.
  324|  1.20k|      uint8_t *dst8 = data8_bl + 2 * data_stride;
  325|       |      // Save old pixels, then replace with data from stripe_boundary_below
  326|  1.20k|      memcpy(rlbs->tmp_save_below[2], REAL_PTR(use_highbd, dst8), line_size);
  ------------------
  |  |  205|  1.20k|#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
  |  |  ------------------
  |  |  |  |   75|  1.02k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  |  |  ------------------
  |  |  |  Branch (205:27): [True: 1.02k, False: 179]
  |  |  ------------------
  ------------------
  327|  1.20k|      memcpy(REAL_PTR(use_highbd, dst8),
  ------------------
  |  |  205|  1.20k|#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
  |  |  ------------------
  |  |  |  |   75|  1.01k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  |  |  ------------------
  |  |  |  Branch (205:27): [True: 1.01k, False: 184]
  |  |  ------------------
  ------------------
  328|  1.20k|             REAL_PTR(use_highbd, data8_bl + (2 - 1) * data_stride), line_size);
  ------------------
  |  |  205|  1.20k|#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
  |  |  ------------------
  |  |  |  |   75|  1.01k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  |  |  ------------------
  |  |  |  Branch (205:27): [True: 1.01k, False: 184]
  |  |  ------------------
  ------------------
  329|  1.20k|    }
  330|  1.38k|  }
  331|  5.00k|}
restoration.c:restore_processing_stripe_boundary:
  339|  5.00k|    int copy_below, int opt) {
  340|  5.00k|  const int line_width =
  341|  5.00k|      (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
  ------------------
  |  |   70|  5.00k|#define RESTORATION_EXTRA_HORZ 4
  ------------------
  342|  5.00k|  const int line_size = line_width << use_highbd;
  343|       |
  344|  5.00k|  const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
  ------------------
  |  |   70|  5.00k|#define RESTORATION_EXTRA_HORZ 4
  ------------------
  345|       |
  346|  5.00k|  if (!opt) {
  ------------------
  |  Branch (346:7): [True: 3.62k, False: 1.38k]
  ------------------
  347|  3.62k|    if (copy_above) {
  ------------------
  |  Branch (347:9): [True: 2.15k, False: 1.46k]
  ------------------
  348|  2.15k|      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
  349|  8.63k|      for (int i = -RESTORATION_BORDER; i < 0; ++i) {
  ------------------
  |  |   62|  2.15k|#define RESTORATION_BORDER 3
  ------------------
  |  Branch (349:41): [True: 6.47k, False: 2.15k]
  ------------------
  350|  6.47k|        uint8_t *dst8 = data8_tl + i * data_stride;
  351|  6.47k|        memcpy(REAL_PTR(use_highbd, dst8),
  ------------------
  |  |  205|  6.47k|#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
  |  |  ------------------
  |  |  |  |   75|  2.04k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  |  |  ------------------
  |  |  |  Branch (205:27): [True: 2.04k, False: 4.43k]
  |  |  ------------------
  ------------------
  352|  6.47k|               rlbs->tmp_save_above[i + RESTORATION_BORDER], line_size);
  ------------------
  |  |   62|  6.47k|#define RESTORATION_BORDER 3
  ------------------
  353|  6.47k|      }
  354|  2.15k|    }
  355|       |
  356|  3.62k|    if (copy_below) {
  ------------------
  |  Branch (356:9): [True: 2.37k, False: 1.24k]
  ------------------
  357|  2.37k|      const int stripe_bottom = limits->v_start + h;
  358|  2.37k|      uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
  359|       |
  360|  9.49k|      for (int i = 0; i < RESTORATION_BORDER; ++i) {
  ------------------
  |  |   62|  9.49k|#define RESTORATION_BORDER 3
  ------------------
  |  Branch (360:23): [True: 7.11k, False: 2.37k]
  ------------------
  361|  7.11k|        if (stripe_bottom + i >= limits->v_end + RESTORATION_BORDER) break;
  ------------------
  |  |   62|  7.11k|#define RESTORATION_BORDER 3
  ------------------
  |  Branch (361:13): [True: 0, False: 7.11k]
  ------------------
  362|       |
  363|  7.11k|        uint8_t *dst8 = data8_bl + i * data_stride;
  364|  7.11k|        memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[i], line_size);
  ------------------
  |  |  205|  7.11k|#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
  |  |  ------------------
  |  |  |  |   75|  2.18k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  |  |  ------------------
  |  |  |  Branch (205:27): [True: 2.18k, False: 4.93k]
  |  |  ------------------
  ------------------
  365|  7.11k|      }
  366|  2.37k|    }
  367|  3.62k|  } else {
  368|  1.38k|    if (copy_above) {
  ------------------
  |  Branch (368:9): [True: 1.23k, False: 151]
  ------------------
  369|  1.23k|      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
  370|       |
  371|       |      // Only restore i=-RESTORATION_BORDER line.
  372|  1.23k|      uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
  ------------------
  |  |   62|  1.23k|#define RESTORATION_BORDER 3
  ------------------
  373|  1.23k|      memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_above[0], line_size);
  ------------------
  |  |  205|  1.23k|#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
  |  |  ------------------
  |  |  |  |   75|  1.04k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  |  |  ------------------
  |  |  |  Branch (205:27): [True: 1.04k, False: 187]
  |  |  ------------------
  ------------------
  374|  1.23k|    }
  375|       |
  376|  1.38k|    if (copy_below) {
  ------------------
  |  Branch (376:9): [True: 1.20k, False: 184]
  ------------------
  377|  1.20k|      const int stripe_bottom = limits->v_start + h;
  378|  1.20k|      uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
  379|       |
  380|       |      // Only restore i=2 line.
  381|  1.20k|      if (stripe_bottom + 2 < limits->v_end + RESTORATION_BORDER) {
  ------------------
  |  |   62|  1.20k|#define RESTORATION_BORDER 3
  ------------------
  |  Branch (381:11): [True: 1.20k, False: 0]
  ------------------
  382|  1.20k|        uint8_t *dst8 = data8_bl + 2 * data_stride;
  383|  1.20k|        memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[2], line_size);
  ------------------
  |  |  205|  1.20k|#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
  |  |  ------------------
  |  |  |  |   75|  1.02k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  |  |  ------------------
  |  |  |  Branch (205:27): [True: 1.02k, False: 178]
  |  |  ------------------
  ------------------
  384|  1.20k|      }
  385|  1.20k|    }
  386|  1.38k|  }
  387|  5.00k|}
restoration.c:filter_frame_on_unit:
 1060|  5.82k|                                 struct aom_internal_error_info *error_info) {
 1061|  5.82k|  FilterFrameCtxt *ctxt = (FilterFrameCtxt *)priv;
 1062|  5.82k|  const RestorationInfo *rsi = ctxt->rsi;
 1063|       |
 1064|  5.82k|  av1_loop_restoration_filter_unit(
 1065|  5.82k|      limits, &rsi->unit_info[rest_unit_idx], &rsi->boundaries, rlbs,
 1066|  5.82k|      ctxt->plane_w, ctxt->plane_h, ctxt->ss_x, ctxt->ss_y, ctxt->highbd,
 1067|  5.82k|      ctxt->bit_depth, ctxt->data8, ctxt->data_stride, ctxt->dst8,
 1068|  5.82k|      ctxt->dst_stride, tmpbuf, rsi->optimized_lr, error_info);
 1069|  5.82k|}
restoration.c:foreach_rest_unit_in_planes:
 1184|     65|                                        int num_planes) {
 1185|     65|  FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
 1186|       |
 1187|    240|  for (int plane = 0; plane < num_planes; ++plane) {
  ------------------
  |  Branch (1187:23): [True: 175, False: 65]
  ------------------
 1188|    175|    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) {
  ------------------
  |  Branch (1188:9): [True: 27, False: 148]
  ------------------
 1189|     27|      continue;
 1190|     27|    }
 1191|       |
 1192|    148|    foreach_rest_unit_in_plane(cm, plane, lr_ctxt->on_rest_unit, &ctxt[plane],
 1193|    148|                               cm->rst_tmpbuf, cm->rlbs);
 1194|    148|  }
 1195|     65|}
restoration.c:foreach_rest_unit_in_plane:
 1147|    148|                                       RestorationLineBuffers *rlbs) {
 1148|    148|  const RestorationInfo *rsi = &cm->rst_info[plane];
 1149|    148|  const int hnum_rest_units = rsi->horz_units;
 1150|    148|  const int vnum_rest_units = rsi->vert_units;
 1151|    148|  const int unit_size = rsi->restoration_unit_size;
 1152|       |
 1153|    148|  const int is_uv = plane > 0;
 1154|    148|  const int ss_y = is_uv && cm->seq_params->subsampling_y;
  ------------------
  |  Branch (1154:20): [True: 93, False: 55]
  |  Branch (1154:29): [True: 8, False: 85]
  ------------------
 1155|    148|  const int ext_size = unit_size * 3 / 2;
 1156|    148|  int plane_w, plane_h;
 1157|    148|  av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
 1158|       |
 1159|    148|  int y0 = 0, i = 0;
 1160|    428|  while (y0 < plane_h) {
  ------------------
  |  Branch (1160:10): [True: 280, False: 148]
  ------------------
 1161|    280|    int remaining_h = plane_h - y0;
 1162|    280|    int h = (remaining_h < ext_size) ? remaining_h : unit_size;
  ------------------
  |  Branch (1162:13): [True: 148, False: 132]
  ------------------
 1163|       |
 1164|    280|    RestorationTileLimits limits;
 1165|    280|    limits.v_start = y0;
 1166|    280|    limits.v_end = y0 + h;
 1167|    280|    assert(limits.v_end <= plane_h);
 1168|       |    // Offset upwards to align with the restoration processing stripe
 1169|    280|    const int voffset = RESTORATION_UNIT_OFFSET >> ss_y;
  ------------------
  |  |   37|    280|#define RESTORATION_UNIT_OFFSET 8
  ------------------
 1170|    280|    limits.v_start = AOMMAX(0, limits.v_start - voffset);
  ------------------
  |  |   35|    280|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 148, False: 132]
  |  |  ------------------
  ------------------
 1171|    280|    if (limits.v_end < plane_h) limits.v_end -= voffset;
  ------------------
  |  Branch (1171:9): [True: 132, False: 148]
  ------------------
 1172|       |
 1173|    280|    av1_foreach_rest_unit_in_row(&limits, plane_w, on_rest_unit, i, unit_size,
 1174|    280|                                 hnum_rest_units, vnum_rest_units, plane, priv,
 1175|    280|                                 tmpbuf, rlbs, av1_lr_sync_read_dummy,
 1176|    280|                                 av1_lr_sync_write_dummy, NULL, cm->error);
 1177|       |
 1178|    280|    y0 += h;
 1179|    280|    ++i;
 1180|    280|  }
 1181|    148|}
restoration.c:loop_restoration_copy_planes:
 1127|     65|                                         AV1_COMMON *cm, int num_planes) {
 1128|     65|  typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc,
 1129|     65|                           YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend,
 1130|     65|                           int vstart, int vend);
 1131|     65|  static const copy_fun copy_funs[3] = { aom_yv12_partial_coloc_copy_y,
  ------------------
  |  |   58|     65|#define aom_yv12_partial_coloc_copy_y aom_yv12_partial_coloc_copy_y_c
  ------------------
 1132|     65|                                         aom_yv12_partial_coloc_copy_u,
  ------------------
  |  |   52|     65|#define aom_yv12_partial_coloc_copy_u aom_yv12_partial_coloc_copy_u_c
  ------------------
 1133|     65|                                         aom_yv12_partial_coloc_copy_v };
  ------------------
  |  |   55|     65|#define aom_yv12_partial_coloc_copy_v aom_yv12_partial_coloc_copy_v_c
  ------------------
 1134|     65|  assert(num_planes <= 3);
 1135|    240|  for (int plane = 0; plane < num_planes; ++plane) {
  ------------------
  |  Branch (1135:23): [True: 175, False: 65]
  ------------------
 1136|    175|    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
  ------------------
  |  Branch (1136:9): [True: 27, False: 148]
  ------------------
 1137|    148|    FilterFrameCtxt *lr_plane_ctxt = &loop_rest_ctxt->ctxt[plane];
 1138|    148|    copy_funs[plane](loop_rest_ctxt->dst, loop_rest_ctxt->frame, 0,
 1139|    148|                     lr_plane_ctxt->plane_w, 0, lr_plane_ctxt->plane_h);
 1140|    148|  }
 1141|     65|}
restoration.c:save_boundary_lines:
 1443|  3.63k|                                int plane, AV1_COMMON *cm, int after_cdef) {
 1444|  3.63k|  const int is_uv = plane > 0;
 1445|  3.63k|  const int ss_y = is_uv && cm->seq_params->subsampling_y;
  ------------------
  |  Branch (1445:20): [True: 2.27k, False: 1.36k]
  |  Branch (1445:29): [True: 8, False: 2.26k]
  ------------------
 1446|  3.63k|  const int stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
  ------------------
  |  |   34|  3.63k|#define RESTORATION_PROC_UNIT_SIZE 64
  ------------------
 1447|  3.63k|  const int stripe_off = RESTORATION_UNIT_OFFSET >> ss_y;
  ------------------
  |  |   37|  3.63k|#define RESTORATION_UNIT_OFFSET 8
  ------------------
 1448|       |
 1449|  3.63k|  int plane_w, plane_h;
 1450|  3.63k|  av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
 1451|       |
 1452|  3.63k|  RestorationStripeBoundaries *boundaries = &cm->rst_info[plane].boundaries;
 1453|       |
 1454|  3.63k|  const int plane_height = ROUND_POWER_OF_TWO(cm->height, ss_y);
  ------------------
  |  |   41|  3.63k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
 1455|       |
 1456|  3.63k|  int stripe_idx;
 1457|  12.9k|  for (stripe_idx = 0;; ++stripe_idx) {
 1458|  12.9k|    const int rel_y0 = AOMMAX(0, stripe_idx * stripe_height - stripe_off);
  ------------------
  |  |   35|  12.9k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 3.63k, False: 9.30k]
  |  |  ------------------
  ------------------
 1459|  12.9k|    const int y0 = rel_y0;
 1460|  12.9k|    if (y0 >= plane_h) break;
  ------------------
  |  Branch (1460:9): [True: 3.63k, False: 9.30k]
  ------------------
 1461|       |
 1462|  9.30k|    const int rel_y1 = (stripe_idx + 1) * stripe_height - stripe_off;
 1463|  9.30k|    const int y1 = AOMMIN(rel_y1, plane_h);
  ------------------
  |  |   34|  9.30k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 5.67k, False: 3.63k]
  |  |  ------------------
  ------------------
 1464|       |
 1465|       |    // Extend using CDEF pixels at the top and bottom of the frame,
 1466|       |    // and deblocked pixels at internal stripe boundaries
 1467|  9.30k|    const int use_deblock_above = (stripe_idx > 0);
 1468|  9.30k|    const int use_deblock_below = (y1 < plane_height);
 1469|       |
 1470|  9.30k|    if (!after_cdef) {
  ------------------
  |  Branch (1470:9): [True: 4.65k, False: 4.65k]
  ------------------
 1471|       |      // Save deblocked context at internal stripe boundaries
 1472|  4.65k|      if (use_deblock_above) {
  ------------------
  |  Branch (1472:11): [True: 2.83k, False: 1.81k]
  ------------------
 1473|  2.83k|        save_deblock_boundary_lines(frame, cm, plane, y0 - RESTORATION_CTX_VERT,
  ------------------
  |  |   66|  2.83k|#define RESTORATION_CTX_VERT 2
  ------------------
 1474|  2.83k|                                    stripe_idx, use_highbd, 1, boundaries);
 1475|  2.83k|      }
 1476|  4.65k|      if (use_deblock_below) {
  ------------------
  |  Branch (1476:11): [True: 2.83k, False: 1.81k]
  ------------------
 1477|  2.83k|        save_deblock_boundary_lines(frame, cm, plane, y1, stripe_idx,
 1478|  2.83k|                                    use_highbd, 0, boundaries);
 1479|  2.83k|      }
 1480|  4.65k|    } else {
 1481|       |      // Save CDEF context at frame boundaries
 1482|  4.65k|      if (!use_deblock_above) {
  ------------------
  |  Branch (1482:11): [True: 1.81k, False: 2.83k]
  ------------------
 1483|  1.81k|        save_cdef_boundary_lines(frame, cm, plane, y0, stripe_idx, use_highbd,
 1484|  1.81k|                                 1, boundaries);
 1485|  1.81k|      }
 1486|  4.65k|      if (!use_deblock_below) {
  ------------------
  |  Branch (1486:11): [True: 1.81k, False: 2.83k]
  ------------------
 1487|  1.81k|        save_cdef_boundary_lines(frame, cm, plane, y1 - 1, stripe_idx,
 1488|  1.81k|                                 use_highbd, 0, boundaries);
 1489|  1.81k|      }
 1490|  4.65k|    }
 1491|  9.30k|  }
 1492|  3.63k|}
restoration.c:save_deblock_boundary_lines:
 1358|  5.67k|    RestorationStripeBoundaries *boundaries) {
 1359|  5.67k|  const int is_uv = plane > 0;
 1360|  5.67k|  const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
  ------------------
  |  |  205|  5.67k|#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
  |  |  ------------------
  |  |  |  |   75|  1.49k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  |  |  ------------------
  |  |  |  Branch (205:27): [True: 1.49k, False: 4.18k]
  |  |  ------------------
  ------------------
 1361|  5.67k|  const int src_stride = frame->strides[is_uv] << use_highbd;
 1362|  5.67k|  const uint8_t *src_rows = src_buf + row * (ptrdiff_t)src_stride;
 1363|       |
 1364|  5.67k|  uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
  ------------------
  |  Branch (1364:23): [True: 2.83k, False: 2.83k]
  ------------------
 1365|  5.67k|                               : boundaries->stripe_boundary_below;
 1366|  5.67k|  uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
  ------------------
  |  |   70|  5.67k|#define RESTORATION_EXTRA_HORZ 4
  ------------------
 1367|  5.67k|  const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
 1368|  5.67k|  uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
  ------------------
  |  |   66|  5.67k|#define RESTORATION_CTX_VERT 2
  ------------------
 1369|       |
 1370|       |  // There is a rare case in which a processing stripe can end 1px above the
 1371|       |  // crop border. In this case, we do want to use deblocked pixels from below
 1372|       |  // the stripe (hence why we ended up in this function), but instead of
 1373|       |  // fetching 2 "below" rows we need to fetch one and duplicate it.
 1374|       |  // This is equivalent to clamping the sample locations against the crop border
 1375|  5.67k|  const int lines_to_save =
 1376|  5.67k|      AOMMIN(RESTORATION_CTX_VERT, frame->crop_heights[is_uv] - row);
  ------------------
  |  |   34|  5.67k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 5.67k, False: 6]
  |  |  ------------------
  ------------------
 1377|  5.67k|  assert(lines_to_save == 1 || lines_to_save == 2);
 1378|       |
 1379|  5.67k|  int upscaled_width;
 1380|  5.67k|  int line_bytes;
 1381|  5.67k|  if (av1_superres_scaled(cm)) {
  ------------------
  |  Branch (1381:7): [True: 504, False: 5.17k]
  ------------------
 1382|    504|    const int ss_x = is_uv && cm->seq_params->subsampling_x;
  ------------------
  |  Branch (1382:22): [True: 336, False: 168]
  |  Branch (1382:31): [True: 0, False: 336]
  ------------------
 1383|    504|    upscaled_width = (cm->superres_upscaled_width + ss_x) >> ss_x;
 1384|    504|    line_bytes = upscaled_width << use_highbd;
 1385|    504|    if (use_highbd)
  ------------------
  |  Branch (1385:9): [True: 0, False: 504]
  ------------------
 1386|      0|      av1_upscale_normative_rows(
 1387|      0|          cm, CONVERT_TO_BYTEPTR(src_rows), frame->strides[is_uv],
  ------------------
  |  |   76|      0|#define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1))
  ------------------
 1388|      0|          CONVERT_TO_BYTEPTR(bdry_rows), boundaries->stripe_boundary_stride,
  ------------------
  |  |   76|      0|#define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1))
  ------------------
 1389|      0|          plane, lines_to_save);
 1390|    504|    else
 1391|    504|      av1_upscale_normative_rows(cm, src_rows, frame->strides[is_uv], bdry_rows,
 1392|    504|                                 boundaries->stripe_boundary_stride, plane,
 1393|    504|                                 lines_to_save);
 1394|  5.17k|  } else {
 1395|  5.17k|    upscaled_width = frame->crop_widths[is_uv];
 1396|  5.17k|    line_bytes = upscaled_width << use_highbd;
 1397|  15.5k|    for (int i = 0; i < lines_to_save; i++) {
  ------------------
  |  Branch (1397:21): [True: 10.3k, False: 5.17k]
  ------------------
 1398|  10.3k|      memcpy(bdry_rows + i * bdry_stride, src_rows + i * src_stride,
 1399|  10.3k|             line_bytes);
 1400|  10.3k|    }
 1401|  5.17k|  }
 1402|       |  // If we only saved one line, then copy it into the second line buffer
 1403|  5.67k|  if (lines_to_save == 1)
  ------------------
  |  Branch (1403:7): [True: 0, False: 5.67k]
  ------------------
 1404|      0|    memcpy(bdry_rows + bdry_stride, bdry_rows, line_bytes);
 1405|       |
 1406|  5.67k|  extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
  ------------------
  |  |   66|  5.67k|#define RESTORATION_CTX_VERT 2
  ------------------
 1407|  5.67k|               RESTORATION_EXTRA_HORZ, use_highbd);
  ------------------
  |  |   70|  5.67k|#define RESTORATION_EXTRA_HORZ 4
  ------------------
 1408|  5.67k|}
restoration.c:extend_lines:
 1341|  9.30k|                         int extend, int use_highbitdepth) {
 1342|  27.9k|  for (int i = 0; i < height; ++i) {
  ------------------
  |  Branch (1342:19): [True: 18.6k, False: 9.30k]
  ------------------
 1343|  18.6k|    if (use_highbitdepth) {
  ------------------
  |  Branch (1343:9): [True: 5.23k, False: 13.3k]
  ------------------
 1344|  5.23k|      uint16_t *buf16 = (uint16_t *)buf;
 1345|  5.23k|      aom_memset16(buf16 - extend, buf16[0], extend);
 1346|  5.23k|      aom_memset16(buf16 + width, buf16[width - 1], extend);
 1347|  13.3k|    } else {
 1348|  13.3k|      memset(buf - extend, buf[0], extend);
 1349|  13.3k|      memset(buf + width, buf[width - 1], extend);
 1350|  13.3k|    }
 1351|  18.6k|    buf += stride;
 1352|  18.6k|  }
 1353|  9.30k|}
restoration.c:save_cdef_boundary_lines:
 1413|  3.63k|                                     RestorationStripeBoundaries *boundaries) {
 1414|  3.63k|  const int is_uv = plane > 0;
 1415|  3.63k|  const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
  ------------------
  |  |  205|  3.63k|#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
  |  |  ------------------
  |  |  |  |   75|  1.12k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  |  |  ------------------
  |  |  |  Branch (205:27): [True: 1.12k, False: 2.50k]
  |  |  ------------------
  ------------------
 1416|  3.63k|  const int src_stride = frame->strides[is_uv] << use_highbd;
 1417|  3.63k|  const uint8_t *src_rows = src_buf + row * (ptrdiff_t)src_stride;
 1418|       |
 1419|  3.63k|  uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
  ------------------
  |  Branch (1419:23): [True: 1.81k, False: 1.81k]
  ------------------
 1420|  3.63k|                               : boundaries->stripe_boundary_below;
 1421|  3.63k|  uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
  ------------------
  |  |   70|  3.63k|#define RESTORATION_EXTRA_HORZ 4
  ------------------
 1422|  3.63k|  const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
 1423|  3.63k|  uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
  ------------------
  |  |   66|  3.63k|#define RESTORATION_CTX_VERT 2
  ------------------
 1424|  3.63k|  const int src_width = frame->crop_widths[is_uv];
 1425|       |
 1426|       |  // At the point where this function is called, we've already applied
 1427|       |  // superres. So we don't need to extend the lines here, we can just
 1428|       |  // pull directly from the topmost row of the upscaled frame.
 1429|  3.63k|  const int ss_x = is_uv && cm->seq_params->subsampling_x;
  ------------------
  |  Branch (1429:20): [True: 2.27k, False: 1.36k]
  |  Branch (1429:29): [True: 360, False: 1.91k]
  ------------------
 1430|  3.63k|  const int upscaled_width = av1_superres_scaled(cm)
  ------------------
  |  Branch (1430:30): [True: 192, False: 3.44k]
  ------------------
 1431|  3.63k|                                 ? (cm->superres_upscaled_width + ss_x) >> ss_x
 1432|  3.63k|                                 : src_width;
 1433|  3.63k|  const int line_bytes = upscaled_width << use_highbd;
 1434|  10.8k|  for (int i = 0; i < RESTORATION_CTX_VERT; i++) {
  ------------------
  |  |   66|  10.8k|#define RESTORATION_CTX_VERT 2
  ------------------
  |  Branch (1434:19): [True: 7.26k, False: 3.63k]
  ------------------
 1435|       |    // Copy the line at 'src_rows' into both context lines
 1436|  7.26k|    memcpy(bdry_rows + i * bdry_stride, src_rows, line_bytes);
 1437|  7.26k|  }
 1438|  3.63k|  extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
  ------------------
  |  |   66|  3.63k|#define RESTORATION_CTX_VERT 2
  ------------------
 1439|  3.63k|               RESTORATION_EXTRA_HORZ, use_highbd);
  ------------------
  |  |   70|  3.63k|#define RESTORATION_EXTRA_HORZ 4
  ------------------
 1440|  3.63k|}

blockd.c:set_default_wiener:
  302|  71.3k|static inline void set_default_wiener(WienerInfo *wiener_info) {
  303|  71.3k|  wiener_info->vfilter[0] = wiener_info->hfilter[0] = WIENER_FILT_TAP0_MIDV;
  ------------------
  |  |  137|  71.3k|#define WIENER_FILT_TAP0_MIDV (3)
  ------------------
  304|  71.3k|  wiener_info->vfilter[1] = wiener_info->hfilter[1] = WIENER_FILT_TAP1_MIDV;
  ------------------
  |  |  138|  71.3k|#define WIENER_FILT_TAP1_MIDV (-7)
  ------------------
  305|  71.3k|  wiener_info->vfilter[2] = wiener_info->hfilter[2] = WIENER_FILT_TAP2_MIDV;
  ------------------
  |  |  139|  71.3k|#define WIENER_FILT_TAP2_MIDV (15)
  ------------------
  306|  71.3k|  wiener_info->vfilter[WIENER_HALFWIN] = wiener_info->hfilter[WIENER_HALFWIN] =
  ------------------
  |  |   43|  71.3k|#define WIENER_HALFWIN 3
  ------------------
                wiener_info->vfilter[WIENER_HALFWIN] = wiener_info->hfilter[WIENER_HALFWIN] =
  ------------------
  |  |   43|  71.3k|#define WIENER_HALFWIN 3
  ------------------
  307|  71.3k|      -2 *
  308|  71.3k|      (WIENER_FILT_TAP2_MIDV + WIENER_FILT_TAP1_MIDV + WIENER_FILT_TAP0_MIDV);
  ------------------
  |  |  139|  71.3k|#define WIENER_FILT_TAP2_MIDV (15)
  ------------------
                    (WIENER_FILT_TAP2_MIDV + WIENER_FILT_TAP1_MIDV + WIENER_FILT_TAP0_MIDV);
  ------------------
  |  |  138|  71.3k|#define WIENER_FILT_TAP1_MIDV (-7)
  ------------------
                    (WIENER_FILT_TAP2_MIDV + WIENER_FILT_TAP1_MIDV + WIENER_FILT_TAP0_MIDV);
  ------------------
  |  |  137|  71.3k|#define WIENER_FILT_TAP0_MIDV (3)
  ------------------
  309|  71.3k|  wiener_info->vfilter[4] = wiener_info->hfilter[4] = WIENER_FILT_TAP2_MIDV;
  ------------------
  |  |  139|  71.3k|#define WIENER_FILT_TAP2_MIDV (15)
  ------------------
  310|  71.3k|  wiener_info->vfilter[5] = wiener_info->hfilter[5] = WIENER_FILT_TAP1_MIDV;
  ------------------
  |  |  138|  71.3k|#define WIENER_FILT_TAP1_MIDV (-7)
  ------------------
  311|  71.3k|  wiener_info->vfilter[6] = wiener_info->hfilter[6] = WIENER_FILT_TAP0_MIDV;
  ------------------
  |  |  137|  71.3k|#define WIENER_FILT_TAP0_MIDV (3)
  ------------------
  312|  71.3k|}
blockd.c:set_default_sgrproj:
  297|  71.2k|static inline void set_default_sgrproj(SgrprojInfo *sgrproj_info) {
  298|  71.2k|  sgrproj_info->xqd[0] = (SGRPROJ_PRJ_MIN0 + SGRPROJ_PRJ_MAX0) / 2;
  ------------------
  |  |  106|  71.2k|#define SGRPROJ_PRJ_MIN0 (-(1 << SGRPROJ_PRJ_BITS) * 3 / 4)
  |  |  ------------------
  |  |  |  |   99|  71.2k|#define SGRPROJ_PRJ_BITS 7
  |  |  ------------------
  ------------------
                sgrproj_info->xqd[0] = (SGRPROJ_PRJ_MIN0 + SGRPROJ_PRJ_MAX0) / 2;
  ------------------
  |  |  107|  71.2k|#define SGRPROJ_PRJ_MAX0 (SGRPROJ_PRJ_MIN0 + (1 << SGRPROJ_PRJ_BITS) - 1)
  |  |  ------------------
  |  |  |  |  106|  71.2k|#define SGRPROJ_PRJ_MIN0 (-(1 << SGRPROJ_PRJ_BITS) * 3 / 4)
  |  |  |  |  ------------------
  |  |  |  |  |  |   99|  71.2k|#define SGRPROJ_PRJ_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |               #define SGRPROJ_PRJ_MAX0 (SGRPROJ_PRJ_MIN0 + (1 << SGRPROJ_PRJ_BITS) - 1)
  |  |  ------------------
  |  |  |  |   99|  71.2k|#define SGRPROJ_PRJ_BITS 7
  |  |  ------------------
  ------------------
  299|  71.2k|  sgrproj_info->xqd[1] = (SGRPROJ_PRJ_MIN1 + SGRPROJ_PRJ_MAX1) / 2;
  ------------------
  |  |  108|  71.2k|#define SGRPROJ_PRJ_MIN1 (-(1 << SGRPROJ_PRJ_BITS) / 4)
  |  |  ------------------
  |  |  |  |   99|  71.2k|#define SGRPROJ_PRJ_BITS 7
  |  |  ------------------
  ------------------
                sgrproj_info->xqd[1] = (SGRPROJ_PRJ_MIN1 + SGRPROJ_PRJ_MAX1) / 2;
  ------------------
  |  |  109|  71.2k|#define SGRPROJ_PRJ_MAX1 (SGRPROJ_PRJ_MIN1 + (1 << SGRPROJ_PRJ_BITS) - 1)
  |  |  ------------------
  |  |  |  |  108|  71.2k|#define SGRPROJ_PRJ_MIN1 (-(1 << SGRPROJ_PRJ_BITS) / 4)
  |  |  |  |  ------------------
  |  |  |  |  |  |   99|  71.2k|#define SGRPROJ_PRJ_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |               #define SGRPROJ_PRJ_MAX1 (SGRPROJ_PRJ_MIN1 + (1 << SGRPROJ_PRJ_BITS) - 1)
  |  |  ------------------
  |  |  |  |   99|  71.2k|#define SGRPROJ_PRJ_BITS 7
  |  |  ------------------
  ------------------
  300|  71.2k|}

av1_scale_mv:
   34|     24|                  const struct scale_factors *sf) {
   35|     24|  const int x_off_q4 = av1_scaled_x(x << SUBPEL_BITS, sf);
  ------------------
  |  |   23|     24|#define SUBPEL_BITS 4
  ------------------
   36|     24|  const int y_off_q4 = av1_scaled_y(y << SUBPEL_BITS, sf);
  ------------------
  |  |   23|     24|#define SUBPEL_BITS 4
  ------------------
   37|     24|  const MV32 res = {
   38|     24|    av1_scaled_y((y << SUBPEL_BITS) + mvq4->row, sf) - y_off_q4,
  ------------------
  |  |   23|     24|#define SUBPEL_BITS 4
  ------------------
   39|     24|    av1_scaled_x((x << SUBPEL_BITS) + mvq4->col, sf) - x_off_q4
  ------------------
  |  |   23|     24|#define SUBPEL_BITS 4
  ------------------
   40|     24|  };
   41|     24|  return res;
   42|     24|}
av1_setup_scale_factors_for_frame:
   45|  57.5k|                                       int other_h, int this_w, int this_h) {
   46|  57.5k|  if (!valid_ref_frame_size(other_w, other_h, this_w, this_h)) {
  ------------------
  |  Branch (46:7): [True: 12, False: 57.5k]
  ------------------
   47|     12|    sf->x_scale_fp = REF_INVALID_SCALE;
  ------------------
  |  |   26|     12|#define REF_INVALID_SCALE -1
  ------------------
   48|     12|    sf->y_scale_fp = REF_INVALID_SCALE;
  ------------------
  |  |   26|     12|#define REF_INVALID_SCALE -1
  ------------------
   49|     12|    return;
   50|     12|  }
   51|       |
   52|  57.5k|  sf->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w);
   53|  57.5k|  sf->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h);
   54|       |
   55|  57.5k|  sf->x_step_q4 = fixed_point_scale_to_coarse_point_scale(sf->x_scale_fp);
   56|  57.5k|  sf->y_step_q4 = fixed_point_scale_to_coarse_point_scale(sf->y_scale_fp);
   57|  57.5k|}
scale.c:get_fixed_point_scale_factor:
   19|   115k|static int get_fixed_point_scale_factor(int other_size, int this_size) {
   20|       |  // Calculate scaling factor once for each reference frame
   21|       |  // and use fixed point scaling factors in decoding and encoding routines.
   22|       |  // Hardware implementations can calculate scale factor in device driver
   23|       |  // and use multiplication and shifting on hardware instead of division.
   24|   115k|  return ((other_size << REF_SCALE_SHIFT) + this_size / 2) / this_size;
  ------------------
  |  |   24|   115k|#define REF_SCALE_SHIFT 14
  ------------------
   25|   115k|}
scale.c:fixed_point_scale_to_coarse_point_scale:
   28|   115k|static int fixed_point_scale_to_coarse_point_scale(int scale_fp) {
   29|   115k|  return ROUND_POWER_OF_TWO(scale_fp, REF_SCALE_SHIFT - SCALE_SUBPEL_BITS);
  ------------------
  |  |   41|   115k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
   30|   115k|}

decodeframe.c:valid_ref_frame_size:
   78|  1.32k|                                       int this_width, int this_height) {
   79|  1.32k|  return 2 * this_width >= ref_width && 2 * this_height >= ref_height &&
  ------------------
  |  Branch (79:10): [True: 1.26k, False: 63]
  |  Branch (79:41): [True: 1.23k, False: 21]
  ------------------
   80|  1.23k|         this_width <= 16 * ref_width && this_height <= 16 * ref_height;
  ------------------
  |  Branch (80:10): [True: 1.23k, False: 0]
  |  Branch (80:42): [True: 1.23k, False: 0]
  ------------------
   81|  1.32k|}
decodeframe.c:av1_is_valid_scale:
   64|   443k|static inline int av1_is_valid_scale(const struct scale_factors *sf) {
   65|   443k|  assert(sf != NULL);
   66|   443k|  return sf->x_scale_fp != REF_INVALID_SCALE &&
  ------------------
  |  |   26|   887k|#define REF_INVALID_SCALE -1
  ------------------
  |  Branch (66:10): [True: 443k, False: 9]
  ------------------
   67|   443k|         sf->y_scale_fp != REF_INVALID_SCALE;
  ------------------
  |  |   26|   443k|#define REF_INVALID_SCALE -1
  ------------------
  |  Branch (67:10): [True: 443k, False: 0]
  ------------------
   68|   443k|}
decodeframe.c:av1_is_scaled:
   70|   387k|static inline int av1_is_scaled(const struct scale_factors *sf) {
   71|   387k|  assert(sf != NULL);
   72|   387k|  return av1_is_valid_scale(sf) &&
  ------------------
  |  Branch (72:10): [True: 387k, False: 18.4E]
  ------------------
   73|   387k|         (sf->x_scale_fp != REF_NO_SCALE || sf->y_scale_fp != REF_NO_SCALE);
  ------------------
  |  |   25|   775k|#define REF_NO_SCALE (1 << REF_SCALE_SHIFT)
  |  |  ------------------
  |  |  |  |   24|   387k|#define REF_SCALE_SHIFT 14
  |  |  ------------------
  ------------------
                       (sf->x_scale_fp != REF_NO_SCALE || sf->y_scale_fp != REF_NO_SCALE);
  ------------------
  |  |   25|   387k|#define REF_NO_SCALE (1 << REF_SCALE_SHIFT)
  |  |  ------------------
  |  |  |  |   24|   387k|#define REF_SCALE_SHIFT 14
  |  |  ------------------
  ------------------
  |  Branch (73:11): [True: 48, False: 387k]
  |  Branch (73:45): [True: 0, False: 387k]
  ------------------
   74|   387k|}
decodeframe.c:av1_scaled_y:
   45|     24|static inline int av1_scaled_y(int val, const struct scale_factors *sf) {
   46|     24|  const int off =
   47|     24|      (sf->y_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1));
  ------------------
  |  |   24|     24|#define REF_SCALE_SHIFT 14
  ------------------
                    (sf->y_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1));
  ------------------
  |  |   23|     24|#define SUBPEL_BITS 4
  ------------------
   48|     24|  const int64_t tval = (int64_t)val * sf->y_scale_fp + off;
   49|     24|  return (int)ROUND_POWER_OF_TWO_SIGNED_64(tval,
  ------------------
  |  |   58|     24|  (((value) < 0) ? -ROUND_POWER_OF_TWO_64(-(value), (n)) \
  |  |  ------------------
  |  |  |  |   53|      0|  (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |  |  Branch (58:4): [True: 0, False: 24]
  |  |  ------------------
  |  |   59|     24|                 : ROUND_POWER_OF_TWO_64((value), (n)))
  |  |  ------------------
  |  |  |  |   53|     24|  (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
   50|     24|                                           REF_SCALE_SHIFT - SCALE_EXTRA_BITS);
   51|     24|}
decodeframe.c:av1_scaled_x:
   36|     24|static inline int av1_scaled_x(int val, const struct scale_factors *sf) {
   37|     24|  const int off =
   38|     24|      (sf->x_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1));
  ------------------
  |  |   24|     24|#define REF_SCALE_SHIFT 14
  ------------------
                    (sf->x_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1));
  ------------------
  |  |   23|     24|#define SUBPEL_BITS 4
  ------------------
   39|     24|  const int64_t tval = (int64_t)val * sf->x_scale_fp + off;
   40|     24|  return (int)ROUND_POWER_OF_TWO_SIGNED_64(tval,
  ------------------
  |  |   58|     24|  (((value) < 0) ? -ROUND_POWER_OF_TWO_64(-(value), (n)) \
  |  |  ------------------
  |  |  |  |   53|      4|  (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |  |  Branch (58:4): [True: 4, False: 20]
  |  |  ------------------
  |  |   59|     24|                 : ROUND_POWER_OF_TWO_64((value), (n)))
  |  |  ------------------
  |  |  |  |   53|     20|  (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
   41|     24|                                           REF_SCALE_SHIFT - SCALE_EXTRA_BITS);
   42|     24|}
decodemv.c:av1_is_scaled:
   70|  15.6k|static inline int av1_is_scaled(const struct scale_factors *sf) {
   71|  15.6k|  assert(sf != NULL);
   72|  15.6k|  return av1_is_valid_scale(sf) &&
  ------------------
  |  Branch (72:10): [True: 15.6k, False: 0]
  ------------------
   73|  15.6k|         (sf->x_scale_fp != REF_NO_SCALE || sf->y_scale_fp != REF_NO_SCALE);
  ------------------
  |  |   25|  31.2k|#define REF_NO_SCALE (1 << REF_SCALE_SHIFT)
  |  |  ------------------
  |  |  |  |   24|  15.6k|#define REF_SCALE_SHIFT 14
  |  |  ------------------
  ------------------
                       (sf->x_scale_fp != REF_NO_SCALE || sf->y_scale_fp != REF_NO_SCALE);
  ------------------
  |  |   25|  15.6k|#define REF_NO_SCALE (1 << REF_SCALE_SHIFT)
  |  |  ------------------
  |  |  |  |   24|  15.6k|#define REF_SCALE_SHIFT 14
  |  |  ------------------
  ------------------
  |  Branch (73:11): [True: 0, False: 15.6k]
  |  Branch (73:45): [True: 0, False: 15.6k]
  ------------------
   74|  15.6k|}
decodemv.c:av1_is_valid_scale:
   64|  15.6k|static inline int av1_is_valid_scale(const struct scale_factors *sf) {
   65|  15.6k|  assert(sf != NULL);
   66|  15.6k|  return sf->x_scale_fp != REF_INVALID_SCALE &&
  ------------------
  |  |   26|  31.2k|#define REF_INVALID_SCALE -1
  ------------------
  |  Branch (66:10): [True: 15.6k, False: 0]
  ------------------
   67|  15.6k|         sf->y_scale_fp != REF_INVALID_SCALE;
  ------------------
  |  |   26|  15.6k|#define REF_INVALID_SCALE -1
  ------------------
  |  Branch (67:10): [True: 15.6k, False: 0]
  ------------------
   68|  15.6k|}
reconinter.c:av1_is_scaled:
   70|   265k|static inline int av1_is_scaled(const struct scale_factors *sf) {
   71|   265k|  assert(sf != NULL);
   72|   265k|  return av1_is_valid_scale(sf) &&
  ------------------
  |  Branch (72:10): [True: 265k, False: 0]
  ------------------
   73|   265k|         (sf->x_scale_fp != REF_NO_SCALE || sf->y_scale_fp != REF_NO_SCALE);
  ------------------
  |  |   25|   530k|#define REF_NO_SCALE (1 << REF_SCALE_SHIFT)
  |  |  ------------------
  |  |  |  |   24|   265k|#define REF_SCALE_SHIFT 14
  |  |  ------------------
  ------------------
                       (sf->x_scale_fp != REF_NO_SCALE || sf->y_scale_fp != REF_NO_SCALE);
  ------------------
  |  |   25|   264k|#define REF_NO_SCALE (1 << REF_SCALE_SHIFT)
  |  |  ------------------
  |  |  |  |   24|   264k|#define REF_SCALE_SHIFT 14
  |  |  ------------------
  ------------------
  |  Branch (73:11): [True: 24, False: 264k]
  |  Branch (73:45): [True: 0, False: 264k]
  ------------------
   74|   265k|}
reconinter.c:av1_scaled_x:
   36|     24|static inline int av1_scaled_x(int val, const struct scale_factors *sf) {
   37|     24|  const int off =
   38|     24|      (sf->x_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1));
  ------------------
  |  |   24|     24|#define REF_SCALE_SHIFT 14
  ------------------
                    (sf->x_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1));
  ------------------
  |  |   23|     24|#define SUBPEL_BITS 4
  ------------------
   39|     24|  const int64_t tval = (int64_t)val * sf->x_scale_fp + off;
   40|     24|  return (int)ROUND_POWER_OF_TWO_SIGNED_64(tval,
  ------------------
  |  |   58|     24|  (((value) < 0) ? -ROUND_POWER_OF_TWO_64(-(value), (n)) \
  |  |  ------------------
  |  |  |  |   53|      0|  (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |  |  Branch (58:4): [True: 0, False: 24]
  |  |  ------------------
  |  |   59|     24|                 : ROUND_POWER_OF_TWO_64((value), (n)))
  |  |  ------------------
  |  |  |  |   53|     24|  (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
   41|     24|                                           REF_SCALE_SHIFT - SCALE_EXTRA_BITS);
   42|     24|}
reconinter.c:av1_scaled_y:
   45|     24|static inline int av1_scaled_y(int val, const struct scale_factors *sf) {
   46|     24|  const int off =
   47|     24|      (sf->y_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1));
  ------------------
  |  |   24|     24|#define REF_SCALE_SHIFT 14
  ------------------
                    (sf->y_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1));
  ------------------
  |  |   23|     24|#define SUBPEL_BITS 4
  ------------------
   48|     24|  const int64_t tval = (int64_t)val * sf->y_scale_fp + off;
   49|     24|  return (int)ROUND_POWER_OF_TWO_SIGNED_64(tval,
  ------------------
  |  |   58|     24|  (((value) < 0) ? -ROUND_POWER_OF_TWO_64(-(value), (n)) \
  |  |  ------------------
  |  |  |  |   53|      0|  (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |  |  Branch (58:4): [True: 0, False: 24]
  |  |  ------------------
  |  |   59|     24|                 : ROUND_POWER_OF_TWO_64((value), (n)))
  |  |  ------------------
  |  |  |  |   53|     24|  (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
   50|     24|                                           REF_SCALE_SHIFT - SCALE_EXTRA_BITS);
   51|     24|}
reconinter.c:av1_unscaled_value:
   54|   381k|static inline int av1_unscaled_value(int val, const struct scale_factors *sf) {
   55|   381k|  (void)sf;
   56|   381k|  return val * (1 << SCALE_EXTRA_BITS);
  ------------------
  |  |   31|   381k|#define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   28|   381k|#define SCALE_SUBPEL_BITS 10
  |  |  ------------------
  |  |               #define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   23|   381k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
   57|   381k|}
reconinter.c:av1_is_valid_scale:
   64|   279k|static inline int av1_is_valid_scale(const struct scale_factors *sf) {
   65|   279k|  assert(sf != NULL);
   66|   279k|  return sf->x_scale_fp != REF_INVALID_SCALE &&
  ------------------
  |  |   26|   558k|#define REF_INVALID_SCALE -1
  ------------------
  |  Branch (66:10): [True: 279k, False: 0]
  ------------------
   67|   279k|         sf->y_scale_fp != REF_INVALID_SCALE;
  ------------------
  |  |   26|   279k|#define REF_INVALID_SCALE -1
  ------------------
  |  Branch (67:10): [True: 279k, False: 0]
  ------------------
   68|   279k|}
scale.c:av1_scaled_x:
   36|     48|static inline int av1_scaled_x(int val, const struct scale_factors *sf) {
   37|     48|  const int off =
   38|     48|      (sf->x_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1));
  ------------------
  |  |   24|     48|#define REF_SCALE_SHIFT 14
  ------------------
                    (sf->x_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1));
  ------------------
  |  |   23|     48|#define SUBPEL_BITS 4
  ------------------
   39|     48|  const int64_t tval = (int64_t)val * sf->x_scale_fp + off;
   40|     48|  return (int)ROUND_POWER_OF_TWO_SIGNED_64(tval,
  ------------------
  |  |   58|     48|  (((value) < 0) ? -ROUND_POWER_OF_TWO_64(-(value), (n)) \
  |  |  ------------------
  |  |  |  |   53|      4|  (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |  |  Branch (58:4): [True: 4, False: 44]
  |  |  ------------------
  |  |   59|     48|                 : ROUND_POWER_OF_TWO_64((value), (n)))
  |  |  ------------------
  |  |  |  |   53|     44|  (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
   41|     48|                                           REF_SCALE_SHIFT - SCALE_EXTRA_BITS);
   42|     48|}
scale.c:av1_scaled_y:
   45|     48|static inline int av1_scaled_y(int val, const struct scale_factors *sf) {
   46|     48|  const int off =
   47|     48|      (sf->y_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1));
  ------------------
  |  |   24|     48|#define REF_SCALE_SHIFT 14
  ------------------
                    (sf->y_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1));
  ------------------
  |  |   23|     48|#define SUBPEL_BITS 4
  ------------------
   48|     48|  const int64_t tval = (int64_t)val * sf->y_scale_fp + off;
   49|     48|  return (int)ROUND_POWER_OF_TWO_SIGNED_64(tval,
  ------------------
  |  |   58|     48|  (((value) < 0) ? -ROUND_POWER_OF_TWO_64(-(value), (n)) \
  |  |  ------------------
  |  |  |  |   53|      0|  (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |  |  Branch (58:4): [True: 0, False: 48]
  |  |  ------------------
  |  |   59|     48|                 : ROUND_POWER_OF_TWO_64((value), (n)))
  |  |  ------------------
  |  |  |  |   53|     48|  (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
   50|     48|                                           REF_SCALE_SHIFT - SCALE_EXTRA_BITS);
   51|     48|}
scale.c:valid_ref_frame_size:
   78|  57.5k|                                       int this_width, int this_height) {
   79|  57.5k|  return 2 * this_width >= ref_width && 2 * this_height >= ref_height &&
  ------------------
  |  Branch (79:10): [True: 57.5k, False: 7]
  |  Branch (79:41): [True: 57.5k, False: 5]
  ------------------
   80|  57.5k|         this_width <= 16 * ref_width && this_height <= 16 * ref_height;
  ------------------
  |  Branch (80:10): [True: 57.5k, False: 0]
  |  Branch (80:42): [True: 57.5k, False: 0]
  ------------------
   81|  57.5k|}

decodetxb.c:get_scan:
   46|  2.77M|static inline const SCAN_ORDER *get_scan(TX_SIZE tx_size, TX_TYPE tx_type) {
   47|  2.77M|  return get_default_scan(tx_size, tx_type);
   48|  2.77M|}
decodetxb.c:get_default_scan:
   42|  2.77M|                                                 TX_TYPE tx_type) {
   43|  2.77M|  return &av1_scan_orders[tx_size][tx_type];
   44|  2.77M|}

av1_clearall_segfeatures:
   37|  25.7k|void av1_clearall_segfeatures(struct segmentation *seg) {
   38|  25.7k|  av1_zero(seg->feature_data);
  ------------------
  |  |   43|  25.7k|#define av1_zero(dest) memset(&(dest), 0, sizeof(dest))
  ------------------
   39|  25.7k|  av1_zero(seg->feature_mask);
  ------------------
  |  |   43|  25.7k|#define av1_zero(dest) memset(&(dest), 0, sizeof(dest))
  ------------------
   40|  25.7k|}
av1_calculate_segdata:
   42|  2.84k|void av1_calculate_segdata(struct segmentation *seg) {
   43|  2.84k|  seg->segid_preskip = 0;
   44|  2.84k|  seg->last_active_segid = 0;
   45|  25.5k|  for (int i = 0; i < MAX_SEGMENTS; i++) {
  ------------------
  |  |   21|  25.5k|#define MAX_SEGMENTS 8
  ------------------
  |  Branch (45:19): [True: 22.7k, False: 2.84k]
  ------------------
   46|   204k|    for (int j = 0; j < SEG_LVL_MAX; j++) {
  ------------------
  |  Branch (46:21): [True: 182k, False: 22.7k]
  ------------------
   47|   182k|      if (seg->feature_mask[i] & (1 << j)) {
  ------------------
  |  Branch (47:11): [True: 77.1k, False: 104k]
  ------------------
   48|  77.1k|        seg->segid_preskip |= (j >= SEG_LVL_REF_FRAME);
   49|  77.1k|        seg->last_active_segid = i;
   50|  77.1k|      }
   51|   182k|    }
   52|  22.7k|  }
   53|  2.84k|}
av1_enable_segfeature:
   56|  77.8k|                           SEG_LVL_FEATURES feature_id) {
   57|  77.8k|  seg->feature_mask[segment_id] |= 1 << feature_id;
   58|  77.8k|}
av1_seg_feature_data_max:
   60|  77.8k|int av1_seg_feature_data_max(SEG_LVL_FEATURES feature_id) {
   61|  77.8k|  return seg_feature_data_max[feature_id];
   62|  77.8k|}
av1_is_segfeature_signed:
   64|  77.8k|int av1_is_segfeature_signed(SEG_LVL_FEATURES feature_id) {
   65|  77.8k|  return seg_feature_data_signed[feature_id];
   66|  77.8k|}
av1_set_segdata:
   80|   183k|                     SEG_LVL_FEATURES feature_id, int seg_data) {
   81|   183k|  if (seg_data < 0) {
  ------------------
  |  Branch (81:7): [True: 24.8k, False: 158k]
  ------------------
   82|  24.8k|    assert(seg_feature_data_signed[feature_id]);
   83|  24.8k|    assert(-seg_data <= seg_feature_data_max[feature_id]);
   84|   158k|  } else {
   85|   158k|    assert(seg_data <= seg_feature_data_max[feature_id]);
   86|   158k|  }
   87|       |
   88|   183k|  seg->feature_data[segment_id][feature_id] = seg_data;
   89|   183k|}

decodeframe.c:segfeatures_copy:
   68|  26.5k|                                    const struct segmentation *src) {
   69|  26.5k|  int i, j;
   70|   238k|  for (i = 0; i < MAX_SEGMENTS; i++) {
  ------------------
  |  |   21|   238k|#define MAX_SEGMENTS 8
  ------------------
  |  Branch (70:15): [True: 212k, False: 26.5k]
  ------------------
   71|   212k|    dst->feature_mask[i] = src->feature_mask[i];
   72|  1.91M|    for (j = 0; j < SEG_LVL_MAX; j++) {
  ------------------
  |  Branch (72:17): [True: 1.69M, False: 212k]
  ------------------
   73|  1.69M|      dst->feature_data[i][j] = src->feature_data[i][j];
   74|  1.69M|    }
   75|   212k|  }
   76|  26.5k|  dst->segid_preskip = src->segid_preskip;
   77|  26.5k|  dst->last_active_segid = src->last_active_segid;
   78|  26.5k|}
decodemv.c:segfeature_active:
   63|  4.19M|                                    SEG_LVL_FEATURES feature_id) {
   64|  4.19M|  return seg->enabled && (seg->feature_mask[segment_id] & (1 << feature_id));
  ------------------
  |  Branch (64:10): [True: 677k, False: 3.52M]
  |  Branch (64:26): [True: 194k, False: 482k]
  ------------------
   65|  4.19M|}
decodemv.c:set_segment_id:
  101|   444k|                                  uint8_t segment_id) {
  102|   444k|  segment_ids += mi_offset;
  103|  1.73M|  for (int y = 0; y < y_mis; ++y) {
  ------------------
  |  Branch (103:19): [True: 1.29M, False: 444k]
  ------------------
  104|  1.29M|    memset(&segment_ids[y * mi_stride], segment_id,
  105|  1.29M|           x_mis * sizeof(segment_ids[0]));
  106|  1.29M|  }
  107|   444k|}
decodemv.c:get_segdata:
   95|    172|                              SEG_LVL_FEATURES feature_id) {
   96|    172|  return seg->feature_data[segment_id][feature_id];
   97|    172|}
av1_loopfilter.c:segfeature_active:
   63|   923k|                                    SEG_LVL_FEATURES feature_id) {
   64|   923k|  return seg->enabled && (seg->feature_mask[segment_id] & (1 << feature_id));
  ------------------
  |  Branch (64:10): [True: 782k, False: 141k]
  |  Branch (64:26): [True: 559k, False: 223k]
  ------------------
   65|   923k|}
av1_loopfilter.c:get_segdata:
   95|   553k|                              SEG_LVL_FEATURES feature_id) {
   96|   553k|  return seg->feature_data[segment_id][feature_id];
   97|   553k|}
quant_common.c:segfeature_active:
   63|  6.76M|                                    SEG_LVL_FEATURES feature_id) {
   64|  6.76M|  return seg->enabled && (seg->feature_mask[segment_id] & (1 << feature_id));
  ------------------
  |  Branch (64:10): [True: 2.21M, False: 4.54M]
  |  Branch (64:26): [True: 1.37M, False: 839k]
  ------------------
   65|  6.76M|}
quant_common.c:get_segdata:
   95|  1.37M|                              SEG_LVL_FEATURES feature_id) {
   96|  1.37M|  return seg->feature_data[segment_id][feature_id];
   97|  1.37M|}

av1_loop_filter_alloc:
   67|  1.42k|                           int width, int num_workers) {
   68|  1.42k|  lf_sync->rows = rows;
   69|  1.42k|#if CONFIG_MULTITHREAD
   70|  1.42k|  {
   71|  1.42k|    int i, j;
   72|       |
   73|  5.68k|    for (j = 0; j < MAX_MB_PLANE; j++) {
  ------------------
  |  |   36|  5.68k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (73:17): [True: 4.26k, False: 1.42k]
  ------------------
   74|  4.26k|      CHECK_MEM_ERROR(cm, lf_sync->mutex_[j],
  ------------------
  |  |   51|  4.26k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  4.26k|  do {                                                    \
  |  |  |  |   69|  4.26k|    lval = (expr);                                        \
  |  |  |  |   70|  4.26k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 4.26k]
  |  |  |  |  ------------------
  |  |  |  |   71|  4.26k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  4.26k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 4.26k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   75|  4.26k|                      aom_malloc(sizeof(*(lf_sync->mutex_[j])) * rows));
   76|  4.26k|      if (lf_sync->mutex_[j]) {
  ------------------
  |  Branch (76:11): [True: 4.26k, False: 0]
  ------------------
   77|  10.5k|        for (i = 0; i < rows; ++i) {
  ------------------
  |  Branch (77:21): [True: 6.28k, False: 4.26k]
  ------------------
   78|  6.28k|          pthread_mutex_init(&lf_sync->mutex_[j][i], NULL);
   79|  6.28k|        }
   80|  4.26k|      }
   81|       |
   82|  4.26k|      CHECK_MEM_ERROR(cm, lf_sync->cond_[j],
  ------------------
  |  |   51|  4.26k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  4.26k|  do {                                                    \
  |  |  |  |   69|  4.26k|    lval = (expr);                                        \
  |  |  |  |   70|  4.26k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 4.26k]
  |  |  |  |  ------------------
  |  |  |  |   71|  4.26k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  4.26k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 4.26k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   83|  4.26k|                      aom_malloc(sizeof(*(lf_sync->cond_[j])) * rows));
   84|  4.26k|      if (lf_sync->cond_[j]) {
  ------------------
  |  Branch (84:11): [True: 4.26k, False: 0]
  ------------------
   85|  10.5k|        for (i = 0; i < rows; ++i) {
  ------------------
  |  Branch (85:21): [True: 6.28k, False: 4.26k]
  ------------------
   86|  6.28k|          pthread_cond_init(&lf_sync->cond_[j][i], NULL);
   87|  6.28k|        }
   88|  4.26k|      }
   89|  4.26k|    }
   90|       |
   91|  1.42k|    CHECK_MEM_ERROR(cm, lf_sync->job_mutex,
  ------------------
  |  |   51|  1.42k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  1.42k|  do {                                                    \
  |  |  |  |   69|  1.42k|    lval = (expr);                                        \
  |  |  |  |   70|  1.42k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 1.42k]
  |  |  |  |  ------------------
  |  |  |  |   71|  1.42k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  1.42k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 1.42k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   92|  1.42k|                    aom_malloc(sizeof(*(lf_sync->job_mutex))));
   93|  1.42k|    if (lf_sync->job_mutex) {
  ------------------
  |  Branch (93:9): [True: 1.42k, False: 0]
  ------------------
   94|  1.42k|      pthread_mutex_init(lf_sync->job_mutex, NULL);
   95|  1.42k|    }
   96|  1.42k|  }
   97|  1.42k|#endif  // CONFIG_MULTITHREAD
   98|  1.42k|  CHECK_MEM_ERROR(cm, lf_sync->lfdata,
  ------------------
  |  |   51|  1.42k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  1.42k|  do {                                                    \
  |  |  |  |   69|  1.42k|    lval = (expr);                                        \
  |  |  |  |   70|  1.42k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 1.42k]
  |  |  |  |  ------------------
  |  |  |  |   71|  1.42k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  1.42k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 1.42k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   99|  1.42k|                  aom_malloc(num_workers * sizeof(*(lf_sync->lfdata))));
  100|  1.42k|  lf_sync->num_workers = num_workers;
  101|       |
  102|  5.68k|  for (int j = 0; j < MAX_MB_PLANE; j++) {
  ------------------
  |  |   36|  5.68k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (102:19): [True: 4.26k, False: 1.42k]
  ------------------
  103|  4.26k|    CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col[j],
  ------------------
  |  |   51|  4.26k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  4.26k|  do {                                                    \
  |  |  |  |   69|  4.26k|    lval = (expr);                                        \
  |  |  |  |   70|  4.26k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 4.26k]
  |  |  |  |  ------------------
  |  |  |  |   71|  4.26k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  4.26k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 4.26k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  104|  4.26k|                    aom_malloc(sizeof(*(lf_sync->cur_sb_col[j])) * rows));
  105|  4.26k|  }
  106|  1.42k|  CHECK_MEM_ERROR(
  ------------------
  |  |   51|  1.42k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  1.42k|  do {                                                    \
  |  |  |  |   69|  1.42k|    lval = (expr);                                        \
  |  |  |  |   70|  1.42k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 1.42k]
  |  |  |  |  ------------------
  |  |  |  |   71|  1.42k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  1.42k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 1.42k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  107|  1.42k|      cm, lf_sync->job_queue,
  108|  1.42k|      aom_malloc(sizeof(*(lf_sync->job_queue)) * rows * MAX_MB_PLANE * 2));
  109|       |  // Set up nsync.
  110|  1.42k|  lf_sync->sync_range = get_sync_range(width);
  111|  1.42k|}
av1_loop_filter_dealloc:
  114|  10.5k|void av1_loop_filter_dealloc(AV1LfSync *lf_sync) {
  115|  10.5k|  if (lf_sync != NULL) {
  ------------------
  |  Branch (115:7): [True: 10.5k, False: 0]
  ------------------
  116|  10.5k|    int j;
  117|  10.5k|#if CONFIG_MULTITHREAD
  118|  10.5k|    int i;
  119|  42.3k|    for (j = 0; j < MAX_MB_PLANE; j++) {
  ------------------
  |  |   36|  42.3k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (119:17): [True: 31.7k, False: 10.5k]
  ------------------
  120|  31.7k|      if (lf_sync->mutex_[j] != NULL) {
  ------------------
  |  Branch (120:11): [True: 4.26k, False: 27.4k]
  ------------------
  121|  10.5k|        for (i = 0; i < lf_sync->rows; ++i) {
  ------------------
  |  Branch (121:21): [True: 6.28k, False: 4.26k]
  ------------------
  122|  6.28k|          pthread_mutex_destroy(&lf_sync->mutex_[j][i]);
  123|  6.28k|        }
  124|  4.26k|        aom_free(lf_sync->mutex_[j]);
  125|  4.26k|      }
  126|  31.7k|      if (lf_sync->cond_[j] != NULL) {
  ------------------
  |  Branch (126:11): [True: 4.26k, False: 27.4k]
  ------------------
  127|  10.5k|        for (i = 0; i < lf_sync->rows; ++i) {
  ------------------
  |  Branch (127:21): [True: 6.28k, False: 4.26k]
  ------------------
  128|  6.28k|          pthread_cond_destroy(&lf_sync->cond_[j][i]);
  129|  6.28k|        }
  130|  4.26k|        aom_free(lf_sync->cond_[j]);
  131|  4.26k|      }
  132|  31.7k|    }
  133|  10.5k|    if (lf_sync->job_mutex != NULL) {
  ------------------
  |  Branch (133:9): [True: 1.42k, False: 9.15k]
  ------------------
  134|  1.42k|      pthread_mutex_destroy(lf_sync->job_mutex);
  135|  1.42k|      aom_free(lf_sync->job_mutex);
  136|  1.42k|    }
  137|  10.5k|#endif  // CONFIG_MULTITHREAD
  138|  10.5k|    aom_free(lf_sync->lfdata);
  139|  42.3k|    for (j = 0; j < MAX_MB_PLANE; j++) {
  ------------------
  |  |   36|  42.3k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (139:17): [True: 31.7k, False: 10.5k]
  ------------------
  140|  31.7k|      aom_free(lf_sync->cur_sb_col[j]);
  141|  31.7k|    }
  142|       |
  143|  10.5k|    aom_free(lf_sync->job_queue);
  144|       |    // clear the structure as the source of this call may be a resize in which
  145|       |    // case this call will be followed by an _alloc() which may fail.
  146|  10.5k|    av1_zero(*lf_sync);
  ------------------
  |  |   43|  10.5k|#define av1_zero(dest) memset(&(dest), 0, sizeof(dest))
  ------------------
  147|  10.5k|  }
  148|  10.5k|}
av1_alloc_cdef_sync:
  151|  12.5k|                         int num_workers) {
  152|  12.5k|  if (num_workers < 1) return;
  ------------------
  |  Branch (152:7): [True: 5.52k, False: 6.98k]
  ------------------
  153|  6.98k|#if CONFIG_MULTITHREAD
  154|  6.98k|  if (cdef_sync->mutex_ == NULL) {
  ------------------
  |  Branch (154:7): [True: 4.55k, False: 2.43k]
  ------------------
  155|  4.55k|    CHECK_MEM_ERROR(cm, cdef_sync->mutex_,
  ------------------
  |  |   51|  4.55k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  4.55k|  do {                                                    \
  |  |  |  |   69|  4.55k|    lval = (expr);                                        \
  |  |  |  |   70|  4.55k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 4.55k]
  |  |  |  |  ------------------
  |  |  |  |   71|  4.55k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  4.55k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 4.55k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  156|  4.55k|                    aom_malloc(sizeof(*(cdef_sync->mutex_))));
  157|  4.55k|    if (cdef_sync->mutex_) pthread_mutex_init(cdef_sync->mutex_, NULL);
  ------------------
  |  Branch (157:9): [True: 4.55k, False: 0]
  ------------------
  158|  4.55k|  }
  159|       |#else
  160|       |  (void)cm;
  161|       |  (void)cdef_sync;
  162|       |#endif  // CONFIG_MULTITHREAD
  163|  6.98k|}
av1_free_cdef_sync:
  165|  17.9k|void av1_free_cdef_sync(AV1CdefSync *cdef_sync) {
  166|  17.9k|  if (cdef_sync == NULL) return;
  ------------------
  |  Branch (166:7): [True: 0, False: 17.9k]
  ------------------
  167|  17.9k|#if CONFIG_MULTITHREAD
  168|  17.9k|  if (cdef_sync->mutex_ != NULL) {
  ------------------
  |  Branch (168:7): [True: 4.55k, False: 13.3k]
  ------------------
  169|  4.55k|    pthread_mutex_destroy(cdef_sync->mutex_);
  170|  4.55k|    aom_free(cdef_sync->mutex_);
  171|  4.55k|  }
  172|  17.9k|#endif  // CONFIG_MULTITHREAD
  173|  17.9k|}
av1_thread_loop_filter_rows:
  271|  14.3k|    int num_mis_in_lpf_unit_height_log2) {
  272|       |  // TODO(aomedia:3276): Pass error_info to the low-level functions as required
  273|       |  // in future to handle error propagation.
  274|  14.3k|  (void)error_info;
  275|  14.3k|  const int sb_cols =
  276|  14.3k|      CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, MAX_MIB_SIZE_LOG2);
  ------------------
  |  |   62|  14.3k|#define CEIL_POWER_OF_TWO(value, n) (((value) + (1 << (n)) - 1) >> (n))
  ------------------
  277|  14.3k|  const int r = mi_row >> num_mis_in_lpf_unit_height_log2;
  278|  14.3k|  int mi_col, c;
  279|       |
  280|  14.3k|  const bool joint_filter_chroma = (lpf_opt_level == 2) && plane > AOM_PLANE_Y;
  ------------------
  |  |  210|  14.3k|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
  |  Branch (280:36): [True: 0, False: 14.3k]
  |  Branch (280:60): [True: 0, False: 0]
  ------------------
  281|  14.3k|  const int num_planes = joint_filter_chroma ? 2 : 1;
  ------------------
  |  Branch (281:26): [True: 0, False: 14.3k]
  ------------------
  282|  14.3k|  assert(IMPLIES(joint_filter_chroma, plane == AOM_PLANE_U));
  283|       |
  284|  14.3k|  if (dir == 0) {
  ------------------
  |  Branch (284:7): [True: 7.16k, False: 7.15k]
  ------------------
  285|  16.3k|    for (mi_col = 0; mi_col < cm->mi_params.mi_cols; mi_col += MAX_MIB_SIZE) {
  ------------------
  |  |   44|  9.21k|#define MAX_MIB_SIZE (1 << MAX_MIB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   43|  9.21k|#define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   31|  9.21k|#define MAX_SB_SIZE_LOG2 7
  |  |  |  |  ------------------
  |  |  |  |               #define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   39|  9.21k|#define MI_SIZE_LOG2 2
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  |  Branch (285:22): [True: 9.21k, False: 7.16k]
  ------------------
  286|  9.21k|      c = mi_col >> MAX_MIB_SIZE_LOG2;
  ------------------
  |  |   43|  9.21k|#define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|  9.21k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  |  |               #define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  9.21k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  287|       |
  288|  9.21k|      av1_setup_dst_planes(planes, cm->seq_params->sb_size, frame_buffer,
  289|  9.21k|                           mi_row, mi_col, plane, plane + num_planes);
  290|  9.21k|      if (lpf_opt_level) {
  ------------------
  |  Branch (290:11): [True: 0, False: 9.21k]
  ------------------
  291|      0|        if (plane == AOM_PLANE_Y) {
  ------------------
  |  |  210|      0|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
  |  Branch (291:13): [True: 0, False: 0]
  ------------------
  292|      0|          av1_filter_block_plane_vert_opt(cm, xd, &planes[plane], mi_row,
  293|      0|                                          mi_col, params_buf, tx_buf,
  294|      0|                                          num_mis_in_lpf_unit_height_log2);
  295|      0|        } else {
  296|      0|          av1_filter_block_plane_vert_opt_chroma(
  297|      0|              cm, xd, &planes[plane], mi_row, mi_col, params_buf, tx_buf, plane,
  298|      0|              joint_filter_chroma, num_mis_in_lpf_unit_height_log2);
  299|      0|        }
  300|  9.21k|      } else {
  301|  9.21k|        av1_filter_block_plane_vert(cm, xd, plane, &planes[plane], mi_row,
  302|  9.21k|                                    mi_col);
  303|  9.21k|      }
  304|  9.21k|      if (lf_sync != NULL) {
  ------------------
  |  Branch (304:11): [True: 7.74k, False: 1.46k]
  ------------------
  305|  7.74k|        sync_write(lf_sync, r, c, sb_cols, plane);
  306|  7.74k|      }
  307|  9.21k|    }
  308|  7.16k|  } else if (dir == 1) {
  ------------------
  |  Branch (308:14): [True: 7.16k, False: 18.4E]
  ------------------
  309|  16.3k|    for (mi_col = 0; mi_col < cm->mi_params.mi_cols; mi_col += MAX_MIB_SIZE) {
  ------------------
  |  |   44|  9.21k|#define MAX_MIB_SIZE (1 << MAX_MIB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   43|  9.21k|#define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   31|  9.21k|#define MAX_SB_SIZE_LOG2 7
  |  |  |  |  ------------------
  |  |  |  |               #define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   39|  9.21k|#define MI_SIZE_LOG2 2
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  |  Branch (309:22): [True: 9.21k, False: 7.16k]
  ------------------
  310|  9.21k|      c = mi_col >> MAX_MIB_SIZE_LOG2;
  ------------------
  |  |   43|  9.21k|#define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|  9.21k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  |  |               #define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  9.21k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  311|       |
  312|  9.21k|      if (lf_sync != NULL) {
  ------------------
  |  Branch (312:11): [True: 7.74k, False: 1.46k]
  ------------------
  313|       |        // Wait for vertical edge filtering of the top-right block to be
  314|       |        // completed
  315|  7.74k|        sync_read(lf_sync, r, c, plane);
  316|       |
  317|       |        // Wait for vertical edge filtering of the right block to be completed
  318|  7.74k|        sync_read(lf_sync, r + 1, c, plane);
  319|  7.74k|      }
  320|       |
  321|  9.21k|#if CONFIG_MULTITHREAD
  322|  9.21k|      if (lf_sync && lf_sync->num_workers > 1) {
  ------------------
  |  Branch (322:11): [True: 7.75k, False: 1.46k]
  |  Branch (322:22): [True: 7.75k, False: 1]
  ------------------
  323|  7.75k|        pthread_mutex_lock(lf_sync->job_mutex);
  324|  7.75k|        const bool lf_mt_exit = lf_sync->lf_mt_exit;
  325|  7.75k|        pthread_mutex_unlock(lf_sync->job_mutex);
  326|       |        // Exit in case any worker has encountered an error.
  327|  7.75k|        if (lf_mt_exit) return;
  ------------------
  |  Branch (327:13): [True: 0, False: 7.75k]
  ------------------
  328|  7.75k|      }
  329|  9.21k|#endif
  330|       |
  331|  9.21k|      av1_setup_dst_planes(planes, cm->seq_params->sb_size, frame_buffer,
  332|  9.21k|                           mi_row, mi_col, plane, plane + num_planes);
  333|  9.21k|      if (lpf_opt_level) {
  ------------------
  |  Branch (333:11): [True: 0, False: 9.21k]
  ------------------
  334|      0|        if (plane == AOM_PLANE_Y) {
  ------------------
  |  |  210|      0|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
  |  Branch (334:13): [True: 0, False: 0]
  ------------------
  335|      0|          av1_filter_block_plane_horz_opt(cm, xd, &planes[plane], mi_row,
  336|      0|                                          mi_col, params_buf, tx_buf,
  337|      0|                                          num_mis_in_lpf_unit_height_log2);
  338|      0|        } else {
  339|      0|          av1_filter_block_plane_horz_opt_chroma(
  340|      0|              cm, xd, &planes[plane], mi_row, mi_col, params_buf, tx_buf, plane,
  341|      0|              joint_filter_chroma, num_mis_in_lpf_unit_height_log2);
  342|      0|        }
  343|  9.21k|      } else {
  344|  9.21k|        av1_filter_block_plane_horz(cm, xd, plane, &planes[plane], mi_row,
  345|  9.21k|                                    mi_col);
  346|  9.21k|      }
  347|  9.21k|    }
  348|  7.16k|  }
  349|  14.3k|}
av1_loop_filter_frame_mt:
  495|  1.86k|                              int lpf_opt_level) {
  496|  1.86k|  int start_mi_row, end_mi_row, mi_rows_to_filter;
  497|  1.86k|  int planes_to_lf[MAX_MB_PLANE];
  498|       |
  499|  1.86k|  if (!check_planes_to_loop_filter(&cm->lf, planes_to_lf, plane_start,
  ------------------
  |  Branch (499:7): [True: 0, False: 1.86k]
  ------------------
  500|  1.86k|                                   plane_end))
  501|      0|    return;
  502|       |
  503|  1.86k|  start_mi_row = 0;
  504|  1.86k|  mi_rows_to_filter = cm->mi_params.mi_rows;
  505|  1.86k|  if (partial_frame && cm->mi_params.mi_rows > 8) {
  ------------------
  |  Branch (505:7): [True: 0, False: 1.86k]
  |  Branch (505:24): [True: 0, False: 0]
  ------------------
  506|      0|    start_mi_row = cm->mi_params.mi_rows >> 1;
  507|      0|    start_mi_row &= 0xfffffff8;
  508|      0|    mi_rows_to_filter = AOMMAX(cm->mi_params.mi_rows / 8, 8);
  ------------------
  |  |   35|      0|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  509|      0|  }
  510|  1.86k|  end_mi_row = start_mi_row + mi_rows_to_filter;
  511|  1.86k|  av1_loop_filter_frame_init(cm, plane_start, plane_end);
  512|       |
  513|  1.86k|  if (num_workers > 1) {
  ------------------
  |  Branch (513:7): [True: 1.51k, False: 347]
  ------------------
  514|       |    // Enqueue and execute loopfiltering jobs.
  515|  1.51k|    loop_filter_rows_mt(frame, cm, xd, start_mi_row, end_mi_row, planes_to_lf,
  516|  1.51k|                        workers, num_workers, lf_sync, lpf_opt_level);
  517|  1.51k|  } else {
  518|       |    // Directly filter in the main thread.
  519|    347|    loop_filter_rows(frame, cm, xd, start_mi_row, end_mi_row, planes_to_lf,
  520|    347|                     lpf_opt_level);
  521|    347|  }
  522|  1.86k|}
av1_loop_restoration_alloc:
  589|    718|                                int num_planes, int width) {
  590|    718|  lr_sync->rows = num_rows_lr;
  591|    718|  lr_sync->num_planes = num_planes;
  592|    718|#if CONFIG_MULTITHREAD
  593|    718|  {
  594|    718|    int i, j;
  595|       |
  596|  2.66k|    for (j = 0; j < num_planes; j++) {
  ------------------
  |  Branch (596:17): [True: 1.95k, False: 718]
  ------------------
  597|  1.95k|      CHECK_MEM_ERROR(cm, lr_sync->mutex_[j],
  ------------------
  |  |   51|  1.95k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  1.95k|  do {                                                    \
  |  |  |  |   69|  1.95k|    lval = (expr);                                        \
  |  |  |  |   70|  1.95k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 1.95k]
  |  |  |  |  ------------------
  |  |  |  |   71|  1.95k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  1.95k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 1.95k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  598|  1.95k|                      aom_malloc(sizeof(*(lr_sync->mutex_[j])) * num_rows_lr));
  599|  1.95k|      if (lr_sync->mutex_[j]) {
  ------------------
  |  Branch (599:11): [True: 1.95k, False: 0]
  ------------------
  600|  7.04k|        for (i = 0; i < num_rows_lr; ++i) {
  ------------------
  |  Branch (600:21): [True: 5.09k, False: 1.95k]
  ------------------
  601|  5.09k|          pthread_mutex_init(&lr_sync->mutex_[j][i], NULL);
  602|  5.09k|        }
  603|  1.95k|      }
  604|       |
  605|  1.95k|      CHECK_MEM_ERROR(cm, lr_sync->cond_[j],
  ------------------
  |  |   51|  1.95k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  1.95k|  do {                                                    \
  |  |  |  |   69|  1.95k|    lval = (expr);                                        \
  |  |  |  |   70|  1.95k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 1.95k]
  |  |  |  |  ------------------
  |  |  |  |   71|  1.95k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  1.95k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 1.95k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  606|  1.95k|                      aom_malloc(sizeof(*(lr_sync->cond_[j])) * num_rows_lr));
  607|  1.95k|      if (lr_sync->cond_[j]) {
  ------------------
  |  Branch (607:11): [True: 1.95k, False: 0]
  ------------------
  608|  7.04k|        for (i = 0; i < num_rows_lr; ++i) {
  ------------------
  |  Branch (608:21): [True: 5.09k, False: 1.95k]
  ------------------
  609|  5.09k|          pthread_cond_init(&lr_sync->cond_[j][i], NULL);
  610|  5.09k|        }
  611|  1.95k|      }
  612|  1.95k|    }
  613|       |
  614|    718|    CHECK_MEM_ERROR(cm, lr_sync->job_mutex,
  ------------------
  |  |   51|    718|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|    718|  do {                                                    \
  |  |  |  |   69|    718|    lval = (expr);                                        \
  |  |  |  |   70|    718|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 718]
  |  |  |  |  ------------------
  |  |  |  |   71|    718|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|    718|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 718]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  615|    718|                    aom_malloc(sizeof(*(lr_sync->job_mutex))));
  616|    718|    if (lr_sync->job_mutex) {
  ------------------
  |  Branch (616:9): [True: 718, False: 0]
  ------------------
  617|    718|      pthread_mutex_init(lr_sync->job_mutex, NULL);
  618|    718|    }
  619|    718|  }
  620|    718|#endif  // CONFIG_MULTITHREAD
  621|    718|  CHECK_MEM_ERROR(cm, lr_sync->lrworkerdata,
  ------------------
  |  |   51|    718|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|    718|  do {                                                    \
  |  |  |  |   69|    718|    lval = (expr);                                        \
  |  |  |  |   70|    718|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 718]
  |  |  |  |  ------------------
  |  |  |  |   71|    718|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|    718|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 718]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  622|    718|                  aom_calloc(num_workers, sizeof(*(lr_sync->lrworkerdata))));
  623|    718|  lr_sync->num_workers = num_workers;
  624|       |
  625|  22.7k|  for (int worker_idx = 0; worker_idx < num_workers; ++worker_idx) {
  ------------------
  |  Branch (625:28): [True: 21.9k, False: 718]
  ------------------
  626|  21.9k|    if (worker_idx < num_workers - 1) {
  ------------------
  |  Branch (626:9): [True: 21.2k, False: 718]
  ------------------
  627|  21.2k|      CHECK_MEM_ERROR(cm, lr_sync->lrworkerdata[worker_idx].rst_tmpbuf,
  ------------------
  |  |   51|  21.2k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  21.2k|  do {                                                    \
  |  |  |  |   69|  21.2k|    lval = (expr);                                        \
  |  |  |  |   70|  21.2k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 21.2k]
  |  |  |  |  ------------------
  |  |  |  |   71|  21.2k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  21.2k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 21.2k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  628|  21.2k|                      (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE));
  629|  21.2k|      CHECK_MEM_ERROR(cm, lr_sync->lrworkerdata[worker_idx].rlbs,
  ------------------
  |  |   51|  21.2k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  21.2k|  do {                                                    \
  |  |  |  |   69|  21.2k|    lval = (expr);                                        \
  |  |  |  |   70|  21.2k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 21.2k]
  |  |  |  |  ------------------
  |  |  |  |   71|  21.2k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  21.2k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 21.2k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  630|  21.2k|                      aom_malloc(sizeof(RestorationLineBuffers)));
  631|       |
  632|  21.2k|    } else {
  633|    718|      lr_sync->lrworkerdata[worker_idx].rst_tmpbuf = cm->rst_tmpbuf;
  634|    718|      lr_sync->lrworkerdata[worker_idx].rlbs = cm->rlbs;
  635|    718|    }
  636|  21.9k|  }
  637|       |
  638|  2.66k|  for (int j = 0; j < num_planes; j++) {
  ------------------
  |  Branch (638:19): [True: 1.95k, False: 718]
  ------------------
  639|  1.95k|    CHECK_MEM_ERROR(
  ------------------
  |  |   51|  1.95k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  1.95k|  do {                                                    \
  |  |  |  |   69|  1.95k|    lval = (expr);                                        \
  |  |  |  |   70|  1.95k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 1.95k]
  |  |  |  |  ------------------
  |  |  |  |   71|  1.95k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  1.95k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 1.95k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  640|  1.95k|        cm, lr_sync->cur_sb_col[j],
  641|  1.95k|        aom_malloc(sizeof(*(lr_sync->cur_sb_col[j])) * num_rows_lr));
  642|  1.95k|  }
  643|    718|  CHECK_MEM_ERROR(
  ------------------
  |  |   51|    718|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|    718|  do {                                                    \
  |  |  |  |   69|    718|    lval = (expr);                                        \
  |  |  |  |   70|    718|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 718]
  |  |  |  |  ------------------
  |  |  |  |   71|    718|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|    718|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 718]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  644|    718|      cm, lr_sync->job_queue,
  645|    718|      aom_malloc(sizeof(*(lr_sync->job_queue)) * num_rows_lr * num_planes));
  646|       |  // Set up nsync.
  647|    718|  lr_sync->sync_range = get_lr_sync_range(width);
  648|    718|}
av1_loop_restoration_dealloc:
  651|  9.87k|void av1_loop_restoration_dealloc(AV1LrSync *lr_sync) {
  652|  9.87k|  if (lr_sync != NULL) {
  ------------------
  |  Branch (652:7): [True: 9.87k, False: 0]
  ------------------
  653|  9.87k|    int j;
  654|  9.87k|#if CONFIG_MULTITHREAD
  655|  9.87k|    int i;
  656|  39.5k|    for (j = 0; j < MAX_MB_PLANE; j++) {
  ------------------
  |  |   36|  39.5k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (656:17): [True: 29.6k, False: 9.87k]
  ------------------
  657|  29.6k|      if (lr_sync->mutex_[j] != NULL) {
  ------------------
  |  Branch (657:11): [True: 1.95k, False: 27.6k]
  ------------------
  658|  7.04k|        for (i = 0; i < lr_sync->rows; ++i) {
  ------------------
  |  Branch (658:21): [True: 5.09k, False: 1.95k]
  ------------------
  659|  5.09k|          pthread_mutex_destroy(&lr_sync->mutex_[j][i]);
  660|  5.09k|        }
  661|  1.95k|        aom_free(lr_sync->mutex_[j]);
  662|  1.95k|      }
  663|  29.6k|      if (lr_sync->cond_[j] != NULL) {
  ------------------
  |  Branch (663:11): [True: 1.95k, False: 27.6k]
  ------------------
  664|  7.04k|        for (i = 0; i < lr_sync->rows; ++i) {
  ------------------
  |  Branch (664:21): [True: 5.09k, False: 1.95k]
  ------------------
  665|  5.09k|          pthread_cond_destroy(&lr_sync->cond_[j][i]);
  666|  5.09k|        }
  667|  1.95k|        aom_free(lr_sync->cond_[j]);
  668|  1.95k|      }
  669|  29.6k|    }
  670|  9.87k|    if (lr_sync->job_mutex != NULL) {
  ------------------
  |  Branch (670:9): [True: 718, False: 9.15k]
  ------------------
  671|    718|      pthread_mutex_destroy(lr_sync->job_mutex);
  672|    718|      aom_free(lr_sync->job_mutex);
  673|    718|    }
  674|  9.87k|#endif  // CONFIG_MULTITHREAD
  675|  39.5k|    for (j = 0; j < MAX_MB_PLANE; j++) {
  ------------------
  |  |   36|  39.5k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (675:17): [True: 29.6k, False: 9.87k]
  ------------------
  676|  29.6k|      aom_free(lr_sync->cur_sb_col[j]);
  677|  29.6k|    }
  678|       |
  679|  9.87k|    aom_free(lr_sync->job_queue);
  680|       |
  681|  9.87k|    if (lr_sync->lrworkerdata) {
  ------------------
  |  Branch (681:9): [True: 718, False: 9.15k]
  ------------------
  682|  21.9k|      for (int worker_idx = 0; worker_idx < lr_sync->num_workers - 1;
  ------------------
  |  Branch (682:32): [True: 21.2k, False: 718]
  ------------------
  683|  21.2k|           worker_idx++) {
  684|  21.2k|        LRWorkerData *const workerdata_data =
  685|  21.2k|            lr_sync->lrworkerdata + worker_idx;
  686|       |
  687|  21.2k|        aom_free(workerdata_data->rst_tmpbuf);
  688|  21.2k|        aom_free(workerdata_data->rlbs);
  689|  21.2k|      }
  690|    718|      aom_free(lr_sync->lrworkerdata);
  691|    718|    }
  692|       |
  693|       |    // clear the structure as the source of this call may be a resize in which
  694|       |    // case this call will be followed by an _alloc() which may fail.
  695|  9.87k|    av1_zero(*lr_sync);
  ------------------
  |  |   43|  9.87k|#define av1_zero(dest) memset(&(dest), 0, sizeof(dest))
  ------------------
  696|  9.87k|  }
  697|  9.87k|}
av1_loop_restoration_filter_frame_mt:
  984|    718|                                          int do_extend_border) {
  985|    718|  assert(!cm->features.all_lossless);
  986|       |
  987|    718|  const int num_planes = av1_num_planes(cm);
  988|       |
  989|    718|  AV1LrStruct *loop_rest_ctxt = (AV1LrStruct *)lr_ctxt;
  990|       |
  991|    718|  av1_loop_restoration_filter_frame_init(loop_rest_ctxt, frame, cm,
  992|    718|                                         optimized_lr, num_planes);
  993|       |
  994|    718|  foreach_rest_unit_in_planes_mt(loop_rest_ctxt, workers, num_workers, lr_sync,
  995|    718|                                 cm, do_extend_border);
  996|    718|}
av1_cdef_init_fb_row_mt:
 1165|  3.77k|                             struct AV1CdefSyncData *const cdef_sync, int fbr) {
 1166|  3.77k|  const int num_planes = av1_num_planes(cm);
 1167|  3.77k|  const int nvfb = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
  ------------------
  |  |   58|  3.77k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  3.77k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
                const int nvfb = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
  ------------------
  |  |   58|  3.77k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  3.77k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1168|  3.77k|  const int luma_stride =
 1169|  3.77k|      ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols << MI_SIZE_LOG2, 4);
  ------------------
  |  |   69|  3.77k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  ------------------
 1170|       |
 1171|       |  // for the current filter block, it's top left corner mi structure (mi_tl)
 1172|       |  // is first accessed to check whether the top and left boundaries are
 1173|       |  // frame boundaries. Then bottom-left and top-right mi structures are
 1174|       |  // accessed to check whether the bottom and right boundaries
 1175|       |  // (respectively) are frame boundaries.
 1176|       |  //
 1177|       |  // Note that we can't just check the bottom-right mi structure - eg. if
 1178|       |  // we're at the right-hand edge of the frame but not the bottom, then
 1179|       |  // the bottom-right mi is NULL but the bottom-left is not.
 1180|  3.77k|  fb_info->frame_boundary[TOP] = (MI_SIZE_64X64 * fbr == 0) ? 1 : 0;
  ------------------
  |  |   58|  3.77k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  3.77k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  |  Branch (1180:34): [True: 1.67k, False: 2.10k]
  ------------------
 1181|  3.77k|  if (fbr != nvfb - 1)
  ------------------
  |  Branch (1181:7): [True: 2.10k, False: 1.67k]
  ------------------
 1182|  2.10k|    fb_info->frame_boundary[BOTTOM] =
 1183|  2.10k|        (MI_SIZE_64X64 * (fbr + 1) == cm->mi_params.mi_rows) ? 1 : 0;
  ------------------
  |  |   58|  2.10k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  2.10k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  |  Branch (1183:9): [True: 0, False: 2.10k]
  ------------------
 1184|  1.67k|  else
 1185|  1.67k|    fb_info->frame_boundary[BOTTOM] = 1;
 1186|       |
 1187|  3.77k|  fb_info->src = src;
 1188|  3.77k|  fb_info->damping = cm->cdef_info.cdef_damping;
 1189|  3.77k|  fb_info->coeff_shift = AOMMAX(cm->seq_params->bit_depth - 8, 0);
  ------------------
  |  |   35|  3.77k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 1.03k, False: 2.74k]
  |  |  ------------------
  ------------------
 1190|  3.77k|  av1_zero(fb_info->dir);
  ------------------
  |  |   43|  3.77k|#define av1_zero(dest) memset(&(dest), 0, sizeof(dest))
  ------------------
 1191|  3.77k|  av1_zero(fb_info->var);
  ------------------
  |  |   43|  3.77k|#define av1_zero(dest) memset(&(dest), 0, sizeof(dest))
  ------------------
 1192|       |
 1193|  14.2k|  for (int plane = 0; plane < num_planes; plane++) {
  ------------------
  |  Branch (1193:23): [True: 10.4k, False: 3.77k]
  ------------------
 1194|  10.4k|    const int stride = luma_stride >> xd->plane[plane].subsampling_x;
 1195|  10.4k|    uint16_t *top_linebuf = &linebuf[plane][0];
 1196|  10.4k|    uint16_t *bot_linebuf = &linebuf[plane][nvfb * CDEF_VBORDER * stride];
  ------------------
  |  |   23|  10.4k|#define CDEF_VBORDER (2)
  ------------------
 1197|  10.4k|    {
 1198|  10.4k|      const int mi_high_l2 = MI_SIZE_LOG2 - xd->plane[plane].subsampling_y;
  ------------------
  |  |   39|  10.4k|#define MI_SIZE_LOG2 2
  ------------------
 1199|  10.4k|      const int top_offset = MI_SIZE_64X64 * (fbr + 1) << mi_high_l2;
  ------------------
  |  |   58|  10.4k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  10.4k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1200|  10.4k|      const int bot_offset = MI_SIZE_64X64 * (fbr + 1) << mi_high_l2;
  ------------------
  |  |   58|  10.4k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  10.4k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1201|       |
 1202|  10.4k|      if (fbr != nvfb - 1)  // if (fbr != 0)  // top line buffer copy
  ------------------
  |  Branch (1202:11): [True: 5.97k, False: 4.46k]
  ------------------
 1203|  5.97k|        av1_cdef_copy_sb8_16(
 1204|  5.97k|            cm, &top_linebuf[(fbr + 1) * CDEF_VBORDER * stride], stride,
  ------------------
  |  |   23|  5.97k|#define CDEF_VBORDER (2)
  ------------------
 1205|  5.97k|            xd->plane[plane].dst.buf, top_offset - CDEF_VBORDER, 0,
  ------------------
  |  |   23|  5.97k|#define CDEF_VBORDER (2)
  ------------------
 1206|  5.97k|            xd->plane[plane].dst.stride, CDEF_VBORDER, stride);
  ------------------
  |  |   23|  5.97k|#define CDEF_VBORDER (2)
  ------------------
 1207|  10.4k|      if (fbr != nvfb - 1)  // bottom line buffer copy
  ------------------
  |  Branch (1207:11): [True: 5.96k, False: 4.48k]
  ------------------
 1208|  5.96k|        av1_cdef_copy_sb8_16(cm, &bot_linebuf[fbr * CDEF_VBORDER * stride],
  ------------------
  |  |   23|  5.96k|#define CDEF_VBORDER (2)
  ------------------
 1209|  5.96k|                             stride, xd->plane[plane].dst.buf, bot_offset, 0,
 1210|  5.96k|                             xd->plane[plane].dst.stride, CDEF_VBORDER, stride);
  ------------------
  |  |   23|  5.96k|#define CDEF_VBORDER (2)
  ------------------
 1211|  10.4k|    }
 1212|       |
 1213|  10.4k|    fb_info->top_linebuf[plane] = &linebuf[plane][fbr * CDEF_VBORDER * stride];
  ------------------
  |  |   23|  10.4k|#define CDEF_VBORDER (2)
  ------------------
 1214|  10.4k|    fb_info->bot_linebuf[plane] =
 1215|  10.4k|        &linebuf[plane]
 1216|  10.4k|                [nvfb * CDEF_VBORDER * stride + (fbr * CDEF_VBORDER * stride)];
  ------------------
  |  |   23|  10.4k|#define CDEF_VBORDER (2)
  ------------------
                              [nvfb * CDEF_VBORDER * stride + (fbr * CDEF_VBORDER * stride)];
  ------------------
  |  |   23|  10.4k|#define CDEF_VBORDER (2)
  ------------------
 1217|  10.4k|  }
 1218|       |
 1219|  3.77k|  cdef_row_mt_sync_write(cdef_sync, fbr);
 1220|  3.77k|  cdef_row_mt_sync_read(cdef_sync, fbr);
 1221|  3.77k|}
av1_cdef_frame_mt:
 1235|  1.67k|                       int do_extend_border) {
 1236|  1.67k|  YV12_BUFFER_CONFIG *frame = &cm->cur_frame->buf;
 1237|  1.67k|  const int num_planes = av1_num_planes(cm);
 1238|       |
 1239|  1.67k|  av1_setup_dst_planes(xd->plane, cm->seq_params->sb_size, frame, 0, 0, 0,
 1240|  1.67k|                       num_planes);
 1241|       |
 1242|  1.67k|  reset_cdef_job_info(cdef_sync);
 1243|  1.67k|  prepare_cdef_frame_workers(cm, xd, cdef_worker, cdef_sb_row_worker_hook,
 1244|  1.67k|                             workers, cdef_sync, num_workers,
 1245|  1.67k|                             cdef_init_fb_row_fn, do_extend_border);
 1246|  1.67k|  launch_cdef_workers(workers, num_workers);
 1247|  1.67k|  sync_cdef_workers(workers, cm, num_workers);
 1248|  1.67k|}
av1_get_intrabc_extra_top_right_sb_delay:
 1250|  14.6k|int av1_get_intrabc_extra_top_right_sb_delay(const AV1_COMMON *cm) {
 1251|       |  // No additional top-right delay when intraBC tool is not enabled.
 1252|  14.6k|  if (!av1_allow_intrabc(cm)) return 0;
  ------------------
  |  Branch (1252:7): [True: 13.0k, False: 1.57k]
  ------------------
 1253|       |  // Due to the hardware constraints on processing the intraBC tool with row
 1254|       |  // multithreading, a top-right delay of 3 superblocks of size 128x128 or 5
 1255|       |  // superblocks of size 64x64 is mandated. However, a minimum top-right delay
 1256|       |  // of 1 superblock is assured with 'sync_range'. Hence return only the
 1257|       |  // additional superblock delay when the intraBC tool is enabled.
 1258|  1.57k|  return cm->seq_params->sb_size == BLOCK_128X128 ? 2 : 4;
  ------------------
  |  Branch (1258:10): [True: 242, False: 1.33k]
  ------------------
 1259|  14.6k|}
thread_common.c:get_sync_range:
   32|  1.42k|static inline int get_sync_range(int width) {
   33|       |  // nsync numbers are picked by testing. For example, for 4k
   34|       |  // video, using 4 gives best performance.
   35|  1.42k|  if (width < 640)
  ------------------
  |  Branch (35:7): [True: 1.42k, False: 0]
  ------------------
   36|  1.42k|    return 1;
   37|      0|  else if (width <= 1280)
  ------------------
  |  Branch (37:12): [True: 0, False: 0]
  ------------------
   38|      0|    return 2;
   39|      0|  else if (width <= 4096)
  ------------------
  |  Branch (39:12): [True: 0, False: 0]
  ------------------
   40|      0|    return 4;
   41|      0|  else
   42|      0|    return 8;
   43|  1.42k|}
thread_common.c:sync_write:
  228|  7.73k|                              const int sb_cols, int plane) {
  229|  7.73k|#if CONFIG_MULTITHREAD
  230|  7.73k|  const int nsync = lf_sync->sync_range;
  231|  7.73k|  int cur;
  232|       |  // Only signal when there are enough filtered SB for next row to run.
  233|  7.73k|  int sig = 1;
  234|       |
  235|  7.73k|  if (c < sb_cols - 1) {
  ------------------
  |  Branch (235:7): [True: 1.73k, False: 6.00k]
  ------------------
  236|  1.73k|    cur = c;
  237|  1.73k|    if (c % nsync) sig = 0;
  ------------------
  |  Branch (237:9): [True: 0, False: 1.73k]
  ------------------
  238|  6.00k|  } else {
  239|  6.00k|    cur = sb_cols + nsync;
  240|  6.00k|  }
  241|       |
  242|  7.73k|  if (sig) {
  ------------------
  |  Branch (242:7): [True: 7.73k, False: 1]
  ------------------
  243|  7.73k|    pthread_mutex_lock(&lf_sync->mutex_[plane][r]);
  244|       |
  245|       |    // When a thread encounters an error, cur_sb_col[plane][r] is set to maximum
  246|       |    // column number. In this case, the AOMMAX operation here ensures that
  247|       |    // cur_sb_col[plane][r] is not overwritten with a smaller value thus
  248|       |    // preventing the infinite waiting of threads in the relevant sync_read()
  249|       |    // function.
  250|  7.73k|    lf_sync->cur_sb_col[plane][r] = AOMMAX(lf_sync->cur_sb_col[plane][r], cur);
  ------------------
  |  |   35|  7.73k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 7.73k]
  |  |  ------------------
  ------------------
  251|       |
  252|  7.73k|    pthread_cond_broadcast(&lf_sync->cond_[plane][r]);
  253|  7.73k|    pthread_mutex_unlock(&lf_sync->mutex_[plane][r]);
  254|  7.73k|  }
  255|       |#else
  256|       |  (void)lf_sync;
  257|       |  (void)r;
  258|       |  (void)c;
  259|       |  (void)sb_cols;
  260|       |  (void)plane;
  261|       |#endif  // CONFIG_MULTITHREAD
  262|  7.73k|}
thread_common.c:sync_read:
  206|  15.4k|                             int plane) {
  207|  15.4k|#if CONFIG_MULTITHREAD
  208|  15.4k|  const int nsync = lf_sync->sync_range;
  209|       |
  210|  15.4k|  if (r && !(c & (nsync - 1))) {
  ------------------
  |  Branch (210:7): [True: 10.6k, False: 4.85k]
  |  Branch (210:12): [True: 10.6k, False: 2]
  ------------------
  211|  10.6k|    pthread_mutex_t *const mutex = &lf_sync->mutex_[plane][r - 1];
  212|  10.6k|    pthread_mutex_lock(mutex);
  213|       |
  214|  13.2k|    while (c > lf_sync->cur_sb_col[plane][r - 1] - nsync) {
  ------------------
  |  Branch (214:12): [True: 2.59k, False: 10.6k]
  ------------------
  215|  2.59k|      pthread_cond_wait(&lf_sync->cond_[plane][r - 1], mutex);
  216|  2.59k|    }
  217|  10.6k|    pthread_mutex_unlock(mutex);
  218|  10.6k|  }
  219|       |#else
  220|       |  (void)lf_sync;
  221|       |  (void)r;
  222|       |  (void)c;
  223|       |  (void)plane;
  224|       |#endif  // CONFIG_MULTITHREAD
  225|  15.4k|}
thread_common.c:loop_filter_rows_mt:
  435|  1.51k|                                AV1LfSync *lf_sync, int lpf_opt_level) {
  436|  1.51k|  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
  437|  1.51k|  int i;
  438|  1.51k|  loop_filter_frame_mt_init(cm, start, stop, planes_to_lf, num_workers, lf_sync,
  439|  1.51k|                            lpf_opt_level, MAX_MIB_SIZE_LOG2);
  ------------------
  |  |   43|  1.51k|#define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|  1.51k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  |  |               #define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  1.51k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  440|       |
  441|       |  // Set up loopfilter thread data.
  442|  46.6k|  for (i = num_workers - 1; i >= 0; --i) {
  ------------------
  |  Branch (442:29): [True: 45.1k, False: 1.51k]
  ------------------
  443|  45.1k|    AVxWorker *const worker = &workers[i];
  444|  45.1k|    LFWorkerData *const lf_data = &lf_sync->lfdata[i];
  445|       |
  446|  45.1k|    worker->hook = loop_filter_row_worker;
  447|  45.1k|    worker->data1 = lf_sync;
  448|  45.1k|    worker->data2 = lf_data;
  449|       |
  450|       |    // Loopfilter data
  451|  45.1k|    loop_filter_data_reset(lf_data, frame, cm, xd);
  452|       |
  453|       |    // Start loopfiltering
  454|  45.1k|    worker->had_error = 0;
  455|  45.1k|    if (i == 0) {
  ------------------
  |  Branch (455:9): [True: 1.51k, False: 43.6k]
  ------------------
  456|  1.51k|      winterface->execute(worker);
  457|  43.6k|    } else {
  458|  43.6k|      winterface->launch(worker);
  459|  43.6k|    }
  460|  45.1k|  }
  461|       |
  462|  1.51k|  sync_lf_workers(workers, cm, num_workers);
  463|  1.51k|}
thread_common.c:loop_filter_row_worker:
  393|  45.1k|static int loop_filter_row_worker(void *arg1, void *arg2) {
  394|  45.1k|  AV1LfSync *const lf_sync = (AV1LfSync *)arg1;
  395|  45.1k|  LFWorkerData *const lf_data = (LFWorkerData *)arg2;
  396|  45.1k|  AV1LfMTInfo *cur_job_info;
  397|       |
  398|  45.1k|#if CONFIG_MULTITHREAD
  399|  45.1k|  pthread_mutex_t *job_mutex_ = lf_sync->job_mutex;
  400|  45.1k|#endif
  401|       |
  402|  45.1k|  struct aom_internal_error_info *const error_info = &lf_data->error_info;
  403|       |
  404|       |  // The jmp_buf is valid only for the duration of the function that calls
  405|       |  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
  406|       |  // before it returns.
  407|  45.1k|  if (setjmp(error_info->jmp)) {
  ------------------
  |  Branch (407:7): [True: 0, False: 45.1k]
  ------------------
  408|      0|    error_info->setjmp = 0;
  409|      0|#if CONFIG_MULTITHREAD
  410|      0|    pthread_mutex_lock(job_mutex_);
  411|      0|    lf_sync->lf_mt_exit = true;
  412|      0|    pthread_mutex_unlock(job_mutex_);
  413|      0|#endif
  414|      0|    av1_set_vert_loop_filter_done(lf_data->cm, lf_sync, MAX_MIB_SIZE_LOG2);
  ------------------
  |  |   43|      0|#define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|      0|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  |  |               #define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|      0|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  415|      0|    return 0;
  416|      0|  }
  417|  45.1k|  error_info->setjmp = 1;
  418|       |
  419|  57.1k|  while ((cur_job_info = get_lf_job_info(lf_sync)) != NULL) {
  ------------------
  |  Branch (419:10): [True: 12.0k, False: 45.1k]
  ------------------
  420|  12.0k|    const int lpf_opt_level = cur_job_info->lpf_opt_level;
  421|  12.0k|    av1_thread_loop_filter_rows(
  422|  12.0k|        lf_data->frame_buffer, lf_data->cm, lf_data->planes, lf_data->xd,
  423|  12.0k|        cur_job_info->mi_row, cur_job_info->plane, cur_job_info->dir,
  424|  12.0k|        lpf_opt_level, lf_sync, error_info, lf_data->params_buf,
  425|  12.0k|        lf_data->tx_buf, MAX_MIB_SIZE_LOG2);
  ------------------
  |  |   43|  12.0k|#define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|  12.0k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  |  |               #define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  12.0k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  426|  12.0k|  }
  427|  45.1k|  error_info->setjmp = 0;
  428|  45.1k|  return 1;
  429|  45.1k|}
thread_common.c:sync_lf_workers:
  370|  1.51k|                                   AV1_COMMON *const cm, int num_workers) {
  371|  1.51k|  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
  372|  1.51k|  int had_error = workers[0].had_error;
  373|  1.51k|  struct aom_internal_error_info error_info;
  374|       |
  375|       |  // Read the error_info of main thread.
  376|  1.51k|  if (had_error) {
  ------------------
  |  Branch (376:7): [True: 0, False: 1.51k]
  ------------------
  377|      0|    AVxWorker *const worker = &workers[0];
  378|      0|    error_info = ((LFWorkerData *)worker->data2)->error_info;
  379|      0|  }
  380|       |
  381|       |  // Wait till all rows are finished.
  382|  45.1k|  for (int i = num_workers - 1; i > 0; --i) {
  ------------------
  |  Branch (382:33): [True: 43.6k, False: 1.51k]
  ------------------
  383|  43.6k|    AVxWorker *const worker = &workers[i];
  384|  43.6k|    if (!winterface->sync(worker)) {
  ------------------
  |  Branch (384:9): [True: 0, False: 43.6k]
  ------------------
  385|      0|      had_error = 1;
  386|      0|      error_info = ((LFWorkerData *)worker->data2)->error_info;
  387|      0|    }
  388|  43.6k|  }
  389|  1.51k|  if (had_error) aom_internal_error_copy(cm->error, &error_info);
  ------------------
  |  Branch (389:7): [True: 0, False: 1.51k]
  ------------------
  390|  1.51k|}
thread_common.c:loop_filter_rows:
  468|    347|                             int lpf_opt_level) {
  469|       |  // Filter top rows of all planes first, in case the output can be partially
  470|       |  // reconstructed row by row.
  471|    347|  int mi_row, plane, dir;
  472|       |
  473|    347|  AV1_DEBLOCKING_PARAMETERS params_buf[MAX_MIB_SIZE];
  474|    347|  TX_SIZE tx_buf[MAX_MIB_SIZE];
  475|    770|  for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
  ------------------
  |  |   44|    423|#define MAX_MIB_SIZE (1 << MAX_MIB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   43|    423|#define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   31|    423|#define MAX_SB_SIZE_LOG2 7
  |  |  |  |  ------------------
  |  |  |  |               #define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   39|    423|#define MI_SIZE_LOG2 2
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  |  Branch (475:24): [True: 423, False: 347]
  ------------------
  476|  1.69k|    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
  ------------------
  |  |   36|  1.69k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (476:21): [True: 1.26k, False: 423]
  ------------------
  477|  1.26k|      if (skip_loop_filter_plane(planes_to_lf, plane, lpf_opt_level)) {
  ------------------
  |  Branch (477:11): [True: 119, False: 1.15k]
  ------------------
  478|    119|        continue;
  479|    119|      }
  480|       |
  481|  3.45k|      for (dir = 0; dir < 2; ++dir) {
  ------------------
  |  Branch (481:21): [True: 2.30k, False: 1.15k]
  ------------------
  482|  2.30k|        av1_thread_loop_filter_rows(frame, cm, xd->plane, xd, mi_row, plane,
  483|  2.30k|                                    dir, lpf_opt_level, /*lf_sync=*/NULL,
  484|  2.30k|                                    xd->error_info, params_buf, tx_buf,
  485|  2.30k|                                    MAX_MIB_SIZE_LOG2);
  ------------------
  |  |   43|  2.30k|#define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|  2.30k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  |  |               #define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  2.30k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  486|  2.30k|      }
  487|  1.15k|    }
  488|    423|  }
  489|    347|}
thread_common.c:get_lr_sync_range:
   46|    718|static inline int get_lr_sync_range(int width) {
   47|       |#if 0
   48|       |  // nsync numbers are picked by testing. For example, for 4k
   49|       |  // video, using 4 gives best performance.
   50|       |  if (width < 640)
   51|       |    return 1;
   52|       |  else if (width <= 1280)
   53|       |    return 2;
   54|       |  else if (width <= 4096)
   55|       |    return 4;
   56|       |  else
   57|       |    return 8;
   58|       |#else
   59|    718|  (void)width;
   60|    718|  return 1;
   61|    718|#endif
   62|    718|}
thread_common.c:foreach_rest_unit_in_planes_mt:
  923|    718|                                           int do_extend_border) {
  924|    718|  FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
  925|       |
  926|    718|  const int num_planes = av1_num_planes(cm);
  927|       |
  928|    718|  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
  929|    718|  int num_rows_lr = 0;
  930|       |
  931|  2.66k|  for (int plane = 0; plane < num_planes; plane++) {
  ------------------
  |  Branch (931:23): [True: 1.95k, False: 718]
  ------------------
  932|  1.95k|    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
  ------------------
  |  Branch (932:9): [True: 456, False: 1.49k]
  ------------------
  933|       |
  934|  1.49k|    const int plane_h = ctxt[plane].plane_h;
  935|  1.49k|    const int unit_size = cm->rst_info[plane].restoration_unit_size;
  936|       |
  937|  1.49k|    num_rows_lr = AOMMAX(num_rows_lr, av1_lr_count_units(unit_size, plane_h));
  ------------------
  |  |   35|  1.49k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 1.49k]
  |  |  ------------------
  ------------------
  938|  1.49k|  }
  939|       |
  940|    718|  int i;
  941|    718|  assert(MAX_MB_PLANE == 3);
  942|       |
  943|    718|  if (!lr_sync->sync_range || num_rows_lr > lr_sync->rows ||
  ------------------
  |  Branch (943:7): [True: 718, False: 0]
  |  Branch (943:31): [True: 0, False: 0]
  ------------------
  944|    718|      num_workers > lr_sync->num_workers || num_planes > lr_sync->num_planes) {
  ------------------
  |  Branch (944:7): [True: 0, False: 0]
  |  Branch (944:45): [True: 0, False: 0]
  ------------------
  945|    718|    av1_loop_restoration_dealloc(lr_sync);
  946|    718|    av1_loop_restoration_alloc(lr_sync, cm, num_workers, num_rows_lr,
  947|    718|                               num_planes, cm->width);
  948|    718|  }
  949|    718|  lr_sync->lr_mt_exit = false;
  950|       |
  951|       |  // Initialize cur_sb_col to -1 for all SB rows.
  952|  2.66k|  for (i = 0; i < num_planes; i++) {
  ------------------
  |  Branch (952:15): [True: 1.95k, False: 718]
  ------------------
  953|  1.95k|    memset(lr_sync->cur_sb_col[i], -1,
  954|  1.95k|           sizeof(*(lr_sync->cur_sb_col[i])) * num_rows_lr);
  955|  1.95k|  }
  956|       |
  957|    718|  enqueue_lr_jobs(lr_sync, lr_ctxt, cm);
  958|       |
  959|       |  // Set up looprestoration thread data.
  960|  22.7k|  for (i = num_workers - 1; i >= 0; --i) {
  ------------------
  |  Branch (960:29): [True: 21.9k, False: 718]
  ------------------
  961|  21.9k|    AVxWorker *const worker = &workers[i];
  962|  21.9k|    lr_sync->lrworkerdata[i].lr_ctxt = (void *)lr_ctxt;
  963|  21.9k|    lr_sync->lrworkerdata[i].do_extend_border = do_extend_border;
  964|  21.9k|    worker->hook = loop_restoration_row_worker;
  965|  21.9k|    worker->data1 = lr_sync;
  966|  21.9k|    worker->data2 = &lr_sync->lrworkerdata[i];
  967|       |
  968|       |    // Start loop restoration
  969|  21.9k|    worker->had_error = 0;
  970|  21.9k|    if (i == 0) {
  ------------------
  |  Branch (970:9): [True: 718, False: 21.2k]
  ------------------
  971|    718|      winterface->execute(worker);
  972|  21.2k|    } else {
  973|  21.2k|      winterface->launch(worker);
  974|  21.2k|    }
  975|  21.9k|  }
  976|       |
  977|    718|  sync_lr_workers(workers, cm, num_workers);
  978|    718|}
thread_common.c:enqueue_lr_jobs:
  700|    718|                            AV1_COMMON *cm) {
  701|    718|  FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
  702|       |
  703|    718|  const int num_planes = av1_num_planes(cm);
  704|    718|  AV1LrMTInfo *lr_job_queue = lr_sync->job_queue;
  705|    718|  int32_t lr_job_counter[2], num_even_lr_jobs = 0;
  706|    718|  lr_sync->jobs_enqueued = 0;
  707|    718|  lr_sync->jobs_dequeued = 0;
  708|       |
  709|  2.66k|  for (int plane = 0; plane < num_planes; plane++) {
  ------------------
  |  Branch (709:23): [True: 1.95k, False: 718]
  ------------------
  710|  1.95k|    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
  ------------------
  |  Branch (710:9): [True: 456, False: 1.49k]
  ------------------
  711|  1.49k|    num_even_lr_jobs =
  712|  1.49k|        num_even_lr_jobs + ((ctxt[plane].rsi->vert_units + 1) >> 1);
  713|  1.49k|  }
  714|    718|  lr_job_counter[0] = 0;
  715|    718|  lr_job_counter[1] = num_even_lr_jobs;
  716|       |
  717|  2.66k|  for (int plane = 0; plane < num_planes; plane++) {
  ------------------
  |  Branch (717:23): [True: 1.95k, False: 718]
  ------------------
  718|  1.95k|    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
  ------------------
  |  Branch (718:9): [True: 456, False: 1.49k]
  ------------------
  719|  1.49k|    const int is_uv = plane > 0;
  720|  1.49k|    const int ss_y = is_uv && cm->seq_params->subsampling_y;
  ------------------
  |  Branch (720:22): [True: 936, False: 558]
  |  Branch (720:31): [True: 0, False: 936]
  ------------------
  721|  1.49k|    const int unit_size = ctxt[plane].rsi->restoration_unit_size;
  722|  1.49k|    const int plane_h = ctxt[plane].plane_h;
  723|  1.49k|    const int ext_size = unit_size * 3 / 2;
  724|       |
  725|  1.49k|    int y0 = 0, i = 0;
  726|  5.35k|    while (y0 < plane_h) {
  ------------------
  |  Branch (726:12): [True: 3.86k, False: 1.49k]
  ------------------
  727|  3.86k|      int remaining_h = plane_h - y0;
  728|  3.86k|      int h = (remaining_h < ext_size) ? remaining_h : unit_size;
  ------------------
  |  Branch (728:15): [True: 1.49k, False: 2.37k]
  ------------------
  729|       |
  730|  3.86k|      RestorationTileLimits limits;
  731|  3.86k|      limits.v_start = y0;
  732|  3.86k|      limits.v_end = y0 + h;
  733|  3.86k|      assert(limits.v_end <= plane_h);
  734|       |      // Offset upwards to align with the restoration processing stripe
  735|  3.86k|      const int voffset = RESTORATION_UNIT_OFFSET >> ss_y;
  ------------------
  |  |   37|  3.86k|#define RESTORATION_UNIT_OFFSET 8
  ------------------
  736|  3.86k|      limits.v_start = AOMMAX(0, limits.v_start - voffset);
  ------------------
  |  |   35|  3.86k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 1.49k, False: 2.37k]
  |  |  ------------------
  ------------------
  737|  3.86k|      if (limits.v_end < plane_h) limits.v_end -= voffset;
  ------------------
  |  Branch (737:11): [True: 2.37k, False: 1.49k]
  ------------------
  738|       |
  739|  3.86k|      assert(lr_job_counter[0] <= num_even_lr_jobs);
  740|       |
  741|  3.86k|      lr_job_queue[lr_job_counter[i & 1]].lr_unit_row = i;
  742|  3.86k|      lr_job_queue[lr_job_counter[i & 1]].plane = plane;
  743|  3.86k|      lr_job_queue[lr_job_counter[i & 1]].v_start = limits.v_start;
  744|  3.86k|      lr_job_queue[lr_job_counter[i & 1]].v_end = limits.v_end;
  745|  3.86k|      lr_job_queue[lr_job_counter[i & 1]].sync_mode = i & 1;
  746|  3.86k|      if ((i & 1) == 0) {
  ------------------
  |  Branch (746:11): [True: 2.55k, False: 1.30k]
  ------------------
  747|  2.55k|        lr_job_queue[lr_job_counter[i & 1]].v_copy_start =
  748|  2.55k|            limits.v_start + RESTORATION_BORDER;
  ------------------
  |  |   62|  2.55k|#define RESTORATION_BORDER 3
  ------------------
  749|  2.55k|        lr_job_queue[lr_job_counter[i & 1]].v_copy_end =
  750|  2.55k|            limits.v_end - RESTORATION_BORDER;
  ------------------
  |  |   62|  2.55k|#define RESTORATION_BORDER 3
  ------------------
  751|  2.55k|        if (i == 0) {
  ------------------
  |  Branch (751:13): [True: 1.49k, False: 1.06k]
  ------------------
  752|  1.49k|          assert(limits.v_start == 0);
  753|  1.49k|          lr_job_queue[lr_job_counter[i & 1]].v_copy_start = 0;
  754|  1.49k|        }
  755|  2.55k|        if (i == (ctxt[plane].rsi->vert_units - 1)) {
  ------------------
  |  Branch (755:13): [True: 1.24k, False: 1.30k]
  ------------------
  756|  1.24k|          assert(limits.v_end == plane_h);
  757|  1.24k|          lr_job_queue[lr_job_counter[i & 1]].v_copy_end = plane_h;
  758|  1.24k|        }
  759|  2.55k|      } else {
  760|  1.30k|        lr_job_queue[lr_job_counter[i & 1]].v_copy_start =
  761|  1.30k|            AOMMAX(limits.v_start - RESTORATION_BORDER, 0);
  ------------------
  |  |   35|  1.30k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 1.30k, False: 0]
  |  |  ------------------
  ------------------
  762|  1.30k|        lr_job_queue[lr_job_counter[i & 1]].v_copy_end =
  763|  1.30k|            AOMMIN(limits.v_end + RESTORATION_BORDER, plane_h);
  ------------------
  |  |   34|  1.30k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 1.06k, False: 246]
  |  |  ------------------
  ------------------
  764|  1.30k|      }
  765|  3.86k|      lr_job_counter[i & 1]++;
  766|  3.86k|      lr_sync->jobs_enqueued++;
  767|       |
  768|  3.86k|      y0 += h;
  769|  3.86k|      ++i;
  770|  3.86k|    }
  771|  1.49k|  }
  772|    718|}
thread_common.c:loop_restoration_row_worker:
  814|  21.9k|static int loop_restoration_row_worker(void *arg1, void *arg2) {
  815|  21.9k|  AV1LrSync *const lr_sync = (AV1LrSync *)arg1;
  816|  21.9k|  LRWorkerData *lrworkerdata = (LRWorkerData *)arg2;
  817|  21.9k|  AV1LrStruct *lr_ctxt = (AV1LrStruct *)lrworkerdata->lr_ctxt;
  818|  21.9k|  FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
  819|  21.9k|  int lr_unit_row;
  820|  21.9k|  int plane;
  821|  21.9k|  int plane_w;
  822|  21.9k|#if CONFIG_MULTITHREAD
  823|  21.9k|  pthread_mutex_t *job_mutex_ = lr_sync->job_mutex;
  824|  21.9k|#endif
  825|  21.9k|  struct aom_internal_error_info *const error_info = &lrworkerdata->error_info;
  826|       |
  827|       |  // The jmp_buf is valid only for the duration of the function that calls
  828|       |  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
  829|       |  // before it returns.
  830|  21.9k|  if (setjmp(error_info->jmp)) {
  ------------------
  |  Branch (830:7): [True: 0, False: 21.9k]
  ------------------
  831|      0|    error_info->setjmp = 0;
  832|      0|#if CONFIG_MULTITHREAD
  833|      0|    pthread_mutex_lock(job_mutex_);
  834|      0|    lr_sync->lr_mt_exit = true;
  835|      0|    pthread_mutex_unlock(job_mutex_);
  836|      0|#endif
  837|       |    // In case of loop restoration multithreading, the worker on an even lr
  838|       |    // block row waits for the completion of the filtering of the top-right and
  839|       |    // bottom-right blocks. Hence, in case a thread (main/worker) encounters an
  840|       |    // error, update that filtering of every row in the frame is complete in
  841|       |    // order to avoid the dependent workers from waiting indefinitely.
  842|      0|    set_loop_restoration_done(lr_sync, lr_ctxt->ctxt);
  843|      0|    return 0;
  844|      0|  }
  845|  21.9k|  error_info->setjmp = 1;
  846|       |
  847|  21.9k|  typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc,
  848|  21.9k|                           YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend,
  849|  21.9k|                           int vstart, int vend);
  850|  21.9k|  static const copy_fun copy_funs[MAX_MB_PLANE] = {
  851|  21.9k|    aom_yv12_partial_coloc_copy_y, aom_yv12_partial_coloc_copy_u,
  ------------------
  |  |   58|  21.9k|#define aom_yv12_partial_coloc_copy_y aom_yv12_partial_coloc_copy_y_c
  ------------------
                  aom_yv12_partial_coloc_copy_y, aom_yv12_partial_coloc_copy_u,
  ------------------
  |  |   52|  21.9k|#define aom_yv12_partial_coloc_copy_u aom_yv12_partial_coloc_copy_u_c
  ------------------
  852|  21.9k|    aom_yv12_partial_coloc_copy_v
  ------------------
  |  |   55|  21.9k|#define aom_yv12_partial_coloc_copy_v aom_yv12_partial_coloc_copy_v_c
  ------------------
  853|  21.9k|  };
  854|       |
  855|  25.8k|  while (1) {
  ------------------
  |  Branch (855:10): [True: 25.7k, Folded]
  ------------------
  856|  25.7k|    AV1LrMTInfo *cur_job_info = get_lr_job_info(lr_sync);
  857|  25.7k|    if (cur_job_info != NULL) {
  ------------------
  |  Branch (857:9): [True: 3.86k, False: 21.9k]
  ------------------
  858|  3.86k|      RestorationTileLimits limits;
  859|  3.86k|      sync_read_fn_t on_sync_read;
  860|  3.86k|      sync_write_fn_t on_sync_write;
  861|  3.86k|      limits.v_start = cur_job_info->v_start;
  862|  3.86k|      limits.v_end = cur_job_info->v_end;
  863|  3.86k|      lr_unit_row = cur_job_info->lr_unit_row;
  864|  3.86k|      plane = cur_job_info->plane;
  865|  3.86k|      plane_w = ctxt[plane].plane_w;
  866|       |
  867|       |      // sync_mode == 1 implies only sync read is required in LR Multi-threading
  868|       |      // sync_mode == 0 implies only sync write is required.
  869|  3.86k|      on_sync_read =
  870|  3.86k|          cur_job_info->sync_mode == 1 ? lr_sync_read : av1_lr_sync_read_dummy;
  ------------------
  |  Branch (870:11): [True: 1.30k, False: 2.55k]
  ------------------
  871|  3.86k|      on_sync_write = cur_job_info->sync_mode == 0 ? lr_sync_write
  ------------------
  |  Branch (871:23): [True: 2.55k, False: 1.31k]
  ------------------
  872|  3.86k|                                                   : av1_lr_sync_write_dummy;
  873|       |
  874|  3.86k|      av1_foreach_rest_unit_in_row(
  875|  3.86k|          &limits, plane_w, lr_ctxt->on_rest_unit, lr_unit_row,
  876|  3.86k|          ctxt[plane].rsi->restoration_unit_size, ctxt[plane].rsi->horz_units,
  877|  3.86k|          ctxt[plane].rsi->vert_units, plane, &ctxt[plane],
  878|  3.86k|          lrworkerdata->rst_tmpbuf, lrworkerdata->rlbs, on_sync_read,
  879|  3.86k|          on_sync_write, lr_sync, error_info);
  880|       |
  881|  3.86k|      copy_funs[plane](lr_ctxt->dst, lr_ctxt->frame, 0, plane_w,
  882|  3.86k|                       cur_job_info->v_copy_start, cur_job_info->v_copy_end);
  883|       |
  884|  3.86k|      if (lrworkerdata->do_extend_border) {
  ------------------
  |  Branch (884:11): [True: 0, False: 3.86k]
  ------------------
  885|      0|        aom_extend_frame_borders_plane_row(lr_ctxt->frame, plane,
  ------------------
  |  |   34|      0|#define aom_extend_frame_borders_plane_row aom_extend_frame_borders_plane_row_c
  ------------------
  886|      0|                                           cur_job_info->v_copy_start,
  887|      0|                                           cur_job_info->v_copy_end);
  888|      0|      }
  889|  21.9k|    } else {
  890|  21.9k|      break;
  891|  21.9k|    }
  892|  25.7k|  }
  893|  21.9k|  error_info->setjmp = 0;
  894|  21.9k|  return 1;
  895|  21.9k|}
thread_common.c:get_lr_job_info:
  774|  25.7k|static AV1LrMTInfo *get_lr_job_info(AV1LrSync *lr_sync) {
  775|  25.7k|  AV1LrMTInfo *cur_job_info = NULL;
  776|       |
  777|  25.7k|#if CONFIG_MULTITHREAD
  778|  25.7k|  pthread_mutex_lock(lr_sync->job_mutex);
  779|       |
  780|  25.8k|  if (!lr_sync->lr_mt_exit && lr_sync->jobs_dequeued < lr_sync->jobs_enqueued) {
  ------------------
  |  Branch (780:7): [True: 25.8k, False: 18.4E]
  |  Branch (780:31): [True: 3.86k, False: 21.9k]
  ------------------
  781|  3.86k|    cur_job_info = lr_sync->job_queue + lr_sync->jobs_dequeued;
  782|  3.86k|    lr_sync->jobs_dequeued++;
  783|  3.86k|  }
  784|       |
  785|  25.7k|  pthread_mutex_unlock(lr_sync->job_mutex);
  786|       |#else
  787|       |  (void)lr_sync;
  788|       |#endif
  789|       |
  790|  25.7k|  return cur_job_info;
  791|  25.7k|}
thread_common.c:lr_sync_read:
  525|  3.39k|static inline void lr_sync_read(void *const lr_sync, int r, int c, int plane) {
  526|  3.39k|#if CONFIG_MULTITHREAD
  527|  3.39k|  AV1LrSync *const loop_res_sync = (AV1LrSync *)lr_sync;
  528|  3.39k|  const int nsync = loop_res_sync->sync_range;
  529|       |
  530|  3.39k|  if (r && !(c & (nsync - 1))) {
  ------------------
  |  Branch (530:7): [True: 3.39k, False: 0]
  |  Branch (530:12): [True: 3.39k, False: 1]
  ------------------
  531|  3.39k|    pthread_mutex_t *const mutex = &loop_res_sync->mutex_[plane][r - 1];
  532|  3.39k|    pthread_mutex_lock(mutex);
  533|       |
  534|  4.09k|    while (c > loop_res_sync->cur_sb_col[plane][r - 1] - nsync) {
  ------------------
  |  Branch (534:12): [True: 701, False: 3.39k]
  ------------------
  535|    701|      pthread_cond_wait(&loop_res_sync->cond_[plane][r - 1], mutex);
  536|    701|    }
  537|  3.39k|    pthread_mutex_unlock(mutex);
  538|  3.39k|  }
  539|       |#else
  540|       |  (void)lr_sync;
  541|       |  (void)r;
  542|       |  (void)c;
  543|       |  (void)plane;
  544|       |#endif  // CONFIG_MULTITHREAD
  545|  3.39k|}
thread_common.c:lr_sync_write:
  548|  3.54k|                                 const int sb_cols, int plane) {
  549|  3.54k|#if CONFIG_MULTITHREAD
  550|  3.54k|  AV1LrSync *const loop_res_sync = (AV1LrSync *)lr_sync;
  551|  3.54k|  const int nsync = loop_res_sync->sync_range;
  552|  3.54k|  int cur;
  553|       |  // Only signal when there are enough filtered SB for next row to run.
  554|  3.54k|  int sig = 1;
  555|       |
  556|  3.54k|  if (c < sb_cols - 1) {
  ------------------
  |  Branch (556:7): [True: 996, False: 2.55k]
  ------------------
  557|    996|    cur = c;
  558|    996|    if (c % nsync) sig = 0;
  ------------------
  |  Branch (558:9): [True: 0, False: 996]
  ------------------
  559|  2.55k|  } else {
  560|  2.55k|    cur = sb_cols + nsync;
  561|  2.55k|  }
  562|       |
  563|  3.54k|  if (sig) {
  ------------------
  |  Branch (563:7): [True: 3.54k, False: 18.4E]
  ------------------
  564|  3.54k|    pthread_mutex_lock(&loop_res_sync->mutex_[plane][r]);
  565|       |
  566|       |    // When a thread encounters an error, cur_sb_col[plane][r] is set to maximum
  567|       |    // column number. In this case, the AOMMAX operation here ensures that
  568|       |    // cur_sb_col[plane][r] is not overwritten with a smaller value thus
  569|       |    // preventing the infinite waiting of threads in the relevant sync_read()
  570|       |    // function.
  571|  3.54k|    loop_res_sync->cur_sb_col[plane][r] =
  572|  3.54k|        AOMMAX(loop_res_sync->cur_sb_col[plane][r], cur);
  ------------------
  |  |   35|  3.54k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 3.54k]
  |  |  ------------------
  ------------------
  573|       |
  574|  3.54k|    pthread_cond_broadcast(&loop_res_sync->cond_[plane][r]);
  575|  3.54k|    pthread_mutex_unlock(&loop_res_sync->mutex_[plane][r]);
  576|  3.54k|  }
  577|       |#else
  578|       |  (void)lr_sync;
  579|       |  (void)r;
  580|       |  (void)c;
  581|       |  (void)sb_cols;
  582|       |  (void)plane;
  583|       |#endif  // CONFIG_MULTITHREAD
  584|  3.54k|}
thread_common.c:sync_lr_workers:
  898|    718|                                   AV1_COMMON *const cm, int num_workers) {
  899|    718|  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
  900|    718|  int had_error = workers[0].had_error;
  901|    718|  struct aom_internal_error_info error_info;
  902|       |
  903|       |  // Read the error_info of main thread.
  904|    718|  if (had_error) {
  ------------------
  |  Branch (904:7): [True: 0, False: 718]
  ------------------
  905|      0|    AVxWorker *const worker = &workers[0];
  906|      0|    error_info = ((LRWorkerData *)worker->data2)->error_info;
  907|      0|  }
  908|       |
  909|       |  // Wait till all rows are finished.
  910|  21.9k|  for (int i = num_workers - 1; i > 0; --i) {
  ------------------
  |  Branch (910:33): [True: 21.2k, False: 718]
  ------------------
  911|  21.2k|    AVxWorker *const worker = &workers[i];
  912|  21.2k|    if (!winterface->sync(worker)) {
  ------------------
  |  Branch (912:9): [True: 0, False: 21.2k]
  ------------------
  913|      0|      had_error = 1;
  914|      0|      error_info = ((LRWorkerData *)worker->data2)->error_info;
  915|      0|    }
  916|  21.2k|  }
  917|    718|  if (had_error) aom_internal_error_copy(cm->error, &error_info);
  ------------------
  |  Branch (917:7): [True: 0, False: 718]
  ------------------
  918|    718|}
thread_common.c:cdef_row_mt_sync_write:
  192|  3.76k|                                          int row) {
  193|  3.76k|#if CONFIG_MULTITHREAD
  194|  3.76k|  AV1CdefRowSync *const cdef_row_mt = cdef_sync->cdef_row_mt;
  195|  3.76k|  pthread_mutex_lock(cdef_row_mt[row].row_mutex_);
  196|  3.76k|  pthread_cond_signal(cdef_row_mt[row].row_cond_);
  197|  3.76k|  cdef_row_mt[row].is_row_done = 1;
  198|  3.76k|  pthread_mutex_unlock(cdef_row_mt[row].row_mutex_);
  199|       |#else
  200|       |  (void)cdef_sync;
  201|       |  (void)row;
  202|       |#endif  // CONFIG_MULTITHREAD
  203|  3.76k|}
thread_common.c:cdef_row_mt_sync_read:
  176|  3.78k|                                         int row) {
  177|  3.78k|  if (!row) return;
  ------------------
  |  Branch (177:7): [True: 1.67k, False: 2.10k]
  ------------------
  178|  2.10k|#if CONFIG_MULTITHREAD
  179|  2.10k|  AV1CdefRowSync *const cdef_row_mt = cdef_sync->cdef_row_mt;
  180|  2.10k|  pthread_mutex_lock(cdef_row_mt[row - 1].row_mutex_);
  181|  2.29k|  while (cdef_row_mt[row - 1].is_row_done != 1)
  ------------------
  |  Branch (181:10): [True: 187, False: 2.10k]
  ------------------
  182|    187|    pthread_cond_wait(cdef_row_mt[row - 1].row_cond_,
  183|    187|                      cdef_row_mt[row - 1].row_mutex_);
  184|  2.10k|  cdef_row_mt[row - 1].is_row_done = 0;
  185|  2.10k|  pthread_mutex_unlock(cdef_row_mt[row - 1].row_mutex_);
  186|       |#else
  187|       |  (void)cdef_sync;
  188|       |#endif  // CONFIG_MULTITHREAD
  189|  2.10k|}
thread_common.c:reset_cdef_job_info:
 1000|  1.67k|static inline void reset_cdef_job_info(AV1CdefSync *const cdef_sync) {
 1001|  1.67k|  cdef_sync->end_of_frame = 0;
 1002|  1.67k|  cdef_sync->fbr = 0;
 1003|  1.67k|  cdef_sync->fbc = 0;
 1004|       |  cdef_sync->cdef_mt_exit = false;
 1005|  1.67k|}
thread_common.c:prepare_cdef_frame_workers:
 1139|  1.67k|    int do_extend_border) {
 1140|  1.67k|  const int num_planes = av1_num_planes(cm);
 1141|       |
 1142|  1.67k|  cdef_worker[0].srcbuf = cm->cdef_info.srcbuf;
 1143|  6.26k|  for (int plane = 0; plane < num_planes; plane++)
  ------------------
  |  Branch (1143:23): [True: 4.58k, False: 1.67k]
  ------------------
 1144|  4.58k|    cdef_worker[0].colbuf[plane] = cm->cdef_info.colbuf[plane];
 1145|  48.8k|  for (int i = num_workers - 1; i >= 0; i--) {
  ------------------
  |  Branch (1145:33): [True: 47.1k, False: 1.67k]
  ------------------
 1146|  47.1k|    AVxWorker *const worker = &workers[i];
 1147|  47.1k|    cdef_worker[i].cm = cm;
 1148|  47.1k|    cdef_worker[i].xd = xd;
 1149|  47.1k|    cdef_worker[i].cdef_init_fb_row_fn = cdef_init_fb_row_fn;
 1150|  47.1k|    cdef_worker[i].do_extend_border = do_extend_border;
 1151|   175k|    for (int plane = 0; plane < num_planes; plane++)
  ------------------
  |  Branch (1151:25): [True: 128k, False: 47.1k]
  ------------------
 1152|   128k|      cdef_worker[i].linebuf[plane] = cm->cdef_info.linebuf[plane];
 1153|       |
 1154|  47.1k|    worker->hook = hook;
 1155|  47.1k|    worker->data1 = cdef_sync;
 1156|  47.1k|    worker->data2 = &cdef_worker[i];
 1157|  47.1k|  }
 1158|  1.67k|}
thread_common.c:cdef_sb_row_worker_hook:
 1079|  47.1k|static int cdef_sb_row_worker_hook(void *arg1, void *arg2) {
 1080|  47.1k|  AV1CdefSync *const cdef_sync = (AV1CdefSync *)arg1;
 1081|  47.1k|  AV1CdefWorkerData *const cdef_worker = (AV1CdefWorkerData *)arg2;
 1082|  47.1k|  AV1_COMMON *cm = cdef_worker->cm;
 1083|  47.1k|  const int nvfb = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
  ------------------
  |  |   58|  47.1k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  47.1k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
                const int nvfb = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
  ------------------
  |  |   58|  47.1k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  47.1k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1084|       |
 1085|  47.1k|#if CONFIG_MULTITHREAD
 1086|  47.1k|  pthread_mutex_t *job_mutex_ = cdef_sync->mutex_;
 1087|  47.1k|#endif
 1088|  47.1k|  struct aom_internal_error_info *const error_info = &cdef_worker->error_info;
 1089|       |
 1090|       |  // The jmp_buf is valid only for the duration of the function that calls
 1091|       |  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
 1092|       |  // before it returns.
 1093|  47.1k|  if (setjmp(error_info->jmp)) {
  ------------------
  |  Branch (1093:7): [True: 0, False: 47.1k]
  ------------------
 1094|      0|    error_info->setjmp = 0;
 1095|      0|#if CONFIG_MULTITHREAD
 1096|      0|    pthread_mutex_lock(job_mutex_);
 1097|      0|    cdef_sync->cdef_mt_exit = true;
 1098|      0|    pthread_mutex_unlock(job_mutex_);
 1099|      0|#endif
 1100|       |    // In case of cdef row-multithreading, the worker on a filter block row
 1101|       |    // (fbr) waits for the line buffers (top and bottom) copy of the above row.
 1102|       |    // Hence, in case a thread (main/worker) encounters an error before copying
 1103|       |    // of the line buffers, update that line buffer copy is complete in order to
 1104|       |    // avoid dependent workers waiting indefinitely.
 1105|      0|    set_cdef_init_fb_row_done(cdef_sync, nvfb);
 1106|      0|    return 0;
 1107|      0|  }
 1108|  47.1k|  error_info->setjmp = 1;
 1109|       |
 1110|  47.1k|  volatile int cur_fbr;
 1111|  47.1k|  const int num_planes = av1_num_planes(cm);
 1112|  50.9k|  while (get_cdef_row_next_job(cdef_sync, &cur_fbr, nvfb)) {
  ------------------
  |  Branch (1112:10): [True: 3.78k, False: 47.1k]
  ------------------
 1113|  3.78k|    MACROBLOCKD *xd = cdef_worker->xd;
 1114|  3.78k|    av1_cdef_fb_row(cm, xd, cdef_worker->linebuf, cdef_worker->colbuf,
 1115|  3.78k|                    cdef_worker->srcbuf, cur_fbr,
 1116|  3.78k|                    cdef_worker->cdef_init_fb_row_fn, cdef_sync, error_info);
 1117|  3.78k|    if (cdef_worker->do_extend_border) {
  ------------------
  |  Branch (1117:9): [True: 0, False: 3.78k]
  ------------------
 1118|      0|      for (int plane = 0; plane < num_planes; ++plane) {
  ------------------
  |  Branch (1118:27): [True: 0, False: 0]
  ------------------
 1119|      0|        const YV12_BUFFER_CONFIG *ybf = &cm->cur_frame->buf;
 1120|      0|        const int is_uv = plane > 0;
 1121|      0|        const int mi_high = MI_SIZE_LOG2 - xd->plane[plane].subsampling_y;
  ------------------
  |  |   39|      0|#define MI_SIZE_LOG2 2
  ------------------
 1122|      0|        const int unit_height = MI_SIZE_64X64 << mi_high;
  ------------------
  |  |   58|      0|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|      0|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1123|      0|        const int v_start = cur_fbr * unit_height;
 1124|      0|        const int v_end =
 1125|      0|            AOMMIN(v_start + unit_height, ybf->crop_heights[is_uv]);
  ------------------
  |  |   34|      0|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1126|      0|        aom_extend_frame_borders_plane_row(ybf, plane, v_start, v_end);
  ------------------
  |  |   34|      0|#define aom_extend_frame_borders_plane_row aom_extend_frame_borders_plane_row_c
  ------------------
 1127|      0|      }
 1128|      0|    }
 1129|  3.78k|  }
 1130|  47.1k|  error_info->setjmp = 0;
 1131|  47.1k|  return 1;
 1132|  47.1k|}
thread_common.c:get_cdef_row_next_job:
 1056|  50.7k|                                        volatile int *cur_fbr, const int nvfb) {
 1057|  50.7k|#if CONFIG_MULTITHREAD
 1058|  50.7k|  pthread_mutex_lock(cdef_sync->mutex_);
 1059|  50.7k|#endif  // CONFIG_MULTITHREAD
 1060|  50.7k|  int do_next_row = 0;
 1061|       |  // Populates information needed for current job and update the row
 1062|       |  // index of the next row to be processed.
 1063|  50.9k|  if (!cdef_sync->cdef_mt_exit && cdef_sync->end_of_frame == 0) {
  ------------------
  |  Branch (1063:7): [True: 50.9k, False: 18.4E]
  |  Branch (1063:35): [True: 3.78k, False: 47.1k]
  ------------------
 1064|  3.78k|    do_next_row = 1;
 1065|  3.78k|    *cur_fbr = cdef_sync->fbr;
 1066|  3.78k|    update_cdef_row_next_job_info(cdef_sync, nvfb);
 1067|  3.78k|  }
 1068|  50.7k|#if CONFIG_MULTITHREAD
 1069|  50.7k|  pthread_mutex_unlock(cdef_sync->mutex_);
 1070|  50.7k|#endif  // CONFIG_MULTITHREAD
 1071|  50.7k|  return do_next_row;
 1072|  50.7k|}
thread_common.c:update_cdef_row_next_job_info:
 1046|  3.78k|                                          const int nvfb) {
 1047|  3.78k|  cdef_sync->fbr++;
 1048|  3.78k|  if (cdef_sync->fbr == nvfb) {
  ------------------
  |  Branch (1048:7): [True: 1.67k, False: 2.10k]
  ------------------
 1049|  1.67k|    cdef_sync->end_of_frame = 1;
 1050|  1.67k|  }
 1051|  3.78k|}
thread_common.c:launch_cdef_workers:
 1008|  1.67k|                                       int num_workers) {
 1009|  1.67k|  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
 1010|  48.8k|  for (int i = num_workers - 1; i >= 0; i--) {
  ------------------
  |  Branch (1010:33): [True: 47.1k, False: 1.67k]
  ------------------
 1011|  47.1k|    AVxWorker *const worker = &workers[i];
 1012|  47.1k|    worker->had_error = 0;
 1013|  47.1k|    if (i == 0)
  ------------------
  |  Branch (1013:9): [True: 1.67k, False: 45.5k]
  ------------------
 1014|  1.67k|      winterface->execute(worker);
 1015|  45.5k|    else
 1016|  45.5k|      winterface->launch(worker);
 1017|  47.1k|  }
 1018|  1.67k|}
thread_common.c:sync_cdef_workers:
 1021|  1.67k|                                     AV1_COMMON *const cm, int num_workers) {
 1022|  1.67k|  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
 1023|  1.67k|  int had_error = workers[0].had_error;
 1024|  1.67k|  struct aom_internal_error_info error_info;
 1025|       |
 1026|       |  // Read the error_info of main thread.
 1027|  1.67k|  if (had_error) {
  ------------------
  |  Branch (1027:7): [True: 0, False: 1.67k]
  ------------------
 1028|      0|    AVxWorker *const worker = &workers[0];
 1029|      0|    error_info = ((AV1CdefWorkerData *)worker->data2)->error_info;
 1030|      0|  }
 1031|       |
 1032|       |  // Wait till all rows are finished.
 1033|  47.1k|  for (int i = num_workers - 1; i > 0; --i) {
  ------------------
  |  Branch (1033:33): [True: 45.5k, False: 1.67k]
  ------------------
 1034|  45.5k|    AVxWorker *const worker = &workers[i];
 1035|  45.5k|    if (!winterface->sync(worker)) {
  ------------------
  |  Branch (1035:9): [True: 0, False: 45.5k]
  ------------------
 1036|      0|      had_error = 1;
 1037|      0|      error_info = ((AV1CdefWorkerData *)worker->data2)->error_info;
 1038|      0|    }
 1039|  45.5k|  }
 1040|  1.67k|  if (had_error) aom_internal_error_copy(cm->error, &error_info);
  ------------------
  |  Branch (1040:7): [True: 0, False: 1.67k]
  ------------------
 1041|  1.67k|}

thread_common.c:check_planes_to_loop_filter:
  336|  1.86k|                                              int plane_start, int plane_end) {
  337|  1.86k|  set_planes_to_loop_filter(lf, planes_to_lf, plane_start, plane_end);
  338|       |  // If the luma plane is purposely not filtered, neither are the chroma
  339|       |  // planes.
  340|  1.86k|  if (!planes_to_lf[0] && plane_start <= 0 && 0 < plane_end) return 0;
  ------------------
  |  Branch (340:7): [True: 0, False: 1.86k]
  |  Branch (340:27): [True: 0, False: 0]
  |  Branch (340:47): [True: 0, False: 0]
  ------------------
  341|       |  // Early exit.
  342|  1.86k|  if (!planes_to_lf[0] && !planes_to_lf[1] && !planes_to_lf[2]) return 0;
  ------------------
  |  Branch (342:7): [True: 0, False: 1.86k]
  |  Branch (342:27): [True: 0, False: 0]
  |  Branch (342:47): [True: 0, False: 0]
  ------------------
  343|  1.86k|  return 1;
  344|  1.86k|}
thread_common.c:set_planes_to_loop_filter:
  326|  1.86k|                                             int plane_start, int plane_end) {
  327|       |  // For each luma and chroma plane, whether to filter it or not.
  328|  1.86k|  planes_to_lf[0] = (lf->filter_level[0] || lf->filter_level[1]) &&
  ------------------
  |  Branch (328:22): [True: 1.47k, False: 387]
  |  Branch (328:45): [True: 387, False: 0]
  ------------------
  329|  1.86k|                    plane_start <= 0 && 0 < plane_end;
  ------------------
  |  Branch (329:21): [True: 1.86k, False: 0]
  |  Branch (329:41): [True: 1.86k, False: 0]
  ------------------
  330|  1.86k|  planes_to_lf[1] = lf->filter_level_u && plane_start <= 1 && 1 < plane_end;
  ------------------
  |  Branch (330:21): [True: 1.62k, False: 244]
  |  Branch (330:43): [True: 1.62k, False: 0]
  |  Branch (330:63): [True: 1.62k, False: 0]
  ------------------
  331|  1.86k|  planes_to_lf[2] = lf->filter_level_v && plane_start <= 2 && 2 < plane_end;
  ------------------
  |  Branch (331:21): [True: 1.59k, False: 269]
  |  Branch (331:43): [True: 1.59k, False: 0]
  |  Branch (331:63): [True: 1.59k, False: 0]
  ------------------
  332|  1.86k|}
thread_common.c:loop_filter_frame_mt_init:
  268|  1.51k|    int lpf_opt_level, int num_mis_in_lpf_unit_height_log2) {
  269|       |  // Number of superblock rows
  270|  1.51k|  const int sb_rows =
  271|  1.51k|      CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, num_mis_in_lpf_unit_height_log2);
  ------------------
  |  |   62|  1.51k|#define CEIL_POWER_OF_TWO(value, n) (((value) + (1 << (n)) - 1) >> (n))
  ------------------
  272|       |
  273|  1.51k|  if (!lf_sync->sync_range || sb_rows != lf_sync->rows ||
  ------------------
  |  Branch (273:7): [True: 1.41k, False: 100]
  |  Branch (273:31): [True: 4, False: 96]
  ------------------
  274|  1.42k|      num_workers > lf_sync->num_workers) {
  ------------------
  |  Branch (274:7): [True: 0, False: 96]
  ------------------
  275|  1.42k|    av1_loop_filter_dealloc(lf_sync);
  276|  1.42k|    av1_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);
  277|  1.42k|  }
  278|  1.51k|  lf_sync->lf_mt_exit = false;
  279|       |
  280|       |  // Initialize cur_sb_col to -1 for all SB rows.
  281|  6.07k|  for (int i = 0; i < MAX_MB_PLANE; i++) {
  ------------------
  |  |   36|  6.07k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (281:19): [True: 4.55k, False: 1.51k]
  ------------------
  282|  4.55k|    memset(lf_sync->cur_sb_col[i], -1,
  283|  4.55k|           sizeof(*(lf_sync->cur_sb_col[i])) * sb_rows);
  284|  4.55k|  }
  285|       |
  286|  1.51k|  enqueue_lf_jobs(lf_sync, start_mi_row, end_mi_row, planes_to_lf,
  287|  1.51k|                  lpf_opt_level, (1 << num_mis_in_lpf_unit_height_log2));
  288|  1.51k|}
thread_common.c:enqueue_lf_jobs:
  238|  1.51k|                                   int num_mis_in_lpf_unit_height) {
  239|  1.51k|  int mi_row, plane, dir;
  240|  1.51k|  AV1LfMTInfo *lf_job_queue = lf_sync->job_queue;
  241|  1.51k|  lf_sync->jobs_enqueued = 0;
  242|  1.51k|  lf_sync->jobs_dequeued = 0;
  243|       |
  244|       |  // Launch all vertical jobs first, as they are blocking the horizontal ones.
  245|       |  // Launch top row jobs for all planes first, in case the output can be
  246|       |  // partially reconstructed row by row.
  247|  4.55k|  for (dir = 0; dir < 2; ++dir) {
  ------------------
  |  Branch (247:17): [True: 3.03k, False: 1.51k]
  ------------------
  248|  7.44k|    for (mi_row = start; mi_row < stop; mi_row += num_mis_in_lpf_unit_height) {
  ------------------
  |  Branch (248:26): [True: 4.40k, False: 3.03k]
  ------------------
  249|  17.6k|      for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
  ------------------
  |  |   36|  17.6k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (249:23): [True: 13.2k, False: 4.40k]
  ------------------
  250|  13.2k|        if (skip_loop_filter_plane(planes_to_lf, plane, lpf_opt_level)) {
  ------------------
  |  Branch (250:13): [True: 1.17k, False: 12.0k]
  ------------------
  251|  1.17k|          continue;
  252|  1.17k|        }
  253|  12.0k|        if (!planes_to_lf[plane]) continue;
  ------------------
  |  Branch (253:13): [True: 0, False: 12.0k]
  ------------------
  254|  12.0k|        lf_job_queue->mi_row = mi_row;
  255|  12.0k|        lf_job_queue->plane = plane;
  256|  12.0k|        lf_job_queue->dir = dir;
  257|  12.0k|        lf_job_queue->lpf_opt_level = lpf_opt_level;
  258|  12.0k|        lf_job_queue++;
  259|  12.0k|        lf_sync->jobs_enqueued++;
  260|  12.0k|      }
  261|  4.40k|    }
  262|  3.03k|  }
  263|  1.51k|}
thread_common.c:skip_loop_filter_plane:
  213|  14.4k|    const int planes_to_lf[MAX_MB_PLANE], int plane, int lpf_opt_level) {
  214|       |  // If LPF_PICK_METHOD is LPF_PICK_FROM_Q, we have the option to filter both
  215|       |  // chroma planes together
  216|  14.4k|  if (lpf_opt_level == 2) {
  ------------------
  |  Branch (216:7): [True: 0, False: 14.4k]
  ------------------
  217|      0|    if (plane == AOM_PLANE_Y) {
  ------------------
  |  |  210|      0|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
  |  Branch (217:9): [True: 0, False: 0]
  ------------------
  218|      0|      return !planes_to_lf[plane];
  219|      0|    }
  220|      0|    if (plane == AOM_PLANE_U) {
  ------------------
  |  |  211|      0|#define AOM_PLANE_U 1      /**< U (Chroma) plane */
  ------------------
  |  Branch (220:9): [True: 0, False: 0]
  ------------------
  221|       |      // U and V are handled together
  222|      0|      return !planes_to_lf[1] && !planes_to_lf[2];
  ------------------
  |  Branch (222:14): [True: 0, False: 0]
  |  Branch (222:34): [True: 0, False: 0]
  ------------------
  223|      0|    }
  224|      0|    assert(plane == AOM_PLANE_V);
  225|      0|    if (plane == AOM_PLANE_V) {
  ------------------
  |  |  212|      0|#define AOM_PLANE_V 2      /**< V (Chroma) plane */
  ------------------
  |  Branch (225:9): [True: 0, False: 0]
  ------------------
  226|       |      // V is handled when u is filtered
  227|      0|      return true;
  228|      0|    }
  229|      0|  }
  230|       |
  231|       |  // Normal operation mode
  232|  14.4k|  return !planes_to_lf[plane];
  233|  14.4k|}
thread_common.c:get_lf_job_info:
  290|  56.7k|static inline AV1LfMTInfo *get_lf_job_info(AV1LfSync *lf_sync) {
  291|  56.7k|  AV1LfMTInfo *cur_job_info = NULL;
  292|       |
  293|  56.7k|#if CONFIG_MULTITHREAD
  294|  56.7k|  pthread_mutex_lock(lf_sync->job_mutex);
  295|       |
  296|  57.1k|  if (!lf_sync->lf_mt_exit && lf_sync->jobs_dequeued < lf_sync->jobs_enqueued) {
  ------------------
  |  Branch (296:7): [True: 57.1k, False: 18.4E]
  |  Branch (296:31): [True: 12.0k, False: 45.1k]
  ------------------
  297|  12.0k|    cur_job_info = lf_sync->job_queue + lf_sync->jobs_dequeued;
  298|  12.0k|    lf_sync->jobs_dequeued++;
  299|  12.0k|  }
  300|       |
  301|  56.7k|  pthread_mutex_unlock(lf_sync->job_mutex);
  302|       |#else
  303|       |  (void)lf_sync;
  304|       |#endif
  305|       |
  306|  56.7k|  return cur_job_info;
  307|  56.7k|}
thread_common.c:loop_filter_data_reset:
  312|  45.1k|                                          MACROBLOCKD *xd) {
  313|  45.1k|  struct macroblockd_plane *pd = xd->plane;
  314|  45.1k|  lf_data->frame_buffer = frame_buffer;
  315|  45.1k|  lf_data->cm = cm;
  316|  45.1k|  lf_data->xd = xd;
  317|   180k|  for (int i = 0; i < MAX_MB_PLANE; i++) {
  ------------------
  |  |   36|   180k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (317:19): [True: 135k, False: 45.1k]
  ------------------
  318|   135k|    lf_data->planes[i].dst = pd[i].dst;
  319|   135k|    lf_data->planes[i].subsampling_x = pd[i].subsampling_x;
  320|   135k|    lf_data->planes[i].subsampling_y = pd[i].subsampling_y;
  321|   135k|  }
  322|  45.1k|}

av1_tile_init:
   19|  63.3k|void av1_tile_init(TileInfo *tile, const AV1_COMMON *cm, int row, int col) {
   20|  63.3k|  av1_tile_set_row(tile, cm, row);
   21|  63.3k|  av1_tile_set_col(tile, cm, col);
   22|  63.3k|}
av1_get_tile_limits:
   32|  26.5k|void av1_get_tile_limits(AV1_COMMON *const cm) {
   33|  26.5k|  const SequenceHeader *const seq_params = cm->seq_params;
   34|  26.5k|  CommonTileParams *const tiles = &cm->tiles;
   35|  26.5k|  const int sb_cols =
   36|  26.5k|      CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, seq_params->mib_size_log2);
  ------------------
  |  |   62|  26.5k|#define CEIL_POWER_OF_TWO(value, n) (((value) + (1 << (n)) - 1) >> (n))
  ------------------
   37|  26.5k|  const int sb_rows =
   38|  26.5k|      CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, seq_params->mib_size_log2);
  ------------------
  |  |   62|  26.5k|#define CEIL_POWER_OF_TWO(value, n) (((value) + (1 << (n)) - 1) >> (n))
  ------------------
   39|       |
   40|  26.5k|  const int sb_size_log2 = seq_params->mib_size_log2 + MI_SIZE_LOG2;
  ------------------
  |  |   39|  26.5k|#define MI_SIZE_LOG2 2
  ------------------
   41|  26.5k|  tiles->max_width_sb = MAX_TILE_WIDTH >> sb_size_log2;
  ------------------
  |  |   50|  26.5k|#define MAX_TILE_WIDTH (4096)        // Max Tile width in pixels
  ------------------
   42|       |
   43|       |#if CONFIG_CWG_C013
   44|       |  bool use_level_7_above = false;
   45|       |  for (int i = 0; i < seq_params->operating_points_cnt_minus_1 + 1; i++) {
   46|       |    if (seq_params->seq_level_idx[i] >= SEQ_LEVEL_7_0 &&
   47|       |        seq_params->seq_level_idx[i] <= SEQ_LEVEL_8_3) {
   48|       |      // Currently it is assumed that levels 7.x and 8.x are either used for all
   49|       |      // operating points, or none of them.
   50|       |      if (i != 0 && !use_level_7_above) {
   51|       |        aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
   52|       |                           "Either all the operating points are levels 7.x or "
   53|       |                           "8.x, or none of them are.");
   54|       |      }
   55|       |      use_level_7_above = true;
   56|       |    }
   57|       |  }
   58|       |  const int max_tile_area_sb =
   59|       |      (use_level_7_above ? MAX_TILE_AREA_LEVEL_7_AND_ABOVE : MAX_TILE_AREA) >>
   60|       |      (2 * sb_size_log2);
   61|       |#else
   62|  26.5k|  const int max_tile_area_sb = MAX_TILE_AREA >> (2 * sb_size_log2);
  ------------------
  |  |   51|  26.5k|#define MAX_TILE_AREA (4096 * 2304)  // Maximum tile area in pixels
  ------------------
   63|  26.5k|#endif
   64|       |
   65|  26.5k|  tiles->min_log2_cols = tile_log2(tiles->max_width_sb, sb_cols);
   66|  26.5k|  tiles->max_log2_cols = tile_log2(1, AOMMIN(sb_cols, MAX_TILE_COLS));
  ------------------
  |  |   34|  26.5k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 26.5k, False: 0]
  |  |  ------------------
  ------------------
   67|  26.5k|  tiles->max_log2_rows = tile_log2(1, AOMMIN(sb_rows, MAX_TILE_ROWS));
  ------------------
  |  |   34|  26.5k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 26.5k, False: 0]
  |  |  ------------------
  ------------------
   68|  26.5k|  tiles->min_log2 = tile_log2(max_tile_area_sb, sb_cols * sb_rows);
   69|  26.5k|  tiles->min_log2 = AOMMAX(tiles->min_log2, tiles->min_log2_cols);
  ------------------
  |  |   35|  26.5k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 26.5k]
  |  |  ------------------
  ------------------
   70|  26.5k|}
av1_calculate_tile_cols:
   74|  26.5k|                             CommonTileParams *const tiles) {
   75|  26.5k|  int sb_cols = CEIL_POWER_OF_TWO(cm_mi_cols, seq_params->mib_size_log2);
  ------------------
  |  |   62|  26.5k|#define CEIL_POWER_OF_TWO(value, n) (((value) + (1 << (n)) - 1) >> (n))
  ------------------
   76|  26.5k|  int sb_rows = CEIL_POWER_OF_TWO(cm_mi_rows, seq_params->mib_size_log2);
  ------------------
  |  |   62|  26.5k|#define CEIL_POWER_OF_TWO(value, n) (((value) + (1 << (n)) - 1) >> (n))
  ------------------
   77|  26.5k|  int i;
   78|       |
   79|       |  // This will be overridden if there is at least two columns of tiles
   80|       |  // (otherwise there is no inner tile width)
   81|  26.5k|  tiles->min_inner_width = -1;
   82|       |
   83|  26.5k|  if (tiles->uniform_spacing) {
  ------------------
  |  Branch (83:7): [True: 23.9k, False: 2.53k]
  ------------------
   84|  23.9k|    int start_sb;
   85|  23.9k|    int size_sb = CEIL_POWER_OF_TWO(sb_cols, tiles->log2_cols);
  ------------------
  |  |   62|  23.9k|#define CEIL_POWER_OF_TWO(value, n) (((value) + (1 << (n)) - 1) >> (n))
  ------------------
   86|  23.9k|    assert(size_sb > 0);
   87|  48.2k|    for (i = 0, start_sb = 0; start_sb < sb_cols; i++) {
  ------------------
  |  Branch (87:31): [True: 24.3k, False: 23.9k]
  ------------------
   88|  24.3k|      tiles->col_start_sb[i] = start_sb;
   89|  24.3k|      start_sb += size_sb;
   90|  24.3k|    }
   91|  23.9k|    tiles->cols = i;
   92|  23.9k|    tiles->col_start_sb[i] = sb_cols;
   93|  23.9k|    tiles->min_log2_rows = AOMMAX(tiles->min_log2 - tiles->log2_cols, 0);
  ------------------
  |  |   35|  23.9k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 23.9k]
  |  |  ------------------
  ------------------
   94|  23.9k|    tiles->max_height_sb = sb_rows >> tiles->min_log2_rows;
   95|       |
   96|  23.9k|    tiles->width = size_sb << seq_params->mib_size_log2;
   97|  23.9k|    tiles->width = AOMMIN(tiles->width, cm_mi_cols);
  ------------------
  |  |   34|  23.9k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 239, False: 23.7k]
  |  |  ------------------
  ------------------
   98|  23.9k|    if (tiles->cols > 1) {
  ------------------
  |  Branch (98:9): [True: 239, False: 23.7k]
  ------------------
   99|    239|      tiles->min_inner_width = tiles->width;
  100|    239|    }
  101|  23.9k|  } else {
  102|  2.53k|    int max_tile_area_sb = (sb_rows * sb_cols);
  103|  2.53k|    int widest_tile_sb = 1;
  104|  2.53k|    int narrowest_inner_tile_sb = 65536;
  105|  2.53k|    tiles->log2_cols = tile_log2(1, tiles->cols);
  106|  5.41k|    for (i = 0; i < tiles->cols; i++) {
  ------------------
  |  Branch (106:17): [True: 2.88k, False: 2.53k]
  ------------------
  107|  2.88k|      int size_sb = tiles->col_start_sb[i + 1] - tiles->col_start_sb[i];
  108|  2.88k|      widest_tile_sb = AOMMAX(widest_tile_sb, size_sb);
  ------------------
  |  |   35|  2.88k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 72, False: 2.81k]
  |  |  ------------------
  ------------------
  109|       |      // ignore the rightmost tile in frame for determining the narrowest
  110|  2.88k|      if (i < tiles->cols - 1)
  ------------------
  |  Branch (110:11): [True: 351, False: 2.53k]
  ------------------
  111|    351|        narrowest_inner_tile_sb = AOMMIN(narrowest_inner_tile_sb, size_sb);
  ------------------
  |  |   34|    351|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 18, False: 333]
  |  |  ------------------
  ------------------
  112|  2.88k|    }
  113|  2.53k|    if (tiles->min_log2) {
  ------------------
  |  Branch (113:9): [True: 0, False: 2.53k]
  ------------------
  114|      0|      max_tile_area_sb >>= (tiles->min_log2 + 1);
  115|      0|    }
  116|  2.53k|    tiles->max_height_sb = AOMMAX(max_tile_area_sb / widest_tile_sb, 1);
  ------------------
  |  |   35|  2.53k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 561, False: 1.97k]
  |  |  ------------------
  ------------------
  117|  2.53k|    if (tiles->cols > 1) {
  ------------------
  |  Branch (117:9): [True: 207, False: 2.32k]
  ------------------
  118|    207|      tiles->min_inner_width = narrowest_inner_tile_sb
  119|    207|                               << seq_params->mib_size_log2;
  120|    207|    }
  121|  2.53k|  }
  122|  26.5k|}
av1_calculate_tile_rows:
  125|  26.5k|                             int cm_mi_rows, CommonTileParams *const tiles) {
  126|  26.5k|  int sb_rows = CEIL_POWER_OF_TWO(cm_mi_rows, seq_params->mib_size_log2);
  ------------------
  |  |   62|  26.5k|#define CEIL_POWER_OF_TWO(value, n) (((value) + (1 << (n)) - 1) >> (n))
  ------------------
  127|  26.5k|  int start_sb, size_sb, i;
  128|       |
  129|  26.5k|  if (tiles->uniform_spacing) {
  ------------------
  |  Branch (129:7): [True: 23.9k, False: 2.53k]
  ------------------
  130|  23.9k|    size_sb = CEIL_POWER_OF_TWO(sb_rows, tiles->log2_rows);
  ------------------
  |  |   62|  23.9k|#define CEIL_POWER_OF_TWO(value, n) (((value) + (1 << (n)) - 1) >> (n))
  ------------------
  131|  23.9k|    assert(size_sb > 0);
  132|  48.8k|    for (i = 0, start_sb = 0; start_sb < sb_rows; i++) {
  ------------------
  |  Branch (132:31): [True: 24.9k, False: 23.9k]
  ------------------
  133|  24.9k|      tiles->row_start_sb[i] = start_sb;
  134|  24.9k|      start_sb += size_sb;
  135|  24.9k|    }
  136|  23.9k|    tiles->rows = i;
  137|  23.9k|    tiles->row_start_sb[i] = sb_rows;
  138|       |
  139|  23.9k|    tiles->height = size_sb << seq_params->mib_size_log2;
  140|  23.9k|    tiles->height = AOMMIN(tiles->height, cm_mi_rows);
  ------------------
  |  |   34|  23.9k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 870, False: 23.1k]
  |  |  ------------------
  ------------------
  141|  23.9k|  } else {
  142|  2.53k|    tiles->log2_rows = tile_log2(1, tiles->rows);
  143|  2.53k|  }
  144|  26.5k|}
av1_tile_set_row:
  146|  76.0k|void av1_tile_set_row(TileInfo *tile, const AV1_COMMON *cm, int row) {
  147|  76.0k|  assert(row < cm->tiles.rows);
  148|  76.0k|  int mi_row_start = cm->tiles.row_start_sb[row]
  149|  76.0k|                     << cm->seq_params->mib_size_log2;
  150|  76.0k|  int mi_row_end = cm->tiles.row_start_sb[row + 1]
  151|  76.0k|                   << cm->seq_params->mib_size_log2;
  152|  76.0k|  tile->tile_row = row;
  153|  76.0k|  tile->mi_row_start = mi_row_start;
  154|  76.0k|  tile->mi_row_end = AOMMIN(mi_row_end, cm->mi_params.mi_rows);
  ------------------
  |  |   34|  76.0k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 3.47k, False: 72.5k]
  |  |  ------------------
  ------------------
  155|       |  assert(tile->mi_row_end > tile->mi_row_start);
  156|  76.0k|}
av1_tile_set_col:
  158|  76.6k|void av1_tile_set_col(TileInfo *tile, const AV1_COMMON *cm, int col) {
  159|  76.6k|  assert(col < cm->tiles.cols);
  160|  76.6k|  int mi_col_start = cm->tiles.col_start_sb[col]
  161|  76.6k|                     << cm->seq_params->mib_size_log2;
  162|  76.6k|  int mi_col_end = cm->tiles.col_start_sb[col + 1]
  163|  76.6k|                   << cm->seq_params->mib_size_log2;
  164|  76.6k|  tile->tile_col = col;
  165|  76.6k|  tile->mi_col_start = mi_col_start;
  166|  76.6k|  tile->mi_col_end = AOMMIN(mi_col_end, cm->mi_params.mi_cols);
  ------------------
  |  |   34|  76.6k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 2.11k, False: 74.4k]
  |  |  ------------------
  ------------------
  167|       |  assert(tile->mi_col_end > tile->mi_col_start);
  168|  76.6k|}
av1_get_sb_rows_in_tile:
  170|  64.6k|int av1_get_sb_rows_in_tile(const AV1_COMMON *cm, const TileInfo *tile) {
  171|  64.6k|  return CEIL_POWER_OF_TWO(tile->mi_row_end - tile->mi_row_start,
  ------------------
  |  |   62|  64.6k|#define CEIL_POWER_OF_TWO(value, n) (((value) + (1 << (n)) - 1) >> (n))
  ------------------
  172|  64.6k|                           cm->seq_params->mib_size_log2);
  173|  64.6k|}
av1_get_sb_cols_in_tile:
  175|  21.9k|int av1_get_sb_cols_in_tile(const AV1_COMMON *cm, const TileInfo *tile) {
  176|  21.9k|  return CEIL_POWER_OF_TWO(tile->mi_col_end - tile->mi_col_start,
  ------------------
  |  |   62|  21.9k|#define CEIL_POWER_OF_TWO(value, n) (((value) + (1 << (n)) - 1) >> (n))
  ------------------
  177|  21.9k|                           cm->seq_params->mib_size_log2);
  178|  21.9k|}
av1_is_min_tile_width_satisfied:
  220|  26.4k|int av1_is_min_tile_width_satisfied(const AV1_COMMON *cm) {
  221|       |  // Disable check if there is a single tile col in the frame
  222|  26.4k|  if (cm->tiles.cols == 1) return 1;
  ------------------
  |  Branch (222:7): [True: 26.0k, False: 437]
  ------------------
  223|       |
  224|    437|  return ((cm->tiles.min_inner_width << MI_SIZE_LOG2) >=
  ------------------
  |  |   39|    437|#define MI_SIZE_LOG2 2
  ------------------
  225|    437|          (64 << av1_superres_scaled(cm)));
  226|  26.4k|}
tile_common.c:tile_log2:
   25|   111k|static int tile_log2(int blk_size, int target) {
   26|   111k|  int k;
   27|   166k|  for (k = 0; (blk_size << k) < target; k++) {
  ------------------
  |  Branch (27:15): [True: 55.7k, False: 111k]
  ------------------
   28|  55.7k|  }
   29|   111k|  return k;
   30|   111k|}

av1_max_level_bitrate:
   54|     73|                              int seq_tier) {
   55|     73|  int64_t bitrate;
   56|       |
   57|     73|  if (seq_tier) {
  ------------------
  |  Branch (57:7): [True: 37, False: 36]
  ------------------
   58|     37|    bitrate = high_kbps[seq_level_idx] * bitrate_profile_factor[seq_profile];
   59|     37|  } else {
   60|     36|    bitrate = main_kbps[seq_level_idx] * bitrate_profile_factor[seq_profile];
   61|     36|  }
   62|       |
   63|     73|  return bitrate * 1000;
   64|     73|}

decodetxb.c:get_txb_ctx:
  450|  4.71M|                               TXB_CTX *const txb_ctx) {
  451|  4.71M|  switch (tx_size) {
  452|  1.64M|    case TX_4X4: get_txb_ctx_4x4(plane_bsize, plane, a, l, txb_ctx); break;
  ------------------
  |  Branch (452:5): [True: 1.64M, False: 3.06M]
  ------------------
  453|  1.07M|    case TX_8X8: get_txb_ctx_8x8(plane_bsize, plane, a, l, txb_ctx); break;
  ------------------
  |  Branch (453:5): [True: 1.07M, False: 3.64M]
  ------------------
  454|   380k|    case TX_16X16: get_txb_ctx_16x16(plane_bsize, plane, a, l, txb_ctx); break;
  ------------------
  |  Branch (454:5): [True: 380k, False: 4.33M]
  ------------------
  455|   235k|    case TX_32X32: get_txb_ctx_32x32(plane_bsize, plane, a, l, txb_ctx); break;
  ------------------
  |  Branch (455:5): [True: 235k, False: 4.48M]
  ------------------
  456|  1.37M|    default:
  ------------------
  |  Branch (456:5): [True: 1.37M, False: 3.34M]
  ------------------
  457|  1.37M|      get_txb_ctx_general(plane_bsize, tx_size, plane, a, l, txb_ctx);
  458|  1.37M|      break;
  459|  4.71M|  }
  460|  4.71M|}
decodetxb.c:get_txb_ctx_general:
  285|  1.37M|                                TXB_CTX *const txb_ctx) {
  286|  1.37M|#define MAX_TX_SIZE_UNIT 16
  287|  1.37M|  static const int8_t signs[3] = { 0, -1, 1 };
  288|  1.37M|  static const int8_t dc_sign_contexts[4 * MAX_TX_SIZE_UNIT + 1] = {
  289|  1.37M|    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  290|  1.37M|    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  291|  1.37M|    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
  292|  1.37M|  };
  293|  1.37M|  const int txb_w_unit = tx_size_wide_unit[tx_size];
  294|  1.37M|  const int txb_h_unit = tx_size_high_unit[tx_size];
  295|  1.37M|  int dc_sign = 0;
  296|  1.37M|  int k = 0;
  297|       |
  298|  5.08M|  do {
  299|  5.08M|    const unsigned int sign = ((uint8_t)a[k]) >> COEFF_CONTEXT_BITS;
  ------------------
  |  |   51|  5.08M|#define COEFF_CONTEXT_BITS 3
  ------------------
  300|  5.08M|    assert(sign <= 2);
  301|  5.08M|    dc_sign += signs[sign];
  302|  5.08M|  } while (++k < txb_w_unit);
  ------------------
  |  Branch (302:12): [True: 3.70M, False: 1.37M]
  ------------------
  303|       |
  304|  1.37M|  k = 0;
  305|  4.35M|  do {
  306|  4.35M|    const unsigned int sign = ((uint8_t)l[k]) >> COEFF_CONTEXT_BITS;
  ------------------
  |  |   51|  4.35M|#define COEFF_CONTEXT_BITS 3
  ------------------
  307|  4.35M|    assert(sign <= 2);
  308|  4.35M|    dc_sign += signs[sign];
  309|  4.35M|  } while (++k < txb_h_unit);
  ------------------
  |  Branch (309:12): [True: 2.98M, False: 1.37M]
  ------------------
  310|       |
  311|  1.37M|  txb_ctx->dc_sign_ctx = dc_sign_contexts[dc_sign + 2 * MAX_TX_SIZE_UNIT];
  ------------------
  |  |  286|  1.37M|#define MAX_TX_SIZE_UNIT 16
  ------------------
  312|       |
  313|  1.37M|  if (plane == 0) {
  ------------------
  |  Branch (313:7): [True: 632k, False: 744k]
  ------------------
  314|   632k|    if (plane_bsize == txsize_to_bsize[tx_size]) {
  ------------------
  |  Branch (314:9): [True: 562k, False: 70.5k]
  ------------------
  315|   562k|      txb_ctx->txb_skip_ctx = 0;
  316|   562k|    } else {
  317|       |      // This is the algorithm to generate table skip_contexts[top][left].
  318|       |      //    const int max = AOMMIN(top | left, 4);
  319|       |      //    const int min = AOMMIN(AOMMIN(top, left), 4);
  320|       |      //    if (!max)
  321|       |      //      txb_skip_ctx = 1;
  322|       |      //    else if (!min)
  323|       |      //      txb_skip_ctx = 2 + (max > 3);
  324|       |      //    else if (max <= 3)
  325|       |      //      txb_skip_ctx = 4;
  326|       |      //    else if (min <= 3)
  327|       |      //      txb_skip_ctx = 5;
  328|       |      //    else
  329|       |      //      txb_skip_ctx = 6;
  330|  70.5k|      static const uint8_t skip_contexts[5][5] = { { 1, 2, 2, 2, 3 },
  331|  70.5k|                                                   { 2, 4, 4, 4, 5 },
  332|  70.5k|                                                   { 2, 4, 4, 4, 5 },
  333|  70.5k|                                                   { 2, 4, 4, 4, 5 },
  334|  70.5k|                                                   { 3, 5, 5, 5, 6 } };
  335|       |      // For top and left, we only care about which of the following three
  336|       |      // categories they belong to: { 0 }, { 1, 2, 3 }, or { 4, 5, ... }. The
  337|       |      // spec calculates top and left with the Max() function. We can calculate
  338|       |      // an approximate max with bitwise OR because the real max and the
  339|       |      // approximate max belong to the same category.
  340|  70.5k|      int top = 0;
  341|  70.5k|      int left = 0;
  342|       |
  343|  70.5k|      k = 0;
  344|   210k|      do {
  345|   210k|        top |= a[k];
  346|   210k|      } while (++k < txb_w_unit);
  ------------------
  |  Branch (346:16): [True: 140k, False: 70.5k]
  ------------------
  347|  70.5k|      top &= COEFF_CONTEXT_MASK;
  ------------------
  |  |   52|  70.5k|#define COEFF_CONTEXT_MASK ((1 << COEFF_CONTEXT_BITS) - 1)
  |  |  ------------------
  |  |  |  |   51|  70.5k|#define COEFF_CONTEXT_BITS 3
  |  |  ------------------
  ------------------
  348|  70.5k|      top = AOMMIN(top, 4);
  ------------------
  |  |   34|  70.5k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 31.7k, False: 38.7k]
  |  |  ------------------
  ------------------
  349|       |
  350|  70.5k|      k = 0;
  351|   186k|      do {
  352|   186k|        left |= l[k];
  353|   186k|      } while (++k < txb_h_unit);
  ------------------
  |  Branch (353:16): [True: 115k, False: 70.5k]
  ------------------
  354|  70.5k|      left &= COEFF_CONTEXT_MASK;
  ------------------
  |  |   52|  70.5k|#define COEFF_CONTEXT_MASK ((1 << COEFF_CONTEXT_BITS) - 1)
  |  |  ------------------
  |  |  |  |   51|  70.5k|#define COEFF_CONTEXT_BITS 3
  |  |  ------------------
  ------------------
  355|  70.5k|      left = AOMMIN(left, 4);
  ------------------
  |  |   34|  70.5k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 31.8k, False: 38.6k]
  |  |  ------------------
  ------------------
  356|       |
  357|  70.5k|      txb_ctx->txb_skip_ctx = skip_contexts[top][left];
  358|  70.5k|    }
  359|   744k|  } else {
  360|   744k|    const int ctx_base = get_entropy_context(tx_size, a, l);
  361|   744k|    const int ctx_offset = (num_pels_log2_lookup[plane_bsize] >
  ------------------
  |  Branch (361:28): [True: 30.6k, False: 713k]
  ------------------
  362|   744k|                            num_pels_log2_lookup[txsize_to_bsize[tx_size]])
  363|   744k|                               ? 10
  364|   744k|                               : 7;
  365|   744k|    txb_ctx->txb_skip_ctx = ctx_base + ctx_offset;
  366|   744k|  }
  367|  1.37M|}
decodetxb.c:get_txb_bhl:
   50|  4.71M|static inline int get_txb_bhl(TX_SIZE tx_size) {
   51|  4.71M|  tx_size = av1_get_adjusted_tx_size(tx_size);
   52|  4.71M|  return tx_size_high_log2[tx_size];
   53|  4.71M|}
decodetxb.c:get_txb_wide:
   55|  4.72M|static inline int get_txb_wide(TX_SIZE tx_size) {
   56|  4.72M|  tx_size = av1_get_adjusted_tx_size(tx_size);
   57|  4.72M|  return tx_size_wide[tx_size];
   58|  4.72M|}
decodetxb.c:get_txb_high:
   60|  4.72M|static inline int get_txb_high(TX_SIZE tx_size) {
   61|  4.72M|  tx_size = av1_get_adjusted_tx_size(tx_size);
   62|  4.72M|  return tx_size_high[tx_size];
   63|  4.72M|}
decodetxb.c:set_levels:
   65|  4.71M|static inline uint8_t *set_levels(uint8_t *const levels_buf, const int height) {
   66|  4.71M|  return levels_buf + TX_PAD_TOP * (height + TX_PAD_HOR);
  ------------------
  |  |  193|  4.71M|#define TX_PAD_TOP 0
  ------------------
                return levels_buf + TX_PAD_TOP * (height + TX_PAD_HOR);
  ------------------
  |  |  190|  4.71M|#define TX_PAD_HOR 4
  ------------------
   67|  4.71M|}
decodetxb.c:get_lower_levels_ctx_eob:
  229|  2.77M|static inline int get_lower_levels_ctx_eob(int bhl, int width, int scan_idx) {
  230|  2.77M|  if (scan_idx == 0) return 0;
  ------------------
  |  Branch (230:7): [True: 473k, False: 2.30M]
  ------------------
  231|  2.30M|  if (scan_idx <= (width << bhl) / 8) return 1;
  ------------------
  |  Branch (231:7): [True: 662k, False: 1.64M]
  ------------------
  232|  1.64M|  if (scan_idx <= (width << bhl) / 4) return 2;
  ------------------
  |  Branch (232:7): [True: 282k, False: 1.35M]
  ------------------
  233|  1.35M|  return 3;
  234|  1.64M|}
decodetxb.c:get_br_ctx_eob:
   92|   207k|                                           const TX_CLASS tx_class) {
   93|   207k|  const int col = c >> bhl;
   94|   207k|  const int row = c - (col << bhl);
   95|   207k|  if (c == 0) return 0;
  ------------------
  |  Branch (95:7): [True: 64.5k, False: 142k]
  ------------------
   96|   142k|  if ((tx_class == TX_CLASS_2D && row < 2 && col < 2) ||
  ------------------
  |  Branch (96:8): [True: 135k, False: 7.79k]
  |  Branch (96:35): [True: 46.6k, False: 88.4k]
  |  Branch (96:46): [True: 25.9k, False: 20.7k]
  ------------------
   97|   116k|      (tx_class == TX_CLASS_HORIZ && col == 0) ||
  ------------------
  |  Branch (97:8): [True: 5.77k, False: 111k]
  |  Branch (97:38): [True: 155, False: 5.61k]
  ------------------
   98|   116k|      (tx_class == TX_CLASS_VERT && row == 0))
  ------------------
  |  Branch (98:8): [True: 2.02k, False: 114k]
  |  Branch (98:37): [True: 135, False: 1.89k]
  ------------------
   99|  26.2k|    return 7;
  100|   116k|  return 14;
  101|   142k|}
decodetxb.c:get_padded_idx:
   69|   177M|static inline int get_padded_idx(const int idx, const int bhl) {
   70|   177M|  return idx + ((idx >> bhl) << TX_PAD_HOR_LOG2);
  ------------------
  |  |  189|   177M|#define TX_PAD_HOR_LOG2 2
  ------------------
   71|   177M|}
decodetxb.c:get_lower_levels_ctx_2d:
  237|  53.4M|                                          int bhl, TX_SIZE tx_size) {
  238|  53.4M|  assert(coeff_idx > 0);
  239|  53.4M|  int mag;
  240|       |  // Note: AOMMIN(level, 3) is useless for decoder since level < 3.
  241|  53.4M|  levels = levels + get_padded_idx(coeff_idx, bhl);
  242|  53.4M|  mag = AOMMIN(levels[(1 << bhl) + TX_PAD_HOR], 3);               // { 0, 1 }
  ------------------
  |  |   34|  53.4M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 49.6M, False: 3.81M]
  |  |  ------------------
  ------------------
  243|  53.4M|  mag += AOMMIN(levels[1], 3);                                    // { 1, 0 }
  ------------------
  |  |   34|  53.4M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 49.6M, False: 3.80M]
  |  |  ------------------
  ------------------
  244|  53.4M|  mag += AOMMIN(levels[(1 << bhl) + TX_PAD_HOR + 1], 3);          // { 1, 1 }
  ------------------
  |  |   34|  53.4M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 50.5M, False: 2.88M]
  |  |  ------------------
  ------------------
  245|  53.4M|  mag += AOMMIN(levels[(2 << bhl) + (2 << TX_PAD_HOR_LOG2)], 3);  // { 0, 2 }
  ------------------
  |  |   34|  53.4M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 50.6M, False: 2.76M]
  |  |  ------------------
  ------------------
  246|  53.4M|  mag += AOMMIN(levels[2], 3);                                    // { 2, 0 }
  ------------------
  |  |   34|  53.4M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 50.7M, False: 2.73M]
  |  |  ------------------
  ------------------
  247|       |
  248|  53.4M|  const int ctx = AOMMIN((mag + 1) >> 1, 4);
  ------------------
  |  |   34|  53.4M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 48.4M, False: 5.01M]
  |  |  ------------------
  ------------------
  249|  53.4M|  return ctx + av1_nz_map_ctx_offset[tx_size][coeff_idx];
  250|  53.4M|}
decodetxb.c:get_br_ctx_2d:
   75|  5.18M|                                const int bhl) {
   76|  5.18M|  assert(c > 0);
   77|  5.18M|  const int col = c >> bhl;
   78|  5.18M|  const int row = c - (col << bhl);
   79|  5.18M|  const int stride = (1 << bhl) + TX_PAD_HOR;
  ------------------
  |  |  190|  5.18M|#define TX_PAD_HOR 4
  ------------------
   80|  5.18M|  const int pos = col * stride + row;
   81|  5.18M|  int mag = AOMMIN(levels[pos + 1], MAX_BASE_BR_RANGE) +
  ------------------
  |  |   34|  5.18M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 5.00M, False: 183k]
  |  |  ------------------
  ------------------
   82|  5.18M|            AOMMIN(levels[pos + stride], MAX_BASE_BR_RANGE) +
  ------------------
  |  |   34|  5.18M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 5.01M, False: 175k]
  |  |  ------------------
  ------------------
   83|  5.18M|            AOMMIN(levels[pos + 1 + stride], MAX_BASE_BR_RANGE);
  ------------------
  |  |   34|  5.18M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 5.06M, False: 123k]
  |  |  ------------------
  ------------------
   84|  5.18M|  mag = AOMMIN((mag + 1) >> 1, 6);
  ------------------
  |  |   34|  5.18M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 3.80M, False: 1.38M]
  |  |  ------------------
  ------------------
   85|       |  //((row | col) < 2) is equivalent to ((row < 2) && (col < 2))
   86|  5.18M|  if ((row | col) < 2) return mag + 7;
  ------------------
  |  Branch (86:7): [True: 1.13M, False: 4.04M]
  ------------------
   87|  4.04M|  return mag + 14;
   88|  5.18M|}
decodetxb.c:get_lower_levels_ctx:
  254|  5.29M|                                                 TX_CLASS tx_class) {
  255|  5.29M|  const int stats =
  256|  5.29M|      get_nz_mag(levels + get_padded_idx(coeff_idx, bhl), bhl, tx_class);
  257|  5.29M|  return get_nz_map_ctx_from_stats(stats, coeff_idx, bhl, tx_size, tx_class);
  258|  5.29M|}
decodetxb.c:get_nz_mag:
  151|  5.28M|                                       const int bhl, const TX_CLASS tx_class) {
  152|  5.28M|  int mag;
  153|       |
  154|       |  // Note: AOMMIN(level, 3) is useless for decoder since level < 3.
  155|  5.28M|  mag = clip_max3[levels[(1 << bhl) + TX_PAD_HOR]];  // { 0, 1 }
  ------------------
  |  |  190|  5.28M|#define TX_PAD_HOR 4
  ------------------
  156|  5.28M|  mag += clip_max3[levels[1]];                       // { 1, 0 }
  157|       |
  158|  5.28M|  if (tx_class == TX_CLASS_2D) {
  ------------------
  |  Branch (158:7): [True: 2.17M, False: 3.11M]
  ------------------
  159|  2.17M|    mag += clip_max3[levels[(1 << bhl) + TX_PAD_HOR + 1]];          // { 1, 1 }
  ------------------
  |  |  190|  2.17M|#define TX_PAD_HOR 4
  ------------------
  160|  2.17M|    mag += clip_max3[levels[(2 << bhl) + (2 << TX_PAD_HOR_LOG2)]];  // { 0, 2 }
  ------------------
  |  |  189|  2.17M|#define TX_PAD_HOR_LOG2 2
  ------------------
  161|  2.17M|    mag += clip_max3[levels[2]];                                    // { 2, 0 }
  162|  3.11M|  } else if (tx_class == TX_CLASS_VERT) {
  ------------------
  |  Branch (162:14): [True: 1.03M, False: 2.08M]
  ------------------
  163|  1.03M|    mag += clip_max3[levels[2]];  // { 2, 0 }
  164|  1.03M|    mag += clip_max3[levels[3]];  // { 3, 0 }
  165|  1.03M|    mag += clip_max3[levels[4]];  // { 4, 0 }
  166|  2.08M|  } else {
  167|  2.08M|    mag += clip_max3[levels[(2 << bhl) + (2 << TX_PAD_HOR_LOG2)]];  // { 0, 2 }
  ------------------
  |  |  189|  2.08M|#define TX_PAD_HOR_LOG2 2
  ------------------
  168|  2.08M|    mag += clip_max3[levels[(3 << bhl) + (3 << TX_PAD_HOR_LOG2)]];  // { 0, 3 }
  ------------------
  |  |  189|  2.08M|#define TX_PAD_HOR_LOG2 2
  ------------------
  169|  2.08M|    mag += clip_max3[levels[(4 << bhl) + (4 << TX_PAD_HOR_LOG2)]];  // { 0, 4 }
  ------------------
  |  |  189|  2.08M|#define TX_PAD_HOR_LOG2 2
  ------------------
  170|  2.08M|  }
  171|       |
  172|  5.28M|  return mag;
  173|  5.28M|}
decodetxb.c:get_nz_map_ctx_from_stats:
  192|  5.28M|    const int bhl, const TX_SIZE tx_size, const TX_CLASS tx_class) {
  193|       |  // tx_class == 0(TX_CLASS_2D)
  194|  5.28M|  if ((tx_class | coeff_idx) == 0) return 0;
  ------------------
  |  Branch (194:7): [True: 2.17M, False: 3.11M]
  ------------------
  195|  3.11M|  int ctx = (stats + 1) >> 1;
  196|  3.11M|  ctx = AOMMIN(ctx, 4);
  ------------------
  |  |   34|  3.11M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 3.00M, False: 109k]
  |  |  ------------------
  ------------------
  197|  3.11M|  switch (tx_class) {
  198|      0|    case TX_CLASS_2D: {
  ------------------
  |  Branch (198:5): [True: 0, False: 3.11M]
  ------------------
  199|       |      // This is the algorithm to generate av1_nz_map_ctx_offset[][]
  200|       |      //   const int width = tx_size_wide[tx_size];
  201|       |      //   const int height = tx_size_high[tx_size];
  202|       |      //   if (width < height) {
  203|       |      //     if (row < 2) return 11 + ctx;
  204|       |      //   } else if (width > height) {
  205|       |      //     if (col < 2) return 16 + ctx;
  206|       |      //   }
  207|       |      //   if (row + col < 2) return ctx + 1;
  208|       |      //   if (row + col < 4) return 5 + ctx + 1;
  209|       |      //   return 21 + ctx;
  210|      0|      return ctx + av1_nz_map_ctx_offset[tx_size][coeff_idx];
  211|      0|    }
  212|  2.09M|    case TX_CLASS_HORIZ: {
  ------------------
  |  Branch (212:5): [True: 2.09M, False: 1.02M]
  ------------------
  213|  2.09M|      const int col = coeff_idx >> bhl;
  214|  2.09M|      return ctx + nz_map_ctx_offset_1d[col];
  215|      0|    }
  216|  1.03M|    case TX_CLASS_VERT: {
  ------------------
  |  Branch (216:5): [True: 1.03M, False: 2.08M]
  ------------------
  217|  1.03M|      const int col = coeff_idx >> bhl;
  218|  1.03M|      const int row = coeff_idx - (col << bhl);
  219|  1.03M|      return ctx + nz_map_ctx_offset_1d[row];
  220|      0|    }
  221|      0|    default: break;
  ------------------
  |  Branch (221:5): [True: 0, False: 3.11M]
  ------------------
  222|  3.11M|  }
  223|      0|  return 0;
  224|  3.11M|}
decodetxb.c:get_br_ctx:
  105|  1.11M|                                       const int bhl, const TX_CLASS tx_class) {
  106|  1.11M|  const int col = c >> bhl;
  107|  1.11M|  const int row = c - (col << bhl);
  108|  1.11M|  const int stride = (1 << bhl) + TX_PAD_HOR;
  ------------------
  |  |  190|  1.11M|#define TX_PAD_HOR 4
  ------------------
  109|  1.11M|  const int pos = col * stride + row;
  110|  1.11M|  int mag = levels[pos + 1];
  111|  1.11M|  mag += levels[pos + stride];
  112|  1.11M|  switch (tx_class) {
  113|   900k|    case TX_CLASS_2D:
  ------------------
  |  Branch (113:5): [True: 900k, False: 210k]
  ------------------
  114|   900k|      mag += levels[pos + stride + 1];
  115|   900k|      mag = AOMMIN((mag + 1) >> 1, 6);
  ------------------
  |  |   34|   900k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 734k, False: 165k]
  |  |  ------------------
  ------------------
  116|   900k|      if (c == 0) return mag;
  ------------------
  |  Branch (116:11): [True: 900k, False: 18.4E]
  ------------------
  117|  18.4E|      if ((row < 2) && (col < 2)) return mag + 7;
  ------------------
  |  Branch (117:11): [True: 0, False: 18.4E]
  |  Branch (117:24): [True: 0, False: 0]
  ------------------
  118|  18.4E|      break;
  119|  18.4E|    case TX_CLASS_HORIZ:
  ------------------
  |  Branch (119:5): [True: 156k, False: 954k]
  ------------------
  120|   156k|      mag += levels[pos + (stride << 1)];
  121|   156k|      mag = AOMMIN((mag + 1) >> 1, 6);
  ------------------
  |  |   34|   156k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 121k, False: 34.7k]
  |  |  ------------------
  ------------------
  122|   156k|      if (c == 0) return mag;
  ------------------
  |  Branch (122:11): [True: 15.6k, False: 140k]
  ------------------
  123|   140k|      if (col == 0) return mag + 7;
  ------------------
  |  Branch (123:11): [True: 44.8k, False: 95.6k]
  ------------------
  124|  95.6k|      break;
  125|  95.6k|    case TX_CLASS_VERT:
  ------------------
  |  Branch (125:5): [True: 54.9k, False: 1.05M]
  ------------------
  126|  54.9k|      mag += levels[pos + 2];
  127|  54.9k|      mag = AOMMIN((mag + 1) >> 1, 6);
  ------------------
  |  |   34|  54.9k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 45.7k, False: 9.20k]
  |  |  ------------------
  ------------------
  128|  54.9k|      if (c == 0) return mag;
  ------------------
  |  Branch (128:11): [True: 5.65k, False: 49.2k]
  ------------------
  129|  49.2k|      if (row == 0) return mag + 7;
  ------------------
  |  Branch (129:11): [True: 18.2k, False: 30.9k]
  ------------------
  130|  30.9k|      break;
  131|  30.9k|    default: break;
  ------------------
  |  Branch (131:5): [True: 0, False: 1.11M]
  ------------------
  132|  1.11M|  }
  133|       |
  134|   126k|  return mag + 14;
  135|  1.11M|}
decodetxb.c:set_dc_sign:
  274|  2.77M|static inline void set_dc_sign(int *cul_level, int dc_val) {
  275|  2.77M|  if (dc_val < 0)
  ------------------
  |  Branch (275:7): [True: 1.17M, False: 1.59M]
  ------------------
  276|  1.17M|    *cul_level |= 1 << COEFF_CONTEXT_BITS;
  ------------------
  |  |   51|  1.17M|#define COEFF_CONTEXT_BITS 3
  ------------------
  277|  1.59M|  else if (dc_val > 0)
  ------------------
  |  Branch (277:12): [True: 1.08M, False: 515k]
  ------------------
  278|  1.08M|    *cul_level += 2 << COEFF_CONTEXT_BITS;
  ------------------
  |  |   51|  1.08M|#define COEFF_CONTEXT_BITS 3
  ------------------
  279|  2.77M|}

av1_get_shear_params:
  243|  58.4k|int av1_get_shear_params(WarpedMotionParams *wm) {
  244|       |#ifndef NDEBUG
  245|       |  // Check that models have been constructed sensibly
  246|       |  // This is a good place to check, because this function does not need to
  247|       |  // be called until after model construction is complete, but must be called
  248|       |  // before the model can be used for prediction.
  249|       |  check_model_consistency(wm);
  250|       |#endif  // NDEBUG
  251|       |
  252|  58.4k|  const int32_t *mat = wm->wmmat;
  253|  58.4k|  if (!is_affine_valid(wm)) return 0;
  ------------------
  |  Branch (253:7): [True: 0, False: 58.4k]
  ------------------
  254|       |
  255|  58.4k|  wm->alpha =
  256|  58.4k|      clamp(mat[2] - (1 << WARPEDMODEL_PREC_BITS), INT16_MIN, INT16_MAX);
  ------------------
  |  |   96|  58.4k|#define WARPEDMODEL_PREC_BITS 16
  ------------------
  257|  58.4k|  wm->beta = clamp(mat[3], INT16_MIN, INT16_MAX);
  258|  58.4k|  int16_t shift;
  259|  58.4k|  int16_t y = resolve_divisor_32(abs(mat[2]), &shift) * (mat[2] < 0 ? -1 : 1);
  ------------------
  |  Branch (259:58): [True: 0, False: 58.4k]
  ------------------
  260|  58.4k|  int64_t v = ((int64_t)mat[4] * (1 << WARPEDMODEL_PREC_BITS)) * y;
  ------------------
  |  |   96|  58.4k|#define WARPEDMODEL_PREC_BITS 16
  ------------------
  261|  58.4k|  wm->gamma =
  262|  58.4k|      clamp((int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift), INT16_MIN, INT16_MAX);
  ------------------
  |  |   58|  58.4k|  (((value) < 0) ? -ROUND_POWER_OF_TWO_64(-(value), (n)) \
  |  |  ------------------
  |  |  |  |   53|  1.41k|  (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |  |  Branch (58:4): [True: 1.41k, False: 57.0k]
  |  |  ------------------
  |  |   59|  58.4k|                 : ROUND_POWER_OF_TWO_64((value), (n)))
  |  |  ------------------
  |  |  |  |   53|  57.0k|  (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
  263|  58.4k|  v = ((int64_t)mat[3] * mat[4]) * y;
  264|  58.4k|  wm->delta = clamp(mat[5] - (int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift) -
  ------------------
  |  |   58|  58.4k|  (((value) < 0) ? -ROUND_POWER_OF_TWO_64(-(value), (n)) \
  |  |  ------------------
  |  |  |  |   53|  1.78k|  (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |  |  Branch (58:4): [True: 1.78k, False: 56.6k]
  |  |  ------------------
  |  |   59|  58.4k|                 : ROUND_POWER_OF_TWO_64((value), (n)))
  |  |  ------------------
  |  |  |  |   53|  56.6k|  (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
  265|  58.4k|                        (1 << WARPEDMODEL_PREC_BITS),
  ------------------
  |  |   96|  58.4k|#define WARPEDMODEL_PREC_BITS 16
  ------------------
  266|  58.4k|                    INT16_MIN, INT16_MAX);
  267|       |
  268|  58.4k|  wm->alpha = ROUND_POWER_OF_TWO_SIGNED(wm->alpha, WARP_PARAM_REDUCE_BITS) *
  ------------------
  |  |   45|  58.4k|  (((value) < 0) ? -ROUND_POWER_OF_TWO(-(value), (n)) \
  |  |  ------------------
  |  |  |  |   41|  2.39k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |  |  Branch (45:4): [True: 2.39k, False: 56.0k]
  |  |  ------------------
  |  |   46|  58.4k|                 : ROUND_POWER_OF_TWO((value), (n)))
  |  |  ------------------
  |  |  |  |   41|  56.0k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
  269|  58.4k|              (1 << WARP_PARAM_REDUCE_BITS);
  ------------------
  |  |  105|  58.4k|#define WARP_PARAM_REDUCE_BITS 6
  ------------------
  270|  58.4k|  wm->beta = ROUND_POWER_OF_TWO_SIGNED(wm->beta, WARP_PARAM_REDUCE_BITS) *
  ------------------
  |  |   45|  58.4k|  (((value) < 0) ? -ROUND_POWER_OF_TWO(-(value), (n)) \
  |  |  ------------------
  |  |  |  |   41|  1.43k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |  |  Branch (45:4): [True: 1.43k, False: 56.9k]
  |  |  ------------------
  |  |   46|  58.4k|                 : ROUND_POWER_OF_TWO((value), (n)))
  |  |  ------------------
  |  |  |  |   41|  56.9k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
  271|  58.4k|             (1 << WARP_PARAM_REDUCE_BITS);
  ------------------
  |  |  105|  58.4k|#define WARP_PARAM_REDUCE_BITS 6
  ------------------
  272|  58.4k|  wm->gamma = ROUND_POWER_OF_TWO_SIGNED(wm->gamma, WARP_PARAM_REDUCE_BITS) *
  ------------------
  |  |   45|  58.4k|  (((value) < 0) ? -ROUND_POWER_OF_TWO(-(value), (n)) \
  |  |  ------------------
  |  |  |  |   41|  1.41k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |  |  Branch (45:4): [True: 1.41k, False: 57.0k]
  |  |  ------------------
  |  |   46|  58.4k|                 : ROUND_POWER_OF_TWO((value), (n)))
  |  |  ------------------
  |  |  |  |   41|  57.0k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
  273|  58.4k|              (1 << WARP_PARAM_REDUCE_BITS);
  ------------------
  |  |  105|  58.4k|#define WARP_PARAM_REDUCE_BITS 6
  ------------------
  274|  58.4k|  wm->delta = ROUND_POWER_OF_TWO_SIGNED(wm->delta, WARP_PARAM_REDUCE_BITS) *
  ------------------
  |  |   45|  58.4k|  (((value) < 0) ? -ROUND_POWER_OF_TWO(-(value), (n)) \
  |  |  ------------------
  |  |  |  |   41|  2.37k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |  |  Branch (45:4): [True: 2.37k, False: 56.0k]
  |  |  ------------------
  |  |   46|  58.4k|                 : ROUND_POWER_OF_TWO((value), (n)))
  |  |  ------------------
  |  |  |  |   41|  56.0k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
  275|  58.4k|              (1 << WARP_PARAM_REDUCE_BITS);
  ------------------
  |  |  105|  58.4k|#define WARP_PARAM_REDUCE_BITS 6
  ------------------
  276|       |
  277|  58.4k|  if (!is_affine_shear_allowed(wm->alpha, wm->beta, wm->gamma, wm->delta))
  ------------------
  |  Branch (277:7): [True: 191, False: 58.2k]
  ------------------
  278|    191|    return 0;
  279|       |
  280|  58.2k|  return 1;
  281|  58.4k|}
highbd_warp_plane:
  419|    956|                       int bd, ConvolveParams *conv_params) {
  420|    956|  const int32_t *const mat = wm->wmmat;
  421|    956|  const int16_t alpha = wm->alpha;
  422|    956|  const int16_t beta = wm->beta;
  423|    956|  const int16_t gamma = wm->gamma;
  424|    956|  const int16_t delta = wm->delta;
  425|       |
  426|    956|  av1_highbd_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
  427|    956|                         p_width, p_height, p_stride, subsampling_x,
  428|    956|                         subsampling_y, bd, conv_params, alpha, beta, gamma,
  429|    956|                         delta);
  430|    956|}
warp_plane:
  651|  3.22k|                int subsampling_y, ConvolveParams *conv_params) {
  652|  3.22k|  const int32_t *const mat = wm->wmmat;
  653|  3.22k|  const int16_t alpha = wm->alpha;
  654|  3.22k|  const int16_t beta = wm->beta;
  655|  3.22k|  const int16_t gamma = wm->gamma;
  656|  3.22k|  const int16_t delta = wm->delta;
  657|  3.22k|  av1_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row, p_width,
  658|  3.22k|                  p_height, p_stride, subsampling_x, subsampling_y, conv_params,
  659|  3.22k|                  alpha, beta, gamma, delta);
  660|  3.22k|}
av1_warp_plane:
  666|  4.17k|                    int subsampling_y, ConvolveParams *conv_params) {
  667|  4.17k|#if CONFIG_AV1_HIGHBITDEPTH
  668|  4.17k|  if (use_hbd)
  ------------------
  |  Branch (668:7): [True: 956, False: 3.22k]
  ------------------
  669|    956|    highbd_warp_plane(wm, CONVERT_TO_SHORTPTR(ref), width, height, stride,
  ------------------
  |  |   75|    956|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  670|    956|                      CONVERT_TO_SHORTPTR(pred), p_col, p_row, p_width,
  ------------------
  |  |   75|    956|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  671|    956|                      p_height, p_stride, subsampling_x, subsampling_y, bd,
  672|    956|                      conv_params);
  673|  3.22k|  else
  674|  3.22k|    warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
  675|  3.22k|               p_height, p_stride, subsampling_x, subsampling_y, conv_params);
  676|       |#else
  677|       |  (void)use_hbd;
  678|       |  (void)bd;
  679|       |  warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
  680|       |             p_height, p_stride, subsampling_x, subsampling_y, conv_params);
  681|       |#endif
  682|  4.17k|}
av1_find_projection:
  908|  3.45k|                        WarpedMotionParams *wm_params, int mi_row, int mi_col) {
  909|  3.45k|  assert(wm_params->wmtype == AFFINE);
  910|       |
  911|  3.45k|  if (find_affine_int(np, pts1, pts2, bsize, mvy, mvx, wm_params, mi_row,
  ------------------
  |  Branch (911:7): [True: 39, False: 3.41k]
  ------------------
  912|  3.45k|                      mi_col))
  913|     39|    return 1;
  914|       |
  915|       |  // check compatibility with the fast warp filter
  916|  3.41k|  if (!av1_get_shear_params(wm_params)) return 1;
  ------------------
  |  Branch (916:7): [True: 179, False: 3.23k]
  ------------------
  917|       |
  918|  3.23k|  return 0;
  919|  3.41k|}
warped_motion.c:is_affine_valid:
  205|  58.4k|static int is_affine_valid(const WarpedMotionParams *const wm) {
  206|  58.4k|  const int32_t *mat = wm->wmmat;
  207|  58.4k|  return (mat[2] > 0);
  208|  58.4k|}
warped_motion.c:resolve_divisor_32:
  189|  58.4k|static int16_t resolve_divisor_32(uint32_t D, int16_t *shift) {
  190|  58.4k|  int32_t f;
  191|  58.4k|  *shift = get_msb(D);
  192|       |  // e is obtained from D after resetting the most significant 1 bit.
  193|  58.4k|  const int32_t e = D - ((uint32_t)1 << *shift);
  194|       |  // Get the most significant DIV_LUT_BITS (8) bits of e into f
  195|  58.4k|  if (*shift > DIV_LUT_BITS)
  ------------------
  |  |  140|  58.4k|#define DIV_LUT_BITS 8
  ------------------
  |  Branch (195:7): [True: 58.4k, False: 0]
  ------------------
  196|  58.4k|    f = ROUND_POWER_OF_TWO(e, *shift - DIV_LUT_BITS);
  ------------------
  |  |   41|  58.4k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
  197|      0|  else
  198|      0|    f = e << (DIV_LUT_BITS - *shift);
  ------------------
  |  |  140|      0|#define DIV_LUT_BITS 8
  ------------------
  199|  58.4k|  assert(f <= DIV_LUT_NUM);
  200|  58.4k|  *shift += DIV_LUT_PREC_BITS;
  ------------------
  |  |  139|  58.4k|#define DIV_LUT_PREC_BITS 14
  ------------------
  201|       |  // Use f as lookup into the precomputed table of multipliers
  202|  58.4k|  return div_lut[f];
  203|  58.4k|}
warped_motion.c:is_affine_shear_allowed:
  211|  58.4k|                                   int16_t delta) {
  212|  58.4k|  if ((4 * abs(alpha) + 7 * abs(beta) >= (1 << WARPEDMODEL_PREC_BITS)) ||
  ------------------
  |  |   96|  58.4k|#define WARPEDMODEL_PREC_BITS 16
  ------------------
  |  Branch (212:7): [True: 163, False: 58.2k]
  ------------------
  213|  58.2k|      (4 * abs(gamma) + 4 * abs(delta) >= (1 << WARPEDMODEL_PREC_BITS)))
  ------------------
  |  |   96|  58.2k|#define WARPEDMODEL_PREC_BITS 16
  ------------------
  |  Branch (213:7): [True: 28, False: 58.2k]
  ------------------
  214|    191|    return 0;
  215|  58.2k|  else
  216|  58.2k|    return 1;
  217|  58.4k|}
warped_motion.c:find_affine_int:
  798|  3.45k|                           WarpedMotionParams *wm, int mi_row, int mi_col) {
  799|  3.45k|  int32_t A[2][2] = { { 0, 0 }, { 0, 0 } };
  800|  3.45k|  int32_t Bx[2] = { 0, 0 };
  801|  3.45k|  int32_t By[2] = { 0, 0 };
  802|       |
  803|  3.45k|  const int bw = block_size_wide[bsize];
  804|  3.45k|  const int bh = block_size_high[bsize];
  805|  3.45k|  const int rsuy = bh / 2 - 1;
  806|  3.45k|  const int rsux = bw / 2 - 1;
  807|  3.45k|  const int suy = rsuy * 8;
  808|  3.45k|  const int sux = rsux * 8;
  809|  3.45k|  const int duy = suy + mvy;
  810|  3.45k|  const int dux = sux + mvx;
  811|       |
  812|       |  // Assume the center pixel of the block has exactly the same motion vector
  813|       |  // as transmitted for the block. First shift the origin of the source
  814|       |  // points to the block center, and the origin of the destination points to
  815|       |  // the block center added to the motion vector transmitted.
  816|       |  // Let (xi, yi) denote the source points and (xi', yi') denote destination
  817|       |  // points after origin shfifting, for i = 0, 1, 2, .... n-1.
  818|       |  // Then if  P = [x0, y0,
  819|       |  //               x1, y1
  820|       |  //               x2, y1,
  821|       |  //                ....
  822|       |  //              ]
  823|       |  //          q = [x0', x1', x2', ... ]'
  824|       |  //          r = [y0', y1', y2', ... ]'
  825|       |  // the least squares problems that need to be solved are:
  826|       |  //          [h1, h2]' = inv(P'P)P'q and
  827|       |  //          [h3, h4]' = inv(P'P)P'r
  828|       |  // where the affine transformation is given by:
  829|       |  //          x' = h1.x + h2.y
  830|       |  //          y' = h3.x + h4.y
  831|       |  //
  832|       |  // The loop below computes: A = P'P, Bx = P'q, By = P'r
  833|       |  // We need to just compute inv(A).Bx and inv(A).By for the solutions.
  834|       |  // Contribution from neighbor block
  835|  9.67k|  for (int i = 0; i < np; i++) {
  ------------------
  |  Branch (835:19): [True: 6.22k, False: 3.45k]
  ------------------
  836|  6.22k|    const int dx = pts2[i * 2] - dux;
  837|  6.22k|    const int dy = pts2[i * 2 + 1] - duy;
  838|  6.22k|    const int sx = pts1[i * 2] - sux;
  839|  6.22k|    const int sy = pts1[i * 2 + 1] - suy;
  840|       |    // (TODO)yunqing: This comparison wouldn't be necessary if the sample
  841|       |    // selection is done in find_samples(). Also, global offset can be removed
  842|       |    // while collecting samples.
  843|  6.22k|    if (abs(sx - dx) < LS_MV_MAX && abs(sy - dy) < LS_MV_MAX) {
  ------------------
  |  |  684|  12.4k|#define LS_MV_MAX 256  // max mv in 1/8-pel
  ------------------
                  if (abs(sx - dx) < LS_MV_MAX && abs(sy - dy) < LS_MV_MAX) {
  ------------------
  |  |  684|  6.19k|#define LS_MV_MAX 256  // max mv in 1/8-pel
  ------------------
  |  Branch (843:9): [True: 6.19k, False: 29]
  |  Branch (843:37): [True: 6.18k, False: 10]
  ------------------
  844|  6.18k|      A[0][0] += LS_SQUARE(sx);
  ------------------
  |  |  710|  6.18k|  (((a) * (a) * 4 + (a) * 4 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
  |  |  ------------------
  |  |  |  |  686|  6.18k|#define LS_STEP 8
  |  |  ------------------
  |  |                 (((a) * (a) * 4 + (a) * 4 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
  |  |  ------------------
  |  |  |  |  686|  6.18k|#define LS_STEP 8
  |  |  ------------------
  |  |                 (((a) * (a) * 4 + (a) * 4 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
  |  |  ------------------
  |  |  |  |  686|  6.18k|#define LS_STEP 8
  |  |  ------------------
  |  |  711|  6.18k|   (2 + LS_MAT_DOWN_BITS))
  |  |  ------------------
  |  |  |  |  700|  6.18k|#define LS_MAT_DOWN_BITS 2
  |  |  ------------------
  ------------------
  845|  6.18k|      A[0][1] += LS_PRODUCT1(sx, sy);
  ------------------
  |  |  713|  6.18k|  (((a) * (b) * 4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> \
  |  |  ------------------
  |  |  |  |  686|  6.18k|#define LS_STEP 8
  |  |  ------------------
  |  |                 (((a) * (b) * 4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> \
  |  |  ------------------
  |  |  |  |  686|  6.18k|#define LS_STEP 8
  |  |  ------------------
  |  |                 (((a) * (b) * 4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> \
  |  |  ------------------
  |  |  |  |  686|  6.18k|#define LS_STEP 8
  |  |  ------------------
  |  |  714|  6.18k|   (2 + LS_MAT_DOWN_BITS))
  |  |  ------------------
  |  |  |  |  700|  6.18k|#define LS_MAT_DOWN_BITS 2
  |  |  ------------------
  ------------------
  846|  6.18k|      A[1][1] += LS_SQUARE(sy);
  ------------------
  |  |  710|  6.18k|  (((a) * (a) * 4 + (a) * 4 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
  |  |  ------------------
  |  |  |  |  686|  6.18k|#define LS_STEP 8
  |  |  ------------------
  |  |                 (((a) * (a) * 4 + (a) * 4 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
  |  |  ------------------
  |  |  |  |  686|  6.18k|#define LS_STEP 8
  |  |  ------------------
  |  |                 (((a) * (a) * 4 + (a) * 4 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
  |  |  ------------------
  |  |  |  |  686|  6.18k|#define LS_STEP 8
  |  |  ------------------
  |  |  711|  6.18k|   (2 + LS_MAT_DOWN_BITS))
  |  |  ------------------
  |  |  |  |  700|  6.18k|#define LS_MAT_DOWN_BITS 2
  |  |  ------------------
  ------------------
  847|  6.18k|      Bx[0] += LS_PRODUCT2(sx, dx);
  ------------------
  |  |  716|  6.18k|  (((a) * (b) * 4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
  |  |  ------------------
  |  |  |  |  686|  6.18k|#define LS_STEP 8
  |  |  ------------------
  |  |                 (((a) * (b) * 4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
  |  |  ------------------
  |  |  |  |  686|  6.18k|#define LS_STEP 8
  |  |  ------------------
  |  |                 (((a) * (b) * 4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
  |  |  ------------------
  |  |  |  |  686|  6.18k|#define LS_STEP 8
  |  |  ------------------
  |  |  717|  6.18k|   (2 + LS_MAT_DOWN_BITS))
  |  |  ------------------
  |  |  |  |  700|  6.18k|#define LS_MAT_DOWN_BITS 2
  |  |  ------------------
  ------------------
  848|  6.18k|      Bx[1] += LS_PRODUCT1(sy, dx);
  ------------------
  |  |  713|  6.18k|  (((a) * (b) * 4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> \
  |  |  ------------------
  |  |  |  |  686|  6.18k|#define LS_STEP 8
  |  |  ------------------
  |  |                 (((a) * (b) * 4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> \
  |  |  ------------------
  |  |  |  |  686|  6.18k|#define LS_STEP 8
  |  |  ------------------
  |  |                 (((a) * (b) * 4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> \
  |  |  ------------------
  |  |  |  |  686|  6.18k|#define LS_STEP 8
  |  |  ------------------
  |  |  714|  6.18k|   (2 + LS_MAT_DOWN_BITS))
  |  |  ------------------
  |  |  |  |  700|  6.18k|#define LS_MAT_DOWN_BITS 2
  |  |  ------------------
  ------------------
  849|  6.18k|      By[0] += LS_PRODUCT1(sx, dy);
  ------------------
  |  |  713|  6.18k|  (((a) * (b) * 4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> \
  |  |  ------------------
  |  |  |  |  686|  6.18k|#define LS_STEP 8
  |  |  ------------------
  |  |                 (((a) * (b) * 4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> \
  |  |  ------------------
  |  |  |  |  686|  6.18k|#define LS_STEP 8
  |  |  ------------------
  |  |                 (((a) * (b) * 4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> \
  |  |  ------------------
  |  |  |  |  686|  6.18k|#define LS_STEP 8
  |  |  ------------------
  |  |  714|  6.18k|   (2 + LS_MAT_DOWN_BITS))
  |  |  ------------------
  |  |  |  |  700|  6.18k|#define LS_MAT_DOWN_BITS 2
  |  |  ------------------
  ------------------
  850|  6.18k|      By[1] += LS_PRODUCT2(sy, dy);
  ------------------
  |  |  716|  6.18k|  (((a) * (b) * 4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
  |  |  ------------------
  |  |  |  |  686|  6.18k|#define LS_STEP 8
  |  |  ------------------
  |  |                 (((a) * (b) * 4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
  |  |  ------------------
  |  |  |  |  686|  6.18k|#define LS_STEP 8
  |  |  ------------------
  |  |                 (((a) * (b) * 4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
  |  |  ------------------
  |  |  |  |  686|  6.18k|#define LS_STEP 8
  |  |  ------------------
  |  |  717|  6.18k|   (2 + LS_MAT_DOWN_BITS))
  |  |  ------------------
  |  |  |  |  700|  6.18k|#define LS_MAT_DOWN_BITS 2
  |  |  ------------------
  ------------------
  851|  6.18k|    }
  852|  6.22k|  }
  853|       |
  854|       |  // Just for debugging, and can be removed later.
  855|  3.45k|  assert(A[0][0] >= LS_MAT_MIN && A[0][0] <= LS_MAT_MAX);
  856|  3.45k|  assert(A[0][1] >= LS_MAT_MIN && A[0][1] <= LS_MAT_MAX);
  857|  3.45k|  assert(A[1][1] >= LS_MAT_MIN && A[1][1] <= LS_MAT_MAX);
  858|  3.45k|  assert(Bx[0] >= LS_MAT_MIN && Bx[0] <= LS_MAT_MAX);
  859|  3.45k|  assert(Bx[1] >= LS_MAT_MIN && Bx[1] <= LS_MAT_MAX);
  860|  3.45k|  assert(By[0] >= LS_MAT_MIN && By[0] <= LS_MAT_MAX);
  861|  3.45k|  assert(By[1] >= LS_MAT_MIN && By[1] <= LS_MAT_MAX);
  862|       |
  863|       |  // Compute Determinant of A
  864|  3.45k|  const int64_t Det = (int64_t)A[0][0] * A[1][1] - (int64_t)A[0][1] * A[0][1];
  865|  3.45k|  if (Det == 0) return 1;
  ------------------
  |  Branch (865:7): [True: 39, False: 3.41k]
  ------------------
  866|       |
  867|  3.41k|  int16_t shift;
  868|  3.41k|  int16_t iDet = resolve_divisor_64(llabs(Det), &shift) * (Det < 0 ? -1 : 1);
  ------------------
  |  Branch (868:60): [True: 0, False: 3.41k]
  ------------------
  869|  3.41k|  shift -= WARPEDMODEL_PREC_BITS;
  ------------------
  |  |   96|  3.41k|#define WARPEDMODEL_PREC_BITS 16
  ------------------
  870|  3.41k|  if (shift < 0) {
  ------------------
  |  Branch (870:7): [True: 0, False: 3.41k]
  ------------------
  871|      0|    iDet <<= (-shift);
  872|      0|    shift = 0;
  873|      0|  }
  874|       |
  875|  3.41k|  int64_t Px[2], Py[2];
  876|       |  // These divided by the Det, are the least squares solutions
  877|  3.41k|  Px[0] = (int64_t)A[1][1] * Bx[0] - (int64_t)A[0][1] * Bx[1];
  878|  3.41k|  Px[1] = -(int64_t)A[0][1] * Bx[0] + (int64_t)A[0][0] * Bx[1];
  879|  3.41k|  Py[0] = (int64_t)A[1][1] * By[0] - (int64_t)A[0][1] * By[1];
  880|  3.41k|  Py[1] = -(int64_t)A[0][1] * By[0] + (int64_t)A[0][0] * By[1];
  881|       |
  882|  3.41k|  wm->wmmat[2] = get_mult_shift_diag(Px[0], iDet, shift);
  883|  3.41k|  wm->wmmat[3] = get_mult_shift_ndiag(Px[1], iDet, shift);
  884|  3.41k|  wm->wmmat[4] = get_mult_shift_ndiag(Py[0], iDet, shift);
  885|  3.41k|  wm->wmmat[5] = get_mult_shift_diag(Py[1], iDet, shift);
  886|       |
  887|  3.41k|  const int isuy = (mi_row * MI_SIZE + rsuy);
  ------------------
  |  |   40|  3.41k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  3.41k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  888|  3.41k|  const int isux = (mi_col * MI_SIZE + rsux);
  ------------------
  |  |   40|  3.41k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  3.41k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  889|       |  // Note: In the vx, vy expressions below, the max value of each of the
  890|       |  // 2nd and 3rd terms are (2^16 - 1) * (2^13 - 1). That leaves enough room
  891|       |  // for the first term so that the overall sum in the worst case fits
  892|       |  // within 32 bits overall.
  893|  3.41k|  const int32_t vx = mvx * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
  ------------------
  |  |   96|  3.41k|#define WARPEDMODEL_PREC_BITS 16
  ------------------
  894|  3.41k|                     (isux * (wm->wmmat[2] - (1 << WARPEDMODEL_PREC_BITS)) +
  ------------------
  |  |   96|  3.41k|#define WARPEDMODEL_PREC_BITS 16
  ------------------
  895|  3.41k|                      isuy * wm->wmmat[3]);
  896|  3.41k|  const int32_t vy = mvy * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
  ------------------
  |  |   96|  3.41k|#define WARPEDMODEL_PREC_BITS 16
  ------------------
  897|  3.41k|                     (isux * wm->wmmat[4] +
  898|  3.41k|                      isuy * (wm->wmmat[5] - (1 << WARPEDMODEL_PREC_BITS)));
  ------------------
  |  |   96|  3.41k|#define WARPEDMODEL_PREC_BITS 16
  ------------------
  899|  3.41k|  wm->wmmat[0] =
  900|  3.41k|      clamp(vx, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
  ------------------
  |  |   98|  3.41k|#define WARPEDMODEL_TRANS_CLAMP (128 << WARPEDMODEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  3.41k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  ------------------
                    clamp(vx, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
  ------------------
  |  |   98|  3.41k|#define WARPEDMODEL_TRANS_CLAMP (128 << WARPEDMODEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  3.41k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  ------------------
  901|  3.41k|  wm->wmmat[1] =
  902|  3.41k|      clamp(vy, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
  ------------------
  |  |   98|  3.41k|#define WARPEDMODEL_TRANS_CLAMP (128 << WARPEDMODEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  3.41k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  ------------------
                    clamp(vy, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
  ------------------
  |  |   98|  3.41k|#define WARPEDMODEL_TRANS_CLAMP (128 << WARPEDMODEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  3.41k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  ------------------
  903|  3.41k|  return 0;
  904|  3.45k|}
warped_motion.c:resolve_divisor_64:
  172|  3.41k|static int16_t resolve_divisor_64(uint64_t D, int16_t *shift) {
  173|  3.41k|  int64_t f;
  174|  3.41k|  *shift = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
  ------------------
  |  Branch (174:22): [True: 20, False: 3.39k]
  ------------------
  175|  3.41k|                               : get_msb((unsigned int)D));
  176|       |  // e is obtained from D after resetting the most significant 1 bit.
  177|  3.41k|  const int64_t e = D - ((uint64_t)1 << *shift);
  178|       |  // Get the most significant DIV_LUT_BITS (8) bits of e into f
  179|  3.41k|  if (*shift > DIV_LUT_BITS)
  ------------------
  |  |  140|  3.41k|#define DIV_LUT_BITS 8
  ------------------
  |  Branch (179:7): [True: 3.41k, False: 0]
  ------------------
  180|  3.41k|    f = ROUND_POWER_OF_TWO_64(e, *shift - DIV_LUT_BITS);
  ------------------
  |  |   53|  3.41k|  (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n))
  ------------------
  181|      0|  else
  182|      0|    f = e << (DIV_LUT_BITS - *shift);
  ------------------
  |  |  140|      0|#define DIV_LUT_BITS 8
  ------------------
  183|  3.41k|  assert(f <= DIV_LUT_NUM);
  184|  3.41k|  *shift += DIV_LUT_PREC_BITS;
  ------------------
  |  |  139|  3.41k|#define DIV_LUT_PREC_BITS 14
  ------------------
  185|       |  // Use f as lookup into the precomputed table of multipliers
  186|  3.41k|  return div_lut[f];
  187|  3.41k|}
warped_motion.c:get_mult_shift_diag:
  787|  6.83k|static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) {
  788|  6.83k|  int64_t v = Px * (int64_t)iDet;
  789|  6.83k|  return (int32_t)clamp64(
  790|  6.83k|      ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
  ------------------
  |  |   58|  6.83k|  (((value) < 0) ? -ROUND_POWER_OF_TWO_64(-(value), (n)) \
  |  |  ------------------
  |  |  |  |   53|     41|  (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |  |  Branch (58:4): [True: 41, False: 6.79k]
  |  |  ------------------
  |  |   59|  6.83k|                 : ROUND_POWER_OF_TWO_64((value), (n)))
  |  |  ------------------
  |  |  |  |   53|  6.79k|  (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
  791|  6.83k|      (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
  ------------------
  |  |   96|  6.83k|#define WARPEDMODEL_PREC_BITS 16
  ------------------
                    (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
  ------------------
  |  |   99|  6.83k|#define WARPEDMODEL_NONDIAGAFFINE_CLAMP (1 << (WARPEDMODEL_PREC_BITS - 3))
  |  |  ------------------
  |  |  |  |   96|  6.83k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  ------------------
  792|  6.83k|      (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
  ------------------
  |  |   96|  6.83k|#define WARPEDMODEL_PREC_BITS 16
  ------------------
                    (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
  ------------------
  |  |   99|  6.83k|#define WARPEDMODEL_NONDIAGAFFINE_CLAMP (1 << (WARPEDMODEL_PREC_BITS - 3))
  |  |  ------------------
  |  |  |  |   96|  6.83k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  ------------------
  793|  6.83k|}
warped_motion.c:get_mult_shift_ndiag:
  780|  6.83k|static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) {
  781|  6.83k|  int64_t v = Px * (int64_t)iDet;
  782|  6.83k|  return (int32_t)clamp64(ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
  ------------------
  |  |   58|  6.83k|  (((value) < 0) ? -ROUND_POWER_OF_TWO_64(-(value), (n)) \
  |  |  ------------------
  |  |  |  |   53|  1.50k|  (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |  |  Branch (58:4): [True: 1.50k, False: 5.33k]
  |  |  ------------------
  |  |   59|  6.83k|                 : ROUND_POWER_OF_TWO_64((value), (n)))
  |  |  ------------------
  |  |  |  |   53|  5.33k|  (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
  783|  6.83k|                          -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
  ------------------
  |  |   99|  6.83k|#define WARPEDMODEL_NONDIAGAFFINE_CLAMP (1 << (WARPEDMODEL_PREC_BITS - 3))
  |  |  ------------------
  |  |  |  |   96|  6.83k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  ------------------
  784|  6.83k|                          WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
  ------------------
  |  |   99|  6.83k|#define WARPEDMODEL_NONDIAGAFFINE_CLAMP (1 << (WARPEDMODEL_PREC_BITS - 3))
  |  |  ------------------
  |  |  |  |   96|  6.83k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  ------------------
  785|  6.83k|}

av1_convolve_horiz_rs_sse4_1:
   28|    600|                                  int x_step_qn) {
   29|    600|  assert(UPSCALE_NORMATIVE_TAPS == 8);
   30|       |
   31|    600|  src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
  ------------------
  |  |  101|    600|#define UPSCALE_NORMATIVE_TAPS 8
  ------------------
   32|       |
   33|    600|  const __m128i round_add = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
  ------------------
  |  |   21|    600|#define FILTER_BITS 7
  ------------------
   34|    600|  const __m128i zero = _mm_setzero_si128();
   35|       |
   36|    600|  const uint8_t *src_y;
   37|    600|  uint8_t *dst_y;
   38|    600|  int x_qn = x0_qn;
   39|  19.5k|  for (int x = 0; x < w; x += 4, x_qn += 4 * x_step_qn) {
  ------------------
  |  Branch (39:19): [True: 18.9k, False: 600]
  ------------------
   40|  18.9k|    const int x_filter_idx0 =
   41|  18.9k|        ((x_qn + 0 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
  ------------------
  |  |   37|  18.9k|#define RS_SCALE_SUBPEL_MASK ((1 << RS_SCALE_SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   36|  18.9k|#define RS_SCALE_SUBPEL_BITS 14
  |  |  ------------------
  ------------------
                      ((x_qn + 0 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
  ------------------
  |  |   38|  18.9k|#define RS_SCALE_EXTRA_BITS (RS_SCALE_SUBPEL_BITS - RS_SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   36|  18.9k|#define RS_SCALE_SUBPEL_BITS 14
  |  |  ------------------
  |  |               #define RS_SCALE_EXTRA_BITS (RS_SCALE_SUBPEL_BITS - RS_SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   34|  18.9k|#define RS_SUBPEL_BITS 6
  |  |  ------------------
  ------------------
   42|  18.9k|    const int x_filter_idx1 =
   43|  18.9k|        ((x_qn + 1 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
  ------------------
  |  |   37|  18.9k|#define RS_SCALE_SUBPEL_MASK ((1 << RS_SCALE_SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   36|  18.9k|#define RS_SCALE_SUBPEL_BITS 14
  |  |  ------------------
  ------------------
                      ((x_qn + 1 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
  ------------------
  |  |   38|  18.9k|#define RS_SCALE_EXTRA_BITS (RS_SCALE_SUBPEL_BITS - RS_SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   36|  18.9k|#define RS_SCALE_SUBPEL_BITS 14
  |  |  ------------------
  |  |               #define RS_SCALE_EXTRA_BITS (RS_SCALE_SUBPEL_BITS - RS_SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   34|  18.9k|#define RS_SUBPEL_BITS 6
  |  |  ------------------
  ------------------
   44|  18.9k|    const int x_filter_idx2 =
   45|  18.9k|        ((x_qn + 2 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
  ------------------
  |  |   37|  18.9k|#define RS_SCALE_SUBPEL_MASK ((1 << RS_SCALE_SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   36|  18.9k|#define RS_SCALE_SUBPEL_BITS 14
  |  |  ------------------
  ------------------
                      ((x_qn + 2 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
  ------------------
  |  |   38|  18.9k|#define RS_SCALE_EXTRA_BITS (RS_SCALE_SUBPEL_BITS - RS_SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   36|  18.9k|#define RS_SCALE_SUBPEL_BITS 14
  |  |  ------------------
  |  |               #define RS_SCALE_EXTRA_BITS (RS_SCALE_SUBPEL_BITS - RS_SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   34|  18.9k|#define RS_SUBPEL_BITS 6
  |  |  ------------------
  ------------------
   46|  18.9k|    const int x_filter_idx3 =
   47|  18.9k|        ((x_qn + 3 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
  ------------------
  |  |   37|  18.9k|#define RS_SCALE_SUBPEL_MASK ((1 << RS_SCALE_SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   36|  18.9k|#define RS_SCALE_SUBPEL_BITS 14
  |  |  ------------------
  ------------------
                      ((x_qn + 3 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
  ------------------
  |  |   38|  18.9k|#define RS_SCALE_EXTRA_BITS (RS_SCALE_SUBPEL_BITS - RS_SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   36|  18.9k|#define RS_SCALE_SUBPEL_BITS 14
  |  |  ------------------
  |  |               #define RS_SCALE_EXTRA_BITS (RS_SCALE_SUBPEL_BITS - RS_SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   34|  18.9k|#define RS_SUBPEL_BITS 6
  |  |  ------------------
  ------------------
   48|       |
   49|  18.9k|    assert(x_filter_idx0 <= RS_SUBPEL_MASK);
   50|  18.9k|    assert(x_filter_idx1 <= RS_SUBPEL_MASK);
   51|  18.9k|    assert(x_filter_idx2 <= RS_SUBPEL_MASK);
   52|  18.9k|    assert(x_filter_idx3 <= RS_SUBPEL_MASK);
   53|       |
   54|  18.9k|    const int16_t *const x_filter0 =
   55|  18.9k|        &x_filters[x_filter_idx0 * UPSCALE_NORMATIVE_TAPS];
  ------------------
  |  |  101|  18.9k|#define UPSCALE_NORMATIVE_TAPS 8
  ------------------
   56|  18.9k|    const int16_t *const x_filter1 =
   57|  18.9k|        &x_filters[x_filter_idx1 * UPSCALE_NORMATIVE_TAPS];
  ------------------
  |  |  101|  18.9k|#define UPSCALE_NORMATIVE_TAPS 8
  ------------------
   58|  18.9k|    const int16_t *const x_filter2 =
   59|  18.9k|        &x_filters[x_filter_idx2 * UPSCALE_NORMATIVE_TAPS];
  ------------------
  |  |  101|  18.9k|#define UPSCALE_NORMATIVE_TAPS 8
  ------------------
   60|  18.9k|    const int16_t *const x_filter3 =
   61|  18.9k|        &x_filters[x_filter_idx3 * UPSCALE_NORMATIVE_TAPS];
  ------------------
  |  |  101|  18.9k|#define UPSCALE_NORMATIVE_TAPS 8
  ------------------
   62|       |
   63|  18.9k|    const __m128i fil0_16 = xx_loadu_128(x_filter0);
   64|  18.9k|    const __m128i fil1_16 = xx_loadu_128(x_filter1);
   65|  18.9k|    const __m128i fil2_16 = xx_loadu_128(x_filter2);
   66|  18.9k|    const __m128i fil3_16 = xx_loadu_128(x_filter3);
   67|       |
   68|  18.9k|    src_y = src;
   69|  18.9k|    dst_y = dst;
   70|   574k|    for (int y = 0; y < h; y++, src_y += src_stride, dst_y += dst_stride) {
  ------------------
  |  Branch (70:21): [True: 555k, False: 18.9k]
  ------------------
   71|   555k|      const uint8_t *const src_x0 =
   72|   555k|          &src_y[(x_qn + 0 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
  ------------------
  |  |   36|   555k|#define RS_SCALE_SUBPEL_BITS 14
  ------------------
   73|   555k|      const uint8_t *const src_x1 =
   74|   555k|          &src_y[(x_qn + 1 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
  ------------------
  |  |   36|   555k|#define RS_SCALE_SUBPEL_BITS 14
  ------------------
   75|   555k|      const uint8_t *const src_x2 =
   76|   555k|          &src_y[(x_qn + 2 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
  ------------------
  |  |   36|   555k|#define RS_SCALE_SUBPEL_BITS 14
  ------------------
   77|   555k|      const uint8_t *const src_x3 =
   78|   555k|          &src_y[(x_qn + 3 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
  ------------------
  |  |   36|   555k|#define RS_SCALE_SUBPEL_BITS 14
  ------------------
   79|       |
   80|       |      // Load up the source data. This is 8-bit input data, so each load
   81|       |      // gets 8 pixels.
   82|   555k|      const __m128i src0_8 = xx_loadl_64(src_x0);
   83|   555k|      const __m128i src1_8 = xx_loadl_64(src_x1);
   84|   555k|      const __m128i src2_8 = xx_loadl_64(src_x2);
   85|   555k|      const __m128i src3_8 = xx_loadl_64(src_x3);
   86|       |
   87|       |      // Now zero-extend up to 16-bit precision, i.e.
   88|       |      // [ 00 00 00 00 hg fe dc ba ] -> [ 0h 0g 0f 0e 0d 0c 0b 0a ]
   89|   555k|      const __m128i src0_16 = _mm_cvtepu8_epi16(src0_8);
   90|   555k|      const __m128i src1_16 = _mm_cvtepu8_epi16(src1_8);
   91|   555k|      const __m128i src2_16 = _mm_cvtepu8_epi16(src2_8);
   92|   555k|      const __m128i src3_16 = _mm_cvtepu8_epi16(src3_8);
   93|       |
   94|       |      // Multiply by filter coefficients (results in a 32-bit value),
   95|       |      // and add adjacent pairs, i.e.
   96|       |      // ([ s7 s6 s5 s4 s3 s2 s1 s0], [ f7 f6 f5 f4 f3 f2 f1 f0 ])
   97|       |      // -> [ {s7*f7+s6*f6} {s5*f5+s4*f4} {s3*f3+s2*f2} {s1*f1+s0*f0} ]
   98|   555k|      const __m128i conv0_32 = _mm_madd_epi16(src0_16, fil0_16);
   99|   555k|      const __m128i conv1_32 = _mm_madd_epi16(src1_16, fil1_16);
  100|   555k|      const __m128i conv2_32 = _mm_madd_epi16(src2_16, fil2_16);
  101|   555k|      const __m128i conv3_32 = _mm_madd_epi16(src3_16, fil3_16);
  102|       |
  103|       |      // Reduce horizontally and add, i.e.
  104|       |      // ([ D C B A ], [ S R Q P ]) -> [ S+R Q+P D+C B+A ]
  105|   555k|      const __m128i conv01_32 = _mm_hadd_epi32(conv0_32, conv1_32);
  106|   555k|      const __m128i conv23_32 = _mm_hadd_epi32(conv2_32, conv3_32);
  107|       |
  108|   555k|      const __m128i conv0123_32 = _mm_hadd_epi32(conv01_32, conv23_32);
  109|       |
  110|       |      // Divide down by (1 << FILTER_BITS), rounding to nearest.
  111|   555k|      const __m128i shifted_32 =
  112|   555k|          _mm_srai_epi32(_mm_add_epi32(conv0123_32, round_add), FILTER_BITS);
  ------------------
  |  |   21|   555k|#define FILTER_BITS 7
  ------------------
  113|       |
  114|       |      // Pack 32-bit values into 16-bit values, i.e.
  115|       |      // ([ D C B A ], [ 0 0 0 0 ]) -> [ 0 0 0 0 D C B A ]
  116|   555k|      const __m128i shifted_16 = _mm_packus_epi32(shifted_32, zero);
  117|       |
  118|       |      // Pack 16-bit values into 8-bit values, i.e.
  119|       |      // ([ 0 0 0 0 D C B A ], [ 0 0 0 0 0 0 0 0 ])
  120|       |      // -> [ 0 0 0 0 0 0 DC BA ]
  121|   555k|      const __m128i shifted_8 = _mm_packus_epi16(shifted_16, zero);
  122|       |
  123|       |      // Write to the output
  124|   555k|      xx_storel_32(&dst_y[x], shifted_8);
  125|   555k|    }
  126|  18.9k|  }
  127|    600|}

av1_convolve_2d_scale_sse4_1:
  238|     24|                                  ConvolveParams *conv_params) {
  239|     24|  int16_t tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
  240|     24|  int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
  ------------------
  |  |   28|     24|#define SCALE_SUBPEL_BITS 10
  ------------------
  241|     24|             filter_params_y->taps;
  242|       |
  243|     24|  const int xtaps = filter_params_x->taps;
  244|     24|  const int ytaps = filter_params_y->taps;
  245|     24|  const int fo_vert = ytaps / 2 - 1;
  246|     24|  assert((xtaps == 8) && (ytaps == 8));
  247|     24|  (void)xtaps;
  248|       |
  249|       |  // horizontal filter
  250|     24|  hfilter8(src - fo_vert * src_stride, src_stride, tmp, w, im_h, subpel_x_qn,
  251|     24|           x_step_qn, filter_params_x, conv_params->round_0);
  252|       |
  253|       |  // vertical filter (input is transposed)
  254|     24|  vfilter8(tmp, im_h, dst8, dst8_stride, w, h, subpel_y_qn, y_step_qn,
  255|     24|           filter_params_y, conv_params, 8);
  256|     24|}
av1_convolve_scale_sse4.c:hfilter8:
   25|     24|                     const InterpFilterParams *filter_params, int round) {
   26|     24|  const int bd = 8;
   27|     24|  const int ntaps = 8;
   28|       |
   29|     24|  src -= ntaps / 2 - 1;
   30|       |
   31|     24|  int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1));
  ------------------
  |  |   21|     24|#define FILTER_BITS 7
  ------------------
   32|     24|  const __m128i round_add = _mm_set1_epi32(round_add32);
   33|     24|  const __m128i round_shift = _mm_cvtsi32_si128(round);
   34|       |
   35|     24|  int x_qn = subpel_x_qn;
   36|    408|  for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
  ------------------
  |  Branch (36:19): [True: 384, False: 24]
  ------------------
   37|    384|    const uint8_t *const src_col = src + (x_qn >> SCALE_SUBPEL_BITS);
  ------------------
  |  |   28|    384|#define SCALE_SUBPEL_BITS 10
  ------------------
   38|    384|    const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
  ------------------
  |  |   30|    384|#define SCALE_SUBPEL_MASK (SCALE_SUBPEL_SHIFTS - 1)
  |  |  ------------------
  |  |  |  |   29|    384|#define SCALE_SUBPEL_SHIFTS (1 << SCALE_SUBPEL_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   28|    384|#define SCALE_SUBPEL_BITS 10
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
                  const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
  ------------------
  |  |   31|    384|#define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   28|    384|#define SCALE_SUBPEL_BITS 10
  |  |  ------------------
  |  |               #define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   23|    384|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
   39|    384|    assert(filter_idx < SUBPEL_SHIFTS);
   40|    384|    const int16_t *filter =
   41|    384|        av1_get_interp_filter_subpel_kernel(filter_params, filter_idx);
   42|       |
   43|       |    // Load the filter coefficients
   44|    384|    const __m128i coefflo = _mm_loadu_si128((__m128i *)filter);
   45|    384|    const __m128i zero = _mm_castps_si128(_mm_setzero_ps());
   46|       |
   47|    384|    int y;
   48|  2.04k|    for (y = 0; y <= h - 4; y += 4) {
  ------------------
  |  Branch (48:17): [True: 1.66k, False: 384]
  ------------------
   49|  1.66k|      const uint8_t *const src0 = src_col + y * src_stride;
   50|  1.66k|      const uint8_t *const src1 = src0 + 1 * src_stride;
   51|  1.66k|      const uint8_t *const src2 = src0 + 2 * src_stride;
   52|  1.66k|      const uint8_t *const src3 = src0 + 3 * src_stride;
   53|       |
   54|       |      // Load up source data. This is 8-bit input data; each load is just
   55|       |      // loading the lower half of the register and gets 8 pixels
   56|  1.66k|      const __m128i data08 = _mm_loadl_epi64((__m128i *)src0);
   57|  1.66k|      const __m128i data18 = _mm_loadl_epi64((__m128i *)src1);
   58|  1.66k|      const __m128i data28 = _mm_loadl_epi64((__m128i *)src2);
   59|  1.66k|      const __m128i data38 = _mm_loadl_epi64((__m128i *)src3);
   60|       |
   61|       |      // Now zero-extend up to 16-bit precision by interleaving with
   62|       |      // zeros. Drop the upper half of each register (which just had zeros)
   63|  1.66k|      const __m128i data0lo = _mm_unpacklo_epi8(data08, zero);
   64|  1.66k|      const __m128i data1lo = _mm_unpacklo_epi8(data18, zero);
   65|  1.66k|      const __m128i data2lo = _mm_unpacklo_epi8(data28, zero);
   66|  1.66k|      const __m128i data3lo = _mm_unpacklo_epi8(data38, zero);
   67|       |
   68|       |      // Multiply by coefficients
   69|  1.66k|      const __m128i conv0lo = _mm_madd_epi16(data0lo, coefflo);
   70|  1.66k|      const __m128i conv1lo = _mm_madd_epi16(data1lo, coefflo);
   71|  1.66k|      const __m128i conv2lo = _mm_madd_epi16(data2lo, coefflo);
   72|  1.66k|      const __m128i conv3lo = _mm_madd_epi16(data3lo, coefflo);
   73|       |
   74|       |      // Reduce horizontally and add
   75|  1.66k|      const __m128i conv01lo = _mm_hadd_epi32(conv0lo, conv1lo);
   76|  1.66k|      const __m128i conv23lo = _mm_hadd_epi32(conv2lo, conv3lo);
   77|  1.66k|      const __m128i conv = _mm_hadd_epi32(conv01lo, conv23lo);
   78|       |
   79|       |      // Divide down by (1 << round), rounding to nearest.
   80|  1.66k|      __m128i shifted =
   81|  1.66k|          _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);
   82|       |
   83|  1.66k|      shifted = _mm_packus_epi32(shifted, shifted);
   84|       |      // Write transposed to the output
   85|  1.66k|      _mm_storel_epi64((__m128i *)(dst + y + x * h), shifted);
   86|  1.66k|    }
   87|  1.53k|    for (; y < h; ++y) {
  ------------------
  |  Branch (87:12): [True: 1.15k, False: 384]
  ------------------
   88|  1.15k|      const uint8_t *const src_row = src_col + y * src_stride;
   89|       |
   90|  1.15k|      int32_t sum = (1 << (bd + FILTER_BITS - 1));
  ------------------
  |  |   21|  1.15k|#define FILTER_BITS 7
  ------------------
   91|  10.3k|      for (int k = 0; k < ntaps; ++k) {
  ------------------
  |  Branch (91:23): [True: 9.21k, False: 1.15k]
  ------------------
   92|  9.21k|        sum += filter[k] * src_row[k];
   93|  9.21k|      }
   94|       |
   95|  1.15k|      dst[y + x * h] = ROUND_POWER_OF_TWO(sum, round);
  ------------------
  |  |   41|  1.15k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
   96|  1.15k|    }
   97|    384|  }
   98|     24|}
av1_convolve_scale_sse4.c:vfilter8:
  110|     24|                     const ConvolveParams *conv_params, int bd) {
  111|     24|  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
  ------------------
  |  |   21|     24|#define FILTER_BITS 7
  ------------------
  112|     24|  const int ntaps = 8;
  113|       |
  114|     24|  const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
  115|       |
  116|     24|  const int32_t sub32 = ((1 << (offset_bits - conv_params->round_1)) +
  117|     24|                         (1 << (offset_bits - conv_params->round_1 - 1)));
  118|     24|  const __m128i sub = _mm_set1_epi16(sub32);
  119|       |
  120|     24|  CONV_BUF_TYPE *dst16 = conv_params->dst;
  121|     24|  const int dst16_stride = conv_params->dst_stride;
  122|     24|  const int bits =
  123|     24|      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|     24|#define FILTER_BITS 7
  ------------------
  124|     24|  const __m128i bits_shift = _mm_cvtsi32_si128(bits);
  125|     24|  const __m128i bits_const = _mm_set1_epi16(((1 << bits) >> 1));
  126|     24|  const __m128i round_shift_add =
  127|     24|      _mm_set1_epi32(((1 << conv_params->round_1) >> 1));
  128|     24|  const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits);
  129|       |
  130|     24|  const int w0 = conv_params->fwd_offset;
  131|     24|  const int w1 = conv_params->bck_offset;
  132|     24|  const __m128i wt0 = _mm_set1_epi16((short)w0);
  133|     24|  const __m128i wt1 = _mm_set1_epi16((short)w1);
  134|     24|  const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
  135|       |
  136|     24|  int y_qn = subpel_y_qn;
  137|    328|  for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
  ------------------
  |  Branch (137:19): [True: 304, False: 24]
  ------------------
  138|    304|    const int16_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS);
  ------------------
  |  |   28|    304|#define SCALE_SUBPEL_BITS 10
  ------------------
  139|    304|    const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
  ------------------
  |  |   30|    304|#define SCALE_SUBPEL_MASK (SCALE_SUBPEL_SHIFTS - 1)
  |  |  ------------------
  |  |  |  |   29|    304|#define SCALE_SUBPEL_SHIFTS (1 << SCALE_SUBPEL_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   28|    304|#define SCALE_SUBPEL_BITS 10
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
                  const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
  ------------------
  |  |   31|    304|#define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   28|    304|#define SCALE_SUBPEL_BITS 10
  |  |  ------------------
  |  |               #define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   23|    304|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  140|    304|    assert(filter_idx < SUBPEL_SHIFTS);
  141|    304|    const int16_t *filter =
  142|    304|        av1_get_interp_filter_subpel_kernel(filter_params, filter_idx);
  143|       |
  144|    304|    const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter);
  145|    304|    int x;
  146|  1.58k|    for (x = 0; x <= w - 4; x += 4) {
  ------------------
  |  Branch (146:17): [True: 1.28k, False: 304]
  ------------------
  147|  1.28k|      const int16_t *const src0 = src_y + x * src_stride;
  148|  1.28k|      const int16_t *const src1 = src0 + 1 * src_stride;
  149|  1.28k|      const int16_t *const src2 = src0 + 2 * src_stride;
  150|  1.28k|      const int16_t *const src3 = src0 + 3 * src_stride;
  151|       |
  152|       |      // Load the source data for the three rows, adding the three registers of
  153|       |      // convolved products to one as we go (conv0..conv3) to avoid the
  154|       |      // register pressure getting too high.
  155|  1.28k|      const __m128i conv0 = convolve_16_8(src0, coeff0716);
  156|  1.28k|      const __m128i conv1 = convolve_16_8(src1, coeff0716);
  157|  1.28k|      const __m128i conv2 = convolve_16_8(src2, coeff0716);
  158|  1.28k|      const __m128i conv3 = convolve_16_8(src3, coeff0716);
  159|       |
  160|       |      // Now reduce horizontally to get one lane for each result
  161|  1.28k|      const __m128i conv01 = _mm_hadd_epi32(conv0, conv1);
  162|  1.28k|      const __m128i conv23 = _mm_hadd_epi32(conv2, conv3);
  163|  1.28k|      __m128i conv = _mm_hadd_epi32(conv01, conv23);
  164|       |
  165|  1.28k|      conv = _mm_add_epi32(conv, res_add_const);
  166|       |      // Divide down by (1 << round_1), rounding to nearest and subtract sub32.
  167|  1.28k|      __m128i shifted =
  168|  1.28k|          _mm_sra_epi32(_mm_add_epi32(conv, round_shift_add), round_shift);
  169|       |
  170|  1.28k|      uint8_t *dst_x = dst + y * dst_stride + x;
  171|  1.28k|      __m128i result;
  172|  1.28k|      __m128i shifted_16 = _mm_packus_epi32(shifted, shifted);
  173|       |
  174|  1.28k|      if (conv_params->is_compound) {
  ------------------
  |  Branch (174:11): [True: 0, False: 1.28k]
  ------------------
  175|      0|        CONV_BUF_TYPE *dst_16_x = dst16 + y * dst16_stride + x;
  176|      0|        if (conv_params->do_average) {
  ------------------
  |  Branch (176:13): [True: 0, False: 0]
  ------------------
  177|      0|          const __m128i p_16 = _mm_loadl_epi64((__m128i *)dst_16_x);
  178|      0|          if (conv_params->use_dist_wtd_comp_avg) {
  ------------------
  |  Branch (178:15): [True: 0, False: 0]
  ------------------
  179|      0|            const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, shifted_16);
  180|      0|            const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, wt);
  181|      0|            const __m128i shifted_32 =
  182|      0|                _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
  ------------------
  |  |   76|      0|#define DIST_PRECISION_BITS 4
  ------------------
  183|      0|            shifted_16 = _mm_packus_epi32(shifted_32, shifted_32);
  184|      0|          } else {
  185|      0|            shifted_16 = _mm_srai_epi16(_mm_add_epi16(p_16, shifted_16), 1);
  186|      0|          }
  187|      0|          const __m128i subbed = _mm_sub_epi16(shifted_16, sub);
  188|      0|          result = _mm_sra_epi16(_mm_add_epi16(subbed, bits_const), bits_shift);
  189|      0|          const __m128i result_8 = _mm_packus_epi16(result, result);
  190|      0|          *(int *)dst_x = _mm_cvtsi128_si32(result_8);
  191|      0|        } else {
  192|      0|          _mm_storel_epi64((__m128i *)dst_16_x, shifted_16);
  193|      0|        }
  194|  1.28k|      } else {
  195|  1.28k|        const __m128i subbed = _mm_sub_epi16(shifted_16, sub);
  196|  1.28k|        result = _mm_sra_epi16(_mm_add_epi16(subbed, bits_const), bits_shift);
  197|  1.28k|        const __m128i result_8 = _mm_packus_epi16(result, result);
  198|  1.28k|        *(int *)dst_x = _mm_cvtsi128_si32(result_8);
  199|  1.28k|      }
  200|  1.28k|    }
  201|    304|    for (; x < w; ++x) {
  ------------------
  |  Branch (201:12): [True: 0, False: 304]
  ------------------
  202|      0|      const int16_t *src_x = src_y + x * src_stride;
  203|      0|      int32_t sum = 1 << offset_bits;
  204|      0|      for (int k = 0; k < ntaps; ++k) sum += filter[k] * src_x[k];
  ------------------
  |  Branch (204:23): [True: 0, False: 0]
  ------------------
  205|      0|      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
  ------------------
  |  |   41|      0|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
  206|       |
  207|      0|      if (conv_params->is_compound) {
  ------------------
  |  Branch (207:11): [True: 0, False: 0]
  ------------------
  208|      0|        if (conv_params->do_average) {
  ------------------
  |  Branch (208:13): [True: 0, False: 0]
  ------------------
  209|      0|          int32_t tmp = dst16[y * dst16_stride + x];
  210|      0|          if (conv_params->use_dist_wtd_comp_avg) {
  ------------------
  |  Branch (210:15): [True: 0, False: 0]
  ------------------
  211|      0|            tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
  212|      0|            tmp = tmp >> DIST_PRECISION_BITS;
  ------------------
  |  |   76|      0|#define DIST_PRECISION_BITS 4
  ------------------
  213|      0|          } else {
  214|      0|            tmp += res;
  215|      0|            tmp = tmp >> 1;
  216|      0|          }
  217|       |          /* Subtract round offset and convolve round */
  218|      0|          tmp = tmp - sub32;
  219|      0|          dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
  ------------------
  |  |   41|      0|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
  220|      0|        } else {
  221|      0|          dst16[y * dst16_stride + x] = res;
  222|      0|        }
  223|      0|      } else {
  224|       |        /* Subtract round offset and convolve round */
  225|      0|        int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
  226|      0|                             (1 << (offset_bits - conv_params->round_1 - 1)));
  227|      0|        dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
  ------------------
  |  |   41|      0|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
  228|      0|      }
  229|      0|    }
  230|    304|  }
  231|     24|}
av1_convolve_scale_sse4.c:convolve_16_8:
  100|  5.12k|static __m128i convolve_16_8(const int16_t *src, __m128i coeff) {
  101|  5.12k|  __m128i data = _mm_loadu_si128((__m128i *)src);
  102|  5.12k|  return _mm_madd_epi16(data, coeff);
  103|  5.12k|}

av1_lowbd_inv_txfm2d_add_avx2:
 2211|   753k|                                   int eob) {
 2212|   753k|  switch (tx_size) {
 2213|   109k|    case TX_4X4:
  ------------------
  |  Branch (2213:5): [True: 109k, False: 643k]
  ------------------
 2214|   142k|    case TX_4X8:
  ------------------
  |  Branch (2214:5): [True: 32.8k, False: 720k]
  ------------------
 2215|   192k|    case TX_8X4:
  ------------------
  |  Branch (2215:5): [True: 49.8k, False: 703k]
  ------------------
 2216|   225k|    case TX_8X16:
  ------------------
  |  Branch (2216:5): [True: 33.0k, False: 720k]
  ------------------
 2217|   273k|    case TX_16X8:
  ------------------
  |  Branch (2217:5): [True: 48.6k, False: 704k]
  ------------------
 2218|   292k|    case TX_4X16:
  ------------------
  |  Branch (2218:5): [True: 18.7k, False: 734k]
  ------------------
 2219|   330k|    case TX_16X4:
  ------------------
  |  Branch (2219:5): [True: 38.3k, False: 714k]
  ------------------
 2220|   342k|    case TX_8X32:
  ------------------
  |  Branch (2220:5): [True: 11.1k, False: 742k]
  ------------------
 2221|   356k|    case TX_32X8:
  ------------------
  |  Branch (2221:5): [True: 14.7k, False: 738k]
  ------------------
 2222|   356k|      av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size,
 2223|   356k|                                     eob);
 2224|   356k|      break;
 2225|   243k|    case TX_8X8:
  ------------------
  |  Branch (2225:5): [True: 243k, False: 509k]
  ------------------
 2226|   243k|      lowbd_inv_txfm2d_add_8x8_avx2(input, output, stride, tx_type, tx_size,
 2227|   243k|                                    eob);
 2228|   243k|      break;
 2229|  89.6k|    case TX_16X16:
  ------------------
  |  Branch (2229:5): [True: 89.6k, False: 663k]
  ------------------
 2230|   119k|    case TX_32X32:
  ------------------
  |  Branch (2230:5): [True: 29.7k, False: 723k]
  ------------------
 2231|   126k|    case TX_64X64:
  ------------------
  |  Branch (2231:5): [True: 7.46k, False: 745k]
  ------------------
 2232|   135k|    case TX_16X32:
  ------------------
  |  Branch (2232:5): [True: 8.88k, False: 744k]
  ------------------
 2233|   145k|    case TX_32X16:
  ------------------
  |  Branch (2233:5): [True: 10.2k, False: 742k]
  ------------------
 2234|   147k|    case TX_32X64:
  ------------------
  |  Branch (2234:5): [True: 1.53k, False: 751k]
  ------------------
 2235|   148k|    case TX_64X32:
  ------------------
  |  Branch (2235:5): [True: 1.08k, False: 752k]
  ------------------
 2236|   150k|    case TX_16X64:
  ------------------
  |  Branch (2236:5): [True: 2.10k, False: 751k]
  ------------------
 2237|   152k|    case TX_64X16:
  ------------------
  |  Branch (2237:5): [True: 2.14k, False: 750k]
  ------------------
 2238|   152k|    default:
  ------------------
  |  Branch (2238:5): [True: 0, False: 753k]
  ------------------
 2239|   152k|      lowbd_inv_txfm2d_add_universe_avx2(input, output, stride, tx_type,
 2240|   152k|                                         tx_size, eob);
 2241|   152k|      break;
 2242|   753k|  }
 2243|   753k|}
av1_inv_txfm_add_avx2:
 2246|  1.04M|                           const TxfmParam *txfm_param) {
 2247|  1.04M|  const TX_TYPE tx_type = txfm_param->tx_type;
 2248|  1.04M|  if (!txfm_param->lossless) {
  ------------------
  |  Branch (2248:7): [True: 753k, False: 295k]
  ------------------
 2249|   753k|    av1_lowbd_inv_txfm2d_add_avx2(dqcoeff, dst, stride, tx_type,
 2250|   753k|                                  txfm_param->tx_size, txfm_param->eob);
 2251|   753k|  } else {
 2252|   295k|    av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
 2253|   295k|  }
 2254|  1.04M|}
av1_inv_txfm_avx2.c:lowbd_inv_txfm2d_add_8x8_avx2:
 2145|   243k|                                          TX_SIZE tx_size, int eob) {
 2146|   243k|  switch (tx_type) {
 2147|  10.2k|    case IDTX:
  ------------------
  |  Branch (2147:5): [True: 10.2k, False: 233k]
  ------------------
 2148|  10.2k|      av1_lowbd_inv_txfm2d_add_idtx_ssse3(input, output, stride, tx_size);
 2149|       |
 2150|  10.2k|      break;
 2151|  3.25k|    case V_DCT:
  ------------------
  |  Branch (2151:5): [True: 3.25k, False: 240k]
  ------------------
 2152|  3.48k|    case V_ADST:
  ------------------
  |  Branch (2152:5): [True: 222, False: 243k]
  ------------------
 2153|  3.67k|    case V_FLIPADST:
  ------------------
  |  Branch (2153:5): [True: 196, False: 243k]
  ------------------
 2154|  3.67k|      av1_lowbd_inv_txfm2d_add_h_identity_ssse3(input, output, stride, tx_type,
 2155|  3.67k|                                                tx_size, eob);
 2156|  3.67k|      break;
 2157|  6.58k|    case H_DCT:
  ------------------
  |  Branch (2157:5): [True: 6.58k, False: 237k]
  ------------------
 2158|  7.10k|    case H_ADST:
  ------------------
  |  Branch (2158:5): [True: 518, False: 243k]
  ------------------
 2159|  7.52k|    case H_FLIPADST:
  ------------------
  |  Branch (2159:5): [True: 424, False: 243k]
  ------------------
 2160|  7.52k|      av1_lowbd_inv_txfm2d_add_v_identity_ssse3(input, output, stride, tx_type,
 2161|  7.52k|                                                tx_size, eob);
 2162|  7.52k|      break;
 2163|   222k|    default:
  ------------------
  |  Branch (2163:5): [True: 222k, False: 21.4k]
  ------------------
 2164|   222k|      lowbd_inv_txfm2d_8x8_no_identity_avx2(input, output, stride, tx_type,
 2165|   222k|                                            tx_size, eob);
 2166|   243k|  }
 2167|   243k|}
av1_inv_txfm_avx2.c:lowbd_inv_txfm2d_8x8_no_identity_avx2:
 2112|   222k|    TX_SIZE tx_size, int eob) {
 2113|   222k|  __m128i buf1[8];
 2114|   222k|  const int input_stride = 8;
 2115|   222k|  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
 2116|   222k|  assert(hitx_1d_tab[tx_type] < 2);
 2117|   222k|  assert(vitx_1d_tab[tx_type] < 2);
 2118|   222k|  const transform_1d_ssse3 row_txfm =
 2119|   222k|      lowbd_txfm_all_1d_zeros_8x8_arr[hitx_1d_tab[tx_type]][eob != 1];
 2120|   222k|  const transform_1d_ssse3 col_txfm =
 2121|   222k|      lowbd_txfm_all_1d_zeros_8x8_arr[vitx_1d_tab[tx_type]][eob != 1];
 2122|       |
 2123|   222k|  assert(col_txfm != NULL);
 2124|   222k|  assert(row_txfm != NULL);
 2125|   222k|  int ud_flip, lr_flip;
 2126|   222k|  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 2127|       |
 2128|   222k|  __m128i buf0[8];
 2129|   222k|  __m128i *buf0_cur = buf0;
 2130|   222k|  load_buffer_avx2(input, input_stride, buf0_cur);
 2131|   222k|  row_txfm(buf0, buf0);
 2132|       |
 2133|   222k|  assert(shift[0] < 0);
 2134|   222k|  __m128i *_buf1 = buf1;
 2135|   222k|  round_and_transpose_avx2(buf0, _buf1, shift[0], &lr_flip);
 2136|       |  assert(shift[1] < 0);
 2137|   222k|  col_txfm(buf1, buf1);
 2138|   222k|  round_shift_lowbd_write_buffer_avx2(buf1, shift[1], output, stride, ud_flip);
 2139|   222k|}
av1_inv_txfm_avx2.c:load_buffer_avx2:
 1896|   222k|                                    __m128i *out) {
 1897|   222k|  const __m256i a = _mm256_load_si256((const __m256i *)in);
 1898|   222k|  const __m256i b = _mm256_load_si256((const __m256i *)(in + stride * 1));
 1899|   222k|  const __m256i c = _mm256_load_si256((const __m256i *)(in + stride * 2));
 1900|   222k|  const __m256i d = _mm256_load_si256((const __m256i *)(in + stride * 3));
 1901|   222k|  const __m256i e = _mm256_load_si256((const __m256i *)(in + stride * 4));
 1902|   222k|  const __m256i f = _mm256_load_si256((const __m256i *)(in + stride * 5));
 1903|   222k|  const __m256i g = _mm256_load_si256((const __m256i *)(in + stride * 6));
 1904|   222k|  const __m256i h = _mm256_load_si256((const __m256i *)(in + stride * 7));
 1905|       |
 1906|       |  // a0 a1 a2 a3 b0 b1 b2 b3 a4 a5 a6 a7 b4 b5 b6 b7
 1907|   222k|  const __m256i ab_16bit = _mm256_packs_epi32(a, b);
 1908|       |  // c0 c1 c2 c3 d0 d1 d2 d3 c4 c5 c6 c7 d4 d5 d6 d7
 1909|   222k|  const __m256i cd_16bit = _mm256_packs_epi32(c, d);
 1910|       |  // e0 e1 e2 e3 f0 f1 f2 f3 e4 e5 e6 e7 f4 f5 f6 f7
 1911|   222k|  const __m256i ef_16bit = _mm256_packs_epi32(e, f);
 1912|       |  // g0 g1 g2 g3 h0 h1 h2 h3 g4 g5 g6 g7 h4 h5 h6 h7
 1913|   222k|  const __m256i gh_16bit = _mm256_packs_epi32(g, h);
 1914|       |
 1915|       |  // a0 a1 a2 a3 a4 a5 a6 a7 b0 b1 b2 b3 b4 b5 b6 b7
 1916|   222k|  const __m256i ab = _mm256_permute4x64_epi64(ab_16bit, 0xd8);
 1917|       |  // c0 c1 c2 c3 c4 c5 c6 c7 d0 d1 d2 d3 d4 d5 d6 d7
 1918|   222k|  const __m256i cd = _mm256_permute4x64_epi64(cd_16bit, 0xd8);
 1919|       |  // e0 e1 e2 e3 e4 e5 e6 e7 f0 f1 f2 f3 f4 f5 f6 f7
 1920|   222k|  const __m256i ef = _mm256_permute4x64_epi64(ef_16bit, 0xd8);
 1921|       |  // g0 g1 g2 g3 g4 g5 g6 g7 h0 h1 h2 h3 h4 h5 h6 h7
 1922|   222k|  const __m256i gh = _mm256_permute4x64_epi64(gh_16bit, 0xd8);
 1923|       |
 1924|   222k|  out[0] = _mm256_castsi256_si128(ab);
 1925|   222k|  out[1] = _mm256_extractf128_si256(ab, 1);
 1926|   222k|  out[2] = _mm256_castsi256_si128(cd);
 1927|   222k|  out[3] = _mm256_extractf128_si256(cd, 1);
 1928|   222k|  out[4] = _mm256_castsi256_si128(ef);
 1929|   222k|  out[5] = _mm256_extractf128_si256(ef, 1);
 1930|   222k|  out[6] = _mm256_castsi256_si128(gh);
 1931|       |  out[7] = _mm256_extractf128_si256(gh, 1);
 1932|   222k|}
av1_inv_txfm_avx2.c:round_and_transpose_avx2:
 1936|   222k|                                            int *lr_flip) {
 1937|   222k|  __m256i buf_temp[4];
 1938|   222k|  const __m256i scale = _mm256_set1_epi16(1 << (15 + bit));
 1939|   222k|  int j = *lr_flip ? 7 : 0;
  ------------------
  |  Branch (1939:11): [True: 1.47k, False: 220k]
  ------------------
 1940|   222k|  const int step = *lr_flip ? -1 : 1;
  ------------------
  |  Branch (1940:20): [True: 1.47k, False: 220k]
  ------------------
 1941|       |
 1942|       |  // 70 71 72 73 74 75 76 77 | 30 31 32 33 34 35 36 37
 1943|   222k|  buf_temp[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]),
 1944|   222k|                                        in[j + 4 * step], 1);
 1945|   222k|  j += step;
 1946|       |  // 60 61 62 63 64 65 66 67 | 20 21 22 23 24 25 26 27
 1947|   222k|  buf_temp[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]),
 1948|   222k|                                        in[j + 4 * step], 1);
 1949|   222k|  j += step;
 1950|       |  // 50 51 52 53 54 55 56 57 | 10 11 12 13 14 15 16 17
 1951|   222k|  buf_temp[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]),
 1952|   222k|                                        in[j + 4 * step], 1);
 1953|   222k|  j += step;
 1954|       |  // 40 41 42 43 44 45 46 47 | 00 01 02 03 04 05 06 07
 1955|   222k|  buf_temp[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]),
 1956|   222k|                                        in[j + 4 * step], 1);
 1957|       |
 1958|       |  // 70 71 72 73 74 75 76 77 | 30 31 32 33 34 35 36 37
 1959|   222k|  buf_temp[0] = _mm256_mulhrs_epi16(buf_temp[0], scale);
 1960|       |  // 60 61 62 63 64 65 66 67 | 20 21 22 23 24 25 26 27
 1961|   222k|  buf_temp[1] = _mm256_mulhrs_epi16(buf_temp[1], scale);
 1962|       |  // 50 51 52 53 54 55 56 57 | 10 11 12 13 14 15 16 17
 1963|   222k|  buf_temp[2] = _mm256_mulhrs_epi16(buf_temp[2], scale);
 1964|       |  // 40 41 42 43 44 45 46 47 | 00 01 02 03 04 05 06 07
 1965|   222k|  buf_temp[3] = _mm256_mulhrs_epi16(buf_temp[3], scale);
 1966|       |
 1967|       |  // 70 60 71 61 72 62 73 63 | 30 20 31 21 32 22 33 23
 1968|   222k|  const __m256i unpcklo0 = _mm256_unpacklo_epi16(buf_temp[0], buf_temp[1]);
 1969|       |  // 74 64 75 65 76 66 77 67 | 34 24 35 25 36 26 37 27
 1970|   222k|  const __m256i unpckhi0 = _mm256_unpackhi_epi16(buf_temp[0], buf_temp[1]);
 1971|       |  // 50 40 51 41 52 42 53 43 | 10 00 11 01 12 02 13 03
 1972|   222k|  const __m256i unpcklo1 = _mm256_unpacklo_epi16(buf_temp[2], buf_temp[3]);
 1973|       |  // 54 44 55 45 56 46 57 47 | 14 04 15 05 16 06 17 07
 1974|   222k|  const __m256i unpckhi1 = _mm256_unpackhi_epi16(buf_temp[2], buf_temp[3]);
 1975|       |
 1976|       |  // 70 60 50 40 71 61 51 41 | 30 20 10 00 31 21 11 01
 1977|   222k|  const __m256i unpcklo00 = _mm256_unpacklo_epi32(unpcklo0, unpcklo1);
 1978|       |  // 72 62 52 42 73 63 53 43 | 32 22 12 02 33 23 13 03
 1979|   222k|  const __m256i unpckhi00 = _mm256_unpackhi_epi32(unpcklo0, unpcklo1);
 1980|       |  // 74 64 54 44 75 65 55 45 | 34 24 14 04 35 25 15 05
 1981|   222k|  const __m256i unpcklo01 = _mm256_unpacklo_epi32(unpckhi0, unpckhi1);
 1982|       |  // 76 66 56 46 77 67 57 47 | 36 26 16 06 37 27 17 07
 1983|   222k|  const __m256i unpckhi01 = _mm256_unpackhi_epi32(unpckhi0, unpckhi1);
 1984|       |
 1985|       |  // 70 60 50 40 30 20 10 00 | 71 61 51 41 31 21 11 01
 1986|   222k|  const __m256i reg_00 = _mm256_permute4x64_epi64(unpcklo00, 0xd8);
 1987|       |  // 72 62 52 42 32 22 12 02 | 73 63 53 43 33 23 13 03
 1988|   222k|  const __m256i reg_01 = _mm256_permute4x64_epi64(unpckhi00, 0xd8);
 1989|       |  // 74 64 54 44 34 24 14 04 | 75 65 55 45 35 25 15 05
 1990|   222k|  const __m256i reg_10 = _mm256_permute4x64_epi64(unpcklo01, 0xd8);
 1991|       |  // 76 66 56 46 36 26 16 06 | 77 67 57 47 37 27 17 07
 1992|   222k|  const __m256i reg_11 = _mm256_permute4x64_epi64(unpckhi01, 0xd8);
 1993|       |
 1994|       |  // 70 60 50 40 30 20 10 00
 1995|   222k|  out[0] = _mm256_castsi256_si128(reg_00);
 1996|       |  // 71 61 51 41 31 21 11 01
 1997|   222k|  out[1] = _mm256_extracti128_si256(reg_00, 1);
 1998|       |  // 72 62 52 42 32 22 12 02
 1999|   222k|  out[2] = _mm256_castsi256_si128(reg_01);
 2000|       |  // 73 63 53 43 33 23 13 03
 2001|   222k|  out[3] = _mm256_extracti128_si256(reg_01, 1);
 2002|       |  // 74 64 54 44 34 24 14 04
 2003|   222k|  out[4] = _mm256_castsi256_si128(reg_10);
 2004|       |  // 75 65 55 45 35 25 15 05
 2005|   222k|  out[5] = _mm256_extracti128_si256(reg_10, 1);
 2006|       |  // 76 66 56 46 36 26 16 06
 2007|   222k|  out[6] = _mm256_castsi256_si128(reg_11);
 2008|       |  // 77 67 57 47 37 27 17 07
 2009|       |  out[7] = _mm256_extracti128_si256(reg_11, 1);
 2010|   222k|}
av1_inv_txfm_avx2.c:round_shift_lowbd_write_buffer_avx2:
 2014|   222k|                                                       int stride, int flipud) {
 2015|   222k|  __m256i in_256[4], v_256[4];
 2016|   222k|  int j = flipud ? 7 : 0;
  ------------------
  |  Branch (2016:11): [True: 1.54k, False: 220k]
  ------------------
 2017|   222k|  const int step = flipud ? -1 : 1;
  ------------------
  |  Branch (2017:20): [True: 1.54k, False: 220k]
  ------------------
 2018|   222k|  const __m256i scale = _mm256_set1_epi16(1 << (15 + bit));
 2019|   222k|  const __m256i zero = _mm256_setzero_si256();
 2020|       |  // in[0], in[1]
 2021|   222k|  in_256[0] =
 2022|   222k|      _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + step], 1);
 2023|   222k|  j += 2 * step;
 2024|       |  // in[2], in[3]
 2025|   222k|  in_256[1] =
 2026|   222k|      _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + step], 1);
 2027|   222k|  j += 2 * step;
 2028|       |  // in[4], in[5]
 2029|   222k|  in_256[2] =
 2030|   222k|      _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + step], 1);
 2031|   222k|  j += 2 * step;
 2032|       |  // in[6], in[7]
 2033|   222k|  in_256[3] =
 2034|   222k|      _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + step], 1);
 2035|       |
 2036|       |  // i00 i01 i02 i03 i04 i05 i06 i07 i10 i11 i12 i13 i14 i15 i16 i17
 2037|   222k|  in_256[0] = _mm256_mulhrs_epi16(in_256[0], scale);
 2038|       |  // i20 i21 i22 i23 i24 i25 i26 i27 i30 i31 i32 i33 i34 i35 i36 i37
 2039|   222k|  in_256[1] = _mm256_mulhrs_epi16(in_256[1], scale);
 2040|       |  // i40 i41 i42 i43 i44 i45 i46 i47 i50 i51 i52 i53 i54 i55 i56 i57
 2041|   222k|  in_256[2] = _mm256_mulhrs_epi16(in_256[2], scale);
 2042|       |  // i60 i61 i62 i63 i64 i65 i66 i67 i70 i71 i72 i73 i74 i75 i76 i77
 2043|   222k|  in_256[3] = _mm256_mulhrs_epi16(in_256[3], scale);
 2044|       |
 2045|   222k|  const __m128i v0 = _mm_loadl_epi64((__m128i const *)(output));
 2046|   222k|  const __m128i v1 = _mm_loadl_epi64((__m128i const *)(output + stride));
 2047|   222k|  const __m128i v2 = _mm_loadl_epi64((__m128i const *)(output + 2 * stride));
 2048|   222k|  const __m128i v3 = _mm_loadl_epi64((__m128i const *)(output + 3 * stride));
 2049|   222k|  const __m128i v4 = _mm_loadl_epi64((__m128i const *)(output + 4 * stride));
 2050|   222k|  const __m128i v5 = _mm_loadl_epi64((__m128i const *)(output + 5 * stride));
 2051|   222k|  const __m128i v6 = _mm_loadl_epi64((__m128i const *)(output + 6 * stride));
 2052|   222k|  const __m128i v7 = _mm_loadl_epi64((__m128i const *)(output + 7 * stride));
 2053|       |
 2054|   222k|  v_256[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(v0), v1, 1);
 2055|   222k|  v_256[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(v2), v3, 1);
 2056|   222k|  v_256[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(v4), v5, 1);
 2057|   222k|  v_256[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(v6), v7, 1);
 2058|       |
 2059|   222k|  const __m256i unpcklo0 = _mm256_unpacklo_epi8(v_256[0], zero);
 2060|   222k|  const __m256i unpcklo1 = _mm256_unpacklo_epi8(v_256[1], zero);
 2061|   222k|  const __m256i unpcklo2 = _mm256_unpacklo_epi8(v_256[2], zero);
 2062|   222k|  const __m256i unpcklo3 = _mm256_unpacklo_epi8(v_256[3], zero);
 2063|       |  // 00 01 10 11
 2064|   222k|  const __m256i x0 = _mm256_adds_epi16(in_256[0], unpcklo0);
 2065|       |  // 20 21 30 31
 2066|   222k|  const __m256i x1 = _mm256_adds_epi16(in_256[1], unpcklo1);
 2067|       |  // 40 41 50 51
 2068|   222k|  const __m256i x2 = _mm256_adds_epi16(in_256[2], unpcklo2);
 2069|       |  // 60 61 70 71
 2070|   222k|  const __m256i x3 = _mm256_adds_epi16(in_256[3], unpcklo3);
 2071|       |
 2072|       |  // 00 01 20 21 10 11 30 31
 2073|   222k|  const __m256i res_0123 = _mm256_packus_epi16(x0, x1);
 2074|       |  // 40 41 60 61 50 51 70 71
 2075|   222k|  const __m256i res_4567 = _mm256_packus_epi16(x2, x3);
 2076|       |
 2077|       |  // 00 01 20 21
 2078|   222k|  const __m128i res_02 = _mm256_castsi256_si128(res_0123);
 2079|       |  // 10 11 30 31
 2080|   222k|  const __m128i res_13 = _mm256_extracti128_si256(res_0123, 1);
 2081|       |  // 40 41 60 61
 2082|   222k|  const __m128i res_46 = _mm256_castsi256_si128(res_4567);
 2083|       |  // 50 51 70 71
 2084|   222k|  const __m128i res_57 = _mm256_extracti128_si256(res_4567, 1);
 2085|       |
 2086|       |  // 00 01
 2087|   222k|  _mm_storel_epi64((__m128i *)(output), res_02);
 2088|       |  // 10 11
 2089|   222k|  _mm_storel_epi64((__m128i *)(output + stride), res_13);
 2090|       |  // 20 21
 2091|   222k|  _mm_storel_epi64((__m128i *)(output + 2 * stride),
 2092|   222k|                   _mm_unpackhi_epi64(res_02, res_02));
 2093|       |  // 30 31
 2094|   222k|  _mm_storel_epi64((__m128i *)(output + 3 * stride),
 2095|   222k|                   _mm_unpackhi_epi64(res_13, res_13));
 2096|       |  // 40 41
 2097|   222k|  _mm_storel_epi64((__m128i *)(output + 4 * stride), res_46);
 2098|       |  // 50 51
 2099|   222k|  _mm_storel_epi64((__m128i *)(output + 5 * stride), res_57);
 2100|       |  // 60 61
 2101|   222k|  _mm_storel_epi64((__m128i *)(output + 6 * stride),
 2102|   222k|                   _mm_unpackhi_epi64(res_46, res_46));
 2103|       |  // 70 71
 2104|   222k|  _mm_storel_epi64((__m128i *)(output + 7 * stride),
 2105|   222k|                   _mm_unpackhi_epi64(res_57, res_57));
 2106|   222k|}
av1_inv_txfm_avx2.c:lowbd_inv_txfm2d_add_universe_avx2:
 2172|   152k|    TX_SIZE tx_size, int eob) {
 2173|   152k|  (void)eob;
 2174|   152k|  switch (tx_type) {
 2175|   100k|    case DCT_DCT:
  ------------------
  |  Branch (2175:5): [True: 100k, False: 52.1k]
  ------------------
 2176|   114k|    case ADST_DCT:   // ADST in vertical, DCT in horizontal
  ------------------
  |  Branch (2176:5): [True: 13.4k, False: 139k]
  ------------------
 2177|   133k|    case DCT_ADST:   // DCT  in vertical, ADST in horizontal
  ------------------
  |  Branch (2177:5): [True: 19.0k, False: 133k]
  ------------------
 2178|   149k|    case ADST_ADST:  // ADST in both directions
  ------------------
  |  Branch (2178:5): [True: 15.9k, False: 136k]
  ------------------
 2179|   149k|    case FLIPADST_DCT:
  ------------------
  |  Branch (2179:5): [True: 220, False: 152k]
  ------------------
 2180|   149k|    case DCT_FLIPADST:
  ------------------
  |  Branch (2180:5): [True: 268, False: 152k]
  ------------------
 2181|   149k|    case FLIPADST_FLIPADST:
  ------------------
  |  Branch (2181:5): [True: 244, False: 152k]
  ------------------
 2182|   150k|    case ADST_FLIPADST:
  ------------------
  |  Branch (2182:5): [True: 208, False: 152k]
  ------------------
 2183|   150k|    case FLIPADST_ADST:
  ------------------
  |  Branch (2183:5): [True: 226, False: 152k]
  ------------------
 2184|   150k|      lowbd_inv_txfm2d_add_no_identity_avx2(input, output, stride, tx_type,
 2185|   150k|                                            tx_size, eob);
 2186|   150k|      break;
 2187|  1.95k|    case IDTX:
  ------------------
  |  Branch (2187:5): [True: 1.95k, False: 150k]
  ------------------
 2188|  1.95k|      lowbd_inv_txfm2d_add_idtx_avx2(input, output, stride, tx_size, eob);
 2189|  1.95k|      break;
 2190|    194|    case V_DCT:
  ------------------
  |  Branch (2190:5): [True: 194, False: 152k]
  ------------------
 2191|    194|    case V_ADST:
  ------------------
  |  Branch (2191:5): [True: 0, False: 152k]
  ------------------
 2192|    194|    case V_FLIPADST:
  ------------------
  |  Branch (2192:5): [True: 0, False: 152k]
  ------------------
 2193|    194|      lowbd_inv_txfm2d_add_h_identity_avx2(input, output, stride, tx_type,
 2194|    194|                                           tx_size, eob);
 2195|    194|      break;
 2196|    308|    case H_DCT:
  ------------------
  |  Branch (2196:5): [True: 308, False: 152k]
  ------------------
 2197|    308|    case H_ADST:
  ------------------
  |  Branch (2197:5): [True: 0, False: 152k]
  ------------------
 2198|    308|    case H_FLIPADST:
  ------------------
  |  Branch (2198:5): [True: 0, False: 152k]
  ------------------
 2199|    308|      lowbd_inv_txfm2d_add_v_identity_avx2(input, output, stride, tx_type,
 2200|    308|                                           tx_size, eob);
 2201|    308|      break;
 2202|      0|    default:
  ------------------
  |  Branch (2202:5): [True: 0, False: 152k]
  ------------------
 2203|      0|      av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size,
 2204|      0|                                     eob);
 2205|      0|      break;
 2206|   152k|  }
 2207|   152k|}
av1_inv_txfm_avx2.c:lowbd_inv_txfm2d_add_no_identity_avx2:
 1634|   150k|    TX_SIZE tx_size, int eob) {
 1635|   150k|  __m256i buf1[64 * 16];
 1636|   150k|  int eobx, eoby;
 1637|   150k|  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
 1638|   150k|  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
 1639|   150k|  const int txw_idx = get_txw_idx(tx_size);
 1640|   150k|  const int txh_idx = get_txh_idx(tx_size);
 1641|   150k|  const int txfm_size_col = tx_size_wide[tx_size];
 1642|   150k|  const int txfm_size_row = tx_size_high[tx_size];
 1643|   150k|  const int buf_size_w_div16 = txfm_size_col >> 4;
 1644|   150k|  const int buf_size_nonzero_w = ((eobx + 16) >> 4) << 4;
 1645|   150k|  const int buf_size_nonzero_h_div16 = (eoby + 16) >> 4;
 1646|   150k|  const int input_stride = AOMMIN(32, txfm_size_row);
  ------------------
  |  |   34|   150k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 11.1k, False: 139k]
  |  |  ------------------
  ------------------
 1647|   150k|  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
 1648|       |
 1649|   150k|  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
 1650|   150k|  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
 1651|   150k|  const transform_1d_avx2 row_txfm =
 1652|   150k|      lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
 1653|   150k|  const transform_1d_avx2 col_txfm =
 1654|   150k|      lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
 1655|       |
 1656|   150k|  assert(col_txfm != NULL);
 1657|   150k|  assert(row_txfm != NULL);
 1658|   150k|  int ud_flip, lr_flip;
 1659|   150k|  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 1660|   150k|  const __m256i scale0 = _mm256_set1_epi16(1 << (15 + shift[0]));
 1661|   307k|  for (int i = 0; i < buf_size_nonzero_h_div16; i++) {
  ------------------
  |  Branch (1661:19): [True: 157k, False: 150k]
  ------------------
 1662|   157k|    __m256i buf0[64];
 1663|   157k|    load_buffer_32bit_to_16bit_w16_avx2(input + 16 * i, input_stride, buf0,
 1664|   157k|                                        buf_size_nonzero_w);
 1665|   157k|    if (rect_type == 1 || rect_type == -1) {
  ------------------
  |  Branch (1665:9): [True: 11.4k, False: 146k]
  |  Branch (1665:27): [True: 11.7k, False: 134k]
  ------------------
 1666|  23.2k|      round_shift_avx2(buf0, buf0, buf_size_nonzero_w);  // rect special code
 1667|  23.2k|    }
 1668|   157k|    row_txfm(buf0, buf0);
 1669|  3.97M|    for (int j = 0; j < txfm_size_col; ++j) {
  ------------------
  |  Branch (1669:21): [True: 3.81M, False: 157k]
  ------------------
 1670|  3.81M|      buf0[j] = _mm256_mulhrs_epi16(buf0[j], scale0);
 1671|  3.81M|    }
 1672|       |
 1673|   157k|    __m256i *buf1_cur = buf1 + (i << 4);
 1674|   157k|    if (lr_flip) {
  ------------------
  |  Branch (1674:9): [True: 720, False: 156k]
  ------------------
 1675|  1.44k|      for (int j = 0; j < buf_size_w_div16; ++j) {
  ------------------
  |  Branch (1675:23): [True: 720, False: 720]
  ------------------
 1676|    720|        __m256i temp[16];
 1677|    720|        flip_buf_avx2(buf0 + 16 * j, temp, 16);
 1678|    720|        int offset = txfm_size_row * (buf_size_w_div16 - 1 - j);
 1679|    720|        transpose_16bit_16x16_avx2(temp, buf1_cur + offset);
 1680|    720|      }
 1681|   156k|    } else {
 1682|   394k|      for (int j = 0; j < buf_size_w_div16; ++j) {
  ------------------
  |  Branch (1682:23): [True: 237k, False: 156k]
  ------------------
 1683|   237k|        transpose_16bit_16x16_avx2(buf0 + 16 * j, buf1_cur + txfm_size_row * j);
 1684|   237k|      }
 1685|   156k|    }
 1686|   157k|  }
 1687|   150k|  const __m256i scale1 = _mm256_set1_epi16(1 << (15 + shift[1]));
 1688|   374k|  for (int i = 0; i < buf_size_w_div16; i++) {
  ------------------
  |  Branch (1688:19): [True: 223k, False: 150k]
  ------------------
 1689|   223k|    __m256i *buf1_cur = buf1 + i * txfm_size_row;
 1690|   223k|    col_txfm(buf1_cur, buf1_cur);
 1691|  6.64M|    for (int j = 0; j < txfm_size_row; ++j) {
  ------------------
  |  Branch (1691:21): [True: 6.42M, False: 223k]
  ------------------
 1692|  6.42M|      buf1_cur[j] = _mm256_mulhrs_epi16(buf1_cur[j], scale1);
 1693|  6.42M|    }
 1694|   223k|  }
 1695|   374k|  for (int i = 0; i < buf_size_w_div16; i++) {
  ------------------
  |  Branch (1695:19): [True: 223k, False: 150k]
  ------------------
 1696|   223k|    lowbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row, output + 16 * i,
 1697|   223k|                                 stride, ud_flip, txfm_size_row);
 1698|   223k|  }
 1699|   150k|}
av1_inv_txfm_avx2.c:idct16_low1_avx2:
  192|  23.7k|static void idct16_low1_avx2(const __m256i *input, __m256i *output) {
  193|  23.7k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|  23.7k|#define INV_COS_BIT 12
  ------------------
  194|       |
  195|       |  // stage 1
  196|  23.7k|  __m256i x1[2];
  197|  23.7k|  x1[0] = input[0];
  198|       |
  199|       |  // stage 2
  200|       |  // stage 3
  201|       |  // stage 4
  202|  23.7k|  btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]);
  ------------------
  |  |   30|  23.7k|  do {                                             \
  |  |   31|  23.7k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  23.7k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  23.7k|    const __m256i _in = in;                        \
  |  |   34|  23.7k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  23.7k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  23.7k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 23.7k]
  |  |  ------------------
  ------------------
  203|       |
  204|       |  // stage 5
  205|       |  // stage 6
  206|  23.7k|  output[0] = x1[0];
  207|  23.7k|  output[1] = x1[1];
  208|  23.7k|  output[2] = x1[1];
  209|  23.7k|  output[3] = x1[0];
  210|  23.7k|  output[4] = x1[0];
  211|  23.7k|  output[5] = x1[1];
  212|  23.7k|  output[6] = x1[1];
  213|  23.7k|  output[7] = x1[0];
  214|  23.7k|  output[8] = x1[0];
  215|  23.7k|  output[9] = x1[1];
  216|  23.7k|  output[10] = x1[1];
  217|  23.7k|  output[11] = x1[0];
  218|  23.7k|  output[12] = x1[0];
  219|  23.7k|  output[13] = x1[1];
  220|  23.7k|  output[14] = x1[1];
  221|  23.7k|  output[15] = x1[0];
  222|  23.7k|}
av1_inv_txfm_avx2.c:idct16_low8_avx2:
  144|  74.2k|static void idct16_low8_avx2(const __m256i *input, __m256i *output) {
  145|  74.2k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|  74.2k|#define INV_COS_BIT 12
  ------------------
  146|  74.2k|  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|  74.2k|#define INV_COS_BIT 12
  ------------------
  147|       |
  148|  74.2k|  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
  149|  74.2k|  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
  150|  74.2k|  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
  151|       |
  152|       |  // stage 1
  153|  74.2k|  __m256i x1[16];
  154|  74.2k|  x1[0] = input[0];
  155|  74.2k|  x1[2] = input[4];
  156|  74.2k|  x1[4] = input[2];
  157|  74.2k|  x1[6] = input[6];
  158|  74.2k|  x1[8] = input[1];
  159|  74.2k|  x1[10] = input[5];
  160|  74.2k|  x1[12] = input[3];
  161|  74.2k|  x1[14] = input[7];
  162|       |
  163|       |  // stage 2
  164|  74.2k|  btf_16_w16_0_avx2(cospi[60], cospi[4], x1[8], x1[8], x1[15]);
  ------------------
  |  |   30|  74.2k|  do {                                             \
  |  |   31|  74.2k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  74.2k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  74.2k|    const __m256i _in = in;                        \
  |  |   34|  74.2k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  74.2k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  74.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 74.2k]
  |  |  ------------------
  ------------------
  165|  74.2k|  btf_16_w16_0_avx2(-cospi[36], cospi[28], x1[14], x1[9], x1[14]);
  ------------------
  |  |   30|  74.2k|  do {                                             \
  |  |   31|  74.2k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  74.2k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  74.2k|    const __m256i _in = in;                        \
  |  |   34|  74.2k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  74.2k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  74.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 74.2k]
  |  |  ------------------
  ------------------
  166|  74.2k|  btf_16_w16_0_avx2(cospi[44], cospi[20], x1[10], x1[10], x1[13]);
  ------------------
  |  |   30|  74.2k|  do {                                             \
  |  |   31|  74.2k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  74.2k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  74.2k|    const __m256i _in = in;                        \
  |  |   34|  74.2k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  74.2k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  74.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 74.2k]
  |  |  ------------------
  ------------------
  167|  74.2k|  btf_16_w16_0_avx2(-cospi[52], cospi[12], x1[12], x1[11], x1[12]);
  ------------------
  |  |   30|  74.2k|  do {                                             \
  |  |   31|  74.2k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  74.2k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  74.2k|    const __m256i _in = in;                        \
  |  |   34|  74.2k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  74.2k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  74.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 74.2k]
  |  |  ------------------
  ------------------
  168|       |
  169|       |  // stage 3
  170|  74.2k|  btf_16_w16_0_avx2(cospi[56], cospi[8], x1[4], x1[4], x1[7]);
  ------------------
  |  |   30|  74.2k|  do {                                             \
  |  |   31|  74.2k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  74.2k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  74.2k|    const __m256i _in = in;                        \
  |  |   34|  74.2k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  74.2k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  74.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 74.2k]
  |  |  ------------------
  ------------------
  171|  74.2k|  btf_16_w16_0_avx2(-cospi[40], cospi[24], x1[6], x1[5], x1[6]);
  ------------------
  |  |   30|  74.2k|  do {                                             \
  |  |   31|  74.2k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  74.2k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  74.2k|    const __m256i _in = in;                        \
  |  |   34|  74.2k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  74.2k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  74.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 74.2k]
  |  |  ------------------
  ------------------
  172|  74.2k|  btf_16_adds_subs_avx2(&x1[8], &x1[9]);
  173|  74.2k|  btf_16_adds_subs_avx2(&x1[11], &x1[10]);
  174|  74.2k|  btf_16_adds_subs_avx2(&x1[12], &x1[13]);
  175|  74.2k|  btf_16_adds_subs_avx2(&x1[15], &x1[14]);
  176|       |
  177|       |  // stage 4
  178|  74.2k|  btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]);
  ------------------
  |  |   30|  74.2k|  do {                                             \
  |  |   31|  74.2k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  74.2k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  74.2k|    const __m256i _in = in;                        \
  |  |   34|  74.2k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  74.2k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  74.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 74.2k]
  |  |  ------------------
  ------------------
  179|  74.2k|  btf_16_w16_0_avx2(cospi[48], cospi[16], x1[2], x1[2], x1[3]);
  ------------------
  |  |   30|  74.2k|  do {                                             \
  |  |   31|  74.2k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  74.2k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  74.2k|    const __m256i _in = in;                        \
  |  |   34|  74.2k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  74.2k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  74.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 74.2k]
  |  |  ------------------
  ------------------
  180|  74.2k|  btf_16_adds_subs_avx2(&x1[4], &x1[5]);
  181|  74.2k|  btf_16_adds_subs_avx2(&x1[7], &x1[6]);
  182|  74.2k|  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r,
  183|  74.2k|                  INV_COS_BIT);
  ------------------
  |  |   43|  74.2k|#define INV_COS_BIT 12
  ------------------
  184|  74.2k|  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r,
  185|  74.2k|                  INV_COS_BIT);
  ------------------
  |  |   43|  74.2k|#define INV_COS_BIT 12
  ------------------
  186|       |
  187|  74.2k|  idct16_stage5_avx2(x1, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|  74.2k|#define INV_COS_BIT 12
  ------------------
  188|  74.2k|  idct16_stage6_avx2(x1, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|  74.2k|#define INV_COS_BIT 12
  ------------------
  189|  74.2k|  idct16_stage7_avx2(output, x1);
  190|  74.2k|}
av1_inv_txfm_avx2.c:idct16_stage5_avx2:
   28|   126k|                                      const __m256i _r, int8_t cos_bit) {
   29|   126k|  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
   30|   126k|  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
   31|   126k|  btf_16_adds_subs_avx2(&x1[0], &x1[3]);
   32|   126k|  btf_16_adds_subs_avx2(&x1[1], &x1[2]);
   33|   126k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
   34|       |
   35|   126k|  btf_16_adds_subs_avx2(&x1[8], &x1[11]);
   36|   126k|  btf_16_adds_subs_avx2(&x1[9], &x1[10]);
   37|   126k|  btf_16_adds_subs_avx2(&x1[15], &x1[12]);
   38|   126k|  btf_16_adds_subs_avx2(&x1[14], &x1[13]);
   39|   126k|}
av1_inv_txfm_avx2.c:idct16_stage6_avx2:
   42|   126k|                                      const __m256i _r, int8_t cos_bit) {
   43|   126k|  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
   44|   126k|  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
   45|   126k|  btf_16_adds_subs_avx2(&x[0], &x[7]);
   46|   126k|  btf_16_adds_subs_avx2(&x[1], &x[6]);
   47|   126k|  btf_16_adds_subs_avx2(&x[2], &x[5]);
   48|   126k|  btf_16_adds_subs_avx2(&x[3], &x[4]);
   49|   126k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
   50|   126k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
   51|   126k|}
av1_inv_txfm_avx2.c:idct16_stage7_avx2:
   53|   126k|static inline void idct16_stage7_avx2(__m256i *output, __m256i *x1) {
   54|   126k|  btf_16_adds_subs_out_avx2(&output[0], &output[15], x1[0], x1[15]);
   55|   126k|  btf_16_adds_subs_out_avx2(&output[1], &output[14], x1[1], x1[14]);
   56|   126k|  btf_16_adds_subs_out_avx2(&output[2], &output[13], x1[2], x1[13]);
   57|   126k|  btf_16_adds_subs_out_avx2(&output[3], &output[12], x1[3], x1[12]);
   58|   126k|  btf_16_adds_subs_out_avx2(&output[4], &output[11], x1[4], x1[11]);
   59|   126k|  btf_16_adds_subs_out_avx2(&output[5], &output[10], x1[5], x1[10]);
   60|   126k|  btf_16_adds_subs_out_avx2(&output[6], &output[9], x1[6], x1[9]);
   61|   126k|  btf_16_adds_subs_out_avx2(&output[7], &output[8], x1[7], x1[8]);
   62|   126k|}
av1_inv_txfm_avx2.c:idct16_avx2:
   64|  51.8k|static void idct16_avx2(const __m256i *input, __m256i *output) {
   65|  51.8k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|  51.8k|#define INV_COS_BIT 12
  ------------------
   66|  51.8k|  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|  51.8k|#define INV_COS_BIT 12
  ------------------
   67|       |
   68|  51.8k|  __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
   69|  51.8k|  __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
   70|  51.8k|  __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]);
   71|  51.8k|  __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]);
   72|  51.8k|  __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]);
   73|  51.8k|  __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]);
   74|  51.8k|  __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]);
   75|  51.8k|  __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]);
   76|  51.8k|  __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
   77|  51.8k|  __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
   78|  51.8k|  __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
   79|  51.8k|  __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
   80|  51.8k|  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
   81|  51.8k|  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
   82|  51.8k|  __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
   83|  51.8k|  __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
   84|  51.8k|  __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
   85|  51.8k|  __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
   86|  51.8k|  __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
   87|       |
   88|       |  // stage 1
   89|  51.8k|  __m256i x1[16];
   90|  51.8k|  x1[0] = input[0];
   91|  51.8k|  x1[1] = input[8];
   92|  51.8k|  x1[2] = input[4];
   93|  51.8k|  x1[3] = input[12];
   94|  51.8k|  x1[4] = input[2];
   95|  51.8k|  x1[5] = input[10];
   96|  51.8k|  x1[6] = input[6];
   97|  51.8k|  x1[7] = input[14];
   98|  51.8k|  x1[8] = input[1];
   99|  51.8k|  x1[9] = input[9];
  100|  51.8k|  x1[10] = input[5];
  101|  51.8k|  x1[11] = input[13];
  102|  51.8k|  x1[12] = input[3];
  103|  51.8k|  x1[13] = input[11];
  104|  51.8k|  x1[14] = input[7];
  105|  51.8k|  x1[15] = input[15];
  106|       |
  107|       |  // stage 2
  108|  51.8k|  btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r,
  109|  51.8k|                  INV_COS_BIT);
  ------------------
  |  |   43|  51.8k|#define INV_COS_BIT 12
  ------------------
  110|  51.8k|  btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r,
  111|  51.8k|                  INV_COS_BIT);
  ------------------
  |  |   43|  51.8k|#define INV_COS_BIT 12
  ------------------
  112|  51.8k|  btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r,
  113|  51.8k|                  INV_COS_BIT);
  ------------------
  |  |   43|  51.8k|#define INV_COS_BIT 12
  ------------------
  114|  51.8k|  btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r,
  115|  51.8k|                  INV_COS_BIT);
  ------------------
  |  |   43|  51.8k|#define INV_COS_BIT 12
  ------------------
  116|       |
  117|       |  // stage 3
  118|  51.8k|  btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r,
  119|  51.8k|                  INV_COS_BIT);
  ------------------
  |  |   43|  51.8k|#define INV_COS_BIT 12
  ------------------
  120|  51.8k|  btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r,
  121|  51.8k|                  INV_COS_BIT);
  ------------------
  |  |   43|  51.8k|#define INV_COS_BIT 12
  ------------------
  122|  51.8k|  btf_16_adds_subs_avx2(&x1[8], &x1[9]);
  123|  51.8k|  btf_16_adds_subs_avx2(&x1[11], &x1[10]);
  124|  51.8k|  btf_16_adds_subs_avx2(&x1[12], &x1[13]);
  125|  51.8k|  btf_16_adds_subs_avx2(&x1[15], &x1[14]);
  126|       |
  127|       |  // stage 4
  128|  51.8k|  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r,
  129|  51.8k|                  INV_COS_BIT);
  ------------------
  |  |   43|  51.8k|#define INV_COS_BIT 12
  ------------------
  130|  51.8k|  btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r,
  131|  51.8k|                  INV_COS_BIT);
  ------------------
  |  |   43|  51.8k|#define INV_COS_BIT 12
  ------------------
  132|  51.8k|  btf_16_adds_subs_avx2(&x1[4], &x1[5]);
  133|  51.8k|  btf_16_adds_subs_avx2(&x1[7], &x1[6]);
  134|  51.8k|  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r,
  135|  51.8k|                  INV_COS_BIT);
  ------------------
  |  |   43|  51.8k|#define INV_COS_BIT 12
  ------------------
  136|  51.8k|  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r,
  137|  51.8k|                  INV_COS_BIT);
  ------------------
  |  |   43|  51.8k|#define INV_COS_BIT 12
  ------------------
  138|       |
  139|  51.8k|  idct16_stage5_avx2(x1, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|  51.8k|#define INV_COS_BIT 12
  ------------------
  140|  51.8k|  idct16_stage6_avx2(x1, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|  51.8k|#define INV_COS_BIT 12
  ------------------
  141|  51.8k|  idct16_stage7_avx2(output, x1);
  142|  51.8k|}
av1_inv_txfm_avx2.c:iadst16_low1_avx2:
  414|  8.03k|static void iadst16_low1_avx2(const __m256i *input, __m256i *output) {
  415|  8.03k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|  8.03k|#define INV_COS_BIT 12
  ------------------
  416|  8.03k|  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|  8.03k|#define INV_COS_BIT 12
  ------------------
  417|       |
  418|  8.03k|  const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
  419|  8.03k|  const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
  420|  8.03k|  const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
  421|  8.03k|  const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
  422|       |
  423|       |  // stage 1
  424|  8.03k|  __m256i x1[16];
  425|  8.03k|  x1[1] = input[0];
  426|       |
  427|       |  // stage 2
  428|  8.03k|  btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]);
  ------------------
  |  |   30|  8.03k|  do {                                             \
  |  |   31|  8.03k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  8.03k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  8.03k|    const __m256i _in = in;                        \
  |  |   34|  8.03k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  8.03k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  8.03k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 8.03k]
  |  |  ------------------
  ------------------
  429|       |
  430|       |  // stage 3
  431|  8.03k|  x1[8] = x1[0];
  432|  8.03k|  x1[9] = x1[1];
  433|       |
  434|       |  // stage 4
  435|  8.03k|  btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r,
  436|  8.03k|                  INV_COS_BIT);
  ------------------
  |  |   43|  8.03k|#define INV_COS_BIT 12
  ------------------
  437|       |
  438|       |  // stage 5
  439|  8.03k|  x1[4] = x1[0];
  440|  8.03k|  x1[5] = x1[1];
  441|       |
  442|  8.03k|  x1[12] = x1[8];
  443|  8.03k|  x1[13] = x1[9];
  444|       |
  445|       |  // stage 6
  446|  8.03k|  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r,
  447|  8.03k|                  INV_COS_BIT);
  ------------------
  |  |   43|  8.03k|#define INV_COS_BIT 12
  ------------------
  448|  8.03k|  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r,
  449|  8.03k|                  INV_COS_BIT);
  ------------------
  |  |   43|  8.03k|#define INV_COS_BIT 12
  ------------------
  450|       |
  451|       |  // stage 7
  452|  8.03k|  x1[2] = x1[0];
  453|  8.03k|  x1[3] = x1[1];
  454|  8.03k|  x1[6] = x1[4];
  455|  8.03k|  x1[7] = x1[5];
  456|  8.03k|  x1[10] = x1[8];
  457|  8.03k|  x1[11] = x1[9];
  458|  8.03k|  x1[14] = x1[12];
  459|  8.03k|  x1[15] = x1[13];
  460|       |
  461|  8.03k|  iadst16_stage8_avx2(x1, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|  8.03k|#define INV_COS_BIT 12
  ------------------
  462|  8.03k|  iadst16_stage9_avx2(output, x1);
  463|  8.03k|}
av1_inv_txfm_avx2.c:iadst16_stage8_avx2:
  283|  66.3k|                                       const __m256i _r, int8_t cos_bit) {
  284|  66.3k|  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
  285|  66.3k|  const __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
  286|  66.3k|  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], _r, cos_bit);
  287|  66.3k|  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], _r, cos_bit);
  288|  66.3k|  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[10], &x1[11], _r, cos_bit);
  289|  66.3k|  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[14], &x1[15], _r, cos_bit);
  290|  66.3k|}
av1_inv_txfm_avx2.c:iadst16_stage9_avx2:
  292|  66.3k|static inline void iadst16_stage9_avx2(__m256i *output, __m256i *x1) {
  293|  66.3k|  const __m256i __zero = _mm256_setzero_si256();
  294|  66.3k|  output[0] = x1[0];
  295|  66.3k|  output[1] = _mm256_subs_epi16(__zero, x1[8]);
  296|  66.3k|  output[2] = x1[12];
  297|  66.3k|  output[3] = _mm256_subs_epi16(__zero, x1[4]);
  298|  66.3k|  output[4] = x1[6];
  299|  66.3k|  output[5] = _mm256_subs_epi16(__zero, x1[14]);
  300|  66.3k|  output[6] = x1[10];
  301|  66.3k|  output[7] = _mm256_subs_epi16(__zero, x1[2]);
  302|  66.3k|  output[8] = x1[3];
  303|  66.3k|  output[9] = _mm256_subs_epi16(__zero, x1[11]);
  304|  66.3k|  output[10] = x1[15];
  305|  66.3k|  output[11] = _mm256_subs_epi16(__zero, x1[7]);
  306|  66.3k|  output[12] = x1[5];
  307|  66.3k|  output[13] = _mm256_subs_epi16(__zero, x1[13]);
  308|  66.3k|  output[14] = x1[9];
  309|  66.3k|  output[15] = _mm256_subs_epi16(__zero, x1[1]);
  310|  66.3k|}
av1_inv_txfm_avx2.c:iadst16_low8_avx2:
  380|  31.6k|static void iadst16_low8_avx2(const __m256i *input, __m256i *output) {
  381|  31.6k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|  31.6k|#define INV_COS_BIT 12
  ------------------
  382|  31.6k|  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|  31.6k|#define INV_COS_BIT 12
  ------------------
  383|       |
  384|       |  // stage 1
  385|  31.6k|  __m256i x1[16];
  386|  31.6k|  x1[1] = input[0];
  387|  31.6k|  x1[3] = input[2];
  388|  31.6k|  x1[5] = input[4];
  389|  31.6k|  x1[7] = input[6];
  390|  31.6k|  x1[8] = input[7];
  391|  31.6k|  x1[10] = input[5];
  392|  31.6k|  x1[12] = input[3];
  393|  31.6k|  x1[14] = input[1];
  394|       |
  395|       |  // stage 2
  396|  31.6k|  btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]);
  ------------------
  |  |   30|  31.6k|  do {                                             \
  |  |   31|  31.6k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  31.6k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  31.6k|    const __m256i _in = in;                        \
  |  |   34|  31.6k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  31.6k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  31.6k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 31.6k]
  |  |  ------------------
  ------------------
  397|  31.6k|  btf_16_w16_0_avx2(cospi[54], -cospi[10], x1[3], x1[2], x1[3]);
  ------------------
  |  |   30|  31.6k|  do {                                             \
  |  |   31|  31.6k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  31.6k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  31.6k|    const __m256i _in = in;                        \
  |  |   34|  31.6k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  31.6k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  31.6k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 31.6k]
  |  |  ------------------
  ------------------
  398|  31.6k|  btf_16_w16_0_avx2(cospi[46], -cospi[18], x1[5], x1[4], x1[5]);
  ------------------
  |  |   30|  31.6k|  do {                                             \
  |  |   31|  31.6k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  31.6k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  31.6k|    const __m256i _in = in;                        \
  |  |   34|  31.6k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  31.6k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  31.6k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 31.6k]
  |  |  ------------------
  ------------------
  399|  31.6k|  btf_16_w16_0_avx2(cospi[38], -cospi[26], x1[7], x1[6], x1[7]);
  ------------------
  |  |   30|  31.6k|  do {                                             \
  |  |   31|  31.6k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  31.6k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  31.6k|    const __m256i _in = in;                        \
  |  |   34|  31.6k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  31.6k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  31.6k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 31.6k]
  |  |  ------------------
  ------------------
  400|  31.6k|  btf_16_w16_0_avx2(cospi[34], cospi[30], x1[8], x1[8], x1[9]);
  ------------------
  |  |   30|  31.6k|  do {                                             \
  |  |   31|  31.6k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  31.6k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  31.6k|    const __m256i _in = in;                        \
  |  |   34|  31.6k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  31.6k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  31.6k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 31.6k]
  |  |  ------------------
  ------------------
  401|  31.6k|  btf_16_w16_0_avx2(cospi[42], cospi[22], x1[10], x1[10], x1[11]);
  ------------------
  |  |   30|  31.6k|  do {                                             \
  |  |   31|  31.6k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  31.6k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  31.6k|    const __m256i _in = in;                        \
  |  |   34|  31.6k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  31.6k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  31.6k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 31.6k]
  |  |  ------------------
  ------------------
  402|  31.6k|  btf_16_w16_0_avx2(cospi[50], cospi[14], x1[12], x1[12], x1[13]);
  ------------------
  |  |   30|  31.6k|  do {                                             \
  |  |   31|  31.6k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  31.6k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  31.6k|    const __m256i _in = in;                        \
  |  |   34|  31.6k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  31.6k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  31.6k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 31.6k]
  |  |  ------------------
  ------------------
  403|  31.6k|  btf_16_w16_0_avx2(cospi[58], cospi[06], x1[14], x1[14], x1[15]);
  ------------------
  |  |   30|  31.6k|  do {                                             \
  |  |   31|  31.6k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  31.6k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  31.6k|    const __m256i _in = in;                        \
  |  |   34|  31.6k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  31.6k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  31.6k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 31.6k]
  |  |  ------------------
  ------------------
  404|       |
  405|  31.6k|  iadst16_stage3_avx2(x1);
  406|  31.6k|  iadst16_stage4_avx2(x1, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|  31.6k|#define INV_COS_BIT 12
  ------------------
  407|  31.6k|  iadst16_stage5_avx2(x1);
  408|  31.6k|  iadst16_stage6_avx2(x1, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|  31.6k|#define INV_COS_BIT 12
  ------------------
  409|  31.6k|  iadst16_stage7_avx2(x1);
  410|  31.6k|  iadst16_stage8_avx2(x1, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|  31.6k|#define INV_COS_BIT 12
  ------------------
  411|  31.6k|  iadst16_stage9_avx2(output, x1);
  412|  31.6k|}
av1_inv_txfm_avx2.c:iadst16_stage3_avx2:
  224|  58.3k|static inline void iadst16_stage3_avx2(__m256i *x) {
  225|  58.3k|  btf_16_adds_subs_avx2(&x[0], &x[8]);
  226|  58.3k|  btf_16_adds_subs_avx2(&x[1], &x[9]);
  227|  58.3k|  btf_16_adds_subs_avx2(&x[2], &x[10]);
  228|  58.3k|  btf_16_adds_subs_avx2(&x[3], &x[11]);
  229|  58.3k|  btf_16_adds_subs_avx2(&x[4], &x[12]);
  230|  58.3k|  btf_16_adds_subs_avx2(&x[5], &x[13]);
  231|  58.3k|  btf_16_adds_subs_avx2(&x[6], &x[14]);
  232|  58.3k|  btf_16_adds_subs_avx2(&x[7], &x[15]);
  233|  58.3k|}
av1_inv_txfm_avx2.c:iadst16_stage4_avx2:
  236|  58.3k|                                       const __m256i _r, int8_t cos_bit) {
  237|  58.3k|  const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
  238|  58.3k|  const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
  239|  58.3k|  const __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
  240|  58.3k|  const __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
  241|  58.3k|  const __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]);
  242|  58.3k|  const __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]);
  243|  58.3k|  btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x[8], &x[9], _r, cos_bit);
  244|  58.3k|  btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, &x[10], &x[11], _r, cos_bit);
  245|  58.3k|  btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, &x[12], &x[13], _r, cos_bit);
  246|  58.3k|  btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, &x[14], &x[15], _r, cos_bit);
  247|  58.3k|}
av1_inv_txfm_avx2.c:iadst16_stage5_avx2:
  249|  58.3k|static inline void iadst16_stage5_avx2(__m256i *x) {
  250|  58.3k|  btf_16_adds_subs_avx2(&x[0], &x[4]);
  251|  58.3k|  btf_16_adds_subs_avx2(&x[1], &x[5]);
  252|  58.3k|  btf_16_adds_subs_avx2(&x[2], &x[6]);
  253|  58.3k|  btf_16_adds_subs_avx2(&x[3], &x[7]);
  254|  58.3k|  btf_16_adds_subs_avx2(&x[8], &x[12]);
  255|  58.3k|  btf_16_adds_subs_avx2(&x[9], &x[13]);
  256|  58.3k|  btf_16_adds_subs_avx2(&x[10], &x[14]);
  257|  58.3k|  btf_16_adds_subs_avx2(&x[11], &x[15]);
  258|  58.3k|}
av1_inv_txfm_avx2.c:iadst16_stage6_avx2:
  261|  58.3k|                                       const __m256i _r, int8_t cos_bit) {
  262|  58.3k|  const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
  263|  58.3k|  const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
  264|  58.3k|  const __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]);
  265|  58.3k|  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[4], &x[5], _r, cos_bit);
  266|  58.3k|  btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[6], &x[7], _r, cos_bit);
  267|  58.3k|  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[12], &x[13], _r, cos_bit);
  268|  58.3k|  btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[14], &x[15], _r, cos_bit);
  269|  58.3k|}
av1_inv_txfm_avx2.c:iadst16_stage7_avx2:
  271|  58.3k|static inline void iadst16_stage7_avx2(__m256i *x) {
  272|  58.3k|  btf_16_adds_subs_avx2(&x[0], &x[2]);
  273|  58.3k|  btf_16_adds_subs_avx2(&x[1], &x[3]);
  274|  58.3k|  btf_16_adds_subs_avx2(&x[4], &x[6]);
  275|  58.3k|  btf_16_adds_subs_avx2(&x[5], &x[7]);
  276|  58.3k|  btf_16_adds_subs_avx2(&x[8], &x[10]);
  277|  58.3k|  btf_16_adds_subs_avx2(&x[9], &x[11]);
  278|  58.3k|  btf_16_adds_subs_avx2(&x[12], &x[14]);
  279|  58.3k|  btf_16_adds_subs_avx2(&x[13], &x[15]);
  280|  58.3k|}
av1_inv_txfm_avx2.c:iadst16_avx2:
  312|  26.6k|static void iadst16_avx2(const __m256i *input, __m256i *output) {
  313|  26.6k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|  26.6k|#define INV_COS_BIT 12
  ------------------
  314|       |
  315|  26.6k|  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|  26.6k|#define INV_COS_BIT 12
  ------------------
  316|       |
  317|  26.6k|  __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
  318|  26.6k|  __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
  319|  26.6k|  __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
  320|  26.6k|  __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
  321|  26.6k|  __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
  322|  26.6k|  __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
  323|  26.6k|  __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
  324|  26.6k|  __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
  325|  26.6k|  __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
  326|  26.6k|  __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
  327|  26.6k|  __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
  328|  26.6k|  __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
  329|  26.6k|  __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
  330|  26.6k|  __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
  331|  26.6k|  __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
  332|  26.6k|  __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
  333|       |
  334|       |  // stage 1
  335|  26.6k|  __m256i x1[16];
  336|  26.6k|  x1[0] = input[15];
  337|  26.6k|  x1[1] = input[0];
  338|  26.6k|  x1[2] = input[13];
  339|  26.6k|  x1[3] = input[2];
  340|  26.6k|  x1[4] = input[11];
  341|  26.6k|  x1[5] = input[4];
  342|  26.6k|  x1[6] = input[9];
  343|  26.6k|  x1[7] = input[6];
  344|  26.6k|  x1[8] = input[7];
  345|  26.6k|  x1[9] = input[8];
  346|  26.6k|  x1[10] = input[5];
  347|  26.6k|  x1[11] = input[10];
  348|  26.6k|  x1[12] = input[3];
  349|  26.6k|  x1[13] = input[12];
  350|  26.6k|  x1[14] = input[1];
  351|  26.6k|  x1[15] = input[14];
  352|       |
  353|       |  // stage 2
  354|  26.6k|  btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r,
  355|  26.6k|                  INV_COS_BIT);
  ------------------
  |  |   43|  26.6k|#define INV_COS_BIT 12
  ------------------
  356|  26.6k|  btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r,
  357|  26.6k|                  INV_COS_BIT);
  ------------------
  |  |   43|  26.6k|#define INV_COS_BIT 12
  ------------------
  358|  26.6k|  btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r,
  359|  26.6k|                  INV_COS_BIT);
  ------------------
  |  |   43|  26.6k|#define INV_COS_BIT 12
  ------------------
  360|  26.6k|  btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r,
  361|  26.6k|                  INV_COS_BIT);
  ------------------
  |  |   43|  26.6k|#define INV_COS_BIT 12
  ------------------
  362|  26.6k|  btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r,
  363|  26.6k|                  INV_COS_BIT);
  ------------------
  |  |   43|  26.6k|#define INV_COS_BIT 12
  ------------------
  364|  26.6k|  btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r,
  365|  26.6k|                  INV_COS_BIT);
  ------------------
  |  |   43|  26.6k|#define INV_COS_BIT 12
  ------------------
  366|  26.6k|  btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r,
  367|  26.6k|                  INV_COS_BIT);
  ------------------
  |  |   43|  26.6k|#define INV_COS_BIT 12
  ------------------
  368|  26.6k|  btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r,
  369|  26.6k|                  INV_COS_BIT);
  ------------------
  |  |   43|  26.6k|#define INV_COS_BIT 12
  ------------------
  370|       |
  371|  26.6k|  iadst16_stage3_avx2(x1);
  372|  26.6k|  iadst16_stage4_avx2(x1, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|  26.6k|#define INV_COS_BIT 12
  ------------------
  373|  26.6k|  iadst16_stage5_avx2(x1);
  374|  26.6k|  iadst16_stage6_avx2(x1, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|  26.6k|#define INV_COS_BIT 12
  ------------------
  375|  26.6k|  iadst16_stage7_avx2(x1);
  376|  26.6k|  iadst16_stage8_avx2(x1, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|  26.6k|#define INV_COS_BIT 12
  ------------------
  377|  26.6k|  iadst16_stage9_avx2(output, x1);
  378|  26.6k|}
av1_inv_txfm_avx2.c:idct32_low1_avx2:
  582|  36.2k|static void idct32_low1_avx2(const __m256i *input, __m256i *output) {
  583|  36.2k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|  36.2k|#define INV_COS_BIT 12
  ------------------
  584|       |
  585|       |  // stage 1
  586|  36.2k|  __m256i x[2];
  587|  36.2k|  x[0] = input[0];
  588|       |
  589|       |  // stage 2
  590|       |  // stage 3
  591|       |  // stage 4
  592|       |  // stage 5
  593|  36.2k|  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
  ------------------
  |  |   30|  36.2k|  do {                                             \
  |  |   31|  36.2k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  36.2k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  36.2k|    const __m256i _in = in;                        \
  |  |   34|  36.2k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  36.2k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  36.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 36.2k]
  |  |  ------------------
  ------------------
  594|       |
  595|       |  // stage 6
  596|       |  // stage 7
  597|       |  // stage 8
  598|       |  // stage 9
  599|  36.2k|  output[0] = x[0];
  600|  36.2k|  output[31] = x[0];
  601|  36.2k|  output[1] = x[1];
  602|  36.2k|  output[30] = x[1];
  603|  36.2k|  output[2] = x[1];
  604|  36.2k|  output[29] = x[1];
  605|  36.2k|  output[3] = x[0];
  606|  36.2k|  output[28] = x[0];
  607|  36.2k|  output[4] = x[0];
  608|  36.2k|  output[27] = x[0];
  609|  36.2k|  output[5] = x[1];
  610|  36.2k|  output[26] = x[1];
  611|  36.2k|  output[6] = x[1];
  612|  36.2k|  output[25] = x[1];
  613|  36.2k|  output[7] = x[0];
  614|  36.2k|  output[24] = x[0];
  615|  36.2k|  output[8] = x[0];
  616|  36.2k|  output[23] = x[0];
  617|  36.2k|  output[9] = x[1];
  618|  36.2k|  output[22] = x[1];
  619|  36.2k|  output[10] = x[1];
  620|  36.2k|  output[21] = x[1];
  621|  36.2k|  output[11] = x[0];
  622|  36.2k|  output[20] = x[0];
  623|  36.2k|  output[12] = x[0];
  624|  36.2k|  output[19] = x[0];
  625|  36.2k|  output[13] = x[1];
  626|  36.2k|  output[18] = x[1];
  627|  36.2k|  output[14] = x[1];
  628|  36.2k|  output[17] = x[1];
  629|  36.2k|  output[15] = x[0];
  630|  36.2k|  output[16] = x[0];
  631|  36.2k|}
av1_inv_txfm_avx2.c:idct32_low8_avx2:
  633|  42.3k|static void idct32_low8_avx2(const __m256i *input, __m256i *output) {
  634|  42.3k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|  42.3k|#define INV_COS_BIT 12
  ------------------
  635|  42.3k|  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|  42.3k|#define INV_COS_BIT 12
  ------------------
  636|       |
  637|       |  // stage 1
  638|  42.3k|  __m256i x[32];
  639|  42.3k|  x[0] = input[0];
  640|  42.3k|  x[4] = input[4];
  641|  42.3k|  x[8] = input[2];
  642|  42.3k|  x[12] = input[6];
  643|  42.3k|  x[16] = input[1];
  644|  42.3k|  x[20] = input[5];
  645|  42.3k|  x[24] = input[3];
  646|  42.3k|  x[28] = input[7];
  647|       |
  648|       |  // stage 2
  649|  42.3k|  btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
  ------------------
  |  |   30|  42.3k|  do {                                             \
  |  |   31|  42.3k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  42.3k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  42.3k|    const __m256i _in = in;                        \
  |  |   34|  42.3k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  42.3k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  42.3k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 42.3k]
  |  |  ------------------
  ------------------
  650|  42.3k|  btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
  ------------------
  |  |   30|  42.3k|  do {                                             \
  |  |   31|  42.3k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  42.3k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  42.3k|    const __m256i _in = in;                        \
  |  |   34|  42.3k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  42.3k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  42.3k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 42.3k]
  |  |  ------------------
  ------------------
  651|  42.3k|  btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
  ------------------
  |  |   30|  42.3k|  do {                                             \
  |  |   31|  42.3k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  42.3k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  42.3k|    const __m256i _in = in;                        \
  |  |   34|  42.3k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  42.3k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  42.3k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 42.3k]
  |  |  ------------------
  ------------------
  652|  42.3k|  btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
  ------------------
  |  |   30|  42.3k|  do {                                             \
  |  |   31|  42.3k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  42.3k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  42.3k|    const __m256i _in = in;                        \
  |  |   34|  42.3k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  42.3k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  42.3k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 42.3k]
  |  |  ------------------
  ------------------
  653|       |
  654|       |  // stage 3
  655|  42.3k|  btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
  ------------------
  |  |   30|  42.3k|  do {                                             \
  |  |   31|  42.3k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  42.3k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  42.3k|    const __m256i _in = in;                        \
  |  |   34|  42.3k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  42.3k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  42.3k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 42.3k]
  |  |  ------------------
  ------------------
  656|  42.3k|  btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
  ------------------
  |  |   30|  42.3k|  do {                                             \
  |  |   31|  42.3k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  42.3k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  42.3k|    const __m256i _in = in;                        \
  |  |   34|  42.3k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  42.3k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  42.3k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 42.3k]
  |  |  ------------------
  ------------------
  657|  42.3k|  x[17] = x[16];
  658|  42.3k|  x[18] = x[19];
  659|  42.3k|  x[21] = x[20];
  660|  42.3k|  x[22] = x[23];
  661|  42.3k|  x[25] = x[24];
  662|  42.3k|  x[26] = x[27];
  663|  42.3k|  x[29] = x[28];
  664|  42.3k|  x[30] = x[31];
  665|       |
  666|       |  // stage 4
  667|  42.3k|  btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
  ------------------
  |  |   30|  42.3k|  do {                                             \
  |  |   31|  42.3k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  42.3k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  42.3k|    const __m256i _in = in;                        \
  |  |   34|  42.3k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  42.3k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  42.3k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 42.3k]
  |  |  ------------------
  ------------------
  668|  42.3k|  x[9] = x[8];
  669|  42.3k|  x[10] = x[11];
  670|  42.3k|  x[13] = x[12];
  671|  42.3k|  x[14] = x[15];
  672|  42.3k|  idct32_high16_stage4_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|  42.3k|#define INV_COS_BIT 12
  ------------------
  673|       |
  674|       |  // stage 5
  675|  42.3k|  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
  ------------------
  |  |   30|  42.3k|  do {                                             \
  |  |   31|  42.3k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  42.3k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  42.3k|    const __m256i _in = in;                        \
  |  |   34|  42.3k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  42.3k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  42.3k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 42.3k]
  |  |  ------------------
  ------------------
  676|  42.3k|  x[5] = x[4];
  677|  42.3k|  x[6] = x[7];
  678|  42.3k|  idct32_high24_stage5_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|  42.3k|#define INV_COS_BIT 12
  ------------------
  679|       |  // stage 6
  680|  42.3k|  x[3] = x[0];
  681|  42.3k|  x[2] = x[1];
  682|  42.3k|  idct32_high28_stage6_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|  42.3k|#define INV_COS_BIT 12
  ------------------
  683|       |
  684|  42.3k|  idct32_stage7_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|  42.3k|#define INV_COS_BIT 12
  ------------------
  685|  42.3k|  idct32_stage8_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|  42.3k|#define INV_COS_BIT 12
  ------------------
  686|  42.3k|  idct32_stage9_avx2(output, x);
  687|  42.3k|}
av1_inv_txfm_avx2.c:idct32_high16_stage4_avx2:
  477|  82.8k|                                             const __m256i _r, int8_t cos_bit) {
  478|  82.8k|  const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
  479|  82.8k|  const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
  480|  82.8k|  const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
  481|  82.8k|  const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
  482|  82.8k|  const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
  483|  82.8k|  const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
  484|  82.8k|  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit);
  485|  82.8k|  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit);
  486|  82.8k|  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit);
  487|  82.8k|  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit);
  488|  82.8k|}
av1_inv_txfm_avx2.c:idct32_high24_stage5_avx2:
  491|  82.8k|                                             const __m256i _r, int8_t cos_bit) {
  492|  82.8k|  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
  493|  82.8k|  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
  494|  82.8k|  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
  495|  82.8k|  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
  496|  82.8k|  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit);
  497|  82.8k|  btf_16_adds_subs_avx2(&x[16], &x[19]);
  498|  82.8k|  btf_16_adds_subs_avx2(&x[17], &x[18]);
  499|  82.8k|  btf_16_adds_subs_avx2(&x[23], &x[20]);
  500|  82.8k|  btf_16_adds_subs_avx2(&x[22], &x[21]);
  501|  82.8k|  btf_16_adds_subs_avx2(&x[24], &x[27]);
  502|  82.8k|  btf_16_adds_subs_avx2(&x[25], &x[26]);
  503|  82.8k|  btf_16_adds_subs_avx2(&x[31], &x[28]);
  504|  82.8k|  btf_16_adds_subs_avx2(&x[30], &x[29]);
  505|  82.8k|}
av1_inv_txfm_avx2.c:idct32_high28_stage6_avx2:
  508|  82.8k|                                             const __m256i _r, int8_t cos_bit) {
  509|  82.8k|  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
  510|  82.8k|  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
  511|  82.8k|  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
  512|  82.8k|  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
  513|  82.8k|  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
  514|  82.8k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit);
  515|  82.8k|  btf_16_adds_subs_avx2(&x[8], &x[11]);
  516|  82.8k|  btf_16_adds_subs_avx2(&x[9], &x[10]);
  517|  82.8k|  btf_16_adds_subs_avx2(&x[15], &x[12]);
  518|  82.8k|  btf_16_adds_subs_avx2(&x[14], &x[13]);
  519|  82.8k|  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit);
  520|  82.8k|  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit);
  521|  82.8k|  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit);
  522|  82.8k|  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit);
  523|  82.8k|}
av1_inv_txfm_avx2.c:idct32_stage7_avx2:
  526|  82.8k|                                      const __m256i _r, int8_t cos_bit) {
  527|  82.8k|  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
  528|  82.8k|  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
  529|  82.8k|  btf_16_adds_subs_avx2(&x[0], &x[7]);
  530|  82.8k|  btf_16_adds_subs_avx2(&x[1], &x[6]);
  531|  82.8k|  btf_16_adds_subs_avx2(&x[2], &x[5]);
  532|  82.8k|  btf_16_adds_subs_avx2(&x[3], &x[4]);
  533|  82.8k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
  534|  82.8k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
  535|  82.8k|  btf_16_adds_subs_avx2(&x[16], &x[23]);
  536|  82.8k|  btf_16_adds_subs_avx2(&x[17], &x[22]);
  537|  82.8k|  btf_16_adds_subs_avx2(&x[18], &x[21]);
  538|  82.8k|  btf_16_adds_subs_avx2(&x[19], &x[20]);
  539|  82.8k|  btf_16_adds_subs_avx2(&x[31], &x[24]);
  540|  82.8k|  btf_16_adds_subs_avx2(&x[30], &x[25]);
  541|  82.8k|  btf_16_adds_subs_avx2(&x[29], &x[26]);
  542|  82.8k|  btf_16_adds_subs_avx2(&x[28], &x[27]);
  543|  82.8k|}
av1_inv_txfm_avx2.c:idct32_stage8_avx2:
  546|  82.8k|                                      const __m256i _r, int8_t cos_bit) {
  547|  82.8k|  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
  548|  82.8k|  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
  549|  82.8k|  btf_16_adds_subs_avx2(&x[0], &x[15]);
  550|  82.8k|  btf_16_adds_subs_avx2(&x[1], &x[14]);
  551|  82.8k|  btf_16_adds_subs_avx2(&x[2], &x[13]);
  552|  82.8k|  btf_16_adds_subs_avx2(&x[3], &x[12]);
  553|  82.8k|  btf_16_adds_subs_avx2(&x[4], &x[11]);
  554|  82.8k|  btf_16_adds_subs_avx2(&x[5], &x[10]);
  555|  82.8k|  btf_16_adds_subs_avx2(&x[6], &x[9]);
  556|  82.8k|  btf_16_adds_subs_avx2(&x[7], &x[8]);
  557|  82.8k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit);
  558|  82.8k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit);
  559|  82.8k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit);
  560|  82.8k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit);
  561|  82.8k|}
av1_inv_txfm_avx2.c:idct32_stage9_avx2:
  563|  82.8k|static inline void idct32_stage9_avx2(__m256i *output, __m256i *x) {
  564|  82.8k|  btf_16_adds_subs_out_avx2(&output[0], &output[31], x[0], x[31]);
  565|  82.8k|  btf_16_adds_subs_out_avx2(&output[1], &output[30], x[1], x[30]);
  566|  82.8k|  btf_16_adds_subs_out_avx2(&output[2], &output[29], x[2], x[29]);
  567|  82.8k|  btf_16_adds_subs_out_avx2(&output[3], &output[28], x[3], x[28]);
  568|  82.8k|  btf_16_adds_subs_out_avx2(&output[4], &output[27], x[4], x[27]);
  569|  82.8k|  btf_16_adds_subs_out_avx2(&output[5], &output[26], x[5], x[26]);
  570|  82.8k|  btf_16_adds_subs_out_avx2(&output[6], &output[25], x[6], x[25]);
  571|  82.8k|  btf_16_adds_subs_out_avx2(&output[7], &output[24], x[7], x[24]);
  572|  82.8k|  btf_16_adds_subs_out_avx2(&output[8], &output[23], x[8], x[23]);
  573|  82.8k|  btf_16_adds_subs_out_avx2(&output[9], &output[22], x[9], x[22]);
  574|  82.8k|  btf_16_adds_subs_out_avx2(&output[10], &output[21], x[10], x[21]);
  575|  82.8k|  btf_16_adds_subs_out_avx2(&output[11], &output[20], x[11], x[20]);
  576|  82.8k|  btf_16_adds_subs_out_avx2(&output[12], &output[19], x[12], x[19]);
  577|  82.8k|  btf_16_adds_subs_out_avx2(&output[13], &output[18], x[13], x[18]);
  578|  82.8k|  btf_16_adds_subs_out_avx2(&output[14], &output[17], x[14], x[17]);
  579|  82.8k|  btf_16_adds_subs_out_avx2(&output[15], &output[16], x[15], x[16]);
  580|  82.8k|}
av1_inv_txfm_avx2.c:idct32_low16_avx2:
  689|  17.9k|static void idct32_low16_avx2(const __m256i *input, __m256i *output) {
  690|  17.9k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|  17.9k|#define INV_COS_BIT 12
  ------------------
  691|  17.9k|  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|  17.9k|#define INV_COS_BIT 12
  ------------------
  692|       |
  693|       |  // stage 1
  694|  17.9k|  __m256i x[32];
  695|  17.9k|  x[0] = input[0];
  696|  17.9k|  x[2] = input[8];
  697|  17.9k|  x[4] = input[4];
  698|  17.9k|  x[6] = input[12];
  699|  17.9k|  x[8] = input[2];
  700|  17.9k|  x[10] = input[10];
  701|  17.9k|  x[12] = input[6];
  702|  17.9k|  x[14] = input[14];
  703|  17.9k|  x[16] = input[1];
  704|  17.9k|  x[18] = input[9];
  705|  17.9k|  x[20] = input[5];
  706|  17.9k|  x[22] = input[13];
  707|  17.9k|  x[24] = input[3];
  708|  17.9k|  x[26] = input[11];
  709|  17.9k|  x[28] = input[7];
  710|  17.9k|  x[30] = input[15];
  711|       |
  712|       |  // stage 2
  713|  17.9k|  btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
  ------------------
  |  |   30|  17.9k|  do {                                             \
  |  |   31|  17.9k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  17.9k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  17.9k|    const __m256i _in = in;                        \
  |  |   34|  17.9k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  17.9k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  17.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 17.9k]
  |  |  ------------------
  ------------------
  714|  17.9k|  btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]);
  ------------------
  |  |   30|  17.9k|  do {                                             \
  |  |   31|  17.9k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  17.9k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  17.9k|    const __m256i _in = in;                        \
  |  |   34|  17.9k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  17.9k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  17.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 17.9k]
  |  |  ------------------
  ------------------
  715|  17.9k|  btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]);
  ------------------
  |  |   30|  17.9k|  do {                                             \
  |  |   31|  17.9k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  17.9k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  17.9k|    const __m256i _in = in;                        \
  |  |   34|  17.9k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  17.9k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  17.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 17.9k]
  |  |  ------------------
  ------------------
  716|  17.9k|  btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
  ------------------
  |  |   30|  17.9k|  do {                                             \
  |  |   31|  17.9k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  17.9k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  17.9k|    const __m256i _in = in;                        \
  |  |   34|  17.9k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  17.9k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  17.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 17.9k]
  |  |  ------------------
  ------------------
  717|  17.9k|  btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
  ------------------
  |  |   30|  17.9k|  do {                                             \
  |  |   31|  17.9k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  17.9k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  17.9k|    const __m256i _in = in;                        \
  |  |   34|  17.9k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  17.9k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  17.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 17.9k]
  |  |  ------------------
  ------------------
  718|  17.9k|  btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]);
  ------------------
  |  |   30|  17.9k|  do {                                             \
  |  |   31|  17.9k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  17.9k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  17.9k|    const __m256i _in = in;                        \
  |  |   34|  17.9k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  17.9k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  17.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 17.9k]
  |  |  ------------------
  ------------------
  719|  17.9k|  btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]);
  ------------------
  |  |   30|  17.9k|  do {                                             \
  |  |   31|  17.9k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  17.9k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  17.9k|    const __m256i _in = in;                        \
  |  |   34|  17.9k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  17.9k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  17.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 17.9k]
  |  |  ------------------
  ------------------
  720|  17.9k|  btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
  ------------------
  |  |   30|  17.9k|  do {                                             \
  |  |   31|  17.9k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  17.9k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  17.9k|    const __m256i _in = in;                        \
  |  |   34|  17.9k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  17.9k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  17.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 17.9k]
  |  |  ------------------
  ------------------
  721|       |
  722|       |  // stage 3
  723|  17.9k|  btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
  ------------------
  |  |   30|  17.9k|  do {                                             \
  |  |   31|  17.9k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  17.9k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  17.9k|    const __m256i _in = in;                        \
  |  |   34|  17.9k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  17.9k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  17.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 17.9k]
  |  |  ------------------
  ------------------
  724|  17.9k|  btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]);
  ------------------
  |  |   30|  17.9k|  do {                                             \
  |  |   31|  17.9k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  17.9k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  17.9k|    const __m256i _in = in;                        \
  |  |   34|  17.9k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  17.9k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  17.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 17.9k]
  |  |  ------------------
  ------------------
  725|  17.9k|  btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]);
  ------------------
  |  |   30|  17.9k|  do {                                             \
  |  |   31|  17.9k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  17.9k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  17.9k|    const __m256i _in = in;                        \
  |  |   34|  17.9k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  17.9k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  17.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 17.9k]
  |  |  ------------------
  ------------------
  726|  17.9k|  btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
  ------------------
  |  |   30|  17.9k|  do {                                             \
  |  |   31|  17.9k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  17.9k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  17.9k|    const __m256i _in = in;                        \
  |  |   34|  17.9k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  17.9k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  17.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 17.9k]
  |  |  ------------------
  ------------------
  727|  17.9k|  idct32_high16_stage3_avx2(x);
  728|       |
  729|       |  // stage 4
  730|  17.9k|  btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
  ------------------
  |  |   30|  17.9k|  do {                                             \
  |  |   31|  17.9k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  17.9k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  17.9k|    const __m256i _in = in;                        \
  |  |   34|  17.9k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  17.9k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  17.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 17.9k]
  |  |  ------------------
  ------------------
  731|  17.9k|  btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]);
  ------------------
  |  |   30|  17.9k|  do {                                             \
  |  |   31|  17.9k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  17.9k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  17.9k|    const __m256i _in = in;                        \
  |  |   34|  17.9k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  17.9k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  17.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 17.9k]
  |  |  ------------------
  ------------------
  732|  17.9k|  btf_16_adds_subs_avx2(&x[8], &x[9]);
  733|  17.9k|  btf_16_adds_subs_avx2(&x[11], &x[10]);
  734|  17.9k|  btf_16_adds_subs_avx2(&x[12], &x[13]);
  735|  17.9k|  btf_16_adds_subs_avx2(&x[15], &x[14]);
  736|  17.9k|  idct32_high16_stage4_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|  17.9k|#define INV_COS_BIT 12
  ------------------
  737|       |
  738|       |  // stage 5
  739|  17.9k|  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
  ------------------
  |  |   30|  17.9k|  do {                                             \
  |  |   31|  17.9k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  17.9k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  17.9k|    const __m256i _in = in;                        \
  |  |   34|  17.9k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  17.9k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  17.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 17.9k]
  |  |  ------------------
  ------------------
  740|  17.9k|  btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]);
  ------------------
  |  |   30|  17.9k|  do {                                             \
  |  |   31|  17.9k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  17.9k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  17.9k|    const __m256i _in = in;                        \
  |  |   34|  17.9k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  17.9k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  17.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 17.9k]
  |  |  ------------------
  ------------------
  741|  17.9k|  btf_16_adds_subs_avx2(&x[4], &x[5]);
  742|  17.9k|  btf_16_adds_subs_avx2(&x[7], &x[6]);
  743|  17.9k|  idct32_high24_stage5_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|  17.9k|#define INV_COS_BIT 12
  ------------------
  744|       |
  745|  17.9k|  btf_16_adds_subs_avx2(&x[0], &x[3]);
  746|  17.9k|  btf_16_adds_subs_avx2(&x[1], &x[2]);
  747|  17.9k|  idct32_high28_stage6_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|  17.9k|#define INV_COS_BIT 12
  ------------------
  748|       |
  749|  17.9k|  idct32_stage7_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|  17.9k|#define INV_COS_BIT 12
  ------------------
  750|  17.9k|  idct32_stage8_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|  17.9k|#define INV_COS_BIT 12
  ------------------
  751|  17.9k|  idct32_stage9_avx2(output, x);
  752|  17.9k|}
av1_inv_txfm_avx2.c:idct32_high16_stage3_avx2:
  465|  40.4k|static inline void idct32_high16_stage3_avx2(__m256i *x) {
  466|  40.4k|  btf_16_adds_subs_avx2(&x[16], &x[17]);
  467|  40.4k|  btf_16_adds_subs_avx2(&x[19], &x[18]);
  468|  40.4k|  btf_16_adds_subs_avx2(&x[20], &x[21]);
  469|  40.4k|  btf_16_adds_subs_avx2(&x[23], &x[22]);
  470|  40.4k|  btf_16_adds_subs_avx2(&x[24], &x[25]);
  471|  40.4k|  btf_16_adds_subs_avx2(&x[27], &x[26]);
  472|  40.4k|  btf_16_adds_subs_avx2(&x[28], &x[29]);
  473|  40.4k|  btf_16_adds_subs_avx2(&x[31], &x[30]);
  474|  40.4k|}
av1_inv_txfm_avx2.c:idct32_avx2:
  754|  22.5k|static void idct32_avx2(const __m256i *input, __m256i *output) {
  755|  22.5k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|  22.5k|#define INV_COS_BIT 12
  ------------------
  756|  22.5k|  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|  22.5k|#define INV_COS_BIT 12
  ------------------
  757|       |
  758|  22.5k|  __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
  759|  22.5k|  __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
  760|  22.5k|  __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
  761|  22.5k|  __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
  762|  22.5k|  __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
  763|  22.5k|  __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
  764|  22.5k|  __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
  765|  22.5k|  __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
  766|  22.5k|  __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
  767|  22.5k|  __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
  768|  22.5k|  __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
  769|  22.5k|  __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
  770|  22.5k|  __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
  771|  22.5k|  __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
  772|  22.5k|  __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
  773|  22.5k|  __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
  774|  22.5k|  __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
  775|  22.5k|  __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
  776|  22.5k|  __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]);
  777|  22.5k|  __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]);
  778|  22.5k|  __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]);
  779|  22.5k|  __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]);
  780|  22.5k|  __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]);
  781|  22.5k|  __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]);
  782|  22.5k|  __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
  783|  22.5k|  __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
  784|  22.5k|  __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
  785|  22.5k|  __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
  786|  22.5k|  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
  787|  22.5k|  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
  788|  22.5k|  __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
  789|  22.5k|  __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
  790|       |
  791|       |  // stage 1
  792|  22.5k|  __m256i x1[32];
  793|  22.5k|  x1[0] = input[0];
  794|  22.5k|  x1[1] = input[16];
  795|  22.5k|  x1[2] = input[8];
  796|  22.5k|  x1[3] = input[24];
  797|  22.5k|  x1[4] = input[4];
  798|  22.5k|  x1[5] = input[20];
  799|  22.5k|  x1[6] = input[12];
  800|  22.5k|  x1[7] = input[28];
  801|  22.5k|  x1[8] = input[2];
  802|  22.5k|  x1[9] = input[18];
  803|  22.5k|  x1[10] = input[10];
  804|  22.5k|  x1[11] = input[26];
  805|  22.5k|  x1[12] = input[6];
  806|  22.5k|  x1[13] = input[22];
  807|  22.5k|  x1[14] = input[14];
  808|  22.5k|  x1[15] = input[30];
  809|  22.5k|  x1[16] = input[1];
  810|  22.5k|  x1[17] = input[17];
  811|  22.5k|  x1[18] = input[9];
  812|  22.5k|  x1[19] = input[25];
  813|  22.5k|  x1[20] = input[5];
  814|  22.5k|  x1[21] = input[21];
  815|  22.5k|  x1[22] = input[13];
  816|  22.5k|  x1[23] = input[29];
  817|  22.5k|  x1[24] = input[3];
  818|  22.5k|  x1[25] = input[19];
  819|  22.5k|  x1[26] = input[11];
  820|  22.5k|  x1[27] = input[27];
  821|  22.5k|  x1[28] = input[7];
  822|  22.5k|  x1[29] = input[23];
  823|  22.5k|  x1[30] = input[15];
  824|  22.5k|  x1[31] = input[31];
  825|       |
  826|       |  // stage 2
  827|  22.5k|  btf_16_w16_avx2(cospi_p62_m02, cospi_p02_p62, &x1[16], &x1[31], _r,
  828|  22.5k|                  INV_COS_BIT);
  ------------------
  |  |   43|  22.5k|#define INV_COS_BIT 12
  ------------------
  829|  22.5k|  btf_16_w16_avx2(cospi_p30_m34, cospi_p34_p30, &x1[17], &x1[30], _r,
  830|  22.5k|                  INV_COS_BIT);
  ------------------
  |  |   43|  22.5k|#define INV_COS_BIT 12
  ------------------
  831|  22.5k|  btf_16_w16_avx2(cospi_p46_m18, cospi_p18_p46, &x1[18], &x1[29], _r,
  832|  22.5k|                  INV_COS_BIT);
  ------------------
  |  |   43|  22.5k|#define INV_COS_BIT 12
  ------------------
  833|  22.5k|  btf_16_w16_avx2(cospi_p14_m50, cospi_p50_p14, &x1[19], &x1[28], _r,
  834|  22.5k|                  INV_COS_BIT);
  ------------------
  |  |   43|  22.5k|#define INV_COS_BIT 12
  ------------------
  835|  22.5k|  btf_16_w16_avx2(cospi_p54_m10, cospi_p10_p54, &x1[20], &x1[27], _r,
  836|  22.5k|                  INV_COS_BIT);
  ------------------
  |  |   43|  22.5k|#define INV_COS_BIT 12
  ------------------
  837|  22.5k|  btf_16_w16_avx2(cospi_p22_m42, cospi_p42_p22, &x1[21], &x1[26], _r,
  838|  22.5k|                  INV_COS_BIT);
  ------------------
  |  |   43|  22.5k|#define INV_COS_BIT 12
  ------------------
  839|  22.5k|  btf_16_w16_avx2(cospi_p38_m26, cospi_p26_p38, &x1[22], &x1[25], _r,
  840|  22.5k|                  INV_COS_BIT);
  ------------------
  |  |   43|  22.5k|#define INV_COS_BIT 12
  ------------------
  841|  22.5k|  btf_16_w16_avx2(cospi_p06_m58, cospi_p58_p06, &x1[23], &x1[24], _r,
  842|  22.5k|                  INV_COS_BIT);
  ------------------
  |  |   43|  22.5k|#define INV_COS_BIT 12
  ------------------
  843|       |
  844|       |  // stage 3
  845|  22.5k|  btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r,
  846|  22.5k|                  INV_COS_BIT);
  ------------------
  |  |   43|  22.5k|#define INV_COS_BIT 12
  ------------------
  847|  22.5k|  btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r,
  848|  22.5k|                  INV_COS_BIT);
  ------------------
  |  |   43|  22.5k|#define INV_COS_BIT 12
  ------------------
  849|  22.5k|  btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r,
  850|  22.5k|                  INV_COS_BIT);
  ------------------
  |  |   43|  22.5k|#define INV_COS_BIT 12
  ------------------
  851|  22.5k|  btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r,
  852|  22.5k|                  INV_COS_BIT);
  ------------------
  |  |   43|  22.5k|#define INV_COS_BIT 12
  ------------------
  853|  22.5k|  idct32_high16_stage3_avx2(x1);
  854|       |
  855|       |  // stage 4
  856|  22.5k|  btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r,
  857|  22.5k|                  INV_COS_BIT);
  ------------------
  |  |   43|  22.5k|#define INV_COS_BIT 12
  ------------------
  858|  22.5k|  btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r,
  859|  22.5k|                  INV_COS_BIT);
  ------------------
  |  |   43|  22.5k|#define INV_COS_BIT 12
  ------------------
  860|  22.5k|  btf_16_adds_subs_avx2(&x1[8], &x1[9]);
  861|  22.5k|  btf_16_adds_subs_avx2(&x1[11], &x1[10]);
  862|  22.5k|  btf_16_adds_subs_avx2(&x1[12], &x1[13]);
  863|  22.5k|  btf_16_adds_subs_avx2(&x1[15], &x1[14]);
  864|  22.5k|  idct32_high16_stage4_avx2(x1, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|  22.5k|#define INV_COS_BIT 12
  ------------------
  865|       |
  866|       |  // stage 5
  867|  22.5k|  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r,
  868|  22.5k|                  INV_COS_BIT);
  ------------------
  |  |   43|  22.5k|#define INV_COS_BIT 12
  ------------------
  869|  22.5k|  btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r,
  870|  22.5k|                  INV_COS_BIT);
  ------------------
  |  |   43|  22.5k|#define INV_COS_BIT 12
  ------------------
  871|  22.5k|  btf_16_adds_subs_avx2(&x1[4], &x1[5]);
  872|  22.5k|  btf_16_adds_subs_avx2(&x1[7], &x1[6]);
  873|  22.5k|  idct32_high24_stage5_avx2(x1, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|  22.5k|#define INV_COS_BIT 12
  ------------------
  874|       |
  875|       |  // stage 6
  876|  22.5k|  btf_16_adds_subs_avx2(&x1[0], &x1[3]);
  877|  22.5k|  btf_16_adds_subs_avx2(&x1[1], &x1[2]);
  878|  22.5k|  idct32_high28_stage6_avx2(x1, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|  22.5k|#define INV_COS_BIT 12
  ------------------
  879|       |
  880|  22.5k|  idct32_stage7_avx2(x1, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|  22.5k|#define INV_COS_BIT 12
  ------------------
  881|  22.5k|  idct32_stage8_avx2(x1, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|  22.5k|#define INV_COS_BIT 12
  ------------------
  882|  22.5k|  idct32_stage9_avx2(output, x1);
  883|  22.5k|}
av1_inv_txfm_avx2.c:idct64_low1_avx2:
 1126|  23.9k|static void idct64_low1_avx2(const __m256i *input, __m256i *output) {
 1127|  23.9k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|  23.9k|#define INV_COS_BIT 12
  ------------------
 1128|       |
 1129|       |  // stage 1
 1130|  23.9k|  __m256i x[32];
 1131|  23.9k|  x[0] = input[0];
 1132|       |
 1133|       |  // stage 2
 1134|       |  // stage 3
 1135|       |  // stage 4
 1136|       |  // stage 5
 1137|       |  // stage 6
 1138|  23.9k|  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
  ------------------
  |  |   30|  23.9k|  do {                                             \
  |  |   31|  23.9k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  23.9k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  23.9k|    const __m256i _in = in;                        \
  |  |   34|  23.9k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  23.9k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  23.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 23.9k]
  |  |  ------------------
  ------------------
 1139|       |
 1140|       |  // stage 7
 1141|       |  // stage 8
 1142|       |  // stage 9
 1143|       |  // stage 10
 1144|       |  // stage 11
 1145|  23.9k|  output[0] = x[0];
 1146|  23.9k|  output[63] = x[0];
 1147|  23.9k|  output[1] = x[1];
 1148|  23.9k|  output[62] = x[1];
 1149|  23.9k|  output[2] = x[1];
 1150|  23.9k|  output[61] = x[1];
 1151|  23.9k|  output[3] = x[0];
 1152|  23.9k|  output[60] = x[0];
 1153|  23.9k|  output[4] = x[0];
 1154|  23.9k|  output[59] = x[0];
 1155|  23.9k|  output[5] = x[1];
 1156|  23.9k|  output[58] = x[1];
 1157|  23.9k|  output[6] = x[1];
 1158|  23.9k|  output[57] = x[1];
 1159|  23.9k|  output[7] = x[0];
 1160|  23.9k|  output[56] = x[0];
 1161|  23.9k|  output[8] = x[0];
 1162|  23.9k|  output[55] = x[0];
 1163|  23.9k|  output[9] = x[1];
 1164|  23.9k|  output[54] = x[1];
 1165|  23.9k|  output[10] = x[1];
 1166|  23.9k|  output[53] = x[1];
 1167|  23.9k|  output[11] = x[0];
 1168|  23.9k|  output[52] = x[0];
 1169|  23.9k|  output[12] = x[0];
 1170|  23.9k|  output[51] = x[0];
 1171|  23.9k|  output[13] = x[1];
 1172|  23.9k|  output[50] = x[1];
 1173|  23.9k|  output[14] = x[1];
 1174|  23.9k|  output[49] = x[1];
 1175|  23.9k|  output[15] = x[0];
 1176|  23.9k|  output[48] = x[0];
 1177|  23.9k|  output[16] = x[0];
 1178|  23.9k|  output[47] = x[0];
 1179|  23.9k|  output[17] = x[1];
 1180|  23.9k|  output[46] = x[1];
 1181|  23.9k|  output[18] = x[1];
 1182|  23.9k|  output[45] = x[1];
 1183|  23.9k|  output[19] = x[0];
 1184|  23.9k|  output[44] = x[0];
 1185|  23.9k|  output[20] = x[0];
 1186|  23.9k|  output[43] = x[0];
 1187|  23.9k|  output[21] = x[1];
 1188|  23.9k|  output[42] = x[1];
 1189|  23.9k|  output[22] = x[1];
 1190|  23.9k|  output[41] = x[1];
 1191|  23.9k|  output[23] = x[0];
 1192|  23.9k|  output[40] = x[0];
 1193|  23.9k|  output[24] = x[0];
 1194|  23.9k|  output[39] = x[0];
 1195|  23.9k|  output[25] = x[1];
 1196|  23.9k|  output[38] = x[1];
 1197|  23.9k|  output[26] = x[1];
 1198|  23.9k|  output[37] = x[1];
 1199|  23.9k|  output[27] = x[0];
 1200|  23.9k|  output[36] = x[0];
 1201|  23.9k|  output[28] = x[0];
 1202|  23.9k|  output[35] = x[0];
 1203|  23.9k|  output[29] = x[1];
 1204|  23.9k|  output[34] = x[1];
 1205|  23.9k|  output[30] = x[1];
 1206|  23.9k|  output[33] = x[1];
 1207|  23.9k|  output[31] = x[0];
 1208|  23.9k|  output[32] = x[0];
 1209|  23.9k|}
av1_inv_txfm_avx2.c:idct64_low8_avx2:
 1211|  10.9k|static void idct64_low8_avx2(const __m256i *input, __m256i *output) {
 1212|  10.9k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|  10.9k|#define INV_COS_BIT 12
  ------------------
 1213|  10.9k|  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|  10.9k|#define INV_COS_BIT 12
  ------------------
 1214|  10.9k|  const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
 1215|  10.9k|  const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
 1216|  10.9k|  const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
 1217|  10.9k|  const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
 1218|  10.9k|  const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
 1219|  10.9k|  const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
 1220|  10.9k|  const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
 1221|  10.9k|  const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
 1222|  10.9k|  const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
 1223|  10.9k|  const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
 1224|  10.9k|  const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
 1225|  10.9k|  const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
 1226|  10.9k|  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
 1227|  10.9k|  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
 1228|  10.9k|  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
 1229|  10.9k|  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
 1230|       |
 1231|       |  // stage 1
 1232|  10.9k|  __m256i x[64];
 1233|  10.9k|  x[0] = input[0];
 1234|  10.9k|  x[8] = input[4];
 1235|  10.9k|  x[16] = input[2];
 1236|  10.9k|  x[24] = input[6];
 1237|  10.9k|  x[32] = input[1];
 1238|  10.9k|  x[40] = input[5];
 1239|  10.9k|  x[48] = input[3];
 1240|  10.9k|  x[56] = input[7];
 1241|       |
 1242|       |  // stage 2
 1243|  10.9k|  btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
  ------------------
  |  |   30|  10.9k|  do {                                             \
  |  |   31|  10.9k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  10.9k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  10.9k|    const __m256i _in = in;                        \
  |  |   34|  10.9k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  10.9k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  10.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 10.9k]
  |  |  ------------------
  ------------------
 1244|  10.9k|  btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
  ------------------
  |  |   30|  10.9k|  do {                                             \
  |  |   31|  10.9k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  10.9k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  10.9k|    const __m256i _in = in;                        \
  |  |   34|  10.9k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  10.9k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  10.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 10.9k]
  |  |  ------------------
  ------------------
 1245|  10.9k|  btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
  ------------------
  |  |   30|  10.9k|  do {                                             \
  |  |   31|  10.9k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  10.9k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  10.9k|    const __m256i _in = in;                        \
  |  |   34|  10.9k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  10.9k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  10.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 10.9k]
  |  |  ------------------
  ------------------
 1246|  10.9k|  btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
  ------------------
  |  |   30|  10.9k|  do {                                             \
  |  |   31|  10.9k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  10.9k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  10.9k|    const __m256i _in = in;                        \
  |  |   34|  10.9k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  10.9k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  10.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 10.9k]
  |  |  ------------------
  ------------------
 1247|       |
 1248|       |  // stage 3
 1249|  10.9k|  btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
  ------------------
  |  |   30|  10.9k|  do {                                             \
  |  |   31|  10.9k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  10.9k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  10.9k|    const __m256i _in = in;                        \
  |  |   34|  10.9k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  10.9k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  10.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 10.9k]
  |  |  ------------------
  ------------------
 1250|  10.9k|  btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
  ------------------
  |  |   30|  10.9k|  do {                                             \
  |  |   31|  10.9k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  10.9k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  10.9k|    const __m256i _in = in;                        \
  |  |   34|  10.9k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  10.9k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  10.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 10.9k]
  |  |  ------------------
  ------------------
 1251|  10.9k|  x[33] = x[32];
 1252|  10.9k|  x[38] = x[39];
 1253|  10.9k|  x[41] = x[40];
 1254|  10.9k|  x[46] = x[47];
 1255|  10.9k|  x[49] = x[48];
 1256|  10.9k|  x[54] = x[55];
 1257|  10.9k|  x[57] = x[56];
 1258|  10.9k|  x[62] = x[63];
 1259|       |
 1260|       |  // stage 4
 1261|  10.9k|  btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
  ------------------
  |  |   30|  10.9k|  do {                                             \
  |  |   31|  10.9k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  10.9k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  10.9k|    const __m256i _in = in;                        \
  |  |   34|  10.9k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  10.9k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  10.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 10.9k]
  |  |  ------------------
  ------------------
 1262|  10.9k|  x[17] = x[16];
 1263|  10.9k|  x[22] = x[23];
 1264|  10.9k|  x[25] = x[24];
 1265|  10.9k|  x[30] = x[31];
 1266|  10.9k|  btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r,
 1267|  10.9k|                  INV_COS_BIT);
  ------------------
  |  |   43|  10.9k|#define INV_COS_BIT 12
  ------------------
 1268|  10.9k|  btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r,
 1269|  10.9k|                  INV_COS_BIT);
  ------------------
  |  |   43|  10.9k|#define INV_COS_BIT 12
  ------------------
 1270|  10.9k|  btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r,
 1271|  10.9k|                  INV_COS_BIT);
  ------------------
  |  |   43|  10.9k|#define INV_COS_BIT 12
  ------------------
 1272|  10.9k|  btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r,
 1273|  10.9k|                  INV_COS_BIT);
  ------------------
  |  |   43|  10.9k|#define INV_COS_BIT 12
  ------------------
 1274|       |
 1275|       |  // stage 5
 1276|  10.9k|  x[9] = x[8];
 1277|  10.9k|  x[14] = x[15];
 1278|  10.9k|  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r,
 1279|  10.9k|                  INV_COS_BIT);
  ------------------
  |  |   43|  10.9k|#define INV_COS_BIT 12
  ------------------
 1280|  10.9k|  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r,
 1281|  10.9k|                  INV_COS_BIT);
  ------------------
  |  |   43|  10.9k|#define INV_COS_BIT 12
  ------------------
 1282|  10.9k|  x[35] = x[32];
 1283|  10.9k|  x[34] = x[33];
 1284|  10.9k|  x[36] = x[39];
 1285|  10.9k|  x[37] = x[38];
 1286|  10.9k|  x[43] = x[40];
 1287|  10.9k|  x[42] = x[41];
 1288|  10.9k|  x[44] = x[47];
 1289|  10.9k|  x[45] = x[46];
 1290|  10.9k|  x[51] = x[48];
 1291|  10.9k|  x[50] = x[49];
 1292|  10.9k|  x[52] = x[55];
 1293|  10.9k|  x[53] = x[54];
 1294|  10.9k|  x[59] = x[56];
 1295|  10.9k|  x[58] = x[57];
 1296|  10.9k|  x[60] = x[63];
 1297|  10.9k|  x[61] = x[62];
 1298|       |
 1299|       |  // stage 6
 1300|  10.9k|  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
  ------------------
  |  |   30|  10.9k|  do {                                             \
  |  |   31|  10.9k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  10.9k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  10.9k|    const __m256i _in = in;                        \
  |  |   34|  10.9k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  10.9k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  10.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 10.9k]
  |  |  ------------------
  ------------------
 1301|  10.9k|  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, INV_COS_BIT);
  ------------------
  |  |   43|  10.9k|#define INV_COS_BIT 12
  ------------------
 1302|  10.9k|  x[19] = x[16];
 1303|  10.9k|  x[18] = x[17];
 1304|  10.9k|  x[20] = x[23];
 1305|  10.9k|  x[21] = x[22];
 1306|  10.9k|  x[27] = x[24];
 1307|  10.9k|  x[26] = x[25];
 1308|  10.9k|  x[28] = x[31];
 1309|  10.9k|  x[29] = x[30];
 1310|  10.9k|  idct64_stage6_high32_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|  10.9k|#define INV_COS_BIT 12
  ------------------
 1311|       |
 1312|       |  // stage 7
 1313|  10.9k|  x[3] = x[0];
 1314|  10.9k|  x[2] = x[1];
 1315|  10.9k|  x[11] = x[8];
 1316|  10.9k|  x[10] = x[9];
 1317|  10.9k|  x[12] = x[15];
 1318|  10.9k|  x[13] = x[14];
 1319|  10.9k|  idct64_stage7_high48_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|  10.9k|#define INV_COS_BIT 12
  ------------------
 1320|       |
 1321|       |  // stage 8
 1322|  10.9k|  x[7] = x[0];
 1323|  10.9k|  x[6] = x[1];
 1324|  10.9k|  x[5] = x[2];
 1325|  10.9k|  x[4] = x[3];
 1326|  10.9k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r,
 1327|  10.9k|                  INV_COS_BIT);
  ------------------
  |  |   43|  10.9k|#define INV_COS_BIT 12
  ------------------
 1328|  10.9k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r,
 1329|  10.9k|                  INV_COS_BIT);
  ------------------
  |  |   43|  10.9k|#define INV_COS_BIT 12
  ------------------
 1330|  10.9k|  idct64_stage8_high48_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|  10.9k|#define INV_COS_BIT 12
  ------------------
 1331|       |
 1332|  10.9k|  idct64_stage9_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|  10.9k|#define INV_COS_BIT 12
  ------------------
 1333|  10.9k|  idct64_stage10_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|  10.9k|#define INV_COS_BIT 12
  ------------------
 1334|  10.9k|  idct64_stage11_avx2(output, x);
 1335|  10.9k|}
av1_inv_txfm_avx2.c:idct64_stage6_high32_avx2:
  942|  22.5k|                                             const __m256i _r, int8_t cos_bit) {
  943|  22.5k|  (void)cos_bit;
  944|  22.5k|  const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
  945|  22.5k|  const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
  946|  22.5k|  const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
  947|  22.5k|  const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
  948|  22.5k|  const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
  949|  22.5k|  const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
  950|  22.5k|  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[34], &x[61], _r, cos_bit);
  951|  22.5k|  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[35], &x[60], _r, cos_bit);
  952|  22.5k|  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[36], &x[59], _r, cos_bit);
  953|  22.5k|  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[37], &x[58], _r, cos_bit);
  954|  22.5k|  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[42], &x[53], _r, cos_bit);
  955|  22.5k|  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[43], &x[52], _r, cos_bit);
  956|  22.5k|  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[44], &x[51], _r, cos_bit);
  957|  22.5k|  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[45], &x[50], _r, cos_bit);
  958|  22.5k|}
av1_inv_txfm_avx2.c:idct64_stage7_high48_avx2:
  974|  22.5k|                                             const __m256i _r, int8_t cos_bit) {
  975|  22.5k|  (void)cos_bit;
  976|  22.5k|  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
  977|  22.5k|  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
  978|  22.5k|  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
  979|  22.5k|  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit);
  980|  22.5k|  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit);
  981|  22.5k|  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit);
  982|  22.5k|  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit);
  983|  22.5k|  btf_16_adds_subs_avx2(&x[32], &x[39]);
  984|  22.5k|  btf_16_adds_subs_avx2(&x[33], &x[38]);
  985|  22.5k|  btf_16_adds_subs_avx2(&x[34], &x[37]);
  986|  22.5k|  btf_16_adds_subs_avx2(&x[35], &x[36]);
  987|  22.5k|  btf_16_adds_subs_avx2(&x[47], &x[40]);
  988|  22.5k|  btf_16_adds_subs_avx2(&x[46], &x[41]);
  989|  22.5k|  btf_16_adds_subs_avx2(&x[45], &x[42]);
  990|  22.5k|  btf_16_adds_subs_avx2(&x[44], &x[43]);
  991|  22.5k|  btf_16_adds_subs_avx2(&x[48], &x[55]);
  992|  22.5k|  btf_16_adds_subs_avx2(&x[49], &x[54]);
  993|  22.5k|  btf_16_adds_subs_avx2(&x[50], &x[53]);
  994|  22.5k|  btf_16_adds_subs_avx2(&x[51], &x[52]);
  995|  22.5k|  btf_16_adds_subs_avx2(&x[63], &x[56]);
  996|  22.5k|  btf_16_adds_subs_avx2(&x[62], &x[57]);
  997|  22.5k|  btf_16_adds_subs_avx2(&x[61], &x[58]);
  998|  22.5k|  btf_16_adds_subs_avx2(&x[60], &x[59]);
  999|  22.5k|}
av1_inv_txfm_avx2.c:idct64_stage8_high48_avx2:
 1002|  22.5k|                                             const __m256i _r, int8_t cos_bit) {
 1003|  22.5k|  (void)cos_bit;
 1004|  22.5k|  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
 1005|  22.5k|  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
 1006|  22.5k|  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
 1007|  22.5k|  btf_16_adds_subs_avx2(&x[16], &x[23]);
 1008|  22.5k|  btf_16_adds_subs_avx2(&x[17], &x[22]);
 1009|  22.5k|  btf_16_adds_subs_avx2(&x[18], &x[21]);
 1010|  22.5k|  btf_16_adds_subs_avx2(&x[19], &x[20]);
 1011|  22.5k|  btf_16_adds_subs_avx2(&x[31], &x[24]);
 1012|  22.5k|  btf_16_adds_subs_avx2(&x[30], &x[25]);
 1013|  22.5k|  btf_16_adds_subs_avx2(&x[29], &x[26]);
 1014|  22.5k|  btf_16_adds_subs_avx2(&x[28], &x[27]);
 1015|  22.5k|  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[36], &x[59], _r, cos_bit);
 1016|  22.5k|  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[37], &x[58], _r, cos_bit);
 1017|  22.5k|  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[38], &x[57], _r, cos_bit);
 1018|  22.5k|  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[39], &x[56], _r, cos_bit);
 1019|  22.5k|  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[40], &x[55], _r, cos_bit);
 1020|  22.5k|  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[41], &x[54], _r, cos_bit);
 1021|  22.5k|  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[42], &x[53], _r, cos_bit);
 1022|  22.5k|  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[43], &x[52], _r, cos_bit);
 1023|  22.5k|}
av1_inv_txfm_avx2.c:idct64_stage9_avx2:
 1026|  22.5k|                                      const __m256i _r, int8_t cos_bit) {
 1027|  22.5k|  (void)cos_bit;
 1028|  22.5k|  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
 1029|  22.5k|  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
 1030|  22.5k|  btf_16_adds_subs_avx2(&x[0], &x[15]);
 1031|  22.5k|  btf_16_adds_subs_avx2(&x[1], &x[14]);
 1032|  22.5k|  btf_16_adds_subs_avx2(&x[2], &x[13]);
 1033|  22.5k|  btf_16_adds_subs_avx2(&x[3], &x[12]);
 1034|  22.5k|  btf_16_adds_subs_avx2(&x[4], &x[11]);
 1035|  22.5k|  btf_16_adds_subs_avx2(&x[5], &x[10]);
 1036|  22.5k|  btf_16_adds_subs_avx2(&x[6], &x[9]);
 1037|  22.5k|  btf_16_adds_subs_avx2(&x[7], &x[8]);
 1038|  22.5k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit);
 1039|  22.5k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit);
 1040|  22.5k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit);
 1041|  22.5k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit);
 1042|  22.5k|  btf_16_adds_subs_avx2(&x[32], &x[47]);
 1043|  22.5k|  btf_16_adds_subs_avx2(&x[33], &x[46]);
 1044|  22.5k|  btf_16_adds_subs_avx2(&x[34], &x[45]);
 1045|  22.5k|  btf_16_adds_subs_avx2(&x[35], &x[44]);
 1046|  22.5k|  btf_16_adds_subs_avx2(&x[36], &x[43]);
 1047|  22.5k|  btf_16_adds_subs_avx2(&x[37], &x[42]);
 1048|  22.5k|  btf_16_adds_subs_avx2(&x[38], &x[41]);
 1049|  22.5k|  btf_16_adds_subs_avx2(&x[39], &x[40]);
 1050|  22.5k|  btf_16_adds_subs_avx2(&x[63], &x[48]);
 1051|  22.5k|  btf_16_adds_subs_avx2(&x[62], &x[49]);
 1052|  22.5k|  btf_16_adds_subs_avx2(&x[61], &x[50]);
 1053|  22.5k|  btf_16_adds_subs_avx2(&x[60], &x[51]);
 1054|  22.5k|  btf_16_adds_subs_avx2(&x[59], &x[52]);
 1055|  22.5k|  btf_16_adds_subs_avx2(&x[58], &x[53]);
 1056|  22.5k|  btf_16_adds_subs_avx2(&x[57], &x[54]);
 1057|  22.5k|  btf_16_adds_subs_avx2(&x[56], &x[55]);
 1058|  22.5k|}
av1_inv_txfm_avx2.c:idct64_stage10_avx2:
 1061|  22.5k|                                       const __m256i _r, int8_t cos_bit) {
 1062|  22.5k|  (void)cos_bit;
 1063|  22.5k|  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
 1064|  22.5k|  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
 1065|  22.5k|  btf_16_adds_subs_avx2(&x[0], &x[31]);
 1066|  22.5k|  btf_16_adds_subs_avx2(&x[1], &x[30]);
 1067|  22.5k|  btf_16_adds_subs_avx2(&x[2], &x[29]);
 1068|  22.5k|  btf_16_adds_subs_avx2(&x[3], &x[28]);
 1069|  22.5k|  btf_16_adds_subs_avx2(&x[4], &x[27]);
 1070|  22.5k|  btf_16_adds_subs_avx2(&x[5], &x[26]);
 1071|  22.5k|  btf_16_adds_subs_avx2(&x[6], &x[25]);
 1072|  22.5k|  btf_16_adds_subs_avx2(&x[7], &x[24]);
 1073|  22.5k|  btf_16_adds_subs_avx2(&x[8], &x[23]);
 1074|  22.5k|  btf_16_adds_subs_avx2(&x[9], &x[22]);
 1075|  22.5k|  btf_16_adds_subs_avx2(&x[10], &x[21]);
 1076|  22.5k|  btf_16_adds_subs_avx2(&x[11], &x[20]);
 1077|  22.5k|  btf_16_adds_subs_avx2(&x[12], &x[19]);
 1078|  22.5k|  btf_16_adds_subs_avx2(&x[13], &x[18]);
 1079|  22.5k|  btf_16_adds_subs_avx2(&x[14], &x[17]);
 1080|  22.5k|  btf_16_adds_subs_avx2(&x[15], &x[16]);
 1081|  22.5k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[40], &x[55], _r, cos_bit);
 1082|  22.5k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[41], &x[54], _r, cos_bit);
 1083|  22.5k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[42], &x[53], _r, cos_bit);
 1084|  22.5k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[43], &x[52], _r, cos_bit);
 1085|  22.5k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[44], &x[51], _r, cos_bit);
 1086|  22.5k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[45], &x[50], _r, cos_bit);
 1087|  22.5k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[46], &x[49], _r, cos_bit);
 1088|  22.5k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[47], &x[48], _r, cos_bit);
 1089|  22.5k|}
av1_inv_txfm_avx2.c:idct64_stage11_avx2:
 1091|  22.5k|static inline void idct64_stage11_avx2(__m256i *output, __m256i *x) {
 1092|  22.5k|  btf_16_adds_subs_out_avx2(&output[0], &output[63], x[0], x[63]);
 1093|  22.5k|  btf_16_adds_subs_out_avx2(&output[1], &output[62], x[1], x[62]);
 1094|  22.5k|  btf_16_adds_subs_out_avx2(&output[2], &output[61], x[2], x[61]);
 1095|  22.5k|  btf_16_adds_subs_out_avx2(&output[3], &output[60], x[3], x[60]);
 1096|  22.5k|  btf_16_adds_subs_out_avx2(&output[4], &output[59], x[4], x[59]);
 1097|  22.5k|  btf_16_adds_subs_out_avx2(&output[5], &output[58], x[5], x[58]);
 1098|  22.5k|  btf_16_adds_subs_out_avx2(&output[6], &output[57], x[6], x[57]);
 1099|  22.5k|  btf_16_adds_subs_out_avx2(&output[7], &output[56], x[7], x[56]);
 1100|  22.5k|  btf_16_adds_subs_out_avx2(&output[8], &output[55], x[8], x[55]);
 1101|  22.5k|  btf_16_adds_subs_out_avx2(&output[9], &output[54], x[9], x[54]);
 1102|  22.5k|  btf_16_adds_subs_out_avx2(&output[10], &output[53], x[10], x[53]);
 1103|  22.5k|  btf_16_adds_subs_out_avx2(&output[11], &output[52], x[11], x[52]);
 1104|  22.5k|  btf_16_adds_subs_out_avx2(&output[12], &output[51], x[12], x[51]);
 1105|  22.5k|  btf_16_adds_subs_out_avx2(&output[13], &output[50], x[13], x[50]);
 1106|  22.5k|  btf_16_adds_subs_out_avx2(&output[14], &output[49], x[14], x[49]);
 1107|  22.5k|  btf_16_adds_subs_out_avx2(&output[15], &output[48], x[15], x[48]);
 1108|  22.5k|  btf_16_adds_subs_out_avx2(&output[16], &output[47], x[16], x[47]);
 1109|  22.5k|  btf_16_adds_subs_out_avx2(&output[17], &output[46], x[17], x[46]);
 1110|  22.5k|  btf_16_adds_subs_out_avx2(&output[18], &output[45], x[18], x[45]);
 1111|  22.5k|  btf_16_adds_subs_out_avx2(&output[19], &output[44], x[19], x[44]);
 1112|  22.5k|  btf_16_adds_subs_out_avx2(&output[20], &output[43], x[20], x[43]);
 1113|  22.5k|  btf_16_adds_subs_out_avx2(&output[21], &output[42], x[21], x[42]);
 1114|  22.5k|  btf_16_adds_subs_out_avx2(&output[22], &output[41], x[22], x[41]);
 1115|  22.5k|  btf_16_adds_subs_out_avx2(&output[23], &output[40], x[23], x[40]);
 1116|  22.5k|  btf_16_adds_subs_out_avx2(&output[24], &output[39], x[24], x[39]);
 1117|  22.5k|  btf_16_adds_subs_out_avx2(&output[25], &output[38], x[25], x[38]);
 1118|  22.5k|  btf_16_adds_subs_out_avx2(&output[26], &output[37], x[26], x[37]);
 1119|  22.5k|  btf_16_adds_subs_out_avx2(&output[27], &output[36], x[27], x[36]);
 1120|  22.5k|  btf_16_adds_subs_out_avx2(&output[28], &output[35], x[28], x[35]);
 1121|  22.5k|  btf_16_adds_subs_out_avx2(&output[29], &output[34], x[29], x[34]);
 1122|  22.5k|  btf_16_adds_subs_out_avx2(&output[30], &output[33], x[30], x[33]);
 1123|  22.5k|  btf_16_adds_subs_out_avx2(&output[31], &output[32], x[31], x[32]);
 1124|  22.5k|}
av1_inv_txfm_avx2.c:idct64_low16_avx2:
 1337|  6.11k|static void idct64_low16_avx2(const __m256i *input, __m256i *output) {
 1338|  6.11k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|  6.11k|#define INV_COS_BIT 12
  ------------------
 1339|  6.11k|  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|  6.11k|#define INV_COS_BIT 12
  ------------------
 1340|       |
 1341|  6.11k|  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
 1342|  6.11k|  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
 1343|  6.11k|  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
 1344|  6.11k|  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
 1345|  6.11k|  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
 1346|       |
 1347|       |  // stage 1
 1348|  6.11k|  __m256i x[64];
 1349|  6.11k|  x[0] = input[0];
 1350|  6.11k|  x[4] = input[8];
 1351|  6.11k|  x[8] = input[4];
 1352|  6.11k|  x[12] = input[12];
 1353|  6.11k|  x[16] = input[2];
 1354|  6.11k|  x[20] = input[10];
 1355|  6.11k|  x[24] = input[6];
 1356|  6.11k|  x[28] = input[14];
 1357|  6.11k|  x[32] = input[1];
 1358|  6.11k|  x[36] = input[9];
 1359|  6.11k|  x[40] = input[5];
 1360|  6.11k|  x[44] = input[13];
 1361|  6.11k|  x[48] = input[3];
 1362|  6.11k|  x[52] = input[11];
 1363|  6.11k|  x[56] = input[7];
 1364|  6.11k|  x[60] = input[15];
 1365|       |
 1366|       |  // stage 2
 1367|  6.11k|  btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
  ------------------
  |  |   30|  6.11k|  do {                                             \
  |  |   31|  6.11k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  6.11k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  6.11k|    const __m256i _in = in;                        \
  |  |   34|  6.11k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  6.11k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  6.11k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 6.11k]
  |  |  ------------------
  ------------------
 1368|  6.11k|  btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]);
  ------------------
  |  |   30|  6.11k|  do {                                             \
  |  |   31|  6.11k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  6.11k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  6.11k|    const __m256i _in = in;                        \
  |  |   34|  6.11k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  6.11k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  6.11k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 6.11k]
  |  |  ------------------
  ------------------
 1369|  6.11k|  btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]);
  ------------------
  |  |   30|  6.11k|  do {                                             \
  |  |   31|  6.11k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  6.11k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  6.11k|    const __m256i _in = in;                        \
  |  |   34|  6.11k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  6.11k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  6.11k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 6.11k]
  |  |  ------------------
  ------------------
 1370|  6.11k|  btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
  ------------------
  |  |   30|  6.11k|  do {                                             \
  |  |   31|  6.11k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  6.11k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  6.11k|    const __m256i _in = in;                        \
  |  |   34|  6.11k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  6.11k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  6.11k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 6.11k]
  |  |  ------------------
  ------------------
 1371|  6.11k|  btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
  ------------------
  |  |   30|  6.11k|  do {                                             \
  |  |   31|  6.11k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  6.11k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  6.11k|    const __m256i _in = in;                        \
  |  |   34|  6.11k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  6.11k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  6.11k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 6.11k]
  |  |  ------------------
  ------------------
 1372|  6.11k|  btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]);
  ------------------
  |  |   30|  6.11k|  do {                                             \
  |  |   31|  6.11k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  6.11k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  6.11k|    const __m256i _in = in;                        \
  |  |   34|  6.11k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  6.11k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  6.11k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 6.11k]
  |  |  ------------------
  ------------------
 1373|  6.11k|  btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]);
  ------------------
  |  |   30|  6.11k|  do {                                             \
  |  |   31|  6.11k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  6.11k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  6.11k|    const __m256i _in = in;                        \
  |  |   34|  6.11k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  6.11k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  6.11k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 6.11k]
  |  |  ------------------
  ------------------
 1374|  6.11k|  btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
  ------------------
  |  |   30|  6.11k|  do {                                             \
  |  |   31|  6.11k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  6.11k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  6.11k|    const __m256i _in = in;                        \
  |  |   34|  6.11k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  6.11k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  6.11k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 6.11k]
  |  |  ------------------
  ------------------
 1375|       |
 1376|       |  // stage 3
 1377|  6.11k|  btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
  ------------------
  |  |   30|  6.11k|  do {                                             \
  |  |   31|  6.11k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  6.11k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  6.11k|    const __m256i _in = in;                        \
  |  |   34|  6.11k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  6.11k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  6.11k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 6.11k]
  |  |  ------------------
  ------------------
 1378|  6.11k|  btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
  ------------------
  |  |   30|  6.11k|  do {                                             \
  |  |   31|  6.11k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  6.11k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  6.11k|    const __m256i _in = in;                        \
  |  |   34|  6.11k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  6.11k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  6.11k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 6.11k]
  |  |  ------------------
  ------------------
 1379|  6.11k|  btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
  ------------------
  |  |   30|  6.11k|  do {                                             \
  |  |   31|  6.11k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  6.11k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  6.11k|    const __m256i _in = in;                        \
  |  |   34|  6.11k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  6.11k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  6.11k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 6.11k]
  |  |  ------------------
  ------------------
 1380|  6.11k|  btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
  ------------------
  |  |   30|  6.11k|  do {                                             \
  |  |   31|  6.11k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  6.11k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  6.11k|    const __m256i _in = in;                        \
  |  |   34|  6.11k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  6.11k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  6.11k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 6.11k]
  |  |  ------------------
  ------------------
 1381|  6.11k|  x[33] = x[32];
 1382|  6.11k|  x[34] = x[35];
 1383|  6.11k|  x[37] = x[36];
 1384|  6.11k|  x[38] = x[39];
 1385|  6.11k|  x[41] = x[40];
 1386|  6.11k|  x[42] = x[43];
 1387|  6.11k|  x[45] = x[44];
 1388|  6.11k|  x[46] = x[47];
 1389|  6.11k|  x[49] = x[48];
 1390|  6.11k|  x[50] = x[51];
 1391|  6.11k|  x[53] = x[52];
 1392|  6.11k|  x[54] = x[55];
 1393|  6.11k|  x[57] = x[56];
 1394|  6.11k|  x[58] = x[59];
 1395|  6.11k|  x[61] = x[60];
 1396|  6.11k|  x[62] = x[63];
 1397|       |
 1398|       |  // stage 4
 1399|  6.11k|  btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
  ------------------
  |  |   30|  6.11k|  do {                                             \
  |  |   31|  6.11k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  6.11k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  6.11k|    const __m256i _in = in;                        \
  |  |   34|  6.11k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  6.11k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  6.11k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 6.11k]
  |  |  ------------------
  ------------------
 1400|  6.11k|  btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
  ------------------
  |  |   30|  6.11k|  do {                                             \
  |  |   31|  6.11k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  6.11k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  6.11k|    const __m256i _in = in;                        \
  |  |   34|  6.11k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  6.11k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  6.11k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 6.11k]
  |  |  ------------------
  ------------------
 1401|  6.11k|  x[17] = x[16];
 1402|  6.11k|  x[18] = x[19];
 1403|  6.11k|  x[21] = x[20];
 1404|  6.11k|  x[22] = x[23];
 1405|  6.11k|  x[25] = x[24];
 1406|  6.11k|  x[26] = x[27];
 1407|  6.11k|  x[29] = x[28];
 1408|  6.11k|  x[30] = x[31];
 1409|  6.11k|  idct64_stage4_high32_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|  6.11k|#define INV_COS_BIT 12
  ------------------
 1410|       |
 1411|       |  // stage 5
 1412|  6.11k|  btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
  ------------------
  |  |   30|  6.11k|  do {                                             \
  |  |   31|  6.11k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  6.11k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  6.11k|    const __m256i _in = in;                        \
  |  |   34|  6.11k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  6.11k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  6.11k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 6.11k]
  |  |  ------------------
  ------------------
 1413|  6.11k|  x[9] = x[8];
 1414|  6.11k|  x[10] = x[11];
 1415|  6.11k|  x[13] = x[12];
 1416|  6.11k|  x[14] = x[15];
 1417|  6.11k|  idct64_stage5_high48_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|  6.11k|#define INV_COS_BIT 12
  ------------------
 1418|       |
 1419|       |  // stage 6
 1420|  6.11k|  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
  ------------------
  |  |   30|  6.11k|  do {                                             \
  |  |   31|  6.11k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  6.11k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  6.11k|    const __m256i _in = in;                        \
  |  |   34|  6.11k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  6.11k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  6.11k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 6.11k]
  |  |  ------------------
  ------------------
 1421|  6.11k|  x[5] = x[4];
 1422|  6.11k|  x[6] = x[7];
 1423|  6.11k|  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, INV_COS_BIT);
  ------------------
  |  |   43|  6.11k|#define INV_COS_BIT 12
  ------------------
 1424|  6.11k|  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r,
 1425|  6.11k|                  INV_COS_BIT);
  ------------------
  |  |   43|  6.11k|#define INV_COS_BIT 12
  ------------------
 1426|  6.11k|  idct64_stage6_high48_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|  6.11k|#define INV_COS_BIT 12
  ------------------
 1427|       |
 1428|       |  // stage 7
 1429|  6.11k|  x[3] = x[0];
 1430|  6.11k|  x[2] = x[1];
 1431|  6.11k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, INV_COS_BIT);
  ------------------
  |  |   43|  6.11k|#define INV_COS_BIT 12
  ------------------
 1432|  6.11k|  btf_16_adds_subs_avx2(&x[8], &x[11]);
 1433|  6.11k|  btf_16_adds_subs_avx2(&x[9], &x[10]);
 1434|  6.11k|  btf_16_adds_subs_avx2(&x[15], &x[12]);
 1435|  6.11k|  btf_16_adds_subs_avx2(&x[14], &x[13]);
 1436|  6.11k|  idct64_stage7_high48_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|  6.11k|#define INV_COS_BIT 12
  ------------------
 1437|       |
 1438|       |  // stage 8
 1439|  6.11k|  btf_16_adds_subs_avx2(&x[0], &x[7]);
 1440|  6.11k|  btf_16_adds_subs_avx2(&x[1], &x[6]);
 1441|  6.11k|  btf_16_adds_subs_avx2(&x[2], &x[5]);
 1442|  6.11k|  btf_16_adds_subs_avx2(&x[3], &x[4]);
 1443|  6.11k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r,
 1444|  6.11k|                  INV_COS_BIT);
  ------------------
  |  |   43|  6.11k|#define INV_COS_BIT 12
  ------------------
 1445|  6.11k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r,
 1446|  6.11k|                  INV_COS_BIT);
  ------------------
  |  |   43|  6.11k|#define INV_COS_BIT 12
  ------------------
 1447|  6.11k|  idct64_stage8_high48_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|  6.11k|#define INV_COS_BIT 12
  ------------------
 1448|       |
 1449|  6.11k|  idct64_stage9_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|  6.11k|#define INV_COS_BIT 12
  ------------------
 1450|  6.11k|  idct64_stage10_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|  6.11k|#define INV_COS_BIT 12
  ------------------
 1451|  6.11k|  idct64_stage11_avx2(output, x);
 1452|  6.11k|}
av1_inv_txfm_avx2.c:idct64_stage4_high32_avx2:
  886|  11.6k|                                             const __m256i _r, int8_t cos_bit) {
  887|  11.6k|  (void)cos_bit;
  888|  11.6k|  const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
  889|  11.6k|  const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
  890|  11.6k|  const __m256i cospi_m60_m04 = pair_set_w16_epi16(-cospi[60], -cospi[4]);
  891|  11.6k|  const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
  892|  11.6k|  const __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
  893|  11.6k|  const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
  894|  11.6k|  const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
  895|  11.6k|  const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
  896|  11.6k|  const __m256i cospi_m44_m20 = pair_set_w16_epi16(-cospi[44], -cospi[20]);
  897|  11.6k|  const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
  898|  11.6k|  const __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
  899|  11.6k|  const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
  900|  11.6k|  btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r, cos_bit);
  901|  11.6k|  btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, &x[34], &x[61], _r, cos_bit);
  902|  11.6k|  btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, &x[37], &x[58], _r, cos_bit);
  903|  11.6k|  btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r, cos_bit);
  904|  11.6k|  btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r, cos_bit);
  905|  11.6k|  btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, &x[42], &x[53], _r, cos_bit);
  906|  11.6k|  btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, &x[45], &x[50], _r, cos_bit);
  907|  11.6k|  btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r, cos_bit);
  908|  11.6k|}
av1_inv_txfm_avx2.c:idct64_stage5_high48_avx2:
  911|  11.6k|                                             const __m256i _r, int8_t cos_bit) {
  912|  11.6k|  (void)cos_bit;
  913|  11.6k|  const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
  914|  11.6k|  const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
  915|  11.6k|  const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
  916|  11.6k|  const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
  917|  11.6k|  const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
  918|  11.6k|  const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
  919|  11.6k|  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit);
  920|  11.6k|  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit);
  921|  11.6k|  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit);
  922|  11.6k|  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit);
  923|  11.6k|  btf_16_adds_subs_avx2(&x[32], &x[35]);
  924|  11.6k|  btf_16_adds_subs_avx2(&x[33], &x[34]);
  925|  11.6k|  btf_16_adds_subs_avx2(&x[39], &x[36]);
  926|  11.6k|  btf_16_adds_subs_avx2(&x[38], &x[37]);
  927|  11.6k|  btf_16_adds_subs_avx2(&x[40], &x[43]);
  928|  11.6k|  btf_16_adds_subs_avx2(&x[41], &x[42]);
  929|  11.6k|  btf_16_adds_subs_avx2(&x[47], &x[44]);
  930|  11.6k|  btf_16_adds_subs_avx2(&x[46], &x[45]);
  931|  11.6k|  btf_16_adds_subs_avx2(&x[48], &x[51]);
  932|  11.6k|  btf_16_adds_subs_avx2(&x[49], &x[50]);
  933|  11.6k|  btf_16_adds_subs_avx2(&x[55], &x[52]);
  934|  11.6k|  btf_16_adds_subs_avx2(&x[54], &x[53]);
  935|  11.6k|  btf_16_adds_subs_avx2(&x[56], &x[59]);
  936|  11.6k|  btf_16_adds_subs_avx2(&x[57], &x[58]);
  937|  11.6k|  btf_16_adds_subs_avx2(&x[63], &x[60]);
  938|  11.6k|  btf_16_adds_subs_avx2(&x[62], &x[61]);
  939|  11.6k|}
av1_inv_txfm_avx2.c:idct64_stage6_high48_avx2:
  961|  11.6k|                                             const __m256i _r, int8_t cos_bit) {
  962|  11.6k|  btf_16_adds_subs_avx2(&x[16], &x[19]);
  963|  11.6k|  btf_16_adds_subs_avx2(&x[17], &x[18]);
  964|  11.6k|  btf_16_adds_subs_avx2(&x[23], &x[20]);
  965|  11.6k|  btf_16_adds_subs_avx2(&x[22], &x[21]);
  966|  11.6k|  btf_16_adds_subs_avx2(&x[24], &x[27]);
  967|  11.6k|  btf_16_adds_subs_avx2(&x[25], &x[26]);
  968|  11.6k|  btf_16_adds_subs_avx2(&x[31], &x[28]);
  969|  11.6k|  btf_16_adds_subs_avx2(&x[30], &x[29]);
  970|  11.6k|  idct64_stage6_high32_avx2(x, cospi, _r, cos_bit);
  971|  11.6k|}
av1_inv_txfm_avx2.c:idct64_low32_avx2:
 1454|  5.49k|static void idct64_low32_avx2(const __m256i *input, __m256i *output) {
 1455|  5.49k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|  5.49k|#define INV_COS_BIT 12
  ------------------
 1456|  5.49k|  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|  5.49k|#define INV_COS_BIT 12
  ------------------
 1457|       |
 1458|  5.49k|  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
 1459|  5.49k|  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
 1460|  5.49k|  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
 1461|  5.49k|  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
 1462|  5.49k|  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
 1463|       |
 1464|       |  // stage 1
 1465|  5.49k|  __m256i x[64];
 1466|  5.49k|  x[0] = input[0];
 1467|  5.49k|  x[2] = input[16];
 1468|  5.49k|  x[4] = input[8];
 1469|  5.49k|  x[6] = input[24];
 1470|  5.49k|  x[8] = input[4];
 1471|  5.49k|  x[10] = input[20];
 1472|  5.49k|  x[12] = input[12];
 1473|  5.49k|  x[14] = input[28];
 1474|  5.49k|  x[16] = input[2];
 1475|  5.49k|  x[18] = input[18];
 1476|  5.49k|  x[20] = input[10];
 1477|  5.49k|  x[22] = input[26];
 1478|  5.49k|  x[24] = input[6];
 1479|  5.49k|  x[26] = input[22];
 1480|  5.49k|  x[28] = input[14];
 1481|  5.49k|  x[30] = input[30];
 1482|  5.49k|  x[32] = input[1];
 1483|  5.49k|  x[34] = input[17];
 1484|  5.49k|  x[36] = input[9];
 1485|  5.49k|  x[38] = input[25];
 1486|  5.49k|  x[40] = input[5];
 1487|  5.49k|  x[42] = input[21];
 1488|  5.49k|  x[44] = input[13];
 1489|  5.49k|  x[46] = input[29];
 1490|  5.49k|  x[48] = input[3];
 1491|  5.49k|  x[50] = input[19];
 1492|  5.49k|  x[52] = input[11];
 1493|  5.49k|  x[54] = input[27];
 1494|  5.49k|  x[56] = input[7];
 1495|  5.49k|  x[58] = input[23];
 1496|  5.49k|  x[60] = input[15];
 1497|  5.49k|  x[62] = input[31];
 1498|       |
 1499|       |  // stage 2
 1500|  5.49k|  btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
  ------------------
  |  |   30|  5.49k|  do {                                             \
  |  |   31|  5.49k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  5.49k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  5.49k|    const __m256i _in = in;                        \
  |  |   34|  5.49k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  5.49k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  5.49k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 5.49k]
  |  |  ------------------
  ------------------
 1501|  5.49k|  btf_16_w16_0_avx2(-cospi[33], cospi[31], x[62], x[33], x[62]);
  ------------------
  |  |   30|  5.49k|  do {                                             \
  |  |   31|  5.49k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  5.49k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  5.49k|    const __m256i _in = in;                        \
  |  |   34|  5.49k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  5.49k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  5.49k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 5.49k]
  |  |  ------------------
  ------------------
 1502|  5.49k|  btf_16_w16_0_avx2(cospi[47], cospi[17], x[34], x[34], x[61]);
  ------------------
  |  |   30|  5.49k|  do {                                             \
  |  |   31|  5.49k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  5.49k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  5.49k|    const __m256i _in = in;                        \
  |  |   34|  5.49k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  5.49k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  5.49k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 5.49k]
  |  |  ------------------
  ------------------
 1503|  5.49k|  btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]);
  ------------------
  |  |   30|  5.49k|  do {                                             \
  |  |   31|  5.49k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  5.49k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  5.49k|    const __m256i _in = in;                        \
  |  |   34|  5.49k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  5.49k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  5.49k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 5.49k]
  |  |  ------------------
  ------------------
 1504|  5.49k|  btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]);
  ------------------
  |  |   30|  5.49k|  do {                                             \
  |  |   31|  5.49k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  5.49k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  5.49k|    const __m256i _in = in;                        \
  |  |   34|  5.49k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  5.49k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  5.49k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 5.49k]
  |  |  ------------------
  ------------------
 1505|  5.49k|  btf_16_w16_0_avx2(-cospi[41], cospi[23], x[58], x[37], x[58]);
  ------------------
  |  |   30|  5.49k|  do {                                             \
  |  |   31|  5.49k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  5.49k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  5.49k|    const __m256i _in = in;                        \
  |  |   34|  5.49k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  5.49k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  5.49k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 5.49k]
  |  |  ------------------
  ------------------
 1506|  5.49k|  btf_16_w16_0_avx2(cospi[39], cospi[25], x[38], x[38], x[57]);
  ------------------
  |  |   30|  5.49k|  do {                                             \
  |  |   31|  5.49k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  5.49k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  5.49k|    const __m256i _in = in;                        \
  |  |   34|  5.49k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  5.49k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  5.49k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 5.49k]
  |  |  ------------------
  ------------------
 1507|  5.49k|  btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
  ------------------
  |  |   30|  5.49k|  do {                                             \
  |  |   31|  5.49k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  5.49k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  5.49k|    const __m256i _in = in;                        \
  |  |   34|  5.49k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  5.49k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  5.49k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 5.49k]
  |  |  ------------------
  ------------------
 1508|  5.49k|  btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
  ------------------
  |  |   30|  5.49k|  do {                                             \
  |  |   31|  5.49k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  5.49k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  5.49k|    const __m256i _in = in;                        \
  |  |   34|  5.49k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  5.49k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  5.49k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 5.49k]
  |  |  ------------------
  ------------------
 1509|  5.49k|  btf_16_w16_0_avx2(-cospi[37], cospi[27], x[54], x[41], x[54]);
  ------------------
  |  |   30|  5.49k|  do {                                             \
  |  |   31|  5.49k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  5.49k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  5.49k|    const __m256i _in = in;                        \
  |  |   34|  5.49k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  5.49k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  5.49k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 5.49k]
  |  |  ------------------
  ------------------
 1510|  5.49k|  btf_16_w16_0_avx2(cospi[43], cospi[21], x[42], x[42], x[53]);
  ------------------
  |  |   30|  5.49k|  do {                                             \
  |  |   31|  5.49k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  5.49k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  5.49k|    const __m256i _in = in;                        \
  |  |   34|  5.49k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  5.49k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  5.49k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 5.49k]
  |  |  ------------------
  ------------------
 1511|  5.49k|  btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]);
  ------------------
  |  |   30|  5.49k|  do {                                             \
  |  |   31|  5.49k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  5.49k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  5.49k|    const __m256i _in = in;                        \
  |  |   34|  5.49k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  5.49k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  5.49k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 5.49k]
  |  |  ------------------
  ------------------
 1512|  5.49k|  btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]);
  ------------------
  |  |   30|  5.49k|  do {                                             \
  |  |   31|  5.49k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  5.49k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  5.49k|    const __m256i _in = in;                        \
  |  |   34|  5.49k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  5.49k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  5.49k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 5.49k]
  |  |  ------------------
  ------------------
 1513|  5.49k|  btf_16_w16_0_avx2(-cospi[45], cospi[19], x[50], x[45], x[50]);
  ------------------
  |  |   30|  5.49k|  do {                                             \
  |  |   31|  5.49k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  5.49k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  5.49k|    const __m256i _in = in;                        \
  |  |   34|  5.49k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  5.49k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  5.49k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 5.49k]
  |  |  ------------------
  ------------------
 1514|  5.49k|  btf_16_w16_0_avx2(cospi[35], cospi[29], x[46], x[46], x[49]);
  ------------------
  |  |   30|  5.49k|  do {                                             \
  |  |   31|  5.49k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  5.49k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  5.49k|    const __m256i _in = in;                        \
  |  |   34|  5.49k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  5.49k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  5.49k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 5.49k]
  |  |  ------------------
  ------------------
 1515|  5.49k|  btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
  ------------------
  |  |   30|  5.49k|  do {                                             \
  |  |   31|  5.49k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  5.49k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  5.49k|    const __m256i _in = in;                        \
  |  |   34|  5.49k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  5.49k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  5.49k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 5.49k]
  |  |  ------------------
  ------------------
 1516|       |
 1517|       |  // stage 3
 1518|  5.49k|  btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
  ------------------
  |  |   30|  5.49k|  do {                                             \
  |  |   31|  5.49k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  5.49k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  5.49k|    const __m256i _in = in;                        \
  |  |   34|  5.49k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  5.49k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  5.49k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 5.49k]
  |  |  ------------------
  ------------------
 1519|  5.49k|  btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]);
  ------------------
  |  |   30|  5.49k|  do {                                             \
  |  |   31|  5.49k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  5.49k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  5.49k|    const __m256i _in = in;                        \
  |  |   34|  5.49k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  5.49k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  5.49k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 5.49k]
  |  |  ------------------
  ------------------
 1520|  5.49k|  btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]);
  ------------------
  |  |   30|  5.49k|  do {                                             \
  |  |   31|  5.49k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  5.49k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  5.49k|    const __m256i _in = in;                        \
  |  |   34|  5.49k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  5.49k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  5.49k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 5.49k]
  |  |  ------------------
  ------------------
 1521|  5.49k|  btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
  ------------------
  |  |   30|  5.49k|  do {                                             \
  |  |   31|  5.49k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  5.49k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  5.49k|    const __m256i _in = in;                        \
  |  |   34|  5.49k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  5.49k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  5.49k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 5.49k]
  |  |  ------------------
  ------------------
 1522|  5.49k|  btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
  ------------------
  |  |   30|  5.49k|  do {                                             \
  |  |   31|  5.49k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  5.49k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  5.49k|    const __m256i _in = in;                        \
  |  |   34|  5.49k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  5.49k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  5.49k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 5.49k]
  |  |  ------------------
  ------------------
 1523|  5.49k|  btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]);
  ------------------
  |  |   30|  5.49k|  do {                                             \
  |  |   31|  5.49k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  5.49k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  5.49k|    const __m256i _in = in;                        \
  |  |   34|  5.49k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  5.49k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  5.49k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 5.49k]
  |  |  ------------------
  ------------------
 1524|  5.49k|  btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]);
  ------------------
  |  |   30|  5.49k|  do {                                             \
  |  |   31|  5.49k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  5.49k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  5.49k|    const __m256i _in = in;                        \
  |  |   34|  5.49k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  5.49k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  5.49k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 5.49k]
  |  |  ------------------
  ------------------
 1525|  5.49k|  btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
  ------------------
  |  |   30|  5.49k|  do {                                             \
  |  |   31|  5.49k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  5.49k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  5.49k|    const __m256i _in = in;                        \
  |  |   34|  5.49k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  5.49k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  5.49k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 5.49k]
  |  |  ------------------
  ------------------
 1526|  5.49k|  btf_16_adds_subs_avx2(&x[32], &x[33]);
 1527|  5.49k|  btf_16_adds_subs_avx2(&x[35], &x[34]);
 1528|  5.49k|  btf_16_adds_subs_avx2(&x[36], &x[37]);
 1529|  5.49k|  btf_16_adds_subs_avx2(&x[39], &x[38]);
 1530|  5.49k|  btf_16_adds_subs_avx2(&x[40], &x[41]);
 1531|  5.49k|  btf_16_adds_subs_avx2(&x[43], &x[42]);
 1532|  5.49k|  btf_16_adds_subs_avx2(&x[44], &x[45]);
 1533|  5.49k|  btf_16_adds_subs_avx2(&x[47], &x[46]);
 1534|  5.49k|  btf_16_adds_subs_avx2(&x[48], &x[49]);
 1535|  5.49k|  btf_16_adds_subs_avx2(&x[51], &x[50]);
 1536|  5.49k|  btf_16_adds_subs_avx2(&x[52], &x[53]);
 1537|  5.49k|  btf_16_adds_subs_avx2(&x[55], &x[54]);
 1538|  5.49k|  btf_16_adds_subs_avx2(&x[56], &x[57]);
 1539|  5.49k|  btf_16_adds_subs_avx2(&x[59], &x[58]);
 1540|  5.49k|  btf_16_adds_subs_avx2(&x[60], &x[61]);
 1541|  5.49k|  btf_16_adds_subs_avx2(&x[63], &x[62]);
 1542|       |
 1543|       |  // stage 4
 1544|  5.49k|  btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
  ------------------
  |  |   30|  5.49k|  do {                                             \
  |  |   31|  5.49k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  5.49k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  5.49k|    const __m256i _in = in;                        \
  |  |   34|  5.49k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  5.49k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  5.49k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 5.49k]
  |  |  ------------------
  ------------------
 1545|  5.49k|  btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]);
  ------------------
  |  |   30|  5.49k|  do {                                             \
  |  |   31|  5.49k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  5.49k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  5.49k|    const __m256i _in = in;                        \
  |  |   34|  5.49k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  5.49k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  5.49k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 5.49k]
  |  |  ------------------
  ------------------
 1546|  5.49k|  btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]);
  ------------------
  |  |   30|  5.49k|  do {                                             \
  |  |   31|  5.49k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  5.49k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  5.49k|    const __m256i _in = in;                        \
  |  |   34|  5.49k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  5.49k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  5.49k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 5.49k]
  |  |  ------------------
  ------------------
 1547|  5.49k|  btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
  ------------------
  |  |   30|  5.49k|  do {                                             \
  |  |   31|  5.49k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  5.49k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  5.49k|    const __m256i _in = in;                        \
  |  |   34|  5.49k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  5.49k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  5.49k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 5.49k]
  |  |  ------------------
  ------------------
 1548|  5.49k|  btf_16_adds_subs_avx2(&x[16], &x[17]);
 1549|  5.49k|  btf_16_adds_subs_avx2(&x[19], &x[18]);
 1550|  5.49k|  btf_16_adds_subs_avx2(&x[20], &x[21]);
 1551|  5.49k|  btf_16_adds_subs_avx2(&x[23], &x[22]);
 1552|  5.49k|  btf_16_adds_subs_avx2(&x[24], &x[25]);
 1553|  5.49k|  btf_16_adds_subs_avx2(&x[27], &x[26]);
 1554|  5.49k|  btf_16_adds_subs_avx2(&x[28], &x[29]);
 1555|  5.49k|  btf_16_adds_subs_avx2(&x[31], &x[30]);
 1556|  5.49k|  idct64_stage4_high32_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|  5.49k|#define INV_COS_BIT 12
  ------------------
 1557|       |
 1558|       |  // stage 5
 1559|  5.49k|  btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
  ------------------
  |  |   30|  5.49k|  do {                                             \
  |  |   31|  5.49k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  5.49k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  5.49k|    const __m256i _in = in;                        \
  |  |   34|  5.49k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  5.49k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  5.49k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 5.49k]
  |  |  ------------------
  ------------------
 1560|  5.49k|  btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]);
  ------------------
  |  |   30|  5.49k|  do {                                             \
  |  |   31|  5.49k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  5.49k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  5.49k|    const __m256i _in = in;                        \
  |  |   34|  5.49k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  5.49k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  5.49k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 5.49k]
  |  |  ------------------
  ------------------
 1561|  5.49k|  btf_16_adds_subs_avx2(&x[8], &x[9]);
 1562|  5.49k|  btf_16_adds_subs_avx2(&x[11], &x[10]);
 1563|  5.49k|  btf_16_adds_subs_avx2(&x[12], &x[13]);
 1564|  5.49k|  btf_16_adds_subs_avx2(&x[15], &x[14]);
 1565|  5.49k|  idct64_stage5_high48_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|  5.49k|#define INV_COS_BIT 12
  ------------------
 1566|       |
 1567|       |  // stage 6
 1568|  5.49k|  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
  ------------------
  |  |   30|  5.49k|  do {                                             \
  |  |   31|  5.49k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  5.49k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  5.49k|    const __m256i _in = in;                        \
  |  |   34|  5.49k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  5.49k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  5.49k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 5.49k]
  |  |  ------------------
  ------------------
 1569|  5.49k|  btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]);
  ------------------
  |  |   30|  5.49k|  do {                                             \
  |  |   31|  5.49k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|  5.49k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|  5.49k|    const __m256i _in = in;                        \
  |  |   34|  5.49k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|  5.49k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|  5.49k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded, False: 5.49k]
  |  |  ------------------
  ------------------
 1570|  5.49k|  btf_16_adds_subs_avx2(&x[4], &x[5]);
 1571|  5.49k|  btf_16_adds_subs_avx2(&x[7], &x[6]);
 1572|  5.49k|  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, INV_COS_BIT);
  ------------------
  |  |   43|  5.49k|#define INV_COS_BIT 12
  ------------------
 1573|  5.49k|  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r,
 1574|  5.49k|                  INV_COS_BIT);
  ------------------
  |  |   43|  5.49k|#define INV_COS_BIT 12
  ------------------
 1575|  5.49k|  idct64_stage6_high48_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|  5.49k|#define INV_COS_BIT 12
  ------------------
 1576|       |
 1577|       |  // stage 7
 1578|  5.49k|  btf_16_adds_subs_avx2(&x[0], &x[3]);
 1579|  5.49k|  btf_16_adds_subs_avx2(&x[1], &x[2]);
 1580|  5.49k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, INV_COS_BIT);
  ------------------
  |  |   43|  5.49k|#define INV_COS_BIT 12
  ------------------
 1581|  5.49k|  btf_16_adds_subs_avx2(&x[8], &x[11]);
 1582|  5.49k|  btf_16_adds_subs_avx2(&x[9], &x[10]);
 1583|  5.49k|  btf_16_adds_subs_avx2(&x[15], &x[12]);
 1584|  5.49k|  btf_16_adds_subs_avx2(&x[14], &x[13]);
 1585|  5.49k|  idct64_stage7_high48_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|  5.49k|#define INV_COS_BIT 12
  ------------------
 1586|       |
 1587|       |  // stage 8
 1588|  5.49k|  btf_16_adds_subs_avx2(&x[0], &x[7]);
 1589|  5.49k|  btf_16_adds_subs_avx2(&x[1], &x[6]);
 1590|  5.49k|  btf_16_adds_subs_avx2(&x[2], &x[5]);
 1591|  5.49k|  btf_16_adds_subs_avx2(&x[3], &x[4]);
 1592|  5.49k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r,
 1593|  5.49k|                  INV_COS_BIT);
  ------------------
  |  |   43|  5.49k|#define INV_COS_BIT 12
  ------------------
 1594|  5.49k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r,
 1595|  5.49k|                  INV_COS_BIT);
  ------------------
  |  |   43|  5.49k|#define INV_COS_BIT 12
  ------------------
 1596|  5.49k|  idct64_stage8_high48_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|  5.49k|#define INV_COS_BIT 12
  ------------------
 1597|       |
 1598|       |  // stage 9~11
 1599|  5.49k|  idct64_stage9_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|  5.49k|#define INV_COS_BIT 12
  ------------------
 1600|  5.49k|  idct64_stage10_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|  5.49k|#define INV_COS_BIT 12
  ------------------
 1601|  5.49k|  idct64_stage11_avx2(output, x);
 1602|  5.49k|}
av1_inv_txfm_avx2.c:lowbd_inv_txfm2d_add_idtx_avx2:
 1768|  1.95k|                                                  int32_t eob) {
 1769|  1.95k|  (void)eob;
 1770|  1.95k|  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
 1771|  1.95k|  const int txw_idx = get_txw_idx(tx_size);
 1772|  1.95k|  const int txh_idx = get_txh_idx(tx_size);
 1773|  1.95k|  const int txfm_size_col = tx_size_wide[tx_size];
 1774|  1.95k|  const int txfm_size_row = tx_size_high[tx_size];
 1775|  1.95k|  const int col_max = AOMMIN(32, txfm_size_col);
  ------------------
  |  |   34|  1.95k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 1.95k]
  |  |  ------------------
  ------------------
 1776|  1.95k|  const int row_max = AOMMIN(32, txfm_size_row);
  ------------------
  |  |   34|  1.95k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 1.95k]
  |  |  ------------------
  ------------------
 1777|  1.95k|  const int input_stride = row_max;
 1778|  1.95k|  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
 1779|  1.95k|  __m256i buf[32];
 1780|       |
 1781|  3.95k|  for (int i = 0; i < (col_max >> 4); ++i) {
  ------------------
  |  Branch (1781:19): [True: 1.99k, False: 1.95k]
  ------------------
 1782|  4.09k|    for (int j = 0; j < (row_max >> 4); j++) {
  ------------------
  |  Branch (1782:21): [True: 2.09k, False: 1.99k]
  ------------------
 1783|  2.09k|      iidentity_row_16xn_avx2(buf, input + j * 16 + i * 16 * input_stride,
 1784|  2.09k|                              row_max, shift[0], 16, txw_idx, rect_type);
 1785|  2.09k|      transpose_16bit_16x16_avx2(buf, buf);
 1786|  2.09k|      iidentity_col_16xn_avx2(output + i * 16 + j * 16 * stride, stride, buf,
 1787|  2.09k|                              shift[1], 16, txh_idx);
 1788|  2.09k|    }
 1789|  1.99k|  }
 1790|  1.95k|}
av1_inv_txfm_avx2.c:iidentity_row_16xn_avx2:
 1703|  2.28k|                                           int txw_idx, int rect_type) {
 1704|  2.28k|  const int32_t *input_row = input;
 1705|  2.28k|  const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txw_idx]);
 1706|  2.28k|  const __m256i _r = _mm256_set1_epi16((1 << (NewSqrt2Bits - 1)) +
  ------------------
  |  |   41|  2.28k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 1707|  2.28k|                                       (1 << (NewSqrt2Bits - shift - 1)));
  ------------------
  |  |   41|  2.28k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 1708|  2.28k|  const __m256i one = _mm256_set1_epi16(1);
 1709|  2.28k|  const __m256i scale__r = _mm256_unpacklo_epi16(scale, _r);
 1710|  2.28k|  if (rect_type != 1 && rect_type != -1) {
  ------------------
  |  Branch (1710:7): [True: 2.23k, False: 52]
  |  Branch (1710:25): [True: 2.12k, False: 112]
  ------------------
 1711|  36.1k|    for (int i = 0; i < height; ++i) {
  ------------------
  |  Branch (1711:21): [True: 34.0k, False: 2.12k]
  ------------------
 1712|  34.0k|      const __m256i src = load_32bit_to_16bit_w16_avx2(input_row);
 1713|  34.0k|      input_row += stride;
 1714|  34.0k|      __m256i lo = _mm256_unpacklo_epi16(src, one);
 1715|  34.0k|      __m256i hi = _mm256_unpackhi_epi16(src, one);
 1716|  34.0k|      lo = _mm256_madd_epi16(lo, scale__r);
 1717|  34.0k|      hi = _mm256_madd_epi16(hi, scale__r);
 1718|  34.0k|      lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift);
  ------------------
  |  |   41|  34.0k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 1719|  34.0k|      hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift);
  ------------------
  |  |   41|  34.0k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 1720|  34.0k|      out[i] = _mm256_packs_epi32(lo, hi);
 1721|  34.0k|    }
 1722|  2.12k|  } else {
 1723|    164|    const __m256i rect_scale =
 1724|    164|        _mm256_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits));
  ------------------
  |  |   41|    164|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 1725|  2.78k|    for (int i = 0; i < height; ++i) {
  ------------------
  |  Branch (1725:21): [True: 2.62k, False: 164]
  ------------------
 1726|  2.62k|      __m256i src = load_32bit_to_16bit_w16_avx2(input_row);
 1727|  2.62k|      src = _mm256_mulhrs_epi16(src, rect_scale);
 1728|  2.62k|      input_row += stride;
 1729|  2.62k|      __m256i lo = _mm256_unpacklo_epi16(src, one);
 1730|  2.62k|      __m256i hi = _mm256_unpackhi_epi16(src, one);
 1731|  2.62k|      lo = _mm256_madd_epi16(lo, scale__r);
 1732|  2.62k|      hi = _mm256_madd_epi16(hi, scale__r);
 1733|  2.62k|      lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift);
  ------------------
  |  |   41|  2.62k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 1734|  2.62k|      hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift);
  ------------------
  |  |   41|  2.62k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 1735|  2.62k|      out[i] = _mm256_packs_epi32(lo, hi);
 1736|  2.62k|    }
 1737|    164|  }
 1738|  2.28k|}
av1_inv_txfm_avx2.c:iidentity_col_16xn_avx2:
 1742|  2.40k|                                           int txh_idx) {
 1743|  2.40k|  const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txh_idx]);
 1744|  2.40k|  const __m256i scale__r = _mm256_set1_epi16(1 << (NewSqrt2Bits - 1));
  ------------------
  |  |   41|  2.40k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 1745|  2.40k|  const __m256i shift__r = _mm256_set1_epi32(1 << (-shift - 1));
 1746|  2.40k|  const __m256i one = _mm256_set1_epi16(1);
 1747|  2.40k|  const __m256i scale_coeff = _mm256_unpacklo_epi16(scale, scale__r);
 1748|  40.8k|  for (int h = 0; h < height; ++h) {
  ------------------
  |  Branch (1748:19): [True: 38.4k, False: 2.40k]
  ------------------
 1749|  38.4k|    __m256i lo = _mm256_unpacklo_epi16(buf[h], one);
 1750|  38.4k|    __m256i hi = _mm256_unpackhi_epi16(buf[h], one);
 1751|  38.4k|    lo = _mm256_madd_epi16(lo, scale_coeff);
 1752|  38.4k|    hi = _mm256_madd_epi16(hi, scale_coeff);
 1753|  38.4k|    lo = _mm256_srai_epi32(lo, NewSqrt2Bits);
  ------------------
  |  |   41|  38.4k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 1754|  38.4k|    hi = _mm256_srai_epi32(hi, NewSqrt2Bits);
  ------------------
  |  |   41|  38.4k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 1755|  38.4k|    lo = _mm256_add_epi32(lo, shift__r);
 1756|  38.4k|    hi = _mm256_add_epi32(hi, shift__r);
 1757|  38.4k|    lo = _mm256_srai_epi32(lo, -shift);
 1758|  38.4k|    hi = _mm256_srai_epi32(hi, -shift);
 1759|  38.4k|    const __m256i x = _mm256_packs_epi32(lo, hi);
 1760|  38.4k|    write_recon_w16_avx2(x, output);
 1761|  38.4k|    output += stride;
 1762|  38.4k|  }
 1763|  2.40k|}
av1_inv_txfm_avx2.c:lowbd_inv_txfm2d_add_h_identity_avx2:
 1794|    194|    TX_SIZE tx_size, int eob) {
 1795|    194|  int eobx, eoby;
 1796|    194|  get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
 1797|    194|  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
 1798|    194|  const int txw_idx = get_txw_idx(tx_size);
 1799|    194|  const int txh_idx = get_txh_idx(tx_size);
 1800|    194|  const int txfm_size_col = tx_size_wide[tx_size];
 1801|    194|  const int txfm_size_row = tx_size_high[tx_size];
 1802|    194|  const int txfm_size_row_notzero = AOMMIN(32, txfm_size_row);
  ------------------
  |  |   34|    194|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 194]
  |  |  ------------------
  ------------------
 1803|    194|  const int input_stride = txfm_size_row_notzero;
 1804|    194|  const int buf_size_w_div16 = (eobx + 16) >> 4;
 1805|    194|  const int buf_size_h_div16 = (eoby + 16) >> 4;
 1806|    194|  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
 1807|       |
 1808|    194|  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
 1809|    194|  const transform_1d_avx2 col_txfm =
 1810|    194|      lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
 1811|       |
 1812|    194|  assert(col_txfm != NULL);
 1813|       |
 1814|    194|  int ud_flip, lr_flip;
 1815|    194|  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 1816|    388|  for (int i = 0; i < buf_size_w_div16; i++) {
  ------------------
  |  Branch (1816:19): [True: 194, False: 194]
  ------------------
 1817|    194|    __m256i buf0[64];
 1818|    388|    for (int j = 0; j < buf_size_h_div16; j++) {
  ------------------
  |  Branch (1818:21): [True: 194, False: 194]
  ------------------
 1819|    194|      __m256i *buf0_cur = buf0 + j * 16;
 1820|    194|      const int32_t *input_cur = input + i * 16 * input_stride + j * 16;
 1821|    194|      iidentity_row_16xn_avx2(buf0_cur, input_cur, input_stride, shift[0], 16,
 1822|    194|                              txw_idx, rect_type);
 1823|    194|      transpose_16bit_16x16_avx2(buf0_cur, buf0_cur);
 1824|    194|    }
 1825|    194|    col_txfm(buf0, buf0);
 1826|    194|    __m256i mshift = _mm256_set1_epi16(1 << (15 + shift[1]));
 1827|    194|    int k = ud_flip ? (txfm_size_row - 1) : 0;
  ------------------
  |  Branch (1827:13): [True: 0, False: 194]
  ------------------
 1828|    194|    const int step = ud_flip ? -1 : 1;
  ------------------
  |  Branch (1828:22): [True: 0, False: 194]
  ------------------
 1829|  3.29k|    for (int j = 0; j < txfm_size_row; ++j, k += step) {
  ------------------
  |  Branch (1829:21): [True: 3.10k, False: 194]
  ------------------
 1830|  3.10k|      __m256i res = _mm256_mulhrs_epi16(buf0[k], mshift);
 1831|  3.10k|      write_recon_w16_avx2(res, output + (i << 4) + j * stride);
 1832|  3.10k|    }
 1833|    194|  }
 1834|    194|}
av1_inv_txfm_avx2.c:lowbd_inv_txfm2d_add_v_identity_avx2:
 1838|    308|    TX_SIZE tx_size, int eob) {
 1839|    308|  __m256i buf1[64];
 1840|    308|  int eobx, eoby;
 1841|    308|  get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
 1842|    308|  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
 1843|    308|  const int txw_idx = get_txw_idx(tx_size);
 1844|    308|  const int txh_idx = get_txh_idx(tx_size);
 1845|    308|  const int txfm_size_col = tx_size_wide[tx_size];
 1846|    308|  const int txfm_size_row = tx_size_high[tx_size];
 1847|    308|  const int buf_size_w_div16 = txfm_size_col >> 4;
 1848|    308|  const int buf_size_h_div16 = (eoby + 16) >> 4;
 1849|    308|  const int buf_size_nonzero_w = ((eobx + 8) >> 3) << 3;
 1850|    308|  const int input_stride = AOMMIN(32, txfm_size_row);
  ------------------
  |  |   34|    308|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 308]
  |  |  ------------------
  ------------------
 1851|    308|  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
 1852|       |
 1853|    308|  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
 1854|    308|  const transform_1d_avx2 row_txfm =
 1855|    308|      lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
 1856|       |
 1857|    308|  assert(row_txfm != NULL);
 1858|       |
 1859|    308|  int ud_flip, lr_flip;
 1860|    308|  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 1861|    616|  for (int i = 0; i < buf_size_h_div16; i++) {
  ------------------
  |  Branch (1861:19): [True: 308, False: 308]
  ------------------
 1862|    308|    __m256i buf0[64];
 1863|    308|    load_buffer_32bit_to_16bit_w16_avx2(input + i * 16, input_stride, buf0,
 1864|    308|                                        buf_size_nonzero_w);
 1865|    308|    if (rect_type == 1 || rect_type == -1) {
  ------------------
  |  Branch (1865:9): [True: 0, False: 308]
  |  Branch (1865:27): [True: 0, False: 308]
  ------------------
 1866|      0|      round_shift_avx2(buf0, buf0, buf_size_nonzero_w);  // rect special code
 1867|      0|    }
 1868|    308|    row_txfm(buf0, buf0);
 1869|    308|    round_shift_16bit_w16_avx2(buf0, txfm_size_col, shift[0]);
 1870|    308|    __m256i *_buf1 = buf1;
 1871|    308|    if (lr_flip) {
  ------------------
  |  Branch (1871:9): [True: 0, False: 308]
  ------------------
 1872|      0|      for (int j = 0; j < buf_size_w_div16; ++j) {
  ------------------
  |  Branch (1872:23): [True: 0, False: 0]
  ------------------
 1873|      0|        __m256i temp[16];
 1874|      0|        flip_buf_avx2(buf0 + 16 * j, temp, 16);
 1875|      0|        transpose_16bit_16x16_avx2(temp,
 1876|      0|                                   _buf1 + 16 * (buf_size_w_div16 - 1 - j));
 1877|      0|      }
 1878|    308|    } else {
 1879|    616|      for (int j = 0; j < buf_size_w_div16; ++j) {
  ------------------
  |  Branch (1879:23): [True: 308, False: 308]
  ------------------
 1880|    308|        transpose_16bit_16x16_avx2(buf0 + 16 * j, _buf1 + 16 * j);
 1881|    308|      }
 1882|    308|    }
 1883|    616|    for (int j = 0; j < buf_size_w_div16; ++j) {
  ------------------
  |  Branch (1883:21): [True: 308, False: 308]
  ------------------
 1884|    308|      iidentity_col_16xn_avx2(output + i * 16 * stride + j * 16, stride,
 1885|    308|                              buf1 + j * 16, shift[1], 16, txh_idx);
 1886|    308|    }
 1887|    308|  }
 1888|    308|}

av1_inv_txfm_avx2.c:round_shift_avx2:
   39|  23.2k|                                    int size) {
   40|  23.2k|  const __m256i scale = _mm256_set1_epi16(NewInvSqrt2 * 8);
   41|   435k|  for (int i = 0; i < size; ++i) {
  ------------------
  |  Branch (41:19): [True: 411k, False: 23.2k]
  ------------------
   42|   411k|    output[i] = _mm256_mulhrs_epi16(input[i], scale);
   43|   411k|  }
   44|  23.2k|}
av1_inv_txfm_avx2.c:lowbd_write_buffer_16xn_avx2:
   56|   223k|                                                int height) {
   57|   223k|  int j = flipud ? (height - 1) : 0;
  ------------------
  |  Branch (57:11): [True: 690, False: 223k]
  ------------------
   58|   223k|  const int step = flipud ? -1 : 1;
  ------------------
  |  Branch (58:20): [True: 690, False: 223k]
  ------------------
   59|  6.64M|  for (int i = 0; i < height; ++i, j += step) {
  ------------------
  |  Branch (59:19): [True: 6.42M, False: 223k]
  ------------------
   60|  6.42M|    write_recon_w16_avx2(in[j], output + i * stride);
   61|  6.42M|  }
   62|   223k|}
av1_inv_txfm_avx2.c:write_recon_w16_avx2:
   46|  6.46M|static inline void write_recon_w16_avx2(__m256i res, uint8_t *output) {
   47|  6.46M|  __m128i pred = _mm_loadu_si128((__m128i const *)(output));
   48|  6.46M|  __m256i u = _mm256_adds_epi16(_mm256_cvtepu8_epi16(pred), res);
   49|  6.46M|  __m128i y = _mm256_castsi256_si128(
   50|       |      _mm256_permute4x64_epi64(_mm256_packus_epi16(u, u), 168));
   51|  6.46M|  _mm_storeu_si128((__m128i *)(output), y);
   52|  6.46M|}

av1_idct8_low1_ssse3:
   79|  92.6k|void av1_idct8_low1_ssse3(const __m128i *input, __m128i *output) {
   80|  92.6k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|  92.6k|#define INV_COS_BIT 12
  ------------------
   81|       |
   82|       |  // stage 1
   83|  92.6k|  __m128i x[2];
   84|  92.6k|  x[0] = input[0];
   85|       |
   86|       |  // stage 2
   87|       |  // stage 3
   88|  92.6k|  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
  ------------------
  |  |   28|  92.6k|  do {                                          \
  |  |   29|  92.6k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  92.6k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  92.6k|    const __m128i _in = in;                     \
  |  |   32|  92.6k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  92.6k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  92.6k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 92.6k]
  |  |  ------------------
  ------------------
   89|       |
   90|       |  // stage 4
   91|       |  // stage 5
   92|  92.6k|  output[0] = x[0];
   93|  92.6k|  output[7] = x[0];
   94|  92.6k|  output[1] = x[1];
   95|  92.6k|  output[6] = x[1];
   96|  92.6k|  output[2] = x[1];
   97|  92.6k|  output[5] = x[1];
   98|  92.6k|  output[3] = x[0];
   99|  92.6k|  output[4] = x[0];
  100|  92.6k|}
av1_idct8_sse2:
  102|   407k|void av1_idct8_sse2(const __m128i *input, __m128i *output) {
  103|   407k|  const int8_t cos_bit = INV_COS_BIT;
  ------------------
  |  |   43|   407k|#define INV_COS_BIT 12
  ------------------
  104|   407k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|   407k|#define INV_COS_BIT 12
  ------------------
  105|   407k|  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|   407k|#define INV_COS_BIT 12
  ------------------
  106|       |
  107|   407k|  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
  ------------------
  |  |   20|   407k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  108|   407k|  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
  ------------------
  |  |   20|   407k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  109|   407k|  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
  ------------------
  |  |   20|   407k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  110|   407k|  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
  ------------------
  |  |   20|   407k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  111|   407k|  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
  ------------------
  |  |   20|   407k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  112|   407k|  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
  ------------------
  |  |   20|   407k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  113|   407k|  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
  ------------------
  |  |   20|   407k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  114|   407k|  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
  ------------------
  |  |   20|   407k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  115|   407k|  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
  ------------------
  |  |   20|   407k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  116|       |
  117|       |  // stage 1
  118|   407k|  __m128i x[8];
  119|   407k|  x[0] = input[0];
  120|   407k|  x[1] = input[4];
  121|   407k|  x[2] = input[2];
  122|   407k|  x[3] = input[6];
  123|   407k|  x[4] = input[1];
  124|   407k|  x[5] = input[5];
  125|   407k|  x[6] = input[3];
  126|   407k|  x[7] = input[7];
  127|       |
  128|       |  // stage 2
  129|   407k|  btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
  ------------------
  |  |   61|   407k|  do {                                            \
  |  |   62|   407k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   407k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   407k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   407k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   407k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   407k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   407k|                                                  \
  |  |   69|   407k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   407k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   407k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   407k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   407k|                                                  \
  |  |   74|   407k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   407k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   407k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   407k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   407k|                                                  \
  |  |   79|   407k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   407k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   407k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 407k]
  |  |  ------------------
  ------------------
  130|   407k|  btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
  ------------------
  |  |   61|   407k|  do {                                            \
  |  |   62|   407k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   407k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   407k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   407k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   407k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   407k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   407k|                                                  \
  |  |   69|   407k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   407k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   407k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   407k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   407k|                                                  \
  |  |   74|   407k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   407k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   407k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   407k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   407k|                                                  \
  |  |   79|   407k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   407k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   407k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 407k]
  |  |  ------------------
  ------------------
  131|       |
  132|       |  // stage 3
  133|   407k|  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
  ------------------
  |  |   61|   407k|  do {                                            \
  |  |   62|   407k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   407k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   407k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   407k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   407k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   407k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   407k|                                                  \
  |  |   69|   407k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   407k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   407k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   407k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   407k|                                                  \
  |  |   74|   407k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   407k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   407k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   407k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   407k|                                                  \
  |  |   79|   407k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   407k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   407k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 407k]
  |  |  ------------------
  ------------------
  134|   407k|  btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
  ------------------
  |  |   61|   407k|  do {                                            \
  |  |   62|   407k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   407k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   407k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   407k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   407k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   407k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   407k|                                                  \
  |  |   69|   407k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   407k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   407k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   407k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   407k|                                                  \
  |  |   74|   407k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   407k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   407k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   407k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   407k|                                                  \
  |  |   79|   407k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   407k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   407k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 407k]
  |  |  ------------------
  ------------------
  135|   407k|  btf_16_adds_subs_sse2(x[4], x[5]);
  ------------------
  |  |   37|   407k|  do {                                  \
  |  |   38|   407k|    const __m128i _in0 = in0;           \
  |  |   39|   407k|    const __m128i _in1 = in1;           \
  |  |   40|   407k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   407k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   407k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 407k]
  |  |  ------------------
  ------------------
  136|   407k|  btf_16_subs_adds_sse2(x[7], x[6]);
  ------------------
  |  |   45|   407k|  do {                                  \
  |  |   46|   407k|    const __m128i _in0 = in0;           \
  |  |   47|   407k|    const __m128i _in1 = in1;           \
  |  |   48|   407k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|   407k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|   407k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded, False: 407k]
  |  |  ------------------
  ------------------
  137|       |
  138|       |  // stage 4
  139|   407k|  btf_16_adds_subs_sse2(x[0], x[3]);
  ------------------
  |  |   37|   407k|  do {                                  \
  |  |   38|   407k|    const __m128i _in0 = in0;           \
  |  |   39|   407k|    const __m128i _in1 = in1;           \
  |  |   40|   407k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   407k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   407k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 407k]
  |  |  ------------------
  ------------------
  140|   407k|  btf_16_adds_subs_sse2(x[1], x[2]);
  ------------------
  |  |   37|   407k|  do {                                  \
  |  |   38|   407k|    const __m128i _in0 = in0;           \
  |  |   39|   407k|    const __m128i _in1 = in1;           \
  |  |   40|   407k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   407k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   407k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 407k]
  |  |  ------------------
  ------------------
  141|   407k|  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
  ------------------
  |  |   61|   407k|  do {                                            \
  |  |   62|   407k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   407k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   407k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   407k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   407k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   407k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   407k|                                                  \
  |  |   69|   407k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   407k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   407k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   407k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   407k|                                                  \
  |  |   74|   407k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   407k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   407k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   407k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   407k|                                                  \
  |  |   79|   407k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   407k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   407k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 407k]
  |  |  ------------------
  ------------------
  142|       |
  143|       |  // stage 5
  144|   407k|  btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]);
  ------------------
  |  |   53|   407k|  do {                                                  \
  |  |   54|   407k|    const __m128i _in0 = in0;                           \
  |  |   55|   407k|    const __m128i _in1 = in1;                           \
  |  |   56|   407k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|   407k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|   407k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 407k]
  |  |  ------------------
  ------------------
  145|   407k|  btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]);
  ------------------
  |  |   53|   407k|  do {                                                  \
  |  |   54|   407k|    const __m128i _in0 = in0;                           \
  |  |   55|   407k|    const __m128i _in1 = in1;                           \
  |  |   56|   407k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|   407k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|   407k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 407k]
  |  |  ------------------
  ------------------
  146|   407k|  btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]);
  ------------------
  |  |   53|   407k|  do {                                                  \
  |  |   54|   407k|    const __m128i _in0 = in0;                           \
  |  |   55|   407k|    const __m128i _in1 = in1;                           \
  |  |   56|   407k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|   407k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|   407k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 407k]
  |  |  ------------------
  ------------------
  147|   407k|  btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]);
  ------------------
  |  |   53|   407k|  do {                                                  \
  |  |   54|   407k|    const __m128i _in0 = in0;                           \
  |  |   55|   407k|    const __m128i _in1 = in1;                           \
  |  |   56|   407k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|   407k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|   407k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 407k]
  |  |  ------------------
  ------------------
  148|   407k|}
av1_iadst8_low1_ssse3:
 1701|  22.4k|void av1_iadst8_low1_ssse3(const __m128i *input, __m128i *output) {
 1702|  22.4k|  const int8_t cos_bit = INV_COS_BIT;
  ------------------
  |  |   43|  22.4k|#define INV_COS_BIT 12
  ------------------
 1703|  22.4k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|  22.4k|#define INV_COS_BIT 12
  ------------------
 1704|  22.4k|  const __m128i __zero = _mm_setzero_si128();
 1705|  22.4k|  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|  22.4k|#define INV_COS_BIT 12
  ------------------
 1706|       |
 1707|  22.4k|  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
  ------------------
  |  |   20|  22.4k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1708|  22.4k|  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
  ------------------
  |  |   20|  22.4k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1709|  22.4k|  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
  ------------------
  |  |   20|  22.4k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1710|  22.4k|  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
  ------------------
  |  |   20|  22.4k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1711|       |
 1712|       |  // stage 1
 1713|  22.4k|  __m128i x[8];
 1714|  22.4k|  x[1] = input[0];
 1715|       |
 1716|       |  // stage 2
 1717|  22.4k|  btf_16_ssse3(cospi[60], -cospi[4], x[1], x[0], x[1]);
  ------------------
  |  |   28|  22.4k|  do {                                          \
  |  |   29|  22.4k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  22.4k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  22.4k|    const __m128i _in = in;                     \
  |  |   32|  22.4k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  22.4k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  22.4k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.4k]
  |  |  ------------------
  ------------------
 1718|       |
 1719|       |  // stage 3
 1720|  22.4k|  x[4] = x[0];
 1721|  22.4k|  x[5] = x[1];
 1722|       |
 1723|       |  // stage 4
 1724|  22.4k|  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
  ------------------
  |  |   61|  22.4k|  do {                                            \
  |  |   62|  22.4k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  22.4k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  22.4k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  22.4k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  22.4k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  22.4k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  22.4k|                                                  \
  |  |   69|  22.4k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  22.4k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  22.4k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  22.4k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  22.4k|                                                  \
  |  |   74|  22.4k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  22.4k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  22.4k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  22.4k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  22.4k|                                                  \
  |  |   79|  22.4k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  22.4k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  22.4k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 22.4k]
  |  |  ------------------
  ------------------
 1725|       |
 1726|       |  // stage 5
 1727|  22.4k|  x[2] = x[0];
 1728|  22.4k|  x[3] = x[1];
 1729|  22.4k|  x[6] = x[4];
 1730|  22.4k|  x[7] = x[5];
 1731|       |
 1732|       |  // stage 6
 1733|  22.4k|  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
  ------------------
  |  |   61|  22.4k|  do {                                            \
  |  |   62|  22.4k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  22.4k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  22.4k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  22.4k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  22.4k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  22.4k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  22.4k|                                                  \
  |  |   69|  22.4k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  22.4k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  22.4k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  22.4k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  22.4k|                                                  \
  |  |   74|  22.4k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  22.4k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  22.4k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  22.4k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  22.4k|                                                  \
  |  |   79|  22.4k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  22.4k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  22.4k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 22.4k]
  |  |  ------------------
  ------------------
 1734|  22.4k|  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
  ------------------
  |  |   61|  22.4k|  do {                                            \
  |  |   62|  22.4k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  22.4k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  22.4k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  22.4k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  22.4k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  22.4k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  22.4k|                                                  \
  |  |   69|  22.4k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  22.4k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  22.4k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  22.4k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  22.4k|                                                  \
  |  |   74|  22.4k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  22.4k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  22.4k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  22.4k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  22.4k|                                                  \
  |  |   79|  22.4k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  22.4k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  22.4k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 22.4k]
  |  |  ------------------
  ------------------
 1735|       |
 1736|       |  // stage 7
 1737|  22.4k|  output[0] = x[0];
 1738|  22.4k|  output[1] = _mm_subs_epi16(__zero, x[4]);
 1739|  22.4k|  output[2] = x[6];
 1740|  22.4k|  output[3] = _mm_subs_epi16(__zero, x[2]);
 1741|  22.4k|  output[4] = x[3];
 1742|  22.4k|  output[5] = _mm_subs_epi16(__zero, x[7]);
 1743|  22.4k|  output[6] = x[5];
 1744|  22.4k|  output[7] = _mm_subs_epi16(__zero, x[1]);
 1745|  22.4k|}
av1_iadst8_sse2:
 1747|   138k|void av1_iadst8_sse2(const __m128i *input, __m128i *output) {
 1748|   138k|  const int8_t cos_bit = INV_COS_BIT;
  ------------------
  |  |   43|   138k|#define INV_COS_BIT 12
  ------------------
 1749|   138k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|   138k|#define INV_COS_BIT 12
  ------------------
 1750|   138k|  const __m128i __zero = _mm_setzero_si128();
 1751|   138k|  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|   138k|#define INV_COS_BIT 12
  ------------------
 1752|       |
 1753|   138k|  const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
  ------------------
  |  |   20|   138k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1754|   138k|  const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
  ------------------
  |  |   20|   138k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1755|   138k|  const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
  ------------------
  |  |   20|   138k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1756|   138k|  const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
  ------------------
  |  |   20|   138k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1757|   138k|  const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
  ------------------
  |  |   20|   138k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1758|   138k|  const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
  ------------------
  |  |   20|   138k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1759|   138k|  const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
  ------------------
  |  |   20|   138k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1760|   138k|  const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
  ------------------
  |  |   20|   138k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1761|   138k|  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
  ------------------
  |  |   20|   138k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1762|   138k|  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
  ------------------
  |  |   20|   138k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1763|   138k|  const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
  ------------------
  |  |   20|   138k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1764|   138k|  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
  ------------------
  |  |   20|   138k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1765|   138k|  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
  ------------------
  |  |   20|   138k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1766|       |
 1767|       |  // stage 1
 1768|   138k|  __m128i x[8];
 1769|   138k|  x[0] = input[7];
 1770|   138k|  x[1] = input[0];
 1771|   138k|  x[2] = input[5];
 1772|   138k|  x[3] = input[2];
 1773|   138k|  x[4] = input[3];
 1774|   138k|  x[5] = input[4];
 1775|   138k|  x[6] = input[1];
 1776|   138k|  x[7] = input[6];
 1777|       |
 1778|       |  // stage 2
 1779|   138k|  btf_16_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1]);
  ------------------
  |  |   61|   138k|  do {                                            \
  |  |   62|   138k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   138k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   138k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   138k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   138k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   138k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   138k|                                                  \
  |  |   69|   138k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   138k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   138k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   138k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   138k|                                                  \
  |  |   74|   138k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   138k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   138k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   138k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   138k|                                                  \
  |  |   79|   138k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   138k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   138k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 138k]
  |  |  ------------------
  ------------------
 1780|   138k|  btf_16_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3]);
  ------------------
  |  |   61|   138k|  do {                                            \
  |  |   62|   138k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   138k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   138k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   138k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   138k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   138k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   138k|                                                  \
  |  |   69|   138k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   138k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   138k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   138k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   138k|                                                  \
  |  |   74|   138k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   138k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   138k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   138k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   138k|                                                  \
  |  |   79|   138k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   138k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   138k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 138k]
  |  |  ------------------
  ------------------
 1781|   138k|  btf_16_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5]);
  ------------------
  |  |   61|   138k|  do {                                            \
  |  |   62|   138k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   138k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   138k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   138k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   138k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   138k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   138k|                                                  \
  |  |   69|   138k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   138k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   138k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   138k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   138k|                                                  \
  |  |   74|   138k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   138k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   138k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   138k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   138k|                                                  \
  |  |   79|   138k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   138k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   138k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 138k]
  |  |  ------------------
  ------------------
 1782|   138k|  btf_16_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7]);
  ------------------
  |  |   61|   138k|  do {                                            \
  |  |   62|   138k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   138k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   138k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   138k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   138k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   138k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   138k|                                                  \
  |  |   69|   138k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   138k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   138k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   138k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   138k|                                                  \
  |  |   74|   138k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   138k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   138k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   138k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   138k|                                                  \
  |  |   79|   138k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   138k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   138k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 138k]
  |  |  ------------------
  ------------------
 1783|       |
 1784|       |  // stage 3
 1785|   138k|  btf_16_adds_subs_sse2(x[0], x[4]);
  ------------------
  |  |   37|   138k|  do {                                  \
  |  |   38|   138k|    const __m128i _in0 = in0;           \
  |  |   39|   138k|    const __m128i _in1 = in1;           \
  |  |   40|   138k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   138k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   138k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 138k]
  |  |  ------------------
  ------------------
 1786|   138k|  btf_16_adds_subs_sse2(x[1], x[5]);
  ------------------
  |  |   37|   138k|  do {                                  \
  |  |   38|   138k|    const __m128i _in0 = in0;           \
  |  |   39|   138k|    const __m128i _in1 = in1;           \
  |  |   40|   138k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   138k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   138k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 138k]
  |  |  ------------------
  ------------------
 1787|   138k|  btf_16_adds_subs_sse2(x[2], x[6]);
  ------------------
  |  |   37|   138k|  do {                                  \
  |  |   38|   138k|    const __m128i _in0 = in0;           \
  |  |   39|   138k|    const __m128i _in1 = in1;           \
  |  |   40|   138k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   138k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   138k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 138k]
  |  |  ------------------
  ------------------
 1788|   138k|  btf_16_adds_subs_sse2(x[3], x[7]);
  ------------------
  |  |   37|   138k|  do {                                  \
  |  |   38|   138k|    const __m128i _in0 = in0;           \
  |  |   39|   138k|    const __m128i _in1 = in1;           \
  |  |   40|   138k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   138k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   138k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 138k]
  |  |  ------------------
  ------------------
 1789|       |
 1790|       |  // stage 4
 1791|   138k|  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
  ------------------
  |  |   61|   138k|  do {                                            \
  |  |   62|   138k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   138k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   138k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   138k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   138k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   138k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   138k|                                                  \
  |  |   69|   138k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   138k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   138k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   138k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   138k|                                                  \
  |  |   74|   138k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   138k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   138k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   138k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   138k|                                                  \
  |  |   79|   138k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   138k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   138k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 138k]
  |  |  ------------------
  ------------------
 1792|   138k|  btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
  ------------------
  |  |   61|   138k|  do {                                            \
  |  |   62|   138k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   138k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   138k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   138k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   138k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   138k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   138k|                                                  \
  |  |   69|   138k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   138k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   138k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   138k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   138k|                                                  \
  |  |   74|   138k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   138k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   138k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   138k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   138k|                                                  \
  |  |   79|   138k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   138k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   138k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 138k]
  |  |  ------------------
  ------------------
 1793|       |
 1794|       |  // stage 5
 1795|   138k|  btf_16_adds_subs_sse2(x[0], x[2]);
  ------------------
  |  |   37|   138k|  do {                                  \
  |  |   38|   138k|    const __m128i _in0 = in0;           \
  |  |   39|   138k|    const __m128i _in1 = in1;           \
  |  |   40|   138k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   138k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   138k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 138k]
  |  |  ------------------
  ------------------
 1796|   138k|  btf_16_adds_subs_sse2(x[1], x[3]);
  ------------------
  |  |   37|   138k|  do {                                  \
  |  |   38|   138k|    const __m128i _in0 = in0;           \
  |  |   39|   138k|    const __m128i _in1 = in1;           \
  |  |   40|   138k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   138k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   138k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 138k]
  |  |  ------------------
  ------------------
 1797|   138k|  btf_16_adds_subs_sse2(x[4], x[6]);
  ------------------
  |  |   37|   138k|  do {                                  \
  |  |   38|   138k|    const __m128i _in0 = in0;           \
  |  |   39|   138k|    const __m128i _in1 = in1;           \
  |  |   40|   138k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   138k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   138k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 138k]
  |  |  ------------------
  ------------------
 1798|   138k|  btf_16_adds_subs_sse2(x[5], x[7]);
  ------------------
  |  |   37|   138k|  do {                                  \
  |  |   38|   138k|    const __m128i _in0 = in0;           \
  |  |   39|   138k|    const __m128i _in1 = in1;           \
  |  |   40|   138k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   138k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   138k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 138k]
  |  |  ------------------
  ------------------
 1799|       |
 1800|       |  // stage 6
 1801|   138k|  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
  ------------------
  |  |   61|   138k|  do {                                            \
  |  |   62|   138k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   138k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   138k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   138k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   138k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   138k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   138k|                                                  \
  |  |   69|   138k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   138k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   138k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   138k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   138k|                                                  \
  |  |   74|   138k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   138k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   138k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   138k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   138k|                                                  \
  |  |   79|   138k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   138k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   138k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 138k]
  |  |  ------------------
  ------------------
 1802|   138k|  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
  ------------------
  |  |   61|   138k|  do {                                            \
  |  |   62|   138k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   138k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   138k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   138k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   138k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   138k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   138k|                                                  \
  |  |   69|   138k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   138k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   138k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   138k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   138k|                                                  \
  |  |   74|   138k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   138k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   138k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   138k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   138k|                                                  \
  |  |   79|   138k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   138k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   138k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 138k]
  |  |  ------------------
  ------------------
 1803|       |
 1804|       |  // stage 7
 1805|   138k|  output[0] = x[0];
 1806|   138k|  output[1] = _mm_subs_epi16(__zero, x[4]);
 1807|   138k|  output[2] = x[6];
 1808|   138k|  output[3] = _mm_subs_epi16(__zero, x[2]);
 1809|   138k|  output[4] = x[3];
 1810|   138k|  output[5] = _mm_subs_epi16(__zero, x[7]);
 1811|   138k|  output[6] = x[5];
 1812|   138k|  output[7] = _mm_subs_epi16(__zero, x[1]);
 1813|   138k|}
av1_lowbd_inv_txfm2d_add_idtx_ssse3:
 2386|  14.2k|                                         int stride, TX_SIZE tx_size) {
 2387|  14.2k|  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
 2388|  14.2k|  const int txw_idx = get_txw_idx(tx_size);
 2389|  14.2k|  const int txh_idx = get_txh_idx(tx_size);
 2390|  14.2k|  const int txfm_size_col = tx_size_wide[tx_size];
 2391|  14.2k|  const int txfm_size_row = tx_size_high[tx_size];
 2392|  14.2k|  const int col_max = AOMMIN(32, txfm_size_col);
  ------------------
  |  |   34|  14.2k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 14.2k]
  |  |  ------------------
  ------------------
 2393|  14.2k|  const int row_max = AOMMIN(32, txfm_size_row);
  ------------------
  |  |   34|  14.2k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 14.2k]
  |  |  ------------------
  ------------------
 2394|  14.2k|  const int input_stride = row_max;
 2395|  14.2k|  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
 2396|       |
 2397|  31.0k|  for (int i = 0; i < (col_max >> 3); ++i) {
  ------------------
  |  Branch (2397:19): [True: 16.7k, False: 14.2k]
  ------------------
 2398|  35.3k|    for (int j = 0; j < (row_max >> 3); j++) {
  ------------------
  |  Branch (2398:21): [True: 18.5k, False: 16.7k]
  ------------------
 2399|  18.5k|      __m128i buf[8];
 2400|  18.5k|      iidentity_row_8xn_ssse3(buf, input + j * 8 + i * 8 * input_stride,
 2401|  18.5k|                              row_max, shift[0], 8, txw_idx, rect_type);
 2402|  18.5k|      transpose_16bit_8x8(buf, buf);
 2403|  18.5k|      iidentity_col_8xn_ssse3(output + i * 8 + j * 8 * stride, stride, buf,
 2404|  18.5k|                              shift[1], 8, txh_idx);
 2405|  18.5k|    }
 2406|  16.7k|  }
 2407|  14.2k|}
av1_lowbd_inv_txfm2d_add_h_identity_ssse3:
 2544|  5.85k|                                               int eob) {
 2545|  5.85k|  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
 2546|  5.85k|  int eobx, eoby;
 2547|  5.85k|  get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
 2548|  5.85k|  const int txw_idx = get_txw_idx(tx_size);
 2549|  5.85k|  const int txh_idx = get_txh_idx(tx_size);
 2550|  5.85k|  const int txfm_size_col = tx_size_wide[tx_size];
 2551|  5.85k|  const int txfm_size_row = tx_size_high[tx_size];
 2552|  5.85k|  const int buf_size_w_div8 = (eobx + 8) >> 3;
 2553|  5.85k|  const int buf_size_h_div8 = (eoby + 8) >> 3;
 2554|  5.85k|  const int input_stride = AOMMIN(32, txfm_size_row);
  ------------------
  |  |   34|  5.85k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 5.85k]
  |  |  ------------------
  ------------------
 2555|  5.85k|  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
 2556|       |
 2557|  5.85k|  const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby];
 2558|  5.85k|  assert(fun_idx < 5);
 2559|  5.85k|  const transform_1d_ssse3 col_txfm =
 2560|  5.85k|      lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx];
 2561|       |
 2562|  5.85k|  assert(col_txfm != NULL);
 2563|       |
 2564|  5.85k|  int ud_flip, lr_flip;
 2565|  5.85k|  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 2566|  12.8k|  for (int i = 0; i < buf_size_w_div8; i++) {
  ------------------
  |  Branch (2566:19): [True: 7.01k, False: 5.85k]
  ------------------
 2567|  7.01k|    __m128i buf0[64];
 2568|  14.3k|    for (int j = 0; j < buf_size_h_div8; j++) {
  ------------------
  |  Branch (2568:21): [True: 7.31k, False: 7.01k]
  ------------------
 2569|  7.31k|      __m128i *buf0_cur = buf0 + j * 8;
 2570|  7.31k|      const int32_t *input_cur = input + i * 8 * input_stride + j * 8;
 2571|  7.31k|      iidentity_row_8xn_ssse3(buf0_cur, input_cur, input_stride, shift[0], 8,
 2572|  7.31k|                              txw_idx, rect_type);
 2573|  7.31k|      transpose_16bit_8x8(buf0_cur, buf0_cur);
 2574|  7.31k|    }
 2575|  7.01k|    col_txfm(buf0, buf0);
 2576|  7.01k|    __m128i mshift = _mm_set1_epi16(1 << (15 + shift[1]));
 2577|  7.01k|    int k = ud_flip ? (txfm_size_row - 1) : 0;
  ------------------
  |  Branch (2577:13): [True: 360, False: 6.65k]
  ------------------
 2578|  7.01k|    const int step = ud_flip ? -1 : 1;
  ------------------
  |  Branch (2578:22): [True: 360, False: 6.65k]
  ------------------
 2579|  7.01k|    uint8_t *out = output + 8 * i;
 2580|  70.3k|    for (int j = 0; j < txfm_size_row; ++j, k += step) {
  ------------------
  |  Branch (2580:21): [True: 63.3k, False: 7.01k]
  ------------------
 2581|  63.3k|      const __m128i v = _mm_loadl_epi64((__m128i const *)(out));
 2582|  63.3k|      __m128i res = _mm_mulhrs_epi16(buf0[k], mshift);
 2583|  63.3k|      const __m128i u = lowbd_get_recon_8x8_sse2(v, res);
 2584|  63.3k|      _mm_storel_epi64((__m128i *)(out), u);
 2585|  63.3k|      out += stride;
 2586|  63.3k|    }
 2587|  7.01k|  }
 2588|  5.85k|}
av1_lowbd_inv_txfm2d_add_v_identity_ssse3:
 2593|  11.8k|                                               int eob) {
 2594|  11.8k|  __m128i buf1[64];
 2595|  11.8k|  int eobx, eoby;
 2596|  11.8k|  get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
 2597|  11.8k|  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
 2598|  11.8k|  const int txw_idx = get_txw_idx(tx_size);
 2599|  11.8k|  const int txh_idx = get_txh_idx(tx_size);
 2600|  11.8k|  const int txfm_size_col = tx_size_wide[tx_size];
 2601|  11.8k|  const int txfm_size_row = tx_size_high[tx_size];
 2602|  11.8k|  const int buf_size_w_div8 = txfm_size_col >> 3;
 2603|  11.8k|  const int buf_size_nonzero_w = ((eobx + 8) >> 3) << 3;
 2604|  11.8k|  const int buf_size_h_div8 = (eoby + 8) >> 3;
 2605|  11.8k|  const int input_stride = AOMMIN(32, txfm_size_row);
  ------------------
  |  |   34|  11.8k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 11.8k]
  |  |  ------------------
  ------------------
 2606|  11.8k|  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
 2607|       |
 2608|  11.8k|  const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx];
 2609|  11.8k|  const transform_1d_ssse3 row_txfm =
 2610|  11.8k|      lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx];
 2611|       |
 2612|  11.8k|  assert(row_txfm != NULL);
 2613|  11.8k|  int ud_flip, lr_flip;
 2614|  11.8k|  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 2615|  25.3k|  for (int i = 0; i < buf_size_h_div8; i++) {
  ------------------
  |  Branch (2615:19): [True: 13.4k, False: 11.8k]
  ------------------
 2616|  13.4k|    __m128i buf0[64];
 2617|  13.4k|    load_buffer_32bit_to_16bit(input + i * 8, input_stride, buf0,
 2618|  13.4k|                               buf_size_nonzero_w);
 2619|  13.4k|    if (rect_type == 1 || rect_type == -1) {
  ------------------
  |  Branch (2619:9): [True: 2.57k, False: 10.8k]
  |  Branch (2619:27): [True: 3.37k, False: 7.52k]
  ------------------
 2620|  5.95k|      round_shift_ssse3(buf0, buf0, buf_size_nonzero_w);  // rect special code
 2621|  5.95k|    }
 2622|  13.4k|    row_txfm(buf0, buf0);
 2623|  13.4k|    round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]);
 2624|  13.4k|    __m128i *_buf1 = buf1;
 2625|  13.4k|    if (lr_flip) {
  ------------------
  |  Branch (2625:9): [True: 758, False: 12.7k]
  ------------------
 2626|  1.67k|      for (int j = 0; j < buf_size_w_div8; ++j) {
  ------------------
  |  Branch (2626:23): [True: 916, False: 758]
  ------------------
 2627|    916|        __m128i temp[8];
 2628|    916|        flip_buf_sse2(buf0 + 8 * j, temp, 8);
 2629|    916|        transpose_16bit_8x8(temp, _buf1 + 8 * (buf_size_w_div8 - 1 - j));
 2630|    916|      }
 2631|  12.7k|    } else {
 2632|  27.8k|      for (int j = 0; j < buf_size_w_div8; ++j) {
  ------------------
  |  Branch (2632:23): [True: 15.1k, False: 12.7k]
  ------------------
 2633|  15.1k|        transpose_16bit_8x8(buf0 + 8 * j, _buf1 + 8 * j);
 2634|  15.1k|      }
 2635|  12.7k|    }
 2636|       |
 2637|  29.5k|    for (int j = 0; j < buf_size_w_div8; ++j) {
  ------------------
  |  Branch (2637:21): [True: 16.0k, False: 13.4k]
  ------------------
 2638|  16.0k|      iidentity_col_8xn_ssse3(output + i * 8 * stride + j * 8, stride,
 2639|  16.0k|                              buf1 + j * 8, shift[1], 8, txh_idx);
 2640|  16.0k|    }
 2641|  13.4k|  }
 2642|  11.8k|}
av1_lowbd_inv_txfm2d_add_ssse3:
 2865|   356k|                                    TX_SIZE tx_size, int eob) {
 2866|   356k|  switch (tx_size) {
 2867|   109k|    case TX_4X4:
  ------------------
  |  Branch (2867:5): [True: 109k, False: 247k]
  ------------------
 2868|   109k|      lowbd_inv_txfm2d_add_4x4_ssse3(input, output, stride, tx_type, tx_size,
 2869|   109k|                                     eob);
 2870|   109k|      break;
 2871|  32.8k|    case TX_4X8:
  ------------------
  |  Branch (2871:5): [True: 32.8k, False: 323k]
  ------------------
 2872|  32.8k|      lowbd_inv_txfm2d_add_4x8_ssse3(input, output, stride, tx_type, tx_size,
 2873|  32.8k|                                     eob);
 2874|  32.8k|      break;
 2875|  49.8k|    case TX_8X4:
  ------------------
  |  Branch (2875:5): [True: 49.8k, False: 306k]
  ------------------
 2876|  49.8k|      lowbd_inv_txfm2d_add_8x4_ssse3(input, output, stride, tx_type, tx_size,
 2877|  49.8k|                                     eob);
 2878|  49.8k|      break;
 2879|  18.7k|    case TX_4X16:
  ------------------
  |  Branch (2879:5): [True: 18.7k, False: 338k]
  ------------------
 2880|  18.7k|      lowbd_inv_txfm2d_add_4x16_ssse3(input, output, stride, tx_type, tx_size,
 2881|  18.7k|                                      eob);
 2882|  18.7k|      break;
 2883|  38.3k|    case TX_16X4:
  ------------------
  |  Branch (2883:5): [True: 38.3k, False: 318k]
  ------------------
 2884|  38.3k|      lowbd_inv_txfm2d_add_16x4_ssse3(input, output, stride, tx_type, tx_size,
 2885|  38.3k|                                      eob);
 2886|  38.3k|      break;
 2887|   107k|    default:
  ------------------
  |  Branch (2887:5): [True: 107k, False: 249k]
  ------------------
 2888|   107k|      lowbd_inv_txfm2d_add_universe_ssse3(input, output, stride, tx_type,
 2889|   107k|                                          tx_size, eob);
 2890|   107k|      break;
 2891|   356k|  }
 2892|   356k|}
av1_inv_txfm_ssse3.c:iidentity_row_8xn_ssse3:
 2318|  25.8k|                                           int txw_idx, int rect_type) {
 2319|  25.8k|  const int32_t *input_row = input;
 2320|  25.8k|  const __m128i scale = _mm_set1_epi16(NewSqrt2list[txw_idx]);
 2321|  25.8k|  const __m128i rounding = _mm_set1_epi16((1 << (NewSqrt2Bits - 1)) +
  ------------------
  |  |   41|  25.8k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 2322|  25.8k|                                          (1 << (NewSqrt2Bits - shift - 1)));
  ------------------
  |  |   41|  25.8k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 2323|  25.8k|  const __m128i one = _mm_set1_epi16(1);
 2324|  25.8k|  const __m128i scale_rounding = _mm_unpacklo_epi16(scale, rounding);
 2325|  25.8k|  if (rect_type != 1 && rect_type != -1) {
  ------------------
  |  Branch (2325:7): [True: 18.7k, False: 7.04k]
  |  Branch (2325:25): [True: 14.5k, False: 4.25k]
  ------------------
 2326|   130k|    for (int i = 0; i < height; ++i) {
  ------------------
  |  Branch (2326:21): [True: 116k, False: 14.5k]
  ------------------
 2327|   116k|      const __m128i src = load_32bit_to_16bit(input_row);
 2328|   116k|      input_row += stride;
 2329|   116k|      __m128i lo = _mm_unpacklo_epi16(src, one);
 2330|   116k|      __m128i hi = _mm_unpackhi_epi16(src, one);
 2331|   116k|      lo = _mm_madd_epi16(lo, scale_rounding);
 2332|   116k|      hi = _mm_madd_epi16(hi, scale_rounding);
 2333|   116k|      lo = _mm_srai_epi32(lo, NewSqrt2Bits - shift);
  ------------------
  |  |   41|   116k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 2334|   116k|      hi = _mm_srai_epi32(hi, NewSqrt2Bits - shift);
  ------------------
  |  |   41|   116k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 2335|   116k|      out[i] = _mm_packs_epi32(lo, hi);
 2336|   116k|    }
 2337|  14.5k|  } else {
 2338|  11.2k|    const __m128i rect_scale =
 2339|  11.2k|        _mm_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits));
  ------------------
  |  |   41|  11.2k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 2340|   101k|    for (int i = 0; i < height; ++i) {
  ------------------
  |  Branch (2340:21): [True: 90.3k, False: 11.2k]
  ------------------
 2341|  90.3k|      __m128i src = load_32bit_to_16bit(input_row);
 2342|  90.3k|      src = _mm_mulhrs_epi16(src, rect_scale);
 2343|  90.3k|      input_row += stride;
 2344|  90.3k|      __m128i lo = _mm_unpacklo_epi16(src, one);
 2345|  90.3k|      __m128i hi = _mm_unpackhi_epi16(src, one);
 2346|  90.3k|      lo = _mm_madd_epi16(lo, scale_rounding);
 2347|  90.3k|      hi = _mm_madd_epi16(hi, scale_rounding);
 2348|  90.3k|      lo = _mm_srai_epi32(lo, NewSqrt2Bits - shift);
  ------------------
  |  |   41|  90.3k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 2349|  90.3k|      hi = _mm_srai_epi32(hi, NewSqrt2Bits - shift);
  ------------------
  |  |   41|  90.3k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 2350|  90.3k|      out[i] = _mm_packs_epi32(lo, hi);
 2351|  90.3k|    }
 2352|  11.2k|  }
 2353|  25.8k|}
av1_inv_txfm_ssse3.c:iidentity_col_8xn_ssse3:
 2357|  34.5k|                                           int txh_idx) {
 2358|  34.5k|  const __m128i scale = _mm_set1_epi16(NewSqrt2list[txh_idx]);
 2359|  34.5k|  const __m128i scale_rounding = _mm_set1_epi16(1 << (NewSqrt2Bits - 1));
  ------------------
  |  |   41|  34.5k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 2360|  34.5k|  const __m128i shift_rounding = _mm_set1_epi32(1 << (-shift - 1));
 2361|  34.5k|  const __m128i one = _mm_set1_epi16(1);
 2362|  34.5k|  const __m128i scale_coeff = _mm_unpacklo_epi16(scale, scale_rounding);
 2363|  34.5k|  const __m128i zero = _mm_setzero_si128();
 2364|   311k|  for (int h = 0; h < height; ++h) {
  ------------------
  |  Branch (2364:19): [True: 276k, False: 34.5k]
  ------------------
 2365|   276k|    __m128i lo = _mm_unpacklo_epi16(buf[h], one);
 2366|   276k|    __m128i hi = _mm_unpackhi_epi16(buf[h], one);
 2367|   276k|    lo = _mm_madd_epi16(lo, scale_coeff);
 2368|   276k|    hi = _mm_madd_epi16(hi, scale_coeff);
 2369|   276k|    lo = _mm_srai_epi32(lo, NewSqrt2Bits);
  ------------------
  |  |   41|   276k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 2370|   276k|    hi = _mm_srai_epi32(hi, NewSqrt2Bits);
  ------------------
  |  |   41|   276k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 2371|   276k|    lo = _mm_add_epi32(lo, shift_rounding);
 2372|   276k|    hi = _mm_add_epi32(hi, shift_rounding);
 2373|   276k|    lo = _mm_srai_epi32(lo, -shift);
 2374|   276k|    hi = _mm_srai_epi32(hi, -shift);
 2375|   276k|    __m128i x = _mm_packs_epi32(lo, hi);
 2376|       |
 2377|   276k|    const __m128i pred = _mm_loadl_epi64((__m128i const *)(output));
 2378|   276k|    x = _mm_adds_epi16(x, _mm_unpacklo_epi8(pred, zero));
 2379|   276k|    const __m128i u = _mm_packus_epi16(x, x);
 2380|   276k|    _mm_storel_epi64((__m128i *)(output), u);
 2381|   276k|    output += stride;
 2382|   276k|  }
 2383|  34.5k|}
av1_inv_txfm_ssse3.c:idct4_sse2:
   27|  97.6k|static void idct4_sse2(const __m128i *input, __m128i *output) {
   28|  97.6k|  const int8_t cos_bit = INV_COS_BIT;
  ------------------
  |  |   43|  97.6k|#define INV_COS_BIT 12
  ------------------
   29|  97.6k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|  97.6k|#define INV_COS_BIT 12
  ------------------
   30|  97.6k|  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|  97.6k|#define INV_COS_BIT 12
  ------------------
   31|       |
   32|  97.6k|  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
  ------------------
  |  |   20|  97.6k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
   33|  97.6k|  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
  ------------------
  |  |   20|  97.6k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
   34|  97.6k|  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
  ------------------
  |  |   20|  97.6k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
   35|  97.6k|  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
  ------------------
  |  |   20|  97.6k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
   36|       |
   37|       |  // stage 1
   38|  97.6k|  __m128i x[4];
   39|  97.6k|  x[0] = input[0];
   40|  97.6k|  x[1] = input[2];
   41|  97.6k|  x[2] = input[1];
   42|  97.6k|  x[3] = input[3];
   43|       |
   44|       |  // stage 2
   45|  97.6k|  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
  ------------------
  |  |   61|  97.6k|  do {                                            \
  |  |   62|  97.6k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  97.6k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  97.6k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  97.6k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  97.6k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  97.6k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  97.6k|                                                  \
  |  |   69|  97.6k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  97.6k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  97.6k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  97.6k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  97.6k|                                                  \
  |  |   74|  97.6k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  97.6k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  97.6k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  97.6k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  97.6k|                                                  \
  |  |   79|  97.6k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  97.6k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  97.6k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 97.6k]
  |  |  ------------------
  ------------------
   46|  97.6k|  btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
  ------------------
  |  |   61|  97.6k|  do {                                            \
  |  |   62|  97.6k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  97.6k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  97.6k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  97.6k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  97.6k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  97.6k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  97.6k|                                                  \
  |  |   69|  97.6k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  97.6k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  97.6k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  97.6k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  97.6k|                                                  \
  |  |   74|  97.6k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  97.6k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  97.6k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  97.6k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  97.6k|                                                  \
  |  |   79|  97.6k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  97.6k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  97.6k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 97.6k]
  |  |  ------------------
  ------------------
   47|       |
   48|       |  // stage 3
   49|  97.6k|  btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]);
  ------------------
  |  |   53|  97.6k|  do {                                                  \
  |  |   54|  97.6k|    const __m128i _in0 = in0;                           \
  |  |   55|  97.6k|    const __m128i _in1 = in1;                           \
  |  |   56|  97.6k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|  97.6k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|  97.6k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 97.6k]
  |  |  ------------------
  ------------------
   50|  97.6k|  btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]);
  ------------------
  |  |   53|  97.6k|  do {                                                  \
  |  |   54|  97.6k|    const __m128i _in0 = in0;                           \
  |  |   55|  97.6k|    const __m128i _in1 = in1;                           \
  |  |   56|  97.6k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|  97.6k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|  97.6k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 97.6k]
  |  |  ------------------
  ------------------
   51|  97.6k|}
av1_inv_txfm_ssse3.c:iadst4_sse2:
 1597|  76.7k|static void iadst4_sse2(const __m128i *input, __m128i *output) {
 1598|  76.7k|  const int32_t *sinpi = sinpi_arr(INV_COS_BIT);
  ------------------
  |  |   43|  76.7k|#define INV_COS_BIT 12
  ------------------
 1599|  76.7k|  const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
  ------------------
  |  |   20|  76.7k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1600|  76.7k|  const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]);
  ------------------
  |  |   20|  76.7k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1601|  76.7k|  const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]);
  ------------------
  |  |   20|  76.7k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1602|  76.7k|  const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]);
  ------------------
  |  |   20|  76.7k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1603|  76.7k|  const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]);
  ------------------
  |  |   20|  76.7k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1604|  76.7k|  const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]);
  ------------------
  |  |   20|  76.7k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1605|  76.7k|  const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]);
  ------------------
  |  |   20|  76.7k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1606|  76.7k|  const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]);
  ------------------
  |  |   20|  76.7k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1607|  76.7k|  __m128i x0[4];
 1608|  76.7k|  x0[0] = input[0];
 1609|  76.7k|  x0[1] = input[1];
 1610|  76.7k|  x0[2] = input[2];
 1611|  76.7k|  x0[3] = input[3];
 1612|       |
 1613|  76.7k|  __m128i u[4];
 1614|  76.7k|  u[0] = _mm_unpacklo_epi16(x0[0], x0[2]);
 1615|  76.7k|  u[1] = _mm_unpackhi_epi16(x0[0], x0[2]);
 1616|  76.7k|  u[2] = _mm_unpacklo_epi16(x0[1], x0[3]);
 1617|  76.7k|  u[3] = _mm_unpackhi_epi16(x0[1], x0[3]);
 1618|       |
 1619|  76.7k|  __m128i x1[16];
 1620|  76.7k|  x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04);  // x0*sin1 + x2*sin4
 1621|  76.7k|  x1[1] = _mm_madd_epi16(u[1], sinpi_p01_p04);
 1622|  76.7k|  x1[2] = _mm_madd_epi16(u[0], sinpi_p02_m01);  // x0*sin2 - x2*sin1
 1623|  76.7k|  x1[3] = _mm_madd_epi16(u[1], sinpi_p02_m01);
 1624|  76.7k|  x1[4] = _mm_madd_epi16(u[2], sinpi_p03_p02);  // x1*sin3 + x3*sin2
 1625|  76.7k|  x1[5] = _mm_madd_epi16(u[3], sinpi_p03_p02);
 1626|  76.7k|  x1[6] = _mm_madd_epi16(u[2], sinpi_p03_m04);  // x1*sin3 - x3*sin4
 1627|  76.7k|  x1[7] = _mm_madd_epi16(u[3], sinpi_p03_m04);
 1628|  76.7k|  x1[8] = _mm_madd_epi16(u[0], sinpi_p03_m03);  // x0*sin3 - x2*sin3
 1629|  76.7k|  x1[9] = _mm_madd_epi16(u[1], sinpi_p03_m03);
 1630|  76.7k|  x1[10] = _mm_madd_epi16(u[2], sinpi_0_p03);  // x2*sin3
 1631|  76.7k|  x1[11] = _mm_madd_epi16(u[3], sinpi_0_p03);
 1632|  76.7k|  x1[12] = _mm_madd_epi16(u[0], sinpi_p04_p02);  // x0*sin4 + x2*sin2
 1633|  76.7k|  x1[13] = _mm_madd_epi16(u[1], sinpi_p04_p02);
 1634|  76.7k|  x1[14] = _mm_madd_epi16(u[2], sinpi_m03_m01);  // -x1*sin3 - x3*sin1
 1635|  76.7k|  x1[15] = _mm_madd_epi16(u[3], sinpi_m03_m01);
 1636|       |
 1637|  76.7k|  __m128i x2[8];
 1638|  76.7k|  x2[0] = _mm_add_epi32(x1[0], x1[4]);  // x0*sin1 +x2*sin4 +x1*sin3 +x3*sin2
 1639|  76.7k|  x2[1] = _mm_add_epi32(x1[1], x1[5]);
 1640|  76.7k|  x2[2] = _mm_add_epi32(x1[2], x1[6]);  // x0*sin2 -x2*sin1 +x1*sin3 -x3*sin4
 1641|  76.7k|  x2[3] = _mm_add_epi32(x1[3], x1[7]);
 1642|  76.7k|  x2[4] = _mm_add_epi32(x1[8], x1[10]);  // x0*sin3 -x2*sin3 +x3*sin3
 1643|  76.7k|  x2[5] = _mm_add_epi32(x1[9], x1[11]);
 1644|  76.7k|  x2[6] = _mm_add_epi32(x1[12], x1[14]);  // x0*sin1 +x2*sin4 +x0*sin2 -x2*sin1
 1645|  76.7k|  x2[7] = _mm_add_epi32(x1[13], x1[15]);
 1646|       |
 1647|  76.7k|  const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|  76.7k|#define INV_COS_BIT 12
  ------------------
 1648|   383k|  for (int i = 0; i < 4; ++i) {
  ------------------
  |  Branch (1648:19): [True: 307k, False: 76.7k]
  ------------------
 1649|   307k|    __m128i out0 = _mm_add_epi32(x2[2 * i], rounding);
 1650|   307k|    __m128i out1 = _mm_add_epi32(x2[2 * i + 1], rounding);
 1651|   307k|    out0 = _mm_srai_epi32(out0, INV_COS_BIT);
  ------------------
  |  |   43|   307k|#define INV_COS_BIT 12
  ------------------
 1652|   307k|    out1 = _mm_srai_epi32(out1, INV_COS_BIT);
  ------------------
  |  |   43|   307k|#define INV_COS_BIT 12
  ------------------
 1653|   307k|    output[i] = _mm_packs_epi32(out0, out1);
 1654|   307k|  }
 1655|  76.7k|}
av1_inv_txfm_ssse3.c:iidentity4_ssse3:
 2210|  51.4k|static void iidentity4_ssse3(const __m128i *input, __m128i *output) {
 2211|  51.4k|  const int16_t scale_fractional = (NewSqrt2 - (1 << NewSqrt2Bits));
  ------------------
  |  |   41|  51.4k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 2212|  51.4k|  const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits));
  ------------------
  |  |   41|  51.4k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 2213|   257k|  for (int i = 0; i < 4; ++i) {
  ------------------
  |  Branch (2213:19): [True: 205k, False: 51.4k]
  ------------------
 2214|   205k|    __m128i x = _mm_mulhrs_epi16(input[i], scale);
 2215|   205k|    output[i] = _mm_adds_epi16(x, input[i]);
 2216|   205k|  }
 2217|  51.4k|}
av1_inv_txfm_ssse3.c:iidentity8_sse2:
 2219|  9.45k|static void iidentity8_sse2(const __m128i *input, __m128i *output) {
 2220|  85.1k|  for (int i = 0; i < 8; ++i) {
  ------------------
  |  Branch (2220:19): [True: 75.6k, False: 9.45k]
  ------------------
 2221|  75.6k|    output[i] = _mm_adds_epi16(input[i], input[i]);
 2222|  75.6k|  }
 2223|  9.45k|}
av1_inv_txfm_ssse3.c:idct16_low1_ssse3:
  236|  8.35k|static void idct16_low1_ssse3(const __m128i *input, __m128i *output) {
  237|  8.35k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|  8.35k|#define INV_COS_BIT 12
  ------------------
  238|       |
  239|       |  // stage 1
  240|  8.35k|  __m128i x[2];
  241|  8.35k|  x[0] = input[0];
  242|       |
  243|       |  // stage 2
  244|       |  // stage 3
  245|       |  // stage 4
  246|  8.35k|  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
  ------------------
  |  |   28|  8.35k|  do {                                          \
  |  |   29|  8.35k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  8.35k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  8.35k|    const __m128i _in = in;                     \
  |  |   32|  8.35k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  8.35k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  8.35k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 8.35k]
  |  |  ------------------
  ------------------
  247|       |
  248|       |  // stage 5
  249|       |  // stage 6
  250|       |  // stage 7
  251|  8.35k|  output[0] = x[0];
  252|  8.35k|  output[15] = x[0];
  253|  8.35k|  output[1] = x[1];
  254|  8.35k|  output[14] = x[1];
  255|  8.35k|  output[2] = x[1];
  256|  8.35k|  output[13] = x[1];
  257|  8.35k|  output[3] = x[0];
  258|  8.35k|  output[12] = x[0];
  259|  8.35k|  output[4] = x[0];
  260|  8.35k|  output[11] = x[0];
  261|  8.35k|  output[5] = x[1];
  262|  8.35k|  output[10] = x[1];
  263|  8.35k|  output[6] = x[1];
  264|  8.35k|  output[9] = x[1];
  265|  8.35k|  output[7] = x[0];
  266|  8.35k|  output[8] = x[0];
  267|  8.35k|}
av1_inv_txfm_ssse3.c:idct16_low8_ssse3:
  269|  22.2k|static void idct16_low8_ssse3(const __m128i *input, __m128i *output) {
  270|  22.2k|  const int8_t cos_bit = INV_COS_BIT;
  ------------------
  |  |   43|  22.2k|#define INV_COS_BIT 12
  ------------------
  271|  22.2k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|  22.2k|#define INV_COS_BIT 12
  ------------------
  272|  22.2k|  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|  22.2k|#define INV_COS_BIT 12
  ------------------
  273|  22.2k|  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
  ------------------
  |  |   20|  22.2k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  274|  22.2k|  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
  ------------------
  |  |   20|  22.2k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  275|  22.2k|  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
  ------------------
  |  |   20|  22.2k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  276|       |
  277|       |  // stage 1
  278|  22.2k|  __m128i x[16];
  279|  22.2k|  x[0] = input[0];
  280|  22.2k|  x[2] = input[4];
  281|  22.2k|  x[4] = input[2];
  282|  22.2k|  x[6] = input[6];
  283|  22.2k|  x[8] = input[1];
  284|  22.2k|  x[10] = input[5];
  285|  22.2k|  x[12] = input[3];
  286|  22.2k|  x[14] = input[7];
  287|       |
  288|       |  // stage 2
  289|  22.2k|  btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
  ------------------
  |  |   28|  22.2k|  do {                                          \
  |  |   29|  22.2k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  22.2k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  22.2k|    const __m128i _in = in;                     \
  |  |   32|  22.2k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  22.2k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  22.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.2k]
  |  |  ------------------
  ------------------
  290|  22.2k|  btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
  ------------------
  |  |   28|  22.2k|  do {                                          \
  |  |   29|  22.2k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  22.2k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  22.2k|    const __m128i _in = in;                     \
  |  |   32|  22.2k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  22.2k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  22.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.2k]
  |  |  ------------------
  ------------------
  291|  22.2k|  btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
  ------------------
  |  |   28|  22.2k|  do {                                          \
  |  |   29|  22.2k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  22.2k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  22.2k|    const __m128i _in = in;                     \
  |  |   32|  22.2k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  22.2k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  22.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.2k]
  |  |  ------------------
  ------------------
  292|  22.2k|  btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
  ------------------
  |  |   28|  22.2k|  do {                                          \
  |  |   29|  22.2k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  22.2k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  22.2k|    const __m128i _in = in;                     \
  |  |   32|  22.2k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  22.2k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  22.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.2k]
  |  |  ------------------
  ------------------
  293|       |
  294|       |  // stage 3
  295|  22.2k|  btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
  ------------------
  |  |   28|  22.2k|  do {                                          \
  |  |   29|  22.2k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  22.2k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  22.2k|    const __m128i _in = in;                     \
  |  |   32|  22.2k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  22.2k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  22.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.2k]
  |  |  ------------------
  ------------------
  296|  22.2k|  btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
  ------------------
  |  |   28|  22.2k|  do {                                          \
  |  |   29|  22.2k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  22.2k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  22.2k|    const __m128i _in = in;                     \
  |  |   32|  22.2k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  22.2k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  22.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.2k]
  |  |  ------------------
  ------------------
  297|  22.2k|  btf_16_adds_subs_sse2(x[8], x[9]);
  ------------------
  |  |   37|  22.2k|  do {                                  \
  |  |   38|  22.2k|    const __m128i _in0 = in0;           \
  |  |   39|  22.2k|    const __m128i _in1 = in1;           \
  |  |   40|  22.2k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  22.2k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  22.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 22.2k]
  |  |  ------------------
  ------------------
  298|  22.2k|  btf_16_subs_adds_sse2(x[11], x[10]);
  ------------------
  |  |   45|  22.2k|  do {                                  \
  |  |   46|  22.2k|    const __m128i _in0 = in0;           \
  |  |   47|  22.2k|    const __m128i _in1 = in1;           \
  |  |   48|  22.2k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|  22.2k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|  22.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded, False: 22.2k]
  |  |  ------------------
  ------------------
  299|  22.2k|  btf_16_adds_subs_sse2(x[12], x[13]);
  ------------------
  |  |   37|  22.2k|  do {                                  \
  |  |   38|  22.2k|    const __m128i _in0 = in0;           \
  |  |   39|  22.2k|    const __m128i _in1 = in1;           \
  |  |   40|  22.2k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  22.2k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  22.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 22.2k]
  |  |  ------------------
  ------------------
  300|  22.2k|  btf_16_subs_adds_sse2(x[15], x[14]);
  ------------------
  |  |   45|  22.2k|  do {                                  \
  |  |   46|  22.2k|    const __m128i _in0 = in0;           \
  |  |   47|  22.2k|    const __m128i _in1 = in1;           \
  |  |   48|  22.2k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|  22.2k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|  22.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded, False: 22.2k]
  |  |  ------------------
  ------------------
  301|       |
  302|       |  // stage 4
  303|  22.2k|  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
  ------------------
  |  |   28|  22.2k|  do {                                          \
  |  |   29|  22.2k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  22.2k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  22.2k|    const __m128i _in = in;                     \
  |  |   32|  22.2k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  22.2k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  22.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.2k]
  |  |  ------------------
  ------------------
  304|  22.2k|  btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
  ------------------
  |  |   28|  22.2k|  do {                                          \
  |  |   29|  22.2k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  22.2k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  22.2k|    const __m128i _in = in;                     \
  |  |   32|  22.2k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  22.2k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  22.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 22.2k]
  |  |  ------------------
  ------------------
  305|  22.2k|  btf_16_adds_subs_sse2(x[4], x[5]);
  ------------------
  |  |   37|  22.2k|  do {                                  \
  |  |   38|  22.2k|    const __m128i _in0 = in0;           \
  |  |   39|  22.2k|    const __m128i _in1 = in1;           \
  |  |   40|  22.2k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  22.2k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  22.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 22.2k]
  |  |  ------------------
  ------------------
  306|  22.2k|  btf_16_subs_adds_sse2(x[7], x[6]);
  ------------------
  |  |   45|  22.2k|  do {                                  \
  |  |   46|  22.2k|    const __m128i _in0 = in0;           \
  |  |   47|  22.2k|    const __m128i _in1 = in1;           \
  |  |   48|  22.2k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|  22.2k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|  22.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded, False: 22.2k]
  |  |  ------------------
  ------------------
  307|  22.2k|  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
  ------------------
  |  |   61|  22.2k|  do {                                            \
  |  |   62|  22.2k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  22.2k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  22.2k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  22.2k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  22.2k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  22.2k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  22.2k|                                                  \
  |  |   69|  22.2k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  22.2k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  22.2k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  22.2k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  22.2k|                                                  \
  |  |   74|  22.2k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  22.2k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  22.2k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  22.2k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  22.2k|                                                  \
  |  |   79|  22.2k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  22.2k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  22.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 22.2k]
  |  |  ------------------
  ------------------
  308|  22.2k|  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
  ------------------
  |  |   61|  22.2k|  do {                                            \
  |  |   62|  22.2k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  22.2k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  22.2k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  22.2k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  22.2k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  22.2k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  22.2k|                                                  \
  |  |   69|  22.2k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  22.2k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  22.2k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  22.2k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  22.2k|                                                  \
  |  |   74|  22.2k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  22.2k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  22.2k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  22.2k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  22.2k|                                                  \
  |  |   79|  22.2k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  22.2k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  22.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 22.2k]
  |  |  ------------------
  ------------------
  309|       |
  310|  22.2k|  idct16_stage5_sse2(x, cospi, __rounding, cos_bit);
  311|  22.2k|  idct16_stage6_sse2(x, cospi, __rounding, cos_bit);
  312|  22.2k|  idct16_stage7_sse2(output, x);
  313|  22.2k|}
av1_inv_txfm_ssse3.c:idct16_stage5_sse2:
  200|  38.2k|                                      int8_t cos_bit) {
  201|  38.2k|  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
  ------------------
  |  |   20|  38.2k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  202|  38.2k|  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
  ------------------
  |  |   20|  38.2k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  203|  38.2k|  btf_16_adds_subs_sse2(x[0], x[3]);
  ------------------
  |  |   37|  38.2k|  do {                                  \
  |  |   38|  38.2k|    const __m128i _in0 = in0;           \
  |  |   39|  38.2k|    const __m128i _in1 = in1;           \
  |  |   40|  38.2k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  38.2k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  38.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 38.2k]
  |  |  ------------------
  ------------------
  204|  38.2k|  btf_16_adds_subs_sse2(x[1], x[2]);
  ------------------
  |  |   37|  38.2k|  do {                                  \
  |  |   38|  38.2k|    const __m128i _in0 = in0;           \
  |  |   39|  38.2k|    const __m128i _in1 = in1;           \
  |  |   40|  38.2k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  38.2k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  38.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 38.2k]
  |  |  ------------------
  ------------------
  205|  38.2k|  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
  ------------------
  |  |   61|  38.2k|  do {                                            \
  |  |   62|  38.2k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  38.2k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  38.2k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  38.2k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  38.2k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  38.2k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  38.2k|                                                  \
  |  |   69|  38.2k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  38.2k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  38.2k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  38.2k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  38.2k|                                                  \
  |  |   74|  38.2k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  38.2k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  38.2k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  38.2k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  38.2k|                                                  \
  |  |   79|  38.2k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  38.2k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  38.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 38.2k]
  |  |  ------------------
  ------------------
  206|  38.2k|  btf_16_adds_subs_sse2(x[8], x[11]);
  ------------------
  |  |   37|  38.2k|  do {                                  \
  |  |   38|  38.2k|    const __m128i _in0 = in0;           \
  |  |   39|  38.2k|    const __m128i _in1 = in1;           \
  |  |   40|  38.2k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  38.2k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  38.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 38.2k]
  |  |  ------------------
  ------------------
  207|  38.2k|  btf_16_adds_subs_sse2(x[9], x[10]);
  ------------------
  |  |   37|  38.2k|  do {                                  \
  |  |   38|  38.2k|    const __m128i _in0 = in0;           \
  |  |   39|  38.2k|    const __m128i _in1 = in1;           \
  |  |   40|  38.2k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  38.2k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  38.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 38.2k]
  |  |  ------------------
  ------------------
  208|  38.2k|  btf_16_subs_adds_sse2(x[15], x[12]);
  ------------------
  |  |   45|  38.2k|  do {                                  \
  |  |   46|  38.2k|    const __m128i _in0 = in0;           \
  |  |   47|  38.2k|    const __m128i _in1 = in1;           \
  |  |   48|  38.2k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|  38.2k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|  38.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded, False: 38.2k]
  |  |  ------------------
  ------------------
  209|  38.2k|  btf_16_subs_adds_sse2(x[14], x[13]);
  ------------------
  |  |   45|  38.2k|  do {                                  \
  |  |   46|  38.2k|    const __m128i _in0 = in0;           \
  |  |   47|  38.2k|    const __m128i _in1 = in1;           \
  |  |   48|  38.2k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|  38.2k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|  38.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded, False: 38.2k]
  |  |  ------------------
  ------------------
  210|  38.2k|}
av1_inv_txfm_ssse3.c:idct16_stage6_sse2:
  214|  38.2k|                                      int8_t cos_bit) {
  215|  38.2k|  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
  ------------------
  |  |   20|  38.2k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  216|  38.2k|  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
  ------------------
  |  |   20|  38.2k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  217|  38.2k|  btf_16_adds_subs_sse2(x[0], x[7]);
  ------------------
  |  |   37|  38.2k|  do {                                  \
  |  |   38|  38.2k|    const __m128i _in0 = in0;           \
  |  |   39|  38.2k|    const __m128i _in1 = in1;           \
  |  |   40|  38.2k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  38.2k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  38.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 38.2k]
  |  |  ------------------
  ------------------
  218|  38.2k|  btf_16_adds_subs_sse2(x[1], x[6]);
  ------------------
  |  |   37|  38.2k|  do {                                  \
  |  |   38|  38.2k|    const __m128i _in0 = in0;           \
  |  |   39|  38.2k|    const __m128i _in1 = in1;           \
  |  |   40|  38.2k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  38.2k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  38.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 38.2k]
  |  |  ------------------
  ------------------
  219|  38.2k|  btf_16_adds_subs_sse2(x[2], x[5]);
  ------------------
  |  |   37|  38.2k|  do {                                  \
  |  |   38|  38.2k|    const __m128i _in0 = in0;           \
  |  |   39|  38.2k|    const __m128i _in1 = in1;           \
  |  |   40|  38.2k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  38.2k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  38.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 38.2k]
  |  |  ------------------
  ------------------
  220|  38.2k|  btf_16_adds_subs_sse2(x[3], x[4]);
  ------------------
  |  |   37|  38.2k|  do {                                  \
  |  |   38|  38.2k|    const __m128i _in0 = in0;           \
  |  |   39|  38.2k|    const __m128i _in1 = in1;           \
  |  |   40|  38.2k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  38.2k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  38.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 38.2k]
  |  |  ------------------
  ------------------
  221|  38.2k|  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
  ------------------
  |  |   61|  38.2k|  do {                                            \
  |  |   62|  38.2k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  38.2k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  38.2k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  38.2k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  38.2k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  38.2k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  38.2k|                                                  \
  |  |   69|  38.2k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  38.2k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  38.2k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  38.2k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  38.2k|                                                  \
  |  |   74|  38.2k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  38.2k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  38.2k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  38.2k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  38.2k|                                                  \
  |  |   79|  38.2k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  38.2k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  38.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 38.2k]
  |  |  ------------------
  ------------------
  222|  38.2k|  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
  ------------------
  |  |   61|  38.2k|  do {                                            \
  |  |   62|  38.2k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  38.2k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  38.2k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  38.2k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  38.2k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  38.2k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  38.2k|                                                  \
  |  |   69|  38.2k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  38.2k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  38.2k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  38.2k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  38.2k|                                                  \
  |  |   74|  38.2k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  38.2k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  38.2k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  38.2k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  38.2k|                                                  \
  |  |   79|  38.2k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  38.2k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  38.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 38.2k]
  |  |  ------------------
  ------------------
  223|  38.2k|}
av1_inv_txfm_ssse3.c:idct16_stage7_sse2:
  225|  66.7k|static inline void idct16_stage7_sse2(__m128i *output, __m128i *x) {
  226|  66.7k|  btf_16_adds_subs_out_sse2(output[0], output[15], x[0], x[15]);
  ------------------
  |  |   53|  66.7k|  do {                                                  \
  |  |   54|  66.7k|    const __m128i _in0 = in0;                           \
  |  |   55|  66.7k|    const __m128i _in1 = in1;                           \
  |  |   56|  66.7k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|  66.7k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|  66.7k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 66.7k]
  |  |  ------------------
  ------------------
  227|  66.7k|  btf_16_adds_subs_out_sse2(output[1], output[14], x[1], x[14]);
  ------------------
  |  |   53|  66.7k|  do {                                                  \
  |  |   54|  66.7k|    const __m128i _in0 = in0;                           \
  |  |   55|  66.7k|    const __m128i _in1 = in1;                           \
  |  |   56|  66.7k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|  66.7k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|  66.7k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 66.7k]
  |  |  ------------------
  ------------------
  228|  66.7k|  btf_16_adds_subs_out_sse2(output[2], output[13], x[2], x[13]);
  ------------------
  |  |   53|  66.7k|  do {                                                  \
  |  |   54|  66.7k|    const __m128i _in0 = in0;                           \
  |  |   55|  66.7k|    const __m128i _in1 = in1;                           \
  |  |   56|  66.7k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|  66.7k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|  66.7k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 66.7k]
  |  |  ------------------
  ------------------
  229|  66.7k|  btf_16_adds_subs_out_sse2(output[3], output[12], x[3], x[12]);
  ------------------
  |  |   53|  66.7k|  do {                                                  \
  |  |   54|  66.7k|    const __m128i _in0 = in0;                           \
  |  |   55|  66.7k|    const __m128i _in1 = in1;                           \
  |  |   56|  66.7k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|  66.7k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|  66.7k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 66.7k]
  |  |  ------------------
  ------------------
  230|  66.7k|  btf_16_adds_subs_out_sse2(output[4], output[11], x[4], x[11]);
  ------------------
  |  |   53|  66.7k|  do {                                                  \
  |  |   54|  66.7k|    const __m128i _in0 = in0;                           \
  |  |   55|  66.7k|    const __m128i _in1 = in1;                           \
  |  |   56|  66.7k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|  66.7k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|  66.7k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 66.7k]
  |  |  ------------------
  ------------------
  231|  66.7k|  btf_16_adds_subs_out_sse2(output[5], output[10], x[5], x[10]);
  ------------------
  |  |   53|  66.7k|  do {                                                  \
  |  |   54|  66.7k|    const __m128i _in0 = in0;                           \
  |  |   55|  66.7k|    const __m128i _in1 = in1;                           \
  |  |   56|  66.7k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|  66.7k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|  66.7k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 66.7k]
  |  |  ------------------
  ------------------
  232|  66.7k|  btf_16_adds_subs_out_sse2(output[6], output[9], x[6], x[9]);
  ------------------
  |  |   53|  66.7k|  do {                                                  \
  |  |   54|  66.7k|    const __m128i _in0 = in0;                           \
  |  |   55|  66.7k|    const __m128i _in1 = in1;                           \
  |  |   56|  66.7k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|  66.7k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|  66.7k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 66.7k]
  |  |  ------------------
  ------------------
  233|  66.7k|  btf_16_adds_subs_out_sse2(output[7], output[8], x[7], x[8]);
  ------------------
  |  |   53|  66.7k|  do {                                                  \
  |  |   54|  66.7k|    const __m128i _in0 = in0;                           \
  |  |   55|  66.7k|    const __m128i _in1 = in1;                           \
  |  |   56|  66.7k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|  66.7k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|  66.7k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 66.7k]
  |  |  ------------------
  ------------------
  234|  66.7k|}
av1_inv_txfm_ssse3.c:idct16_sse2:
  315|  15.9k|static void idct16_sse2(const __m128i *input, __m128i *output) {
  316|  15.9k|  const int8_t cos_bit = INV_COS_BIT;
  ------------------
  |  |   43|  15.9k|#define INV_COS_BIT 12
  ------------------
  317|  15.9k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|  15.9k|#define INV_COS_BIT 12
  ------------------
  318|  15.9k|  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|  15.9k|#define INV_COS_BIT 12
  ------------------
  319|       |
  320|  15.9k|  const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
  ------------------
  |  |   20|  15.9k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  321|  15.9k|  const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
  ------------------
  |  |   20|  15.9k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  322|  15.9k|  const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
  ------------------
  |  |   20|  15.9k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  323|  15.9k|  const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
  ------------------
  |  |   20|  15.9k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  324|  15.9k|  const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
  ------------------
  |  |   20|  15.9k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  325|  15.9k|  const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
  ------------------
  |  |   20|  15.9k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  326|  15.9k|  const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
  ------------------
  |  |   20|  15.9k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  327|  15.9k|  const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
  ------------------
  |  |   20|  15.9k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  328|  15.9k|  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
  ------------------
  |  |   20|  15.9k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  329|  15.9k|  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
  ------------------
  |  |   20|  15.9k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  330|  15.9k|  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
  ------------------
  |  |   20|  15.9k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  331|  15.9k|  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
  ------------------
  |  |   20|  15.9k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  332|  15.9k|  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
  ------------------
  |  |   20|  15.9k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  333|  15.9k|  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
  ------------------
  |  |   20|  15.9k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  334|  15.9k|  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
  ------------------
  |  |   20|  15.9k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  335|  15.9k|  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
  ------------------
  |  |   20|  15.9k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  336|  15.9k|  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
  ------------------
  |  |   20|  15.9k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  337|  15.9k|  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
  ------------------
  |  |   20|  15.9k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  338|  15.9k|  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
  ------------------
  |  |   20|  15.9k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  339|       |
  340|       |  // stage 1
  341|  15.9k|  __m128i x[16];
  342|  15.9k|  x[0] = input[0];
  343|  15.9k|  x[1] = input[8];
  344|  15.9k|  x[2] = input[4];
  345|  15.9k|  x[3] = input[12];
  346|  15.9k|  x[4] = input[2];
  347|  15.9k|  x[5] = input[10];
  348|  15.9k|  x[6] = input[6];
  349|  15.9k|  x[7] = input[14];
  350|  15.9k|  x[8] = input[1];
  351|  15.9k|  x[9] = input[9];
  352|  15.9k|  x[10] = input[5];
  353|  15.9k|  x[11] = input[13];
  354|  15.9k|  x[12] = input[3];
  355|  15.9k|  x[13] = input[11];
  356|  15.9k|  x[14] = input[7];
  357|  15.9k|  x[15] = input[15];
  358|       |
  359|       |  // stage 2
  360|  15.9k|  btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
  ------------------
  |  |   61|  15.9k|  do {                                            \
  |  |   62|  15.9k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  15.9k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  15.9k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  15.9k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  15.9k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  15.9k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  15.9k|                                                  \
  |  |   69|  15.9k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  15.9k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  15.9k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  15.9k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  15.9k|                                                  \
  |  |   74|  15.9k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  15.9k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  15.9k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  15.9k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  15.9k|                                                  \
  |  |   79|  15.9k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  15.9k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  15.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 15.9k]
  |  |  ------------------
  ------------------
  361|  15.9k|  btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
  ------------------
  |  |   61|  15.9k|  do {                                            \
  |  |   62|  15.9k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  15.9k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  15.9k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  15.9k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  15.9k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  15.9k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  15.9k|                                                  \
  |  |   69|  15.9k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  15.9k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  15.9k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  15.9k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  15.9k|                                                  \
  |  |   74|  15.9k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  15.9k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  15.9k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  15.9k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  15.9k|                                                  \
  |  |   79|  15.9k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  15.9k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  15.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 15.9k]
  |  |  ------------------
  ------------------
  362|  15.9k|  btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
  ------------------
  |  |   61|  15.9k|  do {                                            \
  |  |   62|  15.9k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  15.9k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  15.9k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  15.9k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  15.9k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  15.9k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  15.9k|                                                  \
  |  |   69|  15.9k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  15.9k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  15.9k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  15.9k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  15.9k|                                                  \
  |  |   74|  15.9k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  15.9k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  15.9k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  15.9k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  15.9k|                                                  \
  |  |   79|  15.9k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  15.9k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  15.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 15.9k]
  |  |  ------------------
  ------------------
  363|  15.9k|  btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
  ------------------
  |  |   61|  15.9k|  do {                                            \
  |  |   62|  15.9k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  15.9k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  15.9k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  15.9k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  15.9k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  15.9k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  15.9k|                                                  \
  |  |   69|  15.9k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  15.9k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  15.9k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  15.9k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  15.9k|                                                  \
  |  |   74|  15.9k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  15.9k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  15.9k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  15.9k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  15.9k|                                                  \
  |  |   79|  15.9k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  15.9k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  15.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 15.9k]
  |  |  ------------------
  ------------------
  364|       |
  365|       |  // stage 3
  366|  15.9k|  btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
  ------------------
  |  |   61|  15.9k|  do {                                            \
  |  |   62|  15.9k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  15.9k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  15.9k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  15.9k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  15.9k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  15.9k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  15.9k|                                                  \
  |  |   69|  15.9k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  15.9k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  15.9k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  15.9k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  15.9k|                                                  \
  |  |   74|  15.9k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  15.9k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  15.9k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  15.9k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  15.9k|                                                  \
  |  |   79|  15.9k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  15.9k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  15.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 15.9k]
  |  |  ------------------
  ------------------
  367|  15.9k|  btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
  ------------------
  |  |   61|  15.9k|  do {                                            \
  |  |   62|  15.9k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  15.9k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  15.9k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  15.9k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  15.9k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  15.9k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  15.9k|                                                  \
  |  |   69|  15.9k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  15.9k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  15.9k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  15.9k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  15.9k|                                                  \
  |  |   74|  15.9k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  15.9k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  15.9k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  15.9k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  15.9k|                                                  \
  |  |   79|  15.9k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  15.9k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  15.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 15.9k]
  |  |  ------------------
  ------------------
  368|  15.9k|  btf_16_adds_subs_sse2(x[8], x[9]);
  ------------------
  |  |   37|  15.9k|  do {                                  \
  |  |   38|  15.9k|    const __m128i _in0 = in0;           \
  |  |   39|  15.9k|    const __m128i _in1 = in1;           \
  |  |   40|  15.9k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  15.9k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  15.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 15.9k]
  |  |  ------------------
  ------------------
  369|  15.9k|  btf_16_subs_adds_sse2(x[11], x[10]);
  ------------------
  |  |   45|  15.9k|  do {                                  \
  |  |   46|  15.9k|    const __m128i _in0 = in0;           \
  |  |   47|  15.9k|    const __m128i _in1 = in1;           \
  |  |   48|  15.9k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|  15.9k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|  15.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded, False: 15.9k]
  |  |  ------------------
  ------------------
  370|  15.9k|  btf_16_adds_subs_sse2(x[12], x[13]);
  ------------------
  |  |   37|  15.9k|  do {                                  \
  |  |   38|  15.9k|    const __m128i _in0 = in0;           \
  |  |   39|  15.9k|    const __m128i _in1 = in1;           \
  |  |   40|  15.9k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  15.9k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  15.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 15.9k]
  |  |  ------------------
  ------------------
  371|  15.9k|  btf_16_subs_adds_sse2(x[15], x[14]);
  ------------------
  |  |   45|  15.9k|  do {                                  \
  |  |   46|  15.9k|    const __m128i _in0 = in0;           \
  |  |   47|  15.9k|    const __m128i _in1 = in1;           \
  |  |   48|  15.9k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|  15.9k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|  15.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded, False: 15.9k]
  |  |  ------------------
  ------------------
  372|       |
  373|       |  // stage 4
  374|  15.9k|  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
  ------------------
  |  |   61|  15.9k|  do {                                            \
  |  |   62|  15.9k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  15.9k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  15.9k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  15.9k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  15.9k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  15.9k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  15.9k|                                                  \
  |  |   69|  15.9k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  15.9k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  15.9k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  15.9k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  15.9k|                                                  \
  |  |   74|  15.9k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  15.9k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  15.9k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  15.9k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  15.9k|                                                  \
  |  |   79|  15.9k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  15.9k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  15.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 15.9k]
  |  |  ------------------
  ------------------
  375|  15.9k|  btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
  ------------------
  |  |   61|  15.9k|  do {                                            \
  |  |   62|  15.9k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  15.9k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  15.9k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  15.9k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  15.9k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  15.9k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  15.9k|                                                  \
  |  |   69|  15.9k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  15.9k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  15.9k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  15.9k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  15.9k|                                                  \
  |  |   74|  15.9k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  15.9k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  15.9k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  15.9k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  15.9k|                                                  \
  |  |   79|  15.9k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  15.9k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  15.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 15.9k]
  |  |  ------------------
  ------------------
  376|  15.9k|  btf_16_adds_subs_sse2(x[4], x[5]);
  ------------------
  |  |   37|  15.9k|  do {                                  \
  |  |   38|  15.9k|    const __m128i _in0 = in0;           \
  |  |   39|  15.9k|    const __m128i _in1 = in1;           \
  |  |   40|  15.9k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  15.9k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  15.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 15.9k]
  |  |  ------------------
  ------------------
  377|  15.9k|  btf_16_subs_adds_sse2(x[7], x[6]);
  ------------------
  |  |   45|  15.9k|  do {                                  \
  |  |   46|  15.9k|    const __m128i _in0 = in0;           \
  |  |   47|  15.9k|    const __m128i _in1 = in1;           \
  |  |   48|  15.9k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|  15.9k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|  15.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded, False: 15.9k]
  |  |  ------------------
  ------------------
  378|  15.9k|  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
  ------------------
  |  |   61|  15.9k|  do {                                            \
  |  |   62|  15.9k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  15.9k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  15.9k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  15.9k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  15.9k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  15.9k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  15.9k|                                                  \
  |  |   69|  15.9k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  15.9k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  15.9k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  15.9k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  15.9k|                                                  \
  |  |   74|  15.9k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  15.9k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  15.9k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  15.9k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  15.9k|                                                  \
  |  |   79|  15.9k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  15.9k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  15.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 15.9k]
  |  |  ------------------
  ------------------
  379|  15.9k|  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
  ------------------
  |  |   61|  15.9k|  do {                                            \
  |  |   62|  15.9k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  15.9k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  15.9k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  15.9k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  15.9k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  15.9k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  15.9k|                                                  \
  |  |   69|  15.9k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  15.9k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  15.9k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  15.9k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  15.9k|                                                  \
  |  |   74|  15.9k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  15.9k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  15.9k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  15.9k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  15.9k|                                                  \
  |  |   79|  15.9k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  15.9k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  15.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 15.9k]
  |  |  ------------------
  ------------------
  380|       |
  381|       |  // stage 5~7
  382|  15.9k|  idct16_stage5_sse2(x, cospi, __rounding, cos_bit);
  383|  15.9k|  idct16_stage6_sse2(x, cospi, __rounding, cos_bit);
  384|  15.9k|  idct16_stage7_sse2(output, x);
  385|  15.9k|}
av1_inv_txfm_ssse3.c:iadst16_low1_ssse3:
 1974|  4.22k|static void iadst16_low1_ssse3(const __m128i *input, __m128i *output) {
 1975|  4.22k|  const int8_t cos_bit = INV_COS_BIT;
  ------------------
  |  |   43|  4.22k|#define INV_COS_BIT 12
  ------------------
 1976|  4.22k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|  4.22k|#define INV_COS_BIT 12
  ------------------
 1977|  4.22k|  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|  4.22k|#define INV_COS_BIT 12
  ------------------
 1978|       |
 1979|  4.22k|  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
  ------------------
  |  |   20|  4.22k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1980|  4.22k|  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
  ------------------
  |  |   20|  4.22k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1981|  4.22k|  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
  ------------------
  |  |   20|  4.22k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1982|  4.22k|  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
  ------------------
  |  |   20|  4.22k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1983|       |
 1984|       |  // stage 1
 1985|  4.22k|  __m128i x[16];
 1986|  4.22k|  x[1] = input[0];
 1987|       |
 1988|       |  // stage 2
 1989|  4.22k|  btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]);
  ------------------
  |  |   28|  4.22k|  do {                                          \
  |  |   29|  4.22k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  4.22k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  4.22k|    const __m128i _in = in;                     \
  |  |   32|  4.22k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  4.22k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  4.22k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 4.22k]
  |  |  ------------------
  ------------------
 1990|       |
 1991|       |  // stage 3
 1992|  4.22k|  x[8] = x[0];
 1993|  4.22k|  x[9] = x[1];
 1994|       |
 1995|       |  // stage 4
 1996|  4.22k|  btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
  ------------------
  |  |   61|  4.22k|  do {                                            \
  |  |   62|  4.22k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  4.22k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  4.22k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  4.22k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  4.22k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  4.22k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  4.22k|                                                  \
  |  |   69|  4.22k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  4.22k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  4.22k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  4.22k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  4.22k|                                                  \
  |  |   74|  4.22k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  4.22k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  4.22k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  4.22k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  4.22k|                                                  \
  |  |   79|  4.22k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  4.22k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  4.22k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 4.22k]
  |  |  ------------------
  ------------------
 1997|       |
 1998|       |  // stage 5
 1999|  4.22k|  x[4] = x[0];
 2000|  4.22k|  x[5] = x[1];
 2001|  4.22k|  x[12] = x[8];
 2002|  4.22k|  x[13] = x[9];
 2003|       |
 2004|       |  // stage 6
 2005|  4.22k|  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
  ------------------
  |  |   61|  4.22k|  do {                                            \
  |  |   62|  4.22k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  4.22k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  4.22k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  4.22k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  4.22k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  4.22k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  4.22k|                                                  \
  |  |   69|  4.22k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  4.22k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  4.22k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  4.22k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  4.22k|                                                  \
  |  |   74|  4.22k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  4.22k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  4.22k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  4.22k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  4.22k|                                                  \
  |  |   79|  4.22k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  4.22k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  4.22k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 4.22k]
  |  |  ------------------
  ------------------
 2006|  4.22k|  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
  ------------------
  |  |   61|  4.22k|  do {                                            \
  |  |   62|  4.22k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  4.22k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  4.22k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  4.22k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  4.22k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  4.22k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  4.22k|                                                  \
  |  |   69|  4.22k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  4.22k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  4.22k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  4.22k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  4.22k|                                                  \
  |  |   74|  4.22k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  4.22k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  4.22k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  4.22k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  4.22k|                                                  \
  |  |   79|  4.22k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  4.22k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  4.22k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 4.22k]
  |  |  ------------------
  ------------------
 2007|       |
 2008|       |  // stage 7
 2009|  4.22k|  x[2] = x[0];
 2010|  4.22k|  x[3] = x[1];
 2011|  4.22k|  x[6] = x[4];
 2012|  4.22k|  x[7] = x[5];
 2013|  4.22k|  x[10] = x[8];
 2014|  4.22k|  x[11] = x[9];
 2015|  4.22k|  x[14] = x[12];
 2016|  4.22k|  x[15] = x[13];
 2017|       |
 2018|  4.22k|  iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
 2019|  4.22k|  iadst16_stage9_ssse3(output, x);
 2020|  4.22k|}
av1_inv_txfm_ssse3.c:iadst16_stage8_ssse3:
 1945|  28.2k|                                        int8_t cos_bit) {
 1946|  28.2k|  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
  ------------------
  |  |   20|  28.2k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1947|  28.2k|  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
  ------------------
  |  |   20|  28.2k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1948|  28.2k|  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
  ------------------
  |  |   61|  28.2k|  do {                                            \
  |  |   62|  28.2k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  28.2k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  28.2k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  28.2k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  28.2k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  28.2k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  28.2k|                                                  \
  |  |   69|  28.2k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  28.2k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  28.2k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  28.2k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  28.2k|                                                  \
  |  |   74|  28.2k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  28.2k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  28.2k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  28.2k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  28.2k|                                                  \
  |  |   79|  28.2k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  28.2k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  28.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 28.2k]
  |  |  ------------------
  ------------------
 1949|  28.2k|  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
  ------------------
  |  |   61|  28.2k|  do {                                            \
  |  |   62|  28.2k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  28.2k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  28.2k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  28.2k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  28.2k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  28.2k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  28.2k|                                                  \
  |  |   69|  28.2k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  28.2k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  28.2k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  28.2k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  28.2k|                                                  \
  |  |   74|  28.2k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  28.2k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  28.2k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  28.2k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  28.2k|                                                  \
  |  |   79|  28.2k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  28.2k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  28.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 28.2k]
  |  |  ------------------
  ------------------
 1950|  28.2k|  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11]);
  ------------------
  |  |   61|  28.2k|  do {                                            \
  |  |   62|  28.2k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  28.2k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  28.2k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  28.2k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  28.2k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  28.2k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  28.2k|                                                  \
  |  |   69|  28.2k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  28.2k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  28.2k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  28.2k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  28.2k|                                                  \
  |  |   74|  28.2k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  28.2k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  28.2k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  28.2k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  28.2k|                                                  \
  |  |   79|  28.2k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  28.2k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  28.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 28.2k]
  |  |  ------------------
  ------------------
 1951|  28.2k|  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]);
  ------------------
  |  |   61|  28.2k|  do {                                            \
  |  |   62|  28.2k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  28.2k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  28.2k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  28.2k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  28.2k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  28.2k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  28.2k|                                                  \
  |  |   69|  28.2k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  28.2k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  28.2k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  28.2k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  28.2k|                                                  \
  |  |   74|  28.2k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  28.2k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  28.2k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  28.2k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  28.2k|                                                  \
  |  |   79|  28.2k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  28.2k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  28.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 28.2k]
  |  |  ------------------
  ------------------
 1952|  28.2k|}
av1_inv_txfm_ssse3.c:iadst16_stage9_ssse3:
 1954|  51.1k|static inline void iadst16_stage9_ssse3(__m128i *output, __m128i *x) {
 1955|  51.1k|  const __m128i __zero = _mm_setzero_si128();
 1956|  51.1k|  output[0] = x[0];
 1957|  51.1k|  output[1] = _mm_subs_epi16(__zero, x[8]);
 1958|  51.1k|  output[2] = x[12];
 1959|  51.1k|  output[3] = _mm_subs_epi16(__zero, x[4]);
 1960|  51.1k|  output[4] = x[6];
 1961|  51.1k|  output[5] = _mm_subs_epi16(__zero, x[14]);
 1962|  51.1k|  output[6] = x[10];
 1963|  51.1k|  output[7] = _mm_subs_epi16(__zero, x[2]);
 1964|  51.1k|  output[8] = x[3];
 1965|  51.1k|  output[9] = _mm_subs_epi16(__zero, x[11]);
 1966|  51.1k|  output[10] = x[15];
 1967|  51.1k|  output[11] = _mm_subs_epi16(__zero, x[7]);
 1968|  51.1k|  output[12] = x[5];
 1969|  51.1k|  output[13] = _mm_subs_epi16(__zero, x[13]);
 1970|  51.1k|  output[14] = x[9];
 1971|  51.1k|  output[15] = _mm_subs_epi16(__zero, x[1]);
 1972|  51.1k|}
av1_inv_txfm_ssse3.c:iadst16_low8_ssse3:
 2022|  12.7k|static void iadst16_low8_ssse3(const __m128i *input, __m128i *output) {
 2023|  12.7k|  const int8_t cos_bit = INV_COS_BIT;
  ------------------
  |  |   43|  12.7k|#define INV_COS_BIT 12
  ------------------
 2024|  12.7k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|  12.7k|#define INV_COS_BIT 12
  ------------------
 2025|  12.7k|  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|  12.7k|#define INV_COS_BIT 12
  ------------------
 2026|       |
 2027|       |  // stage 1
 2028|  12.7k|  __m128i x[16];
 2029|  12.7k|  x[1] = input[0];
 2030|  12.7k|  x[3] = input[2];
 2031|  12.7k|  x[5] = input[4];
 2032|  12.7k|  x[7] = input[6];
 2033|  12.7k|  x[8] = input[7];
 2034|  12.7k|  x[10] = input[5];
 2035|  12.7k|  x[12] = input[3];
 2036|  12.7k|  x[14] = input[1];
 2037|       |
 2038|       |  // stage 2
 2039|  12.7k|  btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]);
  ------------------
  |  |   28|  12.7k|  do {                                          \
  |  |   29|  12.7k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  12.7k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  12.7k|    const __m128i _in = in;                     \
  |  |   32|  12.7k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  12.7k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  12.7k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 12.7k]
  |  |  ------------------
  ------------------
 2040|  12.7k|  btf_16_ssse3(cospi[54], -cospi[10], x[3], x[2], x[3]);
  ------------------
  |  |   28|  12.7k|  do {                                          \
  |  |   29|  12.7k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  12.7k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  12.7k|    const __m128i _in = in;                     \
  |  |   32|  12.7k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  12.7k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  12.7k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 12.7k]
  |  |  ------------------
  ------------------
 2041|  12.7k|  btf_16_ssse3(cospi[46], -cospi[18], x[5], x[4], x[5]);
  ------------------
  |  |   28|  12.7k|  do {                                          \
  |  |   29|  12.7k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  12.7k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  12.7k|    const __m128i _in = in;                     \
  |  |   32|  12.7k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  12.7k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  12.7k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 12.7k]
  |  |  ------------------
  ------------------
 2042|  12.7k|  btf_16_ssse3(cospi[38], -cospi[26], x[7], x[6], x[7]);
  ------------------
  |  |   28|  12.7k|  do {                                          \
  |  |   29|  12.7k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  12.7k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  12.7k|    const __m128i _in = in;                     \
  |  |   32|  12.7k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  12.7k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  12.7k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 12.7k]
  |  |  ------------------
  ------------------
 2043|  12.7k|  btf_16_ssse3(cospi[34], cospi[30], x[8], x[8], x[9]);
  ------------------
  |  |   28|  12.7k|  do {                                          \
  |  |   29|  12.7k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  12.7k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  12.7k|    const __m128i _in = in;                     \
  |  |   32|  12.7k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  12.7k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  12.7k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 12.7k]
  |  |  ------------------
  ------------------
 2044|  12.7k|  btf_16_ssse3(cospi[42], cospi[22], x[10], x[10], x[11]);
  ------------------
  |  |   28|  12.7k|  do {                                          \
  |  |   29|  12.7k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  12.7k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  12.7k|    const __m128i _in = in;                     \
  |  |   32|  12.7k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  12.7k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  12.7k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 12.7k]
  |  |  ------------------
  ------------------
 2045|  12.7k|  btf_16_ssse3(cospi[50], cospi[14], x[12], x[12], x[13]);
  ------------------
  |  |   28|  12.7k|  do {                                          \
  |  |   29|  12.7k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  12.7k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  12.7k|    const __m128i _in = in;                     \
  |  |   32|  12.7k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  12.7k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  12.7k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 12.7k]
  |  |  ------------------
  ------------------
 2046|  12.7k|  btf_16_ssse3(cospi[58], cospi[6], x[14], x[14], x[15]);
  ------------------
  |  |   28|  12.7k|  do {                                          \
  |  |   29|  12.7k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  12.7k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  12.7k|    const __m128i _in = in;                     \
  |  |   32|  12.7k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  12.7k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  12.7k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 12.7k]
  |  |  ------------------
  ------------------
 2047|       |
 2048|       |  // stage 3
 2049|  12.7k|  iadst16_stage3_ssse3(x);
 2050|  12.7k|  iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit);
 2051|  12.7k|  iadst16_stage5_ssse3(x);
 2052|  12.7k|  iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit);
 2053|  12.7k|  iadst16_stage7_ssse3(x);
 2054|  12.7k|  iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
 2055|  12.7k|  iadst16_stage9_ssse3(output, x);
 2056|  12.7k|}
av1_inv_txfm_ssse3.c:iadst16_stage3_ssse3:
 1883|  46.8k|static inline void iadst16_stage3_ssse3(__m128i *x) {
 1884|  46.8k|  btf_16_adds_subs_sse2(x[0], x[8]);
  ------------------
  |  |   37|  46.8k|  do {                                  \
  |  |   38|  46.8k|    const __m128i _in0 = in0;           \
  |  |   39|  46.8k|    const __m128i _in1 = in1;           \
  |  |   40|  46.8k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  46.8k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  46.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 46.8k]
  |  |  ------------------
  ------------------
 1885|  46.8k|  btf_16_adds_subs_sse2(x[1], x[9]);
  ------------------
  |  |   37|  46.8k|  do {                                  \
  |  |   38|  46.8k|    const __m128i _in0 = in0;           \
  |  |   39|  46.8k|    const __m128i _in1 = in1;           \
  |  |   40|  46.8k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  46.8k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  46.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 46.8k]
  |  |  ------------------
  ------------------
 1886|  46.8k|  btf_16_adds_subs_sse2(x[2], x[10]);
  ------------------
  |  |   37|  46.8k|  do {                                  \
  |  |   38|  46.8k|    const __m128i _in0 = in0;           \
  |  |   39|  46.8k|    const __m128i _in1 = in1;           \
  |  |   40|  46.8k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  46.8k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  46.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 46.8k]
  |  |  ------------------
  ------------------
 1887|  46.8k|  btf_16_adds_subs_sse2(x[3], x[11]);
  ------------------
  |  |   37|  46.8k|  do {                                  \
  |  |   38|  46.8k|    const __m128i _in0 = in0;           \
  |  |   39|  46.8k|    const __m128i _in1 = in1;           \
  |  |   40|  46.8k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  46.8k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  46.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 46.8k]
  |  |  ------------------
  ------------------
 1888|  46.8k|  btf_16_adds_subs_sse2(x[4], x[12]);
  ------------------
  |  |   37|  46.8k|  do {                                  \
  |  |   38|  46.8k|    const __m128i _in0 = in0;           \
  |  |   39|  46.8k|    const __m128i _in1 = in1;           \
  |  |   40|  46.8k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  46.8k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  46.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 46.8k]
  |  |  ------------------
  ------------------
 1889|  46.8k|  btf_16_adds_subs_sse2(x[5], x[13]);
  ------------------
  |  |   37|  46.8k|  do {                                  \
  |  |   38|  46.8k|    const __m128i _in0 = in0;           \
  |  |   39|  46.8k|    const __m128i _in1 = in1;           \
  |  |   40|  46.8k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  46.8k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  46.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 46.8k]
  |  |  ------------------
  ------------------
 1890|  46.8k|  btf_16_adds_subs_sse2(x[6], x[14]);
  ------------------
  |  |   37|  46.8k|  do {                                  \
  |  |   38|  46.8k|    const __m128i _in0 = in0;           \
  |  |   39|  46.8k|    const __m128i _in1 = in1;           \
  |  |   40|  46.8k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  46.8k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  46.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 46.8k]
  |  |  ------------------
  ------------------
 1891|  46.8k|  btf_16_adds_subs_sse2(x[7], x[15]);
  ------------------
  |  |   37|  46.8k|  do {                                  \
  |  |   38|  46.8k|    const __m128i _in0 = in0;           \
  |  |   39|  46.8k|    const __m128i _in1 = in1;           \
  |  |   40|  46.8k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  46.8k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  46.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 46.8k]
  |  |  ------------------
  ------------------
 1892|  46.8k|}
av1_inv_txfm_ssse3.c:iadst16_stage4_ssse3:
 1896|  24.0k|                                        int8_t cos_bit) {
 1897|  24.0k|  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
  ------------------
  |  |   20|  24.0k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1898|  24.0k|  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
  ------------------
  |  |   20|  24.0k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1899|  24.0k|  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
  ------------------
  |  |   20|  24.0k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1900|  24.0k|  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
  ------------------
  |  |   20|  24.0k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1901|  24.0k|  const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
  ------------------
  |  |   20|  24.0k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1902|  24.0k|  const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
  ------------------
  |  |   20|  24.0k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1903|  24.0k|  btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
  ------------------
  |  |   61|  24.0k|  do {                                            \
  |  |   62|  24.0k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  24.0k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  24.0k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  24.0k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  24.0k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  24.0k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  24.0k|                                                  \
  |  |   69|  24.0k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  24.0k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  24.0k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  24.0k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  24.0k|                                                  \
  |  |   74|  24.0k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  24.0k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  24.0k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  24.0k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  24.0k|                                                  \
  |  |   79|  24.0k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  24.0k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  24.0k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 24.0k]
  |  |  ------------------
  ------------------
 1904|  24.0k|  btf_16_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]);
  ------------------
  |  |   61|  24.0k|  do {                                            \
  |  |   62|  24.0k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  24.0k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  24.0k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  24.0k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  24.0k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  24.0k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  24.0k|                                                  \
  |  |   69|  24.0k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  24.0k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  24.0k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  24.0k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  24.0k|                                                  \
  |  |   74|  24.0k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  24.0k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  24.0k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  24.0k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  24.0k|                                                  \
  |  |   79|  24.0k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  24.0k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  24.0k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 24.0k]
  |  |  ------------------
  ------------------
 1905|  24.0k|  btf_16_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]);
  ------------------
  |  |   61|  24.0k|  do {                                            \
  |  |   62|  24.0k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  24.0k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  24.0k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  24.0k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  24.0k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  24.0k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  24.0k|                                                  \
  |  |   69|  24.0k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  24.0k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  24.0k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  24.0k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  24.0k|                                                  \
  |  |   74|  24.0k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  24.0k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  24.0k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  24.0k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  24.0k|                                                  \
  |  |   79|  24.0k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  24.0k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  24.0k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 24.0k]
  |  |  ------------------
  ------------------
 1906|  24.0k|  btf_16_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]);
  ------------------
  |  |   61|  24.0k|  do {                                            \
  |  |   62|  24.0k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  24.0k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  24.0k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  24.0k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  24.0k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  24.0k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  24.0k|                                                  \
  |  |   69|  24.0k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  24.0k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  24.0k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  24.0k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  24.0k|                                                  \
  |  |   74|  24.0k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  24.0k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  24.0k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  24.0k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  24.0k|                                                  \
  |  |   79|  24.0k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  24.0k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  24.0k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 24.0k]
  |  |  ------------------
  ------------------
 1907|  24.0k|}
av1_inv_txfm_ssse3.c:iadst16_stage5_ssse3:
 1909|  46.8k|static inline void iadst16_stage5_ssse3(__m128i *x) {
 1910|  46.8k|  btf_16_adds_subs_sse2(x[0], x[4]);
  ------------------
  |  |   37|  46.8k|  do {                                  \
  |  |   38|  46.8k|    const __m128i _in0 = in0;           \
  |  |   39|  46.8k|    const __m128i _in1 = in1;           \
  |  |   40|  46.8k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  46.8k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  46.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 46.8k]
  |  |  ------------------
  ------------------
 1911|  46.8k|  btf_16_adds_subs_sse2(x[1], x[5]);
  ------------------
  |  |   37|  46.8k|  do {                                  \
  |  |   38|  46.8k|    const __m128i _in0 = in0;           \
  |  |   39|  46.8k|    const __m128i _in1 = in1;           \
  |  |   40|  46.8k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  46.8k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  46.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 46.8k]
  |  |  ------------------
  ------------------
 1912|  46.8k|  btf_16_adds_subs_sse2(x[2], x[6]);
  ------------------
  |  |   37|  46.8k|  do {                                  \
  |  |   38|  46.8k|    const __m128i _in0 = in0;           \
  |  |   39|  46.8k|    const __m128i _in1 = in1;           \
  |  |   40|  46.8k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  46.8k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  46.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 46.8k]
  |  |  ------------------
  ------------------
 1913|  46.8k|  btf_16_adds_subs_sse2(x[3], x[7]);
  ------------------
  |  |   37|  46.8k|  do {                                  \
  |  |   38|  46.8k|    const __m128i _in0 = in0;           \
  |  |   39|  46.8k|    const __m128i _in1 = in1;           \
  |  |   40|  46.8k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  46.8k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  46.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 46.8k]
  |  |  ------------------
  ------------------
 1914|  46.8k|  btf_16_adds_subs_sse2(x[8], x[12]);
  ------------------
  |  |   37|  46.8k|  do {                                  \
  |  |   38|  46.8k|    const __m128i _in0 = in0;           \
  |  |   39|  46.8k|    const __m128i _in1 = in1;           \
  |  |   40|  46.8k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  46.8k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  46.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 46.8k]
  |  |  ------------------
  ------------------
 1915|  46.8k|  btf_16_adds_subs_sse2(x[9], x[13]);
  ------------------
  |  |   37|  46.8k|  do {                                  \
  |  |   38|  46.8k|    const __m128i _in0 = in0;           \
  |  |   39|  46.8k|    const __m128i _in1 = in1;           \
  |  |   40|  46.8k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  46.8k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  46.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 46.8k]
  |  |  ------------------
  ------------------
 1916|  46.8k|  btf_16_adds_subs_sse2(x[10], x[14]);
  ------------------
  |  |   37|  46.8k|  do {                                  \
  |  |   38|  46.8k|    const __m128i _in0 = in0;           \
  |  |   39|  46.8k|    const __m128i _in1 = in1;           \
  |  |   40|  46.8k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  46.8k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  46.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 46.8k]
  |  |  ------------------
  ------------------
 1917|  46.8k|  btf_16_adds_subs_sse2(x[11], x[15]);
  ------------------
  |  |   37|  46.8k|  do {                                  \
  |  |   38|  46.8k|    const __m128i _in0 = in0;           \
  |  |   39|  46.8k|    const __m128i _in1 = in1;           \
  |  |   40|  46.8k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  46.8k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  46.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 46.8k]
  |  |  ------------------
  ------------------
 1918|  46.8k|}
av1_inv_txfm_ssse3.c:iadst16_stage6_ssse3:
 1922|  24.0k|                                        int8_t cos_bit) {
 1923|  24.0k|  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
  ------------------
  |  |   20|  24.0k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1924|  24.0k|  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
  ------------------
  |  |   20|  24.0k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1925|  24.0k|  const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
  ------------------
  |  |   20|  24.0k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1926|  24.0k|  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
  ------------------
  |  |   61|  24.0k|  do {                                            \
  |  |   62|  24.0k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  24.0k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  24.0k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  24.0k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  24.0k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  24.0k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  24.0k|                                                  \
  |  |   69|  24.0k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  24.0k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  24.0k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  24.0k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  24.0k|                                                  \
  |  |   74|  24.0k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  24.0k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  24.0k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  24.0k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  24.0k|                                                  \
  |  |   79|  24.0k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  24.0k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  24.0k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 24.0k]
  |  |  ------------------
  ------------------
 1927|  24.0k|  btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
  ------------------
  |  |   61|  24.0k|  do {                                            \
  |  |   62|  24.0k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  24.0k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  24.0k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  24.0k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  24.0k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  24.0k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  24.0k|                                                  \
  |  |   69|  24.0k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  24.0k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  24.0k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  24.0k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  24.0k|                                                  \
  |  |   74|  24.0k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  24.0k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  24.0k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  24.0k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  24.0k|                                                  \
  |  |   79|  24.0k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  24.0k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  24.0k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 24.0k]
  |  |  ------------------
  ------------------
 1928|  24.0k|  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
  ------------------
  |  |   61|  24.0k|  do {                                            \
  |  |   62|  24.0k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  24.0k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  24.0k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  24.0k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  24.0k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  24.0k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  24.0k|                                                  \
  |  |   69|  24.0k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  24.0k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  24.0k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  24.0k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  24.0k|                                                  \
  |  |   74|  24.0k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  24.0k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  24.0k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  24.0k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  24.0k|                                                  \
  |  |   79|  24.0k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  24.0k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  24.0k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 24.0k]
  |  |  ------------------
  ------------------
 1929|  24.0k|  btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]);
  ------------------
  |  |   61|  24.0k|  do {                                            \
  |  |   62|  24.0k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  24.0k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  24.0k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  24.0k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  24.0k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  24.0k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  24.0k|                                                  \
  |  |   69|  24.0k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  24.0k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  24.0k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  24.0k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  24.0k|                                                  \
  |  |   74|  24.0k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  24.0k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  24.0k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  24.0k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  24.0k|                                                  \
  |  |   79|  24.0k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  24.0k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  24.0k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 24.0k]
  |  |  ------------------
  ------------------
 1930|  24.0k|}
av1_inv_txfm_ssse3.c:iadst16_stage7_ssse3:
 1932|  46.8k|static inline void iadst16_stage7_ssse3(__m128i *x) {
 1933|  46.8k|  btf_16_adds_subs_sse2(x[0], x[2]);
  ------------------
  |  |   37|  46.8k|  do {                                  \
  |  |   38|  46.8k|    const __m128i _in0 = in0;           \
  |  |   39|  46.8k|    const __m128i _in1 = in1;           \
  |  |   40|  46.8k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  46.8k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  46.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 46.8k]
  |  |  ------------------
  ------------------
 1934|  46.8k|  btf_16_adds_subs_sse2(x[1], x[3]);
  ------------------
  |  |   37|  46.8k|  do {                                  \
  |  |   38|  46.8k|    const __m128i _in0 = in0;           \
  |  |   39|  46.8k|    const __m128i _in1 = in1;           \
  |  |   40|  46.8k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  46.8k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  46.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 46.8k]
  |  |  ------------------
  ------------------
 1935|  46.8k|  btf_16_adds_subs_sse2(x[4], x[6]);
  ------------------
  |  |   37|  46.8k|  do {                                  \
  |  |   38|  46.8k|    const __m128i _in0 = in0;           \
  |  |   39|  46.8k|    const __m128i _in1 = in1;           \
  |  |   40|  46.8k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  46.8k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  46.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 46.8k]
  |  |  ------------------
  ------------------
 1936|  46.8k|  btf_16_adds_subs_sse2(x[5], x[7]);
  ------------------
  |  |   37|  46.8k|  do {                                  \
  |  |   38|  46.8k|    const __m128i _in0 = in0;           \
  |  |   39|  46.8k|    const __m128i _in1 = in1;           \
  |  |   40|  46.8k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  46.8k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  46.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 46.8k]
  |  |  ------------------
  ------------------
 1937|  46.8k|  btf_16_adds_subs_sse2(x[8], x[10]);
  ------------------
  |  |   37|  46.8k|  do {                                  \
  |  |   38|  46.8k|    const __m128i _in0 = in0;           \
  |  |   39|  46.8k|    const __m128i _in1 = in1;           \
  |  |   40|  46.8k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  46.8k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  46.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 46.8k]
  |  |  ------------------
  ------------------
 1938|  46.8k|  btf_16_adds_subs_sse2(x[9], x[11]);
  ------------------
  |  |   37|  46.8k|  do {                                  \
  |  |   38|  46.8k|    const __m128i _in0 = in0;           \
  |  |   39|  46.8k|    const __m128i _in1 = in1;           \
  |  |   40|  46.8k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  46.8k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  46.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 46.8k]
  |  |  ------------------
  ------------------
 1939|  46.8k|  btf_16_adds_subs_sse2(x[12], x[14]);
  ------------------
  |  |   37|  46.8k|  do {                                  \
  |  |   38|  46.8k|    const __m128i _in0 = in0;           \
  |  |   39|  46.8k|    const __m128i _in1 = in1;           \
  |  |   40|  46.8k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  46.8k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  46.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 46.8k]
  |  |  ------------------
  ------------------
 1940|  46.8k|  btf_16_adds_subs_sse2(x[13], x[15]);
  ------------------
  |  |   37|  46.8k|  do {                                  \
  |  |   38|  46.8k|    const __m128i _in0 = in0;           \
  |  |   39|  46.8k|    const __m128i _in1 = in1;           \
  |  |   40|  46.8k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  46.8k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  46.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 46.8k]
  |  |  ------------------
  ------------------
 1941|  46.8k|}
av1_inv_txfm_ssse3.c:iadst16_sse2:
 2057|  11.2k|static void iadst16_sse2(const __m128i *input, __m128i *output) {
 2058|  11.2k|  const int8_t cos_bit = INV_COS_BIT;
  ------------------
  |  |   43|  11.2k|#define INV_COS_BIT 12
  ------------------
 2059|  11.2k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|  11.2k|#define INV_COS_BIT 12
  ------------------
 2060|  11.2k|  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|  11.2k|#define INV_COS_BIT 12
  ------------------
 2061|  11.2k|  const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
  ------------------
  |  |   20|  11.2k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2062|  11.2k|  const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
  ------------------
  |  |   20|  11.2k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2063|  11.2k|  const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
  ------------------
  |  |   20|  11.2k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2064|  11.2k|  const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
  ------------------
  |  |   20|  11.2k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2065|  11.2k|  const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
  ------------------
  |  |   20|  11.2k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2066|  11.2k|  const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
  ------------------
  |  |   20|  11.2k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2067|  11.2k|  const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
  ------------------
  |  |   20|  11.2k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2068|  11.2k|  const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
  ------------------
  |  |   20|  11.2k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2069|  11.2k|  const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
  ------------------
  |  |   20|  11.2k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2070|  11.2k|  const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
  ------------------
  |  |   20|  11.2k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2071|  11.2k|  const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
  ------------------
  |  |   20|  11.2k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2072|  11.2k|  const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
  ------------------
  |  |   20|  11.2k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2073|  11.2k|  const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
  ------------------
  |  |   20|  11.2k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2074|  11.2k|  const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
  ------------------
  |  |   20|  11.2k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2075|  11.2k|  const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
  ------------------
  |  |   20|  11.2k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2076|  11.2k|  const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
  ------------------
  |  |   20|  11.2k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2077|       |
 2078|       |  // stage 1
 2079|  11.2k|  __m128i x[16];
 2080|  11.2k|  x[0] = input[15];
 2081|  11.2k|  x[1] = input[0];
 2082|  11.2k|  x[2] = input[13];
 2083|  11.2k|  x[3] = input[2];
 2084|  11.2k|  x[4] = input[11];
 2085|  11.2k|  x[5] = input[4];
 2086|  11.2k|  x[6] = input[9];
 2087|  11.2k|  x[7] = input[6];
 2088|  11.2k|  x[8] = input[7];
 2089|  11.2k|  x[9] = input[8];
 2090|  11.2k|  x[10] = input[5];
 2091|  11.2k|  x[11] = input[10];
 2092|  11.2k|  x[12] = input[3];
 2093|  11.2k|  x[13] = input[12];
 2094|  11.2k|  x[14] = input[1];
 2095|  11.2k|  x[15] = input[14];
 2096|       |
 2097|       |  // stage 2
 2098|  11.2k|  btf_16_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1]);
  ------------------
  |  |   61|  11.2k|  do {                                            \
  |  |   62|  11.2k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  11.2k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  11.2k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  11.2k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  11.2k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  11.2k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  11.2k|                                                  \
  |  |   69|  11.2k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  11.2k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  11.2k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  11.2k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  11.2k|                                                  \
  |  |   74|  11.2k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  11.2k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  11.2k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  11.2k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  11.2k|                                                  \
  |  |   79|  11.2k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  11.2k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  11.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 11.2k]
  |  |  ------------------
  ------------------
 2099|  11.2k|  btf_16_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3]);
  ------------------
  |  |   61|  11.2k|  do {                                            \
  |  |   62|  11.2k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  11.2k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  11.2k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  11.2k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  11.2k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  11.2k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  11.2k|                                                  \
  |  |   69|  11.2k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  11.2k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  11.2k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  11.2k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  11.2k|                                                  \
  |  |   74|  11.2k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  11.2k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  11.2k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  11.2k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  11.2k|                                                  \
  |  |   79|  11.2k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  11.2k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  11.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 11.2k]
  |  |  ------------------
  ------------------
 2100|  11.2k|  btf_16_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5]);
  ------------------
  |  |   61|  11.2k|  do {                                            \
  |  |   62|  11.2k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  11.2k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  11.2k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  11.2k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  11.2k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  11.2k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  11.2k|                                                  \
  |  |   69|  11.2k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  11.2k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  11.2k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  11.2k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  11.2k|                                                  \
  |  |   74|  11.2k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  11.2k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  11.2k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  11.2k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  11.2k|                                                  \
  |  |   79|  11.2k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  11.2k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  11.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 11.2k]
  |  |  ------------------
  ------------------
 2101|  11.2k|  btf_16_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7]);
  ------------------
  |  |   61|  11.2k|  do {                                            \
  |  |   62|  11.2k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  11.2k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  11.2k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  11.2k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  11.2k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  11.2k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  11.2k|                                                  \
  |  |   69|  11.2k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  11.2k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  11.2k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  11.2k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  11.2k|                                                  \
  |  |   74|  11.2k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  11.2k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  11.2k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  11.2k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  11.2k|                                                  \
  |  |   79|  11.2k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  11.2k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  11.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 11.2k]
  |  |  ------------------
  ------------------
 2102|  11.2k|  btf_16_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9]);
  ------------------
  |  |   61|  11.2k|  do {                                            \
  |  |   62|  11.2k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  11.2k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  11.2k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  11.2k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  11.2k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  11.2k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  11.2k|                                                  \
  |  |   69|  11.2k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  11.2k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  11.2k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  11.2k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  11.2k|                                                  \
  |  |   74|  11.2k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  11.2k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  11.2k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  11.2k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  11.2k|                                                  \
  |  |   79|  11.2k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  11.2k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  11.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 11.2k]
  |  |  ------------------
  ------------------
 2103|  11.2k|  btf_16_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11]);
  ------------------
  |  |   61|  11.2k|  do {                                            \
  |  |   62|  11.2k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  11.2k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  11.2k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  11.2k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  11.2k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  11.2k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  11.2k|                                                  \
  |  |   69|  11.2k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  11.2k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  11.2k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  11.2k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  11.2k|                                                  \
  |  |   74|  11.2k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  11.2k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  11.2k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  11.2k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  11.2k|                                                  \
  |  |   79|  11.2k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  11.2k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  11.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 11.2k]
  |  |  ------------------
  ------------------
 2104|  11.2k|  btf_16_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13]);
  ------------------
  |  |   61|  11.2k|  do {                                            \
  |  |   62|  11.2k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  11.2k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  11.2k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  11.2k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  11.2k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  11.2k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  11.2k|                                                  \
  |  |   69|  11.2k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  11.2k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  11.2k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  11.2k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  11.2k|                                                  \
  |  |   74|  11.2k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  11.2k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  11.2k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  11.2k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  11.2k|                                                  \
  |  |   79|  11.2k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  11.2k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  11.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 11.2k]
  |  |  ------------------
  ------------------
 2105|  11.2k|  btf_16_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15]);
  ------------------
  |  |   61|  11.2k|  do {                                            \
  |  |   62|  11.2k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  11.2k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  11.2k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  11.2k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  11.2k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  11.2k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  11.2k|                                                  \
  |  |   69|  11.2k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  11.2k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  11.2k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  11.2k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  11.2k|                                                  \
  |  |   74|  11.2k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  11.2k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  11.2k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  11.2k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  11.2k|                                                  \
  |  |   79|  11.2k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  11.2k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  11.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 11.2k]
  |  |  ------------------
  ------------------
 2106|       |
 2107|       |  // stage 3~9
 2108|  11.2k|  iadst16_stage3_ssse3(x);
 2109|  11.2k|  iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit);
 2110|  11.2k|  iadst16_stage5_ssse3(x);
 2111|  11.2k|  iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit);
 2112|  11.2k|  iadst16_stage7_ssse3(x);
 2113|  11.2k|  iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
 2114|  11.2k|  iadst16_stage9_ssse3(output, x);
 2115|  11.2k|}
av1_inv_txfm_ssse3.c:idct32_low1_ssse3:
  597|  4.46k|static void idct32_low1_ssse3(const __m128i *input, __m128i *output) {
  598|  4.46k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|  4.46k|#define INV_COS_BIT 12
  ------------------
  599|       |
  600|       |  // stage 1
  601|  4.46k|  __m128i x[2];
  602|  4.46k|  x[0] = input[0];
  603|       |
  604|       |  // stage 2
  605|       |  // stage 3
  606|       |  // stage 4
  607|       |  // stage 5
  608|  4.46k|  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
  ------------------
  |  |   28|  4.46k|  do {                                          \
  |  |   29|  4.46k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  4.46k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  4.46k|    const __m128i _in = in;                     \
  |  |   32|  4.46k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  4.46k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  4.46k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 4.46k]
  |  |  ------------------
  ------------------
  609|       |
  610|       |  // stage 6
  611|       |  // stage 7
  612|       |  // stage 8
  613|       |  // stage 9
  614|  4.46k|  output[0] = x[0];
  615|  4.46k|  output[31] = x[0];
  616|  4.46k|  output[1] = x[1];
  617|  4.46k|  output[30] = x[1];
  618|  4.46k|  output[2] = x[1];
  619|  4.46k|  output[29] = x[1];
  620|  4.46k|  output[3] = x[0];
  621|  4.46k|  output[28] = x[0];
  622|  4.46k|  output[4] = x[0];
  623|  4.46k|  output[27] = x[0];
  624|  4.46k|  output[5] = x[1];
  625|  4.46k|  output[26] = x[1];
  626|  4.46k|  output[6] = x[1];
  627|  4.46k|  output[25] = x[1];
  628|  4.46k|  output[7] = x[0];
  629|  4.46k|  output[24] = x[0];
  630|  4.46k|  output[8] = x[0];
  631|  4.46k|  output[23] = x[0];
  632|  4.46k|  output[9] = x[1];
  633|  4.46k|  output[22] = x[1];
  634|  4.46k|  output[10] = x[1];
  635|  4.46k|  output[21] = x[1];
  636|  4.46k|  output[11] = x[0];
  637|  4.46k|  output[20] = x[0];
  638|  4.46k|  output[12] = x[0];
  639|  4.46k|  output[19] = x[0];
  640|  4.46k|  output[13] = x[1];
  641|  4.46k|  output[18] = x[1];
  642|  4.46k|  output[14] = x[1];
  643|  4.46k|  output[17] = x[1];
  644|  4.46k|  output[15] = x[0];
  645|  4.46k|  output[16] = x[0];
  646|  4.46k|}
av1_inv_txfm_ssse3.c:idct32_low8_ssse3:
  648|  11.8k|static void idct32_low8_ssse3(const __m128i *input, __m128i *output) {
  649|  11.8k|  const int8_t cos_bit = INV_COS_BIT;
  ------------------
  |  |   43|  11.8k|#define INV_COS_BIT 12
  ------------------
  650|  11.8k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|  11.8k|#define INV_COS_BIT 12
  ------------------
  651|  11.8k|  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|  11.8k|#define INV_COS_BIT 12
  ------------------
  652|       |
  653|       |  // stage 1
  654|  11.8k|  __m128i x[32];
  655|  11.8k|  x[0] = input[0];
  656|  11.8k|  x[4] = input[4];
  657|  11.8k|  x[8] = input[2];
  658|  11.8k|  x[12] = input[6];
  659|  11.8k|  x[16] = input[1];
  660|  11.8k|  x[20] = input[5];
  661|  11.8k|  x[24] = input[3];
  662|  11.8k|  x[28] = input[7];
  663|       |
  664|       |  // stage 2
  665|  11.8k|  btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
  ------------------
  |  |   28|  11.8k|  do {                                          \
  |  |   29|  11.8k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  11.8k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  11.8k|    const __m128i _in = in;                     \
  |  |   32|  11.8k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  11.8k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  11.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 11.8k]
  |  |  ------------------
  ------------------
  666|  11.8k|  btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
  ------------------
  |  |   28|  11.8k|  do {                                          \
  |  |   29|  11.8k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  11.8k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  11.8k|    const __m128i _in = in;                     \
  |  |   32|  11.8k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  11.8k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  11.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 11.8k]
  |  |  ------------------
  ------------------
  667|  11.8k|  btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
  ------------------
  |  |   28|  11.8k|  do {                                          \
  |  |   29|  11.8k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  11.8k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  11.8k|    const __m128i _in = in;                     \
  |  |   32|  11.8k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  11.8k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  11.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 11.8k]
  |  |  ------------------
  ------------------
  668|  11.8k|  btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
  ------------------
  |  |   28|  11.8k|  do {                                          \
  |  |   29|  11.8k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  11.8k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  11.8k|    const __m128i _in = in;                     \
  |  |   32|  11.8k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  11.8k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  11.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 11.8k]
  |  |  ------------------
  ------------------
  669|       |
  670|       |  // stage 3
  671|  11.8k|  btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
  ------------------
  |  |   28|  11.8k|  do {                                          \
  |  |   29|  11.8k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  11.8k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  11.8k|    const __m128i _in = in;                     \
  |  |   32|  11.8k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  11.8k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  11.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 11.8k]
  |  |  ------------------
  ------------------
  672|  11.8k|  btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
  ------------------
  |  |   28|  11.8k|  do {                                          \
  |  |   29|  11.8k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  11.8k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  11.8k|    const __m128i _in = in;                     \
  |  |   32|  11.8k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  11.8k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  11.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 11.8k]
  |  |  ------------------
  ------------------
  673|  11.8k|  x[17] = x[16];
  674|  11.8k|  x[18] = x[19];
  675|  11.8k|  x[21] = x[20];
  676|  11.8k|  x[22] = x[23];
  677|  11.8k|  x[25] = x[24];
  678|  11.8k|  x[26] = x[27];
  679|  11.8k|  x[29] = x[28];
  680|  11.8k|  x[30] = x[31];
  681|       |
  682|       |  // stage 4
  683|  11.8k|  btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
  ------------------
  |  |   28|  11.8k|  do {                                          \
  |  |   29|  11.8k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  11.8k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  11.8k|    const __m128i _in = in;                     \
  |  |   32|  11.8k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  11.8k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  11.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 11.8k]
  |  |  ------------------
  ------------------
  684|  11.8k|  x[9] = x[8];
  685|  11.8k|  x[10] = x[11];
  686|  11.8k|  x[13] = x[12];
  687|  11.8k|  x[14] = x[15];
  688|  11.8k|  idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
  689|       |
  690|       |  // stage 5
  691|  11.8k|  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
  ------------------
  |  |   28|  11.8k|  do {                                          \
  |  |   29|  11.8k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  11.8k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  11.8k|    const __m128i _in = in;                     \
  |  |   32|  11.8k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  11.8k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  11.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 11.8k]
  |  |  ------------------
  ------------------
  692|  11.8k|  x[5] = x[4];
  693|  11.8k|  x[6] = x[7];
  694|  11.8k|  idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
  695|       |  // stage 6
  696|  11.8k|  x[3] = x[0];
  697|  11.8k|  x[2] = x[1];
  698|  11.8k|  idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
  699|       |
  700|  11.8k|  idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
  701|  11.8k|  idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
  702|  11.8k|  idct32_stage9_sse2(output, x);
  703|  11.8k|}
av1_inv_txfm_ssse3.c:idct32_high16_stage4_sse2:
  488|  21.2k|                                             int8_t cos_bit) {
  489|  21.2k|  const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
  ------------------
  |  |   20|  21.2k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  490|  21.2k|  const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
  ------------------
  |  |   20|  21.2k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  491|  21.2k|  const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
  ------------------
  |  |   20|  21.2k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  492|  21.2k|  const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
  ------------------
  |  |   20|  21.2k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  493|  21.2k|  const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
  ------------------
  |  |   20|  21.2k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  494|  21.2k|  const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
  ------------------
  |  |   20|  21.2k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  495|  21.2k|  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
  ------------------
  |  |   61|  21.2k|  do {                                            \
  |  |   62|  21.2k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  21.2k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  21.2k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  21.2k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  21.2k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  21.2k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  21.2k|                                                  \
  |  |   69|  21.2k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  21.2k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  21.2k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  21.2k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  21.2k|                                                  \
  |  |   74|  21.2k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  21.2k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  21.2k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  21.2k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  21.2k|                                                  \
  |  |   79|  21.2k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  21.2k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  496|  21.2k|  btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]);
  ------------------
  |  |   61|  21.2k|  do {                                            \
  |  |   62|  21.2k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  21.2k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  21.2k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  21.2k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  21.2k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  21.2k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  21.2k|                                                  \
  |  |   69|  21.2k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  21.2k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  21.2k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  21.2k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  21.2k|                                                  \
  |  |   74|  21.2k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  21.2k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  21.2k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  21.2k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  21.2k|                                                  \
  |  |   79|  21.2k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  21.2k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  497|  21.2k|  btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]);
  ------------------
  |  |   61|  21.2k|  do {                                            \
  |  |   62|  21.2k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  21.2k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  21.2k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  21.2k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  21.2k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  21.2k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  21.2k|                                                  \
  |  |   69|  21.2k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  21.2k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  21.2k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  21.2k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  21.2k|                                                  \
  |  |   74|  21.2k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  21.2k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  21.2k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  21.2k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  21.2k|                                                  \
  |  |   79|  21.2k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  21.2k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  498|  21.2k|  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
  ------------------
  |  |   61|  21.2k|  do {                                            \
  |  |   62|  21.2k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  21.2k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  21.2k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  21.2k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  21.2k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  21.2k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  21.2k|                                                  \
  |  |   69|  21.2k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  21.2k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  21.2k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  21.2k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  21.2k|                                                  \
  |  |   74|  21.2k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  21.2k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  21.2k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  21.2k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  21.2k|                                                  \
  |  |   79|  21.2k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  21.2k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  499|  21.2k|}
av1_inv_txfm_ssse3.c:idct32_high24_stage5_sse2:
  503|  21.2k|                                             int8_t cos_bit) {
  504|  21.2k|  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
  ------------------
  |  |   20|  21.2k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  505|  21.2k|  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
  ------------------
  |  |   20|  21.2k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  506|  21.2k|  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
  ------------------
  |  |   20|  21.2k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  507|  21.2k|  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
  ------------------
  |  |   61|  21.2k|  do {                                            \
  |  |   62|  21.2k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  21.2k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  21.2k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  21.2k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  21.2k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  21.2k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  21.2k|                                                  \
  |  |   69|  21.2k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  21.2k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  21.2k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  21.2k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  21.2k|                                                  \
  |  |   74|  21.2k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  21.2k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  21.2k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  21.2k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  21.2k|                                                  \
  |  |   79|  21.2k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  21.2k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  508|  21.2k|  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
  ------------------
  |  |   61|  21.2k|  do {                                            \
  |  |   62|  21.2k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  21.2k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  21.2k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  21.2k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  21.2k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  21.2k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  21.2k|                                                  \
  |  |   69|  21.2k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  21.2k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  21.2k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  21.2k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  21.2k|                                                  \
  |  |   74|  21.2k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  21.2k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  21.2k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  21.2k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  21.2k|                                                  \
  |  |   79|  21.2k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  21.2k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  509|  21.2k|  btf_16_adds_subs_sse2(x[16], x[19]);
  ------------------
  |  |   37|  21.2k|  do {                                  \
  |  |   38|  21.2k|    const __m128i _in0 = in0;           \
  |  |   39|  21.2k|    const __m128i _in1 = in1;           \
  |  |   40|  21.2k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  21.2k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  510|  21.2k|  btf_16_adds_subs_sse2(x[17], x[18]);
  ------------------
  |  |   37|  21.2k|  do {                                  \
  |  |   38|  21.2k|    const __m128i _in0 = in0;           \
  |  |   39|  21.2k|    const __m128i _in1 = in1;           \
  |  |   40|  21.2k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  21.2k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  511|  21.2k|  btf_16_subs_adds_sse2(x[23], x[20]);
  ------------------
  |  |   45|  21.2k|  do {                                  \
  |  |   46|  21.2k|    const __m128i _in0 = in0;           \
  |  |   47|  21.2k|    const __m128i _in1 = in1;           \
  |  |   48|  21.2k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|  21.2k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  512|  21.2k|  btf_16_subs_adds_sse2(x[22], x[21]);
  ------------------
  |  |   45|  21.2k|  do {                                  \
  |  |   46|  21.2k|    const __m128i _in0 = in0;           \
  |  |   47|  21.2k|    const __m128i _in1 = in1;           \
  |  |   48|  21.2k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|  21.2k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  513|  21.2k|  btf_16_adds_subs_sse2(x[24], x[27]);
  ------------------
  |  |   37|  21.2k|  do {                                  \
  |  |   38|  21.2k|    const __m128i _in0 = in0;           \
  |  |   39|  21.2k|    const __m128i _in1 = in1;           \
  |  |   40|  21.2k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  21.2k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  514|  21.2k|  btf_16_adds_subs_sse2(x[25], x[26]);
  ------------------
  |  |   37|  21.2k|  do {                                  \
  |  |   38|  21.2k|    const __m128i _in0 = in0;           \
  |  |   39|  21.2k|    const __m128i _in1 = in1;           \
  |  |   40|  21.2k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  21.2k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  515|  21.2k|  btf_16_subs_adds_sse2(x[31], x[28]);
  ------------------
  |  |   45|  21.2k|  do {                                  \
  |  |   46|  21.2k|    const __m128i _in0 = in0;           \
  |  |   47|  21.2k|    const __m128i _in1 = in1;           \
  |  |   48|  21.2k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|  21.2k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  516|  21.2k|  btf_16_subs_adds_sse2(x[30], x[29]);
  ------------------
  |  |   45|  21.2k|  do {                                  \
  |  |   46|  21.2k|    const __m128i _in0 = in0;           \
  |  |   47|  21.2k|    const __m128i _in1 = in1;           \
  |  |   48|  21.2k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|  21.2k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  517|  21.2k|}
av1_inv_txfm_ssse3.c:idct32_high28_stage6_sse2:
  521|  21.2k|                                             int8_t cos_bit) {
  522|  21.2k|  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
  ------------------
  |  |   20|  21.2k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  523|  21.2k|  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
  ------------------
  |  |   20|  21.2k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  524|  21.2k|  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
  ------------------
  |  |   20|  21.2k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  525|  21.2k|  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
  ------------------
  |  |   20|  21.2k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  526|  21.2k|  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
  ------------------
  |  |   20|  21.2k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  527|  21.2k|  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
  ------------------
  |  |   61|  21.2k|  do {                                            \
  |  |   62|  21.2k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  21.2k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  21.2k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  21.2k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  21.2k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  21.2k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  21.2k|                                                  \
  |  |   69|  21.2k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  21.2k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  21.2k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  21.2k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  21.2k|                                                  \
  |  |   74|  21.2k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  21.2k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  21.2k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  21.2k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  21.2k|                                                  \
  |  |   79|  21.2k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  21.2k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  528|  21.2k|  btf_16_adds_subs_sse2(x[8], x[11]);
  ------------------
  |  |   37|  21.2k|  do {                                  \
  |  |   38|  21.2k|    const __m128i _in0 = in0;           \
  |  |   39|  21.2k|    const __m128i _in1 = in1;           \
  |  |   40|  21.2k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  21.2k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  529|  21.2k|  btf_16_adds_subs_sse2(x[9], x[10]);
  ------------------
  |  |   37|  21.2k|  do {                                  \
  |  |   38|  21.2k|    const __m128i _in0 = in0;           \
  |  |   39|  21.2k|    const __m128i _in1 = in1;           \
  |  |   40|  21.2k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  21.2k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  530|  21.2k|  btf_16_subs_adds_sse2(x[15], x[12]);
  ------------------
  |  |   45|  21.2k|  do {                                  \
  |  |   46|  21.2k|    const __m128i _in0 = in0;           \
  |  |   47|  21.2k|    const __m128i _in1 = in1;           \
  |  |   48|  21.2k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|  21.2k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  531|  21.2k|  btf_16_subs_adds_sse2(x[14], x[13]);
  ------------------
  |  |   45|  21.2k|  do {                                  \
  |  |   46|  21.2k|    const __m128i _in0 = in0;           \
  |  |   47|  21.2k|    const __m128i _in1 = in1;           \
  |  |   48|  21.2k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|  21.2k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  532|  21.2k|  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]);
  ------------------
  |  |   61|  21.2k|  do {                                            \
  |  |   62|  21.2k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  21.2k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  21.2k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  21.2k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  21.2k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  21.2k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  21.2k|                                                  \
  |  |   69|  21.2k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  21.2k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  21.2k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  21.2k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  21.2k|                                                  \
  |  |   74|  21.2k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  21.2k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  21.2k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  21.2k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  21.2k|                                                  \
  |  |   79|  21.2k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  21.2k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  533|  21.2k|  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]);
  ------------------
  |  |   61|  21.2k|  do {                                            \
  |  |   62|  21.2k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  21.2k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  21.2k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  21.2k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  21.2k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  21.2k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  21.2k|                                                  \
  |  |   69|  21.2k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  21.2k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  21.2k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  21.2k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  21.2k|                                                  \
  |  |   74|  21.2k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  21.2k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  21.2k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  21.2k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  21.2k|                                                  \
  |  |   79|  21.2k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  21.2k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  534|  21.2k|  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]);
  ------------------
  |  |   61|  21.2k|  do {                                            \
  |  |   62|  21.2k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  21.2k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  21.2k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  21.2k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  21.2k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  21.2k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  21.2k|                                                  \
  |  |   69|  21.2k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  21.2k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  21.2k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  21.2k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  21.2k|                                                  \
  |  |   74|  21.2k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  21.2k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  21.2k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  21.2k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  21.2k|                                                  \
  |  |   79|  21.2k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  21.2k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  535|  21.2k|  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]);
  ------------------
  |  |   61|  21.2k|  do {                                            \
  |  |   62|  21.2k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  21.2k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  21.2k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  21.2k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  21.2k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  21.2k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  21.2k|                                                  \
  |  |   69|  21.2k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  21.2k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  21.2k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  21.2k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  21.2k|                                                  \
  |  |   74|  21.2k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  21.2k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  21.2k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  21.2k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  21.2k|                                                  \
  |  |   79|  21.2k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  21.2k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  536|  21.2k|}
av1_inv_txfm_ssse3.c:idct32_stage7_sse2:
  540|  21.2k|                                      int8_t cos_bit) {
  541|  21.2k|  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
  ------------------
  |  |   20|  21.2k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  542|  21.2k|  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
  ------------------
  |  |   20|  21.2k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  543|  21.2k|  btf_16_adds_subs_sse2(x[0], x[7]);
  ------------------
  |  |   37|  21.2k|  do {                                  \
  |  |   38|  21.2k|    const __m128i _in0 = in0;           \
  |  |   39|  21.2k|    const __m128i _in1 = in1;           \
  |  |   40|  21.2k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  21.2k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  544|  21.2k|  btf_16_adds_subs_sse2(x[1], x[6]);
  ------------------
  |  |   37|  21.2k|  do {                                  \
  |  |   38|  21.2k|    const __m128i _in0 = in0;           \
  |  |   39|  21.2k|    const __m128i _in1 = in1;           \
  |  |   40|  21.2k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  21.2k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  545|  21.2k|  btf_16_adds_subs_sse2(x[2], x[5]);
  ------------------
  |  |   37|  21.2k|  do {                                  \
  |  |   38|  21.2k|    const __m128i _in0 = in0;           \
  |  |   39|  21.2k|    const __m128i _in1 = in1;           \
  |  |   40|  21.2k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  21.2k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  546|  21.2k|  btf_16_adds_subs_sse2(x[3], x[4]);
  ------------------
  |  |   37|  21.2k|  do {                                  \
  |  |   38|  21.2k|    const __m128i _in0 = in0;           \
  |  |   39|  21.2k|    const __m128i _in1 = in1;           \
  |  |   40|  21.2k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  21.2k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  547|  21.2k|  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
  ------------------
  |  |   61|  21.2k|  do {                                            \
  |  |   62|  21.2k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  21.2k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  21.2k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  21.2k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  21.2k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  21.2k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  21.2k|                                                  \
  |  |   69|  21.2k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  21.2k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  21.2k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  21.2k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  21.2k|                                                  \
  |  |   74|  21.2k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  21.2k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  21.2k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  21.2k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  21.2k|                                                  \
  |  |   79|  21.2k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  21.2k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  548|  21.2k|  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
  ------------------
  |  |   61|  21.2k|  do {                                            \
  |  |   62|  21.2k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  21.2k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  21.2k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  21.2k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  21.2k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  21.2k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  21.2k|                                                  \
  |  |   69|  21.2k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  21.2k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  21.2k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  21.2k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  21.2k|                                                  \
  |  |   74|  21.2k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  21.2k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  21.2k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  21.2k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  21.2k|                                                  \
  |  |   79|  21.2k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  21.2k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  549|  21.2k|  btf_16_adds_subs_sse2(x[16], x[23]);
  ------------------
  |  |   37|  21.2k|  do {                                  \
  |  |   38|  21.2k|    const __m128i _in0 = in0;           \
  |  |   39|  21.2k|    const __m128i _in1 = in1;           \
  |  |   40|  21.2k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  21.2k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  550|  21.2k|  btf_16_adds_subs_sse2(x[17], x[22]);
  ------------------
  |  |   37|  21.2k|  do {                                  \
  |  |   38|  21.2k|    const __m128i _in0 = in0;           \
  |  |   39|  21.2k|    const __m128i _in1 = in1;           \
  |  |   40|  21.2k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  21.2k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  551|  21.2k|  btf_16_adds_subs_sse2(x[18], x[21]);
  ------------------
  |  |   37|  21.2k|  do {                                  \
  |  |   38|  21.2k|    const __m128i _in0 = in0;           \
  |  |   39|  21.2k|    const __m128i _in1 = in1;           \
  |  |   40|  21.2k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  21.2k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  552|  21.2k|  btf_16_adds_subs_sse2(x[19], x[20]);
  ------------------
  |  |   37|  21.2k|  do {                                  \
  |  |   38|  21.2k|    const __m128i _in0 = in0;           \
  |  |   39|  21.2k|    const __m128i _in1 = in1;           \
  |  |   40|  21.2k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  21.2k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  553|  21.2k|  btf_16_subs_adds_sse2(x[31], x[24]);
  ------------------
  |  |   45|  21.2k|  do {                                  \
  |  |   46|  21.2k|    const __m128i _in0 = in0;           \
  |  |   47|  21.2k|    const __m128i _in1 = in1;           \
  |  |   48|  21.2k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|  21.2k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  554|  21.2k|  btf_16_subs_adds_sse2(x[30], x[25]);
  ------------------
  |  |   45|  21.2k|  do {                                  \
  |  |   46|  21.2k|    const __m128i _in0 = in0;           \
  |  |   47|  21.2k|    const __m128i _in1 = in1;           \
  |  |   48|  21.2k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|  21.2k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  555|  21.2k|  btf_16_subs_adds_sse2(x[29], x[26]);
  ------------------
  |  |   45|  21.2k|  do {                                  \
  |  |   46|  21.2k|    const __m128i _in0 = in0;           \
  |  |   47|  21.2k|    const __m128i _in1 = in1;           \
  |  |   48|  21.2k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|  21.2k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  556|  21.2k|  btf_16_subs_adds_sse2(x[28], x[27]);
  ------------------
  |  |   45|  21.2k|  do {                                  \
  |  |   46|  21.2k|    const __m128i _in0 = in0;           \
  |  |   47|  21.2k|    const __m128i _in1 = in1;           \
  |  |   48|  21.2k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|  21.2k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  557|  21.2k|}
av1_inv_txfm_ssse3.c:idct32_stage8_sse2:
  561|  21.2k|                                      int8_t cos_bit) {
  562|  21.2k|  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
  ------------------
  |  |   20|  21.2k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  563|  21.2k|  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
  ------------------
  |  |   20|  21.2k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  564|  21.2k|  btf_16_adds_subs_sse2(x[0], x[15]);
  ------------------
  |  |   37|  21.2k|  do {                                  \
  |  |   38|  21.2k|    const __m128i _in0 = in0;           \
  |  |   39|  21.2k|    const __m128i _in1 = in1;           \
  |  |   40|  21.2k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  21.2k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  565|  21.2k|  btf_16_adds_subs_sse2(x[1], x[14]);
  ------------------
  |  |   37|  21.2k|  do {                                  \
  |  |   38|  21.2k|    const __m128i _in0 = in0;           \
  |  |   39|  21.2k|    const __m128i _in1 = in1;           \
  |  |   40|  21.2k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  21.2k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  566|  21.2k|  btf_16_adds_subs_sse2(x[2], x[13]);
  ------------------
  |  |   37|  21.2k|  do {                                  \
  |  |   38|  21.2k|    const __m128i _in0 = in0;           \
  |  |   39|  21.2k|    const __m128i _in1 = in1;           \
  |  |   40|  21.2k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  21.2k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  567|  21.2k|  btf_16_adds_subs_sse2(x[3], x[12]);
  ------------------
  |  |   37|  21.2k|  do {                                  \
  |  |   38|  21.2k|    const __m128i _in0 = in0;           \
  |  |   39|  21.2k|    const __m128i _in1 = in1;           \
  |  |   40|  21.2k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  21.2k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  568|  21.2k|  btf_16_adds_subs_sse2(x[4], x[11]);
  ------------------
  |  |   37|  21.2k|  do {                                  \
  |  |   38|  21.2k|    const __m128i _in0 = in0;           \
  |  |   39|  21.2k|    const __m128i _in1 = in1;           \
  |  |   40|  21.2k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  21.2k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  569|  21.2k|  btf_16_adds_subs_sse2(x[5], x[10]);
  ------------------
  |  |   37|  21.2k|  do {                                  \
  |  |   38|  21.2k|    const __m128i _in0 = in0;           \
  |  |   39|  21.2k|    const __m128i _in1 = in1;           \
  |  |   40|  21.2k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  21.2k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  570|  21.2k|  btf_16_adds_subs_sse2(x[6], x[9]);
  ------------------
  |  |   37|  21.2k|  do {                                  \
  |  |   38|  21.2k|    const __m128i _in0 = in0;           \
  |  |   39|  21.2k|    const __m128i _in1 = in1;           \
  |  |   40|  21.2k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  21.2k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  571|  21.2k|  btf_16_adds_subs_sse2(x[7], x[8]);
  ------------------
  |  |   37|  21.2k|  do {                                  \
  |  |   38|  21.2k|    const __m128i _in0 = in0;           \
  |  |   39|  21.2k|    const __m128i _in1 = in1;           \
  |  |   40|  21.2k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  21.2k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  572|  21.2k|  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]);
  ------------------
  |  |   61|  21.2k|  do {                                            \
  |  |   62|  21.2k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  21.2k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  21.2k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  21.2k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  21.2k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  21.2k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  21.2k|                                                  \
  |  |   69|  21.2k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  21.2k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  21.2k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  21.2k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  21.2k|                                                  \
  |  |   74|  21.2k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  21.2k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  21.2k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  21.2k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  21.2k|                                                  \
  |  |   79|  21.2k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  21.2k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  573|  21.2k|  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]);
  ------------------
  |  |   61|  21.2k|  do {                                            \
  |  |   62|  21.2k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  21.2k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  21.2k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  21.2k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  21.2k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  21.2k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  21.2k|                                                  \
  |  |   69|  21.2k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  21.2k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  21.2k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  21.2k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  21.2k|                                                  \
  |  |   74|  21.2k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  21.2k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  21.2k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  21.2k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  21.2k|                                                  \
  |  |   79|  21.2k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  21.2k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  574|  21.2k|  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]);
  ------------------
  |  |   61|  21.2k|  do {                                            \
  |  |   62|  21.2k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  21.2k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  21.2k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  21.2k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  21.2k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  21.2k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  21.2k|                                                  \
  |  |   69|  21.2k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  21.2k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  21.2k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  21.2k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  21.2k|                                                  \
  |  |   74|  21.2k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  21.2k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  21.2k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  21.2k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  21.2k|                                                  \
  |  |   79|  21.2k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  21.2k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  575|  21.2k|  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]);
  ------------------
  |  |   61|  21.2k|  do {                                            \
  |  |   62|  21.2k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  21.2k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  21.2k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  21.2k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  21.2k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  21.2k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  21.2k|                                                  \
  |  |   69|  21.2k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  21.2k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  21.2k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  21.2k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  21.2k|                                                  \
  |  |   74|  21.2k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  21.2k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  21.2k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  21.2k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  21.2k|                                                  \
  |  |   79|  21.2k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  21.2k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  576|  21.2k|}
av1_inv_txfm_ssse3.c:idct32_stage9_sse2:
  578|  21.2k|static inline void idct32_stage9_sse2(__m128i *output, __m128i *x) {
  579|  21.2k|  btf_16_adds_subs_out_sse2(output[0], output[31], x[0], x[31]);
  ------------------
  |  |   53|  21.2k|  do {                                                  \
  |  |   54|  21.2k|    const __m128i _in0 = in0;                           \
  |  |   55|  21.2k|    const __m128i _in1 = in1;                           \
  |  |   56|  21.2k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|  21.2k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  580|  21.2k|  btf_16_adds_subs_out_sse2(output[1], output[30], x[1], x[30]);
  ------------------
  |  |   53|  21.2k|  do {                                                  \
  |  |   54|  21.2k|    const __m128i _in0 = in0;                           \
  |  |   55|  21.2k|    const __m128i _in1 = in1;                           \
  |  |   56|  21.2k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|  21.2k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  581|  21.2k|  btf_16_adds_subs_out_sse2(output[2], output[29], x[2], x[29]);
  ------------------
  |  |   53|  21.2k|  do {                                                  \
  |  |   54|  21.2k|    const __m128i _in0 = in0;                           \
  |  |   55|  21.2k|    const __m128i _in1 = in1;                           \
  |  |   56|  21.2k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|  21.2k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  582|  21.2k|  btf_16_adds_subs_out_sse2(output[3], output[28], x[3], x[28]);
  ------------------
  |  |   53|  21.2k|  do {                                                  \
  |  |   54|  21.2k|    const __m128i _in0 = in0;                           \
  |  |   55|  21.2k|    const __m128i _in1 = in1;                           \
  |  |   56|  21.2k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|  21.2k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  583|  21.2k|  btf_16_adds_subs_out_sse2(output[4], output[27], x[4], x[27]);
  ------------------
  |  |   53|  21.2k|  do {                                                  \
  |  |   54|  21.2k|    const __m128i _in0 = in0;                           \
  |  |   55|  21.2k|    const __m128i _in1 = in1;                           \
  |  |   56|  21.2k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|  21.2k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  584|  21.2k|  btf_16_adds_subs_out_sse2(output[5], output[26], x[5], x[26]);
  ------------------
  |  |   53|  21.2k|  do {                                                  \
  |  |   54|  21.2k|    const __m128i _in0 = in0;                           \
  |  |   55|  21.2k|    const __m128i _in1 = in1;                           \
  |  |   56|  21.2k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|  21.2k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  585|  21.2k|  btf_16_adds_subs_out_sse2(output[6], output[25], x[6], x[25]);
  ------------------
  |  |   53|  21.2k|  do {                                                  \
  |  |   54|  21.2k|    const __m128i _in0 = in0;                           \
  |  |   55|  21.2k|    const __m128i _in1 = in1;                           \
  |  |   56|  21.2k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|  21.2k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  586|  21.2k|  btf_16_adds_subs_out_sse2(output[7], output[24], x[7], x[24]);
  ------------------
  |  |   53|  21.2k|  do {                                                  \
  |  |   54|  21.2k|    const __m128i _in0 = in0;                           \
  |  |   55|  21.2k|    const __m128i _in1 = in1;                           \
  |  |   56|  21.2k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|  21.2k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  587|  21.2k|  btf_16_adds_subs_out_sse2(output[8], output[23], x[8], x[23]);
  ------------------
  |  |   53|  21.2k|  do {                                                  \
  |  |   54|  21.2k|    const __m128i _in0 = in0;                           \
  |  |   55|  21.2k|    const __m128i _in1 = in1;                           \
  |  |   56|  21.2k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|  21.2k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  588|  21.2k|  btf_16_adds_subs_out_sse2(output[9], output[22], x[9], x[22]);
  ------------------
  |  |   53|  21.2k|  do {                                                  \
  |  |   54|  21.2k|    const __m128i _in0 = in0;                           \
  |  |   55|  21.2k|    const __m128i _in1 = in1;                           \
  |  |   56|  21.2k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|  21.2k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  589|  21.2k|  btf_16_adds_subs_out_sse2(output[10], output[21], x[10], x[21]);
  ------------------
  |  |   53|  21.2k|  do {                                                  \
  |  |   54|  21.2k|    const __m128i _in0 = in0;                           \
  |  |   55|  21.2k|    const __m128i _in1 = in1;                           \
  |  |   56|  21.2k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|  21.2k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  590|  21.2k|  btf_16_adds_subs_out_sse2(output[11], output[20], x[11], x[20]);
  ------------------
  |  |   53|  21.2k|  do {                                                  \
  |  |   54|  21.2k|    const __m128i _in0 = in0;                           \
  |  |   55|  21.2k|    const __m128i _in1 = in1;                           \
  |  |   56|  21.2k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|  21.2k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  591|  21.2k|  btf_16_adds_subs_out_sse2(output[12], output[19], x[12], x[19]);
  ------------------
  |  |   53|  21.2k|  do {                                                  \
  |  |   54|  21.2k|    const __m128i _in0 = in0;                           \
  |  |   55|  21.2k|    const __m128i _in1 = in1;                           \
  |  |   56|  21.2k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|  21.2k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  592|  21.2k|  btf_16_adds_subs_out_sse2(output[13], output[18], x[13], x[18]);
  ------------------
  |  |   53|  21.2k|  do {                                                  \
  |  |   54|  21.2k|    const __m128i _in0 = in0;                           \
  |  |   55|  21.2k|    const __m128i _in1 = in1;                           \
  |  |   56|  21.2k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|  21.2k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  593|  21.2k|  btf_16_adds_subs_out_sse2(output[14], output[17], x[14], x[17]);
  ------------------
  |  |   53|  21.2k|  do {                                                  \
  |  |   54|  21.2k|    const __m128i _in0 = in0;                           \
  |  |   55|  21.2k|    const __m128i _in1 = in1;                           \
  |  |   56|  21.2k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|  21.2k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  594|  21.2k|  btf_16_adds_subs_out_sse2(output[15], output[16], x[15], x[16]);
  ------------------
  |  |   53|  21.2k|  do {                                                  \
  |  |   54|  21.2k|    const __m128i _in0 = in0;                           \
  |  |   55|  21.2k|    const __m128i _in1 = in1;                           \
  |  |   56|  21.2k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|  21.2k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|  21.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 21.2k]
  |  |  ------------------
  ------------------
  595|  21.2k|}
av1_inv_txfm_ssse3.c:idct32_low16_ssse3:
  705|  5.28k|static void idct32_low16_ssse3(const __m128i *input, __m128i *output) {
  706|  5.28k|  const int8_t cos_bit = INV_COS_BIT;
  ------------------
  |  |   43|  5.28k|#define INV_COS_BIT 12
  ------------------
  707|  5.28k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|  5.28k|#define INV_COS_BIT 12
  ------------------
  708|  5.28k|  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|  5.28k|#define INV_COS_BIT 12
  ------------------
  709|       |
  710|       |  // stage 1
  711|  5.28k|  __m128i x[32];
  712|  5.28k|  x[0] = input[0];
  713|  5.28k|  x[2] = input[8];
  714|  5.28k|  x[4] = input[4];
  715|  5.28k|  x[6] = input[12];
  716|  5.28k|  x[8] = input[2];
  717|  5.28k|  x[10] = input[10];
  718|  5.28k|  x[12] = input[6];
  719|  5.28k|  x[14] = input[14];
  720|  5.28k|  x[16] = input[1];
  721|  5.28k|  x[18] = input[9];
  722|  5.28k|  x[20] = input[5];
  723|  5.28k|  x[22] = input[13];
  724|  5.28k|  x[24] = input[3];
  725|  5.28k|  x[26] = input[11];
  726|  5.28k|  x[28] = input[7];
  727|  5.28k|  x[30] = input[15];
  728|       |
  729|       |  // stage 2
  730|  5.28k|  btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
  ------------------
  |  |   28|  5.28k|  do {                                          \
  |  |   29|  5.28k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  5.28k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  5.28k|    const __m128i _in = in;                     \
  |  |   32|  5.28k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  5.28k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  5.28k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 5.28k]
  |  |  ------------------
  ------------------
  731|  5.28k|  btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]);
  ------------------
  |  |   28|  5.28k|  do {                                          \
  |  |   29|  5.28k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  5.28k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  5.28k|    const __m128i _in = in;                     \
  |  |   32|  5.28k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  5.28k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  5.28k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 5.28k]
  |  |  ------------------
  ------------------
  732|  5.28k|  btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]);
  ------------------
  |  |   28|  5.28k|  do {                                          \
  |  |   29|  5.28k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  5.28k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  5.28k|    const __m128i _in = in;                     \
  |  |   32|  5.28k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  5.28k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  5.28k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 5.28k]
  |  |  ------------------
  ------------------
  733|  5.28k|  btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
  ------------------
  |  |   28|  5.28k|  do {                                          \
  |  |   29|  5.28k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  5.28k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  5.28k|    const __m128i _in = in;                     \
  |  |   32|  5.28k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  5.28k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  5.28k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 5.28k]
  |  |  ------------------
  ------------------
  734|  5.28k|  btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
  ------------------
  |  |   28|  5.28k|  do {                                          \
  |  |   29|  5.28k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  5.28k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  5.28k|    const __m128i _in = in;                     \
  |  |   32|  5.28k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  5.28k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  5.28k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 5.28k]
  |  |  ------------------
  ------------------
  735|  5.28k|  btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]);
  ------------------
  |  |   28|  5.28k|  do {                                          \
  |  |   29|  5.28k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  5.28k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  5.28k|    const __m128i _in = in;                     \
  |  |   32|  5.28k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  5.28k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  5.28k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 5.28k]
  |  |  ------------------
  ------------------
  736|  5.28k|  btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]);
  ------------------
  |  |   28|  5.28k|  do {                                          \
  |  |   29|  5.28k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  5.28k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  5.28k|    const __m128i _in = in;                     \
  |  |   32|  5.28k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  5.28k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  5.28k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 5.28k]
  |  |  ------------------
  ------------------
  737|  5.28k|  btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
  ------------------
  |  |   28|  5.28k|  do {                                          \
  |  |   29|  5.28k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  5.28k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  5.28k|    const __m128i _in = in;                     \
  |  |   32|  5.28k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  5.28k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  5.28k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 5.28k]
  |  |  ------------------
  ------------------
  738|       |
  739|       |  // stage 3
  740|  5.28k|  btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
  ------------------
  |  |   28|  5.28k|  do {                                          \
  |  |   29|  5.28k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  5.28k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  5.28k|    const __m128i _in = in;                     \
  |  |   32|  5.28k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  5.28k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  5.28k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 5.28k]
  |  |  ------------------
  ------------------
  741|  5.28k|  btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
  ------------------
  |  |   28|  5.28k|  do {                                          \
  |  |   29|  5.28k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  5.28k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  5.28k|    const __m128i _in = in;                     \
  |  |   32|  5.28k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  5.28k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  5.28k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 5.28k]
  |  |  ------------------
  ------------------
  742|  5.28k|  btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
  ------------------
  |  |   28|  5.28k|  do {                                          \
  |  |   29|  5.28k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  5.28k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  5.28k|    const __m128i _in = in;                     \
  |  |   32|  5.28k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  5.28k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  5.28k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 5.28k]
  |  |  ------------------
  ------------------
  743|  5.28k|  btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
  ------------------
  |  |   28|  5.28k|  do {                                          \
  |  |   29|  5.28k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  5.28k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  5.28k|    const __m128i _in = in;                     \
  |  |   32|  5.28k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  5.28k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  5.28k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 5.28k]
  |  |  ------------------
  ------------------
  744|  5.28k|  idct32_high16_stage3_sse2(x);
  745|       |
  746|       |  // stage 4
  747|  5.28k|  btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
  ------------------
  |  |   28|  5.28k|  do {                                          \
  |  |   29|  5.28k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  5.28k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  5.28k|    const __m128i _in = in;                     \
  |  |   32|  5.28k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  5.28k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  5.28k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 5.28k]
  |  |  ------------------
  ------------------
  748|  5.28k|  btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
  ------------------
  |  |   28|  5.28k|  do {                                          \
  |  |   29|  5.28k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  5.28k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  5.28k|    const __m128i _in = in;                     \
  |  |   32|  5.28k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  5.28k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  5.28k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 5.28k]
  |  |  ------------------
  ------------------
  749|  5.28k|  btf_16_adds_subs_sse2(x[8], x[9]);
  ------------------
  |  |   37|  5.28k|  do {                                  \
  |  |   38|  5.28k|    const __m128i _in0 = in0;           \
  |  |   39|  5.28k|    const __m128i _in1 = in1;           \
  |  |   40|  5.28k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  5.28k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  5.28k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 5.28k]
  |  |  ------------------
  ------------------
  750|  5.28k|  btf_16_subs_adds_sse2(x[11], x[10]);
  ------------------
  |  |   45|  5.28k|  do {                                  \
  |  |   46|  5.28k|    const __m128i _in0 = in0;           \
  |  |   47|  5.28k|    const __m128i _in1 = in1;           \
  |  |   48|  5.28k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|  5.28k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|  5.28k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded, False: 5.28k]
  |  |  ------------------
  ------------------
  751|  5.28k|  btf_16_adds_subs_sse2(x[12], x[13]);
  ------------------
  |  |   37|  5.28k|  do {                                  \
  |  |   38|  5.28k|    const __m128i _in0 = in0;           \
  |  |   39|  5.28k|    const __m128i _in1 = in1;           \
  |  |   40|  5.28k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  5.28k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  5.28k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 5.28k]
  |  |  ------------------
  ------------------
  752|  5.28k|  btf_16_subs_adds_sse2(x[15], x[14]);
  ------------------
  |  |   45|  5.28k|  do {                                  \
  |  |   46|  5.28k|    const __m128i _in0 = in0;           \
  |  |   47|  5.28k|    const __m128i _in1 = in1;           \
  |  |   48|  5.28k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|  5.28k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|  5.28k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded, False: 5.28k]
  |  |  ------------------
  ------------------
  753|  5.28k|  idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
  754|       |
  755|       |  // stage 5
  756|  5.28k|  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
  ------------------
  |  |   28|  5.28k|  do {                                          \
  |  |   29|  5.28k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  5.28k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  5.28k|    const __m128i _in = in;                     \
  |  |   32|  5.28k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  5.28k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  5.28k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 5.28k]
  |  |  ------------------
  ------------------
  757|  5.28k|  btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
  ------------------
  |  |   28|  5.28k|  do {                                          \
  |  |   29|  5.28k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  5.28k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  5.28k|    const __m128i _in = in;                     \
  |  |   32|  5.28k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  5.28k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  5.28k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded, False: 5.28k]
  |  |  ------------------
  ------------------
  758|  5.28k|  btf_16_adds_subs_sse2(x[4], x[5]);
  ------------------
  |  |   37|  5.28k|  do {                                  \
  |  |   38|  5.28k|    const __m128i _in0 = in0;           \
  |  |   39|  5.28k|    const __m128i _in1 = in1;           \
  |  |   40|  5.28k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  5.28k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  5.28k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 5.28k]
  |  |  ------------------
  ------------------
  759|  5.28k|  btf_16_subs_adds_sse2(x[7], x[6]);
  ------------------
  |  |   45|  5.28k|  do {                                  \
  |  |   46|  5.28k|    const __m128i _in0 = in0;           \
  |  |   47|  5.28k|    const __m128i _in1 = in1;           \
  |  |   48|  5.28k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|  5.28k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|  5.28k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded, False: 5.28k]
  |  |  ------------------
  ------------------
  760|  5.28k|  idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
  761|       |
  762|  5.28k|  btf_16_adds_subs_sse2(x[0], x[3]);
  ------------------
  |  |   37|  5.28k|  do {                                  \
  |  |   38|  5.28k|    const __m128i _in0 = in0;           \
  |  |   39|  5.28k|    const __m128i _in1 = in1;           \
  |  |   40|  5.28k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  5.28k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  5.28k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 5.28k]
  |  |  ------------------
  ------------------
  763|  5.28k|  btf_16_adds_subs_sse2(x[1], x[2]);
  ------------------
  |  |   37|  5.28k|  do {                                  \
  |  |   38|  5.28k|    const __m128i _in0 = in0;           \
  |  |   39|  5.28k|    const __m128i _in1 = in1;           \
  |  |   40|  5.28k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  5.28k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  5.28k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 5.28k]
  |  |  ------------------
  ------------------
  764|  5.28k|  idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
  765|       |
  766|  5.28k|  idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
  767|  5.28k|  idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
  768|  5.28k|  idct32_stage9_sse2(output, x);
  769|  5.28k|}
av1_inv_txfm_ssse3.c:idct32_high16_stage3_sse2:
  475|  9.35k|static inline void idct32_high16_stage3_sse2(__m128i *x) {
  476|  9.35k|  btf_16_adds_subs_sse2(x[16], x[17]);
  ------------------
  |  |   37|  9.35k|  do {                                  \
  |  |   38|  9.35k|    const __m128i _in0 = in0;           \
  |  |   39|  9.35k|    const __m128i _in1 = in1;           \
  |  |   40|  9.35k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  9.35k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  9.35k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 9.35k]
  |  |  ------------------
  ------------------
  477|  9.35k|  btf_16_subs_adds_sse2(x[19], x[18]);
  ------------------
  |  |   45|  9.35k|  do {                                  \
  |  |   46|  9.35k|    const __m128i _in0 = in0;           \
  |  |   47|  9.35k|    const __m128i _in1 = in1;           \
  |  |   48|  9.35k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|  9.35k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|  9.35k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded, False: 9.35k]
  |  |  ------------------
  ------------------
  478|  9.35k|  btf_16_adds_subs_sse2(x[20], x[21]);
  ------------------
  |  |   37|  9.35k|  do {                                  \
  |  |   38|  9.35k|    const __m128i _in0 = in0;           \
  |  |   39|  9.35k|    const __m128i _in1 = in1;           \
  |  |   40|  9.35k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  9.35k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  9.35k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 9.35k]
  |  |  ------------------
  ------------------
  479|  9.35k|  btf_16_subs_adds_sse2(x[23], x[22]);
  ------------------
  |  |   45|  9.35k|  do {                                  \
  |  |   46|  9.35k|    const __m128i _in0 = in0;           \
  |  |   47|  9.35k|    const __m128i _in1 = in1;           \
  |  |   48|  9.35k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|  9.35k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|  9.35k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded, False: 9.35k]
  |  |  ------------------
  ------------------
  480|  9.35k|  btf_16_adds_subs_sse2(x[24], x[25]);
  ------------------
  |  |   37|  9.35k|  do {                                  \
  |  |   38|  9.35k|    const __m128i _in0 = in0;           \
  |  |   39|  9.35k|    const __m128i _in1 = in1;           \
  |  |   40|  9.35k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  9.35k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  9.35k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 9.35k]
  |  |  ------------------
  ------------------
  481|  9.35k|  btf_16_subs_adds_sse2(x[27], x[26]);
  ------------------
  |  |   45|  9.35k|  do {                                  \
  |  |   46|  9.35k|    const __m128i _in0 = in0;           \
  |  |   47|  9.35k|    const __m128i _in1 = in1;           \
  |  |   48|  9.35k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|  9.35k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|  9.35k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded, False: 9.35k]
  |  |  ------------------
  ------------------
  482|  9.35k|  btf_16_adds_subs_sse2(x[28], x[29]);
  ------------------
  |  |   37|  9.35k|  do {                                  \
  |  |   38|  9.35k|    const __m128i _in0 = in0;           \
  |  |   39|  9.35k|    const __m128i _in1 = in1;           \
  |  |   40|  9.35k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  9.35k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  9.35k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 9.35k]
  |  |  ------------------
  ------------------
  483|  9.35k|  btf_16_subs_adds_sse2(x[31], x[30]);
  ------------------
  |  |   45|  9.35k|  do {                                  \
  |  |   46|  9.35k|    const __m128i _in0 = in0;           \
  |  |   47|  9.35k|    const __m128i _in1 = in1;           \
  |  |   48|  9.35k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|  9.35k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|  9.35k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded, False: 9.35k]
  |  |  ------------------
  ------------------
  484|  9.35k|}
av1_inv_txfm_ssse3.c:idct32_sse2:
  771|  4.07k|static void idct32_sse2(const __m128i *input, __m128i *output) {
  772|  4.07k|  const int8_t cos_bit = INV_COS_BIT;
  ------------------
  |  |   43|  4.07k|#define INV_COS_BIT 12
  ------------------
  773|  4.07k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|  4.07k|#define INV_COS_BIT 12
  ------------------
  774|  4.07k|  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|  4.07k|#define INV_COS_BIT 12
  ------------------
  775|       |
  776|  4.07k|  const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
  ------------------
  |  |   20|  4.07k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  777|  4.07k|  const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
  ------------------
  |  |   20|  4.07k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  778|  4.07k|  const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
  ------------------
  |  |   20|  4.07k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  779|  4.07k|  const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
  ------------------
  |  |   20|  4.07k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  780|  4.07k|  const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
  ------------------
  |  |   20|  4.07k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  781|  4.07k|  const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
  ------------------
  |  |   20|  4.07k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  782|  4.07k|  const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
  ------------------
  |  |   20|  4.07k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  783|  4.07k|  const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
  ------------------
  |  |   20|  4.07k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  784|  4.07k|  const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
  ------------------
  |  |   20|  4.07k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  785|  4.07k|  const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
  ------------------
  |  |   20|  4.07k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  786|  4.07k|  const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
  ------------------
  |  |   20|  4.07k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  787|  4.07k|  const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
  ------------------
  |  |   20|  4.07k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  788|  4.07k|  const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
  ------------------
  |  |   20|  4.07k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  789|  4.07k|  const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
  ------------------
  |  |   20|  4.07k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  790|  4.07k|  const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
  ------------------
  |  |   20|  4.07k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  791|  4.07k|  const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
  ------------------
  |  |   20|  4.07k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  792|  4.07k|  const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
  ------------------
  |  |   20|  4.07k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  793|  4.07k|  const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
  ------------------
  |  |   20|  4.07k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  794|  4.07k|  const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
  ------------------
  |  |   20|  4.07k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  795|  4.07k|  const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
  ------------------
  |  |   20|  4.07k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  796|  4.07k|  const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
  ------------------
  |  |   20|  4.07k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  797|  4.07k|  const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
  ------------------
  |  |   20|  4.07k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  798|  4.07k|  const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
  ------------------
  |  |   20|  4.07k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  799|  4.07k|  const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
  ------------------
  |  |   20|  4.07k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  800|  4.07k|  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
  ------------------
  |  |   20|  4.07k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  801|  4.07k|  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
  ------------------
  |  |   20|  4.07k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  802|  4.07k|  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
  ------------------
  |  |   20|  4.07k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  803|  4.07k|  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
  ------------------
  |  |   20|  4.07k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  804|  4.07k|  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
  ------------------
  |  |   20|  4.07k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  805|  4.07k|  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
  ------------------
  |  |   20|  4.07k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  806|  4.07k|  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
  ------------------
  |  |   20|  4.07k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  807|  4.07k|  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
  ------------------
  |  |   20|  4.07k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  808|       |
  809|       |  // stage 1
  810|  4.07k|  __m128i x[32];
  811|  4.07k|  x[0] = input[0];
  812|  4.07k|  x[1] = input[16];
  813|  4.07k|  x[2] = input[8];
  814|  4.07k|  x[3] = input[24];
  815|  4.07k|  x[4] = input[4];
  816|  4.07k|  x[5] = input[20];
  817|  4.07k|  x[6] = input[12];
  818|  4.07k|  x[7] = input[28];
  819|  4.07k|  x[8] = input[2];
  820|  4.07k|  x[9] = input[18];
  821|  4.07k|  x[10] = input[10];
  822|  4.07k|  x[11] = input[26];
  823|  4.07k|  x[12] = input[6];
  824|  4.07k|  x[13] = input[22];
  825|  4.07k|  x[14] = input[14];
  826|  4.07k|  x[15] = input[30];
  827|  4.07k|  x[16] = input[1];
  828|  4.07k|  x[17] = input[17];
  829|  4.07k|  x[18] = input[9];
  830|  4.07k|  x[19] = input[25];
  831|  4.07k|  x[20] = input[5];
  832|  4.07k|  x[21] = input[21];
  833|  4.07k|  x[22] = input[13];
  834|  4.07k|  x[23] = input[29];
  835|  4.07k|  x[24] = input[3];
  836|  4.07k|  x[25] = input[19];
  837|  4.07k|  x[26] = input[11];
  838|  4.07k|  x[27] = input[27];
  839|  4.07k|  x[28] = input[7];
  840|  4.07k|  x[29] = input[23];
  841|  4.07k|  x[30] = input[15];
  842|  4.07k|  x[31] = input[31];
  843|       |
  844|       |  // stage 2
  845|  4.07k|  btf_16_sse2(cospi_p62_m02, cospi_p02_p62, x[16], x[31], x[16], x[31]);
  ------------------
  |  |   61|  4.07k|  do {                                            \
  |  |   62|  4.07k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  4.07k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  4.07k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  4.07k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  4.07k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  4.07k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  4.07k|                                                  \
  |  |   69|  4.07k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  4.07k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  4.07k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  4.07k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  4.07k|                                                  \
  |  |   74|  4.07k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  4.07k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  4.07k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  4.07k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  4.07k|                                                  \
  |  |   79|  4.07k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  4.07k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  4.07k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 4.07k]
  |  |  ------------------
  ------------------
  846|  4.07k|  btf_16_sse2(cospi_p30_m34, cospi_p34_p30, x[17], x[30], x[17], x[30]);
  ------------------
  |  |   61|  4.07k|  do {                                            \
  |  |   62|  4.07k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  4.07k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  4.07k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  4.07k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  4.07k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  4.07k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  4.07k|                                                  \
  |  |   69|  4.07k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  4.07k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  4.07k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  4.07k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  4.07k|                                                  \
  |  |   74|  4.07k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  4.07k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  4.07k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  4.07k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  4.07k|                                                  \
  |  |   79|  4.07k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  4.07k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  4.07k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 4.07k]
  |  |  ------------------
  ------------------
  847|  4.07k|  btf_16_sse2(cospi_p46_m18, cospi_p18_p46, x[18], x[29], x[18], x[29]);
  ------------------
  |  |   61|  4.07k|  do {                                            \
  |  |   62|  4.07k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  4.07k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  4.07k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  4.07k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  4.07k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  4.07k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  4.07k|                                                  \
  |  |   69|  4.07k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  4.07k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  4.07k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  4.07k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  4.07k|                                                  \
  |  |   74|  4.07k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  4.07k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  4.07k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  4.07k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  4.07k|                                                  \
  |  |   79|  4.07k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  4.07k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  4.07k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 4.07k]
  |  |  ------------------
  ------------------
  848|  4.07k|  btf_16_sse2(cospi_p14_m50, cospi_p50_p14, x[19], x[28], x[19], x[28]);
  ------------------
  |  |   61|  4.07k|  do {                                            \
  |  |   62|  4.07k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  4.07k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  4.07k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  4.07k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  4.07k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  4.07k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  4.07k|                                                  \
  |  |   69|  4.07k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  4.07k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  4.07k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  4.07k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  4.07k|                                                  \
  |  |   74|  4.07k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  4.07k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  4.07k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  4.07k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  4.07k|                                                  \
  |  |   79|  4.07k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  4.07k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  4.07k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 4.07k]
  |  |  ------------------
  ------------------
  849|  4.07k|  btf_16_sse2(cospi_p54_m10, cospi_p10_p54, x[20], x[27], x[20], x[27]);
  ------------------
  |  |   61|  4.07k|  do {                                            \
  |  |   62|  4.07k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  4.07k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  4.07k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  4.07k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  4.07k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  4.07k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  4.07k|                                                  \
  |  |   69|  4.07k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  4.07k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  4.07k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  4.07k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  4.07k|                                                  \
  |  |   74|  4.07k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  4.07k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  4.07k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  4.07k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  4.07k|                                                  \
  |  |   79|  4.07k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  4.07k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  4.07k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 4.07k]
  |  |  ------------------
  ------------------
  850|  4.07k|  btf_16_sse2(cospi_p22_m42, cospi_p42_p22, x[21], x[26], x[21], x[26]);
  ------------------
  |  |   61|  4.07k|  do {                                            \
  |  |   62|  4.07k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  4.07k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  4.07k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  4.07k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  4.07k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  4.07k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  4.07k|                                                  \
  |  |   69|  4.07k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  4.07k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  4.07k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  4.07k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  4.07k|                                                  \
  |  |   74|  4.07k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  4.07k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  4.07k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  4.07k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  4.07k|                                                  \
  |  |   79|  4.07k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  4.07k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  4.07k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 4.07k]
  |  |  ------------------
  ------------------
  851|  4.07k|  btf_16_sse2(cospi_p38_m26, cospi_p26_p38, x[22], x[25], x[22], x[25]);
  ------------------
  |  |   61|  4.07k|  do {                                            \
  |  |   62|  4.07k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  4.07k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  4.07k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  4.07k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  4.07k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  4.07k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  4.07k|                                                  \
  |  |   69|  4.07k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  4.07k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  4.07k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  4.07k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  4.07k|                                                  \
  |  |   74|  4.07k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  4.07k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  4.07k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  4.07k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  4.07k|                                                  \
  |  |   79|  4.07k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  4.07k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  4.07k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 4.07k]
  |  |  ------------------
  ------------------
  852|  4.07k|  btf_16_sse2(cospi_p06_m58, cospi_p58_p06, x[23], x[24], x[23], x[24]);
  ------------------
  |  |   61|  4.07k|  do {                                            \
  |  |   62|  4.07k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  4.07k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  4.07k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  4.07k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  4.07k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  4.07k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  4.07k|                                                  \
  |  |   69|  4.07k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  4.07k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  4.07k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  4.07k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  4.07k|                                                  \
  |  |   74|  4.07k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  4.07k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  4.07k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  4.07k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  4.07k|                                                  \
  |  |   79|  4.07k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  4.07k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  4.07k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 4.07k]
  |  |  ------------------
  ------------------
  853|       |
  854|       |  // stage 3
  855|  4.07k|  btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
  ------------------
  |  |   61|  4.07k|  do {                                            \
  |  |   62|  4.07k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  4.07k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  4.07k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  4.07k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  4.07k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  4.07k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  4.07k|                                                  \
  |  |   69|  4.07k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  4.07k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  4.07k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  4.07k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  4.07k|                                                  \
  |  |   74|  4.07k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  4.07k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  4.07k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  4.07k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  4.07k|                                                  \
  |  |   79|  4.07k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  4.07k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  4.07k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 4.07k]
  |  |  ------------------
  ------------------
  856|  4.07k|  btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
  ------------------
  |  |   61|  4.07k|  do {                                            \
  |  |   62|  4.07k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  4.07k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  4.07k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  4.07k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  4.07k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  4.07k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  4.07k|                                                  \
  |  |   69|  4.07k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  4.07k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  4.07k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  4.07k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  4.07k|                                                  \
  |  |   74|  4.07k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  4.07k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  4.07k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  4.07k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  4.07k|                                                  \
  |  |   79|  4.07k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  4.07k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  4.07k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 4.07k]
  |  |  ------------------
  ------------------
  857|  4.07k|  btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
  ------------------
  |  |   61|  4.07k|  do {                                            \
  |  |   62|  4.07k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  4.07k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  4.07k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  4.07k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  4.07k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  4.07k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  4.07k|                                                  \
  |  |   69|  4.07k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  4.07k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  4.07k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  4.07k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  4.07k|                                                  \
  |  |   74|  4.07k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  4.07k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  4.07k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  4.07k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  4.07k|                                                  \
  |  |   79|  4.07k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  4.07k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  4.07k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 4.07k]
  |  |  ------------------
  ------------------
  858|  4.07k|  btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
  ------------------
  |  |   61|  4.07k|  do {                                            \
  |  |   62|  4.07k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  4.07k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  4.07k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  4.07k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  4.07k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  4.07k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  4.07k|                                                  \
  |  |   69|  4.07k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  4.07k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  4.07k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  4.07k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  4.07k|                                                  \
  |  |   74|  4.07k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  4.07k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  4.07k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  4.07k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  4.07k|                                                  \
  |  |   79|  4.07k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  4.07k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  4.07k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 4.07k]
  |  |  ------------------
  ------------------
  859|  4.07k|  idct32_high16_stage3_sse2(x);
  860|       |
  861|       |  // stage 4
  862|  4.07k|  btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
  ------------------
  |  |   61|  4.07k|  do {                                            \
  |  |   62|  4.07k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  4.07k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  4.07k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  4.07k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  4.07k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  4.07k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  4.07k|                                                  \
  |  |   69|  4.07k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  4.07k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  4.07k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  4.07k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  4.07k|                                                  \
  |  |   74|  4.07k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  4.07k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  4.07k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  4.07k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  4.07k|                                                  \
  |  |   79|  4.07k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  4.07k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  4.07k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 4.07k]
  |  |  ------------------
  ------------------
  863|  4.07k|  btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
  ------------------
  |  |   61|  4.07k|  do {                                            \
  |  |   62|  4.07k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  4.07k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  4.07k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  4.07k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  4.07k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  4.07k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  4.07k|                                                  \
  |  |   69|  4.07k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  4.07k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  4.07k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  4.07k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  4.07k|                                                  \
  |  |   74|  4.07k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  4.07k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  4.07k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  4.07k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  4.07k|                                                  \
  |  |   79|  4.07k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  4.07k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  4.07k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 4.07k]
  |  |  ------------------
  ------------------
  864|  4.07k|  btf_16_adds_subs_sse2(x[8], x[9]);
  ------------------
  |  |   37|  4.07k|  do {                                  \
  |  |   38|  4.07k|    const __m128i _in0 = in0;           \
  |  |   39|  4.07k|    const __m128i _in1 = in1;           \
  |  |   40|  4.07k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  4.07k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  4.07k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 4.07k]
  |  |  ------------------
  ------------------
  865|  4.07k|  btf_16_subs_adds_sse2(x[11], x[10]);
  ------------------
  |  |   45|  4.07k|  do {                                  \
  |  |   46|  4.07k|    const __m128i _in0 = in0;           \
  |  |   47|  4.07k|    const __m128i _in1 = in1;           \
  |  |   48|  4.07k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|  4.07k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|  4.07k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded, False: 4.07k]
  |  |  ------------------
  ------------------
  866|  4.07k|  btf_16_adds_subs_sse2(x[12], x[13]);
  ------------------
  |  |   37|  4.07k|  do {                                  \
  |  |   38|  4.07k|    const __m128i _in0 = in0;           \
  |  |   39|  4.07k|    const __m128i _in1 = in1;           \
  |  |   40|  4.07k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  4.07k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  4.07k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 4.07k]
  |  |  ------------------
  ------------------
  867|  4.07k|  btf_16_subs_adds_sse2(x[15], x[14]);
  ------------------
  |  |   45|  4.07k|  do {                                  \
  |  |   46|  4.07k|    const __m128i _in0 = in0;           \
  |  |   47|  4.07k|    const __m128i _in1 = in1;           \
  |  |   48|  4.07k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|  4.07k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|  4.07k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded, False: 4.07k]
  |  |  ------------------
  ------------------
  868|  4.07k|  idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
  869|       |
  870|       |  // stage 5
  871|  4.07k|  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
  ------------------
  |  |   61|  4.07k|  do {                                            \
  |  |   62|  4.07k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  4.07k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  4.07k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  4.07k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  4.07k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  4.07k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  4.07k|                                                  \
  |  |   69|  4.07k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  4.07k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  4.07k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  4.07k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  4.07k|                                                  \
  |  |   74|  4.07k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  4.07k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  4.07k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  4.07k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  4.07k|                                                  \
  |  |   79|  4.07k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  4.07k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  4.07k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 4.07k]
  |  |  ------------------
  ------------------
  872|  4.07k|  btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
  ------------------
  |  |   61|  4.07k|  do {                                            \
  |  |   62|  4.07k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  4.07k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  4.07k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  4.07k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  4.07k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  4.07k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  4.07k|                                                  \
  |  |   69|  4.07k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  4.07k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  4.07k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  4.07k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  4.07k|                                                  \
  |  |   74|  4.07k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  4.07k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  4.07k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  4.07k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  4.07k|                                                  \
  |  |   79|  4.07k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  4.07k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  4.07k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded, False: 4.07k]
  |  |  ------------------
  ------------------
  873|  4.07k|  btf_16_adds_subs_sse2(x[4], x[5]);
  ------------------
  |  |   37|  4.07k|  do {                                  \
  |  |   38|  4.07k|    const __m128i _in0 = in0;           \
  |  |   39|  4.07k|    const __m128i _in1 = in1;           \
  |  |   40|  4.07k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  4.07k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  4.07k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 4.07k]
  |  |  ------------------
  ------------------
  874|  4.07k|  btf_16_adds_subs_sse2(x[7], x[6]);
  ------------------
  |  |   37|  4.07k|  do {                                  \
  |  |   38|  4.07k|    const __m128i _in0 = in0;           \
  |  |   39|  4.07k|    const __m128i _in1 = in1;           \
  |  |   40|  4.07k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  4.07k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  4.07k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 4.07k]
  |  |  ------------------
  ------------------
  875|  4.07k|  idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
  876|       |
  877|       |  // stage 6
  878|  4.07k|  btf_16_adds_subs_sse2(x[0], x[3]);
  ------------------
  |  |   37|  4.07k|  do {                                  \
  |  |   38|  4.07k|    const __m128i _in0 = in0;           \
  |  |   39|  4.07k|    const __m128i _in1 = in1;           \
  |  |   40|  4.07k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  4.07k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  4.07k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 4.07k]
  |  |  ------------------
  ------------------
  879|  4.07k|  btf_16_adds_subs_sse2(x[1], x[2]);
  ------------------
  |  |   37|  4.07k|  do {                                  \
  |  |   38|  4.07k|    const __m128i _in0 = in0;           \
  |  |   39|  4.07k|    const __m128i _in1 = in1;           \
  |  |   40|  4.07k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  4.07k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  4.07k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 4.07k]
  |  |  ------------------
  ------------------
  880|  4.07k|  idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
  881|       |
  882|       |  // stage 7~8
  883|  4.07k|  idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
  884|  4.07k|  idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
  885|  4.07k|  idct32_stage9_sse2(output, x);
  886|  4.07k|}
av1_inv_txfm_ssse3.c:lowbd_get_recon_8x8_sse2:
 2236|  1.38M|                                               __m128i res) {
 2237|  1.38M|  const __m128i zero = _mm_setzero_si128();
 2238|  1.38M|  __m128i x0 = _mm_adds_epi16(res, _mm_unpacklo_epi8(pred, zero));
 2239|  1.38M|  return _mm_packus_epi16(x0, x0);
 2240|  1.38M|}
av1_inv_txfm_ssse3.c:round_shift_ssse3:
 2467|   169k|                                     int size) {
 2468|   169k|  const __m128i scale = _mm_set1_epi16(NewInvSqrt2 * 8);
 2469|  1.53M|  for (int i = 0; i < size; ++i) {
  ------------------
  |  Branch (2469:19): [True: 1.36M, False: 169k]
  ------------------
 2470|  1.36M|    output[i] = _mm_mulhrs_epi16(input[i], scale);
 2471|  1.36M|  }
 2472|   169k|}
av1_inv_txfm_ssse3.c:lowbd_inv_txfm2d_add_4x4_ssse3:
 2412|   109k|                                           int eob) {
 2413|   109k|  (void)tx_size_;
 2414|   109k|  (void)eob;
 2415|   109k|  __m128i buf[4];
 2416|   109k|  const TX_SIZE tx_size = TX_4X4;
 2417|   109k|  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
 2418|   109k|  const int txw_idx = get_txw_idx(tx_size);
 2419|   109k|  const int txh_idx = get_txh_idx(tx_size);
 2420|   109k|  const int txfm_size_col = tx_size_wide[tx_size];
 2421|   109k|  const int txfm_size_row = tx_size_high[tx_size];
 2422|       |
 2423|   109k|  const transform_1d_ssse3 row_txfm =
 2424|   109k|      lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
 2425|   109k|  const transform_1d_ssse3 col_txfm =
 2426|   109k|      lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
 2427|       |
 2428|   109k|  int ud_flip, lr_flip;
 2429|   109k|  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 2430|   109k|  load_buffer_32bit_to_16bit_w4(input, txfm_size_row, buf, txfm_size_col);
 2431|   109k|  row_txfm(buf, buf);
 2432|   109k|  if (lr_flip) {
  ------------------
  |  Branch (2432:7): [True: 2.26k, False: 107k]
  ------------------
 2433|  2.26k|    __m128i temp[4];
 2434|  2.26k|    flip_buf_sse2(buf, temp, txfm_size_col);
 2435|  2.26k|    transpose_16bit_4x4(temp, buf);
 2436|   107k|  } else {
 2437|   107k|    transpose_16bit_4x4(buf, buf);
 2438|   107k|  }
 2439|   109k|  col_txfm(buf, buf);
 2440|   109k|  round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
 2441|   109k|  lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
 2442|   109k|}
av1_inv_txfm_ssse3.c:idct4_w4_sse2:
   53|   101k|static void idct4_w4_sse2(const __m128i *input, __m128i *output) {
   54|   101k|  const int8_t cos_bit = INV_COS_BIT;
  ------------------
  |  |   43|   101k|#define INV_COS_BIT 12
  ------------------
   55|   101k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|   101k|#define INV_COS_BIT 12
  ------------------
   56|   101k|  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|   101k|#define INV_COS_BIT 12
  ------------------
   57|       |
   58|   101k|  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
  ------------------
  |  |   20|   101k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
   59|   101k|  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
  ------------------
  |  |   20|   101k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
   60|   101k|  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
  ------------------
  |  |   20|   101k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
   61|   101k|  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
  ------------------
  |  |   20|   101k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
   62|       |
   63|       |  // stage 1
   64|   101k|  __m128i x[4];
   65|   101k|  x[0] = input[0];
   66|   101k|  x[1] = input[2];
   67|   101k|  x[2] = input[1];
   68|   101k|  x[3] = input[3];
   69|       |
   70|       |  // stage 2
   71|   101k|  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
  ------------------
  |  |   45|   101k|  do {                                               \
  |  |   46|   101k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   101k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   101k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   101k|                                                     \
  |  |   50|   101k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   101k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   101k|                                                     \
  |  |   53|   101k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   101k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   101k|                                                     \
  |  |   56|   101k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   101k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   101k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 101k]
  |  |  ------------------
  ------------------
   72|   101k|  btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
  ------------------
  |  |   45|   101k|  do {                                               \
  |  |   46|   101k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   101k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   101k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   101k|                                                     \
  |  |   50|   101k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   101k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   101k|                                                     \
  |  |   53|   101k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   101k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   101k|                                                     \
  |  |   56|   101k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   101k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   101k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 101k]
  |  |  ------------------
  ------------------
   73|       |
   74|       |  // stage 3
   75|   101k|  btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]);
  ------------------
  |  |   53|   101k|  do {                                                  \
  |  |   54|   101k|    const __m128i _in0 = in0;                           \
  |  |   55|   101k|    const __m128i _in1 = in1;                           \
  |  |   56|   101k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|   101k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|   101k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 101k]
  |  |  ------------------
  ------------------
   76|   101k|  btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]);
  ------------------
  |  |   53|   101k|  do {                                                  \
  |  |   54|   101k|    const __m128i _in0 = in0;                           \
  |  |   55|   101k|    const __m128i _in1 = in1;                           \
  |  |   56|   101k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|   101k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|   101k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 101k]
  |  |  ------------------
  ------------------
   77|   101k|}
av1_inv_txfm_ssse3.c:iadst4_w4_sse2:
 1657|  84.8k|static void iadst4_w4_sse2(const __m128i *input, __m128i *output) {
 1658|  84.8k|  const int32_t *sinpi = sinpi_arr(INV_COS_BIT);
  ------------------
  |  |   43|  84.8k|#define INV_COS_BIT 12
  ------------------
 1659|  84.8k|  const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
  ------------------
  |  |   20|  84.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1660|  84.8k|  const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]);
  ------------------
  |  |   20|  84.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1661|  84.8k|  const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]);
  ------------------
  |  |   20|  84.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1662|  84.8k|  const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]);
  ------------------
  |  |   20|  84.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1663|  84.8k|  const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]);
  ------------------
  |  |   20|  84.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1664|  84.8k|  const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]);
  ------------------
  |  |   20|  84.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1665|  84.8k|  const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]);
  ------------------
  |  |   20|  84.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1666|  84.8k|  const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]);
  ------------------
  |  |   20|  84.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1667|  84.8k|  __m128i x0[4];
 1668|  84.8k|  x0[0] = input[0];
 1669|  84.8k|  x0[1] = input[1];
 1670|  84.8k|  x0[2] = input[2];
 1671|  84.8k|  x0[3] = input[3];
 1672|       |
 1673|  84.8k|  __m128i u[2];
 1674|  84.8k|  u[0] = _mm_unpacklo_epi16(x0[0], x0[2]);
 1675|  84.8k|  u[1] = _mm_unpacklo_epi16(x0[1], x0[3]);
 1676|       |
 1677|  84.8k|  __m128i x1[8];
 1678|  84.8k|  x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04);  // x0*sin1 + x2*sin4
 1679|  84.8k|  x1[1] = _mm_madd_epi16(u[0], sinpi_p02_m01);  // x0*sin2 - x2*sin1
 1680|  84.8k|  x1[2] = _mm_madd_epi16(u[1], sinpi_p03_p02);  // x1*sin3 + x3*sin2
 1681|  84.8k|  x1[3] = _mm_madd_epi16(u[1], sinpi_p03_m04);  // x1*sin3 - x3*sin4
 1682|  84.8k|  x1[4] = _mm_madd_epi16(u[0], sinpi_p03_m03);  // x0*sin3 - x2*sin3
 1683|  84.8k|  x1[5] = _mm_madd_epi16(u[1], sinpi_0_p03);    // x2*sin3
 1684|  84.8k|  x1[6] = _mm_madd_epi16(u[0], sinpi_p04_p02);  // x0*sin4 + x2*sin2
 1685|  84.8k|  x1[7] = _mm_madd_epi16(u[1], sinpi_m03_m01);  // -x1*sin3 - x3*sin1
 1686|       |
 1687|  84.8k|  __m128i x2[4];
 1688|  84.8k|  x2[0] = _mm_add_epi32(x1[0], x1[2]);  // x0*sin1 + x2*sin4 + x1*sin3 + x3*sin2
 1689|  84.8k|  x2[1] = _mm_add_epi32(x1[1], x1[3]);  // x0*sin2 - x2*sin1 + x1*sin3 - x3*sin4
 1690|  84.8k|  x2[2] = _mm_add_epi32(x1[4], x1[5]);  // x0*sin3 - x2*sin3 + x3*sin3
 1691|  84.8k|  x2[3] = _mm_add_epi32(x1[6], x1[7]);  // x0*sin4 + x2*sin2 - x1*sin3 - x3*sin1
 1692|       |
 1693|  84.8k|  const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|  84.8k|#define INV_COS_BIT 12
  ------------------
 1694|   424k|  for (int i = 0; i < 4; ++i) {
  ------------------
  |  Branch (1694:19): [True: 339k, False: 84.8k]
  ------------------
 1695|   339k|    __m128i out0 = _mm_add_epi32(x2[i], rounding);
 1696|   339k|    out0 = _mm_srai_epi32(out0, INV_COS_BIT);
  ------------------
  |  |   43|   339k|#define INV_COS_BIT 12
  ------------------
 1697|   339k|    output[i] = _mm_packs_epi32(out0, out0);
 1698|   339k|  }
 1699|  84.8k|}
av1_inv_txfm_ssse3.c:idct8_w4_sse2:
  150|  39.4k|static void idct8_w4_sse2(const __m128i *input, __m128i *output) {
  151|  39.4k|  const int8_t cos_bit = INV_COS_BIT;
  ------------------
  |  |   43|  39.4k|#define INV_COS_BIT 12
  ------------------
  152|  39.4k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|  39.4k|#define INV_COS_BIT 12
  ------------------
  153|  39.4k|  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|  39.4k|#define INV_COS_BIT 12
  ------------------
  154|       |
  155|  39.4k|  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
  ------------------
  |  |   20|  39.4k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  156|  39.4k|  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
  ------------------
  |  |   20|  39.4k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  157|  39.4k|  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
  ------------------
  |  |   20|  39.4k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  158|  39.4k|  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
  ------------------
  |  |   20|  39.4k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  159|  39.4k|  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
  ------------------
  |  |   20|  39.4k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  160|  39.4k|  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
  ------------------
  |  |   20|  39.4k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  161|  39.4k|  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
  ------------------
  |  |   20|  39.4k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  162|  39.4k|  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
  ------------------
  |  |   20|  39.4k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  163|  39.4k|  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
  ------------------
  |  |   20|  39.4k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  164|       |
  165|       |  // stage 1
  166|  39.4k|  __m128i x[8];
  167|  39.4k|  x[0] = input[0];
  168|  39.4k|  x[1] = input[4];
  169|  39.4k|  x[2] = input[2];
  170|  39.4k|  x[3] = input[6];
  171|  39.4k|  x[4] = input[1];
  172|  39.4k|  x[5] = input[5];
  173|  39.4k|  x[6] = input[3];
  174|  39.4k|  x[7] = input[7];
  175|       |
  176|       |  // stage 2
  177|  39.4k|  btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
  ------------------
  |  |   45|  39.4k|  do {                                               \
  |  |   46|  39.4k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|  39.4k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|  39.4k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|  39.4k|                                                     \
  |  |   50|  39.4k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|  39.4k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|  39.4k|                                                     \
  |  |   53|  39.4k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|  39.4k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|  39.4k|                                                     \
  |  |   56|  39.4k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|  39.4k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|  39.4k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 39.4k]
  |  |  ------------------
  ------------------
  178|  39.4k|  btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
  ------------------
  |  |   45|  39.4k|  do {                                               \
  |  |   46|  39.4k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|  39.4k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|  39.4k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|  39.4k|                                                     \
  |  |   50|  39.4k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|  39.4k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|  39.4k|                                                     \
  |  |   53|  39.4k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|  39.4k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|  39.4k|                                                     \
  |  |   56|  39.4k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|  39.4k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|  39.4k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 39.4k]
  |  |  ------------------
  ------------------
  179|       |
  180|       |  // stage 3
  181|  39.4k|  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
  ------------------
  |  |   45|  39.4k|  do {                                               \
  |  |   46|  39.4k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|  39.4k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|  39.4k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|  39.4k|                                                     \
  |  |   50|  39.4k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|  39.4k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|  39.4k|                                                     \
  |  |   53|  39.4k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|  39.4k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|  39.4k|                                                     \
  |  |   56|  39.4k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|  39.4k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|  39.4k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 39.4k]
  |  |  ------------------
  ------------------
  182|  39.4k|  btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
  ------------------
  |  |   45|  39.4k|  do {                                               \
  |  |   46|  39.4k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|  39.4k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|  39.4k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|  39.4k|                                                     \
  |  |   50|  39.4k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|  39.4k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|  39.4k|                                                     \
  |  |   53|  39.4k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|  39.4k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|  39.4k|                                                     \
  |  |   56|  39.4k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|  39.4k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|  39.4k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 39.4k]
  |  |  ------------------
  ------------------
  183|  39.4k|  btf_16_adds_subs_sse2(x[4], x[5]);
  ------------------
  |  |   37|  39.4k|  do {                                  \
  |  |   38|  39.4k|    const __m128i _in0 = in0;           \
  |  |   39|  39.4k|    const __m128i _in1 = in1;           \
  |  |   40|  39.4k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  39.4k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  39.4k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 39.4k]
  |  |  ------------------
  ------------------
  184|  39.4k|  btf_16_subs_adds_sse2(x[7], x[6]);
  ------------------
  |  |   45|  39.4k|  do {                                  \
  |  |   46|  39.4k|    const __m128i _in0 = in0;           \
  |  |   47|  39.4k|    const __m128i _in1 = in1;           \
  |  |   48|  39.4k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|  39.4k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|  39.4k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded, False: 39.4k]
  |  |  ------------------
  ------------------
  185|       |
  186|       |  // stage 4
  187|  39.4k|  btf_16_adds_subs_sse2(x[0], x[3]);
  ------------------
  |  |   37|  39.4k|  do {                                  \
  |  |   38|  39.4k|    const __m128i _in0 = in0;           \
  |  |   39|  39.4k|    const __m128i _in1 = in1;           \
  |  |   40|  39.4k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  39.4k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  39.4k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 39.4k]
  |  |  ------------------
  ------------------
  188|  39.4k|  btf_16_adds_subs_sse2(x[1], x[2]);
  ------------------
  |  |   37|  39.4k|  do {                                  \
  |  |   38|  39.4k|    const __m128i _in0 = in0;           \
  |  |   39|  39.4k|    const __m128i _in1 = in1;           \
  |  |   40|  39.4k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  39.4k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  39.4k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 39.4k]
  |  |  ------------------
  ------------------
  189|  39.4k|  btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
  ------------------
  |  |   45|  39.4k|  do {                                               \
  |  |   46|  39.4k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|  39.4k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|  39.4k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|  39.4k|                                                     \
  |  |   50|  39.4k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|  39.4k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|  39.4k|                                                     \
  |  |   53|  39.4k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|  39.4k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|  39.4k|                                                     \
  |  |   56|  39.4k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|  39.4k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|  39.4k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 39.4k]
  |  |  ------------------
  ------------------
  190|       |
  191|       |  // stage 5
  192|  39.4k|  btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]);
  ------------------
  |  |   53|  39.4k|  do {                                                  \
  |  |   54|  39.4k|    const __m128i _in0 = in0;                           \
  |  |   55|  39.4k|    const __m128i _in1 = in1;                           \
  |  |   56|  39.4k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|  39.4k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|  39.4k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 39.4k]
  |  |  ------------------
  ------------------
  193|  39.4k|  btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]);
  ------------------
  |  |   53|  39.4k|  do {                                                  \
  |  |   54|  39.4k|    const __m128i _in0 = in0;                           \
  |  |   55|  39.4k|    const __m128i _in1 = in1;                           \
  |  |   56|  39.4k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|  39.4k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|  39.4k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 39.4k]
  |  |  ------------------
  ------------------
  194|  39.4k|  btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]);
  ------------------
  |  |   53|  39.4k|  do {                                                  \
  |  |   54|  39.4k|    const __m128i _in0 = in0;                           \
  |  |   55|  39.4k|    const __m128i _in1 = in1;                           \
  |  |   56|  39.4k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|  39.4k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|  39.4k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 39.4k]
  |  |  ------------------
  ------------------
  195|  39.4k|  btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]);
  ------------------
  |  |   53|  39.4k|  do {                                                  \
  |  |   54|  39.4k|    const __m128i _in0 = in0;                           \
  |  |   55|  39.4k|    const __m128i _in1 = in1;                           \
  |  |   56|  39.4k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|  39.4k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|  39.4k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 39.4k]
  |  |  ------------------
  ------------------
  196|  39.4k|}
av1_inv_txfm_ssse3.c:iadst8_w4_sse2:
 1815|  33.7k|static void iadst8_w4_sse2(const __m128i *input, __m128i *output) {
 1816|  33.7k|  const int8_t cos_bit = INV_COS_BIT;
  ------------------
  |  |   43|  33.7k|#define INV_COS_BIT 12
  ------------------
 1817|  33.7k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|  33.7k|#define INV_COS_BIT 12
  ------------------
 1818|  33.7k|  const __m128i __zero = _mm_setzero_si128();
 1819|  33.7k|  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|  33.7k|#define INV_COS_BIT 12
  ------------------
 1820|       |
 1821|  33.7k|  const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
  ------------------
  |  |   20|  33.7k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1822|  33.7k|  const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
  ------------------
  |  |   20|  33.7k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1823|  33.7k|  const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
  ------------------
  |  |   20|  33.7k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1824|  33.7k|  const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
  ------------------
  |  |   20|  33.7k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1825|  33.7k|  const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
  ------------------
  |  |   20|  33.7k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1826|  33.7k|  const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
  ------------------
  |  |   20|  33.7k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1827|  33.7k|  const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
  ------------------
  |  |   20|  33.7k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1828|  33.7k|  const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
  ------------------
  |  |   20|  33.7k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1829|  33.7k|  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
  ------------------
  |  |   20|  33.7k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1830|  33.7k|  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
  ------------------
  |  |   20|  33.7k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1831|  33.7k|  const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
  ------------------
  |  |   20|  33.7k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1832|  33.7k|  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
  ------------------
  |  |   20|  33.7k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1833|  33.7k|  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
  ------------------
  |  |   20|  33.7k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1834|       |
 1835|       |  // stage 1
 1836|  33.7k|  __m128i x[8];
 1837|  33.7k|  x[0] = input[7];
 1838|  33.7k|  x[1] = input[0];
 1839|  33.7k|  x[2] = input[5];
 1840|  33.7k|  x[3] = input[2];
 1841|  33.7k|  x[4] = input[3];
 1842|  33.7k|  x[5] = input[4];
 1843|  33.7k|  x[6] = input[1];
 1844|  33.7k|  x[7] = input[6];
 1845|       |
 1846|       |  // stage 2
 1847|  33.7k|  btf_16_4p_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1]);
  ------------------
  |  |   45|  33.7k|  do {                                               \
  |  |   46|  33.7k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|  33.7k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|  33.7k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|  33.7k|                                                     \
  |  |   50|  33.7k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|  33.7k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|  33.7k|                                                     \
  |  |   53|  33.7k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|  33.7k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|  33.7k|                                                     \
  |  |   56|  33.7k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|  33.7k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|  33.7k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 33.7k]
  |  |  ------------------
  ------------------
 1848|  33.7k|  btf_16_4p_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3]);
  ------------------
  |  |   45|  33.7k|  do {                                               \
  |  |   46|  33.7k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|  33.7k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|  33.7k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|  33.7k|                                                     \
  |  |   50|  33.7k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|  33.7k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|  33.7k|                                                     \
  |  |   53|  33.7k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|  33.7k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|  33.7k|                                                     \
  |  |   56|  33.7k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|  33.7k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|  33.7k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 33.7k]
  |  |  ------------------
  ------------------
 1849|  33.7k|  btf_16_4p_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5]);
  ------------------
  |  |   45|  33.7k|  do {                                               \
  |  |   46|  33.7k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|  33.7k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|  33.7k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|  33.7k|                                                     \
  |  |   50|  33.7k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|  33.7k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|  33.7k|                                                     \
  |  |   53|  33.7k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|  33.7k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|  33.7k|                                                     \
  |  |   56|  33.7k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|  33.7k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|  33.7k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 33.7k]
  |  |  ------------------
  ------------------
 1850|  33.7k|  btf_16_4p_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7]);
  ------------------
  |  |   45|  33.7k|  do {                                               \
  |  |   46|  33.7k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|  33.7k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|  33.7k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|  33.7k|                                                     \
  |  |   50|  33.7k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|  33.7k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|  33.7k|                                                     \
  |  |   53|  33.7k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|  33.7k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|  33.7k|                                                     \
  |  |   56|  33.7k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|  33.7k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|  33.7k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 33.7k]
  |  |  ------------------
  ------------------
 1851|       |
 1852|       |  // stage 3
 1853|  33.7k|  btf_16_adds_subs_sse2(x[0], x[4]);
  ------------------
  |  |   37|  33.7k|  do {                                  \
  |  |   38|  33.7k|    const __m128i _in0 = in0;           \
  |  |   39|  33.7k|    const __m128i _in1 = in1;           \
  |  |   40|  33.7k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  33.7k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  33.7k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 33.7k]
  |  |  ------------------
  ------------------
 1854|  33.7k|  btf_16_adds_subs_sse2(x[1], x[5]);
  ------------------
  |  |   37|  33.7k|  do {                                  \
  |  |   38|  33.7k|    const __m128i _in0 = in0;           \
  |  |   39|  33.7k|    const __m128i _in1 = in1;           \
  |  |   40|  33.7k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  33.7k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  33.7k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 33.7k]
  |  |  ------------------
  ------------------
 1855|  33.7k|  btf_16_adds_subs_sse2(x[2], x[6]);
  ------------------
  |  |   37|  33.7k|  do {                                  \
  |  |   38|  33.7k|    const __m128i _in0 = in0;           \
  |  |   39|  33.7k|    const __m128i _in1 = in1;           \
  |  |   40|  33.7k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  33.7k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  33.7k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 33.7k]
  |  |  ------------------
  ------------------
 1856|  33.7k|  btf_16_adds_subs_sse2(x[3], x[7]);
  ------------------
  |  |   37|  33.7k|  do {                                  \
  |  |   38|  33.7k|    const __m128i _in0 = in0;           \
  |  |   39|  33.7k|    const __m128i _in1 = in1;           \
  |  |   40|  33.7k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  33.7k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  33.7k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 33.7k]
  |  |  ------------------
  ------------------
 1857|       |
 1858|       |  // stage 4
 1859|  33.7k|  btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
  ------------------
  |  |   45|  33.7k|  do {                                               \
  |  |   46|  33.7k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|  33.7k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|  33.7k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|  33.7k|                                                     \
  |  |   50|  33.7k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|  33.7k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|  33.7k|                                                     \
  |  |   53|  33.7k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|  33.7k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|  33.7k|                                                     \
  |  |   56|  33.7k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|  33.7k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|  33.7k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 33.7k]
  |  |  ------------------
  ------------------
 1860|  33.7k|  btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
  ------------------
  |  |   45|  33.7k|  do {                                               \
  |  |   46|  33.7k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|  33.7k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|  33.7k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|  33.7k|                                                     \
  |  |   50|  33.7k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|  33.7k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|  33.7k|                                                     \
  |  |   53|  33.7k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|  33.7k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|  33.7k|                                                     \
  |  |   56|  33.7k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|  33.7k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|  33.7k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 33.7k]
  |  |  ------------------
  ------------------
 1861|       |
 1862|       |  // stage 5
 1863|  33.7k|  btf_16_adds_subs_sse2(x[0], x[2]);
  ------------------
  |  |   37|  33.7k|  do {                                  \
  |  |   38|  33.7k|    const __m128i _in0 = in0;           \
  |  |   39|  33.7k|    const __m128i _in1 = in1;           \
  |  |   40|  33.7k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  33.7k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  33.7k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 33.7k]
  |  |  ------------------
  ------------------
 1864|  33.7k|  btf_16_adds_subs_sse2(x[1], x[3]);
  ------------------
  |  |   37|  33.7k|  do {                                  \
  |  |   38|  33.7k|    const __m128i _in0 = in0;           \
  |  |   39|  33.7k|    const __m128i _in1 = in1;           \
  |  |   40|  33.7k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  33.7k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  33.7k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 33.7k]
  |  |  ------------------
  ------------------
 1865|  33.7k|  btf_16_adds_subs_sse2(x[4], x[6]);
  ------------------
  |  |   37|  33.7k|  do {                                  \
  |  |   38|  33.7k|    const __m128i _in0 = in0;           \
  |  |   39|  33.7k|    const __m128i _in1 = in1;           \
  |  |   40|  33.7k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  33.7k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  33.7k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 33.7k]
  |  |  ------------------
  ------------------
 1866|  33.7k|  btf_16_adds_subs_sse2(x[5], x[7]);
  ------------------
  |  |   37|  33.7k|  do {                                  \
  |  |   38|  33.7k|    const __m128i _in0 = in0;           \
  |  |   39|  33.7k|    const __m128i _in1 = in1;           \
  |  |   40|  33.7k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  33.7k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  33.7k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 33.7k]
  |  |  ------------------
  ------------------
 1867|       |
 1868|       |  // stage 6
 1869|  33.7k|  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
  ------------------
  |  |   45|  33.7k|  do {                                               \
  |  |   46|  33.7k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|  33.7k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|  33.7k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|  33.7k|                                                     \
  |  |   50|  33.7k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|  33.7k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|  33.7k|                                                     \
  |  |   53|  33.7k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|  33.7k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|  33.7k|                                                     \
  |  |   56|  33.7k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|  33.7k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|  33.7k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 33.7k]
  |  |  ------------------
  ------------------
 1870|  33.7k|  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
  ------------------
  |  |   45|  33.7k|  do {                                               \
  |  |   46|  33.7k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|  33.7k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|  33.7k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|  33.7k|                                                     \
  |  |   50|  33.7k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|  33.7k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|  33.7k|                                                     \
  |  |   53|  33.7k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|  33.7k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|  33.7k|                                                     \
  |  |   56|  33.7k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|  33.7k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|  33.7k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 33.7k]
  |  |  ------------------
  ------------------
 1871|       |
 1872|       |  // stage 7
 1873|  33.7k|  output[0] = x[0];
 1874|  33.7k|  output[1] = _mm_subs_epi16(__zero, x[4]);
 1875|  33.7k|  output[2] = x[6];
 1876|  33.7k|  output[3] = _mm_subs_epi16(__zero, x[2]);
 1877|  33.7k|  output[4] = x[3];
 1878|  33.7k|  output[5] = _mm_subs_epi16(__zero, x[7]);
 1879|  33.7k|  output[6] = x[5];
 1880|  33.7k|  output[7] = _mm_subs_epi16(__zero, x[1]);
 1881|  33.7k|}
av1_inv_txfm_ssse3.c:idct16_w4_sse2:
  387|  28.5k|static void idct16_w4_sse2(const __m128i *input, __m128i *output) {
  388|  28.5k|  const int8_t cos_bit = INV_COS_BIT;
  ------------------
  |  |   43|  28.5k|#define INV_COS_BIT 12
  ------------------
  389|  28.5k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|  28.5k|#define INV_COS_BIT 12
  ------------------
  390|  28.5k|  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|  28.5k|#define INV_COS_BIT 12
  ------------------
  391|       |
  392|  28.5k|  const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
  ------------------
  |  |   20|  28.5k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  393|  28.5k|  const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
  ------------------
  |  |   20|  28.5k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  394|  28.5k|  const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
  ------------------
  |  |   20|  28.5k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  395|  28.5k|  const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
  ------------------
  |  |   20|  28.5k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  396|  28.5k|  const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
  ------------------
  |  |   20|  28.5k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  397|  28.5k|  const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
  ------------------
  |  |   20|  28.5k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  398|  28.5k|  const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
  ------------------
  |  |   20|  28.5k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  399|  28.5k|  const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
  ------------------
  |  |   20|  28.5k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  400|  28.5k|  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
  ------------------
  |  |   20|  28.5k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  401|  28.5k|  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
  ------------------
  |  |   20|  28.5k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  402|  28.5k|  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
  ------------------
  |  |   20|  28.5k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  403|  28.5k|  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
  ------------------
  |  |   20|  28.5k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  404|  28.5k|  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
  ------------------
  |  |   20|  28.5k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  405|  28.5k|  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
  ------------------
  |  |   20|  28.5k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  406|  28.5k|  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
  ------------------
  |  |   20|  28.5k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  407|  28.5k|  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
  ------------------
  |  |   20|  28.5k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  408|  28.5k|  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
  ------------------
  |  |   20|  28.5k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  409|  28.5k|  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
  ------------------
  |  |   20|  28.5k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  410|  28.5k|  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
  ------------------
  |  |   20|  28.5k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  411|  28.5k|  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
  ------------------
  |  |   20|  28.5k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  412|       |
  413|       |  // stage 1
  414|  28.5k|  __m128i x[16];
  415|  28.5k|  x[0] = input[0];
  416|  28.5k|  x[1] = input[8];
  417|  28.5k|  x[2] = input[4];
  418|  28.5k|  x[3] = input[12];
  419|  28.5k|  x[4] = input[2];
  420|  28.5k|  x[5] = input[10];
  421|  28.5k|  x[6] = input[6];
  422|  28.5k|  x[7] = input[14];
  423|  28.5k|  x[8] = input[1];
  424|  28.5k|  x[9] = input[9];
  425|  28.5k|  x[10] = input[5];
  426|  28.5k|  x[11] = input[13];
  427|  28.5k|  x[12] = input[3];
  428|  28.5k|  x[13] = input[11];
  429|  28.5k|  x[14] = input[7];
  430|  28.5k|  x[15] = input[15];
  431|       |
  432|       |  // stage 2
  433|  28.5k|  btf_16_4p_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
  ------------------
  |  |   45|  28.5k|  do {                                               \
  |  |   46|  28.5k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|  28.5k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|  28.5k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|  28.5k|                                                     \
  |  |   50|  28.5k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|  28.5k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|  28.5k|                                                     \
  |  |   53|  28.5k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|  28.5k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|  28.5k|                                                     \
  |  |   56|  28.5k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|  28.5k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|  28.5k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 28.5k]
  |  |  ------------------
  ------------------
  434|  28.5k|  btf_16_4p_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
  ------------------
  |  |   45|  28.5k|  do {                                               \
  |  |   46|  28.5k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|  28.5k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|  28.5k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|  28.5k|                                                     \
  |  |   50|  28.5k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|  28.5k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|  28.5k|                                                     \
  |  |   53|  28.5k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|  28.5k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|  28.5k|                                                     \
  |  |   56|  28.5k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|  28.5k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|  28.5k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 28.5k]
  |  |  ------------------
  ------------------
  435|  28.5k|  btf_16_4p_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
  ------------------
  |  |   45|  28.5k|  do {                                               \
  |  |   46|  28.5k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|  28.5k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|  28.5k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|  28.5k|                                                     \
  |  |   50|  28.5k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|  28.5k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|  28.5k|                                                     \
  |  |   53|  28.5k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|  28.5k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|  28.5k|                                                     \
  |  |   56|  28.5k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|  28.5k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|  28.5k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 28.5k]
  |  |  ------------------
  ------------------
  436|  28.5k|  btf_16_4p_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
  ------------------
  |  |   45|  28.5k|  do {                                               \
  |  |   46|  28.5k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|  28.5k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|  28.5k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|  28.5k|                                                     \
  |  |   50|  28.5k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|  28.5k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|  28.5k|                                                     \
  |  |   53|  28.5k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|  28.5k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|  28.5k|                                                     \
  |  |   56|  28.5k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|  28.5k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|  28.5k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 28.5k]
  |  |  ------------------
  ------------------
  437|       |
  438|       |  // stage 3
  439|  28.5k|  btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
  ------------------
  |  |   45|  28.5k|  do {                                               \
  |  |   46|  28.5k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|  28.5k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|  28.5k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|  28.5k|                                                     \
  |  |   50|  28.5k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|  28.5k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|  28.5k|                                                     \
  |  |   53|  28.5k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|  28.5k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|  28.5k|                                                     \
  |  |   56|  28.5k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|  28.5k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|  28.5k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 28.5k]
  |  |  ------------------
  ------------------
  440|  28.5k|  btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
  ------------------
  |  |   45|  28.5k|  do {                                               \
  |  |   46|  28.5k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|  28.5k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|  28.5k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|  28.5k|                                                     \
  |  |   50|  28.5k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|  28.5k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|  28.5k|                                                     \
  |  |   53|  28.5k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|  28.5k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|  28.5k|                                                     \
  |  |   56|  28.5k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|  28.5k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|  28.5k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 28.5k]
  |  |  ------------------
  ------------------
  441|  28.5k|  btf_16_adds_subs_sse2(x[8], x[9]);
  ------------------
  |  |   37|  28.5k|  do {                                  \
  |  |   38|  28.5k|    const __m128i _in0 = in0;           \
  |  |   39|  28.5k|    const __m128i _in1 = in1;           \
  |  |   40|  28.5k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  28.5k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  28.5k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 28.5k]
  |  |  ------------------
  ------------------
  442|  28.5k|  btf_16_subs_adds_sse2(x[11], x[10]);
  ------------------
  |  |   45|  28.5k|  do {                                  \
  |  |   46|  28.5k|    const __m128i _in0 = in0;           \
  |  |   47|  28.5k|    const __m128i _in1 = in1;           \
  |  |   48|  28.5k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|  28.5k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|  28.5k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded, False: 28.5k]
  |  |  ------------------
  ------------------
  443|  28.5k|  btf_16_adds_subs_sse2(x[12], x[13]);
  ------------------
  |  |   37|  28.5k|  do {                                  \
  |  |   38|  28.5k|    const __m128i _in0 = in0;           \
  |  |   39|  28.5k|    const __m128i _in1 = in1;           \
  |  |   40|  28.5k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  28.5k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  28.5k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 28.5k]
  |  |  ------------------
  ------------------
  444|  28.5k|  btf_16_subs_adds_sse2(x[15], x[14]);
  ------------------
  |  |   45|  28.5k|  do {                                  \
  |  |   46|  28.5k|    const __m128i _in0 = in0;           \
  |  |   47|  28.5k|    const __m128i _in1 = in1;           \
  |  |   48|  28.5k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|  28.5k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|  28.5k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded, False: 28.5k]
  |  |  ------------------
  ------------------
  445|       |
  446|       |  // stage 4
  447|  28.5k|  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
  ------------------
  |  |   45|  28.5k|  do {                                               \
  |  |   46|  28.5k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|  28.5k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|  28.5k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|  28.5k|                                                     \
  |  |   50|  28.5k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|  28.5k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|  28.5k|                                                     \
  |  |   53|  28.5k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|  28.5k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|  28.5k|                                                     \
  |  |   56|  28.5k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|  28.5k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|  28.5k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 28.5k]
  |  |  ------------------
  ------------------
  448|  28.5k|  btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
  ------------------
  |  |   45|  28.5k|  do {                                               \
  |  |   46|  28.5k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|  28.5k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|  28.5k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|  28.5k|                                                     \
  |  |   50|  28.5k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|  28.5k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|  28.5k|                                                     \
  |  |   53|  28.5k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|  28.5k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|  28.5k|                                                     \
  |  |   56|  28.5k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|  28.5k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|  28.5k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 28.5k]
  |  |  ------------------
  ------------------
  449|  28.5k|  btf_16_adds_subs_sse2(x[4], x[5]);
  ------------------
  |  |   37|  28.5k|  do {                                  \
  |  |   38|  28.5k|    const __m128i _in0 = in0;           \
  |  |   39|  28.5k|    const __m128i _in1 = in1;           \
  |  |   40|  28.5k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  28.5k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  28.5k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 28.5k]
  |  |  ------------------
  ------------------
  450|  28.5k|  btf_16_subs_adds_sse2(x[7], x[6]);
  ------------------
  |  |   45|  28.5k|  do {                                  \
  |  |   46|  28.5k|    const __m128i _in0 = in0;           \
  |  |   47|  28.5k|    const __m128i _in1 = in1;           \
  |  |   48|  28.5k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|  28.5k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|  28.5k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded, False: 28.5k]
  |  |  ------------------
  ------------------
  451|  28.5k|  btf_16_4p_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
  ------------------
  |  |   45|  28.5k|  do {                                               \
  |  |   46|  28.5k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|  28.5k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|  28.5k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|  28.5k|                                                     \
  |  |   50|  28.5k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|  28.5k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|  28.5k|                                                     \
  |  |   53|  28.5k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|  28.5k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|  28.5k|                                                     \
  |  |   56|  28.5k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|  28.5k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|  28.5k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 28.5k]
  |  |  ------------------
  ------------------
  452|  28.5k|  btf_16_4p_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
  ------------------
  |  |   45|  28.5k|  do {                                               \
  |  |   46|  28.5k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|  28.5k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|  28.5k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|  28.5k|                                                     \
  |  |   50|  28.5k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|  28.5k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|  28.5k|                                                     \
  |  |   53|  28.5k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|  28.5k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|  28.5k|                                                     \
  |  |   56|  28.5k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|  28.5k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|  28.5k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 28.5k]
  |  |  ------------------
  ------------------
  453|       |
  454|       |  // stage 5
  455|  28.5k|  btf_16_adds_subs_sse2(x[0], x[3]);
  ------------------
  |  |   37|  28.5k|  do {                                  \
  |  |   38|  28.5k|    const __m128i _in0 = in0;           \
  |  |   39|  28.5k|    const __m128i _in1 = in1;           \
  |  |   40|  28.5k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  28.5k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  28.5k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 28.5k]
  |  |  ------------------
  ------------------
  456|  28.5k|  btf_16_adds_subs_sse2(x[1], x[2]);
  ------------------
  |  |   37|  28.5k|  do {                                  \
  |  |   38|  28.5k|    const __m128i _in0 = in0;           \
  |  |   39|  28.5k|    const __m128i _in1 = in1;           \
  |  |   40|  28.5k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  28.5k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  28.5k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 28.5k]
  |  |  ------------------
  ------------------
  457|  28.5k|  btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
  ------------------
  |  |   45|  28.5k|  do {                                               \
  |  |   46|  28.5k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|  28.5k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|  28.5k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|  28.5k|                                                     \
  |  |   50|  28.5k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|  28.5k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|  28.5k|                                                     \
  |  |   53|  28.5k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|  28.5k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|  28.5k|                                                     \
  |  |   56|  28.5k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|  28.5k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|  28.5k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 28.5k]
  |  |  ------------------
  ------------------
  458|  28.5k|  btf_16_adds_subs_sse2(x[8], x[11]);
  ------------------
  |  |   37|  28.5k|  do {                                  \
  |  |   38|  28.5k|    const __m128i _in0 = in0;           \
  |  |   39|  28.5k|    const __m128i _in1 = in1;           \
  |  |   40|  28.5k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  28.5k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  28.5k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 28.5k]
  |  |  ------------------
  ------------------
  459|  28.5k|  btf_16_adds_subs_sse2(x[9], x[10]);
  ------------------
  |  |   37|  28.5k|  do {                                  \
  |  |   38|  28.5k|    const __m128i _in0 = in0;           \
  |  |   39|  28.5k|    const __m128i _in1 = in1;           \
  |  |   40|  28.5k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  28.5k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  28.5k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 28.5k]
  |  |  ------------------
  ------------------
  460|  28.5k|  btf_16_subs_adds_sse2(x[15], x[12]);
  ------------------
  |  |   45|  28.5k|  do {                                  \
  |  |   46|  28.5k|    const __m128i _in0 = in0;           \
  |  |   47|  28.5k|    const __m128i _in1 = in1;           \
  |  |   48|  28.5k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|  28.5k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|  28.5k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded, False: 28.5k]
  |  |  ------------------
  ------------------
  461|  28.5k|  btf_16_subs_adds_sse2(x[14], x[13]);
  ------------------
  |  |   45|  28.5k|  do {                                  \
  |  |   46|  28.5k|    const __m128i _in0 = in0;           \
  |  |   47|  28.5k|    const __m128i _in1 = in1;           \
  |  |   48|  28.5k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|  28.5k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|  28.5k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded, False: 28.5k]
  |  |  ------------------
  ------------------
  462|       |
  463|       |  // stage 6
  464|  28.5k|  btf_16_adds_subs_sse2(x[0], x[7]);
  ------------------
  |  |   37|  28.5k|  do {                                  \
  |  |   38|  28.5k|    const __m128i _in0 = in0;           \
  |  |   39|  28.5k|    const __m128i _in1 = in1;           \
  |  |   40|  28.5k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  28.5k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  28.5k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 28.5k]
  |  |  ------------------
  ------------------
  465|  28.5k|  btf_16_adds_subs_sse2(x[1], x[6]);
  ------------------
  |  |   37|  28.5k|  do {                                  \
  |  |   38|  28.5k|    const __m128i _in0 = in0;           \
  |  |   39|  28.5k|    const __m128i _in1 = in1;           \
  |  |   40|  28.5k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  28.5k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  28.5k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 28.5k]
  |  |  ------------------
  ------------------
  466|  28.5k|  btf_16_adds_subs_sse2(x[2], x[5]);
  ------------------
  |  |   37|  28.5k|  do {                                  \
  |  |   38|  28.5k|    const __m128i _in0 = in0;           \
  |  |   39|  28.5k|    const __m128i _in1 = in1;           \
  |  |   40|  28.5k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  28.5k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  28.5k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 28.5k]
  |  |  ------------------
  ------------------
  467|  28.5k|  btf_16_adds_subs_sse2(x[3], x[4]);
  ------------------
  |  |   37|  28.5k|  do {                                  \
  |  |   38|  28.5k|    const __m128i _in0 = in0;           \
  |  |   39|  28.5k|    const __m128i _in1 = in1;           \
  |  |   40|  28.5k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  28.5k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  28.5k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded, False: 28.5k]
  |  |  ------------------
  ------------------
  468|  28.5k|  btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
  ------------------
  |  |   45|  28.5k|  do {                                               \
  |  |   46|  28.5k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|  28.5k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|  28.5k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|  28.5k|                                                     \
  |  |   50|  28.5k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|  28.5k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|  28.5k|                                                     \
  |  |   53|  28.5k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|  28.5k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|  28.5k|                                                     \
  |  |   56|  28.5k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|  28.5k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|  28.5k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 28.5k]
  |  |  ------------------
  ------------------
  469|  28.5k|  btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
  ------------------
  |  |   45|  28.5k|  do {                                               \
  |  |   46|  28.5k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|  28.5k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|  28.5k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|  28.5k|                                                     \
  |  |   50|  28.5k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|  28.5k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|  28.5k|                                                     \
  |  |   53|  28.5k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|  28.5k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|  28.5k|                                                     \
  |  |   56|  28.5k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|  28.5k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|  28.5k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 28.5k]
  |  |  ------------------
  ------------------
  470|       |
  471|       |  // stage 7
  472|  28.5k|  idct16_stage7_sse2(output, x);
  473|  28.5k|}
av1_inv_txfm_ssse3.c:iadst16_w4_sse2:
 2117|  22.8k|static void iadst16_w4_sse2(const __m128i *input, __m128i *output) {
 2118|  22.8k|  const int8_t cos_bit = INV_COS_BIT;
  ------------------
  |  |   43|  22.8k|#define INV_COS_BIT 12
  ------------------
 2119|  22.8k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|  22.8k|#define INV_COS_BIT 12
  ------------------
 2120|  22.8k|  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|  22.8k|#define INV_COS_BIT 12
  ------------------
 2121|       |
 2122|  22.8k|  const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
  ------------------
  |  |   20|  22.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2123|  22.8k|  const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
  ------------------
  |  |   20|  22.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2124|  22.8k|  const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
  ------------------
  |  |   20|  22.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2125|  22.8k|  const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
  ------------------
  |  |   20|  22.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2126|  22.8k|  const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
  ------------------
  |  |   20|  22.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2127|  22.8k|  const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
  ------------------
  |  |   20|  22.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2128|  22.8k|  const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
  ------------------
  |  |   20|  22.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2129|  22.8k|  const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
  ------------------
  |  |   20|  22.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2130|  22.8k|  const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
  ------------------
  |  |   20|  22.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2131|  22.8k|  const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
  ------------------
  |  |   20|  22.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2132|  22.8k|  const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
  ------------------
  |  |   20|  22.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2133|  22.8k|  const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
  ------------------
  |  |   20|  22.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2134|  22.8k|  const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
  ------------------
  |  |   20|  22.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2135|  22.8k|  const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
  ------------------
  |  |   20|  22.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2136|  22.8k|  const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
  ------------------
  |  |   20|  22.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2137|  22.8k|  const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
  ------------------
  |  |   20|  22.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2138|  22.8k|  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
  ------------------
  |  |   20|  22.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2139|  22.8k|  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
  ------------------
  |  |   20|  22.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2140|  22.8k|  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
  ------------------
  |  |   20|  22.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2141|  22.8k|  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
  ------------------
  |  |   20|  22.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2142|  22.8k|  const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
  ------------------
  |  |   20|  22.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2143|  22.8k|  const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
  ------------------
  |  |   20|  22.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2144|  22.8k|  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
  ------------------
  |  |   20|  22.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2145|  22.8k|  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
  ------------------
  |  |   20|  22.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2146|  22.8k|  const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
  ------------------
  |  |   20|  22.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2147|  22.8k|  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
  ------------------
  |  |   20|  22.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2148|  22.8k|  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
  ------------------
  |  |   20|  22.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2149|       |
 2150|       |  // stage 1
 2151|  22.8k|  __m128i x[16];
 2152|  22.8k|  x[0] = input[15];
 2153|  22.8k|  x[1] = input[0];
 2154|  22.8k|  x[2] = input[13];
 2155|  22.8k|  x[3] = input[2];
 2156|  22.8k|  x[4] = input[11];
 2157|  22.8k|  x[5] = input[4];
 2158|  22.8k|  x[6] = input[9];
 2159|  22.8k|  x[7] = input[6];
 2160|  22.8k|  x[8] = input[7];
 2161|  22.8k|  x[9] = input[8];
 2162|  22.8k|  x[10] = input[5];
 2163|  22.8k|  x[11] = input[10];
 2164|  22.8k|  x[12] = input[3];
 2165|  22.8k|  x[13] = input[12];
 2166|  22.8k|  x[14] = input[1];
 2167|  22.8k|  x[15] = input[14];
 2168|       |
 2169|       |  // stage 2
 2170|  22.8k|  btf_16_4p_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1]);
  ------------------
  |  |   45|  22.8k|  do {                                               \
  |  |   46|  22.8k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|  22.8k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|  22.8k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|  22.8k|                                                     \
  |  |   50|  22.8k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|  22.8k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|  22.8k|                                                     \
  |  |   53|  22.8k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|  22.8k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|  22.8k|                                                     \
  |  |   56|  22.8k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|  22.8k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
 2171|  22.8k|  btf_16_4p_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3]);
  ------------------
  |  |   45|  22.8k|  do {                                               \
  |  |   46|  22.8k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|  22.8k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|  22.8k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|  22.8k|                                                     \
  |  |   50|  22.8k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|  22.8k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|  22.8k|                                                     \
  |  |   53|  22.8k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|  22.8k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|  22.8k|                                                     \
  |  |   56|  22.8k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|  22.8k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
 2172|  22.8k|  btf_16_4p_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5]);
  ------------------
  |  |   45|  22.8k|  do {                                               \
  |  |   46|  22.8k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|  22.8k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|  22.8k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|  22.8k|                                                     \
  |  |   50|  22.8k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|  22.8k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|  22.8k|                                                     \
  |  |   53|  22.8k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|  22.8k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|  22.8k|                                                     \
  |  |   56|  22.8k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|  22.8k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
 2173|  22.8k|  btf_16_4p_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7]);
  ------------------
  |  |   45|  22.8k|  do {                                               \
  |  |   46|  22.8k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|  22.8k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|  22.8k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|  22.8k|                                                     \
  |  |   50|  22.8k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|  22.8k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|  22.8k|                                                     \
  |  |   53|  22.8k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|  22.8k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|  22.8k|                                                     \
  |  |   56|  22.8k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|  22.8k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
 2174|  22.8k|  btf_16_4p_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9]);
  ------------------
  |  |   45|  22.8k|  do {                                               \
  |  |   46|  22.8k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|  22.8k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|  22.8k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|  22.8k|                                                     \
  |  |   50|  22.8k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|  22.8k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|  22.8k|                                                     \
  |  |   53|  22.8k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|  22.8k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|  22.8k|                                                     \
  |  |   56|  22.8k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|  22.8k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
 2175|  22.8k|  btf_16_4p_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11]);
  ------------------
  |  |   45|  22.8k|  do {                                               \
  |  |   46|  22.8k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|  22.8k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|  22.8k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|  22.8k|                                                     \
  |  |   50|  22.8k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|  22.8k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|  22.8k|                                                     \
  |  |   53|  22.8k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|  22.8k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|  22.8k|                                                     \
  |  |   56|  22.8k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|  22.8k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
 2176|  22.8k|  btf_16_4p_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13]);
  ------------------
  |  |   45|  22.8k|  do {                                               \
  |  |   46|  22.8k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|  22.8k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|  22.8k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|  22.8k|                                                     \
  |  |   50|  22.8k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|  22.8k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|  22.8k|                                                     \
  |  |   53|  22.8k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|  22.8k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|  22.8k|                                                     \
  |  |   56|  22.8k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|  22.8k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
 2177|  22.8k|  btf_16_4p_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15]);
  ------------------
  |  |   45|  22.8k|  do {                                               \
  |  |   46|  22.8k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|  22.8k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|  22.8k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|  22.8k|                                                     \
  |  |   50|  22.8k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|  22.8k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|  22.8k|                                                     \
  |  |   53|  22.8k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|  22.8k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|  22.8k|                                                     \
  |  |   56|  22.8k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|  22.8k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
 2178|       |
 2179|       |  // stage 3
 2180|  22.8k|  iadst16_stage3_ssse3(x);
 2181|       |
 2182|       |  // stage 4
 2183|  22.8k|  btf_16_4p_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
  ------------------
  |  |   45|  22.8k|  do {                                               \
  |  |   46|  22.8k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|  22.8k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|  22.8k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|  22.8k|                                                     \
  |  |   50|  22.8k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|  22.8k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|  22.8k|                                                     \
  |  |   53|  22.8k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|  22.8k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|  22.8k|                                                     \
  |  |   56|  22.8k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|  22.8k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
 2184|  22.8k|  btf_16_4p_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]);
  ------------------
  |  |   45|  22.8k|  do {                                               \
  |  |   46|  22.8k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|  22.8k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|  22.8k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|  22.8k|                                                     \
  |  |   50|  22.8k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|  22.8k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|  22.8k|                                                     \
  |  |   53|  22.8k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|  22.8k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|  22.8k|                                                     \
  |  |   56|  22.8k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|  22.8k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
 2185|  22.8k|  btf_16_4p_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]);
  ------------------
  |  |   45|  22.8k|  do {                                               \
  |  |   46|  22.8k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|  22.8k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|  22.8k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|  22.8k|                                                     \
  |  |   50|  22.8k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|  22.8k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|  22.8k|                                                     \
  |  |   53|  22.8k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|  22.8k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|  22.8k|                                                     \
  |  |   56|  22.8k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|  22.8k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
 2186|  22.8k|  btf_16_4p_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]);
  ------------------
  |  |   45|  22.8k|  do {                                               \
  |  |   46|  22.8k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|  22.8k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|  22.8k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|  22.8k|                                                     \
  |  |   50|  22.8k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|  22.8k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|  22.8k|                                                     \
  |  |   53|  22.8k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|  22.8k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|  22.8k|                                                     \
  |  |   56|  22.8k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|  22.8k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
 2187|       |
 2188|       |  // stage 5
 2189|  22.8k|  iadst16_stage5_ssse3(x);
 2190|       |
 2191|       |  // stage 6
 2192|  22.8k|  btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
  ------------------
  |  |   45|  22.8k|  do {                                               \
  |  |   46|  22.8k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|  22.8k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|  22.8k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|  22.8k|                                                     \
  |  |   50|  22.8k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|  22.8k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|  22.8k|                                                     \
  |  |   53|  22.8k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|  22.8k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|  22.8k|                                                     \
  |  |   56|  22.8k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|  22.8k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
 2193|  22.8k|  btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
  ------------------
  |  |   45|  22.8k|  do {                                               \
  |  |   46|  22.8k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|  22.8k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|  22.8k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|  22.8k|                                                     \
  |  |   50|  22.8k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|  22.8k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|  22.8k|                                                     \
  |  |   53|  22.8k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|  22.8k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|  22.8k|                                                     \
  |  |   56|  22.8k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|  22.8k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
 2194|  22.8k|  btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
  ------------------
  |  |   45|  22.8k|  do {                                               \
  |  |   46|  22.8k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|  22.8k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|  22.8k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|  22.8k|                                                     \
  |  |   50|  22.8k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|  22.8k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|  22.8k|                                                     \
  |  |   53|  22.8k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|  22.8k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|  22.8k|                                                     \
  |  |   56|  22.8k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|  22.8k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
 2195|  22.8k|  btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]);
  ------------------
  |  |   45|  22.8k|  do {                                               \
  |  |   46|  22.8k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|  22.8k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|  22.8k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|  22.8k|                                                     \
  |  |   50|  22.8k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|  22.8k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|  22.8k|                                                     \
  |  |   53|  22.8k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|  22.8k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|  22.8k|                                                     \
  |  |   56|  22.8k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|  22.8k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
 2196|       |
 2197|       |  // stage 7
 2198|  22.8k|  iadst16_stage7_ssse3(x);
 2199|       |
 2200|       |  // stage 8
 2201|  22.8k|  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
  ------------------
  |  |   45|  22.8k|  do {                                               \
  |  |   46|  22.8k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|  22.8k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|  22.8k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|  22.8k|                                                     \
  |  |   50|  22.8k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|  22.8k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|  22.8k|                                                     \
  |  |   53|  22.8k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|  22.8k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|  22.8k|                                                     \
  |  |   56|  22.8k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|  22.8k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
 2202|  22.8k|  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
  ------------------
  |  |   45|  22.8k|  do {                                               \
  |  |   46|  22.8k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|  22.8k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|  22.8k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|  22.8k|                                                     \
  |  |   50|  22.8k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|  22.8k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|  22.8k|                                                     \
  |  |   53|  22.8k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|  22.8k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|  22.8k|                                                     \
  |  |   56|  22.8k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|  22.8k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
 2203|  22.8k|  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11]);
  ------------------
  |  |   45|  22.8k|  do {                                               \
  |  |   46|  22.8k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|  22.8k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|  22.8k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|  22.8k|                                                     \
  |  |   50|  22.8k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|  22.8k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|  22.8k|                                                     \
  |  |   53|  22.8k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|  22.8k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|  22.8k|                                                     \
  |  |   56|  22.8k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|  22.8k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
 2204|  22.8k|  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]);
  ------------------
  |  |   45|  22.8k|  do {                                               \
  |  |   46|  22.8k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|  22.8k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|  22.8k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|  22.8k|                                                     \
  |  |   50|  22.8k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|  22.8k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|  22.8k|                                                     \
  |  |   53|  22.8k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|  22.8k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|  22.8k|                                                     \
  |  |   56|  22.8k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|  22.8k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|  22.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
 2205|       |
 2206|       |  // stage 9
 2207|  22.8k|  iadst16_stage9_ssse3(output, x);
 2208|  22.8k|}
av1_inv_txfm_ssse3.c:iidentity16_ssse3:
 2225|  2.34k|static void iidentity16_ssse3(const __m128i *input, __m128i *output) {
 2226|  2.34k|  const int16_t scale_fractional = 2 * (NewSqrt2 - (1 << NewSqrt2Bits));
  ------------------
  |  |   41|  2.34k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 2227|  2.34k|  const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits));
  ------------------
  |  |   41|  2.34k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 2228|  39.9k|  for (int i = 0; i < 16; ++i) {
  ------------------
  |  Branch (2228:19): [True: 37.5k, False: 2.34k]
  ------------------
 2229|  37.5k|    __m128i x = _mm_mulhrs_epi16(input[i], scale);
 2230|  37.5k|    __m128i srcx2 = _mm_adds_epi16(input[i], input[i]);
 2231|  37.5k|    output[i] = _mm_adds_epi16(x, srcx2);
 2232|  37.5k|  }
 2233|  2.34k|}
av1_inv_txfm_ssse3.c:lowbd_write_buffer_4xn_sse2:
 2244|   161k|                                               const int height) {
 2245|   161k|  int j = flipud ? (height - 1) : 0;
  ------------------
  |  Branch (2245:11): [True: 2.50k, False: 158k]
  ------------------
 2246|   161k|  const int step = flipud ? -1 : 1;
  ------------------
  |  Branch (2246:20): [True: 2.50k, False: 158k]
  ------------------
 2247|   161k|  const __m128i zero = _mm_setzero_si128();
 2248|  1.16M|  for (int i = 0; i < height; ++i, j += step) {
  ------------------
  |  Branch (2248:19): [True: 1.00M, False: 161k]
  ------------------
 2249|  1.00M|    const __m128i v = _mm_cvtsi32_si128(*((int *)(output + i * stride)));
 2250|  1.00M|    __m128i u = _mm_adds_epi16(in[j], _mm_unpacklo_epi8(v, zero));
 2251|  1.00M|    u = _mm_packus_epi16(u, zero);
 2252|  1.00M|    *((int *)(output + i * stride)) = _mm_cvtsi128_si32(u);
 2253|  1.00M|  }
 2254|   161k|}
av1_inv_txfm_ssse3.c:lowbd_inv_txfm2d_add_4x8_ssse3:
 2678|  32.8k|                                           int eob) {
 2679|  32.8k|  (void)tx_size_;
 2680|  32.8k|  (void)eob;
 2681|  32.8k|  __m128i buf[8];
 2682|  32.8k|  const TX_SIZE tx_size = TX_4X8;
 2683|  32.8k|  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
 2684|  32.8k|  const int txw_idx = get_txw_idx(tx_size);
 2685|  32.8k|  const int txh_idx = get_txh_idx(tx_size);
 2686|  32.8k|  const int txfm_size_col = tx_size_wide[tx_size];
 2687|  32.8k|  const int txfm_size_row = tx_size_high[tx_size];
 2688|       |
 2689|  32.8k|  const transform_1d_ssse3 row_txfm =
 2690|  32.8k|      lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]];
 2691|  32.8k|  const transform_1d_ssse3 col_txfm =
 2692|  32.8k|      lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
 2693|       |
 2694|  32.8k|  int ud_flip, lr_flip;
 2695|  32.8k|  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 2696|  32.8k|  load_buffer_32bit_to_16bit(input, txfm_size_row, buf, txfm_size_col);
 2697|  32.8k|  round_shift_ssse3(buf, buf, txfm_size_col);  // rect special code
 2698|  32.8k|  row_txfm(buf, buf);
 2699|       |  // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);// shift[0] is 0
 2700|  32.8k|  if (lr_flip) {
  ------------------
  |  Branch (2700:7): [True: 592, False: 32.2k]
  ------------------
 2701|    592|    __m128i temp[4];
 2702|    592|    flip_buf_sse2(buf, temp, txfm_size_col);
 2703|    592|    transpose_16bit_8x4(temp, buf);
 2704|  32.2k|  } else {
 2705|  32.2k|    transpose_16bit_8x4(buf, buf);
 2706|  32.2k|  }
 2707|  32.8k|  col_txfm(buf, buf);
 2708|  32.8k|  round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
 2709|  32.8k|  lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
 2710|  32.8k|}
av1_inv_txfm_ssse3.c:lowbd_inv_txfm2d_add_8x4_ssse3:
 2715|  49.8k|                                           int eob) {
 2716|  49.8k|  (void)tx_size_;
 2717|  49.8k|  (void)eob;
 2718|  49.8k|  __m128i buf[8];
 2719|  49.8k|  const TX_SIZE tx_size = TX_8X4;
 2720|  49.8k|  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
 2721|  49.8k|  const int txw_idx = get_txw_idx(tx_size);
 2722|  49.8k|  const int txh_idx = get_txh_idx(tx_size);
 2723|  49.8k|  const int txfm_size_col = tx_size_wide[tx_size];
 2724|  49.8k|  const int txfm_size_row = tx_size_high[tx_size];
 2725|       |
 2726|  49.8k|  const transform_1d_ssse3 row_txfm =
 2727|  49.8k|      lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
 2728|  49.8k|  const transform_1d_ssse3 col_txfm =
 2729|  49.8k|      lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]];
 2730|       |
 2731|  49.8k|  int ud_flip, lr_flip;
 2732|  49.8k|  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 2733|  49.8k|  load_buffer_32bit_to_16bit_w4(input, txfm_size_row, buf, txfm_size_col);
 2734|  49.8k|  round_shift_ssse3(buf, buf, txfm_size_col);  // rect special code
 2735|  49.8k|  row_txfm(buf, buf);
 2736|       |  // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]); // shift[0] is 0
 2737|  49.8k|  if (lr_flip) {
  ------------------
  |  Branch (2737:7): [True: 898, False: 48.9k]
  ------------------
 2738|    898|    __m128i temp[8];
 2739|    898|    flip_buf_sse2(buf, temp, txfm_size_col);
 2740|    898|    transpose_16bit_4x8(temp, buf);
 2741|  48.9k|  } else {
 2742|  48.9k|    transpose_16bit_4x8(buf, buf);
 2743|  48.9k|  }
 2744|  49.8k|  col_txfm(buf, buf);
 2745|  49.8k|  round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
 2746|  49.8k|  lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
 2747|  49.8k|}
av1_inv_txfm_ssse3.c:lowbd_write_buffer_8xn_sse2:
 2258|   166k|                                               const int height) {
 2259|   166k|  int j = flipud ? (height - 1) : 0;
  ------------------
  |  Branch (2259:11): [True: 1.63k, False: 164k]
  ------------------
 2260|   166k|  const int step = flipud ? -1 : 1;
  ------------------
  |  Branch (2260:20): [True: 1.63k, False: 164k]
  ------------------
 2261|  1.48M|  for (int i = 0; i < height; ++i, j += step) {
  ------------------
  |  Branch (2261:19): [True: 1.32M, False: 166k]
  ------------------
 2262|  1.32M|    const __m128i v = _mm_loadl_epi64((__m128i const *)(output + i * stride));
 2263|  1.32M|    const __m128i u = lowbd_get_recon_8x8_sse2(v, in[j]);
 2264|  1.32M|    _mm_storel_epi64((__m128i *)(output + i * stride), u);
 2265|  1.32M|  }
 2266|   166k|}
av1_inv_txfm_ssse3.c:lowbd_inv_txfm2d_add_4x16_ssse3:
 2752|  18.7k|                                            int eob) {
 2753|  18.7k|  (void)tx_size_;
 2754|  18.7k|  (void)eob;
 2755|  18.7k|  __m128i buf[16];
 2756|  18.7k|  const TX_SIZE tx_size = TX_4X16;
 2757|  18.7k|  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
 2758|  18.7k|  const int txw_idx = get_txw_idx(tx_size);
 2759|  18.7k|  const int txh_idx = get_txh_idx(tx_size);
 2760|  18.7k|  const int txfm_size_col = tx_size_wide[tx_size];
 2761|  18.7k|  const int txfm_size_row = tx_size_high[tx_size];
 2762|       |
 2763|  18.7k|  const transform_1d_ssse3 row_txfm =
 2764|  18.7k|      lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]];
 2765|  18.7k|  const transform_1d_ssse3 col_txfm =
 2766|  18.7k|      lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
 2767|       |
 2768|  18.7k|  int ud_flip, lr_flip;
 2769|  18.7k|  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 2770|       |
 2771|  18.7k|  const int row_one_loop = 8;
 2772|  56.1k|  for (int i = 0; i < 2; ++i) {
  ------------------
  |  Branch (2772:19): [True: 37.4k, False: 18.7k]
  ------------------
 2773|  37.4k|    const int32_t *input_cur = input + i * row_one_loop;
 2774|  37.4k|    __m128i *buf_cur = buf + i * row_one_loop;
 2775|  37.4k|    load_buffer_32bit_to_16bit(input_cur, txfm_size_row, buf_cur,
 2776|  37.4k|                               txfm_size_col);
 2777|  37.4k|    if (row_txfm == iidentity4_ssse3) {
  ------------------
  |  Branch (2777:9): [True: 3.34k, False: 34.1k]
  ------------------
 2778|  3.34k|      const __m128i scale = pair_set_epi16(NewSqrt2, 3 << (NewSqrt2Bits - 1));
  ------------------
  |  |   20|  3.34k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2779|  3.34k|      const __m128i ones = _mm_set1_epi16(1);
 2780|  16.7k|      for (int j = 0; j < 4; ++j) {
  ------------------
  |  Branch (2780:23): [True: 13.3k, False: 3.34k]
  ------------------
 2781|  13.3k|        const __m128i buf_lo = _mm_unpacklo_epi16(buf_cur[j], ones);
 2782|  13.3k|        const __m128i buf_hi = _mm_unpackhi_epi16(buf_cur[j], ones);
 2783|  13.3k|        const __m128i buf_32_lo =
 2784|  13.3k|            _mm_srai_epi32(_mm_madd_epi16(buf_lo, scale), (NewSqrt2Bits + 1));
  ------------------
  |  |   41|  13.3k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 2785|  13.3k|        const __m128i buf_32_hi =
 2786|  13.3k|            _mm_srai_epi32(_mm_madd_epi16(buf_hi, scale), (NewSqrt2Bits + 1));
  ------------------
  |  |   41|  13.3k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 2787|  13.3k|        buf_cur[j] = _mm_packs_epi32(buf_32_lo, buf_32_hi);
 2788|  13.3k|      }
 2789|  34.1k|    } else {
 2790|  34.1k|      row_txfm(buf_cur, buf_cur);
 2791|  34.1k|      round_shift_16bit_ssse3(buf_cur, row_one_loop, shift[0]);
 2792|  34.1k|    }
 2793|  37.4k|    if (lr_flip) {
  ------------------
  |  Branch (2793:9): [True: 476, False: 36.9k]
  ------------------
 2794|    476|      __m128i temp[8];
 2795|    476|      flip_buf_sse2(buf_cur, temp, txfm_size_col);
 2796|    476|      transpose_16bit_8x4(temp, buf_cur);
 2797|  36.9k|    } else {
 2798|  36.9k|      transpose_16bit_8x4(buf_cur, buf_cur);
 2799|  36.9k|    }
 2800|  37.4k|  }
 2801|  18.7k|  col_txfm(buf, buf);
 2802|  18.7k|  round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
 2803|  18.7k|  lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
 2804|  18.7k|}
av1_inv_txfm_ssse3.c:lowbd_inv_txfm2d_add_16x4_ssse3:
 2809|  38.3k|                                            int eob) {
 2810|  38.3k|  (void)tx_size_;
 2811|  38.3k|  (void)eob;
 2812|  38.3k|  __m128i buf[16];
 2813|  38.3k|  const TX_SIZE tx_size = TX_16X4;
 2814|  38.3k|  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
 2815|  38.3k|  const int txw_idx = get_txw_idx(tx_size);
 2816|  38.3k|  const int txh_idx = get_txh_idx(tx_size);
 2817|  38.3k|  const int txfm_size_col = tx_size_wide[tx_size];
 2818|  38.3k|  const int txfm_size_row = tx_size_high[tx_size];
 2819|  38.3k|  const int buf_size_w_div8 = txfm_size_col >> 3;
 2820|       |
 2821|  38.3k|  const transform_1d_ssse3 row_txfm =
 2822|  38.3k|      lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
 2823|  38.3k|  const transform_1d_ssse3 col_txfm =
 2824|  38.3k|      lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]];
 2825|       |
 2826|  38.3k|  int ud_flip, lr_flip;
 2827|  38.3k|  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 2828|  38.3k|  const int row_one_loop = 8;
 2829|  38.3k|  load_buffer_32bit_to_16bit_w4(input, txfm_size_row, buf, txfm_size_col);
 2830|  38.3k|  if (row_txfm == iidentity16_ssse3) {
  ------------------
  |  Branch (2830:7): [True: 3.32k, False: 35.0k]
  ------------------
 2831|  3.32k|    const __m128i scale = pair_set_epi16(2 * NewSqrt2, 3 << (NewSqrt2Bits - 1));
  ------------------
  |  |   20|  3.32k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2832|  3.32k|    const __m128i ones = _mm_set1_epi16(1);
 2833|  56.4k|    for (int j = 0; j < 16; ++j) {
  ------------------
  |  Branch (2833:21): [True: 53.1k, False: 3.32k]
  ------------------
 2834|  53.1k|      const __m128i buf_lo = _mm_unpacklo_epi16(buf[j], ones);
 2835|  53.1k|      const __m128i buf_hi = _mm_unpackhi_epi16(buf[j], ones);
 2836|  53.1k|      const __m128i buf_32_lo =
 2837|  53.1k|          _mm_srai_epi32(_mm_madd_epi16(buf_lo, scale), (NewSqrt2Bits + 1));
  ------------------
  |  |   41|  53.1k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 2838|  53.1k|      const __m128i buf_32_hi =
 2839|  53.1k|          _mm_srai_epi32(_mm_madd_epi16(buf_hi, scale), (NewSqrt2Bits + 1));
  ------------------
  |  |   41|  53.1k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 2840|  53.1k|      buf[j] = _mm_packs_epi32(buf_32_lo, buf_32_hi);
 2841|  53.1k|    }
 2842|  35.0k|  } else {
 2843|  35.0k|    row_txfm(buf, buf);
 2844|  35.0k|    round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);
 2845|  35.0k|  }
 2846|  38.3k|  if (lr_flip) {
  ------------------
  |  Branch (2846:7): [True: 366, False: 38.0k]
  ------------------
 2847|    366|    __m128i temp[16];
 2848|    366|    flip_buf_sse2(buf, temp, 16);
 2849|    366|    transpose_16bit_4x8(temp, buf);
 2850|    366|    transpose_16bit_4x8(temp + 8, buf + 8);
 2851|  38.0k|  } else {
 2852|  38.0k|    transpose_16bit_4x8(buf, buf);
 2853|  38.0k|    transpose_16bit_4x8(buf + row_one_loop, buf + row_one_loop);
 2854|  38.0k|  }
 2855|   115k|  for (int i = 0; i < buf_size_w_div8; i++) {
  ------------------
  |  Branch (2855:19): [True: 76.7k, False: 38.3k]
  ------------------
 2856|  76.7k|    col_txfm(buf + i * row_one_loop, buf + i * row_one_loop);
 2857|  76.7k|    round_shift_16bit_ssse3(buf + i * row_one_loop, txfm_size_row, shift[1]);
 2858|  76.7k|  }
 2859|  38.3k|  lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, 4);
 2860|  38.3k|  lowbd_write_buffer_8xn_sse2(buf + 8, output + 8, stride, ud_flip, 4);
 2861|  38.3k|}
av1_inv_txfm_ssse3.c:lowbd_inv_txfm2d_add_universe_ssse3:
 2647|   107k|    TX_SIZE tx_size, int eob) {
 2648|   107k|  switch (tx_type) {
 2649|  55.3k|    case DCT_DCT:
  ------------------
  |  Branch (2649:5): [True: 55.3k, False: 52.1k]
  ------------------
 2650|  55.3k|      lowbd_inv_txfm2d_add_no_identity_ssse3(input, output, stride, tx_type,
 2651|  55.3k|                                             tx_size, eob);
 2652|  55.3k|      break;
 2653|  3.98k|    case IDTX:
  ------------------
  |  Branch (2653:5): [True: 3.98k, False: 103k]
  ------------------
 2654|  3.98k|      av1_lowbd_inv_txfm2d_add_idtx_ssse3(input, output, stride, tx_size);
 2655|  3.98k|      break;
 2656|  1.96k|    case V_DCT:
  ------------------
  |  Branch (2656:5): [True: 1.96k, False: 105k]
  ------------------
 2657|  2.07k|    case V_ADST:
  ------------------
  |  Branch (2657:5): [True: 114, False: 107k]
  ------------------
 2658|  2.18k|    case V_FLIPADST:
  ------------------
  |  Branch (2658:5): [True: 108, False: 107k]
  ------------------
 2659|  2.18k|      av1_lowbd_inv_txfm2d_add_h_identity_ssse3(input, output, stride, tx_type,
 2660|  2.18k|                                                tx_size, eob);
 2661|  2.18k|      break;
 2662|  3.85k|    case H_DCT:
  ------------------
  |  Branch (2662:5): [True: 3.85k, False: 103k]
  ------------------
 2663|  4.11k|    case H_ADST:
  ------------------
  |  Branch (2663:5): [True: 265, False: 107k]
  ------------------
 2664|  4.37k|    case H_FLIPADST:
  ------------------
  |  Branch (2664:5): [True: 254, False: 107k]
  ------------------
 2665|  4.37k|      av1_lowbd_inv_txfm2d_add_v_identity_ssse3(input, output, stride, tx_type,
 2666|  4.37k|                                                tx_size, eob);
 2667|  4.37k|      break;
 2668|  41.6k|    default:
  ------------------
  |  Branch (2668:5): [True: 41.6k, False: 65.9k]
  ------------------
 2669|  41.6k|      lowbd_inv_txfm2d_add_no_identity_ssse3(input, output, stride, tx_type,
 2670|  41.6k|                                             tx_size, eob);
 2671|  41.6k|      break;
 2672|   107k|  }
 2673|   107k|}
av1_inv_txfm_ssse3.c:lowbd_inv_txfm2d_add_no_identity_ssse3:
 2476|  97.0k|    TX_SIZE tx_size, int eob) {
 2477|  97.0k|  __m128i buf1[64 * 8];
 2478|  97.0k|  int eobx, eoby;
 2479|  97.0k|  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
 2480|  97.0k|  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
 2481|  97.0k|  const int txw_idx = get_txw_idx(tx_size);
 2482|  97.0k|  const int txh_idx = get_txh_idx(tx_size);
 2483|  97.0k|  const int txfm_size_col = tx_size_wide[tx_size];
 2484|  97.0k|  const int txfm_size_row = tx_size_high[tx_size];
 2485|  97.0k|  const int buf_size_w_div8 = txfm_size_col >> 3;
 2486|  97.0k|  const int buf_size_nonzero_w = ((eobx + 8) >> 3) << 3;
 2487|  97.0k|  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
 2488|  97.0k|  const int input_stride = AOMMIN(32, txfm_size_row);
  ------------------
  |  |   34|  97.0k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 97.0k]
  |  |  ------------------
  ------------------
 2489|  97.0k|  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
 2490|       |
 2491|  97.0k|  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
 2492|  97.0k|  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
 2493|  97.0k|  const transform_1d_ssse3 row_txfm =
 2494|  97.0k|      lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
 2495|  97.0k|  const transform_1d_ssse3 col_txfm =
 2496|  97.0k|      lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
 2497|       |
 2498|  97.0k|  assert(col_txfm != NULL);
 2499|  97.0k|  assert(row_txfm != NULL);
 2500|  97.0k|  int ud_flip, lr_flip;
 2501|  97.0k|  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 2502|   211k|  for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
  ------------------
  |  Branch (2502:19): [True: 114k, False: 97.0k]
  ------------------
 2503|   114k|    __m128i buf0[64];
 2504|   114k|    load_buffer_32bit_to_16bit(input + 8 * i, input_stride, buf0,
 2505|   114k|                               buf_size_nonzero_w);
 2506|   114k|    if (rect_type == 1 || rect_type == -1) {
  ------------------
  |  Branch (2506:9): [True: 42.4k, False: 71.6k]
  |  Branch (2506:27): [True: 38.2k, False: 33.3k]
  ------------------
 2507|  80.7k|      round_shift_ssse3(buf0, buf0, buf_size_nonzero_w);  // rect special code
 2508|  80.7k|    }
 2509|   114k|    row_txfm(buf0, buf0);
 2510|   114k|    round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]);
 2511|   114k|    __m128i *_buf1 = buf1 + i * 8;
 2512|   114k|    if (lr_flip) {
  ------------------
  |  Branch (2512:9): [True: 892, False: 113k]
  ------------------
 2513|  2.22k|      for (int j = 0; j < buf_size_w_div8; ++j) {
  ------------------
  |  Branch (2513:23): [True: 1.33k, False: 892]
  ------------------
 2514|  1.33k|        __m128i temp[8];
 2515|  1.33k|        flip_buf_sse2(buf0 + 8 * j, temp, 8);
 2516|  1.33k|        transpose_16bit_8x8(temp,
 2517|  1.33k|                            _buf1 + txfm_size_row * (buf_size_w_div8 - 1 - j));
 2518|  1.33k|      }
 2519|   113k|    } else {
 2520|   312k|      for (int j = 0; j < buf_size_w_div8; ++j) {
  ------------------
  |  Branch (2520:23): [True: 199k, False: 113k]
  ------------------
 2521|   199k|        transpose_16bit_8x8(buf0 + 8 * j, _buf1 + txfm_size_row * j);
 2522|   199k|      }
 2523|   113k|    }
 2524|   114k|  }
 2525|   280k|  for (int i = 0; i < buf_size_w_div8; i++) {
  ------------------
  |  Branch (2525:19): [True: 183k, False: 97.0k]
  ------------------
 2526|   183k|    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row);
 2527|   183k|    round_shift_16bit_ssse3(buf1 + i * txfm_size_row, txfm_size_row, shift[1]);
 2528|   183k|  }
 2529|       |
 2530|  97.0k|  if (txfm_size_col >= 16) {
  ------------------
  |  Branch (2530:7): [True: 57.1k, False: 39.9k]
  ------------------
 2531|   128k|    for (int i = 0; i < (txfm_size_col >> 4); i++) {
  ------------------
  |  Branch (2531:21): [True: 71.7k, False: 57.1k]
  ------------------
 2532|  71.7k|      lowbd_write_buffer_16xn_sse2(buf1 + i * txfm_size_row * 2,
 2533|  71.7k|                                   output + 16 * i, stride, ud_flip,
 2534|  71.7k|                                   txfm_size_row);
 2535|  71.7k|    }
 2536|  57.1k|  } else if (txfm_size_col == 8) {
  ------------------
  |  Branch (2536:14): [True: 39.9k, False: 1]
  ------------------
 2537|  39.9k|    lowbd_write_buffer_8xn_sse2(buf1, output, stride, ud_flip, txfm_size_row);
 2538|  39.9k|  }
 2539|  97.0k|}
av1_inv_txfm_ssse3.c:lowbd_write_buffer_16xn_sse2:
 2456|  71.7k|                                                int height) {
 2457|  71.7k|  int j = flipud ? (height - 1) : 0;
  ------------------
  |  Branch (2457:11): [True: 416, False: 71.3k]
  ------------------
 2458|  71.7k|  const int step = flipud ? -1 : 1;
  ------------------
  |  Branch (2458:20): [True: 416, False: 71.3k]
  ------------------
 2459|   645k|  for (int i = 0; i < height; ++i, j += step) {
  ------------------
  |  Branch (2459:19): [True: 573k, False: 71.7k]
  ------------------
 2460|   573k|    __m128i v = _mm_loadu_si128((__m128i const *)(output + i * stride));
 2461|   573k|    __m128i u = lowbd_get_recon_16x16_sse2(v, in[j], in[j + height]);
 2462|   573k|    _mm_storeu_si128((__m128i *)(output + i * stride), u);
 2463|   573k|  }
 2464|  71.7k|}
av1_inv_txfm_ssse3.c:lowbd_get_recon_16x16_sse2:
 2445|   573k|                                                 __m128i res0, __m128i res1) {
 2446|   573k|  const __m128i zero = _mm_setzero_si128();
 2447|   573k|  __m128i x0 = _mm_unpacklo_epi8(pred, zero);
 2448|   573k|  __m128i x1 = _mm_unpackhi_epi8(pred, zero);
 2449|   573k|  x0 = _mm_adds_epi16(res0, x0);
 2450|   573k|  x1 = _mm_adds_epi16(res1, x1);
 2451|   573k|  return _mm_packus_epi16(x0, x1);
 2452|   573k|}

av1_inv_txfm_ssse3.c:get_eobx_eoby_scan_h_identity:
  202|  5.85k|                                                 TX_SIZE tx_size, int eob) {
  203|  5.85k|  eob -= 1;
  204|  5.85k|  const int txfm_size_col = tx_size_wide[tx_size];
  205|  5.85k|  const int eobx_max = AOMMIN(32, txfm_size_col) - 1;
  ------------------
  |  |   34|  5.85k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 5.85k]
  |  |  ------------------
  ------------------
  206|  5.85k|  *eobx = (eob >= eobx_max) ? eobx_max : eob_fill[eob];
  ------------------
  |  Branch (206:11): [True: 5.04k, False: 817]
  ------------------
  207|  5.85k|  const int temp_eoby = eob / (eobx_max + 1);
  208|       |  assert(temp_eoby < 32);
  209|  5.85k|  *eoby = eob_fill[temp_eoby];
  210|  5.85k|}
av1_inv_txfm_ssse3.c:get_eobx_eoby_scan_v_identity:
  213|  11.8k|                                                 TX_SIZE tx_size, int eob) {
  214|  11.8k|  eob -= 1;
  215|  11.8k|  const int txfm_size_row = tx_size_high[tx_size];
  216|  11.8k|  const int eoby_max = AOMMIN(32, txfm_size_row) - 1;
  ------------------
  |  |   34|  11.8k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 11.8k]
  |  |  ------------------
  ------------------
  217|  11.8k|  *eobx = eob_fill[eob / (eoby_max + 1)];
  218|  11.8k|  *eoby = (eob >= eoby_max) ? eoby_max : eob_fill[eob];
  ------------------
  |  Branch (218:11): [True: 10.5k, False: 1.34k]
  ------------------
  219|  11.8k|}
av1_inv_txfm_ssse3.c:round_shift_16bit_ssse3:
   60|   667k|static inline void round_shift_16bit_ssse3(__m128i *in, int size, int bit) {
   61|   667k|  if (bit < 0) {
  ------------------
  |  Branch (61:7): [True: 667k, False: 2]
  ------------------
   62|   667k|    const __m128i scale = _mm_set1_epi16(1 << (15 + bit));
   63|  6.70M|    for (int i = 0; i < size; ++i) {
  ------------------
  |  Branch (63:21): [True: 6.03M, False: 667k]
  ------------------
   64|  6.03M|      in[i] = _mm_mulhrs_epi16(in[i], scale);
   65|  6.03M|    }
   66|   667k|  } else if (bit > 0) {
  ------------------
  |  Branch (66:14): [True: 0, False: 2]
  ------------------
   67|      0|    for (int i = 0; i < size; ++i) {
  ------------------
  |  Branch (67:21): [True: 0, False: 0]
  ------------------
   68|      0|      in[i] = _mm_slli_epi16(in[i], bit);
   69|      0|    }
   70|      0|  }
   71|   667k|}
av1_inv_txfm_ssse3.c:get_eobx_eoby_scan_default:
  182|  97.0k|                                              TX_SIZE tx_size, int eob) {
  183|  97.0k|  if (eob == 1) {
  ------------------
  |  Branch (183:7): [True: 16.7k, False: 80.2k]
  ------------------
  184|  16.7k|    *eobx = 0;
  185|  16.7k|    *eoby = 0;
  186|  16.7k|    return;
  187|  16.7k|  }
  188|       |
  189|  80.2k|  const int tx_w_log2 = tx_size_wide_log2_eob[tx_size];
  190|  80.2k|  const int eob_row = (eob - 1) >> tx_w_log2;
  191|  80.2k|  const int eobxy = av1_eob_to_eobxy_default[tx_size][eob_row];
  192|  80.2k|  *eobx = eobxy & 0xFF;
  193|  80.2k|  *eoby = eobxy >> 8;
  194|  80.2k|}
highbd_inv_txfm_sse4.c:get_eobx_eoby_scan_v_identity:
  213|  5.90k|                                                 TX_SIZE tx_size, int eob) {
  214|  5.90k|  eob -= 1;
  215|  5.90k|  const int txfm_size_row = tx_size_high[tx_size];
  216|  5.90k|  const int eoby_max = AOMMIN(32, txfm_size_row) - 1;
  ------------------
  |  |   34|  5.90k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 5.90k]
  |  |  ------------------
  ------------------
  217|  5.90k|  *eobx = eob_fill[eob / (eoby_max + 1)];
  218|  5.90k|  *eoby = (eob >= eoby_max) ? eoby_max : eob_fill[eob];
  ------------------
  |  Branch (218:11): [True: 4.89k, False: 1.01k]
  ------------------
  219|  5.90k|}
highbd_inv_txfm_sse4.c:get_eobx_eoby_scan_h_identity:
  202|  11.0k|                                                 TX_SIZE tx_size, int eob) {
  203|  11.0k|  eob -= 1;
  204|  11.0k|  const int txfm_size_col = tx_size_wide[tx_size];
  205|  11.0k|  const int eobx_max = AOMMIN(32, txfm_size_col) - 1;
  ------------------
  |  |   34|  11.0k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 11.0k]
  |  |  ------------------
  ------------------
  206|  11.0k|  *eobx = (eob >= eobx_max) ? eobx_max : eob_fill[eob];
  ------------------
  |  Branch (206:11): [True: 9.04k, False: 2.04k]
  ------------------
  207|  11.0k|  const int temp_eoby = eob / (eobx_max + 1);
  208|       |  assert(temp_eoby < 32);
  209|  11.0k|  *eoby = eob_fill[temp_eoby];
  210|  11.0k|}
av1_inv_txfm_avx2.c:get_eobx_eoby_scan_default:
  182|   150k|                                              TX_SIZE tx_size, int eob) {
  183|   150k|  if (eob == 1) {
  ------------------
  |  Branch (183:7): [True: 32.1k, False: 118k]
  ------------------
  184|  32.1k|    *eobx = 0;
  185|  32.1k|    *eoby = 0;
  186|  32.1k|    return;
  187|  32.1k|  }
  188|       |
  189|   118k|  const int tx_w_log2 = tx_size_wide_log2_eob[tx_size];
  190|   118k|  const int eob_row = (eob - 1) >> tx_w_log2;
  191|   118k|  const int eobxy = av1_eob_to_eobxy_default[tx_size][eob_row];
  192|   118k|  *eobx = eobxy & 0xFF;
  193|   118k|  *eoby = eobxy >> 8;
  194|   118k|}
av1_inv_txfm_avx2.c:get_eobx_eoby_scan_h_identity:
  202|    194|                                                 TX_SIZE tx_size, int eob) {
  203|    194|  eob -= 1;
  204|    194|  const int txfm_size_col = tx_size_wide[tx_size];
  205|    194|  const int eobx_max = AOMMIN(32, txfm_size_col) - 1;
  ------------------
  |  |   34|    194|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 194]
  |  |  ------------------
  ------------------
  206|    194|  *eobx = (eob >= eobx_max) ? eobx_max : eob_fill[eob];
  ------------------
  |  Branch (206:11): [True: 150, False: 44]
  ------------------
  207|    194|  const int temp_eoby = eob / (eobx_max + 1);
  208|       |  assert(temp_eoby < 32);
  209|    194|  *eoby = eob_fill[temp_eoby];
  210|    194|}
av1_inv_txfm_avx2.c:get_eobx_eoby_scan_v_identity:
  213|    308|                                                 TX_SIZE tx_size, int eob) {
  214|    308|  eob -= 1;
  215|    308|  const int txfm_size_row = tx_size_high[tx_size];
  216|    308|  const int eoby_max = AOMMIN(32, txfm_size_row) - 1;
  ------------------
  |  |   34|    308|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 308]
  |  |  ------------------
  ------------------
  217|    308|  *eobx = eob_fill[eob / (eoby_max + 1)];
  218|    308|  *eoby = (eob >= eoby_max) ? eoby_max : eob_fill[eob];
  ------------------
  |  Branch (218:11): [True: 206, False: 102]
  ------------------
  219|    308|}
highbd_inv_txfm_avx2.c:get_eobx_eoby_scan_default:
  182|   465k|                                              TX_SIZE tx_size, int eob) {
  183|   465k|  if (eob == 1) {
  ------------------
  |  Branch (183:7): [True: 81.3k, False: 383k]
  ------------------
  184|  81.3k|    *eobx = 0;
  185|  81.3k|    *eoby = 0;
  186|  81.3k|    return;
  187|  81.3k|  }
  188|       |
  189|   383k|  const int tx_w_log2 = tx_size_wide_log2_eob[tx_size];
  190|   383k|  const int eob_row = (eob - 1) >> tx_w_log2;
  191|   383k|  const int eobxy = av1_eob_to_eobxy_default[tx_size][eob_row];
  192|   383k|  *eobx = eobxy & 0xFF;
  193|   383k|  *eoby = eobxy >> 8;
  194|   383k|}

av1_inv_txfm_ssse3.c:load_32bit_to_16bit:
   87|  1.72M|static inline __m128i load_32bit_to_16bit(const int32_t *a) {
   88|  1.72M|  const __m128i a_low = _mm_load_si128((const __m128i *)a);
   89|  1.72M|  return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4));
   90|  1.72M|}
av1_inv_txfm_ssse3.c:load_buffer_32bit_to_16bit:
  173|   197k|                                              __m128i *out, int out_size) {
  174|  1.71M|  for (int i = 0; i < out_size; ++i) {
  ------------------
  |  Branch (174:19): [True: 1.52M, False: 197k]
  ------------------
  175|  1.52M|    out[i] = load_32bit_to_16bit(in + i * stride);
  176|  1.52M|  }
  177|   197k|}
av1_inv_txfm_ssse3.c:flip_buf_sse2:
  253|  6.85k|static inline void flip_buf_sse2(__m128i *in, __m128i *out, int size) {
  254|  51.2k|  for (int i = 0; i < size; ++i) {
  ------------------
  |  Branch (254:19): [True: 44.3k, False: 6.85k]
  ------------------
  255|  44.3k|    out[size - i - 1] = in[i];
  256|  44.3k|  }
  257|  6.85k|}
av1_inv_txfm_ssse3.c:load_buffer_32bit_to_16bit_w4:
  180|   197k|                                                 __m128i *out, int out_size) {
  181|  1.64M|  for (int i = 0; i < out_size; ++i) {
  ------------------
  |  Branch (181:19): [True: 1.45M, False: 197k]
  ------------------
  182|  1.45M|    out[i] = load_32bit_to_16bit_w4(in + i * stride);
  183|  1.45M|  }
  184|   197k|}
av1_inv_txfm_ssse3.c:load_32bit_to_16bit_w4:
   92|  1.45M|static inline __m128i load_32bit_to_16bit_w4(const int32_t *a) {
   93|  1.45M|  const __m128i a_low = _mm_load_si128((const __m128i *)a);
   94|  1.45M|  return _mm_packs_epi32(a_low, a_low);
   95|  1.45M|}
highbd_inv_txfm_sse4.c:flip_buf_sse2:
  253|    501|static inline void flip_buf_sse2(__m128i *in, __m128i *out, int size) {
  254|  6.22k|  for (int i = 0; i < size; ++i) {
  ------------------
  |  Branch (254:19): [True: 5.72k, False: 501]
  ------------------
  255|  5.72k|    out[size - i - 1] = in[i];
  256|  5.72k|  }
  257|    501|}

highbd_inv_txfm_sse4.c:av1_round_shift_rect_array_32_sse4_1:
   49|   143k|                                                        const int val) {
   50|   143k|  const __m128i sqrt2 = _mm_set1_epi32(val);
   51|   143k|  if (bit > 0) {
  ------------------
  |  Branch (51:7): [True: 0, False: 143k]
  ------------------
   52|      0|    int i;
   53|      0|    for (i = 0; i < size; i++) {
  ------------------
  |  Branch (53:17): [True: 0, False: 0]
  ------------------
   54|      0|      const __m128i r0 = av1_round_shift_32_sse4_1(input[i], bit);
   55|      0|      const __m128i r1 = _mm_mullo_epi32(sqrt2, r0);
   56|      0|      output[i] = av1_round_shift_32_sse4_1(r1, NewSqrt2Bits);
  ------------------
  |  |   41|      0|#define NewSqrt2Bits ((int32_t)12)
  ------------------
   57|      0|    }
   58|   143k|  } else {
   59|   143k|    int i;
   60|  1.38M|    for (i = 0; i < size; i++) {
  ------------------
  |  Branch (60:17): [True: 1.24M, False: 143k]
  ------------------
   61|  1.24M|      const __m128i r0 = _mm_slli_epi32(input[i], -bit);
   62|  1.24M|      const __m128i r1 = _mm_mullo_epi32(sqrt2, r0);
   63|  1.24M|      output[i] = av1_round_shift_32_sse4_1(r1, NewSqrt2Bits);
  ------------------
  |  |   41|  1.24M|#define NewSqrt2Bits ((int32_t)12)
  ------------------
   64|  1.24M|    }
   65|   143k|  }
   66|   143k|}
highbd_inv_txfm_sse4.c:av1_round_shift_32_sse4_1:
   21|  4.06M|static inline __m128i av1_round_shift_32_sse4_1(__m128i vec, int bit) {
   22|  4.06M|  __m128i tmp, round;
   23|  4.06M|  round = _mm_set1_epi32(1 << (bit - 1));
   24|  4.06M|  tmp = _mm_add_epi32(vec, round);
   25|  4.06M|  return _mm_srai_epi32(tmp, bit);
   26|  4.06M|}
highbd_inv_txfm_sse4.c:av1_round_shift_array_32_sse4_1:
   31|   269k|                                                   const int bit) {
   32|   269k|  if (bit > 0) {
  ------------------
  |  Branch (32:7): [True: 269k, False: 0]
  ------------------
   33|   269k|    int i;
   34|  3.09M|    for (i = 0; i < size; i++) {
  ------------------
  |  Branch (34:17): [True: 2.82M, False: 269k]
  ------------------
   35|  2.82M|      output[i] = av1_round_shift_32_sse4_1(input[i], bit);
   36|  2.82M|    }
   37|   269k|  } else {
   38|      0|    int i;
   39|      0|    for (i = 0; i < size; i++) {
  ------------------
  |  Branch (39:17): [True: 0, False: 0]
  ------------------
   40|      0|      output[i] = _mm_slli_epi32(input[i], -bit);
   41|      0|    }
   42|      0|  }
   43|   269k|}

cdef_find_dir_dual_avx2:
  182|   274k|                             int *out_dir_1st_8x8, int *out_dir_2nd_8x8) {
  183|   274k|  int32_t cost_first_8x8[8];
  184|   274k|  int32_t cost_second_8x8[8];
  185|       |  // Used to store the best cost for 2 8x8's.
  186|   274k|  int32_t best_cost[2] = { 0 };
  187|       |  // Best direction for 2 8x8's.
  188|   274k|  int best_dir[2] = { 0 };
  189|       |
  190|   274k|  const __m128i const_coeff_shift_reg = _mm_cvtsi32_si128(coeff_shift);
  191|   274k|  const __m256i const_128_reg = _mm256_set1_epi16(128);
  192|   274k|  __m256i lines[8];
  193|  2.46M|  for (int i = 0; i < 8; i++) {
  ------------------
  |  Branch (193:19): [True: 2.19M, False: 274k]
  ------------------
  194|  2.19M|    const __m128i src_1 = _mm_loadu_si128((const __m128i *)&img1[i * stride]);
  195|  2.19M|    const __m128i src_2 = _mm_loadu_si128((const __m128i *)&img2[i * stride]);
  196|       |
  197|  2.19M|    lines[i] = _mm256_insertf128_si256(_mm256_castsi128_si256(src_1), src_2, 1);
  198|  2.19M|    lines[i] = _mm256_sub_epi16(
  199|  2.19M|        _mm256_sra_epi16(lines[i], const_coeff_shift_reg), const_128_reg);
  200|  2.19M|  }
  201|       |
  202|       |  /* Compute "mostly vertical" directions. */
  203|   274k|  const __m256i dir47 =
  204|   274k|      compute_directions_avx2(lines, cost_first_8x8 + 4, cost_second_8x8 + 4);
  205|       |
  206|       |  /* Transpose and reverse the order of the lines. */
  207|   274k|  array_reverse_transpose_8x8_avx2(lines, lines);
  208|       |
  209|       |  /* Compute "mostly horizontal" directions. */
  210|   274k|  const __m256i dir03 =
  211|   274k|      compute_directions_avx2(lines, cost_first_8x8, cost_second_8x8);
  212|       |
  213|   274k|  __m256i max = _mm256_max_epi32(dir03, dir47);
  214|   274k|  max =
  215|   274k|      _mm256_max_epi32(max, _mm256_or_si256(_mm256_srli_si256(max, 8),
  216|   274k|                                            _mm256_slli_si256(max, 16 - (8))));
  217|   274k|  max =
  218|   274k|      _mm256_max_epi32(max, _mm256_or_si256(_mm256_srli_si256(max, 4),
  219|   274k|                                            _mm256_slli_si256(max, 16 - (4))));
  220|       |
  221|   274k|  const __m128i first_8x8_output = _mm256_castsi256_si128(max);
  222|   274k|  const __m128i second_8x8_output = _mm256_extractf128_si256(max, 1);
  223|   274k|  const __m128i cmpeg_res_00 =
  224|   274k|      _mm_cmpeq_epi32(first_8x8_output, _mm256_castsi256_si128(dir47));
  225|   274k|  const __m128i cmpeg_res_01 =
  226|   274k|      _mm_cmpeq_epi32(first_8x8_output, _mm256_castsi256_si128(dir03));
  227|   274k|  const __m128i cmpeg_res_10 =
  228|   274k|      _mm_cmpeq_epi32(second_8x8_output, _mm256_extractf128_si256(dir47, 1));
  229|   274k|  const __m128i cmpeg_res_11 =
  230|   274k|      _mm_cmpeq_epi32(second_8x8_output, _mm256_extractf128_si256(dir03, 1));
  231|   274k|  const __m128i t_first_8x8 = _mm_packs_epi32(cmpeg_res_01, cmpeg_res_00);
  232|   274k|  const __m128i t_second_8x8 = _mm_packs_epi32(cmpeg_res_11, cmpeg_res_10);
  233|       |
  234|   274k|  best_cost[0] = _mm_cvtsi128_si32(_mm256_castsi256_si128(max));
  235|   274k|  best_cost[1] = _mm_cvtsi128_si32(second_8x8_output);
  236|   274k|  best_dir[0] = _mm_movemask_epi8(_mm_packs_epi16(t_first_8x8, t_first_8x8));
  237|   274k|  best_dir[0] =
  238|   274k|      get_msb(best_dir[0] ^ (best_dir[0] - 1));  // Count trailing zeros
  239|   274k|  best_dir[1] = _mm_movemask_epi8(_mm_packs_epi16(t_second_8x8, t_second_8x8));
  240|   274k|  best_dir[1] =
  241|   274k|      get_msb(best_dir[1] ^ (best_dir[1] - 1));  // Count trailing zeros
  242|       |
  243|       |  /* Difference between the optimal variance and the variance along the
  244|       |     orthogonal direction. Again, the sum(x^2) terms cancel out. */
  245|   274k|  *var_out_1st = best_cost[0] - cost_first_8x8[(best_dir[0] + 4) & 7];
  246|   274k|  *var_out_2nd = best_cost[1] - cost_second_8x8[(best_dir[1] + 4) & 7];
  247|       |
  248|       |  /* We'd normally divide by 840, but dividing by 1024 is close enough
  249|       |  for what we're going to do with this. */
  250|   274k|  *var_out_1st >>= 10;
  251|   274k|  *var_out_2nd >>= 10;
  252|   274k|  *out_dir_1st_8x8 = best_dir[0];
  253|   274k|  *out_dir_2nd_8x8 = best_dir[1];
  254|   274k|}
cdef_copy_rect8_8bit_to_16bit_avx2:
  258|  36.0k|                                        int width, int height) {
  259|  36.0k|  int j = 0;
  260|  36.0k|  int remaining_width = width;
  261|  36.0k|  assert(height % 2 == 0);
  262|  36.0k|  assert(height > 0);
  263|  36.0k|  assert(width > 0);
  264|       |
  265|       |  // Process multiple 32 pixels at a time.
  266|  36.0k|  if (remaining_width > 31) {
  ------------------
  |  Branch (266:7): [True: 29.9k, False: 6.09k]
  ------------------
  267|  29.9k|    int i = 0;
  268|   357k|    do {
  269|   357k|      j = 0;
  270|   649k|      do {
  271|   649k|        __m128i row00 =
  272|   649k|            _mm_loadu_si128((const __m128i *)&src[(i + 0) * sstride + (j + 0)]);
  273|   649k|        __m128i row01 = _mm_loadu_si128(
  274|   649k|            (const __m128i *)&src[(i + 0) * sstride + (j + 16)]);
  275|   649k|        __m128i row10 =
  276|   649k|            _mm_loadu_si128((const __m128i *)&src[(i + 1) * sstride + (j + 0)]);
  277|   649k|        __m128i row11 = _mm_loadu_si128(
  278|   649k|            (const __m128i *)&src[(i + 1) * sstride + (j + 16)]);
  279|   649k|        _mm256_storeu_si256((__m256i *)&dst[(i + 0) * dstride + (j + 0)],
  280|   649k|                            _mm256_cvtepu8_epi16(row00));
  281|   649k|        _mm256_storeu_si256((__m256i *)&dst[(i + 0) * dstride + (j + 16)],
  282|   649k|                            _mm256_cvtepu8_epi16(row01));
  283|   649k|        _mm256_storeu_si256((__m256i *)&dst[(i + 1) * dstride + (j + 0)],
  284|   649k|                            _mm256_cvtepu8_epi16(row10));
  285|   649k|        _mm256_storeu_si256((__m256i *)&dst[(i + 1) * dstride + (j + 16)],
  286|   649k|                            _mm256_cvtepu8_epi16(row11));
  287|   649k|        j += 32;
  288|   649k|      } while (j <= width - 32);
  ------------------
  |  Branch (288:16): [True: 292k, False: 357k]
  ------------------
  289|   357k|      i += 2;
  290|   357k|    } while (i < height);
  ------------------
  |  Branch (290:14): [True: 327k, False: 29.9k]
  ------------------
  291|  29.9k|    remaining_width = width & 31;
  292|  29.9k|  }
  293|       |
  294|       |  // Process 16 pixels at a time.
  295|  36.0k|  if (remaining_width > 15) {
  ------------------
  |  Branch (295:7): [True: 9.70k, False: 26.3k]
  ------------------
  296|  9.70k|    int i = 0;
  297|  85.0k|    do {
  298|  85.0k|      __m128i row0 =
  299|  85.0k|          _mm_loadu_si128((const __m128i *)&src[(i + 0) * sstride + j]);
  300|  85.0k|      __m128i row1 =
  301|  85.0k|          _mm_loadu_si128((const __m128i *)&src[(i + 1) * sstride + j]);
  302|  85.0k|      _mm256_storeu_si256((__m256i *)&dst[(i + 0) * dstride + j],
  303|  85.0k|                          _mm256_cvtepu8_epi16(row0));
  304|  85.0k|      _mm256_storeu_si256((__m256i *)&dst[(i + 1) * dstride + j],
  305|  85.0k|                          _mm256_cvtepu8_epi16(row1));
  306|  85.0k|      i += 2;
  307|  85.0k|    } while (i < height);
  ------------------
  |  Branch (307:14): [True: 75.3k, False: 9.70k]
  ------------------
  308|  9.70k|    remaining_width = width & 15;
  309|  9.70k|    j += 16;
  310|  9.70k|  }
  311|       |
  312|       |  // Process 8 pixels at a time.
  313|  36.0k|  if (remaining_width > 7) {
  ------------------
  |  Branch (313:7): [True: 20.4k, False: 15.6k]
  ------------------
  314|  20.4k|    int i = 0;
  315|   407k|    do {
  316|   407k|      __m128i row0 =
  317|   407k|          _mm_loadl_epi64((const __m128i *)&src[(i + 0) * sstride + j]);
  318|   407k|      __m128i row1 =
  319|   407k|          _mm_loadl_epi64((const __m128i *)&src[(i + 1) * sstride + j]);
  320|   407k|      _mm_storeu_si128((__m128i *)&dst[(i + 0) * dstride + j],
  321|   407k|                       _mm_unpacklo_epi8(row0, _mm_setzero_si128()));
  322|   407k|      _mm_storeu_si128((__m128i *)&dst[(i + 1) * dstride + j],
  323|   407k|                       _mm_unpacklo_epi8(row1, _mm_setzero_si128()));
  324|   407k|      i += 2;
  325|   407k|    } while (i < height);
  ------------------
  |  Branch (325:14): [True: 386k, False: 20.4k]
  ------------------
  326|  20.4k|    remaining_width = width & 7;
  327|  20.4k|    j += 8;
  328|  20.4k|  }
  329|       |
  330|       |  // Process 4 pixels at a time.
  331|  36.0k|  if (remaining_width > 3) {
  ------------------
  |  Branch (331:7): [True: 2.73k, False: 33.3k]
  ------------------
  332|  2.73k|    int i = 0;
  333|  34.3k|    do {
  334|  34.3k|      __m128i row0 =
  335|  34.3k|          _mm_cvtsi32_si128(*((const int32_t *)&src[(i + 0) * sstride + j]));
  336|  34.3k|      __m128i row1 =
  337|  34.3k|          _mm_cvtsi32_si128(*((const int32_t *)&src[(i + 1) * sstride + j]));
  338|  34.3k|      _mm_storel_epi64((__m128i *)&dst[(i + 0) * dstride + j],
  339|  34.3k|                       _mm_unpacklo_epi8(row0, _mm_setzero_si128()));
  340|  34.3k|      _mm_storel_epi64((__m128i *)&dst[(i + 1) * dstride + j],
  341|  34.3k|                       _mm_unpacklo_epi8(row1, _mm_setzero_si128()));
  342|  34.3k|      i += 2;
  343|  34.3k|    } while (i < height);
  ------------------
  |  Branch (343:14): [True: 31.6k, False: 2.73k]
  ------------------
  344|  2.73k|    remaining_width = width & 3;
  345|  2.73k|    j += 4;
  346|  2.73k|  }
  347|       |
  348|       |  // Process the remaining pixels.
  349|  36.0k|  if (remaining_width) {
  ------------------
  |  Branch (349:7): [True: 0, False: 36.0k]
  ------------------
  350|      0|    for (int i = 0; i < height; i++) {
  ------------------
  |  Branch (350:21): [True: 0, False: 0]
  ------------------
  351|      0|      for (int k = j; k < width; k++) {
  ------------------
  |  Branch (351:23): [True: 0, False: 0]
  ------------------
  352|      0|        dst[i * dstride + k] = src[i * sstride + k];
  353|      0|      }
  354|      0|    }
  355|      0|  }
  356|  36.0k|}
cdef_block_avx2.c:compute_directions_avx2:
   70|   549k|                                              int32_t cost_second_8x8[4]) {
   71|   549k|  __m256i partial4a, partial4b, partial5a, partial5b, partial7a, partial7b;
   72|   549k|  __m256i partial6;
   73|   549k|  __m256i tmp;
   74|       |  /* Partial sums for lines 0 and 1. */
   75|   549k|  partial4a = _mm256_slli_si256(lines[0], 14);
   76|   549k|  partial4b = _mm256_srli_si256(lines[0], 2);
   77|   549k|  partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[1], 12));
   78|   549k|  partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[1], 4));
   79|   549k|  tmp = _mm256_add_epi16(lines[0], lines[1]);
   80|   549k|  partial5a = _mm256_slli_si256(tmp, 10);
   81|   549k|  partial5b = _mm256_srli_si256(tmp, 6);
   82|   549k|  partial7a = _mm256_slli_si256(tmp, 4);
   83|   549k|  partial7b = _mm256_srli_si256(tmp, 12);
   84|   549k|  partial6 = tmp;
   85|       |
   86|       |  /* Partial sums for lines 2 and 3. */
   87|   549k|  partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[2], 10));
   88|   549k|  partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[2], 6));
   89|   549k|  partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[3], 8));
   90|   549k|  partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[3], 8));
   91|   549k|  tmp = _mm256_add_epi16(lines[2], lines[3]);
   92|   549k|  partial5a = _mm256_add_epi16(partial5a, _mm256_slli_si256(tmp, 8));
   93|   549k|  partial5b = _mm256_add_epi16(partial5b, _mm256_srli_si256(tmp, 8));
   94|   549k|  partial7a = _mm256_add_epi16(partial7a, _mm256_slli_si256(tmp, 6));
   95|   549k|  partial7b = _mm256_add_epi16(partial7b, _mm256_srli_si256(tmp, 10));
   96|   549k|  partial6 = _mm256_add_epi16(partial6, tmp);
   97|       |
   98|       |  /* Partial sums for lines 4 and 5. */
   99|   549k|  partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[4], 6));
  100|   549k|  partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[4], 10));
  101|   549k|  partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[5], 4));
  102|   549k|  partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[5], 12));
  103|   549k|  tmp = _mm256_add_epi16(lines[4], lines[5]);
  104|   549k|  partial5a = _mm256_add_epi16(partial5a, _mm256_slli_si256(tmp, 6));
  105|   549k|  partial5b = _mm256_add_epi16(partial5b, _mm256_srli_si256(tmp, 10));
  106|   549k|  partial7a = _mm256_add_epi16(partial7a, _mm256_slli_si256(tmp, 8));
  107|   549k|  partial7b = _mm256_add_epi16(partial7b, _mm256_srli_si256(tmp, 8));
  108|   549k|  partial6 = _mm256_add_epi16(partial6, tmp);
  109|       |
  110|       |  /* Partial sums for lines 6 and 7. */
  111|   549k|  partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[6], 2));
  112|   549k|  partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[6], 14));
  113|   549k|  partial4a = _mm256_add_epi16(partial4a, lines[7]);
  114|   549k|  tmp = _mm256_add_epi16(lines[6], lines[7]);
  115|   549k|  partial5a = _mm256_add_epi16(partial5a, _mm256_slli_si256(tmp, 4));
  116|   549k|  partial5b = _mm256_add_epi16(partial5b, _mm256_srli_si256(tmp, 12));
  117|   549k|  partial7a = _mm256_add_epi16(partial7a, _mm256_slli_si256(tmp, 10));
  118|   549k|  partial7b = _mm256_add_epi16(partial7b, _mm256_srli_si256(tmp, 6));
  119|   549k|  partial6 = _mm256_add_epi16(partial6, tmp);
  120|       |
  121|   549k|  const __m256i const_reg_1 =
  122|   549k|      _mm256_set_epi32(210, 280, 420, 840, 210, 280, 420, 840);
  123|   549k|  const __m256i const_reg_2 =
  124|   549k|      _mm256_set_epi32(105, 120, 140, 168, 105, 120, 140, 168);
  125|   549k|  const __m256i const_reg_3 = _mm256_set_epi32(210, 420, 0, 0, 210, 420, 0, 0);
  126|   549k|  const __m256i const_reg_4 =
  127|   549k|      _mm256_set_epi32(105, 105, 105, 140, 105, 105, 105, 140);
  128|       |
  129|       |  /* Compute costs in terms of partial sums. */
  130|   549k|  partial4a =
  131|   549k|      fold_mul_and_sum_avx2(&partial4a, &partial4b, &const_reg_1, &const_reg_2);
  132|   549k|  partial7a =
  133|   549k|      fold_mul_and_sum_avx2(&partial7a, &partial7b, &const_reg_3, &const_reg_4);
  134|   549k|  partial5a =
  135|   549k|      fold_mul_and_sum_avx2(&partial5a, &partial5b, &const_reg_3, &const_reg_4);
  136|   549k|  partial6 = _mm256_madd_epi16(partial6, partial6);
  137|   549k|  partial6 = _mm256_mullo_epi32(partial6, _mm256_set1_epi32(105));
  138|       |
  139|   549k|  partial4a = hsum4_avx2(&partial4a, &partial5a, &partial6, &partial7a);
  140|   549k|  _mm_storeu_si128((__m128i *)cost_frist_8x8,
  141|   549k|                   _mm256_castsi256_si128(partial4a));
  142|   549k|  _mm_storeu_si128((__m128i *)cost_second_8x8,
  143|   549k|                   _mm256_extractf128_si256(partial4a, 1));
  144|       |
  145|   549k|  return partial4a;
  146|   549k|}
cdef_block_avx2.c:fold_mul_and_sum_avx2:
   25|  1.64M|                                            const __m256i *const2) {
   26|       |  // Mask used to shuffle the elements present in 256bit register.
   27|  1.64M|  static const int shuffle_reg_256bit[8] = { 0x0b0a0d0c, 0x07060908, 0x03020504,
   28|  1.64M|                                             0x0f0e0100, 0x0b0a0d0c, 0x07060908,
   29|  1.64M|                                             0x03020504, 0x0f0e0100 };
   30|  1.64M|  __m256i tmp;
   31|       |  /* Reverse partial B. */
   32|  1.64M|  *partialb = _mm256_shuffle_epi8(
   33|  1.64M|      *partialb, _mm256_loadu_si256((const __m256i *)shuffle_reg_256bit));
   34|       |
   35|       |  /* Interleave the x and y values of identical indices and pair x8 with 0. */
   36|  1.64M|  tmp = *partiala;
   37|  1.64M|  *partiala = _mm256_unpacklo_epi16(*partiala, *partialb);
   38|  1.64M|  *partialb = _mm256_unpackhi_epi16(tmp, *partialb);
   39|       |
   40|       |  /* Square and add the corresponding x and y values. */
   41|  1.64M|  *partiala = _mm256_madd_epi16(*partiala, *partiala);
   42|  1.64M|  *partialb = _mm256_madd_epi16(*partialb, *partialb);
   43|       |  /* Multiply by constant. */
   44|  1.64M|  *partiala = _mm256_mullo_epi32(*partiala, *const1);
   45|  1.64M|  *partialb = _mm256_mullo_epi32(*partialb, *const2);
   46|       |  /* Sum all results. */
   47|  1.64M|  *partiala = _mm256_add_epi32(*partiala, *partialb);
   48|  1.64M|  return *partiala;
   49|  1.64M|}
cdef_block_avx2.c:hsum4_avx2:
   52|   549k|                                 __m256i *x3) {
   53|   549k|  const __m256i t0 = _mm256_unpacklo_epi32(*x0, *x1);
   54|   549k|  const __m256i t1 = _mm256_unpacklo_epi32(*x2, *x3);
   55|   549k|  const __m256i t2 = _mm256_unpackhi_epi32(*x0, *x1);
   56|   549k|  const __m256i t3 = _mm256_unpackhi_epi32(*x2, *x3);
   57|       |
   58|   549k|  *x0 = _mm256_unpacklo_epi64(t0, t1);
   59|   549k|  *x1 = _mm256_unpackhi_epi64(t0, t1);
   60|   549k|  *x2 = _mm256_unpacklo_epi64(t2, t3);
   61|   549k|  *x3 = _mm256_unpackhi_epi64(t2, t3);
   62|   549k|  return _mm256_add_epi32(_mm256_add_epi32(*x0, *x1),
   63|   549k|                          _mm256_add_epi32(*x2, *x3));
   64|   549k|}
cdef_block_avx2.c:array_reverse_transpose_8x8_avx2:
  150|   274k|static inline void array_reverse_transpose_8x8_avx2(__m256i *in, __m256i *res) {
  151|   274k|  const __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]);
  152|   274k|  const __m256i tr0_1 = _mm256_unpacklo_epi16(in[2], in[3]);
  153|   274k|  const __m256i tr0_2 = _mm256_unpackhi_epi16(in[0], in[1]);
  154|   274k|  const __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]);
  155|   274k|  const __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]);
  156|   274k|  const __m256i tr0_5 = _mm256_unpacklo_epi16(in[6], in[7]);
  157|   274k|  const __m256i tr0_6 = _mm256_unpackhi_epi16(in[4], in[5]);
  158|   274k|  const __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]);
  159|       |
  160|   274k|  const __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_1);
  161|   274k|  const __m256i tr1_1 = _mm256_unpacklo_epi32(tr0_4, tr0_5);
  162|   274k|  const __m256i tr1_2 = _mm256_unpackhi_epi32(tr0_0, tr0_1);
  163|   274k|  const __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_4, tr0_5);
  164|   274k|  const __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_2, tr0_3);
  165|   274k|  const __m256i tr1_5 = _mm256_unpacklo_epi32(tr0_6, tr0_7);
  166|   274k|  const __m256i tr1_6 = _mm256_unpackhi_epi32(tr0_2, tr0_3);
  167|   274k|  const __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_6, tr0_7);
  168|       |
  169|   274k|  res[7] = _mm256_unpacklo_epi64(tr1_0, tr1_1);
  170|   274k|  res[6] = _mm256_unpackhi_epi64(tr1_0, tr1_1);
  171|   274k|  res[5] = _mm256_unpacklo_epi64(tr1_2, tr1_3);
  172|   274k|  res[4] = _mm256_unpackhi_epi64(tr1_2, tr1_3);
  173|   274k|  res[3] = _mm256_unpacklo_epi64(tr1_4, tr1_5);
  174|   274k|  res[2] = _mm256_unpackhi_epi64(tr1_4, tr1_5);
  175|   274k|  res[1] = _mm256_unpacklo_epi64(tr1_6, tr1_7);
  176|   274k|  res[0] = _mm256_unpackhi_epi64(tr1_6, tr1_7);
  177|   274k|}

cfl_get_predict_lbd_fn_avx2:
  278|   146k|cfl_predict_lbd_fn cfl_get_predict_lbd_fn_avx2(TX_SIZE tx_size) {
  279|   146k|  static const cfl_predict_lbd_fn pred[TX_SIZES_ALL] = {
  280|   146k|    cfl_predict_lbd_4x4_ssse3,   /* 4x4 */
  281|   146k|    cfl_predict_lbd_8x8_ssse3,   /* 8x8 */
  282|   146k|    cfl_predict_lbd_16x16_ssse3, /* 16x16 */
  283|   146k|    cfl_predict_lbd_32x32_avx2,  /* 32x32 */
  284|   146k|    NULL,                        /* 64x64 (invalid CFL size) */
  285|   146k|    cfl_predict_lbd_4x8_ssse3,   /* 4x8 */
  286|   146k|    cfl_predict_lbd_8x4_ssse3,   /* 8x4 */
  287|   146k|    cfl_predict_lbd_8x16_ssse3,  /* 8x16 */
  288|   146k|    cfl_predict_lbd_16x8_ssse3,  /* 16x8 */
  289|   146k|    cfl_predict_lbd_16x32_ssse3, /* 16x32 */
  290|   146k|    cfl_predict_lbd_32x16_avx2,  /* 32x16 */
  291|   146k|    NULL,                        /* 32x64 (invalid CFL size) */
  292|   146k|    NULL,                        /* 64x32 (invalid CFL size) */
  293|   146k|    cfl_predict_lbd_4x16_ssse3,  /* 4x16  */
  294|   146k|    cfl_predict_lbd_16x4_ssse3,  /* 16x4  */
  295|   146k|    cfl_predict_lbd_8x32_ssse3,  /* 8x32  */
  296|   146k|    cfl_predict_lbd_32x8_avx2,   /* 32x8  */
  297|   146k|    NULL,                        /* 16x64 (invalid CFL size) */
  298|   146k|    NULL,                        /* 64x16 (invalid CFL size) */
  299|   146k|  };
  300|       |  // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the
  301|       |  // function pointer array out of bounds.
  302|   146k|  return pred[tx_size % TX_SIZES_ALL];
  303|   146k|}
cfl_get_predict_hbd_fn_avx2:
  352|   126k|cfl_predict_hbd_fn cfl_get_predict_hbd_fn_avx2(TX_SIZE tx_size) {
  353|   126k|  static const cfl_predict_hbd_fn pred[TX_SIZES_ALL] = {
  354|   126k|    cfl_predict_hbd_4x4_ssse3,  /* 4x4 */
  355|   126k|    cfl_predict_hbd_8x8_ssse3,  /* 8x8 */
  356|   126k|    cfl_predict_hbd_16x16_avx2, /* 16x16 */
  357|   126k|    cfl_predict_hbd_32x32_avx2, /* 32x32 */
  358|   126k|    NULL,                       /* 64x64 (invalid CFL size) */
  359|   126k|    cfl_predict_hbd_4x8_ssse3,  /* 4x8 */
  360|   126k|    cfl_predict_hbd_8x4_ssse3,  /* 8x4 */
  361|   126k|    cfl_predict_hbd_8x16_ssse3, /* 8x16 */
  362|   126k|    cfl_predict_hbd_16x8_avx2,  /* 16x8 */
  363|   126k|    cfl_predict_hbd_16x32_avx2, /* 16x32 */
  364|   126k|    cfl_predict_hbd_32x16_avx2, /* 32x16 */
  365|   126k|    NULL,                       /* 32x64 (invalid CFL size) */
  366|   126k|    NULL,                       /* 64x32 (invalid CFL size) */
  367|   126k|    cfl_predict_hbd_4x16_ssse3, /* 4x16  */
  368|   126k|    cfl_predict_hbd_16x4_avx2,  /* 16x4  */
  369|   126k|    cfl_predict_hbd_8x32_ssse3, /* 8x32  */
  370|   126k|    cfl_predict_hbd_32x8_avx2,  /* 32x8  */
  371|   126k|    NULL,                       /* 16x64 (invalid CFL size) */
  372|   126k|    NULL,                       /* 64x16 (invalid CFL size) */
  373|   126k|  };
  374|       |  // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the
  375|       |  // function pointer array out of bounds.
  376|   126k|  return pred[tx_size % TX_SIZES_ALL];
  377|   126k|}
cfl_get_subtract_average_fn_avx2:
  470|   136k|cfl_subtract_average_fn cfl_get_subtract_average_fn_avx2(TX_SIZE tx_size) {
  471|   136k|  static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = {
  472|   136k|    cfl_subtract_average_4x4_sse2,   /* 4x4 */
  473|   136k|    cfl_subtract_average_8x8_sse2,   /* 8x8 */
  474|   136k|    cfl_subtract_average_16x16_avx2, /* 16x16 */
  475|   136k|    cfl_subtract_average_32x32_avx2, /* 32x32 */
  476|   136k|    NULL,                            /* 64x64 (invalid CFL size) */
  477|   136k|    cfl_subtract_average_4x8_sse2,   /* 4x8 */
  478|   136k|    cfl_subtract_average_8x4_sse2,   /* 8x4 */
  479|   136k|    cfl_subtract_average_8x16_sse2,  /* 8x16 */
  480|   136k|    cfl_subtract_average_16x8_avx2,  /* 16x8 */
  481|   136k|    cfl_subtract_average_16x32_avx2, /* 16x32 */
  482|   136k|    cfl_subtract_average_32x16_avx2, /* 32x16 */
  483|   136k|    NULL,                            /* 32x64 (invalid CFL size) */
  484|   136k|    NULL,                            /* 64x32 (invalid CFL size) */
  485|   136k|    cfl_subtract_average_4x16_sse2,  /* 4x16 */
  486|   136k|    cfl_subtract_average_16x4_avx2,  /* 16x4 */
  487|   136k|    cfl_subtract_average_8x32_sse2,  /* 8x32 */
  488|   136k|    cfl_subtract_average_32x8_avx2,  /* 32x8 */
  489|   136k|    NULL,                            /* 16x64 (invalid CFL size) */
  490|   136k|    NULL,                            /* 64x16 (invalid CFL size) */
  491|   136k|  };
  492|       |  // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to
  493|       |  // index the function pointer array out of bounds.
  494|   136k|  return sub_avg[tx_size % TX_SIZES_ALL];
  495|   136k|}
cfl_avx2.c:cfl_luma_subsampling_420_lbd_avx2:
   64|     94|                                              int height) {
   65|     94|  (void)width;                               // Forever 32
   66|     94|  const __m256i twos = _mm256_set1_epi8(2);  // Thirty two twos
   67|     94|  const int luma_stride = input_stride << 1;
   68|     94|  __m256i *row = (__m256i *)pred_buf_q3;
   69|     94|  const __m256i *row_end = row + (height >> 1) * CFL_BUF_LINE_I256;
  ------------------
  |  |  524|     94|#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
  |  |  ------------------
  |  |  |  |  522|     94|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
   70|    776|  do {
   71|    776|    __m256i top = _mm256_loadu_si256((__m256i *)input);
   72|    776|    __m256i bot = _mm256_loadu_si256((__m256i *)(input + input_stride));
   73|       |
   74|    776|    __m256i top_16x16 = _mm256_maddubs_epi16(top, twos);
   75|    776|    __m256i bot_16x16 = _mm256_maddubs_epi16(bot, twos);
   76|    776|    __m256i sum_16x16 = _mm256_add_epi16(top_16x16, bot_16x16);
   77|       |
   78|    776|    _mm256_storeu_si256(row, sum_16x16);
   79|       |
   80|    776|    input += luma_stride;
   81|    776|  } while ((row += CFL_BUF_LINE_I256) < row_end);
  ------------------
  |  |  524|    776|#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
  |  |  ------------------
  |  |  |  |  522|    776|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  |  Branch (81:12): [True: 682, False: 94]
  ------------------
   82|     94|}
cfl_avx2.c:cfl_luma_subsampling_422_lbd_avx2:
   99|     18|                                              int height) {
  100|     18|  (void)width;                                // Forever 32
  101|     18|  const __m256i fours = _mm256_set1_epi8(4);  // Thirty two fours
  102|     18|  __m256i *row = (__m256i *)pred_buf_q3;
  103|     18|  const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
  ------------------
  |  |  524|     18|#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
  |  |  ------------------
  |  |  |  |  522|     18|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  104|    336|  do {
  105|    336|    __m256i top = _mm256_loadu_si256((__m256i *)input);
  106|    336|    __m256i top_16x16 = _mm256_maddubs_epi16(top, fours);
  107|    336|    _mm256_storeu_si256(row, top_16x16);
  108|    336|    input += input_stride;
  109|    336|  } while ((row += CFL_BUF_LINE_I256) < row_end);
  ------------------
  |  |  524|    336|#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
  |  |  ------------------
  |  |  |  |  522|    336|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  |  Branch (109:12): [True: 318, False: 18]
  ------------------
  110|     18|}
cfl_avx2.c:cfl_luma_subsampling_444_lbd_avx2:
  127|  3.87k|                                              int height) {
  128|  3.87k|  (void)width;  // Forever 32
  129|  3.87k|  __m256i *row = (__m256i *)pred_buf_q3;
  130|  3.87k|  const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
  ------------------
  |  |  524|  3.87k|#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
  |  |  ------------------
  |  |  |  |  522|  3.87k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  131|  3.87k|  const __m256i zeros = _mm256_setzero_si256();
  132|  71.6k|  do {
  133|  71.6k|    __m256i top = _mm256_loadu_si256((__m256i *)input);
  134|  71.6k|    top = _mm256_permute4x64_epi64(top, _MM_SHUFFLE(3, 1, 2, 0));
  135|       |
  136|  71.6k|    __m256i row_lo = _mm256_unpacklo_epi8(top, zeros);
  137|  71.6k|    row_lo = _mm256_slli_epi16(row_lo, 3);
  138|  71.6k|    __m256i row_hi = _mm256_unpackhi_epi8(top, zeros);
  139|  71.6k|    row_hi = _mm256_slli_epi16(row_hi, 3);
  140|       |
  141|  71.6k|    _mm256_storeu_si256(row, row_lo);
  142|  71.6k|    _mm256_storeu_si256(row + 1, row_hi);
  143|       |
  144|  71.6k|    input += input_stride;
  145|  71.6k|  } while ((row += CFL_BUF_LINE_I256) < row_end);
  ------------------
  |  |  524|  71.6k|#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
  |  |  ------------------
  |  |  |  |  522|  71.6k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  |  Branch (145:12): [True: 67.8k, False: 3.87k]
  ------------------
  146|  3.87k|}
cfl_avx2.c:cfl_luma_subsampling_420_hbd_avx2:
  166|     34|                                              int height) {
  167|     34|  (void)width;  // Forever 32
  168|     34|  const int luma_stride = input_stride << 1;
  169|     34|  __m256i *row = (__m256i *)pred_buf_q3;
  170|     34|  const __m256i *row_end = row + (height >> 1) * CFL_BUF_LINE_I256;
  ------------------
  |  |  524|     34|#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
  |  |  ------------------
  |  |  |  |  522|     34|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  171|    336|  do {
  172|    336|    __m256i top = _mm256_loadu_si256((__m256i *)input);
  173|    336|    __m256i bot = _mm256_loadu_si256((__m256i *)(input + input_stride));
  174|    336|    __m256i sum = _mm256_add_epi16(top, bot);
  175|       |
  176|    336|    __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16));
  177|    336|    __m256i bot_1 = _mm256_loadu_si256((__m256i *)(input + 16 + input_stride));
  178|    336|    __m256i sum_1 = _mm256_add_epi16(top_1, bot_1);
  179|       |
  180|    336|    __m256i hsum = _mm256_hadd_epi16(sum, sum_1);
  181|    336|    hsum = _mm256_permute4x64_epi64(hsum, _MM_SHUFFLE(3, 1, 2, 0));
  182|    336|    hsum = _mm256_add_epi16(hsum, hsum);
  183|       |
  184|    336|    _mm256_storeu_si256(row, hsum);
  185|       |
  186|    336|    input += luma_stride;
  187|    336|  } while ((row += CFL_BUF_LINE_I256) < row_end);
  ------------------
  |  |  524|    336|#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
  |  |  ------------------
  |  |  |  |  522|    336|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  |  Branch (187:12): [True: 302, False: 34]
  ------------------
  188|     34|}
cfl_avx2.c:cfl_luma_subsampling_422_hbd_avx2:
  206|    194|                                              int height) {
  207|    194|  (void)width;  // Forever 32
  208|    194|  __m256i *row = (__m256i *)pred_buf_q3;
  209|    194|  const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
  ------------------
  |  |  524|    194|#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
  |  |  ------------------
  |  |  |  |  522|    194|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  210|  4.25k|  do {
  211|  4.25k|    __m256i top = _mm256_loadu_si256((__m256i *)input);
  212|  4.25k|    __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16));
  213|  4.25k|    __m256i hsum = _mm256_hadd_epi16(top, top_1);
  214|  4.25k|    hsum = _mm256_permute4x64_epi64(hsum, _MM_SHUFFLE(3, 1, 2, 0));
  215|  4.25k|    hsum = _mm256_slli_epi16(hsum, 2);
  216|       |
  217|  4.25k|    _mm256_storeu_si256(row, hsum);
  218|       |
  219|  4.25k|    input += input_stride;
  220|  4.25k|  } while ((row += CFL_BUF_LINE_I256) < row_end);
  ------------------
  |  |  524|  4.25k|#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
  |  |  ------------------
  |  |  |  |  522|  4.25k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  |  Branch (220:12): [True: 4.06k, False: 194]
  ------------------
  221|    194|}
cfl_avx2.c:cfl_luma_subsampling_444_hbd_avx2:
  228|  3.58k|                                              int height) {
  229|  3.58k|  (void)width;  // Forever 32
  230|  3.58k|  __m256i *row = (__m256i *)pred_buf_q3;
  231|  3.58k|  const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
  ------------------
  |  |  524|  3.58k|#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
  |  |  ------------------
  |  |  |  |  522|  3.58k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  232|  62.7k|  do {
  233|  62.7k|    __m256i top = _mm256_loadu_si256((__m256i *)input);
  234|  62.7k|    __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16));
  235|  62.7k|    _mm256_storeu_si256(row, _mm256_slli_epi16(top, 3));
  236|  62.7k|    _mm256_storeu_si256(row + 1, _mm256_slli_epi16(top_1, 3));
  237|  62.7k|    input += input_stride;
  238|  62.7k|  } while ((row += CFL_BUF_LINE_I256) < row_end);
  ------------------
  |  |  524|  62.7k|#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
  |  |  ------------------
  |  |  |  |  522|  62.7k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  |  Branch (238:12): [True: 59.1k, False: 3.58k]
  ------------------
  239|  3.58k|}
cfl_avx2.c:cfl_predict_lbd_avx2:
  256|  10.1k|                                        int alpha_q3, int width, int height) {
  257|  10.1k|  (void)width;
  258|  10.1k|  const __m256i alpha_sign = _mm256_set1_epi16(alpha_q3);
  259|  10.1k|  const __m256i alpha_q12 = _mm256_slli_epi16(_mm256_abs_epi16(alpha_sign), 9);
  260|  10.1k|  const __m256i dc_q0 = _mm256_set1_epi16(*dst);
  261|  10.1k|  __m256i *row = (__m256i *)pred_buf_q3;
  262|  10.1k|  const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
  ------------------
  |  |  524|  10.1k|#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
  |  |  ------------------
  |  |  |  |  522|  10.1k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  263|       |
  264|   188k|  do {
  265|   188k|    __m256i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
  266|   188k|    __m256i next = predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
  267|   188k|    res = _mm256_packus_epi16(res, next);
  268|   188k|    res = _mm256_permute4x64_epi64(res, _MM_SHUFFLE(3, 1, 2, 0));
  269|   188k|    _mm256_storeu_si256((__m256i *)dst, res);
  270|   188k|    dst += dst_stride;
  271|   188k|  } while ((row += CFL_BUF_LINE_I256) < row_end);
  ------------------
  |  |  524|   188k|#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
  |  |  ------------------
  |  |  |  |  522|   188k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  |  Branch (271:12): [True: 178k, False: 10.1k]
  ------------------
  272|  10.1k|}
cfl_avx2.c:predict_unclipped:
  245|  1.14M|                                        __m256i alpha_sign, __m256i dc_q0) {
  246|  1.14M|  __m256i ac_q3 = _mm256_loadu_si256(input);
  247|  1.14M|  __m256i ac_sign = _mm256_sign_epi16(alpha_sign, ac_q3);
  248|  1.14M|  __m256i scaled_luma_q0 =
  249|  1.14M|      _mm256_mulhrs_epi16(_mm256_abs_epi16(ac_q3), alpha_q12);
  250|  1.14M|  scaled_luma_q0 = _mm256_sign_epi16(scaled_luma_q0, ac_sign);
  251|  1.14M|  return _mm256_add_epi16(scaled_luma_q0, dc_q0);
  252|  1.14M|}
cfl_avx2.c:cfl_predict_hbd_avx2:
  319|  50.1k|                                        int height) {
  320|       |  // Use SSSE3 version for smaller widths
  321|  50.1k|  assert(width == 16 || width == 32);
  322|  50.1k|  const __m256i alpha_sign = _mm256_set1_epi16(alpha_q3);
  323|  50.1k|  const __m256i alpha_q12 = _mm256_slli_epi16(_mm256_abs_epi16(alpha_sign), 9);
  324|  50.1k|  const __m256i dc_q0 = _mm256_loadu_si256((__m256i *)dst);
  325|  50.1k|  const __m256i max = highbd_max_epi16(bd);
  326|       |
  327|  50.1k|  __m256i *row = (__m256i *)pred_buf_q3;
  328|  50.1k|  const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
  ------------------
  |  |  524|  50.1k|#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
  |  |  ------------------
  |  |  |  |  522|  50.1k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  329|   599k|  do {
  330|   599k|    const __m256i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
  331|   599k|    _mm256_storeu_si256((__m256i *)dst,
  332|   599k|                        highbd_clamp_epi16(res, _mm256_setzero_si256(), max));
  333|   599k|    if (width == 32) {
  ------------------
  |  Branch (333:9): [True: 168k, False: 431k]
  ------------------
  334|   168k|      const __m256i res_1 =
  335|   168k|          predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
  336|   168k|      _mm256_storeu_si256(
  337|   168k|          (__m256i *)(dst + 16),
  338|   168k|          highbd_clamp_epi16(res_1, _mm256_setzero_si256(), max));
  339|   168k|    }
  340|   599k|    dst += dst_stride;
  341|   599k|  } while ((row += CFL_BUF_LINE_I256) < row_end);
  ------------------
  |  |  524|   599k|#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
  |  |  ------------------
  |  |  |  |  522|   599k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  |  Branch (341:12): [True: 549k, False: 50.1k]
  ------------------
  342|  50.1k|}
cfl_avx2.c:highbd_max_epi16:
  306|  50.1k|static __m256i highbd_max_epi16(int bd) {
  307|  50.1k|  const __m256i neg_one = _mm256_set1_epi16(-1);
  308|       |  // (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd)
  309|  50.1k|  return _mm256_xor_si256(_mm256_slli_epi16(neg_one, bd), neg_one);
  310|  50.1k|}
cfl_avx2.c:highbd_clamp_epi16:
  312|   768k|static __m256i highbd_clamp_epi16(__m256i u, __m256i zero, __m256i max) {
  313|   768k|  return _mm256_max_epi16(_mm256_min_epi16(u, max), zero);
  314|   768k|}
cfl_avx2.c:subtract_average_avx2:
  405|  53.0k|                                         int num_pel_log2) {
  406|       |  // Use SSE2 version for smaller widths
  407|  53.0k|  assert(width == 16 || width == 32);
  408|       |
  409|  53.0k|  const __m256i *src = (__m256i *)src_ptr;
  410|  53.0k|  const __m256i *const end = src + height * CFL_BUF_LINE_I256;
  ------------------
  |  |  524|  53.0k|#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
  |  |  ------------------
  |  |  |  |  522|  53.0k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  411|       |  // To maximize usage of the AVX2 registers, we sum two rows per loop
  412|       |  // iteration
  413|  53.0k|  const int step = 2 * CFL_BUF_LINE_I256;
  ------------------
  |  |  524|  53.0k|#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
  |  |  ------------------
  |  |  |  |  522|  53.0k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  414|       |
  415|  53.0k|  __m256i sum = _mm256_setzero_si256();
  416|       |  // For width 32, we use a second sum accumulator to reduce accumulator
  417|       |  // dependencies in the loop.
  418|  53.0k|  __m256i sum2;
  419|  53.0k|  if (width == 32) sum2 = _mm256_setzero_si256();
  ------------------
  |  Branch (419:7): [True: 9.77k, False: 43.3k]
  ------------------
  420|       |
  421|   313k|  do {
  422|       |    // Add top row to the bottom row
  423|   313k|    __m256i l0 = _mm256_add_epi16(_mm256_loadu_si256(src),
  424|   313k|                                  _mm256_loadu_si256(src + CFL_BUF_LINE_I256));
  ------------------
  |  |  524|   313k|#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
  |  |  ------------------
  |  |  |  |  522|   313k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  425|   313k|    sum = _mm256_add_epi32(sum, _mm256_addl_epi16(l0));
  426|   313k|    if (width == 32) { /* Don't worry, this if it gets optimized out. */
  ------------------
  |  Branch (426:9): [True: 89.2k, False: 224k]
  ------------------
  427|       |      // Add the second part of the top row to the second part of the bottom row
  428|  89.2k|      __m256i l1 =
  429|  89.2k|          _mm256_add_epi16(_mm256_loadu_si256(src + 1),
  430|  89.2k|                           _mm256_loadu_si256(src + 1 + CFL_BUF_LINE_I256));
  ------------------
  |  |  524|  89.2k|#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
  |  |  ------------------
  |  |  |  |  522|  89.2k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  431|  89.2k|      sum2 = _mm256_add_epi32(sum2, _mm256_addl_epi16(l1));
  432|  89.2k|    }
  433|   313k|    src += step;
  434|   313k|  } while (src < end);
  ------------------
  |  Branch (434:12): [True: 260k, False: 53.0k]
  ------------------
  435|       |  // Combine both sum accumulators
  436|  53.0k|  if (width == 32) sum = _mm256_add_epi32(sum, sum2);
  ------------------
  |  Branch (436:7): [True: 9.77k, False: 43.3k]
  ------------------
  437|       |
  438|  53.0k|  __m256i fill = fill_sum_epi32(sum);
  439|       |
  440|  53.0k|  __m256i avg_epi16 = _mm256_srli_epi32(
  441|  53.0k|      _mm256_add_epi32(fill, _mm256_set1_epi32(round_offset)), num_pel_log2);
  442|  53.0k|  avg_epi16 = _mm256_packs_epi32(avg_epi16, avg_epi16);
  443|       |
  444|       |  // Store and subtract loop
  445|  53.0k|  src = (__m256i *)src_ptr;
  446|  53.0k|  __m256i *dst = (__m256i *)dst_ptr;
  447|   626k|  do {
  448|   626k|    _mm256_storeu_si256(dst,
  449|   626k|                        _mm256_sub_epi16(_mm256_loadu_si256(src), avg_epi16));
  450|   626k|    if (width == 32) {
  ------------------
  |  Branch (450:9): [True: 178k, False: 448k]
  ------------------
  451|   178k|      _mm256_storeu_si256(
  452|   178k|          dst + 1, _mm256_sub_epi16(_mm256_loadu_si256(src + 1), avg_epi16));
  453|   178k|    }
  454|   626k|    src += CFL_BUF_LINE_I256;
  ------------------
  |  |  524|   626k|#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
  |  |  ------------------
  |  |  |  |  522|   626k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  455|   626k|    dst += CFL_BUF_LINE_I256;
  ------------------
  |  |  524|   626k|#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
  |  |  ------------------
  |  |  |  |  522|   626k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  456|   626k|  } while (src < end);
  ------------------
  |  Branch (456:12): [True: 573k, False: 53.0k]
  ------------------
  457|  53.0k|}
cfl_avx2.c:_mm256_addl_epi16:
  397|   402k|static inline __m256i _mm256_addl_epi16(__m256i a) {
  398|   402k|  return _mm256_add_epi32(_mm256_unpacklo_epi16(a, _mm256_setzero_si256()),
  399|   402k|                          _mm256_unpackhi_epi16(a, _mm256_setzero_si256()));
  400|   402k|}
cfl_avx2.c:fill_sum_epi32:
  382|  53.0k|static inline __m256i fill_sum_epi32(__m256i a) {
  383|       |  // Given that a == [A, B, C, D, E, F, G, H]
  384|  53.0k|  a = _mm256_hadd_epi32(a, a);
  385|       |  // Given that A' == A + B, C' == C + D, E' == E + F, G' == G + H
  386|       |  // a == [A', C', A', C', E', G', E', G']
  387|  53.0k|  a = _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0));
  388|       |  // a == [A', C', E', G', A', C', E', G']
  389|  53.0k|  a = _mm256_hadd_epi32(a, a);
  390|       |  // Given that A'' == A' + C' and E'' == E' + G'
  391|       |  // a == [A'', E'', A'', E'', A'', E'', A'', E'']
  392|  53.0k|  return _mm256_hadd_epi32(a, a);
  393|       |  // Given that A''' == A'' + E''
  394|       |  // a == [A''', A''', A''', A''', A''', A''', A''', A''']
  395|  53.0k|}

cfl_sse2.c:subtract_average_sse2:
   25|  83.3k|                                         int num_pel_log2) {
   26|  83.3k|  const __m128i zeros = _mm_setzero_si128();
   27|  83.3k|  const __m128i round_offset_epi32 = _mm_set1_epi32(round_offset);
   28|  83.3k|  const __m128i *src = (__m128i *)src_ptr;
   29|  83.3k|  const __m128i *const end = src + height * CFL_BUF_LINE_I128;
  ------------------
  |  |  523|  83.3k|#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3)
  |  |  ------------------
  |  |  |  |  522|  83.3k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
   30|  83.3k|  const int step = CFL_BUF_LINE_I128 * (1 + (width == 8) + 3 * (width == 4));
  ------------------
  |  |  523|  83.3k|#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3)
  |  |  ------------------
  |  |  |  |  522|  83.3k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
   31|       |
   32|  83.3k|  __m128i sum = zeros;
   33|   327k|  do {
   34|   327k|    __m128i l0;
   35|   327k|    if (width == 4) {
  ------------------
  |  Branch (35:9): [True: 58.6k, False: 268k]
  ------------------
   36|  58.6k|      l0 = _mm_add_epi16(_mm_loadl_epi64(src),
   37|  58.6k|                         _mm_loadl_epi64(src + CFL_BUF_LINE_I128));
  ------------------
  |  |  523|  58.6k|#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3)
  |  |  ------------------
  |  |  |  |  522|  58.6k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
   38|  58.6k|      __m128i l1 = _mm_add_epi16(_mm_loadl_epi64(src + 2 * CFL_BUF_LINE_I128),
  ------------------
  |  |  523|  58.6k|#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3)
  |  |  ------------------
  |  |  |  |  522|  58.6k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
   39|  58.6k|                                 _mm_loadl_epi64(src + 3 * CFL_BUF_LINE_I128));
  ------------------
  |  |  523|  58.6k|#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3)
  |  |  ------------------
  |  |  |  |  522|  58.6k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
   40|  58.6k|      sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros),
   41|  58.6k|                                             _mm_unpacklo_epi16(l1, zeros)));
   42|   268k|    } else {
   43|   268k|      if (width == 8) {
  ------------------
  |  Branch (43:11): [True: 268k, False: 0]
  ------------------
   44|   268k|        l0 = _mm_add_epi16(_mm_loadu_si128(src),
   45|   268k|                           _mm_loadu_si128(src + CFL_BUF_LINE_I128));
  ------------------
  |  |  523|   268k|#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3)
  |  |  ------------------
  |  |  |  |  522|   268k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
   46|   268k|      } else {
   47|      0|        l0 = _mm_add_epi16(_mm_loadu_si128(src), _mm_loadu_si128(src + 1));
   48|      0|      }
   49|   268k|      sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros),
   50|   268k|                                             _mm_unpackhi_epi16(l0, zeros)));
   51|   268k|      if (width == 32) {
  ------------------
  |  Branch (51:11): [True: 0, False: 268k]
  ------------------
   52|      0|        l0 = _mm_add_epi16(_mm_loadu_si128(src + 2), _mm_loadu_si128(src + 3));
   53|      0|        sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros),
   54|      0|                                               _mm_unpackhi_epi16(l0, zeros)));
   55|      0|      }
   56|   268k|    }
   57|   327k|    src += step;
   58|   327k|  } while (src < end);
  ------------------
  |  Branch (58:12): [True: 244k, False: 83.3k]
  ------------------
   59|       |
   60|  83.3k|  sum = fill_sum_epi32(sum);
   61|       |
   62|  83.3k|  __m128i avg_epi16 =
   63|  83.3k|      _mm_srli_epi32(_mm_add_epi32(sum, round_offset_epi32), num_pel_log2);
   64|  83.3k|  avg_epi16 = _mm_packs_epi32(avg_epi16, avg_epi16);
   65|       |
   66|  83.3k|  src = (__m128i *)src_ptr;
   67|  83.3k|  __m128i *dst = (__m128i *)dst_ptr;
   68|   771k|  do {
   69|   771k|    if (width == 4) {
  ------------------
  |  Branch (69:9): [True: 234k, False: 537k]
  ------------------
   70|   234k|      _mm_storel_epi64(dst, _mm_sub_epi16(_mm_loadl_epi64(src), avg_epi16));
   71|   537k|    } else {
   72|   537k|      _mm_storeu_si128(dst, _mm_sub_epi16(_mm_loadu_si128(src), avg_epi16));
   73|   537k|      if (width > 8) {
  ------------------
  |  Branch (73:11): [True: 0, False: 537k]
  ------------------
   74|      0|        _mm_storeu_si128(dst + 1,
   75|      0|                         _mm_sub_epi16(_mm_loadu_si128(src + 1), avg_epi16));
   76|      0|        if (width == 32) {
  ------------------
  |  Branch (76:13): [True: 0, False: 0]
  ------------------
   77|      0|          _mm_storeu_si128(dst + 2,
   78|      0|                           _mm_sub_epi16(_mm_loadu_si128(src + 2), avg_epi16));
   79|      0|          _mm_storeu_si128(dst + 3,
   80|      0|                           _mm_sub_epi16(_mm_loadu_si128(src + 3), avg_epi16));
   81|      0|        }
   82|      0|      }
   83|   537k|    }
   84|   771k|    src += CFL_BUF_LINE_I128;
  ------------------
  |  |  523|   771k|#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3)
  |  |  ------------------
  |  |  |  |  522|   771k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
   85|   771k|    dst += CFL_BUF_LINE_I128;
  ------------------
  |  |  523|   771k|#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3)
  |  |  ------------------
  |  |  |  |  522|   771k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
   86|   771k|  } while (src < end);
  ------------------
  |  Branch (86:12): [True: 688k, False: 83.3k]
  ------------------
   87|  83.3k|}
cfl_sse2.c:fill_sum_epi32:
   17|  83.3k|static inline __m128i fill_sum_epi32(__m128i l0) {
   18|  83.3k|  l0 = _mm_add_epi32(l0, _mm_shuffle_epi32(l0, _MM_SHUFFLE(1, 0, 3, 2)));
   19|       |  return _mm_add_epi32(l0, _mm_shuffle_epi32(l0, _MM_SHUFFLE(2, 3, 0, 1)));
   20|  83.3k|}

cfl_ssse3.c:cfl_luma_subsampling_420_lbd_ssse3:
   43|  8.40k|                                                      int width, int height) {
   44|  8.40k|  const __m128i twos = _mm_set1_epi8(2);
   45|  8.40k|  __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
   46|  8.40k|  const __m128i *end = pred_buf_m128i + (height >> 1) * CFL_BUF_LINE_I128;
  ------------------
  |  |  523|  8.40k|#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3)
  |  |  ------------------
  |  |  |  |  522|  8.40k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
   47|  8.40k|  const int luma_stride = input_stride << 1;
   48|  27.8k|  do {
   49|  27.8k|    if (width == 4) {
  ------------------
  |  Branch (49:9): [True: 16.5k, False: 11.2k]
  ------------------
   50|  16.5k|      __m128i top = _mm_loadh_epi32((__m128i *)input);
   51|  16.5k|      top = _mm_maddubs_epi16(top, twos);
   52|  16.5k|      __m128i bot = _mm_loadh_epi32((__m128i *)(input + input_stride));
   53|  16.5k|      bot = _mm_maddubs_epi16(bot, twos);
   54|  16.5k|      const __m128i sum = _mm_add_epi16(top, bot);
   55|  16.5k|      _mm_storeh_epi32(pred_buf_m128i, sum);
   56|  16.5k|    } else if (width == 8) {
  ------------------
  |  Branch (56:16): [True: 6.30k, False: 4.98k]
  ------------------
   57|  6.30k|      __m128i top = _mm_loadl_epi64((__m128i *)input);
   58|  6.30k|      top = _mm_maddubs_epi16(top, twos);
   59|  6.30k|      __m128i bot = _mm_loadl_epi64((__m128i *)(input + input_stride));
   60|  6.30k|      bot = _mm_maddubs_epi16(bot, twos);
   61|  6.30k|      const __m128i sum = _mm_add_epi16(top, bot);
   62|  6.30k|      _mm_storel_epi64(pred_buf_m128i, sum);
   63|  6.30k|    } else {
   64|  4.98k|      __m128i top = _mm_loadu_si128((__m128i *)input);
   65|  4.98k|      top = _mm_maddubs_epi16(top, twos);
   66|  4.98k|      __m128i bot = _mm_loadu_si128((__m128i *)(input + input_stride));
   67|  4.98k|      bot = _mm_maddubs_epi16(bot, twos);
   68|  4.98k|      const __m128i sum = _mm_add_epi16(top, bot);
   69|  4.98k|      _mm_storeu_si128(pred_buf_m128i, sum);
   70|  4.98k|      if (width == 32) {
  ------------------
  |  Branch (70:11): [True: 0, False: 4.98k]
  ------------------
   71|      0|        __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
   72|      0|        __m128i bot_1 =
   73|      0|            _mm_loadu_si128(((__m128i *)(input + input_stride)) + 1);
   74|      0|        top_1 = _mm_maddubs_epi16(top_1, twos);
   75|      0|        bot_1 = _mm_maddubs_epi16(bot_1, twos);
   76|      0|        __m128i sum_1 = _mm_add_epi16(top_1, bot_1);
   77|      0|        _mm_storeu_si128(pred_buf_m128i + 1, sum_1);
   78|      0|      }
   79|  4.98k|    }
   80|  27.8k|    input += luma_stride;
   81|  27.8k|    pred_buf_m128i += CFL_BUF_LINE_I128;
  ------------------
  |  |  523|  27.8k|#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3)
  |  |  ------------------
  |  |  |  |  522|  27.8k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
   82|  27.8k|  } while (pred_buf_m128i < end);
  ------------------
  |  Branch (82:12): [True: 19.4k, False: 8.40k]
  ------------------
   83|  8.40k|}
cfl_ssse3.c:_mm_loadh_epi32:
   21|   244k|static inline __m128i _mm_loadh_epi32(__m128i const *mem_addr) {
   22|   244k|  return _mm_cvtsi32_si128(*((int *)mem_addr));
   23|   244k|}
cfl_ssse3.c:_mm_storeh_epi32:
   26|   278k|static inline void _mm_storeh_epi32(__m128i const *mem_addr, __m128i a) {
   27|   278k|  *((int *)mem_addr) = _mm_cvtsi128_si32(a);
   28|   278k|}
cfl_ssse3.c:cfl_luma_subsampling_422_lbd_ssse3:
   98|    358|                                                      int width, int height) {
   99|    358|  const __m128i fours = _mm_set1_epi8(4);
  100|    358|  __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
  101|    358|  const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128;
  ------------------
  |  |  523|    358|#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3)
  |  |  ------------------
  |  |  |  |  522|    358|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  102|  2.32k|  do {
  103|  2.32k|    if (width == 4) {
  ------------------
  |  Branch (103:9): [True: 480, False: 1.84k]
  ------------------
  104|    480|      __m128i top = _mm_loadh_epi32((__m128i *)input);
  105|    480|      top = _mm_maddubs_epi16(top, fours);
  106|    480|      _mm_storeh_epi32(pred_buf_m128i, top);
  107|  1.84k|    } else if (width == 8) {
  ------------------
  |  Branch (107:16): [True: 1.01k, False: 824]
  ------------------
  108|  1.01k|      __m128i top = _mm_loadl_epi64((__m128i *)input);
  109|  1.01k|      top = _mm_maddubs_epi16(top, fours);
  110|  1.01k|      _mm_storel_epi64(pred_buf_m128i, top);
  111|  1.01k|    } else {
  112|    824|      __m128i top = _mm_loadu_si128((__m128i *)input);
  113|    824|      top = _mm_maddubs_epi16(top, fours);
  114|    824|      _mm_storeu_si128(pred_buf_m128i, top);
  115|    824|      if (width == 32) {
  ------------------
  |  Branch (115:11): [True: 0, False: 824]
  ------------------
  116|      0|        __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
  117|      0|        top_1 = _mm_maddubs_epi16(top_1, fours);
  118|      0|        _mm_storeu_si128(pred_buf_m128i + 1, top_1);
  119|      0|      }
  120|    824|    }
  121|  2.32k|    input += input_stride;
  122|  2.32k|    pred_buf_m128i += CFL_BUF_LINE_I128;
  ------------------
  |  |  523|  2.32k|#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3)
  |  |  ------------------
  |  |  |  |  522|  2.32k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  123|  2.32k|  } while (pred_buf_m128i < end);
  ------------------
  |  Branch (123:12): [True: 1.96k, False: 358]
  ------------------
  124|    358|}
cfl_ssse3.c:cfl_luma_subsampling_444_lbd_ssse3:
  138|   105k|                                                      int width, int height) {
  139|   105k|  const __m128i zeros = _mm_setzero_si128();
  140|   105k|  const int luma_stride = input_stride;
  141|   105k|  __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
  142|   105k|  const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128;
  ------------------
  |  |  523|   105k|#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3)
  |  |  ------------------
  |  |  |  |  522|   105k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  143|   817k|  do {
  144|   817k|    if (width == 4) {
  ------------------
  |  Branch (144:9): [True: 210k, False: 606k]
  ------------------
  145|   210k|      __m128i row = _mm_loadh_epi32((__m128i *)input);
  146|   210k|      row = _mm_unpacklo_epi8(row, zeros);
  147|   210k|      _mm_storel_epi64(pred_buf_m128i, _mm_slli_epi16(row, 3));
  148|   606k|    } else if (width == 8) {
  ------------------
  |  Branch (148:16): [True: 423k, False: 183k]
  ------------------
  149|   423k|      __m128i row = _mm_loadl_epi64((__m128i *)input);
  150|   423k|      row = _mm_unpacklo_epi8(row, zeros);
  151|   423k|      _mm_storeu_si128(pred_buf_m128i, _mm_slli_epi16(row, 3));
  152|   423k|    } else {
  153|   183k|      __m128i row = _mm_loadu_si128((__m128i *)input);
  154|   183k|      const __m128i row_lo = _mm_unpacklo_epi8(row, zeros);
  155|   183k|      const __m128i row_hi = _mm_unpackhi_epi8(row, zeros);
  156|   183k|      _mm_storeu_si128(pred_buf_m128i, _mm_slli_epi16(row_lo, 3));
  157|   183k|      _mm_storeu_si128(pred_buf_m128i + 1, _mm_slli_epi16(row_hi, 3));
  158|   183k|      if (width == 32) {
  ------------------
  |  Branch (158:11): [True: 0, False: 183k]
  ------------------
  159|      0|        __m128i row_1 = _mm_loadu_si128(((__m128i *)input) + 1);
  160|      0|        const __m128i row_1_lo = _mm_unpacklo_epi8(row_1, zeros);
  161|      0|        const __m128i row_1_hi = _mm_unpackhi_epi8(row_1, zeros);
  162|      0|        _mm_storeu_si128(pred_buf_m128i + 2, _mm_slli_epi16(row_1_lo, 3));
  163|      0|        _mm_storeu_si128(pred_buf_m128i + 3, _mm_slli_epi16(row_1_hi, 3));
  164|      0|      }
  165|   183k|    }
  166|   817k|    input += luma_stride;
  167|   817k|    pred_buf_m128i += CFL_BUF_LINE_I128;
  ------------------
  |  |  523|   817k|#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3)
  |  |  ------------------
  |  |  |  |  522|   817k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  168|   817k|  } while (pred_buf_m128i < end);
  ------------------
  |  Branch (168:12): [True: 711k, False: 105k]
  ------------------
  169|   105k|}
cfl_ssse3.c:cfl_luma_subsampling_420_hbd_ssse3:
  185|  1.45k|                                                      int width, int height) {
  186|  1.45k|  const uint16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE;
  ------------------
  |  |  522|  1.45k|#define CFL_BUF_LINE (32)
  ------------------
  187|  1.45k|  const int luma_stride = input_stride << 1;
  188|  5.00k|  do {
  189|  5.00k|    if (width == 4) {
  ------------------
  |  Branch (189:9): [True: 2.88k, False: 2.12k]
  ------------------
  190|  2.88k|      const __m128i top = _mm_loadl_epi64((__m128i *)input);
  191|  2.88k|      const __m128i bot = _mm_loadl_epi64((__m128i *)(input + input_stride));
  192|  2.88k|      __m128i sum = _mm_add_epi16(top, bot);
  193|  2.88k|      sum = _mm_hadd_epi16(sum, sum);
  194|  2.88k|      *((int *)pred_buf_q3) = _mm_cvtsi128_si32(_mm_add_epi16(sum, sum));
  195|  2.88k|    } else {
  196|  2.12k|      const __m128i top = _mm_loadu_si128((__m128i *)input);
  197|  2.12k|      const __m128i bot = _mm_loadu_si128((__m128i *)(input + input_stride));
  198|  2.12k|      __m128i sum = _mm_add_epi16(top, bot);
  199|  2.12k|      if (width == 8) {
  ------------------
  |  Branch (199:11): [True: 1.20k, False: 920]
  ------------------
  200|  1.20k|        sum = _mm_hadd_epi16(sum, sum);
  201|  1.20k|        _mm_storel_epi64((__m128i *)pred_buf_q3, _mm_add_epi16(sum, sum));
  202|  1.20k|      } else {
  203|    920|        const __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
  204|    920|        const __m128i bot_1 =
  205|    920|            _mm_loadu_si128(((__m128i *)(input + input_stride)) + 1);
  206|    920|        sum = _mm_hadd_epi16(sum, _mm_add_epi16(top_1, bot_1));
  207|    920|        _mm_storeu_si128((__m128i *)pred_buf_q3, _mm_add_epi16(sum, sum));
  208|    920|        if (width == 32) {
  ------------------
  |  Branch (208:13): [True: 0, False: 920]
  ------------------
  209|      0|          const __m128i top_2 = _mm_loadu_si128(((__m128i *)input) + 2);
  210|      0|          const __m128i bot_2 =
  211|      0|              _mm_loadu_si128(((__m128i *)(input + input_stride)) + 2);
  212|      0|          const __m128i top_3 = _mm_loadu_si128(((__m128i *)input) + 3);
  213|      0|          const __m128i bot_3 =
  214|      0|              _mm_loadu_si128(((__m128i *)(input + input_stride)) + 3);
  215|      0|          const __m128i sum_2 = _mm_add_epi16(top_2, bot_2);
  216|      0|          const __m128i sum_3 = _mm_add_epi16(top_3, bot_3);
  217|      0|          __m128i next_sum = _mm_hadd_epi16(sum_2, sum_3);
  218|      0|          _mm_storeu_si128(((__m128i *)pred_buf_q3) + 1,
  219|      0|                           _mm_add_epi16(next_sum, next_sum));
  220|      0|        }
  221|    920|      }
  222|  2.12k|    }
  223|  5.00k|    input += luma_stride;
  224|  5.00k|  } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
  ------------------
  |  |  522|  5.00k|#define CFL_BUF_LINE (32)
  ------------------
  |  Branch (224:12): [True: 3.55k, False: 1.45k]
  ------------------
  225|  1.45k|}
cfl_ssse3.c:cfl_luma_subsampling_422_hbd_ssse3:
  240|    448|                                                      int width, int height) {
  241|    448|  __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
  242|    448|  const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128;
  ------------------
  |  |  523|    448|#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3)
  |  |  ------------------
  |  |  |  |  522|    448|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  243|  3.30k|  do {
  244|  3.30k|    if (width == 4) {
  ------------------
  |  Branch (244:9): [True: 392, False: 2.91k]
  ------------------
  245|    392|      const __m128i top = _mm_loadl_epi64((__m128i *)input);
  246|    392|      const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top), 2);
  247|    392|      _mm_storeh_epi32(pred_buf_m128i, sum);
  248|  2.91k|    } else {
  249|  2.91k|      const __m128i top = _mm_loadu_si128((__m128i *)input);
  250|  2.91k|      if (width == 8) {
  ------------------
  |  Branch (250:11): [True: 1.39k, False: 1.52k]
  ------------------
  251|  1.39k|        const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top), 2);
  252|  1.39k|        _mm_storel_epi64(pred_buf_m128i, sum);
  253|  1.52k|      } else {
  254|  1.52k|        const __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
  255|  1.52k|        const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top_1), 2);
  256|  1.52k|        _mm_storeu_si128(pred_buf_m128i, sum);
  257|  1.52k|        if (width == 32) {
  ------------------
  |  Branch (257:13): [True: 0, False: 1.52k]
  ------------------
  258|      0|          const __m128i top_2 = _mm_loadu_si128(((__m128i *)input) + 2);
  259|      0|          const __m128i top_3 = _mm_loadu_si128(((__m128i *)input) + 3);
  260|      0|          const __m128i sum_1 = _mm_slli_epi16(_mm_hadd_epi16(top_2, top_3), 2);
  261|      0|          _mm_storeu_si128(pred_buf_m128i + 1, sum_1);
  262|      0|        }
  263|  1.52k|      }
  264|  2.91k|    }
  265|  3.30k|    pred_buf_m128i += CFL_BUF_LINE_I128;
  ------------------
  |  |  523|  3.30k|#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3)
  |  |  ------------------
  |  |  |  |  522|  3.30k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  266|  3.30k|    input += input_stride;
  267|  3.30k|  } while (pred_buf_m128i < end);
  ------------------
  |  Branch (267:12): [True: 2.85k, False: 448]
  ------------------
  268|    448|}
cfl_ssse3.c:cfl_luma_subsampling_444_hbd_ssse3:
  273|  95.5k|                                                      int width, int height) {
  274|  95.5k|  const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
  ------------------
  |  |  522|  95.5k|#define CFL_BUF_LINE (32)
  ------------------
  275|   732k|  do {
  276|   732k|    if (width == 4) {
  ------------------
  |  Branch (276:9): [True: 184k, False: 548k]
  ------------------
  277|   184k|      const __m128i row = _mm_slli_epi16(_mm_loadl_epi64((__m128i *)input), 3);
  278|   184k|      _mm_storel_epi64((__m128i *)pred_buf_q3, row);
  279|   548k|    } else {
  280|   548k|      const __m128i row = _mm_slli_epi16(_mm_loadu_si128((__m128i *)input), 3);
  281|   548k|      _mm_storeu_si128((__m128i *)pred_buf_q3, row);
  282|   548k|      if (width >= 16) {
  ------------------
  |  Branch (282:11): [True: 163k, False: 385k]
  ------------------
  283|   163k|        __m128i row_1 = _mm_loadu_si128(((__m128i *)input) + 1);
  284|   163k|        row_1 = _mm_slli_epi16(row_1, 3);
  285|   163k|        _mm_storeu_si128(((__m128i *)pred_buf_q3) + 1, row_1);
  286|   163k|        if (width == 32) {
  ------------------
  |  Branch (286:13): [True: 0, False: 163k]
  ------------------
  287|      0|          __m128i row_2 = _mm_loadu_si128(((__m128i *)input) + 2);
  288|      0|          row_2 = _mm_slli_epi16(row_2, 3);
  289|      0|          _mm_storeu_si128(((__m128i *)pred_buf_q3) + 2, row_2);
  290|      0|          __m128i row_3 = _mm_loadu_si128(((__m128i *)input) + 3);
  291|      0|          row_3 = _mm_slli_epi16(row_3, 3);
  292|      0|          _mm_storeu_si128(((__m128i *)pred_buf_q3) + 3, row_3);
  293|      0|        }
  294|   163k|      }
  295|   548k|    }
  296|   732k|    input += input_stride;
  297|   732k|    pred_buf_q3 += CFL_BUF_LINE;
  ------------------
  |  |  522|   732k|#define CFL_BUF_LINE (32)
  ------------------
  298|   732k|  } while (pred_buf_q3 < end);
  ------------------
  |  Branch (298:12): [True: 637k, False: 95.5k]
  ------------------
  299|  95.5k|}
cfl_ssse3.c:cfl_predict_lbd_ssse3:
  315|   135k|                                         int alpha_q3, int width, int height) {
  316|   135k|  const __m128i alpha_sign = _mm_set1_epi16(alpha_q3);
  317|   135k|  const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9);
  318|   135k|  const __m128i dc_q0 = _mm_set1_epi16(*dst);
  319|   135k|  __m128i *row = (__m128i *)pred_buf_q3;
  320|   135k|  const __m128i *row_end = row + height * CFL_BUF_LINE_I128;
  ------------------
  |  |  523|   135k|#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3)
  |  |  ------------------
  |  |  |  |  522|   135k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  321|  1.29M|  do {
  322|  1.29M|    __m128i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
  323|  1.29M|    if (width < 16) {
  ------------------
  |  Branch (323:9): [True: 832k, False: 465k]
  ------------------
  324|   832k|      res = _mm_packus_epi16(res, res);
  325|   832k|      if (width == 4)
  ------------------
  |  Branch (325:11): [True: 260k, False: 571k]
  ------------------
  326|   260k|        _mm_storeh_epi32((__m128i *)dst, res);
  327|   571k|      else
  328|   571k|        _mm_storel_epi64((__m128i *)dst, res);
  329|   832k|    } else {
  330|   465k|      __m128i next = predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
  331|   465k|      res = _mm_packus_epi16(res, next);
  332|   465k|      _mm_storeu_si128((__m128i *)dst, res);
  333|   465k|      if (width == 32) {
  ------------------
  |  Branch (333:11): [True: 0, False: 465k]
  ------------------
  334|      0|        res = predict_unclipped(row + 2, alpha_q12, alpha_sign, dc_q0);
  335|      0|        next = predict_unclipped(row + 3, alpha_q12, alpha_sign, dc_q0);
  336|      0|        res = _mm_packus_epi16(res, next);
  337|      0|        _mm_storeu_si128((__m128i *)(dst + 16), res);
  338|      0|      }
  339|   465k|    }
  340|  1.29M|    dst += dst_stride;
  341|  1.29M|  } while ((row += CFL_BUF_LINE_I128) < row_end);
  ------------------
  |  |  523|  1.29M|#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3)
  |  |  ------------------
  |  |  |  |  522|  1.29M|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  |  Branch (341:12): [True: 1.16M, False: 135k]
  ------------------
  342|   135k|}
cfl_ssse3.c:predict_unclipped:
  305|  2.47M|                                        __m128i alpha_sign, __m128i dc_q0) {
  306|  2.47M|  __m128i ac_q3 = _mm_loadu_si128(input);
  307|  2.47M|  __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3);
  308|  2.47M|  __m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12);
  309|  2.47M|  scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign);
  310|  2.47M|  return _mm_add_epi16(scaled_luma_q0, dc_q0);
  311|  2.47M|}
cfl_ssse3.c:cfl_predict_hbd_ssse3:
  360|  76.6k|                                         int height) {
  361|  76.6k|  const __m128i alpha_sign = _mm_set1_epi16(alpha_q3);
  362|  76.6k|  const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9);
  363|  76.6k|  const __m128i dc_q0 = _mm_set1_epi16(*dst);
  364|  76.6k|  const __m128i max = highbd_max_epi16(bd);
  365|  76.6k|  const __m128i zeros = _mm_setzero_si128();
  366|  76.6k|  __m128i *row = (__m128i *)pred_buf_q3;
  367|  76.6k|  const __m128i *row_end = row + height * CFL_BUF_LINE_I128;
  ------------------
  |  |  523|  76.6k|#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3)
  |  |  ------------------
  |  |  |  |  522|  76.6k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  368|   711k|  do {
  369|   711k|    __m128i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
  370|   711k|    res = highbd_clamp_epi16(res, zeros, max);
  371|   711k|    if (width == 4) {
  ------------------
  |  Branch (371:9): [True: 208k, False: 503k]
  ------------------
  372|   208k|      _mm_storel_epi64((__m128i *)dst, res);
  373|   503k|    } else {
  374|   503k|      _mm_storeu_si128((__m128i *)dst, res);
  375|   503k|    }
  376|   711k|    if (width >= 16) {
  ------------------
  |  Branch (376:9): [True: 0, False: 711k]
  ------------------
  377|      0|      const __m128i res_1 =
  378|      0|          predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
  379|      0|      _mm_storeu_si128(((__m128i *)dst) + 1,
  380|      0|                       highbd_clamp_epi16(res_1, zeros, max));
  381|      0|    }
  382|   711k|    if (width == 32) {
  ------------------
  |  Branch (382:9): [True: 0, False: 711k]
  ------------------
  383|      0|      const __m128i res_2 =
  384|      0|          predict_unclipped(row + 2, alpha_q12, alpha_sign, dc_q0);
  385|      0|      _mm_storeu_si128((__m128i *)(dst + 16),
  386|      0|                       highbd_clamp_epi16(res_2, zeros, max));
  387|      0|      const __m128i res_3 =
  388|      0|          predict_unclipped(row + 3, alpha_q12, alpha_sign, dc_q0);
  389|      0|      _mm_storeu_si128((__m128i *)(dst + 24),
  390|      0|                       highbd_clamp_epi16(res_3, zeros, max));
  391|      0|    }
  392|   711k|    dst += dst_stride;
  393|   711k|  } while ((row += CFL_BUF_LINE_I128) < row_end);
  ------------------
  |  |  523|   711k|#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3)
  |  |  ------------------
  |  |  |  |  522|   711k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  |  Branch (393:12): [True: 634k, False: 76.6k]
  ------------------
  394|  76.6k|}
cfl_ssse3.c:highbd_max_epi16:
  347|  76.6k|static inline __m128i highbd_max_epi16(int bd) {
  348|  76.6k|  const __m128i neg_one = _mm_set1_epi16(-1);
  349|       |  // (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd)
  350|  76.6k|  return _mm_xor_si128(_mm_slli_epi16(neg_one, bd), neg_one);
  351|  76.6k|}
cfl_ssse3.c:highbd_clamp_epi16:
  353|   711k|static inline __m128i highbd_clamp_epi16(__m128i u, __m128i zero, __m128i max) {
  354|   711k|  return _mm_max_epi16(_mm_min_epi16(u, max), zero);
  355|   711k|}

av1_convolve_2d_sr_avx2:
  147|  49.0k|    const int32_t subpel_y_qn, ConvolveParams *conv_params) {
  148|  49.0k|#if CONFIG_SVT_AV1
  149|  49.0k|  const int32_t tap_x = get_filter_tap(filter_params_x, subpel_x_qn);
  150|  49.0k|  const int32_t tap_y = get_filter_tap(filter_params_y, subpel_y_qn);
  151|       |
  152|  49.0k|  const bool use_general = (tap_x == 12 || tap_y == 12);
  ------------------
  |  Branch (152:29): [True: 0, False: 49.0k]
  |  Branch (152:44): [True: 0, False: 49.0k]
  ------------------
  153|  49.0k|  if (use_general) {
  ------------------
  |  Branch (153:7): [True: 0, False: 49.0k]
  ------------------
  154|      0|    convolve_2d_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
  155|      0|                                filter_params_x, filter_params_y, subpel_x_qn,
  156|      0|                                subpel_y_qn, conv_params);
  157|  49.0k|  } else {
  158|  49.0k|    av1_convolve_2d_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h,
  159|  49.0k|                                        filter_params_x, filter_params_y,
  160|  49.0k|                                        subpel_x_qn, subpel_y_qn, conv_params);
  161|  49.0k|  }
  162|       |#else
  163|       |  convolve_2d_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
  164|       |                              filter_params_x, filter_params_y, subpel_x_qn,
  165|       |                              subpel_y_qn, conv_params);
  166|       |#endif
  167|  49.0k|}

av1_convolve_y_sr_avx2:
  517|  14.4k|                            const int32_t subpel_y_qn) {
  518|  14.4k|#if CONFIG_SVT_AV1
  519|  14.4k|  const int vert_tap = get_filter_tap(filter_params_y, subpel_y_qn);
  520|       |
  521|  14.4k|  if (vert_tap == 12) {
  ------------------
  |  Branch (521:7): [True: 0, False: 14.4k]
  ------------------
  522|      0|    av1_convolve_y_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
  523|      0|                                   filter_params_y, subpel_y_qn);
  524|  14.4k|  } else {
  525|  14.4k|    av1_convolve_y_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h,
  526|  14.4k|                                       filter_params_y, subpel_y_qn);
  527|  14.4k|  }
  528|       |#else
  529|       |  av1_convolve_y_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
  530|       |                                 filter_params_y, subpel_y_qn);
  531|       |#endif
  532|  14.4k|}
av1_convolve_x_sr_avx2:
  912|  14.2k|                            ConvolveParams *conv_params) {
  913|  14.2k|#if CONFIG_SVT_AV1
  914|  14.2k|  const int horz_tap = get_filter_tap(filter_params_x, subpel_x_qn);
  915|       |
  916|  14.2k|  if (horz_tap == 12) {
  ------------------
  |  Branch (916:7): [True: 0, False: 14.2k]
  ------------------
  917|      0|    av1_convolve_x_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
  918|      0|                                   filter_params_x, subpel_x_qn, conv_params);
  919|  14.2k|  } else {
  920|  14.2k|    av1_convolve_x_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h,
  921|  14.2k|                                       filter_params_x, subpel_x_qn,
  922|  14.2k|                                       conv_params);
  923|  14.2k|  }
  924|       |#else
  925|       |  av1_convolve_x_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
  926|       |                                 filter_params_x, subpel_x_qn, conv_params);
  927|       |#endif
  928|  14.2k|}

av1_filter_intra_predictor_sse4_1:
  346|   195k|                                       const uint8_t *left, int mode) {
  347|   195k|  const int bw = tx_size_wide[tx_size];
  348|   195k|  const int bh = tx_size_high[tx_size];
  349|   195k|  filter_intra_predictor_sse4_1(dst, stride, above, left, mode, bw, bh);
  350|   195k|}
filterintra_sse4.c:filter_intra_predictor_sse4_1:
  216|   195k|                                                 const int height) {
  217|   195k|  const uint8_t *const top_ptr = (const uint8_t *)top_row;
  218|   195k|  const uint8_t *const left_ptr = (const uint8_t *)left_column;
  219|   195k|  uint8_t *dst = (uint8_t *)dest;
  220|   195k|  if (width == 4) {
  ------------------
  |  Branch (220:7): [True: 78.8k, False: 116k]
  ------------------
  221|  78.8k|    filter_4xh(dst, stride, top_ptr, left_ptr, mode, height);
  222|  78.8k|    return;
  223|  78.8k|  }
  224|       |
  225|       |  // There is one set of 7 taps for each of the 4x2 output pixels.
  226|   116k|  const __m128i taps_0_1 = xx_load_128(av1_filter_intra_taps[mode][0]);
  227|   116k|  const __m128i taps_2_3 = xx_load_128(av1_filter_intra_taps[mode][2]);
  228|   116k|  const __m128i taps_4_5 = xx_load_128(av1_filter_intra_taps[mode][4]);
  229|   116k|  const __m128i taps_6_7 = xx_load_128(av1_filter_intra_taps[mode][6]);
  230|       |
  231|       |  // This mask rearranges bytes in the order: 0, 1, 2, 3, 4, 8, 9, 15. The 15 at
  232|       |  // the end is an unused value, which shall be multiplied by 0 when we apply
  233|       |  // the filter.
  234|   116k|  const int64_t kCondenseLeftMask = 0x0F09080403020100;
  235|       |
  236|       |  // Takes the "left section" and puts it right after p0-p4.
  237|   116k|  const __m128i pixel_order1 = _mm_set1_epi64x(kCondenseLeftMask);
  238|       |
  239|       |  // This mask rearranges bytes in the order: 8, 0, 1, 2, 3, 9, 10, 15. The last
  240|       |  // byte is unused as above.
  241|   116k|  const int64_t kInsertTopLeftMask = 0x0F0A090302010008;
  242|       |
  243|       |  // Shuffles the "top left" from the left section, to the front. Used when
  244|       |  // grabbing data from left_column and not top_row.
  245|   116k|  const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftMask);
  246|       |
  247|       |  // This first pass takes care of the cases where the top left pixel comes from
  248|       |  // top_row.
  249|   116k|  __m128i pixels = xx_loadl_64(top_ptr - 1);
  250|   116k|  __m128i left = _mm_slli_si128(xx_loadl_32(left_column), 8);
  251|   116k|  pixels = _mm_or_si128(pixels, left);
  252|       |
  253|       |  // Two sets of the same pixels to multiply with two sets of taps.
  254|   116k|  pixels = _mm_shuffle_epi8(pixels, pixel_order1);
  255|   116k|  filter_4x2_sse4_1(dst, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
  256|   116k|                    &taps_6_7);
  257|   116k|  left = _mm_srli_si128(left, 1);
  258|       |
  259|       |  // Load
  260|   116k|  pixels = xx_loadl_32(dst + stride);
  261|       |
  262|       |  // Because of the above shift, this OR 'invades' the final of the first 8
  263|       |  // bytes of |pixels|. This is acceptable because the 8th filter tap is always
  264|       |  // a padded 0.
  265|   116k|  pixels = _mm_or_si128(pixels, left);
  266|   116k|  pixels = _mm_shuffle_epi8(pixels, pixel_order2);
  267|   116k|  const ptrdiff_t stride2 = stride << 1;
  268|   116k|  const ptrdiff_t stride4 = stride << 2;
  269|   116k|  filter_4x2_sse4_1(dst + stride2, stride, &pixels, &taps_0_1, &taps_2_3,
  270|   116k|                    &taps_4_5, &taps_6_7);
  271|   116k|  dst += 4;
  272|   332k|  for (int x = 3; x < width - 4; x += 4) {
  ------------------
  |  Branch (272:19): [True: 216k, False: 116k]
  ------------------
  273|   216k|    pixels = xx_loadl_32(top_ptr + x);
  274|   216k|    pixels = _mm_insert_epi8(pixels, (int8_t)top_ptr[x + 4], 4);
  275|   216k|    pixels = _mm_insert_epi8(pixels, (int8_t)dst[-1], 5);
  276|   216k|    pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride - 1], 6);
  277|       |
  278|       |    // Duplicate bottom half into upper half.
  279|   216k|    pixels = _mm_shuffle_epi32(pixels, DUPLICATE_FIRST_HALF);
  280|   216k|    filter_4x2_sse4_1(dst, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
  281|   216k|                      &taps_6_7);
  282|   216k|    pixels = xx_loadl_32(dst + stride - 1);
  283|   216k|    pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride + 3], 4);
  284|   216k|    pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride2 - 1], 5);
  285|   216k|    pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride + stride2 - 1], 6);
  286|       |
  287|       |    // Duplicate bottom half into upper half.
  288|   216k|    pixels = _mm_shuffle_epi32(pixels, DUPLICATE_FIRST_HALF);
  289|   216k|    filter_4x2_sse4_1(dst + stride2, stride, &pixels, &taps_0_1, &taps_2_3,
  290|   216k|                      &taps_4_5, &taps_6_7);
  291|   216k|    dst += 4;
  292|   216k|  }
  293|       |
  294|       |  // Now we handle heights that reference previous blocks rather than top_row.
  295|   268k|  for (int y = 4; y < height; y += 4) {
  ------------------
  |  Branch (295:19): [True: 151k, False: 116k]
  ------------------
  296|       |    // Leftmost 4x4 block for this height.
  297|   151k|    dst -= width;
  298|   151k|    dst += stride4;
  299|       |
  300|       |    // Top Left is not available by offset in these leftmost blocks.
  301|   151k|    pixels = xx_loadl_32(dst - stride);
  302|   151k|    left = _mm_slli_si128(xx_loadl_32(left_ptr + y - 1), 8);
  303|   151k|    left = _mm_insert_epi8(left, (int8_t)left_ptr[y + 3], 12);
  304|   151k|    pixels = _mm_or_si128(pixels, left);
  305|   151k|    pixels = _mm_shuffle_epi8(pixels, pixel_order2);
  306|   151k|    filter_4x2_sse4_1(dst, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
  307|   151k|                      &taps_6_7);
  308|       |
  309|       |    // The bytes shifted into positions 6 and 7 will be ignored by the shuffle.
  310|   151k|    left = _mm_srli_si128(left, 2);
  311|   151k|    pixels = xx_loadl_32(dst + stride);
  312|   151k|    pixels = _mm_or_si128(pixels, left);
  313|   151k|    pixels = _mm_shuffle_epi8(pixels, pixel_order2);
  314|   151k|    filter_4x2_sse4_1(dst + stride2, stride, &pixels, &taps_0_1, &taps_2_3,
  315|   151k|                      &taps_4_5, &taps_6_7);
  316|       |
  317|   151k|    dst += 4;
  318|       |
  319|       |    // Remaining 4x4 blocks for this height.
  320|   512k|    for (int x = 4; x < width; x += 4) {
  ------------------
  |  Branch (320:21): [True: 361k, False: 151k]
  ------------------
  321|   361k|      pixels = xx_loadl_32(dst - stride - 1);
  322|   361k|      pixels = _mm_insert_epi8(pixels, (int8_t)dst[-stride + 3], 4);
  323|   361k|      pixels = _mm_insert_epi8(pixels, (int8_t)dst[-1], 5);
  324|   361k|      pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride - 1], 6);
  325|       |
  326|       |      // Duplicate bottom half into upper half.
  327|   361k|      pixels = _mm_shuffle_epi32(pixels, DUPLICATE_FIRST_HALF);
  328|   361k|      filter_4x2_sse4_1(dst, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
  329|   361k|                        &taps_6_7);
  330|   361k|      pixels = xx_loadl_32(dst + stride - 1);
  331|   361k|      pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride + 3], 4);
  332|   361k|      pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride2 - 1], 5);
  333|   361k|      pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride2 + stride - 1], 6);
  334|       |
  335|       |      // Duplicate bottom half into upper half.
  336|       |      pixels = _mm_shuffle_epi32(pixels, DUPLICATE_FIRST_HALF);
  337|   361k|      filter_4x2_sse4_1(dst + stride2, stride, &pixels, &taps_0_1, &taps_2_3,
  338|   361k|                        &taps_4_5, &taps_6_7);
  339|   361k|      dst += 4;
  340|   361k|    }
  341|   151k|  }
  342|   116k|}
filterintra_sse4.c:filter_4xh:
   62|  78.8k|                              const int height) {
   63|  78.8k|  const __m128i taps_0_1 = xx_load_128(av1_filter_intra_taps[mode][0]);
   64|  78.8k|  const __m128i taps_2_3 = xx_load_128(av1_filter_intra_taps[mode][2]);
   65|  78.8k|  const __m128i taps_4_5 = xx_load_128(av1_filter_intra_taps[mode][4]);
   66|  78.8k|  const __m128i taps_6_7 = xx_load_128(av1_filter_intra_taps[mode][6]);
   67|  78.8k|  __m128i top = xx_loadl_32(top_ptr - 1);
   68|  78.8k|  __m128i pixels = _mm_insert_epi8(top, (int8_t)top_ptr[3], 4);
   69|  78.8k|  __m128i left = (height == 4 ? xx_loadl_32(left_ptr) : xx_loadl_64(left_ptr));
  ------------------
  |  Branch (69:19): [True: 63.1k, False: 15.7k]
  ------------------
   70|  78.8k|  left = _mm_slli_si128(left, 5);
   71|       |
   72|       |  // Relative pixels: top[-1], top[0], top[1], top[2], top[3], left[0], left[1],
   73|       |  // left[2], left[3], left[4], left[5], left[6], left[7]
   74|  78.8k|  pixels = _mm_or_si128(left, pixels);
   75|       |
   76|       |  // Duplicate first 8 bytes.
   77|  78.8k|  pixels = _mm_shuffle_epi32(pixels, DUPLICATE_FIRST_HALF);
   78|  78.8k|  filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
   79|  78.8k|                    &taps_6_7);
   80|  78.8k|  dest += stride;  // Move to y = 1.
   81|  78.8k|  pixels = xx_loadl_32(dest);
   82|       |
   83|       |  // Relative pixels: top[0], top[1], top[2], top[3], empty, left[-2], left[-1],
   84|       |  // left[0], left[1], ...
   85|  78.8k|  pixels = _mm_or_si128(left, pixels);
   86|       |
   87|       |  // This mask rearranges bytes in the order: 6, 0, 1, 2, 3, 7, 8, 15. The last
   88|       |  // byte is an unused value, which shall be multiplied by 0 when we apply the
   89|       |  // filter.
   90|  78.8k|  const int64_t kInsertTopLeftFirstMask = 0x0F08070302010006;
   91|       |
   92|       |  // Insert left[-1] in front as TL and put left[0] and left[1] at the end.
   93|  78.8k|  const __m128i pixel_order1 = _mm_set1_epi64x(kInsertTopLeftFirstMask);
   94|  78.8k|  pixels = _mm_shuffle_epi8(pixels, pixel_order1);
   95|  78.8k|  dest += stride;  // Move to y = 2.
   96|  78.8k|  filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
   97|  78.8k|                    &taps_6_7);
   98|  78.8k|  dest += stride;  // Move to y = 3.
   99|       |
  100|       |  // Compute the middle 8 rows before using common code for the final 4 rows.
  101|       |  // Because the common code below this block assumes that
  102|  78.8k|  if (height == 16) {
  ------------------
  |  Branch (102:7): [True: 4.69k, False: 74.1k]
  ------------------
  103|       |    // This shift allows us to use pixel_order2 twice after shifting by 2 later.
  104|  4.69k|    left = _mm_slli_si128(left, 1);
  105|  4.69k|    pixels = xx_loadl_32(dest);
  106|       |
  107|       |    // Relative pixels: top[0], top[1], top[2], top[3], empty, empty, left[-4],
  108|       |    // left[-3], left[-2], left[-1], left[0], left[1], left[2], left[3]
  109|  4.69k|    pixels = _mm_or_si128(left, pixels);
  110|       |
  111|       |    // This mask rearranges bytes in the order: 9, 0, 1, 2, 3, 7, 8, 15. The
  112|       |    // last byte is an unused value, as above. The top-left was shifted to
  113|       |    // position nine to keep two empty spaces after the top pixels.
  114|  4.69k|    const int64_t kInsertTopLeftSecondMask = 0x0F0B0A0302010009;
  115|       |
  116|       |    // Insert (relative) left[-1] in front as TL and put left[0] and left[1] at
  117|       |    // the end.
  118|  4.69k|    const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftSecondMask);
  119|  4.69k|    pixels = _mm_shuffle_epi8(pixels, pixel_order2);
  120|  4.69k|    dest += stride;  // Move to y = 4.
  121|       |
  122|       |    // First 4x2 in the if body.
  123|  4.69k|    filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
  124|  4.69k|                      &taps_6_7);
  125|       |
  126|       |    // Clear all but final pixel in the first 8 of left column.
  127|  4.69k|    __m128i keep_top_left = _mm_srli_si128(left, 13);
  128|  4.69k|    dest += stride;  // Move to y = 5.
  129|  4.69k|    pixels = xx_loadl_32(dest);
  130|  4.69k|    left = _mm_srli_si128(left, 2);
  131|       |
  132|       |    // Relative pixels: top[0], top[1], top[2], top[3], left[-6],
  133|       |    // left[-5], left[-4], left[-3], left[-2], left[-1], left[0], left[1]
  134|  4.69k|    pixels = _mm_or_si128(left, pixels);
  135|  4.69k|    left = xx_loadl_64(left_ptr + 8);
  136|       |
  137|  4.69k|    pixels = _mm_shuffle_epi8(pixels, pixel_order2);
  138|  4.69k|    dest += stride;  // Move to y = 6.
  139|       |
  140|       |    // Second 4x2 in the if body.
  141|  4.69k|    filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
  142|  4.69k|                      &taps_6_7);
  143|       |
  144|       |    // Position TL value so we can use pixel_order1.
  145|  4.69k|    keep_top_left = _mm_slli_si128(keep_top_left, 6);
  146|  4.69k|    dest += stride;  // Move to y = 7.
  147|  4.69k|    pixels = xx_loadl_32(dest);
  148|  4.69k|    left = _mm_slli_si128(left, 7);
  149|  4.69k|    left = _mm_or_si128(left, keep_top_left);
  150|       |
  151|       |    // Relative pixels: top[0], top[1], top[2], top[3], empty, empty,
  152|       |    // left[-1], left[0], left[1], left[2], left[3], ...
  153|  4.69k|    pixels = _mm_or_si128(left, pixels);
  154|  4.69k|    pixels = _mm_shuffle_epi8(pixels, pixel_order1);
  155|  4.69k|    dest += stride;  // Move to y = 8.
  156|       |
  157|       |    // Third 4x2 in the if body.
  158|  4.69k|    filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
  159|  4.69k|                      &taps_6_7);
  160|  4.69k|    dest += stride;  // Move to y = 9.
  161|       |
  162|       |    // Prepare final inputs.
  163|  4.69k|    pixels = xx_loadl_32(dest);
  164|  4.69k|    left = _mm_srli_si128(left, 2);
  165|       |
  166|       |    // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2]
  167|       |    // left[-1], left[0], left[1], left[2], left[3], ...
  168|  4.69k|    pixels = _mm_or_si128(left, pixels);
  169|  4.69k|    pixels = _mm_shuffle_epi8(pixels, pixel_order1);
  170|  4.69k|    dest += stride;  // Move to y = 10.
  171|       |
  172|       |    // Fourth 4x2 in the if body.
  173|  4.69k|    filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
  174|  4.69k|                      &taps_6_7);
  175|  4.69k|    dest += stride;  // Move to y = 11.
  176|  4.69k|  }
  177|       |
  178|       |  // In both the 8 and 16 case, we assume that the left vector has the next TL
  179|       |  // at position 8.
  180|  78.8k|  if (height > 4) {
  ------------------
  |  Branch (180:7): [True: 15.7k, False: 63.1k]
  ------------------
  181|       |    // Erase prior left pixels by shifting TL to position 0.
  182|  15.7k|    left = _mm_srli_si128(left, 8);
  183|  15.7k|    left = _mm_slli_si128(left, 6);
  184|  15.7k|    pixels = xx_loadl_32(dest);
  185|       |
  186|       |    // Relative pixels: top[0], top[1], top[2], top[3], empty, empty,
  187|       |    // left[-1], left[0], left[1], left[2], left[3], ...
  188|  15.7k|    pixels = _mm_or_si128(left, pixels);
  189|  15.7k|    pixels = _mm_shuffle_epi8(pixels, pixel_order1);
  190|  15.7k|    dest += stride;  // Move to y = 12 or 4.
  191|       |
  192|       |    // First of final two 4x2 blocks.
  193|  15.7k|    filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
  194|  15.7k|                      &taps_6_7);
  195|  15.7k|    dest += stride;  // Move to y = 13 or 5.
  196|  15.7k|    pixels = xx_loadl_32(dest);
  197|  15.7k|    left = _mm_srli_si128(left, 2);
  198|       |
  199|       |    // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2]
  200|       |    // left[-1], left[0], left[1], left[2], left[3], ...
  201|  15.7k|    pixels = _mm_or_si128(left, pixels);
  202|  15.7k|    pixels = _mm_shuffle_epi8(pixels, pixel_order1);
  203|  15.7k|    dest += stride;  // Move to y = 14 or 6.
  204|       |
  205|       |    // Last of final two 4x2 blocks.
  206|  15.7k|    filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
  207|  15.7k|                      &taps_6_7);
  208|  15.7k|  }
  209|  78.8k|}
filterintra_sse4.c:filter_4x2_sse4_1:
   36|  1.89M|                                     const __m128i *taps_6_7) {
   37|  1.89M|  const __m128i mul_0_01 = _mm_maddubs_epi16(*pixels, *taps_0_1);
   38|  1.89M|  const __m128i mul_0_23 = _mm_maddubs_epi16(*pixels, *taps_2_3);
   39|       |  // |output_half| contains 8 partial sums.
   40|  1.89M|  __m128i output_half = _mm_hadd_epi16(mul_0_01, mul_0_23);
   41|  1.89M|  __m128i output = _mm_hadd_epi16(output_half, output_half);
   42|  1.89M|  const __m128i output_row0 =
   43|  1.89M|      _mm_packus_epi16(xx_roundn_epi16_unsigned(output, 4),
   44|  1.89M|                       /* arbitrary pack arg */ output);
   45|  1.89M|  xx_storel_32(dst, output_row0);
   46|  1.89M|  const __m128i mul_1_01 = _mm_maddubs_epi16(*pixels, *taps_4_5);
   47|  1.89M|  const __m128i mul_1_23 = _mm_maddubs_epi16(*pixels, *taps_6_7);
   48|  1.89M|  output_half = _mm_hadd_epi16(mul_1_01, mul_1_23);
   49|  1.89M|  output = _mm_hadd_epi16(output_half, output_half);
   50|  1.89M|  const __m128i output_row1 =
   51|  1.89M|      _mm_packus_epi16(xx_roundn_epi16_unsigned(output, 4),
   52|  1.89M|                       /* arbitrary pack arg */ output);
   53|  1.89M|  xx_storel_32(dst + stride, output_row1);
   54|  1.89M|}

av1_highbd_convolve_2d_sr_avx2:
   35|  9.37k|                                    ConvolveParams *conv_params, int bd) {
   36|  9.37k|  if (filter_params_x->taps == 12) {
  ------------------
  |  Branch (36:7): [True: 0, False: 9.37k]
  ------------------
   37|      0|    av1_highbd_convolve_2d_sr_ssse3(src, src_stride, dst, dst_stride, w, h,
   38|      0|                                    filter_params_x, filter_params_y,
   39|      0|                                    subpel_x_qn, subpel_y_qn, conv_params, bd);
   40|      0|    return;
   41|      0|  }
   42|       |
   43|  9.37k|  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
  ------------------
  |  |   19|  9.37k|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
   44|  9.37k|  int im_h = h + filter_params_y->taps - 1;
   45|  9.37k|  int im_stride = 8;
   46|  9.37k|  int i, j;
   47|  9.37k|  const int fo_vert = filter_params_y->taps / 2 - 1;
   48|  9.37k|  const int fo_horiz = filter_params_x->taps / 2 - 1;
   49|  9.37k|  const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
   50|       |
   51|       |  // Check that, even with 12-bit input, the intermediate values will fit
   52|       |  // into an unsigned 16-bit intermediate array.
   53|  9.37k|  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
   54|       |
   55|  9.37k|  __m256i s[8], coeffs_y[4], coeffs_x[4];
   56|       |
   57|  9.37k|  const __m256i round_const_x = _mm256_set1_epi32(
   58|  9.37k|      ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
  ------------------
  |  |   21|  9.37k|#define FILTER_BITS 7
  ------------------
   59|  9.37k|  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
   60|       |
   61|  9.37k|  const __m256i round_const_y = _mm256_set1_epi32(
   62|  9.37k|      ((1 << conv_params->round_1) >> 1) -
   63|  9.37k|      (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
  ------------------
  |  |   21|  9.37k|#define FILTER_BITS 7
  ------------------
   64|  9.37k|  const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
   65|       |
   66|  9.37k|  const int bits =
   67|  9.37k|      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|  9.37k|#define FILTER_BITS 7
  ------------------
   68|  9.37k|  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
   69|  9.37k|  const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
   70|  9.37k|  const __m256i clip_pixel =
   71|  9.37k|      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
  ------------------
  |  Branch (71:25): [True: 7.71k, False: 1.65k]
  |  Branch (71:44): [True: 1.65k, False: 0]
  ------------------
   72|  9.37k|  const __m256i zero = _mm256_setzero_si256();
   73|       |
   74|  9.37k|  prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
   75|  9.37k|  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
   76|       |
   77|  21.5k|  for (j = 0; j < w; j += 8) {
  ------------------
  |  Branch (77:15): [True: 12.2k, False: 9.37k]
  ------------------
   78|       |    /* Horizontal filter */
   79|  12.2k|    {
   80|   131k|      for (i = 0; i < im_h; i += 2) {
  ------------------
  |  Branch (80:19): [True: 119k, False: 12.2k]
  ------------------
   81|   119k|        const __m256i row0 =
   82|   119k|            _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
   83|   119k|        __m256i row1 = _mm256_setzero_si256();
   84|   119k|        if (i + 1 < im_h)
  ------------------
  |  Branch (84:13): [True: 106k, False: 12.2k]
  ------------------
   85|   106k|          row1 =
   86|   106k|              _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
   87|       |
   88|   119k|        const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
   89|   119k|        const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
   90|       |
   91|       |        // even pixels
   92|   119k|        s[0] = _mm256_alignr_epi8(r1, r0, 0);
   93|   119k|        s[1] = _mm256_alignr_epi8(r1, r0, 4);
   94|   119k|        s[2] = _mm256_alignr_epi8(r1, r0, 8);
   95|   119k|        s[3] = _mm256_alignr_epi8(r1, r0, 12);
   96|       |
   97|   119k|        __m256i res_even = convolve(s, coeffs_x);
   98|   119k|        res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
   99|   119k|                                    round_shift_x);
  100|       |
  101|       |        // odd pixels
  102|   119k|        s[0] = _mm256_alignr_epi8(r1, r0, 2);
  103|   119k|        s[1] = _mm256_alignr_epi8(r1, r0, 6);
  104|   119k|        s[2] = _mm256_alignr_epi8(r1, r0, 10);
  105|   119k|        s[3] = _mm256_alignr_epi8(r1, r0, 14);
  106|       |
  107|   119k|        __m256i res_odd = convolve(s, coeffs_x);
  108|   119k|        res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
  109|   119k|                                   round_shift_x);
  110|       |
  111|   119k|        __m256i res_even1 = _mm256_packs_epi32(res_even, res_even);
  112|   119k|        __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd);
  113|   119k|        __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1);
  114|       |
  115|   119k|        _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
  116|   119k|      }
  117|  12.2k|    }
  118|       |
  119|       |    /* Vertical filter */
  120|  12.2k|    {
  121|  12.2k|      __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
  122|  12.2k|      __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
  123|  12.2k|      __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
  124|  12.2k|      __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
  125|  12.2k|      __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
  126|  12.2k|      __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
  127|       |
  128|  12.2k|      s[0] = _mm256_unpacklo_epi16(s0, s1);
  129|  12.2k|      s[1] = _mm256_unpacklo_epi16(s2, s3);
  130|  12.2k|      s[2] = _mm256_unpacklo_epi16(s4, s5);
  131|       |
  132|  12.2k|      s[4] = _mm256_unpackhi_epi16(s0, s1);
  133|  12.2k|      s[5] = _mm256_unpackhi_epi16(s2, s3);
  134|  12.2k|      s[6] = _mm256_unpackhi_epi16(s4, s5);
  135|       |
  136|  82.4k|      for (i = 0; i < h; i += 2) {
  ------------------
  |  Branch (136:19): [True: 70.2k, False: 12.2k]
  ------------------
  137|  70.2k|        const int16_t *data = &im_block[i * im_stride];
  138|       |
  139|  70.2k|        const __m256i s6 =
  140|  70.2k|            _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
  141|  70.2k|        const __m256i s7 =
  142|  70.2k|            _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
  143|       |
  144|  70.2k|        s[3] = _mm256_unpacklo_epi16(s6, s7);
  145|  70.2k|        s[7] = _mm256_unpackhi_epi16(s6, s7);
  146|       |
  147|  70.2k|        const __m256i res_a = convolve(s, coeffs_y);
  148|  70.2k|        __m256i res_a_round = _mm256_sra_epi32(
  149|  70.2k|            _mm256_add_epi32(res_a, round_const_y), round_shift_y);
  150|       |
  151|  70.2k|        res_a_round = _mm256_sra_epi32(
  152|  70.2k|            _mm256_add_epi32(res_a_round, round_const_bits), round_shift_bits);
  153|       |
  154|  70.2k|        if (w - j > 4) {
  ------------------
  |  Branch (154:13): [True: 58.1k, False: 12.0k]
  ------------------
  155|  58.1k|          const __m256i res_b = convolve(s + 4, coeffs_y);
  156|  58.1k|          __m256i res_b_round = _mm256_sra_epi32(
  157|  58.1k|              _mm256_add_epi32(res_b, round_const_y), round_shift_y);
  158|  58.1k|          res_b_round =
  159|  58.1k|              _mm256_sra_epi32(_mm256_add_epi32(res_b_round, round_const_bits),
  160|  58.1k|                               round_shift_bits);
  161|       |
  162|  58.1k|          __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
  163|  58.1k|          res_16bit = _mm256_min_epi16(res_16bit, clip_pixel);
  164|  58.1k|          res_16bit = _mm256_max_epi16(res_16bit, zero);
  165|       |
  166|  58.1k|          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
  167|  58.1k|                           _mm256_castsi256_si128(res_16bit));
  168|  58.1k|          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
  169|  58.1k|                           _mm256_extracti128_si256(res_16bit, 1));
  170|  58.1k|        } else if (w == 4) {
  ------------------
  |  Branch (170:20): [True: 10.1k, False: 1.94k]
  ------------------
  171|  10.1k|          res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
  172|  10.1k|          res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
  173|  10.1k|          res_a_round = _mm256_max_epi16(res_a_round, zero);
  174|       |
  175|  10.1k|          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j],
  176|  10.1k|                           _mm256_castsi256_si128(res_a_round));
  177|  10.1k|          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
  178|  10.1k|                           _mm256_extracti128_si256(res_a_round, 1));
  179|  10.1k|        } else {
  180|  1.94k|          res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
  181|  1.94k|          res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
  182|  1.94k|          res_a_round = _mm256_max_epi16(res_a_round, zero);
  183|       |
  184|  1.94k|          xx_storel_32(&dst[i * dst_stride + j],
  185|  1.94k|                       _mm256_castsi256_si128(res_a_round));
  186|  1.94k|          xx_storel_32(&dst[i * dst_stride + j + dst_stride],
  187|  1.94k|                       _mm256_extracti128_si256(res_a_round, 1));
  188|  1.94k|        }
  189|       |
  190|  70.2k|        s[0] = s[1];
  191|  70.2k|        s[1] = s[2];
  192|  70.2k|        s[2] = s[3];
  193|       |
  194|  70.2k|        s[4] = s[5];
  195|  70.2k|        s[5] = s[6];
  196|  70.2k|        s[6] = s[7];
  197|  70.2k|      }
  198|  12.2k|    }
  199|  12.2k|  }
  200|  9.37k|}

av1_highbd_inv_txfm_add_avx2:
 4215|  1.26M|                                  int stride, const TxfmParam *txfm_param) {
 4216|  1.26M|  assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
 4217|  1.26M|  const TX_SIZE tx_size = txfm_param->tx_size;
 4218|  1.26M|  switch (tx_size) {
 4219|  43.8k|    case TX_4X8:
  ------------------
  |  Branch (4219:5): [True: 43.8k, False: 1.22M]
  ------------------
 4220|   112k|    case TX_8X4:
  ------------------
  |  Branch (4220:5): [True: 68.9k, False: 1.19M]
  ------------------
 4221|   696k|    case TX_4X4:
  ------------------
  |  Branch (4221:5): [True: 583k, False: 681k]
  ------------------
 4222|   742k|    case TX_16X4:
  ------------------
  |  Branch (4222:5): [True: 46.1k, False: 1.21M]
  ------------------
 4223|   763k|    case TX_4X16:
  ------------------
  |  Branch (4223:5): [True: 20.8k, False: 1.24M]
  ------------------
 4224|   763k|      av1_highbd_inv_txfm_add_sse4_1(input, dest, stride, txfm_param);
 4225|   763k|      break;
 4226|   501k|    default:
  ------------------
  |  Branch (4226:5): [True: 501k, False: 763k]
  ------------------
 4227|   501k|      av1_highbd_inv_txfm2d_add_universe_avx2(
 4228|   501k|          input, dest, stride, txfm_param->tx_type, txfm_param->tx_size,
 4229|   501k|          txfm_param->eob, txfm_param->bd);
 4230|   501k|      break;
 4231|  1.26M|  }
 4232|  1.26M|}
highbd_inv_txfm_avx2.c:av1_highbd_inv_txfm2d_add_universe_avx2:
 4187|   501k|                                                    const int bd) {
 4188|   501k|  switch (tx_type) {
 4189|   259k|    case DCT_DCT:
  ------------------
  |  Branch (4189:5): [True: 259k, False: 242k]
  ------------------
 4190|   317k|    case ADST_DCT:
  ------------------
  |  Branch (4190:5): [True: 58.6k, False: 443k]
  ------------------
 4191|   395k|    case DCT_ADST:
  ------------------
  |  Branch (4191:5): [True: 77.9k, False: 424k]
  ------------------
 4192|   463k|    case ADST_ADST:
  ------------------
  |  Branch (4192:5): [True: 67.9k, False: 434k]
  ------------------
 4193|   464k|    case FLIPADST_DCT:
  ------------------
  |  Branch (4193:5): [True: 491, False: 501k]
  ------------------
 4194|   464k|    case DCT_FLIPADST:
  ------------------
  |  Branch (4194:5): [True: 299, False: 501k]
  ------------------
 4195|   464k|    case FLIPADST_FLIPADST:
  ------------------
  |  Branch (4195:5): [True: 267, False: 501k]
  ------------------
 4196|   464k|    case ADST_FLIPADST:
  ------------------
  |  Branch (4196:5): [True: 206, False: 501k]
  ------------------
 4197|   465k|    case FLIPADST_ADST:
  ------------------
  |  Branch (4197:5): [True: 300, False: 501k]
  ------------------
 4198|   465k|      highbd_inv_txfm2d_add_no_identity_avx2(input, CONVERT_TO_SHORTPTR(output),
  ------------------
  |  |   75|   465k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
 4199|   465k|                                             stride, tx_type, tx_size, eob, bd);
 4200|   465k|      break;
 4201|  19.8k|    case IDTX:
  ------------------
  |  Branch (4201:5): [True: 19.8k, False: 482k]
  ------------------
 4202|  30.5k|    case H_DCT:
  ------------------
  |  Branch (4202:5): [True: 10.7k, False: 491k]
  ------------------
 4203|  30.7k|    case H_ADST:
  ------------------
  |  Branch (4203:5): [True: 243, False: 501k]
  ------------------
 4204|  30.9k|    case H_FLIPADST:
  ------------------
  |  Branch (4204:5): [True: 140, False: 501k]
  ------------------
 4205|  36.6k|    case V_DCT:
  ------------------
  |  Branch (4205:5): [True: 5.72k, False: 496k]
  ------------------
 4206|  36.7k|    case V_ADST:
  ------------------
  |  Branch (4206:5): [True: 130, False: 501k]
  ------------------
 4207|  36.8k|    case V_FLIPADST:
  ------------------
  |  Branch (4207:5): [True: 51, False: 501k]
  ------------------
 4208|  36.8k|      av1_highbd_inv_txfm2d_add_universe_sse4_1(input, output, stride, tx_type,
 4209|  36.8k|                                                tx_size, eob, bd);
 4210|  36.8k|      break;
 4211|      0|    default: assert(0); break;
  ------------------
  |  Branch (4211:5): [True: 0, False: 501k]
  ------------------
 4212|   501k|  }
 4213|   501k|}
highbd_inv_txfm_avx2.c:highbd_inv_txfm2d_add_no_identity_avx2:
 4111|   465k|                                                   const int bd) {
 4112|   465k|  __m256i buf1[64 * 8];
 4113|   465k|  int eobx, eoby;
 4114|   465k|  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
 4115|   465k|  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
 4116|   465k|  const int txw_idx = get_txw_idx(tx_size);
 4117|   465k|  const int txh_idx = get_txh_idx(tx_size);
 4118|   465k|  const int txfm_size_col = tx_size_wide[tx_size];
 4119|   465k|  const int txfm_size_row = tx_size_high[tx_size];
 4120|   465k|  const int buf_size_w_div8 = txfm_size_col >> 3;
 4121|   465k|  const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3;
 4122|   465k|  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
 4123|   465k|  const int input_stride = AOMMIN(32, txfm_size_row);
  ------------------
  |  |   34|   465k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 8.07k, False: 457k]
  |  |  ------------------
  ------------------
 4124|   465k|  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
 4125|   465k|  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
 4126|   465k|  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
 4127|   465k|  const transform_1d_avx2 row_txfm =
 4128|   465k|      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
 4129|   465k|  const transform_1d_avx2 col_txfm =
 4130|   465k|      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
 4131|       |
 4132|   465k|  assert(col_txfm != NULL);
 4133|   465k|  assert(row_txfm != NULL);
 4134|   465k|  int ud_flip, lr_flip;
 4135|   465k|  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 4136|       |
 4137|       |  // 1st stage: column transform
 4138|  1.00M|  for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
  ------------------
  |  Branch (4138:19): [True: 542k, False: 465k]
  ------------------
 4139|   542k|    __m256i buf0[64];
 4140|   542k|    load_buffer_32bit_input(input + i * 8, input_stride, buf0,
 4141|   542k|                            buf_size_nonzero_w);
 4142|   542k|    if (rect_type == 1 || rect_type == -1) {
  ------------------
  |  Branch (4142:9): [True: 65.5k, False: 476k]
  |  Branch (4142:27): [True: 63.6k, False: 413k]
  ------------------
 4143|   129k|      round_shift_rect_array_32_avx2(buf0, buf0, buf_size_nonzero_w, 0,
 4144|   129k|                                     NewInvSqrt2);
 4145|   129k|    }
 4146|   542k|    row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
  ------------------
  |  |   43|   542k|#define INV_COS_BIT 12
  ------------------
 4147|       |
 4148|   542k|    __m256i *_buf1 = buf1 + i * 8;
 4149|   542k|    if (lr_flip) {
  ------------------
  |  Branch (4149:9): [True: 859, False: 541k]
  ------------------
 4150|  2.11k|      for (int j = 0; j < buf_size_w_div8; ++j) {
  ------------------
  |  Branch (4150:23): [True: 1.25k, False: 859]
  ------------------
 4151|  1.25k|        transpose_8x8_flip_avx2(
 4152|  1.25k|            &buf0[j * 8], &_buf1[(buf_size_w_div8 - 1 - j) * txfm_size_row]);
 4153|  1.25k|      }
 4154|   541k|    } else {
 4155|  1.62M|      for (int j = 0; j < buf_size_w_div8; ++j) {
  ------------------
  |  Branch (4155:23): [True: 1.08M, False: 541k]
  ------------------
 4156|  1.08M|        transpose_8x8_avx2(&buf0[j * 8], &_buf1[j * txfm_size_row]);
 4157|  1.08M|      }
 4158|   541k|    }
 4159|   542k|  }
 4160|       |  // 2nd stage: column transform
 4161|  1.34M|  for (int i = 0; i < buf_size_w_div8; i++) {
  ------------------
  |  Branch (4161:19): [True: 883k, False: 465k]
  ------------------
 4162|   883k|    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
  ------------------
  |  |   43|   883k|#define INV_COS_BIT 12
  ------------------
 4163|   883k|             bd, 0);
 4164|       |
 4165|   883k|    round_shift_array_32_avx2(buf1 + i * txfm_size_row,
 4166|   883k|                              buf1 + i * txfm_size_row, txfm_size_row,
 4167|   883k|                              -shift[1]);
 4168|   883k|  }
 4169|       |
 4170|       |  // write to buffer
 4171|   465k|  if (txfm_size_col >= 16) {
  ------------------
  |  Branch (4171:7): [True: 233k, False: 231k]
  ------------------
 4172|   559k|    for (int i = 0; i < (txfm_size_col >> 4); i++) {
  ------------------
  |  Branch (4172:21): [True: 325k, False: 233k]
  ------------------
 4173|   325k|      highbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row * 2,
 4174|   325k|                                    output + 16 * i, stride, ud_flip,
 4175|   325k|                                    txfm_size_row, bd);
 4176|   325k|    }
 4177|   233k|  } else if (txfm_size_col == 8) {
  ------------------
  |  Branch (4177:14): [True: 231k, False: 18.4E]
  ------------------
 4178|   231k|    highbd_write_buffer_8xn_avx2(buf1, output, stride, ud_flip, txfm_size_row,
 4179|   231k|                                 bd);
 4180|   231k|  }
 4181|   465k|}
highbd_inv_txfm_avx2.c:idct8x8_low1_avx2:
 2414|  65.1k|                              int bd, int out_shift) {
 2415|  65.1k|  const int32_t *cospi = cospi_arr(bit);
 2416|  65.1k|  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
 2417|  65.1k|  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
 2418|  65.1k|  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|   130k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 65.1k]
  |  |  |  Branch (35:31): [True: 41.4k, False: 23.6k]
  |  |  |  Branch (35:44): [True: 41.4k, False: 23.6k]
  |  |  ------------------
  ------------------
 2419|  65.1k|  __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
 2420|  65.1k|  __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
 2421|  65.1k|  __m256i x;
 2422|       |
 2423|       |  // stage 0
 2424|       |  // stage 1
 2425|       |  // stage 2
 2426|       |  // stage 3
 2427|  65.1k|  x = _mm256_mullo_epi32(in[0], cospi32);
 2428|  65.1k|  x = _mm256_add_epi32(x, rnding);
 2429|  65.1k|  x = _mm256_srai_epi32(x, bit);
 2430|       |
 2431|       |  // stage 4
 2432|       |  // stage 5
 2433|  65.1k|  if (!do_cols) {
  ------------------
  |  Branch (2433:7): [True: 23.6k, False: 41.4k]
  ------------------
 2434|  23.6k|    const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|  23.6k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 23.6k]
  |  |  ------------------
  ------------------
 2435|  23.6k|    __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
 2436|  23.6k|    clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
 2437|  23.6k|    clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
 2438|  23.6k|    x = _mm256_add_epi32(x, offset);
 2439|  23.6k|    x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
 2440|  23.6k|  }
 2441|  65.1k|  x = _mm256_max_epi32(x, clamp_lo);
 2442|  65.1k|  x = _mm256_min_epi32(x, clamp_hi);
 2443|  65.1k|  out[0] = x;
 2444|  65.1k|  out[1] = x;
 2445|  65.1k|  out[2] = x;
 2446|  65.1k|  out[3] = x;
 2447|  65.1k|  out[4] = x;
 2448|  65.1k|  out[5] = x;
 2449|  65.1k|  out[6] = x;
 2450|  65.1k|  out[7] = x;
 2451|  65.1k|}
highbd_inv_txfm_avx2.c:idct8x8_avx2:
 2453|   344k|                         int bd, int out_shift) {
 2454|   344k|  const int32_t *cospi = cospi_arr(bit);
 2455|   344k|  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
 2456|   344k|  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
 2457|   344k|  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
 2458|   344k|  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
 2459|   344k|  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
 2460|   344k|  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
 2461|   344k|  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
 2462|   344k|  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
 2463|   344k|  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
 2464|   344k|  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
 2465|   344k|  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
 2466|   344k|  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|   689k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 344k]
  |  |  |  Branch (35:31): [True: 212k, False: 132k]
  |  |  |  Branch (35:44): [True: 212k, False: 132k]
  |  |  ------------------
  ------------------
 2467|   344k|  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
 2468|   344k|  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
 2469|   344k|  __m256i u0, u1, u2, u3, u4, u5, u6, u7;
 2470|   344k|  __m256i v0, v1, v2, v3, v4, v5, v6, v7;
 2471|   344k|  __m256i x, y;
 2472|       |
 2473|       |  // stage 0
 2474|       |  // stage 1
 2475|       |  // stage 2
 2476|   344k|  u0 = in[0];
 2477|   344k|  u1 = in[4];
 2478|   344k|  u2 = in[2];
 2479|   344k|  u3 = in[6];
 2480|       |
 2481|   344k|  x = _mm256_mullo_epi32(in[1], cospi56);
 2482|   344k|  y = _mm256_mullo_epi32(in[7], cospim8);
 2483|   344k|  u4 = _mm256_add_epi32(x, y);
 2484|   344k|  u4 = _mm256_add_epi32(u4, rnding);
 2485|   344k|  u4 = _mm256_srai_epi32(u4, bit);
 2486|       |
 2487|   344k|  x = _mm256_mullo_epi32(in[1], cospi8);
 2488|   344k|  y = _mm256_mullo_epi32(in[7], cospi56);
 2489|   344k|  u7 = _mm256_add_epi32(x, y);
 2490|   344k|  u7 = _mm256_add_epi32(u7, rnding);
 2491|   344k|  u7 = _mm256_srai_epi32(u7, bit);
 2492|       |
 2493|   344k|  x = _mm256_mullo_epi32(in[5], cospi24);
 2494|   344k|  y = _mm256_mullo_epi32(in[3], cospim40);
 2495|   344k|  u5 = _mm256_add_epi32(x, y);
 2496|   344k|  u5 = _mm256_add_epi32(u5, rnding);
 2497|   344k|  u5 = _mm256_srai_epi32(u5, bit);
 2498|       |
 2499|   344k|  x = _mm256_mullo_epi32(in[5], cospi40);
 2500|   344k|  y = _mm256_mullo_epi32(in[3], cospi24);
 2501|   344k|  u6 = _mm256_add_epi32(x, y);
 2502|   344k|  u6 = _mm256_add_epi32(u6, rnding);
 2503|   344k|  u6 = _mm256_srai_epi32(u6, bit);
 2504|       |
 2505|       |  // stage 3
 2506|   344k|  x = _mm256_mullo_epi32(u0, cospi32);
 2507|   344k|  y = _mm256_mullo_epi32(u1, cospi32);
 2508|   344k|  v0 = _mm256_add_epi32(x, y);
 2509|   344k|  v0 = _mm256_add_epi32(v0, rnding);
 2510|   344k|  v0 = _mm256_srai_epi32(v0, bit);
 2511|       |
 2512|   344k|  v1 = _mm256_sub_epi32(x, y);
 2513|   344k|  v1 = _mm256_add_epi32(v1, rnding);
 2514|   344k|  v1 = _mm256_srai_epi32(v1, bit);
 2515|       |
 2516|   344k|  x = _mm256_mullo_epi32(u2, cospi48);
 2517|   344k|  y = _mm256_mullo_epi32(u3, cospim16);
 2518|   344k|  v2 = _mm256_add_epi32(x, y);
 2519|   344k|  v2 = _mm256_add_epi32(v2, rnding);
 2520|   344k|  v2 = _mm256_srai_epi32(v2, bit);
 2521|       |
 2522|   344k|  x = _mm256_mullo_epi32(u2, cospi16);
 2523|   344k|  y = _mm256_mullo_epi32(u3, cospi48);
 2524|   344k|  v3 = _mm256_add_epi32(x, y);
 2525|   344k|  v3 = _mm256_add_epi32(v3, rnding);
 2526|   344k|  v3 = _mm256_srai_epi32(v3, bit);
 2527|       |
 2528|   344k|  addsub_avx2(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
 2529|   344k|  addsub_avx2(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
 2530|       |
 2531|       |  // stage 4
 2532|   344k|  addsub_avx2(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
 2533|   344k|  addsub_avx2(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
 2534|   344k|  u4 = v4;
 2535|   344k|  u7 = v7;
 2536|       |
 2537|   344k|  x = _mm256_mullo_epi32(v5, cospi32);
 2538|   344k|  y = _mm256_mullo_epi32(v6, cospi32);
 2539|   344k|  u6 = _mm256_add_epi32(y, x);
 2540|   344k|  u6 = _mm256_add_epi32(u6, rnding);
 2541|   344k|  u6 = _mm256_srai_epi32(u6, bit);
 2542|       |
 2543|   344k|  u5 = _mm256_sub_epi32(y, x);
 2544|   344k|  u5 = _mm256_add_epi32(u5, rnding);
 2545|   344k|  u5 = _mm256_srai_epi32(u5, bit);
 2546|       |
 2547|   344k|  addsub_avx2(u0, u7, out + 0, out + 7, &clamp_lo, &clamp_hi);
 2548|   344k|  addsub_avx2(u1, u6, out + 1, out + 6, &clamp_lo, &clamp_hi);
 2549|   344k|  addsub_avx2(u2, u5, out + 2, out + 5, &clamp_lo, &clamp_hi);
 2550|   344k|  addsub_avx2(u3, u4, out + 3, out + 4, &clamp_lo, &clamp_hi);
 2551|       |  // stage 5
 2552|   344k|  if (!do_cols) {
  ------------------
  |  Branch (2552:7): [True: 132k, False: 212k]
  ------------------
 2553|   132k|    const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|   132k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 132k]
  |  |  ------------------
  ------------------
 2554|   132k|    const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
 2555|   132k|    const __m256i clamp_hi_out =
 2556|   132k|        _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
 2557|       |
 2558|   132k|    round_shift_4x4_avx2(out, out_shift);
 2559|   132k|    round_shift_4x4_avx2(out + 4, out_shift);
 2560|   132k|    highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 8);
 2561|   132k|  }
 2562|   344k|}
highbd_inv_txfm_avx2.c:addsub_avx2:
  265|  29.5M|                        const __m256i *clamp_hi) {
  266|  29.5M|  __m256i a0 = _mm256_add_epi32(in0, in1);
  267|  29.5M|  __m256i a1 = _mm256_sub_epi32(in0, in1);
  268|       |
  269|  29.5M|  a0 = _mm256_max_epi32(a0, *clamp_lo);
  270|  29.5M|  a0 = _mm256_min_epi32(a0, *clamp_hi);
  271|  29.5M|  a1 = _mm256_max_epi32(a1, *clamp_lo);
  272|  29.5M|  a1 = _mm256_min_epi32(a1, *clamp_hi);
  273|       |
  274|  29.5M|  *out0 = a0;
  275|  29.5M|  *out1 = a1;
  276|  29.5M|}
highbd_inv_txfm_avx2.c:round_shift_4x4_avx2:
   50|  1.39M|static inline void round_shift_4x4_avx2(__m256i *in, int shift) {
   51|  1.39M|  if (shift != 0) {
  ------------------
  |  Branch (51:7): [True: 1.39M, False: 18.4E]
  ------------------
   52|  1.39M|    __m256i rnding = _mm256_set1_epi32(1 << (shift - 1));
   53|  1.39M|    in[0] = _mm256_add_epi32(in[0], rnding);
   54|  1.39M|    in[1] = _mm256_add_epi32(in[1], rnding);
   55|  1.39M|    in[2] = _mm256_add_epi32(in[2], rnding);
   56|  1.39M|    in[3] = _mm256_add_epi32(in[3], rnding);
   57|       |
   58|  1.39M|    in[0] = _mm256_srai_epi32(in[0], shift);
   59|  1.39M|    in[1] = _mm256_srai_epi32(in[1], shift);
   60|  1.39M|    in[2] = _mm256_srai_epi32(in[2], shift);
   61|  1.39M|    in[3] = _mm256_srai_epi32(in[3], shift);
   62|  1.39M|  }
   63|  1.39M|}
highbd_inv_txfm_avx2.c:highbd_clamp_epi32_avx2:
   74|   318k|                                    const __m256i *clamp_hi, int size) {
   75|   318k|  __m256i a0, a1;
   76|  1.71M|  for (int i = 0; i < size; i += 4) {
  ------------------
  |  Branch (76:19): [True: 1.39M, False: 318k]
  ------------------
   77|  1.39M|    a0 = _mm256_max_epi32(in[i], *clamp_lo);
   78|  1.39M|    out[i] = _mm256_min_epi32(a0, *clamp_hi);
   79|       |
   80|  1.39M|    a1 = _mm256_max_epi32(in[i + 1], *clamp_lo);
   81|  1.39M|    out[i + 1] = _mm256_min_epi32(a1, *clamp_hi);
   82|       |
   83|  1.39M|    a0 = _mm256_max_epi32(in[i + 2], *clamp_lo);
   84|  1.39M|    out[i + 2] = _mm256_min_epi32(a0, *clamp_hi);
   85|       |
   86|  1.39M|    a1 = _mm256_max_epi32(in[i + 3], *clamp_lo);
   87|  1.39M|    out[i + 3] = _mm256_min_epi32(a1, *clamp_hi);
   88|  1.39M|  }
   89|   318k|}
highbd_inv_txfm_avx2.c:iadst8x8_low1_avx2:
 2564|  27.5k|                               int bd, int out_shift) {
 2565|  27.5k|  const int32_t *cospi = cospi_arr(bit);
 2566|  27.5k|  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
 2567|  27.5k|  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
 2568|  27.5k|  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
 2569|  27.5k|  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
 2570|  27.5k|  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
 2571|  27.5k|  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
 2572|  27.5k|  const __m256i kZero = _mm256_setzero_si256();
 2573|  27.5k|  __m256i u[8], x;
 2574|       |
 2575|       |  // stage 0
 2576|       |  // stage 1
 2577|       |  // stage 2
 2578|       |
 2579|  27.5k|  x = _mm256_mullo_epi32(in[0], cospi60);
 2580|  27.5k|  u[0] = _mm256_add_epi32(x, rnding);
 2581|  27.5k|  u[0] = _mm256_srai_epi32(u[0], bit);
 2582|       |
 2583|  27.5k|  x = _mm256_mullo_epi32(in[0], cospi4);
 2584|  27.5k|  u[1] = _mm256_sub_epi32(kZero, x);
 2585|  27.5k|  u[1] = _mm256_add_epi32(u[1], rnding);
 2586|  27.5k|  u[1] = _mm256_srai_epi32(u[1], bit);
 2587|       |
 2588|       |  // stage 3
 2589|       |  // stage 4
 2590|  27.5k|  __m256i temp1, temp2;
 2591|  27.5k|  temp1 = _mm256_mullo_epi32(u[0], cospi16);
 2592|  27.5k|  x = _mm256_mullo_epi32(u[1], cospi48);
 2593|  27.5k|  temp1 = _mm256_add_epi32(temp1, x);
 2594|  27.5k|  temp1 = _mm256_add_epi32(temp1, rnding);
 2595|  27.5k|  temp1 = _mm256_srai_epi32(temp1, bit);
 2596|  27.5k|  u[4] = temp1;
 2597|       |
 2598|  27.5k|  temp2 = _mm256_mullo_epi32(u[0], cospi48);
 2599|  27.5k|  x = _mm256_mullo_epi32(u[1], cospi16);
 2600|  27.5k|  u[5] = _mm256_sub_epi32(temp2, x);
 2601|  27.5k|  u[5] = _mm256_add_epi32(u[5], rnding);
 2602|  27.5k|  u[5] = _mm256_srai_epi32(u[5], bit);
 2603|       |
 2604|       |  // stage 5
 2605|       |  // stage 6
 2606|  27.5k|  temp1 = _mm256_mullo_epi32(u[0], cospi32);
 2607|  27.5k|  x = _mm256_mullo_epi32(u[1], cospi32);
 2608|  27.5k|  u[2] = _mm256_add_epi32(temp1, x);
 2609|  27.5k|  u[2] = _mm256_add_epi32(u[2], rnding);
 2610|  27.5k|  u[2] = _mm256_srai_epi32(u[2], bit);
 2611|       |
 2612|  27.5k|  u[3] = _mm256_sub_epi32(temp1, x);
 2613|  27.5k|  u[3] = _mm256_add_epi32(u[3], rnding);
 2614|  27.5k|  u[3] = _mm256_srai_epi32(u[3], bit);
 2615|       |
 2616|  27.5k|  temp1 = _mm256_mullo_epi32(u[4], cospi32);
 2617|  27.5k|  x = _mm256_mullo_epi32(u[5], cospi32);
 2618|  27.5k|  u[6] = _mm256_add_epi32(temp1, x);
 2619|  27.5k|  u[6] = _mm256_add_epi32(u[6], rnding);
 2620|  27.5k|  u[6] = _mm256_srai_epi32(u[6], bit);
 2621|       |
 2622|  27.5k|  u[7] = _mm256_sub_epi32(temp1, x);
 2623|  27.5k|  u[7] = _mm256_add_epi32(u[7], rnding);
 2624|  27.5k|  u[7] = _mm256_srai_epi32(u[7], bit);
 2625|       |
 2626|       |  // stage 7
 2627|  27.5k|  if (do_cols) {
  ------------------
  |  Branch (2627:7): [True: 14.7k, False: 12.8k]
  ------------------
 2628|  14.7k|    out[0] = u[0];
 2629|  14.7k|    out[1] = _mm256_sub_epi32(kZero, u[4]);
 2630|  14.7k|    out[2] = u[6];
 2631|  14.7k|    out[3] = _mm256_sub_epi32(kZero, u[2]);
 2632|  14.7k|    out[4] = u[3];
 2633|  14.7k|    out[5] = _mm256_sub_epi32(kZero, u[7]);
 2634|  14.7k|    out[6] = u[5];
 2635|  14.7k|    out[7] = _mm256_sub_epi32(kZero, u[1]);
 2636|  14.7k|  } else {
 2637|  12.8k|    const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|  12.8k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 12.8k]
  |  |  ------------------
  ------------------
 2638|  12.8k|    const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
 2639|  12.8k|    const __m256i clamp_hi_out =
 2640|  12.8k|        _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
 2641|       |
 2642|  12.8k|    neg_shift_avx2(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
 2643|  12.8k|                   out_shift);
 2644|  12.8k|    neg_shift_avx2(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
 2645|  12.8k|                   out_shift);
 2646|  12.8k|    neg_shift_avx2(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
 2647|  12.8k|                   out_shift);
 2648|  12.8k|    neg_shift_avx2(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
 2649|  12.8k|                   out_shift);
 2650|  12.8k|  }
 2651|  27.5k|}
highbd_inv_txfm_avx2.c:neg_shift_avx2:
  143|   938k|                           const __m256i *clamp_hi, int shift) {
  144|   938k|  __m256i offset = _mm256_set1_epi32((1 << shift) >> 1);
  145|   938k|  __m256i a0 = _mm256_add_epi32(offset, in0);
  146|   938k|  __m256i a1 = _mm256_sub_epi32(offset, in1);
  147|       |
  148|   938k|  a0 = _mm256_sra_epi32(a0, _mm_cvtsi32_si128(shift));
  149|   938k|  a1 = _mm256_sra_epi32(a1, _mm_cvtsi32_si128(shift));
  150|       |
  151|   938k|  a0 = _mm256_max_epi32(a0, *clamp_lo);
  152|   938k|  a0 = _mm256_min_epi32(a0, *clamp_hi);
  153|   938k|  a1 = _mm256_max_epi32(a1, *clamp_lo);
  154|   938k|  a1 = _mm256_min_epi32(a1, *clamp_hi);
  155|       |
  156|   938k|  *out0 = a0;
  157|   938k|  *out1 = a1;
  158|   938k|}
highbd_inv_txfm_avx2.c:iadst8x8_avx2:
 2654|   167k|                          int bd, int out_shift) {
 2655|   167k|  const int32_t *cospi = cospi_arr(bit);
 2656|   167k|  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
 2657|   167k|  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
 2658|   167k|  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
 2659|   167k|  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
 2660|   167k|  const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
 2661|   167k|  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
 2662|   167k|  const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
 2663|   167k|  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
 2664|   167k|  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
 2665|   167k|  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
 2666|   167k|  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
 2667|   167k|  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
 2668|   167k|  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
 2669|   167k|  const __m256i kZero = _mm256_setzero_si256();
 2670|   167k|  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|   334k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 167k]
  |  |  |  Branch (35:31): [True: 87.3k, False: 79.8k]
  |  |  |  Branch (35:44): [True: 87.3k, False: 79.8k]
  |  |  ------------------
  ------------------
 2671|   167k|  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
 2672|   167k|  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
 2673|   167k|  __m256i u[8], v[8], x;
 2674|       |
 2675|       |  // stage 0
 2676|       |  // stage 1
 2677|       |  // stage 2
 2678|       |
 2679|   167k|  u[0] = _mm256_mullo_epi32(in[7], cospi4);
 2680|   167k|  x = _mm256_mullo_epi32(in[0], cospi60);
 2681|   167k|  u[0] = _mm256_add_epi32(u[0], x);
 2682|   167k|  u[0] = _mm256_add_epi32(u[0], rnding);
 2683|   167k|  u[0] = _mm256_srai_epi32(u[0], bit);
 2684|       |
 2685|   167k|  u[1] = _mm256_mullo_epi32(in[7], cospi60);
 2686|   167k|  x = _mm256_mullo_epi32(in[0], cospi4);
 2687|   167k|  u[1] = _mm256_sub_epi32(u[1], x);
 2688|   167k|  u[1] = _mm256_add_epi32(u[1], rnding);
 2689|   167k|  u[1] = _mm256_srai_epi32(u[1], bit);
 2690|       |
 2691|   167k|  u[2] = _mm256_mullo_epi32(in[5], cospi20);
 2692|   167k|  x = _mm256_mullo_epi32(in[2], cospi44);
 2693|   167k|  u[2] = _mm256_add_epi32(u[2], x);
 2694|   167k|  u[2] = _mm256_add_epi32(u[2], rnding);
 2695|   167k|  u[2] = _mm256_srai_epi32(u[2], bit);
 2696|       |
 2697|   167k|  u[3] = _mm256_mullo_epi32(in[5], cospi44);
 2698|   167k|  x = _mm256_mullo_epi32(in[2], cospi20);
 2699|   167k|  u[3] = _mm256_sub_epi32(u[3], x);
 2700|   167k|  u[3] = _mm256_add_epi32(u[3], rnding);
 2701|   167k|  u[3] = _mm256_srai_epi32(u[3], bit);
 2702|       |
 2703|   167k|  u[4] = _mm256_mullo_epi32(in[3], cospi36);
 2704|   167k|  x = _mm256_mullo_epi32(in[4], cospi28);
 2705|   167k|  u[4] = _mm256_add_epi32(u[4], x);
 2706|   167k|  u[4] = _mm256_add_epi32(u[4], rnding);
 2707|   167k|  u[4] = _mm256_srai_epi32(u[4], bit);
 2708|       |
 2709|   167k|  u[5] = _mm256_mullo_epi32(in[3], cospi28);
 2710|   167k|  x = _mm256_mullo_epi32(in[4], cospi36);
 2711|   167k|  u[5] = _mm256_sub_epi32(u[5], x);
 2712|   167k|  u[5] = _mm256_add_epi32(u[5], rnding);
 2713|   167k|  u[5] = _mm256_srai_epi32(u[5], bit);
 2714|       |
 2715|   167k|  u[6] = _mm256_mullo_epi32(in[1], cospi52);
 2716|   167k|  x = _mm256_mullo_epi32(in[6], cospi12);
 2717|   167k|  u[6] = _mm256_add_epi32(u[6], x);
 2718|   167k|  u[6] = _mm256_add_epi32(u[6], rnding);
 2719|   167k|  u[6] = _mm256_srai_epi32(u[6], bit);
 2720|       |
 2721|   167k|  u[7] = _mm256_mullo_epi32(in[1], cospi12);
 2722|   167k|  x = _mm256_mullo_epi32(in[6], cospi52);
 2723|   167k|  u[7] = _mm256_sub_epi32(u[7], x);
 2724|   167k|  u[7] = _mm256_add_epi32(u[7], rnding);
 2725|   167k|  u[7] = _mm256_srai_epi32(u[7], bit);
 2726|       |
 2727|       |  // stage 3
 2728|   167k|  addsub_avx2(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
 2729|   167k|  addsub_avx2(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
 2730|   167k|  addsub_avx2(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
 2731|   167k|  addsub_avx2(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
 2732|       |
 2733|       |  // stage 4
 2734|   167k|  u[0] = v[0];
 2735|   167k|  u[1] = v[1];
 2736|   167k|  u[2] = v[2];
 2737|   167k|  u[3] = v[3];
 2738|       |
 2739|   167k|  u[4] = _mm256_mullo_epi32(v[4], cospi16);
 2740|   167k|  x = _mm256_mullo_epi32(v[5], cospi48);
 2741|   167k|  u[4] = _mm256_add_epi32(u[4], x);
 2742|   167k|  u[4] = _mm256_add_epi32(u[4], rnding);
 2743|   167k|  u[4] = _mm256_srai_epi32(u[4], bit);
 2744|       |
 2745|   167k|  u[5] = _mm256_mullo_epi32(v[4], cospi48);
 2746|   167k|  x = _mm256_mullo_epi32(v[5], cospi16);
 2747|   167k|  u[5] = _mm256_sub_epi32(u[5], x);
 2748|   167k|  u[5] = _mm256_add_epi32(u[5], rnding);
 2749|   167k|  u[5] = _mm256_srai_epi32(u[5], bit);
 2750|       |
 2751|   167k|  u[6] = _mm256_mullo_epi32(v[6], cospim48);
 2752|   167k|  x = _mm256_mullo_epi32(v[7], cospi16);
 2753|   167k|  u[6] = _mm256_add_epi32(u[6], x);
 2754|   167k|  u[6] = _mm256_add_epi32(u[6], rnding);
 2755|   167k|  u[6] = _mm256_srai_epi32(u[6], bit);
 2756|       |
 2757|   167k|  u[7] = _mm256_mullo_epi32(v[6], cospi16);
 2758|   167k|  x = _mm256_mullo_epi32(v[7], cospim48);
 2759|   167k|  u[7] = _mm256_sub_epi32(u[7], x);
 2760|   167k|  u[7] = _mm256_add_epi32(u[7], rnding);
 2761|   167k|  u[7] = _mm256_srai_epi32(u[7], bit);
 2762|       |
 2763|       |  // stage 5
 2764|   167k|  addsub_avx2(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
 2765|   167k|  addsub_avx2(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
 2766|   167k|  addsub_avx2(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
 2767|   167k|  addsub_avx2(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
 2768|       |
 2769|       |  // stage 6
 2770|   167k|  u[0] = v[0];
 2771|   167k|  u[1] = v[1];
 2772|   167k|  u[4] = v[4];
 2773|   167k|  u[5] = v[5];
 2774|       |
 2775|   167k|  v[0] = _mm256_mullo_epi32(v[2], cospi32);
 2776|   167k|  x = _mm256_mullo_epi32(v[3], cospi32);
 2777|   167k|  u[2] = _mm256_add_epi32(v[0], x);
 2778|   167k|  u[2] = _mm256_add_epi32(u[2], rnding);
 2779|   167k|  u[2] = _mm256_srai_epi32(u[2], bit);
 2780|       |
 2781|   167k|  u[3] = _mm256_sub_epi32(v[0], x);
 2782|   167k|  u[3] = _mm256_add_epi32(u[3], rnding);
 2783|   167k|  u[3] = _mm256_srai_epi32(u[3], bit);
 2784|       |
 2785|   167k|  v[0] = _mm256_mullo_epi32(v[6], cospi32);
 2786|   167k|  x = _mm256_mullo_epi32(v[7], cospi32);
 2787|   167k|  u[6] = _mm256_add_epi32(v[0], x);
 2788|   167k|  u[6] = _mm256_add_epi32(u[6], rnding);
 2789|   167k|  u[6] = _mm256_srai_epi32(u[6], bit);
 2790|       |
 2791|   167k|  u[7] = _mm256_sub_epi32(v[0], x);
 2792|   167k|  u[7] = _mm256_add_epi32(u[7], rnding);
 2793|   167k|  u[7] = _mm256_srai_epi32(u[7], bit);
 2794|       |
 2795|       |  // stage 7
 2796|   167k|  if (do_cols) {
  ------------------
  |  Branch (2796:7): [True: 87.3k, False: 79.8k]
  ------------------
 2797|  87.3k|    out[0] = u[0];
 2798|  87.3k|    out[1] = _mm256_sub_epi32(kZero, u[4]);
 2799|  87.3k|    out[2] = u[6];
 2800|  87.3k|    out[3] = _mm256_sub_epi32(kZero, u[2]);
 2801|  87.3k|    out[4] = u[3];
 2802|  87.3k|    out[5] = _mm256_sub_epi32(kZero, u[7]);
 2803|  87.3k|    out[6] = u[5];
 2804|  87.3k|    out[7] = _mm256_sub_epi32(kZero, u[1]);
 2805|  87.3k|  } else {
 2806|  79.8k|    const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|  79.8k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 79.8k]
  |  |  ------------------
  ------------------
 2807|  79.8k|    const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
 2808|  79.8k|    const __m256i clamp_hi_out =
 2809|  79.8k|        _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
 2810|       |
 2811|  79.8k|    neg_shift_avx2(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
 2812|  79.8k|                   out_shift);
 2813|  79.8k|    neg_shift_avx2(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
 2814|  79.8k|                   out_shift);
 2815|  79.8k|    neg_shift_avx2(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
 2816|  79.8k|                   out_shift);
 2817|  79.8k|    neg_shift_avx2(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
 2818|  79.8k|                   out_shift);
 2819|  79.8k|  }
 2820|   167k|}
highbd_inv_txfm_avx2.c:idct16_low1_avx2:
 1155|  52.2k|                             int bd, int out_shift) {
 1156|  52.2k|  const int32_t *cospi = cospi_arr(bit);
 1157|  52.2k|  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
 1158|  52.2k|  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
 1159|  52.2k|  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|   104k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 52.2k]
  |  |  |  Branch (35:31): [True: 35.6k, False: 16.6k]
  |  |  |  Branch (35:44): [True: 35.6k, False: 16.6k]
  |  |  ------------------
  ------------------
 1160|  52.2k|  __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
 1161|  52.2k|  __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
 1162|       |
 1163|  52.2k|  {
 1164|       |    // stage 0
 1165|       |    // stage 1
 1166|       |    // stage 2
 1167|       |    // stage 3
 1168|       |    // stage 4
 1169|  52.2k|    in[0] = _mm256_mullo_epi32(in[0], cospi32);
 1170|  52.2k|    in[0] = _mm256_add_epi32(in[0], rnding);
 1171|  52.2k|    in[0] = _mm256_srai_epi32(in[0], bit);
 1172|       |
 1173|       |    // stage 5
 1174|       |    // stage 6
 1175|       |    // stage 7
 1176|  52.2k|    if (!do_cols) {
  ------------------
  |  Branch (1176:9): [True: 16.6k, False: 35.6k]
  ------------------
 1177|  16.6k|      const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|  16.6k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 16.6k]
  |  |  ------------------
  ------------------
 1178|  16.6k|      clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
 1179|  16.6k|      clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
 1180|  16.6k|      __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
 1181|  16.6k|      in[0] = _mm256_add_epi32(in[0], offset);
 1182|  16.6k|      in[0] = _mm256_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift));
 1183|  16.6k|    }
 1184|  52.2k|    in[0] = _mm256_max_epi32(in[0], clamp_lo);
 1185|  52.2k|    in[0] = _mm256_min_epi32(in[0], clamp_hi);
 1186|  52.2k|    out[0] = in[0];
 1187|  52.2k|    out[1] = in[0];
 1188|  52.2k|    out[2] = in[0];
 1189|  52.2k|    out[3] = in[0];
 1190|  52.2k|    out[4] = in[0];
 1191|  52.2k|    out[5] = in[0];
 1192|  52.2k|    out[6] = in[0];
 1193|  52.2k|    out[7] = in[0];
 1194|  52.2k|    out[8] = in[0];
 1195|  52.2k|    out[9] = in[0];
 1196|  52.2k|    out[10] = in[0];
 1197|  52.2k|    out[11] = in[0];
 1198|  52.2k|    out[12] = in[0];
 1199|  52.2k|    out[13] = in[0];
 1200|  52.2k|    out[14] = in[0];
 1201|  52.2k|    out[15] = in[0];
 1202|  52.2k|  }
 1203|  52.2k|}
highbd_inv_txfm_avx2.c:idct16_low8_avx2:
 1206|   173k|                             int bd, int out_shift) {
 1207|   173k|  const int32_t *cospi = cospi_arr(bit);
 1208|   173k|  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
 1209|   173k|  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
 1210|   173k|  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
 1211|   173k|  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
 1212|   173k|  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
 1213|   173k|  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
 1214|   173k|  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
 1215|   173k|  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
 1216|   173k|  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
 1217|   173k|  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
 1218|   173k|  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
 1219|   173k|  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
 1220|   173k|  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
 1221|   173k|  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
 1222|   173k|  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
 1223|   173k|  const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
 1224|   173k|  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
 1225|   173k|  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
 1226|   173k|  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|   346k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 173k]
  |  |  |  Branch (35:31): [True: 117k, False: 55.7k]
  |  |  |  Branch (35:44): [True: 117k, False: 55.7k]
  |  |  ------------------
  ------------------
 1227|   173k|  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
 1228|   173k|  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
 1229|   173k|  __m256i u[16], x, y;
 1230|       |
 1231|   173k|  {
 1232|       |    // stage 0
 1233|       |    // stage 1
 1234|   173k|    u[0] = in[0];
 1235|   173k|    u[2] = in[4];
 1236|   173k|    u[4] = in[2];
 1237|   173k|    u[6] = in[6];
 1238|   173k|    u[8] = in[1];
 1239|   173k|    u[10] = in[5];
 1240|   173k|    u[12] = in[3];
 1241|   173k|    u[14] = in[7];
 1242|       |
 1243|       |    // stage 2
 1244|   173k|    u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
 1245|   173k|    u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
 1246|       |
 1247|   173k|    u[9] = half_btf_0_avx2(&cospim36, &u[14], &rnding, bit);
 1248|   173k|    u[14] = half_btf_0_avx2(&cospi28, &u[14], &rnding, bit);
 1249|       |
 1250|   173k|    u[13] = half_btf_0_avx2(&cospi20, &u[10], &rnding, bit);
 1251|   173k|    u[10] = half_btf_0_avx2(&cospi44, &u[10], &rnding, bit);
 1252|       |
 1253|   173k|    u[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit);
 1254|   173k|    u[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit);
 1255|       |
 1256|       |    // stage 3
 1257|   173k|    u[7] = half_btf_0_avx2(&cospi8, &u[4], &rnding, bit);
 1258|   173k|    u[4] = half_btf_0_avx2(&cospi56, &u[4], &rnding, bit);
 1259|   173k|    u[5] = half_btf_0_avx2(&cospim40, &u[6], &rnding, bit);
 1260|   173k|    u[6] = half_btf_0_avx2(&cospi24, &u[6], &rnding, bit);
 1261|       |
 1262|   173k|    addsub_avx2(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
 1263|   173k|    addsub_avx2(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
 1264|   173k|    addsub_avx2(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
 1265|   173k|    addsub_avx2(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
 1266|       |
 1267|       |    // stage 4
 1268|   173k|    x = _mm256_mullo_epi32(u[0], cospi32);
 1269|   173k|    u[0] = _mm256_add_epi32(x, rnding);
 1270|   173k|    u[0] = _mm256_srai_epi32(u[0], bit);
 1271|   173k|    u[1] = u[0];
 1272|       |
 1273|   173k|    u[3] = half_btf_0_avx2(&cospi16, &u[2], &rnding, bit);
 1274|   173k|    u[2] = half_btf_0_avx2(&cospi48, &u[2], &rnding, bit);
 1275|       |
 1276|   173k|    addsub_avx2(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi);
 1277|   173k|    addsub_avx2(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi);
 1278|       |
 1279|   173k|    x = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
 1280|   173k|    u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
 1281|   173k|    u[9] = x;
 1282|   173k|    y = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
 1283|   173k|    u[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
 1284|   173k|    u[10] = y;
 1285|       |
 1286|       |    // stage 5
 1287|   173k|    addsub_avx2(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
 1288|   173k|    addsub_avx2(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
 1289|       |
 1290|   173k|    x = _mm256_mullo_epi32(u[5], cospi32);
 1291|   173k|    y = _mm256_mullo_epi32(u[6], cospi32);
 1292|   173k|    u[5] = _mm256_sub_epi32(y, x);
 1293|   173k|    u[5] = _mm256_add_epi32(u[5], rnding);
 1294|   173k|    u[5] = _mm256_srai_epi32(u[5], bit);
 1295|       |
 1296|   173k|    u[6] = _mm256_add_epi32(y, x);
 1297|   173k|    u[6] = _mm256_add_epi32(u[6], rnding);
 1298|   173k|    u[6] = _mm256_srai_epi32(u[6], bit);
 1299|       |
 1300|   173k|    addsub_avx2(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
 1301|   173k|    addsub_avx2(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
 1302|   173k|    addsub_avx2(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
 1303|   173k|    addsub_avx2(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
 1304|       |
 1305|       |    // stage 6
 1306|   173k|    addsub_avx2(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi);
 1307|   173k|    addsub_avx2(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi);
 1308|   173k|    addsub_avx2(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi);
 1309|   173k|    addsub_avx2(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi);
 1310|       |
 1311|   173k|    x = _mm256_mullo_epi32(u[10], cospi32);
 1312|   173k|    y = _mm256_mullo_epi32(u[13], cospi32);
 1313|   173k|    u[10] = _mm256_sub_epi32(y, x);
 1314|   173k|    u[10] = _mm256_add_epi32(u[10], rnding);
 1315|   173k|    u[10] = _mm256_srai_epi32(u[10], bit);
 1316|       |
 1317|   173k|    u[13] = _mm256_add_epi32(x, y);
 1318|   173k|    u[13] = _mm256_add_epi32(u[13], rnding);
 1319|   173k|    u[13] = _mm256_srai_epi32(u[13], bit);
 1320|       |
 1321|   173k|    x = _mm256_mullo_epi32(u[11], cospi32);
 1322|   173k|    y = _mm256_mullo_epi32(u[12], cospi32);
 1323|   173k|    u[11] = _mm256_sub_epi32(y, x);
 1324|   173k|    u[11] = _mm256_add_epi32(u[11], rnding);
 1325|   173k|    u[11] = _mm256_srai_epi32(u[11], bit);
 1326|       |
 1327|   173k|    u[12] = _mm256_add_epi32(x, y);
 1328|   173k|    u[12] = _mm256_add_epi32(u[12], rnding);
 1329|   173k|    u[12] = _mm256_srai_epi32(u[12], bit);
 1330|       |    // stage 7
 1331|   173k|    addsub_avx2(u[0], u[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
 1332|   173k|    addsub_avx2(u[1], u[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
 1333|   173k|    addsub_avx2(u[2], u[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
 1334|   173k|    addsub_avx2(u[3], u[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
 1335|   173k|    addsub_avx2(u[4], u[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
 1336|   173k|    addsub_avx2(u[5], u[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
 1337|   173k|    addsub_avx2(u[6], u[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
 1338|   173k|    addsub_avx2(u[7], u[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
 1339|       |
 1340|   173k|    if (!do_cols) {
  ------------------
  |  Branch (1340:9): [True: 55.7k, False: 117k]
  ------------------
 1341|  55.7k|      const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|  55.7k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 55.7k]
  |  |  ------------------
  ------------------
 1342|  55.7k|      const __m256i clamp_lo_out =
 1343|  55.7k|          _mm256_set1_epi32(-(1 << (log_range_out - 1)));
 1344|  55.7k|      const __m256i clamp_hi_out =
 1345|  55.7k|          _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
 1346|  55.7k|      round_shift_8x8_avx2(out, out_shift);
 1347|  55.7k|      highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 16);
 1348|  55.7k|    }
 1349|   173k|  }
 1350|   173k|}
highbd_inv_txfm_avx2.c:half_btf_0_avx2:
  242|  6.71M|                                      const __m256i *rounding, int bit) {
  243|  6.71M|  __m256i x;
  244|  6.71M|  x = _mm256_mullo_epi32(*w0, *n0);
  245|  6.71M|  x = _mm256_add_epi32(x, *rounding);
  246|  6.71M|  x = _mm256_srai_epi32(x, bit);
  247|  6.71M|  return x;
  248|  6.71M|}
highbd_inv_txfm_avx2.c:half_btf_avx2:
  252|  14.3M|                                    const __m256i *rounding, int bit) {
  253|  14.3M|  __m256i x, y;
  254|       |
  255|  14.3M|  x = _mm256_mullo_epi32(*w0, *n0);
  256|  14.3M|  y = _mm256_mullo_epi32(*w1, *n1);
  257|  14.3M|  x = _mm256_add_epi32(x, y);
  258|  14.3M|  x = _mm256_add_epi32(x, *rounding);
  259|  14.3M|  x = _mm256_srai_epi32(x, bit);
  260|  14.3M|  return x;
  261|  14.3M|}
highbd_inv_txfm_avx2.c:round_shift_8x8_avx2:
   65|   283k|static inline void round_shift_8x8_avx2(__m256i *in, int shift) {
   66|   283k|  round_shift_4x4_avx2(in, shift);
   67|   283k|  round_shift_4x4_avx2(in + 4, shift);
   68|   283k|  round_shift_4x4_avx2(in + 8, shift);
   69|   283k|  round_shift_4x4_avx2(in + 12, shift);
   70|   283k|}
highbd_inv_txfm_avx2.c:idct16_avx2:
 1353|   106k|                        int out_shift) {
 1354|   106k|  const int32_t *cospi = cospi_arr(bit);
 1355|   106k|  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
 1356|   106k|  const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
 1357|   106k|  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
 1358|   106k|  const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
 1359|   106k|  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
 1360|   106k|  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
 1361|   106k|  const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
 1362|   106k|  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
 1363|   106k|  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
 1364|   106k|  const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
 1365|   106k|  const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
 1366|   106k|  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
 1367|   106k|  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
 1368|   106k|  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
 1369|   106k|  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
 1370|   106k|  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
 1371|   106k|  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
 1372|   106k|  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
 1373|   106k|  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
 1374|   106k|  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
 1375|   106k|  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
 1376|   106k|  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
 1377|   106k|  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
 1378|   106k|  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
 1379|   106k|  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|   212k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 106k]
  |  |  |  Branch (35:31): [True: 57.1k, False: 49.0k]
  |  |  |  Branch (35:44): [True: 57.1k, False: 49.0k]
  |  |  ------------------
  ------------------
 1380|   106k|  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
 1381|   106k|  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
 1382|   106k|  __m256i u[16], v[16], x, y;
 1383|       |
 1384|   106k|  {
 1385|       |    // stage 0
 1386|       |    // stage 1
 1387|   106k|    u[0] = in[0];
 1388|   106k|    u[1] = in[8];
 1389|   106k|    u[2] = in[4];
 1390|   106k|    u[3] = in[12];
 1391|   106k|    u[4] = in[2];
 1392|   106k|    u[5] = in[10];
 1393|   106k|    u[6] = in[6];
 1394|   106k|    u[7] = in[14];
 1395|   106k|    u[8] = in[1];
 1396|   106k|    u[9] = in[9];
 1397|   106k|    u[10] = in[5];
 1398|   106k|    u[11] = in[13];
 1399|   106k|    u[12] = in[3];
 1400|   106k|    u[13] = in[11];
 1401|   106k|    u[14] = in[7];
 1402|   106k|    u[15] = in[15];
 1403|       |
 1404|       |    // stage 2
 1405|   106k|    v[0] = u[0];
 1406|   106k|    v[1] = u[1];
 1407|   106k|    v[2] = u[2];
 1408|   106k|    v[3] = u[3];
 1409|   106k|    v[4] = u[4];
 1410|   106k|    v[5] = u[5];
 1411|   106k|    v[6] = u[6];
 1412|   106k|    v[7] = u[7];
 1413|       |
 1414|   106k|    v[8] = half_btf_avx2(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit);
 1415|   106k|    v[9] = half_btf_avx2(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit);
 1416|   106k|    v[10] = half_btf_avx2(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit);
 1417|   106k|    v[11] = half_btf_avx2(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit);
 1418|   106k|    v[12] = half_btf_avx2(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit);
 1419|   106k|    v[13] = half_btf_avx2(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit);
 1420|   106k|    v[14] = half_btf_avx2(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit);
 1421|   106k|    v[15] = half_btf_avx2(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit);
 1422|       |
 1423|       |    // stage 3
 1424|   106k|    u[0] = v[0];
 1425|   106k|    u[1] = v[1];
 1426|   106k|    u[2] = v[2];
 1427|   106k|    u[3] = v[3];
 1428|   106k|    u[4] = half_btf_avx2(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit);
 1429|   106k|    u[5] = half_btf_avx2(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit);
 1430|   106k|    u[6] = half_btf_avx2(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit);
 1431|   106k|    u[7] = half_btf_avx2(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit);
 1432|   106k|    addsub_avx2(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
 1433|   106k|    addsub_avx2(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
 1434|   106k|    addsub_avx2(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
 1435|   106k|    addsub_avx2(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
 1436|       |
 1437|       |    // stage 4
 1438|   106k|    x = _mm256_mullo_epi32(u[0], cospi32);
 1439|   106k|    y = _mm256_mullo_epi32(u[1], cospi32);
 1440|   106k|    v[0] = _mm256_add_epi32(x, y);
 1441|   106k|    v[0] = _mm256_add_epi32(v[0], rnding);
 1442|   106k|    v[0] = _mm256_srai_epi32(v[0], bit);
 1443|       |
 1444|   106k|    v[1] = _mm256_sub_epi32(x, y);
 1445|   106k|    v[1] = _mm256_add_epi32(v[1], rnding);
 1446|   106k|    v[1] = _mm256_srai_epi32(v[1], bit);
 1447|       |
 1448|   106k|    v[2] = half_btf_avx2(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit);
 1449|   106k|    v[3] = half_btf_avx2(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit);
 1450|   106k|    addsub_avx2(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
 1451|   106k|    addsub_avx2(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
 1452|   106k|    v[8] = u[8];
 1453|   106k|    v[9] = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
 1454|   106k|    v[10] = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
 1455|   106k|    v[11] = u[11];
 1456|   106k|    v[12] = u[12];
 1457|   106k|    v[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
 1458|   106k|    v[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
 1459|   106k|    v[15] = u[15];
 1460|       |
 1461|       |    // stage 5
 1462|   106k|    addsub_avx2(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
 1463|   106k|    addsub_avx2(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
 1464|   106k|    u[4] = v[4];
 1465|       |
 1466|   106k|    x = _mm256_mullo_epi32(v[5], cospi32);
 1467|   106k|    y = _mm256_mullo_epi32(v[6], cospi32);
 1468|   106k|    u[5] = _mm256_sub_epi32(y, x);
 1469|   106k|    u[5] = _mm256_add_epi32(u[5], rnding);
 1470|   106k|    u[5] = _mm256_srai_epi32(u[5], bit);
 1471|       |
 1472|   106k|    u[6] = _mm256_add_epi32(y, x);
 1473|   106k|    u[6] = _mm256_add_epi32(u[6], rnding);
 1474|   106k|    u[6] = _mm256_srai_epi32(u[6], bit);
 1475|       |
 1476|   106k|    u[7] = v[7];
 1477|   106k|    addsub_avx2(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
 1478|   106k|    addsub_avx2(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
 1479|   106k|    addsub_avx2(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
 1480|   106k|    addsub_avx2(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
 1481|       |
 1482|       |    // stage 6
 1483|   106k|    addsub_avx2(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi);
 1484|   106k|    addsub_avx2(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi);
 1485|   106k|    addsub_avx2(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi);
 1486|   106k|    addsub_avx2(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi);
 1487|   106k|    v[8] = u[8];
 1488|   106k|    v[9] = u[9];
 1489|       |
 1490|   106k|    x = _mm256_mullo_epi32(u[10], cospi32);
 1491|   106k|    y = _mm256_mullo_epi32(u[13], cospi32);
 1492|   106k|    v[10] = _mm256_sub_epi32(y, x);
 1493|   106k|    v[10] = _mm256_add_epi32(v[10], rnding);
 1494|   106k|    v[10] = _mm256_srai_epi32(v[10], bit);
 1495|       |
 1496|   106k|    v[13] = _mm256_add_epi32(x, y);
 1497|   106k|    v[13] = _mm256_add_epi32(v[13], rnding);
 1498|   106k|    v[13] = _mm256_srai_epi32(v[13], bit);
 1499|       |
 1500|   106k|    x = _mm256_mullo_epi32(u[11], cospi32);
 1501|   106k|    y = _mm256_mullo_epi32(u[12], cospi32);
 1502|   106k|    v[11] = _mm256_sub_epi32(y, x);
 1503|   106k|    v[11] = _mm256_add_epi32(v[11], rnding);
 1504|   106k|    v[11] = _mm256_srai_epi32(v[11], bit);
 1505|       |
 1506|   106k|    v[12] = _mm256_add_epi32(x, y);
 1507|   106k|    v[12] = _mm256_add_epi32(v[12], rnding);
 1508|   106k|    v[12] = _mm256_srai_epi32(v[12], bit);
 1509|       |
 1510|   106k|    v[14] = u[14];
 1511|   106k|    v[15] = u[15];
 1512|       |
 1513|       |    // stage 7
 1514|   106k|    addsub_avx2(v[0], v[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
 1515|   106k|    addsub_avx2(v[1], v[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
 1516|   106k|    addsub_avx2(v[2], v[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
 1517|   106k|    addsub_avx2(v[3], v[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
 1518|   106k|    addsub_avx2(v[4], v[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
 1519|   106k|    addsub_avx2(v[5], v[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
 1520|   106k|    addsub_avx2(v[6], v[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
 1521|   106k|    addsub_avx2(v[7], v[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
 1522|       |
 1523|   106k|    if (!do_cols) {
  ------------------
  |  Branch (1523:9): [True: 49.0k, False: 57.1k]
  ------------------
 1524|  49.0k|      const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|  49.0k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 49.0k]
  |  |  ------------------
  ------------------
 1525|  49.0k|      const __m256i clamp_lo_out =
 1526|  49.0k|          _mm256_set1_epi32(-(1 << (log_range_out - 1)));
 1527|  49.0k|      const __m256i clamp_hi_out =
 1528|  49.0k|          _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
 1529|  49.0k|      round_shift_8x8_avx2(out, out_shift);
 1530|  49.0k|      highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 16);
 1531|  49.0k|    }
 1532|   106k|  }
 1533|   106k|}
highbd_inv_txfm_avx2.c:iadst16_low1_avx2:
 1536|  18.7k|                              int bd, int out_shift) {
 1537|  18.7k|  const int32_t *cospi = cospi_arr(bit);
 1538|  18.7k|  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
 1539|  18.7k|  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
 1540|  18.7k|  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
 1541|  18.7k|  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
 1542|  18.7k|  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
 1543|  18.7k|  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
 1544|  18.7k|  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
 1545|  18.7k|  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
 1546|  18.7k|  const __m256i zero = _mm256_setzero_si256();
 1547|  18.7k|  __m256i v[16], x, y, temp1, temp2;
 1548|       |
 1549|       |  // Calculate the column 0, 1, 2, 3
 1550|  18.7k|  {
 1551|       |    // stage 0
 1552|       |    // stage 1
 1553|       |    // stage 2
 1554|  18.7k|    x = _mm256_mullo_epi32(in[0], cospi62);
 1555|  18.7k|    v[0] = _mm256_add_epi32(x, rnding);
 1556|  18.7k|    v[0] = _mm256_srai_epi32(v[0], bit);
 1557|       |
 1558|  18.7k|    x = _mm256_mullo_epi32(in[0], cospi2);
 1559|  18.7k|    v[1] = _mm256_sub_epi32(zero, x);
 1560|  18.7k|    v[1] = _mm256_add_epi32(v[1], rnding);
 1561|  18.7k|    v[1] = _mm256_srai_epi32(v[1], bit);
 1562|       |
 1563|       |    // stage 3
 1564|  18.7k|    v[8] = v[0];
 1565|  18.7k|    v[9] = v[1];
 1566|       |
 1567|       |    // stage 4
 1568|  18.7k|    temp1 = _mm256_mullo_epi32(v[8], cospi8);
 1569|  18.7k|    x = _mm256_mullo_epi32(v[9], cospi56);
 1570|  18.7k|    temp1 = _mm256_add_epi32(temp1, x);
 1571|  18.7k|    temp1 = _mm256_add_epi32(temp1, rnding);
 1572|  18.7k|    temp1 = _mm256_srai_epi32(temp1, bit);
 1573|       |
 1574|  18.7k|    temp2 = _mm256_mullo_epi32(v[8], cospi56);
 1575|  18.7k|    x = _mm256_mullo_epi32(v[9], cospi8);
 1576|  18.7k|    temp2 = _mm256_sub_epi32(temp2, x);
 1577|  18.7k|    temp2 = _mm256_add_epi32(temp2, rnding);
 1578|  18.7k|    temp2 = _mm256_srai_epi32(temp2, bit);
 1579|  18.7k|    v[8] = temp1;
 1580|  18.7k|    v[9] = temp2;
 1581|       |
 1582|       |    // stage 5
 1583|  18.7k|    v[4] = v[0];
 1584|  18.7k|    v[5] = v[1];
 1585|  18.7k|    v[12] = v[8];
 1586|  18.7k|    v[13] = v[9];
 1587|       |
 1588|       |    // stage 6
 1589|  18.7k|    temp1 = _mm256_mullo_epi32(v[4], cospi16);
 1590|  18.7k|    x = _mm256_mullo_epi32(v[5], cospi48);
 1591|  18.7k|    temp1 = _mm256_add_epi32(temp1, x);
 1592|  18.7k|    temp1 = _mm256_add_epi32(temp1, rnding);
 1593|  18.7k|    temp1 = _mm256_srai_epi32(temp1, bit);
 1594|       |
 1595|  18.7k|    temp2 = _mm256_mullo_epi32(v[4], cospi48);
 1596|  18.7k|    x = _mm256_mullo_epi32(v[5], cospi16);
 1597|  18.7k|    temp2 = _mm256_sub_epi32(temp2, x);
 1598|  18.7k|    temp2 = _mm256_add_epi32(temp2, rnding);
 1599|  18.7k|    temp2 = _mm256_srai_epi32(temp2, bit);
 1600|  18.7k|    v[4] = temp1;
 1601|  18.7k|    v[5] = temp2;
 1602|       |
 1603|  18.7k|    temp1 = _mm256_mullo_epi32(v[12], cospi16);
 1604|  18.7k|    x = _mm256_mullo_epi32(v[13], cospi48);
 1605|  18.7k|    temp1 = _mm256_add_epi32(temp1, x);
 1606|  18.7k|    temp1 = _mm256_add_epi32(temp1, rnding);
 1607|  18.7k|    temp1 = _mm256_srai_epi32(temp1, bit);
 1608|       |
 1609|  18.7k|    temp2 = _mm256_mullo_epi32(v[12], cospi48);
 1610|  18.7k|    x = _mm256_mullo_epi32(v[13], cospi16);
 1611|  18.7k|    temp2 = _mm256_sub_epi32(temp2, x);
 1612|  18.7k|    temp2 = _mm256_add_epi32(temp2, rnding);
 1613|  18.7k|    temp2 = _mm256_srai_epi32(temp2, bit);
 1614|  18.7k|    v[12] = temp1;
 1615|  18.7k|    v[13] = temp2;
 1616|       |
 1617|       |    // stage 7
 1618|  18.7k|    v[2] = v[0];
 1619|  18.7k|    v[3] = v[1];
 1620|  18.7k|    v[6] = v[4];
 1621|  18.7k|    v[7] = v[5];
 1622|  18.7k|    v[10] = v[8];
 1623|  18.7k|    v[11] = v[9];
 1624|  18.7k|    v[14] = v[12];
 1625|  18.7k|    v[15] = v[13];
 1626|       |
 1627|       |    // stage 8
 1628|  18.7k|    y = _mm256_mullo_epi32(v[2], cospi32);
 1629|  18.7k|    x = _mm256_mullo_epi32(v[3], cospi32);
 1630|  18.7k|    v[2] = _mm256_add_epi32(y, x);
 1631|  18.7k|    v[2] = _mm256_add_epi32(v[2], rnding);
 1632|  18.7k|    v[2] = _mm256_srai_epi32(v[2], bit);
 1633|       |
 1634|  18.7k|    v[3] = _mm256_sub_epi32(y, x);
 1635|  18.7k|    v[3] = _mm256_add_epi32(v[3], rnding);
 1636|  18.7k|    v[3] = _mm256_srai_epi32(v[3], bit);
 1637|       |
 1638|  18.7k|    y = _mm256_mullo_epi32(v[6], cospi32);
 1639|  18.7k|    x = _mm256_mullo_epi32(v[7], cospi32);
 1640|  18.7k|    v[6] = _mm256_add_epi32(y, x);
 1641|  18.7k|    v[6] = _mm256_add_epi32(v[6], rnding);
 1642|  18.7k|    v[6] = _mm256_srai_epi32(v[6], bit);
 1643|       |
 1644|  18.7k|    v[7] = _mm256_sub_epi32(y, x);
 1645|  18.7k|    v[7] = _mm256_add_epi32(v[7], rnding);
 1646|  18.7k|    v[7] = _mm256_srai_epi32(v[7], bit);
 1647|       |
 1648|  18.7k|    y = _mm256_mullo_epi32(v[10], cospi32);
 1649|  18.7k|    x = _mm256_mullo_epi32(v[11], cospi32);
 1650|  18.7k|    v[10] = _mm256_add_epi32(y, x);
 1651|  18.7k|    v[10] = _mm256_add_epi32(v[10], rnding);
 1652|  18.7k|    v[10] = _mm256_srai_epi32(v[10], bit);
 1653|       |
 1654|  18.7k|    v[11] = _mm256_sub_epi32(y, x);
 1655|  18.7k|    v[11] = _mm256_add_epi32(v[11], rnding);
 1656|  18.7k|    v[11] = _mm256_srai_epi32(v[11], bit);
 1657|       |
 1658|  18.7k|    y = _mm256_mullo_epi32(v[14], cospi32);
 1659|  18.7k|    x = _mm256_mullo_epi32(v[15], cospi32);
 1660|  18.7k|    v[14] = _mm256_add_epi32(y, x);
 1661|  18.7k|    v[14] = _mm256_add_epi32(v[14], rnding);
 1662|  18.7k|    v[14] = _mm256_srai_epi32(v[14], bit);
 1663|       |
 1664|  18.7k|    v[15] = _mm256_sub_epi32(y, x);
 1665|  18.7k|    v[15] = _mm256_add_epi32(v[15], rnding);
 1666|  18.7k|    v[15] = _mm256_srai_epi32(v[15], bit);
 1667|       |
 1668|       |    // stage 9
 1669|  18.7k|    if (do_cols) {
  ------------------
  |  Branch (1669:9): [True: 10.5k, False: 8.22k]
  ------------------
 1670|  10.5k|      out[0] = v[0];
 1671|  10.5k|      out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), v[8]);
 1672|  10.5k|      out[2] = v[12];
 1673|  10.5k|      out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), v[4]);
 1674|  10.5k|      out[4] = v[6];
 1675|  10.5k|      out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), v[14]);
 1676|  10.5k|      out[6] = v[10];
 1677|  10.5k|      out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), v[2]);
 1678|  10.5k|      out[8] = v[3];
 1679|  10.5k|      out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), v[11]);
 1680|  10.5k|      out[10] = v[15];
 1681|  10.5k|      out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), v[7]);
 1682|  10.5k|      out[12] = v[5];
 1683|  10.5k|      out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), v[13]);
 1684|  10.5k|      out[14] = v[9];
 1685|  10.5k|      out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), v[1]);
 1686|  10.5k|    } else {
 1687|  8.22k|      const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|  8.22k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 8.22k]
  |  |  ------------------
  ------------------
 1688|  8.22k|      const __m256i clamp_lo_out =
 1689|  8.22k|          _mm256_set1_epi32(-(1 << (log_range_out - 1)));
 1690|  8.22k|      const __m256i clamp_hi_out =
 1691|  8.22k|          _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
 1692|       |
 1693|  8.22k|      neg_shift_avx2(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
 1694|  8.22k|                     out_shift);
 1695|  8.22k|      neg_shift_avx2(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
 1696|  8.22k|                     &clamp_hi_out, out_shift);
 1697|  8.22k|      neg_shift_avx2(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
 1698|  8.22k|                     &clamp_hi_out, out_shift);
 1699|  8.22k|      neg_shift_avx2(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
 1700|  8.22k|                     &clamp_hi_out, out_shift);
 1701|  8.22k|      neg_shift_avx2(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
 1702|  8.22k|                     &clamp_hi_out, out_shift);
 1703|  8.22k|      neg_shift_avx2(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
 1704|  8.22k|                     &clamp_hi_out, out_shift);
 1705|  8.22k|      neg_shift_avx2(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
 1706|  8.22k|                     &clamp_hi_out, out_shift);
 1707|  8.22k|      neg_shift_avx2(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
 1708|  8.22k|                     &clamp_hi_out, out_shift);
 1709|  8.22k|    }
 1710|  18.7k|  }
 1711|  18.7k|}
highbd_inv_txfm_avx2.c:iadst16_low8_avx2:
 1714|  71.4k|                              int bd, int out_shift) {
 1715|  71.4k|  const int32_t *cospi = cospi_arr(bit);
 1716|  71.4k|  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
 1717|  71.4k|  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
 1718|  71.4k|  const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
 1719|  71.4k|  const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
 1720|  71.4k|  const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
 1721|  71.4k|  const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
 1722|  71.4k|  const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
 1723|  71.4k|  const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
 1724|  71.4k|  const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
 1725|  71.4k|  const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
 1726|  71.4k|  const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
 1727|  71.4k|  const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
 1728|  71.4k|  const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
 1729|  71.4k|  const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
 1730|  71.4k|  const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
 1731|  71.4k|  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
 1732|  71.4k|  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
 1733|  71.4k|  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
 1734|  71.4k|  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
 1735|  71.4k|  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
 1736|  71.4k|  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
 1737|  71.4k|  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
 1738|  71.4k|  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
 1739|  71.4k|  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
 1740|  71.4k|  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
 1741|  71.4k|  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
 1742|  71.4k|  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
 1743|  71.4k|  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|   142k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 71.4k]
  |  |  |  Branch (35:31): [True: 40.5k, False: 30.8k]
  |  |  |  Branch (35:44): [True: 40.5k, False: 30.8k]
  |  |  ------------------
  ------------------
 1744|  71.4k|  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
 1745|  71.4k|  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
 1746|  71.4k|  __m256i u[16], x, y;
 1747|       |
 1748|  71.4k|  {
 1749|       |    // stage 0
 1750|       |    // stage 1
 1751|       |    // stage 2
 1752|  71.4k|    __m256i zero = _mm256_setzero_si256();
 1753|  71.4k|    x = _mm256_mullo_epi32(in[0], cospi62);
 1754|  71.4k|    u[0] = _mm256_add_epi32(x, rnding);
 1755|  71.4k|    u[0] = _mm256_srai_epi32(u[0], bit);
 1756|       |
 1757|  71.4k|    x = _mm256_mullo_epi32(in[0], cospi2);
 1758|  71.4k|    u[1] = _mm256_sub_epi32(zero, x);
 1759|  71.4k|    u[1] = _mm256_add_epi32(u[1], rnding);
 1760|  71.4k|    u[1] = _mm256_srai_epi32(u[1], bit);
 1761|       |
 1762|  71.4k|    x = _mm256_mullo_epi32(in[2], cospi54);
 1763|  71.4k|    u[2] = _mm256_add_epi32(x, rnding);
 1764|  71.4k|    u[2] = _mm256_srai_epi32(u[2], bit);
 1765|       |
 1766|  71.4k|    x = _mm256_mullo_epi32(in[2], cospi10);
 1767|  71.4k|    u[3] = _mm256_sub_epi32(zero, x);
 1768|  71.4k|    u[3] = _mm256_add_epi32(u[3], rnding);
 1769|  71.4k|    u[3] = _mm256_srai_epi32(u[3], bit);
 1770|       |
 1771|  71.4k|    x = _mm256_mullo_epi32(in[4], cospi46);
 1772|  71.4k|    u[4] = _mm256_add_epi32(x, rnding);
 1773|  71.4k|    u[4] = _mm256_srai_epi32(u[4], bit);
 1774|       |
 1775|  71.4k|    x = _mm256_mullo_epi32(in[4], cospi18);
 1776|  71.4k|    u[5] = _mm256_sub_epi32(zero, x);
 1777|  71.4k|    u[5] = _mm256_add_epi32(u[5], rnding);
 1778|  71.4k|    u[5] = _mm256_srai_epi32(u[5], bit);
 1779|       |
 1780|  71.4k|    x = _mm256_mullo_epi32(in[6], cospi38);
 1781|  71.4k|    u[6] = _mm256_add_epi32(x, rnding);
 1782|  71.4k|    u[6] = _mm256_srai_epi32(u[6], bit);
 1783|       |
 1784|  71.4k|    x = _mm256_mullo_epi32(in[6], cospi26);
 1785|  71.4k|    u[7] = _mm256_sub_epi32(zero, x);
 1786|  71.4k|    u[7] = _mm256_add_epi32(u[7], rnding);
 1787|  71.4k|    u[7] = _mm256_srai_epi32(u[7], bit);
 1788|       |
 1789|  71.4k|    u[8] = _mm256_mullo_epi32(in[7], cospi34);
 1790|  71.4k|    u[8] = _mm256_add_epi32(u[8], rnding);
 1791|  71.4k|    u[8] = _mm256_srai_epi32(u[8], bit);
 1792|       |
 1793|  71.4k|    u[9] = _mm256_mullo_epi32(in[7], cospi30);
 1794|  71.4k|    u[9] = _mm256_add_epi32(u[9], rnding);
 1795|  71.4k|    u[9] = _mm256_srai_epi32(u[9], bit);
 1796|       |
 1797|  71.4k|    u[10] = _mm256_mullo_epi32(in[5], cospi42);
 1798|  71.4k|    u[10] = _mm256_add_epi32(u[10], rnding);
 1799|  71.4k|    u[10] = _mm256_srai_epi32(u[10], bit);
 1800|       |
 1801|  71.4k|    u[11] = _mm256_mullo_epi32(in[5], cospi22);
 1802|  71.4k|    u[11] = _mm256_add_epi32(u[11], rnding);
 1803|  71.4k|    u[11] = _mm256_srai_epi32(u[11], bit);
 1804|       |
 1805|  71.4k|    u[12] = _mm256_mullo_epi32(in[3], cospi50);
 1806|  71.4k|    u[12] = _mm256_add_epi32(u[12], rnding);
 1807|  71.4k|    u[12] = _mm256_srai_epi32(u[12], bit);
 1808|       |
 1809|  71.4k|    u[13] = _mm256_mullo_epi32(in[3], cospi14);
 1810|  71.4k|    u[13] = _mm256_add_epi32(u[13], rnding);
 1811|  71.4k|    u[13] = _mm256_srai_epi32(u[13], bit);
 1812|       |
 1813|  71.4k|    u[14] = _mm256_mullo_epi32(in[1], cospi58);
 1814|  71.4k|    u[14] = _mm256_add_epi32(u[14], rnding);
 1815|  71.4k|    u[14] = _mm256_srai_epi32(u[14], bit);
 1816|       |
 1817|  71.4k|    u[15] = _mm256_mullo_epi32(in[1], cospi6);
 1818|  71.4k|    u[15] = _mm256_add_epi32(u[15], rnding);
 1819|  71.4k|    u[15] = _mm256_srai_epi32(u[15], bit);
 1820|       |
 1821|       |    // stage 3
 1822|  71.4k|    addsub_avx2(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
 1823|  71.4k|    addsub_avx2(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
 1824|  71.4k|    addsub_avx2(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
 1825|  71.4k|    addsub_avx2(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
 1826|  71.4k|    addsub_avx2(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
 1827|  71.4k|    addsub_avx2(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
 1828|  71.4k|    addsub_avx2(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
 1829|  71.4k|    addsub_avx2(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
 1830|       |
 1831|       |    // stage 4
 1832|  71.4k|    y = _mm256_mullo_epi32(u[8], cospi56);
 1833|  71.4k|    x = _mm256_mullo_epi32(u[9], cospi56);
 1834|  71.4k|    u[8] = _mm256_mullo_epi32(u[8], cospi8);
 1835|  71.4k|    u[8] = _mm256_add_epi32(u[8], x);
 1836|  71.4k|    u[8] = _mm256_add_epi32(u[8], rnding);
 1837|  71.4k|    u[8] = _mm256_srai_epi32(u[8], bit);
 1838|       |
 1839|  71.4k|    x = _mm256_mullo_epi32(u[9], cospi8);
 1840|  71.4k|    u[9] = _mm256_sub_epi32(y, x);
 1841|  71.4k|    u[9] = _mm256_add_epi32(u[9], rnding);
 1842|  71.4k|    u[9] = _mm256_srai_epi32(u[9], bit);
 1843|       |
 1844|  71.4k|    x = _mm256_mullo_epi32(u[11], cospi24);
 1845|  71.4k|    y = _mm256_mullo_epi32(u[10], cospi24);
 1846|  71.4k|    u[10] = _mm256_mullo_epi32(u[10], cospi40);
 1847|  71.4k|    u[10] = _mm256_add_epi32(u[10], x);
 1848|  71.4k|    u[10] = _mm256_add_epi32(u[10], rnding);
 1849|  71.4k|    u[10] = _mm256_srai_epi32(u[10], bit);
 1850|       |
 1851|  71.4k|    x = _mm256_mullo_epi32(u[11], cospi40);
 1852|  71.4k|    u[11] = _mm256_sub_epi32(y, x);
 1853|  71.4k|    u[11] = _mm256_add_epi32(u[11], rnding);
 1854|  71.4k|    u[11] = _mm256_srai_epi32(u[11], bit);
 1855|       |
 1856|  71.4k|    x = _mm256_mullo_epi32(u[13], cospi8);
 1857|  71.4k|    y = _mm256_mullo_epi32(u[12], cospi8);
 1858|  71.4k|    u[12] = _mm256_mullo_epi32(u[12], cospim56);
 1859|  71.4k|    u[12] = _mm256_add_epi32(u[12], x);
 1860|  71.4k|    u[12] = _mm256_add_epi32(u[12], rnding);
 1861|  71.4k|    u[12] = _mm256_srai_epi32(u[12], bit);
 1862|       |
 1863|  71.4k|    x = _mm256_mullo_epi32(u[13], cospim56);
 1864|  71.4k|    u[13] = _mm256_sub_epi32(y, x);
 1865|  71.4k|    u[13] = _mm256_add_epi32(u[13], rnding);
 1866|  71.4k|    u[13] = _mm256_srai_epi32(u[13], bit);
 1867|       |
 1868|  71.4k|    x = _mm256_mullo_epi32(u[15], cospi40);
 1869|  71.4k|    y = _mm256_mullo_epi32(u[14], cospi40);
 1870|  71.4k|    u[14] = _mm256_mullo_epi32(u[14], cospim24);
 1871|  71.4k|    u[14] = _mm256_add_epi32(u[14], x);
 1872|  71.4k|    u[14] = _mm256_add_epi32(u[14], rnding);
 1873|  71.4k|    u[14] = _mm256_srai_epi32(u[14], bit);
 1874|       |
 1875|  71.4k|    x = _mm256_mullo_epi32(u[15], cospim24);
 1876|  71.4k|    u[15] = _mm256_sub_epi32(y, x);
 1877|  71.4k|    u[15] = _mm256_add_epi32(u[15], rnding);
 1878|  71.4k|    u[15] = _mm256_srai_epi32(u[15], bit);
 1879|       |
 1880|       |    // stage 5
 1881|  71.4k|    addsub_avx2(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
 1882|  71.4k|    addsub_avx2(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
 1883|  71.4k|    addsub_avx2(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
 1884|  71.4k|    addsub_avx2(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
 1885|  71.4k|    addsub_avx2(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
 1886|  71.4k|    addsub_avx2(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
 1887|  71.4k|    addsub_avx2(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
 1888|  71.4k|    addsub_avx2(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
 1889|       |
 1890|       |    // stage 6
 1891|  71.4k|    x = _mm256_mullo_epi32(u[5], cospi48);
 1892|  71.4k|    y = _mm256_mullo_epi32(u[4], cospi48);
 1893|  71.4k|    u[4] = _mm256_mullo_epi32(u[4], cospi16);
 1894|  71.4k|    u[4] = _mm256_add_epi32(u[4], x);
 1895|  71.4k|    u[4] = _mm256_add_epi32(u[4], rnding);
 1896|  71.4k|    u[4] = _mm256_srai_epi32(u[4], bit);
 1897|       |
 1898|  71.4k|    x = _mm256_mullo_epi32(u[5], cospi16);
 1899|  71.4k|    u[5] = _mm256_sub_epi32(y, x);
 1900|  71.4k|    u[5] = _mm256_add_epi32(u[5], rnding);
 1901|  71.4k|    u[5] = _mm256_srai_epi32(u[5], bit);
 1902|       |
 1903|  71.4k|    x = _mm256_mullo_epi32(u[7], cospi16);
 1904|  71.4k|    y = _mm256_mullo_epi32(u[6], cospi16);
 1905|  71.4k|    u[6] = _mm256_mullo_epi32(u[6], cospim48);
 1906|  71.4k|    u[6] = _mm256_add_epi32(u[6], x);
 1907|  71.4k|    u[6] = _mm256_add_epi32(u[6], rnding);
 1908|  71.4k|    u[6] = _mm256_srai_epi32(u[6], bit);
 1909|       |
 1910|  71.4k|    x = _mm256_mullo_epi32(u[7], cospim48);
 1911|  71.4k|    u[7] = _mm256_sub_epi32(y, x);
 1912|  71.4k|    u[7] = _mm256_add_epi32(u[7], rnding);
 1913|  71.4k|    u[7] = _mm256_srai_epi32(u[7], bit);
 1914|       |
 1915|  71.4k|    x = _mm256_mullo_epi32(u[13], cospi48);
 1916|  71.4k|    y = _mm256_mullo_epi32(u[12], cospi48);
 1917|  71.4k|    u[12] = _mm256_mullo_epi32(u[12], cospi16);
 1918|  71.4k|    u[12] = _mm256_add_epi32(u[12], x);
 1919|  71.4k|    u[12] = _mm256_add_epi32(u[12], rnding);
 1920|  71.4k|    u[12] = _mm256_srai_epi32(u[12], bit);
 1921|       |
 1922|  71.4k|    x = _mm256_mullo_epi32(u[13], cospi16);
 1923|  71.4k|    u[13] = _mm256_sub_epi32(y, x);
 1924|  71.4k|    u[13] = _mm256_add_epi32(u[13], rnding);
 1925|  71.4k|    u[13] = _mm256_srai_epi32(u[13], bit);
 1926|       |
 1927|  71.4k|    x = _mm256_mullo_epi32(u[15], cospi16);
 1928|  71.4k|    y = _mm256_mullo_epi32(u[14], cospi16);
 1929|  71.4k|    u[14] = _mm256_mullo_epi32(u[14], cospim48);
 1930|  71.4k|    u[14] = _mm256_add_epi32(u[14], x);
 1931|  71.4k|    u[14] = _mm256_add_epi32(u[14], rnding);
 1932|  71.4k|    u[14] = _mm256_srai_epi32(u[14], bit);
 1933|       |
 1934|  71.4k|    x = _mm256_mullo_epi32(u[15], cospim48);
 1935|  71.4k|    u[15] = _mm256_sub_epi32(y, x);
 1936|  71.4k|    u[15] = _mm256_add_epi32(u[15], rnding);
 1937|  71.4k|    u[15] = _mm256_srai_epi32(u[15], bit);
 1938|       |
 1939|       |    // stage 7
 1940|  71.4k|    addsub_avx2(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
 1941|  71.4k|    addsub_avx2(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
 1942|  71.4k|    addsub_avx2(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
 1943|  71.4k|    addsub_avx2(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
 1944|  71.4k|    addsub_avx2(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
 1945|  71.4k|    addsub_avx2(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
 1946|  71.4k|    addsub_avx2(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
 1947|  71.4k|    addsub_avx2(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
 1948|       |
 1949|       |    // stage 8
 1950|  71.4k|    y = _mm256_mullo_epi32(u[2], cospi32);
 1951|  71.4k|    x = _mm256_mullo_epi32(u[3], cospi32);
 1952|  71.4k|    u[2] = _mm256_add_epi32(y, x);
 1953|  71.4k|    u[2] = _mm256_add_epi32(u[2], rnding);
 1954|  71.4k|    u[2] = _mm256_srai_epi32(u[2], bit);
 1955|       |
 1956|  71.4k|    u[3] = _mm256_sub_epi32(y, x);
 1957|  71.4k|    u[3] = _mm256_add_epi32(u[3], rnding);
 1958|  71.4k|    u[3] = _mm256_srai_epi32(u[3], bit);
 1959|  71.4k|    y = _mm256_mullo_epi32(u[6], cospi32);
 1960|  71.4k|    x = _mm256_mullo_epi32(u[7], cospi32);
 1961|  71.4k|    u[6] = _mm256_add_epi32(y, x);
 1962|  71.4k|    u[6] = _mm256_add_epi32(u[6], rnding);
 1963|  71.4k|    u[6] = _mm256_srai_epi32(u[6], bit);
 1964|       |
 1965|  71.4k|    u[7] = _mm256_sub_epi32(y, x);
 1966|  71.4k|    u[7] = _mm256_add_epi32(u[7], rnding);
 1967|  71.4k|    u[7] = _mm256_srai_epi32(u[7], bit);
 1968|       |
 1969|  71.4k|    y = _mm256_mullo_epi32(u[10], cospi32);
 1970|  71.4k|    x = _mm256_mullo_epi32(u[11], cospi32);
 1971|  71.4k|    u[10] = _mm256_add_epi32(y, x);
 1972|  71.4k|    u[10] = _mm256_add_epi32(u[10], rnding);
 1973|  71.4k|    u[10] = _mm256_srai_epi32(u[10], bit);
 1974|       |
 1975|  71.4k|    u[11] = _mm256_sub_epi32(y, x);
 1976|  71.4k|    u[11] = _mm256_add_epi32(u[11], rnding);
 1977|  71.4k|    u[11] = _mm256_srai_epi32(u[11], bit);
 1978|       |
 1979|  71.4k|    y = _mm256_mullo_epi32(u[14], cospi32);
 1980|  71.4k|    x = _mm256_mullo_epi32(u[15], cospi32);
 1981|  71.4k|    u[14] = _mm256_add_epi32(y, x);
 1982|  71.4k|    u[14] = _mm256_add_epi32(u[14], rnding);
 1983|  71.4k|    u[14] = _mm256_srai_epi32(u[14], bit);
 1984|       |
 1985|  71.4k|    u[15] = _mm256_sub_epi32(y, x);
 1986|  71.4k|    u[15] = _mm256_add_epi32(u[15], rnding);
 1987|  71.4k|    u[15] = _mm256_srai_epi32(u[15], bit);
 1988|       |
 1989|       |    // stage 9
 1990|  71.4k|    if (do_cols) {
  ------------------
  |  Branch (1990:9): [True: 40.5k, False: 30.8k]
  ------------------
 1991|  40.5k|      out[0] = u[0];
 1992|  40.5k|      out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), u[8]);
 1993|  40.5k|      out[2] = u[12];
 1994|  40.5k|      out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), u[4]);
 1995|  40.5k|      out[4] = u[6];
 1996|  40.5k|      out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), u[14]);
 1997|  40.5k|      out[6] = u[10];
 1998|  40.5k|      out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), u[2]);
 1999|  40.5k|      out[8] = u[3];
 2000|  40.5k|      out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), u[11]);
 2001|  40.5k|      out[10] = u[15];
 2002|  40.5k|      out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), u[7]);
 2003|  40.5k|      out[12] = u[5];
 2004|  40.5k|      out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), u[13]);
 2005|  40.5k|      out[14] = u[9];
 2006|  40.5k|      out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), u[1]);
 2007|  40.5k|    } else {
 2008|  30.8k|      const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|  30.8k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 30.8k]
  |  |  ------------------
  ------------------
 2009|  30.8k|      const __m256i clamp_lo_out =
 2010|  30.8k|          _mm256_set1_epi32(-(1 << (log_range_out - 1)));
 2011|  30.8k|      const __m256i clamp_hi_out =
 2012|  30.8k|          _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
 2013|       |
 2014|  30.8k|      neg_shift_avx2(u[0], u[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
 2015|  30.8k|                     out_shift);
 2016|  30.8k|      neg_shift_avx2(u[12], u[4], out + 2, out + 3, &clamp_lo_out,
 2017|  30.8k|                     &clamp_hi_out, out_shift);
 2018|  30.8k|      neg_shift_avx2(u[6], u[14], out + 4, out + 5, &clamp_lo_out,
 2019|  30.8k|                     &clamp_hi_out, out_shift);
 2020|  30.8k|      neg_shift_avx2(u[10], u[2], out + 6, out + 7, &clamp_lo_out,
 2021|  30.8k|                     &clamp_hi_out, out_shift);
 2022|  30.8k|      neg_shift_avx2(u[3], u[11], out + 8, out + 9, &clamp_lo_out,
 2023|  30.8k|                     &clamp_hi_out, out_shift);
 2024|  30.8k|      neg_shift_avx2(u[15], u[7], out + 10, out + 11, &clamp_lo_out,
 2025|  30.8k|                     &clamp_hi_out, out_shift);
 2026|  30.8k|      neg_shift_avx2(u[5], u[13], out + 12, out + 13, &clamp_lo_out,
 2027|  30.8k|                     &clamp_hi_out, out_shift);
 2028|  30.8k|      neg_shift_avx2(u[9], u[1], out + 14, out + 15, &clamp_lo_out,
 2029|  30.8k|                     &clamp_hi_out, out_shift);
 2030|  30.8k|    }
 2031|  71.4k|  }
 2032|  71.4k|}
highbd_inv_txfm_avx2.c:iadst16_avx2:
 2035|  56.8k|                         int bd, int out_shift) {
 2036|  56.8k|  const int32_t *cospi = cospi_arr(bit);
 2037|  56.8k|  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
 2038|  56.8k|  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
 2039|  56.8k|  const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
 2040|  56.8k|  const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
 2041|  56.8k|  const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
 2042|  56.8k|  const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
 2043|  56.8k|  const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
 2044|  56.8k|  const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
 2045|  56.8k|  const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
 2046|  56.8k|  const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
 2047|  56.8k|  const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
 2048|  56.8k|  const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
 2049|  56.8k|  const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
 2050|  56.8k|  const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
 2051|  56.8k|  const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
 2052|  56.8k|  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
 2053|  56.8k|  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
 2054|  56.8k|  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
 2055|  56.8k|  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
 2056|  56.8k|  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
 2057|  56.8k|  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
 2058|  56.8k|  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
 2059|  56.8k|  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
 2060|  56.8k|  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
 2061|  56.8k|  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
 2062|  56.8k|  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
 2063|  56.8k|  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
 2064|  56.8k|  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|   113k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 56.8k]
  |  |  |  Branch (35:31): [True: 24.9k, False: 31.8k]
  |  |  |  Branch (35:44): [True: 24.9k, False: 31.8k]
  |  |  ------------------
  ------------------
 2065|  56.8k|  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
 2066|  56.8k|  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
 2067|  56.8k|  __m256i u[16], v[16], x, y;
 2068|       |
 2069|  56.8k|  {
 2070|       |    // stage 0
 2071|       |    // stage 1
 2072|       |    // stage 2
 2073|  56.8k|    v[0] = _mm256_mullo_epi32(in[15], cospi2);
 2074|  56.8k|    x = _mm256_mullo_epi32(in[0], cospi62);
 2075|  56.8k|    v[0] = _mm256_add_epi32(v[0], x);
 2076|  56.8k|    v[0] = _mm256_add_epi32(v[0], rnding);
 2077|  56.8k|    v[0] = _mm256_srai_epi32(v[0], bit);
 2078|       |
 2079|  56.8k|    v[1] = _mm256_mullo_epi32(in[15], cospi62);
 2080|  56.8k|    x = _mm256_mullo_epi32(in[0], cospi2);
 2081|  56.8k|    v[1] = _mm256_sub_epi32(v[1], x);
 2082|  56.8k|    v[1] = _mm256_add_epi32(v[1], rnding);
 2083|  56.8k|    v[1] = _mm256_srai_epi32(v[1], bit);
 2084|       |
 2085|  56.8k|    v[2] = _mm256_mullo_epi32(in[13], cospi10);
 2086|  56.8k|    x = _mm256_mullo_epi32(in[2], cospi54);
 2087|  56.8k|    v[2] = _mm256_add_epi32(v[2], x);
 2088|  56.8k|    v[2] = _mm256_add_epi32(v[2], rnding);
 2089|  56.8k|    v[2] = _mm256_srai_epi32(v[2], bit);
 2090|       |
 2091|  56.8k|    v[3] = _mm256_mullo_epi32(in[13], cospi54);
 2092|  56.8k|    x = _mm256_mullo_epi32(in[2], cospi10);
 2093|  56.8k|    v[3] = _mm256_sub_epi32(v[3], x);
 2094|  56.8k|    v[3] = _mm256_add_epi32(v[3], rnding);
 2095|  56.8k|    v[3] = _mm256_srai_epi32(v[3], bit);
 2096|       |
 2097|  56.8k|    v[4] = _mm256_mullo_epi32(in[11], cospi18);
 2098|  56.8k|    x = _mm256_mullo_epi32(in[4], cospi46);
 2099|  56.8k|    v[4] = _mm256_add_epi32(v[4], x);
 2100|  56.8k|    v[4] = _mm256_add_epi32(v[4], rnding);
 2101|  56.8k|    v[4] = _mm256_srai_epi32(v[4], bit);
 2102|       |
 2103|  56.8k|    v[5] = _mm256_mullo_epi32(in[11], cospi46);
 2104|  56.8k|    x = _mm256_mullo_epi32(in[4], cospi18);
 2105|  56.8k|    v[5] = _mm256_sub_epi32(v[5], x);
 2106|  56.8k|    v[5] = _mm256_add_epi32(v[5], rnding);
 2107|  56.8k|    v[5] = _mm256_srai_epi32(v[5], bit);
 2108|       |
 2109|  56.8k|    v[6] = _mm256_mullo_epi32(in[9], cospi26);
 2110|  56.8k|    x = _mm256_mullo_epi32(in[6], cospi38);
 2111|  56.8k|    v[6] = _mm256_add_epi32(v[6], x);
 2112|  56.8k|    v[6] = _mm256_add_epi32(v[6], rnding);
 2113|  56.8k|    v[6] = _mm256_srai_epi32(v[6], bit);
 2114|       |
 2115|  56.8k|    v[7] = _mm256_mullo_epi32(in[9], cospi38);
 2116|  56.8k|    x = _mm256_mullo_epi32(in[6], cospi26);
 2117|  56.8k|    v[7] = _mm256_sub_epi32(v[7], x);
 2118|  56.8k|    v[7] = _mm256_add_epi32(v[7], rnding);
 2119|  56.8k|    v[7] = _mm256_srai_epi32(v[7], bit);
 2120|       |
 2121|  56.8k|    v[8] = _mm256_mullo_epi32(in[7], cospi34);
 2122|  56.8k|    x = _mm256_mullo_epi32(in[8], cospi30);
 2123|  56.8k|    v[8] = _mm256_add_epi32(v[8], x);
 2124|  56.8k|    v[8] = _mm256_add_epi32(v[8], rnding);
 2125|  56.8k|    v[8] = _mm256_srai_epi32(v[8], bit);
 2126|       |
 2127|  56.8k|    v[9] = _mm256_mullo_epi32(in[7], cospi30);
 2128|  56.8k|    x = _mm256_mullo_epi32(in[8], cospi34);
 2129|  56.8k|    v[9] = _mm256_sub_epi32(v[9], x);
 2130|  56.8k|    v[9] = _mm256_add_epi32(v[9], rnding);
 2131|  56.8k|    v[9] = _mm256_srai_epi32(v[9], bit);
 2132|       |
 2133|  56.8k|    v[10] = _mm256_mullo_epi32(in[5], cospi42);
 2134|  56.8k|    x = _mm256_mullo_epi32(in[10], cospi22);
 2135|  56.8k|    v[10] = _mm256_add_epi32(v[10], x);
 2136|  56.8k|    v[10] = _mm256_add_epi32(v[10], rnding);
 2137|  56.8k|    v[10] = _mm256_srai_epi32(v[10], bit);
 2138|       |
 2139|  56.8k|    v[11] = _mm256_mullo_epi32(in[5], cospi22);
 2140|  56.8k|    x = _mm256_mullo_epi32(in[10], cospi42);
 2141|  56.8k|    v[11] = _mm256_sub_epi32(v[11], x);
 2142|  56.8k|    v[11] = _mm256_add_epi32(v[11], rnding);
 2143|  56.8k|    v[11] = _mm256_srai_epi32(v[11], bit);
 2144|       |
 2145|  56.8k|    v[12] = _mm256_mullo_epi32(in[3], cospi50);
 2146|  56.8k|    x = _mm256_mullo_epi32(in[12], cospi14);
 2147|  56.8k|    v[12] = _mm256_add_epi32(v[12], x);
 2148|  56.8k|    v[12] = _mm256_add_epi32(v[12], rnding);
 2149|  56.8k|    v[12] = _mm256_srai_epi32(v[12], bit);
 2150|       |
 2151|  56.8k|    v[13] = _mm256_mullo_epi32(in[3], cospi14);
 2152|  56.8k|    x = _mm256_mullo_epi32(in[12], cospi50);
 2153|  56.8k|    v[13] = _mm256_sub_epi32(v[13], x);
 2154|  56.8k|    v[13] = _mm256_add_epi32(v[13], rnding);
 2155|  56.8k|    v[13] = _mm256_srai_epi32(v[13], bit);
 2156|       |
 2157|  56.8k|    v[14] = _mm256_mullo_epi32(in[1], cospi58);
 2158|  56.8k|    x = _mm256_mullo_epi32(in[14], cospi6);
 2159|  56.8k|    v[14] = _mm256_add_epi32(v[14], x);
 2160|  56.8k|    v[14] = _mm256_add_epi32(v[14], rnding);
 2161|  56.8k|    v[14] = _mm256_srai_epi32(v[14], bit);
 2162|       |
 2163|  56.8k|    v[15] = _mm256_mullo_epi32(in[1], cospi6);
 2164|  56.8k|    x = _mm256_mullo_epi32(in[14], cospi58);
 2165|  56.8k|    v[15] = _mm256_sub_epi32(v[15], x);
 2166|  56.8k|    v[15] = _mm256_add_epi32(v[15], rnding);
 2167|  56.8k|    v[15] = _mm256_srai_epi32(v[15], bit);
 2168|       |
 2169|       |    // stage 3
 2170|  56.8k|    addsub_avx2(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
 2171|  56.8k|    addsub_avx2(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
 2172|  56.8k|    addsub_avx2(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
 2173|  56.8k|    addsub_avx2(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
 2174|  56.8k|    addsub_avx2(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
 2175|  56.8k|    addsub_avx2(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
 2176|  56.8k|    addsub_avx2(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
 2177|  56.8k|    addsub_avx2(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
 2178|       |
 2179|       |    // stage 4
 2180|  56.8k|    v[0] = u[0];
 2181|  56.8k|    v[1] = u[1];
 2182|  56.8k|    v[2] = u[2];
 2183|  56.8k|    v[3] = u[3];
 2184|  56.8k|    v[4] = u[4];
 2185|  56.8k|    v[5] = u[5];
 2186|  56.8k|    v[6] = u[6];
 2187|  56.8k|    v[7] = u[7];
 2188|       |
 2189|  56.8k|    v[8] = _mm256_mullo_epi32(u[8], cospi8);
 2190|  56.8k|    x = _mm256_mullo_epi32(u[9], cospi56);
 2191|  56.8k|    v[8] = _mm256_add_epi32(v[8], x);
 2192|  56.8k|    v[8] = _mm256_add_epi32(v[8], rnding);
 2193|  56.8k|    v[8] = _mm256_srai_epi32(v[8], bit);
 2194|       |
 2195|  56.8k|    v[9] = _mm256_mullo_epi32(u[8], cospi56);
 2196|  56.8k|    x = _mm256_mullo_epi32(u[9], cospi8);
 2197|  56.8k|    v[9] = _mm256_sub_epi32(v[9], x);
 2198|  56.8k|    v[9] = _mm256_add_epi32(v[9], rnding);
 2199|  56.8k|    v[9] = _mm256_srai_epi32(v[9], bit);
 2200|       |
 2201|  56.8k|    v[10] = _mm256_mullo_epi32(u[10], cospi40);
 2202|  56.8k|    x = _mm256_mullo_epi32(u[11], cospi24);
 2203|  56.8k|    v[10] = _mm256_add_epi32(v[10], x);
 2204|  56.8k|    v[10] = _mm256_add_epi32(v[10], rnding);
 2205|  56.8k|    v[10] = _mm256_srai_epi32(v[10], bit);
 2206|       |
 2207|  56.8k|    v[11] = _mm256_mullo_epi32(u[10], cospi24);
 2208|  56.8k|    x = _mm256_mullo_epi32(u[11], cospi40);
 2209|  56.8k|    v[11] = _mm256_sub_epi32(v[11], x);
 2210|  56.8k|    v[11] = _mm256_add_epi32(v[11], rnding);
 2211|  56.8k|    v[11] = _mm256_srai_epi32(v[11], bit);
 2212|       |
 2213|  56.8k|    v[12] = _mm256_mullo_epi32(u[12], cospim56);
 2214|  56.8k|    x = _mm256_mullo_epi32(u[13], cospi8);
 2215|  56.8k|    v[12] = _mm256_add_epi32(v[12], x);
 2216|  56.8k|    v[12] = _mm256_add_epi32(v[12], rnding);
 2217|  56.8k|    v[12] = _mm256_srai_epi32(v[12], bit);
 2218|       |
 2219|  56.8k|    v[13] = _mm256_mullo_epi32(u[12], cospi8);
 2220|  56.8k|    x = _mm256_mullo_epi32(u[13], cospim56);
 2221|  56.8k|    v[13] = _mm256_sub_epi32(v[13], x);
 2222|  56.8k|    v[13] = _mm256_add_epi32(v[13], rnding);
 2223|  56.8k|    v[13] = _mm256_srai_epi32(v[13], bit);
 2224|       |
 2225|  56.8k|    v[14] = _mm256_mullo_epi32(u[14], cospim24);
 2226|  56.8k|    x = _mm256_mullo_epi32(u[15], cospi40);
 2227|  56.8k|    v[14] = _mm256_add_epi32(v[14], x);
 2228|  56.8k|    v[14] = _mm256_add_epi32(v[14], rnding);
 2229|  56.8k|    v[14] = _mm256_srai_epi32(v[14], bit);
 2230|       |
 2231|  56.8k|    v[15] = _mm256_mullo_epi32(u[14], cospi40);
 2232|  56.8k|    x = _mm256_mullo_epi32(u[15], cospim24);
 2233|  56.8k|    v[15] = _mm256_sub_epi32(v[15], x);
 2234|  56.8k|    v[15] = _mm256_add_epi32(v[15], rnding);
 2235|  56.8k|    v[15] = _mm256_srai_epi32(v[15], bit);
 2236|       |
 2237|       |    // stage 5
 2238|  56.8k|    addsub_avx2(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
 2239|  56.8k|    addsub_avx2(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
 2240|  56.8k|    addsub_avx2(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
 2241|  56.8k|    addsub_avx2(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
 2242|  56.8k|    addsub_avx2(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
 2243|  56.8k|    addsub_avx2(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
 2244|  56.8k|    addsub_avx2(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
 2245|  56.8k|    addsub_avx2(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
 2246|       |
 2247|       |    // stage 6
 2248|  56.8k|    v[0] = u[0];
 2249|  56.8k|    v[1] = u[1];
 2250|  56.8k|    v[2] = u[2];
 2251|  56.8k|    v[3] = u[3];
 2252|       |
 2253|  56.8k|    v[4] = _mm256_mullo_epi32(u[4], cospi16);
 2254|  56.8k|    x = _mm256_mullo_epi32(u[5], cospi48);
 2255|  56.8k|    v[4] = _mm256_add_epi32(v[4], x);
 2256|  56.8k|    v[4] = _mm256_add_epi32(v[4], rnding);
 2257|  56.8k|    v[4] = _mm256_srai_epi32(v[4], bit);
 2258|       |
 2259|  56.8k|    v[5] = _mm256_mullo_epi32(u[4], cospi48);
 2260|  56.8k|    x = _mm256_mullo_epi32(u[5], cospi16);
 2261|  56.8k|    v[5] = _mm256_sub_epi32(v[5], x);
 2262|  56.8k|    v[5] = _mm256_add_epi32(v[5], rnding);
 2263|  56.8k|    v[5] = _mm256_srai_epi32(v[5], bit);
 2264|       |
 2265|  56.8k|    v[6] = _mm256_mullo_epi32(u[6], cospim48);
 2266|  56.8k|    x = _mm256_mullo_epi32(u[7], cospi16);
 2267|  56.8k|    v[6] = _mm256_add_epi32(v[6], x);
 2268|  56.8k|    v[6] = _mm256_add_epi32(v[6], rnding);
 2269|  56.8k|    v[6] = _mm256_srai_epi32(v[6], bit);
 2270|       |
 2271|  56.8k|    v[7] = _mm256_mullo_epi32(u[6], cospi16);
 2272|  56.8k|    x = _mm256_mullo_epi32(u[7], cospim48);
 2273|  56.8k|    v[7] = _mm256_sub_epi32(v[7], x);
 2274|  56.8k|    v[7] = _mm256_add_epi32(v[7], rnding);
 2275|  56.8k|    v[7] = _mm256_srai_epi32(v[7], bit);
 2276|       |
 2277|  56.8k|    v[8] = u[8];
 2278|  56.8k|    v[9] = u[9];
 2279|  56.8k|    v[10] = u[10];
 2280|  56.8k|    v[11] = u[11];
 2281|       |
 2282|  56.8k|    v[12] = _mm256_mullo_epi32(u[12], cospi16);
 2283|  56.8k|    x = _mm256_mullo_epi32(u[13], cospi48);
 2284|  56.8k|    v[12] = _mm256_add_epi32(v[12], x);
 2285|  56.8k|    v[12] = _mm256_add_epi32(v[12], rnding);
 2286|  56.8k|    v[12] = _mm256_srai_epi32(v[12], bit);
 2287|       |
 2288|  56.8k|    v[13] = _mm256_mullo_epi32(u[12], cospi48);
 2289|  56.8k|    x = _mm256_mullo_epi32(u[13], cospi16);
 2290|  56.8k|    v[13] = _mm256_sub_epi32(v[13], x);
 2291|  56.8k|    v[13] = _mm256_add_epi32(v[13], rnding);
 2292|  56.8k|    v[13] = _mm256_srai_epi32(v[13], bit);
 2293|       |
 2294|  56.8k|    v[14] = _mm256_mullo_epi32(u[14], cospim48);
 2295|  56.8k|    x = _mm256_mullo_epi32(u[15], cospi16);
 2296|  56.8k|    v[14] = _mm256_add_epi32(v[14], x);
 2297|  56.8k|    v[14] = _mm256_add_epi32(v[14], rnding);
 2298|  56.8k|    v[14] = _mm256_srai_epi32(v[14], bit);
 2299|       |
 2300|  56.8k|    v[15] = _mm256_mullo_epi32(u[14], cospi16);
 2301|  56.8k|    x = _mm256_mullo_epi32(u[15], cospim48);
 2302|  56.8k|    v[15] = _mm256_sub_epi32(v[15], x);
 2303|  56.8k|    v[15] = _mm256_add_epi32(v[15], rnding);
 2304|  56.8k|    v[15] = _mm256_srai_epi32(v[15], bit);
 2305|       |
 2306|       |    // stage 7
 2307|  56.8k|    addsub_avx2(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
 2308|  56.8k|    addsub_avx2(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
 2309|  56.8k|    addsub_avx2(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
 2310|  56.8k|    addsub_avx2(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
 2311|  56.8k|    addsub_avx2(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
 2312|  56.8k|    addsub_avx2(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
 2313|  56.8k|    addsub_avx2(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
 2314|  56.8k|    addsub_avx2(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
 2315|       |
 2316|       |    // stage 8
 2317|  56.8k|    v[0] = u[0];
 2318|  56.8k|    v[1] = u[1];
 2319|       |
 2320|  56.8k|    y = _mm256_mullo_epi32(u[2], cospi32);
 2321|  56.8k|    x = _mm256_mullo_epi32(u[3], cospi32);
 2322|  56.8k|    v[2] = _mm256_add_epi32(y, x);
 2323|  56.8k|    v[2] = _mm256_add_epi32(v[2], rnding);
 2324|  56.8k|    v[2] = _mm256_srai_epi32(v[2], bit);
 2325|       |
 2326|  56.8k|    v[3] = _mm256_sub_epi32(y, x);
 2327|  56.8k|    v[3] = _mm256_add_epi32(v[3], rnding);
 2328|  56.8k|    v[3] = _mm256_srai_epi32(v[3], bit);
 2329|       |
 2330|  56.8k|    v[4] = u[4];
 2331|  56.8k|    v[5] = u[5];
 2332|       |
 2333|  56.8k|    y = _mm256_mullo_epi32(u[6], cospi32);
 2334|  56.8k|    x = _mm256_mullo_epi32(u[7], cospi32);
 2335|  56.8k|    v[6] = _mm256_add_epi32(y, x);
 2336|  56.8k|    v[6] = _mm256_add_epi32(v[6], rnding);
 2337|  56.8k|    v[6] = _mm256_srai_epi32(v[6], bit);
 2338|       |
 2339|  56.8k|    v[7] = _mm256_sub_epi32(y, x);
 2340|  56.8k|    v[7] = _mm256_add_epi32(v[7], rnding);
 2341|  56.8k|    v[7] = _mm256_srai_epi32(v[7], bit);
 2342|       |
 2343|  56.8k|    v[8] = u[8];
 2344|  56.8k|    v[9] = u[9];
 2345|       |
 2346|  56.8k|    y = _mm256_mullo_epi32(u[10], cospi32);
 2347|  56.8k|    x = _mm256_mullo_epi32(u[11], cospi32);
 2348|  56.8k|    v[10] = _mm256_add_epi32(y, x);
 2349|  56.8k|    v[10] = _mm256_add_epi32(v[10], rnding);
 2350|  56.8k|    v[10] = _mm256_srai_epi32(v[10], bit);
 2351|       |
 2352|  56.8k|    v[11] = _mm256_sub_epi32(y, x);
 2353|  56.8k|    v[11] = _mm256_add_epi32(v[11], rnding);
 2354|  56.8k|    v[11] = _mm256_srai_epi32(v[11], bit);
 2355|       |
 2356|  56.8k|    v[12] = u[12];
 2357|  56.8k|    v[13] = u[13];
 2358|       |
 2359|  56.8k|    y = _mm256_mullo_epi32(u[14], cospi32);
 2360|  56.8k|    x = _mm256_mullo_epi32(u[15], cospi32);
 2361|  56.8k|    v[14] = _mm256_add_epi32(y, x);
 2362|  56.8k|    v[14] = _mm256_add_epi32(v[14], rnding);
 2363|  56.8k|    v[14] = _mm256_srai_epi32(v[14], bit);
 2364|       |
 2365|  56.8k|    v[15] = _mm256_sub_epi32(y, x);
 2366|  56.8k|    v[15] = _mm256_add_epi32(v[15], rnding);
 2367|  56.8k|    v[15] = _mm256_srai_epi32(v[15], bit);
 2368|       |
 2369|       |    // stage 9
 2370|  56.8k|    if (do_cols) {
  ------------------
  |  Branch (2370:9): [True: 24.9k, False: 31.8k]
  ------------------
 2371|  24.9k|      out[0] = v[0];
 2372|  24.9k|      out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), v[8]);
 2373|  24.9k|      out[2] = v[12];
 2374|  24.9k|      out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), v[4]);
 2375|  24.9k|      out[4] = v[6];
 2376|  24.9k|      out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), v[14]);
 2377|  24.9k|      out[6] = v[10];
 2378|  24.9k|      out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), v[2]);
 2379|  24.9k|      out[8] = v[3];
 2380|  24.9k|      out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), v[11]);
 2381|  24.9k|      out[10] = v[15];
 2382|  24.9k|      out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), v[7]);
 2383|  24.9k|      out[12] = v[5];
 2384|  24.9k|      out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), v[13]);
 2385|  24.9k|      out[14] = v[9];
 2386|  24.9k|      out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), v[1]);
 2387|  31.8k|    } else {
 2388|  31.8k|      const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|  31.8k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 31.8k]
  |  |  ------------------
  ------------------
 2389|  31.8k|      const __m256i clamp_lo_out =
 2390|  31.8k|          _mm256_set1_epi32(-(1 << (log_range_out - 1)));
 2391|  31.8k|      const __m256i clamp_hi_out =
 2392|  31.8k|          _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
 2393|       |
 2394|  31.8k|      neg_shift_avx2(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
 2395|  31.8k|                     out_shift);
 2396|  31.8k|      neg_shift_avx2(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
 2397|  31.8k|                     &clamp_hi_out, out_shift);
 2398|  31.8k|      neg_shift_avx2(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
 2399|  31.8k|                     &clamp_hi_out, out_shift);
 2400|  31.8k|      neg_shift_avx2(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
 2401|  31.8k|                     &clamp_hi_out, out_shift);
 2402|  31.8k|      neg_shift_avx2(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
 2403|  31.8k|                     &clamp_hi_out, out_shift);
 2404|  31.8k|      neg_shift_avx2(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
 2405|  31.8k|                     &clamp_hi_out, out_shift);
 2406|  31.8k|      neg_shift_avx2(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
 2407|  31.8k|                     &clamp_hi_out, out_shift);
 2408|  31.8k|      neg_shift_avx2(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
 2409|  31.8k|                     &clamp_hi_out, out_shift);
 2410|  31.8k|    }
 2411|  56.8k|  }
 2412|  56.8k|}
highbd_inv_txfm_avx2.c:idct32_low1_avx2:
  443|  73.3k|                             int bd, int out_shift) {
  444|  73.3k|  const int32_t *cospi = cospi_arr(bit);
  445|  73.3k|  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
  446|  73.3k|  const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
  447|  73.3k|  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|   146k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 73.3k]
  |  |  |  Branch (35:31): [True: 56.9k, False: 16.3k]
  |  |  |  Branch (35:44): [True: 56.9k, False: 16.3k]
  |  |  ------------------
  ------------------
  448|  73.3k|  __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
  449|  73.3k|  __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
  450|  73.3k|  __m256i x;
  451|       |  // stage 0
  452|       |  // stage 1
  453|       |  // stage 2
  454|       |  // stage 3
  455|       |  // stage 4
  456|       |  // stage 5
  457|  73.3k|  x = _mm256_mullo_epi32(in[0], cospi32);
  458|  73.3k|  x = _mm256_add_epi32(x, rounding);
  459|  73.3k|  x = _mm256_srai_epi32(x, bit);
  460|       |
  461|       |  // stage 6
  462|       |  // stage 7
  463|       |  // stage 8
  464|       |  // stage 9
  465|  73.3k|  if (!do_cols) {
  ------------------
  |  Branch (465:7): [True: 16.3k, False: 56.9k]
  ------------------
  466|  16.3k|    const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|  16.3k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 16.3k]
  |  |  ------------------
  ------------------
  467|  16.3k|    __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
  468|  16.3k|    clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
  469|  16.3k|    clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
  470|  16.3k|    x = _mm256_add_epi32(offset, x);
  471|  16.3k|    x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
  472|  16.3k|  }
  473|  73.3k|  x = _mm256_max_epi32(x, clamp_lo);
  474|  73.3k|  x = _mm256_min_epi32(x, clamp_hi);
  475|  73.3k|  out[0] = x;
  476|  73.3k|  out[1] = x;
  477|  73.3k|  out[2] = x;
  478|  73.3k|  out[3] = x;
  479|  73.3k|  out[4] = x;
  480|  73.3k|  out[5] = x;
  481|  73.3k|  out[6] = x;
  482|  73.3k|  out[7] = x;
  483|  73.3k|  out[8] = x;
  484|  73.3k|  out[9] = x;
  485|  73.3k|  out[10] = x;
  486|  73.3k|  out[11] = x;
  487|  73.3k|  out[12] = x;
  488|  73.3k|  out[13] = x;
  489|  73.3k|  out[14] = x;
  490|  73.3k|  out[15] = x;
  491|  73.3k|  out[16] = x;
  492|  73.3k|  out[17] = x;
  493|  73.3k|  out[18] = x;
  494|  73.3k|  out[19] = x;
  495|  73.3k|  out[20] = x;
  496|  73.3k|  out[21] = x;
  497|  73.3k|  out[22] = x;
  498|  73.3k|  out[23] = x;
  499|  73.3k|  out[24] = x;
  500|  73.3k|  out[25] = x;
  501|  73.3k|  out[26] = x;
  502|  73.3k|  out[27] = x;
  503|  73.3k|  out[28] = x;
  504|  73.3k|  out[29] = x;
  505|  73.3k|  out[30] = x;
  506|  73.3k|  out[31] = x;
  507|  73.3k|}
highbd_inv_txfm_avx2.c:idct32_low8_avx2:
  510|   112k|                             int bd, int out_shift) {
  511|   112k|  const int32_t *cospi = cospi_arr(bit);
  512|   112k|  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
  513|   112k|  const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
  514|   112k|  const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
  515|   112k|  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
  516|   112k|  const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
  517|   112k|  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
  518|   112k|  const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
  519|   112k|  const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
  520|   112k|  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
  521|   112k|  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
  522|   112k|  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
  523|   112k|  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
  524|   112k|  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
  525|   112k|  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
  526|   112k|  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
  527|   112k|  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
  528|   112k|  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
  529|   112k|  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
  530|   112k|  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
  531|   112k|  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
  532|   112k|  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
  533|   112k|  const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
  534|   112k|  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
  535|   112k|  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
  536|   112k|  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
  537|   112k|  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
  538|   112k|  const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
  539|   112k|  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|   224k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 112k]
  |  |  |  Branch (35:31): [True: 81.2k, False: 30.7k]
  |  |  |  Branch (35:44): [True: 81.2k, False: 30.7k]
  |  |  ------------------
  ------------------
  540|   112k|  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
  541|   112k|  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
  542|   112k|  __m256i bf1[32];
  543|       |
  544|   112k|  {
  545|       |    // stage 0
  546|       |    // stage 1
  547|   112k|    bf1[0] = in[0];
  548|   112k|    bf1[4] = in[4];
  549|   112k|    bf1[8] = in[2];
  550|   112k|    bf1[12] = in[6];
  551|   112k|    bf1[16] = in[1];
  552|   112k|    bf1[20] = in[5];
  553|   112k|    bf1[24] = in[3];
  554|   112k|    bf1[28] = in[7];
  555|       |
  556|       |    // stage 2
  557|   112k|    bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit);
  558|   112k|    bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit);
  559|   112k|    bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit);
  560|   112k|    bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit);
  561|   112k|    bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit);
  562|   112k|    bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit);
  563|   112k|    bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit);
  564|   112k|    bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit);
  565|       |
  566|       |    // stage 3
  567|   112k|    bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit);
  568|   112k|    bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit);
  569|       |
  570|   112k|    bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit);
  571|   112k|    bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit);
  572|   112k|    bf1[17] = bf1[16];
  573|   112k|    bf1[18] = bf1[19];
  574|   112k|    bf1[21] = bf1[20];
  575|   112k|    bf1[22] = bf1[23];
  576|   112k|    bf1[25] = bf1[24];
  577|   112k|    bf1[26] = bf1[27];
  578|   112k|    bf1[29] = bf1[28];
  579|   112k|    bf1[30] = bf1[31];
  580|       |
  581|       |    // stage 4
  582|   112k|    bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit);
  583|   112k|    bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit);
  584|       |
  585|   112k|    bf1[9] = bf1[8];
  586|   112k|    bf1[10] = bf1[11];
  587|   112k|    bf1[13] = bf1[12];
  588|   112k|    bf1[14] = bf1[15];
  589|       |
  590|   112k|    idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
  591|   112k|                       &cospi24, &cospi40, &cospim24, &rounding, bit);
  592|       |
  593|       |    // stage 5
  594|   112k|    bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit);
  595|   112k|    bf1[1] = bf1[0];
  596|   112k|    bf1[5] = bf1[4];
  597|   112k|    bf1[6] = bf1[7];
  598|       |
  599|   112k|    idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
  600|   112k|                       &clamp_hi, &rounding, bit);
  601|       |
  602|       |    // stage 6
  603|   112k|    bf1[3] = bf1[0];
  604|   112k|    bf1[2] = bf1[1];
  605|       |
  606|   112k|    idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
  607|   112k|                       &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
  608|       |
  609|       |    // stage 7
  610|   112k|    idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
  611|   112k|                       &rounding, bit);
  612|       |
  613|       |    // stage 8
  614|   112k|    idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
  615|   112k|                       &rounding, bit);
  616|       |
  617|       |    // stage 9
  618|   112k|    idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
  619|   112k|  }
  620|   112k|}
highbd_inv_txfm_avx2.c:idct32_stage4_avx2:
  282|   161k|    const __m256i *rounding, int bit) {
  283|   161k|  __m256i temp1, temp2;
  284|   161k|  temp1 = half_btf_avx2(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit);
  285|   161k|  bf1[30] = half_btf_avx2(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit);
  286|   161k|  bf1[17] = temp1;
  287|       |
  288|   161k|  temp2 = half_btf_avx2(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit);
  289|   161k|  bf1[29] = half_btf_avx2(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit);
  290|   161k|  bf1[18] = temp2;
  291|       |
  292|   161k|  temp1 = half_btf_avx2(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit);
  293|   161k|  bf1[26] = half_btf_avx2(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit);
  294|   161k|  bf1[21] = temp1;
  295|       |
  296|   161k|  temp2 = half_btf_avx2(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit);
  297|   161k|  bf1[25] = half_btf_avx2(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit);
  298|   161k|  bf1[22] = temp2;
  299|   161k|}
highbd_inv_txfm_avx2.c:idct32_stage5_avx2:
  304|   162k|    const __m256i *clamp_hi, const __m256i *rounding, int bit) {
  305|   162k|  __m256i temp1, temp2;
  306|   162k|  temp1 = half_btf_avx2(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit);
  307|   162k|  bf1[14] = half_btf_avx2(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit);
  308|   162k|  bf1[9] = temp1;
  309|       |
  310|   162k|  temp2 = half_btf_avx2(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit);
  311|   162k|  bf1[13] = half_btf_avx2(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit);
  312|   162k|  bf1[10] = temp2;
  313|       |
  314|   162k|  addsub_avx2(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi);
  315|   162k|  addsub_avx2(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi);
  316|   162k|  addsub_avx2(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi);
  317|   162k|  addsub_avx2(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi);
  318|   162k|  addsub_avx2(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi);
  319|   162k|  addsub_avx2(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi);
  320|   162k|  addsub_avx2(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi);
  321|   162k|  addsub_avx2(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi);
  322|   162k|}
highbd_inv_txfm_avx2.c:idct32_stage6_avx2:
  328|   162k|    const __m256i *rounding, int bit) {
  329|   162k|  __m256i temp1, temp2;
  330|   162k|  temp1 = half_btf_avx2(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit);
  331|   162k|  bf1[6] = half_btf_avx2(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit);
  332|   162k|  bf1[5] = temp1;
  333|       |
  334|   162k|  addsub_avx2(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi);
  335|   162k|  addsub_avx2(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi);
  336|   162k|  addsub_avx2(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi);
  337|   162k|  addsub_avx2(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi);
  338|       |
  339|   162k|  temp1 = half_btf_avx2(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit);
  340|   162k|  bf1[29] = half_btf_avx2(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit);
  341|   162k|  bf1[18] = temp1;
  342|   162k|  temp2 = half_btf_avx2(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit);
  343|   162k|  bf1[28] = half_btf_avx2(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit);
  344|   162k|  bf1[19] = temp2;
  345|   162k|  temp1 = half_btf_avx2(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit);
  346|   162k|  bf1[27] = half_btf_avx2(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit);
  347|   162k|  bf1[20] = temp1;
  348|   162k|  temp2 = half_btf_avx2(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit);
  349|   162k|  bf1[26] = half_btf_avx2(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit);
  350|   162k|  bf1[21] = temp2;
  351|   162k|}
highbd_inv_txfm_avx2.c:idct32_stage7_avx2:
  357|   162k|                                      const __m256i *rounding, int bit) {
  358|   162k|  __m256i temp1, temp2;
  359|   162k|  addsub_avx2(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi);
  360|   162k|  addsub_avx2(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi);
  361|   162k|  addsub_avx2(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi);
  362|   162k|  addsub_avx2(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi);
  363|       |
  364|   162k|  temp1 = half_btf_avx2(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit);
  365|   162k|  bf1[13] = half_btf_avx2(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit);
  366|   162k|  bf1[10] = temp1;
  367|   162k|  temp2 = half_btf_avx2(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit);
  368|   162k|  bf1[12] = half_btf_avx2(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit);
  369|   162k|  bf1[11] = temp2;
  370|       |
  371|   162k|  addsub_avx2(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi);
  372|   162k|  addsub_avx2(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi);
  373|   162k|  addsub_avx2(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi);
  374|   162k|  addsub_avx2(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi);
  375|   162k|  addsub_avx2(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi);
  376|   162k|  addsub_avx2(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi);
  377|   162k|  addsub_avx2(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi);
  378|   162k|  addsub_avx2(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi);
  379|   162k|}
highbd_inv_txfm_avx2.c:idct32_stage8_avx2:
  385|   162k|                                      const __m256i *rounding, int bit) {
  386|   162k|  __m256i temp1, temp2;
  387|   162k|  addsub_avx2(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi);
  388|   162k|  addsub_avx2(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi);
  389|   162k|  addsub_avx2(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi);
  390|   162k|  addsub_avx2(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi);
  391|   162k|  addsub_avx2(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi);
  392|   162k|  addsub_avx2(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi);
  393|   162k|  addsub_avx2(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi);
  394|   162k|  addsub_avx2(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi);
  395|       |
  396|   162k|  temp1 = half_btf_avx2(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit);
  397|   162k|  bf1[27] = half_btf_avx2(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit);
  398|   162k|  bf1[20] = temp1;
  399|   162k|  temp2 = half_btf_avx2(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit);
  400|   162k|  bf1[26] = half_btf_avx2(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit);
  401|   162k|  bf1[21] = temp2;
  402|   162k|  temp1 = half_btf_avx2(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit);
  403|   162k|  bf1[25] = half_btf_avx2(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit);
  404|   162k|  bf1[22] = temp1;
  405|   162k|  temp2 = half_btf_avx2(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit);
  406|   162k|  bf1[24] = half_btf_avx2(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit);
  407|   162k|  bf1[23] = temp2;
  408|   162k|}
highbd_inv_txfm_avx2.c:idct32_stage9_avx2:
  414|   162k|                                      const __m256i *clamp_hi) {
  415|   162k|  addsub_avx2(bf1[0], bf1[31], out + 0, out + 31, clamp_lo, clamp_hi);
  416|   162k|  addsub_avx2(bf1[1], bf1[30], out + 1, out + 30, clamp_lo, clamp_hi);
  417|   162k|  addsub_avx2(bf1[2], bf1[29], out + 2, out + 29, clamp_lo, clamp_hi);
  418|   162k|  addsub_avx2(bf1[3], bf1[28], out + 3, out + 28, clamp_lo, clamp_hi);
  419|   162k|  addsub_avx2(bf1[4], bf1[27], out + 4, out + 27, clamp_lo, clamp_hi);
  420|   162k|  addsub_avx2(bf1[5], bf1[26], out + 5, out + 26, clamp_lo, clamp_hi);
  421|   162k|  addsub_avx2(bf1[6], bf1[25], out + 6, out + 25, clamp_lo, clamp_hi);
  422|   162k|  addsub_avx2(bf1[7], bf1[24], out + 7, out + 24, clamp_lo, clamp_hi);
  423|   162k|  addsub_avx2(bf1[8], bf1[23], out + 8, out + 23, clamp_lo, clamp_hi);
  424|   162k|  addsub_avx2(bf1[9], bf1[22], out + 9, out + 22, clamp_lo, clamp_hi);
  425|   162k|  addsub_avx2(bf1[10], bf1[21], out + 10, out + 21, clamp_lo, clamp_hi);
  426|   162k|  addsub_avx2(bf1[11], bf1[20], out + 11, out + 20, clamp_lo, clamp_hi);
  427|   162k|  addsub_avx2(bf1[12], bf1[19], out + 12, out + 19, clamp_lo, clamp_hi);
  428|   162k|  addsub_avx2(bf1[13], bf1[18], out + 13, out + 18, clamp_lo, clamp_hi);
  429|   162k|  addsub_avx2(bf1[14], bf1[17], out + 14, out + 17, clamp_lo, clamp_hi);
  430|   162k|  addsub_avx2(bf1[15], bf1[16], out + 15, out + 16, clamp_lo, clamp_hi);
  431|   162k|  if (!do_cols) {
  ------------------
  |  Branch (431:7): [True: 49.6k, False: 112k]
  ------------------
  432|  49.6k|    const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|  49.6k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 49.6k]
  |  |  ------------------
  ------------------
  433|  49.6k|    const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
  434|  49.6k|    const __m256i clamp_hi_out =
  435|  49.6k|        _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
  436|  49.6k|    round_shift_8x8_avx2(out, out_shift);
  437|  49.6k|    round_shift_8x8_avx2(out + 16, out_shift);
  438|  49.6k|    highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 32);
  439|  49.6k|  }
  440|   162k|}
highbd_inv_txfm_avx2.c:idct32_low16_avx2:
  623|  49.9k|                              int bd, int out_shift) {
  624|  49.9k|  const int32_t *cospi = cospi_arr(bit);
  625|  49.9k|  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
  626|  49.9k|  const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
  627|  49.9k|  const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
  628|  49.9k|  const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
  629|  49.9k|  const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
  630|  49.9k|  const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
  631|  49.9k|  const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
  632|  49.9k|  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
  633|  49.9k|  const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
  634|  49.9k|  const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
  635|  49.9k|  const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
  636|  49.9k|  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
  637|  49.9k|  const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
  638|  49.9k|  const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
  639|  49.9k|  const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
  640|  49.9k|  const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
  641|  49.9k|  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
  642|  49.9k|  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
  643|  49.9k|  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
  644|  49.9k|  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
  645|  49.9k|  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
  646|  49.9k|  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
  647|  49.9k|  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
  648|  49.9k|  const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
  649|  49.9k|  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
  650|  49.9k|  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
  651|  49.9k|  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
  652|  49.9k|  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
  653|  49.9k|  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
  654|  49.9k|  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
  655|  49.9k|  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
  656|  49.9k|  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
  657|  49.9k|  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
  658|  49.9k|  const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
  659|  49.9k|  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
  660|  49.9k|  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
  661|  49.9k|  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
  662|  49.9k|  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
  663|  49.9k|  const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
  664|  49.9k|  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|  99.9k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 49.9k]
  |  |  |  Branch (35:31): [True: 31.0k, False: 18.9k]
  |  |  |  Branch (35:44): [True: 31.0k, False: 18.9k]
  |  |  ------------------
  ------------------
  665|  49.9k|  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
  666|  49.9k|  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
  667|  49.9k|  __m256i bf1[32];
  668|       |
  669|  49.9k|  {
  670|       |    // stage 0
  671|       |    // stage 1
  672|  49.9k|    bf1[0] = in[0];
  673|  49.9k|    bf1[2] = in[8];
  674|  49.9k|    bf1[4] = in[4];
  675|  49.9k|    bf1[6] = in[12];
  676|  49.9k|    bf1[8] = in[2];
  677|  49.9k|    bf1[10] = in[10];
  678|  49.9k|    bf1[12] = in[6];
  679|  49.9k|    bf1[14] = in[14];
  680|  49.9k|    bf1[16] = in[1];
  681|  49.9k|    bf1[18] = in[9];
  682|  49.9k|    bf1[20] = in[5];
  683|  49.9k|    bf1[22] = in[13];
  684|  49.9k|    bf1[24] = in[3];
  685|  49.9k|    bf1[26] = in[11];
  686|  49.9k|    bf1[28] = in[7];
  687|  49.9k|    bf1[30] = in[15];
  688|       |
  689|       |    // stage 2
  690|  49.9k|    bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit);
  691|  49.9k|    bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit);
  692|  49.9k|    bf1[17] = half_btf_0_avx2(&cospim34, &bf1[30], &rounding, bit);
  693|  49.9k|    bf1[30] = half_btf_0_avx2(&cospi30, &bf1[30], &rounding, bit);
  694|  49.9k|    bf1[29] = half_btf_0_avx2(&cospi18, &bf1[18], &rounding, bit);
  695|  49.9k|    bf1[18] = half_btf_0_avx2(&cospi46, &bf1[18], &rounding, bit);
  696|  49.9k|    bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit);
  697|  49.9k|    bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit);
  698|  49.9k|    bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit);
  699|  49.9k|    bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit);
  700|  49.9k|    bf1[21] = half_btf_0_avx2(&cospim42, &bf1[26], &rounding, bit);
  701|  49.9k|    bf1[26] = half_btf_0_avx2(&cospi22, &bf1[26], &rounding, bit);
  702|  49.9k|    bf1[25] = half_btf_0_avx2(&cospi26, &bf1[22], &rounding, bit);
  703|  49.9k|    bf1[22] = half_btf_0_avx2(&cospi38, &bf1[22], &rounding, bit);
  704|  49.9k|    bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit);
  705|  49.9k|    bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit);
  706|       |
  707|       |    // stage 3
  708|  49.9k|    bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit);
  709|  49.9k|    bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit);
  710|  49.9k|    bf1[9] = half_btf_0_avx2(&cospim36, &bf1[14], &rounding, bit);
  711|  49.9k|    bf1[14] = half_btf_0_avx2(&cospi28, &bf1[14], &rounding, bit);
  712|  49.9k|    bf1[13] = half_btf_0_avx2(&cospi20, &bf1[10], &rounding, bit);
  713|  49.9k|    bf1[10] = half_btf_0_avx2(&cospi44, &bf1[10], &rounding, bit);
  714|  49.9k|    bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit);
  715|  49.9k|    bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit);
  716|       |
  717|  49.9k|    addsub_avx2(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
  718|  49.9k|    addsub_avx2(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
  719|  49.9k|    addsub_avx2(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
  720|  49.9k|    addsub_avx2(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
  721|  49.9k|    addsub_avx2(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
  722|  49.9k|    addsub_avx2(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
  723|  49.9k|    addsub_avx2(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
  724|  49.9k|    addsub_avx2(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
  725|       |
  726|       |    // stage 4
  727|  49.9k|    bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit);
  728|  49.9k|    bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit);
  729|  49.9k|    bf1[5] = half_btf_0_avx2(&cospim40, &bf1[6], &rounding, bit);
  730|  49.9k|    bf1[6] = half_btf_0_avx2(&cospi24, &bf1[6], &rounding, bit);
  731|       |
  732|  49.9k|    addsub_avx2(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi);
  733|  49.9k|    addsub_avx2(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi);
  734|  49.9k|    addsub_avx2(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi);
  735|  49.9k|    addsub_avx2(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi);
  736|       |
  737|  49.9k|    idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
  738|  49.9k|                       &cospi24, &cospi40, &cospim24, &rounding, bit);
  739|       |
  740|       |    // stage 5
  741|  49.9k|    bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit);
  742|  49.9k|    bf1[1] = bf1[0];
  743|  49.9k|    bf1[3] = half_btf_0_avx2(&cospi16, &bf1[2], &rounding, bit);
  744|  49.9k|    bf1[2] = half_btf_0_avx2(&cospi48, &bf1[2], &rounding, bit);
  745|       |
  746|  49.9k|    addsub_avx2(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
  747|  49.9k|    addsub_avx2(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
  748|       |
  749|  49.9k|    idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
  750|  49.9k|                       &clamp_hi, &rounding, bit);
  751|       |
  752|       |    // stage 6
  753|  49.9k|    addsub_avx2(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi);
  754|  49.9k|    addsub_avx2(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi);
  755|       |
  756|  49.9k|    idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
  757|  49.9k|                       &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
  758|       |
  759|       |    // stage 7
  760|  49.9k|    idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
  761|  49.9k|                       &rounding, bit);
  762|       |
  763|       |    // stage 8
  764|  49.9k|    idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
  765|  49.9k|                       &rounding, bit);
  766|       |
  767|       |    // stage 9
  768|  49.9k|    idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
  769|  49.9k|  }
  770|  49.9k|}
highbd_inv_txfm_avx2.c:idct32_avx2:
  773|  46.7k|                        int out_shift) {
  774|  46.7k|  const int32_t *cospi = cospi_arr(bit);
  775|  46.7k|  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
  776|  46.7k|  const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
  777|  46.7k|  const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
  778|  46.7k|  const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
  779|  46.7k|  const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
  780|  46.7k|  const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
  781|  46.7k|  const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
  782|  46.7k|  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
  783|  46.7k|  const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
  784|  46.7k|  const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
  785|  46.7k|  const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
  786|  46.7k|  const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
  787|  46.7k|  const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
  788|  46.7k|  const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
  789|  46.7k|  const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
  790|  46.7k|  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
  791|  46.7k|  const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
  792|  46.7k|  const __m256i cospim26 = _mm256_set1_epi32(-cospi[26]);
  793|  46.7k|  const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
  794|  46.7k|  const __m256i cospim10 = _mm256_set1_epi32(-cospi[10]);
  795|  46.7k|  const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
  796|  46.7k|  const __m256i cospim18 = _mm256_set1_epi32(-cospi[18]);
  797|  46.7k|  const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
  798|  46.7k|  const __m256i cospim2 = _mm256_set1_epi32(-cospi[2]);
  799|  46.7k|  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
  800|  46.7k|  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
  801|  46.7k|  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
  802|  46.7k|  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
  803|  46.7k|  const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
  804|  46.7k|  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
  805|  46.7k|  const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
  806|  46.7k|  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
  807|  46.7k|  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
  808|  46.7k|  const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
  809|  46.7k|  const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
  810|  46.7k|  const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
  811|  46.7k|  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
  812|  46.7k|  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
  813|  46.7k|  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
  814|  46.7k|  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
  815|  46.7k|  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
  816|  46.7k|  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
  817|  46.7k|  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
  818|  46.7k|  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
  819|  46.7k|  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
  820|  46.7k|  const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
  821|  46.7k|  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
  822|  46.7k|  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
  823|  46.7k|  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
  824|  46.7k|  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
  825|  46.7k|  const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
  826|  46.7k|  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|  93.4k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 46.7k]
  |  |  |  Branch (35:31): [True: 23.4k, False: 23.2k]
  |  |  |  Branch (35:44): [True: 23.4k, False: 23.2k]
  |  |  ------------------
  ------------------
  827|  46.7k|  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
  828|  46.7k|  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
  829|  46.7k|  __m256i bf1[32], bf0[32];
  830|       |
  831|  46.7k|  {
  832|       |    // stage 0
  833|       |    // stage 1
  834|  46.7k|    bf1[0] = in[0];
  835|  46.7k|    bf1[1] = in[16];
  836|  46.7k|    bf1[2] = in[8];
  837|  46.7k|    bf1[3] = in[24];
  838|  46.7k|    bf1[4] = in[4];
  839|  46.7k|    bf1[5] = in[20];
  840|  46.7k|    bf1[6] = in[12];
  841|  46.7k|    bf1[7] = in[28];
  842|  46.7k|    bf1[8] = in[2];
  843|  46.7k|    bf1[9] = in[18];
  844|  46.7k|    bf1[10] = in[10];
  845|  46.7k|    bf1[11] = in[26];
  846|  46.7k|    bf1[12] = in[6];
  847|  46.7k|    bf1[13] = in[22];
  848|  46.7k|    bf1[14] = in[14];
  849|  46.7k|    bf1[15] = in[30];
  850|  46.7k|    bf1[16] = in[1];
  851|  46.7k|    bf1[17] = in[17];
  852|  46.7k|    bf1[18] = in[9];
  853|  46.7k|    bf1[19] = in[25];
  854|  46.7k|    bf1[20] = in[5];
  855|  46.7k|    bf1[21] = in[21];
  856|  46.7k|    bf1[22] = in[13];
  857|  46.7k|    bf1[23] = in[29];
  858|  46.7k|    bf1[24] = in[3];
  859|  46.7k|    bf1[25] = in[19];
  860|  46.7k|    bf1[26] = in[11];
  861|  46.7k|    bf1[27] = in[27];
  862|  46.7k|    bf1[28] = in[7];
  863|  46.7k|    bf1[29] = in[23];
  864|  46.7k|    bf1[30] = in[15];
  865|  46.7k|    bf1[31] = in[31];
  866|       |
  867|       |    // stage 2
  868|  46.7k|    bf0[0] = bf1[0];
  869|  46.7k|    bf0[1] = bf1[1];
  870|  46.7k|    bf0[2] = bf1[2];
  871|  46.7k|    bf0[3] = bf1[3];
  872|  46.7k|    bf0[4] = bf1[4];
  873|  46.7k|    bf0[5] = bf1[5];
  874|  46.7k|    bf0[6] = bf1[6];
  875|  46.7k|    bf0[7] = bf1[7];
  876|  46.7k|    bf0[8] = bf1[8];
  877|  46.7k|    bf0[9] = bf1[9];
  878|  46.7k|    bf0[10] = bf1[10];
  879|  46.7k|    bf0[11] = bf1[11];
  880|  46.7k|    bf0[12] = bf1[12];
  881|  46.7k|    bf0[13] = bf1[13];
  882|  46.7k|    bf0[14] = bf1[14];
  883|  46.7k|    bf0[15] = bf1[15];
  884|  46.7k|    bf0[16] =
  885|  46.7k|        half_btf_avx2(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit);
  886|  46.7k|    bf0[17] =
  887|  46.7k|        half_btf_avx2(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit);
  888|  46.7k|    bf0[18] =
  889|  46.7k|        half_btf_avx2(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit);
  890|  46.7k|    bf0[19] =
  891|  46.7k|        half_btf_avx2(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit);
  892|  46.7k|    bf0[20] =
  893|  46.7k|        half_btf_avx2(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit);
  894|  46.7k|    bf0[21] =
  895|  46.7k|        half_btf_avx2(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit);
  896|  46.7k|    bf0[22] =
  897|  46.7k|        half_btf_avx2(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit);
  898|  46.7k|    bf0[23] =
  899|  46.7k|        half_btf_avx2(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit);
  900|  46.7k|    bf0[24] =
  901|  46.7k|        half_btf_avx2(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit);
  902|  46.7k|    bf0[25] =
  903|  46.7k|        half_btf_avx2(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit);
  904|  46.7k|    bf0[26] =
  905|  46.7k|        half_btf_avx2(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit);
  906|  46.7k|    bf0[27] =
  907|  46.7k|        half_btf_avx2(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit);
  908|  46.7k|    bf0[28] =
  909|  46.7k|        half_btf_avx2(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit);
  910|  46.7k|    bf0[29] =
  911|  46.7k|        half_btf_avx2(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit);
  912|  46.7k|    bf0[30] =
  913|  46.7k|        half_btf_avx2(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit);
  914|  46.7k|    bf0[31] =
  915|  46.7k|        half_btf_avx2(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit);
  916|       |
  917|       |    // stage 3
  918|  46.7k|    bf1[0] = bf0[0];
  919|  46.7k|    bf1[1] = bf0[1];
  920|  46.7k|    bf1[2] = bf0[2];
  921|  46.7k|    bf1[3] = bf0[3];
  922|  46.7k|    bf1[4] = bf0[4];
  923|  46.7k|    bf1[5] = bf0[5];
  924|  46.7k|    bf1[6] = bf0[6];
  925|  46.7k|    bf1[7] = bf0[7];
  926|  46.7k|    bf1[8] =
  927|  46.7k|        half_btf_avx2(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit);
  928|  46.7k|    bf1[9] =
  929|  46.7k|        half_btf_avx2(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit);
  930|  46.7k|    bf1[10] =
  931|  46.7k|        half_btf_avx2(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit);
  932|  46.7k|    bf1[11] =
  933|  46.7k|        half_btf_avx2(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit);
  934|  46.7k|    bf1[12] =
  935|  46.7k|        half_btf_avx2(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit);
  936|  46.7k|    bf1[13] =
  937|  46.7k|        half_btf_avx2(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit);
  938|  46.7k|    bf1[14] =
  939|  46.7k|        half_btf_avx2(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit);
  940|  46.7k|    bf1[15] =
  941|  46.7k|        half_btf_avx2(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit);
  942|       |
  943|  46.7k|    addsub_avx2(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
  944|  46.7k|    addsub_avx2(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
  945|  46.7k|    addsub_avx2(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
  946|  46.7k|    addsub_avx2(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
  947|  46.7k|    addsub_avx2(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
  948|  46.7k|    addsub_avx2(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
  949|  46.7k|    addsub_avx2(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
  950|  46.7k|    addsub_avx2(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
  951|       |
  952|       |    // stage 4
  953|  46.7k|    bf0[0] = bf1[0];
  954|  46.7k|    bf0[1] = bf1[1];
  955|  46.7k|    bf0[2] = bf1[2];
  956|  46.7k|    bf0[3] = bf1[3];
  957|  46.7k|    bf0[4] =
  958|  46.7k|        half_btf_avx2(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit);
  959|  46.7k|    bf0[5] =
  960|  46.7k|        half_btf_avx2(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit);
  961|  46.7k|    bf0[6] =
  962|  46.7k|        half_btf_avx2(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit);
  963|  46.7k|    bf0[7] = half_btf_avx2(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit);
  964|       |
  965|  46.7k|    addsub_avx2(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi);
  966|  46.7k|    addsub_avx2(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi);
  967|  46.7k|    addsub_avx2(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi);
  968|  46.7k|    addsub_avx2(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi);
  969|       |
  970|  46.7k|    bf0[16] = bf1[16];
  971|  46.7k|    bf0[17] =
  972|  46.7k|        half_btf_avx2(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit);
  973|  46.7k|    bf0[18] =
  974|  46.7k|        half_btf_avx2(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit);
  975|  46.7k|    bf0[19] = bf1[19];
  976|  46.7k|    bf0[20] = bf1[20];
  977|  46.7k|    bf0[21] =
  978|  46.7k|        half_btf_avx2(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit);
  979|  46.7k|    bf0[22] =
  980|  46.7k|        half_btf_avx2(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit);
  981|  46.7k|    bf0[23] = bf1[23];
  982|  46.7k|    bf0[24] = bf1[24];
  983|  46.7k|    bf0[25] =
  984|  46.7k|        half_btf_avx2(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit);
  985|  46.7k|    bf0[26] =
  986|  46.7k|        half_btf_avx2(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit);
  987|  46.7k|    bf0[27] = bf1[27];
  988|  46.7k|    bf0[28] = bf1[28];
  989|  46.7k|    bf0[29] =
  990|  46.7k|        half_btf_avx2(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit);
  991|  46.7k|    bf0[30] =
  992|  46.7k|        half_btf_avx2(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit);
  993|  46.7k|    bf0[31] = bf1[31];
  994|       |
  995|       |    // stage 5
  996|  46.7k|    bf1[0] =
  997|  46.7k|        half_btf_avx2(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit);
  998|  46.7k|    bf1[1] =
  999|  46.7k|        half_btf_avx2(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit);
 1000|  46.7k|    bf1[2] =
 1001|  46.7k|        half_btf_avx2(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit);
 1002|  46.7k|    bf1[3] =
 1003|  46.7k|        half_btf_avx2(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit);
 1004|  46.7k|    addsub_avx2(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
 1005|  46.7k|    addsub_avx2(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
 1006|  46.7k|    bf1[8] = bf0[8];
 1007|  46.7k|    bf1[9] =
 1008|  46.7k|        half_btf_avx2(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit);
 1009|  46.7k|    bf1[10] =
 1010|  46.7k|        half_btf_avx2(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit);
 1011|  46.7k|    bf1[11] = bf0[11];
 1012|  46.7k|    bf1[12] = bf0[12];
 1013|  46.7k|    bf1[13] =
 1014|  46.7k|        half_btf_avx2(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit);
 1015|  46.7k|    bf1[14] =
 1016|  46.7k|        half_btf_avx2(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit);
 1017|  46.7k|    bf1[15] = bf0[15];
 1018|  46.7k|    addsub_avx2(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi);
 1019|  46.7k|    addsub_avx2(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi);
 1020|  46.7k|    addsub_avx2(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi);
 1021|  46.7k|    addsub_avx2(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi);
 1022|  46.7k|    addsub_avx2(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi);
 1023|  46.7k|    addsub_avx2(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi);
 1024|  46.7k|    addsub_avx2(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi);
 1025|  46.7k|    addsub_avx2(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi);
 1026|       |
 1027|       |    // stage 6
 1028|  46.7k|    addsub_avx2(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi);
 1029|  46.7k|    addsub_avx2(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi);
 1030|  46.7k|    bf0[4] = bf1[4];
 1031|  46.7k|    bf0[5] =
 1032|  46.7k|        half_btf_avx2(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
 1033|  46.7k|    bf0[6] =
 1034|  46.7k|        half_btf_avx2(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
 1035|  46.7k|    bf0[7] = bf1[7];
 1036|  46.7k|    addsub_avx2(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi);
 1037|  46.7k|    addsub_avx2(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi);
 1038|  46.7k|    addsub_avx2(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi);
 1039|  46.7k|    addsub_avx2(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi);
 1040|  46.7k|    bf0[16] = bf1[16];
 1041|  46.7k|    bf0[17] = bf1[17];
 1042|  46.7k|    bf0[18] =
 1043|  46.7k|        half_btf_avx2(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit);
 1044|  46.7k|    bf0[19] =
 1045|  46.7k|        half_btf_avx2(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit);
 1046|  46.7k|    bf0[20] =
 1047|  46.7k|        half_btf_avx2(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit);
 1048|  46.7k|    bf0[21] =
 1049|  46.7k|        half_btf_avx2(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit);
 1050|  46.7k|    bf0[22] = bf1[22];
 1051|  46.7k|    bf0[23] = bf1[23];
 1052|  46.7k|    bf0[24] = bf1[24];
 1053|  46.7k|    bf0[25] = bf1[25];
 1054|  46.7k|    bf0[26] =
 1055|  46.7k|        half_btf_avx2(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit);
 1056|  46.7k|    bf0[27] =
 1057|  46.7k|        half_btf_avx2(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit);
 1058|  46.7k|    bf0[28] =
 1059|  46.7k|        half_btf_avx2(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit);
 1060|  46.7k|    bf0[29] =
 1061|  46.7k|        half_btf_avx2(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit);
 1062|  46.7k|    bf0[30] = bf1[30];
 1063|  46.7k|    bf0[31] = bf1[31];
 1064|       |
 1065|       |    // stage 7
 1066|  46.7k|    addsub_avx2(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi);
 1067|  46.7k|    addsub_avx2(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi);
 1068|  46.7k|    addsub_avx2(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi);
 1069|  46.7k|    addsub_avx2(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi);
 1070|  46.7k|    bf1[8] = bf0[8];
 1071|  46.7k|    bf1[9] = bf0[9];
 1072|  46.7k|    bf1[10] =
 1073|  46.7k|        half_btf_avx2(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
 1074|  46.7k|    bf1[11] =
 1075|  46.7k|        half_btf_avx2(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
 1076|  46.7k|    bf1[12] =
 1077|  46.7k|        half_btf_avx2(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
 1078|  46.7k|    bf1[13] =
 1079|  46.7k|        half_btf_avx2(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
 1080|  46.7k|    bf1[14] = bf0[14];
 1081|  46.7k|    bf1[15] = bf0[15];
 1082|  46.7k|    addsub_avx2(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi);
 1083|  46.7k|    addsub_avx2(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi);
 1084|  46.7k|    addsub_avx2(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi);
 1085|  46.7k|    addsub_avx2(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi);
 1086|  46.7k|    addsub_avx2(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi);
 1087|  46.7k|    addsub_avx2(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi);
 1088|  46.7k|    addsub_avx2(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi);
 1089|  46.7k|    addsub_avx2(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi);
 1090|       |
 1091|       |    // stage 8
 1092|  46.7k|    addsub_avx2(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi);
 1093|  46.7k|    addsub_avx2(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi);
 1094|  46.7k|    addsub_avx2(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi);
 1095|  46.7k|    addsub_avx2(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi);
 1096|  46.7k|    addsub_avx2(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi);
 1097|  46.7k|    addsub_avx2(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi);
 1098|  46.7k|    addsub_avx2(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi);
 1099|  46.7k|    addsub_avx2(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi);
 1100|  46.7k|    bf0[16] = bf1[16];
 1101|  46.7k|    bf0[17] = bf1[17];
 1102|  46.7k|    bf0[18] = bf1[18];
 1103|  46.7k|    bf0[19] = bf1[19];
 1104|  46.7k|    bf0[20] =
 1105|  46.7k|        half_btf_avx2(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
 1106|  46.7k|    bf0[21] =
 1107|  46.7k|        half_btf_avx2(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
 1108|  46.7k|    bf0[22] =
 1109|  46.7k|        half_btf_avx2(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
 1110|  46.7k|    bf0[23] =
 1111|  46.7k|        half_btf_avx2(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
 1112|  46.7k|    bf0[24] =
 1113|  46.7k|        half_btf_avx2(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
 1114|  46.7k|    bf0[25] =
 1115|  46.7k|        half_btf_avx2(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
 1116|  46.7k|    bf0[26] =
 1117|  46.7k|        half_btf_avx2(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
 1118|  46.7k|    bf0[27] =
 1119|  46.7k|        half_btf_avx2(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
 1120|  46.7k|    bf0[28] = bf1[28];
 1121|  46.7k|    bf0[29] = bf1[29];
 1122|  46.7k|    bf0[30] = bf1[30];
 1123|  46.7k|    bf0[31] = bf1[31];
 1124|       |
 1125|       |    // stage 9
 1126|  46.7k|    addsub_avx2(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo, &clamp_hi);
 1127|  46.7k|    addsub_avx2(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo, &clamp_hi);
 1128|  46.7k|    addsub_avx2(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo, &clamp_hi);
 1129|  46.7k|    addsub_avx2(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo, &clamp_hi);
 1130|  46.7k|    addsub_avx2(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo, &clamp_hi);
 1131|  46.7k|    addsub_avx2(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo, &clamp_hi);
 1132|  46.7k|    addsub_avx2(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo, &clamp_hi);
 1133|  46.7k|    addsub_avx2(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo, &clamp_hi);
 1134|  46.7k|    addsub_avx2(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo, &clamp_hi);
 1135|  46.7k|    addsub_avx2(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo, &clamp_hi);
 1136|  46.7k|    addsub_avx2(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo, &clamp_hi);
 1137|  46.7k|    addsub_avx2(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo, &clamp_hi);
 1138|  46.7k|    addsub_avx2(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo, &clamp_hi);
 1139|  46.7k|    addsub_avx2(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo, &clamp_hi);
 1140|  46.7k|    addsub_avx2(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo, &clamp_hi);
 1141|  46.7k|    addsub_avx2(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo, &clamp_hi);
 1142|  46.7k|    if (!do_cols) {
  ------------------
  |  Branch (1142:9): [True: 23.2k, False: 23.4k]
  ------------------
 1143|  23.2k|      const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|  23.2k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 23.2k]
  |  |  ------------------
  ------------------
 1144|  23.2k|      const __m256i clamp_lo_out =
 1145|  23.2k|          _mm256_set1_epi32(-(1 << (log_range_out - 1)));
 1146|  23.2k|      const __m256i clamp_hi_out =
 1147|  23.2k|          _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
 1148|  23.2k|      round_shift_8x8_avx2(out, out_shift);
 1149|  23.2k|      round_shift_8x8_avx2(out + 16, out_shift);
 1150|  23.2k|      highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 32);
 1151|  23.2k|    }
 1152|  46.7k|  }
 1153|  46.7k|}
highbd_inv_txfm_avx2.c:idct64_low1_avx2:
 2959|  24.9k|                             int bd, int out_shift) {
 2960|  24.9k|  const int32_t *cospi = cospi_arr(bit);
 2961|  24.9k|  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
 2962|  24.9k|  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|  49.9k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 24.9k]
  |  |  |  Branch (35:31): [True: 21.3k, False: 3.66k]
  |  |  |  Branch (35:44): [True: 21.3k, False: 3.66k]
  |  |  ------------------
  ------------------
 2963|  24.9k|  __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
 2964|  24.9k|  __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
 2965|       |
 2966|  24.9k|  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
 2967|       |
 2968|  24.9k|  {
 2969|  24.9k|    __m256i x;
 2970|       |
 2971|       |    // stage 1
 2972|       |    // stage 2
 2973|       |    // stage 3
 2974|       |    // stage 4
 2975|       |    // stage 5
 2976|       |    // stage 6
 2977|  24.9k|    x = half_btf_0_avx2(&cospi32, &in[0], &rnding, bit);
 2978|       |
 2979|       |    // stage 8
 2980|       |    // stage 9
 2981|       |    // stage 10
 2982|       |    // stage 11
 2983|  24.9k|    if (!do_cols) {
  ------------------
  |  Branch (2983:9): [True: 3.66k, False: 21.3k]
  ------------------
 2984|  3.66k|      const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|  3.66k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 3.66k]
  |  |  ------------------
  ------------------
 2985|  3.66k|      clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
 2986|  3.66k|      clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
 2987|  3.66k|      if (out_shift != 0) {
  ------------------
  |  Branch (2987:11): [True: 3.66k, False: 0]
  ------------------
 2988|  3.66k|        __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
 2989|  3.66k|        x = _mm256_add_epi32(x, offset);
 2990|  3.66k|        x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
 2991|  3.66k|      }
 2992|  3.66k|    }
 2993|  24.9k|    x = _mm256_max_epi32(x, clamp_lo);
 2994|  24.9k|    x = _mm256_min_epi32(x, clamp_hi);
 2995|  24.9k|    out[0] = x;
 2996|  24.9k|    out[1] = x;
 2997|  24.9k|    out[2] = x;
 2998|  24.9k|    out[3] = x;
 2999|  24.9k|    out[4] = x;
 3000|  24.9k|    out[5] = x;
 3001|  24.9k|    out[6] = x;
 3002|  24.9k|    out[7] = x;
 3003|  24.9k|    out[8] = x;
 3004|  24.9k|    out[9] = x;
 3005|  24.9k|    out[10] = x;
 3006|  24.9k|    out[11] = x;
 3007|  24.9k|    out[12] = x;
 3008|  24.9k|    out[13] = x;
 3009|  24.9k|    out[14] = x;
 3010|  24.9k|    out[15] = x;
 3011|  24.9k|    out[16] = x;
 3012|  24.9k|    out[17] = x;
 3013|  24.9k|    out[18] = x;
 3014|  24.9k|    out[19] = x;
 3015|  24.9k|    out[20] = x;
 3016|  24.9k|    out[21] = x;
 3017|  24.9k|    out[22] = x;
 3018|  24.9k|    out[23] = x;
 3019|  24.9k|    out[24] = x;
 3020|  24.9k|    out[25] = x;
 3021|  24.9k|    out[26] = x;
 3022|  24.9k|    out[27] = x;
 3023|  24.9k|    out[28] = x;
 3024|  24.9k|    out[29] = x;
 3025|  24.9k|    out[30] = x;
 3026|  24.9k|    out[31] = x;
 3027|  24.9k|    out[32] = x;
 3028|  24.9k|    out[33] = x;
 3029|  24.9k|    out[34] = x;
 3030|  24.9k|    out[35] = x;
 3031|  24.9k|    out[36] = x;
 3032|  24.9k|    out[37] = x;
 3033|  24.9k|    out[38] = x;
 3034|  24.9k|    out[39] = x;
 3035|  24.9k|    out[40] = x;
 3036|  24.9k|    out[41] = x;
 3037|  24.9k|    out[42] = x;
 3038|  24.9k|    out[43] = x;
 3039|  24.9k|    out[44] = x;
 3040|  24.9k|    out[45] = x;
 3041|  24.9k|    out[46] = x;
 3042|  24.9k|    out[47] = x;
 3043|  24.9k|    out[48] = x;
 3044|  24.9k|    out[49] = x;
 3045|  24.9k|    out[50] = x;
 3046|  24.9k|    out[51] = x;
 3047|  24.9k|    out[52] = x;
 3048|  24.9k|    out[53] = x;
 3049|  24.9k|    out[54] = x;
 3050|  24.9k|    out[55] = x;
 3051|  24.9k|    out[56] = x;
 3052|  24.9k|    out[57] = x;
 3053|  24.9k|    out[58] = x;
 3054|  24.9k|    out[59] = x;
 3055|  24.9k|    out[60] = x;
 3056|  24.9k|    out[61] = x;
 3057|  24.9k|    out[62] = x;
 3058|  24.9k|    out[63] = x;
 3059|  24.9k|  }
 3060|  24.9k|}
highbd_inv_txfm_avx2.c:idct64_low8_avx2:
 3062|  19.0k|                             int bd, int out_shift) {
 3063|  19.0k|  int i, j;
 3064|  19.0k|  const int32_t *cospi = cospi_arr(bit);
 3065|  19.0k|  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
 3066|  19.0k|  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|  38.0k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 19.0k]
  |  |  |  Branch (35:31): [True: 16.1k, False: 2.85k]
  |  |  |  Branch (35:44): [True: 16.1k, False: 2.85k]
  |  |  ------------------
  ------------------
 3067|  19.0k|  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
 3068|  19.0k|  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
 3069|       |
 3070|  19.0k|  const __m256i cospi1 = _mm256_set1_epi32(cospi[1]);
 3071|  19.0k|  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
 3072|  19.0k|  const __m256i cospi3 = _mm256_set1_epi32(cospi[3]);
 3073|  19.0k|  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
 3074|  19.0k|  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
 3075|  19.0k|  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
 3076|  19.0k|  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
 3077|  19.0k|  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
 3078|  19.0k|  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
 3079|  19.0k|  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
 3080|  19.0k|  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
 3081|  19.0k|  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
 3082|  19.0k|  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
 3083|  19.0k|  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
 3084|  19.0k|  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
 3085|  19.0k|  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
 3086|  19.0k|  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
 3087|  19.0k|  const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
 3088|  19.0k|  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
 3089|  19.0k|  const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]);
 3090|  19.0k|  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
 3091|  19.0k|  const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
 3092|  19.0k|  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
 3093|  19.0k|  const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]);
 3094|  19.0k|  const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
 3095|  19.0k|  const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
 3096|  19.0k|  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
 3097|  19.0k|  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
 3098|  19.0k|  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
 3099|  19.0k|  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
 3100|  19.0k|  const __m256i cospi63 = _mm256_set1_epi32(cospi[63]);
 3101|  19.0k|  const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]);
 3102|  19.0k|  const __m256i cospi7 = _mm256_set1_epi32(cospi[7]);
 3103|  19.0k|  const __m256i cospi5 = _mm256_set1_epi32(cospi[5]);
 3104|  19.0k|  const __m256i cospi59 = _mm256_set1_epi32(cospi[59]);
 3105|  19.0k|  const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]);
 3106|  19.0k|  const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
 3107|  19.0k|  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
 3108|       |
 3109|  19.0k|  {
 3110|  19.0k|    __m256i u[64];
 3111|       |
 3112|       |    // stage 1
 3113|  19.0k|    u[0] = in[0];
 3114|  19.0k|    u[8] = in[4];
 3115|  19.0k|    u[16] = in[2];
 3116|  19.0k|    u[24] = in[6];
 3117|  19.0k|    u[32] = in[1];
 3118|  19.0k|    u[40] = in[5];
 3119|  19.0k|    u[48] = in[3];
 3120|  19.0k|    u[56] = in[7];
 3121|       |
 3122|       |    // stage 2
 3123|  19.0k|    u[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit);
 3124|  19.0k|    u[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit);
 3125|  19.0k|    u[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit);
 3126|  19.0k|    u[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit);
 3127|  19.0k|    u[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit);
 3128|  19.0k|    u[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit);
 3129|  19.0k|    u[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit);
 3130|  19.0k|    u[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit);
 3131|       |
 3132|       |    // stage 3
 3133|  19.0k|    u[31] = half_btf_0_avx2(&cospi2, &u[16], &rnding, bit);
 3134|  19.0k|    u[16] = half_btf_0_avx2(&cospi62, &u[16], &rnding, bit);
 3135|  19.0k|    u[23] = half_btf_0_avx2(&cospim58, &u[24], &rnding, bit);
 3136|  19.0k|    u[24] = half_btf_0_avx2(&cospi6, &u[24], &rnding, bit);
 3137|  19.0k|    u[33] = u[32];
 3138|  19.0k|    u[38] = u[39];
 3139|  19.0k|    u[41] = u[40];
 3140|  19.0k|    u[46] = u[47];
 3141|  19.0k|    u[49] = u[48];
 3142|  19.0k|    u[54] = u[55];
 3143|  19.0k|    u[57] = u[56];
 3144|  19.0k|    u[62] = u[63];
 3145|       |
 3146|       |    // stage 4
 3147|  19.0k|    __m256i temp1, temp2;
 3148|  19.0k|    u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
 3149|  19.0k|    u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
 3150|  19.0k|    u[17] = u[16];
 3151|  19.0k|    u[22] = u[23];
 3152|  19.0k|    u[25] = u[24];
 3153|  19.0k|    u[30] = u[31];
 3154|       |
 3155|  19.0k|    temp1 = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
 3156|  19.0k|    u[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
 3157|  19.0k|    u[33] = temp1;
 3158|       |
 3159|  19.0k|    temp2 = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
 3160|  19.0k|    u[38] = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
 3161|  19.0k|    u[57] = temp2;
 3162|       |
 3163|  19.0k|    temp1 = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
 3164|  19.0k|    u[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
 3165|  19.0k|    u[41] = temp1;
 3166|       |
 3167|  19.0k|    temp2 = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
 3168|  19.0k|    u[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
 3169|  19.0k|    u[46] = temp2;
 3170|       |
 3171|       |    // stage 5
 3172|  19.0k|    u[9] = u[8];
 3173|  19.0k|    u[14] = u[15];
 3174|       |
 3175|  19.0k|    temp1 = half_btf_avx2(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
 3176|  19.0k|    u[30] = half_btf_avx2(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
 3177|  19.0k|    u[17] = temp1;
 3178|       |
 3179|  19.0k|    temp2 = half_btf_avx2(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
 3180|  19.0k|    u[25] = half_btf_avx2(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
 3181|  19.0k|    u[22] = temp2;
 3182|       |
 3183|  19.0k|    u[35] = u[32];
 3184|  19.0k|    u[34] = u[33];
 3185|  19.0k|    u[36] = u[39];
 3186|  19.0k|    u[37] = u[38];
 3187|  19.0k|    u[43] = u[40];
 3188|  19.0k|    u[42] = u[41];
 3189|  19.0k|    u[44] = u[47];
 3190|  19.0k|    u[45] = u[46];
 3191|  19.0k|    u[51] = u[48];
 3192|  19.0k|    u[50] = u[49];
 3193|  19.0k|    u[52] = u[55];
 3194|  19.0k|    u[53] = u[54];
 3195|  19.0k|    u[59] = u[56];
 3196|  19.0k|    u[58] = u[57];
 3197|  19.0k|    u[60] = u[63];
 3198|  19.0k|    u[61] = u[62];
 3199|       |
 3200|       |    // stage 6
 3201|  19.0k|    temp1 = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
 3202|  19.0k|    u[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
 3203|  19.0k|    u[0] = temp1;
 3204|       |
 3205|  19.0k|    temp2 = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
 3206|  19.0k|    u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
 3207|  19.0k|    u[9] = temp2;
 3208|  19.0k|    u[19] = u[16];
 3209|  19.0k|    u[18] = u[17];
 3210|  19.0k|    u[20] = u[23];
 3211|  19.0k|    u[21] = u[22];
 3212|  19.0k|    u[27] = u[24];
 3213|  19.0k|    u[26] = u[25];
 3214|  19.0k|    u[28] = u[31];
 3215|  19.0k|    u[29] = u[30];
 3216|       |
 3217|  19.0k|    temp1 = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
 3218|  19.0k|    u[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
 3219|  19.0k|    u[34] = temp1;
 3220|  19.0k|    temp2 = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
 3221|  19.0k|    u[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
 3222|  19.0k|    u[35] = temp2;
 3223|  19.0k|    temp1 = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
 3224|  19.0k|    u[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
 3225|  19.0k|    u[36] = temp1;
 3226|  19.0k|    temp2 = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
 3227|  19.0k|    u[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
 3228|  19.0k|    u[37] = temp2;
 3229|  19.0k|    temp1 = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
 3230|  19.0k|    u[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
 3231|  19.0k|    u[42] = temp1;
 3232|  19.0k|    temp2 = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
 3233|  19.0k|    u[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
 3234|  19.0k|    u[43] = temp2;
 3235|  19.0k|    temp1 = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
 3236|  19.0k|    u[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
 3237|  19.0k|    u[44] = temp1;
 3238|  19.0k|    temp2 = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
 3239|  19.0k|    u[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
 3240|  19.0k|    u[45] = temp2;
 3241|       |
 3242|       |    // stage 7
 3243|  19.0k|    u[3] = u[0];
 3244|  19.0k|    u[2] = u[1];
 3245|  19.0k|    u[11] = u[8];
 3246|  19.0k|    u[10] = u[9];
 3247|  19.0k|    u[12] = u[15];
 3248|  19.0k|    u[13] = u[14];
 3249|       |
 3250|  19.0k|    temp1 = half_btf_avx2(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
 3251|  19.0k|    u[29] = half_btf_avx2(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
 3252|  19.0k|    u[18] = temp1;
 3253|  19.0k|    temp2 = half_btf_avx2(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
 3254|  19.0k|    u[28] = half_btf_avx2(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
 3255|  19.0k|    u[19] = temp2;
 3256|  19.0k|    temp1 = half_btf_avx2(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
 3257|  19.0k|    u[27] = half_btf_avx2(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
 3258|  19.0k|    u[20] = temp1;
 3259|  19.0k|    temp2 = half_btf_avx2(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
 3260|  19.0k|    u[26] = half_btf_avx2(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
 3261|  19.0k|    u[21] = temp2;
 3262|  57.0k|    for (i = 32; i < 64; i += 16) {
  ------------------
  |  Branch (3262:18): [True: 38.0k, False: 19.0k]
  ------------------
 3263|   190k|      for (j = i; j < i + 4; j++) {
  ------------------
  |  Branch (3263:19): [True: 152k, False: 38.0k]
  ------------------
 3264|   152k|        addsub_avx2(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
 3265|   152k|        addsub_avx2(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
 3266|   152k|                    &clamp_hi);
 3267|   152k|      }
 3268|  38.0k|    }
 3269|       |
 3270|       |    // stage 8
 3271|  19.0k|    u[7] = u[0];
 3272|  19.0k|    u[6] = u[1];
 3273|  19.0k|    u[5] = u[2];
 3274|  19.0k|    u[4] = u[3];
 3275|       |
 3276|  19.0k|    idct64_stage8_avx2(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
 3277|  19.0k|                       &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
 3278|       |
 3279|       |    // stage 9
 3280|  19.0k|    idct64_stage9_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
 3281|  19.0k|                       bit);
 3282|       |
 3283|       |    // stage 10
 3284|  19.0k|    idct64_stage10_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
 3285|  19.0k|                        bit);
 3286|       |
 3287|       |    // stage 11
 3288|  19.0k|    idct64_stage11_avx2(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
 3289|  19.0k|  }
 3290|  19.0k|}
highbd_inv_txfm_avx2.c:idct64_stage8_avx2:
 2825|  27.9k|    const __m256i *rnding, int bit) {
 2826|  27.9k|  int i;
 2827|  27.9k|  __m256i temp1, temp2, temp3, temp4;
 2828|  27.9k|  temp1 = half_btf_avx2(cospim32, &u[10], cospi32, &u[13], rnding, bit);
 2829|  27.9k|  u[13] = half_btf_avx2(cospi32, &u[10], cospi32, &u[13], rnding, bit);
 2830|  27.9k|  u[10] = temp1;
 2831|  27.9k|  temp2 = half_btf_avx2(cospim32, &u[11], cospi32, &u[12], rnding, bit);
 2832|  27.9k|  u[12] = half_btf_avx2(cospi32, &u[11], cospi32, &u[12], rnding, bit);
 2833|  27.9k|  u[11] = temp2;
 2834|       |
 2835|   139k|  for (i = 16; i < 20; ++i) {
  ------------------
  |  Branch (2835:16): [True: 111k, False: 27.9k]
  ------------------
 2836|   111k|    addsub_avx2(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi);
 2837|   111k|    addsub_avx2(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo, clamp_hi);
 2838|   111k|  }
 2839|       |
 2840|  27.9k|  temp1 = half_btf_avx2(cospim16, &u[36], cospi48, &u[59], rnding, bit);
 2841|  27.9k|  temp2 = half_btf_avx2(cospim16, &u[37], cospi48, &u[58], rnding, bit);
 2842|  27.9k|  temp3 = half_btf_avx2(cospim16, &u[38], cospi48, &u[57], rnding, bit);
 2843|  27.9k|  temp4 = half_btf_avx2(cospim16, &u[39], cospi48, &u[56], rnding, bit);
 2844|  27.9k|  u[56] = half_btf_avx2(cospi48, &u[39], cospi16, &u[56], rnding, bit);
 2845|  27.9k|  u[57] = half_btf_avx2(cospi48, &u[38], cospi16, &u[57], rnding, bit);
 2846|  27.9k|  u[58] = half_btf_avx2(cospi48, &u[37], cospi16, &u[58], rnding, bit);
 2847|  27.9k|  u[59] = half_btf_avx2(cospi48, &u[36], cospi16, &u[59], rnding, bit);
 2848|  27.9k|  u[36] = temp1;
 2849|  27.9k|  u[37] = temp2;
 2850|  27.9k|  u[38] = temp3;
 2851|  27.9k|  u[39] = temp4;
 2852|       |
 2853|  27.9k|  temp1 = half_btf_avx2(cospim48, &u[40], cospim16, &u[55], rnding, bit);
 2854|  27.9k|  temp2 = half_btf_avx2(cospim48, &u[41], cospim16, &u[54], rnding, bit);
 2855|  27.9k|  temp3 = half_btf_avx2(cospim48, &u[42], cospim16, &u[53], rnding, bit);
 2856|  27.9k|  temp4 = half_btf_avx2(cospim48, &u[43], cospim16, &u[52], rnding, bit);
 2857|  27.9k|  u[52] = half_btf_avx2(cospim16, &u[43], cospi48, &u[52], rnding, bit);
 2858|  27.9k|  u[53] = half_btf_avx2(cospim16, &u[42], cospi48, &u[53], rnding, bit);
 2859|  27.9k|  u[54] = half_btf_avx2(cospim16, &u[41], cospi48, &u[54], rnding, bit);
 2860|  27.9k|  u[55] = half_btf_avx2(cospim16, &u[40], cospi48, &u[55], rnding, bit);
 2861|  27.9k|  u[40] = temp1;
 2862|  27.9k|  u[41] = temp2;
 2863|  27.9k|  u[42] = temp3;
 2864|  27.9k|  u[43] = temp4;
 2865|  27.9k|}
highbd_inv_txfm_avx2.c:idct64_stage9_avx2:
 2871|  27.9k|                                      const __m256i *rnding, int bit) {
 2872|  27.9k|  int i;
 2873|  27.9k|  __m256i temp1, temp2, temp3, temp4;
 2874|   251k|  for (i = 0; i < 8; ++i) {
  ------------------
  |  Branch (2874:15): [True: 223k, False: 27.9k]
  ------------------
 2875|   223k|    addsub_avx2(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi);
 2876|   223k|  }
 2877|       |
 2878|  27.9k|  temp1 = half_btf_avx2(cospim32, &u[20], cospi32, &u[27], rnding, bit);
 2879|  27.9k|  temp2 = half_btf_avx2(cospim32, &u[21], cospi32, &u[26], rnding, bit);
 2880|  27.9k|  temp3 = half_btf_avx2(cospim32, &u[22], cospi32, &u[25], rnding, bit);
 2881|  27.9k|  temp4 = half_btf_avx2(cospim32, &u[23], cospi32, &u[24], rnding, bit);
 2882|  27.9k|  u[24] = half_btf_avx2(cospi32, &u[23], cospi32, &u[24], rnding, bit);
 2883|  27.9k|  u[25] = half_btf_avx2(cospi32, &u[22], cospi32, &u[25], rnding, bit);
 2884|  27.9k|  u[26] = half_btf_avx2(cospi32, &u[21], cospi32, &u[26], rnding, bit);
 2885|  27.9k|  u[27] = half_btf_avx2(cospi32, &u[20], cospi32, &u[27], rnding, bit);
 2886|  27.9k|  u[20] = temp1;
 2887|  27.9k|  u[21] = temp2;
 2888|  27.9k|  u[22] = temp3;
 2889|  27.9k|  u[23] = temp4;
 2890|   251k|  for (i = 32; i < 40; i++) {
  ------------------
  |  Branch (2890:16): [True: 223k, False: 27.9k]
  ------------------
 2891|   223k|    addsub_avx2(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi);
 2892|   223k|  }
 2893|       |
 2894|   251k|  for (i = 48; i < 56; i++) {
  ------------------
  |  Branch (2894:16): [True: 223k, False: 27.9k]
  ------------------
 2895|   223k|    addsub_avx2(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi);
 2896|   223k|  }
 2897|  27.9k|}
highbd_inv_txfm_avx2.c:idct64_stage10_avx2:
 2903|  27.9k|                                       const __m256i *rnding, int bit) {
 2904|  27.9k|  __m256i temp1, temp2, temp3, temp4;
 2905|   474k|  for (int i = 0; i < 16; i++) {
  ------------------
  |  Branch (2905:19): [True: 447k, False: 27.9k]
  ------------------
 2906|   447k|    addsub_avx2(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi);
 2907|   447k|  }
 2908|       |
 2909|  27.9k|  temp1 = half_btf_avx2(cospim32, &u[40], cospi32, &u[55], rnding, bit);
 2910|  27.9k|  temp2 = half_btf_avx2(cospim32, &u[41], cospi32, &u[54], rnding, bit);
 2911|  27.9k|  temp3 = half_btf_avx2(cospim32, &u[42], cospi32, &u[53], rnding, bit);
 2912|  27.9k|  temp4 = half_btf_avx2(cospim32, &u[43], cospi32, &u[52], rnding, bit);
 2913|  27.9k|  u[52] = half_btf_avx2(cospi32, &u[43], cospi32, &u[52], rnding, bit);
 2914|  27.9k|  u[53] = half_btf_avx2(cospi32, &u[42], cospi32, &u[53], rnding, bit);
 2915|  27.9k|  u[54] = half_btf_avx2(cospi32, &u[41], cospi32, &u[54], rnding, bit);
 2916|  27.9k|  u[55] = half_btf_avx2(cospi32, &u[40], cospi32, &u[55], rnding, bit);
 2917|  27.9k|  u[40] = temp1;
 2918|  27.9k|  u[41] = temp2;
 2919|  27.9k|  u[42] = temp3;
 2920|  27.9k|  u[43] = temp4;
 2921|       |
 2922|  27.9k|  temp1 = half_btf_avx2(cospim32, &u[44], cospi32, &u[51], rnding, bit);
 2923|  27.9k|  temp2 = half_btf_avx2(cospim32, &u[45], cospi32, &u[50], rnding, bit);
 2924|  27.9k|  temp3 = half_btf_avx2(cospim32, &u[46], cospi32, &u[49], rnding, bit);
 2925|  27.9k|  temp4 = half_btf_avx2(cospim32, &u[47], cospi32, &u[48], rnding, bit);
 2926|  27.9k|  u[48] = half_btf_avx2(cospi32, &u[47], cospi32, &u[48], rnding, bit);
 2927|  27.9k|  u[49] = half_btf_avx2(cospi32, &u[46], cospi32, &u[49], rnding, bit);
 2928|  27.9k|  u[50] = half_btf_avx2(cospi32, &u[45], cospi32, &u[50], rnding, bit);
 2929|  27.9k|  u[51] = half_btf_avx2(cospi32, &u[44], cospi32, &u[51], rnding, bit);
 2930|  27.9k|  u[44] = temp1;
 2931|  27.9k|  u[45] = temp2;
 2932|  27.9k|  u[46] = temp3;
 2933|  27.9k|  u[47] = temp4;
 2934|  27.9k|}
highbd_inv_txfm_avx2.c:idct64_stage11_avx2:
 2939|  27.9k|                                       const __m256i *clamp_hi) {
 2940|   921k|  for (int i = 0; i < 32; i++) {
  ------------------
  |  Branch (2940:19): [True: 894k, False: 27.9k]
  ------------------
 2941|   894k|    addsub_avx2(u[i], u[63 - i], &out[(i)], &out[(63 - i)], clamp_lo, clamp_hi);
 2942|   894k|  }
 2943|       |
 2944|  27.9k|  if (!do_cols) {
  ------------------
  |  Branch (2944:7): [True: 5.24k, False: 22.6k]
  ------------------
 2945|  5.24k|    const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|  5.24k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 5.24k]
  |  |  ------------------
  ------------------
 2946|  5.24k|    const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
 2947|  5.24k|    const __m256i clamp_hi_out =
 2948|  5.24k|        _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
 2949|       |
 2950|  5.24k|    round_shift_8x8_avx2(out, out_shift);
 2951|  5.24k|    round_shift_8x8_avx2(out + 16, out_shift);
 2952|  5.24k|    round_shift_8x8_avx2(out + 32, out_shift);
 2953|  5.24k|    round_shift_8x8_avx2(out + 48, out_shift);
 2954|  5.24k|    highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 64);
 2955|  5.24k|  }
 2956|  27.9k|}
highbd_inv_txfm_avx2.c:idct64_low16_avx2:
 3292|  8.92k|                              int bd, int out_shift) {
 3293|  8.92k|  int i, j;
 3294|  8.92k|  const int32_t *cospi = cospi_arr(bit);
 3295|  8.92k|  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
 3296|  8.92k|  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|  17.8k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 8.92k]
  |  |  |  Branch (35:31): [True: 6.52k, False: 2.39k]
  |  |  |  Branch (35:44): [True: 6.52k, False: 2.39k]
  |  |  ------------------
  ------------------
 3297|  8.92k|  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
 3298|  8.92k|  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
 3299|       |
 3300|  8.92k|  const __m256i cospi1 = _mm256_set1_epi32(cospi[1]);
 3301|  8.92k|  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
 3302|  8.92k|  const __m256i cospi3 = _mm256_set1_epi32(cospi[3]);
 3303|  8.92k|  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
 3304|  8.92k|  const __m256i cospi5 = _mm256_set1_epi32(cospi[5]);
 3305|  8.92k|  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
 3306|  8.92k|  const __m256i cospi7 = _mm256_set1_epi32(cospi[7]);
 3307|  8.92k|  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
 3308|  8.92k|  const __m256i cospi9 = _mm256_set1_epi32(cospi[9]);
 3309|  8.92k|  const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
 3310|  8.92k|  const __m256i cospi11 = _mm256_set1_epi32(cospi[11]);
 3311|  8.92k|  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
 3312|  8.92k|  const __m256i cospi13 = _mm256_set1_epi32(cospi[13]);
 3313|  8.92k|  const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
 3314|  8.92k|  const __m256i cospi15 = _mm256_set1_epi32(cospi[15]);
 3315|  8.92k|  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
 3316|  8.92k|  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
 3317|  8.92k|  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
 3318|  8.92k|  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
 3319|  8.92k|  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
 3320|  8.92k|  const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
 3321|  8.92k|  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
 3322|  8.92k|  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
 3323|  8.92k|  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
 3324|  8.92k|  const __m256i cospi51 = _mm256_set1_epi32(cospi[51]);
 3325|  8.92k|  const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
 3326|  8.92k|  const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
 3327|  8.92k|  const __m256i cospi55 = _mm256_set1_epi32(cospi[55]);
 3328|  8.92k|  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
 3329|  8.92k|  const __m256i cospi59 = _mm256_set1_epi32(cospi[59]);
 3330|  8.92k|  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
 3331|  8.92k|  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
 3332|  8.92k|  const __m256i cospi63 = _mm256_set1_epi32(cospi[63]);
 3333|       |
 3334|  8.92k|  const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
 3335|  8.92k|  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
 3336|  8.92k|  const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]);
 3337|  8.92k|  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
 3338|  8.92k|  const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
 3339|  8.92k|  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
 3340|  8.92k|  const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]);
 3341|  8.92k|  const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
 3342|  8.92k|  const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
 3343|  8.92k|  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
 3344|  8.92k|  const __m256i cospim44 = _mm256_set1_epi32(-cospi[44]);
 3345|  8.92k|  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
 3346|  8.92k|  const __m256i cospim49 = _mm256_set1_epi32(-cospi[49]);
 3347|  8.92k|  const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
 3348|  8.92k|  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
 3349|  8.92k|  const __m256i cospim53 = _mm256_set1_epi32(-cospi[53]);
 3350|  8.92k|  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
 3351|  8.92k|  const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]);
 3352|  8.92k|  const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
 3353|  8.92k|  const __m256i cospim60 = _mm256_set1_epi32(-cospi[60]);
 3354|  8.92k|  const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]);
 3355|       |
 3356|  8.92k|  {
 3357|  8.92k|    __m256i u[64];
 3358|  8.92k|    __m256i tmp1, tmp2, tmp3, tmp4;
 3359|       |    // stage 1
 3360|  8.92k|    u[0] = in[0];
 3361|  8.92k|    u[32] = in[1];
 3362|  8.92k|    u[36] = in[9];
 3363|  8.92k|    u[40] = in[5];
 3364|  8.92k|    u[44] = in[13];
 3365|  8.92k|    u[48] = in[3];
 3366|  8.92k|    u[52] = in[11];
 3367|  8.92k|    u[56] = in[7];
 3368|  8.92k|    u[60] = in[15];
 3369|  8.92k|    u[16] = in[2];
 3370|  8.92k|    u[20] = in[10];
 3371|  8.92k|    u[24] = in[6];
 3372|  8.92k|    u[28] = in[14];
 3373|  8.92k|    u[4] = in[8];
 3374|  8.92k|    u[8] = in[4];
 3375|  8.92k|    u[12] = in[12];
 3376|       |
 3377|       |    // stage 2
 3378|  8.92k|    u[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit);
 3379|  8.92k|    u[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit);
 3380|  8.92k|    u[35] = half_btf_0_avx2(&cospim49, &u[60], &rnding, bit);
 3381|  8.92k|    u[60] = half_btf_0_avx2(&cospi15, &u[60], &rnding, bit);
 3382|  8.92k|    u[59] = half_btf_0_avx2(&cospi9, &u[36], &rnding, bit);
 3383|  8.92k|    u[36] = half_btf_0_avx2(&cospi55, &u[36], &rnding, bit);
 3384|  8.92k|    u[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit);
 3385|  8.92k|    u[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit);
 3386|  8.92k|    u[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit);
 3387|  8.92k|    u[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit);
 3388|  8.92k|    u[43] = half_btf_0_avx2(&cospim53, &u[52], &rnding, bit);
 3389|  8.92k|    u[52] = half_btf_0_avx2(&cospi11, &u[52], &rnding, bit);
 3390|  8.92k|    u[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit);
 3391|  8.92k|    u[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit);
 3392|  8.92k|    u[51] = half_btf_0_avx2(&cospi13, &u[44], &rnding, bit);
 3393|  8.92k|    u[44] = half_btf_0_avx2(&cospi51, &u[44], &rnding, bit);
 3394|       |
 3395|       |    // stage 3
 3396|  8.92k|    u[31] = half_btf_0_avx2(&cospi2, &u[16], &rnding, bit);
 3397|  8.92k|    u[16] = half_btf_0_avx2(&cospi62, &u[16], &rnding, bit);
 3398|  8.92k|    u[19] = half_btf_0_avx2(&cospim50, &u[28], &rnding, bit);
 3399|  8.92k|    u[28] = half_btf_0_avx2(&cospi14, &u[28], &rnding, bit);
 3400|  8.92k|    u[27] = half_btf_0_avx2(&cospi10, &u[20], &rnding, bit);
 3401|  8.92k|    u[20] = half_btf_0_avx2(&cospi54, &u[20], &rnding, bit);
 3402|  8.92k|    u[23] = half_btf_0_avx2(&cospim58, &u[24], &rnding, bit);
 3403|  8.92k|    u[24] = half_btf_0_avx2(&cospi6, &u[24], &rnding, bit);
 3404|  8.92k|    u[33] = u[32];
 3405|  8.92k|    u[34] = u[35];
 3406|  8.92k|    u[37] = u[36];
 3407|  8.92k|    u[38] = u[39];
 3408|  8.92k|    u[41] = u[40];
 3409|  8.92k|    u[42] = u[43];
 3410|  8.92k|    u[45] = u[44];
 3411|  8.92k|    u[46] = u[47];
 3412|  8.92k|    u[49] = u[48];
 3413|  8.92k|    u[50] = u[51];
 3414|  8.92k|    u[53] = u[52];
 3415|  8.92k|    u[54] = u[55];
 3416|  8.92k|    u[57] = u[56];
 3417|  8.92k|    u[58] = u[59];
 3418|  8.92k|    u[61] = u[60];
 3419|  8.92k|    u[62] = u[63];
 3420|       |
 3421|       |    // stage 4
 3422|  8.92k|    u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
 3423|  8.92k|    u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
 3424|  8.92k|    u[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit);
 3425|  8.92k|    u[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit);
 3426|       |
 3427|  8.92k|    u[17] = u[16];
 3428|  8.92k|    u[18] = u[19];
 3429|  8.92k|    u[21] = u[20];
 3430|  8.92k|    u[22] = u[23];
 3431|  8.92k|    u[25] = u[24];
 3432|  8.92k|    u[26] = u[27];
 3433|  8.92k|    u[29] = u[28];
 3434|  8.92k|    u[30] = u[31];
 3435|       |
 3436|  8.92k|    tmp1 = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
 3437|  8.92k|    tmp2 = half_btf_avx2(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
 3438|  8.92k|    tmp3 = half_btf_avx2(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
 3439|  8.92k|    tmp4 = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
 3440|  8.92k|    u[57] = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
 3441|  8.92k|    u[58] = half_btf_avx2(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
 3442|  8.92k|    u[61] = half_btf_avx2(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
 3443|  8.92k|    u[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
 3444|  8.92k|    u[33] = tmp1;
 3445|  8.92k|    u[34] = tmp2;
 3446|  8.92k|    u[37] = tmp3;
 3447|  8.92k|    u[38] = tmp4;
 3448|       |
 3449|  8.92k|    tmp1 = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
 3450|  8.92k|    tmp2 = half_btf_avx2(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
 3451|  8.92k|    tmp3 = half_btf_avx2(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
 3452|  8.92k|    tmp4 = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
 3453|  8.92k|    u[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
 3454|  8.92k|    u[50] = half_btf_avx2(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
 3455|  8.92k|    u[53] = half_btf_avx2(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
 3456|  8.92k|    u[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
 3457|  8.92k|    u[41] = tmp1;
 3458|  8.92k|    u[42] = tmp2;
 3459|  8.92k|    u[45] = tmp3;
 3460|  8.92k|    u[46] = tmp4;
 3461|       |
 3462|       |    // stage 5
 3463|  8.92k|    u[7] = half_btf_0_avx2(&cospi8, &u[4], &rnding, bit);
 3464|  8.92k|    u[4] = half_btf_0_avx2(&cospi56, &u[4], &rnding, bit);
 3465|       |
 3466|  8.92k|    u[9] = u[8];
 3467|  8.92k|    u[10] = u[11];
 3468|  8.92k|    u[13] = u[12];
 3469|  8.92k|    u[14] = u[15];
 3470|       |
 3471|  8.92k|    tmp1 = half_btf_avx2(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
 3472|  8.92k|    tmp2 = half_btf_avx2(&cospim56, &u[18], &cospim8, &u[29], &rnding, bit);
 3473|  8.92k|    tmp3 = half_btf_avx2(&cospim40, &u[21], &cospi24, &u[26], &rnding, bit);
 3474|  8.92k|    tmp4 = half_btf_avx2(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
 3475|  8.92k|    u[25] = half_btf_avx2(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
 3476|  8.92k|    u[26] = half_btf_avx2(&cospi24, &u[21], &cospi40, &u[26], &rnding, bit);
 3477|  8.92k|    u[29] = half_btf_avx2(&cospim8, &u[18], &cospi56, &u[29], &rnding, bit);
 3478|  8.92k|    u[30] = half_btf_avx2(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
 3479|  8.92k|    u[17] = tmp1;
 3480|  8.92k|    u[18] = tmp2;
 3481|  8.92k|    u[21] = tmp3;
 3482|  8.92k|    u[22] = tmp4;
 3483|       |
 3484|  44.6k|    for (i = 32; i < 64; i += 8) {
  ------------------
  |  Branch (3484:18): [True: 35.6k, False: 8.92k]
  ------------------
 3485|  35.6k|      addsub_avx2(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
 3486|  35.6k|                  &clamp_hi);
 3487|  35.6k|      addsub_avx2(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
 3488|  35.6k|                  &clamp_hi);
 3489|       |
 3490|  35.6k|      addsub_avx2(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
 3491|  35.6k|                  &clamp_hi);
 3492|  35.6k|      addsub_avx2(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
 3493|  35.6k|                  &clamp_hi);
 3494|  35.6k|    }
 3495|       |
 3496|       |    // stage 6
 3497|  8.92k|    tmp1 = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
 3498|  8.92k|    u[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
 3499|  8.92k|    u[0] = tmp1;
 3500|  8.92k|    u[5] = u[4];
 3501|  8.92k|    u[6] = u[7];
 3502|       |
 3503|  8.92k|    tmp1 = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
 3504|  8.92k|    u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
 3505|  8.92k|    u[9] = tmp1;
 3506|  8.92k|    tmp2 = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
 3507|  8.92k|    u[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
 3508|  8.92k|    u[10] = tmp2;
 3509|       |
 3510|  26.7k|    for (i = 16; i < 32; i += 8) {
  ------------------
  |  Branch (3510:18): [True: 17.8k, False: 8.92k]
  ------------------
 3511|  17.8k|      addsub_avx2(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
 3512|  17.8k|                  &clamp_hi);
 3513|  17.8k|      addsub_avx2(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
 3514|  17.8k|                  &clamp_hi);
 3515|       |
 3516|  17.8k|      addsub_avx2(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
 3517|  17.8k|                  &clamp_hi);
 3518|  17.8k|      addsub_avx2(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
 3519|  17.8k|                  &clamp_hi);
 3520|  17.8k|    }
 3521|       |
 3522|  8.92k|    tmp1 = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
 3523|  8.92k|    tmp2 = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
 3524|  8.92k|    tmp3 = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
 3525|  8.92k|    tmp4 = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
 3526|  8.92k|    u[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
 3527|  8.92k|    u[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
 3528|  8.92k|    u[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
 3529|  8.92k|    u[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
 3530|  8.92k|    u[34] = tmp1;
 3531|  8.92k|    u[35] = tmp2;
 3532|  8.92k|    u[36] = tmp3;
 3533|  8.92k|    u[37] = tmp4;
 3534|       |
 3535|  8.92k|    tmp1 = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
 3536|  8.92k|    tmp2 = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
 3537|  8.92k|    tmp3 = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
 3538|  8.92k|    tmp4 = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
 3539|  8.92k|    u[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
 3540|  8.92k|    u[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
 3541|  8.92k|    u[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
 3542|  8.92k|    u[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
 3543|  8.92k|    u[42] = tmp1;
 3544|  8.92k|    u[43] = tmp2;
 3545|  8.92k|    u[44] = tmp3;
 3546|  8.92k|    u[45] = tmp4;
 3547|       |
 3548|       |    // stage 7
 3549|  8.92k|    u[3] = u[0];
 3550|  8.92k|    u[2] = u[1];
 3551|  8.92k|    tmp1 = half_btf_avx2(&cospim32, &u[5], &cospi32, &u[6], &rnding, bit);
 3552|  8.92k|    u[6] = half_btf_avx2(&cospi32, &u[5], &cospi32, &u[6], &rnding, bit);
 3553|  8.92k|    u[5] = tmp1;
 3554|  8.92k|    addsub_avx2(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
 3555|  8.92k|    addsub_avx2(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
 3556|  8.92k|    addsub_avx2(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
 3557|  8.92k|    addsub_avx2(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
 3558|       |
 3559|  8.92k|    tmp1 = half_btf_avx2(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
 3560|  8.92k|    tmp2 = half_btf_avx2(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
 3561|  8.92k|    tmp3 = half_btf_avx2(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
 3562|  8.92k|    tmp4 = half_btf_avx2(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
 3563|  8.92k|    u[26] = half_btf_avx2(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
 3564|  8.92k|    u[27] = half_btf_avx2(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
 3565|  8.92k|    u[28] = half_btf_avx2(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
 3566|  8.92k|    u[29] = half_btf_avx2(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
 3567|  8.92k|    u[18] = tmp1;
 3568|  8.92k|    u[19] = tmp2;
 3569|  8.92k|    u[20] = tmp3;
 3570|  8.92k|    u[21] = tmp4;
 3571|       |
 3572|  26.7k|    for (i = 32; i < 64; i += 16) {
  ------------------
  |  Branch (3572:18): [True: 17.8k, False: 8.92k]
  ------------------
 3573|  89.2k|      for (j = i; j < i + 4; j++) {
  ------------------
  |  Branch (3573:19): [True: 71.3k, False: 17.8k]
  ------------------
 3574|  71.3k|        addsub_avx2(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
 3575|  71.3k|        addsub_avx2(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
 3576|  71.3k|                    &clamp_hi);
 3577|  71.3k|      }
 3578|  17.8k|    }
 3579|       |
 3580|       |    // stage 8
 3581|  44.6k|    for (i = 0; i < 4; ++i) {
  ------------------
  |  Branch (3581:17): [True: 35.6k, False: 8.92k]
  ------------------
 3582|  35.6k|      addsub_avx2(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi);
 3583|  35.6k|    }
 3584|       |
 3585|  8.92k|    idct64_stage8_avx2(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
 3586|  8.92k|                       &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
 3587|       |
 3588|       |    // stage 9
 3589|  8.92k|    idct64_stage9_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
 3590|  8.92k|                       bit);
 3591|       |
 3592|       |    // stage 10
 3593|  8.92k|    idct64_stage10_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
 3594|  8.92k|                        bit);
 3595|       |
 3596|       |    // stage 11
 3597|  8.92k|    idct64_stage11_avx2(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
 3598|  8.92k|  }
 3599|  8.92k|}
highbd_inv_txfm_avx2.c:idct64_avx2:
 3601|  6.91k|                        int out_shift) {
 3602|  6.91k|  int i, j;
 3603|  6.91k|  const int32_t *cospi = cospi_arr(bit);
 3604|  6.91k|  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
 3605|  6.91k|  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|  13.8k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 6.91k]
  |  |  |  Branch (35:31): [True: 4.05k, False: 2.85k]
  |  |  |  Branch (35:44): [True: 4.05k, False: 2.85k]
  |  |  ------------------
  ------------------
 3606|  6.91k|  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
 3607|  6.91k|  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
 3608|       |
 3609|  6.91k|  const __m256i cospi1 = _mm256_set1_epi32(cospi[1]);
 3610|  6.91k|  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
 3611|  6.91k|  const __m256i cospi3 = _mm256_set1_epi32(cospi[3]);
 3612|  6.91k|  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
 3613|  6.91k|  const __m256i cospi5 = _mm256_set1_epi32(cospi[5]);
 3614|  6.91k|  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
 3615|  6.91k|  const __m256i cospi7 = _mm256_set1_epi32(cospi[7]);
 3616|  6.91k|  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
 3617|  6.91k|  const __m256i cospi9 = _mm256_set1_epi32(cospi[9]);
 3618|  6.91k|  const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
 3619|  6.91k|  const __m256i cospi11 = _mm256_set1_epi32(cospi[11]);
 3620|  6.91k|  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
 3621|  6.91k|  const __m256i cospi13 = _mm256_set1_epi32(cospi[13]);
 3622|  6.91k|  const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
 3623|  6.91k|  const __m256i cospi15 = _mm256_set1_epi32(cospi[15]);
 3624|  6.91k|  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
 3625|  6.91k|  const __m256i cospi17 = _mm256_set1_epi32(cospi[17]);
 3626|  6.91k|  const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
 3627|  6.91k|  const __m256i cospi19 = _mm256_set1_epi32(cospi[19]);
 3628|  6.91k|  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
 3629|  6.91k|  const __m256i cospi21 = _mm256_set1_epi32(cospi[21]);
 3630|  6.91k|  const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
 3631|  6.91k|  const __m256i cospi23 = _mm256_set1_epi32(cospi[23]);
 3632|  6.91k|  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
 3633|  6.91k|  const __m256i cospi25 = _mm256_set1_epi32(cospi[25]);
 3634|  6.91k|  const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
 3635|  6.91k|  const __m256i cospi27 = _mm256_set1_epi32(cospi[27]);
 3636|  6.91k|  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
 3637|  6.91k|  const __m256i cospi29 = _mm256_set1_epi32(cospi[29]);
 3638|  6.91k|  const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
 3639|  6.91k|  const __m256i cospi31 = _mm256_set1_epi32(cospi[31]);
 3640|  6.91k|  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
 3641|  6.91k|  const __m256i cospi35 = _mm256_set1_epi32(cospi[35]);
 3642|  6.91k|  const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
 3643|  6.91k|  const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
 3644|  6.91k|  const __m256i cospi39 = _mm256_set1_epi32(cospi[39]);
 3645|  6.91k|  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
 3646|  6.91k|  const __m256i cospi43 = _mm256_set1_epi32(cospi[43]);
 3647|  6.91k|  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
 3648|  6.91k|  const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
 3649|  6.91k|  const __m256i cospi47 = _mm256_set1_epi32(cospi[47]);
 3650|  6.91k|  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
 3651|  6.91k|  const __m256i cospi51 = _mm256_set1_epi32(cospi[51]);
 3652|  6.91k|  const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
 3653|  6.91k|  const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
 3654|  6.91k|  const __m256i cospi55 = _mm256_set1_epi32(cospi[55]);
 3655|  6.91k|  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
 3656|  6.91k|  const __m256i cospi59 = _mm256_set1_epi32(cospi[59]);
 3657|  6.91k|  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
 3658|  6.91k|  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
 3659|  6.91k|  const __m256i cospi63 = _mm256_set1_epi32(cospi[63]);
 3660|       |
 3661|  6.91k|  const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
 3662|  6.91k|  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
 3663|  6.91k|  const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]);
 3664|  6.91k|  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
 3665|  6.91k|  const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
 3666|  6.91k|  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
 3667|  6.91k|  const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]);
 3668|  6.91k|  const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
 3669|  6.91k|  const __m256i cospim33 = _mm256_set1_epi32(-cospi[33]);
 3670|  6.91k|  const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
 3671|  6.91k|  const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
 3672|  6.91k|  const __m256i cospim37 = _mm256_set1_epi32(-cospi[37]);
 3673|  6.91k|  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
 3674|  6.91k|  const __m256i cospim41 = _mm256_set1_epi32(-cospi[41]);
 3675|  6.91k|  const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
 3676|  6.91k|  const __m256i cospim44 = _mm256_set1_epi32(-cospi[44]);
 3677|  6.91k|  const __m256i cospim45 = _mm256_set1_epi32(-cospi[45]);
 3678|  6.91k|  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
 3679|  6.91k|  const __m256i cospim49 = _mm256_set1_epi32(-cospi[49]);
 3680|  6.91k|  const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
 3681|  6.91k|  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
 3682|  6.91k|  const __m256i cospim53 = _mm256_set1_epi32(-cospi[53]);
 3683|  6.91k|  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
 3684|  6.91k|  const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]);
 3685|  6.91k|  const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
 3686|  6.91k|  const __m256i cospim60 = _mm256_set1_epi32(-cospi[60]);
 3687|  6.91k|  const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]);
 3688|       |
 3689|  6.91k|  {
 3690|  6.91k|    __m256i u[64], v[64];
 3691|       |
 3692|       |    // stage 1
 3693|  6.91k|    u[32] = in[1];
 3694|  6.91k|    u[34] = in[17];
 3695|  6.91k|    u[36] = in[9];
 3696|  6.91k|    u[38] = in[25];
 3697|  6.91k|    u[40] = in[5];
 3698|  6.91k|    u[42] = in[21];
 3699|  6.91k|    u[44] = in[13];
 3700|  6.91k|    u[46] = in[29];
 3701|  6.91k|    u[48] = in[3];
 3702|  6.91k|    u[50] = in[19];
 3703|  6.91k|    u[52] = in[11];
 3704|  6.91k|    u[54] = in[27];
 3705|  6.91k|    u[56] = in[7];
 3706|  6.91k|    u[58] = in[23];
 3707|  6.91k|    u[60] = in[15];
 3708|  6.91k|    u[62] = in[31];
 3709|       |
 3710|  6.91k|    v[16] = in[2];
 3711|  6.91k|    v[18] = in[18];
 3712|  6.91k|    v[20] = in[10];
 3713|  6.91k|    v[22] = in[26];
 3714|  6.91k|    v[24] = in[6];
 3715|  6.91k|    v[26] = in[22];
 3716|  6.91k|    v[28] = in[14];
 3717|  6.91k|    v[30] = in[30];
 3718|       |
 3719|  6.91k|    u[8] = in[4];
 3720|  6.91k|    u[10] = in[20];
 3721|  6.91k|    u[12] = in[12];
 3722|  6.91k|    u[14] = in[28];
 3723|       |
 3724|  6.91k|    v[4] = in[8];
 3725|  6.91k|    v[6] = in[24];
 3726|       |
 3727|  6.91k|    u[0] = in[0];
 3728|  6.91k|    u[2] = in[16];
 3729|       |
 3730|       |    // stage 2
 3731|  6.91k|    v[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit);
 3732|  6.91k|    v[33] = half_btf_0_avx2(&cospim33, &u[62], &rnding, bit);
 3733|  6.91k|    v[34] = half_btf_0_avx2(&cospi47, &u[34], &rnding, bit);
 3734|  6.91k|    v[35] = half_btf_0_avx2(&cospim49, &u[60], &rnding, bit);
 3735|  6.91k|    v[36] = half_btf_0_avx2(&cospi55, &u[36], &rnding, bit);
 3736|  6.91k|    v[37] = half_btf_0_avx2(&cospim41, &u[58], &rnding, bit);
 3737|  6.91k|    v[38] = half_btf_0_avx2(&cospi39, &u[38], &rnding, bit);
 3738|  6.91k|    v[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit);
 3739|  6.91k|    v[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit);
 3740|  6.91k|    v[41] = half_btf_0_avx2(&cospim37, &u[54], &rnding, bit);
 3741|  6.91k|    v[42] = half_btf_0_avx2(&cospi43, &u[42], &rnding, bit);
 3742|  6.91k|    v[43] = half_btf_0_avx2(&cospim53, &u[52], &rnding, bit);
 3743|  6.91k|    v[44] = half_btf_0_avx2(&cospi51, &u[44], &rnding, bit);
 3744|  6.91k|    v[45] = half_btf_0_avx2(&cospim45, &u[50], &rnding, bit);
 3745|  6.91k|    v[46] = half_btf_0_avx2(&cospi35, &u[46], &rnding, bit);
 3746|  6.91k|    v[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit);
 3747|  6.91k|    v[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit);
 3748|  6.91k|    v[49] = half_btf_0_avx2(&cospi29, &u[46], &rnding, bit);
 3749|  6.91k|    v[50] = half_btf_0_avx2(&cospi19, &u[50], &rnding, bit);
 3750|  6.91k|    v[51] = half_btf_0_avx2(&cospi13, &u[44], &rnding, bit);
 3751|  6.91k|    v[52] = half_btf_0_avx2(&cospi11, &u[52], &rnding, bit);
 3752|  6.91k|    v[53] = half_btf_0_avx2(&cospi21, &u[42], &rnding, bit);
 3753|  6.91k|    v[54] = half_btf_0_avx2(&cospi27, &u[54], &rnding, bit);
 3754|  6.91k|    v[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit);
 3755|  6.91k|    v[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit);
 3756|  6.91k|    v[57] = half_btf_0_avx2(&cospi25, &u[38], &rnding, bit);
 3757|  6.91k|    v[58] = half_btf_0_avx2(&cospi23, &u[58], &rnding, bit);
 3758|  6.91k|    v[59] = half_btf_0_avx2(&cospi9, &u[36], &rnding, bit);
 3759|  6.91k|    v[60] = half_btf_0_avx2(&cospi15, &u[60], &rnding, bit);
 3760|  6.91k|    v[61] = half_btf_0_avx2(&cospi17, &u[34], &rnding, bit);
 3761|  6.91k|    v[62] = half_btf_0_avx2(&cospi31, &u[62], &rnding, bit);
 3762|  6.91k|    v[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit);
 3763|       |
 3764|       |    // stage 3
 3765|  6.91k|    u[16] = half_btf_0_avx2(&cospi62, &v[16], &rnding, bit);
 3766|  6.91k|    u[17] = half_btf_0_avx2(&cospim34, &v[30], &rnding, bit);
 3767|  6.91k|    u[18] = half_btf_0_avx2(&cospi46, &v[18], &rnding, bit);
 3768|  6.91k|    u[19] = half_btf_0_avx2(&cospim50, &v[28], &rnding, bit);
 3769|  6.91k|    u[20] = half_btf_0_avx2(&cospi54, &v[20], &rnding, bit);
 3770|  6.91k|    u[21] = half_btf_0_avx2(&cospim42, &v[26], &rnding, bit);
 3771|  6.91k|    u[22] = half_btf_0_avx2(&cospi38, &v[22], &rnding, bit);
 3772|  6.91k|    u[23] = half_btf_0_avx2(&cospim58, &v[24], &rnding, bit);
 3773|  6.91k|    u[24] = half_btf_0_avx2(&cospi6, &v[24], &rnding, bit);
 3774|  6.91k|    u[25] = half_btf_0_avx2(&cospi26, &v[22], &rnding, bit);
 3775|  6.91k|    u[26] = half_btf_0_avx2(&cospi22, &v[26], &rnding, bit);
 3776|  6.91k|    u[27] = half_btf_0_avx2(&cospi10, &v[20], &rnding, bit);
 3777|  6.91k|    u[28] = half_btf_0_avx2(&cospi14, &v[28], &rnding, bit);
 3778|  6.91k|    u[29] = half_btf_0_avx2(&cospi18, &v[18], &rnding, bit);
 3779|  6.91k|    u[30] = half_btf_0_avx2(&cospi30, &v[30], &rnding, bit);
 3780|  6.91k|    u[31] = half_btf_0_avx2(&cospi2, &v[16], &rnding, bit);
 3781|       |
 3782|  62.1k|    for (i = 32; i < 64; i += 4) {
  ------------------
  |  Branch (3782:18): [True: 55.2k, False: 6.91k]
  ------------------
 3783|  55.2k|      addsub_avx2(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
 3784|  55.2k|                  &clamp_hi);
 3785|  55.2k|      addsub_avx2(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
 3786|  55.2k|                  &clamp_hi);
 3787|  55.2k|    }
 3788|       |
 3789|       |    // stage 4
 3790|  6.91k|    v[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
 3791|  6.91k|    v[9] = half_btf_0_avx2(&cospim36, &u[14], &rnding, bit);
 3792|  6.91k|    v[10] = half_btf_0_avx2(&cospi44, &u[10], &rnding, bit);
 3793|  6.91k|    v[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit);
 3794|  6.91k|    v[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit);
 3795|  6.91k|    v[13] = half_btf_0_avx2(&cospi20, &u[10], &rnding, bit);
 3796|  6.91k|    v[14] = half_btf_0_avx2(&cospi28, &u[14], &rnding, bit);
 3797|  6.91k|    v[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
 3798|       |
 3799|  34.5k|    for (i = 16; i < 32; i += 4) {
  ------------------
  |  Branch (3799:18): [True: 27.6k, False: 6.91k]
  ------------------
 3800|  27.6k|      addsub_avx2(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo,
 3801|  27.6k|                  &clamp_hi);
 3802|  27.6k|      addsub_avx2(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo,
 3803|  27.6k|                  &clamp_hi);
 3804|  27.6k|    }
 3805|       |
 3806|  62.1k|    for (i = 32; i < 64; i += 4) {
  ------------------
  |  Branch (3806:18): [True: 55.2k, False: 6.91k]
  ------------------
 3807|  55.2k|      v[i + 0] = u[i + 0];
 3808|  55.2k|      v[i + 3] = u[i + 3];
 3809|  55.2k|    }
 3810|       |
 3811|  6.91k|    v[33] = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
 3812|  6.91k|    v[34] = half_btf_avx2(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
 3813|  6.91k|    v[37] = half_btf_avx2(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
 3814|  6.91k|    v[38] = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
 3815|  6.91k|    v[41] = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
 3816|  6.91k|    v[42] = half_btf_avx2(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
 3817|  6.91k|    v[45] = half_btf_avx2(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
 3818|  6.91k|    v[46] = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
 3819|  6.91k|    v[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
 3820|  6.91k|    v[50] = half_btf_avx2(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
 3821|  6.91k|    v[53] = half_btf_avx2(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
 3822|  6.91k|    v[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
 3823|  6.91k|    v[57] = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
 3824|  6.91k|    v[58] = half_btf_avx2(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
 3825|  6.91k|    v[61] = half_btf_avx2(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
 3826|  6.91k|    v[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
 3827|       |
 3828|       |    // stage 5
 3829|  6.91k|    u[4] = half_btf_0_avx2(&cospi56, &v[4], &rnding, bit);
 3830|  6.91k|    u[5] = half_btf_0_avx2(&cospim40, &v[6], &rnding, bit);
 3831|  6.91k|    u[6] = half_btf_0_avx2(&cospi24, &v[6], &rnding, bit);
 3832|  6.91k|    u[7] = half_btf_0_avx2(&cospi8, &v[4], &rnding, bit);
 3833|       |
 3834|  20.7k|    for (i = 8; i < 16; i += 4) {
  ------------------
  |  Branch (3834:17): [True: 13.8k, False: 6.91k]
  ------------------
 3835|  13.8k|      addsub_avx2(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
 3836|  13.8k|                  &clamp_hi);
 3837|  13.8k|      addsub_avx2(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
 3838|  13.8k|                  &clamp_hi);
 3839|  13.8k|    }
 3840|       |
 3841|  34.5k|    for (i = 16; i < 32; i += 4) {
  ------------------
  |  Branch (3841:18): [True: 27.6k, False: 6.91k]
  ------------------
 3842|  27.6k|      u[i + 0] = v[i + 0];
 3843|  27.6k|      u[i + 3] = v[i + 3];
 3844|  27.6k|    }
 3845|       |
 3846|  6.91k|    u[17] = half_btf_avx2(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit);
 3847|  6.91k|    u[18] = half_btf_avx2(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit);
 3848|  6.91k|    u[21] = half_btf_avx2(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit);
 3849|  6.91k|    u[22] = half_btf_avx2(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit);
 3850|  6.91k|    u[25] = half_btf_avx2(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit);
 3851|  6.91k|    u[26] = half_btf_avx2(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit);
 3852|  6.91k|    u[29] = half_btf_avx2(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit);
 3853|  6.91k|    u[30] = half_btf_avx2(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit);
 3854|       |
 3855|  34.5k|    for (i = 32; i < 64; i += 8) {
  ------------------
  |  Branch (3855:18): [True: 27.6k, False: 6.91k]
  ------------------
 3856|  27.6k|      addsub_avx2(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
 3857|  27.6k|                  &clamp_hi);
 3858|  27.6k|      addsub_avx2(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
 3859|  27.6k|                  &clamp_hi);
 3860|       |
 3861|  27.6k|      addsub_avx2(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
 3862|  27.6k|                  &clamp_hi);
 3863|  27.6k|      addsub_avx2(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
 3864|  27.6k|                  &clamp_hi);
 3865|  27.6k|    }
 3866|       |
 3867|       |    // stage 6
 3868|  6.91k|    v[0] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
 3869|  6.91k|    v[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
 3870|  6.91k|    v[2] = half_btf_0_avx2(&cospi48, &u[2], &rnding, bit);
 3871|  6.91k|    v[3] = half_btf_0_avx2(&cospi16, &u[2], &rnding, bit);
 3872|       |
 3873|  6.91k|    addsub_avx2(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
 3874|  6.91k|    addsub_avx2(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
 3875|       |
 3876|  20.7k|    for (i = 8; i < 16; i += 4) {
  ------------------
  |  Branch (3876:17): [True: 13.8k, False: 6.91k]
  ------------------
 3877|  13.8k|      v[i + 0] = u[i + 0];
 3878|  13.8k|      v[i + 3] = u[i + 3];
 3879|  13.8k|    }
 3880|       |
 3881|  6.91k|    v[9] = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
 3882|  6.91k|    v[10] = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
 3883|  6.91k|    v[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
 3884|  6.91k|    v[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
 3885|       |
 3886|  20.7k|    for (i = 16; i < 32; i += 8) {
  ------------------
  |  Branch (3886:18): [True: 13.8k, False: 6.91k]
  ------------------
 3887|  13.8k|      addsub_avx2(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo,
 3888|  13.8k|                  &clamp_hi);
 3889|  13.8k|      addsub_avx2(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo,
 3890|  13.8k|                  &clamp_hi);
 3891|       |
 3892|  13.8k|      addsub_avx2(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo,
 3893|  13.8k|                  &clamp_hi);
 3894|  13.8k|      addsub_avx2(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo,
 3895|  13.8k|                  &clamp_hi);
 3896|  13.8k|    }
 3897|       |
 3898|  34.5k|    for (i = 32; i < 64; i += 8) {
  ------------------
  |  Branch (3898:18): [True: 27.6k, False: 6.91k]
  ------------------
 3899|  27.6k|      v[i + 0] = u[i + 0];
 3900|  27.6k|      v[i + 1] = u[i + 1];
 3901|  27.6k|      v[i + 6] = u[i + 6];
 3902|  27.6k|      v[i + 7] = u[i + 7];
 3903|  27.6k|    }
 3904|       |
 3905|  6.91k|    v[34] = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
 3906|  6.91k|    v[35] = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
 3907|  6.91k|    v[36] = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
 3908|  6.91k|    v[37] = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
 3909|  6.91k|    v[42] = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
 3910|  6.91k|    v[43] = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
 3911|  6.91k|    v[44] = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
 3912|  6.91k|    v[45] = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
 3913|  6.91k|    v[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
 3914|  6.91k|    v[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
 3915|  6.91k|    v[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
 3916|  6.91k|    v[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
 3917|  6.91k|    v[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
 3918|  6.91k|    v[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
 3919|  6.91k|    v[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
 3920|  6.91k|    v[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
 3921|       |
 3922|       |    // stage 7
 3923|  6.91k|    addsub_avx2(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
 3924|  6.91k|    addsub_avx2(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
 3925|       |
 3926|  6.91k|    u[4] = v[4];
 3927|  6.91k|    u[7] = v[7];
 3928|  6.91k|    u[5] = half_btf_avx2(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit);
 3929|  6.91k|    u[6] = half_btf_avx2(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit);
 3930|       |
 3931|  6.91k|    addsub_avx2(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
 3932|  6.91k|    addsub_avx2(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
 3933|  6.91k|    addsub_avx2(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
 3934|  6.91k|    addsub_avx2(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
 3935|       |
 3936|  20.7k|    for (i = 16; i < 32; i += 8) {
  ------------------
  |  Branch (3936:18): [True: 13.8k, False: 6.91k]
  ------------------
 3937|  13.8k|      u[i + 0] = v[i + 0];
 3938|  13.8k|      u[i + 1] = v[i + 1];
 3939|  13.8k|      u[i + 6] = v[i + 6];
 3940|  13.8k|      u[i + 7] = v[i + 7];
 3941|  13.8k|    }
 3942|       |
 3943|  6.91k|    u[18] = half_btf_avx2(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit);
 3944|  6.91k|    u[19] = half_btf_avx2(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit);
 3945|  6.91k|    u[20] = half_btf_avx2(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit);
 3946|  6.91k|    u[21] = half_btf_avx2(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit);
 3947|  6.91k|    u[26] = half_btf_avx2(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit);
 3948|  6.91k|    u[27] = half_btf_avx2(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit);
 3949|  6.91k|    u[28] = half_btf_avx2(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit);
 3950|  6.91k|    u[29] = half_btf_avx2(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit);
 3951|       |
 3952|  20.7k|    for (i = 32; i < 64; i += 16) {
  ------------------
  |  Branch (3952:18): [True: 13.8k, False: 6.91k]
  ------------------
 3953|  69.1k|      for (j = i; j < i + 4; j++) {
  ------------------
  |  Branch (3953:19): [True: 55.2k, False: 13.8k]
  ------------------
 3954|  55.2k|        addsub_avx2(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
 3955|  55.2k|        addsub_avx2(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
 3956|  55.2k|                    &clamp_hi);
 3957|  55.2k|      }
 3958|  13.8k|    }
 3959|       |
 3960|       |    // stage 8
 3961|  34.5k|    for (i = 0; i < 4; ++i) {
  ------------------
  |  Branch (3961:17): [True: 27.6k, False: 6.91k]
  ------------------
 3962|  27.6k|      addsub_avx2(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi);
 3963|  27.6k|    }
 3964|       |
 3965|  6.91k|    v[8] = u[8];
 3966|  6.91k|    v[9] = u[9];
 3967|  6.91k|    v[14] = u[14];
 3968|  6.91k|    v[15] = u[15];
 3969|       |
 3970|  6.91k|    v[10] = half_btf_avx2(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit);
 3971|  6.91k|    v[11] = half_btf_avx2(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit);
 3972|  6.91k|    v[12] = half_btf_avx2(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit);
 3973|  6.91k|    v[13] = half_btf_avx2(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit);
 3974|       |
 3975|  34.5k|    for (i = 16; i < 20; ++i) {
  ------------------
  |  Branch (3975:18): [True: 27.6k, False: 6.91k]
  ------------------
 3976|  27.6k|      addsub_avx2(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi);
 3977|  27.6k|      addsub_avx2(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo,
 3978|  27.6k|                  &clamp_hi);
 3979|  27.6k|    }
 3980|       |
 3981|  34.5k|    for (i = 32; i < 36; ++i) {
  ------------------
  |  Branch (3981:18): [True: 27.6k, False: 6.91k]
  ------------------
 3982|  27.6k|      v[i] = u[i];
 3983|  27.6k|      v[i + 12] = u[i + 12];
 3984|  27.6k|      v[i + 16] = u[i + 16];
 3985|  27.6k|      v[i + 28] = u[i + 28];
 3986|  27.6k|    }
 3987|       |
 3988|  6.91k|    v[36] = half_btf_avx2(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit);
 3989|  6.91k|    v[37] = half_btf_avx2(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit);
 3990|  6.91k|    v[38] = half_btf_avx2(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit);
 3991|  6.91k|    v[39] = half_btf_avx2(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit);
 3992|  6.91k|    v[40] = half_btf_avx2(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit);
 3993|  6.91k|    v[41] = half_btf_avx2(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit);
 3994|  6.91k|    v[42] = half_btf_avx2(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit);
 3995|  6.91k|    v[43] = half_btf_avx2(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit);
 3996|  6.91k|    v[52] = half_btf_avx2(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit);
 3997|  6.91k|    v[53] = half_btf_avx2(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit);
 3998|  6.91k|    v[54] = half_btf_avx2(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit);
 3999|  6.91k|    v[55] = half_btf_avx2(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit);
 4000|  6.91k|    v[56] = half_btf_avx2(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit);
 4001|  6.91k|    v[57] = half_btf_avx2(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit);
 4002|  6.91k|    v[58] = half_btf_avx2(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit);
 4003|  6.91k|    v[59] = half_btf_avx2(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit);
 4004|       |
 4005|       |    // stage 9
 4006|  62.1k|    for (i = 0; i < 8; ++i) {
  ------------------
  |  Branch (4006:17): [True: 55.2k, False: 6.91k]
  ------------------
 4007|  55.2k|      addsub_avx2(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi);
 4008|  55.2k|    }
 4009|       |
 4010|  34.5k|    for (i = 16; i < 20; ++i) {
  ------------------
  |  Branch (4010:18): [True: 27.6k, False: 6.91k]
  ------------------
 4011|  27.6k|      u[i] = v[i];
 4012|  27.6k|      u[i + 12] = v[i + 12];
 4013|  27.6k|    }
 4014|       |
 4015|  6.91k|    u[20] = half_btf_avx2(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit);
 4016|  6.91k|    u[21] = half_btf_avx2(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit);
 4017|  6.91k|    u[22] = half_btf_avx2(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit);
 4018|  6.91k|    u[23] = half_btf_avx2(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit);
 4019|  6.91k|    u[24] = half_btf_avx2(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit);
 4020|  6.91k|    u[25] = half_btf_avx2(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit);
 4021|  6.91k|    u[26] = half_btf_avx2(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit);
 4022|  6.91k|    u[27] = half_btf_avx2(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit);
 4023|       |
 4024|  62.1k|    for (i = 32; i < 40; i++) {
  ------------------
  |  Branch (4024:18): [True: 55.2k, False: 6.91k]
  ------------------
 4025|  55.2k|      addsub_avx2(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi);
 4026|  55.2k|    }
 4027|       |
 4028|  62.1k|    for (i = 48; i < 56; i++) {
  ------------------
  |  Branch (4028:18): [True: 55.2k, False: 6.91k]
  ------------------
 4029|  55.2k|      addsub_avx2(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi);
 4030|  55.2k|    }
 4031|       |
 4032|       |    // stage 10
 4033|   117k|    for (i = 0; i < 16; i++) {
  ------------------
  |  Branch (4033:17): [True: 110k, False: 6.91k]
  ------------------
 4034|   110k|      addsub_avx2(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi);
 4035|   110k|    }
 4036|       |
 4037|  62.1k|    for (i = 32; i < 40; i++) v[i] = u[i];
  ------------------
  |  Branch (4037:18): [True: 55.2k, False: 6.91k]
  ------------------
 4038|       |
 4039|  6.91k|    v[40] = half_btf_avx2(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit);
 4040|  6.91k|    v[41] = half_btf_avx2(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit);
 4041|  6.91k|    v[42] = half_btf_avx2(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit);
 4042|  6.91k|    v[43] = half_btf_avx2(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit);
 4043|  6.91k|    v[44] = half_btf_avx2(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit);
 4044|  6.91k|    v[45] = half_btf_avx2(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit);
 4045|  6.91k|    v[46] = half_btf_avx2(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit);
 4046|  6.91k|    v[47] = half_btf_avx2(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit);
 4047|  6.91k|    v[48] = half_btf_avx2(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit);
 4048|  6.91k|    v[49] = half_btf_avx2(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit);
 4049|  6.91k|    v[50] = half_btf_avx2(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit);
 4050|  6.91k|    v[51] = half_btf_avx2(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit);
 4051|  6.91k|    v[52] = half_btf_avx2(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit);
 4052|  6.91k|    v[53] = half_btf_avx2(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit);
 4053|  6.91k|    v[54] = half_btf_avx2(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit);
 4054|  6.91k|    v[55] = half_btf_avx2(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit);
 4055|       |
 4056|  62.1k|    for (i = 56; i < 64; i++) v[i] = u[i];
  ------------------
  |  Branch (4056:18): [True: 55.2k, False: 6.91k]
  ------------------
 4057|       |
 4058|       |    // stage 11
 4059|   228k|    for (i = 0; i < 32; i++) {
  ------------------
  |  Branch (4059:17): [True: 221k, False: 6.91k]
  ------------------
 4060|   221k|      addsub_avx2(v[i], v[63 - i], &out[(i)], &out[(63 - i)], &clamp_lo,
 4061|   221k|                  &clamp_hi);
 4062|   221k|    }
 4063|  6.91k|    if (!do_cols) {
  ------------------
  |  Branch (4063:9): [True: 2.85k, False: 4.05k]
  ------------------
 4064|  2.85k|      const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|  2.85k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 2.85k]
  |  |  ------------------
  ------------------
 4065|  2.85k|      const __m256i clamp_lo_out =
 4066|  2.85k|          _mm256_set1_epi32(-(1 << (log_range_out - 1)));
 4067|  2.85k|      const __m256i clamp_hi_out =
 4068|  2.85k|          _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
 4069|       |
 4070|  2.85k|      round_shift_8x8_avx2(out, out_shift);
 4071|  2.85k|      round_shift_8x8_avx2(out + 16, out_shift);
 4072|  2.85k|      round_shift_8x8_avx2(out + 32, out_shift);
 4073|  2.85k|      round_shift_8x8_avx2(out + 48, out_shift);
 4074|  2.85k|      highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 64);
 4075|  2.85k|    }
 4076|  6.91k|  }
 4077|  6.91k|}
highbd_inv_txfm_avx2.c:load_buffer_32bit_input:
  235|   542k|                                           __m256i *out, int out_size) {
  236|  6.32M|  for (int i = 0; i < out_size; ++i) {
  ------------------
  |  Branch (236:19): [True: 5.78M, False: 542k]
  ------------------
  237|  5.78M|    out[i] = _mm256_loadu_si256((const __m256i *)(in + i * stride));
  238|  5.78M|  }
  239|   542k|}
highbd_inv_txfm_avx2.c:transpose_8x8_flip_avx2:
  197|  1.25k|static void transpose_8x8_flip_avx2(const __m256i *in, __m256i *out) {
  198|  1.25k|  __m256i u0, u1, u2, u3, u4, u5, u6, u7;
  199|  1.25k|  __m256i x0, x1;
  200|       |
  201|  1.25k|  u0 = _mm256_unpacklo_epi32(in[7], in[6]);
  202|  1.25k|  u1 = _mm256_unpackhi_epi32(in[7], in[6]);
  203|       |
  204|  1.25k|  u2 = _mm256_unpacklo_epi32(in[5], in[4]);
  205|  1.25k|  u3 = _mm256_unpackhi_epi32(in[5], in[4]);
  206|       |
  207|  1.25k|  u4 = _mm256_unpacklo_epi32(in[3], in[2]);
  208|  1.25k|  u5 = _mm256_unpackhi_epi32(in[3], in[2]);
  209|       |
  210|  1.25k|  u6 = _mm256_unpacklo_epi32(in[1], in[0]);
  211|  1.25k|  u7 = _mm256_unpackhi_epi32(in[1], in[0]);
  212|       |
  213|  1.25k|  x0 = _mm256_unpacklo_epi64(u0, u2);
  214|  1.25k|  x1 = _mm256_unpacklo_epi64(u4, u6);
  215|  1.25k|  out[0] = _mm256_permute2f128_si256(x0, x1, 0x20);
  216|  1.25k|  out[4] = _mm256_permute2f128_si256(x0, x1, 0x31);
  217|       |
  218|  1.25k|  x0 = _mm256_unpackhi_epi64(u0, u2);
  219|  1.25k|  x1 = _mm256_unpackhi_epi64(u4, u6);
  220|  1.25k|  out[1] = _mm256_permute2f128_si256(x0, x1, 0x20);
  221|  1.25k|  out[5] = _mm256_permute2f128_si256(x0, x1, 0x31);
  222|       |
  223|  1.25k|  x0 = _mm256_unpacklo_epi64(u1, u3);
  224|  1.25k|  x1 = _mm256_unpacklo_epi64(u5, u7);
  225|  1.25k|  out[2] = _mm256_permute2f128_si256(x0, x1, 0x20);
  226|  1.25k|  out[6] = _mm256_permute2f128_si256(x0, x1, 0x31);
  227|       |
  228|  1.25k|  x0 = _mm256_unpackhi_epi64(u1, u3);
  229|  1.25k|  x1 = _mm256_unpackhi_epi64(u5, u7);
  230|  1.25k|  out[3] = _mm256_permute2f128_si256(x0, x1, 0x20);
  231|       |  out[7] = _mm256_permute2f128_si256(x0, x1, 0x31);
  232|  1.25k|}
highbd_inv_txfm_avx2.c:transpose_8x8_avx2:
  160|  1.08M|static void transpose_8x8_avx2(const __m256i *in, __m256i *out) {
  161|  1.08M|  __m256i u0, u1, u2, u3, u4, u5, u6, u7;
  162|  1.08M|  __m256i x0, x1;
  163|       |
  164|  1.08M|  u0 = _mm256_unpacklo_epi32(in[0], in[1]);
  165|  1.08M|  u1 = _mm256_unpackhi_epi32(in[0], in[1]);
  166|       |
  167|  1.08M|  u2 = _mm256_unpacklo_epi32(in[2], in[3]);
  168|  1.08M|  u3 = _mm256_unpackhi_epi32(in[2], in[3]);
  169|       |
  170|  1.08M|  u4 = _mm256_unpacklo_epi32(in[4], in[5]);
  171|  1.08M|  u5 = _mm256_unpackhi_epi32(in[4], in[5]);
  172|       |
  173|  1.08M|  u6 = _mm256_unpacklo_epi32(in[6], in[7]);
  174|  1.08M|  u7 = _mm256_unpackhi_epi32(in[6], in[7]);
  175|       |
  176|  1.08M|  x0 = _mm256_unpacklo_epi64(u0, u2);
  177|  1.08M|  x1 = _mm256_unpacklo_epi64(u4, u6);
  178|  1.08M|  out[0] = _mm256_permute2f128_si256(x0, x1, 0x20);
  179|  1.08M|  out[4] = _mm256_permute2f128_si256(x0, x1, 0x31);
  180|       |
  181|  1.08M|  x0 = _mm256_unpackhi_epi64(u0, u2);
  182|  1.08M|  x1 = _mm256_unpackhi_epi64(u4, u6);
  183|  1.08M|  out[1] = _mm256_permute2f128_si256(x0, x1, 0x20);
  184|  1.08M|  out[5] = _mm256_permute2f128_si256(x0, x1, 0x31);
  185|       |
  186|  1.08M|  x0 = _mm256_unpacklo_epi64(u1, u3);
  187|  1.08M|  x1 = _mm256_unpacklo_epi64(u5, u7);
  188|  1.08M|  out[2] = _mm256_permute2f128_si256(x0, x1, 0x20);
  189|  1.08M|  out[6] = _mm256_permute2f128_si256(x0, x1, 0x31);
  190|       |
  191|  1.08M|  x0 = _mm256_unpackhi_epi64(u1, u3);
  192|  1.08M|  x1 = _mm256_unpackhi_epi64(u5, u7);
  193|  1.08M|  out[3] = _mm256_permute2f128_si256(x0, x1, 0x20);
  194|       |  out[7] = _mm256_permute2f128_si256(x0, x1, 0x31);
  195|  1.08M|}
highbd_inv_txfm_avx2.c:highbd_write_buffer_16xn_avx2:
  107|   325k|                                                 int height, const int bd) {
  108|   325k|  int j = flipud ? (height - 1) : 0;
  ------------------
  |  Branch (108:11): [True: 568, False: 325k]
  ------------------
  109|   325k|  const int step = flipud ? -1 : 1;
  ------------------
  |  Branch (109:20): [True: 568, False: 325k]
  ------------------
  110|  7.45M|  for (int i = 0; i < height; ++i, j += step) {
  ------------------
  |  Branch (110:19): [True: 7.13M, False: 325k]
  ------------------
  111|  7.13M|    __m256i v = _mm256_loadu_si256((__m256i const *)(output + i * stride));
  112|  7.13M|    __m256i u = highbd_get_recon_16x8_avx2(v, in[j], in[j + height], bd);
  113|       |
  114|  7.13M|    _mm256_storeu_si256((__m256i *)(output + i * stride), u);
  115|  7.13M|  }
  116|   325k|}
highbd_inv_txfm_avx2.c:highbd_get_recon_16x8_avx2:
   93|  7.13M|                                                 const int bd) {
   94|  7.13M|  __m256i x0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(pred));
   95|  7.13M|  __m256i x1 = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(pred, 1));
   96|       |
   97|  7.13M|  x0 = _mm256_add_epi32(res0, x0);
   98|  7.13M|  x1 = _mm256_add_epi32(res1, x1);
   99|  7.13M|  x0 = _mm256_packus_epi32(x0, x1);
  100|       |  x0 = _mm256_permute4x64_epi64(x0, 0xd8);
  101|  7.13M|  x0 = highbd_clamp_epi16_avx2(x0, bd);
  102|  7.13M|  return x0;
  103|  7.13M|}
highbd_inv_txfm_avx2.c:highbd_clamp_epi16_avx2:
   34|  9.53M|static inline __m256i highbd_clamp_epi16_avx2(__m256i u, int bd) {
   35|  9.53M|  const __m256i zero = _mm256_setzero_si256();
   36|  9.53M|  const __m256i one = _mm256_set1_epi16(1);
   37|  9.53M|  const __m256i max = _mm256_sub_epi16(_mm256_slli_epi16(one, bd), one);
   38|  9.53M|  __m256i clamped, mask;
   39|       |
   40|  9.53M|  mask = _mm256_cmpgt_epi16(u, max);
   41|  9.53M|  clamped = _mm256_andnot_si256(mask, u);
   42|  9.53M|  mask = _mm256_and_si256(mask, max);
   43|  9.53M|  clamped = _mm256_or_si256(mask, clamped);
   44|  9.53M|  mask = _mm256_cmpgt_epi16(clamped, zero);
   45|  9.53M|  clamped = _mm256_and_si256(clamped, mask);
   46|       |
   47|  9.53M|  return clamped;
   48|  9.53M|}
highbd_inv_txfm_avx2.c:highbd_write_buffer_8xn_avx2:
  129|   231k|                                                int height, const int bd) {
  130|   231k|  int j = flipud ? (height - 1) : 0;
  ------------------
  |  Branch (130:11): [True: 490, False: 231k]
  ------------------
  131|   231k|  __m128i temp;
  132|   231k|  const int step = flipud ? -1 : 1;
  ------------------
  |  Branch (132:20): [True: 490, False: 231k]
  ------------------
  133|  2.64M|  for (int i = 0; i < height; ++i, j += step) {
  ------------------
  |  Branch (133:19): [True: 2.40M, False: 231k]
  ------------------
  134|  2.40M|    temp = _mm_loadu_si128((__m128i const *)(output + i * stride));
  135|  2.40M|    __m256i v = _mm256_cvtepi16_epi32(temp);
  136|  2.40M|    __m256i u = highbd_get_recon_8x8_avx2(v, in[j], bd);
  137|  2.40M|    __m128i u1 = _mm256_castsi256_si128(u);
  138|  2.40M|    _mm_storeu_si128((__m128i *)(output + i * stride), u1);
  139|  2.40M|  }
  140|   231k|}
highbd_inv_txfm_avx2.c:highbd_get_recon_8x8_avx2:
  118|  2.40M|                                                const int bd) {
  119|  2.40M|  __m256i x0 = pred;
  120|  2.40M|  x0 = _mm256_add_epi32(res, x0);
  121|  2.40M|  x0 = _mm256_packus_epi32(x0, x0);
  122|       |  x0 = _mm256_permute4x64_epi64(x0, 0xd8);
  123|  2.40M|  x0 = highbd_clamp_epi16_avx2(x0, bd);
  124|  2.40M|  return x0;
  125|  2.40M|}

av1_highbd_iwht4x4_16_add_sse4_1:
  149|   360k|                                      int stride, int bd) {
  150|       |  /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
  151|       |     0.5 shifts per pixel. */
  152|   360k|  __m128i op[4];
  153|   360k|  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
  ------------------
  |  |   75|   360k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  154|       |
  155|   360k|  load_buffer_4x4(input, op);
  156|       |
  157|       |  // Shift before-hand.
  158|   360k|  op[0] = _mm_srai_epi32(op[0], UNIT_QUANT_SHIFT);
  ------------------
  |  |   21|   360k|#define UNIT_QUANT_SHIFT 2
  ------------------
  159|   360k|  op[1] = _mm_srai_epi32(op[1], UNIT_QUANT_SHIFT);
  ------------------
  |  |   21|   360k|#define UNIT_QUANT_SHIFT 2
  ------------------
  160|   360k|  op[2] = _mm_srai_epi32(op[2], UNIT_QUANT_SHIFT);
  ------------------
  |  |   21|   360k|#define UNIT_QUANT_SHIFT 2
  ------------------
  161|   360k|  op[3] = _mm_srai_epi32(op[3], UNIT_QUANT_SHIFT);
  ------------------
  |  |   21|   360k|#define UNIT_QUANT_SHIFT 2
  ------------------
  162|       |
  163|  1.08M|  for (int i = 0; i < 2; ++i) {
  ------------------
  |  Branch (163:19): [True: 720k, False: 360k]
  ------------------
  164|   720k|    __m128i a1 = op[0];
  165|   720k|    __m128i c1 = op[1];
  166|   720k|    __m128i d1 = op[2];
  167|   720k|    __m128i b1 = op[3];
  168|   720k|    a1 = _mm_add_epi32(a1, c1);          // a1 += c1
  169|   720k|    d1 = _mm_sub_epi32(d1, b1);          // d1 -= b1
  170|   720k|    __m128i e1 = _mm_sub_epi32(a1, d1);  // e1 = (a1 - d1) >> 1
  171|   720k|    e1 = _mm_srai_epi32(e1, 1);
  172|   720k|    b1 = _mm_sub_epi32(e1, b1);  // b1 = e1 - b1
  173|   720k|    c1 = _mm_sub_epi32(e1, c1);  // c1 = e1 - c1
  174|   720k|    a1 = _mm_sub_epi32(a1, b1);  // a1 -= b1
  175|   720k|    d1 = _mm_add_epi32(d1, c1);  // d1 += c1
  176|       |
  177|   720k|    op[0] = a1;
  178|   720k|    op[1] = b1;
  179|   720k|    op[2] = c1;
  180|   720k|    op[3] = d1;
  181|   720k|    if (i == 0) {
  ------------------
  |  Branch (181:9): [True: 360k, False: 360k]
  ------------------
  182|   360k|      transpose_32bit_4x4(op, op);
  183|   360k|    }
  184|   720k|  }
  185|       |
  186|       |  // Convert to int16_t. The C code checks that we are in range.
  187|   360k|  op[0] = _mm_packs_epi32(op[0], op[1]);
  188|   360k|  op[1] = _mm_packs_epi32(op[2], op[3]);
  189|       |
  190|       |  // Load uint16_t.
  191|   360k|  __m128i dst[2];
  192|   360k|  __m128i tmp[4];
  193|   360k|  tmp[0] = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride));
  194|   360k|  tmp[1] = _mm_loadl_epi64((const __m128i *)(dest + 1 * stride));
  195|   360k|  dst[0] = _mm_unpacklo_epi64(tmp[0], tmp[1]);
  196|   360k|  tmp[2] = _mm_loadl_epi64((const __m128i *)(dest + 2 * stride));
  197|   360k|  tmp[3] = _mm_loadl_epi64((const __m128i *)(dest + 3 * stride));
  198|   360k|  dst[1] = _mm_unpacklo_epi64(tmp[2], tmp[3]);
  199|       |
  200|       |  // Add to the previous results.
  201|   360k|  dst[0] = _mm_add_epi16(dst[0], op[0]);
  202|   360k|  dst[1] = _mm_add_epi16(dst[1], op[1]);
  203|       |
  204|       |  // Clamp.
  205|   360k|  dst[0] = highbd_clamp_epi16(dst[0], bd);
  206|   360k|  dst[1] = highbd_clamp_epi16(dst[1], bd);
  207|       |
  208|       |  // Store.
  209|   360k|  _mm_storel_epi64((__m128i *)(dest + 0 * stride), dst[0]);
  210|   360k|  dst[0] = _mm_srli_si128(dst[0], 8);
  211|   360k|  _mm_storel_epi64((__m128i *)(dest + 1 * stride), dst[0]);
  212|   360k|  _mm_storel_epi64((__m128i *)(dest + 2 * stride), dst[1]);
  213|       |  dst[1] = _mm_srli_si128(dst[1], 8);
  214|   360k|  _mm_storel_epi64((__m128i *)(dest + 3 * stride), dst[1]);
  215|   360k|}
av1_inv_txfm2d_add_4x4_sse4_1:
  722|   147k|                                   int stride, TX_TYPE tx_type, int bd) {
  723|   147k|  __m128i in[4];
  724|   147k|  const int8_t *shift = av1_inv_txfm_shift_ls[TX_4X4];
  725|       |
  726|   147k|  switch (tx_type) {
  727|  34.1k|    case DCT_DCT:
  ------------------
  |  Branch (727:5): [True: 34.1k, False: 113k]
  ------------------
  728|  34.1k|      load_buffer_4x4(input, in);
  729|  34.1k|      idct4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
  ------------------
  |  |   43|  34.1k|#define INV_COS_BIT 12
  ------------------
  730|  34.1k|      transpose_32bit_4x4(in, in);
  731|  34.1k|      idct4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
  ------------------
  |  |   43|  34.1k|#define INV_COS_BIT 12
  ------------------
  732|  34.1k|      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
  733|  34.1k|      break;
  734|  22.9k|    case ADST_DCT:
  ------------------
  |  Branch (734:5): [True: 22.9k, False: 124k]
  ------------------
  735|  22.9k|      load_buffer_4x4(input, in);
  736|  22.9k|      idct4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
  ------------------
  |  |   43|  22.9k|#define INV_COS_BIT 12
  ------------------
  737|  22.9k|      transpose_32bit_4x4(in, in);
  738|  22.9k|      iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
  ------------------
  |  |   43|  22.9k|#define INV_COS_BIT 12
  ------------------
  739|  22.9k|      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
  740|  22.9k|      break;
  741|  27.6k|    case DCT_ADST:
  ------------------
  |  Branch (741:5): [True: 27.6k, False: 119k]
  ------------------
  742|  27.6k|      load_buffer_4x4(input, in);
  743|  27.6k|      iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
  ------------------
  |  |   43|  27.6k|#define INV_COS_BIT 12
  ------------------
  744|  27.6k|      transpose_32bit_4x4(in, in);
  745|  27.6k|      idct4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
  ------------------
  |  |   43|  27.6k|#define INV_COS_BIT 12
  ------------------
  746|  27.6k|      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
  747|  27.6k|      break;
  748|  34.1k|    case ADST_ADST:
  ------------------
  |  Branch (748:5): [True: 34.1k, False: 113k]
  ------------------
  749|  34.1k|      load_buffer_4x4(input, in);
  750|  34.1k|      iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
  ------------------
  |  |   43|  34.1k|#define INV_COS_BIT 12
  ------------------
  751|  34.1k|      transpose_32bit_4x4(in, in);
  752|  34.1k|      iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
  ------------------
  |  |   43|  34.1k|#define INV_COS_BIT 12
  ------------------
  753|  34.1k|      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
  754|  34.1k|      break;
  755|     58|    case FLIPADST_DCT:
  ------------------
  |  Branch (755:5): [True: 58, False: 147k]
  ------------------
  756|     58|      load_buffer_4x4(input, in);
  757|     58|      idct4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
  ------------------
  |  |   43|     58|#define INV_COS_BIT 12
  ------------------
  758|     58|      transpose_32bit_4x4(in, in);
  759|     58|      iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
  ------------------
  |  |   43|     58|#define INV_COS_BIT 12
  ------------------
  760|     58|      write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
  761|     58|      break;
  762|    142|    case DCT_FLIPADST:
  ------------------
  |  Branch (762:5): [True: 142, False: 147k]
  ------------------
  763|    142|      load_buffer_4x4(input, in);
  764|    142|      iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
  ------------------
  |  |   43|    142|#define INV_COS_BIT 12
  ------------------
  765|    142|      transpose_32bit_4x4(in, in);
  766|    142|      idct4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
  ------------------
  |  |   43|    142|#define INV_COS_BIT 12
  ------------------
  767|    142|      write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
  768|    142|      break;
  769|     64|    case FLIPADST_FLIPADST:
  ------------------
  |  Branch (769:5): [True: 64, False: 147k]
  ------------------
  770|     64|      load_buffer_4x4(input, in);
  771|     64|      iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
  ------------------
  |  |   43|     64|#define INV_COS_BIT 12
  ------------------
  772|     64|      transpose_32bit_4x4(in, in);
  773|     64|      iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
  ------------------
  |  |   43|     64|#define INV_COS_BIT 12
  ------------------
  774|     64|      write_buffer_4x4(in, output, stride, 1, 1, -shift[1], bd);
  775|     64|      break;
  776|    105|    case ADST_FLIPADST:
  ------------------
  |  Branch (776:5): [True: 105, False: 147k]
  ------------------
  777|    105|      load_buffer_4x4(input, in);
  778|    105|      iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
  ------------------
  |  |   43|    105|#define INV_COS_BIT 12
  ------------------
  779|    105|      transpose_32bit_4x4(in, in);
  780|    105|      iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
  ------------------
  |  |   43|    105|#define INV_COS_BIT 12
  ------------------
  781|    105|      write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
  782|    105|      break;
  783|    164|    case FLIPADST_ADST:
  ------------------
  |  Branch (783:5): [True: 164, False: 147k]
  ------------------
  784|    164|      load_buffer_4x4(input, in);
  785|    164|      iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
  ------------------
  |  |   43|    164|#define INV_COS_BIT 12
  ------------------
  786|    164|      transpose_32bit_4x4(in, in);
  787|    164|      iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
  ------------------
  |  |   43|    164|#define INV_COS_BIT 12
  ------------------
  788|    164|      write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
  789|    164|      break;
  790|  8.50k|    case IDTX:
  ------------------
  |  Branch (790:5): [True: 8.50k, False: 138k]
  ------------------
  791|  8.50k|      load_buffer_4x4(input, in);
  792|  8.50k|      iidentity4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
  ------------------
  |  |   43|  8.50k|#define INV_COS_BIT 12
  ------------------
  793|  8.50k|      transpose_32bit_4x4(in, in);
  794|  8.50k|      iidentity4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
  ------------------
  |  |   43|  8.50k|#define INV_COS_BIT 12
  ------------------
  795|  8.50k|      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
  796|  8.50k|      break;
  797|  5.64k|    case V_DCT:
  ------------------
  |  Branch (797:5): [True: 5.64k, False: 141k]
  ------------------
  798|  5.64k|      load_buffer_4x4(input, in);
  799|  5.64k|      iidentity4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
  ------------------
  |  |   43|  5.64k|#define INV_COS_BIT 12
  ------------------
  800|  5.64k|      transpose_32bit_4x4(in, in);
  801|  5.64k|      idct4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
  ------------------
  |  |   43|  5.64k|#define INV_COS_BIT 12
  ------------------
  802|  5.64k|      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
  803|  5.64k|      break;
  804|  13.4k|    case H_DCT:
  ------------------
  |  Branch (804:5): [True: 13.4k, False: 134k]
  ------------------
  805|  13.4k|      load_buffer_4x4(input, in);
  806|  13.4k|      idct4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
  ------------------
  |  |   43|  13.4k|#define INV_COS_BIT 12
  ------------------
  807|  13.4k|      transpose_32bit_4x4(in, in);
  808|  13.4k|      iidentity4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
  ------------------
  |  |   43|  13.4k|#define INV_COS_BIT 12
  ------------------
  809|  13.4k|      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
  810|  13.4k|      break;
  811|     75|    case V_ADST:
  ------------------
  |  Branch (811:5): [True: 75, False: 147k]
  ------------------
  812|     75|      load_buffer_4x4(input, in);
  813|     75|      iidentity4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
  ------------------
  |  |   43|     75|#define INV_COS_BIT 12
  ------------------
  814|     75|      transpose_32bit_4x4(in, in);
  815|     75|      iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
  ------------------
  |  |   43|     75|#define INV_COS_BIT 12
  ------------------
  816|     75|      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
  817|     75|      break;
  818|    156|    case H_ADST:
  ------------------
  |  Branch (818:5): [True: 156, False: 147k]
  ------------------
  819|    156|      load_buffer_4x4(input, in);
  820|    156|      iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
  ------------------
  |  |   43|    156|#define INV_COS_BIT 12
  ------------------
  821|    156|      transpose_32bit_4x4(in, in);
  822|    156|      iidentity4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
  ------------------
  |  |   43|    156|#define INV_COS_BIT 12
  ------------------
  823|    156|      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
  824|    156|      break;
  825|     54|    case V_FLIPADST:
  ------------------
  |  Branch (825:5): [True: 54, False: 147k]
  ------------------
  826|     54|      load_buffer_4x4(input, in);
  827|     54|      iidentity4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
  ------------------
  |  |   43|     54|#define INV_COS_BIT 12
  ------------------
  828|     54|      transpose_32bit_4x4(in, in);
  829|     54|      iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
  ------------------
  |  |   43|     54|#define INV_COS_BIT 12
  ------------------
  830|     54|      write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
  831|     54|      break;
  832|    137|    case H_FLIPADST:
  ------------------
  |  Branch (832:5): [True: 137, False: 147k]
  ------------------
  833|    137|      load_buffer_4x4(input, in);
  834|    137|      iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
  ------------------
  |  |   43|    137|#define INV_COS_BIT 12
  ------------------
  835|    137|      transpose_32bit_4x4(in, in);
  836|    137|      iidentity4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
  ------------------
  |  |   43|    137|#define INV_COS_BIT 12
  ------------------
  837|    137|      write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
  838|    137|      break;
  839|      0|    default: assert(0);
  ------------------
  |  Branch (839:5): [True: 0, False: 147k]
  ------------------
  840|   147k|  }
  841|   147k|}
av1_highbd_inv_txfm2d_add_universe_sse4_1:
 5720|  36.8k|                                               int eob, const int bd) {
 5721|  36.8k|  switch (tx_type) {
 5722|      0|    case DCT_DCT:
  ------------------
  |  Branch (5722:5): [True: 0, False: 36.8k]
  ------------------
 5723|      0|    case ADST_DCT:
  ------------------
  |  Branch (5723:5): [True: 0, False: 36.8k]
  ------------------
 5724|      0|    case DCT_ADST:
  ------------------
  |  Branch (5724:5): [True: 0, False: 36.8k]
  ------------------
 5725|      0|    case ADST_ADST:
  ------------------
  |  Branch (5725:5): [True: 0, False: 36.8k]
  ------------------
 5726|      0|    case FLIPADST_DCT:
  ------------------
  |  Branch (5726:5): [True: 0, False: 36.8k]
  ------------------
 5727|      0|    case DCT_FLIPADST:
  ------------------
  |  Branch (5727:5): [True: 0, False: 36.8k]
  ------------------
 5728|      0|    case FLIPADST_FLIPADST:
  ------------------
  |  Branch (5728:5): [True: 0, False: 36.8k]
  ------------------
 5729|      0|    case ADST_FLIPADST:
  ------------------
  |  Branch (5729:5): [True: 0, False: 36.8k]
  ------------------
 5730|      0|    case FLIPADST_ADST:
  ------------------
  |  Branch (5730:5): [True: 0, False: 36.8k]
  ------------------
 5731|      0|      highbd_inv_txfm2d_add_no_identity_sse41(
 5732|      0|          input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
  ------------------
  |  |   75|      0|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
 5733|      0|          bd);
 5734|      0|      break;
 5735|  5.72k|    case V_DCT:
  ------------------
  |  Branch (5735:5): [True: 5.72k, False: 31.1k]
  ------------------
 5736|  5.85k|    case V_ADST:
  ------------------
  |  Branch (5736:5): [True: 130, False: 36.7k]
  ------------------
 5737|  5.90k|    case V_FLIPADST:
  ------------------
  |  Branch (5737:5): [True: 51, False: 36.7k]
  ------------------
 5738|  5.90k|      highbd_inv_txfm2d_add_h_identity_ssse41(
 5739|  5.90k|          input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
  ------------------
  |  |   75|  5.90k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
 5740|  5.90k|          bd);
 5741|  5.90k|      break;
 5742|  10.7k|    case H_DCT:
  ------------------
  |  Branch (5742:5): [True: 10.7k, False: 26.1k]
  ------------------
 5743|  10.9k|    case H_ADST:
  ------------------
  |  Branch (5743:5): [True: 243, False: 36.6k]
  ------------------
 5744|  11.0k|    case H_FLIPADST:
  ------------------
  |  Branch (5744:5): [True: 140, False: 36.7k]
  ------------------
 5745|  11.0k|      highbd_inv_txfm2d_add_v_identity_ssse41(
 5746|  11.0k|          input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
  ------------------
  |  |   75|  11.0k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
 5747|  11.0k|          bd);
 5748|  11.0k|      break;
 5749|  19.8k|    case IDTX:
  ------------------
  |  Branch (5749:5): [True: 19.8k, False: 16.9k]
  ------------------
 5750|  19.8k|      highbd_inv_txfm2d_add_idtx_ssse41(input, CONVERT_TO_SHORTPTR(output),
  ------------------
  |  |   75|  19.8k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
 5751|  19.8k|                                        stride, tx_type, tx_size, eob, bd);
 5752|  19.8k|      break;
 5753|      0|    default: assert(0); break;
  ------------------
  |  Branch (5753:5): [True: 0, False: 36.8k]
  ------------------
 5754|  36.8k|  }
 5755|  36.8k|}
av1_highbd_inv_txfm_add_sse4_1:
 5802|   763k|                                    int stride, const TxfmParam *txfm_param) {
 5803|   763k|  assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
 5804|   763k|  const TX_SIZE tx_size = txfm_param->tx_size;
 5805|   763k|  switch (tx_size) {
 5806|      0|    case TX_8X8:
  ------------------
  |  Branch (5806:5): [True: 0, False: 763k]
  ------------------
 5807|      0|      av1_highbd_inv_txfm_add_8x8_sse4_1(input, dest, stride, txfm_param);
 5808|      0|      break;
 5809|  43.8k|    case TX_4X8:
  ------------------
  |  Branch (5809:5): [True: 43.8k, False: 719k]
  ------------------
 5810|  43.8k|      av1_highbd_inv_txfm_add_4x8_sse4_1(input, dest, stride, txfm_param);
 5811|  43.8k|      break;
 5812|  68.9k|    case TX_8X4:
  ------------------
  |  Branch (5812:5): [True: 68.9k, False: 694k]
  ------------------
 5813|  68.9k|      av1_highbd_inv_txfm_add_8x4_sse4_1(input, dest, stride, txfm_param);
 5814|  68.9k|      break;
 5815|   583k|    case TX_4X4:
  ------------------
  |  Branch (5815:5): [True: 583k, False: 179k]
  ------------------
 5816|   583k|      av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param);
 5817|   583k|      break;
 5818|  46.1k|    case TX_16X4:
  ------------------
  |  Branch (5818:5): [True: 46.1k, False: 717k]
  ------------------
 5819|  46.1k|      av1_highbd_inv_txfm_add_16x4_sse4_1(input, dest, stride, txfm_param);
 5820|  46.1k|      break;
 5821|  20.8k|    case TX_4X16:
  ------------------
  |  Branch (5821:5): [True: 20.8k, False: 742k]
  ------------------
 5822|  20.8k|      av1_highbd_inv_txfm_add_4x16_sse4_1(input, dest, stride, txfm_param);
 5823|  20.8k|      break;
 5824|      0|    default:
  ------------------
  |  Branch (5824:5): [True: 0, False: 763k]
  ------------------
 5825|      0|      av1_highbd_inv_txfm2d_add_universe_sse4_1(
 5826|      0|          input, dest, stride, txfm_param->tx_type, tx_size, txfm_param->eob,
 5827|      0|          txfm_param->bd);
 5828|      0|      break;
 5829|   763k|  }
 5830|   763k|}
highbd_inv_txfm_sse4.c:load_buffer_4x4:
  141|   507k|static inline void load_buffer_4x4(const int32_t *coeff, __m128i *in) {
  142|   507k|  in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
  143|   507k|  in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
  144|   507k|  in[2] = _mm_load_si128((const __m128i *)(coeff + 8));
  145|   507k|  in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
  146|   507k|}
highbd_inv_txfm_sse4.c:highbd_clamp_epi16:
   24|  1.69M|static inline __m128i highbd_clamp_epi16(__m128i u, int bd) {
   25|  1.69M|  const __m128i zero = _mm_setzero_si128();
   26|  1.69M|  const __m128i one = _mm_set1_epi16(1);
   27|  1.69M|  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
   28|  1.69M|  __m128i clamped, mask;
   29|       |
   30|  1.69M|  mask = _mm_cmpgt_epi16(u, max);
   31|  1.69M|  clamped = _mm_andnot_si128(mask, u);
   32|  1.69M|  mask = _mm_and_si128(mask, max);
   33|  1.69M|  clamped = _mm_or_si128(mask, clamped);
   34|  1.69M|  mask = _mm_cmpgt_epi16(clamped, zero);
   35|  1.69M|  clamped = _mm_and_si128(clamped, mask);
   36|       |
   37|  1.69M|  return clamped;
   38|  1.69M|}
highbd_inv_txfm_sse4.c:idct4x4_sse4_1:
  456|   394k|                           int bd, int out_shift) {
  457|   394k|  const int32_t *cospi = cospi_arr(bit);
  458|   394k|  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
  459|   394k|  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
  460|   394k|  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
  461|   394k|  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
  462|   394k|  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
  463|   394k|  int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|   788k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 394k]
  |  |  |  Branch (35:31): [True: 236k, False: 157k]
  |  |  |  Branch (35:44): [True: 236k, False: 157k]
  |  |  ------------------
  ------------------
  464|   394k|  __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
  465|   394k|  __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
  466|   394k|  __m128i u0, u1, u2, u3;
  467|   394k|  __m128i v0, v1, v2, v3, x, y;
  468|       |
  469|       |  // Stage 0
  470|       |  // Stage 1
  471|       |  // Stage 2
  472|   394k|  u0 = in[0];
  473|   394k|  u1 = in[1];
  474|   394k|  u2 = in[2];
  475|   394k|  u3 = in[3];
  476|       |
  477|   394k|  x = _mm_mullo_epi32(u0, cospi32);
  478|   394k|  y = _mm_mullo_epi32(u2, cospi32);
  479|   394k|  v0 = _mm_add_epi32(x, y);
  480|   394k|  v0 = _mm_add_epi32(v0, rnding);
  481|   394k|  v0 = _mm_srai_epi32(v0, bit);
  482|       |
  483|   394k|  v1 = _mm_sub_epi32(x, y);
  484|   394k|  v1 = _mm_add_epi32(v1, rnding);
  485|   394k|  v1 = _mm_srai_epi32(v1, bit);
  486|       |
  487|   394k|  x = _mm_mullo_epi32(u1, cospi48);
  488|   394k|  y = _mm_mullo_epi32(u3, cospim16);
  489|   394k|  v2 = _mm_add_epi32(x, y);
  490|   394k|  v2 = _mm_add_epi32(v2, rnding);
  491|   394k|  v2 = _mm_srai_epi32(v2, bit);
  492|       |
  493|   394k|  x = _mm_mullo_epi32(u1, cospi16);
  494|   394k|  y = _mm_mullo_epi32(u3, cospi48);
  495|   394k|  v3 = _mm_add_epi32(x, y);
  496|   394k|  v3 = _mm_add_epi32(v3, rnding);
  497|   394k|  v3 = _mm_srai_epi32(v3, bit);
  498|       |
  499|       |  // Stage 3
  500|   394k|  addsub_sse4_1(v0, v3, out + 0, out + 3, &clamp_lo, &clamp_hi);
  501|   394k|  addsub_sse4_1(v1, v2, out + 1, out + 2, &clamp_lo, &clamp_hi);
  502|       |
  503|   394k|  if (!do_cols) {
  ------------------
  |  Branch (503:7): [True: 157k, False: 236k]
  ------------------
  504|   157k|    log_range = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|   157k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 157k]
  |  |  ------------------
  ------------------
  505|   157k|    clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
  506|   157k|    clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
  507|       |
  508|   157k|    shift_and_clamp_sse4_1(out + 0, out + 3, &clamp_lo, &clamp_hi, out_shift);
  509|   157k|    shift_and_clamp_sse4_1(out + 1, out + 2, &clamp_lo, &clamp_hi, out_shift);
  510|   157k|  }
  511|   394k|}
highbd_inv_txfm_sse4.c:addsub_sse4_1:
  219|  3.44M|                          const __m128i *clamp_hi) {
  220|  3.44M|  __m128i a0 = _mm_add_epi32(in0, in1);
  221|  3.44M|  __m128i a1 = _mm_sub_epi32(in0, in1);
  222|       |
  223|  3.44M|  a0 = _mm_max_epi32(a0, *clamp_lo);
  224|  3.44M|  a0 = _mm_min_epi32(a0, *clamp_hi);
  225|  3.44M|  a1 = _mm_max_epi32(a1, *clamp_lo);
  226|  3.44M|  a1 = _mm_min_epi32(a1, *clamp_hi);
  227|       |
  228|  3.44M|  *out0 = a0;
  229|  3.44M|  *out1 = a1;
  230|  3.44M|}
highbd_inv_txfm_sse4.c:shift_and_clamp_sse4_1:
  234|   315k|                                   const __m128i *clamp_hi, int shift) {
  235|   315k|  __m128i offset = _mm_set1_epi32((1 << shift) >> 1);
  236|   315k|  __m128i in0_w_offset = _mm_add_epi32(*in0, offset);
  237|   315k|  __m128i in1_w_offset = _mm_add_epi32(*in1, offset);
  238|       |
  239|   315k|  in0_w_offset = _mm_sra_epi32(in0_w_offset, _mm_cvtsi32_si128(shift));
  240|   315k|  in1_w_offset = _mm_sra_epi32(in1_w_offset, _mm_cvtsi32_si128(shift));
  241|       |
  242|   315k|  in0_w_offset = _mm_max_epi32(in0_w_offset, *clamp_lo);
  243|   315k|  in0_w_offset = _mm_min_epi32(in0_w_offset, *clamp_hi);
  244|   315k|  in1_w_offset = _mm_max_epi32(in1_w_offset, *clamp_lo);
  245|   315k|  in1_w_offset = _mm_min_epi32(in1_w_offset, *clamp_hi);
  246|       |
  247|   315k|  *in0 = in0_w_offset;
  248|   315k|  *in1 = in1_w_offset;
  249|   315k|}
highbd_inv_txfm_sse4.c:write_buffer_4x4:
  634|   147k|                             int fliplr, int flipud, int shift, int bd) {
  635|   147k|  const __m128i zero = _mm_setzero_si128();
  636|   147k|  __m128i u0, u1, u2, u3;
  637|   147k|  __m128i v0, v1, v2, v3;
  638|       |
  639|   147k|  round_shift_4x4(in, shift);
  640|       |
  641|   147k|  v0 = _mm_loadl_epi64((__m128i const *)(output + 0 * stride));
  642|   147k|  v1 = _mm_loadl_epi64((__m128i const *)(output + 1 * stride));
  643|   147k|  v2 = _mm_loadl_epi64((__m128i const *)(output + 2 * stride));
  644|   147k|  v3 = _mm_loadl_epi64((__m128i const *)(output + 3 * stride));
  645|       |
  646|   147k|  v0 = _mm_unpacklo_epi16(v0, zero);
  647|   147k|  v1 = _mm_unpacklo_epi16(v1, zero);
  648|   147k|  v2 = _mm_unpacklo_epi16(v2, zero);
  649|   147k|  v3 = _mm_unpacklo_epi16(v3, zero);
  650|       |
  651|   147k|  if (fliplr) {
  ------------------
  |  Branch (651:7): [True: 448, False: 147k]
  ------------------
  652|    448|    in[0] = _mm_shuffle_epi32(in[0], 0x1B);
  653|    448|    in[1] = _mm_shuffle_epi32(in[1], 0x1B);
  654|    448|    in[2] = _mm_shuffle_epi32(in[2], 0x1B);
  655|    448|    in[3] = _mm_shuffle_epi32(in[3], 0x1B);
  656|    448|  }
  657|       |
  658|   147k|  if (flipud) {
  ------------------
  |  Branch (658:7): [True: 340, False: 147k]
  ------------------
  659|    340|    u0 = _mm_add_epi32(in[3], v0);
  660|    340|    u1 = _mm_add_epi32(in[2], v1);
  661|    340|    u2 = _mm_add_epi32(in[1], v2);
  662|    340|    u3 = _mm_add_epi32(in[0], v3);
  663|   147k|  } else {
  664|   147k|    u0 = _mm_add_epi32(in[0], v0);
  665|   147k|    u1 = _mm_add_epi32(in[1], v1);
  666|   147k|    u2 = _mm_add_epi32(in[2], v2);
  667|   147k|    u3 = _mm_add_epi32(in[3], v3);
  668|   147k|  }
  669|       |
  670|   147k|  v0 = _mm_packus_epi32(u0, u1);
  671|   147k|  v2 = _mm_packus_epi32(u2, u3);
  672|       |
  673|   147k|  u0 = highbd_clamp_epi16(v0, bd);
  674|   147k|  u2 = highbd_clamp_epi16(v2, bd);
  675|       |
  676|   147k|  v0 = _mm_unpacklo_epi64(u0, u0);
  677|   147k|  v1 = _mm_unpackhi_epi64(u0, u0);
  678|   147k|  v2 = _mm_unpacklo_epi64(u2, u2);
  679|   147k|  v3 = _mm_unpackhi_epi64(u2, u2);
  680|       |
  681|   147k|  _mm_storel_epi64((__m128i *)(output + 0 * stride), v0);
  682|   147k|  _mm_storel_epi64((__m128i *)(output + 1 * stride), v1);
  683|   147k|  _mm_storel_epi64((__m128i *)(output + 2 * stride), v2);
  684|   147k|  _mm_storel_epi64((__m128i *)(output + 3 * stride), v3);
  685|   147k|}
highbd_inv_txfm_sse4.c:round_shift_4x4:
   40|   707k|static inline void round_shift_4x4(__m128i *in, int shift) {
   41|   707k|  if (shift != 0) {
  ------------------
  |  Branch (41:7): [True: 503k, False: 203k]
  ------------------
   42|   503k|    __m128i rnding = _mm_set1_epi32(1 << (shift - 1));
   43|   503k|    in[0] = _mm_add_epi32(in[0], rnding);
   44|   503k|    in[1] = _mm_add_epi32(in[1], rnding);
   45|   503k|    in[2] = _mm_add_epi32(in[2], rnding);
   46|   503k|    in[3] = _mm_add_epi32(in[3], rnding);
   47|       |
   48|   503k|    in[0] = _mm_srai_epi32(in[0], shift);
   49|   503k|    in[1] = _mm_srai_epi32(in[1], shift);
   50|   503k|    in[2] = _mm_srai_epi32(in[2], shift);
   51|   503k|    in[3] = _mm_srai_epi32(in[3], shift);
   52|   503k|  }
   53|   707k|}
highbd_inv_txfm_sse4.c:iadst4x4_sse4_1:
  514|   305k|                            int bd, int out_shift) {
  515|   305k|  const int32_t *sinpi = sinpi_arr(bit);
  516|   305k|  const __m128i zero = _mm_setzero_si128();
  517|   305k|  __m128i rnding = _mm_set1_epi32(1 << (bit + 4 - 1));
  518|   305k|  rnding = _mm_unpacklo_epi32(rnding, zero);
  519|   305k|  const __m128i mul = _mm_set1_epi32(1 << 4);
  520|   305k|  const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]);
  521|   305k|  const __m128i sinpi2 = _mm_set1_epi32((int)sinpi[2]);
  522|   305k|  const __m128i sinpi3 = _mm_set1_epi32((int)sinpi[3]);
  523|   305k|  const __m128i sinpi4 = _mm_set1_epi32((int)sinpi[4]);
  524|   305k|  __m128i t;
  525|   305k|  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
  526|   305k|  __m128i x0, x1, x2, x3;
  527|   305k|  __m128i u0, u1, u2, u3;
  528|   305k|  __m128i u0_low, u1_low, u2_low, u3_low;
  529|   305k|  __m128i u0_high, u1_high, u2_high, u3_high;
  530|       |
  531|   305k|  x0 = in[0];
  532|   305k|  x1 = in[1];
  533|   305k|  x2 = in[2];
  534|   305k|  x3 = in[3];
  535|       |
  536|   305k|  s0 = _mm_mullo_epi32(x0, sinpi1);
  537|   305k|  s1 = _mm_mullo_epi32(x0, sinpi2);
  538|   305k|  s2 = _mm_mullo_epi32(x1, sinpi3);
  539|   305k|  s3 = _mm_mullo_epi32(x2, sinpi4);
  540|   305k|  s4 = _mm_mullo_epi32(x2, sinpi1);
  541|   305k|  s5 = _mm_mullo_epi32(x3, sinpi2);
  542|   305k|  s6 = _mm_mullo_epi32(x3, sinpi4);
  543|   305k|  t = _mm_sub_epi32(x0, x2);
  544|   305k|  s7 = _mm_add_epi32(t, x3);
  545|       |
  546|   305k|  t = _mm_add_epi32(s0, s3);
  547|   305k|  s0 = _mm_add_epi32(t, s5);
  548|   305k|  t = _mm_sub_epi32(s1, s4);
  549|   305k|  s1 = _mm_sub_epi32(t, s6);
  550|   305k|  s3 = s2;
  551|   305k|  s2 = _mm_mullo_epi32(s7, sinpi3);
  552|       |
  553|   305k|  u0 = _mm_add_epi32(s0, s3);
  554|   305k|  u1 = _mm_add_epi32(s1, s3);
  555|   305k|  u2 = s2;
  556|   305k|  t = _mm_add_epi32(s0, s1);
  557|   305k|  u3 = _mm_sub_epi32(t, s3);
  558|       |
  559|       |  // u0
  560|   305k|  u0_low = _mm_mul_epi32(u0, mul);
  561|   305k|  u0_low = _mm_add_epi64(u0_low, rnding);
  562|       |
  563|   305k|  u0 = _mm_srli_si128(u0, 4);
  564|   305k|  u0_high = _mm_mul_epi32(u0, mul);
  565|   305k|  u0_high = _mm_add_epi64(u0_high, rnding);
  566|       |
  567|   305k|  u0_low = _mm_srli_si128(u0_low, 2);
  568|   305k|  u0_high = _mm_srli_si128(u0_high, 2);
  569|       |
  570|   305k|  u0 = _mm_unpacklo_epi32(u0_low, u0_high);
  571|   305k|  u0_high = _mm_unpackhi_epi32(u0_low, u0_high);
  572|   305k|  u0 = _mm_unpacklo_epi64(u0, u0_high);
  573|       |
  574|       |  // u1
  575|   305k|  u1_low = _mm_mul_epi32(u1, mul);
  576|   305k|  u1_low = _mm_add_epi64(u1_low, rnding);
  577|       |
  578|   305k|  u1 = _mm_srli_si128(u1, 4);
  579|   305k|  u1_high = _mm_mul_epi32(u1, mul);
  580|   305k|  u1_high = _mm_add_epi64(u1_high, rnding);
  581|       |
  582|   305k|  u1_low = _mm_srli_si128(u1_low, 2);
  583|   305k|  u1_high = _mm_srli_si128(u1_high, 2);
  584|       |
  585|   305k|  u1 = _mm_unpacklo_epi32(u1_low, u1_high);
  586|   305k|  u1_high = _mm_unpackhi_epi32(u1_low, u1_high);
  587|   305k|  u1 = _mm_unpacklo_epi64(u1, u1_high);
  588|       |
  589|       |  // u2
  590|   305k|  u2_low = _mm_mul_epi32(u2, mul);
  591|   305k|  u2_low = _mm_add_epi64(u2_low, rnding);
  592|       |
  593|   305k|  u2 = _mm_srli_si128(u2, 4);
  594|   305k|  u2_high = _mm_mul_epi32(u2, mul);
  595|   305k|  u2_high = _mm_add_epi64(u2_high, rnding);
  596|       |
  597|   305k|  u2_low = _mm_srli_si128(u2_low, 2);
  598|   305k|  u2_high = _mm_srli_si128(u2_high, 2);
  599|       |
  600|   305k|  u2 = _mm_unpacklo_epi32(u2_low, u2_high);
  601|   305k|  u2_high = _mm_unpackhi_epi32(u2_low, u2_high);
  602|   305k|  u2 = _mm_unpacklo_epi64(u2, u2_high);
  603|       |
  604|       |  // u3
  605|   305k|  u3_low = _mm_mul_epi32(u3, mul);
  606|   305k|  u3_low = _mm_add_epi64(u3_low, rnding);
  607|       |
  608|   305k|  u3 = _mm_srli_si128(u3, 4);
  609|   305k|  u3_high = _mm_mul_epi32(u3, mul);
  610|   305k|  u3_high = _mm_add_epi64(u3_high, rnding);
  611|       |
  612|   305k|  u3_low = _mm_srli_si128(u3_low, 2);
  613|   305k|  u3_high = _mm_srli_si128(u3_high, 2);
  614|       |
  615|   305k|  u3 = _mm_unpacklo_epi32(u3_low, u3_high);
  616|   305k|  u3_high = _mm_unpackhi_epi32(u3_low, u3_high);
  617|   305k|  u3 = _mm_unpacklo_epi64(u3, u3_high);
  618|       |
  619|   305k|  out[0] = u0;
  620|   305k|  out[1] = u1;
  621|   305k|  out[2] = u2;
  622|   305k|  out[3] = u3;
  623|       |
  624|   305k|  if (!do_cols) {
  ------------------
  |  Branch (624:7): [True: 131k, False: 174k]
  ------------------
  625|   131k|    const int log_range = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|   131k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 131k]
  |  |  ------------------
  ------------------
  626|   131k|    const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
  627|   131k|    const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
  628|   131k|    round_shift_4x4(out, out_shift);
  629|   131k|    highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 4);
  630|   131k|  }
  631|   305k|}
highbd_inv_txfm_sse4.c:highbd_clamp_epi32_sse4_1:
   64|   313k|                                      const __m128i *clamp_hi, int size) {
   65|   313k|  __m128i a0, a1;
   66|   873k|  for (int i = 0; i < size; i += 4) {
  ------------------
  |  Branch (66:19): [True: 559k, False: 313k]
  ------------------
   67|   559k|    a0 = _mm_max_epi32(in[i], *clamp_lo);
   68|   559k|    out[i] = _mm_min_epi32(a0, *clamp_hi);
   69|       |
   70|   559k|    a1 = _mm_max_epi32(in[i + 1], *clamp_lo);
   71|   559k|    out[i + 1] = _mm_min_epi32(a1, *clamp_hi);
   72|       |
   73|   559k|    a0 = _mm_max_epi32(in[i + 2], *clamp_lo);
   74|   559k|    out[i + 2] = _mm_min_epi32(a0, *clamp_hi);
   75|       |
   76|   559k|    a1 = _mm_max_epi32(in[i + 3], *clamp_lo);
   77|   559k|    out[i + 3] = _mm_min_epi32(a1, *clamp_hi);
   78|   559k|  }
   79|   313k|}
highbd_inv_txfm_sse4.c:iidentity4_sse4_1:
  688|  88.3k|                              int bd, int out_shift) {
  689|  88.3k|  (void)bit;
  690|  88.3k|  __m128i zero = _mm_setzero_si128();
  691|  88.3k|  __m128i fact = _mm_set1_epi32(NewSqrt2);
  692|  88.3k|  __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
  ------------------
  |  |   41|  88.3k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
  693|  88.3k|  __m128i a0_low, a1_low;
  694|  88.3k|  __m128i a0_high, a1_high;
  695|       |
  696|  88.3k|  offset = _mm_unpacklo_epi32(offset, zero);
  697|       |
  698|   441k|  for (int i = 0; i < 4; i++) {
  ------------------
  |  Branch (698:19): [True: 353k, False: 88.3k]
  ------------------
  699|   353k|    a0_low = _mm_mul_epi32(in[i], fact);
  700|   353k|    a0_low = _mm_add_epi32(a0_low, offset);
  701|   353k|    a0_low = _mm_srli_epi64(a0_low, NewSqrt2Bits);
  ------------------
  |  |   41|   353k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
  702|       |
  703|   353k|    a0_high = _mm_srli_si128(in[i], 4);
  704|   353k|    a0_high = _mm_mul_epi32(a0_high, fact);
  705|   353k|    a0_high = _mm_add_epi32(a0_high, offset);
  706|   353k|    a0_high = _mm_srli_epi64(a0_high, NewSqrt2Bits);
  ------------------
  |  |   41|   353k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
  707|       |
  708|   353k|    a1_low = _mm_unpacklo_epi32(a0_low, a0_high);
  709|   353k|    a1_high = _mm_unpackhi_epi32(a0_low, a0_high);
  710|   353k|    out[i] = _mm_unpacklo_epi64(a1_low, a1_high);
  711|   353k|  }
  712|       |
  713|  88.3k|  if (!do_cols) {
  ------------------
  |  Branch (713:7): [True: 28.9k, False: 59.4k]
  ------------------
  714|  28.9k|    const int log_range = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|  28.9k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 28.9k]
  |  |  ------------------
  ------------------
  715|  28.9k|    const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
  716|  28.9k|    const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
  717|  28.9k|    round_shift_4x4(out, out_shift);
  718|  28.9k|    highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 4);
  719|  28.9k|  }
  720|  88.3k|}
highbd_inv_txfm_sse4.c:round_shift_8x8:
   55|  45.9k|static void round_shift_8x8(__m128i *in, int shift) {
   56|  45.9k|  round_shift_4x4(&in[0], shift);
   57|  45.9k|  round_shift_4x4(&in[4], shift);
   58|  45.9k|  round_shift_4x4(&in[8], shift);
   59|  45.9k|  round_shift_4x4(&in[12], shift);
   60|  45.9k|}
highbd_inv_txfm_sse4.c:neg_shift_sse4_1:
  438|   265k|                             int shift) {
  439|   265k|  __m128i offset = _mm_set1_epi32((1 << shift) >> 1);
  440|   265k|  __m128i a0 = _mm_add_epi32(offset, in0);
  441|   265k|  __m128i a1 = _mm_sub_epi32(offset, in1);
  442|       |
  443|   265k|  a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift));
  444|   265k|  a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift));
  445|       |
  446|   265k|  a0 = _mm_max_epi32(a0, *clamp_lo);
  447|   265k|  a0 = _mm_min_epi32(a0, *clamp_hi);
  448|   265k|  a1 = _mm_max_epi32(a1, *clamp_lo);
  449|   265k|  a1 = _mm_min_epi32(a1, *clamp_hi);
  450|       |
  451|   265k|  *out0 = a0;
  452|   265k|  *out1 = a1;
  453|   265k|}
highbd_inv_txfm_sse4.c:idct8x8_low1_sse4_1:
 1471|  1.08k|                                int bd, int out_shift) {
 1472|  1.08k|  const int32_t *cospi = cospi_arr(bit);
 1473|  1.08k|  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
 1474|  1.08k|  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
 1475|  1.08k|  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|  2.16k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 1.08k]
  |  |  |  Branch (35:31): [True: 368, False: 714]
  |  |  |  Branch (35:44): [True: 368, False: 714]
  |  |  ------------------
  ------------------
 1476|  1.08k|  __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
 1477|  1.08k|  __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
 1478|  1.08k|  __m128i x;
 1479|       |
 1480|       |  // stage 0
 1481|       |  // stage 1
 1482|       |  // stage 2
 1483|       |  // stage 3
 1484|  1.08k|  x = _mm_mullo_epi32(in[0], cospi32);
 1485|  1.08k|  x = _mm_add_epi32(x, rnding);
 1486|  1.08k|  x = _mm_srai_epi32(x, bit);
 1487|       |
 1488|       |  // stage 4
 1489|       |  // stage 5
 1490|  1.08k|  if (!do_cols) {
  ------------------
  |  Branch (1490:7): [True: 714, False: 368]
  ------------------
 1491|    714|    const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|    714|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 714]
  |  |  ------------------
  ------------------
 1492|    714|    clamp_lo = _mm_set1_epi32(-(1 << (log_range_out - 1)));
 1493|    714|    clamp_hi = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
 1494|       |
 1495|    714|    __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
 1496|    714|    x = _mm_add_epi32(x, offset);
 1497|    714|    x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
 1498|    714|  }
 1499|       |
 1500|  1.08k|  x = _mm_max_epi32(x, clamp_lo);
 1501|  1.08k|  x = _mm_min_epi32(x, clamp_hi);
 1502|  1.08k|  out[0] = x;
 1503|  1.08k|  out[1] = x;
 1504|  1.08k|  out[2] = x;
 1505|  1.08k|  out[3] = x;
 1506|  1.08k|  out[4] = x;
 1507|  1.08k|  out[5] = x;
 1508|  1.08k|  out[6] = x;
 1509|  1.08k|  out[7] = x;
 1510|  1.08k|}
highbd_inv_txfm_sse4.c:idct8x8_new_sse4_1:
 1513|  90.7k|                               int bd, int out_shift) {
 1514|  90.7k|  const int32_t *cospi = cospi_arr(bit);
 1515|  90.7k|  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
 1516|  90.7k|  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
 1517|  90.7k|  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
 1518|  90.7k|  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
 1519|  90.7k|  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
 1520|  90.7k|  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
 1521|  90.7k|  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
 1522|  90.7k|  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
 1523|  90.7k|  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
 1524|  90.7k|  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
 1525|  90.7k|  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
 1526|  90.7k|  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|   181k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 90.7k]
  |  |  |  Branch (35:31): [True: 34.7k, False: 55.9k]
  |  |  |  Branch (35:44): [True: 34.7k, False: 55.9k]
  |  |  ------------------
  ------------------
 1527|  90.7k|  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
 1528|  90.7k|  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
 1529|  90.7k|  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
 1530|  90.7k|  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
 1531|  90.7k|  __m128i x, y;
 1532|       |
 1533|       |  // stage 0
 1534|       |  // stage 1
 1535|       |  // stage 2
 1536|  90.7k|  u0 = in[0];
 1537|  90.7k|  u1 = in[4];
 1538|  90.7k|  u2 = in[2];
 1539|  90.7k|  u3 = in[6];
 1540|       |
 1541|  90.7k|  x = _mm_mullo_epi32(in[1], cospi56);
 1542|  90.7k|  y = _mm_mullo_epi32(in[7], cospim8);
 1543|  90.7k|  u4 = _mm_add_epi32(x, y);
 1544|  90.7k|  u4 = _mm_add_epi32(u4, rnding);
 1545|  90.7k|  u4 = _mm_srai_epi32(u4, bit);
 1546|       |
 1547|  90.7k|  x = _mm_mullo_epi32(in[1], cospi8);
 1548|  90.7k|  y = _mm_mullo_epi32(in[7], cospi56);
 1549|  90.7k|  u7 = _mm_add_epi32(x, y);
 1550|  90.7k|  u7 = _mm_add_epi32(u7, rnding);
 1551|  90.7k|  u7 = _mm_srai_epi32(u7, bit);
 1552|       |
 1553|  90.7k|  x = _mm_mullo_epi32(in[5], cospi24);
 1554|  90.7k|  y = _mm_mullo_epi32(in[3], cospim40);
 1555|  90.7k|  u5 = _mm_add_epi32(x, y);
 1556|  90.7k|  u5 = _mm_add_epi32(u5, rnding);
 1557|  90.7k|  u5 = _mm_srai_epi32(u5, bit);
 1558|       |
 1559|  90.7k|  x = _mm_mullo_epi32(in[5], cospi40);
 1560|  90.7k|  y = _mm_mullo_epi32(in[3], cospi24);
 1561|  90.7k|  u6 = _mm_add_epi32(x, y);
 1562|  90.7k|  u6 = _mm_add_epi32(u6, rnding);
 1563|  90.7k|  u6 = _mm_srai_epi32(u6, bit);
 1564|       |
 1565|       |  // stage 3
 1566|  90.7k|  x = _mm_mullo_epi32(u0, cospi32);
 1567|  90.7k|  y = _mm_mullo_epi32(u1, cospi32);
 1568|  90.7k|  v0 = _mm_add_epi32(x, y);
 1569|  90.7k|  v0 = _mm_add_epi32(v0, rnding);
 1570|  90.7k|  v0 = _mm_srai_epi32(v0, bit);
 1571|       |
 1572|  90.7k|  v1 = _mm_sub_epi32(x, y);
 1573|  90.7k|  v1 = _mm_add_epi32(v1, rnding);
 1574|  90.7k|  v1 = _mm_srai_epi32(v1, bit);
 1575|       |
 1576|  90.7k|  x = _mm_mullo_epi32(u2, cospi48);
 1577|  90.7k|  y = _mm_mullo_epi32(u3, cospim16);
 1578|  90.7k|  v2 = _mm_add_epi32(x, y);
 1579|  90.7k|  v2 = _mm_add_epi32(v2, rnding);
 1580|  90.7k|  v2 = _mm_srai_epi32(v2, bit);
 1581|       |
 1582|  90.7k|  x = _mm_mullo_epi32(u2, cospi16);
 1583|  90.7k|  y = _mm_mullo_epi32(u3, cospi48);
 1584|  90.7k|  v3 = _mm_add_epi32(x, y);
 1585|  90.7k|  v3 = _mm_add_epi32(v3, rnding);
 1586|  90.7k|  v3 = _mm_srai_epi32(v3, bit);
 1587|       |
 1588|  90.7k|  addsub_sse4_1(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
 1589|  90.7k|  addsub_sse4_1(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
 1590|       |
 1591|       |  // stage 4
 1592|  90.7k|  addsub_sse4_1(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
 1593|  90.7k|  addsub_sse4_1(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
 1594|  90.7k|  u4 = v4;
 1595|  90.7k|  u7 = v7;
 1596|       |
 1597|  90.7k|  x = _mm_mullo_epi32(v5, cospi32);
 1598|  90.7k|  y = _mm_mullo_epi32(v6, cospi32);
 1599|  90.7k|  u6 = _mm_add_epi32(y, x);
 1600|  90.7k|  u6 = _mm_add_epi32(u6, rnding);
 1601|  90.7k|  u6 = _mm_srai_epi32(u6, bit);
 1602|       |
 1603|  90.7k|  u5 = _mm_sub_epi32(y, x);
 1604|  90.7k|  u5 = _mm_add_epi32(u5, rnding);
 1605|  90.7k|  u5 = _mm_srai_epi32(u5, bit);
 1606|       |
 1607|       |  // stage 5
 1608|  90.7k|  addsub_sse4_1(u0, u7, out + 0, out + 7, &clamp_lo, &clamp_hi);
 1609|  90.7k|  addsub_sse4_1(u1, u6, out + 1, out + 6, &clamp_lo, &clamp_hi);
 1610|  90.7k|  addsub_sse4_1(u2, u5, out + 2, out + 5, &clamp_lo, &clamp_hi);
 1611|  90.7k|  addsub_sse4_1(u3, u4, out + 3, out + 4, &clamp_lo, &clamp_hi);
 1612|       |
 1613|  90.7k|  if (!do_cols) {
  ------------------
  |  Branch (1613:7): [True: 55.9k, False: 34.7k]
  ------------------
 1614|  55.9k|    const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|  55.9k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 55.9k]
  |  |  ------------------
  ------------------
 1615|  55.9k|    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
 1616|  55.9k|    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
 1617|       |
 1618|  55.9k|    round_shift_4x4(out, out_shift);
 1619|  55.9k|    round_shift_4x4(out + 4, out_shift);
 1620|  55.9k|    highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 8);
 1621|  55.9k|  }
 1622|  90.7k|}
highbd_inv_txfm_sse4.c:iadst8x8_low1_sse4_1:
 1625|     44|                                 int do_cols, int bd, int out_shift) {
 1626|     44|  const int32_t *cospi = cospi_arr(bit);
 1627|     44|  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
 1628|     44|  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
 1629|     44|  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
 1630|     44|  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
 1631|     44|  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
 1632|     44|  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
 1633|     44|  const __m128i kZero = _mm_setzero_si128();
 1634|     44|  __m128i u[8], x;
 1635|       |
 1636|       |  // stage 0
 1637|       |  // stage 1
 1638|       |  // stage 2
 1639|       |
 1640|     44|  x = _mm_mullo_epi32(in[0], cospi60);
 1641|     44|  u[0] = _mm_add_epi32(x, rnding);
 1642|     44|  u[0] = _mm_srai_epi32(u[0], bit);
 1643|       |
 1644|     44|  x = _mm_mullo_epi32(in[0], cospi4);
 1645|     44|  u[1] = _mm_sub_epi32(kZero, x);
 1646|     44|  u[1] = _mm_add_epi32(u[1], rnding);
 1647|     44|  u[1] = _mm_srai_epi32(u[1], bit);
 1648|       |
 1649|       |  // stage 3
 1650|       |  // stage 4
 1651|     44|  __m128i temp1, temp2;
 1652|     44|  temp1 = _mm_mullo_epi32(u[0], cospi16);
 1653|     44|  x = _mm_mullo_epi32(u[1], cospi48);
 1654|     44|  temp1 = _mm_add_epi32(temp1, x);
 1655|     44|  temp1 = _mm_add_epi32(temp1, rnding);
 1656|     44|  temp1 = _mm_srai_epi32(temp1, bit);
 1657|     44|  u[4] = temp1;
 1658|       |
 1659|     44|  temp2 = _mm_mullo_epi32(u[0], cospi48);
 1660|     44|  x = _mm_mullo_epi32(u[1], cospi16);
 1661|     44|  u[5] = _mm_sub_epi32(temp2, x);
 1662|     44|  u[5] = _mm_add_epi32(u[5], rnding);
 1663|     44|  u[5] = _mm_srai_epi32(u[5], bit);
 1664|       |
 1665|       |  // stage 5
 1666|       |  // stage 6
 1667|     44|  temp1 = _mm_mullo_epi32(u[0], cospi32);
 1668|     44|  x = _mm_mullo_epi32(u[1], cospi32);
 1669|     44|  u[2] = _mm_add_epi32(temp1, x);
 1670|     44|  u[2] = _mm_add_epi32(u[2], rnding);
 1671|     44|  u[2] = _mm_srai_epi32(u[2], bit);
 1672|       |
 1673|     44|  u[3] = _mm_sub_epi32(temp1, x);
 1674|     44|  u[3] = _mm_add_epi32(u[3], rnding);
 1675|     44|  u[3] = _mm_srai_epi32(u[3], bit);
 1676|       |
 1677|     44|  temp1 = _mm_mullo_epi32(u[4], cospi32);
 1678|     44|  x = _mm_mullo_epi32(u[5], cospi32);
 1679|     44|  u[6] = _mm_add_epi32(temp1, x);
 1680|     44|  u[6] = _mm_add_epi32(u[6], rnding);
 1681|     44|  u[6] = _mm_srai_epi32(u[6], bit);
 1682|       |
 1683|     44|  u[7] = _mm_sub_epi32(temp1, x);
 1684|     44|  u[7] = _mm_add_epi32(u[7], rnding);
 1685|     44|  u[7] = _mm_srai_epi32(u[7], bit);
 1686|       |
 1687|       |  // stage 7
 1688|     44|  if (do_cols) {
  ------------------
  |  Branch (1688:7): [True: 8, False: 36]
  ------------------
 1689|      8|    out[0] = u[0];
 1690|      8|    out[1] = _mm_sub_epi32(kZero, u[4]);
 1691|      8|    out[2] = u[6];
 1692|      8|    out[3] = _mm_sub_epi32(kZero, u[2]);
 1693|      8|    out[4] = u[3];
 1694|      8|    out[5] = _mm_sub_epi32(kZero, u[7]);
 1695|      8|    out[6] = u[5];
 1696|      8|    out[7] = _mm_sub_epi32(kZero, u[1]);
 1697|     36|  } else {
 1698|     36|    const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|     36|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 36]
  |  |  ------------------
  ------------------
 1699|     36|    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
 1700|     36|    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
 1701|       |
 1702|     36|    neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
 1703|     36|                     out_shift);
 1704|     36|    neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
 1705|     36|                     out_shift);
 1706|     36|    neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
 1707|     36|                     out_shift);
 1708|     36|    neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
 1709|     36|                     out_shift);
 1710|     36|  }
 1711|     44|}
highbd_inv_txfm_sse4.c:iadst8x8_new_sse4_1:
 1714|  43.0k|                                int bd, int out_shift) {
 1715|  43.0k|  const int32_t *cospi = cospi_arr(bit);
 1716|  43.0k|  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
 1717|  43.0k|  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
 1718|  43.0k|  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
 1719|  43.0k|  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
 1720|  43.0k|  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
 1721|  43.0k|  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
 1722|  43.0k|  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
 1723|  43.0k|  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
 1724|  43.0k|  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
 1725|  43.0k|  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
 1726|  43.0k|  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
 1727|  43.0k|  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
 1728|  43.0k|  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
 1729|  43.0k|  const __m128i kZero = _mm_setzero_si128();
 1730|  43.0k|  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|  86.0k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 43.0k]
  |  |  |  Branch (35:31): [True: 16.0k, False: 26.9k]
  |  |  |  Branch (35:44): [True: 16.0k, False: 26.9k]
  |  |  ------------------
  ------------------
 1731|  43.0k|  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
 1732|  43.0k|  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
 1733|  43.0k|  __m128i u[8], v[8], x;
 1734|       |
 1735|       |  // stage 0
 1736|       |  // stage 1
 1737|       |  // stage 2
 1738|       |
 1739|  43.0k|  u[0] = _mm_mullo_epi32(in[7], cospi4);
 1740|  43.0k|  x = _mm_mullo_epi32(in[0], cospi60);
 1741|  43.0k|  u[0] = _mm_add_epi32(u[0], x);
 1742|  43.0k|  u[0] = _mm_add_epi32(u[0], rnding);
 1743|  43.0k|  u[0] = _mm_srai_epi32(u[0], bit);
 1744|       |
 1745|  43.0k|  u[1] = _mm_mullo_epi32(in[7], cospi60);
 1746|  43.0k|  x = _mm_mullo_epi32(in[0], cospi4);
 1747|  43.0k|  u[1] = _mm_sub_epi32(u[1], x);
 1748|  43.0k|  u[1] = _mm_add_epi32(u[1], rnding);
 1749|  43.0k|  u[1] = _mm_srai_epi32(u[1], bit);
 1750|       |
 1751|       |  // (2)
 1752|  43.0k|  u[2] = _mm_mullo_epi32(in[5], cospi20);
 1753|  43.0k|  x = _mm_mullo_epi32(in[2], cospi44);
 1754|  43.0k|  u[2] = _mm_add_epi32(u[2], x);
 1755|  43.0k|  u[2] = _mm_add_epi32(u[2], rnding);
 1756|  43.0k|  u[2] = _mm_srai_epi32(u[2], bit);
 1757|       |
 1758|  43.0k|  u[3] = _mm_mullo_epi32(in[5], cospi44);
 1759|  43.0k|  x = _mm_mullo_epi32(in[2], cospi20);
 1760|  43.0k|  u[3] = _mm_sub_epi32(u[3], x);
 1761|  43.0k|  u[3] = _mm_add_epi32(u[3], rnding);
 1762|  43.0k|  u[3] = _mm_srai_epi32(u[3], bit);
 1763|       |
 1764|       |  // (3)
 1765|  43.0k|  u[4] = _mm_mullo_epi32(in[3], cospi36);
 1766|  43.0k|  x = _mm_mullo_epi32(in[4], cospi28);
 1767|  43.0k|  u[4] = _mm_add_epi32(u[4], x);
 1768|  43.0k|  u[4] = _mm_add_epi32(u[4], rnding);
 1769|  43.0k|  u[4] = _mm_srai_epi32(u[4], bit);
 1770|       |
 1771|  43.0k|  u[5] = _mm_mullo_epi32(in[3], cospi28);
 1772|  43.0k|  x = _mm_mullo_epi32(in[4], cospi36);
 1773|  43.0k|  u[5] = _mm_sub_epi32(u[5], x);
 1774|  43.0k|  u[5] = _mm_add_epi32(u[5], rnding);
 1775|  43.0k|  u[5] = _mm_srai_epi32(u[5], bit);
 1776|       |
 1777|       |  // (4)
 1778|  43.0k|  u[6] = _mm_mullo_epi32(in[1], cospi52);
 1779|  43.0k|  x = _mm_mullo_epi32(in[6], cospi12);
 1780|  43.0k|  u[6] = _mm_add_epi32(u[6], x);
 1781|  43.0k|  u[6] = _mm_add_epi32(u[6], rnding);
 1782|  43.0k|  u[6] = _mm_srai_epi32(u[6], bit);
 1783|       |
 1784|  43.0k|  u[7] = _mm_mullo_epi32(in[1], cospi12);
 1785|  43.0k|  x = _mm_mullo_epi32(in[6], cospi52);
 1786|  43.0k|  u[7] = _mm_sub_epi32(u[7], x);
 1787|  43.0k|  u[7] = _mm_add_epi32(u[7], rnding);
 1788|  43.0k|  u[7] = _mm_srai_epi32(u[7], bit);
 1789|       |
 1790|       |  // stage 3
 1791|  43.0k|  addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
 1792|  43.0k|  addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
 1793|  43.0k|  addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
 1794|  43.0k|  addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
 1795|       |
 1796|       |  // stage 4
 1797|  43.0k|  u[0] = v[0];
 1798|  43.0k|  u[1] = v[1];
 1799|  43.0k|  u[2] = v[2];
 1800|  43.0k|  u[3] = v[3];
 1801|       |
 1802|  43.0k|  u[4] = _mm_mullo_epi32(v[4], cospi16);
 1803|  43.0k|  x = _mm_mullo_epi32(v[5], cospi48);
 1804|  43.0k|  u[4] = _mm_add_epi32(u[4], x);
 1805|  43.0k|  u[4] = _mm_add_epi32(u[4], rnding);
 1806|  43.0k|  u[4] = _mm_srai_epi32(u[4], bit);
 1807|       |
 1808|  43.0k|  u[5] = _mm_mullo_epi32(v[4], cospi48);
 1809|  43.0k|  x = _mm_mullo_epi32(v[5], cospi16);
 1810|  43.0k|  u[5] = _mm_sub_epi32(u[5], x);
 1811|  43.0k|  u[5] = _mm_add_epi32(u[5], rnding);
 1812|  43.0k|  u[5] = _mm_srai_epi32(u[5], bit);
 1813|       |
 1814|  43.0k|  u[6] = _mm_mullo_epi32(v[6], cospim48);
 1815|  43.0k|  x = _mm_mullo_epi32(v[7], cospi16);
 1816|  43.0k|  u[6] = _mm_add_epi32(u[6], x);
 1817|  43.0k|  u[6] = _mm_add_epi32(u[6], rnding);
 1818|  43.0k|  u[6] = _mm_srai_epi32(u[6], bit);
 1819|       |
 1820|  43.0k|  u[7] = _mm_mullo_epi32(v[6], cospi16);
 1821|  43.0k|  x = _mm_mullo_epi32(v[7], cospim48);
 1822|  43.0k|  u[7] = _mm_sub_epi32(u[7], x);
 1823|  43.0k|  u[7] = _mm_add_epi32(u[7], rnding);
 1824|  43.0k|  u[7] = _mm_srai_epi32(u[7], bit);
 1825|       |
 1826|       |  // stage 5
 1827|  43.0k|  addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
 1828|  43.0k|  addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
 1829|  43.0k|  addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
 1830|  43.0k|  addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
 1831|       |
 1832|       |  // stage 6
 1833|  43.0k|  u[0] = v[0];
 1834|  43.0k|  u[1] = v[1];
 1835|  43.0k|  u[4] = v[4];
 1836|  43.0k|  u[5] = v[5];
 1837|       |
 1838|  43.0k|  v[0] = _mm_mullo_epi32(v[2], cospi32);
 1839|  43.0k|  x = _mm_mullo_epi32(v[3], cospi32);
 1840|  43.0k|  u[2] = _mm_add_epi32(v[0], x);
 1841|  43.0k|  u[2] = _mm_add_epi32(u[2], rnding);
 1842|  43.0k|  u[2] = _mm_srai_epi32(u[2], bit);
 1843|       |
 1844|  43.0k|  u[3] = _mm_sub_epi32(v[0], x);
 1845|  43.0k|  u[3] = _mm_add_epi32(u[3], rnding);
 1846|  43.0k|  u[3] = _mm_srai_epi32(u[3], bit);
 1847|       |
 1848|  43.0k|  v[0] = _mm_mullo_epi32(v[6], cospi32);
 1849|  43.0k|  x = _mm_mullo_epi32(v[7], cospi32);
 1850|  43.0k|  u[6] = _mm_add_epi32(v[0], x);
 1851|  43.0k|  u[6] = _mm_add_epi32(u[6], rnding);
 1852|  43.0k|  u[6] = _mm_srai_epi32(u[6], bit);
 1853|       |
 1854|  43.0k|  u[7] = _mm_sub_epi32(v[0], x);
 1855|  43.0k|  u[7] = _mm_add_epi32(u[7], rnding);
 1856|  43.0k|  u[7] = _mm_srai_epi32(u[7], bit);
 1857|       |
 1858|       |  // stage 7
 1859|  43.0k|  if (do_cols) {
  ------------------
  |  Branch (1859:7): [True: 16.0k, False: 26.9k]
  ------------------
 1860|  16.0k|    out[0] = u[0];
 1861|  16.0k|    out[1] = _mm_sub_epi32(kZero, u[4]);
 1862|  16.0k|    out[2] = u[6];
 1863|  16.0k|    out[3] = _mm_sub_epi32(kZero, u[2]);
 1864|  16.0k|    out[4] = u[3];
 1865|  16.0k|    out[5] = _mm_sub_epi32(kZero, u[7]);
 1866|  16.0k|    out[6] = u[5];
 1867|  16.0k|    out[7] = _mm_sub_epi32(kZero, u[1]);
 1868|  26.9k|  } else {
 1869|  26.9k|    const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|  26.9k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 26.9k]
  |  |  ------------------
  ------------------
 1870|  26.9k|    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
 1871|  26.9k|    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
 1872|       |
 1873|  26.9k|    neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
 1874|  26.9k|                     out_shift);
 1875|  26.9k|    neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
 1876|  26.9k|                     out_shift);
 1877|  26.9k|    neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
 1878|  26.9k|                     out_shift);
 1879|  26.9k|    neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
 1880|  26.9k|                     out_shift);
 1881|  26.9k|  }
 1882|  43.0k|}
highbd_inv_txfm_sse4.c:iidentity8_sse4_1:
 1307|   119k|                              int bd, int out_shift) {
 1308|   119k|  (void)bit;
 1309|   119k|  out[0] = _mm_add_epi32(in[0], in[0]);
 1310|   119k|  out[1] = _mm_add_epi32(in[1], in[1]);
 1311|   119k|  out[2] = _mm_add_epi32(in[2], in[2]);
 1312|   119k|  out[3] = _mm_add_epi32(in[3], in[3]);
 1313|   119k|  out[4] = _mm_add_epi32(in[4], in[4]);
 1314|   119k|  out[5] = _mm_add_epi32(in[5], in[5]);
 1315|   119k|  out[6] = _mm_add_epi32(in[6], in[6]);
 1316|   119k|  out[7] = _mm_add_epi32(in[7], in[7]);
 1317|       |
 1318|   119k|  if (!do_cols) {
  ------------------
  |  Branch (1318:7): [True: 51.8k, False: 67.2k]
  ------------------
 1319|  51.8k|    const int log_range = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|  51.8k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 51.8k]
  |  |  ------------------
  ------------------
 1320|  51.8k|    const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
 1321|  51.8k|    const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
 1322|  51.8k|    round_shift_4x4(out, out_shift);
 1323|  51.8k|    round_shift_4x4(out + 4, out_shift);
 1324|  51.8k|    highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 8);
 1325|  51.8k|  }
 1326|   119k|}
highbd_inv_txfm_sse4.c:idct16x16_low1_sse4_1:
 1885|    140|                                  int do_cols, int bd, int out_shift) {
 1886|    140|  const int32_t *cospi = cospi_arr(bit);
 1887|    140|  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
 1888|    140|  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
 1889|    140|  int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|    280|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 140]
  |  |  |  Branch (35:31): [True: 48, False: 92]
  |  |  |  Branch (35:44): [True: 48, False: 92]
  |  |  ------------------
  ------------------
 1890|    140|  __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
 1891|    140|  __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
 1892|       |  // stage 0
 1893|       |  // stage 1
 1894|       |  // stage 2
 1895|       |  // stage 3
 1896|       |  // stage 4
 1897|    140|  in[0] = _mm_mullo_epi32(in[0], cospi32);
 1898|    140|  in[0] = _mm_add_epi32(in[0], rnding);
 1899|    140|  in[0] = _mm_srai_epi32(in[0], bit);
 1900|       |
 1901|       |  // stage 5
 1902|       |  // stage 6
 1903|       |  // stage 7
 1904|    140|  if (!do_cols) {
  ------------------
  |  Branch (1904:7): [True: 92, False: 48]
  ------------------
 1905|     92|    log_range = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|     92|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 92]
  |  |  ------------------
  ------------------
 1906|     92|    clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
 1907|     92|    clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
 1908|     92|    if (out_shift != 0) {
  ------------------
  |  Branch (1908:9): [True: 92, False: 0]
  ------------------
 1909|     92|      __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
 1910|     92|      in[0] = _mm_add_epi32(in[0], offset);
 1911|     92|      in[0] = _mm_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift));
 1912|     92|    }
 1913|     92|  }
 1914|       |
 1915|    140|  in[0] = _mm_max_epi32(in[0], clamp_lo);
 1916|    140|  in[0] = _mm_min_epi32(in[0], clamp_hi);
 1917|    140|  out[0] = in[0];
 1918|    140|  out[1] = in[0];
 1919|    140|  out[2] = in[0];
 1920|    140|  out[3] = in[0];
 1921|    140|  out[4] = in[0];
 1922|    140|  out[5] = in[0];
 1923|    140|  out[6] = in[0];
 1924|    140|  out[7] = in[0];
 1925|    140|  out[8] = in[0];
 1926|    140|  out[9] = in[0];
 1927|    140|  out[10] = in[0];
 1928|    140|  out[11] = in[0];
 1929|    140|  out[12] = in[0];
 1930|    140|  out[13] = in[0];
 1931|    140|  out[14] = in[0];
 1932|    140|  out[15] = in[0];
 1933|    140|}
highbd_inv_txfm_sse4.c:idct16x16_low8_sse4_1:
 1936|    748|                                  int do_cols, int bd, int out_shift) {
 1937|    748|  const int32_t *cospi = cospi_arr(bit);
 1938|    748|  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
 1939|    748|  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
 1940|    748|  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
 1941|    748|  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
 1942|    748|  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
 1943|    748|  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
 1944|    748|  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
 1945|    748|  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
 1946|    748|  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
 1947|    748|  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
 1948|    748|  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
 1949|    748|  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
 1950|    748|  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
 1951|    748|  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
 1952|    748|  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
 1953|    748|  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
 1954|    748|  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
 1955|    748|  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
 1956|    748|  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|  1.49k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 748]
  |  |  |  Branch (35:31): [True: 206, False: 542]
  |  |  |  Branch (35:44): [True: 206, False: 542]
  |  |  ------------------
  ------------------
 1957|    748|  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
 1958|    748|  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
 1959|    748|  __m128i u[16], x, y;
 1960|       |  // stage 0
 1961|       |  // stage 1
 1962|    748|  u[0] = in[0];
 1963|    748|  u[2] = in[4];
 1964|    748|  u[4] = in[2];
 1965|    748|  u[6] = in[6];
 1966|    748|  u[8] = in[1];
 1967|    748|  u[10] = in[5];
 1968|    748|  u[12] = in[3];
 1969|    748|  u[14] = in[7];
 1970|       |
 1971|       |  // stage 2
 1972|    748|  u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
 1973|    748|  u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
 1974|       |
 1975|    748|  u[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit);
 1976|    748|  u[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit);
 1977|       |
 1978|    748|  u[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit);
 1979|    748|  u[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit);
 1980|       |
 1981|    748|  u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
 1982|    748|  u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
 1983|       |
 1984|       |  // stage 3
 1985|    748|  u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit);
 1986|    748|  u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit);
 1987|    748|  u[5] = half_btf_0_sse4_1(&cospim40, &u[6], &rnding, bit);
 1988|    748|  u[6] = half_btf_0_sse4_1(&cospi24, &u[6], &rnding, bit);
 1989|       |
 1990|    748|  addsub_sse4_1(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
 1991|    748|  addsub_sse4_1(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
 1992|    748|  addsub_sse4_1(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
 1993|    748|  addsub_sse4_1(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
 1994|       |
 1995|       |  // stage 4
 1996|    748|  x = _mm_mullo_epi32(u[0], cospi32);
 1997|    748|  u[0] = _mm_add_epi32(x, rnding);
 1998|    748|  u[0] = _mm_srai_epi32(u[0], bit);
 1999|    748|  u[1] = u[0];
 2000|       |
 2001|    748|  u[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit);
 2002|    748|  u[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit);
 2003|       |
 2004|    748|  addsub_sse4_1(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi);
 2005|    748|  addsub_sse4_1(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi);
 2006|       |
 2007|    748|  x = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
 2008|    748|  u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
 2009|    748|  u[9] = x;
 2010|    748|  y = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
 2011|    748|  u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
 2012|    748|  u[10] = y;
 2013|       |
 2014|       |  // stage 5
 2015|    748|  addsub_sse4_1(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
 2016|    748|  addsub_sse4_1(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
 2017|       |
 2018|    748|  x = _mm_mullo_epi32(u[5], cospi32);
 2019|    748|  y = _mm_mullo_epi32(u[6], cospi32);
 2020|    748|  u[5] = _mm_sub_epi32(y, x);
 2021|    748|  u[5] = _mm_add_epi32(u[5], rnding);
 2022|    748|  u[5] = _mm_srai_epi32(u[5], bit);
 2023|       |
 2024|    748|  u[6] = _mm_add_epi32(y, x);
 2025|    748|  u[6] = _mm_add_epi32(u[6], rnding);
 2026|    748|  u[6] = _mm_srai_epi32(u[6], bit);
 2027|       |
 2028|    748|  addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
 2029|    748|  addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
 2030|    748|  addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
 2031|    748|  addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
 2032|       |
 2033|       |  // stage 6
 2034|    748|  addsub_sse4_1(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi);
 2035|    748|  addsub_sse4_1(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi);
 2036|    748|  addsub_sse4_1(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi);
 2037|    748|  addsub_sse4_1(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi);
 2038|       |
 2039|    748|  x = _mm_mullo_epi32(u[10], cospi32);
 2040|    748|  y = _mm_mullo_epi32(u[13], cospi32);
 2041|    748|  u[10] = _mm_sub_epi32(y, x);
 2042|    748|  u[10] = _mm_add_epi32(u[10], rnding);
 2043|    748|  u[10] = _mm_srai_epi32(u[10], bit);
 2044|       |
 2045|    748|  u[13] = _mm_add_epi32(x, y);
 2046|    748|  u[13] = _mm_add_epi32(u[13], rnding);
 2047|    748|  u[13] = _mm_srai_epi32(u[13], bit);
 2048|       |
 2049|    748|  x = _mm_mullo_epi32(u[11], cospi32);
 2050|    748|  y = _mm_mullo_epi32(u[12], cospi32);
 2051|    748|  u[11] = _mm_sub_epi32(y, x);
 2052|    748|  u[11] = _mm_add_epi32(u[11], rnding);
 2053|    748|  u[11] = _mm_srai_epi32(u[11], bit);
 2054|       |
 2055|    748|  u[12] = _mm_add_epi32(x, y);
 2056|    748|  u[12] = _mm_add_epi32(u[12], rnding);
 2057|    748|  u[12] = _mm_srai_epi32(u[12], bit);
 2058|       |  // stage 7
 2059|    748|  addsub_sse4_1(u[0], u[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
 2060|    748|  addsub_sse4_1(u[1], u[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
 2061|    748|  addsub_sse4_1(u[2], u[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
 2062|    748|  addsub_sse4_1(u[3], u[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
 2063|    748|  addsub_sse4_1(u[4], u[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
 2064|    748|  addsub_sse4_1(u[5], u[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
 2065|    748|  addsub_sse4_1(u[6], u[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
 2066|    748|  addsub_sse4_1(u[7], u[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
 2067|       |
 2068|    748|  if (!do_cols) {
  ------------------
  |  Branch (2068:7): [True: 542, False: 206]
  ------------------
 2069|    542|    const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|    542|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 542]
  |  |  ------------------
  ------------------
 2070|    542|    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
 2071|    542|    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
 2072|    542|    round_shift_8x8(out, out_shift);
 2073|    542|    highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16);
 2074|    542|  }
 2075|    748|}
highbd_inv_txfm_sse4.c:idct16x16_sse4_1:
 2567|  37.6k|                             int bd, int out_shift) {
 2568|  37.6k|  const int32_t *cospi = cospi_arr(bit);
 2569|  37.6k|  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
 2570|  37.6k|  const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
 2571|  37.6k|  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
 2572|  37.6k|  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
 2573|  37.6k|  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
 2574|  37.6k|  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
 2575|  37.6k|  const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
 2576|  37.6k|  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
 2577|  37.6k|  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
 2578|  37.6k|  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
 2579|  37.6k|  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
 2580|  37.6k|  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
 2581|  37.6k|  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
 2582|  37.6k|  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
 2583|  37.6k|  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
 2584|  37.6k|  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
 2585|  37.6k|  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
 2586|  37.6k|  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
 2587|  37.6k|  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
 2588|  37.6k|  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
 2589|  37.6k|  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
 2590|  37.6k|  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
 2591|  37.6k|  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
 2592|  37.6k|  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
 2593|  37.6k|  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|  75.2k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 37.6k]
  |  |  |  Branch (35:31): [True: 11.9k, False: 25.7k]
  |  |  |  Branch (35:44): [True: 11.9k, False: 25.7k]
  |  |  ------------------
  ------------------
 2594|  37.6k|  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
 2595|  37.6k|  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
 2596|  37.6k|  __m128i u[16], v[16], x, y;
 2597|       |
 2598|  37.6k|  {
 2599|       |    // stage 0
 2600|       |    // stage 1
 2601|  37.6k|    u[0] = in[0];
 2602|  37.6k|    u[1] = in[8];
 2603|  37.6k|    u[2] = in[4];
 2604|  37.6k|    u[3] = in[12];
 2605|  37.6k|    u[4] = in[2];
 2606|  37.6k|    u[5] = in[10];
 2607|  37.6k|    u[6] = in[6];
 2608|  37.6k|    u[7] = in[14];
 2609|  37.6k|    u[8] = in[1];
 2610|  37.6k|    u[9] = in[9];
 2611|  37.6k|    u[10] = in[5];
 2612|  37.6k|    u[11] = in[13];
 2613|  37.6k|    u[12] = in[3];
 2614|  37.6k|    u[13] = in[11];
 2615|  37.6k|    u[14] = in[7];
 2616|  37.6k|    u[15] = in[15];
 2617|       |
 2618|       |    // stage 2
 2619|  37.6k|    v[0] = u[0];
 2620|  37.6k|    v[1] = u[1];
 2621|  37.6k|    v[2] = u[2];
 2622|  37.6k|    v[3] = u[3];
 2623|  37.6k|    v[4] = u[4];
 2624|  37.6k|    v[5] = u[5];
 2625|  37.6k|    v[6] = u[6];
 2626|  37.6k|    v[7] = u[7];
 2627|       |
 2628|  37.6k|    v[8] = half_btf_sse4_1(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit);
 2629|  37.6k|    v[9] = half_btf_sse4_1(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit);
 2630|  37.6k|    v[10] = half_btf_sse4_1(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit);
 2631|  37.6k|    v[11] = half_btf_sse4_1(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit);
 2632|  37.6k|    v[12] = half_btf_sse4_1(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit);
 2633|  37.6k|    v[13] = half_btf_sse4_1(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit);
 2634|  37.6k|    v[14] = half_btf_sse4_1(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit);
 2635|  37.6k|    v[15] = half_btf_sse4_1(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit);
 2636|       |
 2637|       |    // stage 3
 2638|  37.6k|    u[0] = v[0];
 2639|  37.6k|    u[1] = v[1];
 2640|  37.6k|    u[2] = v[2];
 2641|  37.6k|    u[3] = v[3];
 2642|  37.6k|    u[4] = half_btf_sse4_1(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit);
 2643|  37.6k|    u[5] = half_btf_sse4_1(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit);
 2644|  37.6k|    u[6] = half_btf_sse4_1(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit);
 2645|  37.6k|    u[7] = half_btf_sse4_1(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit);
 2646|  37.6k|    addsub_sse4_1(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
 2647|  37.6k|    addsub_sse4_1(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
 2648|  37.6k|    addsub_sse4_1(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
 2649|  37.6k|    addsub_sse4_1(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
 2650|       |
 2651|       |    // stage 4
 2652|  37.6k|    x = _mm_mullo_epi32(u[0], cospi32);
 2653|  37.6k|    y = _mm_mullo_epi32(u[1], cospi32);
 2654|  37.6k|    v[0] = _mm_add_epi32(x, y);
 2655|  37.6k|    v[0] = _mm_add_epi32(v[0], rnding);
 2656|  37.6k|    v[0] = _mm_srai_epi32(v[0], bit);
 2657|       |
 2658|  37.6k|    v[1] = _mm_sub_epi32(x, y);
 2659|  37.6k|    v[1] = _mm_add_epi32(v[1], rnding);
 2660|  37.6k|    v[1] = _mm_srai_epi32(v[1], bit);
 2661|       |
 2662|  37.6k|    v[2] = half_btf_sse4_1(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit);
 2663|  37.6k|    v[3] = half_btf_sse4_1(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit);
 2664|  37.6k|    addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
 2665|  37.6k|    addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
 2666|  37.6k|    v[8] = u[8];
 2667|  37.6k|    v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
 2668|  37.6k|    v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
 2669|  37.6k|    v[11] = u[11];
 2670|  37.6k|    v[12] = u[12];
 2671|  37.6k|    v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
 2672|  37.6k|    v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
 2673|  37.6k|    v[15] = u[15];
 2674|       |
 2675|       |    // stage 5
 2676|  37.6k|    addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
 2677|  37.6k|    addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
 2678|  37.6k|    u[4] = v[4];
 2679|       |
 2680|  37.6k|    x = _mm_mullo_epi32(v[5], cospi32);
 2681|  37.6k|    y = _mm_mullo_epi32(v[6], cospi32);
 2682|  37.6k|    u[5] = _mm_sub_epi32(y, x);
 2683|  37.6k|    u[5] = _mm_add_epi32(u[5], rnding);
 2684|  37.6k|    u[5] = _mm_srai_epi32(u[5], bit);
 2685|       |
 2686|  37.6k|    u[6] = _mm_add_epi32(y, x);
 2687|  37.6k|    u[6] = _mm_add_epi32(u[6], rnding);
 2688|  37.6k|    u[6] = _mm_srai_epi32(u[6], bit);
 2689|       |
 2690|  37.6k|    u[7] = v[7];
 2691|  37.6k|    addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
 2692|  37.6k|    addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
 2693|  37.6k|    addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
 2694|  37.6k|    addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
 2695|       |
 2696|       |    // stage 6
 2697|  37.6k|    addsub_sse4_1(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi);
 2698|  37.6k|    addsub_sse4_1(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi);
 2699|  37.6k|    addsub_sse4_1(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi);
 2700|  37.6k|    addsub_sse4_1(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi);
 2701|  37.6k|    v[8] = u[8];
 2702|  37.6k|    v[9] = u[9];
 2703|       |
 2704|  37.6k|    x = _mm_mullo_epi32(u[10], cospi32);
 2705|  37.6k|    y = _mm_mullo_epi32(u[13], cospi32);
 2706|  37.6k|    v[10] = _mm_sub_epi32(y, x);
 2707|  37.6k|    v[10] = _mm_add_epi32(v[10], rnding);
 2708|  37.6k|    v[10] = _mm_srai_epi32(v[10], bit);
 2709|       |
 2710|  37.6k|    v[13] = _mm_add_epi32(x, y);
 2711|  37.6k|    v[13] = _mm_add_epi32(v[13], rnding);
 2712|  37.6k|    v[13] = _mm_srai_epi32(v[13], bit);
 2713|       |
 2714|  37.6k|    x = _mm_mullo_epi32(u[11], cospi32);
 2715|  37.6k|    y = _mm_mullo_epi32(u[12], cospi32);
 2716|  37.6k|    v[11] = _mm_sub_epi32(y, x);
 2717|  37.6k|    v[11] = _mm_add_epi32(v[11], rnding);
 2718|  37.6k|    v[11] = _mm_srai_epi32(v[11], bit);
 2719|       |
 2720|  37.6k|    v[12] = _mm_add_epi32(x, y);
 2721|  37.6k|    v[12] = _mm_add_epi32(v[12], rnding);
 2722|  37.6k|    v[12] = _mm_srai_epi32(v[12], bit);
 2723|       |
 2724|  37.6k|    v[14] = u[14];
 2725|  37.6k|    v[15] = u[15];
 2726|       |
 2727|       |    // stage 7
 2728|  37.6k|    addsub_sse4_1(v[0], v[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
 2729|  37.6k|    addsub_sse4_1(v[1], v[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
 2730|  37.6k|    addsub_sse4_1(v[2], v[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
 2731|  37.6k|    addsub_sse4_1(v[3], v[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
 2732|  37.6k|    addsub_sse4_1(v[4], v[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
 2733|  37.6k|    addsub_sse4_1(v[5], v[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
 2734|  37.6k|    addsub_sse4_1(v[6], v[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
 2735|  37.6k|    addsub_sse4_1(v[7], v[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
 2736|       |
 2737|  37.6k|    if (!do_cols) {
  ------------------
  |  Branch (2737:9): [True: 25.7k, False: 11.9k]
  ------------------
 2738|  25.7k|      const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|  25.7k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 25.7k]
  |  |  ------------------
  ------------------
 2739|  25.7k|      const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
 2740|  25.7k|      const __m128i clamp_hi_out =
 2741|  25.7k|          _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
 2742|  25.7k|      round_shift_8x8(out, out_shift);
 2743|  25.7k|      highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16);
 2744|  25.7k|    }
 2745|  37.6k|  }
 2746|  37.6k|}
highbd_inv_txfm_sse4.c:iadst16x16_low1_sse4_1:
 2078|      8|                                   int do_cols, int bd, int out_shift) {
 2079|      8|  const int32_t *cospi = cospi_arr(bit);
 2080|      8|  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
 2081|      8|  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
 2082|      8|  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
 2083|      8|  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
 2084|      8|  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
 2085|      8|  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
 2086|      8|  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
 2087|      8|  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
 2088|      8|  const __m128i zero = _mm_setzero_si128();
 2089|      8|  __m128i v[16], x, y, temp1, temp2;
 2090|       |  // stage 0
 2091|       |  // stage 1
 2092|       |  // stage 2
 2093|      8|  x = _mm_mullo_epi32(in[0], cospi62);
 2094|      8|  v[0] = _mm_add_epi32(x, rnding);
 2095|      8|  v[0] = _mm_srai_epi32(v[0], bit);
 2096|       |
 2097|      8|  x = _mm_mullo_epi32(in[0], cospi2);
 2098|      8|  v[1] = _mm_sub_epi32(zero, x);
 2099|      8|  v[1] = _mm_add_epi32(v[1], rnding);
 2100|      8|  v[1] = _mm_srai_epi32(v[1], bit);
 2101|       |
 2102|       |  // stage 3
 2103|      8|  v[8] = v[0];
 2104|      8|  v[9] = v[1];
 2105|       |
 2106|       |  // stage 4
 2107|      8|  temp1 = _mm_mullo_epi32(v[8], cospi8);
 2108|      8|  x = _mm_mullo_epi32(v[9], cospi56);
 2109|      8|  temp1 = _mm_add_epi32(temp1, x);
 2110|      8|  temp1 = _mm_add_epi32(temp1, rnding);
 2111|      8|  temp1 = _mm_srai_epi32(temp1, bit);
 2112|       |
 2113|      8|  temp2 = _mm_mullo_epi32(v[8], cospi56);
 2114|      8|  x = _mm_mullo_epi32(v[9], cospi8);
 2115|      8|  temp2 = _mm_sub_epi32(temp2, x);
 2116|      8|  temp2 = _mm_add_epi32(temp2, rnding);
 2117|      8|  temp2 = _mm_srai_epi32(temp2, bit);
 2118|      8|  v[8] = temp1;
 2119|      8|  v[9] = temp2;
 2120|       |
 2121|       |  // stage 5
 2122|      8|  v[4] = v[0];
 2123|      8|  v[5] = v[1];
 2124|      8|  v[12] = v[8];
 2125|      8|  v[13] = v[9];
 2126|       |
 2127|       |  // stage 6
 2128|      8|  temp1 = _mm_mullo_epi32(v[4], cospi16);
 2129|      8|  x = _mm_mullo_epi32(v[5], cospi48);
 2130|      8|  temp1 = _mm_add_epi32(temp1, x);
 2131|      8|  temp1 = _mm_add_epi32(temp1, rnding);
 2132|      8|  temp1 = _mm_srai_epi32(temp1, bit);
 2133|       |
 2134|      8|  temp2 = _mm_mullo_epi32(v[4], cospi48);
 2135|      8|  x = _mm_mullo_epi32(v[5], cospi16);
 2136|      8|  temp2 = _mm_sub_epi32(temp2, x);
 2137|      8|  temp2 = _mm_add_epi32(temp2, rnding);
 2138|      8|  temp2 = _mm_srai_epi32(temp2, bit);
 2139|      8|  v[4] = temp1;
 2140|      8|  v[5] = temp2;
 2141|       |
 2142|      8|  temp1 = _mm_mullo_epi32(v[12], cospi16);
 2143|      8|  x = _mm_mullo_epi32(v[13], cospi48);
 2144|      8|  temp1 = _mm_add_epi32(temp1, x);
 2145|      8|  temp1 = _mm_add_epi32(temp1, rnding);
 2146|      8|  temp1 = _mm_srai_epi32(temp1, bit);
 2147|       |
 2148|      8|  temp2 = _mm_mullo_epi32(v[12], cospi48);
 2149|      8|  x = _mm_mullo_epi32(v[13], cospi16);
 2150|      8|  temp2 = _mm_sub_epi32(temp2, x);
 2151|      8|  temp2 = _mm_add_epi32(temp2, rnding);
 2152|      8|  temp2 = _mm_srai_epi32(temp2, bit);
 2153|      8|  v[12] = temp1;
 2154|      8|  v[13] = temp2;
 2155|       |
 2156|       |  // stage 7
 2157|      8|  v[2] = v[0];
 2158|      8|  v[3] = v[1];
 2159|      8|  v[6] = v[4];
 2160|      8|  v[7] = v[5];
 2161|      8|  v[10] = v[8];
 2162|      8|  v[11] = v[9];
 2163|      8|  v[14] = v[12];
 2164|      8|  v[15] = v[13];
 2165|       |
 2166|       |  // stage 8
 2167|      8|  y = _mm_mullo_epi32(v[2], cospi32);
 2168|      8|  x = _mm_mullo_epi32(v[3], cospi32);
 2169|      8|  v[2] = _mm_add_epi32(y, x);
 2170|      8|  v[2] = _mm_add_epi32(v[2], rnding);
 2171|      8|  v[2] = _mm_srai_epi32(v[2], bit);
 2172|       |
 2173|      8|  v[3] = _mm_sub_epi32(y, x);
 2174|      8|  v[3] = _mm_add_epi32(v[3], rnding);
 2175|      8|  v[3] = _mm_srai_epi32(v[3], bit);
 2176|       |
 2177|      8|  y = _mm_mullo_epi32(v[6], cospi32);
 2178|      8|  x = _mm_mullo_epi32(v[7], cospi32);
 2179|      8|  v[6] = _mm_add_epi32(y, x);
 2180|      8|  v[6] = _mm_add_epi32(v[6], rnding);
 2181|      8|  v[6] = _mm_srai_epi32(v[6], bit);
 2182|       |
 2183|      8|  v[7] = _mm_sub_epi32(y, x);
 2184|      8|  v[7] = _mm_add_epi32(v[7], rnding);
 2185|      8|  v[7] = _mm_srai_epi32(v[7], bit);
 2186|       |
 2187|      8|  y = _mm_mullo_epi32(v[10], cospi32);
 2188|      8|  x = _mm_mullo_epi32(v[11], cospi32);
 2189|      8|  v[10] = _mm_add_epi32(y, x);
 2190|      8|  v[10] = _mm_add_epi32(v[10], rnding);
 2191|      8|  v[10] = _mm_srai_epi32(v[10], bit);
 2192|       |
 2193|      8|  v[11] = _mm_sub_epi32(y, x);
 2194|      8|  v[11] = _mm_add_epi32(v[11], rnding);
 2195|      8|  v[11] = _mm_srai_epi32(v[11], bit);
 2196|       |
 2197|      8|  y = _mm_mullo_epi32(v[14], cospi32);
 2198|      8|  x = _mm_mullo_epi32(v[15], cospi32);
 2199|      8|  v[14] = _mm_add_epi32(y, x);
 2200|      8|  v[14] = _mm_add_epi32(v[14], rnding);
 2201|      8|  v[14] = _mm_srai_epi32(v[14], bit);
 2202|       |
 2203|      8|  v[15] = _mm_sub_epi32(y, x);
 2204|      8|  v[15] = _mm_add_epi32(v[15], rnding);
 2205|      8|  v[15] = _mm_srai_epi32(v[15], bit);
 2206|       |
 2207|       |  // stage 9
 2208|      8|  if (do_cols) {
  ------------------
  |  Branch (2208:7): [True: 4, False: 4]
  ------------------
 2209|      4|    out[0] = v[0];
 2210|      4|    out[1] = _mm_sub_epi32(zero, v[8]);
 2211|      4|    out[2] = v[12];
 2212|      4|    out[3] = _mm_sub_epi32(zero, v[4]);
 2213|      4|    out[4] = v[6];
 2214|      4|    out[5] = _mm_sub_epi32(zero, v[14]);
 2215|      4|    out[6] = v[10];
 2216|      4|    out[7] = _mm_sub_epi32(zero, v[2]);
 2217|      4|    out[8] = v[3];
 2218|      4|    out[9] = _mm_sub_epi32(zero, v[11]);
 2219|      4|    out[10] = v[15];
 2220|      4|    out[11] = _mm_sub_epi32(zero, v[7]);
 2221|      4|    out[12] = v[5];
 2222|      4|    out[13] = _mm_sub_epi32(zero, v[13]);
 2223|      4|    out[14] = v[9];
 2224|      4|    out[15] = _mm_sub_epi32(zero, v[1]);
 2225|      4|  } else {
 2226|      4|    const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|      4|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 4]
  |  |  ------------------
  ------------------
 2227|      4|    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
 2228|      4|    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
 2229|       |
 2230|      4|    neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
 2231|      4|                     out_shift);
 2232|      4|    neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
 2233|      4|                     &clamp_hi_out, out_shift);
 2234|      4|    neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
 2235|      4|                     &clamp_hi_out, out_shift);
 2236|      4|    neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
 2237|      4|                     &clamp_hi_out, out_shift);
 2238|      4|    neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
 2239|      4|                     &clamp_hi_out, out_shift);
 2240|      4|    neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
 2241|      4|                     &clamp_hi_out, out_shift);
 2242|      4|    neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
 2243|      4|                     &clamp_hi_out, out_shift);
 2244|      4|    neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
 2245|      4|                     &clamp_hi_out, out_shift);
 2246|      4|  }
 2247|      8|}
highbd_inv_txfm_sse4.c:iadst16x16_low8_sse4_1:
 2250|     64|                                   int do_cols, int bd, int out_shift) {
 2251|     64|  const int32_t *cospi = cospi_arr(bit);
 2252|     64|  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
 2253|     64|  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
 2254|     64|  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
 2255|     64|  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
 2256|     64|  const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
 2257|     64|  const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
 2258|     64|  const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
 2259|     64|  const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
 2260|     64|  const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
 2261|     64|  const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
 2262|     64|  const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
 2263|     64|  const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
 2264|     64|  const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
 2265|     64|  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
 2266|     64|  const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
 2267|     64|  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
 2268|     64|  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
 2269|     64|  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
 2270|     64|  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
 2271|     64|  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
 2272|     64|  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
 2273|     64|  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
 2274|     64|  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
 2275|     64|  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
 2276|     64|  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
 2277|     64|  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
 2278|     64|  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
 2279|     64|  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|    128|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 64]
  |  |  |  Branch (35:31): [True: 16, False: 48]
  |  |  |  Branch (35:44): [True: 16, False: 48]
  |  |  ------------------
  ------------------
 2280|     64|  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
 2281|     64|  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
 2282|     64|  __m128i zero = _mm_setzero_si128();
 2283|     64|  __m128i u[16], x, y;
 2284|       |
 2285|       |  // stage 0
 2286|       |  // stage 1
 2287|       |  // stage 2
 2288|     64|  x = _mm_mullo_epi32(in[0], cospi62);
 2289|     64|  u[0] = _mm_add_epi32(x, rnding);
 2290|     64|  u[0] = _mm_srai_epi32(u[0], bit);
 2291|       |
 2292|     64|  x = _mm_mullo_epi32(in[0], cospi2);
 2293|     64|  u[1] = _mm_sub_epi32(zero, x);
 2294|     64|  u[1] = _mm_add_epi32(u[1], rnding);
 2295|     64|  u[1] = _mm_srai_epi32(u[1], bit);
 2296|       |
 2297|     64|  x = _mm_mullo_epi32(in[2], cospi54);
 2298|     64|  u[2] = _mm_add_epi32(x, rnding);
 2299|     64|  u[2] = _mm_srai_epi32(u[2], bit);
 2300|       |
 2301|     64|  x = _mm_mullo_epi32(in[2], cospi10);
 2302|     64|  u[3] = _mm_sub_epi32(zero, x);
 2303|     64|  u[3] = _mm_add_epi32(u[3], rnding);
 2304|     64|  u[3] = _mm_srai_epi32(u[3], bit);
 2305|       |
 2306|     64|  x = _mm_mullo_epi32(in[4], cospi46);
 2307|     64|  u[4] = _mm_add_epi32(x, rnding);
 2308|     64|  u[4] = _mm_srai_epi32(u[4], bit);
 2309|       |
 2310|     64|  x = _mm_mullo_epi32(in[4], cospi18);
 2311|     64|  u[5] = _mm_sub_epi32(zero, x);
 2312|     64|  u[5] = _mm_add_epi32(u[5], rnding);
 2313|     64|  u[5] = _mm_srai_epi32(u[5], bit);
 2314|       |
 2315|     64|  x = _mm_mullo_epi32(in[6], cospi38);
 2316|     64|  u[6] = _mm_add_epi32(x, rnding);
 2317|     64|  u[6] = _mm_srai_epi32(u[6], bit);
 2318|       |
 2319|     64|  x = _mm_mullo_epi32(in[6], cospi26);
 2320|     64|  u[7] = _mm_sub_epi32(zero, x);
 2321|     64|  u[7] = _mm_add_epi32(u[7], rnding);
 2322|     64|  u[7] = _mm_srai_epi32(u[7], bit);
 2323|       |
 2324|     64|  u[8] = _mm_mullo_epi32(in[7], cospi34);
 2325|     64|  u[8] = _mm_add_epi32(u[8], rnding);
 2326|     64|  u[8] = _mm_srai_epi32(u[8], bit);
 2327|       |
 2328|     64|  u[9] = _mm_mullo_epi32(in[7], cospi30);
 2329|     64|  u[9] = _mm_add_epi32(u[9], rnding);
 2330|     64|  u[9] = _mm_srai_epi32(u[9], bit);
 2331|       |
 2332|     64|  u[10] = _mm_mullo_epi32(in[5], cospi42);
 2333|     64|  u[10] = _mm_add_epi32(u[10], rnding);
 2334|     64|  u[10] = _mm_srai_epi32(u[10], bit);
 2335|       |
 2336|     64|  u[11] = _mm_mullo_epi32(in[5], cospi22);
 2337|     64|  u[11] = _mm_add_epi32(u[11], rnding);
 2338|     64|  u[11] = _mm_srai_epi32(u[11], bit);
 2339|       |
 2340|     64|  u[12] = _mm_mullo_epi32(in[3], cospi50);
 2341|     64|  u[12] = _mm_add_epi32(u[12], rnding);
 2342|     64|  u[12] = _mm_srai_epi32(u[12], bit);
 2343|       |
 2344|     64|  u[13] = _mm_mullo_epi32(in[3], cospi14);
 2345|     64|  u[13] = _mm_add_epi32(u[13], rnding);
 2346|     64|  u[13] = _mm_srai_epi32(u[13], bit);
 2347|       |
 2348|     64|  u[14] = _mm_mullo_epi32(in[1], cospi58);
 2349|     64|  u[14] = _mm_add_epi32(u[14], rnding);
 2350|     64|  u[14] = _mm_srai_epi32(u[14], bit);
 2351|       |
 2352|     64|  u[15] = _mm_mullo_epi32(in[1], cospi6);
 2353|     64|  u[15] = _mm_add_epi32(u[15], rnding);
 2354|     64|  u[15] = _mm_srai_epi32(u[15], bit);
 2355|       |
 2356|       |  // stage 3
 2357|     64|  addsub_sse4_1(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
 2358|     64|  addsub_sse4_1(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
 2359|     64|  addsub_sse4_1(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
 2360|     64|  addsub_sse4_1(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
 2361|     64|  addsub_sse4_1(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
 2362|     64|  addsub_sse4_1(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
 2363|     64|  addsub_sse4_1(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
 2364|     64|  addsub_sse4_1(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
 2365|       |
 2366|       |  // stage 4
 2367|     64|  y = _mm_mullo_epi32(u[8], cospi56);
 2368|     64|  x = _mm_mullo_epi32(u[9], cospi56);
 2369|     64|  u[8] = _mm_mullo_epi32(u[8], cospi8);
 2370|     64|  u[8] = _mm_add_epi32(u[8], x);
 2371|     64|  u[8] = _mm_add_epi32(u[8], rnding);
 2372|     64|  u[8] = _mm_srai_epi32(u[8], bit);
 2373|       |
 2374|     64|  x = _mm_mullo_epi32(u[9], cospi8);
 2375|     64|  u[9] = _mm_sub_epi32(y, x);
 2376|     64|  u[9] = _mm_add_epi32(u[9], rnding);
 2377|     64|  u[9] = _mm_srai_epi32(u[9], bit);
 2378|       |
 2379|     64|  x = _mm_mullo_epi32(u[11], cospi24);
 2380|     64|  y = _mm_mullo_epi32(u[10], cospi24);
 2381|     64|  u[10] = _mm_mullo_epi32(u[10], cospi40);
 2382|     64|  u[10] = _mm_add_epi32(u[10], x);
 2383|     64|  u[10] = _mm_add_epi32(u[10], rnding);
 2384|     64|  u[10] = _mm_srai_epi32(u[10], bit);
 2385|       |
 2386|     64|  x = _mm_mullo_epi32(u[11], cospi40);
 2387|     64|  u[11] = _mm_sub_epi32(y, x);
 2388|     64|  u[11] = _mm_add_epi32(u[11], rnding);
 2389|     64|  u[11] = _mm_srai_epi32(u[11], bit);
 2390|       |
 2391|     64|  x = _mm_mullo_epi32(u[13], cospi8);
 2392|     64|  y = _mm_mullo_epi32(u[12], cospi8);
 2393|     64|  u[12] = _mm_mullo_epi32(u[12], cospim56);
 2394|     64|  u[12] = _mm_add_epi32(u[12], x);
 2395|     64|  u[12] = _mm_add_epi32(u[12], rnding);
 2396|     64|  u[12] = _mm_srai_epi32(u[12], bit);
 2397|       |
 2398|     64|  x = _mm_mullo_epi32(u[13], cospim56);
 2399|     64|  u[13] = _mm_sub_epi32(y, x);
 2400|     64|  u[13] = _mm_add_epi32(u[13], rnding);
 2401|     64|  u[13] = _mm_srai_epi32(u[13], bit);
 2402|       |
 2403|     64|  x = _mm_mullo_epi32(u[15], cospi40);
 2404|     64|  y = _mm_mullo_epi32(u[14], cospi40);
 2405|     64|  u[14] = _mm_mullo_epi32(u[14], cospim24);
 2406|     64|  u[14] = _mm_add_epi32(u[14], x);
 2407|     64|  u[14] = _mm_add_epi32(u[14], rnding);
 2408|     64|  u[14] = _mm_srai_epi32(u[14], bit);
 2409|       |
 2410|     64|  x = _mm_mullo_epi32(u[15], cospim24);
 2411|     64|  u[15] = _mm_sub_epi32(y, x);
 2412|     64|  u[15] = _mm_add_epi32(u[15], rnding);
 2413|     64|  u[15] = _mm_srai_epi32(u[15], bit);
 2414|       |
 2415|       |  // stage 5
 2416|     64|  addsub_sse4_1(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
 2417|     64|  addsub_sse4_1(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
 2418|     64|  addsub_sse4_1(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
 2419|     64|  addsub_sse4_1(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
 2420|     64|  addsub_sse4_1(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
 2421|     64|  addsub_sse4_1(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
 2422|     64|  addsub_sse4_1(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
 2423|     64|  addsub_sse4_1(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
 2424|       |
 2425|       |  // stage 6
 2426|     64|  x = _mm_mullo_epi32(u[5], cospi48);
 2427|     64|  y = _mm_mullo_epi32(u[4], cospi48);
 2428|     64|  u[4] = _mm_mullo_epi32(u[4], cospi16);
 2429|     64|  u[4] = _mm_add_epi32(u[4], x);
 2430|     64|  u[4] = _mm_add_epi32(u[4], rnding);
 2431|     64|  u[4] = _mm_srai_epi32(u[4], bit);
 2432|       |
 2433|     64|  x = _mm_mullo_epi32(u[5], cospi16);
 2434|     64|  u[5] = _mm_sub_epi32(y, x);
 2435|     64|  u[5] = _mm_add_epi32(u[5], rnding);
 2436|     64|  u[5] = _mm_srai_epi32(u[5], bit);
 2437|       |
 2438|     64|  x = _mm_mullo_epi32(u[7], cospi16);
 2439|     64|  y = _mm_mullo_epi32(u[6], cospi16);
 2440|     64|  u[6] = _mm_mullo_epi32(u[6], cospim48);
 2441|     64|  u[6] = _mm_add_epi32(u[6], x);
 2442|     64|  u[6] = _mm_add_epi32(u[6], rnding);
 2443|     64|  u[6] = _mm_srai_epi32(u[6], bit);
 2444|       |
 2445|     64|  x = _mm_mullo_epi32(u[7], cospim48);
 2446|     64|  u[7] = _mm_sub_epi32(y, x);
 2447|     64|  u[7] = _mm_add_epi32(u[7], rnding);
 2448|     64|  u[7] = _mm_srai_epi32(u[7], bit);
 2449|       |
 2450|     64|  x = _mm_mullo_epi32(u[13], cospi48);
 2451|     64|  y = _mm_mullo_epi32(u[12], cospi48);
 2452|     64|  u[12] = _mm_mullo_epi32(u[12], cospi16);
 2453|     64|  u[12] = _mm_add_epi32(u[12], x);
 2454|     64|  u[12] = _mm_add_epi32(u[12], rnding);
 2455|     64|  u[12] = _mm_srai_epi32(u[12], bit);
 2456|       |
 2457|     64|  x = _mm_mullo_epi32(u[13], cospi16);
 2458|     64|  u[13] = _mm_sub_epi32(y, x);
 2459|     64|  u[13] = _mm_add_epi32(u[13], rnding);
 2460|     64|  u[13] = _mm_srai_epi32(u[13], bit);
 2461|       |
 2462|     64|  x = _mm_mullo_epi32(u[15], cospi16);
 2463|     64|  y = _mm_mullo_epi32(u[14], cospi16);
 2464|     64|  u[14] = _mm_mullo_epi32(u[14], cospim48);
 2465|     64|  u[14] = _mm_add_epi32(u[14], x);
 2466|     64|  u[14] = _mm_add_epi32(u[14], rnding);
 2467|     64|  u[14] = _mm_srai_epi32(u[14], bit);
 2468|       |
 2469|     64|  x = _mm_mullo_epi32(u[15], cospim48);
 2470|     64|  u[15] = _mm_sub_epi32(y, x);
 2471|     64|  u[15] = _mm_add_epi32(u[15], rnding);
 2472|     64|  u[15] = _mm_srai_epi32(u[15], bit);
 2473|       |
 2474|       |  // stage 7
 2475|     64|  addsub_sse4_1(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
 2476|     64|  addsub_sse4_1(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
 2477|     64|  addsub_sse4_1(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
 2478|     64|  addsub_sse4_1(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
 2479|     64|  addsub_sse4_1(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
 2480|     64|  addsub_sse4_1(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
 2481|     64|  addsub_sse4_1(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
 2482|     64|  addsub_sse4_1(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
 2483|       |
 2484|       |  // stage 8
 2485|     64|  y = _mm_mullo_epi32(u[2], cospi32);
 2486|     64|  x = _mm_mullo_epi32(u[3], cospi32);
 2487|     64|  u[2] = _mm_add_epi32(y, x);
 2488|     64|  u[2] = _mm_add_epi32(u[2], rnding);
 2489|     64|  u[2] = _mm_srai_epi32(u[2], bit);
 2490|       |
 2491|     64|  u[3] = _mm_sub_epi32(y, x);
 2492|     64|  u[3] = _mm_add_epi32(u[3], rnding);
 2493|     64|  u[3] = _mm_srai_epi32(u[3], bit);
 2494|     64|  y = _mm_mullo_epi32(u[6], cospi32);
 2495|     64|  x = _mm_mullo_epi32(u[7], cospi32);
 2496|     64|  u[6] = _mm_add_epi32(y, x);
 2497|     64|  u[6] = _mm_add_epi32(u[6], rnding);
 2498|     64|  u[6] = _mm_srai_epi32(u[6], bit);
 2499|       |
 2500|     64|  u[7] = _mm_sub_epi32(y, x);
 2501|     64|  u[7] = _mm_add_epi32(u[7], rnding);
 2502|     64|  u[7] = _mm_srai_epi32(u[7], bit);
 2503|       |
 2504|     64|  y = _mm_mullo_epi32(u[10], cospi32);
 2505|     64|  x = _mm_mullo_epi32(u[11], cospi32);
 2506|     64|  u[10] = _mm_add_epi32(y, x);
 2507|     64|  u[10] = _mm_add_epi32(u[10], rnding);
 2508|     64|  u[10] = _mm_srai_epi32(u[10], bit);
 2509|       |
 2510|     64|  u[11] = _mm_sub_epi32(y, x);
 2511|     64|  u[11] = _mm_add_epi32(u[11], rnding);
 2512|     64|  u[11] = _mm_srai_epi32(u[11], bit);
 2513|       |
 2514|     64|  y = _mm_mullo_epi32(u[14], cospi32);
 2515|     64|  x = _mm_mullo_epi32(u[15], cospi32);
 2516|     64|  u[14] = _mm_add_epi32(y, x);
 2517|     64|  u[14] = _mm_add_epi32(u[14], rnding);
 2518|     64|  u[14] = _mm_srai_epi32(u[14], bit);
 2519|       |
 2520|     64|  u[15] = _mm_sub_epi32(y, x);
 2521|     64|  u[15] = _mm_add_epi32(u[15], rnding);
 2522|     64|  u[15] = _mm_srai_epi32(u[15], bit);
 2523|       |
 2524|       |  // stage 9
 2525|     64|  if (do_cols) {
  ------------------
  |  Branch (2525:7): [True: 16, False: 48]
  ------------------
 2526|     16|    out[0] = u[0];
 2527|     16|    out[1] = _mm_sub_epi32(zero, u[8]);
 2528|     16|    out[2] = u[12];
 2529|     16|    out[3] = _mm_sub_epi32(zero, u[4]);
 2530|     16|    out[4] = u[6];
 2531|     16|    out[5] = _mm_sub_epi32(zero, u[14]);
 2532|     16|    out[6] = u[10];
 2533|     16|    out[7] = _mm_sub_epi32(zero, u[2]);
 2534|     16|    out[8] = u[3];
 2535|     16|    out[9] = _mm_sub_epi32(zero, u[11]);
 2536|     16|    out[10] = u[15];
 2537|     16|    out[11] = _mm_sub_epi32(zero, u[7]);
 2538|     16|    out[12] = u[5];
 2539|     16|    out[13] = _mm_sub_epi32(zero, u[13]);
 2540|     16|    out[14] = u[9];
 2541|     16|    out[15] = _mm_sub_epi32(zero, u[1]);
 2542|     48|  } else {
 2543|     48|    const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|     48|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 48]
  |  |  ------------------
  ------------------
 2544|     48|    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
 2545|     48|    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
 2546|       |
 2547|     48|    neg_shift_sse4_1(u[0], u[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
 2548|     48|                     out_shift);
 2549|     48|    neg_shift_sse4_1(u[12], u[4], out + 2, out + 3, &clamp_lo_out,
 2550|     48|                     &clamp_hi_out, out_shift);
 2551|     48|    neg_shift_sse4_1(u[6], u[14], out + 4, out + 5, &clamp_lo_out,
 2552|     48|                     &clamp_hi_out, out_shift);
 2553|     48|    neg_shift_sse4_1(u[10], u[2], out + 6, out + 7, &clamp_lo_out,
 2554|     48|                     &clamp_hi_out, out_shift);
 2555|     48|    neg_shift_sse4_1(u[3], u[11], out + 8, out + 9, &clamp_lo_out,
 2556|     48|                     &clamp_hi_out, out_shift);
 2557|     48|    neg_shift_sse4_1(u[15], u[7], out + 10, out + 11, &clamp_lo_out,
 2558|     48|                     &clamp_hi_out, out_shift);
 2559|     48|    neg_shift_sse4_1(u[5], u[13], out + 12, out + 13, &clamp_lo_out,
 2560|     48|                     &clamp_hi_out, out_shift);
 2561|     48|    neg_shift_sse4_1(u[9], u[1], out + 14, out + 15, &clamp_lo_out,
 2562|     48|                     &clamp_hi_out, out_shift);
 2563|     48|  }
 2564|     64|}
highbd_inv_txfm_sse4.c:iadst16x16_sse4_1:
 2749|  27.6k|                              int bd, int out_shift) {
 2750|  27.6k|  const int32_t *cospi = cospi_arr(bit);
 2751|  27.6k|  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
 2752|  27.6k|  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
 2753|  27.6k|  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
 2754|  27.6k|  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
 2755|  27.6k|  const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
 2756|  27.6k|  const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
 2757|  27.6k|  const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
 2758|  27.6k|  const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
 2759|  27.6k|  const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
 2760|  27.6k|  const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
 2761|  27.6k|  const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
 2762|  27.6k|  const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
 2763|  27.6k|  const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
 2764|  27.6k|  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
 2765|  27.6k|  const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
 2766|  27.6k|  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
 2767|  27.6k|  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
 2768|  27.6k|  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
 2769|  27.6k|  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
 2770|  27.6k|  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
 2771|  27.6k|  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
 2772|  27.6k|  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
 2773|  27.6k|  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
 2774|  27.6k|  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
 2775|  27.6k|  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
 2776|  27.6k|  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
 2777|  27.6k|  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
 2778|  27.6k|  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|  55.3k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 27.6k]
  |  |  |  Branch (35:31): [True: 8.03k, False: 19.6k]
  |  |  |  Branch (35:44): [True: 8.03k, False: 19.6k]
  |  |  ------------------
  ------------------
 2779|  27.6k|  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
 2780|  27.6k|  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
 2781|  27.6k|  const __m128i zero = _mm_setzero_si128();
 2782|  27.6k|  __m128i u[16], v[16], x, y;
 2783|       |  // Calculate the column 0, 1, 2, 3
 2784|       |  // stage 0
 2785|       |  // stage 1
 2786|       |  // stage 2
 2787|  27.6k|  v[0] = _mm_mullo_epi32(in[15], cospi2);
 2788|  27.6k|  x = _mm_mullo_epi32(in[0], cospi62);
 2789|  27.6k|  v[0] = _mm_add_epi32(v[0], x);
 2790|  27.6k|  v[0] = _mm_add_epi32(v[0], rnding);
 2791|  27.6k|  v[0] = _mm_srai_epi32(v[0], bit);
 2792|       |
 2793|  27.6k|  v[1] = _mm_mullo_epi32(in[15], cospi62);
 2794|  27.6k|  x = _mm_mullo_epi32(in[0], cospi2);
 2795|  27.6k|  v[1] = _mm_sub_epi32(v[1], x);
 2796|  27.6k|  v[1] = _mm_add_epi32(v[1], rnding);
 2797|  27.6k|  v[1] = _mm_srai_epi32(v[1], bit);
 2798|       |
 2799|  27.6k|  v[2] = _mm_mullo_epi32(in[13], cospi10);
 2800|  27.6k|  x = _mm_mullo_epi32(in[2], cospi54);
 2801|  27.6k|  v[2] = _mm_add_epi32(v[2], x);
 2802|  27.6k|  v[2] = _mm_add_epi32(v[2], rnding);
 2803|  27.6k|  v[2] = _mm_srai_epi32(v[2], bit);
 2804|       |
 2805|  27.6k|  v[3] = _mm_mullo_epi32(in[13], cospi54);
 2806|  27.6k|  x = _mm_mullo_epi32(in[2], cospi10);
 2807|  27.6k|  v[3] = _mm_sub_epi32(v[3], x);
 2808|  27.6k|  v[3] = _mm_add_epi32(v[3], rnding);
 2809|  27.6k|  v[3] = _mm_srai_epi32(v[3], bit);
 2810|       |
 2811|  27.6k|  v[4] = _mm_mullo_epi32(in[11], cospi18);
 2812|  27.6k|  x = _mm_mullo_epi32(in[4], cospi46);
 2813|  27.6k|  v[4] = _mm_add_epi32(v[4], x);
 2814|  27.6k|  v[4] = _mm_add_epi32(v[4], rnding);
 2815|  27.6k|  v[4] = _mm_srai_epi32(v[4], bit);
 2816|       |
 2817|  27.6k|  v[5] = _mm_mullo_epi32(in[11], cospi46);
 2818|  27.6k|  x = _mm_mullo_epi32(in[4], cospi18);
 2819|  27.6k|  v[5] = _mm_sub_epi32(v[5], x);
 2820|  27.6k|  v[5] = _mm_add_epi32(v[5], rnding);
 2821|  27.6k|  v[5] = _mm_srai_epi32(v[5], bit);
 2822|       |
 2823|  27.6k|  v[6] = _mm_mullo_epi32(in[9], cospi26);
 2824|  27.6k|  x = _mm_mullo_epi32(in[6], cospi38);
 2825|  27.6k|  v[6] = _mm_add_epi32(v[6], x);
 2826|  27.6k|  v[6] = _mm_add_epi32(v[6], rnding);
 2827|  27.6k|  v[6] = _mm_srai_epi32(v[6], bit);
 2828|       |
 2829|  27.6k|  v[7] = _mm_mullo_epi32(in[9], cospi38);
 2830|  27.6k|  x = _mm_mullo_epi32(in[6], cospi26);
 2831|  27.6k|  v[7] = _mm_sub_epi32(v[7], x);
 2832|  27.6k|  v[7] = _mm_add_epi32(v[7], rnding);
 2833|  27.6k|  v[7] = _mm_srai_epi32(v[7], bit);
 2834|       |
 2835|  27.6k|  v[8] = _mm_mullo_epi32(in[7], cospi34);
 2836|  27.6k|  x = _mm_mullo_epi32(in[8], cospi30);
 2837|  27.6k|  v[8] = _mm_add_epi32(v[8], x);
 2838|  27.6k|  v[8] = _mm_add_epi32(v[8], rnding);
 2839|  27.6k|  v[8] = _mm_srai_epi32(v[8], bit);
 2840|       |
 2841|  27.6k|  v[9] = _mm_mullo_epi32(in[7], cospi30);
 2842|  27.6k|  x = _mm_mullo_epi32(in[8], cospi34);
 2843|  27.6k|  v[9] = _mm_sub_epi32(v[9], x);
 2844|  27.6k|  v[9] = _mm_add_epi32(v[9], rnding);
 2845|  27.6k|  v[9] = _mm_srai_epi32(v[9], bit);
 2846|       |
 2847|  27.6k|  v[10] = _mm_mullo_epi32(in[5], cospi42);
 2848|  27.6k|  x = _mm_mullo_epi32(in[10], cospi22);
 2849|  27.6k|  v[10] = _mm_add_epi32(v[10], x);
 2850|  27.6k|  v[10] = _mm_add_epi32(v[10], rnding);
 2851|  27.6k|  v[10] = _mm_srai_epi32(v[10], bit);
 2852|       |
 2853|  27.6k|  v[11] = _mm_mullo_epi32(in[5], cospi22);
 2854|  27.6k|  x = _mm_mullo_epi32(in[10], cospi42);
 2855|  27.6k|  v[11] = _mm_sub_epi32(v[11], x);
 2856|  27.6k|  v[11] = _mm_add_epi32(v[11], rnding);
 2857|  27.6k|  v[11] = _mm_srai_epi32(v[11], bit);
 2858|       |
 2859|  27.6k|  v[12] = _mm_mullo_epi32(in[3], cospi50);
 2860|  27.6k|  x = _mm_mullo_epi32(in[12], cospi14);
 2861|  27.6k|  v[12] = _mm_add_epi32(v[12], x);
 2862|  27.6k|  v[12] = _mm_add_epi32(v[12], rnding);
 2863|  27.6k|  v[12] = _mm_srai_epi32(v[12], bit);
 2864|       |
 2865|  27.6k|  v[13] = _mm_mullo_epi32(in[3], cospi14);
 2866|  27.6k|  x = _mm_mullo_epi32(in[12], cospi50);
 2867|  27.6k|  v[13] = _mm_sub_epi32(v[13], x);
 2868|  27.6k|  v[13] = _mm_add_epi32(v[13], rnding);
 2869|  27.6k|  v[13] = _mm_srai_epi32(v[13], bit);
 2870|       |
 2871|  27.6k|  v[14] = _mm_mullo_epi32(in[1], cospi58);
 2872|  27.6k|  x = _mm_mullo_epi32(in[14], cospi6);
 2873|  27.6k|  v[14] = _mm_add_epi32(v[14], x);
 2874|  27.6k|  v[14] = _mm_add_epi32(v[14], rnding);
 2875|  27.6k|  v[14] = _mm_srai_epi32(v[14], bit);
 2876|       |
 2877|  27.6k|  v[15] = _mm_mullo_epi32(in[1], cospi6);
 2878|  27.6k|  x = _mm_mullo_epi32(in[14], cospi58);
 2879|  27.6k|  v[15] = _mm_sub_epi32(v[15], x);
 2880|  27.6k|  v[15] = _mm_add_epi32(v[15], rnding);
 2881|  27.6k|  v[15] = _mm_srai_epi32(v[15], bit);
 2882|       |
 2883|       |  // stage 3
 2884|  27.6k|  addsub_sse4_1(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
 2885|  27.6k|  addsub_sse4_1(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
 2886|  27.6k|  addsub_sse4_1(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
 2887|  27.6k|  addsub_sse4_1(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
 2888|  27.6k|  addsub_sse4_1(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
 2889|  27.6k|  addsub_sse4_1(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
 2890|  27.6k|  addsub_sse4_1(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
 2891|  27.6k|  addsub_sse4_1(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
 2892|       |
 2893|       |  // stage 4
 2894|  27.6k|  v[0] = u[0];
 2895|  27.6k|  v[1] = u[1];
 2896|  27.6k|  v[2] = u[2];
 2897|  27.6k|  v[3] = u[3];
 2898|  27.6k|  v[4] = u[4];
 2899|  27.6k|  v[5] = u[5];
 2900|  27.6k|  v[6] = u[6];
 2901|  27.6k|  v[7] = u[7];
 2902|       |
 2903|  27.6k|  v[8] = _mm_mullo_epi32(u[8], cospi8);
 2904|  27.6k|  x = _mm_mullo_epi32(u[9], cospi56);
 2905|  27.6k|  v[8] = _mm_add_epi32(v[8], x);
 2906|  27.6k|  v[8] = _mm_add_epi32(v[8], rnding);
 2907|  27.6k|  v[8] = _mm_srai_epi32(v[8], bit);
 2908|       |
 2909|  27.6k|  v[9] = _mm_mullo_epi32(u[8], cospi56);
 2910|  27.6k|  x = _mm_mullo_epi32(u[9], cospi8);
 2911|  27.6k|  v[9] = _mm_sub_epi32(v[9], x);
 2912|  27.6k|  v[9] = _mm_add_epi32(v[9], rnding);
 2913|  27.6k|  v[9] = _mm_srai_epi32(v[9], bit);
 2914|       |
 2915|  27.6k|  v[10] = _mm_mullo_epi32(u[10], cospi40);
 2916|  27.6k|  x = _mm_mullo_epi32(u[11], cospi24);
 2917|  27.6k|  v[10] = _mm_add_epi32(v[10], x);
 2918|  27.6k|  v[10] = _mm_add_epi32(v[10], rnding);
 2919|  27.6k|  v[10] = _mm_srai_epi32(v[10], bit);
 2920|       |
 2921|  27.6k|  v[11] = _mm_mullo_epi32(u[10], cospi24);
 2922|  27.6k|  x = _mm_mullo_epi32(u[11], cospi40);
 2923|  27.6k|  v[11] = _mm_sub_epi32(v[11], x);
 2924|  27.6k|  v[11] = _mm_add_epi32(v[11], rnding);
 2925|  27.6k|  v[11] = _mm_srai_epi32(v[11], bit);
 2926|       |
 2927|  27.6k|  v[12] = _mm_mullo_epi32(u[12], cospim56);
 2928|  27.6k|  x = _mm_mullo_epi32(u[13], cospi8);
 2929|  27.6k|  v[12] = _mm_add_epi32(v[12], x);
 2930|  27.6k|  v[12] = _mm_add_epi32(v[12], rnding);
 2931|  27.6k|  v[12] = _mm_srai_epi32(v[12], bit);
 2932|       |
 2933|  27.6k|  v[13] = _mm_mullo_epi32(u[12], cospi8);
 2934|  27.6k|  x = _mm_mullo_epi32(u[13], cospim56);
 2935|  27.6k|  v[13] = _mm_sub_epi32(v[13], x);
 2936|  27.6k|  v[13] = _mm_add_epi32(v[13], rnding);
 2937|  27.6k|  v[13] = _mm_srai_epi32(v[13], bit);
 2938|       |
 2939|  27.6k|  v[14] = _mm_mullo_epi32(u[14], cospim24);
 2940|  27.6k|  x = _mm_mullo_epi32(u[15], cospi40);
 2941|  27.6k|  v[14] = _mm_add_epi32(v[14], x);
 2942|  27.6k|  v[14] = _mm_add_epi32(v[14], rnding);
 2943|  27.6k|  v[14] = _mm_srai_epi32(v[14], bit);
 2944|       |
 2945|  27.6k|  v[15] = _mm_mullo_epi32(u[14], cospi40);
 2946|  27.6k|  x = _mm_mullo_epi32(u[15], cospim24);
 2947|  27.6k|  v[15] = _mm_sub_epi32(v[15], x);
 2948|  27.6k|  v[15] = _mm_add_epi32(v[15], rnding);
 2949|  27.6k|  v[15] = _mm_srai_epi32(v[15], bit);
 2950|       |
 2951|       |  // stage 5
 2952|  27.6k|  addsub_sse4_1(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
 2953|  27.6k|  addsub_sse4_1(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
 2954|  27.6k|  addsub_sse4_1(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
 2955|  27.6k|  addsub_sse4_1(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
 2956|  27.6k|  addsub_sse4_1(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
 2957|  27.6k|  addsub_sse4_1(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
 2958|  27.6k|  addsub_sse4_1(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
 2959|  27.6k|  addsub_sse4_1(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
 2960|       |
 2961|       |  // stage 6
 2962|  27.6k|  v[0] = u[0];
 2963|  27.6k|  v[1] = u[1];
 2964|  27.6k|  v[2] = u[2];
 2965|  27.6k|  v[3] = u[3];
 2966|       |
 2967|  27.6k|  v[4] = _mm_mullo_epi32(u[4], cospi16);
 2968|  27.6k|  x = _mm_mullo_epi32(u[5], cospi48);
 2969|  27.6k|  v[4] = _mm_add_epi32(v[4], x);
 2970|  27.6k|  v[4] = _mm_add_epi32(v[4], rnding);
 2971|  27.6k|  v[4] = _mm_srai_epi32(v[4], bit);
 2972|       |
 2973|  27.6k|  v[5] = _mm_mullo_epi32(u[4], cospi48);
 2974|  27.6k|  x = _mm_mullo_epi32(u[5], cospi16);
 2975|  27.6k|  v[5] = _mm_sub_epi32(v[5], x);
 2976|  27.6k|  v[5] = _mm_add_epi32(v[5], rnding);
 2977|  27.6k|  v[5] = _mm_srai_epi32(v[5], bit);
 2978|       |
 2979|  27.6k|  v[6] = _mm_mullo_epi32(u[6], cospim48);
 2980|  27.6k|  x = _mm_mullo_epi32(u[7], cospi16);
 2981|  27.6k|  v[6] = _mm_add_epi32(v[6], x);
 2982|  27.6k|  v[6] = _mm_add_epi32(v[6], rnding);
 2983|  27.6k|  v[6] = _mm_srai_epi32(v[6], bit);
 2984|       |
 2985|  27.6k|  v[7] = _mm_mullo_epi32(u[6], cospi16);
 2986|  27.6k|  x = _mm_mullo_epi32(u[7], cospim48);
 2987|  27.6k|  v[7] = _mm_sub_epi32(v[7], x);
 2988|  27.6k|  v[7] = _mm_add_epi32(v[7], rnding);
 2989|  27.6k|  v[7] = _mm_srai_epi32(v[7], bit);
 2990|       |
 2991|  27.6k|  v[8] = u[8];
 2992|  27.6k|  v[9] = u[9];
 2993|  27.6k|  v[10] = u[10];
 2994|  27.6k|  v[11] = u[11];
 2995|       |
 2996|  27.6k|  v[12] = _mm_mullo_epi32(u[12], cospi16);
 2997|  27.6k|  x = _mm_mullo_epi32(u[13], cospi48);
 2998|  27.6k|  v[12] = _mm_add_epi32(v[12], x);
 2999|  27.6k|  v[12] = _mm_add_epi32(v[12], rnding);
 3000|  27.6k|  v[12] = _mm_srai_epi32(v[12], bit);
 3001|       |
 3002|  27.6k|  v[13] = _mm_mullo_epi32(u[12], cospi48);
 3003|  27.6k|  x = _mm_mullo_epi32(u[13], cospi16);
 3004|  27.6k|  v[13] = _mm_sub_epi32(v[13], x);
 3005|  27.6k|  v[13] = _mm_add_epi32(v[13], rnding);
 3006|  27.6k|  v[13] = _mm_srai_epi32(v[13], bit);
 3007|       |
 3008|  27.6k|  v[14] = _mm_mullo_epi32(u[14], cospim48);
 3009|  27.6k|  x = _mm_mullo_epi32(u[15], cospi16);
 3010|  27.6k|  v[14] = _mm_add_epi32(v[14], x);
 3011|  27.6k|  v[14] = _mm_add_epi32(v[14], rnding);
 3012|  27.6k|  v[14] = _mm_srai_epi32(v[14], bit);
 3013|       |
 3014|  27.6k|  v[15] = _mm_mullo_epi32(u[14], cospi16);
 3015|  27.6k|  x = _mm_mullo_epi32(u[15], cospim48);
 3016|  27.6k|  v[15] = _mm_sub_epi32(v[15], x);
 3017|  27.6k|  v[15] = _mm_add_epi32(v[15], rnding);
 3018|  27.6k|  v[15] = _mm_srai_epi32(v[15], bit);
 3019|       |
 3020|       |  // stage 7
 3021|  27.6k|  addsub_sse4_1(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
 3022|  27.6k|  addsub_sse4_1(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
 3023|  27.6k|  addsub_sse4_1(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
 3024|  27.6k|  addsub_sse4_1(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
 3025|  27.6k|  addsub_sse4_1(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
 3026|  27.6k|  addsub_sse4_1(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
 3027|  27.6k|  addsub_sse4_1(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
 3028|  27.6k|  addsub_sse4_1(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
 3029|       |
 3030|       |  // stage 8
 3031|  27.6k|  v[0] = u[0];
 3032|  27.6k|  v[1] = u[1];
 3033|       |
 3034|  27.6k|  y = _mm_mullo_epi32(u[2], cospi32);
 3035|  27.6k|  x = _mm_mullo_epi32(u[3], cospi32);
 3036|  27.6k|  v[2] = _mm_add_epi32(y, x);
 3037|  27.6k|  v[2] = _mm_add_epi32(v[2], rnding);
 3038|  27.6k|  v[2] = _mm_srai_epi32(v[2], bit);
 3039|       |
 3040|  27.6k|  v[3] = _mm_sub_epi32(y, x);
 3041|  27.6k|  v[3] = _mm_add_epi32(v[3], rnding);
 3042|  27.6k|  v[3] = _mm_srai_epi32(v[3], bit);
 3043|       |
 3044|  27.6k|  v[4] = u[4];
 3045|  27.6k|  v[5] = u[5];
 3046|       |
 3047|  27.6k|  y = _mm_mullo_epi32(u[6], cospi32);
 3048|  27.6k|  x = _mm_mullo_epi32(u[7], cospi32);
 3049|  27.6k|  v[6] = _mm_add_epi32(y, x);
 3050|  27.6k|  v[6] = _mm_add_epi32(v[6], rnding);
 3051|  27.6k|  v[6] = _mm_srai_epi32(v[6], bit);
 3052|       |
 3053|  27.6k|  v[7] = _mm_sub_epi32(y, x);
 3054|  27.6k|  v[7] = _mm_add_epi32(v[7], rnding);
 3055|  27.6k|  v[7] = _mm_srai_epi32(v[7], bit);
 3056|       |
 3057|  27.6k|  v[8] = u[8];
 3058|  27.6k|  v[9] = u[9];
 3059|       |
 3060|  27.6k|  y = _mm_mullo_epi32(u[10], cospi32);
 3061|  27.6k|  x = _mm_mullo_epi32(u[11], cospi32);
 3062|  27.6k|  v[10] = _mm_add_epi32(y, x);
 3063|  27.6k|  v[10] = _mm_add_epi32(v[10], rnding);
 3064|  27.6k|  v[10] = _mm_srai_epi32(v[10], bit);
 3065|       |
 3066|  27.6k|  v[11] = _mm_sub_epi32(y, x);
 3067|  27.6k|  v[11] = _mm_add_epi32(v[11], rnding);
 3068|  27.6k|  v[11] = _mm_srai_epi32(v[11], bit);
 3069|       |
 3070|  27.6k|  v[12] = u[12];
 3071|  27.6k|  v[13] = u[13];
 3072|       |
 3073|  27.6k|  y = _mm_mullo_epi32(u[14], cospi32);
 3074|  27.6k|  x = _mm_mullo_epi32(u[15], cospi32);
 3075|  27.6k|  v[14] = _mm_add_epi32(y, x);
 3076|  27.6k|  v[14] = _mm_add_epi32(v[14], rnding);
 3077|  27.6k|  v[14] = _mm_srai_epi32(v[14], bit);
 3078|       |
 3079|  27.6k|  v[15] = _mm_sub_epi32(y, x);
 3080|  27.6k|  v[15] = _mm_add_epi32(v[15], rnding);
 3081|  27.6k|  v[15] = _mm_srai_epi32(v[15], bit);
 3082|       |
 3083|       |  // stage 9
 3084|  27.6k|  if (do_cols) {
  ------------------
  |  Branch (3084:7): [True: 8.03k, False: 19.6k]
  ------------------
 3085|  8.03k|    out[0] = v[0];
 3086|  8.03k|    out[1] = _mm_sub_epi32(zero, v[8]);
 3087|  8.03k|    out[2] = v[12];
 3088|  8.03k|    out[3] = _mm_sub_epi32(zero, v[4]);
 3089|  8.03k|    out[4] = v[6];
 3090|  8.03k|    out[5] = _mm_sub_epi32(zero, v[14]);
 3091|  8.03k|    out[6] = v[10];
 3092|  8.03k|    out[7] = _mm_sub_epi32(zero, v[2]);
 3093|  8.03k|    out[8] = v[3];
 3094|  8.03k|    out[9] = _mm_sub_epi32(zero, v[11]);
 3095|  8.03k|    out[10] = v[15];
 3096|  8.03k|    out[11] = _mm_sub_epi32(zero, v[7]);
 3097|  8.03k|    out[12] = v[5];
 3098|  8.03k|    out[13] = _mm_sub_epi32(zero, v[13]);
 3099|  8.03k|    out[14] = v[9];
 3100|  8.03k|    out[15] = _mm_sub_epi32(zero, v[1]);
 3101|  19.6k|  } else {
 3102|  19.6k|    const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|  19.6k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 19.6k]
  |  |  ------------------
  ------------------
 3103|  19.6k|    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
 3104|  19.6k|    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
 3105|       |
 3106|  19.6k|    neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
 3107|  19.6k|                     out_shift);
 3108|  19.6k|    neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
 3109|  19.6k|                     &clamp_hi_out, out_shift);
 3110|  19.6k|    neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
 3111|  19.6k|                     &clamp_hi_out, out_shift);
 3112|  19.6k|    neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
 3113|  19.6k|                     &clamp_hi_out, out_shift);
 3114|  19.6k|    neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
 3115|  19.6k|                     &clamp_hi_out, out_shift);
 3116|  19.6k|    neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
 3117|  19.6k|                     &clamp_hi_out, out_shift);
 3118|  19.6k|    neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
 3119|  19.6k|                     &clamp_hi_out, out_shift);
 3120|  19.6k|    neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
 3121|  19.6k|                     &clamp_hi_out, out_shift);
 3122|  19.6k|  }
 3123|  27.6k|}
highbd_inv_txfm_sse4.c:iidentity16_sse4_1:
 3125|  34.8k|                               int bd, int out_shift) {
 3126|  34.8k|  (void)bit;
 3127|  34.8k|  __m128i fact = _mm_set1_epi32(2 * NewSqrt2);
 3128|  34.8k|  __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
  ------------------
  |  |   41|  34.8k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 3129|  34.8k|  __m128i a0_low, a0_high, a1_low, a1_high;
 3130|  34.8k|  __m128i zero = _mm_setzero_si128();
 3131|  34.8k|  offset = _mm_unpacklo_epi32(offset, zero);
 3132|       |
 3133|   592k|  for (int i = 0; i < 16; i++) {
  ------------------
  |  Branch (3133:19): [True: 557k, False: 34.8k]
  ------------------
 3134|   557k|    a0_low = _mm_mul_epi32(in[i], fact);
 3135|   557k|    a0_low = _mm_add_epi32(a0_low, offset);
 3136|   557k|    a0_low = _mm_srli_epi64(a0_low, NewSqrt2Bits);
  ------------------
  |  |   41|   557k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 3137|       |
 3138|   557k|    a0_high = _mm_srli_si128(in[i], 4);
 3139|   557k|    a0_high = _mm_mul_epi32(a0_high, fact);
 3140|   557k|    a0_high = _mm_add_epi32(a0_high, offset);
 3141|   557k|    a0_high = _mm_srli_epi64(a0_high, NewSqrt2Bits);
  ------------------
  |  |   41|   557k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 3142|       |
 3143|   557k|    a1_low = _mm_unpacklo_epi32(a0_low, a0_high);
 3144|   557k|    a1_high = _mm_unpackhi_epi32(a0_low, a0_high);
 3145|   557k|    out[i] = _mm_unpacklo_epi64(a1_low, a1_high);
 3146|   557k|  }
 3147|       |
 3148|  34.8k|  if (!do_cols) {
  ------------------
  |  Branch (3148:7): [True: 19.0k, False: 15.7k]
  ------------------
 3149|  19.0k|    const int log_range = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|  19.0k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 19.0k]
  |  |  ------------------
  ------------------
 3150|  19.0k|    const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
 3151|  19.0k|    const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
 3152|  19.0k|    round_shift_8x8(out, out_shift);
 3153|  19.0k|    highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 16);
 3154|  19.0k|  }
 3155|  34.8k|}
highbd_inv_txfm_sse4.c:iidentity32_sse4_1:
 5170|    564|                               int bd, int out_shift) {
 5171|    564|  (void)bit;
 5172|  1.69k|  for (int i = 0; i < 32; i += 16) {
  ------------------
  |  Branch (5172:19): [True: 1.12k, False: 564]
  ------------------
 5173|  1.12k|    out[i] = _mm_slli_epi32(in[i], 2);
 5174|  1.12k|    out[i + 1] = _mm_slli_epi32(in[i + 1], 2);
 5175|  1.12k|    out[i + 2] = _mm_slli_epi32(in[i + 2], 2);
 5176|  1.12k|    out[i + 3] = _mm_slli_epi32(in[i + 3], 2);
 5177|  1.12k|    out[i + 4] = _mm_slli_epi32(in[i + 4], 2);
 5178|  1.12k|    out[i + 5] = _mm_slli_epi32(in[i + 5], 2);
 5179|  1.12k|    out[i + 6] = _mm_slli_epi32(in[i + 6], 2);
 5180|  1.12k|    out[i + 7] = _mm_slli_epi32(in[i + 7], 2);
 5181|  1.12k|    out[i + 8] = _mm_slli_epi32(in[i + 8], 2);
 5182|  1.12k|    out[i + 9] = _mm_slli_epi32(in[i + 9], 2);
 5183|  1.12k|    out[i + 10] = _mm_slli_epi32(in[i + 10], 2);
 5184|  1.12k|    out[i + 11] = _mm_slli_epi32(in[i + 11], 2);
 5185|  1.12k|    out[i + 12] = _mm_slli_epi32(in[i + 12], 2);
 5186|  1.12k|    out[i + 13] = _mm_slli_epi32(in[i + 13], 2);
 5187|  1.12k|    out[i + 14] = _mm_slli_epi32(in[i + 14], 2);
 5188|  1.12k|    out[i + 15] = _mm_slli_epi32(in[i + 15], 2);
 5189|  1.12k|  }
 5190|       |
 5191|    564|  if (!do_cols) {
  ------------------
  |  Branch (5191:7): [True: 304, False: 260]
  ------------------
 5192|    304|    const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|    304|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 304]
  |  |  ------------------
  ------------------
 5193|    304|    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
 5194|    304|    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
 5195|    304|    round_shift_8x8(out, out_shift);
 5196|    304|    round_shift_8x8(out + 16, out_shift);
 5197|    304|    highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 32);
 5198|    304|  }
 5199|    564|}
highbd_inv_txfm_sse4.c:load_buffer_32bit_input:
  135|   371k|                                           __m128i *out, int out_size) {
  136|  3.18M|  for (int i = 0; i < out_size; ++i) {
  ------------------
  |  Branch (136:19): [True: 2.81M, False: 371k]
  ------------------
  137|  2.81M|    out[i] = _mm_loadu_si128((const __m128i *)(in + i * stride));
  138|  2.81M|  }
  139|   371k|}
highbd_inv_txfm_sse4.c:highbd_write_buffer_8xn_sse4_1:
  123|   206k|                                                  int height, const int bd) {
  124|   206k|  int j = flipud ? (height - 1) : 0;
  ------------------
  |  Branch (124:11): [True: 396, False: 205k]
  ------------------
  125|   206k|  const int step = flipud ? -1 : 1;
  ------------------
  |  Branch (125:20): [True: 396, False: 205k]
  ------------------
  126|  1.27M|  for (int i = 0; i < height; ++i, j += step) {
  ------------------
  |  Branch (126:19): [True: 1.06M, False: 206k]
  ------------------
  127|  1.06M|    __m128i v = _mm_loadu_si128((__m128i const *)(output + i * stride));
  128|  1.06M|    __m128i u = highbd_get_recon_8x8_sse4_1(v, in[j], in[j + height], bd);
  129|       |
  130|  1.06M|    _mm_storeu_si128((__m128i *)(output + i * stride), u);
  131|  1.06M|  }
  132|   206k|}
highbd_inv_txfm_sse4.c:highbd_get_recon_8x8_sse4_1:
   83|  1.06M|                                                  const int bd) {
   84|  1.06M|  __m128i x0 = _mm_cvtepi16_epi32(pred);
   85|       |  __m128i x1 = _mm_cvtepi16_epi32(_mm_srli_si128(pred, 8));
   86|  1.06M|  __m128i min_clip_val = _mm_setzero_si128();
   87|  1.06M|  __m128i max_clip_val = _mm_set1_epi32((1 << bd) - 1);
   88|  1.06M|  x0 = _mm_add_epi32(res0, x0);
   89|  1.06M|  x1 = _mm_add_epi32(res1, x1);
   90|  1.06M|  x0 = _mm_max_epi32(x0, min_clip_val);
   91|  1.06M|  x0 = _mm_min_epi32(x0, max_clip_val);
   92|  1.06M|  x1 = _mm_max_epi32(x1, min_clip_val);
   93|  1.06M|  x1 = _mm_min_epi32(x1, max_clip_val);
   94|  1.06M|  x0 = _mm_packus_epi32(x0, x1);
   95|  1.06M|  return x0;
   96|  1.06M|}
highbd_inv_txfm_sse4.c:highbd_inv_txfm2d_add_h_identity_ssse41:
 5230|  5.90k|                                                    const int bd) {
 5231|  5.90k|  __m128i buf1[64];
 5232|  5.90k|  int eobx, eoby;
 5233|  5.90k|  get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
 5234|  5.90k|  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
 5235|  5.90k|  const int txw_idx = get_txw_idx(tx_size);
 5236|  5.90k|  const int txh_idx = get_txh_idx(tx_size);
 5237|  5.90k|  const int txfm_size_col = tx_size_wide[tx_size];
 5238|  5.90k|  const int txfm_size_row = tx_size_high[tx_size];
 5239|  5.90k|  const int buf_size_w = AOMMIN(32, txfm_size_col);
  ------------------
  |  |   34|  5.90k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 5.90k]
  |  |  ------------------
  ------------------
 5240|  5.90k|  const int buf_size_w_div4 = buf_size_w >> 2;
 5241|  5.90k|  const int buf_size_h_div8 = (eoby + 8) >> 3;
 5242|  5.90k|  const int row_max = AOMMIN(32, txfm_size_row);
  ------------------
  |  |   34|  5.90k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 5.90k]
  |  |  ------------------
  ------------------
 5243|  5.90k|  const int input_stride = row_max;
 5244|  5.90k|  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
 5245|  5.90k|  const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby];
 5246|  5.90k|  const transform_1d_sse4_1 row_txfm =
 5247|  5.90k|      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
 5248|  5.90k|  const transform_1d_sse4_1 col_txfm =
 5249|  5.90k|      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx];
 5250|  5.90k|  int ud_flip, lr_flip;
 5251|  5.90k|  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 5252|       |
 5253|  18.9k|  for (int i = 0; i < (buf_size_h_div8 << 1); ++i) {
  ------------------
  |  Branch (5253:19): [True: 13.0k, False: 5.90k]
  ------------------
 5254|  13.0k|    __m128i buf0[16];
 5255|  13.0k|    load_buffer_32bit_input(input + i * 4, input_stride, buf0, buf_size_w);
 5256|  13.0k|    if (rect_type == 1 || rect_type == -1) {
  ------------------
  |  Branch (5256:9): [True: 1.91k, False: 11.1k]
  |  Branch (5256:27): [True: 2.52k, False: 8.59k]
  ------------------
 5257|  4.44k|      av1_round_shift_rect_array_32_sse4_1(buf0, buf0, buf_size_w, 0,
 5258|  4.44k|                                           NewInvSqrt2);
 5259|  4.44k|    }
 5260|  13.0k|    row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
  ------------------
  |  |   43|  13.0k|#define INV_COS_BIT 12
  ------------------
 5261|       |
 5262|  13.0k|    __m128i *_buf1 = buf1 + i * 4;
 5263|       |
 5264|  43.2k|    for (int j = 0; j < buf_size_w_div4; ++j) {
  ------------------
  |  Branch (5264:21): [True: 30.2k, False: 13.0k]
  ------------------
 5265|  30.2k|      __m128i *buf0_cur = buf0 + j * 4;
 5266|  30.2k|      TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
  ------------------
  |  |   18|  30.2k|  do {                                                \
  |  |   19|  30.2k|    __m128i u0, u1, u2, u3;                           \
  |  |   20|  30.2k|    u0 = _mm_unpacklo_epi32(x0, x1);                  \
  |  |   21|  30.2k|    u1 = _mm_unpackhi_epi32(x0, x1);                  \
  |  |   22|  30.2k|    u2 = _mm_unpacklo_epi32(x2, x3);                  \
  |  |   23|  30.2k|    u3 = _mm_unpackhi_epi32(x2, x3);                  \
  |  |   24|  30.2k|    y0 = _mm_unpacklo_epi64(u0, u2);                  \
  |  |   25|  30.2k|    y1 = _mm_unpackhi_epi64(u0, u2);                  \
  |  |   26|  30.2k|    y2 = _mm_unpacklo_epi64(u1, u3);                  \
  |  |   27|  30.2k|    y3 = _mm_unpackhi_epi64(u1, u3);                  \
  |  |   28|  30.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (28:12): [Folded, False: 30.2k]
  |  |  ------------------
  ------------------
 5267|  30.2k|                    buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
 5268|  30.2k|      _buf1[j * txfm_size_row + 0] = buf0_cur[0];
 5269|  30.2k|      _buf1[j * txfm_size_row + 1] = buf0_cur[1];
 5270|  30.2k|      _buf1[j * txfm_size_row + 2] = buf0_cur[2];
 5271|  30.2k|      _buf1[j * txfm_size_row + 3] = buf0_cur[3];
 5272|  30.2k|    }
 5273|  13.0k|  }
 5274|  19.7k|  for (int i = 0; i < buf_size_w_div4; i++) {
  ------------------
  |  Branch (5274:19): [True: 13.8k, False: 5.90k]
  ------------------
 5275|  13.8k|    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
  ------------------
  |  |   43|  13.8k|#define INV_COS_BIT 12
  ------------------
 5276|  13.8k|             bd, 0);
 5277|       |
 5278|  13.8k|    av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
 5279|  13.8k|                                    buf1 + i * txfm_size_row, txfm_size_row,
 5280|  13.8k|                                    -shift[1]);
 5281|  13.8k|  }
 5282|       |
 5283|       |  // write to buffer
 5284|  12.8k|  for (int i = 0; i < (txfm_size_col >> 3); i++) {
  ------------------
  |  Branch (5284:19): [True: 6.91k, False: 5.90k]
  ------------------
 5285|  6.91k|    highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2, output + 8 * i,
 5286|  6.91k|                                   stride, ud_flip, txfm_size_row, bd);
 5287|  6.91k|  }
 5288|  5.90k|}
highbd_inv_txfm_sse4.c:highbd_inv_txfm2d_add_v_identity_ssse41:
 5293|  11.0k|                                                    const int bd) {
 5294|  11.0k|  __m128i buf1[64];
 5295|  11.0k|  int eobx, eoby;
 5296|  11.0k|  get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
 5297|  11.0k|  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
 5298|  11.0k|  const int txw_idx = get_txw_idx(tx_size);
 5299|  11.0k|  const int txh_idx = get_txh_idx(tx_size);
 5300|  11.0k|  const int txfm_size_col = tx_size_wide[tx_size];
 5301|  11.0k|  const int txfm_size_row = tx_size_high[tx_size];
 5302|  11.0k|  const int buf_size_w_div4 = AOMMIN(32, txfm_size_col) >> 2;
  ------------------
  |  |   34|  11.0k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 11.0k]
  |  |  ------------------
  ------------------
 5303|  11.0k|  const int row_max = AOMMIN(32, txfm_size_row);
  ------------------
  |  |   34|  11.0k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 11.0k]
  |  |  ------------------
  ------------------
 5304|  11.0k|  const int input_stride = row_max;
 5305|  11.0k|  const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
 5306|  11.0k|  const int buf_size_nonzero_w = buf_size_nonzero_w_div8 << 3;
 5307|  11.0k|  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
 5308|  11.0k|  const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx];
 5309|  11.0k|  const transform_1d_sse4_1 row_txfm =
 5310|  11.0k|      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx];
 5311|  11.0k|  const transform_1d_sse4_1 col_txfm =
 5312|  11.0k|      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
 5313|  11.0k|  int ud_flip, lr_flip;
 5314|  11.0k|  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 5315|       |
 5316|  35.9k|  for (int i = 0; i < (row_max >> 2); ++i) {
  ------------------
  |  Branch (5316:19): [True: 24.8k, False: 11.0k]
  ------------------
 5317|  24.8k|    __m128i buf0[16];
 5318|  24.8k|    load_buffer_32bit_input(input + i * 4, input_stride, buf0,
 5319|  24.8k|                            buf_size_nonzero_w);
 5320|  24.8k|    if (rect_type == 1 || rect_type == -1) {
  ------------------
  |  Branch (5320:9): [True: 3.58k, False: 21.2k]
  |  Branch (5320:27): [True: 4.97k, False: 16.2k]
  ------------------
 5321|  8.56k|      av1_round_shift_rect_array_32_sse4_1(buf0, buf0, buf_size_nonzero_w, 0,
 5322|  8.56k|                                           NewInvSqrt2);
 5323|  8.56k|    }
 5324|  24.8k|    row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
  ------------------
  |  |   43|  24.8k|#define INV_COS_BIT 12
  ------------------
 5325|       |
 5326|  24.8k|    __m128i *_buf1 = buf1 + i * 4;
 5327|  24.8k|    if (lr_flip) {
  ------------------
  |  Branch (5327:9): [True: 340, False: 24.4k]
  ------------------
 5328|  1.18k|      for (int j = 0; j < buf_size_w_div4; ++j) {
  ------------------
  |  Branch (5328:23): [True: 848, False: 340]
  ------------------
 5329|    848|        TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
  ------------------
  |  |   18|    848|  do {                                                \
  |  |   19|    848|    __m128i u0, u1, u2, u3;                           \
  |  |   20|    848|    u0 = _mm_unpacklo_epi32(x0, x1);                  \
  |  |   21|    848|    u1 = _mm_unpackhi_epi32(x0, x1);                  \
  |  |   22|    848|    u2 = _mm_unpacklo_epi32(x2, x3);                  \
  |  |   23|    848|    u3 = _mm_unpackhi_epi32(x2, x3);                  \
  |  |   24|    848|    y0 = _mm_unpacklo_epi64(u0, u2);                  \
  |  |   25|    848|    y1 = _mm_unpackhi_epi64(u0, u2);                  \
  |  |   26|    848|    y2 = _mm_unpacklo_epi64(u1, u3);                  \
  |  |   27|    848|    y3 = _mm_unpackhi_epi64(u1, u3);                  \
  |  |   28|    848|  } while (0)
  |  |  ------------------
  |  |  |  Branch (28:12): [Folded, False: 848]
  |  |  ------------------
  ------------------
 5330|    848|                      buf0[4 * j],
 5331|    848|                      _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 0],
 5332|    848|                      _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 1],
 5333|    848|                      _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 2],
 5334|    848|                      _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 3]);
 5335|    848|      }
 5336|  24.4k|    } else {
 5337|  81.1k|      for (int j = 0; j < buf_size_w_div4; ++j) {
  ------------------
  |  Branch (5337:23): [True: 56.6k, False: 24.4k]
  ------------------
 5338|  56.6k|        TRANSPOSE_4X4(
  ------------------
  |  |   18|  56.6k|  do {                                                \
  |  |   19|  56.6k|    __m128i u0, u1, u2, u3;                           \
  |  |   20|  56.6k|    u0 = _mm_unpacklo_epi32(x0, x1);                  \
  |  |   21|  56.6k|    u1 = _mm_unpackhi_epi32(x0, x1);                  \
  |  |   22|  56.6k|    u2 = _mm_unpacklo_epi32(x2, x3);                  \
  |  |   23|  56.6k|    u3 = _mm_unpackhi_epi32(x2, x3);                  \
  |  |   24|  56.6k|    y0 = _mm_unpacklo_epi64(u0, u2);                  \
  |  |   25|  56.6k|    y1 = _mm_unpackhi_epi64(u0, u2);                  \
  |  |   26|  56.6k|    y2 = _mm_unpacklo_epi64(u1, u3);                  \
  |  |   27|  56.6k|    y3 = _mm_unpackhi_epi64(u1, u3);                  \
  |  |   28|  56.6k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (28:12): [Folded, False: 56.6k]
  |  |  ------------------
  ------------------
 5339|  56.6k|            buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
 5340|  56.6k|            _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
 5341|  56.6k|            _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
 5342|  56.6k|      }
 5343|  24.4k|    }
 5344|  24.8k|  }
 5345|  37.0k|  for (int i = 0; i < buf_size_w_div4; i++) {
  ------------------
  |  Branch (5345:19): [True: 25.9k, False: 11.0k]
  ------------------
 5346|  25.9k|    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
  ------------------
  |  |   43|  25.9k|#define INV_COS_BIT 12
  ------------------
 5347|  25.9k|             bd, 0);
 5348|       |
 5349|  25.9k|    av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
 5350|  25.9k|                                    buf1 + i * txfm_size_row, txfm_size_row,
 5351|  25.9k|                                    -shift[1]);
 5352|  25.9k|  }
 5353|       |
 5354|       |  // write to buffer
 5355|  11.0k|  {
 5356|  24.0k|    for (int i = 0; i < (txfm_size_col >> 3); i++) {
  ------------------
  |  Branch (5356:21): [True: 12.9k, False: 11.0k]
  ------------------
 5357|  12.9k|      highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2,
 5358|  12.9k|                                     output + 8 * i, stride, ud_flip,
 5359|  12.9k|                                     txfm_size_row, bd);
 5360|  12.9k|    }
 5361|  11.0k|  }
 5362|  11.0k|}
highbd_inv_txfm_sse4.c:highbd_inv_txfm2d_add_idtx_ssse41:
 5366|  19.8k|                                              int eob, const int bd) {
 5367|  19.8k|  (void)eob;
 5368|  19.8k|  __m128i buf1[64 * 4];
 5369|  19.8k|  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
 5370|  19.8k|  const int txw_idx = get_txw_idx(tx_size);
 5371|  19.8k|  const int txh_idx = get_txh_idx(tx_size);
 5372|  19.8k|  const int txfm_size_col = tx_size_wide[tx_size];
 5373|  19.8k|  const int txfm_size_row = tx_size_high[tx_size];
 5374|  19.8k|  const int row_max = AOMMIN(32, txfm_size_row);
  ------------------
  |  |   34|  19.8k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 19.8k]
  |  |  ------------------
  ------------------
 5375|  19.8k|  const int input_stride = row_max;
 5376|  19.8k|  const int buf_size_w = AOMMIN(32, txfm_size_col);
  ------------------
  |  |   34|  19.8k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 19.8k]
  |  |  ------------------
  ------------------
 5377|  19.8k|  const int buf_size_w_div4 = buf_size_w >> 2;
 5378|  19.8k|  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
 5379|  19.8k|  const transform_1d_sse4_1 row_txfm =
 5380|  19.8k|      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
 5381|  19.8k|  const transform_1d_sse4_1 col_txfm =
 5382|  19.8k|      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
 5383|       |
 5384|  67.8k|  for (int i = 0; i < (row_max >> 2); ++i) {
  ------------------
  |  Branch (5384:19): [True: 47.9k, False: 19.8k]
  ------------------
 5385|  47.9k|    __m128i buf0[32];
 5386|  47.9k|    load_buffer_32bit_input(input + i * 4, input_stride, buf0, buf_size_w);
 5387|  47.9k|    if (rect_type == 1 || rect_type == -1) {
  ------------------
  |  Branch (5387:9): [True: 7.19k, False: 40.8k]
  |  Branch (5387:27): [True: 10.2k, False: 30.5k]
  ------------------
 5388|  17.4k|      av1_round_shift_rect_array_32_sse4_1(buf0, buf0, buf_size_w, 0,
 5389|  17.4k|                                           NewInvSqrt2);
 5390|  17.4k|    }
 5391|  47.9k|    row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
  ------------------
  |  |   43|  47.9k|#define INV_COS_BIT 12
  ------------------
 5392|       |
 5393|  47.9k|    __m128i *_buf1 = buf1 + i * 4;
 5394|   171k|    for (int j = 0; j < buf_size_w_div4; ++j) {
  ------------------
  |  Branch (5394:21): [True: 123k, False: 47.9k]
  ------------------
 5395|   123k|      __m128i *buf0_cur = buf0 + j * 4;
 5396|   123k|      TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
  ------------------
  |  |   18|   123k|  do {                                                \
  |  |   19|   123k|    __m128i u0, u1, u2, u3;                           \
  |  |   20|   123k|    u0 = _mm_unpacklo_epi32(x0, x1);                  \
  |  |   21|   123k|    u1 = _mm_unpackhi_epi32(x0, x1);                  \
  |  |   22|   123k|    u2 = _mm_unpacklo_epi32(x2, x3);                  \
  |  |   23|   123k|    u3 = _mm_unpackhi_epi32(x2, x3);                  \
  |  |   24|   123k|    y0 = _mm_unpacklo_epi64(u0, u2);                  \
  |  |   25|   123k|    y1 = _mm_unpackhi_epi64(u0, u2);                  \
  |  |   26|   123k|    y2 = _mm_unpacklo_epi64(u1, u3);                  \
  |  |   27|   123k|    y3 = _mm_unpackhi_epi64(u1, u3);                  \
  |  |   28|   123k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (28:12): [Folded, False: 123k]
  |  |  ------------------
  ------------------
 5397|   123k|                    buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
 5398|   123k|      _buf1[j * txfm_size_row + 0] = buf0_cur[0];
 5399|   123k|      _buf1[j * txfm_size_row + 1] = buf0_cur[1];
 5400|   123k|      _buf1[j * txfm_size_row + 2] = buf0_cur[2];
 5401|   123k|      _buf1[j * txfm_size_row + 3] = buf0_cur[3];
 5402|   123k|    }
 5403|  47.9k|  }
 5404|  70.1k|  for (int i = 0; i < buf_size_w_div4; i++) {
  ------------------
  |  Branch (5404:19): [True: 50.2k, False: 19.8k]
  ------------------
 5405|  50.2k|    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
  ------------------
  |  |   43|  50.2k|#define INV_COS_BIT 12
  ------------------
 5406|  50.2k|             bd, 0);
 5407|       |
 5408|  50.2k|    av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
 5409|  50.2k|                                    buf1 + i * txfm_size_row, txfm_size_row,
 5410|  50.2k|                                    -shift[1]);
 5411|  50.2k|  }
 5412|       |
 5413|       |  // write to buffer
 5414|  19.8k|  {
 5415|  44.9k|    for (int i = 0; i < (txfm_size_col >> 3); i++) {
  ------------------
  |  Branch (5415:21): [True: 25.1k, False: 19.8k]
  ------------------
 5416|  25.1k|      highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2,
 5417|  25.1k|                                     output + 8 * i, stride, 0, txfm_size_row,
 5418|  25.1k|                                     bd);
 5419|  25.1k|    }
 5420|  19.8k|  }
 5421|  19.8k|}
highbd_inv_txfm_sse4.c:av1_highbd_inv_txfm_add_4x8_sse4_1:
 5759|  43.8k|                                               const TxfmParam *txfm_param) {
 5760|  43.8k|  int bd = txfm_param->bd;
 5761|  43.8k|  const TX_TYPE tx_type = txfm_param->tx_type;
 5762|  43.8k|  const TX_SIZE tx_size = txfm_param->tx_size;
 5763|  43.8k|  int eob = txfm_param->eob;
 5764|  43.8k|  highbd_inv_txfm2d_add_4x8_sse41(input, CONVERT_TO_SHORTPTR(dest), stride,
  ------------------
  |  |   75|  43.8k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
 5765|  43.8k|                                  tx_type, tx_size, eob, bd);
 5766|  43.8k|}
highbd_inv_txfm_sse4.c:highbd_inv_txfm2d_add_4x8_sse41:
 5506|  43.8k|                                            int eob, const int bd) {
 5507|  43.8k|  (void)eob;
 5508|  43.8k|  __m128i buf1[8];
 5509|  43.8k|  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
 5510|  43.8k|  const int txw_idx = get_txw_idx(tx_size);
 5511|  43.8k|  const int txh_idx = get_txh_idx(tx_size);
 5512|  43.8k|  const int txfm_size_col = tx_size_wide[tx_size];
 5513|  43.8k|  const int txfm_size_row = tx_size_high[tx_size];
 5514|  43.8k|  const transform_1d_sse4_1 row_txfm =
 5515|  43.8k|      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
 5516|  43.8k|  const transform_1d_sse4_1 col_txfm =
 5517|  43.8k|      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][1];
 5518|  43.8k|  const int input_stride = AOMMIN(32, txfm_size_row);
  ------------------
  |  |   34|  43.8k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 43.8k]
  |  |  ------------------
  ------------------
 5519|       |
 5520|  43.8k|  assert(col_txfm != NULL);
 5521|  43.8k|  assert(row_txfm != NULL);
 5522|  43.8k|  int ud_flip, lr_flip;
 5523|  43.8k|  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 5524|       |
 5525|       |  // 1st stage: column transform
 5526|  43.8k|  __m128i buf0[8];
 5527|  43.8k|  load_buffer_32bit_input(input, input_stride, buf0, txfm_size_col);
 5528|  43.8k|  load_buffer_32bit_input(input + 4, input_stride, buf0 + 4, txfm_size_col);
 5529|  43.8k|  av1_round_shift_rect_array_32_sse4_1(buf0, buf0, txfm_size_row, 0,
 5530|  43.8k|                                       NewInvSqrt2);
 5531|  43.8k|  row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
  ------------------
  |  |   43|  43.8k|#define INV_COS_BIT 12
  ------------------
 5532|  43.8k|  row_txfm(buf0 + 4, buf0 + 4, INV_COS_BIT, 0, bd, -shift[0]);
  ------------------
  |  |   43|  43.8k|#define INV_COS_BIT 12
  ------------------
 5533|       |
 5534|  43.8k|  if (lr_flip) {
  ------------------
  |  Branch (5534:7): [True: 150, False: 43.6k]
  ------------------
 5535|    150|    TRANSPOSE_4X4(buf0[3], buf0[2], buf0[1], buf0[0], buf1[0], buf1[1], buf1[2],
  ------------------
  |  |   18|    150|  do {                                                \
  |  |   19|    150|    __m128i u0, u1, u2, u3;                           \
  |  |   20|    150|    u0 = _mm_unpacklo_epi32(x0, x1);                  \
  |  |   21|    150|    u1 = _mm_unpackhi_epi32(x0, x1);                  \
  |  |   22|    150|    u2 = _mm_unpacklo_epi32(x2, x3);                  \
  |  |   23|    150|    u3 = _mm_unpackhi_epi32(x2, x3);                  \
  |  |   24|    150|    y0 = _mm_unpacklo_epi64(u0, u2);                  \
  |  |   25|    150|    y1 = _mm_unpackhi_epi64(u0, u2);                  \
  |  |   26|    150|    y2 = _mm_unpacklo_epi64(u1, u3);                  \
  |  |   27|    150|    y3 = _mm_unpackhi_epi64(u1, u3);                  \
  |  |   28|    150|  } while (0)
  |  |  ------------------
  |  |  |  Branch (28:12): [Folded, False: 150]
  |  |  ------------------
  ------------------
 5536|    150|                  buf1[3]);
 5537|       |
 5538|    150|    TRANSPOSE_4X4(buf0[7], buf0[6], buf0[5], buf0[4], buf1[4], buf1[5], buf1[6],
  ------------------
  |  |   18|    150|  do {                                                \
  |  |   19|    150|    __m128i u0, u1, u2, u3;                           \
  |  |   20|    150|    u0 = _mm_unpacklo_epi32(x0, x1);                  \
  |  |   21|    150|    u1 = _mm_unpackhi_epi32(x0, x1);                  \
  |  |   22|    150|    u2 = _mm_unpacklo_epi32(x2, x3);                  \
  |  |   23|    150|    u3 = _mm_unpackhi_epi32(x2, x3);                  \
  |  |   24|    150|    y0 = _mm_unpacklo_epi64(u0, u2);                  \
  |  |   25|    150|    y1 = _mm_unpackhi_epi64(u0, u2);                  \
  |  |   26|    150|    y2 = _mm_unpacklo_epi64(u1, u3);                  \
  |  |   27|    150|    y3 = _mm_unpackhi_epi64(u1, u3);                  \
  |  |   28|    150|  } while (0)
  |  |  ------------------
  |  |  |  Branch (28:12): [Folded, False: 150]
  |  |  ------------------
  ------------------
 5539|    150|                  buf1[7]);
 5540|  43.6k|  } else {
 5541|  43.6k|    TRANSPOSE_4X4(buf0[0], buf0[1], buf0[2], buf0[3], buf1[0], buf1[1], buf1[2],
  ------------------
  |  |   18|  43.6k|  do {                                                \
  |  |   19|  43.6k|    __m128i u0, u1, u2, u3;                           \
  |  |   20|  43.6k|    u0 = _mm_unpacklo_epi32(x0, x1);                  \
  |  |   21|  43.6k|    u1 = _mm_unpackhi_epi32(x0, x1);                  \
  |  |   22|  43.6k|    u2 = _mm_unpacklo_epi32(x2, x3);                  \
  |  |   23|  43.6k|    u3 = _mm_unpackhi_epi32(x2, x3);                  \
  |  |   24|  43.6k|    y0 = _mm_unpacklo_epi64(u0, u2);                  \
  |  |   25|  43.6k|    y1 = _mm_unpackhi_epi64(u0, u2);                  \
  |  |   26|  43.6k|    y2 = _mm_unpacklo_epi64(u1, u3);                  \
  |  |   27|  43.6k|    y3 = _mm_unpackhi_epi64(u1, u3);                  \
  |  |   28|  43.6k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (28:12): [Folded, False: 43.6k]
  |  |  ------------------
  ------------------
 5542|  43.6k|                  buf1[3]);
 5543|       |
 5544|  43.6k|    TRANSPOSE_4X4(buf0[4], buf0[5], buf0[6], buf0[7], buf1[4], buf1[5], buf1[6],
  ------------------
  |  |   18|  43.6k|  do {                                                \
  |  |   19|  43.6k|    __m128i u0, u1, u2, u3;                           \
  |  |   20|  43.6k|    u0 = _mm_unpacklo_epi32(x0, x1);                  \
  |  |   21|  43.6k|    u1 = _mm_unpackhi_epi32(x0, x1);                  \
  |  |   22|  43.6k|    u2 = _mm_unpacklo_epi32(x2, x3);                  \
  |  |   23|  43.6k|    u3 = _mm_unpackhi_epi32(x2, x3);                  \
  |  |   24|  43.6k|    y0 = _mm_unpacklo_epi64(u0, u2);                  \
  |  |   25|  43.6k|    y1 = _mm_unpackhi_epi64(u0, u2);                  \
  |  |   26|  43.6k|    y2 = _mm_unpacklo_epi64(u1, u3);                  \
  |  |   27|  43.6k|    y3 = _mm_unpackhi_epi64(u1, u3);                  \
  |  |   28|  43.6k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (28:12): [Folded, False: 43.6k]
  |  |  ------------------
  ------------------
 5545|  43.6k|                  buf1[7]);
 5546|  43.6k|  }
 5547|       |
 5548|       |  // 2nd stage: column transform
 5549|  43.8k|  col_txfm(buf1, buf1, INV_COS_BIT, 1, bd, 0);
  ------------------
  |  |   43|  43.8k|#define INV_COS_BIT 12
  ------------------
 5550|       |
 5551|  43.8k|  av1_round_shift_array_32_sse4_1(buf1, buf1, txfm_size_row, -shift[1]);
 5552|       |
 5553|       |  // write to buffer
 5554|  43.8k|  highbd_write_buffer_4xn_sse4_1(buf1, output, stride, ud_flip, txfm_size_row,
 5555|  43.8k|                                 bd);
 5556|  43.8k|}
highbd_inv_txfm_sse4.c:highbd_write_buffer_4xn_sse4_1:
  110|  64.6k|                                                  int height, const int bd) {
  111|  64.6k|  int j = flipud ? (height - 1) : 0;
  ------------------
  |  Branch (111:11): [True: 150, False: 64.5k]
  ------------------
  112|  64.6k|  const int step = flipud ? -1 : 1;
  ------------------
  |  Branch (112:20): [True: 150, False: 64.5k]
  ------------------
  113|   748k|  for (int i = 0; i < height; ++i, j += step) {
  ------------------
  |  Branch (113:19): [True: 684k, False: 64.6k]
  ------------------
  114|   684k|    __m128i v = _mm_loadl_epi64((__m128i const *)(output + i * stride));
  115|   684k|    __m128i u = highbd_get_recon_4xn_sse4_1(v, in[j], bd);
  116|       |
  117|   684k|    _mm_storel_epi64((__m128i *)(output + i * stride), u);
  118|   684k|  }
  119|  64.6k|}
highbd_inv_txfm_sse4.c:highbd_get_recon_4xn_sse4_1:
   99|   684k|                                                  __m128i res0, const int bd) {
  100|   684k|  __m128i x0 = _mm_cvtepi16_epi32(pred);
  101|       |
  102|   684k|  x0 = _mm_add_epi32(res0, x0);
  103|   684k|  x0 = _mm_packus_epi32(x0, x0);
  104|   684k|  x0 = highbd_clamp_epi16(x0, bd);
  105|   684k|  return x0;
  106|   684k|}
highbd_inv_txfm_sse4.c:av1_highbd_inv_txfm_add_8x4_sse4_1:
 5770|  68.9k|                                               const TxfmParam *txfm_param) {
 5771|  68.9k|  int bd = txfm_param->bd;
 5772|  68.9k|  const TX_TYPE tx_type = txfm_param->tx_type;
 5773|  68.9k|  const TX_SIZE tx_size = txfm_param->tx_size;
 5774|  68.9k|  int eob = txfm_param->eob;
 5775|  68.9k|  highbd_inv_txfm2d_add_8x4_sse41(input, CONVERT_TO_SHORTPTR(dest), stride,
  ------------------
  |  |   75|  68.9k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
 5776|  68.9k|                                  tx_type, tx_size, eob, bd);
 5777|  68.9k|}
highbd_inv_txfm_sse4.c:highbd_inv_txfm2d_add_8x4_sse41:
 5561|  68.9k|                                            int eob, const int bd) {
 5562|  68.9k|  (void)eob;
 5563|  68.9k|  __m128i buf1[8];
 5564|  68.9k|  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
 5565|  68.9k|  const int txw_idx = get_txw_idx(tx_size);
 5566|  68.9k|  const int txh_idx = get_txh_idx(tx_size);
 5567|  68.9k|  const int txfm_size_col = tx_size_wide[tx_size];
 5568|  68.9k|  const int txfm_size_row = tx_size_high[tx_size];
 5569|  68.9k|  const transform_1d_sse4_1 row_txfm =
 5570|  68.9k|      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][1];
 5571|  68.9k|  const transform_1d_sse4_1 col_txfm =
 5572|  68.9k|      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
 5573|       |
 5574|  68.9k|  assert(col_txfm != NULL);
 5575|  68.9k|  assert(row_txfm != NULL);
 5576|  68.9k|  int ud_flip, lr_flip;
 5577|  68.9k|  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 5578|       |
 5579|       |  // 1st stage: column transform
 5580|  68.9k|  __m128i buf0[8];
 5581|  68.9k|  const int32_t *input_row = input;
 5582|  68.9k|  load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col);
 5583|       |
 5584|  68.9k|  av1_round_shift_rect_array_32_sse4_1(buf0, buf0, txfm_size_col, 0,
 5585|  68.9k|                                       NewInvSqrt2);
 5586|  68.9k|  row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
  ------------------
  |  |   43|  68.9k|#define INV_COS_BIT 12
  ------------------
 5587|       |
 5588|  68.9k|  __m128i *buf1_ptr;
 5589|  68.9k|  if (lr_flip) {
  ------------------
  |  Branch (5589:7): [True: 287, False: 68.6k]
  ------------------
 5590|    287|    flip_buf_sse2(buf0, buf1, txfm_size_col);
 5591|    287|    buf1_ptr = buf1;
 5592|  68.6k|  } else {
 5593|  68.6k|    buf1_ptr = buf0;
 5594|  68.6k|  }
 5595|       |
 5596|       |  // 2nd stage: column transform
 5597|   206k|  for (int i = 0; i < 2; i++) {
  ------------------
  |  Branch (5597:19): [True: 137k, False: 68.9k]
  ------------------
 5598|   137k|    __m128i *buf1_cur = buf1_ptr + i * txfm_size_row;
 5599|   137k|    transpose_32bit_4x4(buf1_cur, buf1_cur);
 5600|   137k|    col_txfm(buf1_cur, buf1_cur, INV_COS_BIT, 1, bd, 0);
  ------------------
  |  |   43|   137k|#define INV_COS_BIT 12
  ------------------
 5601|   137k|  }
 5602|  68.9k|  av1_round_shift_array_32_sse4_1(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]);
 5603|       |  // write to buffer
 5604|  68.9k|  highbd_write_buffer_8xn_sse4_1(buf1_ptr, output, stride, ud_flip,
 5605|  68.9k|                                 txfm_size_row, bd);
 5606|  68.9k|}
highbd_inv_txfm_sse4.c:av1_highbd_inv_txfm_add_4x4_sse4_1:
 5154|   583k|                                               const TxfmParam *txfm_param) {
 5155|   583k|  assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
 5156|   583k|  int eob = txfm_param->eob;
 5157|   583k|  int bd = txfm_param->bd;
 5158|   583k|  int lossless = txfm_param->lossless;
 5159|   583k|  const int32_t *src = cast_to_int32(input);
 5160|   583k|  const TX_TYPE tx_type = txfm_param->tx_type;
 5161|   583k|  if (lossless) {
  ------------------
  |  Branch (5161:7): [True: 436k, False: 147k]
  ------------------
 5162|   436k|    assert(tx_type == DCT_DCT);
 5163|   436k|    av1_highbd_iwht4x4_add(input, dest, stride, eob, bd);
 5164|   436k|    return;
 5165|   436k|  }
 5166|   147k|  av1_inv_txfm2d_add_4x4_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
  ------------------
  |  |   75|   147k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
 5167|   147k|                                bd);
 5168|   147k|}
highbd_inv_txfm_sse4.c:av1_highbd_inv_txfm_add_16x4_sse4_1:
 5792|  46.1k|                                                const TxfmParam *txfm_param) {
 5793|  46.1k|  int bd = txfm_param->bd;
 5794|  46.1k|  const TX_TYPE tx_type = txfm_param->tx_type;
 5795|  46.1k|  const TX_SIZE tx_size = txfm_param->tx_size;
 5796|  46.1k|  int eob = txfm_param->eob;
 5797|  46.1k|  highbd_inv_txfm2d_add_16x4_sse4_1(input, CONVERT_TO_SHORTPTR(dest), stride,
  ------------------
  |  |   75|  46.1k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
 5798|  46.1k|                                    tx_type, tx_size, eob, bd);
 5799|  46.1k|}
highbd_inv_txfm_sse4.c:highbd_inv_txfm2d_add_16x4_sse4_1:
 5667|  46.1k|                                              int eob, const int bd) {
 5668|  46.1k|  (void)eob;
 5669|  46.1k|  __m128i buf1[16];
 5670|  46.1k|  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
 5671|  46.1k|  const int txw_idx = get_txw_idx(tx_size);
 5672|  46.1k|  const int txh_idx = get_txh_idx(tx_size);
 5673|  46.1k|  const int txfm_size_col = tx_size_wide[tx_size];
 5674|  46.1k|  const int txfm_size_row = tx_size_high[tx_size];
 5675|  46.1k|  const int buf_size_w_div8 = txfm_size_col >> 2;
 5676|  46.1k|  const transform_1d_sse4_1 row_txfm =
 5677|  46.1k|      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][2];
 5678|  46.1k|  const transform_1d_sse4_1 col_txfm =
 5679|  46.1k|      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
 5680|       |
 5681|  46.1k|  assert(col_txfm != NULL);
 5682|  46.1k|  assert(row_txfm != NULL);
 5683|  46.1k|  int ud_flip, lr_flip;
 5684|  46.1k|  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 5685|       |
 5686|       |  // 1st stage: column transform
 5687|  46.1k|  __m128i buf0[16];
 5688|  46.1k|  const int32_t *input_row = input;
 5689|  46.1k|  load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col);
 5690|       |
 5691|  46.1k|  row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
  ------------------
  |  |   43|  46.1k|#define INV_COS_BIT 12
  ------------------
 5692|       |
 5693|  46.1k|  __m128i *buf1_ptr;
 5694|  46.1k|  if (lr_flip) {
  ------------------
  |  Branch (5694:7): [True: 214, False: 45.9k]
  ------------------
 5695|    214|    flip_buf_sse2(buf0, buf1, txfm_size_col);
 5696|    214|    buf1_ptr = buf1;
 5697|  45.9k|  } else {
 5698|  45.9k|    buf1_ptr = buf0;
 5699|  45.9k|  }
 5700|       |
 5701|       |  // 2nd stage: column transform
 5702|   230k|  for (int i = 0; i < buf_size_w_div8; i++) {
  ------------------
  |  Branch (5702:19): [True: 184k, False: 46.1k]
  ------------------
 5703|   184k|    __m128i *buf1_cur = buf1_ptr + i * txfm_size_row;
 5704|   184k|    transpose_32bit_4x4(buf1_cur, buf1_cur);
 5705|   184k|    col_txfm(buf1_cur, buf1_cur, INV_COS_BIT, 1, bd, 0);
  ------------------
  |  |   43|   184k|#define INV_COS_BIT 12
  ------------------
 5706|   184k|  }
 5707|  46.1k|  av1_round_shift_array_32_sse4_1(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]);
 5708|       |
 5709|       |  // write to buffer
 5710|   138k|  for (int i = 0; i < (txfm_size_col >> 3); i++) {
  ------------------
  |  Branch (5710:19): [True: 92.3k, False: 46.1k]
  ------------------
 5711|  92.3k|    highbd_write_buffer_8xn_sse4_1(buf1_ptr + i * txfm_size_row * 2,
 5712|  92.3k|                                   output + 8 * i, stride, ud_flip,
 5713|  92.3k|                                   txfm_size_row, bd);
 5714|  92.3k|  }
 5715|  46.1k|}
highbd_inv_txfm_sse4.c:av1_highbd_inv_txfm_add_4x16_sse4_1:
 5781|  20.8k|                                                const TxfmParam *txfm_param) {
 5782|  20.8k|  int bd = txfm_param->bd;
 5783|  20.8k|  const TX_TYPE tx_type = txfm_param->tx_type;
 5784|  20.8k|  const TX_SIZE tx_size = txfm_param->tx_size;
 5785|  20.8k|  int eob = txfm_param->eob;
 5786|  20.8k|  highbd_inv_txfm2d_add_4x16_sse4_1(input, CONVERT_TO_SHORTPTR(dest), stride,
  ------------------
  |  |   75|  20.8k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
 5787|  20.8k|                                    tx_type, tx_size, eob, bd);
 5788|  20.8k|}
highbd_inv_txfm_sse4.c:highbd_inv_txfm2d_add_4x16_sse4_1:
 5611|  20.8k|                                              int eob, const int bd) {
 5612|  20.8k|  (void)eob;
 5613|  20.8k|  __m128i buf1[16];
 5614|  20.8k|  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
 5615|  20.8k|  const int txw_idx = get_txw_idx(tx_size);
 5616|  20.8k|  const int txh_idx = get_txh_idx(tx_size);
 5617|  20.8k|  const int txfm_size_col = tx_size_wide[tx_size];
 5618|  20.8k|  const int txfm_size_row = tx_size_high[tx_size];
 5619|  20.8k|  const int buf_size_h_div8 = txfm_size_row >> 2;
 5620|  20.8k|  const transform_1d_sse4_1 row_txfm =
 5621|  20.8k|      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
 5622|  20.8k|  const transform_1d_sse4_1 col_txfm =
 5623|  20.8k|      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][2];
 5624|  20.8k|  const int input_stride = AOMMIN(32, txfm_size_row);
  ------------------
  |  |   34|  20.8k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 20.8k]
  |  |  ------------------
  ------------------
 5625|       |
 5626|  20.8k|  assert(col_txfm != NULL);
 5627|  20.8k|  assert(row_txfm != NULL);
 5628|  20.8k|  int ud_flip, lr_flip;
 5629|  20.8k|  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 5630|       |
 5631|       |  // 1st stage: column transform
 5632|  20.8k|  __m128i buf0[16];
 5633|   104k|  for (int i = 0; i < (txfm_size_row >> 2); i++) {
  ------------------
  |  Branch (5633:19): [True: 83.3k, False: 20.8k]
  ------------------
 5634|  83.3k|    const int32_t *input_row = input + i * 4;
 5635|  83.3k|    __m128i *buf0_cur = buf0 + i * 4;
 5636|  83.3k|    load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_col);
 5637|  83.3k|    row_txfm(buf0_cur, buf0_cur, INV_COS_BIT, 0, bd, -shift[0]);
  ------------------
  |  |   43|  83.3k|#define INV_COS_BIT 12
  ------------------
 5638|  83.3k|  }
 5639|       |
 5640|  20.8k|  if (lr_flip) {
  ------------------
  |  Branch (5640:7): [True: 112, False: 20.7k]
  ------------------
 5641|    560|    for (int j = 0; j < buf_size_h_div8; ++j) {
  ------------------
  |  Branch (5641:21): [True: 448, False: 112]
  ------------------
 5642|    448|      TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
  ------------------
  |  |   18|    448|  do {                                                \
  |  |   19|    448|    __m128i u0, u1, u2, u3;                           \
  |  |   20|    448|    u0 = _mm_unpacklo_epi32(x0, x1);                  \
  |  |   21|    448|    u1 = _mm_unpackhi_epi32(x0, x1);                  \
  |  |   22|    448|    u2 = _mm_unpacklo_epi32(x2, x3);                  \
  |  |   23|    448|    u3 = _mm_unpackhi_epi32(x2, x3);                  \
  |  |   24|    448|    y0 = _mm_unpacklo_epi64(u0, u2);                  \
  |  |   25|    448|    y1 = _mm_unpackhi_epi64(u0, u2);                  \
  |  |   26|    448|    y2 = _mm_unpacklo_epi64(u1, u3);                  \
  |  |   27|    448|    y3 = _mm_unpackhi_epi64(u1, u3);                  \
  |  |   28|    448|  } while (0)
  |  |  ------------------
  |  |  |  Branch (28:12): [Folded, False: 448]
  |  |  ------------------
  ------------------
 5643|    448|                    buf0[4 * j], buf1[4 * j], buf1[4 * j + 1], buf1[4 * j + 2],
 5644|    448|                    buf1[4 * j + 3]);
 5645|    448|    }
 5646|  20.7k|  } else {
 5647|   103k|    for (int j = 0; j < buf_size_h_div8; ++j) {
  ------------------
  |  Branch (5647:21): [True: 82.9k, False: 20.7k]
  ------------------
 5648|  82.9k|      TRANSPOSE_4X4(buf0[4 * j], buf0[4 * j + 1], buf0[4 * j + 2],
  ------------------
  |  |   18|  82.9k|  do {                                                \
  |  |   19|  82.9k|    __m128i u0, u1, u2, u3;                           \
  |  |   20|  82.9k|    u0 = _mm_unpacklo_epi32(x0, x1);                  \
  |  |   21|  82.9k|    u1 = _mm_unpackhi_epi32(x0, x1);                  \
  |  |   22|  82.9k|    u2 = _mm_unpacklo_epi32(x2, x3);                  \
  |  |   23|  82.9k|    u3 = _mm_unpackhi_epi32(x2, x3);                  \
  |  |   24|  82.9k|    y0 = _mm_unpacklo_epi64(u0, u2);                  \
  |  |   25|  82.9k|    y1 = _mm_unpackhi_epi64(u0, u2);                  \
  |  |   26|  82.9k|    y2 = _mm_unpacklo_epi64(u1, u3);                  \
  |  |   27|  82.9k|    y3 = _mm_unpackhi_epi64(u1, u3);                  \
  |  |   28|  82.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (28:12): [Folded, False: 82.9k]
  |  |  ------------------
  ------------------
 5649|  82.9k|                    buf0[4 * j + 3], buf1[4 * j], buf1[4 * j + 1],
 5650|  82.9k|                    buf1[4 * j + 2], buf1[4 * j + 3]);
 5651|  82.9k|    }
 5652|  20.7k|  }
 5653|       |
 5654|       |  // 2nd stage: column transform
 5655|  20.8k|  col_txfm(buf1, buf1, INV_COS_BIT, 1, bd, 0);
  ------------------
  |  |   43|  20.8k|#define INV_COS_BIT 12
  ------------------
 5656|       |
 5657|  20.8k|  av1_round_shift_array_32_sse4_1(buf1, buf1, txfm_size_row, -shift[1]);
 5658|       |
 5659|       |  // write to buffer
 5660|  20.8k|  highbd_write_buffer_4xn_sse4_1(buf1, output, stride, ud_flip, txfm_size_row,
 5661|  20.8k|                                 bd);
 5662|  20.8k|}

av1_highbd_dist_wtd_convolve_2d_copy_avx2:
   29|  1.37k|                                               int bd) {
   30|  1.37k|  CONV_BUF_TYPE *dst = conv_params->dst;
   31|  1.37k|  int dst_stride = conv_params->dst_stride;
   32|       |
   33|  1.37k|  const int bits =
   34|  1.37k|      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
  ------------------
  |  |   21|  1.37k|#define FILTER_BITS 7
  ------------------
   35|  1.37k|  const __m128i left_shift = _mm_cvtsi32_si128(bits);
   36|  1.37k|  const int do_average = conv_params->do_average;
   37|  1.37k|  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   38|  1.37k|  const int w0 = conv_params->fwd_offset;
   39|  1.37k|  const int w1 = conv_params->bck_offset;
   40|  1.37k|  const __m256i wt0 = _mm256_set1_epi32(w0);
   41|  1.37k|  const __m256i wt1 = _mm256_set1_epi32(w1);
   42|  1.37k|  const __m256i zero = _mm256_setzero_si256();
   43|  1.37k|  int i, j;
   44|       |
   45|  1.37k|  const int offset_0 =
   46|  1.37k|      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|  1.37k|#define FILTER_BITS 7
  ------------------
   47|  1.37k|  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
   48|  1.37k|  const __m256i offset_const = _mm256_set1_epi32(offset);
   49|  1.37k|  const __m256i offset_const_16b = _mm256_set1_epi16(offset);
   50|  1.37k|  const int rounding_shift =
   51|  1.37k|      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|  1.37k|#define FILTER_BITS 7
  ------------------
   52|  1.37k|  const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
   53|  1.37k|  const __m256i clip_pixel_to_bd =
   54|  1.37k|      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
  ------------------
  |  Branch (54:25): [True: 1.10k, False: 266]
  |  Branch (54:44): [True: 266, False: 0]
  ------------------
   55|       |
   56|  1.37k|  assert(bits <= 4);
   57|       |
   58|  1.37k|  if (!(w % 16)) {
  ------------------
  |  Branch (58:7): [True: 644, False: 730]
  ------------------
   59|  10.0k|    for (i = 0; i < h; i += 1) {
  ------------------
  |  Branch (59:17): [True: 9.39k, False: 644]
  ------------------
   60|  25.4k|      for (j = 0; j < w; j += 16) {
  ------------------
  |  Branch (60:19): [True: 16.0k, False: 9.39k]
  ------------------
   61|  16.0k|        const __m256i src_16bit =
   62|  16.0k|            _mm256_loadu_si256((__m256i *)(&src[i * src_stride + j]));
   63|       |
   64|  16.0k|        const __m256i res = _mm256_sll_epi16(src_16bit, left_shift);
   65|       |
   66|  16.0k|        if (do_average) {
  ------------------
  |  Branch (66:13): [True: 7.04k, False: 9.05k]
  ------------------
   67|  7.04k|          const __m256i data_0 =
   68|  7.04k|              _mm256_loadu_si256((__m256i *)(&dst[i * dst_stride + j]));
   69|       |
   70|  7.04k|          const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_0, zero);
   71|  7.04k|          const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_0, zero);
   72|       |
   73|  7.04k|          const __m256i res_32b_lo = _mm256_unpacklo_epi16(res, zero);
   74|  7.04k|          const __m256i res_unsigned_lo =
   75|  7.04k|              _mm256_add_epi32(res_32b_lo, offset_const);
   76|       |
   77|  7.04k|          const __m256i comp_avg_res_lo =
   78|  7.04k|              highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
   79|  7.04k|                              use_dist_wtd_comp_avg);
   80|       |
   81|  7.04k|          const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero);
   82|  7.04k|          const __m256i res_unsigned_hi =
   83|  7.04k|              _mm256_add_epi32(res_32b_hi, offset_const);
   84|       |
   85|  7.04k|          const __m256i comp_avg_res_hi =
   86|  7.04k|              highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
   87|  7.04k|                              use_dist_wtd_comp_avg);
   88|       |
   89|  7.04k|          const __m256i round_result_lo = highbd_convolve_rounding(
   90|  7.04k|              &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
   91|  7.04k|          const __m256i round_result_hi = highbd_convolve_rounding(
   92|  7.04k|              &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
   93|       |
   94|  7.04k|          const __m256i res_16b =
   95|  7.04k|              _mm256_packus_epi32(round_result_lo, round_result_hi);
   96|  7.04k|          const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd);
   97|       |
   98|  7.04k|          _mm256_store_si256((__m256i *)(&dst0[i * dst_stride0 + j]), res_clip);
   99|  9.05k|        } else {
  100|  9.05k|          const __m256i res_unsigned_16b =
  101|  9.05k|              _mm256_adds_epu16(res, offset_const_16b);
  102|       |
  103|  9.05k|          _mm256_store_si256((__m256i *)(&dst[i * dst_stride + j]),
  104|  9.05k|                             res_unsigned_16b);
  105|  9.05k|        }
  106|  16.0k|      }
  107|  9.39k|    }
  108|    730|  } else if (!(w % 4)) {
  ------------------
  |  Branch (108:14): [True: 730, False: 0]
  ------------------
  109|  3.85k|    for (i = 0; i < h; i += 2) {
  ------------------
  |  Branch (109:17): [True: 3.12k, False: 730]
  ------------------
  110|  6.24k|      for (j = 0; j < w; j += 8) {
  ------------------
  |  Branch (110:19): [True: 3.12k, False: 3.12k]
  ------------------
  111|  3.12k|        const __m128i src_row_0 =
  112|  3.12k|            _mm_loadu_si128((__m128i *)(&src[i * src_stride + j]));
  113|  3.12k|        const __m128i src_row_1 =
  114|  3.12k|            _mm_loadu_si128((__m128i *)(&src[i * src_stride + j + src_stride]));
  115|       |        // since not all compilers yet support _mm256_set_m128i()
  116|  3.12k|        const __m256i src_10 = _mm256_insertf128_si256(
  117|  3.12k|            _mm256_castsi128_si256(src_row_0), src_row_1, 1);
  118|       |
  119|  3.12k|        const __m256i res = _mm256_sll_epi16(src_10, left_shift);
  120|       |
  121|  3.12k|        if (w - j < 8) {
  ------------------
  |  Branch (121:13): [True: 920, False: 2.20k]
  ------------------
  122|    920|          if (do_average) {
  ------------------
  |  Branch (122:15): [True: 368, False: 552]
  ------------------
  123|    368|            const __m256i data_0 = _mm256_castsi128_si256(
  124|    368|                _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
  125|    368|            const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
  126|    368|                (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
  127|    368|            const __m256i data_01 =
  128|    368|                _mm256_permute2x128_si256(data_0, data_1, 0x20);
  129|       |
  130|    368|            const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
  131|       |
  132|    368|            const __m256i res_32b = _mm256_unpacklo_epi16(res, zero);
  133|    368|            const __m256i res_unsigned_lo =
  134|    368|                _mm256_add_epi32(res_32b, offset_const);
  135|       |
  136|    368|            const __m256i comp_avg_res =
  137|    368|                highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1,
  138|    368|                                use_dist_wtd_comp_avg);
  139|       |
  140|    368|            const __m256i round_result = highbd_convolve_rounding(
  141|    368|                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
  142|       |
  143|    368|            const __m256i res_16b =
  144|    368|                _mm256_packus_epi32(round_result, round_result);
  145|    368|            const __m256i res_clip =
  146|    368|                _mm256_min_epi16(res_16b, clip_pixel_to_bd);
  147|       |
  148|    368|            const __m128i res_0 = _mm256_castsi256_si128(res_clip);
  149|    368|            const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
  150|       |
  151|    368|            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
  152|    368|            _mm_storel_epi64(
  153|    368|                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
  154|    552|          } else {
  155|    552|            const __m256i res_unsigned_16b =
  156|    552|                _mm256_adds_epu16(res, offset_const_16b);
  157|       |
  158|    552|            const __m128i res_0 = _mm256_castsi256_si128(res_unsigned_16b);
  159|    552|            const __m128i res_1 = _mm256_extracti128_si256(res_unsigned_16b, 1);
  160|       |
  161|    552|            _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
  162|    552|            _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
  163|    552|                             res_1);
  164|    552|          }
  165|  2.20k|        } else {
  166|  2.20k|          if (do_average) {
  ------------------
  |  Branch (166:15): [True: 952, False: 1.24k]
  ------------------
  167|    952|            const __m256i data_0 = _mm256_castsi128_si256(
  168|    952|                _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
  169|    952|            const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
  170|    952|                (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
  171|    952|            const __m256i data_01 =
  172|    952|                _mm256_permute2x128_si256(data_0, data_1, 0x20);
  173|       |
  174|    952|            const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
  175|    952|            const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
  176|       |
  177|    952|            const __m256i res_32b_lo = _mm256_unpacklo_epi16(res, zero);
  178|    952|            const __m256i res_unsigned_lo =
  179|    952|                _mm256_add_epi32(res_32b_lo, offset_const);
  180|       |
  181|    952|            const __m256i comp_avg_res_lo =
  182|    952|                highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
  183|    952|                                use_dist_wtd_comp_avg);
  184|       |
  185|    952|            const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero);
  186|    952|            const __m256i res_unsigned_hi =
  187|    952|                _mm256_add_epi32(res_32b_hi, offset_const);
  188|       |
  189|    952|            const __m256i comp_avg_res_hi =
  190|    952|                highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
  191|    952|                                use_dist_wtd_comp_avg);
  192|       |
  193|    952|            const __m256i round_result_lo =
  194|    952|                highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
  195|    952|                                         &rounding_const, rounding_shift);
  196|    952|            const __m256i round_result_hi =
  197|    952|                highbd_convolve_rounding(&comp_avg_res_hi, &offset_const,
  198|    952|                                         &rounding_const, rounding_shift);
  199|       |
  200|    952|            const __m256i res_16b =
  201|    952|                _mm256_packus_epi32(round_result_lo, round_result_hi);
  202|    952|            const __m256i res_clip =
  203|    952|                _mm256_min_epi16(res_16b, clip_pixel_to_bd);
  204|       |
  205|    952|            const __m128i res_0 = _mm256_castsi256_si128(res_clip);
  206|    952|            const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
  207|       |
  208|    952|            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
  209|    952|            _mm_store_si128(
  210|    952|                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
  211|  1.24k|          } else {
  212|  1.24k|            const __m256i res_unsigned_16b =
  213|  1.24k|                _mm256_adds_epu16(res, offset_const_16b);
  214|  1.24k|            const __m128i res_0 = _mm256_castsi256_si128(res_unsigned_16b);
  215|  1.24k|            const __m128i res_1 = _mm256_extracti128_si256(res_unsigned_16b, 1);
  216|       |
  217|  1.24k|            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
  218|  1.24k|            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
  219|  1.24k|                            res_1);
  220|  1.24k|          }
  221|  2.20k|        }
  222|  3.12k|      }
  223|  3.12k|    }
  224|    730|  }
  225|  1.37k|}
av1_highbd_dist_wtd_convolve_2d_avx2:
  231|  2.39k|    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
  232|  2.39k|  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
  ------------------
  |  |   19|  2.39k|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
  233|  2.39k|  CONV_BUF_TYPE *dst = conv_params->dst;
  234|  2.39k|  int dst_stride = conv_params->dst_stride;
  235|  2.39k|  int im_h = h + filter_params_y->taps - 1;
  236|  2.39k|  int im_stride = 8;
  237|  2.39k|  int i, j;
  238|  2.39k|  const int fo_vert = filter_params_y->taps / 2 - 1;
  239|  2.39k|  const int fo_horiz = filter_params_x->taps / 2 - 1;
  240|  2.39k|  const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
  241|       |
  242|       |  // Check that, even with 12-bit input, the intermediate values will fit
  243|       |  // into an unsigned 16-bit intermediate array.
  244|  2.39k|  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
  245|       |
  246|  2.39k|  __m256i s[8], coeffs_y[4], coeffs_x[4];
  247|  2.39k|  const int do_average = conv_params->do_average;
  248|  2.39k|  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
  249|       |
  250|  2.39k|  const int w0 = conv_params->fwd_offset;
  251|  2.39k|  const int w1 = conv_params->bck_offset;
  252|  2.39k|  const __m256i wt0 = _mm256_set1_epi32(w0);
  253|  2.39k|  const __m256i wt1 = _mm256_set1_epi32(w1);
  254|  2.39k|  const __m256i zero = _mm256_setzero_si256();
  255|       |
  256|  2.39k|  const __m256i round_const_x = _mm256_set1_epi32(
  257|  2.39k|      ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
  ------------------
  |  |   21|  2.39k|#define FILTER_BITS 7
  ------------------
  258|  2.39k|  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
  259|       |
  260|  2.39k|  const __m256i round_const_y = _mm256_set1_epi32(
  261|  2.39k|      ((1 << conv_params->round_1) >> 1) -
  262|  2.39k|      (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
  ------------------
  |  |   21|  2.39k|#define FILTER_BITS 7
  ------------------
  263|  2.39k|  const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
  264|       |
  265|  2.39k|  const int offset_0 =
  266|  2.39k|      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|  2.39k|#define FILTER_BITS 7
  ------------------
  267|  2.39k|  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
  268|  2.39k|  const __m256i offset_const = _mm256_set1_epi32(offset);
  269|  2.39k|  const int rounding_shift =
  270|  2.39k|      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|  2.39k|#define FILTER_BITS 7
  ------------------
  271|  2.39k|  const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
  272|       |
  273|  2.39k|  const __m256i clip_pixel_to_bd =
  274|  2.39k|      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
  ------------------
  |  Branch (274:25): [True: 2.17k, False: 220]
  |  Branch (274:44): [True: 220, False: 0]
  ------------------
  275|       |
  276|  2.39k|  prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
  277|  2.39k|  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
  278|       |
  279|  6.15k|  for (j = 0; j < w; j += 8) {
  ------------------
  |  Branch (279:15): [True: 3.76k, False: 2.39k]
  ------------------
  280|       |    /* Horizontal filter */
  281|  3.76k|    {
  282|  46.1k|      for (i = 0; i < im_h; i += 2) {
  ------------------
  |  Branch (282:19): [True: 42.4k, False: 3.76k]
  ------------------
  283|  42.4k|        const __m256i row0 =
  284|  42.4k|            _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
  285|  42.4k|        __m256i row1 = _mm256_setzero_si256();
  286|  42.4k|        if (i + 1 < im_h)
  ------------------
  |  Branch (286:13): [True: 38.6k, False: 3.76k]
  ------------------
  287|  38.6k|          row1 =
  288|  38.6k|              _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
  289|       |
  290|  42.4k|        const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
  291|  42.4k|        const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
  292|       |
  293|       |        // even pixels
  294|  42.4k|        s[0] = _mm256_alignr_epi8(r1, r0, 0);
  295|  42.4k|        s[1] = _mm256_alignr_epi8(r1, r0, 4);
  296|  42.4k|        s[2] = _mm256_alignr_epi8(r1, r0, 8);
  297|  42.4k|        s[3] = _mm256_alignr_epi8(r1, r0, 12);
  298|       |
  299|  42.4k|        __m256i res_even = convolve(s, coeffs_x);
  300|  42.4k|        res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
  301|  42.4k|                                    round_shift_x);
  302|       |
  303|       |        // odd pixels
  304|  42.4k|        s[0] = _mm256_alignr_epi8(r1, r0, 2);
  305|  42.4k|        s[1] = _mm256_alignr_epi8(r1, r0, 6);
  306|  42.4k|        s[2] = _mm256_alignr_epi8(r1, r0, 10);
  307|  42.4k|        s[3] = _mm256_alignr_epi8(r1, r0, 14);
  308|       |
  309|  42.4k|        __m256i res_odd = convolve(s, coeffs_x);
  310|  42.4k|        res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
  311|  42.4k|                                   round_shift_x);
  312|       |
  313|  42.4k|        __m256i res_even1 = _mm256_packs_epi32(res_even, res_even);
  314|  42.4k|        __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd);
  315|  42.4k|        __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1);
  316|       |
  317|  42.4k|        _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
  318|  42.4k|      }
  319|  3.76k|    }
  320|       |
  321|       |    /* Vertical filter */
  322|  3.76k|    {
  323|  3.76k|      __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
  324|  3.76k|      __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
  325|  3.76k|      __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
  326|  3.76k|      __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
  327|  3.76k|      __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
  328|  3.76k|      __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
  329|       |
  330|  3.76k|      s[0] = _mm256_unpacklo_epi16(s0, s1);
  331|  3.76k|      s[1] = _mm256_unpacklo_epi16(s2, s3);
  332|  3.76k|      s[2] = _mm256_unpacklo_epi16(s4, s5);
  333|       |
  334|  3.76k|      s[4] = _mm256_unpackhi_epi16(s0, s1);
  335|  3.76k|      s[5] = _mm256_unpackhi_epi16(s2, s3);
  336|  3.76k|      s[6] = _mm256_unpackhi_epi16(s4, s5);
  337|       |
  338|  31.1k|      for (i = 0; i < h; i += 2) {
  ------------------
  |  Branch (338:19): [True: 27.3k, False: 3.76k]
  ------------------
  339|  27.3k|        const int16_t *data = &im_block[i * im_stride];
  340|       |
  341|  27.3k|        const __m256i s6 =
  342|  27.3k|            _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
  343|  27.3k|        const __m256i s7 =
  344|  27.3k|            _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
  345|       |
  346|  27.3k|        s[3] = _mm256_unpacklo_epi16(s6, s7);
  347|  27.3k|        s[7] = _mm256_unpackhi_epi16(s6, s7);
  348|       |
  349|  27.3k|        const __m256i res_a = convolve(s, coeffs_y);
  350|       |
  351|  27.3k|        const __m256i res_a_round = _mm256_sra_epi32(
  352|  27.3k|            _mm256_add_epi32(res_a, round_const_y), round_shift_y);
  353|       |
  354|  27.3k|        const __m256i res_unsigned_lo =
  355|  27.3k|            _mm256_add_epi32(res_a_round, offset_const);
  356|       |
  357|  27.3k|        if (w - j < 8) {
  ------------------
  |  Branch (357:13): [True: 2.64k, False: 24.7k]
  ------------------
  358|  2.64k|          if (do_average) {
  ------------------
  |  Branch (358:15): [True: 1.18k, False: 1.45k]
  ------------------
  359|  1.18k|            const __m256i data_0 = _mm256_castsi128_si256(
  360|  1.18k|                _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
  361|  1.18k|            const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
  362|  1.18k|                (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
  363|  1.18k|            const __m256i data_01 =
  364|  1.18k|                _mm256_permute2x128_si256(data_0, data_1, 0x20);
  365|       |
  366|  1.18k|            const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
  367|       |
  368|  1.18k|            const __m256i comp_avg_res =
  369|  1.18k|                highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1,
  370|  1.18k|                                use_dist_wtd_comp_avg);
  371|       |
  372|  1.18k|            const __m256i round_result = highbd_convolve_rounding(
  373|  1.18k|                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
  374|       |
  375|  1.18k|            const __m256i res_16b =
  376|  1.18k|                _mm256_packus_epi32(round_result, round_result);
  377|  1.18k|            const __m256i res_clip =
  378|  1.18k|                _mm256_min_epi16(res_16b, clip_pixel_to_bd);
  379|       |
  380|  1.18k|            const __m128i res_0 = _mm256_castsi256_si128(res_clip);
  381|  1.18k|            const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
  382|       |
  383|  1.18k|            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
  384|  1.18k|            _mm_storel_epi64(
  385|  1.18k|                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
  386|  1.45k|          } else {
  387|  1.45k|            __m256i res_16b =
  388|  1.45k|                _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo);
  389|  1.45k|            const __m128i res_0 = _mm256_castsi256_si128(res_16b);
  390|  1.45k|            const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
  391|       |
  392|  1.45k|            _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
  393|  1.45k|            _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
  394|  1.45k|                             res_1);
  395|  1.45k|          }
  396|  24.7k|        } else {
  397|  24.7k|          const __m256i res_b = convolve(s + 4, coeffs_y);
  398|  24.7k|          const __m256i res_b_round = _mm256_sra_epi32(
  399|  24.7k|              _mm256_add_epi32(res_b, round_const_y), round_shift_y);
  400|       |
  401|  24.7k|          __m256i res_unsigned_hi = _mm256_add_epi32(res_b_round, offset_const);
  402|       |
  403|  24.7k|          if (do_average) {
  ------------------
  |  Branch (403:15): [True: 10.1k, False: 14.6k]
  ------------------
  404|  10.1k|            const __m256i data_0 = _mm256_castsi128_si256(
  405|  10.1k|                _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
  406|  10.1k|            const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
  407|  10.1k|                (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
  408|  10.1k|            const __m256i data_01 =
  409|  10.1k|                _mm256_permute2x128_si256(data_0, data_1, 0x20);
  410|       |
  411|  10.1k|            const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
  412|  10.1k|            const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
  413|       |
  414|  10.1k|            const __m256i comp_avg_res_lo =
  415|  10.1k|                highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
  416|  10.1k|                                use_dist_wtd_comp_avg);
  417|  10.1k|            const __m256i comp_avg_res_hi =
  418|  10.1k|                highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
  419|  10.1k|                                use_dist_wtd_comp_avg);
  420|       |
  421|  10.1k|            const __m256i round_result_lo =
  422|  10.1k|                highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
  423|  10.1k|                                         &rounding_const, rounding_shift);
  424|  10.1k|            const __m256i round_result_hi =
  425|  10.1k|                highbd_convolve_rounding(&comp_avg_res_hi, &offset_const,
  426|  10.1k|                                         &rounding_const, rounding_shift);
  427|       |
  428|  10.1k|            const __m256i res_16b =
  429|  10.1k|                _mm256_packus_epi32(round_result_lo, round_result_hi);
  430|  10.1k|            const __m256i res_clip =
  431|  10.1k|                _mm256_min_epi16(res_16b, clip_pixel_to_bd);
  432|       |
  433|  10.1k|            const __m128i res_0 = _mm256_castsi256_si128(res_clip);
  434|  10.1k|            const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
  435|       |
  436|  10.1k|            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
  437|  10.1k|            _mm_store_si128(
  438|  10.1k|                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
  439|  14.6k|          } else {
  440|  14.6k|            __m256i res_16b =
  441|  14.6k|                _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi);
  442|  14.6k|            const __m128i res_0 = _mm256_castsi256_si128(res_16b);
  443|  14.6k|            const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
  444|       |
  445|  14.6k|            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
  446|  14.6k|            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
  447|  14.6k|                            res_1);
  448|  14.6k|          }
  449|  24.7k|        }
  450|       |
  451|  27.3k|        s[0] = s[1];
  452|  27.3k|        s[1] = s[2];
  453|  27.3k|        s[2] = s[3];
  454|       |
  455|  27.3k|        s[4] = s[5];
  456|  27.3k|        s[5] = s[6];
  457|  27.3k|        s[6] = s[7];
  458|  27.3k|      }
  459|  3.76k|    }
  460|  3.76k|  }
  461|  2.39k|}
av1_highbd_dist_wtd_convolve_x_avx2:
  466|  1.12k|    ConvolveParams *conv_params, int bd) {
  467|  1.12k|  CONV_BUF_TYPE *dst = conv_params->dst;
  468|  1.12k|  int dst_stride = conv_params->dst_stride;
  469|  1.12k|  const int fo_horiz = filter_params_x->taps / 2 - 1;
  470|  1.12k|  const uint16_t *const src_ptr = src - fo_horiz;
  471|  1.12k|  const int bits = FILTER_BITS - conv_params->round_1;
  ------------------
  |  |   21|  1.12k|#define FILTER_BITS 7
  ------------------
  472|       |
  473|  1.12k|  int i, j;
  474|  1.12k|  __m256i s[4], coeffs_x[4];
  475|       |
  476|  1.12k|  const int do_average = conv_params->do_average;
  477|  1.12k|  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
  478|  1.12k|  const int w0 = conv_params->fwd_offset;
  479|  1.12k|  const int w1 = conv_params->bck_offset;
  480|  1.12k|  const __m256i wt0 = _mm256_set1_epi32(w0);
  481|  1.12k|  const __m256i wt1 = _mm256_set1_epi32(w1);
  482|  1.12k|  const __m256i zero = _mm256_setzero_si256();
  483|       |
  484|  1.12k|  const __m256i round_const_x =
  485|  1.12k|      _mm256_set1_epi32(((1 << conv_params->round_0) >> 1));
  486|  1.12k|  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
  487|  1.12k|  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
  488|       |
  489|  1.12k|  const int offset_0 =
  490|  1.12k|      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|  1.12k|#define FILTER_BITS 7
  ------------------
  491|  1.12k|  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
  492|  1.12k|  const __m256i offset_const = _mm256_set1_epi32(offset);
  493|  1.12k|  const int rounding_shift =
  494|  1.12k|      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|  1.12k|#define FILTER_BITS 7
  ------------------
  495|  1.12k|  const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
  496|  1.12k|  const __m256i clip_pixel_to_bd =
  497|  1.12k|      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
  ------------------
  |  Branch (497:25): [True: 1.04k, False: 86]
  |  Branch (497:44): [True: 86, False: 0]
  ------------------
  498|       |
  499|  1.12k|  assert(bits >= 0);
  500|  1.12k|  prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
  501|       |
  502|  3.18k|  for (j = 0; j < w; j += 8) {
  ------------------
  |  Branch (502:15): [True: 2.06k, False: 1.12k]
  ------------------
  503|       |    /* Horizontal filter */
  504|  23.7k|    for (i = 0; i < h; i += 2) {
  ------------------
  |  Branch (504:17): [True: 21.7k, False: 2.06k]
  ------------------
  505|  21.7k|      const __m256i row0 =
  506|  21.7k|          _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
  507|  21.7k|      __m256i row1 =
  508|  21.7k|          _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
  509|       |
  510|  21.7k|      const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
  511|  21.7k|      const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
  512|       |
  513|       |      // even pixels
  514|  21.7k|      s[0] = _mm256_alignr_epi8(r1, r0, 0);
  515|  21.7k|      s[1] = _mm256_alignr_epi8(r1, r0, 4);
  516|  21.7k|      s[2] = _mm256_alignr_epi8(r1, r0, 8);
  517|  21.7k|      s[3] = _mm256_alignr_epi8(r1, r0, 12);
  518|       |
  519|  21.7k|      __m256i res_even = convolve(s, coeffs_x);
  520|  21.7k|      res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
  521|  21.7k|                                  round_shift_x);
  522|       |
  523|       |      // odd pixels
  524|  21.7k|      s[0] = _mm256_alignr_epi8(r1, r0, 2);
  525|  21.7k|      s[1] = _mm256_alignr_epi8(r1, r0, 6);
  526|  21.7k|      s[2] = _mm256_alignr_epi8(r1, r0, 10);
  527|  21.7k|      s[3] = _mm256_alignr_epi8(r1, r0, 14);
  528|       |
  529|  21.7k|      __m256i res_odd = convolve(s, coeffs_x);
  530|  21.7k|      res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
  531|  21.7k|                                 round_shift_x);
  532|       |
  533|  21.7k|      res_even = _mm256_sll_epi32(res_even, round_shift_bits);
  534|  21.7k|      res_odd = _mm256_sll_epi32(res_odd, round_shift_bits);
  535|       |
  536|  21.7k|      __m256i res1 = _mm256_unpacklo_epi32(res_even, res_odd);
  537|       |
  538|  21.7k|      __m256i res_unsigned_lo = _mm256_add_epi32(res1, offset_const);
  539|       |
  540|  21.7k|      if (w - j < 8) {
  ------------------
  |  Branch (540:11): [True: 1.11k, False: 20.5k]
  ------------------
  541|  1.11k|        if (do_average) {
  ------------------
  |  Branch (541:13): [True: 424, False: 688]
  ------------------
  542|    424|          const __m256i data_0 = _mm256_castsi128_si256(
  543|    424|              _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
  544|    424|          const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
  545|    424|              (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
  546|    424|          const __m256i data_01 =
  547|    424|              _mm256_permute2x128_si256(data_0, data_1, 0x20);
  548|       |
  549|    424|          const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
  550|       |
  551|    424|          const __m256i comp_avg_res = highbd_comp_avg(
  552|    424|              &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg);
  553|       |
  554|    424|          const __m256i round_result = highbd_convolve_rounding(
  555|    424|              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
  556|       |
  557|    424|          const __m256i res_16b =
  558|    424|              _mm256_packus_epi32(round_result, round_result);
  559|    424|          const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd);
  560|       |
  561|    424|          const __m128i res_0 = _mm256_castsi256_si128(res_clip);
  562|    424|          const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
  563|       |
  564|    424|          _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
  565|    424|          _mm_storel_epi64(
  566|    424|              (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
  567|    688|        } else {
  568|    688|          __m256i res_16b =
  569|    688|              _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo);
  570|    688|          const __m128i res_0 = _mm256_castsi256_si128(res_16b);
  571|    688|          const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
  572|       |
  573|    688|          _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
  574|    688|          _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
  575|    688|                           res_1);
  576|    688|        }
  577|  20.5k|      } else {
  578|  20.5k|        __m256i res2 = _mm256_unpackhi_epi32(res_even, res_odd);
  579|  20.5k|        __m256i res_unsigned_hi = _mm256_add_epi32(res2, offset_const);
  580|       |
  581|  20.5k|        if (do_average) {
  ------------------
  |  Branch (581:13): [True: 10.2k, False: 10.3k]
  ------------------
  582|  10.2k|          const __m256i data_0 = _mm256_castsi128_si256(
  583|  10.2k|              _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
  584|  10.2k|          const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
  585|  10.2k|              (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
  586|  10.2k|          const __m256i data_01 =
  587|  10.2k|              _mm256_permute2x128_si256(data_0, data_1, 0x20);
  588|       |
  589|  10.2k|          const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
  590|  10.2k|          const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
  591|       |
  592|  10.2k|          const __m256i comp_avg_res_lo =
  593|  10.2k|              highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
  594|  10.2k|                              use_dist_wtd_comp_avg);
  595|  10.2k|          const __m256i comp_avg_res_hi =
  596|  10.2k|              highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
  597|  10.2k|                              use_dist_wtd_comp_avg);
  598|       |
  599|  10.2k|          const __m256i round_result_lo = highbd_convolve_rounding(
  600|  10.2k|              &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
  601|  10.2k|          const __m256i round_result_hi = highbd_convolve_rounding(
  602|  10.2k|              &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
  603|       |
  604|  10.2k|          const __m256i res_16b =
  605|  10.2k|              _mm256_packus_epi32(round_result_lo, round_result_hi);
  606|  10.2k|          const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd);
  607|       |
  608|  10.2k|          const __m128i res_0 = _mm256_castsi256_si128(res_clip);
  609|  10.2k|          const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
  610|       |
  611|  10.2k|          _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
  612|  10.2k|          _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]),
  613|  10.2k|                          res_1);
  614|  10.3k|        } else {
  615|  10.3k|          __m256i res_16b =
  616|  10.3k|              _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi);
  617|  10.3k|          const __m128i res_0 = _mm256_castsi256_si128(res_16b);
  618|  10.3k|          const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
  619|       |
  620|  10.3k|          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
  621|  10.3k|          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
  622|  10.3k|                          res_1);
  623|  10.3k|        }
  624|  20.5k|      }
  625|  21.7k|    }
  626|  2.06k|  }
  627|  1.12k|}
av1_highbd_dist_wtd_convolve_y_avx2:
  632|    332|    ConvolveParams *conv_params, int bd) {
  633|    332|  CONV_BUF_TYPE *dst = conv_params->dst;
  634|    332|  int dst_stride = conv_params->dst_stride;
  635|    332|  const int fo_vert = filter_params_y->taps / 2 - 1;
  636|    332|  const uint16_t *const src_ptr = src - fo_vert * src_stride;
  637|    332|  const int bits = FILTER_BITS - conv_params->round_0;
  ------------------
  |  |   21|    332|#define FILTER_BITS 7
  ------------------
  638|       |
  639|    332|  assert(bits >= 0);
  640|    332|  int i, j;
  641|    332|  __m256i s[8], coeffs_y[4];
  642|    332|  const int do_average = conv_params->do_average;
  643|    332|  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
  644|       |
  645|    332|  const int w0 = conv_params->fwd_offset;
  646|    332|  const int w1 = conv_params->bck_offset;
  647|    332|  const __m256i wt0 = _mm256_set1_epi32(w0);
  648|    332|  const __m256i wt1 = _mm256_set1_epi32(w1);
  649|    332|  const __m256i round_const_y =
  650|    332|      _mm256_set1_epi32(((1 << conv_params->round_1) >> 1));
  651|    332|  const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
  652|    332|  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
  653|       |
  654|    332|  const int offset_0 =
  655|    332|      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|    332|#define FILTER_BITS 7
  ------------------
  656|    332|  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
  657|    332|  const __m256i offset_const = _mm256_set1_epi32(offset);
  658|    332|  const int rounding_shift =
  659|    332|      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|    332|#define FILTER_BITS 7
  ------------------
  660|    332|  const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
  661|    332|  const __m256i clip_pixel_to_bd =
  662|    332|      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
  ------------------
  |  Branch (662:25): [True: 226, False: 106]
  |  Branch (662:44): [True: 106, False: 0]
  ------------------
  663|    332|  const __m256i zero = _mm256_setzero_si256();
  664|       |
  665|    332|  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
  666|       |
  667|  1.27k|  for (j = 0; j < w; j += 8) {
  ------------------
  |  Branch (667:15): [True: 942, False: 332]
  ------------------
  668|    942|    const uint16_t *data = &src_ptr[j];
  669|       |    /* Vertical filter */
  670|    942|    {
  671|    942|      __m256i src6;
  672|    942|      __m256i s01 = _mm256_permute2x128_si256(
  673|    942|          _mm256_castsi128_si256(
  674|    942|              _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
  675|    942|          _mm256_castsi128_si256(
  676|    942|              _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
  677|    942|          0x20);
  678|    942|      __m256i s12 = _mm256_permute2x128_si256(
  679|    942|          _mm256_castsi128_si256(
  680|    942|              _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
  681|    942|          _mm256_castsi128_si256(
  682|    942|              _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
  683|    942|          0x20);
  684|    942|      __m256i s23 = _mm256_permute2x128_si256(
  685|    942|          _mm256_castsi128_si256(
  686|    942|              _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
  687|    942|          _mm256_castsi128_si256(
  688|    942|              _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
  689|    942|          0x20);
  690|    942|      __m256i s34 = _mm256_permute2x128_si256(
  691|    942|          _mm256_castsi128_si256(
  692|    942|              _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
  693|    942|          _mm256_castsi128_si256(
  694|    942|              _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
  695|    942|          0x20);
  696|    942|      __m256i s45 = _mm256_permute2x128_si256(
  697|    942|          _mm256_castsi128_si256(
  698|    942|              _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
  699|    942|          _mm256_castsi128_si256(
  700|    942|              _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
  701|    942|          0x20);
  702|    942|      src6 = _mm256_castsi128_si256(
  703|    942|          _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
  704|    942|      __m256i s56 = _mm256_permute2x128_si256(
  705|    942|          _mm256_castsi128_si256(
  706|    942|              _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
  707|    942|          src6, 0x20);
  708|       |
  709|    942|      s[0] = _mm256_unpacklo_epi16(s01, s12);
  710|    942|      s[1] = _mm256_unpacklo_epi16(s23, s34);
  711|    942|      s[2] = _mm256_unpacklo_epi16(s45, s56);
  712|       |
  713|    942|      s[4] = _mm256_unpackhi_epi16(s01, s12);
  714|    942|      s[5] = _mm256_unpackhi_epi16(s23, s34);
  715|    942|      s[6] = _mm256_unpackhi_epi16(s45, s56);
  716|       |
  717|  8.39k|      for (i = 0; i < h; i += 2) {
  ------------------
  |  Branch (717:19): [True: 7.44k, False: 942]
  ------------------
  718|  7.44k|        data = &src_ptr[i * src_stride + j];
  719|       |
  720|  7.44k|        const __m256i s67 = _mm256_permute2x128_si256(
  721|  7.44k|            src6,
  722|  7.44k|            _mm256_castsi128_si256(
  723|  7.44k|                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
  724|  7.44k|            0x20);
  725|       |
  726|  7.44k|        src6 = _mm256_castsi128_si256(
  727|  7.44k|            _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
  728|       |
  729|  7.44k|        const __m256i s78 = _mm256_permute2x128_si256(
  730|  7.44k|            _mm256_castsi128_si256(
  731|  7.44k|                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
  732|  7.44k|            src6, 0x20);
  733|       |
  734|  7.44k|        s[3] = _mm256_unpacklo_epi16(s67, s78);
  735|  7.44k|        s[7] = _mm256_unpackhi_epi16(s67, s78);
  736|       |
  737|  7.44k|        const __m256i res_a = convolve(s, coeffs_y);
  738|       |
  739|  7.44k|        __m256i res_a_round = _mm256_sll_epi32(res_a, round_shift_bits);
  740|  7.44k|        res_a_round = _mm256_sra_epi32(
  741|  7.44k|            _mm256_add_epi32(res_a_round, round_const_y), round_shift_y);
  742|       |
  743|  7.44k|        __m256i res_unsigned_lo = _mm256_add_epi32(res_a_round, offset_const);
  744|       |
  745|  7.44k|        if (w - j < 8) {
  ------------------
  |  Branch (745:13): [True: 272, False: 7.17k]
  ------------------
  746|    272|          if (do_average) {
  ------------------
  |  Branch (746:15): [True: 120, False: 152]
  ------------------
  747|    120|            const __m256i data_0 = _mm256_castsi128_si256(
  748|    120|                _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
  749|    120|            const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
  750|    120|                (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
  751|    120|            const __m256i data_01 =
  752|    120|                _mm256_permute2x128_si256(data_0, data_1, 0x20);
  753|       |
  754|    120|            const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
  755|       |
  756|    120|            const __m256i comp_avg_res =
  757|    120|                highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1,
  758|    120|                                use_dist_wtd_comp_avg);
  759|       |
  760|    120|            const __m256i round_result = highbd_convolve_rounding(
  761|    120|                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
  762|       |
  763|    120|            const __m256i res_16b =
  764|    120|                _mm256_packus_epi32(round_result, round_result);
  765|    120|            const __m256i res_clip =
  766|    120|                _mm256_min_epi16(res_16b, clip_pixel_to_bd);
  767|       |
  768|    120|            const __m128i res_0 = _mm256_castsi256_si128(res_clip);
  769|    120|            const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
  770|       |
  771|    120|            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
  772|    120|            _mm_storel_epi64(
  773|    120|                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
  774|    152|          } else {
  775|    152|            __m256i res_16b =
  776|    152|                _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo);
  777|    152|            const __m128i res_0 = _mm256_castsi256_si128(res_16b);
  778|    152|            const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
  779|       |
  780|    152|            _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
  781|    152|            _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
  782|    152|                             res_1);
  783|    152|          }
  784|  7.17k|        } else {
  785|  7.17k|          const __m256i res_b = convolve(s + 4, coeffs_y);
  786|  7.17k|          __m256i res_b_round = _mm256_sll_epi32(res_b, round_shift_bits);
  787|  7.17k|          res_b_round = _mm256_sra_epi32(
  788|  7.17k|              _mm256_add_epi32(res_b_round, round_const_y), round_shift_y);
  789|       |
  790|  7.17k|          __m256i res_unsigned_hi = _mm256_add_epi32(res_b_round, offset_const);
  791|       |
  792|  7.17k|          if (do_average) {
  ------------------
  |  Branch (792:15): [True: 1.23k, False: 5.94k]
  ------------------
  793|  1.23k|            const __m256i data_0 = _mm256_castsi128_si256(
  794|  1.23k|                _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
  795|  1.23k|            const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
  796|  1.23k|                (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
  797|  1.23k|            const __m256i data_01 =
  798|  1.23k|                _mm256_permute2x128_si256(data_0, data_1, 0x20);
  799|       |
  800|  1.23k|            const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
  801|  1.23k|            const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
  802|       |
  803|  1.23k|            const __m256i comp_avg_res_lo =
  804|  1.23k|                highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
  805|  1.23k|                                use_dist_wtd_comp_avg);
  806|  1.23k|            const __m256i comp_avg_res_hi =
  807|  1.23k|                highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
  808|  1.23k|                                use_dist_wtd_comp_avg);
  809|       |
  810|  1.23k|            const __m256i round_result_lo =
  811|  1.23k|                highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
  812|  1.23k|                                         &rounding_const, rounding_shift);
  813|  1.23k|            const __m256i round_result_hi =
  814|  1.23k|                highbd_convolve_rounding(&comp_avg_res_hi, &offset_const,
  815|  1.23k|                                         &rounding_const, rounding_shift);
  816|       |
  817|  1.23k|            const __m256i res_16b =
  818|  1.23k|                _mm256_packus_epi32(round_result_lo, round_result_hi);
  819|  1.23k|            const __m256i res_clip =
  820|  1.23k|                _mm256_min_epi16(res_16b, clip_pixel_to_bd);
  821|       |
  822|  1.23k|            const __m128i res_0 = _mm256_castsi256_si128(res_clip);
  823|  1.23k|            const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
  824|       |
  825|  1.23k|            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
  826|  1.23k|            _mm_store_si128(
  827|  1.23k|                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
  828|  5.94k|          } else {
  829|  5.94k|            __m256i res_16b =
  830|  5.94k|                _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi);
  831|  5.94k|            const __m128i res_0 = _mm256_castsi256_si128(res_16b);
  832|  5.94k|            const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
  833|       |
  834|  5.94k|            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
  835|  5.94k|            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
  836|  5.94k|                            res_1);
  837|  5.94k|          }
  838|  7.17k|        }
  839|  7.44k|        s[0] = s[1];
  840|  7.44k|        s[1] = s[2];
  841|  7.44k|        s[2] = s[3];
  842|       |
  843|  7.44k|        s[4] = s[5];
  844|  7.44k|        s[5] = s[6];
  845|  7.44k|        s[6] = s[7];
  846|  7.44k|      }
  847|    942|    }
  848|    942|  }
  849|    332|}

highbd_inv_txfm_sse4.c:half_btf_0_sse4_1:
  112|  10.4k|                                        const __m128i *rounding, int bit) {
  113|  10.4k|  __m128i x;
  114|       |
  115|  10.4k|  x = _mm_mullo_epi32(*w0, *n0);
  116|  10.4k|  x = _mm_add_epi32(x, *rounding);
  117|  10.4k|  x = _mm_srai_epi32(x, bit);
  118|  10.4k|  return x;
  119|  10.4k|}
highbd_inv_txfm_sse4.c:half_btf_sse4_1:
  100|   680k|                                      const __m128i *rounding, int bit) {
  101|   680k|  __m128i x, y;
  102|       |
  103|   680k|  x = _mm_mullo_epi32(*w0, *n0);
  104|   680k|  y = _mm_mullo_epi32(*w1, *n1);
  105|   680k|  x = _mm_add_epi32(x, y);
  106|   680k|  x = _mm_add_epi32(x, *rounding);
  107|   680k|  x = _mm_srai_epi32(x, bit);
  108|   680k|  return x;
  109|   680k|}

av1_highbd_warp_affine_avx2:
   23|    956|                                 int16_t beta, int16_t gamma, int16_t delta) {
   24|    956|  __m256i tmp[15];
   25|    956|  const int reduce_bits_horiz = conv_params->round_0;
   26|    956|  const int reduce_bits_vert = conv_params->is_compound
  ------------------
  |  Branch (26:32): [True: 50, False: 906]
  ------------------
   27|    956|                                   ? conv_params->round_1
   28|    956|                                   : 2 * FILTER_BITS - reduce_bits_horiz;
  ------------------
  |  |   21|    906|#define FILTER_BITS 7
  ------------------
   29|    956|  const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
  ------------------
  |  |   21|    956|#define FILTER_BITS 7
  ------------------
   30|    956|  const int offset_bits_horiz = bd + FILTER_BITS - 1;
  ------------------
  |  |   21|    956|#define FILTER_BITS 7
  ------------------
   31|    956|  const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
  ------------------
  |  |   21|    956|#define FILTER_BITS 7
  ------------------
   32|    956|  const int round_bits =
   33|    956|      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|    956|#define FILTER_BITS 7
  ------------------
   34|    956|  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
  ------------------
  |  |   21|    956|#define FILTER_BITS 7
  ------------------
   35|    956|  (void)max_bits_horiz;
   36|    956|  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
   37|       |
   38|       |  // Check that, even with 12-bit input, the intermediate values will fit
   39|       |  // into an unsigned 16-bit intermediate array.
   40|    956|  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
   41|       |
   42|    956|  const __m256i clip_pixel =
   43|    956|      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
  ------------------
  |  Branch (43:25): [True: 766, False: 190]
  |  Branch (43:44): [True: 190, False: 0]
  ------------------
   44|    956|  const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert);
   45|    956|  const __m256i reduce_bits_vert_const =
   46|    956|      _mm256_set1_epi32(((1 << reduce_bits_vert) >> 1));
   47|    956|  const __m256i res_add_const = _mm256_set1_epi32(1 << offset_bits_vert);
   48|    956|  const __m256i res_sub_const =
   49|    956|      _mm256_set1_epi32(-(1 << (offset_bits - conv_params->round_1)) -
   50|    956|                        (1 << (offset_bits - conv_params->round_1 - 1)));
   51|    956|  __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits);
   52|    956|  __m256i round_bits_const = _mm256_set1_epi32(((1 << round_bits) >> 1));
   53|       |
   54|    956|  const int w0 = conv_params->fwd_offset;
   55|    956|  const int w1 = conv_params->bck_offset;
   56|    956|  const __m256i wt0 = _mm256_set1_epi32(w0);
   57|    956|  const __m256i wt1 = _mm256_set1_epi32(w1);
   58|       |
   59|    956|  __m256i v_rbhoriz = _mm256_set1_epi32(1 << (reduce_bits_horiz - 1));
   60|    956|  __m256i v_zeros = _mm256_setzero_si256();
   61|    956|  int ohoriz = 1 << offset_bits_horiz;
   62|    956|  int mhoriz = 1 << max_bits_horiz;
   63|    956|  (void)mhoriz;
   64|    956|  int sx;
   65|       |
   66|  2.69k|  for (int i = 0; i < p_height; i += 8) {
  ------------------
  |  Branch (66:19): [True: 1.73k, False: 956]
  ------------------
   67|  6.00k|    for (int j = 0; j < p_width; j += 8) {
  ------------------
  |  Branch (67:21): [True: 4.26k, False: 1.73k]
  ------------------
   68|       |      // Calculate the center of this 8x8 block,
   69|       |      // project to luma coordinates (if in a subsampled chroma plane),
   70|       |      // apply the affine transformation,
   71|       |      // then convert back to the original coordinates (if necessary)
   72|  4.26k|      const int32_t src_x = (p_col + j + 4) << subsampling_x;
   73|  4.26k|      const int32_t src_y = (p_row + i + 4) << subsampling_y;
   74|  4.26k|      const int64_t dst_x =
   75|  4.26k|          (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
   76|  4.26k|      const int64_t dst_y =
   77|  4.26k|          (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
   78|  4.26k|      const int64_t x4 = dst_x >> subsampling_x;
   79|  4.26k|      const int64_t y4 = dst_y >> subsampling_y;
   80|       |
   81|  4.26k|      const int16_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
  ------------------
  |  |   96|  4.26k|#define WARPEDMODEL_PREC_BITS 16
  ------------------
   82|  4.26k|      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
  ------------------
  |  |   96|  4.26k|#define WARPEDMODEL_PREC_BITS 16
  ------------------
   83|  4.26k|      const int16_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
  ------------------
  |  |   96|  4.26k|#define WARPEDMODEL_PREC_BITS 16
  ------------------
   84|  4.26k|      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
  ------------------
  |  |   96|  4.26k|#define WARPEDMODEL_PREC_BITS 16
  ------------------
   85|       |
   86|  4.26k|      sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
  ------------------
  |  |  107|  4.26k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  4.26k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  4.26k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
   87|  4.26k|             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
  ------------------
  |  |  103|  4.26k|#define WARPEDPIXEL_PREC_SHIFTS (1 << WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  4.26k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
                           (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
  ------------------
  |  |  107|  4.26k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  4.26k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  4.26k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
   88|  4.26k|      sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
  ------------------
  |  |  107|  4.26k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  4.26k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  4.26k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
   89|  4.26k|             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
  ------------------
  |  |  103|  4.26k|#define WARPEDPIXEL_PREC_SHIFTS (1 << WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  4.26k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
                           (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
  ------------------
  |  |  107|  4.26k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  4.26k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  4.26k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
   90|       |
   91|  4.26k|      sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
  ------------------
  |  |  105|  4.26k|#define WARP_PARAM_REDUCE_BITS 6
  ------------------
   92|  4.26k|      sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
  ------------------
  |  |  105|  4.26k|#define WARP_PARAM_REDUCE_BITS 6
  ------------------
   93|       |
   94|       |      // Horizontal filter
   95|  4.26k|      if (ix4 <= -7) {
  ------------------
  |  Branch (95:11): [True: 16, False: 4.24k]
  ------------------
   96|    256|        for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
  ------------------
  |  |   34|    256|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 128, False: 128]
  |  |  ------------------
  ------------------
  |  Branch (96:26): [True: 240, False: 16]
  ------------------
   97|    240|          int iy = iy4 + k;
   98|    240|          if (iy < 0)
  ------------------
  |  Branch (98:15): [True: 0, False: 240]
  ------------------
   99|      0|            iy = 0;
  100|    240|          else if (iy > height - 1)
  ------------------
  |  Branch (100:20): [True: 0, False: 240]
  ------------------
  101|      0|            iy = height - 1;
  102|    240|          tmp[k + 7] = _mm256_cvtepi16_epi32(_mm_set1_epi16(
  103|    240|              (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
  ------------------
  |  |   21|    240|#define FILTER_BITS 7
  ------------------
  104|    240|              ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz))));
  ------------------
  |  |   21|    240|#define FILTER_BITS 7
  ------------------
  105|    240|        }
  106|  4.24k|      } else if (ix4 >= width + 6) {
  ------------------
  |  Branch (106:18): [True: 108, False: 4.14k]
  ------------------
  107|  1.72k|        for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
  ------------------
  |  |   34|  1.72k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 1.12k, False: 608]
  |  |  ------------------
  ------------------
  |  Branch (107:26): [True: 1.62k, False: 108]
  ------------------
  108|  1.62k|          int iy = iy4 + k;
  109|  1.62k|          if (iy < 0)
  ------------------
  |  Branch (109:15): [True: 38, False: 1.58k]
  ------------------
  110|     38|            iy = 0;
  111|  1.58k|          else if (iy > height - 1)
  ------------------
  |  Branch (111:20): [True: 32, False: 1.55k]
  ------------------
  112|     32|            iy = height - 1;
  113|  1.62k|          tmp[k + 7] = _mm256_cvtepi16_epi32(
  114|  1.62k|              _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
  ------------------
  |  |   21|  1.62k|#define FILTER_BITS 7
  ------------------
  115|  1.62k|                             ref[iy * stride + (width - 1)] *
  116|  1.62k|                                 (1 << (FILTER_BITS - reduce_bits_horiz))));
  ------------------
  |  |   21|  1.62k|#define FILTER_BITS 7
  ------------------
  117|  1.62k|        }
  118|  4.14k|      } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
  ------------------
  |  Branch (118:18): [True: 312, False: 3.82k]
  |  Branch (118:37): [True: 466, False: 3.36k]
  ------------------
  119|    778|        int32_t tmp1[8];
  120|  12.4k|        for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
  ------------------
  |  |   34|  12.4k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 5.79k, False: 6.65k]
  |  |  ------------------
  ------------------
  |  Branch (120:26): [True: 11.6k, False: 778]
  ------------------
  121|  11.6k|          const int iy = clamp(iy4 + k, 0, height - 1);
  122|       |
  123|  11.6k|          sx = sx4 + beta * (k + 4);
  124|   105k|          for (int l = -4; l < 4; ++l) {
  ------------------
  |  Branch (124:28): [True: 93.3k, False: 11.6k]
  ------------------
  125|  93.3k|            int ix = ix4 + l - 3;
  126|  93.3k|            const int offs = sx >> WARPEDDIFF_PREC_BITS;
  ------------------
  |  |  107|  93.3k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  93.3k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  93.3k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  127|  93.3k|            const int16_t *coeffs = av1_warped_filter[offs];
  128|       |
  129|  93.3k|            int32_t sum = 1 << offset_bits_horiz;
  130|   840k|            for (int m = 0; m < 8; ++m) {
  ------------------
  |  Branch (130:29): [True: 746k, False: 93.3k]
  ------------------
  131|   746k|              const int sample_x = clamp(ix + m, 0, width - 1);
  132|   746k|              sum += ref[iy * stride + sample_x] * coeffs[m];
  133|   746k|            }
  134|  93.3k|            sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
  ------------------
  |  |   41|  93.3k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
  135|  93.3k|            tmp1[(l + 4) / 2 + ((l + 4) % 2) * 4] = sum;
  136|  93.3k|            sx += alpha;
  137|  93.3k|          }
  138|  11.6k|          tmp[k + 7] = _mm256_loadu_si256((__m256i *)tmp1);
  139|  11.6k|        }
  140|  3.36k|      } else {
  141|  3.36k|        if (beta == 0 && alpha == 0) {
  ------------------
  |  Branch (141:13): [True: 2.06k, False: 1.29k]
  |  Branch (141:26): [True: 1.07k, False: 990]
  ------------------
  142|  1.07k|          sx = sx4;
  143|  1.07k|          __m128i v_01 = _mm_loadu_si128(
  144|  1.07k|              (__m128i *)
  145|  1.07k|                  av1_warped_filter[sx >>
  146|  1.07k|                                    WARPEDDIFF_PREC_BITS]);  // A7A6A5A4A3A2A1A0
  ------------------
  |  |  107|  1.07k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  1.07k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  1.07k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  147|  1.07k|          __m256i v_c01 = _mm256_broadcastd_epi32(v_01);     // A1A0A1A0A1A0A1A0
  148|  1.07k|          __m256i v_c23 = _mm256_broadcastd_epi32(
  149|  1.07k|              _mm_shuffle_epi32(v_01, 1));  // A3A2A3A2A3A2A3A2
  150|  1.07k|          __m256i v_c45 = _mm256_broadcastd_epi32(
  151|  1.07k|              _mm_shuffle_epi32(v_01, 2));  // A5A4A5A4A5A4A5A4
  152|  1.07k|          __m256i v_c67 = _mm256_broadcastd_epi32(
  153|  1.07k|              _mm_shuffle_epi32(v_01, 3));  // A7A6A7A6A7A6A7A6
  154|  17.2k|          for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
  ------------------
  |  |   34|  17.2k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 10.8k, False: 6.36k]
  |  |  ------------------
  ------------------
  |  Branch (154:28): [True: 16.1k, False: 1.07k]
  ------------------
  155|  16.1k|            int iy = iy4 + k;
  156|  16.1k|            if (iy < 0)
  ------------------
  |  Branch (156:17): [True: 400, False: 15.7k]
  ------------------
  157|    400|              iy = 0;
  158|  15.7k|            else if (iy > height - 1)
  ------------------
  |  Branch (158:22): [True: 558, False: 15.1k]
  ------------------
  159|    558|              iy = height - 1;
  160|  16.1k|            iy = iy * stride;
  161|       |
  162|  16.1k|            __m256i v_refl = _mm256_inserti128_si256(
  163|  16.1k|                _mm256_setzero_si256(),
  164|  16.1k|                _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0);
  165|  16.1k|            v_refl = _mm256_inserti128_si256(
  166|  16.1k|                v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]),
  167|  16.1k|                1);  // R15 .. R0
  168|       |
  169|  16.1k|            __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE);
  170|       |
  171|  16.1k|            __m256i v_refu =
  172|  16.1k|                _mm256_alignr_epi8(v_ref, v_refl, 2);  // R8R15R14...R2R1
  173|  16.1k|            v_refl = _mm256_inserti128_si256(
  174|  16.1k|                v_refl, _mm256_extracti128_si256(v_refu, 0), 1);
  175|  16.1k|            v_refu = _mm256_inserti128_si256(
  176|  16.1k|                v_refu, _mm256_extracti128_si256(v_ref, 0), 0);
  177|       |
  178|  16.1k|            __m256i v_sum = _mm256_set1_epi32(ohoriz);
  179|  16.1k|            __m256i parsum = _mm256_madd_epi16(
  180|  16.1k|                v_c01, _mm256_alignr_epi8(v_refu, v_refl,
  181|  16.1k|                                          0));  // R8R7R6..R1R7R6R5..R1R0
  182|  16.1k|            __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum);
  183|       |
  184|  16.1k|            parsum = _mm256_madd_epi16(
  185|  16.1k|                v_c23,
  186|  16.1k|                _mm256_alignr_epi8(v_refu, v_refl, 4));  // R10R9..R3R9R8..R3R2
  187|  16.1k|            __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum);
  188|  16.1k|            parsum = _mm256_madd_epi16(
  189|  16.1k|                v_c45, _mm256_alignr_epi8(v_refu, v_refl,
  190|  16.1k|                                          8));  // R12R11..R5R11R10..R5R4
  191|  16.1k|            __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum);
  192|  16.1k|            parsum = _mm256_madd_epi16(
  193|  16.1k|                v_c67, _mm256_alignr_epi8(v_refu, v_refl,
  194|  16.1k|                                          12));  // R14R13..R7R13R12..R7R6
  195|  16.1k|            __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum);
  196|       |
  197|  16.1k|            tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz),
  198|  16.1k|                                           reduce_bits_horiz);
  199|  16.1k|          }
  200|  2.28k|        } else if (alpha == 0) {
  ------------------
  |  Branch (200:20): [True: 258, False: 2.02k]
  ------------------
  201|  4.12k|          for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
  ------------------
  |  |   34|  4.12k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 2.91k, False: 1.21k]
  |  |  ------------------
  ------------------
  |  Branch (201:28): [True: 3.87k, False: 258]
  ------------------
  202|  3.87k|            int iy = iy4 + k;
  203|  3.87k|            if (iy < 0)
  ------------------
  |  Branch (203:17): [True: 0, False: 3.87k]
  ------------------
  204|      0|              iy = 0;
  205|  3.87k|            else if (iy > height - 1)
  ------------------
  |  Branch (205:22): [True: 0, False: 3.87k]
  ------------------
  206|      0|              iy = height - 1;
  207|  3.87k|            iy = iy * stride;
  208|       |
  209|  3.87k|            sx = sx4 + beta * (k + 4);
  210|       |
  211|  3.87k|            __m128i v_01 = _mm_loadu_si128(
  212|  3.87k|                (__m128i *)av1_warped_filter
  213|  3.87k|                    [sx >> WARPEDDIFF_PREC_BITS]);          // A7A6A5A4A3A2A1A0
  ------------------
  |  |  107|  3.87k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  3.87k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  3.87k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  214|  3.87k|            __m256i v_c01 = _mm256_broadcastd_epi32(v_01);  // A1A0A1A0A1A0A1A0
  215|  3.87k|            __m256i v_c23 = _mm256_broadcastd_epi32(
  216|  3.87k|                _mm_shuffle_epi32(v_01, 1));  // A3A2A3A2A3A2A3A2
  217|  3.87k|            __m256i v_c45 = _mm256_broadcastd_epi32(
  218|  3.87k|                _mm_shuffle_epi32(v_01, 2));  // A5A4A5A4A5A4A5A4
  219|  3.87k|            __m256i v_c67 = _mm256_broadcastd_epi32(
  220|  3.87k|                _mm_shuffle_epi32(v_01, 3));  // A7A6A7A6A7A6A7A6
  221|       |
  222|  3.87k|            __m256i v_refl = _mm256_inserti128_si256(
  223|  3.87k|                _mm256_setzero_si256(),
  224|  3.87k|                _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0);
  225|  3.87k|            v_refl = _mm256_inserti128_si256(
  226|  3.87k|                v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]),
  227|  3.87k|                1);  // R15 .. R0
  228|       |
  229|  3.87k|            __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE);
  230|       |
  231|  3.87k|            __m256i v_refu =
  232|  3.87k|                _mm256_alignr_epi8(v_ref, v_refl, 2);  // R8R15R14...R2R1
  233|       |
  234|  3.87k|            v_refl = _mm256_inserti128_si256(
  235|  3.87k|                v_refl, _mm256_extracti128_si256(v_refu, 0), 1);
  236|  3.87k|            v_refu = _mm256_inserti128_si256(
  237|  3.87k|                v_refu, _mm256_extracti128_si256(v_ref, 0), 0);
  238|       |
  239|  3.87k|            __m256i v_sum = _mm256_set1_epi32(ohoriz);
  240|  3.87k|            __m256i parsum =
  241|  3.87k|                _mm256_madd_epi16(v_c01, _mm256_alignr_epi8(v_refu, v_refl, 0));
  242|  3.87k|            __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum);
  243|       |
  244|  3.87k|            parsum =
  245|  3.87k|                _mm256_madd_epi16(v_c23, _mm256_alignr_epi8(v_refu, v_refl, 4));
  246|  3.87k|            __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum);
  247|  3.87k|            parsum =
  248|  3.87k|                _mm256_madd_epi16(v_c45, _mm256_alignr_epi8(v_refu, v_refl, 8));
  249|  3.87k|            __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum);
  250|  3.87k|            parsum = _mm256_madd_epi16(v_c67,
  251|  3.87k|                                       _mm256_alignr_epi8(v_refu, v_refl, 12));
  252|  3.87k|            __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum);
  253|       |
  254|  3.87k|            tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz),
  255|  3.87k|                                           reduce_bits_horiz);
  256|  3.87k|          }
  257|  2.02k|        } else if (beta == 0) {
  ------------------
  |  Branch (257:20): [True: 990, False: 1.03k]
  ------------------
  258|    990|          sx = sx4;
  259|    990|          __m256i v_coeff01 = _mm256_inserti128_si256(
  260|    990|              v_zeros,
  261|    990|              _mm_loadu_si128(
  262|    990|                  (__m128i *)av1_warped_filter[(sx) >> WARPEDDIFF_PREC_BITS]),
  263|    990|              0);
  264|    990|          v_coeff01 = _mm256_inserti128_si256(
  265|    990|              v_coeff01,
  266|    990|              _mm_loadu_si128(
  267|    990|                  (__m128i *)
  268|    990|                      av1_warped_filter[(sx + alpha) >> WARPEDDIFF_PREC_BITS]),
  269|    990|              1);  // B7B6..B1B0A7A6..A1A0
  270|    990|          __m256i v_coeff23 = _mm256_inserti128_si256(
  271|    990|              v_zeros,
  272|    990|              _mm_loadu_si128(
  273|    990|                  (__m128i *)av1_warped_filter[(sx + 2 * alpha) >>
  274|    990|                                               WARPEDDIFF_PREC_BITS]),
  275|    990|              0);
  276|    990|          v_coeff23 = _mm256_inserti128_si256(
  277|    990|              v_coeff23,
  278|    990|              _mm_loadu_si128(
  279|    990|                  (__m128i *)av1_warped_filter[(sx + 3 * alpha) >>
  280|    990|                                               WARPEDDIFF_PREC_BITS]),
  281|    990|              1);  // D7D6..D1D0C7C6..C1C0
  282|    990|          __m256i v_coeff45 = _mm256_inserti128_si256(
  283|    990|              v_zeros,
  284|    990|              _mm_loadu_si128(
  285|    990|                  (__m128i *)av1_warped_filter[(sx + 4 * alpha) >>
  286|    990|                                               WARPEDDIFF_PREC_BITS]),
  287|    990|              0);
  288|    990|          v_coeff45 = _mm256_inserti128_si256(
  289|    990|              v_coeff45,
  290|    990|              _mm_loadu_si128(
  291|    990|                  (__m128i *)av1_warped_filter[(sx + 5 * alpha) >>
  292|    990|                                               WARPEDDIFF_PREC_BITS]),
  293|    990|              1);  // F7F6..F1F0E7E6..E1E0
  294|    990|          __m256i v_coeff67 = _mm256_inserti128_si256(
  295|    990|              v_zeros,
  296|    990|              _mm_loadu_si128(
  297|    990|                  (__m128i *)av1_warped_filter[(sx + 6 * alpha) >>
  298|    990|                                               WARPEDDIFF_PREC_BITS]),
  299|    990|              0);
  300|    990|          v_coeff67 = _mm256_inserti128_si256(
  301|    990|              v_coeff67,
  302|    990|              _mm_loadu_si128(
  303|    990|                  (__m128i *)av1_warped_filter[(sx + 7 * alpha) >>
  304|    990|                                               WARPEDDIFF_PREC_BITS]),
  305|    990|              1);  // H7H6..H1H0G7G6..G1G0
  306|       |
  307|    990|          __m256i v_c0123 = _mm256_unpacklo_epi32(
  308|    990|              v_coeff01,
  309|    990|              v_coeff23);  // D3D2B3B2D1D0B1B0C3C2A3A2C1C0A1A0
  310|    990|          __m256i v_c0123u = _mm256_unpackhi_epi32(
  311|    990|              v_coeff01,
  312|    990|              v_coeff23);  // D7D6B7B6D5D4B5B4C7C6A7A6C5C4A5A4
  313|    990|          __m256i v_c4567 = _mm256_unpacklo_epi32(
  314|    990|              v_coeff45,
  315|    990|              v_coeff67);  // H3H2F3F2H1H0F1F0G3G2E3E2G1G0E1E0
  316|    990|          __m256i v_c4567u = _mm256_unpackhi_epi32(
  317|    990|              v_coeff45,
  318|    990|              v_coeff67);  // H7H6F7F6H5H4F5F4G7G6E7E6G5G4E5E4
  319|       |
  320|    990|          __m256i v_c01 = _mm256_unpacklo_epi64(
  321|    990|              v_c0123, v_c4567);  // H1H0F1F0D1D0B1B0G1G0E1E0C1C0A1A0
  322|    990|          __m256i v_c23 =
  323|    990|              _mm256_unpackhi_epi64(v_c0123, v_c4567);  // H3H2 ... A3A2
  324|    990|          __m256i v_c45 =
  325|    990|              _mm256_unpacklo_epi64(v_c0123u, v_c4567u);  // H5H4 ... A5A4
  326|    990|          __m256i v_c67 =
  327|    990|              _mm256_unpackhi_epi64(v_c0123u, v_c4567u);  // H7H6 ... A7A6
  328|       |
  329|  15.8k|          for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
  ------------------
  |  |   34|  15.8k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 8.00k, False: 7.84k]
  |  |  ------------------
  ------------------
  |  Branch (329:28): [True: 14.8k, False: 990]
  ------------------
  330|  14.8k|            int iy = iy4 + k;
  331|  14.8k|            if (iy < 0)
  ------------------
  |  Branch (331:17): [True: 722, False: 14.1k]
  ------------------
  332|    722|              iy = 0;
  333|  14.1k|            else if (iy > height - 1)
  ------------------
  |  Branch (333:22): [True: 156, False: 13.9k]
  ------------------
  334|    156|              iy = height - 1;
  335|  14.8k|            iy = iy * stride;
  336|       |
  337|  14.8k|            __m256i v_refl = _mm256_inserti128_si256(
  338|  14.8k|                _mm256_setzero_si256(),
  339|  14.8k|                _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0);
  340|  14.8k|            v_refl = _mm256_inserti128_si256(
  341|  14.8k|                v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]),
  342|  14.8k|                1);  // R15 .. R0
  343|       |
  344|  14.8k|            __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE);
  345|       |
  346|  14.8k|            __m256i v_refu =
  347|  14.8k|                _mm256_alignr_epi8(v_ref, v_refl, 2);  // R8R15R14...R2R1
  348|       |
  349|  14.8k|            v_refl = _mm256_inserti128_si256(
  350|  14.8k|                v_refl, _mm256_extracti128_si256(v_refu, 0), 1);
  351|  14.8k|            v_refu = _mm256_inserti128_si256(
  352|  14.8k|                v_refu, _mm256_extracti128_si256(v_ref, 0), 0);
  353|       |
  354|  14.8k|            __m256i v_sum = _mm256_set1_epi32(ohoriz);
  355|  14.8k|            __m256i parsum = _mm256_madd_epi16(
  356|  14.8k|                v_c01, _mm256_alignr_epi8(v_refu, v_refl,
  357|  14.8k|                                          0));  // R8R7R6..R1R7R6R5..R1R0
  358|  14.8k|            __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum);
  359|       |
  360|  14.8k|            parsum = _mm256_madd_epi16(
  361|  14.8k|                v_c23,
  362|  14.8k|                _mm256_alignr_epi8(v_refu, v_refl, 4));  // R10R9..R3R9R8..R3R2
  363|  14.8k|            __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum);
  364|  14.8k|            parsum = _mm256_madd_epi16(
  365|  14.8k|                v_c45, _mm256_alignr_epi8(v_refu, v_refl,
  366|  14.8k|                                          8));  // R12R11..R5R11R10..R5R4
  367|  14.8k|            __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum);
  368|  14.8k|            parsum = _mm256_madd_epi16(
  369|  14.8k|                v_c67, _mm256_alignr_epi8(v_refu, v_refl,
  370|  14.8k|                                          12));  // R14R13..R7R13R12..R7R6
  371|  14.8k|            __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum);
  372|       |
  373|  14.8k|            tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz),
  374|  14.8k|                                           reduce_bits_horiz);
  375|  14.8k|          }
  376|       |
  377|  1.03k|        } else {
  378|  16.6k|          for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
  ------------------
  |  |   34|  16.6k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 10.2k, False: 6.36k]
  |  |  ------------------
  ------------------
  |  Branch (378:28): [True: 15.5k, False: 1.03k]
  ------------------
  379|  15.5k|            int iy = iy4 + k;
  380|  15.5k|            if (iy < 0)
  ------------------
  |  Branch (380:17): [True: 150, False: 15.4k]
  ------------------
  381|    150|              iy = 0;
  382|  15.4k|            else if (iy > height - 1)
  ------------------
  |  Branch (382:22): [True: 104, False: 15.3k]
  ------------------
  383|    104|              iy = height - 1;
  384|  15.5k|            iy = iy * stride;
  385|       |
  386|  15.5k|            sx = sx4 + beta * (k + 4);
  387|       |
  388|  15.5k|            __m256i v_coeff01 = _mm256_inserti128_si256(
  389|  15.5k|                v_zeros,
  390|  15.5k|                _mm_loadu_si128(
  391|  15.5k|                    (__m128i *)av1_warped_filter[(sx) >> WARPEDDIFF_PREC_BITS]),
  392|  15.5k|                0);
  393|  15.5k|            v_coeff01 = _mm256_inserti128_si256(
  394|  15.5k|                v_coeff01,
  395|  15.5k|                _mm_loadu_si128(
  396|  15.5k|                    (__m128i *)av1_warped_filter[(sx + alpha) >>
  397|  15.5k|                                                 WARPEDDIFF_PREC_BITS]),
  398|  15.5k|                1);  // B7B6..B1B0A7A6..A1A0
  399|  15.5k|            __m256i v_coeff23 = _mm256_inserti128_si256(
  400|  15.5k|                v_zeros,
  401|  15.5k|                _mm_loadu_si128(
  402|  15.5k|                    (__m128i *)av1_warped_filter[(sx + 2 * alpha) >>
  403|  15.5k|                                                 WARPEDDIFF_PREC_BITS]),
  404|  15.5k|                0);
  405|  15.5k|            v_coeff23 = _mm256_inserti128_si256(
  406|  15.5k|                v_coeff23,
  407|  15.5k|                _mm_loadu_si128(
  408|  15.5k|                    (__m128i *)av1_warped_filter[(sx + 3 * alpha) >>
  409|  15.5k|                                                 WARPEDDIFF_PREC_BITS]),
  410|  15.5k|                1);  // D7D6..D1D0C7C6..C1C0
  411|  15.5k|            __m256i v_coeff45 = _mm256_inserti128_si256(
  412|  15.5k|                v_zeros,
  413|  15.5k|                _mm_loadu_si128(
  414|  15.5k|                    (__m128i *)av1_warped_filter[(sx + 4 * alpha) >>
  415|  15.5k|                                                 WARPEDDIFF_PREC_BITS]),
  416|  15.5k|                0);
  417|  15.5k|            v_coeff45 = _mm256_inserti128_si256(
  418|  15.5k|                v_coeff45,
  419|  15.5k|                _mm_loadu_si128(
  420|  15.5k|                    (__m128i *)av1_warped_filter[(sx + 5 * alpha) >>
  421|  15.5k|                                                 WARPEDDIFF_PREC_BITS]),
  422|  15.5k|                1);  // F7F6..F1F0E7E6..E1E0
  423|  15.5k|            __m256i v_coeff67 = _mm256_inserti128_si256(
  424|  15.5k|                v_zeros,
  425|  15.5k|                _mm_loadu_si128(
  426|  15.5k|                    (__m128i *)av1_warped_filter[(sx + 6 * alpha) >>
  427|  15.5k|                                                 WARPEDDIFF_PREC_BITS]),
  428|  15.5k|                0);
  429|  15.5k|            v_coeff67 = _mm256_inserti128_si256(
  430|  15.5k|                v_coeff67,
  431|  15.5k|                _mm_loadu_si128(
  432|  15.5k|                    (__m128i *)av1_warped_filter[(sx + 7 * alpha) >>
  433|  15.5k|                                                 WARPEDDIFF_PREC_BITS]),
  434|  15.5k|                1);  // H7H6..H1H0G7G6..G1G0
  435|       |
  436|  15.5k|            __m256i v_c0123 = _mm256_unpacklo_epi32(
  437|  15.5k|                v_coeff01,
  438|  15.5k|                v_coeff23);  // D3D2B3B2D1D0B1B0C3C2A3A2C1C0A1A0
  439|  15.5k|            __m256i v_c0123u = _mm256_unpackhi_epi32(
  440|  15.5k|                v_coeff01,
  441|  15.5k|                v_coeff23);  // D7D6B7B6D5D4B5B4C7C6A7A6C5C4A5A4
  442|  15.5k|            __m256i v_c4567 = _mm256_unpacklo_epi32(
  443|  15.5k|                v_coeff45,
  444|  15.5k|                v_coeff67);  // H3H2F3F2H1H0F1F0G3G2E3E2G1G0E1E0
  445|  15.5k|            __m256i v_c4567u = _mm256_unpackhi_epi32(
  446|  15.5k|                v_coeff45,
  447|  15.5k|                v_coeff67);  // H7H6F7F6H5H4F5F4G7G6E7E6G5G4E5E4
  448|       |
  449|  15.5k|            __m256i v_c01 = _mm256_unpacklo_epi64(
  450|  15.5k|                v_c0123, v_c4567);  // H1H0F1F0D1D0B1B0G1G0E1E0C1C0A1A0
  451|  15.5k|            __m256i v_c23 =
  452|  15.5k|                _mm256_unpackhi_epi64(v_c0123, v_c4567);  // H3H2 ... A3A2
  453|  15.5k|            __m256i v_c45 =
  454|  15.5k|                _mm256_unpacklo_epi64(v_c0123u, v_c4567u);  // H5H4 ... A5A4
  455|  15.5k|            __m256i v_c67 =
  456|  15.5k|                _mm256_unpackhi_epi64(v_c0123u, v_c4567u);  // H7H6 ... A7A6
  457|       |
  458|  15.5k|            __m256i v_refl = _mm256_inserti128_si256(
  459|  15.5k|                _mm256_setzero_si256(),
  460|  15.5k|                _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0);
  461|  15.5k|            v_refl = _mm256_inserti128_si256(
  462|  15.5k|                v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]),
  463|  15.5k|                1);  // R15 .. R0
  464|       |
  465|  15.5k|            __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE);
  466|       |
  467|  15.5k|            __m256i v_refu =
  468|  15.5k|                _mm256_alignr_epi8(v_ref, v_refl, 2);  // R8R15R14...R2R1
  469|       |
  470|  15.5k|            v_refl = _mm256_inserti128_si256(
  471|  15.5k|                v_refl, _mm256_extracti128_si256(v_refu, 0), 1);
  472|  15.5k|            v_refu = _mm256_inserti128_si256(
  473|  15.5k|                v_refu, _mm256_extracti128_si256(v_ref, 0), 0);
  474|       |
  475|  15.5k|            __m256i v_sum = _mm256_set1_epi32(ohoriz);
  476|  15.5k|            __m256i parsum =
  477|  15.5k|                _mm256_madd_epi16(v_c01, _mm256_alignr_epi8(v_refu, v_refl, 0));
  478|  15.5k|            __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum);
  479|       |
  480|  15.5k|            parsum =
  481|  15.5k|                _mm256_madd_epi16(v_c23, _mm256_alignr_epi8(v_refu, v_refl, 4));
  482|  15.5k|            __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum);
  483|  15.5k|            parsum =
  484|  15.5k|                _mm256_madd_epi16(v_c45, _mm256_alignr_epi8(v_refu, v_refl, 8));
  485|  15.5k|            __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum);
  486|  15.5k|            parsum = _mm256_madd_epi16(v_c67,
  487|  15.5k|                                       _mm256_alignr_epi8(v_refu, v_refl, 12));
  488|  15.5k|            __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum);
  489|       |
  490|  15.5k|            tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz),
  491|  15.5k|                                           reduce_bits_horiz);
  492|  15.5k|          }
  493|  1.03k|        }
  494|  3.36k|      }
  495|       |
  496|       |      // Vertical filter
  497|  38.3k|      for (int k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
  ------------------
  |  |   34|  38.3k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 21.9k, False: 16.4k]
  |  |  ------------------
  ------------------
  |  Branch (497:24): [True: 34.1k, False: 4.26k]
  ------------------
  498|  34.1k|        int sy = sy4 + delta * (k + 4);
  499|  34.1k|        const __m256i *src = tmp + (k + 4);
  500|       |
  501|  34.1k|        __m256i v_coeff01 = _mm256_inserti128_si256(
  502|  34.1k|            v_zeros,
  503|  34.1k|            _mm_loadu_si128(
  504|  34.1k|                (__m128i *)av1_warped_filter[(sy) >> WARPEDDIFF_PREC_BITS]),
  505|  34.1k|            0);
  506|  34.1k|        v_coeff01 = _mm256_inserti128_si256(
  507|  34.1k|            v_coeff01,
  508|  34.1k|            _mm_loadu_si128(
  509|  34.1k|                (__m128i *)
  510|  34.1k|                    av1_warped_filter[(sy + gamma) >> WARPEDDIFF_PREC_BITS]),
  511|  34.1k|            1);
  512|  34.1k|        __m256i v_coeff23 = _mm256_inserti128_si256(
  513|  34.1k|            v_zeros,
  514|  34.1k|            _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 2 * gamma) >>
  515|  34.1k|                                                         WARPEDDIFF_PREC_BITS]),
  516|  34.1k|            0);
  517|  34.1k|        v_coeff23 = _mm256_inserti128_si256(
  518|  34.1k|            v_coeff23,
  519|  34.1k|            _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 3 * gamma) >>
  520|  34.1k|                                                         WARPEDDIFF_PREC_BITS]),
  521|  34.1k|            1);
  522|  34.1k|        __m256i v_coeff45 = _mm256_inserti128_si256(
  523|  34.1k|            v_zeros,
  524|  34.1k|            _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 4 * gamma) >>
  525|  34.1k|                                                         WARPEDDIFF_PREC_BITS]),
  526|  34.1k|            0);
  527|  34.1k|        v_coeff45 = _mm256_inserti128_si256(
  528|  34.1k|            v_coeff45,
  529|  34.1k|            _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 5 * gamma) >>
  530|  34.1k|                                                         WARPEDDIFF_PREC_BITS]),
  531|  34.1k|            1);
  532|  34.1k|        __m256i v_coeff67 = _mm256_inserti128_si256(
  533|  34.1k|            v_zeros,
  534|  34.1k|            _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 6 * gamma) >>
  535|  34.1k|                                                         WARPEDDIFF_PREC_BITS]),
  536|  34.1k|            0);
  537|  34.1k|        v_coeff67 = _mm256_inserti128_si256(
  538|  34.1k|            v_coeff67,
  539|  34.1k|            _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 7 * gamma) >>
  540|  34.1k|                                                         WARPEDDIFF_PREC_BITS]),
  541|  34.1k|            1);
  542|       |
  543|  34.1k|        __m256i v_c0123 = _mm256_unpacklo_epi32(
  544|  34.1k|            v_coeff01,
  545|  34.1k|            v_coeff23);  // D3D2B3B2D1D0B1B0C3C2A3A2C1C0A1A0
  546|  34.1k|        __m256i v_c0123u = _mm256_unpackhi_epi32(
  547|  34.1k|            v_coeff01,
  548|  34.1k|            v_coeff23);  // D7D6B7B6D5D4B5B4C7C6A7A6C5C4A5A4
  549|  34.1k|        __m256i v_c4567 = _mm256_unpacklo_epi32(
  550|  34.1k|            v_coeff45,
  551|  34.1k|            v_coeff67);  // H3H2F3F2H1H0F1F0G3G2E3E2G1G0E1E0
  552|  34.1k|        __m256i v_c4567u = _mm256_unpackhi_epi32(
  553|  34.1k|            v_coeff45,
  554|  34.1k|            v_coeff67);  // H7H6F7F6H5H4F5F4G7G6E7E6G5G4E5E4
  555|       |
  556|  34.1k|        __m256i v_c01 = _mm256_unpacklo_epi64(
  557|  34.1k|            v_c0123, v_c4567);  // H1H0F1F0D1D0B1B0G1G0E1E0C1C0A1A0
  558|  34.1k|        __m256i v_c23 =
  559|  34.1k|            _mm256_unpackhi_epi64(v_c0123, v_c4567);  // H3H2 ... A3A2
  560|  34.1k|        __m256i v_c45 =
  561|  34.1k|            _mm256_unpacklo_epi64(v_c0123u, v_c4567u);  // H5H4 ... A5A4
  562|  34.1k|        __m256i v_c67 =
  563|  34.1k|            _mm256_unpackhi_epi64(v_c0123u, v_c4567u);  // H7H6 ... A7A6
  564|       |
  565|  34.1k|        __m256i v_src01l =
  566|  34.1k|            _mm256_unpacklo_epi32(src[0], src[1]);  // T13T03T11T01T12T02T10T00
  567|  34.1k|        __m256i v_src01u =
  568|  34.1k|            _mm256_unpackhi_epi32(src[0], src[1]);  // T17T07T15T05T16T06T14T04
  569|  34.1k|        __m256i v_sum =
  570|  34.1k|            _mm256_madd_epi16(_mm256_packus_epi32(v_src01l, v_src01u),
  571|  34.1k|                              v_c01);  // S7S5S3S1S6S4S2S0
  572|       |
  573|  34.1k|        __m256i v_src23l = _mm256_unpacklo_epi32(src[2], src[3]);
  574|  34.1k|        __m256i v_src23u = _mm256_unpackhi_epi32(src[2], src[3]);
  575|  34.1k|        v_sum = _mm256_add_epi32(
  576|  34.1k|            v_sum,
  577|  34.1k|            _mm256_madd_epi16(_mm256_packus_epi32(v_src23l, v_src23u), v_c23));
  578|       |
  579|  34.1k|        __m256i v_src45l = _mm256_unpacklo_epi32(src[4], src[5]);
  580|  34.1k|        __m256i v_src45u = _mm256_unpackhi_epi32(src[4], src[5]);
  581|  34.1k|        v_sum = _mm256_add_epi32(
  582|  34.1k|            v_sum,
  583|  34.1k|            _mm256_madd_epi16(_mm256_packus_epi32(v_src45l, v_src45u), v_c45));
  584|       |
  585|  34.1k|        __m256i v_src67l = _mm256_unpacklo_epi32(src[6], src[7]);
  586|  34.1k|        __m256i v_src67u = _mm256_unpackhi_epi32(src[6], src[7]);
  587|  34.1k|        v_sum = _mm256_add_epi32(
  588|  34.1k|            v_sum,
  589|  34.1k|            _mm256_madd_epi16(_mm256_packus_epi32(v_src67l, v_src67u), v_c67));
  590|       |
  591|       |        // unpack S7S5S3S1S6S4S2S0 to S7S6S5S4S3S2S1S0
  592|       |
  593|  34.1k|        __m256i v_suml =
  594|  34.1k|            _mm256_permute4x64_epi64(v_sum, 0xD8);  // S7S5S6S4S3S1S2S0
  595|  34.1k|        __m256i v_sumh =
  596|  34.1k|            _mm256_permute4x64_epi64(v_sum, 0x32);      // S2S0S7S5S2S0S3S1
  597|  34.1k|        v_sum = _mm256_unpacklo_epi32(v_suml, v_sumh);  // S7S6S5S4S3S2S1S0
  598|       |
  599|  34.1k|        if (conv_params->is_compound) {
  ------------------
  |  Branch (599:13): [True: 3.18k, False: 30.9k]
  ------------------
  600|  3.18k|          __m128i *const p =
  601|  3.18k|              (__m128i *)&conv_params
  602|  3.18k|                  ->dst[(i + k + 4) * conv_params->dst_stride + j];
  603|       |
  604|  3.18k|          v_sum = _mm256_add_epi32(v_sum, res_add_const);
  605|  3.18k|          v_sum =
  606|  3.18k|              _mm256_sra_epi32(_mm256_add_epi32(v_sum, reduce_bits_vert_const),
  607|  3.18k|                               reduce_bits_vert_shift);
  608|  3.18k|          if (conv_params->do_average) {
  ------------------
  |  Branch (608:15): [True: 1.15k, False: 2.03k]
  ------------------
  609|  1.15k|            __m128i *const dst16 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
  610|  1.15k|            __m256i p_32 = _mm256_cvtepu16_epi32(_mm_loadu_si128(p));
  611|       |
  612|  1.15k|            if (conv_params->use_dist_wtd_comp_avg) {
  ------------------
  |  Branch (612:17): [True: 512, False: 640]
  ------------------
  613|    512|              v_sum = _mm256_add_epi32(_mm256_mullo_epi32(p_32, wt0),
  614|    512|                                       _mm256_mullo_epi32(v_sum, wt1));
  615|    512|              v_sum = _mm256_srai_epi32(v_sum, DIST_PRECISION_BITS);
  ------------------
  |  |   76|    512|#define DIST_PRECISION_BITS 4
  ------------------
  616|    640|            } else {
  617|    640|              v_sum = _mm256_srai_epi32(_mm256_add_epi32(p_32, v_sum), 1);
  618|    640|            }
  619|       |
  620|  1.15k|            __m256i v_sum1 = _mm256_add_epi32(v_sum, res_sub_const);
  621|  1.15k|            v_sum1 = _mm256_sra_epi32(
  622|  1.15k|                _mm256_add_epi32(v_sum1, round_bits_const), round_bits_shift);
  623|       |
  624|  1.15k|            __m256i v_sum16 = _mm256_packus_epi32(v_sum1, v_sum1);
  625|  1.15k|            v_sum16 = _mm256_permute4x64_epi64(v_sum16, 0xD8);
  626|  1.15k|            v_sum16 = _mm256_min_epi16(v_sum16, clip_pixel);
  627|  1.15k|            _mm_storeu_si128(dst16, _mm256_extracti128_si256(v_sum16, 0));
  628|  2.03k|          } else {
  629|  2.03k|            v_sum = _mm256_packus_epi32(v_sum, v_sum);
  630|  2.03k|            __m256i v_sum16 = _mm256_permute4x64_epi64(v_sum, 0xD8);
  631|  2.03k|            _mm_storeu_si128(p, _mm256_extracti128_si256(v_sum16, 0));
  632|  2.03k|          }
  633|  30.9k|        } else {
  634|       |          // Round and pack into 8 bits
  635|  30.9k|          const __m256i round_const =
  636|  30.9k|              _mm256_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
  637|  30.9k|                                ((1 << reduce_bits_vert) >> 1));
  638|       |
  639|  30.9k|          __m256i v_sum1 = _mm256_srai_epi32(
  640|  30.9k|              _mm256_add_epi32(v_sum, round_const), reduce_bits_vert);
  641|       |
  642|  30.9k|          v_sum1 = _mm256_packus_epi32(v_sum1, v_sum1);
  643|  30.9k|          __m256i v_sum16 = _mm256_permute4x64_epi64(v_sum1, 0xD8);
  644|       |          // Clamp res_16bit to the range [0, 2^bd - 1]
  645|  30.9k|          const __m256i max_val = _mm256_set1_epi16((1 << bd) - 1);
  646|  30.9k|          const __m256i zero = _mm256_setzero_si256();
  647|  30.9k|          v_sum16 = _mm256_max_epi16(_mm256_min_epi16(v_sum16, max_val), zero);
  648|       |
  649|  30.9k|          __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
  650|       |
  651|       |          _mm_storeu_si128(p, _mm256_extracti128_si256(v_sum16, 0));
  652|  30.9k|        }
  653|  34.1k|      }
  654|  4.26k|    }
  655|  1.73k|  }
  656|    956|}

av1_highbd_wiener_convolve_add_src_avx2:
   32|  1.42k|    const WienerConvolveParams *conv_params, int bd) {
   33|  1.42k|  assert(x_step_q4 == 16 && y_step_q4 == 16);
   34|  1.42k|  assert(!(w & 7));
   35|  1.42k|  assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
   36|  1.42k|  (void)x_step_q4;
   37|  1.42k|  (void)y_step_q4;
   38|       |
   39|  1.42k|  const uint16_t *const src = CONVERT_TO_SHORTPTR(src8);
  ------------------
  |  |   75|  1.42k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
   40|  1.42k|  uint16_t *const dst = CONVERT_TO_SHORTPTR(dst8);
  ------------------
  |  |   75|  1.42k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
   41|       |
   42|  1.42k|  DECLARE_ALIGNED(32, uint16_t,
  ------------------
  |  |   19|  1.42k|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
   43|  1.42k|                  temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
   44|  1.42k|  int intermediate_height = h + SUBPEL_TAPS - 1;
  ------------------
  |  |   26|  1.42k|#define SUBPEL_TAPS 8
  ------------------
   45|  1.42k|  const int center_tap = ((SUBPEL_TAPS - 1) / 2);
  ------------------
  |  |   26|  1.42k|#define SUBPEL_TAPS 8
  ------------------
   46|  1.42k|  const uint16_t *const src_ptr = src - center_tap * src_stride - center_tap;
   47|       |
   48|  1.42k|  const __m128i zero_128 = _mm_setzero_si128();
   49|  1.42k|  const __m256i zero_256 = _mm256_setzero_si256();
   50|       |
   51|       |  // Add an offset to account for the "add_src" part of the convolve function.
   52|  1.42k|  const __m128i offset = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3);
   53|       |
   54|  1.42k|  const __m256i clamp_low = zero_256;
   55|       |
   56|       |  /* Horizontal filter */
   57|  1.42k|  {
   58|  1.42k|    const __m256i clamp_high_ep =
   59|  1.42k|        _mm256_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1);
  ------------------
  |  |   43|  1.42k|#define WIENER_CLAMP_LIMIT(r0, bd) (1 << ((bd) + 1 + FILTER_BITS - r0))
  |  |  ------------------
  |  |  |  |   21|  1.42k|#define FILTER_BITS 7
  |  |  ------------------
  ------------------
   60|       |
   61|       |    // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ]
   62|  1.42k|    const __m128i coeffs_x = _mm_add_epi16(xx_loadu_128(filter_x), offset);
   63|       |
   64|       |    // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ]
   65|  1.42k|    const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
   66|       |    // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ]
   67|  1.42k|    const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
   68|       |
   69|       |    // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ]
   70|  1.42k|    const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
   71|       |    // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ]
   72|  1.42k|    const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
   73|       |    // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ]
   74|  1.42k|    const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
   75|       |    // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ]
   76|  1.42k|    const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
   77|       |
   78|       |    // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ]
   79|  1.42k|    const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
   80|       |    // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ]
   81|  1.42k|    const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
   82|       |    // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ]
   83|  1.42k|    const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
   84|       |    // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ]
   85|  1.42k|    const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
   86|       |
   87|  1.42k|    const __m256i round_const = _mm256_set1_epi32(
   88|  1.42k|        (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1)));
  ------------------
  |  |   21|  1.42k|#define FILTER_BITS 7
  ------------------
   89|       |
   90|  66.1k|    for (int i = 0; i < intermediate_height; ++i) {
  ------------------
  |  Branch (90:21): [True: 64.7k, False: 1.42k]
  ------------------
   91|   203k|      for (int j = 0; j < w; j += 16) {
  ------------------
  |  Branch (91:23): [True: 138k, False: 64.7k]
  ------------------
   92|   138k|        const uint16_t *src_ij = src_ptr + i * src_stride + j;
   93|       |
   94|       |        // Load 16-bit src data
   95|   138k|        const __m256i src_0 = yy_loadu_256(src_ij + 0);
   96|   138k|        const __m256i src_1 = yy_loadu_256(src_ij + 1);
   97|   138k|        const __m256i src_2 = yy_loadu_256(src_ij + 2);
   98|   138k|        const __m256i src_3 = yy_loadu_256(src_ij + 3);
   99|   138k|        const __m256i src_4 = yy_loadu_256(src_ij + 4);
  100|   138k|        const __m256i src_5 = yy_loadu_256(src_ij + 5);
  101|   138k|        const __m256i src_6 = yy_loadu_256(src_ij + 6);
  102|   138k|        const __m256i src_7 = yy_loadu_256(src_ij + 7);
  103|       |
  104|       |        // Multiply src data by filter coeffs and sum pairs
  105|   138k|        const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
  106|   138k|        const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
  107|   138k|        const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
  108|   138k|        const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
  109|   138k|        const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
  110|   138k|        const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
  111|   138k|        const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
  112|   138k|        const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
  113|       |
  114|       |        // Calculate scalar product for even- and odd-indices separately,
  115|       |        // increasing to 32-bit precision
  116|   138k|        const __m256i res_even_sum = _mm256_add_epi32(
  117|   138k|            _mm256_add_epi32(res_0, res_4), _mm256_add_epi32(res_2, res_6));
  118|   138k|        const __m256i res_even = _mm256_srai_epi32(
  119|   138k|            _mm256_add_epi32(res_even_sum, round_const), conv_params->round_0);
  120|       |
  121|   138k|        const __m256i res_odd_sum = _mm256_add_epi32(
  122|   138k|            _mm256_add_epi32(res_1, res_5), _mm256_add_epi32(res_3, res_7));
  123|   138k|        const __m256i res_odd = _mm256_srai_epi32(
  124|   138k|            _mm256_add_epi32(res_odd_sum, round_const), conv_params->round_0);
  125|       |
  126|       |        // Reduce to 16-bit precision and pack even- and odd-index results
  127|       |        // back into one register. The _mm256_packs_epi32 intrinsic returns
  128|       |        // a register with the pixels ordered as follows:
  129|       |        // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
  130|   138k|        const __m256i res = _mm256_packs_epi32(res_even, res_odd);
  131|   138k|        const __m256i res_clamped =
  132|   138k|            _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high_ep);
  133|       |
  134|       |        // Store in a temporary array
  135|   138k|        yy_storeu_256(temp + i * MAX_SB_SIZE + j, res_clamped);
  ------------------
  |  |   32|   138k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|   138k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
  136|   138k|      }
  137|  64.7k|    }
  138|  1.42k|  }
  139|       |
  140|       |  /* Vertical filter */
  141|  1.42k|  {
  142|  1.42k|    const __m256i clamp_high = _mm256_set1_epi16((1 << bd) - 1);
  143|       |
  144|       |    // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ]
  145|  1.42k|    const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset);
  146|       |
  147|       |    // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ]
  148|  1.42k|    const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
  149|       |    // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ]
  150|  1.42k|    const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
  151|       |
  152|       |    // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ]
  153|  1.42k|    const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
  154|       |    // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ]
  155|  1.42k|    const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
  156|       |    // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ]
  157|  1.42k|    const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
  158|       |    // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ]
  159|  1.42k|    const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
  160|       |
  161|       |    // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ]
  162|  1.42k|    const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
  163|       |    // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ]
  164|  1.42k|    const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
  165|       |    // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ]
  166|  1.42k|    const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
  167|       |    // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ]
  168|  1.42k|    const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
  169|       |
  170|  1.42k|    const __m256i round_const =
  171|  1.42k|        _mm256_set1_epi32((1 << (conv_params->round_1 - 1)) -
  172|  1.42k|                          (1 << (bd + conv_params->round_1 - 1)));
  173|       |
  174|  72.9k|    for (int i = 0; i < h; ++i) {
  ------------------
  |  Branch (174:21): [True: 71.5k, False: 1.42k]
  ------------------
  175|   216k|      for (int j = 0; j < w; j += 16) {
  ------------------
  |  Branch (175:23): [True: 144k, False: 71.5k]
  ------------------
  176|   144k|        const uint16_t *temp_ij = temp + i * MAX_SB_SIZE + j;
  ------------------
  |  |   32|   144k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|   144k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
  177|       |
  178|       |        // Load 16-bit data from the output of the horizontal filter in
  179|       |        // which the pixels are ordered as follows:
  180|       |        // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
  181|   144k|        const __m256i data_0 = yy_loadu_256(temp_ij + 0 * MAX_SB_SIZE);
  ------------------
  |  |   32|   144k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|   144k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
  182|   144k|        const __m256i data_1 = yy_loadu_256(temp_ij + 1 * MAX_SB_SIZE);
  ------------------
  |  |   32|   144k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|   144k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
  183|   144k|        const __m256i data_2 = yy_loadu_256(temp_ij + 2 * MAX_SB_SIZE);
  ------------------
  |  |   32|   144k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|   144k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
  184|   144k|        const __m256i data_3 = yy_loadu_256(temp_ij + 3 * MAX_SB_SIZE);
  ------------------
  |  |   32|   144k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|   144k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
  185|   144k|        const __m256i data_4 = yy_loadu_256(temp_ij + 4 * MAX_SB_SIZE);
  ------------------
  |  |   32|   144k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|   144k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
  186|   144k|        const __m256i data_5 = yy_loadu_256(temp_ij + 5 * MAX_SB_SIZE);
  ------------------
  |  |   32|   144k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|   144k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
  187|   144k|        const __m256i data_6 = yy_loadu_256(temp_ij + 6 * MAX_SB_SIZE);
  ------------------
  |  |   32|   144k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|   144k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
  188|   144k|        const __m256i data_7 = yy_loadu_256(temp_ij + 7 * MAX_SB_SIZE);
  ------------------
  |  |   32|   144k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|   144k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
  189|       |
  190|       |        // Filter the even-indices, increasing to 32-bit precision
  191|   144k|        const __m256i src_0 = _mm256_unpacklo_epi16(data_0, data_1);
  192|   144k|        const __m256i src_2 = _mm256_unpacklo_epi16(data_2, data_3);
  193|   144k|        const __m256i src_4 = _mm256_unpacklo_epi16(data_4, data_5);
  194|   144k|        const __m256i src_6 = _mm256_unpacklo_epi16(data_6, data_7);
  195|       |
  196|   144k|        const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
  197|   144k|        const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
  198|   144k|        const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
  199|   144k|        const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
  200|       |
  201|   144k|        const __m256i res_even = _mm256_add_epi32(
  202|   144k|            _mm256_add_epi32(res_0, res_2), _mm256_add_epi32(res_4, res_6));
  203|       |
  204|       |        // Filter the odd-indices, increasing to 32-bit precision
  205|   144k|        const __m256i src_1 = _mm256_unpackhi_epi16(data_0, data_1);
  206|   144k|        const __m256i src_3 = _mm256_unpackhi_epi16(data_2, data_3);
  207|   144k|        const __m256i src_5 = _mm256_unpackhi_epi16(data_4, data_5);
  208|   144k|        const __m256i src_7 = _mm256_unpackhi_epi16(data_6, data_7);
  209|       |
  210|   144k|        const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
  211|   144k|        const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
  212|   144k|        const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
  213|   144k|        const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
  214|       |
  215|   144k|        const __m256i res_odd = _mm256_add_epi32(
  216|   144k|            _mm256_add_epi32(res_1, res_3), _mm256_add_epi32(res_5, res_7));
  217|       |
  218|       |        // Pixels are currently in the following order:
  219|       |        // res_even order: [ 14 12 10 8 ] [ 6 4 2 0 ]
  220|       |        // res_odd order:  [ 15 13 11 9 ] [ 7 5 3 1 ]
  221|       |        //
  222|       |        // Rearrange the pixels into the following order:
  223|       |        // res_lo order: [ 11 10  9  8 ] [ 3 2 1 0 ]
  224|       |        // res_hi order: [ 15 14 13 12 ] [ 7 6 5 4 ]
  225|   144k|        const __m256i res_lo = _mm256_unpacklo_epi32(res_even, res_odd);
  226|   144k|        const __m256i res_hi = _mm256_unpackhi_epi32(res_even, res_odd);
  227|       |
  228|   144k|        const __m256i res_lo_round = _mm256_srai_epi32(
  229|   144k|            _mm256_add_epi32(res_lo, round_const), conv_params->round_1);
  230|   144k|        const __m256i res_hi_round = _mm256_srai_epi32(
  231|   144k|            _mm256_add_epi32(res_hi, round_const), conv_params->round_1);
  232|       |
  233|       |        // Reduce to 16-bit precision and pack into the correct order:
  234|       |        // [ 15 14 13 12 11 10 9 8 ][ 7 6 5 4 3 2 1 0 ]
  235|   144k|        const __m256i res_16bit =
  236|   144k|            _mm256_packs_epi32(res_lo_round, res_hi_round);
  237|   144k|        const __m256i res_16bit_clamped = _mm256_min_epi16(
  238|   144k|            _mm256_max_epi16(res_16bit, clamp_low), clamp_high);
  239|       |
  240|       |        // Store in the dst array
  241|   144k|        yy_storeu_256(dst + i * dst_stride + j, res_16bit_clamped);
  242|   144k|      }
  243|  71.5k|    }
  244|  1.42k|  }
  245|  1.42k|}

av1_filter_intra_edge_sse4_1:
   18|   659k|void av1_filter_intra_edge_sse4_1(uint8_t *p, int sz, int strength) {
   19|   659k|  if (!strength) return;
  ------------------
  |  Branch (19:7): [True: 301k, False: 358k]
  ------------------
   20|       |
   21|   358k|  DECLARE_ALIGNED(16, static const int8_t, kern[3][16]) = {
  ------------------
  |  |   19|   358k|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
   22|   358k|    { 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0 },  // strength 1: 4,8,4
   23|   358k|    { 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0 },  // strength 2: 5,6,5
   24|   358k|    { 2, 4, 4, 4, 2, 0, 0, 0, 2, 4, 4, 4, 2, 0, 0, 0 }  // strength 3: 2,4,4,4,2
   25|   358k|  };
   26|       |
   27|   358k|  DECLARE_ALIGNED(16, static const int8_t, v_const[5][16]) = {
  ------------------
  |  |   19|   358k|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
   28|   358k|    { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 },
   29|   358k|    { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 },
   30|   358k|    { 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8 },
   31|   358k|    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
   32|   358k|  };
   33|       |
   34|       |  // Extend the first and last samples to simplify the loop for the 5-tap case
   35|   358k|  p[-1] = p[0];
   36|   358k|  __m128i last = _mm_set1_epi8((char)p[sz - 1]);
   37|   358k|  _mm_storeu_si128((__m128i *)&p[sz], last);
   38|       |
   39|       |  // Adjust input pointer for filter support area
   40|   358k|  uint8_t *in = (strength == 3) ? p - 1 : p;
  ------------------
  |  Branch (40:17): [True: 159k, False: 198k]
  ------------------
   41|       |
   42|       |  // Avoid modifying first sample
   43|   358k|  uint8_t *out = p + 1;
   44|   358k|  int len = sz - 1;
   45|       |
   46|   358k|  const int use_3tap_filter = (strength < 3);
   47|       |
   48|   358k|  if (use_3tap_filter) {
  ------------------
  |  Branch (48:7): [True: 198k, False: 159k]
  ------------------
   49|   198k|    __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
   50|   198k|    __m128i shuf0 = _mm_lddqu_si128((__m128i const *)v_const[0]);
   51|   198k|    __m128i shuf1 = _mm_lddqu_si128((__m128i const *)v_const[1]);
   52|   198k|    __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]);
   53|   198k|    __m128i in0 = _mm_lddqu_si128((__m128i *)in);
   54|   501k|    while (len > 0) {
  ------------------
  |  Branch (54:12): [True: 302k, False: 198k]
  ------------------
   55|   302k|      int n_out = (len < 8) ? len : 8;
  ------------------
  |  Branch (55:19): [True: 86.3k, False: 216k]
  ------------------
   56|   302k|      __m128i d0 = _mm_shuffle_epi8(in0, shuf0);
   57|   302k|      __m128i d1 = _mm_shuffle_epi8(in0, shuf1);
   58|   302k|      d0 = _mm_maddubs_epi16(d0, coef0);
   59|   302k|      d1 = _mm_maddubs_epi16(d1, coef0);
   60|   302k|      d0 = _mm_hadd_epi16(d0, d1);
   61|   302k|      __m128i eight = _mm_set1_epi16(8);
   62|   302k|      d0 = _mm_add_epi16(d0, eight);
   63|   302k|      d0 = _mm_srai_epi16(d0, 4);
   64|   302k|      d0 = _mm_packus_epi16(d0, d0);
   65|   302k|      __m128i out0 = _mm_lddqu_si128((__m128i *)out);
   66|   302k|      __m128i n0 = _mm_set1_epi8(n_out);
   67|   302k|      __m128i mask = _mm_cmpgt_epi8(n0, iden);
   68|   302k|      out0 = _mm_blendv_epi8(out0, d0, mask);
   69|   302k|      _mm_storel_epi64((__m128i *)out, out0);
   70|   302k|      __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16));
   71|   302k|      in0 = _mm_alignr_epi8(in1, in0, 8);
   72|   302k|      in += 8;
   73|   302k|      out += 8;
   74|   302k|      len -= n_out;
   75|   302k|    }
   76|   198k|  } else {  // 5-tap filter
   77|   159k|    __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
   78|   159k|    __m128i two = _mm_set1_epi8(2);
   79|   159k|    __m128i shuf_a = _mm_lddqu_si128((__m128i const *)v_const[2]);
   80|   159k|    __m128i shuf_b = _mm_add_epi8(shuf_a, two);
   81|   159k|    __m128i shuf_c = _mm_add_epi8(shuf_b, two);
   82|   159k|    __m128i shuf_d = _mm_add_epi8(shuf_c, two);
   83|   159k|    __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]);
   84|   159k|    __m128i in0 = _mm_lddqu_si128((__m128i *)in);
   85|   645k|    while (len > 0) {
  ------------------
  |  Branch (85:12): [True: 486k, False: 159k]
  ------------------
   86|   486k|      int n_out = (len < 8) ? len : 8;
  ------------------
  |  Branch (86:19): [True: 18.4k, False: 467k]
  ------------------
   87|   486k|      __m128i d0 = _mm_shuffle_epi8(in0, shuf_a);
   88|   486k|      __m128i d1 = _mm_shuffle_epi8(in0, shuf_b);
   89|   486k|      __m128i d2 = _mm_shuffle_epi8(in0, shuf_c);
   90|   486k|      __m128i d3 = _mm_shuffle_epi8(in0, shuf_d);
   91|   486k|      d0 = _mm_maddubs_epi16(d0, coef0);
   92|   486k|      d1 = _mm_maddubs_epi16(d1, coef0);
   93|   486k|      d2 = _mm_maddubs_epi16(d2, coef0);
   94|   486k|      d3 = _mm_maddubs_epi16(d3, coef0);
   95|   486k|      d0 = _mm_hadd_epi16(d0, d1);
   96|   486k|      d2 = _mm_hadd_epi16(d2, d3);
   97|   486k|      d0 = _mm_hadd_epi16(d0, d2);
   98|   486k|      __m128i eight = _mm_set1_epi16(8);
   99|   486k|      d0 = _mm_add_epi16(d0, eight);
  100|   486k|      d0 = _mm_srai_epi16(d0, 4);
  101|   486k|      d0 = _mm_packus_epi16(d0, d0);
  102|   486k|      __m128i out0 = _mm_lddqu_si128((__m128i *)out);
  103|   486k|      __m128i n0 = _mm_set1_epi8(n_out);
  104|   486k|      __m128i mask = _mm_cmpgt_epi8(n0, iden);
  105|   486k|      out0 = _mm_blendv_epi8(out0, d0, mask);
  106|   486k|      _mm_storel_epi64((__m128i *)out, out0);
  107|   486k|      __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16));
  108|       |      in0 = _mm_alignr_epi8(in1, in0, 8);
  109|   486k|      in += 8;
  110|   486k|      out += 8;
  111|   486k|      len -= n_out;
  112|   486k|    }
  113|   159k|  }
  114|   358k|}
av1_upsample_intra_edge_sse4_1:
  116|   242k|void av1_upsample_intra_edge_sse4_1(uint8_t *p, int sz) {
  117|       |  // interpolate half-sample positions
  118|   242k|  assert(sz <= 24);
  119|       |
  120|   242k|  DECLARE_ALIGNED(16, static const int8_t, kernel[1][16]) = {
  ------------------
  |  |   19|   242k|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
  121|   242k|    { -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1 }
  122|   242k|  };
  123|       |
  124|   242k|  DECLARE_ALIGNED(
  ------------------
  |  |   19|   242k|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
  125|   242k|      16, static const int8_t,
  126|   242k|      v_const[2][16]) = { { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 },
  127|   242k|                          { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } };
  128|       |
  129|       |  // Extend first/last samples (upper-left p[-1], last p[sz-1])
  130|       |  // to support 4-tap filter
  131|   242k|  p[-2] = p[-1];
  132|   242k|  p[sz] = p[sz - 1];
  133|       |
  134|   242k|  uint8_t *in = &p[-2];
  135|   242k|  uint8_t *out = &p[-2];
  136|       |
  137|   242k|  int n = sz + 1;  // Input length including upper-left sample
  138|       |
  139|   242k|  __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
  140|   242k|  __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]);
  141|       |
  142|   242k|  __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]);
  143|   242k|  __m128i shuf0 = _mm_lddqu_si128((__m128i *)v_const[0]);
  144|   242k|  __m128i shuf1 = _mm_lddqu_si128((__m128i *)v_const[1]);
  145|       |
  146|   517k|  while (n > 0) {
  ------------------
  |  Branch (146:10): [True: 275k, False: 242k]
  ------------------
  147|   275k|    __m128i in8 = _mm_alignr_epi8(in16, in0, 8);
  148|   275k|    __m128i d0 = _mm_shuffle_epi8(in0, shuf0);
  149|   275k|    __m128i d1 = _mm_shuffle_epi8(in0, shuf1);
  150|   275k|    __m128i d2 = _mm_shuffle_epi8(in8, shuf0);
  151|   275k|    __m128i d3 = _mm_shuffle_epi8(in8, shuf1);
  152|   275k|    d0 = _mm_maddubs_epi16(d0, coef0);
  153|   275k|    d1 = _mm_maddubs_epi16(d1, coef0);
  154|   275k|    d2 = _mm_maddubs_epi16(d2, coef0);
  155|   275k|    d3 = _mm_maddubs_epi16(d3, coef0);
  156|   275k|    d0 = _mm_hadd_epi16(d0, d1);
  157|   275k|    d2 = _mm_hadd_epi16(d2, d3);
  158|   275k|    __m128i eight = _mm_set1_epi16(8);
  159|   275k|    d0 = _mm_add_epi16(d0, eight);
  160|   275k|    d2 = _mm_add_epi16(d2, eight);
  161|   275k|    d0 = _mm_srai_epi16(d0, 4);
  162|   275k|    d2 = _mm_srai_epi16(d2, 4);
  163|   275k|    d0 = _mm_packus_epi16(d0, d2);
  164|       |    __m128i in1 = _mm_alignr_epi8(in16, in0, 1);
  165|   275k|    __m128i out0 = _mm_unpacklo_epi8(in1, d0);
  166|   275k|    __m128i out1 = _mm_unpackhi_epi8(in1, d0);
  167|   275k|    _mm_storeu_si128((__m128i *)&out[0], out0);
  168|   275k|    _mm_storeu_si128((__m128i *)&out[16], out1);
  169|   275k|    in0 = in16;
  170|   275k|    in16 = _mm_setzero_si128();
  171|   275k|    out += 32;
  172|   275k|    n -= 16;
  173|   275k|  }
  174|   242k|}
av1_highbd_filter_intra_edge_sse4_1:
  178|   487k|void av1_highbd_filter_intra_edge_sse4_1(uint16_t *p, int sz, int strength) {
  179|   487k|  if (!strength) return;
  ------------------
  |  Branch (179:7): [True: 221k, False: 266k]
  ------------------
  180|       |
  181|   266k|  DECLARE_ALIGNED(16, static const int16_t, kern[3][8]) = {
  ------------------
  |  |   19|   266k|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
  182|   266k|    { 4, 8, 4, 8, 4, 8, 4, 8 },  // strength 1: 4,8,4
  183|   266k|    { 5, 6, 5, 6, 5, 6, 5, 6 },  // strength 2: 5,6,5
  184|   266k|    { 2, 4, 2, 4, 2, 4, 2, 4 }   // strength 3: 2,4,4,4,2
  185|   266k|  };
  186|       |
  187|   266k|  DECLARE_ALIGNED(16, static const int16_t,
  ------------------
  |  |   19|   266k|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
  188|   266k|                  v_const[1][8]) = { { 0, 1, 2, 3, 4, 5, 6, 7 } };
  189|       |
  190|       |  // Extend the first and last samples to simplify the loop for the 5-tap case
  191|   266k|  p[-1] = p[0];
  192|   266k|  __m128i last = _mm_set1_epi16(p[sz - 1]);
  193|   266k|  _mm_storeu_si128((__m128i *)&p[sz], last);
  194|       |
  195|       |  // Adjust input pointer for filter support area
  196|   266k|  uint16_t *in = (strength == 3) ? p - 1 : p;
  ------------------
  |  Branch (196:18): [True: 115k, False: 151k]
  ------------------
  197|       |
  198|       |  // Avoid modifying first sample
  199|   266k|  uint16_t *out = p + 1;
  200|   266k|  int len = sz - 1;
  201|       |
  202|   266k|  const int use_3tap_filter = (strength < 3);
  203|       |
  204|   266k|  if (use_3tap_filter) {
  ------------------
  |  Branch (204:7): [True: 151k, False: 115k]
  ------------------
  205|   151k|    __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
  206|   151k|    __m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]);
  207|   151k|    __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
  208|   151k|    __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
  209|   376k|    while (len > 0) {
  ------------------
  |  Branch (209:12): [True: 224k, False: 151k]
  ------------------
  210|   224k|      int n_out = (len < 8) ? len : 8;
  ------------------
  |  Branch (210:19): [True: 59.5k, False: 165k]
  ------------------
  211|   224k|      __m128i in1 = _mm_alignr_epi8(in8, in0, 2);
  212|   224k|      __m128i in2 = _mm_alignr_epi8(in8, in0, 4);
  213|   224k|      __m128i in02 = _mm_add_epi16(in0, in2);
  214|   224k|      __m128i d0 = _mm_unpacklo_epi16(in02, in1);
  215|   224k|      __m128i d1 = _mm_unpackhi_epi16(in02, in1);
  216|   224k|      d0 = _mm_mullo_epi16(d0, coef0);
  217|   224k|      d1 = _mm_mullo_epi16(d1, coef0);
  218|   224k|      d0 = _mm_hadd_epi16(d0, d1);
  219|   224k|      __m128i eight = _mm_set1_epi16(8);
  220|   224k|      d0 = _mm_add_epi16(d0, eight);
  221|   224k|      d0 = _mm_srli_epi16(d0, 4);
  222|   224k|      __m128i out0 = _mm_lddqu_si128((__m128i *)out);
  223|   224k|      __m128i n0 = _mm_set1_epi16(n_out);
  224|   224k|      __m128i mask = _mm_cmpgt_epi16(n0, iden);
  225|   224k|      out0 = _mm_blendv_epi8(out0, d0, mask);
  226|   224k|      _mm_storeu_si128((__m128i *)out, out0);
  227|   224k|      in += 8;
  228|   224k|      in0 = in8;
  229|   224k|      in8 = _mm_lddqu_si128((__m128i *)&in[8]);
  230|   224k|      out += 8;
  231|   224k|      len -= n_out;
  232|   224k|    }
  233|   151k|  } else {  // 5-tap filter
  234|   115k|    __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
  235|   115k|    __m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]);
  236|   115k|    __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
  237|   115k|    __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
  238|   457k|    while (len > 0) {
  ------------------
  |  Branch (238:12): [True: 342k, False: 115k]
  ------------------
  239|   342k|      int n_out = (len < 8) ? len : 8;
  ------------------
  |  Branch (239:19): [True: 13.1k, False: 329k]
  ------------------
  240|   342k|      __m128i in1 = _mm_alignr_epi8(in8, in0, 2);
  241|   342k|      __m128i in2 = _mm_alignr_epi8(in8, in0, 4);
  242|   342k|      __m128i in3 = _mm_alignr_epi8(in8, in0, 6);
  243|       |      __m128i in4 = _mm_alignr_epi8(in8, in0, 8);
  244|   342k|      __m128i in04 = _mm_add_epi16(in0, in4);
  245|   342k|      __m128i in123 = _mm_add_epi16(in1, in2);
  246|   342k|      in123 = _mm_add_epi16(in123, in3);
  247|   342k|      __m128i d0 = _mm_unpacklo_epi16(in04, in123);
  248|   342k|      __m128i d1 = _mm_unpackhi_epi16(in04, in123);
  249|   342k|      d0 = _mm_mullo_epi16(d0, coef0);
  250|   342k|      d1 = _mm_mullo_epi16(d1, coef0);
  251|   342k|      d0 = _mm_hadd_epi16(d0, d1);
  252|   342k|      __m128i eight = _mm_set1_epi16(8);
  253|   342k|      d0 = _mm_add_epi16(d0, eight);
  254|   342k|      d0 = _mm_srli_epi16(d0, 4);
  255|   342k|      __m128i out0 = _mm_lddqu_si128((__m128i *)out);
  256|   342k|      __m128i n0 = _mm_set1_epi16(n_out);
  257|   342k|      __m128i mask = _mm_cmpgt_epi16(n0, iden);
  258|   342k|      out0 = _mm_blendv_epi8(out0, d0, mask);
  259|   342k|      _mm_storeu_si128((__m128i *)out, out0);
  260|   342k|      in += 8;
  261|   342k|      in0 = in8;
  262|   342k|      in8 = _mm_lddqu_si128((__m128i *)&in[8]);
  263|   342k|      out += 8;
  264|   342k|      len -= n_out;
  265|   342k|    }
  266|   115k|  }
  267|   266k|}
av1_highbd_upsample_intra_edge_sse4_1:
  269|   172k|void av1_highbd_upsample_intra_edge_sse4_1(uint16_t *p, int sz, int bd) {
  270|       |  // interpolate half-sample positions
  271|   172k|  assert(sz <= 24);
  272|       |
  273|   172k|  DECLARE_ALIGNED(16, static const int16_t,
  ------------------
  |  |   19|   172k|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
  274|   172k|                  kernel[1][8]) = { { -1, 9, -1, 9, -1, 9, -1, 9 } };
  275|       |
  276|       |  // Extend first/last samples (upper-left p[-1], last p[sz-1])
  277|       |  // to support 4-tap filter
  278|   172k|  p[-2] = p[-1];
  279|   172k|  p[sz] = p[sz - 1];
  280|       |
  281|   172k|  uint16_t *in = &p[-2];
  282|   172k|  uint16_t *out = in;
  283|   172k|  int n = sz + 1;
  284|       |
  285|   172k|  __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
  286|   172k|  __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
  287|   172k|  __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]);
  288|   172k|  __m128i in24 = _mm_lddqu_si128((__m128i *)&in[24]);
  289|       |
  290|   499k|  while (n > 0) {
  ------------------
  |  Branch (290:10): [True: 327k, False: 172k]
  ------------------
  291|   327k|    __m128i in1 = _mm_alignr_epi8(in8, in0, 2);
  292|   327k|    __m128i in2 = _mm_alignr_epi8(in8, in0, 4);
  293|       |    __m128i in3 = _mm_alignr_epi8(in8, in0, 6);
  294|   327k|    __m128i sum0 = _mm_add_epi16(in0, in3);
  295|   327k|    __m128i sum1 = _mm_add_epi16(in1, in2);
  296|   327k|    __m128i d0 = _mm_unpacklo_epi16(sum0, sum1);
  297|   327k|    __m128i d1 = _mm_unpackhi_epi16(sum0, sum1);
  298|   327k|    __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]);
  299|   327k|    d0 = _mm_madd_epi16(d0, coef0);
  300|   327k|    d1 = _mm_madd_epi16(d1, coef0);
  301|   327k|    __m128i eight = _mm_set1_epi32(8);
  302|   327k|    d0 = _mm_add_epi32(d0, eight);
  303|   327k|    d1 = _mm_add_epi32(d1, eight);
  304|   327k|    d0 = _mm_srai_epi32(d0, 4);
  305|   327k|    d1 = _mm_srai_epi32(d1, 4);
  306|   327k|    d0 = _mm_packus_epi32(d0, d1);
  307|   327k|    __m128i max0 = _mm_set1_epi16((1 << bd) - 1);
  308|   327k|    d0 = _mm_min_epi16(d0, max0);
  309|   327k|    __m128i out0 = _mm_unpacklo_epi16(in1, d0);
  310|   327k|    __m128i out1 = _mm_unpackhi_epi16(in1, d0);
  311|   327k|    _mm_storeu_si128((__m128i *)&out[0], out0);
  312|   327k|    _mm_storeu_si128((__m128i *)&out[8], out1);
  313|   327k|    in0 = in8;
  314|   327k|    in8 = in16;
  315|   327k|    in16 = in24;
  316|   327k|    in24 = _mm_setzero_si128();
  317|   327k|    out += 16;
  318|   327k|    n -= 8;
  319|   327k|  }
  320|   172k|}

av1_dist_wtd_convolve_x_avx2:
   46|  4.16k|                                  ConvolveParams *conv_params) {
   47|  4.16k|  CONV_BUF_TYPE *dst = conv_params->dst;
   48|  4.16k|  int dst_stride = conv_params->dst_stride;
   49|  4.16k|  const int bd = 8;
   50|  4.16k|  int i, j, is_horiz_4tap = 0;
   51|  4.16k|  const int bits = FILTER_BITS - conv_params->round_1;
  ------------------
  |  |   21|  4.16k|#define FILTER_BITS 7
  ------------------
   52|  4.16k|  const __m256i wt = unpack_weights_avx2(conv_params);
   53|  4.16k|  const int do_average = conv_params->do_average;
   54|  4.16k|  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   55|  4.16k|  const int offset_0 =
   56|  4.16k|      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|  4.16k|#define FILTER_BITS 7
  ------------------
   57|  4.16k|  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
   58|  4.16k|  const __m256i offset_const = _mm256_set1_epi16(offset);
   59|  4.16k|  const int rounding_shift =
   60|  4.16k|      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|  4.16k|#define FILTER_BITS 7
  ------------------
   61|  4.16k|  const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
   62|       |
   63|  4.16k|  assert(bits >= 0);
   64|  4.16k|  assert(conv_params->round_0 > 0);
   65|       |
   66|  4.16k|  const __m256i round_const =
   67|  4.16k|      _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
   68|  4.16k|  const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
   69|       |
   70|  4.16k|  __m256i filt[4], coeffs[4];
   71|       |
   72|  4.16k|  filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
   73|  4.16k|  filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
   74|       |
   75|  4.16k|  prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs);
   76|       |
   77|       |  // Condition for checking valid horz_filt taps
   78|  4.16k|  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0)))
  ------------------
  |  Branch (78:7): [True: 1.35k, False: 2.81k]
  ------------------
   79|  1.35k|    is_horiz_4tap = 1;
   80|       |
   81|       |  // horz_filt as 4 tap
   82|  4.16k|  if (is_horiz_4tap) {
  ------------------
  |  Branch (82:7): [True: 1.35k, False: 2.81k]
  ------------------
   83|  1.35k|    const int fo_horiz = 1;
   84|  1.35k|    const uint8_t *const src_ptr = src - fo_horiz;
   85|  7.07k|    for (i = 0; i < h; i += 2) {
  ------------------
  |  Branch (85:17): [True: 5.71k, False: 1.35k]
  ------------------
   86|  5.71k|      const uint8_t *src_data = src_ptr + i * src_stride;
   87|  5.71k|      CONV_BUF_TYPE *dst_data = dst + i * dst_stride;
   88|  15.0k|      for (j = 0; j < w; j += 8) {
  ------------------
  |  Branch (88:19): [True: 9.35k, False: 5.71k]
  ------------------
   89|  9.35k|        const __m256i data =
   90|  9.35k|            load_line2_avx2(&src_data[j], &src_data[j + src_stride]);
   91|       |
   92|  9.35k|        __m256i res = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
   93|  9.35k|        res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift);
   94|  9.35k|        res = _mm256_slli_epi16(res, bits);
   95|       |
   96|  9.35k|        const __m256i res_unsigned = _mm256_add_epi16(res, offset_const);
   97|       |
   98|       |        // Accumulate values into the destination buffer
   99|  9.35k|        if (do_average) {
  ------------------
  |  Branch (99:13): [True: 3.34k, False: 6.00k]
  ------------------
  100|  3.34k|          const __m256i data_ref_0 =
  101|  3.34k|              load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]);
  102|  3.34k|          const __m256i comp_avg_res =
  103|  3.34k|              comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
  104|       |
  105|  3.34k|          const __m256i round_result = convolve_rounding(
  106|  3.34k|              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
  107|       |
  108|  3.34k|          const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
  109|  3.34k|          const __m128i res_0 = _mm256_castsi256_si128(res_8);
  110|  3.34k|          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
  111|       |
  112|  3.34k|          if (w > 4) {
  ------------------
  |  Branch (112:15): [True: 2.10k, False: 1.24k]
  ------------------
  113|  2.10k|            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
  114|  2.10k|            _mm_storel_epi64(
  115|  2.10k|                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
  116|  2.10k|          } else {
  117|  1.24k|            *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0);
  118|  1.24k|            *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
  119|  1.24k|                _mm_cvtsi128_si32(res_1);
  120|  1.24k|          }
  121|  6.00k|        } else {
  122|  6.00k|          const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
  123|  6.00k|          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
  124|       |
  125|  6.00k|          const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
  126|  6.00k|          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
  127|  6.00k|                          res_1);
  128|  6.00k|        }
  129|  9.35k|      }
  130|  5.71k|    }
  131|  2.81k|  } else {
  132|  2.81k|    const int fo_horiz = filter_params_x->taps / 2 - 1;
  133|  2.81k|    const uint8_t *const src_ptr = src - fo_horiz;
  134|       |
  135|  2.81k|    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
  136|  2.81k|    filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
  137|  22.0k|    for (i = 0; i < h; i += 2) {
  ------------------
  |  Branch (137:17): [True: 19.2k, False: 2.81k]
  ------------------
  138|  19.2k|      const uint8_t *src_data = src_ptr + i * src_stride;
  139|  19.2k|      CONV_BUF_TYPE *dst_data = dst + i * dst_stride;
  140|  67.6k|      for (j = 0; j < w; j += 8) {
  ------------------
  |  Branch (140:19): [True: 48.4k, False: 19.2k]
  ------------------
  141|  48.4k|        const __m256i data =
  142|  48.4k|            load_line2_avx2(&src_data[j], &src_data[j + src_stride]);
  143|       |
  144|  48.4k|        __m256i res = convolve_lowbd_x(data, coeffs, filt);
  145|       |
  146|  48.4k|        res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift);
  147|       |
  148|  48.4k|        res = _mm256_slli_epi16(res, bits);
  149|       |
  150|  48.4k|        const __m256i res_unsigned = _mm256_add_epi16(res, offset_const);
  151|       |
  152|       |        // Accumulate values into the destination buffer
  153|  48.4k|        if (do_average) {
  ------------------
  |  Branch (153:13): [True: 23.5k, False: 24.8k]
  ------------------
  154|  23.5k|          const __m256i data_ref_0 =
  155|  23.5k|              load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]);
  156|  23.5k|          const __m256i comp_avg_res =
  157|  23.5k|              comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
  158|       |
  159|  23.5k|          const __m256i round_result = convolve_rounding(
  160|  23.5k|              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
  161|       |
  162|  23.5k|          const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
  163|  23.5k|          const __m128i res_0 = _mm256_castsi256_si128(res_8);
  164|  23.5k|          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
  165|       |
  166|  23.5k|          if (w > 4) {
  ------------------
  |  Branch (166:15): [True: 23.5k, False: 0]
  ------------------
  167|  23.5k|            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
  168|  23.5k|            _mm_storel_epi64(
  169|  23.5k|                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
  170|  23.5k|          } else {
  171|      0|            *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0);
  172|      0|            *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
  173|      0|                _mm_cvtsi128_si32(res_1);
  174|      0|          }
  175|  24.8k|        } else {
  176|  24.8k|          const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
  177|  24.8k|          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
  178|       |
  179|       |          const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
  180|  24.8k|          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
  181|  24.8k|                          res_1);
  182|  24.8k|        }
  183|  48.4k|      }
  184|  19.2k|    }
  185|  2.81k|  }
  186|  4.16k|}
av1_dist_wtd_convolve_y_avx2:
  192|  2.95k|                                  ConvolveParams *conv_params) {
  193|  2.95k|  CONV_BUF_TYPE *dst = conv_params->dst;
  194|  2.95k|  int dst_stride = conv_params->dst_stride;
  195|  2.95k|  const int bd = 8;
  196|  2.95k|  int i, j, is_vert_4tap = 0;
  197|       |  // +1 to compensate for dividing the filter coeffs by 2
  198|  2.95k|  const int left_shift = FILTER_BITS - conv_params->round_0 + 1;
  ------------------
  |  |   21|  2.95k|#define FILTER_BITS 7
  ------------------
  199|  2.95k|  const __m256i round_const =
  200|  2.95k|      _mm256_set1_epi32((1 << conv_params->round_1) >> 1);
  201|  2.95k|  const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
  202|  2.95k|  const __m256i wt = unpack_weights_avx2(conv_params);
  203|  2.95k|  const int do_average = conv_params->do_average;
  204|  2.95k|  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
  205|  2.95k|  const int offset_0 =
  206|  2.95k|      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|  2.95k|#define FILTER_BITS 7
  ------------------
  207|  2.95k|  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
  208|  2.95k|  const __m256i offset_const = _mm256_set1_epi16(offset);
  209|  2.95k|  const int offset_1 = (1 << (bd + FILTER_BITS - 2));
  ------------------
  |  |   21|  2.95k|#define FILTER_BITS 7
  ------------------
  210|  2.95k|  const __m256i offset_const_1 = _mm256_set1_epi16(offset_1);
  211|  2.95k|  const __m256i offset_const_2 = _mm256_set1_epi16((1 << offset_0));
  212|  2.95k|  const int rounding_shift =
  213|  2.95k|      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|  2.95k|#define FILTER_BITS 7
  ------------------
  214|  2.95k|  const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
  215|  2.95k|  const __m256i zero = _mm256_setzero_si256();
  216|  2.95k|  __m256i coeffs[4], s[8];
  217|       |
  218|  2.95k|  assert((FILTER_BITS - conv_params->round_0) >= 0);
  219|       |
  220|  2.95k|  prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs);
  221|       |
  222|       |  // Condition for checking valid vert_filt taps
  223|  2.95k|  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0)))
  ------------------
  |  Branch (223:7): [True: 1.06k, False: 1.88k]
  ------------------
  224|  1.06k|    is_vert_4tap = 1;
  225|       |
  226|  2.95k|  if (is_vert_4tap) {
  ------------------
  |  Branch (226:7): [True: 1.06k, False: 1.88k]
  ------------------
  227|  1.06k|    const int fo_vert = 1;
  228|  1.06k|    const uint8_t *const src_ptr = src - fo_vert * src_stride;
  229|  2.17k|    for (j = 0; j < w; j += 16) {
  ------------------
  |  Branch (229:17): [True: 1.11k, False: 1.06k]
  ------------------
  230|  1.11k|      const uint8_t *data = &src_ptr[j];
  231|  1.11k|      __m256i src4;
  232|       |      // Load lines a and b. Line a to lower 128, line b to upper 128
  233|  1.11k|      {
  234|  1.11k|        __m256i src_ab[4];
  235|  1.11k|        __m256i src_a[5];
  236|  1.11k|        src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
  237|  5.56k|        for (int kk = 0; kk < 4; ++kk) {
  ------------------
  |  Branch (237:26): [True: 4.45k, False: 1.11k]
  ------------------
  238|  4.45k|          data += src_stride;
  239|  4.45k|          src_a[kk + 1] =
  240|  4.45k|              _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
  241|  4.45k|          src_ab[kk] =
  242|  4.45k|              _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20);
  243|  4.45k|        }
  244|  1.11k|        src4 = src_a[4];
  245|  1.11k|        s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]);
  246|  1.11k|        s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]);
  247|       |
  248|  1.11k|        s[3] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]);
  249|  1.11k|        s[4] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]);
  250|  1.11k|      }
  251|       |
  252|  4.99k|      for (i = 0; i < h; i += 2) {
  ------------------
  |  Branch (252:19): [True: 3.88k, False: 1.11k]
  ------------------
  253|  3.88k|        data = &src_ptr[(i + 5) * src_stride + j];
  254|  3.88k|        const __m256i src5 =
  255|  3.88k|            _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
  256|  3.88k|        const __m256i src_45a = _mm256_permute2x128_si256(src4, src5, 0x20);
  257|       |
  258|  3.88k|        src4 = _mm256_castsi128_si256(
  259|  3.88k|            _mm_loadu_si128((__m128i *)(data + src_stride)));
  260|  3.88k|        const __m256i src_56a = _mm256_permute2x128_si256(src5, src4, 0x20);
  261|       |
  262|  3.88k|        s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
  263|  3.88k|        s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
  264|       |
  265|  3.88k|        __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1);
  266|       |
  267|  3.88k|        res_lo = _mm256_add_epi16(res_lo, offset_const_1);
  268|       |
  269|  3.88k|        const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero);
  270|  3.88k|        const __m256i res_lo_0_shift =
  271|  3.88k|            _mm256_slli_epi32(res_lo_0_32b, left_shift);
  272|  3.88k|        const __m256i res_lo_0_round = _mm256_sra_epi32(
  273|  3.88k|            _mm256_add_epi32(res_lo_0_shift, round_const), round_shift);
  274|       |
  275|  3.88k|        const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero);
  276|  3.88k|        const __m256i res_lo_1_shift =
  277|  3.88k|            _mm256_slli_epi32(res_lo_1_32b, left_shift);
  278|  3.88k|        const __m256i res_lo_1_round = _mm256_sra_epi32(
  279|  3.88k|            _mm256_add_epi32(res_lo_1_shift, round_const), round_shift);
  280|       |
  281|  3.88k|        const __m256i res_lo_round =
  282|  3.88k|            _mm256_packs_epi32(res_lo_0_round, res_lo_1_round);
  283|       |
  284|  3.88k|        const __m256i res_lo_unsigned =
  285|  3.88k|            _mm256_add_epi16(res_lo_round, offset_const_2);
  286|       |
  287|  3.88k|        if (w - j < 16) {
  ------------------
  |  Branch (287:13): [True: 1.86k, False: 2.01k]
  ------------------
  288|  1.86k|          if (do_average) {
  ------------------
  |  Branch (288:15): [True: 740, False: 1.12k]
  ------------------
  289|    740|            const __m256i data_ref_0 =
  290|    740|                load_line2_avx2(&dst[i * dst_stride + j],
  291|    740|                                &dst[i * dst_stride + j + dst_stride]);
  292|    740|            const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_lo_unsigned,
  293|    740|                                                  &wt, use_dist_wtd_comp_avg);
  294|       |
  295|    740|            const __m256i round_result = convolve_rounding(
  296|    740|                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
  297|       |
  298|    740|            const __m256i res_8 =
  299|    740|                _mm256_packus_epi16(round_result, round_result);
  300|    740|            const __m128i res_0 = _mm256_castsi256_si128(res_8);
  301|    740|            const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
  302|       |
  303|    740|            if (w - j > 4) {
  ------------------
  |  Branch (303:17): [True: 268, False: 472]
  ------------------
  304|    268|              _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
  305|    268|              _mm_storel_epi64(
  306|    268|                  (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])),
  307|    268|                  res_1);
  308|    472|            } else {
  309|    472|              *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0);
  310|    472|              *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
  311|    472|                  _mm_cvtsi128_si32(res_1);
  312|    472|            }
  313|  1.12k|          } else {
  314|  1.12k|            const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned);
  315|  1.12k|            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
  316|       |
  317|  1.12k|            const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1);
  318|  1.12k|            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
  319|  1.12k|                            res_1);
  320|  1.12k|          }
  321|  2.01k|        } else {
  322|  2.01k|          __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1);
  323|       |
  324|  2.01k|          res_hi = _mm256_add_epi16(res_hi, offset_const_1);
  325|       |
  326|  2.01k|          const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero);
  327|  2.01k|          const __m256i res_hi_0_shift =
  328|  2.01k|              _mm256_slli_epi32(res_hi_0_32b, left_shift);
  329|  2.01k|          const __m256i res_hi_0_round = _mm256_sra_epi32(
  330|  2.01k|              _mm256_add_epi32(res_hi_0_shift, round_const), round_shift);
  331|       |
  332|  2.01k|          const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero);
  333|  2.01k|          const __m256i res_hi_1_shift =
  334|  2.01k|              _mm256_slli_epi32(res_hi_1_32b, left_shift);
  335|  2.01k|          const __m256i res_hi_1_round = _mm256_sra_epi32(
  336|  2.01k|              _mm256_add_epi32(res_hi_1_shift, round_const), round_shift);
  337|       |
  338|  2.01k|          const __m256i res_hi_round =
  339|  2.01k|              _mm256_packs_epi32(res_hi_0_round, res_hi_1_round);
  340|       |
  341|  2.01k|          const __m256i res_hi_unsigned =
  342|  2.01k|              _mm256_add_epi16(res_hi_round, offset_const_2);
  343|       |
  344|  2.01k|          if (do_average) {
  ------------------
  |  Branch (344:15): [True: 1.20k, False: 816]
  ------------------
  345|  1.20k|            const __m256i data_ref_0_lo =
  346|  1.20k|                load_line2_avx2(&dst[i * dst_stride + j],
  347|  1.20k|                                &dst[i * dst_stride + j + dst_stride]);
  348|       |
  349|  1.20k|            const __m256i data_ref_0_hi =
  350|  1.20k|                load_line2_avx2(&dst[i * dst_stride + j + 8],
  351|  1.20k|                                &dst[i * dst_stride + j + 8 + dst_stride]);
  352|       |
  353|  1.20k|            const __m256i comp_avg_res_lo = comp_avg(
  354|  1.20k|                &data_ref_0_lo, &res_lo_unsigned, &wt, use_dist_wtd_comp_avg);
  355|       |
  356|  1.20k|            const __m256i comp_avg_res_hi = comp_avg(
  357|  1.20k|                &data_ref_0_hi, &res_hi_unsigned, &wt, use_dist_wtd_comp_avg);
  358|       |
  359|  1.20k|            const __m256i round_result_lo =
  360|  1.20k|                convolve_rounding(&comp_avg_res_lo, &offset_const,
  361|  1.20k|                                  &rounding_const, rounding_shift);
  362|       |
  363|  1.20k|            const __m256i round_result_hi =
  364|  1.20k|                convolve_rounding(&comp_avg_res_hi, &offset_const,
  365|  1.20k|                                  &rounding_const, rounding_shift);
  366|       |
  367|  1.20k|            const __m256i res_8 =
  368|  1.20k|                _mm256_packus_epi16(round_result_lo, round_result_hi);
  369|  1.20k|            const __m128i res_0 = _mm256_castsi256_si128(res_8);
  370|  1.20k|            const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
  371|       |
  372|  1.20k|            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
  373|  1.20k|            _mm_store_si128(
  374|  1.20k|                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
  375|       |
  376|  1.20k|          } else {
  377|    816|            const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned);
  378|    816|            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0);
  379|       |
  380|    816|            const __m128i res_lo_1 =
  381|    816|                _mm256_extracti128_si256(res_lo_unsigned, 1);
  382|    816|            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
  383|    816|                            res_lo_1);
  384|       |
  385|    816|            const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned);
  386|    816|            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]),
  387|    816|                            res_hi_0);
  388|       |
  389|    816|            const __m128i res_hi_1 =
  390|    816|                _mm256_extracti128_si256(res_hi_unsigned, 1);
  391|    816|            _mm_store_si128(
  392|    816|                (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]),
  393|    816|                res_hi_1);
  394|    816|          }
  395|  2.01k|        }
  396|  3.88k|        s[0] = s[1];
  397|  3.88k|        s[1] = s[2];
  398|       |
  399|  3.88k|        s[3] = s[4];
  400|  3.88k|        s[4] = s[5];
  401|  3.88k|      }
  402|  1.11k|    }
  403|  1.88k|  } else {
  404|  1.88k|    const int fo_vert = filter_params_y->taps / 2 - 1;
  405|  1.88k|    const uint8_t *const src_ptr = src - fo_vert * src_stride;
  406|  4.12k|    for (j = 0; j < w; j += 16) {
  ------------------
  |  Branch (406:17): [True: 2.23k, False: 1.88k]
  ------------------
  407|  2.23k|      const uint8_t *data = &src_ptr[j];
  408|  2.23k|      __m256i src6;
  409|       |      // Load lines a and b. Line a to lower 128, line b to upper 128
  410|  2.23k|      {
  411|  2.23k|        __m256i src_ab[7];
  412|  2.23k|        __m256i src_a[7];
  413|  2.23k|        src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
  414|  15.6k|        for (int kk = 0; kk < 6; ++kk) {
  ------------------
  |  Branch (414:26): [True: 13.4k, False: 2.23k]
  ------------------
  415|  13.4k|          data += src_stride;
  416|  13.4k|          src_a[kk + 1] =
  417|  13.4k|              _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
  418|  13.4k|          src_ab[kk] =
  419|  13.4k|              _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20);
  420|  13.4k|        }
  421|  2.23k|        src6 = src_a[6];
  422|  2.23k|        s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]);
  423|  2.23k|        s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]);
  424|  2.23k|        s[2] = _mm256_unpacklo_epi8(src_ab[4], src_ab[5]);
  425|  2.23k|        s[4] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]);
  426|  2.23k|        s[5] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]);
  427|  2.23k|        s[6] = _mm256_unpackhi_epi8(src_ab[4], src_ab[5]);
  428|  2.23k|      }
  429|       |
  430|  21.9k|      for (i = 0; i < h; i += 2) {
  ------------------
  |  Branch (430:19): [True: 19.6k, False: 2.23k]
  ------------------
  431|  19.6k|        data = &src_ptr[(i + 7) * src_stride + j];
  432|  19.6k|        const __m256i src7 =
  433|  19.6k|            _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
  434|  19.6k|        const __m256i src_67a = _mm256_permute2x128_si256(src6, src7, 0x20);
  435|       |
  436|  19.6k|        src6 = _mm256_castsi128_si256(
  437|  19.6k|            _mm_loadu_si128((__m128i *)(data + src_stride)));
  438|  19.6k|        const __m256i src_78a = _mm256_permute2x128_si256(src7, src6, 0x20);
  439|       |
  440|  19.6k|        s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
  441|  19.6k|        s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
  442|       |
  443|  19.6k|        __m256i res_lo = convolve_lowbd(s, coeffs);
  444|       |
  445|  19.6k|        res_lo = _mm256_add_epi16(res_lo, offset_const_1);
  446|       |
  447|  19.6k|        const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero);
  448|  19.6k|        const __m256i res_lo_0_shift =
  449|  19.6k|            _mm256_slli_epi32(res_lo_0_32b, left_shift);
  450|  19.6k|        const __m256i res_lo_0_round = _mm256_sra_epi32(
  451|  19.6k|            _mm256_add_epi32(res_lo_0_shift, round_const), round_shift);
  452|       |
  453|  19.6k|        const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero);
  454|  19.6k|        const __m256i res_lo_1_shift =
  455|  19.6k|            _mm256_slli_epi32(res_lo_1_32b, left_shift);
  456|  19.6k|        const __m256i res_lo_1_round = _mm256_sra_epi32(
  457|  19.6k|            _mm256_add_epi32(res_lo_1_shift, round_const), round_shift);
  458|       |
  459|  19.6k|        const __m256i res_lo_round =
  460|  19.6k|            _mm256_packs_epi32(res_lo_0_round, res_lo_1_round);
  461|       |
  462|  19.6k|        const __m256i res_lo_unsigned =
  463|  19.6k|            _mm256_add_epi16(res_lo_round, offset_const_2);
  464|       |
  465|  19.6k|        if (w - j < 16) {
  ------------------
  |  Branch (465:13): [True: 5.28k, False: 14.3k]
  ------------------
  466|  5.28k|          if (do_average) {
  ------------------
  |  Branch (466:15): [True: 2.52k, False: 2.76k]
  ------------------
  467|  2.52k|            const __m256i data_ref_0 =
  468|  2.52k|                load_line2_avx2(&dst[i * dst_stride + j],
  469|  2.52k|                                &dst[i * dst_stride + j + dst_stride]);
  470|  2.52k|            const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_lo_unsigned,
  471|  2.52k|                                                  &wt, use_dist_wtd_comp_avg);
  472|       |
  473|  2.52k|            const __m256i round_result = convolve_rounding(
  474|  2.52k|                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
  475|       |
  476|  2.52k|            const __m256i res_8 =
  477|  2.52k|                _mm256_packus_epi16(round_result, round_result);
  478|  2.52k|            const __m128i res_0 = _mm256_castsi256_si128(res_8);
  479|  2.52k|            const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
  480|       |
  481|  2.52k|            if (w - j > 4) {
  ------------------
  |  Branch (481:17): [True: 2.09k, False: 432]
  ------------------
  482|  2.09k|              _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
  483|  2.09k|              _mm_storel_epi64(
  484|  2.09k|                  (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])),
  485|  2.09k|                  res_1);
  486|  2.09k|            } else {
  487|    432|              *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0);
  488|    432|              *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
  489|    432|                  _mm_cvtsi128_si32(res_1);
  490|    432|            }
  491|  2.76k|          } else {
  492|  2.76k|            const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned);
  493|  2.76k|            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
  494|       |
  495|  2.76k|            const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1);
  496|  2.76k|            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
  497|  2.76k|                            res_1);
  498|  2.76k|          }
  499|  14.3k|        } else {
  500|  14.3k|          __m256i res_hi = convolve_lowbd(s + 4, coeffs);
  501|       |
  502|  14.3k|          res_hi = _mm256_add_epi16(res_hi, offset_const_1);
  503|       |
  504|  14.3k|          const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero);
  505|  14.3k|          const __m256i res_hi_0_shift =
  506|  14.3k|              _mm256_slli_epi32(res_hi_0_32b, left_shift);
  507|  14.3k|          const __m256i res_hi_0_round = _mm256_sra_epi32(
  508|  14.3k|              _mm256_add_epi32(res_hi_0_shift, round_const), round_shift);
  509|       |
  510|  14.3k|          const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero);
  511|  14.3k|          const __m256i res_hi_1_shift =
  512|  14.3k|              _mm256_slli_epi32(res_hi_1_32b, left_shift);
  513|  14.3k|          const __m256i res_hi_1_round = _mm256_sra_epi32(
  514|  14.3k|              _mm256_add_epi32(res_hi_1_shift, round_const), round_shift);
  515|       |
  516|  14.3k|          const __m256i res_hi_round =
  517|  14.3k|              _mm256_packs_epi32(res_hi_0_round, res_hi_1_round);
  518|       |
  519|  14.3k|          const __m256i res_hi_unsigned =
  520|  14.3k|              _mm256_add_epi16(res_hi_round, offset_const_2);
  521|       |
  522|  14.3k|          if (do_average) {
  ------------------
  |  Branch (522:15): [True: 6.22k, False: 8.16k]
  ------------------
  523|  6.22k|            const __m256i data_ref_0_lo =
  524|  6.22k|                load_line2_avx2(&dst[i * dst_stride + j],
  525|  6.22k|                                &dst[i * dst_stride + j + dst_stride]);
  526|       |
  527|  6.22k|            const __m256i data_ref_0_hi =
  528|  6.22k|                load_line2_avx2(&dst[i * dst_stride + j + 8],
  529|  6.22k|                                &dst[i * dst_stride + j + 8 + dst_stride]);
  530|       |
  531|  6.22k|            const __m256i comp_avg_res_lo = comp_avg(
  532|  6.22k|                &data_ref_0_lo, &res_lo_unsigned, &wt, use_dist_wtd_comp_avg);
  533|       |
  534|  6.22k|            const __m256i comp_avg_res_hi = comp_avg(
  535|  6.22k|                &data_ref_0_hi, &res_hi_unsigned, &wt, use_dist_wtd_comp_avg);
  536|       |
  537|  6.22k|            const __m256i round_result_lo =
  538|  6.22k|                convolve_rounding(&comp_avg_res_lo, &offset_const,
  539|  6.22k|                                  &rounding_const, rounding_shift);
  540|       |
  541|  6.22k|            const __m256i round_result_hi =
  542|  6.22k|                convolve_rounding(&comp_avg_res_hi, &offset_const,
  543|  6.22k|                                  &rounding_const, rounding_shift);
  544|       |
  545|  6.22k|            const __m256i res_8 =
  546|  6.22k|                _mm256_packus_epi16(round_result_lo, round_result_hi);
  547|  6.22k|            const __m128i res_0 = _mm256_castsi256_si128(res_8);
  548|  6.22k|            const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
  549|       |
  550|  6.22k|            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
  551|  6.22k|            _mm_store_si128(
  552|  6.22k|                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
  553|       |
  554|  8.16k|          } else {
  555|  8.16k|            const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned);
  556|  8.16k|            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0);
  557|       |
  558|  8.16k|            const __m128i res_lo_1 =
  559|  8.16k|                _mm256_extracti128_si256(res_lo_unsigned, 1);
  560|  8.16k|            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
  561|  8.16k|                            res_lo_1);
  562|       |
  563|  8.16k|            const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned);
  564|  8.16k|            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]),
  565|  8.16k|                            res_hi_0);
  566|       |
  567|  8.16k|            const __m128i res_hi_1 =
  568|  8.16k|                _mm256_extracti128_si256(res_hi_unsigned, 1);
  569|  8.16k|            _mm_store_si128(
  570|  8.16k|                (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]),
  571|  8.16k|                res_hi_1);
  572|  8.16k|          }
  573|  14.3k|        }
  574|  19.6k|        s[0] = s[1];
  575|  19.6k|        s[1] = s[2];
  576|  19.6k|        s[2] = s[3];
  577|       |
  578|  19.6k|        s[4] = s[5];
  579|  19.6k|        s[5] = s[6];
  580|  19.6k|        s[6] = s[7];
  581|  19.6k|      }
  582|  2.23k|    }
  583|  1.88k|  }
  584|  2.95k|}
av1_dist_wtd_convolve_2d_avx2:
  591|  9.41k|                                   ConvolveParams *conv_params) {
  592|  9.41k|  CONV_BUF_TYPE *dst = conv_params->dst;
  593|  9.41k|  int dst_stride = conv_params->dst_stride;
  594|  9.41k|  const int bd = 8;
  595|       |
  596|  9.41k|  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
  ------------------
  |  |   19|  9.41k|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
  597|       |
  598|  9.41k|  int im_stride = 8;
  599|  9.41k|  int i, is_horiz_4tap = 0, is_vert_4tap = 0;
  600|  9.41k|  const __m256i wt = unpack_weights_avx2(conv_params);
  601|  9.41k|  const int do_average = conv_params->do_average;
  602|  9.41k|  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
  603|  9.41k|  const int offset_0 =
  604|  9.41k|      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|  9.41k|#define FILTER_BITS 7
  ------------------
  605|  9.41k|  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
  606|  9.41k|  const __m256i offset_const = _mm256_set1_epi16(offset);
  607|  9.41k|  const int rounding_shift =
  608|  9.41k|      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|  9.41k|#define FILTER_BITS 7
  ------------------
  609|  9.41k|  const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
  610|       |
  611|  9.41k|  assert(conv_params->round_0 > 0);
  612|       |
  613|  9.41k|  const __m256i round_const_h = _mm256_set1_epi16(
  614|  9.41k|      ((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2)));
  ------------------
  |  |   21|  9.41k|#define FILTER_BITS 7
  ------------------
  615|  9.41k|  const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1);
  616|       |
  617|  9.41k|  const __m256i round_const_v = _mm256_set1_epi32(
  618|  9.41k|      ((1 << conv_params->round_1) >> 1) -
  619|  9.41k|      (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
  ------------------
  |  |   21|  9.41k|#define FILTER_BITS 7
  ------------------
  620|  9.41k|  const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
  621|       |
  622|  9.41k|  __m256i filt[4], coeffs_x[4], coeffs_y[4];
  623|       |
  624|  9.41k|  filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
  625|  9.41k|  filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
  626|       |
  627|  9.41k|  prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_x);
  628|  9.41k|  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
  629|       |
  630|       |  // Condition for checking valid horz_filt taps
  631|  9.41k|  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_x[0], coeffs_x[3]), 0)))
  ------------------
  |  Branch (631:7): [True: 3.33k, False: 6.08k]
  ------------------
  632|  3.33k|    is_horiz_4tap = 1;
  633|       |
  634|       |  // Condition for checking valid vert_filt taps
  635|  9.41k|  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_y[0], coeffs_y[3]), 0)))
  ------------------
  |  Branch (635:7): [True: 3.46k, False: 5.95k]
  ------------------
  636|  3.46k|    is_vert_4tap = 1;
  637|       |
  638|  9.41k|  if (is_horiz_4tap) {
  ------------------
  |  Branch (638:7): [True: 3.33k, False: 6.08k]
  ------------------
  639|  3.33k|    int im_h = h + filter_params_y->taps - 1;
  640|  3.33k|    const int fo_vert = filter_params_y->taps / 2 - 1;
  641|  3.33k|    const int fo_horiz = 1;
  642|  3.33k|    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
  643|  7.19k|    for (int j = 0; j < w; j += 8) {
  ------------------
  |  Branch (643:21): [True: 3.86k, False: 3.33k]
  ------------------
  644|       |      /* Horizontal filter */
  645|  3.86k|      const uint8_t *src_h = src_ptr + j;
  646|  38.9k|      for (i = 0; i < im_h; i += 2) {
  ------------------
  |  Branch (646:19): [True: 35.1k, False: 3.86k]
  ------------------
  647|  35.1k|        __m256i data =
  648|  35.1k|            _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h));
  649|  35.1k|        if (i + 1 < im_h)
  ------------------
  |  Branch (649:13): [True: 31.2k, False: 3.86k]
  ------------------
  650|  31.2k|          data = _mm256_inserti128_si256(
  651|  35.1k|              data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1);
  652|  35.1k|        src_h += (src_stride << 1);
  653|  35.1k|        __m256i res = convolve_lowbd_x_4tap(data, coeffs_x + 1, filt);
  654|       |
  655|  35.1k|        res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h),
  656|  35.1k|                               round_shift_h);
  657|       |
  658|  35.1k|        _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
  659|  35.1k|      }
  660|  3.86k|      DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP;
  ------------------
  |  |  501|  3.86k|  do {                                                                         \
  |  |  502|  3.86k|    __m256i s[8];                                                              \
  |  |  503|  3.86k|    __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));    \
  |  |  504|  3.86k|    __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));    \
  |  |  505|  3.86k|    __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));    \
  |  |  506|  3.86k|    __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));    \
  |  |  507|  3.86k|    __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));    \
  |  |  508|  3.86k|    __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));    \
  |  |  509|  3.86k|                                                                               \
  |  |  510|  3.86k|    s[0] = _mm256_unpacklo_epi16(s0, s1);                                      \
  |  |  511|  3.86k|    s[1] = _mm256_unpacklo_epi16(s2, s3);                                      \
  |  |  512|  3.86k|    s[2] = _mm256_unpacklo_epi16(s4, s5);                                      \
  |  |  513|  3.86k|                                                                               \
  |  |  514|  3.86k|    s[4] = _mm256_unpackhi_epi16(s0, s1);                                      \
  |  |  515|  3.86k|    s[5] = _mm256_unpackhi_epi16(s2, s3);                                      \
  |  |  516|  3.86k|    s[6] = _mm256_unpackhi_epi16(s4, s5);                                      \
  |  |  517|  3.86k|                                                                               \
  |  |  518|  23.5k|    for (i = 0; i < h; i += 2) {                                               \
  |  |  ------------------
  |  |  |  Branch (518:17): [True: 19.6k, False: 3.86k]
  |  |  ------------------
  |  |  519|  19.6k|      const int16_t *data = &im_block[i * im_stride];                          \
  |  |  520|  19.6k|                                                                               \
  |  |  521|  19.6k|      const __m256i s6 =                                                       \
  |  |  522|  19.6k|          _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));               \
  |  |  523|  19.6k|      const __m256i s7 =                                                       \
  |  |  524|  19.6k|          _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));               \
  |  |  525|  19.6k|                                                                               \
  |  |  526|  19.6k|      s[3] = _mm256_unpacklo_epi16(s6, s7);                                    \
  |  |  527|  19.6k|      s[7] = _mm256_unpackhi_epi16(s6, s7);                                    \
  |  |  528|  19.6k|                                                                               \
  |  |  529|  19.6k|      const __m256i res_a = convolve(s, coeffs_y);                             \
  |  |  530|  19.6k|      const __m256i res_a_round = _mm256_sra_epi32(                            \
  |  |  531|  19.6k|          _mm256_add_epi32(res_a, round_const_v), round_shift_v);              \
  |  |  532|  19.6k|                                                                               \
  |  |  533|  19.6k|      if (w - j > 4) {                                                         \
  |  |  ------------------
  |  |  |  Branch (533:11): [True: 10.7k, False: 8.90k]
  |  |  ------------------
  |  |  534|  10.7k|        const __m256i res_b = convolve(s + 4, coeffs_y);                       \
  |  |  535|  10.7k|        const __m256i res_b_round = _mm256_sra_epi32(                          \
  |  |  536|  10.7k|            _mm256_add_epi32(res_b, round_const_v), round_shift_v);            \
  |  |  537|  10.7k|        const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round);  \
  |  |  538|  10.7k|        const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);  \
  |  |  539|  10.7k|                                                                               \
  |  |  540|  10.7k|        if (do_average) {                                                      \
  |  |  ------------------
  |  |  |  Branch (540:13): [True: 4.55k, False: 6.21k]
  |  |  ------------------
  |  |  541|  4.55k|          const __m256i data_ref_0 =                                           \
  |  |  542|  4.55k|              load_line2_avx2(&dst[i * dst_stride + j],                        \
  |  |  543|  4.55k|                              &dst[i * dst_stride + j + dst_stride]);          \
  |  |  544|  4.55k|          const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned,    \
  |  |  545|  4.55k|                                                &wt, use_dist_wtd_comp_avg);   \
  |  |  546|  4.55k|                                                                               \
  |  |  547|  4.55k|          const __m256i round_result = convolve_rounding(                      \
  |  |  548|  4.55k|              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);  \
  |  |  549|  4.55k|                                                                               \
  |  |  550|  4.55k|          const __m256i res_8 =                                                \
  |  |  551|  4.55k|              _mm256_packus_epi16(round_result, round_result);                 \
  |  |  552|  4.55k|          const __m128i res_0 = _mm256_castsi256_si128(res_8);                 \
  |  |  553|  4.55k|          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);            \
  |  |  554|  4.55k|                                                                               \
  |  |  555|  4.55k|          _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);    \
  |  |  556|  4.55k|          _mm_storel_epi64(                                                    \
  |  |  557|  4.55k|              (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); \
  |  |  558|  6.21k|        } else {                                                               \
  |  |  559|  6.21k|          const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);          \
  |  |  560|  6.21k|          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);       \
  |  |  561|  6.21k|                                                                               \
  |  |  562|  6.21k|          const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);     \
  |  |  563|  6.21k|          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),  \
  |  |  564|  6.21k|                          res_1);                                              \
  |  |  565|  6.21k|        }                                                                      \
  |  |  566|  10.7k|      } else {                                                                 \
  |  |  567|  8.90k|        const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round);  \
  |  |  568|  8.90k|        const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);  \
  |  |  569|  8.90k|                                                                               \
  |  |  570|  8.90k|        if (do_average) {                                                      \
  |  |  ------------------
  |  |  |  Branch (570:13): [True: 3.67k, False: 5.23k]
  |  |  ------------------
  |  |  571|  3.67k|          const __m256i data_ref_0 =                                           \
  |  |  572|  3.67k|              load_line2_avx2(&dst[i * dst_stride + j],                        \
  |  |  573|  3.67k|                              &dst[i * dst_stride + j + dst_stride]);          \
  |  |  574|  3.67k|                                                                               \
  |  |  575|  3.67k|          const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned,    \
  |  |  576|  3.67k|                                                &wt, use_dist_wtd_comp_avg);   \
  |  |  577|  3.67k|                                                                               \
  |  |  578|  3.67k|          const __m256i round_result = convolve_rounding(                      \
  |  |  579|  3.67k|              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);  \
  |  |  580|  3.67k|                                                                               \
  |  |  581|  3.67k|          const __m256i res_8 =                                                \
  |  |  582|  3.67k|              _mm256_packus_epi16(round_result, round_result);                 \
  |  |  583|  3.67k|          const __m128i res_0 = _mm256_castsi256_si128(res_8);                 \
  |  |  584|  3.67k|          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);            \
  |  |  585|  3.67k|                                                                               \
  |  |  586|  3.67k|          *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0);     \
  |  |  587|  3.67k|          *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) =                 \
  |  |  588|  3.67k|              _mm_cvtsi128_si32(res_1);                                        \
  |  |  589|  3.67k|                                                                               \
  |  |  590|  5.23k|        } else {                                                               \
  |  |  591|  5.23k|          const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);          \
  |  |  592|  5.23k|          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);       \
  |  |  593|  5.23k|                                                                               \
  |  |  594|  5.23k|          const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);     \
  |  |  595|  5.23k|          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),  \
  |  |  596|  5.23k|                          res_1);                                              \
  |  |  597|  5.23k|        }                                                                      \
  |  |  598|  8.90k|      }                                                                        \
  |  |  599|  19.6k|                                                                               \
  |  |  600|  19.6k|      s[0] = s[1];                                                             \
  |  |  601|  19.6k|      s[1] = s[2];                                                             \
  |  |  602|  19.6k|      s[2] = s[3];                                                             \
  |  |  603|  19.6k|                                                                               \
  |  |  604|  19.6k|      s[4] = s[5];                                                             \
  |  |  605|  19.6k|      s[5] = s[6];                                                             \
  |  |  606|  19.6k|      s[6] = s[7];                                                             \
  |  |  607|  19.6k|    }                                                                          \
  |  |  608|  3.86k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (608:12): [Folded, False: 3.86k]
  |  |  ------------------
  ------------------
  661|  3.86k|    }
  662|  6.08k|  } else if (is_vert_4tap) {
  ------------------
  |  Branch (662:14): [True: 1.33k, False: 4.74k]
  ------------------
  663|  1.33k|    int im_h = h + 3;
  664|  1.33k|    const int fo_vert = 1;
  665|  1.33k|    const int fo_horiz = filter_params_x->taps / 2 - 1;
  666|  1.33k|    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
  667|       |
  668|  1.33k|    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
  669|  1.33k|    filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
  670|       |
  671|  3.24k|    for (int j = 0; j < w; j += 8) {
  ------------------
  |  Branch (671:21): [True: 1.90k, False: 1.33k]
  ------------------
  672|       |      /* Horizontal filter */
  673|  1.90k|      const uint8_t *src_h = src_ptr + j;
  674|  1.90k|      DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP;
  ------------------
  |  |  483|  1.90k|  do {                                                                  \
  |  |  484|  16.8k|    for (i = 0; i < im_h; i += 2) {                                     \
  |  |  ------------------
  |  |  |  Branch (484:17): [True: 14.9k, False: 1.90k]
  |  |  ------------------
  |  |  485|  14.9k|      __m256i data =                                                    \
  |  |  486|  14.9k|          _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h));    \
  |  |  487|  14.9k|      if (i + 1 < im_h)                                                 \
  |  |  ------------------
  |  |  |  Branch (487:11): [True: 13.0k, False: 1.90k]
  |  |  ------------------
  |  |  488|  14.9k|        data = _mm256_inserti128_si256(                                 \
  |  |  489|  14.9k|            data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1); \
  |  |  490|  14.9k|      src_h += (src_stride << 1);                                       \
  |  |  491|  14.9k|      __m256i res = convolve_lowbd_x(data, coeffs_x, filt);             \
  |  |  492|  14.9k|                                                                        \
  |  |  493|  14.9k|      res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h),      \
  |  |  494|  14.9k|                             round_shift_h);                            \
  |  |  495|  14.9k|                                                                        \
  |  |  496|  14.9k|      _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);     \
  |  |  497|  14.9k|    }                                                                   \
  |  |  498|  1.90k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (498:12): [Folded, False: 1.90k]
  |  |  ------------------
  ------------------
  675|       |
  676|       |      /* Vertical filter */
  677|  1.90k|      __m256i s[6];
  678|  1.90k|      __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
  679|  1.90k|      __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
  680|  1.90k|      __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
  681|  1.90k|      __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
  682|       |
  683|  1.90k|      s[0] = _mm256_unpacklo_epi16(s0, s1);
  684|  1.90k|      s[1] = _mm256_unpacklo_epi16(s2, s3);
  685|       |
  686|  1.90k|      s[3] = _mm256_unpackhi_epi16(s0, s1);
  687|  1.90k|      s[4] = _mm256_unpackhi_epi16(s2, s3);
  688|       |
  689|  13.0k|      for (i = 0; i < h; i += 2) {
  ------------------
  |  Branch (689:19): [True: 11.1k, False: 1.90k]
  ------------------
  690|  11.1k|        const int16_t *data = &im_block[i * im_stride];
  691|       |
  692|  11.1k|        const __m256i s4 =
  693|  11.1k|            _mm256_loadu_si256((__m256i *)(data + 4 * im_stride));
  694|  11.1k|        const __m256i s5 =
  695|  11.1k|            _mm256_loadu_si256((__m256i *)(data + 5 * im_stride));
  696|       |
  697|  11.1k|        s[2] = _mm256_unpacklo_epi16(s4, s5);
  698|  11.1k|        s[5] = _mm256_unpackhi_epi16(s4, s5);
  699|       |
  700|  11.1k|        const __m256i res_a = convolve_4tap(s, coeffs_y + 1);
  701|  11.1k|        const __m256i res_a_round = _mm256_sra_epi32(
  702|  11.1k|            _mm256_add_epi32(res_a, round_const_v), round_shift_v);
  703|       |
  704|  11.1k|        if (w - j > 4) {
  ------------------
  |  Branch (704:13): [True: 11.1k, False: 0]
  ------------------
  705|  11.1k|          const __m256i res_b = convolve_4tap(s + 3, coeffs_y + 1);
  706|  11.1k|          const __m256i res_b_round = _mm256_sra_epi32(
  707|  11.1k|              _mm256_add_epi32(res_b, round_const_v), round_shift_v);
  708|  11.1k|          const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round);
  709|  11.1k|          const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);
  710|       |
  711|  11.1k|          if (do_average) {
  ------------------
  |  Branch (711:15): [True: 4.24k, False: 6.92k]
  ------------------
  712|  4.24k|            const __m256i data_ref_0 =
  713|  4.24k|                load_line2_avx2(&dst[i * dst_stride + j],
  714|  4.24k|                                &dst[i * dst_stride + j + dst_stride]);
  715|  4.24k|            const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned,
  716|  4.24k|                                                  &wt, use_dist_wtd_comp_avg);
  717|       |
  718|  4.24k|            const __m256i round_result = convolve_rounding(
  719|  4.24k|                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
  720|       |
  721|  4.24k|            const __m256i res_8 =
  722|  4.24k|                _mm256_packus_epi16(round_result, round_result);
  723|  4.24k|            const __m128i res_0 = _mm256_castsi256_si128(res_8);
  724|  4.24k|            const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
  725|       |
  726|  4.24k|            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
  727|  4.24k|            _mm_storel_epi64(
  728|  4.24k|                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
  729|  6.92k|          } else {
  730|  6.92k|            const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
  731|  6.92k|            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
  732|       |
  733|  6.92k|            const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
  734|  6.92k|            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
  735|  6.92k|                            res_1);
  736|  6.92k|          }
  737|  11.1k|        } else {
  738|      0|          const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round);
  739|      0|          const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);
  740|       |
  741|      0|          if (do_average) {
  ------------------
  |  Branch (741:15): [True: 0, False: 0]
  ------------------
  742|      0|            const __m256i data_ref_0 =
  743|      0|                load_line2_avx2(&dst[i * dst_stride + j],
  744|      0|                                &dst[i * dst_stride + j + dst_stride]);
  745|       |
  746|      0|            const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned,
  747|      0|                                                  &wt, use_dist_wtd_comp_avg);
  748|       |
  749|      0|            const __m256i round_result = convolve_rounding(
  750|      0|                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
  751|       |
  752|      0|            const __m256i res_8 =
  753|      0|                _mm256_packus_epi16(round_result, round_result);
  754|      0|            const __m128i res_0 = _mm256_castsi256_si128(res_8);
  755|      0|            const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
  756|       |
  757|      0|            *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0);
  758|      0|            *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
  759|      0|                _mm_cvtsi128_si32(res_1);
  760|       |
  761|      0|          } else {
  762|      0|            const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
  763|      0|            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
  764|       |
  765|      0|            const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
  766|      0|            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
  767|      0|                            res_1);
  768|      0|          }
  769|      0|        }
  770|  11.1k|        s[0] = s[1];
  771|  11.1k|        s[1] = s[2];
  772|  11.1k|        s[3] = s[4];
  773|  11.1k|        s[4] = s[5];
  774|  11.1k|      }
  775|  1.90k|    }
  776|  4.74k|  } else {
  777|  4.74k|    int im_h = h + filter_params_y->taps - 1;
  778|  4.74k|    const int fo_vert = filter_params_y->taps / 2 - 1;
  779|  4.74k|    const int fo_horiz = filter_params_x->taps / 2 - 1;
  780|  4.74k|    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
  781|       |
  782|  4.74k|    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
  783|  4.74k|    filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
  784|       |
  785|  13.3k|    for (int j = 0; j < w; j += 8) {
  ------------------
  |  Branch (785:21): [True: 8.58k, False: 4.74k]
  ------------------
  786|       |      /* Horizontal filter */
  787|  8.58k|      const uint8_t *src_h = src_ptr + j;
  788|  8.58k|      DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP;
  ------------------
  |  |  483|  8.58k|  do {                                                                  \
  |  |  484|   127k|    for (i = 0; i < im_h; i += 2) {                                     \
  |  |  ------------------
  |  |  |  Branch (484:17): [True: 118k, False: 8.58k]
  |  |  ------------------
  |  |  485|   118k|      __m256i data =                                                    \
  |  |  486|   118k|          _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h));    \
  |  |  487|   118k|      if (i + 1 < im_h)                                                 \
  |  |  ------------------
  |  |  |  Branch (487:11): [True: 109k, False: 8.58k]
  |  |  ------------------
  |  |  488|   118k|        data = _mm256_inserti128_si256(                                 \
  |  |  489|   118k|            data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1); \
  |  |  490|   118k|      src_h += (src_stride << 1);                                       \
  |  |  491|   118k|      __m256i res = convolve_lowbd_x(data, coeffs_x, filt);             \
  |  |  492|   118k|                                                                        \
  |  |  493|   118k|      res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h),      \
  |  |  494|   118k|                             round_shift_h);                            \
  |  |  495|   118k|                                                                        \
  |  |  496|   118k|      _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);     \
  |  |  497|   118k|    }                                                                   \
  |  |  498|  8.58k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (498:12): [Folded, False: 8.58k]
  |  |  ------------------
  ------------------
  789|       |
  790|       |      DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP;
  ------------------
  |  |  501|  8.58k|  do {                                                                         \
  |  |  502|  8.58k|    __m256i s[8];                                                              \
  |  |  503|  8.58k|    __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));    \
  |  |  504|  8.58k|    __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));    \
  |  |  505|  8.58k|    __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));    \
  |  |  506|  8.58k|    __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));    \
  |  |  507|  8.58k|    __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));    \
  |  |  508|  8.58k|    __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));    \
  |  |  509|  8.58k|                                                                               \
  |  |  510|  8.58k|    s[0] = _mm256_unpacklo_epi16(s0, s1);                                      \
  |  |  511|  8.58k|    s[1] = _mm256_unpacklo_epi16(s2, s3);                                      \
  |  |  512|  8.58k|    s[2] = _mm256_unpacklo_epi16(s4, s5);                                      \
  |  |  513|  8.58k|                                                                               \
  |  |  514|  8.58k|    s[4] = _mm256_unpackhi_epi16(s0, s1);                                      \
  |  |  515|  8.58k|    s[5] = _mm256_unpackhi_epi16(s2, s3);                                      \
  |  |  516|  8.58k|    s[6] = _mm256_unpackhi_epi16(s4, s5);                                      \
  |  |  517|  8.58k|                                                                               \
  |  |  518|  92.7k|    for (i = 0; i < h; i += 2) {                                               \
  |  |  ------------------
  |  |  |  Branch (518:17): [True: 84.1k, False: 8.58k]
  |  |  ------------------
  |  |  519|  84.1k|      const int16_t *data = &im_block[i * im_stride];                          \
  |  |  520|  84.1k|                                                                               \
  |  |  521|  84.1k|      const __m256i s6 =                                                       \
  |  |  522|  84.1k|          _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));               \
  |  |  523|  84.1k|      const __m256i s7 =                                                       \
  |  |  524|  84.1k|          _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));               \
  |  |  525|  84.1k|                                                                               \
  |  |  526|  84.1k|      s[3] = _mm256_unpacklo_epi16(s6, s7);                                    \
  |  |  527|  84.1k|      s[7] = _mm256_unpackhi_epi16(s6, s7);                                    \
  |  |  528|  84.1k|                                                                               \
  |  |  529|  84.1k|      const __m256i res_a = convolve(s, coeffs_y);                             \
  |  |  530|  84.1k|      const __m256i res_a_round = _mm256_sra_epi32(                            \
  |  |  531|  84.1k|          _mm256_add_epi32(res_a, round_const_v), round_shift_v);              \
  |  |  532|  84.1k|                                                                               \
  |  |  533|  84.1k|      if (w - j > 4) {                                                         \
  |  |  ------------------
  |  |  |  Branch (533:11): [True: 84.1k, False: 0]
  |  |  ------------------
  |  |  534|  84.1k|        const __m256i res_b = convolve(s + 4, coeffs_y);                       \
  |  |  535|  84.1k|        const __m256i res_b_round = _mm256_sra_epi32(                          \
  |  |  536|  84.1k|            _mm256_add_epi32(res_b, round_const_v), round_shift_v);            \
  |  |  537|  84.1k|        const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round);  \
  |  |  538|  84.1k|        const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);  \
  |  |  539|  84.1k|                                                                               \
  |  |  540|  84.1k|        if (do_average) {                                                      \
  |  |  ------------------
  |  |  |  Branch (540:13): [True: 32.3k, False: 51.8k]
  |  |  ------------------
  |  |  541|  32.3k|          const __m256i data_ref_0 =                                           \
  |  |  542|  32.3k|              load_line2_avx2(&dst[i * dst_stride + j],                        \
  |  |  543|  32.3k|                              &dst[i * dst_stride + j + dst_stride]);          \
  |  |  544|  32.3k|          const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned,    \
  |  |  545|  32.3k|                                                &wt, use_dist_wtd_comp_avg);   \
  |  |  546|  32.3k|                                                                               \
  |  |  547|  32.3k|          const __m256i round_result = convolve_rounding(                      \
  |  |  548|  32.3k|              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);  \
  |  |  549|  32.3k|                                                                               \
  |  |  550|  32.3k|          const __m256i res_8 =                                                \
  |  |  551|  32.3k|              _mm256_packus_epi16(round_result, round_result);                 \
  |  |  552|  32.3k|          const __m128i res_0 = _mm256_castsi256_si128(res_8);                 \
  |  |  553|  32.3k|          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);            \
  |  |  554|  32.3k|                                                                               \
  |  |  555|  32.3k|          _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);    \
  |  |  556|  32.3k|          _mm_storel_epi64(                                                    \
  |  |  557|  32.3k|              (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); \
  |  |  558|  51.8k|        } else {                                                               \
  |  |  559|  51.8k|          const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);          \
  |  |  560|  51.8k|          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);       \
  |  |  561|  51.8k|                                                                               \
  |  |  562|  51.8k|          const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);     \
  |  |  563|  51.8k|          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),  \
  |  |  564|  51.8k|                          res_1);                                              \
  |  |  565|  51.8k|        }                                                                      \
  |  |  566|  84.1k|      } else {                                                                 \
  |  |  567|      0|        const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round);  \
  |  |  568|      0|        const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);  \
  |  |  569|      0|                                                                               \
  |  |  570|      0|        if (do_average) {                                                      \
  |  |  ------------------
  |  |  |  Branch (570:13): [True: 0, False: 0]
  |  |  ------------------
  |  |  571|      0|          const __m256i data_ref_0 =                                           \
  |  |  572|      0|              load_line2_avx2(&dst[i * dst_stride + j],                        \
  |  |  573|      0|                              &dst[i * dst_stride + j + dst_stride]);          \
  |  |  574|      0|                                                                               \
  |  |  575|      0|          const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned,    \
  |  |  576|      0|                                                &wt, use_dist_wtd_comp_avg);   \
  |  |  577|      0|                                                                               \
  |  |  578|      0|          const __m256i round_result = convolve_rounding(                      \
  |  |  579|      0|              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);  \
  |  |  580|      0|                                                                               \
  |  |  581|      0|          const __m256i res_8 =                                                \
  |  |  582|      0|              _mm256_packus_epi16(round_result, round_result);                 \
  |  |  583|      0|          const __m128i res_0 = _mm256_castsi256_si128(res_8);                 \
  |  |  584|      0|          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);            \
  |  |  585|      0|                                                                               \
  |  |  586|      0|          *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0);     \
  |  |  587|      0|          *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) =                 \
  |  |  588|      0|              _mm_cvtsi128_si32(res_1);                                        \
  |  |  589|      0|                                                                               \
  |  |  590|      0|        } else {                                                               \
  |  |  591|      0|          const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);          \
  |  |  592|      0|          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);       \
  |  |  593|      0|                                                                               \
  |  |  594|      0|          const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);     \
  |  |  595|      0|          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),  \
  |  |  596|      0|                          res_1);                                              \
  |  |  597|      0|        }                                                                      \
  |  |  598|      0|      }                                                                        \
  |  |  599|  84.1k|                                                                               \
  |  |  600|  84.1k|      s[0] = s[1];                                                             \
  |  |  601|  84.1k|      s[1] = s[2];                                                             \
  |  |  602|  84.1k|      s[2] = s[3];                                                             \
  |  |  603|  84.1k|                                                                               \
  |  |  604|  84.1k|      s[4] = s[5];                                                             \
  |  |  605|  84.1k|      s[5] = s[6];                                                             \
  |  |  606|  84.1k|      s[6] = s[7];                                                             \
  |  |  607|  84.1k|    }                                                                          \
  |  |  608|  8.58k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (608:12): [Folded, False: 8.58k]
  |  |  ------------------
  ------------------
  791|  8.58k|    }
  792|  4.74k|  }
  793|  9.41k|}
av1_dist_wtd_convolve_2d_copy_avx2:
 1091|  13.6k|                                        int h, ConvolveParams *conv_params) {
 1092|  13.6k|  const int bd = 8;
 1093|  13.6k|  CONV_BUF_TYPE *dst = conv_params->dst;
 1094|  13.6k|  int dst_stride = conv_params->dst_stride;
 1095|  13.6k|  assert(conv_params->round_0 == 3);
 1096|  13.6k|  assert(conv_params->round_1 == 7);
 1097|  13.6k|  assert(w % 4 == 0);
 1098|  13.6k|  assert(h % 4 == 0);
 1099|       |
 1100|  13.6k|  const int do_average = conv_params->do_average;
 1101|  13.6k|  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
 1102|  13.6k|  const __m256i wt = unpack_weights_avx2(conv_params);
 1103|  13.6k|  const __m256i zero = _mm256_setzero_si256();
 1104|       |
 1105|  13.6k|  const int offset_0 =
 1106|  13.6k|      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|  13.6k|#define FILTER_BITS 7
  ------------------
 1107|  13.6k|  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
 1108|  13.6k|  const __m256i offset_const = _mm256_set1_epi16(offset);
 1109|  13.6k|  const int rounding_shift =
 1110|  13.6k|      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|  13.6k|#define FILTER_BITS 7
  ------------------
 1111|  13.6k|  const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
 1112|       |
 1113|  13.6k|  if (do_average) {
  ------------------
  |  Branch (1113:7): [True: 5.56k, False: 8.12k]
  ------------------
 1114|  5.56k|    if (use_dist_wtd_comp_avg) {
  ------------------
  |  Branch (1114:9): [True: 1.10k, False: 4.45k]
  ------------------
 1115|  1.10k|      DO_AVG_2D_COPY(1)
  ------------------
  |  |  957|  1.10k|  int i = h;                                                                  \
  |  |  958|  1.10k|  if (w >= 16) {                                                              \
  |  |  ------------------
  |  |  |  Branch (958:7): [True: 435, False: 668]
  |  |  ------------------
  |  |  959|    435|    __m256i src_0, src_1, src_2, src_3;                                       \
  |  |  960|    435|    __m256i ref_0, ref_1, ref_2, ref_3;                                       \
  |  |  961|    435|    __m256i res_0, res_1, res_2, res_3;                                       \
  |  |  962|    435|    __m256i res_10, res_32;                                                   \
  |  |  963|    435|    if (w == 128) {                                                           \
  |  |  ------------------
  |  |  |  Branch (963:9): [True: 6, False: 429]
  |  |  ------------------
  |  |  964|    768|      do {                                                                    \
  |  |  965|    768|        DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 0, 16, 0, 32, 0, 48);    \
  |  |  ------------------
  |  |  |  |  903|    768|  do {                                                                         \
  |  |  |  |  904|    768|    src_0 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  905|    768|        _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0])));             \
  |  |  |  |  906|    768|    src_1 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  907|    768|        _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1])));             \
  |  |  |  |  908|    768|    src_2 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  909|    768|        _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2])));             \
  |  |  |  |  910|    768|    src_3 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  911|    768|        _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3])));             \
  |  |  |  |  912|    768|                                                                               \
  |  |  |  |  913|    768|    src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|    768|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|    768|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  914|    768|    src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|    768|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|    768|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  915|    768|    src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|    768|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|    768|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  916|    768|    src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|    768|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|    768|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  917|    768|    src_0 = _mm256_add_epi16(src_0, offset_const);                             \
  |  |  |  |  918|    768|    src_1 = _mm256_add_epi16(src_1, offset_const);                             \
  |  |  |  |  919|    768|    src_2 = _mm256_add_epi16(src_2, offset_const);                             \
  |  |  |  |  920|    768|    src_3 = _mm256_add_epi16(src_3, offset_const);                             \
  |  |  |  |  921|    768|                                                                               \
  |  |  |  |  922|    768|    ref_0 = _mm256_loadu_si256((__m256i *)(&dst[r0 * dst_stride + c0]));       \
  |  |  |  |  923|    768|    ref_1 = _mm256_loadu_si256((__m256i *)(&dst[r1 * dst_stride + c1]));       \
  |  |  |  |  924|    768|    ref_2 = _mm256_loadu_si256((__m256i *)(&dst[r2 * dst_stride + c2]));       \
  |  |  |  |  925|    768|    ref_3 = _mm256_loadu_si256((__m256i *)(&dst[r3 * dst_stride + c3]));       \
  |  |  |  |  926|    768|                                                                               \
  |  |  |  |  927|    768|    res_0 = comp_avg(&ref_0, &src_0, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  928|    768|    res_1 = comp_avg(&ref_1, &src_1, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  929|    768|    res_2 = comp_avg(&ref_2, &src_2, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  930|    768|    res_3 = comp_avg(&ref_3, &src_3, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  931|    768|                                                                               \
  |  |  |  |  932|    768|    res_0 = convolve_rounding(&res_0, &offset_const, &rounding_const,          \
  |  |  |  |  933|    768|                              rounding_shift);                                 \
  |  |  |  |  934|    768|    res_1 = convolve_rounding(&res_1, &offset_const, &rounding_const,          \
  |  |  |  |  935|    768|                              rounding_shift);                                 \
  |  |  |  |  936|    768|    res_2 = convolve_rounding(&res_2, &offset_const, &rounding_const,          \
  |  |  |  |  937|    768|                              rounding_shift);                                 \
  |  |  |  |  938|    768|    res_3 = convolve_rounding(&res_3, &offset_const, &rounding_const,          \
  |  |  |  |  939|    768|                              rounding_shift);                                 \
  |  |  |  |  940|    768|                                                                               \
  |  |  |  |  941|    768|    res_10 = _mm256_packus_epi16(res_0, res_1);                                \
  |  |  |  |  942|    768|    res_32 = _mm256_packus_epi16(res_2, res_3);                                \
  |  |  |  |  943|    768|    res_10 = _mm256_permute4x64_epi64(res_10, 0xD8);                           \
  |  |  |  |  944|    768|    res_32 = _mm256_permute4x64_epi64(res_32, 0xD8);                           \
  |  |  |  |  945|    768|                                                                               \
  |  |  |  |  946|    768|    _mm_store_si128((__m128i *)(&dst0[r0 * dst_stride0 + c0]),                 \
  |  |  |  |  947|    768|                    _mm256_castsi256_si128(res_10));                           \
  |  |  |  |  948|    768|    _mm_store_si128((__m128i *)(&dst0[r1 * dst_stride0 + c1]),                 \
  |  |  |  |  949|    768|                    _mm256_extracti128_si256(res_10, 1));                      \
  |  |  |  |  950|    768|    _mm_store_si128((__m128i *)(&dst0[r2 * dst_stride0 + c2]),                 \
  |  |  |  |  951|    768|                    _mm256_castsi256_si128(res_32));                           \
  |  |  |  |  952|    768|    _mm_store_si128((__m128i *)(&dst0[r3 * dst_stride0 + c3]),                 \
  |  |  |  |  953|    768|                    _mm256_extracti128_si256(res_32, 1));                      \
  |  |  |  |  954|    768|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (954:12): [Folded, False: 768]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  966|    768|        DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 64, 0, 80, 0, 96, 0, 112);  \
  |  |  ------------------
  |  |  |  |  903|    768|  do {                                                                         \
  |  |  |  |  904|    768|    src_0 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  905|    768|        _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0])));             \
  |  |  |  |  906|    768|    src_1 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  907|    768|        _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1])));             \
  |  |  |  |  908|    768|    src_2 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  909|    768|        _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2])));             \
  |  |  |  |  910|    768|    src_3 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  911|    768|        _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3])));             \
  |  |  |  |  912|    768|                                                                               \
  |  |  |  |  913|    768|    src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|    768|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|    768|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  914|    768|    src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|    768|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|    768|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  915|    768|    src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|    768|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|    768|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  916|    768|    src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|    768|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|    768|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  917|    768|    src_0 = _mm256_add_epi16(src_0, offset_const);                             \
  |  |  |  |  918|    768|    src_1 = _mm256_add_epi16(src_1, offset_const);                             \
  |  |  |  |  919|    768|    src_2 = _mm256_add_epi16(src_2, offset_const);                             \
  |  |  |  |  920|    768|    src_3 = _mm256_add_epi16(src_3, offset_const);                             \
  |  |  |  |  921|    768|                                                                               \
  |  |  |  |  922|    768|    ref_0 = _mm256_loadu_si256((__m256i *)(&dst[r0 * dst_stride + c0]));       \
  |  |  |  |  923|    768|    ref_1 = _mm256_loadu_si256((__m256i *)(&dst[r1 * dst_stride + c1]));       \
  |  |  |  |  924|    768|    ref_2 = _mm256_loadu_si256((__m256i *)(&dst[r2 * dst_stride + c2]));       \
  |  |  |  |  925|    768|    ref_3 = _mm256_loadu_si256((__m256i *)(&dst[r3 * dst_stride + c3]));       \
  |  |  |  |  926|    768|                                                                               \
  |  |  |  |  927|    768|    res_0 = comp_avg(&ref_0, &src_0, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  928|    768|    res_1 = comp_avg(&ref_1, &src_1, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  929|    768|    res_2 = comp_avg(&ref_2, &src_2, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  930|    768|    res_3 = comp_avg(&ref_3, &src_3, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  931|    768|                                                                               \
  |  |  |  |  932|    768|    res_0 = convolve_rounding(&res_0, &offset_const, &rounding_const,          \
  |  |  |  |  933|    768|                              rounding_shift);                                 \
  |  |  |  |  934|    768|    res_1 = convolve_rounding(&res_1, &offset_const, &rounding_const,          \
  |  |  |  |  935|    768|                              rounding_shift);                                 \
  |  |  |  |  936|    768|    res_2 = convolve_rounding(&res_2, &offset_const, &rounding_const,          \
  |  |  |  |  937|    768|                              rounding_shift);                                 \
  |  |  |  |  938|    768|    res_3 = convolve_rounding(&res_3, &offset_const, &rounding_const,          \
  |  |  |  |  939|    768|                              rounding_shift);                                 \
  |  |  |  |  940|    768|                                                                               \
  |  |  |  |  941|    768|    res_10 = _mm256_packus_epi16(res_0, res_1);                                \
  |  |  |  |  942|    768|    res_32 = _mm256_packus_epi16(res_2, res_3);                                \
  |  |  |  |  943|    768|    res_10 = _mm256_permute4x64_epi64(res_10, 0xD8);                           \
  |  |  |  |  944|    768|    res_32 = _mm256_permute4x64_epi64(res_32, 0xD8);                           \
  |  |  |  |  945|    768|                                                                               \
  |  |  |  |  946|    768|    _mm_store_si128((__m128i *)(&dst0[r0 * dst_stride0 + c0]),                 \
  |  |  |  |  947|    768|                    _mm256_castsi256_si128(res_10));                           \
  |  |  |  |  948|    768|    _mm_store_si128((__m128i *)(&dst0[r1 * dst_stride0 + c1]),                 \
  |  |  |  |  949|    768|                    _mm256_extracti128_si256(res_10, 1));                      \
  |  |  |  |  950|    768|    _mm_store_si128((__m128i *)(&dst0[r2 * dst_stride0 + c2]),                 \
  |  |  |  |  951|    768|                    _mm256_castsi256_si128(res_32));                           \
  |  |  |  |  952|    768|    _mm_store_si128((__m128i *)(&dst0[r3 * dst_stride0 + c3]),                 \
  |  |  |  |  953|    768|                    _mm256_extracti128_si256(res_32, 1));                      \
  |  |  |  |  954|    768|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (954:12): [Folded, False: 768]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  967|    768|        i -= 1;                                                               \
  |  |  968|    768|        src += 1 * src_stride;                                                \
  |  |  969|    768|        dst += 1 * dst_stride;                                                \
  |  |  970|    768|        dst0 += 1 * dst_stride0;                                              \
  |  |  971|    768|      } while (i);                                                            \
  |  |  ------------------
  |  |  |  Branch (971:16): [True: 762, False: 6]
  |  |  ------------------
  |  |  972|    429|    } else if (w == 64) {                                                     \
  |  |  ------------------
  |  |  |  Branch (972:16): [True: 25, False: 404]
  |  |  ------------------
  |  |  973|  1.44k|      do {                                                                    \
  |  |  974|  1.44k|        DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 0, 16, 0, 32, 0, 48);    \
  |  |  ------------------
  |  |  |  |  903|  1.44k|  do {                                                                         \
  |  |  |  |  904|  1.44k|    src_0 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  905|  1.44k|        _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0])));             \
  |  |  |  |  906|  1.44k|    src_1 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  907|  1.44k|        _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1])));             \
  |  |  |  |  908|  1.44k|    src_2 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  909|  1.44k|        _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2])));             \
  |  |  |  |  910|  1.44k|    src_3 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  911|  1.44k|        _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3])));             \
  |  |  |  |  912|  1.44k|                                                                               \
  |  |  |  |  913|  1.44k|    src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  1.44k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  1.44k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  914|  1.44k|    src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  1.44k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  1.44k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  915|  1.44k|    src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  1.44k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  1.44k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  916|  1.44k|    src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  1.44k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  1.44k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  917|  1.44k|    src_0 = _mm256_add_epi16(src_0, offset_const);                             \
  |  |  |  |  918|  1.44k|    src_1 = _mm256_add_epi16(src_1, offset_const);                             \
  |  |  |  |  919|  1.44k|    src_2 = _mm256_add_epi16(src_2, offset_const);                             \
  |  |  |  |  920|  1.44k|    src_3 = _mm256_add_epi16(src_3, offset_const);                             \
  |  |  |  |  921|  1.44k|                                                                               \
  |  |  |  |  922|  1.44k|    ref_0 = _mm256_loadu_si256((__m256i *)(&dst[r0 * dst_stride + c0]));       \
  |  |  |  |  923|  1.44k|    ref_1 = _mm256_loadu_si256((__m256i *)(&dst[r1 * dst_stride + c1]));       \
  |  |  |  |  924|  1.44k|    ref_2 = _mm256_loadu_si256((__m256i *)(&dst[r2 * dst_stride + c2]));       \
  |  |  |  |  925|  1.44k|    ref_3 = _mm256_loadu_si256((__m256i *)(&dst[r3 * dst_stride + c3]));       \
  |  |  |  |  926|  1.44k|                                                                               \
  |  |  |  |  927|  1.44k|    res_0 = comp_avg(&ref_0, &src_0, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  928|  1.44k|    res_1 = comp_avg(&ref_1, &src_1, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  929|  1.44k|    res_2 = comp_avg(&ref_2, &src_2, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  930|  1.44k|    res_3 = comp_avg(&ref_3, &src_3, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  931|  1.44k|                                                                               \
  |  |  |  |  932|  1.44k|    res_0 = convolve_rounding(&res_0, &offset_const, &rounding_const,          \
  |  |  |  |  933|  1.44k|                              rounding_shift);                                 \
  |  |  |  |  934|  1.44k|    res_1 = convolve_rounding(&res_1, &offset_const, &rounding_const,          \
  |  |  |  |  935|  1.44k|                              rounding_shift);                                 \
  |  |  |  |  936|  1.44k|    res_2 = convolve_rounding(&res_2, &offset_const, &rounding_const,          \
  |  |  |  |  937|  1.44k|                              rounding_shift);                                 \
  |  |  |  |  938|  1.44k|    res_3 = convolve_rounding(&res_3, &offset_const, &rounding_const,          \
  |  |  |  |  939|  1.44k|                              rounding_shift);                                 \
  |  |  |  |  940|  1.44k|                                                                               \
  |  |  |  |  941|  1.44k|    res_10 = _mm256_packus_epi16(res_0, res_1);                                \
  |  |  |  |  942|  1.44k|    res_32 = _mm256_packus_epi16(res_2, res_3);                                \
  |  |  |  |  943|  1.44k|    res_10 = _mm256_permute4x64_epi64(res_10, 0xD8);                           \
  |  |  |  |  944|  1.44k|    res_32 = _mm256_permute4x64_epi64(res_32, 0xD8);                           \
  |  |  |  |  945|  1.44k|                                                                               \
  |  |  |  |  946|  1.44k|    _mm_store_si128((__m128i *)(&dst0[r0 * dst_stride0 + c0]),                 \
  |  |  |  |  947|  1.44k|                    _mm256_castsi256_si128(res_10));                           \
  |  |  |  |  948|  1.44k|    _mm_store_si128((__m128i *)(&dst0[r1 * dst_stride0 + c1]),                 \
  |  |  |  |  949|  1.44k|                    _mm256_extracti128_si256(res_10, 1));                      \
  |  |  |  |  950|  1.44k|    _mm_store_si128((__m128i *)(&dst0[r2 * dst_stride0 + c2]),                 \
  |  |  |  |  951|  1.44k|                    _mm256_castsi256_si128(res_32));                           \
  |  |  |  |  952|  1.44k|    _mm_store_si128((__m128i *)(&dst0[r3 * dst_stride0 + c3]),                 \
  |  |  |  |  953|  1.44k|                    _mm256_extracti128_si256(res_32, 1));                      \
  |  |  |  |  954|  1.44k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (954:12): [Folded, False: 1.44k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  975|  1.44k|                                                                              \
  |  |  976|  1.44k|        i -= 1;                                                               \
  |  |  977|  1.44k|        src += 1 * src_stride;                                                \
  |  |  978|  1.44k|        dst += 1 * dst_stride;                                                \
  |  |  979|  1.44k|        dst0 += 1 * dst_stride0;                                              \
  |  |  980|  1.44k|      } while (i);                                                            \
  |  |  ------------------
  |  |  |  Branch (980:16): [True: 1.41k, False: 25]
  |  |  ------------------
  |  |  981|    404|    } else if (w == 32) {                                                     \
  |  |  ------------------
  |  |  |  Branch (981:16): [True: 101, False: 303]
  |  |  ------------------
  |  |  982|  1.19k|      do {                                                                    \
  |  |  983|  1.19k|        DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 1, 0, 0, 16, 1, 16);     \
  |  |  ------------------
  |  |  |  |  903|  1.19k|  do {                                                                         \
  |  |  |  |  904|  1.19k|    src_0 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  905|  1.19k|        _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0])));             \
  |  |  |  |  906|  1.19k|    src_1 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  907|  1.19k|        _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1])));             \
  |  |  |  |  908|  1.19k|    src_2 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  909|  1.19k|        _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2])));             \
  |  |  |  |  910|  1.19k|    src_3 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  911|  1.19k|        _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3])));             \
  |  |  |  |  912|  1.19k|                                                                               \
  |  |  |  |  913|  1.19k|    src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  1.19k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  1.19k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  914|  1.19k|    src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  1.19k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  1.19k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  915|  1.19k|    src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  1.19k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  1.19k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  916|  1.19k|    src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  1.19k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  1.19k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  917|  1.19k|    src_0 = _mm256_add_epi16(src_0, offset_const);                             \
  |  |  |  |  918|  1.19k|    src_1 = _mm256_add_epi16(src_1, offset_const);                             \
  |  |  |  |  919|  1.19k|    src_2 = _mm256_add_epi16(src_2, offset_const);                             \
  |  |  |  |  920|  1.19k|    src_3 = _mm256_add_epi16(src_3, offset_const);                             \
  |  |  |  |  921|  1.19k|                                                                               \
  |  |  |  |  922|  1.19k|    ref_0 = _mm256_loadu_si256((__m256i *)(&dst[r0 * dst_stride + c0]));       \
  |  |  |  |  923|  1.19k|    ref_1 = _mm256_loadu_si256((__m256i *)(&dst[r1 * dst_stride + c1]));       \
  |  |  |  |  924|  1.19k|    ref_2 = _mm256_loadu_si256((__m256i *)(&dst[r2 * dst_stride + c2]));       \
  |  |  |  |  925|  1.19k|    ref_3 = _mm256_loadu_si256((__m256i *)(&dst[r3 * dst_stride + c3]));       \
  |  |  |  |  926|  1.19k|                                                                               \
  |  |  |  |  927|  1.19k|    res_0 = comp_avg(&ref_0, &src_0, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  928|  1.19k|    res_1 = comp_avg(&ref_1, &src_1, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  929|  1.19k|    res_2 = comp_avg(&ref_2, &src_2, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  930|  1.19k|    res_3 = comp_avg(&ref_3, &src_3, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  931|  1.19k|                                                                               \
  |  |  |  |  932|  1.19k|    res_0 = convolve_rounding(&res_0, &offset_const, &rounding_const,          \
  |  |  |  |  933|  1.19k|                              rounding_shift);                                 \
  |  |  |  |  934|  1.19k|    res_1 = convolve_rounding(&res_1, &offset_const, &rounding_const,          \
  |  |  |  |  935|  1.19k|                              rounding_shift);                                 \
  |  |  |  |  936|  1.19k|    res_2 = convolve_rounding(&res_2, &offset_const, &rounding_const,          \
  |  |  |  |  937|  1.19k|                              rounding_shift);                                 \
  |  |  |  |  938|  1.19k|    res_3 = convolve_rounding(&res_3, &offset_const, &rounding_const,          \
  |  |  |  |  939|  1.19k|                              rounding_shift);                                 \
  |  |  |  |  940|  1.19k|                                                                               \
  |  |  |  |  941|  1.19k|    res_10 = _mm256_packus_epi16(res_0, res_1);                                \
  |  |  |  |  942|  1.19k|    res_32 = _mm256_packus_epi16(res_2, res_3);                                \
  |  |  |  |  943|  1.19k|    res_10 = _mm256_permute4x64_epi64(res_10, 0xD8);                           \
  |  |  |  |  944|  1.19k|    res_32 = _mm256_permute4x64_epi64(res_32, 0xD8);                           \
  |  |  |  |  945|  1.19k|                                                                               \
  |  |  |  |  946|  1.19k|    _mm_store_si128((__m128i *)(&dst0[r0 * dst_stride0 + c0]),                 \
  |  |  |  |  947|  1.19k|                    _mm256_castsi256_si128(res_10));                           \
  |  |  |  |  948|  1.19k|    _mm_store_si128((__m128i *)(&dst0[r1 * dst_stride0 + c1]),                 \
  |  |  |  |  949|  1.19k|                    _mm256_extracti128_si256(res_10, 1));                      \
  |  |  |  |  950|  1.19k|    _mm_store_si128((__m128i *)(&dst0[r2 * dst_stride0 + c2]),                 \
  |  |  |  |  951|  1.19k|                    _mm256_castsi256_si128(res_32));                           \
  |  |  |  |  952|  1.19k|    _mm_store_si128((__m128i *)(&dst0[r3 * dst_stride0 + c3]),                 \
  |  |  |  |  953|  1.19k|                    _mm256_extracti128_si256(res_32, 1));                      \
  |  |  |  |  954|  1.19k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (954:12): [Folded, False: 1.19k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  984|  1.19k|                                                                              \
  |  |  985|  1.19k|        i -= 2;                                                               \
  |  |  986|  1.19k|        src += 2 * src_stride;                                                \
  |  |  987|  1.19k|        dst += 2 * dst_stride;                                                \
  |  |  988|  1.19k|        dst0 += 2 * dst_stride0;                                              \
  |  |  989|  1.19k|      } while (i);                                                            \
  |  |  ------------------
  |  |  |  Branch (989:16): [True: 1.09k, False: 101]
  |  |  ------------------
  |  |  990|    303|    } else {                                                                  \
  |  |  991|    303|      assert(w == 16);                                                        \
  |  |  992|    998|      do {                                                                    \
  |  |  993|    998|        DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 1, 0, 2, 0, 3, 0);       \
  |  |  ------------------
  |  |  |  |  903|    998|  do {                                                                         \
  |  |  |  |  904|    998|    src_0 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  905|    998|        _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0])));             \
  |  |  |  |  906|    998|    src_1 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  907|    998|        _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1])));             \
  |  |  |  |  908|    998|    src_2 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  909|    998|        _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2])));             \
  |  |  |  |  910|    998|    src_3 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  911|    998|        _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3])));             \
  |  |  |  |  912|    998|                                                                               \
  |  |  |  |  913|    998|    src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|    998|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|    998|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  914|    998|    src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|    998|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|    998|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  915|    998|    src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|    998|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|    998|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  916|    998|    src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|    998|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|    998|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  917|    998|    src_0 = _mm256_add_epi16(src_0, offset_const);                             \
  |  |  |  |  918|    998|    src_1 = _mm256_add_epi16(src_1, offset_const);                             \
  |  |  |  |  919|    998|    src_2 = _mm256_add_epi16(src_2, offset_const);                             \
  |  |  |  |  920|    998|    src_3 = _mm256_add_epi16(src_3, offset_const);                             \
  |  |  |  |  921|    998|                                                                               \
  |  |  |  |  922|    998|    ref_0 = _mm256_loadu_si256((__m256i *)(&dst[r0 * dst_stride + c0]));       \
  |  |  |  |  923|    998|    ref_1 = _mm256_loadu_si256((__m256i *)(&dst[r1 * dst_stride + c1]));       \
  |  |  |  |  924|    998|    ref_2 = _mm256_loadu_si256((__m256i *)(&dst[r2 * dst_stride + c2]));       \
  |  |  |  |  925|    998|    ref_3 = _mm256_loadu_si256((__m256i *)(&dst[r3 * dst_stride + c3]));       \
  |  |  |  |  926|    998|                                                                               \
  |  |  |  |  927|    998|    res_0 = comp_avg(&ref_0, &src_0, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  928|    998|    res_1 = comp_avg(&ref_1, &src_1, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  929|    998|    res_2 = comp_avg(&ref_2, &src_2, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  930|    998|    res_3 = comp_avg(&ref_3, &src_3, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  931|    998|                                                                               \
  |  |  |  |  932|    998|    res_0 = convolve_rounding(&res_0, &offset_const, &rounding_const,          \
  |  |  |  |  933|    998|                              rounding_shift);                                 \
  |  |  |  |  934|    998|    res_1 = convolve_rounding(&res_1, &offset_const, &rounding_const,          \
  |  |  |  |  935|    998|                              rounding_shift);                                 \
  |  |  |  |  936|    998|    res_2 = convolve_rounding(&res_2, &offset_const, &rounding_const,          \
  |  |  |  |  937|    998|                              rounding_shift);                                 \
  |  |  |  |  938|    998|    res_3 = convolve_rounding(&res_3, &offset_const, &rounding_const,          \
  |  |  |  |  939|    998|                              rounding_shift);                                 \
  |  |  |  |  940|    998|                                                                               \
  |  |  |  |  941|    998|    res_10 = _mm256_packus_epi16(res_0, res_1);                                \
  |  |  |  |  942|    998|    res_32 = _mm256_packus_epi16(res_2, res_3);                                \
  |  |  |  |  943|    998|    res_10 = _mm256_permute4x64_epi64(res_10, 0xD8);                           \
  |  |  |  |  944|    998|    res_32 = _mm256_permute4x64_epi64(res_32, 0xD8);                           \
  |  |  |  |  945|    998|                                                                               \
  |  |  |  |  946|    998|    _mm_store_si128((__m128i *)(&dst0[r0 * dst_stride0 + c0]),                 \
  |  |  |  |  947|    998|                    _mm256_castsi256_si128(res_10));                           \
  |  |  |  |  948|    998|    _mm_store_si128((__m128i *)(&dst0[r1 * dst_stride0 + c1]),                 \
  |  |  |  |  949|    998|                    _mm256_extracti128_si256(res_10, 1));                      \
  |  |  |  |  950|    998|    _mm_store_si128((__m128i *)(&dst0[r2 * dst_stride0 + c2]),                 \
  |  |  |  |  951|    998|                    _mm256_castsi256_si128(res_32));                           \
  |  |  |  |  952|    998|    _mm_store_si128((__m128i *)(&dst0[r3 * dst_stride0 + c3]),                 \
  |  |  |  |  953|    998|                    _mm256_extracti128_si256(res_32, 1));                      \
  |  |  |  |  954|    998|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (954:12): [Folded, False: 998]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  994|    998|                                                                              \
  |  |  995|    998|        i -= 4;                                                               \
  |  |  996|    998|        src += 4 * src_stride;                                                \
  |  |  997|    998|        dst += 4 * dst_stride;                                                \
  |  |  998|    998|        dst0 += 4 * dst_stride0;                                              \
  |  |  999|    998|      } while (i);                                                            \
  |  |  ------------------
  |  |  |  Branch (999:16): [True: 695, False: 303]
  |  |  ------------------
  |  | 1000|    303|    }                                                                         \
  |  | 1001|    668|  } else if (w == 8) {                                                        \
  |  |  ------------------
  |  |  |  Branch (1001:14): [True: 408, False: 260]
  |  |  ------------------
  |  | 1002|  1.02k|    do {                                                                      \
  |  | 1003|  1.02k|      const __m128i src_0 =                                                   \
  |  | 1004|  1.02k|          _mm_loadl_epi64((__m128i *)(&src[0 * src_stride]));                 \
  |  | 1005|  1.02k|      const __m128i src_1 =                                                   \
  |  | 1006|  1.02k|          _mm_loadl_epi64((__m128i *)(&src[1 * src_stride]));                 \
  |  | 1007|  1.02k|      const __m128i src_2 =                                                   \
  |  | 1008|  1.02k|          _mm_loadl_epi64((__m128i *)(&src[2 * src_stride]));                 \
  |  | 1009|  1.02k|      const __m128i src_3 =                                                   \
  |  | 1010|  1.02k|          _mm_loadl_epi64((__m128i *)(&src[3 * src_stride]));                 \
  |  | 1011|  1.02k|      __m256i src_10 =                                                        \
  |  | 1012|  1.02k|          _mm256_insertf128_si256(_mm256_castsi128_si256(src_0), src_1, 1);   \
  |  | 1013|  1.02k|      __m256i src_32 =                                                        \
  |  | 1014|  1.02k|          _mm256_insertf128_si256(_mm256_castsi128_si256(src_2), src_3, 1);   \
  |  | 1015|  1.02k|                                                                              \
  |  | 1016|  1.02k|      src_10 = _mm256_unpacklo_epi8(src_10, zero);                            \
  |  | 1017|  1.02k|      src_32 = _mm256_unpacklo_epi8(src_32, zero);                            \
  |  | 1018|  1.02k|                                                                              \
  |  | 1019|  1.02k|      src_10 = _mm256_slli_epi16(src_10, LEFT_SHIFT);                         \
  |  |  ------------------
  |  |  |  |  822|  1.02k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|  1.02k|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 1020|  1.02k|      src_32 = _mm256_slli_epi16(src_32, LEFT_SHIFT);                         \
  |  |  ------------------
  |  |  |  |  822|  1.02k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|  1.02k|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 1021|  1.02k|                                                                              \
  |  | 1022|  1.02k|      src_10 = _mm256_add_epi16(src_10, offset_const);                        \
  |  | 1023|  1.02k|      src_32 = _mm256_add_epi16(src_32, offset_const);                        \
  |  | 1024|  1.02k|                                                                              \
  |  | 1025|  1.02k|      const __m256i ref_10 =                                                  \
  |  | 1026|  1.02k|          load_line2_avx2(&dst[0 * dst_stride], &dst[1 * dst_stride]);        \
  |  | 1027|  1.02k|      const __m256i ref_32 =                                                  \
  |  | 1028|  1.02k|          load_line2_avx2(&dst[2 * dst_stride], &dst[3 * dst_stride]);        \
  |  | 1029|  1.02k|      __m256i res_10 = comp_avg(&ref_10, &src_10, &wt, USE_DIST_WEIGHTED);    \
  |  | 1030|  1.02k|      __m256i res_32 = comp_avg(&ref_32, &src_32, &wt, USE_DIST_WEIGHTED);    \
  |  | 1031|  1.02k|                                                                              \
  |  | 1032|  1.02k|      res_10 = convolve_rounding(&res_10, &offset_const, &rounding_const,     \
  |  | 1033|  1.02k|                                 rounding_shift);                             \
  |  | 1034|  1.02k|      res_32 = convolve_rounding(&res_32, &offset_const, &rounding_const,     \
  |  | 1035|  1.02k|                                 rounding_shift);                             \
  |  | 1036|  1.02k|                                                                              \
  |  | 1037|  1.02k|      __m256i res = _mm256_packus_epi16(res_10, res_32);                      \
  |  | 1038|  1.02k|      const __m128i res_20 = _mm256_castsi256_si128(res);                     \
  |  | 1039|  1.02k|      const __m128i res_31 = _mm256_extracti128_si256(res, 1);                \
  |  | 1040|  1.02k|                                                                              \
  |  | 1041|  1.02k|      _mm_storel_epi64((__m128i *)(&dst0[0 * dst_stride0]), res_20);          \
  |  | 1042|  1.02k|      _mm_storel_epi64((__m128i *)((&dst0[1 * dst_stride0])), res_31);        \
  |  | 1043|  1.02k|      _mm_storeh_epi64((__m128i *)(&dst0[2 * dst_stride0]), res_20);          \
  |  | 1044|  1.02k|      _mm_storeh_epi64((__m128i *)((&dst0[3 * dst_stride0])), res_31);        \
  |  | 1045|  1.02k|      i -= 4;                                                                 \
  |  | 1046|  1.02k|      src += 4 * src_stride;                                                  \
  |  | 1047|  1.02k|      dst += 4 * dst_stride;                                                  \
  |  | 1048|  1.02k|      dst0 += 4 * dst_stride0;                                                \
  |  | 1049|  1.02k|    } while (i);                                                              \
  |  |  ------------------
  |  |  |  Branch (1049:14): [True: 620, False: 408]
  |  |  ------------------
  |  | 1050|    408|  } else {                                                                    \
  |  | 1051|    260|    assert(w == 4);                                                           \
  |  | 1052|    464|    do {                                                                      \
  |  | 1053|    464|      __m256i src_3210_8bit =                                                 \
  |  | 1054|    464|          _mm256_setr_epi32(loadu_int32(src + 0 * src_stride),                \
  |  | 1055|    464|                            loadu_int32(src + 1 * src_stride), 0, 0,          \
  |  | 1056|    464|                            loadu_int32(src + 2 * src_stride),                \
  |  | 1057|    464|                            loadu_int32(src + 3 * src_stride), 0, 0);         \
  |  | 1058|    464|                                                                              \
  |  | 1059|    464|      __m256i src_3210 = _mm256_unpacklo_epi8(src_3210_8bit, zero);           \
  |  | 1060|    464|      src_3210 = _mm256_slli_epi16(src_3210, LEFT_SHIFT);                     \
  |  |  ------------------
  |  |  |  |  822|    464|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|    464|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 1061|    464|      src_3210 = _mm256_add_epi16(src_3210, offset_const);                    \
  |  | 1062|    464|                                                                              \
  |  | 1063|    464|      __m256i ref_3210 =                                                      \
  |  | 1064|    464|          _mm256_setr_epi64x(*(int64_t *)(dst + 0 * dst_stride),              \
  |  | 1065|    464|                             *(int64_t *)(dst + 1 * dst_stride),              \
  |  | 1066|    464|                             *(int64_t *)(dst + 2 * dst_stride),              \
  |  | 1067|    464|                             *(int64_t *)(dst + 3 * dst_stride));             \
  |  | 1068|    464|      __m256i res_3210 =                                                      \
  |  | 1069|    464|          comp_avg(&ref_3210, &src_3210, &wt, USE_DIST_WEIGHTED);             \
  |  | 1070|    464|                                                                              \
  |  | 1071|    464|      res_3210 = convolve_rounding(&res_3210, &offset_const, &rounding_const, \
  |  | 1072|    464|                                   rounding_shift);                           \
  |  | 1073|    464|                                                                              \
  |  | 1074|    464|      res_3210 = _mm256_packus_epi16(res_3210, res_3210);                     \
  |  | 1075|    464|      const __m128i res_10 = _mm256_castsi256_si128(res_3210);                \
  |  | 1076|    464|      const __m128i res_32 = _mm256_extracti128_si256(res_3210, 1);           \
  |  | 1077|    464|                                                                              \
  |  | 1078|    464|      *(int *)(&dst0[0 * dst_stride0]) = _mm_cvtsi128_si32(res_10);           \
  |  | 1079|    464|      *(int *)(&dst0[2 * dst_stride0]) = _mm_cvtsi128_si32(res_32);           \
  |  | 1080|    464|      *(int *)(&dst0[1 * dst_stride0]) = _mm_extract_epi32(res_10, 1);        \
  |  | 1081|    464|      *(int *)(&dst0[3 * dst_stride0]) = _mm_extract_epi32(res_32, 1);        \
  |  | 1082|    464|      i -= 4;                                                                 \
  |  | 1083|    464|      src += 4 * src_stride;                                                  \
  |  | 1084|    464|      dst += 4 * dst_stride;                                                  \
  |  | 1085|    464|      dst0 += 4 * dst_stride0;                                                \
  |  | 1086|    464|    } while (i);                                                              \
  |  |  ------------------
  |  |  |  Branch (1086:14): [True: 204, False: 260]
  |  |  ------------------
  |  | 1087|    260|  }
  ------------------
 1116|  4.45k|    } else {
 1117|  4.45k|      DO_AVG_2D_COPY(0)
  ------------------
  |  |  957|  4.45k|  int i = h;                                                                  \
  |  |  958|  4.45k|  if (w >= 16) {                                                              \
  |  |  ------------------
  |  |  |  Branch (958:7): [True: 1.92k, False: 2.53k]
  |  |  ------------------
  |  |  959|  1.92k|    __m256i src_0, src_1, src_2, src_3;                                       \
  |  |  960|  1.92k|    __m256i ref_0, ref_1, ref_2, ref_3;                                       \
  |  |  961|  1.92k|    __m256i res_0, res_1, res_2, res_3;                                       \
  |  |  962|  1.92k|    __m256i res_10, res_32;                                                   \
  |  |  963|  1.92k|    if (w == 128) {                                                           \
  |  |  ------------------
  |  |  |  Branch (963:9): [True: 9, False: 1.91k]
  |  |  ------------------
  |  |  964|  1.02k|      do {                                                                    \
  |  |  965|  1.02k|        DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 0, 16, 0, 32, 0, 48);    \
  |  |  ------------------
  |  |  |  |  903|  1.02k|  do {                                                                         \
  |  |  |  |  904|  1.02k|    src_0 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  905|  1.02k|        _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0])));             \
  |  |  |  |  906|  1.02k|    src_1 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  907|  1.02k|        _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1])));             \
  |  |  |  |  908|  1.02k|    src_2 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  909|  1.02k|        _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2])));             \
  |  |  |  |  910|  1.02k|    src_3 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  911|  1.02k|        _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3])));             \
  |  |  |  |  912|  1.02k|                                                                               \
  |  |  |  |  913|  1.02k|    src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  1.02k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  1.02k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  914|  1.02k|    src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  1.02k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  1.02k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  915|  1.02k|    src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  1.02k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  1.02k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  916|  1.02k|    src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  1.02k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  1.02k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  917|  1.02k|    src_0 = _mm256_add_epi16(src_0, offset_const);                             \
  |  |  |  |  918|  1.02k|    src_1 = _mm256_add_epi16(src_1, offset_const);                             \
  |  |  |  |  919|  1.02k|    src_2 = _mm256_add_epi16(src_2, offset_const);                             \
  |  |  |  |  920|  1.02k|    src_3 = _mm256_add_epi16(src_3, offset_const);                             \
  |  |  |  |  921|  1.02k|                                                                               \
  |  |  |  |  922|  1.02k|    ref_0 = _mm256_loadu_si256((__m256i *)(&dst[r0 * dst_stride + c0]));       \
  |  |  |  |  923|  1.02k|    ref_1 = _mm256_loadu_si256((__m256i *)(&dst[r1 * dst_stride + c1]));       \
  |  |  |  |  924|  1.02k|    ref_2 = _mm256_loadu_si256((__m256i *)(&dst[r2 * dst_stride + c2]));       \
  |  |  |  |  925|  1.02k|    ref_3 = _mm256_loadu_si256((__m256i *)(&dst[r3 * dst_stride + c3]));       \
  |  |  |  |  926|  1.02k|                                                                               \
  |  |  |  |  927|  1.02k|    res_0 = comp_avg(&ref_0, &src_0, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  928|  1.02k|    res_1 = comp_avg(&ref_1, &src_1, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  929|  1.02k|    res_2 = comp_avg(&ref_2, &src_2, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  930|  1.02k|    res_3 = comp_avg(&ref_3, &src_3, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  931|  1.02k|                                                                               \
  |  |  |  |  932|  1.02k|    res_0 = convolve_rounding(&res_0, &offset_const, &rounding_const,          \
  |  |  |  |  933|  1.02k|                              rounding_shift);                                 \
  |  |  |  |  934|  1.02k|    res_1 = convolve_rounding(&res_1, &offset_const, &rounding_const,          \
  |  |  |  |  935|  1.02k|                              rounding_shift);                                 \
  |  |  |  |  936|  1.02k|    res_2 = convolve_rounding(&res_2, &offset_const, &rounding_const,          \
  |  |  |  |  937|  1.02k|                              rounding_shift);                                 \
  |  |  |  |  938|  1.02k|    res_3 = convolve_rounding(&res_3, &offset_const, &rounding_const,          \
  |  |  |  |  939|  1.02k|                              rounding_shift);                                 \
  |  |  |  |  940|  1.02k|                                                                               \
  |  |  |  |  941|  1.02k|    res_10 = _mm256_packus_epi16(res_0, res_1);                                \
  |  |  |  |  942|  1.02k|    res_32 = _mm256_packus_epi16(res_2, res_3);                                \
  |  |  |  |  943|  1.02k|    res_10 = _mm256_permute4x64_epi64(res_10, 0xD8);                           \
  |  |  |  |  944|  1.02k|    res_32 = _mm256_permute4x64_epi64(res_32, 0xD8);                           \
  |  |  |  |  945|  1.02k|                                                                               \
  |  |  |  |  946|  1.02k|    _mm_store_si128((__m128i *)(&dst0[r0 * dst_stride0 + c0]),                 \
  |  |  |  |  947|  1.02k|                    _mm256_castsi256_si128(res_10));                           \
  |  |  |  |  948|  1.02k|    _mm_store_si128((__m128i *)(&dst0[r1 * dst_stride0 + c1]),                 \
  |  |  |  |  949|  1.02k|                    _mm256_extracti128_si256(res_10, 1));                      \
  |  |  |  |  950|  1.02k|    _mm_store_si128((__m128i *)(&dst0[r2 * dst_stride0 + c2]),                 \
  |  |  |  |  951|  1.02k|                    _mm256_castsi256_si128(res_32));                           \
  |  |  |  |  952|  1.02k|    _mm_store_si128((__m128i *)(&dst0[r3 * dst_stride0 + c3]),                 \
  |  |  |  |  953|  1.02k|                    _mm256_extracti128_si256(res_32, 1));                      \
  |  |  |  |  954|  1.02k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (954:12): [Folded, False: 1.02k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  966|  1.02k|        DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 64, 0, 80, 0, 96, 0, 112);  \
  |  |  ------------------
  |  |  |  |  903|  1.02k|  do {                                                                         \
  |  |  |  |  904|  1.02k|    src_0 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  905|  1.02k|        _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0])));             \
  |  |  |  |  906|  1.02k|    src_1 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  907|  1.02k|        _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1])));             \
  |  |  |  |  908|  1.02k|    src_2 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  909|  1.02k|        _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2])));             \
  |  |  |  |  910|  1.02k|    src_3 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  911|  1.02k|        _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3])));             \
  |  |  |  |  912|  1.02k|                                                                               \
  |  |  |  |  913|  1.02k|    src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  1.02k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  1.02k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  914|  1.02k|    src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  1.02k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  1.02k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  915|  1.02k|    src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  1.02k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  1.02k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  916|  1.02k|    src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  1.02k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  1.02k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  917|  1.02k|    src_0 = _mm256_add_epi16(src_0, offset_const);                             \
  |  |  |  |  918|  1.02k|    src_1 = _mm256_add_epi16(src_1, offset_const);                             \
  |  |  |  |  919|  1.02k|    src_2 = _mm256_add_epi16(src_2, offset_const);                             \
  |  |  |  |  920|  1.02k|    src_3 = _mm256_add_epi16(src_3, offset_const);                             \
  |  |  |  |  921|  1.02k|                                                                               \
  |  |  |  |  922|  1.02k|    ref_0 = _mm256_loadu_si256((__m256i *)(&dst[r0 * dst_stride + c0]));       \
  |  |  |  |  923|  1.02k|    ref_1 = _mm256_loadu_si256((__m256i *)(&dst[r1 * dst_stride + c1]));       \
  |  |  |  |  924|  1.02k|    ref_2 = _mm256_loadu_si256((__m256i *)(&dst[r2 * dst_stride + c2]));       \
  |  |  |  |  925|  1.02k|    ref_3 = _mm256_loadu_si256((__m256i *)(&dst[r3 * dst_stride + c3]));       \
  |  |  |  |  926|  1.02k|                                                                               \
  |  |  |  |  927|  1.02k|    res_0 = comp_avg(&ref_0, &src_0, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  928|  1.02k|    res_1 = comp_avg(&ref_1, &src_1, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  929|  1.02k|    res_2 = comp_avg(&ref_2, &src_2, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  930|  1.02k|    res_3 = comp_avg(&ref_3, &src_3, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  931|  1.02k|                                                                               \
  |  |  |  |  932|  1.02k|    res_0 = convolve_rounding(&res_0, &offset_const, &rounding_const,          \
  |  |  |  |  933|  1.02k|                              rounding_shift);                                 \
  |  |  |  |  934|  1.02k|    res_1 = convolve_rounding(&res_1, &offset_const, &rounding_const,          \
  |  |  |  |  935|  1.02k|                              rounding_shift);                                 \
  |  |  |  |  936|  1.02k|    res_2 = convolve_rounding(&res_2, &offset_const, &rounding_const,          \
  |  |  |  |  937|  1.02k|                              rounding_shift);                                 \
  |  |  |  |  938|  1.02k|    res_3 = convolve_rounding(&res_3, &offset_const, &rounding_const,          \
  |  |  |  |  939|  1.02k|                              rounding_shift);                                 \
  |  |  |  |  940|  1.02k|                                                                               \
  |  |  |  |  941|  1.02k|    res_10 = _mm256_packus_epi16(res_0, res_1);                                \
  |  |  |  |  942|  1.02k|    res_32 = _mm256_packus_epi16(res_2, res_3);                                \
  |  |  |  |  943|  1.02k|    res_10 = _mm256_permute4x64_epi64(res_10, 0xD8);                           \
  |  |  |  |  944|  1.02k|    res_32 = _mm256_permute4x64_epi64(res_32, 0xD8);                           \
  |  |  |  |  945|  1.02k|                                                                               \
  |  |  |  |  946|  1.02k|    _mm_store_si128((__m128i *)(&dst0[r0 * dst_stride0 + c0]),                 \
  |  |  |  |  947|  1.02k|                    _mm256_castsi256_si128(res_10));                           \
  |  |  |  |  948|  1.02k|    _mm_store_si128((__m128i *)(&dst0[r1 * dst_stride0 + c1]),                 \
  |  |  |  |  949|  1.02k|                    _mm256_extracti128_si256(res_10, 1));                      \
  |  |  |  |  950|  1.02k|    _mm_store_si128((__m128i *)(&dst0[r2 * dst_stride0 + c2]),                 \
  |  |  |  |  951|  1.02k|                    _mm256_castsi256_si128(res_32));                           \
  |  |  |  |  952|  1.02k|    _mm_store_si128((__m128i *)(&dst0[r3 * dst_stride0 + c3]),                 \
  |  |  |  |  953|  1.02k|                    _mm256_extracti128_si256(res_32, 1));                      \
  |  |  |  |  954|  1.02k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (954:12): [Folded, False: 1.02k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  967|  1.02k|        i -= 1;                                                               \
  |  |  968|  1.02k|        src += 1 * src_stride;                                                \
  |  |  969|  1.02k|        dst += 1 * dst_stride;                                                \
  |  |  970|  1.02k|        dst0 += 1 * dst_stride0;                                              \
  |  |  971|  1.02k|      } while (i);                                                            \
  |  |  ------------------
  |  |  |  Branch (971:16): [True: 1.01k, False: 9]
  |  |  ------------------
  |  |  972|  1.91k|    } else if (w == 64) {                                                     \
  |  |  ------------------
  |  |  |  Branch (972:16): [True: 187, False: 1.72k]
  |  |  ------------------
  |  |  973|  9.88k|      do {                                                                    \
  |  |  974|  9.88k|        DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 0, 16, 0, 32, 0, 48);    \
  |  |  ------------------
  |  |  |  |  903|  9.88k|  do {                                                                         \
  |  |  |  |  904|  9.88k|    src_0 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  905|  9.88k|        _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0])));             \
  |  |  |  |  906|  9.88k|    src_1 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  907|  9.88k|        _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1])));             \
  |  |  |  |  908|  9.88k|    src_2 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  909|  9.88k|        _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2])));             \
  |  |  |  |  910|  9.88k|    src_3 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  911|  9.88k|        _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3])));             \
  |  |  |  |  912|  9.88k|                                                                               \
  |  |  |  |  913|  9.88k|    src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  9.88k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  9.88k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  914|  9.88k|    src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  9.88k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  9.88k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  915|  9.88k|    src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  9.88k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  9.88k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  916|  9.88k|    src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  9.88k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  9.88k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  917|  9.88k|    src_0 = _mm256_add_epi16(src_0, offset_const);                             \
  |  |  |  |  918|  9.88k|    src_1 = _mm256_add_epi16(src_1, offset_const);                             \
  |  |  |  |  919|  9.88k|    src_2 = _mm256_add_epi16(src_2, offset_const);                             \
  |  |  |  |  920|  9.88k|    src_3 = _mm256_add_epi16(src_3, offset_const);                             \
  |  |  |  |  921|  9.88k|                                                                               \
  |  |  |  |  922|  9.88k|    ref_0 = _mm256_loadu_si256((__m256i *)(&dst[r0 * dst_stride + c0]));       \
  |  |  |  |  923|  9.88k|    ref_1 = _mm256_loadu_si256((__m256i *)(&dst[r1 * dst_stride + c1]));       \
  |  |  |  |  924|  9.88k|    ref_2 = _mm256_loadu_si256((__m256i *)(&dst[r2 * dst_stride + c2]));       \
  |  |  |  |  925|  9.88k|    ref_3 = _mm256_loadu_si256((__m256i *)(&dst[r3 * dst_stride + c3]));       \
  |  |  |  |  926|  9.88k|                                                                               \
  |  |  |  |  927|  9.88k|    res_0 = comp_avg(&ref_0, &src_0, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  928|  9.88k|    res_1 = comp_avg(&ref_1, &src_1, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  929|  9.88k|    res_2 = comp_avg(&ref_2, &src_2, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  930|  9.88k|    res_3 = comp_avg(&ref_3, &src_3, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  931|  9.88k|                                                                               \
  |  |  |  |  932|  9.88k|    res_0 = convolve_rounding(&res_0, &offset_const, &rounding_const,          \
  |  |  |  |  933|  9.88k|                              rounding_shift);                                 \
  |  |  |  |  934|  9.88k|    res_1 = convolve_rounding(&res_1, &offset_const, &rounding_const,          \
  |  |  |  |  935|  9.88k|                              rounding_shift);                                 \
  |  |  |  |  936|  9.88k|    res_2 = convolve_rounding(&res_2, &offset_const, &rounding_const,          \
  |  |  |  |  937|  9.88k|                              rounding_shift);                                 \
  |  |  |  |  938|  9.88k|    res_3 = convolve_rounding(&res_3, &offset_const, &rounding_const,          \
  |  |  |  |  939|  9.88k|                              rounding_shift);                                 \
  |  |  |  |  940|  9.88k|                                                                               \
  |  |  |  |  941|  9.88k|    res_10 = _mm256_packus_epi16(res_0, res_1);                                \
  |  |  |  |  942|  9.88k|    res_32 = _mm256_packus_epi16(res_2, res_3);                                \
  |  |  |  |  943|  9.88k|    res_10 = _mm256_permute4x64_epi64(res_10, 0xD8);                           \
  |  |  |  |  944|  9.88k|    res_32 = _mm256_permute4x64_epi64(res_32, 0xD8);                           \
  |  |  |  |  945|  9.88k|                                                                               \
  |  |  |  |  946|  9.88k|    _mm_store_si128((__m128i *)(&dst0[r0 * dst_stride0 + c0]),                 \
  |  |  |  |  947|  9.88k|                    _mm256_castsi256_si128(res_10));                           \
  |  |  |  |  948|  9.88k|    _mm_store_si128((__m128i *)(&dst0[r1 * dst_stride0 + c1]),                 \
  |  |  |  |  949|  9.88k|                    _mm256_extracti128_si256(res_10, 1));                      \
  |  |  |  |  950|  9.88k|    _mm_store_si128((__m128i *)(&dst0[r2 * dst_stride0 + c2]),                 \
  |  |  |  |  951|  9.88k|                    _mm256_castsi256_si128(res_32));                           \
  |  |  |  |  952|  9.88k|    _mm_store_si128((__m128i *)(&dst0[r3 * dst_stride0 + c3]),                 \
  |  |  |  |  953|  9.88k|                    _mm256_extracti128_si256(res_32, 1));                      \
  |  |  |  |  954|  9.88k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (954:12): [Folded, False: 9.88k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  975|  9.88k|                                                                              \
  |  |  976|  9.88k|        i -= 1;                                                               \
  |  |  977|  9.88k|        src += 1 * src_stride;                                                \
  |  |  978|  9.88k|        dst += 1 * dst_stride;                                                \
  |  |  979|  9.88k|        dst0 += 1 * dst_stride0;                                              \
  |  |  980|  9.88k|      } while (i);                                                            \
  |  |  ------------------
  |  |  |  Branch (980:16): [True: 9.70k, False: 187]
  |  |  ------------------
  |  |  981|  1.72k|    } else if (w == 32) {                                                     \
  |  |  ------------------
  |  |  |  Branch (981:16): [True: 586, False: 1.14k]
  |  |  ------------------
  |  |  982|  9.00k|      do {                                                                    \
  |  |  983|  9.00k|        DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 1, 0, 0, 16, 1, 16);     \
  |  |  ------------------
  |  |  |  |  903|  9.00k|  do {                                                                         \
  |  |  |  |  904|  9.00k|    src_0 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  905|  9.00k|        _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0])));             \
  |  |  |  |  906|  9.00k|    src_1 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  907|  9.00k|        _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1])));             \
  |  |  |  |  908|  9.00k|    src_2 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  909|  9.00k|        _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2])));             \
  |  |  |  |  910|  9.00k|    src_3 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  911|  9.00k|        _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3])));             \
  |  |  |  |  912|  9.00k|                                                                               \
  |  |  |  |  913|  9.00k|    src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  9.00k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  9.00k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  914|  9.00k|    src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  9.00k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  9.00k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  915|  9.00k|    src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  9.00k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  9.00k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  916|  9.00k|    src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  9.00k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  9.00k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  917|  9.00k|    src_0 = _mm256_add_epi16(src_0, offset_const);                             \
  |  |  |  |  918|  9.00k|    src_1 = _mm256_add_epi16(src_1, offset_const);                             \
  |  |  |  |  919|  9.00k|    src_2 = _mm256_add_epi16(src_2, offset_const);                             \
  |  |  |  |  920|  9.00k|    src_3 = _mm256_add_epi16(src_3, offset_const);                             \
  |  |  |  |  921|  9.00k|                                                                               \
  |  |  |  |  922|  9.00k|    ref_0 = _mm256_loadu_si256((__m256i *)(&dst[r0 * dst_stride + c0]));       \
  |  |  |  |  923|  9.00k|    ref_1 = _mm256_loadu_si256((__m256i *)(&dst[r1 * dst_stride + c1]));       \
  |  |  |  |  924|  9.00k|    ref_2 = _mm256_loadu_si256((__m256i *)(&dst[r2 * dst_stride + c2]));       \
  |  |  |  |  925|  9.00k|    ref_3 = _mm256_loadu_si256((__m256i *)(&dst[r3 * dst_stride + c3]));       \
  |  |  |  |  926|  9.00k|                                                                               \
  |  |  |  |  927|  9.00k|    res_0 = comp_avg(&ref_0, &src_0, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  928|  9.00k|    res_1 = comp_avg(&ref_1, &src_1, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  929|  9.00k|    res_2 = comp_avg(&ref_2, &src_2, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  930|  9.00k|    res_3 = comp_avg(&ref_3, &src_3, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  931|  9.00k|                                                                               \
  |  |  |  |  932|  9.00k|    res_0 = convolve_rounding(&res_0, &offset_const, &rounding_const,          \
  |  |  |  |  933|  9.00k|                              rounding_shift);                                 \
  |  |  |  |  934|  9.00k|    res_1 = convolve_rounding(&res_1, &offset_const, &rounding_const,          \
  |  |  |  |  935|  9.00k|                              rounding_shift);                                 \
  |  |  |  |  936|  9.00k|    res_2 = convolve_rounding(&res_2, &offset_const, &rounding_const,          \
  |  |  |  |  937|  9.00k|                              rounding_shift);                                 \
  |  |  |  |  938|  9.00k|    res_3 = convolve_rounding(&res_3, &offset_const, &rounding_const,          \
  |  |  |  |  939|  9.00k|                              rounding_shift);                                 \
  |  |  |  |  940|  9.00k|                                                                               \
  |  |  |  |  941|  9.00k|    res_10 = _mm256_packus_epi16(res_0, res_1);                                \
  |  |  |  |  942|  9.00k|    res_32 = _mm256_packus_epi16(res_2, res_3);                                \
  |  |  |  |  943|  9.00k|    res_10 = _mm256_permute4x64_epi64(res_10, 0xD8);                           \
  |  |  |  |  944|  9.00k|    res_32 = _mm256_permute4x64_epi64(res_32, 0xD8);                           \
  |  |  |  |  945|  9.00k|                                                                               \
  |  |  |  |  946|  9.00k|    _mm_store_si128((__m128i *)(&dst0[r0 * dst_stride0 + c0]),                 \
  |  |  |  |  947|  9.00k|                    _mm256_castsi256_si128(res_10));                           \
  |  |  |  |  948|  9.00k|    _mm_store_si128((__m128i *)(&dst0[r1 * dst_stride0 + c1]),                 \
  |  |  |  |  949|  9.00k|                    _mm256_extracti128_si256(res_10, 1));                      \
  |  |  |  |  950|  9.00k|    _mm_store_si128((__m128i *)(&dst0[r2 * dst_stride0 + c2]),                 \
  |  |  |  |  951|  9.00k|                    _mm256_castsi256_si128(res_32));                           \
  |  |  |  |  952|  9.00k|    _mm_store_si128((__m128i *)(&dst0[r3 * dst_stride0 + c3]),                 \
  |  |  |  |  953|  9.00k|                    _mm256_extracti128_si256(res_32, 1));                      \
  |  |  |  |  954|  9.00k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (954:12): [Folded, False: 9.00k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  984|  9.00k|                                                                              \
  |  |  985|  9.00k|        i -= 2;                                                               \
  |  |  986|  9.00k|        src += 2 * src_stride;                                                \
  |  |  987|  9.00k|        dst += 2 * dst_stride;                                                \
  |  |  988|  9.00k|        dst0 += 2 * dst_stride0;                                              \
  |  |  989|  9.00k|      } while (i);                                                            \
  |  |  ------------------
  |  |  |  Branch (989:16): [True: 8.42k, False: 586]
  |  |  ------------------
  |  |  990|  1.14k|    } else {                                                                  \
  |  |  991|  1.14k|      assert(w == 16);                                                        \
  |  |  992|  4.65k|      do {                                                                    \
  |  |  993|  4.65k|        DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 1, 0, 2, 0, 3, 0);       \
  |  |  ------------------
  |  |  |  |  903|  4.65k|  do {                                                                         \
  |  |  |  |  904|  4.65k|    src_0 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  905|  4.65k|        _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0])));             \
  |  |  |  |  906|  4.65k|    src_1 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  907|  4.65k|        _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1])));             \
  |  |  |  |  908|  4.65k|    src_2 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  909|  4.65k|        _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2])));             \
  |  |  |  |  910|  4.65k|    src_3 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  911|  4.65k|        _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3])));             \
  |  |  |  |  912|  4.65k|                                                                               \
  |  |  |  |  913|  4.65k|    src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  4.65k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  4.65k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  914|  4.65k|    src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  4.65k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  4.65k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  915|  4.65k|    src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  4.65k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  4.65k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  916|  4.65k|    src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  4.65k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  4.65k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  917|  4.65k|    src_0 = _mm256_add_epi16(src_0, offset_const);                             \
  |  |  |  |  918|  4.65k|    src_1 = _mm256_add_epi16(src_1, offset_const);                             \
  |  |  |  |  919|  4.65k|    src_2 = _mm256_add_epi16(src_2, offset_const);                             \
  |  |  |  |  920|  4.65k|    src_3 = _mm256_add_epi16(src_3, offset_const);                             \
  |  |  |  |  921|  4.65k|                                                                               \
  |  |  |  |  922|  4.65k|    ref_0 = _mm256_loadu_si256((__m256i *)(&dst[r0 * dst_stride + c0]));       \
  |  |  |  |  923|  4.65k|    ref_1 = _mm256_loadu_si256((__m256i *)(&dst[r1 * dst_stride + c1]));       \
  |  |  |  |  924|  4.65k|    ref_2 = _mm256_loadu_si256((__m256i *)(&dst[r2 * dst_stride + c2]));       \
  |  |  |  |  925|  4.65k|    ref_3 = _mm256_loadu_si256((__m256i *)(&dst[r3 * dst_stride + c3]));       \
  |  |  |  |  926|  4.65k|                                                                               \
  |  |  |  |  927|  4.65k|    res_0 = comp_avg(&ref_0, &src_0, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  928|  4.65k|    res_1 = comp_avg(&ref_1, &src_1, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  929|  4.65k|    res_2 = comp_avg(&ref_2, &src_2, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  930|  4.65k|    res_3 = comp_avg(&ref_3, &src_3, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  931|  4.65k|                                                                               \
  |  |  |  |  932|  4.65k|    res_0 = convolve_rounding(&res_0, &offset_const, &rounding_const,          \
  |  |  |  |  933|  4.65k|                              rounding_shift);                                 \
  |  |  |  |  934|  4.65k|    res_1 = convolve_rounding(&res_1, &offset_const, &rounding_const,          \
  |  |  |  |  935|  4.65k|                              rounding_shift);                                 \
  |  |  |  |  936|  4.65k|    res_2 = convolve_rounding(&res_2, &offset_const, &rounding_const,          \
  |  |  |  |  937|  4.65k|                              rounding_shift);                                 \
  |  |  |  |  938|  4.65k|    res_3 = convolve_rounding(&res_3, &offset_const, &rounding_const,          \
  |  |  |  |  939|  4.65k|                              rounding_shift);                                 \
  |  |  |  |  940|  4.65k|                                                                               \
  |  |  |  |  941|  4.65k|    res_10 = _mm256_packus_epi16(res_0, res_1);                                \
  |  |  |  |  942|  4.65k|    res_32 = _mm256_packus_epi16(res_2, res_3);                                \
  |  |  |  |  943|  4.65k|    res_10 = _mm256_permute4x64_epi64(res_10, 0xD8);                           \
  |  |  |  |  944|  4.65k|    res_32 = _mm256_permute4x64_epi64(res_32, 0xD8);                           \
  |  |  |  |  945|  4.65k|                                                                               \
  |  |  |  |  946|  4.65k|    _mm_store_si128((__m128i *)(&dst0[r0 * dst_stride0 + c0]),                 \
  |  |  |  |  947|  4.65k|                    _mm256_castsi256_si128(res_10));                           \
  |  |  |  |  948|  4.65k|    _mm_store_si128((__m128i *)(&dst0[r1 * dst_stride0 + c1]),                 \
  |  |  |  |  949|  4.65k|                    _mm256_extracti128_si256(res_10, 1));                      \
  |  |  |  |  950|  4.65k|    _mm_store_si128((__m128i *)(&dst0[r2 * dst_stride0 + c2]),                 \
  |  |  |  |  951|  4.65k|                    _mm256_castsi256_si128(res_32));                           \
  |  |  |  |  952|  4.65k|    _mm_store_si128((__m128i *)(&dst0[r3 * dst_stride0 + c3]),                 \
  |  |  |  |  953|  4.65k|                    _mm256_extracti128_si256(res_32, 1));                      \
  |  |  |  |  954|  4.65k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (954:12): [Folded, False: 4.65k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  994|  4.65k|                                                                              \
  |  |  995|  4.65k|        i -= 4;                                                               \
  |  |  996|  4.65k|        src += 4 * src_stride;                                                \
  |  |  997|  4.65k|        dst += 4 * dst_stride;                                                \
  |  |  998|  4.65k|        dst0 += 4 * dst_stride0;                                              \
  |  |  999|  4.65k|      } while (i);                                                            \
  |  |  ------------------
  |  |  |  Branch (999:16): [True: 3.51k, False: 1.14k]
  |  |  ------------------
  |  | 1000|  1.14k|    }                                                                         \
  |  | 1001|  2.53k|  } else if (w == 8) {                                                        \
  |  |  ------------------
  |  |  |  Branch (1001:14): [True: 1.64k, False: 892]
  |  |  ------------------
  |  | 1002|  4.19k|    do {                                                                      \
  |  | 1003|  4.19k|      const __m128i src_0 =                                                   \
  |  | 1004|  4.19k|          _mm_loadl_epi64((__m128i *)(&src[0 * src_stride]));                 \
  |  | 1005|  4.19k|      const __m128i src_1 =                                                   \
  |  | 1006|  4.19k|          _mm_loadl_epi64((__m128i *)(&src[1 * src_stride]));                 \
  |  | 1007|  4.19k|      const __m128i src_2 =                                                   \
  |  | 1008|  4.19k|          _mm_loadl_epi64((__m128i *)(&src[2 * src_stride]));                 \
  |  | 1009|  4.19k|      const __m128i src_3 =                                                   \
  |  | 1010|  4.19k|          _mm_loadl_epi64((__m128i *)(&src[3 * src_stride]));                 \
  |  | 1011|  4.19k|      __m256i src_10 =                                                        \
  |  | 1012|  4.19k|          _mm256_insertf128_si256(_mm256_castsi128_si256(src_0), src_1, 1);   \
  |  | 1013|  4.19k|      __m256i src_32 =                                                        \
  |  | 1014|  4.19k|          _mm256_insertf128_si256(_mm256_castsi128_si256(src_2), src_3, 1);   \
  |  | 1015|  4.19k|                                                                              \
  |  | 1016|  4.19k|      src_10 = _mm256_unpacklo_epi8(src_10, zero);                            \
  |  | 1017|  4.19k|      src_32 = _mm256_unpacklo_epi8(src_32, zero);                            \
  |  | 1018|  4.19k|                                                                              \
  |  | 1019|  4.19k|      src_10 = _mm256_slli_epi16(src_10, LEFT_SHIFT);                         \
  |  |  ------------------
  |  |  |  |  822|  4.19k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|  4.19k|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 1020|  4.19k|      src_32 = _mm256_slli_epi16(src_32, LEFT_SHIFT);                         \
  |  |  ------------------
  |  |  |  |  822|  4.19k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|  4.19k|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 1021|  4.19k|                                                                              \
  |  | 1022|  4.19k|      src_10 = _mm256_add_epi16(src_10, offset_const);                        \
  |  | 1023|  4.19k|      src_32 = _mm256_add_epi16(src_32, offset_const);                        \
  |  | 1024|  4.19k|                                                                              \
  |  | 1025|  4.19k|      const __m256i ref_10 =                                                  \
  |  | 1026|  4.19k|          load_line2_avx2(&dst[0 * dst_stride], &dst[1 * dst_stride]);        \
  |  | 1027|  4.19k|      const __m256i ref_32 =                                                  \
  |  | 1028|  4.19k|          load_line2_avx2(&dst[2 * dst_stride], &dst[3 * dst_stride]);        \
  |  | 1029|  4.19k|      __m256i res_10 = comp_avg(&ref_10, &src_10, &wt, USE_DIST_WEIGHTED);    \
  |  | 1030|  4.19k|      __m256i res_32 = comp_avg(&ref_32, &src_32, &wt, USE_DIST_WEIGHTED);    \
  |  | 1031|  4.19k|                                                                              \
  |  | 1032|  4.19k|      res_10 = convolve_rounding(&res_10, &offset_const, &rounding_const,     \
  |  | 1033|  4.19k|                                 rounding_shift);                             \
  |  | 1034|  4.19k|      res_32 = convolve_rounding(&res_32, &offset_const, &rounding_const,     \
  |  | 1035|  4.19k|                                 rounding_shift);                             \
  |  | 1036|  4.19k|                                                                              \
  |  | 1037|  4.19k|      __m256i res = _mm256_packus_epi16(res_10, res_32);                      \
  |  | 1038|  4.19k|      const __m128i res_20 = _mm256_castsi256_si128(res);                     \
  |  | 1039|  4.19k|      const __m128i res_31 = _mm256_extracti128_si256(res, 1);                \
  |  | 1040|  4.19k|                                                                              \
  |  | 1041|  4.19k|      _mm_storel_epi64((__m128i *)(&dst0[0 * dst_stride0]), res_20);          \
  |  | 1042|  4.19k|      _mm_storel_epi64((__m128i *)((&dst0[1 * dst_stride0])), res_31);        \
  |  | 1043|  4.19k|      _mm_storeh_epi64((__m128i *)(&dst0[2 * dst_stride0]), res_20);          \
  |  | 1044|  4.19k|      _mm_storeh_epi64((__m128i *)((&dst0[3 * dst_stride0])), res_31);        \
  |  | 1045|  4.19k|      i -= 4;                                                                 \
  |  | 1046|  4.19k|      src += 4 * src_stride;                                                  \
  |  | 1047|  4.19k|      dst += 4 * dst_stride;                                                  \
  |  | 1048|  4.19k|      dst0 += 4 * dst_stride0;                                                \
  |  | 1049|  4.19k|    } while (i);                                                              \
  |  |  ------------------
  |  |  |  Branch (1049:14): [True: 2.55k, False: 1.64k]
  |  |  ------------------
  |  | 1050|  1.64k|  } else {                                                                    \
  |  | 1051|    892|    assert(w == 4);                                                           \
  |  | 1052|  1.44k|    do {                                                                      \
  |  | 1053|  1.44k|      __m256i src_3210_8bit =                                                 \
  |  | 1054|  1.44k|          _mm256_setr_epi32(loadu_int32(src + 0 * src_stride),                \
  |  | 1055|  1.44k|                            loadu_int32(src + 1 * src_stride), 0, 0,          \
  |  | 1056|  1.44k|                            loadu_int32(src + 2 * src_stride),                \
  |  | 1057|  1.44k|                            loadu_int32(src + 3 * src_stride), 0, 0);         \
  |  | 1058|  1.44k|                                                                              \
  |  | 1059|  1.44k|      __m256i src_3210 = _mm256_unpacklo_epi8(src_3210_8bit, zero);           \
  |  | 1060|  1.44k|      src_3210 = _mm256_slli_epi16(src_3210, LEFT_SHIFT);                     \
  |  |  ------------------
  |  |  |  |  822|  1.44k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|  1.44k|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 1061|  1.44k|      src_3210 = _mm256_add_epi16(src_3210, offset_const);                    \
  |  | 1062|  1.44k|                                                                              \
  |  | 1063|  1.44k|      __m256i ref_3210 =                                                      \
  |  | 1064|  1.44k|          _mm256_setr_epi64x(*(int64_t *)(dst + 0 * dst_stride),              \
  |  | 1065|  1.44k|                             *(int64_t *)(dst + 1 * dst_stride),              \
  |  | 1066|  1.44k|                             *(int64_t *)(dst + 2 * dst_stride),              \
  |  | 1067|  1.44k|                             *(int64_t *)(dst + 3 * dst_stride));             \
  |  | 1068|  1.44k|      __m256i res_3210 =                                                      \
  |  | 1069|  1.44k|          comp_avg(&ref_3210, &src_3210, &wt, USE_DIST_WEIGHTED);             \
  |  | 1070|  1.44k|                                                                              \
  |  | 1071|  1.44k|      res_3210 = convolve_rounding(&res_3210, &offset_const, &rounding_const, \
  |  | 1072|  1.44k|                                   rounding_shift);                           \
  |  | 1073|  1.44k|                                                                              \
  |  | 1074|  1.44k|      res_3210 = _mm256_packus_epi16(res_3210, res_3210);                     \
  |  | 1075|  1.44k|      const __m128i res_10 = _mm256_castsi256_si128(res_3210);                \
  |  | 1076|  1.44k|      const __m128i res_32 = _mm256_extracti128_si256(res_3210, 1);           \
  |  | 1077|  1.44k|                                                                              \
  |  | 1078|  1.44k|      *(int *)(&dst0[0 * dst_stride0]) = _mm_cvtsi128_si32(res_10);           \
  |  | 1079|  1.44k|      *(int *)(&dst0[2 * dst_stride0]) = _mm_cvtsi128_si32(res_32);           \
  |  | 1080|  1.44k|      *(int *)(&dst0[1 * dst_stride0]) = _mm_extract_epi32(res_10, 1);        \
  |  | 1081|  1.44k|      *(int *)(&dst0[3 * dst_stride0]) = _mm_extract_epi32(res_32, 1);        \
  |  | 1082|  1.44k|      i -= 4;                                                                 \
  |  | 1083|  1.44k|      src += 4 * src_stride;                                                  \
  |  | 1084|  1.44k|      dst += 4 * dst_stride;                                                  \
  |  | 1085|  1.44k|      dst0 += 4 * dst_stride0;                                                \
  |  | 1086|  1.44k|    } while (i);                                                              \
  |  |  ------------------
  |  |  |  Branch (1086:14): [True: 552, False: 892]
  |  |  ------------------
  |  | 1087|    892|  }
  ------------------
 1118|  4.45k|    }
 1119|  8.12k|  } else {
 1120|  8.12k|    av1_dist_wtd_convolve_2d_no_avg_copy_avx2(src, src_stride, dst, dst_stride,
 1121|  8.12k|                                              w, h, offset_const);
 1122|  8.12k|  }
 1123|  13.6k|}
jnt_convolve_avx2.c:unpack_weights_avx2:
   27|  30.2k|static inline __m256i unpack_weights_avx2(ConvolveParams *conv_params) {
   28|  30.2k|  const int w0 = conv_params->fwd_offset;
   29|  30.2k|  const int w1 = conv_params->bck_offset;
   30|  30.2k|  const __m256i wt0 = _mm256_set1_epi16((int16_t)w0);
   31|  30.2k|  const __m256i wt1 = _mm256_set1_epi16((int16_t)w1);
   32|  30.2k|  const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1);
   33|  30.2k|  return wt;
   34|  30.2k|}
jnt_convolve_avx2.c:load_line2_avx2:
   36|   158k|static inline __m256i load_line2_avx2(const void *a, const void *b) {
   37|       |  return _mm256_permute2x128_si256(
   38|   158k|      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)a)),
   39|   158k|      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)b)), 0x20);
   40|   158k|}
jnt_convolve_avx2.c:av1_dist_wtd_convolve_2d_no_avg_copy_avx2:
  825|  8.12k|    int w, int h, const __m256i offset_const) {
  826|  8.12k|  int i = h;
  827|  8.12k|  if (w >= 16) {
  ------------------
  |  Branch (827:7): [True: 3.43k, False: 4.68k]
  ------------------
  828|  3.43k|    __m256i src_0, src_1, src_2, src_3;
  829|  3.43k|    if (w == 128) {
  ------------------
  |  Branch (829:9): [True: 23, False: 3.41k]
  ------------------
  830|  2.56k|      do {
  831|  2.56k|        DO_NO_AVG_2D_COPY_4X16(0, 0, 0, 16, 0, 32, 0, 48);
  ------------------
  |  |  796|  2.56k|  do {                                                                  \
  |  |  797|  2.56k|    src_0 = _mm256_cvtepu8_epi16(                                       \
  |  |  798|  2.56k|        _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0])));      \
  |  |  799|  2.56k|    src_1 = _mm256_cvtepu8_epi16(                                       \
  |  |  800|  2.56k|        _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1])));      \
  |  |  801|  2.56k|    src_2 = _mm256_cvtepu8_epi16(                                       \
  |  |  802|  2.56k|        _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2])));      \
  |  |  803|  2.56k|    src_3 = _mm256_cvtepu8_epi16(                                       \
  |  |  804|  2.56k|        _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3])));      \
  |  |  805|  2.56k|                                                                        \
  |  |  806|  2.56k|    src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT);                       \
  |  |  ------------------
  |  |  |  |  822|  2.56k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|  2.56k|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  807|  2.56k|    src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT);                       \
  |  |  ------------------
  |  |  |  |  822|  2.56k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|  2.56k|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  808|  2.56k|    src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT);                       \
  |  |  ------------------
  |  |  |  |  822|  2.56k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|  2.56k|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  809|  2.56k|    src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT);                       \
  |  |  ------------------
  |  |  |  |  822|  2.56k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|  2.56k|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  810|  2.56k|                                                                        \
  |  |  811|  2.56k|    src_0 = _mm256_add_epi16(src_0, offset_const);                      \
  |  |  812|  2.56k|    src_1 = _mm256_add_epi16(src_1, offset_const);                      \
  |  |  813|  2.56k|    src_2 = _mm256_add_epi16(src_2, offset_const);                      \
  |  |  814|  2.56k|    src_3 = _mm256_add_epi16(src_3, offset_const);                      \
  |  |  815|  2.56k|                                                                        \
  |  |  816|  2.56k|    _mm256_store_si256((__m256i *)(&dst[r0 * dst_stride + c0]), src_0); \
  |  |  817|  2.56k|    _mm256_store_si256((__m256i *)(&dst[r1 * dst_stride + c1]), src_1); \
  |  |  818|  2.56k|    _mm256_store_si256((__m256i *)(&dst[r2 * dst_stride + c2]), src_2); \
  |  |  819|  2.56k|    _mm256_store_si256((__m256i *)(&dst[r3 * dst_stride + c3]), src_3); \
  |  |  820|  2.56k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (820:12): [Folded, False: 2.56k]
  |  |  ------------------
  ------------------
  832|  2.56k|        DO_NO_AVG_2D_COPY_4X16(0, 64, 0, 80, 0, 96, 0, 112);
  ------------------
  |  |  796|  2.56k|  do {                                                                  \
  |  |  797|  2.56k|    src_0 = _mm256_cvtepu8_epi16(                                       \
  |  |  798|  2.56k|        _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0])));      \
  |  |  799|  2.56k|    src_1 = _mm256_cvtepu8_epi16(                                       \
  |  |  800|  2.56k|        _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1])));      \
  |  |  801|  2.56k|    src_2 = _mm256_cvtepu8_epi16(                                       \
  |  |  802|  2.56k|        _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2])));      \
  |  |  803|  2.56k|    src_3 = _mm256_cvtepu8_epi16(                                       \
  |  |  804|  2.56k|        _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3])));      \
  |  |  805|  2.56k|                                                                        \
  |  |  806|  2.56k|    src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT);                       \
  |  |  ------------------
  |  |  |  |  822|  2.56k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|  2.56k|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  807|  2.56k|    src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT);                       \
  |  |  ------------------
  |  |  |  |  822|  2.56k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|  2.56k|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  808|  2.56k|    src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT);                       \
  |  |  ------------------
  |  |  |  |  822|  2.56k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|  2.56k|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  809|  2.56k|    src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT);                       \
  |  |  ------------------
  |  |  |  |  822|  2.56k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|  2.56k|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  810|  2.56k|                                                                        \
  |  |  811|  2.56k|    src_0 = _mm256_add_epi16(src_0, offset_const);                      \
  |  |  812|  2.56k|    src_1 = _mm256_add_epi16(src_1, offset_const);                      \
  |  |  813|  2.56k|    src_2 = _mm256_add_epi16(src_2, offset_const);                      \
  |  |  814|  2.56k|    src_3 = _mm256_add_epi16(src_3, offset_const);                      \
  |  |  815|  2.56k|                                                                        \
  |  |  816|  2.56k|    _mm256_store_si256((__m256i *)(&dst[r0 * dst_stride + c0]), src_0); \
  |  |  817|  2.56k|    _mm256_store_si256((__m256i *)(&dst[r1 * dst_stride + c1]), src_1); \
  |  |  818|  2.56k|    _mm256_store_si256((__m256i *)(&dst[r2 * dst_stride + c2]), src_2); \
  |  |  819|  2.56k|    _mm256_store_si256((__m256i *)(&dst[r3 * dst_stride + c3]), src_3); \
  |  |  820|  2.56k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (820:12): [Folded, False: 2.56k]
  |  |  ------------------
  ------------------
  833|  2.56k|        src += 1 * src_stride;
  834|  2.56k|        dst += 1 * dst_stride;
  835|  2.56k|        i -= 1;
  836|  2.56k|      } while (i);
  ------------------
  |  Branch (836:16): [True: 2.53k, False: 23]
  ------------------
  837|  3.41k|    } else if (w == 64) {
  ------------------
  |  Branch (837:16): [True: 287, False: 3.12k]
  ------------------
  838|  15.5k|      do {
  839|  15.5k|        DO_NO_AVG_2D_COPY_4X16(0, 0, 0, 16, 0, 32, 0, 48);
  ------------------
  |  |  796|  15.5k|  do {                                                                  \
  |  |  797|  15.5k|    src_0 = _mm256_cvtepu8_epi16(                                       \
  |  |  798|  15.5k|        _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0])));      \
  |  |  799|  15.5k|    src_1 = _mm256_cvtepu8_epi16(                                       \
  |  |  800|  15.5k|        _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1])));      \
  |  |  801|  15.5k|    src_2 = _mm256_cvtepu8_epi16(                                       \
  |  |  802|  15.5k|        _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2])));      \
  |  |  803|  15.5k|    src_3 = _mm256_cvtepu8_epi16(                                       \
  |  |  804|  15.5k|        _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3])));      \
  |  |  805|  15.5k|                                                                        \
  |  |  806|  15.5k|    src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT);                       \
  |  |  ------------------
  |  |  |  |  822|  15.5k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|  15.5k|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  807|  15.5k|    src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT);                       \
  |  |  ------------------
  |  |  |  |  822|  15.5k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|  15.5k|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  808|  15.5k|    src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT);                       \
  |  |  ------------------
  |  |  |  |  822|  15.5k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|  15.5k|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  809|  15.5k|    src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT);                       \
  |  |  ------------------
  |  |  |  |  822|  15.5k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|  15.5k|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  810|  15.5k|                                                                        \
  |  |  811|  15.5k|    src_0 = _mm256_add_epi16(src_0, offset_const);                      \
  |  |  812|  15.5k|    src_1 = _mm256_add_epi16(src_1, offset_const);                      \
  |  |  813|  15.5k|    src_2 = _mm256_add_epi16(src_2, offset_const);                      \
  |  |  814|  15.5k|    src_3 = _mm256_add_epi16(src_3, offset_const);                      \
  |  |  815|  15.5k|                                                                        \
  |  |  816|  15.5k|    _mm256_store_si256((__m256i *)(&dst[r0 * dst_stride + c0]), src_0); \
  |  |  817|  15.5k|    _mm256_store_si256((__m256i *)(&dst[r1 * dst_stride + c1]), src_1); \
  |  |  818|  15.5k|    _mm256_store_si256((__m256i *)(&dst[r2 * dst_stride + c2]), src_2); \
  |  |  819|  15.5k|    _mm256_store_si256((__m256i *)(&dst[r3 * dst_stride + c3]), src_3); \
  |  |  820|  15.5k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (820:12): [Folded, False: 15.5k]
  |  |  ------------------
  ------------------
  840|  15.5k|        src += 1 * src_stride;
  841|  15.5k|        dst += 1 * dst_stride;
  842|  15.5k|        i -= 1;
  843|  15.5k|      } while (i);
  ------------------
  |  Branch (843:16): [True: 15.2k, False: 287]
  ------------------
  844|  3.12k|    } else if (w == 32) {
  ------------------
  |  Branch (844:16): [True: 963, False: 2.16k]
  ------------------
  845|  14.7k|      do {
  846|  14.7k|        DO_NO_AVG_2D_COPY_4X16(0, 0, 1, 0, 0, 16, 1, 16);
  ------------------
  |  |  796|  14.7k|  do {                                                                  \
  |  |  797|  14.7k|    src_0 = _mm256_cvtepu8_epi16(                                       \
  |  |  798|  14.7k|        _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0])));      \
  |  |  799|  14.7k|    src_1 = _mm256_cvtepu8_epi16(                                       \
  |  |  800|  14.7k|        _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1])));      \
  |  |  801|  14.7k|    src_2 = _mm256_cvtepu8_epi16(                                       \
  |  |  802|  14.7k|        _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2])));      \
  |  |  803|  14.7k|    src_3 = _mm256_cvtepu8_epi16(                                       \
  |  |  804|  14.7k|        _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3])));      \
  |  |  805|  14.7k|                                                                        \
  |  |  806|  14.7k|    src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT);                       \
  |  |  ------------------
  |  |  |  |  822|  14.7k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|  14.7k|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  807|  14.7k|    src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT);                       \
  |  |  ------------------
  |  |  |  |  822|  14.7k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|  14.7k|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  808|  14.7k|    src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT);                       \
  |  |  ------------------
  |  |  |  |  822|  14.7k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|  14.7k|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  809|  14.7k|    src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT);                       \
  |  |  ------------------
  |  |  |  |  822|  14.7k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|  14.7k|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  810|  14.7k|                                                                        \
  |  |  811|  14.7k|    src_0 = _mm256_add_epi16(src_0, offset_const);                      \
  |  |  812|  14.7k|    src_1 = _mm256_add_epi16(src_1, offset_const);                      \
  |  |  813|  14.7k|    src_2 = _mm256_add_epi16(src_2, offset_const);                      \
  |  |  814|  14.7k|    src_3 = _mm256_add_epi16(src_3, offset_const);                      \
  |  |  815|  14.7k|                                                                        \
  |  |  816|  14.7k|    _mm256_store_si256((__m256i *)(&dst[r0 * dst_stride + c0]), src_0); \
  |  |  817|  14.7k|    _mm256_store_si256((__m256i *)(&dst[r1 * dst_stride + c1]), src_1); \
  |  |  818|  14.7k|    _mm256_store_si256((__m256i *)(&dst[r2 * dst_stride + c2]), src_2); \
  |  |  819|  14.7k|    _mm256_store_si256((__m256i *)(&dst[r3 * dst_stride + c3]), src_3); \
  |  |  820|  14.7k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (820:12): [Folded, False: 14.7k]
  |  |  ------------------
  ------------------
  847|  14.7k|        src += 2 * src_stride;
  848|  14.7k|        dst += 2 * dst_stride;
  849|  14.7k|        i -= 2;
  850|  14.7k|      } while (i);
  ------------------
  |  Branch (850:16): [True: 13.7k, False: 963]
  ------------------
  851|  2.16k|    } else if (w == 16) {
  ------------------
  |  Branch (851:16): [True: 2.16k, False: 0]
  ------------------
  852|  8.42k|      do {
  853|  8.42k|        DO_NO_AVG_2D_COPY_4X16(0, 0, 1, 0, 2, 0, 3, 0);
  ------------------
  |  |  796|  8.42k|  do {                                                                  \
  |  |  797|  8.42k|    src_0 = _mm256_cvtepu8_epi16(                                       \
  |  |  798|  8.42k|        _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0])));      \
  |  |  799|  8.42k|    src_1 = _mm256_cvtepu8_epi16(                                       \
  |  |  800|  8.42k|        _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1])));      \
  |  |  801|  8.42k|    src_2 = _mm256_cvtepu8_epi16(                                       \
  |  |  802|  8.42k|        _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2])));      \
  |  |  803|  8.42k|    src_3 = _mm256_cvtepu8_epi16(                                       \
  |  |  804|  8.42k|        _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3])));      \
  |  |  805|  8.42k|                                                                        \
  |  |  806|  8.42k|    src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT);                       \
  |  |  ------------------
  |  |  |  |  822|  8.42k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|  8.42k|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  807|  8.42k|    src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT);                       \
  |  |  ------------------
  |  |  |  |  822|  8.42k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|  8.42k|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  808|  8.42k|    src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT);                       \
  |  |  ------------------
  |  |  |  |  822|  8.42k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|  8.42k|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  809|  8.42k|    src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT);                       \
  |  |  ------------------
  |  |  |  |  822|  8.42k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|  8.42k|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  810|  8.42k|                                                                        \
  |  |  811|  8.42k|    src_0 = _mm256_add_epi16(src_0, offset_const);                      \
  |  |  812|  8.42k|    src_1 = _mm256_add_epi16(src_1, offset_const);                      \
  |  |  813|  8.42k|    src_2 = _mm256_add_epi16(src_2, offset_const);                      \
  |  |  814|  8.42k|    src_3 = _mm256_add_epi16(src_3, offset_const);                      \
  |  |  815|  8.42k|                                                                        \
  |  |  816|  8.42k|    _mm256_store_si256((__m256i *)(&dst[r0 * dst_stride + c0]), src_0); \
  |  |  817|  8.42k|    _mm256_store_si256((__m256i *)(&dst[r1 * dst_stride + c1]), src_1); \
  |  |  818|  8.42k|    _mm256_store_si256((__m256i *)(&dst[r2 * dst_stride + c2]), src_2); \
  |  |  819|  8.42k|    _mm256_store_si256((__m256i *)(&dst[r3 * dst_stride + c3]), src_3); \
  |  |  820|  8.42k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (820:12): [Folded, False: 8.42k]
  |  |  ------------------
  ------------------
  854|  8.42k|        src += 4 * src_stride;
  855|  8.42k|        dst += 4 * dst_stride;
  856|  8.42k|        i -= 4;
  857|  8.42k|      } while (i);
  ------------------
  |  Branch (857:16): [True: 6.26k, False: 2.16k]
  ------------------
  858|  2.16k|    }
  859|  4.68k|  } else {
  860|  4.68k|    const __m256i zero = _mm256_setzero_si256();
  861|  10.2k|    do {
  862|  10.2k|      const __m128i src_row_0 =
  863|  10.2k|          _mm_loadl_epi64((__m128i *)(&src[0 * src_stride]));
  864|  10.2k|      const __m128i src_row_1 =
  865|  10.2k|          _mm_loadl_epi64((__m128i *)(&src[1 * src_stride]));
  866|  10.2k|      const __m128i src_row_2 =
  867|  10.2k|          _mm_loadl_epi64((__m128i *)(&src[2 * src_stride]));
  868|  10.2k|      const __m128i src_row_3 =
  869|  10.2k|          _mm_loadl_epi64((__m128i *)(&src[3 * src_stride]));
  870|       |
  871|  10.2k|      __m256i src_10 = _mm256_insertf128_si256(
  872|  10.2k|          _mm256_castsi128_si256(src_row_0), src_row_1, 1);
  873|  10.2k|      __m256i src_32 = _mm256_insertf128_si256(
  874|  10.2k|          _mm256_castsi128_si256(src_row_2), src_row_3, 1);
  875|       |
  876|  10.2k|      src_10 = _mm256_unpacklo_epi8(src_10, zero);
  877|  10.2k|      src_32 = _mm256_unpacklo_epi8(src_32, zero);
  878|       |
  879|  10.2k|      src_10 = _mm256_slli_epi16(src_10, LEFT_SHIFT);
  ------------------
  |  |  822|  10.2k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  ------------------
  |  |  |  |   21|  10.2k|#define FILTER_BITS 7
  |  |  ------------------
  ------------------
  880|  10.2k|      src_32 = _mm256_slli_epi16(src_32, LEFT_SHIFT);
  ------------------
  |  |  822|  10.2k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  ------------------
  |  |  |  |   21|  10.2k|#define FILTER_BITS 7
  |  |  ------------------
  ------------------
  881|       |
  882|  10.2k|      src_10 = _mm256_add_epi16(src_10, offset_const);
  883|  10.2k|      src_32 = _mm256_add_epi16(src_32, offset_const);
  884|       |
  885|       |      // Accumulate values into the destination buffer
  886|  10.2k|      _mm_store_si128((__m128i *)(&dst[0 * dst_stride]),
  887|  10.2k|                      _mm256_castsi256_si128(src_10));
  888|  10.2k|      _mm_store_si128((__m128i *)(&dst[1 * dst_stride]),
  889|  10.2k|                      _mm256_extracti128_si256(src_10, 1));
  890|  10.2k|      _mm_store_si128((__m128i *)(&dst[2 * dst_stride]),
  891|  10.2k|                      _mm256_castsi256_si128(src_32));
  892|  10.2k|      _mm_store_si128((__m128i *)(&dst[3 * dst_stride]),
  893|  10.2k|                      _mm256_extracti128_si256(src_32, 1));
  894|       |
  895|  10.2k|      src += 4 * src_stride;
  896|  10.2k|      dst += 4 * dst_stride;
  897|  10.2k|      i -= 4;
  898|  10.2k|    } while (i);
  ------------------
  |  Branch (898:14): [True: 5.54k, False: 4.68k]
  ------------------
  899|  4.68k|  }
  900|  8.12k|}

av1_build_compound_diffwtd_mask_d16_avx2:
  498|    604|    ConvolveParams *conv_params, int bd) {
  499|    604|  const int shift =
  500|    604|      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
  ------------------
  |  |   21|    604|#define FILTER_BITS 7
  ------------------
  501|       |  // When rounding constant is added, there is a possibility of overflow.
  502|       |  // However that much precision is not required. Code should very well work for
  503|       |  // other values of DIFF_FACTOR_LOG2 and AOM_BLEND_A64_MAX_ALPHA as well. But
  504|       |  // there is a possibility of corner case bugs.
  505|    604|  assert(DIFF_FACTOR_LOG2 == 4);
  506|    604|  assert(AOM_BLEND_A64_MAX_ALPHA == 64);
  507|       |
  508|    604|  if (mask_type == DIFFWTD_38) {
  ------------------
  |  Branch (508:7): [True: 323, False: 281]
  ------------------
  509|    323|    build_compound_diffwtd_mask_d16_avx2(mask, src0, src0_stride, src1,
  510|    323|                                         src1_stride, h, w, shift);
  511|    323|  } else {
  512|    281|    build_compound_diffwtd_mask_d16_inv_avx2(mask, src0, src0_stride, src1,
  513|    281|                                             src1_stride, h, w, shift);
  514|    281|  }
  515|    604|}
reconinter_avx2.c:build_compound_diffwtd_mask_d16_avx2:
  174|    323|    const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, int shift) {
  175|    323|  const int mask_base = 38;
  176|    323|  const __m256i _r = _mm256_set1_epi16((1 << shift) >> 1);
  177|    323|  const __m256i y38 = _mm256_set1_epi16(mask_base);
  178|    323|  const __m256i y64 = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|    323|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|    323|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  179|    323|  int i = 0;
  180|    323|  if (w == 4) {
  ------------------
  |  Branch (180:7): [True: 0, False: 323]
  ------------------
  181|      0|    do {
  182|      0|      const __m128i s0A = xx_loadl_64(src0);
  183|      0|      const __m128i s0B = xx_loadl_64(src0 + src0_stride);
  184|      0|      const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2);
  185|      0|      const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3);
  186|      0|      const __m128i s1A = xx_loadl_64(src1);
  187|      0|      const __m128i s1B = xx_loadl_64(src1 + src1_stride);
  188|      0|      const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2);
  189|      0|      const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3);
  190|      0|      const __m256i s0 = yy_set_m128i(_mm_unpacklo_epi64(s0C, s0D),
  191|      0|                                      _mm_unpacklo_epi64(s0A, s0B));
  192|      0|      const __m256i s1 = yy_set_m128i(_mm_unpacklo_epi64(s1C, s1D),
  193|      0|                                      _mm_unpacklo_epi64(s1A, s1B));
  194|      0|      const __m256i m16 = calc_mask_d16_avx2(&s0, &s1, &_r, &y38, &y64, shift);
  195|      0|      const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256());
  196|      0|      xx_storeu_128(mask,
  197|      0|                    _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8)));
  198|      0|      src0 += src0_stride << 2;
  199|      0|      src1 += src1_stride << 2;
  200|      0|      mask += 16;
  201|      0|      i += 4;
  202|      0|    } while (i < h);
  ------------------
  |  Branch (202:14): [True: 0, False: 0]
  ------------------
  203|    323|  } else if (w == 8) {
  ------------------
  |  Branch (203:14): [True: 84, False: 239]
  ------------------
  204|    346|    do {
  205|    346|      const __m256i s0AB = yy_loadu2_128(src0 + src0_stride, src0);
  206|    346|      const __m256i s0CD =
  207|    346|          yy_loadu2_128(src0 + src0_stride * 3, src0 + src0_stride * 2);
  208|    346|      const __m256i s1AB = yy_loadu2_128(src1 + src1_stride, src1);
  209|    346|      const __m256i s1CD =
  210|    346|          yy_loadu2_128(src1 + src1_stride * 3, src1 + src1_stride * 2);
  211|    346|      const __m256i m16AB =
  212|    346|          calc_mask_d16_avx2(&s0AB, &s1AB, &_r, &y38, &y64, shift);
  213|    346|      const __m256i m16CD =
  214|    346|          calc_mask_d16_avx2(&s0CD, &s1CD, &_r, &y38, &y64, shift);
  215|    346|      const __m256i m8 = _mm256_packus_epi16(m16AB, m16CD);
  216|    346|      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
  217|    346|      src0 += src0_stride << 2;
  218|    346|      src1 += src1_stride << 2;
  219|    346|      mask += 32;
  220|    346|      i += 4;
  221|    346|    } while (i < h);
  ------------------
  |  Branch (221:14): [True: 262, False: 84]
  ------------------
  222|    239|  } else if (w == 16) {
  ------------------
  |  Branch (222:14): [True: 126, False: 113]
  ------------------
  223|  1.19k|    do {
  224|  1.19k|      const __m256i s0A = yy_loadu_256(src0);
  225|  1.19k|      const __m256i s0B = yy_loadu_256(src0 + src0_stride);
  226|  1.19k|      const __m256i s1A = yy_loadu_256(src1);
  227|  1.19k|      const __m256i s1B = yy_loadu_256(src1 + src1_stride);
  228|  1.19k|      const __m256i m16A =
  229|  1.19k|          calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
  230|  1.19k|      const __m256i m16B =
  231|  1.19k|          calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
  232|  1.19k|      const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
  233|  1.19k|      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
  234|  1.19k|      src0 += src0_stride << 1;
  235|  1.19k|      src1 += src1_stride << 1;
  236|  1.19k|      mask += 32;
  237|  1.19k|      i += 2;
  238|  1.19k|    } while (i < h);
  ------------------
  |  Branch (238:14): [True: 1.06k, False: 126]
  ------------------
  239|    126|  } else if (w == 32) {
  ------------------
  |  Branch (239:14): [True: 71, False: 42]
  ------------------
  240|  2.30k|    do {
  241|  2.30k|      const __m256i s0A = yy_loadu_256(src0);
  242|  2.30k|      const __m256i s0B = yy_loadu_256(src0 + 16);
  243|  2.30k|      const __m256i s1A = yy_loadu_256(src1);
  244|  2.30k|      const __m256i s1B = yy_loadu_256(src1 + 16);
  245|  2.30k|      const __m256i m16A =
  246|  2.30k|          calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
  247|  2.30k|      const __m256i m16B =
  248|  2.30k|          calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
  249|  2.30k|      const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
  250|  2.30k|      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
  251|  2.30k|      src0 += src0_stride;
  252|  2.30k|      src1 += src1_stride;
  253|  2.30k|      mask += 32;
  254|  2.30k|      i += 1;
  255|  2.30k|    } while (i < h);
  ------------------
  |  Branch (255:14): [True: 2.23k, False: 71]
  ------------------
  256|     71|  } else if (w == 64) {
  ------------------
  |  Branch (256:14): [True: 40, False: 2]
  ------------------
  257|  2.08k|    do {
  258|  2.08k|      const __m256i s0A = yy_loadu_256(src0);
  259|  2.08k|      const __m256i s0B = yy_loadu_256(src0 + 16);
  260|  2.08k|      const __m256i s0C = yy_loadu_256(src0 + 32);
  261|  2.08k|      const __m256i s0D = yy_loadu_256(src0 + 48);
  262|  2.08k|      const __m256i s1A = yy_loadu_256(src1);
  263|  2.08k|      const __m256i s1B = yy_loadu_256(src1 + 16);
  264|  2.08k|      const __m256i s1C = yy_loadu_256(src1 + 32);
  265|  2.08k|      const __m256i s1D = yy_loadu_256(src1 + 48);
  266|  2.08k|      const __m256i m16A =
  267|  2.08k|          calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
  268|  2.08k|      const __m256i m16B =
  269|  2.08k|          calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
  270|  2.08k|      const __m256i m16C =
  271|  2.08k|          calc_mask_d16_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
  272|  2.08k|      const __m256i m16D =
  273|  2.08k|          calc_mask_d16_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
  274|  2.08k|      const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
  275|  2.08k|      const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
  276|  2.08k|      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
  277|  2.08k|      yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
  278|  2.08k|      src0 += src0_stride;
  279|  2.08k|      src1 += src1_stride;
  280|  2.08k|      mask += 64;
  281|  2.08k|      i += 1;
  282|  2.08k|    } while (i < h);
  ------------------
  |  Branch (282:14): [True: 2.04k, False: 40]
  ------------------
  283|     40|  } else {
  284|    128|    do {
  285|    128|      const __m256i s0A = yy_loadu_256(src0);
  286|    128|      const __m256i s0B = yy_loadu_256(src0 + 16);
  287|    128|      const __m256i s0C = yy_loadu_256(src0 + 32);
  288|    128|      const __m256i s0D = yy_loadu_256(src0 + 48);
  289|    128|      const __m256i s0E = yy_loadu_256(src0 + 64);
  290|    128|      const __m256i s0F = yy_loadu_256(src0 + 80);
  291|    128|      const __m256i s0G = yy_loadu_256(src0 + 96);
  292|    128|      const __m256i s0H = yy_loadu_256(src0 + 112);
  293|    128|      const __m256i s1A = yy_loadu_256(src1);
  294|    128|      const __m256i s1B = yy_loadu_256(src1 + 16);
  295|    128|      const __m256i s1C = yy_loadu_256(src1 + 32);
  296|    128|      const __m256i s1D = yy_loadu_256(src1 + 48);
  297|    128|      const __m256i s1E = yy_loadu_256(src1 + 64);
  298|    128|      const __m256i s1F = yy_loadu_256(src1 + 80);
  299|    128|      const __m256i s1G = yy_loadu_256(src1 + 96);
  300|    128|      const __m256i s1H = yy_loadu_256(src1 + 112);
  301|    128|      const __m256i m16A =
  302|    128|          calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
  303|    128|      const __m256i m16B =
  304|    128|          calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
  305|    128|      const __m256i m16C =
  306|    128|          calc_mask_d16_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
  307|    128|      const __m256i m16D =
  308|    128|          calc_mask_d16_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
  309|    128|      const __m256i m16E =
  310|    128|          calc_mask_d16_avx2(&s0E, &s1E, &_r, &y38, &y64, shift);
  311|    128|      const __m256i m16F =
  312|    128|          calc_mask_d16_avx2(&s0F, &s1F, &_r, &y38, &y64, shift);
  313|    128|      const __m256i m16G =
  314|    128|          calc_mask_d16_avx2(&s0G, &s1G, &_r, &y38, &y64, shift);
  315|    128|      const __m256i m16H =
  316|    128|          calc_mask_d16_avx2(&s0H, &s1H, &_r, &y38, &y64, shift);
  317|    128|      const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
  318|    128|      const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
  319|    128|      const __m256i m8EF = _mm256_packus_epi16(m16E, m16F);
  320|    128|      const __m256i m8GH = _mm256_packus_epi16(m16G, m16H);
  321|    128|      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
  322|    128|      yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
  323|    128|      yy_storeu_256(mask + 64, _mm256_permute4x64_epi64(m8EF, 0xd8));
  324|    128|      yy_storeu_256(mask + 96, _mm256_permute4x64_epi64(m8GH, 0xd8));
  325|    128|      src0 += src0_stride;
  326|    128|      src1 += src1_stride;
  327|    128|      mask += 128;
  328|    128|      i += 1;
  329|    128|    } while (i < h);
  ------------------
  |  Branch (329:14): [True: 126, False: 2]
  ------------------
  330|      2|  }
  331|    323|}
reconinter_avx2.c:calc_mask_d16_avx2:
  142|  17.0k|                                         const __m256i *clip_diff, int round) {
  143|  17.0k|  const __m256i diffa = _mm256_subs_epu16(*data_src0, *data_src1);
  144|  17.0k|  const __m256i diffb = _mm256_subs_epu16(*data_src1, *data_src0);
  145|  17.0k|  const __m256i diff = _mm256_max_epu16(diffa, diffb);
  146|  17.0k|  const __m256i diff_round =
  147|  17.0k|      _mm256_srli_epi16(_mm256_adds_epu16(diff, *round_const), round);
  148|  17.0k|  const __m256i diff_factor = _mm256_srli_epi16(diff_round, DIFF_FACTOR_LOG2);
  ------------------
  |  |   42|  17.0k|#define DIFF_FACTOR_LOG2 4
  ------------------
  149|  17.0k|  const __m256i diff_mask = _mm256_adds_epi16(diff_factor, *mask_base_16);
  150|  17.0k|  const __m256i diff_clamp = _mm256_min_epi16(diff_mask, *clip_diff);
  151|  17.0k|  return diff_clamp;
  152|  17.0k|}
reconinter_avx2.c:build_compound_diffwtd_mask_d16_inv_avx2:
  335|    281|    const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, int shift) {
  336|    281|  const int mask_base = 38;
  337|    281|  const __m256i _r = _mm256_set1_epi16((1 << shift) >> 1);
  338|    281|  const __m256i y38 = _mm256_set1_epi16(mask_base);
  339|    281|  const __m256i y64 = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|    281|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|    281|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  340|    281|  int i = 0;
  341|    281|  if (w == 4) {
  ------------------
  |  Branch (341:7): [True: 0, False: 281]
  ------------------
  342|      0|    do {
  343|      0|      const __m128i s0A = xx_loadl_64(src0);
  344|      0|      const __m128i s0B = xx_loadl_64(src0 + src0_stride);
  345|      0|      const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2);
  346|      0|      const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3);
  347|      0|      const __m128i s1A = xx_loadl_64(src1);
  348|      0|      const __m128i s1B = xx_loadl_64(src1 + src1_stride);
  349|      0|      const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2);
  350|      0|      const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3);
  351|      0|      const __m256i s0 = yy_set_m128i(_mm_unpacklo_epi64(s0C, s0D),
  352|      0|                                      _mm_unpacklo_epi64(s0A, s0B));
  353|      0|      const __m256i s1 = yy_set_m128i(_mm_unpacklo_epi64(s1C, s1D),
  354|      0|                                      _mm_unpacklo_epi64(s1A, s1B));
  355|      0|      const __m256i m16 =
  356|      0|          calc_mask_d16_inv_avx2(&s0, &s1, &_r, &y38, &y64, shift);
  357|      0|      const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256());
  358|      0|      xx_storeu_128(mask,
  359|      0|                    _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8)));
  360|      0|      src0 += src0_stride << 2;
  361|      0|      src1 += src1_stride << 2;
  362|      0|      mask += 16;
  363|      0|      i += 4;
  364|      0|    } while (i < h);
  ------------------
  |  Branch (364:14): [True: 0, False: 0]
  ------------------
  365|    281|  } else if (w == 8) {
  ------------------
  |  Branch (365:14): [True: 72, False: 209]
  ------------------
  366|    282|    do {
  367|    282|      const __m256i s0AB = yy_loadu2_128(src0 + src0_stride, src0);
  368|    282|      const __m256i s0CD =
  369|    282|          yy_loadu2_128(src0 + src0_stride * 3, src0 + src0_stride * 2);
  370|    282|      const __m256i s1AB = yy_loadu2_128(src1 + src1_stride, src1);
  371|    282|      const __m256i s1CD =
  372|    282|          yy_loadu2_128(src1 + src1_stride * 3, src1 + src1_stride * 2);
  373|    282|      const __m256i m16AB =
  374|    282|          calc_mask_d16_inv_avx2(&s0AB, &s1AB, &_r, &y38, &y64, shift);
  375|    282|      const __m256i m16CD =
  376|    282|          calc_mask_d16_inv_avx2(&s0CD, &s1CD, &_r, &y38, &y64, shift);
  377|    282|      const __m256i m8 = _mm256_packus_epi16(m16AB, m16CD);
  378|    282|      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
  379|    282|      src0 += src0_stride << 2;
  380|    282|      src1 += src1_stride << 2;
  381|    282|      mask += 32;
  382|    282|      i += 4;
  383|    282|    } while (i < h);
  ------------------
  |  Branch (383:14): [True: 210, False: 72]
  ------------------
  384|    209|  } else if (w == 16) {
  ------------------
  |  Branch (384:14): [True: 131, False: 78]
  ------------------
  385|  1.05k|    do {
  386|  1.05k|      const __m256i s0A = yy_loadu_256(src0);
  387|  1.05k|      const __m256i s0B = yy_loadu_256(src0 + src0_stride);
  388|  1.05k|      const __m256i s1A = yy_loadu_256(src1);
  389|  1.05k|      const __m256i s1B = yy_loadu_256(src1 + src1_stride);
  390|  1.05k|      const __m256i m16A =
  391|  1.05k|          calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
  392|  1.05k|      const __m256i m16B =
  393|  1.05k|          calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
  394|  1.05k|      const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
  395|  1.05k|      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
  396|  1.05k|      src0 += src0_stride << 1;
  397|  1.05k|      src1 += src1_stride << 1;
  398|  1.05k|      mask += 32;
  399|  1.05k|      i += 2;
  400|  1.05k|    } while (i < h);
  ------------------
  |  Branch (400:14): [True: 925, False: 131]
  ------------------
  401|    131|  } else if (w == 32) {
  ------------------
  |  Branch (401:14): [True: 60, False: 18]
  ------------------
  402|  1.37k|    do {
  403|  1.37k|      const __m256i s0A = yy_loadu_256(src0);
  404|  1.37k|      const __m256i s0B = yy_loadu_256(src0 + 16);
  405|  1.37k|      const __m256i s1A = yy_loadu_256(src1);
  406|  1.37k|      const __m256i s1B = yy_loadu_256(src1 + 16);
  407|  1.37k|      const __m256i m16A =
  408|  1.37k|          calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
  409|  1.37k|      const __m256i m16B =
  410|  1.37k|          calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
  411|  1.37k|      const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
  412|  1.37k|      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
  413|  1.37k|      src0 += src0_stride;
  414|  1.37k|      src1 += src1_stride;
  415|  1.37k|      mask += 32;
  416|  1.37k|      i += 1;
  417|  1.37k|    } while (i < h);
  ------------------
  |  Branch (417:14): [True: 1.31k, False: 60]
  ------------------
  418|     60|  } else if (w == 64) {
  ------------------
  |  Branch (418:14): [True: 13, False: 5]
  ------------------
  419|    512|    do {
  420|    512|      const __m256i s0A = yy_loadu_256(src0);
  421|    512|      const __m256i s0B = yy_loadu_256(src0 + 16);
  422|    512|      const __m256i s0C = yy_loadu_256(src0 + 32);
  423|    512|      const __m256i s0D = yy_loadu_256(src0 + 48);
  424|    512|      const __m256i s1A = yy_loadu_256(src1);
  425|    512|      const __m256i s1B = yy_loadu_256(src1 + 16);
  426|    512|      const __m256i s1C = yy_loadu_256(src1 + 32);
  427|    512|      const __m256i s1D = yy_loadu_256(src1 + 48);
  428|    512|      const __m256i m16A =
  429|    512|          calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
  430|    512|      const __m256i m16B =
  431|    512|          calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
  432|    512|      const __m256i m16C =
  433|    512|          calc_mask_d16_inv_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
  434|    512|      const __m256i m16D =
  435|    512|          calc_mask_d16_inv_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
  436|    512|      const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
  437|    512|      const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
  438|    512|      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
  439|    512|      yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
  440|    512|      src0 += src0_stride;
  441|    512|      src1 += src1_stride;
  442|    512|      mask += 64;
  443|    512|      i += 1;
  444|    512|    } while (i < h);
  ------------------
  |  Branch (444:14): [True: 499, False: 13]
  ------------------
  445|     13|  } else {
  446|    512|    do {
  447|    512|      const __m256i s0A = yy_loadu_256(src0);
  448|    512|      const __m256i s0B = yy_loadu_256(src0 + 16);
  449|    512|      const __m256i s0C = yy_loadu_256(src0 + 32);
  450|    512|      const __m256i s0D = yy_loadu_256(src0 + 48);
  451|    512|      const __m256i s0E = yy_loadu_256(src0 + 64);
  452|    512|      const __m256i s0F = yy_loadu_256(src0 + 80);
  453|    512|      const __m256i s0G = yy_loadu_256(src0 + 96);
  454|    512|      const __m256i s0H = yy_loadu_256(src0 + 112);
  455|    512|      const __m256i s1A = yy_loadu_256(src1);
  456|    512|      const __m256i s1B = yy_loadu_256(src1 + 16);
  457|    512|      const __m256i s1C = yy_loadu_256(src1 + 32);
  458|    512|      const __m256i s1D = yy_loadu_256(src1 + 48);
  459|    512|      const __m256i s1E = yy_loadu_256(src1 + 64);
  460|    512|      const __m256i s1F = yy_loadu_256(src1 + 80);
  461|    512|      const __m256i s1G = yy_loadu_256(src1 + 96);
  462|    512|      const __m256i s1H = yy_loadu_256(src1 + 112);
  463|    512|      const __m256i m16A =
  464|    512|          calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
  465|    512|      const __m256i m16B =
  466|    512|          calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
  467|    512|      const __m256i m16C =
  468|    512|          calc_mask_d16_inv_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
  469|    512|      const __m256i m16D =
  470|    512|          calc_mask_d16_inv_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
  471|    512|      const __m256i m16E =
  472|    512|          calc_mask_d16_inv_avx2(&s0E, &s1E, &_r, &y38, &y64, shift);
  473|    512|      const __m256i m16F =
  474|    512|          calc_mask_d16_inv_avx2(&s0F, &s1F, &_r, &y38, &y64, shift);
  475|    512|      const __m256i m16G =
  476|    512|          calc_mask_d16_inv_avx2(&s0G, &s1G, &_r, &y38, &y64, shift);
  477|    512|      const __m256i m16H =
  478|    512|          calc_mask_d16_inv_avx2(&s0H, &s1H, &_r, &y38, &y64, shift);
  479|    512|      const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
  480|    512|      const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
  481|    512|      const __m256i m8EF = _mm256_packus_epi16(m16E, m16F);
  482|    512|      const __m256i m8GH = _mm256_packus_epi16(m16G, m16H);
  483|    512|      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
  484|    512|      yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
  485|    512|      yy_storeu_256(mask + 64, _mm256_permute4x64_epi64(m8EF, 0xd8));
  486|    512|      yy_storeu_256(mask + 96, _mm256_permute4x64_epi64(m8GH, 0xd8));
  487|    512|      src0 += src0_stride;
  488|    512|      src1 += src1_stride;
  489|    512|      mask += 128;
  490|    512|      i += 1;
  491|    512|    } while (i < h);
  ------------------
  |  Branch (491:14): [True: 507, False: 5]
  ------------------
  492|      5|  }
  493|    281|}
reconinter_avx2.c:calc_mask_d16_inv_avx2:
  159|  11.5k|                                             int round) {
  160|  11.5k|  const __m256i diffa = _mm256_subs_epu16(*data_src0, *data_src1);
  161|  11.5k|  const __m256i diffb = _mm256_subs_epu16(*data_src1, *data_src0);
  162|  11.5k|  const __m256i diff = _mm256_max_epu16(diffa, diffb);
  163|  11.5k|  const __m256i diff_round =
  164|  11.5k|      _mm256_srli_epi16(_mm256_adds_epu16(diff, *round_const), round);
  165|  11.5k|  const __m256i diff_factor = _mm256_srli_epi16(diff_round, DIFF_FACTOR_LOG2);
  ------------------
  |  |   42|  11.5k|#define DIFF_FACTOR_LOG2 4
  ------------------
  166|  11.5k|  const __m256i diff_mask = _mm256_adds_epi16(diff_factor, *mask_base_16);
  167|  11.5k|  const __m256i diff_clamp = _mm256_min_epi16(diff_mask, *clip_diff);
  168|  11.5k|  const __m256i diff_const_16 = _mm256_sub_epi16(*clip_diff, diff_clamp);
  169|  11.5k|  return diff_const_16;
  170|  11.5k|}

av1_selfguided_restoration_avx2:
  553|  2.65k|                                    int highbd) {
  554|       |  // The ALIGN_POWER_OF_TWO macro here ensures that column 1 of Atl, Btl,
  555|       |  // Ctl and Dtl is 32-byte aligned.
  556|  2.65k|  const int buf_elts = ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3);
  ------------------
  |  |   69|  2.65k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  ------------------
  557|       |
  558|  2.65k|  int32_t *buf = aom_memalign(
  559|  2.65k|      32, 4 * sizeof(*buf) * ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3));
  ------------------
  |  |   69|  2.65k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  ------------------
  560|  2.65k|  if (!buf) return -1;
  ------------------
  |  Branch (560:7): [True: 0, False: 2.65k]
  ------------------
  561|       |
  562|  2.65k|  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
  ------------------
  |  |   40|  2.65k|#define SGRPROJ_BORDER_HORZ 3  // Horizontal border used for Sgr
  ------------------
  563|  2.65k|  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
  ------------------
  |  |   39|  2.65k|#define SGRPROJ_BORDER_VERT 3  // Vertical border used for Sgr
  ------------------
  564|       |
  565|       |  // Adjusting the stride of A and B here appears to avoid bad cache effects,
  566|       |  // leading to a significant speed improvement.
  567|       |  // We also align the stride to a multiple of 32 bytes for efficiency.
  568|  2.65k|  int buf_stride = ALIGN_POWER_OF_TWO(width_ext + 16, 3);
  ------------------
  |  |   69|  2.65k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  ------------------
  569|       |
  570|       |  // The "tl" pointers point at the top-left of the initialised data for the
  571|       |  // array.
  572|  2.65k|  int32_t *Atl = buf + 0 * buf_elts + 7;
  573|  2.65k|  int32_t *Btl = buf + 1 * buf_elts + 7;
  574|  2.65k|  int32_t *Ctl = buf + 2 * buf_elts + 7;
  575|  2.65k|  int32_t *Dtl = buf + 3 * buf_elts + 7;
  576|       |
  577|       |  // The "0" pointers are (- SGRPROJ_BORDER_VERT, -SGRPROJ_BORDER_HORZ). Note
  578|       |  // there's a zero row and column in A, B (integral images), so we move down
  579|       |  // and right one for them.
  580|  2.65k|  const int buf_diag_border =
  581|  2.65k|      SGRPROJ_BORDER_HORZ + buf_stride * SGRPROJ_BORDER_VERT;
  ------------------
  |  |   40|  2.65k|#define SGRPROJ_BORDER_HORZ 3  // Horizontal border used for Sgr
  ------------------
                    SGRPROJ_BORDER_HORZ + buf_stride * SGRPROJ_BORDER_VERT;
  ------------------
  |  |   39|  2.65k|#define SGRPROJ_BORDER_VERT 3  // Vertical border used for Sgr
  ------------------
  582|       |
  583|  2.65k|  int32_t *A0 = Atl + 1 + buf_stride;
  584|  2.65k|  int32_t *B0 = Btl + 1 + buf_stride;
  585|  2.65k|  int32_t *C0 = Ctl + 1 + buf_stride;
  586|  2.65k|  int32_t *D0 = Dtl + 1 + buf_stride;
  587|       |
  588|       |  // Finally, A, B, C, D point at position (0, 0).
  589|  2.65k|  int32_t *A = A0 + buf_diag_border;
  590|  2.65k|  int32_t *B = B0 + buf_diag_border;
  591|  2.65k|  int32_t *C = C0 + buf_diag_border;
  592|  2.65k|  int32_t *D = D0 + buf_diag_border;
  593|       |
  594|  2.65k|  const int dgd_diag_border =
  595|  2.65k|      SGRPROJ_BORDER_HORZ + dgd_stride * SGRPROJ_BORDER_VERT;
  ------------------
  |  |   40|  2.65k|#define SGRPROJ_BORDER_HORZ 3  // Horizontal border used for Sgr
  ------------------
                    SGRPROJ_BORDER_HORZ + dgd_stride * SGRPROJ_BORDER_VERT;
  ------------------
  |  |   39|  2.65k|#define SGRPROJ_BORDER_VERT 3  // Vertical border used for Sgr
  ------------------
  596|  2.65k|  const uint8_t *dgd0 = dgd8 - dgd_diag_border;
  597|       |
  598|       |  // Generate integral images from the input. C will contain sums of squares; D
  599|       |  // will contain just sums
  600|  2.65k|  if (highbd)
  ------------------
  |  Branch (600:7): [True: 944, False: 1.70k]
  ------------------
  601|    944|    integral_images_highbd(CONVERT_TO_SHORTPTR(dgd0), dgd_stride, width_ext,
  ------------------
  |  |   75|    944|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  602|    944|                           height_ext, Ctl, Dtl, buf_stride);
  603|  1.70k|  else
  604|  1.70k|    integral_images(dgd0, dgd_stride, width_ext, height_ext, Ctl, Dtl,
  605|  1.70k|                    buf_stride);
  606|       |
  607|  2.65k|  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
  608|       |  // Write to flt0 and flt1
  609|       |  // If params->r == 0 we skip the corresponding filter. We only allow one of
  610|       |  // the radii to be 0, as having both equal to 0 would be equivalent to
  611|       |  // skipping SGR entirely.
  612|  2.65k|  assert(!(params->r[0] == 0 && params->r[1] == 0));
  613|  2.65k|  assert(params->r[0] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
  614|  2.65k|  assert(params->r[1] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
  615|       |
  616|  2.65k|  if (params->r[0] > 0) {
  ------------------
  |  Branch (616:7): [True: 1.66k, False: 988]
  ------------------
  617|  1.66k|    calc_ab_fast(A, B, C, D, width, height, buf_stride, bit_depth,
  618|  1.66k|                 sgr_params_idx, 0);
  619|  1.66k|    final_filter_fast(flt0, flt_stride, A, B, buf_stride, dgd8, dgd_stride,
  620|  1.66k|                      width, height, highbd);
  621|  1.66k|  }
  622|       |
  623|  2.65k|  if (params->r[1] > 0) {
  ------------------
  |  Branch (623:7): [True: 2.01k, False: 639]
  ------------------
  624|  2.01k|    calc_ab(A, B, C, D, width, height, buf_stride, bit_depth, sgr_params_idx,
  625|  2.01k|            1);
  626|  2.01k|    final_filter(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width,
  627|  2.01k|                 height, highbd);
  628|  2.01k|  }
  629|  2.65k|  aom_free(buf);
  630|  2.65k|  return 0;
  631|  2.65k|}
av1_apply_selfguided_restoration_avx2:
  637|  2.65k|                                          int bit_depth, int highbd) {
  638|  2.65k|  int32_t *flt0 = tmpbuf;
  639|  2.65k|  int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
  ------------------
  |  |   87|  2.65k|  (RESTORATION_UNITPELS_HORZ_MAX * RESTORATION_UNITPELS_VERT_MAX)
  |  |  ------------------
  |  |  |  |   82|  2.65k|  (RESTORATION_UNITSIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_HORZ + 16)
  |  |  |  |  ------------------
  |  |  |  |  |  |   80|  2.65k|#define RESTORATION_UNITSIZE_MAX 256
  |  |  |  |  ------------------
  |  |  |  |                 (RESTORATION_UNITSIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_HORZ + 16)
  |  |  |  |  ------------------
  |  |  |  |  |  |   56|  2.65k|#define RESTORATION_BORDER_HORZ (SGRPROJ_BORDER_HORZ)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   40|  2.65k|#define SGRPROJ_BORDER_HORZ 3  // Horizontal border used for Sgr
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (RESTORATION_UNITPELS_HORZ_MAX * RESTORATION_UNITPELS_VERT_MAX)
  |  |  ------------------
  |  |  |  |   84|  2.65k|  ((RESTORATION_UNITSIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_VERT + \
  |  |  |  |  ------------------
  |  |  |  |  |  |   80|  2.65k|#define RESTORATION_UNITSIZE_MAX 256
  |  |  |  |  ------------------
  |  |  |  |                 ((RESTORATION_UNITSIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_VERT + \
  |  |  |  |  ------------------
  |  |  |  |  |  |   50|  2.65k|#define RESTORATION_BORDER_VERT (SGRPROJ_BORDER_VERT)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   39|  2.65k|#define SGRPROJ_BORDER_VERT 3  // Vertical border used for Sgr
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   85|  2.65k|    RESTORATION_UNIT_OFFSET))
  |  |  |  |  ------------------
  |  |  |  |  |  |   37|  2.65k|#define RESTORATION_UNIT_OFFSET 8
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  640|  2.65k|  assert(width * height <= RESTORATION_UNITPELS_MAX);
  641|  2.65k|  const int ret = av1_selfguided_restoration_avx2(
  642|  2.65k|      dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
  643|  2.65k|  if (ret != 0) return ret;
  ------------------
  |  Branch (643:7): [True: 0, False: 2.65k]
  ------------------
  644|  2.65k|  const sgr_params_type *const params = &av1_sgr_params[eps];
  645|  2.65k|  int xq[2];
  646|  2.65k|  av1_decode_xq(xqd, xq, params);
  647|       |
  648|  2.65k|  __m256i xq0 = _mm256_set1_epi32(xq[0]);
  649|  2.65k|  __m256i xq1 = _mm256_set1_epi32(xq[1]);
  650|       |
  651|   120k|  for (int i = 0; i < height; ++i) {
  ------------------
  |  Branch (651:19): [True: 118k, False: 2.65k]
  ------------------
  652|       |    // Calculate output in batches of 16 pixels
  653|   474k|    for (int j = 0; j < width; j += 16) {
  ------------------
  |  Branch (653:21): [True: 356k, False: 118k]
  ------------------
  654|   356k|      const int k = i * width + j;
  655|   356k|      const int m = i * dst_stride + j;
  656|       |
  657|   356k|      const uint8_t *dat8ij = dat8 + i * stride + j;
  658|   356k|      __m256i ep_0, ep_1;
  659|   356k|      __m128i src_0, src_1;
  660|   356k|      if (highbd) {
  ------------------
  |  Branch (660:11): [True: 100k, False: 255k]
  ------------------
  661|   100k|        src_0 = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij));
  ------------------
  |  |   75|   100k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  662|   100k|        src_1 = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij + 8));
  ------------------
  |  |   75|   100k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  663|   100k|        ep_0 = _mm256_cvtepu16_epi32(src_0);
  664|   100k|        ep_1 = _mm256_cvtepu16_epi32(src_1);
  665|   255k|      } else {
  666|   255k|        src_0 = xx_loadu_128(dat8ij);
  667|   255k|        ep_0 = _mm256_cvtepu8_epi32(src_0);
  668|   255k|        ep_1 = _mm256_cvtepu8_epi32(_mm_srli_si128(src_0, 8));
  669|   255k|      }
  670|       |
  671|   356k|      const __m256i u_0 = _mm256_slli_epi32(ep_0, SGRPROJ_RST_BITS);
  ------------------
  |  |  101|   356k|#define SGRPROJ_RST_BITS 4
  ------------------
  672|   356k|      const __m256i u_1 = _mm256_slli_epi32(ep_1, SGRPROJ_RST_BITS);
  ------------------
  |  |  101|   356k|#define SGRPROJ_RST_BITS 4
  ------------------
  673|       |
  674|   356k|      __m256i v_0 = _mm256_slli_epi32(u_0, SGRPROJ_PRJ_BITS);
  ------------------
  |  |   99|   356k|#define SGRPROJ_PRJ_BITS 7
  ------------------
  675|   356k|      __m256i v_1 = _mm256_slli_epi32(u_1, SGRPROJ_PRJ_BITS);
  ------------------
  |  |   99|   356k|#define SGRPROJ_PRJ_BITS 7
  ------------------
  676|       |
  677|   356k|      if (params->r[0] > 0) {
  ------------------
  |  Branch (677:11): [True: 246k, False: 109k]
  ------------------
  678|   246k|        const __m256i f1_0 = _mm256_sub_epi32(yy_loadu_256(&flt0[k]), u_0);
  679|   246k|        v_0 = _mm256_add_epi32(v_0, _mm256_mullo_epi32(xq0, f1_0));
  680|       |
  681|   246k|        const __m256i f1_1 = _mm256_sub_epi32(yy_loadu_256(&flt0[k + 8]), u_1);
  682|   246k|        v_1 = _mm256_add_epi32(v_1, _mm256_mullo_epi32(xq0, f1_1));
  683|   246k|      }
  684|       |
  685|   356k|      if (params->r[1] > 0) {
  ------------------
  |  Branch (685:11): [True: 281k, False: 74.9k]
  ------------------
  686|   281k|        const __m256i f2_0 = _mm256_sub_epi32(yy_loadu_256(&flt1[k]), u_0);
  687|   281k|        v_0 = _mm256_add_epi32(v_0, _mm256_mullo_epi32(xq1, f2_0));
  688|       |
  689|   281k|        const __m256i f2_1 = _mm256_sub_epi32(yy_loadu_256(&flt1[k + 8]), u_1);
  690|   281k|        v_1 = _mm256_add_epi32(v_1, _mm256_mullo_epi32(xq1, f2_1));
  691|   281k|      }
  692|       |
  693|   356k|      const __m256i rounding =
  694|   356k|          round_for_shift(SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
  ------------------
  |  |   99|   356k|#define SGRPROJ_PRJ_BITS 7
  ------------------
                        round_for_shift(SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
  ------------------
  |  |  101|   356k|#define SGRPROJ_RST_BITS 4
  ------------------
  695|   356k|      const __m256i w_0 = _mm256_srai_epi32(
  696|   356k|          _mm256_add_epi32(v_0, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
  ------------------
  |  |   99|   356k|#define SGRPROJ_PRJ_BITS 7
  ------------------
                        _mm256_add_epi32(v_0, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
  ------------------
  |  |  101|   356k|#define SGRPROJ_RST_BITS 4
  ------------------
  697|   356k|      const __m256i w_1 = _mm256_srai_epi32(
  698|   356k|          _mm256_add_epi32(v_1, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
  ------------------
  |  |   99|   356k|#define SGRPROJ_PRJ_BITS 7
  ------------------
                        _mm256_add_epi32(v_1, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
  ------------------
  |  |  101|   356k|#define SGRPROJ_RST_BITS 4
  ------------------
  699|       |
  700|   356k|      if (highbd) {
  ------------------
  |  Branch (700:11): [True: 100k, False: 255k]
  ------------------
  701|       |        // Pack into 16 bits and clamp to [0, 2^bit_depth)
  702|       |        // Note that packing into 16 bits messes up the order of the bits,
  703|       |        // so we use a permute function to correct this
  704|   100k|        const __m256i tmp = _mm256_packus_epi32(w_0, w_1);
  705|   100k|        const __m256i tmp2 = _mm256_permute4x64_epi64(tmp, 0xd8);
  706|   100k|        const __m256i max = _mm256_set1_epi16((1 << bit_depth) - 1);
  707|   100k|        const __m256i res = _mm256_min_epi16(tmp2, max);
  708|   100k|        yy_storeu_256(CONVERT_TO_SHORTPTR(dst8 + m), res);
  ------------------
  |  |   75|   100k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  709|   255k|      } else {
  710|       |        // Pack into 8 bits and clamp to [0, 256)
  711|       |        // Note that each pack messes up the order of the bits,
  712|       |        // so we use a permute function to correct this
  713|   255k|        const __m256i tmp = _mm256_packs_epi32(w_0, w_1);
  714|   255k|        const __m256i tmp2 = _mm256_permute4x64_epi64(tmp, 0xd8);
  715|   255k|        const __m256i res =
  716|   255k|            _mm256_packus_epi16(tmp2, tmp2 /* "don't care" value */);
  717|   255k|        const __m128i res2 =
  718|       |            _mm256_castsi256_si128(_mm256_permute4x64_epi64(res, 0xd8));
  719|   255k|        xx_storeu_128(dst8 + m, res2);
  720|   255k|      }
  721|   356k|    }
  722|   118k|  }
  723|  2.65k|  return 0;
  724|  2.65k|}
selfguided_avx2.c:integral_images_highbd:
  135|    945|                                   int32_t *B, int buf_stride) {
  136|    945|  const __m256i zero = _mm256_setzero_si256();
  137|       |  // Write out the zero top row
  138|    945|  memset_zero_avx(A, &zero, (width + 8));
  139|    945|  memset_zero_avx(B, &zero, (width + 8));
  140|       |
  141|  46.8k|  for (int i = 0; i < height; ++i) {
  ------------------
  |  Branch (141:19): [True: 45.9k, False: 945]
  ------------------
  142|       |    // Zero the left column.
  143|  45.9k|    A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0;
  144|       |
  145|       |    // ldiff is the difference H - D where H is the output sample immediately
  146|       |    // to the left and D is the output sample above it. These are scalars,
  147|       |    // replicated across the eight lanes.
  148|  45.9k|    __m256i ldiff1 = zero, ldiff2 = zero;
  149|   297k|    for (int j = 0; j < width; j += 8) {
  ------------------
  |  Branch (149:21): [True: 251k, False: 45.9k]
  ------------------
  150|   251k|      const int ABj = 1 + j;
  151|       |
  152|   251k|      const __m256i above1 = yy_load_256(B + ABj + i * buf_stride);
  153|   251k|      const __m256i above2 = yy_load_256(A + ABj + i * buf_stride);
  154|       |
  155|   251k|      const __m256i x1 = yy256_load_extend_16_32(src + j + i * src_stride);
  156|   251k|      const __m256i x2 = _mm256_madd_epi16(x1, x1);
  157|       |
  158|   251k|      const __m256i sc1 = scan_32(x1);
  159|   251k|      const __m256i sc2 = scan_32(x2);
  160|       |
  161|   251k|      const __m256i row1 =
  162|   251k|          _mm256_add_epi32(_mm256_add_epi32(sc1, above1), ldiff1);
  163|   251k|      const __m256i row2 =
  164|   251k|          _mm256_add_epi32(_mm256_add_epi32(sc2, above2), ldiff2);
  165|       |
  166|   251k|      yy_store_256(B + ABj + (i + 1) * buf_stride, row1);
  167|   251k|      yy_store_256(A + ABj + (i + 1) * buf_stride, row2);
  168|       |
  169|       |      // Calculate the new H - D.
  170|   251k|      ldiff1 = _mm256_set1_epi32(
  171|   251k|          _mm256_extract_epi32(_mm256_sub_epi32(row1, above1), 7));
  172|   251k|      ldiff2 = _mm256_set1_epi32(
  173|       |          _mm256_extract_epi32(_mm256_sub_epi32(row2, above2), 7));
  174|   251k|    }
  175|  45.9k|  }
  176|    945|}
selfguided_avx2.c:memset_zero_avx:
   69|  5.29k|static void *memset_zero_avx(int32_t *dest, const __m256i *zero, size_t count) {
   70|  5.29k|  unsigned int i = 0;
   71|  12.6k|  for (i = 0; i < (count & 0xffffffe0); i += 32) {
  ------------------
  |  Branch (71:15): [True: 7.33k, False: 5.29k]
  ------------------
   72|  7.33k|    _mm256_storeu_si256((__m256i *)(dest + i), *zero);
   73|  7.33k|    _mm256_storeu_si256((__m256i *)(dest + i + 8), *zero);
   74|  7.33k|    _mm256_storeu_si256((__m256i *)(dest + i + 16), *zero);
   75|  7.33k|    _mm256_storeu_si256((__m256i *)(dest + i + 24), *zero);
   76|  7.33k|  }
   77|  12.5k|  for (; i < (count & 0xfffffff8); i += 8) {
  ------------------
  |  Branch (77:10): [True: 7.22k, False: 5.29k]
  ------------------
   78|  7.22k|    _mm256_storeu_si256((__m256i *)(dest + i), *zero);
   79|  7.22k|  }
   80|  35.5k|  for (; i < count; i++) {
  ------------------
  |  Branch (80:10): [True: 30.2k, False: 5.29k]
  ------------------
   81|  30.2k|    dest[i] = 0;
   82|  30.2k|  }
   83|  5.29k|  return dest;
   84|  5.29k|}
selfguided_avx2.c:yy256_load_extend_16_32:
   29|   242k|static __m256i yy256_load_extend_16_32(const void *p) {
   30|   242k|  return _mm256_cvtepu16_epi32(xx_loadu_128(p));
   31|   242k|}
selfguided_avx2.c:scan_32:
   51|  1.69M|static __m256i scan_32(__m256i x) {
   52|  1.69M|  const __m256i x01 = _mm256_slli_si256(x, 4);
   53|  1.69M|  const __m256i x02 = _mm256_add_epi32(x, x01);
   54|  1.69M|  const __m256i x03 = _mm256_slli_si256(x02, 8);
   55|  1.69M|  const __m256i x04 = _mm256_add_epi32(x02, x03);
   56|  1.69M|  const int32_t s = _mm256_extract_epi32(x04, 3);
   57|  1.69M|  const __m128i s01 = _mm_set1_epi32(s);
   58|       |  const __m256i s02 = _mm256_insertf128_si256(_mm256_setzero_si256(), s01, 1);
   59|  1.69M|  return _mm256_add_epi32(x04, s02);
   60|  1.69M|}
selfguided_avx2.c:integral_images:
   88|  1.70k|                            int buf_stride) {
   89|  1.70k|  const __m256i zero = _mm256_setzero_si256();
   90|       |  // Write out the zero top row
   91|  1.70k|  memset_zero_avx(A, &zero, (width + 8));
   92|  1.70k|  memset_zero_avx(B, &zero, (width + 8));
   93|  81.7k|  for (int i = 0; i < height; ++i) {
  ------------------
  |  Branch (93:19): [True: 80.0k, False: 1.70k]
  ------------------
   94|       |    // Zero the left column.
   95|  80.0k|    A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0;
   96|       |
   97|       |    // ldiff is the difference H - D where H is the output sample immediately
   98|       |    // to the left and D is the output sample above it. These are scalars,
   99|       |    // replicated across the eight lanes.
  100|  80.0k|    __m256i ldiff1 = zero, ldiff2 = zero;
  101|   690k|    for (int j = 0; j < width; j += 8) {
  ------------------
  |  Branch (101:21): [True: 610k, False: 80.0k]
  ------------------
  102|   610k|      const int ABj = 1 + j;
  103|       |
  104|   610k|      const __m256i above1 = yy_load_256(B + ABj + i * buf_stride);
  105|   610k|      const __m256i above2 = yy_load_256(A + ABj + i * buf_stride);
  106|       |
  107|   610k|      const __m256i x1 = yy256_load_extend_8_32(src + j + i * src_stride);
  108|   610k|      const __m256i x2 = _mm256_madd_epi16(x1, x1);
  109|       |
  110|   610k|      const __m256i sc1 = scan_32(x1);
  111|   610k|      const __m256i sc2 = scan_32(x2);
  112|       |
  113|   610k|      const __m256i row1 =
  114|   610k|          _mm256_add_epi32(_mm256_add_epi32(sc1, above1), ldiff1);
  115|   610k|      const __m256i row2 =
  116|   610k|          _mm256_add_epi32(_mm256_add_epi32(sc2, above2), ldiff2);
  117|       |
  118|   610k|      yy_store_256(B + ABj + (i + 1) * buf_stride, row1);
  119|   610k|      yy_store_256(A + ABj + (i + 1) * buf_stride, row2);
  120|       |
  121|       |      // Calculate the new H - D.
  122|   610k|      ldiff1 = _mm256_set1_epi32(
  123|   610k|          _mm256_extract_epi32(_mm256_sub_epi32(row1, above1), 7));
  124|   610k|      ldiff2 = _mm256_set1_epi32(
  125|       |          _mm256_extract_epi32(_mm256_sub_epi32(row2, above2), 7));
  126|   610k|    }
  127|  80.0k|  }
  128|  1.70k|}
selfguided_avx2.c:yy256_load_extend_8_32:
   23|   603k|static __m256i yy256_load_extend_8_32(const void *p) {
   24|   603k|  return _mm256_cvtepu8_epi32(xx_loadl_64(p));
   25|   603k|}
selfguided_avx2.c:calc_ab_fast:
  358|  1.66k|                         int radius_idx) {
  359|  1.66k|  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
  360|  1.66k|  const int r = params->r[radius_idx];
  361|  1.66k|  const int n = (2 * r + 1) * (2 * r + 1);
  362|  1.66k|  const __m256i s = _mm256_set1_epi32(params->s[radius_idx]);
  363|       |  // one_over_n[n-1] is 2^12/n, so easily fits in an int16
  364|  1.66k|  const __m256i one_over_n = _mm256_set1_epi32(av1_one_by_x[n - 1]);
  365|       |
  366|  1.66k|  const __m256i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
  ------------------
  |  |  117|  1.66k|#define SGRPROJ_MTABLE_BITS 20
  ------------------
  367|  1.66k|  const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
  ------------------
  |  |  118|  1.66k|#define SGRPROJ_RECIP_BITS 12
  ------------------
  368|       |
  369|       |  // Set up masks
  370|  1.66k|  const __m128i ones32 = _mm_set_epi32(0, 0, ~0, ~0);
  371|  1.66k|  __m256i mask[8];
  372|  14.9k|  for (int idx = 0; idx < 8; idx++) {
  ------------------
  |  Branch (372:21): [True: 13.3k, False: 1.66k]
  ------------------
  373|  13.3k|    const __m128i shift = _mm_cvtsi32_si128(8 * (8 - idx));
  374|  13.3k|    mask[idx] = _mm256_cvtepi8_epi32(_mm_srl_epi64(ones32, shift));
  375|  13.3k|  }
  376|       |
  377|  40.8k|  for (int i = -1; i < height + 1; i += 2) {
  ------------------
  |  Branch (377:20): [True: 39.1k, False: 1.66k]
  ------------------
  378|   316k|    for (int j = -1; j < width + 1; j += 8) {
  ------------------
  |  Branch (378:22): [True: 277k, False: 39.1k]
  ------------------
  379|   277k|      const int32_t *Cij = C + i * buf_stride + j;
  380|   277k|      const int32_t *Dij = D + i * buf_stride + j;
  381|       |
  382|   277k|      __m256i sum1 = boxsum_from_ii(Dij, buf_stride, r);
  383|   277k|      __m256i sum2 = boxsum_from_ii(Cij, buf_stride, r);
  384|       |
  385|       |      // When width + 2 isn't a multiple of 8, sum1 and sum2 will contain
  386|       |      // some uninitialised data in their upper words. We use a mask to
  387|       |      // ensure that these bits are set to 0.
  388|   277k|      int idx = AOMMIN(8, width + 1 - j);
  ------------------
  |  |   34|   277k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 239k, False: 37.6k]
  |  |  ------------------
  ------------------
  389|   277k|      assert(idx >= 1);
  390|       |
  391|   277k|      if (idx < 8) {
  ------------------
  |  Branch (391:11): [True: 38.2k, False: 239k]
  ------------------
  392|  38.2k|        sum1 = _mm256_and_si256(mask[idx], sum1);
  393|  38.2k|        sum2 = _mm256_and_si256(mask[idx], sum2);
  394|  38.2k|      }
  395|       |
  396|   277k|      const __m256i p = compute_p(sum1, sum2, bit_depth, n);
  397|       |
  398|   277k|      const __m256i z = _mm256_min_epi32(
  399|   277k|          _mm256_srli_epi32(_mm256_add_epi32(_mm256_mullo_epi32(p, s), rnd_z),
  400|   277k|                            SGRPROJ_MTABLE_BITS),
  ------------------
  |  |  117|   277k|#define SGRPROJ_MTABLE_BITS 20
  ------------------
  401|   277k|          _mm256_set1_epi32(255));
  402|       |
  403|   277k|      const __m256i a_res = _mm256_i32gather_epi32(av1_x_by_xplus1, z, 4);
  404|       |
  405|   277k|      yy_storeu_256(A + i * buf_stride + j, a_res);
  406|       |
  407|   277k|      const __m256i a_complement =
  408|   277k|          _mm256_sub_epi32(_mm256_set1_epi32(SGRPROJ_SGR), a_res);
  ------------------
  |  |  104|   277k|#define SGRPROJ_SGR (1 << SGRPROJ_SGR_BITS)
  |  |  ------------------
  |  |  |  |  103|   277k|#define SGRPROJ_SGR_BITS 8
  |  |  ------------------
  ------------------
  409|       |
  410|       |      // sum1 might have lanes greater than 2^15, so we can't use madd to do
  411|       |      // multiplication involving sum1. However, a_complement and one_over_n
  412|       |      // are both less than 256, so we can multiply them first.
  413|   277k|      const __m256i a_comp_over_n = _mm256_madd_epi16(a_complement, one_over_n);
  414|   277k|      const __m256i b_int = _mm256_mullo_epi32(a_comp_over_n, sum1);
  415|   277k|      const __m256i b_res = _mm256_srli_epi32(_mm256_add_epi32(b_int, rnd_res),
  416|   277k|                                              SGRPROJ_RECIP_BITS);
  ------------------
  |  |  118|   277k|#define SGRPROJ_RECIP_BITS 12
  ------------------
  417|       |
  418|   277k|      yy_storeu_256(B + i * buf_stride + j, b_res);
  419|   277k|    }
  420|  39.1k|  }
  421|  1.66k|}
selfguided_avx2.c:boxsum_from_ii:
  180|  1.64M|static inline __m256i boxsum_from_ii(const int32_t *ii, int stride, int r) {
  181|  1.64M|  const __m256i tl = yy_loadu_256(ii - (r + 1) - (r + 1) * stride);
  182|  1.64M|  const __m256i tr = yy_loadu_256(ii + (r + 0) - (r + 1) * stride);
  183|  1.64M|  const __m256i bl = yy_loadu_256(ii - (r + 1) + r * stride);
  184|  1.64M|  const __m256i br = yy_loadu_256(ii + (r + 0) + r * stride);
  185|  1.64M|  const __m256i u = _mm256_sub_epi32(tr, tl);
  186|  1.64M|  const __m256i v = _mm256_sub_epi32(br, bl);
  187|  1.64M|  return _mm256_sub_epi32(v, u);
  188|  1.64M|}
selfguided_avx2.c:compute_p:
  194|   822k|static __m256i compute_p(__m256i sum1, __m256i sum2, int bit_depth, int n) {
  195|   822k|  __m256i an, bb;
  196|   822k|  if (bit_depth > 8) {
  ------------------
  |  Branch (196:7): [True: 202k, False: 620k]
  ------------------
  197|   202k|    const __m256i rounding_a = round_for_shift(2 * (bit_depth - 8));
  198|   202k|    const __m256i rounding_b = round_for_shift(bit_depth - 8);
  199|   202k|    const __m128i shift_a = _mm_cvtsi32_si128(2 * (bit_depth - 8));
  200|   202k|    const __m128i shift_b = _mm_cvtsi32_si128(bit_depth - 8);
  201|   202k|    const __m256i a =
  202|   202k|        _mm256_srl_epi32(_mm256_add_epi32(sum2, rounding_a), shift_a);
  203|   202k|    const __m256i b =
  204|   202k|        _mm256_srl_epi32(_mm256_add_epi32(sum1, rounding_b), shift_b);
  205|       |    // b < 2^14, so we can use a 16-bit madd rather than a 32-bit
  206|       |    // mullo to square it
  207|   202k|    bb = _mm256_madd_epi16(b, b);
  208|   202k|    an = _mm256_max_epi32(_mm256_mullo_epi32(a, _mm256_set1_epi32(n)), bb);
  209|   620k|  } else {
  210|   620k|    bb = _mm256_madd_epi16(sum1, sum1);
  211|   620k|    an = _mm256_mullo_epi32(sum2, _mm256_set1_epi32(n));
  212|   620k|  }
  213|   822k|  return _mm256_sub_epi32(an, bb);
  214|   822k|}
selfguided_avx2.c:final_filter_fast:
  496|  1.66k|                              int height, int highbd) {
  497|  1.66k|  const int nb0 = 5;
  498|  1.66k|  const int nb1 = 4;
  499|       |
  500|  1.66k|  const __m256i rounding0 =
  501|  1.66k|      round_for_shift(SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
  ------------------
  |  |  103|  1.66k|#define SGRPROJ_SGR_BITS 8
  ------------------
                    round_for_shift(SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
  ------------------
  |  |  101|  1.66k|#define SGRPROJ_RST_BITS 4
  ------------------
  502|  1.66k|  const __m256i rounding1 =
  503|  1.66k|      round_for_shift(SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
  ------------------
  |  |  103|  1.66k|#define SGRPROJ_SGR_BITS 8
  ------------------
                    round_for_shift(SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
  ------------------
  |  |  101|  1.66k|#define SGRPROJ_RST_BITS 4
  ------------------
  504|       |
  505|  1.66k|  const uint8_t *dgd_real =
  506|  1.66k|      highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
  ------------------
  |  |   75|    800|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  |  Branch (506:7): [True: 800, False: 861]
  ------------------
  507|       |
  508|  78.5k|  for (int i = 0; i < height; ++i) {
  ------------------
  |  Branch (508:19): [True: 76.8k, False: 1.66k]
  ------------------
  509|  76.8k|    if (!(i & 1)) {  // even row
  ------------------
  |  Branch (509:9): [True: 38.6k, False: 38.2k]
  ------------------
  510|   279k|      for (int j = 0; j < width; j += 8) {
  ------------------
  |  Branch (510:23): [True: 240k, False: 38.6k]
  ------------------
  511|   240k|        const __m256i a =
  512|   240k|            cross_sum_fast_even_row(A + i * buf_stride + j, buf_stride);
  513|   240k|        const __m256i b =
  514|   240k|            cross_sum_fast_even_row(B + i * buf_stride + j, buf_stride);
  515|       |
  516|   240k|        const __m128i raw =
  517|   240k|            xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd));
  518|   240k|        const __m256i src =
  519|   240k|            highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw);
  ------------------
  |  Branch (519:13): [True: 81.8k, False: 159k]
  ------------------
  520|       |
  521|   240k|        __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
  522|   240k|        __m256i w =
  523|   240k|            _mm256_srai_epi32(_mm256_add_epi32(v, rounding0),
  524|   240k|                              SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
  ------------------
  |  |  103|   240k|#define SGRPROJ_SGR_BITS 8
  ------------------
                                            SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
  ------------------
  |  |  101|   240k|#define SGRPROJ_RST_BITS 4
  ------------------
  525|       |
  526|   240k|        yy_storeu_256(dst + i * dst_stride + j, w);
  527|   240k|      }
  528|  38.6k|    } else {  // odd row
  529|   279k|      for (int j = 0; j < width; j += 8) {
  ------------------
  |  Branch (529:23): [True: 241k, False: 38.2k]
  ------------------
  530|   241k|        const __m256i a = cross_sum_fast_odd_row(A + i * buf_stride + j);
  531|   241k|        const __m256i b = cross_sum_fast_odd_row(B + i * buf_stride + j);
  532|       |
  533|   241k|        const __m128i raw =
  534|   241k|            xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd));
  535|   241k|        const __m256i src =
  536|   241k|            highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw);
  ------------------
  |  Branch (536:13): [True: 82.0k, False: 159k]
  ------------------
  537|       |
  538|   241k|        __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
  539|   241k|        __m256i w =
  540|   241k|            _mm256_srai_epi32(_mm256_add_epi32(v, rounding1),
  541|   241k|                              SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
  ------------------
  |  |  103|   241k|#define SGRPROJ_SGR_BITS 8
  ------------------
                                            SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
  ------------------
  |  |  101|   241k|#define SGRPROJ_RST_BITS 4
  ------------------
  542|       |
  543|   241k|        yy_storeu_256(dst + i * dst_stride + j, w);
  544|   241k|      }
  545|  38.2k|    }
  546|  76.8k|  }
  547|  1.66k|}
selfguided_avx2.c:cross_sum_fast_even_row:
  440|   465k|static inline __m256i cross_sum_fast_even_row(const int32_t *buf, int stride) {
  441|   465k|  const __m256i xtl = yy_loadu_256(buf - 1 - stride);
  442|   465k|  const __m256i xt = yy_loadu_256(buf - stride);
  443|   465k|  const __m256i xtr = yy_loadu_256(buf + 1 - stride);
  444|   465k|  const __m256i xbl = yy_loadu_256(buf - 1 + stride);
  445|   465k|  const __m256i xb = yy_loadu_256(buf + stride);
  446|   465k|  const __m256i xbr = yy_loadu_256(buf + 1 + stride);
  447|       |
  448|   465k|  const __m256i fives =
  449|   465k|      _mm256_add_epi32(xtl, _mm256_add_epi32(xtr, _mm256_add_epi32(xbr, xbl)));
  450|   465k|  const __m256i sixes = _mm256_add_epi32(xt, xb);
  451|   465k|  const __m256i fives_plus_sixes = _mm256_add_epi32(fives, sixes);
  452|       |
  453|   465k|  return _mm256_add_epi32(
  454|   465k|      _mm256_add_epi32(_mm256_slli_epi32(fives_plus_sixes, 2),
  455|   465k|                       fives_plus_sixes),
  456|   465k|      sixes);
  457|   465k|}
selfguided_avx2.c:cross_sum_fast_odd_row:
  474|   464k|static inline __m256i cross_sum_fast_odd_row(const int32_t *buf) {
  475|   464k|  const __m256i xl = yy_loadu_256(buf - 1);
  476|   464k|  const __m256i x = yy_loadu_256(buf);
  477|   464k|  const __m256i xr = yy_loadu_256(buf + 1);
  478|       |
  479|   464k|  const __m256i fives = _mm256_add_epi32(xl, xr);
  480|   464k|  const __m256i sixes = x;
  481|       |
  482|   464k|  const __m256i fives_plus_sixes = _mm256_add_epi32(fives, sixes);
  483|       |
  484|   464k|  return _mm256_add_epi32(
  485|   464k|      _mm256_add_epi32(_mm256_slli_epi32(fives_plus_sixes, 2),
  486|   464k|                       fives_plus_sixes),
  487|   464k|      sixes);
  488|   464k|}
selfguided_avx2.c:calc_ab:
  221|  2.01k|                    int sgr_params_idx, int radius_idx) {
  222|  2.01k|  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
  223|  2.01k|  const int r = params->r[radius_idx];
  224|  2.01k|  const int n = (2 * r + 1) * (2 * r + 1);
  225|  2.01k|  const __m256i s = _mm256_set1_epi32(params->s[radius_idx]);
  226|       |  // one_over_n[n-1] is 2^12/n, so easily fits in an int16
  227|  2.01k|  const __m256i one_over_n = _mm256_set1_epi32(av1_one_by_x[n - 1]);
  228|       |
  229|  2.01k|  const __m256i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
  ------------------
  |  |  117|  2.01k|#define SGRPROJ_MTABLE_BITS 20
  ------------------
  230|  2.01k|  const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
  ------------------
  |  |  118|  2.01k|#define SGRPROJ_RECIP_BITS 12
  ------------------
  231|       |
  232|       |  // Set up masks
  233|  2.01k|  const __m128i ones32 = _mm_set_epi32(0, 0, ~0, ~0);
  234|  2.01k|  __m256i mask[8];
  235|  18.1k|  for (int idx = 0; idx < 8; idx++) {
  ------------------
  |  Branch (235:21): [True: 16.0k, False: 2.01k]
  ------------------
  236|  16.0k|    const __m128i shift = _mm_cvtsi32_si128(8 * (8 - idx));
  237|  16.0k|    mask[idx] = _mm256_cvtepi8_epi32(_mm_srl_epi64(ones32, shift));
  238|  16.0k|  }
  239|       |
  240|  94.2k|  for (int i = -1; i < height + 1; ++i) {
  ------------------
  |  Branch (240:20): [True: 92.2k, False: 2.01k]
  ------------------
  241|   700k|    for (int j = -1; j < width + 1; j += 8) {
  ------------------
  |  Branch (241:22): [True: 608k, False: 92.2k]
  ------------------
  242|   608k|      const int32_t *Cij = C + i * buf_stride + j;
  243|   608k|      const int32_t *Dij = D + i * buf_stride + j;
  244|       |
  245|   608k|      __m256i sum1 = boxsum_from_ii(Dij, buf_stride, r);
  246|   608k|      __m256i sum2 = boxsum_from_ii(Cij, buf_stride, r);
  247|       |
  248|       |      // When width + 2 isn't a multiple of 8, sum1 and sum2 will contain
  249|       |      // some uninitialised data in their upper words. We use a mask to
  250|       |      // ensure that these bits are set to 0.
  251|   608k|      int idx = AOMMIN(8, width + 1 - j);
  ------------------
  |  |   34|   608k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 522k, False: 85.8k]
  |  |  ------------------
  ------------------
  252|   608k|      assert(idx >= 1);
  253|       |
  254|   608k|      if (idx < 8) {
  ------------------
  |  Branch (254:11): [True: 92.6k, False: 515k]
  ------------------
  255|  92.6k|        sum1 = _mm256_and_si256(mask[idx], sum1);
  256|  92.6k|        sum2 = _mm256_and_si256(mask[idx], sum2);
  257|  92.6k|      }
  258|       |
  259|   608k|      const __m256i p = compute_p(sum1, sum2, bit_depth, n);
  260|       |
  261|   608k|      const __m256i z = _mm256_min_epi32(
  262|   608k|          _mm256_srli_epi32(_mm256_add_epi32(_mm256_mullo_epi32(p, s), rnd_z),
  263|   608k|                            SGRPROJ_MTABLE_BITS),
  ------------------
  |  |  117|   608k|#define SGRPROJ_MTABLE_BITS 20
  ------------------
  264|   608k|          _mm256_set1_epi32(255));
  265|       |
  266|   608k|      const __m256i a_res = _mm256_i32gather_epi32(av1_x_by_xplus1, z, 4);
  267|       |
  268|   608k|      yy_storeu_256(A + i * buf_stride + j, a_res);
  269|       |
  270|   608k|      const __m256i a_complement =
  271|   608k|          _mm256_sub_epi32(_mm256_set1_epi32(SGRPROJ_SGR), a_res);
  ------------------
  |  |  104|   608k|#define SGRPROJ_SGR (1 << SGRPROJ_SGR_BITS)
  |  |  ------------------
  |  |  |  |  103|   608k|#define SGRPROJ_SGR_BITS 8
  |  |  ------------------
  ------------------
  272|       |
  273|       |      // sum1 might have lanes greater than 2^15, so we can't use madd to do
  274|       |      // multiplication involving sum1. However, a_complement and one_over_n
  275|       |      // are both less than 256, so we can multiply them first.
  276|   608k|      const __m256i a_comp_over_n = _mm256_madd_epi16(a_complement, one_over_n);
  277|   608k|      const __m256i b_int = _mm256_mullo_epi32(a_comp_over_n, sum1);
  278|   608k|      const __m256i b_res = _mm256_srli_epi32(_mm256_add_epi32(b_int, rnd_res),
  279|   608k|                                              SGRPROJ_RECIP_BITS);
  ------------------
  |  |  118|   608k|#define SGRPROJ_RECIP_BITS 12
  ------------------
  280|       |
  281|   608k|      yy_storeu_256(B + i * buf_stride + j, b_res);
  282|   608k|    }
  283|  92.2k|  }
  284|  2.01k|}
selfguided_avx2.c:final_filter:
  326|  2.00k|                         int dgd_stride, int width, int height, int highbd) {
  327|  2.00k|  const int nb = 5;
  328|  2.00k|  const __m256i rounding =
  329|  2.00k|      round_for_shift(SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
  ------------------
  |  |  103|  2.00k|#define SGRPROJ_SGR_BITS 8
  ------------------
                    round_for_shift(SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
  ------------------
  |  |  101|  2.00k|#define SGRPROJ_RST_BITS 4
  ------------------
  330|  2.00k|  const uint8_t *dgd_real =
  331|  2.00k|      highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
  ------------------
  |  |   75|    479|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  |  Branch (331:7): [True: 479, False: 1.52k]
  ------------------
  332|       |
  333|  96.5k|  for (int i = 0; i < height; ++i) {
  ------------------
  |  Branch (333:19): [True: 94.5k, False: 2.00k]
  ------------------
  334|   640k|    for (int j = 0; j < width; j += 8) {
  ------------------
  |  Branch (334:21): [True: 545k, False: 94.5k]
  ------------------
  335|   545k|      const __m256i a = cross_sum(A + i * buf_stride + j, buf_stride);
  336|   545k|      const __m256i b = cross_sum(B + i * buf_stride + j, buf_stride);
  337|       |
  338|   545k|      const __m128i raw =
  339|   545k|          xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd));
  340|   545k|      const __m256i src =
  341|   545k|          highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw);
  ------------------
  |  Branch (341:11): [True: 90.1k, False: 455k]
  ------------------
  342|       |
  343|   545k|      __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
  344|   545k|      __m256i w = _mm256_srai_epi32(_mm256_add_epi32(v, rounding),
  345|   545k|                                    SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
  ------------------
  |  |  103|   545k|#define SGRPROJ_SGR_BITS 8
  ------------------
                                                  SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
  ------------------
  |  |  101|   545k|#define SGRPROJ_RST_BITS 4
  ------------------
  346|       |
  347|   545k|      yy_storeu_256(dst + i * dst_stride + j, w);
  348|   545k|    }
  349|  94.5k|  }
  350|  2.00k|}
selfguided_avx2.c:cross_sum:
  302|  1.01M|static inline __m256i cross_sum(const int32_t *buf, int stride) {
  303|  1.01M|  const __m256i xtl = yy_loadu_256(buf - 1 - stride);
  304|  1.01M|  const __m256i xt = yy_loadu_256(buf - stride);
  305|  1.01M|  const __m256i xtr = yy_loadu_256(buf + 1 - stride);
  306|  1.01M|  const __m256i xl = yy_loadu_256(buf - 1);
  307|  1.01M|  const __m256i x = yy_loadu_256(buf);
  308|  1.01M|  const __m256i xr = yy_loadu_256(buf + 1);
  309|  1.01M|  const __m256i xbl = yy_loadu_256(buf - 1 + stride);
  310|  1.01M|  const __m256i xb = yy_loadu_256(buf + stride);
  311|  1.01M|  const __m256i xbr = yy_loadu_256(buf + 1 + stride);
  312|       |
  313|  1.01M|  const __m256i fours = _mm256_add_epi32(
  314|  1.01M|      xl, _mm256_add_epi32(xt, _mm256_add_epi32(xr, _mm256_add_epi32(xb, x))));
  315|  1.01M|  const __m256i threes =
  316|  1.01M|      _mm256_add_epi32(xtl, _mm256_add_epi32(xtr, _mm256_add_epi32(xbr, xbl)));
  317|       |
  318|  1.01M|  return _mm256_sub_epi32(_mm256_slli_epi32(_mm256_add_epi32(fours, threes), 2),
  319|  1.01M|                          threes);
  320|  1.01M|}
selfguided_avx2.c:round_for_shift:
  190|   700k|static __m256i round_for_shift(unsigned shift) {
  191|   700k|  return _mm256_set1_epi32((1 << shift) >> 1);
  192|   700k|}

av1_warp_affine_avx2:
 1030|  3.22k|                          int16_t beta, int16_t gamma, int16_t delta) {
 1031|  3.22k|  __m256i horz_out[8];
 1032|  3.22k|  int i, j, k;
 1033|  3.22k|  const int bd = 8;
 1034|  3.22k|  const int reduce_bits_horiz = conv_params->round_0;
 1035|  3.22k|  const int reduce_bits_vert = conv_params->is_compound
  ------------------
  |  Branch (1035:32): [True: 430, False: 2.79k]
  ------------------
 1036|  3.22k|                                   ? conv_params->round_1
 1037|  3.22k|                                   : 2 * FILTER_BITS - reduce_bits_horiz;
  ------------------
  |  |   21|  2.79k|#define FILTER_BITS 7
  ------------------
 1038|  3.22k|  const int offset_bits_horiz = bd + FILTER_BITS - 1;
  ------------------
  |  |   21|  3.22k|#define FILTER_BITS 7
  ------------------
 1039|  3.22k|  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
 1040|       |
 1041|  3.22k|  const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
  ------------------
  |  |   21|  3.22k|#define FILTER_BITS 7
  ------------------
 1042|  3.22k|  const __m256i reduce_bits_vert_const =
 1043|  3.22k|      _mm256_set1_epi32(((1 << reduce_bits_vert) >> 1));
 1044|  3.22k|  const __m256i res_add_const = _mm256_set1_epi32(1 << offset_bits_vert);
 1045|  3.22k|  const int round_bits =
 1046|  3.22k|      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|  3.22k|#define FILTER_BITS 7
  ------------------
 1047|  3.22k|  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
  ------------------
  |  |   21|  3.22k|#define FILTER_BITS 7
  ------------------
 1048|  3.22k|  assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
 1049|       |
 1050|  3.22k|  const __m256i round_const = _mm256_set1_epi16(
 1051|  3.22k|      (1 << offset_bits_horiz) + ((1 << reduce_bits_horiz) >> 1));
 1052|  3.22k|  const __m128i shift = _mm_cvtsi32_si128(reduce_bits_horiz);
 1053|       |
 1054|  3.22k|  __m256i res_sub_const, round_bits_const, wt;
 1055|  3.22k|  unpack_weights_and_set_round_const_avx2(conv_params, round_bits, offset_bits,
 1056|  3.22k|                                          &res_sub_const, &round_bits_const,
 1057|  3.22k|                                          &wt);
 1058|       |
 1059|  3.22k|  __m256i res_add_const_1;
 1060|  3.22k|  if (conv_params->is_compound == 1) {
  ------------------
  |  Branch (1060:7): [True: 430, False: 2.79k]
  ------------------
 1061|    430|    res_add_const_1 = _mm256_add_epi32(reduce_bits_vert_const, res_add_const);
 1062|  2.79k|  } else {
 1063|  2.79k|    res_add_const_1 = _mm256_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
 1064|  2.79k|                                        ((1 << reduce_bits_vert) >> 1));
 1065|  2.79k|  }
 1066|  3.22k|  const int32_t const1 = alpha * (-4) + beta * (-4) +
 1067|  3.22k|                         (1 << (WARPEDDIFF_PREC_BITS - 1)) +
  ------------------
  |  |  107|  3.22k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  3.22k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  3.22k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
 1068|  3.22k|                         (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
  ------------------
  |  |  103|  3.22k|#define WARPEDPIXEL_PREC_SHIFTS (1 << WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  3.22k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
                                       (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
  ------------------
  |  |  107|  3.22k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  3.22k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  3.22k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
 1069|  3.22k|  const int32_t const2 = gamma * (-4) + delta * (-4) +
 1070|  3.22k|                         (1 << (WARPEDDIFF_PREC_BITS - 1)) +
  ------------------
  |  |  107|  3.22k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  3.22k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  3.22k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
 1071|  3.22k|                         (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
  ------------------
  |  |  103|  3.22k|#define WARPEDPIXEL_PREC_SHIFTS (1 << WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  3.22k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
                                       (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
  ------------------
  |  |  107|  3.22k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  3.22k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  3.22k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
 1072|  3.22k|  const int32_t const3 = ((1 << WARP_PARAM_REDUCE_BITS) - 1);
  ------------------
  |  |  105|  3.22k|#define WARP_PARAM_REDUCE_BITS 6
  ------------------
 1073|  3.22k|  const int16_t const4 = (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1));
  ------------------
  |  |   21|  3.22k|#define FILTER_BITS 7
  ------------------
 1074|  3.22k|  const int16_t const5 = (1 << (FILTER_BITS - reduce_bits_horiz));
  ------------------
  |  |   21|  3.22k|#define FILTER_BITS 7
  ------------------
 1075|       |
 1076|  3.22k|  __m256i shuffle_src[4];
 1077|  3.22k|  shuffle_src[0] = _mm256_load_si256((__m256i *)shuffle_src0);
 1078|  3.22k|  shuffle_src[1] = _mm256_load_si256((__m256i *)shuffle_src1);
 1079|  3.22k|  shuffle_src[2] = _mm256_load_si256((__m256i *)shuffle_src2);
 1080|  3.22k|  shuffle_src[3] = _mm256_load_si256((__m256i *)shuffle_src3);
 1081|       |
 1082|  9.12k|  for (i = 0; i < p_height; i += 8) {
  ------------------
  |  Branch (1082:15): [True: 5.90k, False: 3.22k]
  ------------------
 1083|  20.6k|    for (j = 0; j < p_width; j += 8) {
  ------------------
  |  Branch (1083:17): [True: 14.7k, False: 5.90k]
  ------------------
 1084|  14.7k|      const int32_t src_x = (p_col + j + 4) << subsampling_x;
 1085|  14.7k|      const int32_t src_y = (p_row + i + 4) << subsampling_y;
 1086|  14.7k|      const int64_t dst_x =
 1087|  14.7k|          (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
 1088|  14.7k|      const int64_t dst_y =
 1089|  14.7k|          (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
 1090|  14.7k|      const int64_t x4 = dst_x >> subsampling_x;
 1091|  14.7k|      const int64_t y4 = dst_y >> subsampling_y;
 1092|       |
 1093|  14.7k|      int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
  ------------------
  |  |   96|  14.7k|#define WARPEDMODEL_PREC_BITS 16
  ------------------
 1094|  14.7k|      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
  ------------------
  |  |   96|  14.7k|#define WARPEDMODEL_PREC_BITS 16
  ------------------
 1095|  14.7k|      int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
  ------------------
  |  |   96|  14.7k|#define WARPEDMODEL_PREC_BITS 16
  ------------------
 1096|  14.7k|      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
  ------------------
  |  |   96|  14.7k|#define WARPEDMODEL_PREC_BITS 16
  ------------------
 1097|       |
 1098|       |      // Add in all the constant terms, including rounding and offset
 1099|  14.7k|      sx4 += const1;
 1100|  14.7k|      sy4 += const2;
 1101|       |
 1102|  14.7k|      sx4 &= ~const3;
 1103|  14.7k|      sy4 &= ~const3;
 1104|       |
 1105|       |      // Horizontal filter
 1106|       |      // If the block is aligned such that, after clamping, every sample
 1107|       |      // would be taken from the leftmost/rightmost column, then we can
 1108|       |      // skip the expensive horizontal filter.
 1109|       |
 1110|  14.7k|      if (ix4 <= -7) {
  ------------------
  |  Branch (1110:11): [True: 98, False: 14.6k]
  ------------------
 1111|     98|        int iy, row = 0;
 1112|    784|        for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
  ------------------
  |  |   34|    784|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 416, False: 368]
  |  |  ------------------
  ------------------
  |  Branch (1112:22): [True: 686, False: 98]
  ------------------
 1113|    686|          iy = iy4 + k;
 1114|    686|          iy = clamp(iy, 0, height - 1);
 1115|    686|          const __m256i temp_0 =
 1116|    686|              _mm256_set1_epi16(const4 + ref[iy * stride] * const5);
 1117|    686|          iy = iy4 + k + 1;
 1118|    686|          iy = clamp(iy, 0, height - 1);
 1119|    686|          const __m256i temp_1 =
 1120|    686|              _mm256_set1_epi16(const4 + ref[iy * stride] * const5);
 1121|    686|          horz_out[row] = _mm256_blend_epi32(temp_0, temp_1, 0xf0);
 1122|    686|          row += 1;
 1123|    686|        }
 1124|     98|        iy = iy4 + k;
 1125|     98|        iy = clamp(iy, 0, height - 1);
 1126|     98|        horz_out[row] = _mm256_set1_epi16(const4 + ref[iy * stride] * const5);
 1127|  14.6k|      } else if (ix4 >= width + 6) {
  ------------------
  |  Branch (1127:18): [True: 730, False: 13.8k]
  ------------------
 1128|    730|        int iy, row = 0;
 1129|  5.84k|        for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
  ------------------
  |  |   34|  5.84k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 5.19k, False: 648]
  |  |  ------------------
  ------------------
  |  Branch (1129:22): [True: 5.11k, False: 730]
  ------------------
 1130|  5.11k|          iy = iy4 + k;
 1131|  5.11k|          iy = clamp(iy, 0, height - 1);
 1132|  5.11k|          const __m256i temp_0 = _mm256_set1_epi16(
 1133|  5.11k|              const4 + ref[iy * stride + (width - 1)] * const5);
 1134|  5.11k|          iy = iy4 + k + 1;
 1135|  5.11k|          iy = clamp(iy, 0, height - 1);
 1136|  5.11k|          const __m256i temp_1 = _mm256_set1_epi16(
 1137|  5.11k|              const4 + ref[iy * stride + (width - 1)] * const5);
 1138|  5.11k|          horz_out[row] = _mm256_blend_epi32(temp_0, temp_1, 0xf0);
 1139|  5.11k|          row += 1;
 1140|  5.11k|        }
 1141|    730|        iy = iy4 + k;
 1142|    730|        iy = clamp(iy, 0, height - 1);
 1143|    730|        horz_out[row] =
 1144|    730|            _mm256_set1_epi16(const4 + ref[iy * stride + (width - 1)] * const5);
 1145|  13.8k|      } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
  ------------------
  |  Branch (1145:18): [True: 875, False: 13.0k]
  |  Branch (1145:37): [True: 1.18k, False: 11.8k]
  ------------------
 1146|  2.06k|        const int out_of_boundary_left = -(ix4 - 6);
 1147|  2.06k|        const int out_of_boundary_right = (ix4 + 8) - width;
 1148|  2.06k|        int iy, sx, row = 0;
 1149|  16.4k|        for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
  ------------------
  |  |   34|  16.4k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 10.4k, False: 6.03k]
  |  |  ------------------
  ------------------
  |  Branch (1149:22): [True: 14.4k, False: 2.06k]
  ------------------
 1150|  14.4k|          iy = iy4 + k;
 1151|  14.4k|          iy = clamp(iy, 0, height - 1);
 1152|  14.4k|          __m128i src0 =
 1153|  14.4k|              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
 1154|  14.4k|          iy = iy4 + k + 1;
 1155|  14.4k|          iy = clamp(iy, 0, height - 1);
 1156|  14.4k|          __m128i src1 =
 1157|  14.4k|              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
 1158|       |
 1159|  14.4k|          if (out_of_boundary_left >= 0) {
  ------------------
  |  Branch (1159:15): [True: 6.12k, False: 8.30k]
  ------------------
 1160|  6.12k|            const __m128i shuffle_reg_left =
 1161|  6.12k|                _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
 1162|  6.12k|            src0 = _mm_shuffle_epi8(src0, shuffle_reg_left);
 1163|  6.12k|            src1 = _mm_shuffle_epi8(src1, shuffle_reg_left);
 1164|  6.12k|          }
 1165|  14.4k|          if (out_of_boundary_right >= 0) {
  ------------------
  |  Branch (1165:15): [True: 8.30k, False: 6.12k]
  ------------------
 1166|  8.30k|            const __m128i shuffle_reg_right = _mm_loadu_si128(
 1167|  8.30k|                (__m128i *)warp_pad_right[out_of_boundary_right]);
 1168|  8.30k|            src0 = _mm_shuffle_epi8(src0, shuffle_reg_right);
 1169|  8.30k|            src1 = _mm_shuffle_epi8(src1, shuffle_reg_right);
 1170|  8.30k|          }
 1171|  14.4k|          sx = sx4 + beta * (k + 4);
 1172|  14.4k|          const __m256i src_01 =
 1173|  14.4k|              _mm256_inserti128_si256(_mm256_castsi128_si256(src0), src1, 0x1);
 1174|  14.4k|          horizontal_filter_avx2(src_01, horz_out, sx, alpha, beta, row,
 1175|  14.4k|                                 shuffle_src, &round_const, &shift);
 1176|  14.4k|          row += 1;
 1177|  14.4k|        }
 1178|  2.06k|        iy = iy4 + k;
 1179|  2.06k|        iy = clamp(iy, 0, height - 1);
 1180|  2.06k|        __m128i src = _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
 1181|  2.06k|        if (out_of_boundary_left >= 0) {
  ------------------
  |  Branch (1181:13): [True: 875, False: 1.18k]
  ------------------
 1182|    875|          const __m128i shuffle_reg_left =
 1183|    875|              _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
 1184|    875|          src = _mm_shuffle_epi8(src, shuffle_reg_left);
 1185|    875|        }
 1186|  2.06k|        if (out_of_boundary_right >= 0) {
  ------------------
  |  Branch (1186:13): [True: 1.18k, False: 875]
  ------------------
 1187|  1.18k|          const __m128i shuffle_reg_right =
 1188|  1.18k|              _mm_loadu_si128((__m128i *)warp_pad_right[out_of_boundary_right]);
 1189|  1.18k|          src = _mm_shuffle_epi8(src, shuffle_reg_right);
 1190|  1.18k|        }
 1191|  2.06k|        sx = sx4 + beta * (k + 4);
 1192|  2.06k|        const __m256i src_01 = _mm256_castsi128_si256(src);
 1193|  2.06k|        __m256i coeff[4];
 1194|  2.06k|        prepare_horizontal_filter_coeff(alpha, sx, coeff);
 1195|  2.06k|        filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src,
 1196|  2.06k|                               &round_const, &shift, row);
 1197|  11.8k|      } else {
 1198|  11.8k|        prepare_warp_horizontal_filter_avx2(
 1199|  11.8k|            ref, horz_out, stride, ix4, iy4, sx4, alpha, beta, p_height, height,
 1200|  11.8k|            i, &round_const, &shift, shuffle_src);
 1201|  11.8k|      }
 1202|       |
 1203|       |      // Vertical filter
 1204|  14.7k|      prepare_warp_vertical_filter_avx2(
 1205|  14.7k|          pred, horz_out, conv_params, gamma, delta, p_height, p_stride,
 1206|  14.7k|          p_width, i, j, sy4, reduce_bits_vert, &res_add_const_1, round_bits,
 1207|  14.7k|          &res_sub_const, &round_bits_const, &wt);
 1208|  14.7k|    }
 1209|  5.90k|  }
 1210|  3.22k|}
warp_plane_avx2.c:unpack_weights_and_set_round_const_avx2:
  433|  3.22k|    __m256i *res_sub_const, __m256i *round_bits_const, __m256i *wt) {
  434|  3.22k|  *res_sub_const =
  435|  3.22k|      _mm256_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) -
  436|  3.22k|                        (1 << (offset_bits - conv_params->round_1 - 1)));
  437|  3.22k|  *round_bits_const = _mm256_set1_epi16(((1 << round_bits) >> 1));
  438|       |
  439|  3.22k|  const int w0 = conv_params->fwd_offset;
  440|  3.22k|  const int w1 = conv_params->bck_offset;
  441|  3.22k|  const __m256i wt0 = _mm256_set1_epi16((short)w0);
  442|  3.22k|  const __m256i wt1 = _mm256_set1_epi16((short)w1);
  443|  3.22k|  *wt = _mm256_unpacklo_epi16(wt0, wt1);
  444|  3.22k|}
warp_plane_avx2.c:horizontal_filter_avx2:
  258|  36.6k|                                          const __m128i *shift) {
  259|  36.6k|  __m256i coeff[4];
  260|  36.6k|  prepare_horizontal_filter_coeff_avx2(alpha, beta, sx, coeff);
  261|  36.6k|  filter_src_pixels_avx2(src, horz_out, coeff, shuffle_src, round_const, shift,
  262|  36.6k|                         row);
  263|  36.6k|}
warp_plane_avx2.c:prepare_horizontal_filter_coeff_avx2:
  101|  36.6k|                                                        __m256i *coeff) {
  102|  36.6k|  __m128i tmp_0 = _mm_loadl_epi64(
  103|  36.6k|      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 0 * alpha)) >>
  104|  36.6k|                                  WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  36.6k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  36.6k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  36.6k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  105|  36.6k|  __m128i tmp_1 = _mm_loadl_epi64(
  106|  36.6k|      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 1 * alpha)) >>
  107|  36.6k|                                  WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  36.6k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  36.6k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  36.6k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  108|  36.6k|  __m128i tmp_2 = _mm_loadl_epi64(
  109|  36.6k|      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 2 * alpha)) >>
  110|  36.6k|                                  WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  36.6k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  36.6k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  36.6k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  111|  36.6k|  __m128i tmp_3 = _mm_loadl_epi64(
  112|  36.6k|      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 3 * alpha)) >>
  113|  36.6k|                                  WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  36.6k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  36.6k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  36.6k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  114|       |
  115|  36.6k|  __m128i tmp_4 = _mm_loadl_epi64(
  116|  36.6k|      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 4 * alpha)) >>
  117|  36.6k|                                  WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  36.6k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  36.6k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  36.6k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  118|  36.6k|  __m128i tmp_5 = _mm_loadl_epi64(
  119|  36.6k|      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 5 * alpha)) >>
  120|  36.6k|                                  WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  36.6k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  36.6k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  36.6k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  121|  36.6k|  __m128i tmp_6 = _mm_loadl_epi64(
  122|  36.6k|      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 6 * alpha)) >>
  123|  36.6k|                                  WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  36.6k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  36.6k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  36.6k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  124|  36.6k|  __m128i tmp_7 = _mm_loadl_epi64(
  125|  36.6k|      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 7 * alpha)) >>
  126|  36.6k|                                  WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  36.6k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  36.6k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  36.6k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  127|       |
  128|  36.6k|  __m256i tmp0_256 = _mm256_castsi128_si256(tmp_0);
  129|  36.6k|  __m256i tmp2_256 = _mm256_castsi128_si256(tmp_2);
  130|  36.6k|  __m256i tmp1_256 = _mm256_castsi128_si256(tmp_1);
  131|  36.6k|  __m256i tmp3_256 = _mm256_castsi128_si256(tmp_3);
  132|       |
  133|  36.6k|  __m256i tmp4_256 = _mm256_castsi128_si256(tmp_4);
  134|  36.6k|  __m256i tmp6_256 = _mm256_castsi128_si256(tmp_6);
  135|  36.6k|  __m256i tmp5_256 = _mm256_castsi128_si256(tmp_5);
  136|  36.6k|  __m256i tmp7_256 = _mm256_castsi128_si256(tmp_7);
  137|       |
  138|  36.6k|  __m128i tmp_8 = _mm_loadl_epi64(
  139|  36.6k|      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 0 * alpha) >>
  140|  36.6k|                                  WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  36.6k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  36.6k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  36.6k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  141|  36.6k|  tmp0_256 = _mm256_inserti128_si256(tmp0_256, tmp_8, 1);
  142|       |
  143|  36.6k|  __m128i tmp_9 = _mm_loadl_epi64(
  144|  36.6k|      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 1 * alpha) >>
  145|  36.6k|                                  WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  36.6k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  36.6k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  36.6k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  146|  36.6k|  tmp1_256 = _mm256_inserti128_si256(tmp1_256, tmp_9, 1);
  147|       |
  148|  36.6k|  __m128i tmp_10 = _mm_loadl_epi64(
  149|  36.6k|      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 2 * alpha) >>
  150|  36.6k|                                  WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  36.6k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  36.6k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  36.6k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  151|  36.6k|  tmp2_256 = _mm256_inserti128_si256(tmp2_256, tmp_10, 1);
  152|       |
  153|  36.6k|  __m128i tmp_11 = _mm_loadl_epi64(
  154|  36.6k|      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 3 * alpha) >>
  155|  36.6k|                                  WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  36.6k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  36.6k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  36.6k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  156|  36.6k|  tmp3_256 = _mm256_inserti128_si256(tmp3_256, tmp_11, 1);
  157|       |
  158|  36.6k|  tmp_2 = _mm_loadl_epi64(
  159|  36.6k|      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 4 * alpha) >>
  160|  36.6k|                                  WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  36.6k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  36.6k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  36.6k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  161|  36.6k|  tmp4_256 = _mm256_inserti128_si256(tmp4_256, tmp_2, 1);
  162|       |
  163|  36.6k|  tmp_3 = _mm_loadl_epi64(
  164|  36.6k|      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 5 * alpha) >>
  165|  36.6k|                                  WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  36.6k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  36.6k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  36.6k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  166|  36.6k|  tmp5_256 = _mm256_inserti128_si256(tmp5_256, tmp_3, 1);
  167|       |
  168|  36.6k|  tmp_6 = _mm_loadl_epi64(
  169|  36.6k|      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 6 * alpha) >>
  170|  36.6k|                                  WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  36.6k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  36.6k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  36.6k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  171|  36.6k|  tmp6_256 = _mm256_inserti128_si256(tmp6_256, tmp_6, 1);
  172|       |
  173|  36.6k|  tmp_7 = _mm_loadl_epi64(
  174|  36.6k|      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 7 * alpha) >>
  175|  36.6k|                                  WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  36.6k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  36.6k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  36.6k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  176|  36.6k|  tmp7_256 = _mm256_inserti128_si256(tmp7_256, tmp_7, 1);
  177|       |
  178|  36.6k|  const __m256i tmp_12 = _mm256_unpacklo_epi16(tmp0_256, tmp2_256);
  179|  36.6k|  const __m256i tmp_13 = _mm256_unpacklo_epi16(tmp1_256, tmp3_256);
  180|  36.6k|  const __m256i tmp_14 = _mm256_unpacklo_epi16(tmp4_256, tmp6_256);
  181|  36.6k|  const __m256i tmp_15 = _mm256_unpacklo_epi16(tmp5_256, tmp7_256);
  182|       |
  183|  36.6k|  const __m256i res_0 = _mm256_unpacklo_epi32(tmp_12, tmp_14);
  184|  36.6k|  const __m256i res_1 = _mm256_unpackhi_epi32(tmp_12, tmp_14);
  185|  36.6k|  const __m256i res_2 = _mm256_unpacklo_epi32(tmp_13, tmp_15);
  186|  36.6k|  const __m256i res_3 = _mm256_unpackhi_epi32(tmp_13, tmp_15);
  187|       |
  188|  36.6k|  coeff[0] = _mm256_unpacklo_epi64(res_0, res_2);
  189|  36.6k|  coeff[1] = _mm256_unpackhi_epi64(res_0, res_2);
  190|  36.6k|  coeff[2] = _mm256_unpacklo_epi64(res_1, res_3);
  191|  36.6k|  coeff[3] = _mm256_unpackhi_epi64(res_1, res_3);
  192|  36.6k|}
warp_plane_avx2.c:prepare_horizontal_filter_coeff:
  265|  5.23k|                                                   __m256i *coeff) {
  266|  5.23k|  const __m128i tmp_0 = _mm_loadl_epi64(
  267|  5.23k|      (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  5.23k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  5.23k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  5.23k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  268|  5.23k|  const __m128i tmp_1 = _mm_loadl_epi64(
  269|  5.23k|      (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  5.23k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  5.23k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  5.23k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  270|  5.23k|  const __m128i tmp_2 = _mm_loadl_epi64(
  271|  5.23k|      (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  5.23k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  5.23k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  5.23k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  272|  5.23k|  const __m128i tmp_3 = _mm_loadl_epi64(
  273|  5.23k|      (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  5.23k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  5.23k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  5.23k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  274|  5.23k|  const __m128i tmp_4 = _mm_loadl_epi64(
  275|  5.23k|      (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  5.23k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  5.23k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  5.23k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  276|  5.23k|  const __m128i tmp_5 = _mm_loadl_epi64(
  277|  5.23k|      (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  5.23k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  5.23k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  5.23k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  278|  5.23k|  const __m128i tmp_6 = _mm_loadl_epi64(
  279|  5.23k|      (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  5.23k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  5.23k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  5.23k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  280|  5.23k|  const __m128i tmp_7 = _mm_loadl_epi64(
  281|  5.23k|      (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  5.23k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  5.23k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  5.23k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  282|       |
  283|  5.23k|  const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2);
  284|  5.23k|  const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3);
  285|  5.23k|  const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6);
  286|  5.23k|  const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7);
  287|       |
  288|  5.23k|  const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10);
  289|  5.23k|  const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10);
  290|  5.23k|  const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11);
  291|  5.23k|  const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11);
  292|       |
  293|  5.23k|  coeff[0] = _mm256_castsi128_si256(_mm_unpacklo_epi64(tmp_12, tmp_14));
  294|  5.23k|  coeff[1] = _mm256_castsi128_si256(_mm_unpackhi_epi64(tmp_12, tmp_14));
  295|  5.23k|  coeff[2] = _mm256_castsi128_si256(_mm_unpacklo_epi64(tmp_13, tmp_15));
  296|  5.23k|  coeff[3] = _mm256_castsi128_si256(_mm_unpackhi_epi64(tmp_13, tmp_15));
  297|  5.23k|}
warp_plane_avx2.c:filter_src_pixels_avx2:
   81|   111k|                                          const __m128i *shift, int row) {
   82|   111k|  const __m256i src_0 = _mm256_shuffle_epi8(src, shuffle_src[0]);
   83|   111k|  const __m256i src_1 = _mm256_shuffle_epi8(src, shuffle_src[1]);
   84|   111k|  const __m256i src_2 = _mm256_shuffle_epi8(src, shuffle_src[2]);
   85|   111k|  const __m256i src_3 = _mm256_shuffle_epi8(src, shuffle_src[3]);
   86|       |
   87|   111k|  const __m256i res_02 = _mm256_maddubs_epi16(src_0, coeff[0]);
   88|   111k|  const __m256i res_46 = _mm256_maddubs_epi16(src_1, coeff[1]);
   89|   111k|  const __m256i res_13 = _mm256_maddubs_epi16(src_2, coeff[2]);
   90|   111k|  const __m256i res_57 = _mm256_maddubs_epi16(src_3, coeff[3]);
   91|       |
   92|   111k|  const __m256i res_even = _mm256_add_epi16(res_02, res_46);
   93|   111k|  const __m256i res_odd = _mm256_add_epi16(res_13, res_57);
   94|   111k|  const __m256i res =
   95|   111k|      _mm256_add_epi16(_mm256_add_epi16(res_even, res_odd), *round_const);
   96|   111k|  horz_out[row] = _mm256_srl_epi16(res, *shift);
   97|   111k|}
warp_plane_avx2.c:prepare_warp_horizontal_filter_avx2:
 1006|  11.8k|    const __m256i *shuffle_src) {
 1007|  11.8k|  if (alpha == 0 && beta == 0)
  ------------------
  |  Branch (1007:7): [True: 4.51k, False: 7.29k]
  |  Branch (1007:21): [True: 3.98k, False: 534]
  ------------------
 1008|  3.98k|    warp_horizontal_filter_alpha0_beta0_avx2(
 1009|  3.98k|        ref, horz_out, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,
 1010|  3.98k|        round_const, shift, shuffle_src);
 1011|  7.83k|  else if (alpha == 0 && beta != 0)
  ------------------
  |  Branch (1011:12): [True: 534, False: 7.29k]
  |  Branch (1011:26): [True: 534, False: 0]
  ------------------
 1012|    534|    warp_horizontal_filter_alpha0_avx2(ref, horz_out, stride, ix4, iy4, sx4,
 1013|    534|                                       alpha, beta, p_height, height, i,
 1014|    534|                                       round_const, shift, shuffle_src);
 1015|  7.29k|  else if (alpha != 0 && beta == 0)
  ------------------
  |  Branch (1015:12): [True: 7.29k, False: 0]
  |  Branch (1015:26): [True: 4.12k, False: 3.17k]
  ------------------
 1016|  4.12k|    warp_horizontal_filter_beta0_avx2(ref, horz_out, stride, ix4, iy4, sx4,
 1017|  4.12k|                                      alpha, beta, p_height, height, i,
 1018|  4.12k|                                      round_const, shift, shuffle_src);
 1019|  3.17k|  else
 1020|  3.17k|    warp_horizontal_filter_avx2(ref, horz_out, stride, ix4, iy4, sx4, alpha,
 1021|  3.17k|                                beta, p_height, height, i, round_const, shift,
 1022|  3.17k|                                shuffle_src);
 1023|  11.8k|}
warp_plane_avx2.c:warp_horizontal_filter_alpha0_beta0_avx2:
  403|  3.98k|    const __m256i *shuffle_src) {
  404|  3.98k|  (void)alpha;
  405|  3.98k|  int k, iy, row = 0;
  406|  3.98k|  __m256i coeff[4];
  407|  3.98k|  prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx4, coeff);
  408|  31.8k|  for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
  ------------------
  |  |   34|  31.8k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 19.7k, False: 12.1k]
  |  |  ------------------
  ------------------
  |  Branch (408:16): [True: 27.8k, False: 3.98k]
  ------------------
  409|  27.8k|    iy = iy4 + k;
  410|  27.8k|    iy = clamp(iy, 0, height - 1);
  411|  27.8k|    const __m128i src0 =
  412|  27.8k|        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
  413|  27.8k|    iy = iy4 + k + 1;
  414|  27.8k|    iy = clamp(iy, 0, height - 1);
  415|  27.8k|    const __m128i src1 =
  416|  27.8k|        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
  417|  27.8k|    const __m256i src_01 =
  418|       |        _mm256_inserti128_si256(_mm256_castsi128_si256(src0), src1, 0x1);
  419|  27.8k|    filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
  420|  27.8k|                           shift, row);
  421|  27.8k|    row += 1;
  422|  27.8k|  }
  423|  3.98k|  iy = iy4 + k;
  424|  3.98k|  iy = clamp(iy, 0, height - 1);
  425|  3.98k|  const __m256i src_01 = _mm256_castsi128_si256(
  426|  3.98k|      _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
  427|  3.98k|  filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
  428|  3.98k|                         shift, row);
  429|  3.98k|}
warp_plane_avx2.c:prepare_horizontal_filter_coeff_alpha0_avx2:
  235|  8.25k|                                                               __m256i *coeff) {
  236|  8.25k|  const __m128i tmp_0 =
  237|  8.25k|      _mm_loadl_epi64((__m128i *)&av1_filter_8bit[sx >> WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  8.25k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  8.25k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  8.25k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  238|  8.25k|  const __m128i tmp_1 = _mm_loadl_epi64(
  239|  8.25k|      (__m128i *)&av1_filter_8bit[(sx + beta) >> WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  8.25k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  8.25k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  8.25k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  240|       |
  241|  8.25k|  const __m256i res_0 =
  242|  8.25k|      _mm256_inserti128_si256(_mm256_castsi128_si256(tmp_0), tmp_1, 0x1);
  243|       |
  244|  8.25k|  coeff[0] = _mm256_shuffle_epi8(
  245|  8.25k|      res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask01_avx2));
  246|  8.25k|  coeff[1] = _mm256_shuffle_epi8(
  247|  8.25k|      res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask23_avx2));
  248|  8.25k|  coeff[2] = _mm256_shuffle_epi8(
  249|  8.25k|      res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask45_avx2));
  250|  8.25k|  coeff[3] = _mm256_shuffle_epi8(
  251|  8.25k|      res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask67_avx2));
  252|  8.25k|}
warp_plane_avx2.c:warp_horizontal_filter_alpha0_avx2:
  336|    534|    const __m256i *shuffle_src) {
  337|    534|  (void)alpha;
  338|    534|  int k, iy, sx, row = 0;
  339|    534|  __m256i coeff[4];
  340|  4.27k|  for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
  ------------------
  |  |   34|  4.27k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 2.84k, False: 1.42k]
  |  |  ------------------
  ------------------
  |  Branch (340:16): [True: 3.73k, False: 534]
  ------------------
  341|  3.73k|    iy = iy4 + k;
  342|  3.73k|    iy = clamp(iy, 0, height - 1);
  343|  3.73k|    const __m128i src_0 =
  344|  3.73k|        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
  345|  3.73k|    iy = iy4 + k + 1;
  346|  3.73k|    iy = clamp(iy, 0, height - 1);
  347|  3.73k|    const __m128i src_1 =
  348|  3.73k|        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
  349|  3.73k|    const __m256i src_01 =
  350|       |        _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1);
  351|  3.73k|    sx = sx4 + beta * (k + 4);
  352|  3.73k|    prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx, coeff);
  353|  3.73k|    filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
  354|  3.73k|                           shift, row);
  355|  3.73k|    row += 1;
  356|  3.73k|  }
  357|    534|  iy = iy4 + k;
  358|    534|  iy = clamp(iy, 0, height - 1);
  359|    534|  const __m256i src_01 = _mm256_castsi128_si256(
  360|    534|      _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
  361|    534|  sx = sx4 + beta * (k + 4);
  362|    534|  prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx, coeff);
  363|    534|  filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
  364|    534|                         shift, row);
  365|    534|}
warp_plane_avx2.c:warp_horizontal_filter_beta0_avx2:
  371|  4.12k|    const __m256i *shuffle_src) {
  372|  4.12k|  (void)beta;
  373|  4.12k|  int k, iy, row = 0;
  374|  4.12k|  __m256i coeff[4];
  375|  4.12k|  prepare_horizontal_filter_coeff_beta0_avx2(alpha, sx4, coeff);
  376|  32.9k|  for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
  ------------------
  |  |   34|  32.9k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 18.3k, False: 14.6k]
  |  |  ------------------
  ------------------
  |  Branch (376:16): [True: 28.8k, False: 4.12k]
  ------------------
  377|  28.8k|    iy = iy4 + k;
  378|  28.8k|    iy = clamp(iy, 0, height - 1);
  379|  28.8k|    const __m128i src_0 =
  380|  28.8k|        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
  381|  28.8k|    iy = iy4 + k + 1;
  382|  28.8k|    iy = clamp(iy, 0, height - 1);
  383|  28.8k|    const __m128i src_1 =
  384|  28.8k|        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
  385|  28.8k|    const __m256i src_01 =
  386|       |        _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1);
  387|  28.8k|    filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
  388|  28.8k|                           shift, row);
  389|  28.8k|    row += 1;
  390|  28.8k|  }
  391|  4.12k|  iy = iy4 + k;
  392|  4.12k|  iy = clamp(iy, 0, height - 1);
  393|  4.12k|  const __m256i src_01 = _mm256_castsi128_si256(
  394|  4.12k|      _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
  395|  4.12k|  filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
  396|  4.12k|                         shift, row);
  397|  4.12k|}
warp_plane_avx2.c:prepare_horizontal_filter_coeff_beta0_avx2:
  195|  4.12k|                                                              __m256i *coeff) {
  196|  4.12k|  __m128i tmp_0 = _mm_loadl_epi64(
  197|  4.12k|      (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  4.12k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  4.12k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  4.12k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  198|  4.12k|  __m128i tmp_1 = _mm_loadl_epi64(
  199|  4.12k|      (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  4.12k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  4.12k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  4.12k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  200|  4.12k|  __m128i tmp_2 = _mm_loadl_epi64(
  201|  4.12k|      (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  4.12k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  4.12k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  4.12k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  202|  4.12k|  __m128i tmp_3 = _mm_loadl_epi64(
  203|  4.12k|      (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  4.12k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  4.12k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  4.12k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  204|  4.12k|  __m128i tmp_4 = _mm_loadl_epi64(
  205|  4.12k|      (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  4.12k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  4.12k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  4.12k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  206|  4.12k|  __m128i tmp_5 = _mm_loadl_epi64(
  207|  4.12k|      (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  4.12k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  4.12k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  4.12k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  208|  4.12k|  __m128i tmp_6 = _mm_loadl_epi64(
  209|  4.12k|      (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  4.12k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  4.12k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  4.12k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  210|  4.12k|  __m128i tmp_7 = _mm_loadl_epi64(
  211|  4.12k|      (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  4.12k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  4.12k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  4.12k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  212|       |
  213|  4.12k|  tmp_0 = _mm_unpacklo_epi16(tmp_0, tmp_2);
  214|  4.12k|  tmp_1 = _mm_unpacklo_epi16(tmp_1, tmp_3);
  215|  4.12k|  tmp_4 = _mm_unpacklo_epi16(tmp_4, tmp_6);
  216|  4.12k|  tmp_5 = _mm_unpacklo_epi16(tmp_5, tmp_7);
  217|       |
  218|  4.12k|  const __m256i tmp_12 = _mm256_broadcastsi128_si256(tmp_0);
  219|  4.12k|  const __m256i tmp_13 = _mm256_broadcastsi128_si256(tmp_1);
  220|  4.12k|  const __m256i tmp_14 = _mm256_broadcastsi128_si256(tmp_4);
  221|  4.12k|  const __m256i tmp_15 = _mm256_broadcastsi128_si256(tmp_5);
  222|       |
  223|  4.12k|  const __m256i res_0 = _mm256_unpacklo_epi32(tmp_12, tmp_14);
  224|  4.12k|  const __m256i res_1 = _mm256_unpackhi_epi32(tmp_12, tmp_14);
  225|  4.12k|  const __m256i res_2 = _mm256_unpacklo_epi32(tmp_13, tmp_15);
  226|  4.12k|  const __m256i res_3 = _mm256_unpackhi_epi32(tmp_13, tmp_15);
  227|       |
  228|  4.12k|  coeff[0] = _mm256_unpacklo_epi64(res_0, res_2);
  229|  4.12k|  coeff[1] = _mm256_unpackhi_epi64(res_0, res_2);
  230|  4.12k|  coeff[2] = _mm256_unpacklo_epi64(res_1, res_3);
  231|  4.12k|  coeff[3] = _mm256_unpackhi_epi64(res_1, res_3);
  232|  4.12k|}
warp_plane_avx2.c:warp_horizontal_filter_avx2:
  303|  3.17k|    const __m256i *shuffle_src) {
  304|  3.17k|  int k, iy, sx, row = 0;
  305|  3.17k|  __m256i coeff[4];
  306|  25.4k|  for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
  ------------------
  |  |   34|  25.4k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 12.8k, False: 12.5k]
  |  |  ------------------
  ------------------
  |  Branch (306:16): [True: 22.2k, False: 3.17k]
  ------------------
  307|  22.2k|    iy = iy4 + k;
  308|  22.2k|    iy = clamp(iy, 0, height - 1);
  309|  22.2k|    const __m128i src_0 =
  310|  22.2k|        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
  311|  22.2k|    iy = iy4 + k + 1;
  312|  22.2k|    iy = clamp(iy, 0, height - 1);
  313|  22.2k|    const __m128i src_1 =
  314|  22.2k|        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
  315|  22.2k|    const __m256i src_01 =
  316|       |        _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1);
  317|  22.2k|    sx = sx4 + beta * (k + 4);
  318|  22.2k|    horizontal_filter_avx2(src_01, horz_out, sx, alpha, beta, row, shuffle_src,
  319|  22.2k|                           round_const, shift);
  320|  22.2k|    row += 1;
  321|  22.2k|  }
  322|  3.17k|  iy = iy4 + k;
  323|  3.17k|  iy = clamp(iy, 0, height - 1);
  324|  3.17k|  const __m256i src_01 = _mm256_castsi128_si256(
  325|  3.17k|      _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
  326|  3.17k|  sx = sx4 + beta * (k + 4);
  327|  3.17k|  prepare_horizontal_filter_coeff(alpha, sx, coeff);
  328|  3.17k|  filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
  329|  3.17k|                         shift, row);
  330|  3.17k|}
warp_plane_avx2.c:prepare_warp_vertical_filter_avx2:
  979|  14.7k|    const __m256i *wt) {
  980|  14.7k|  if (gamma == 0 && delta == 0)
  ------------------
  |  Branch (980:7): [True: 9.52k, False: 5.18k]
  |  Branch (980:21): [True: 5.59k, False: 3.93k]
  ------------------
  981|  5.59k|    warp_vertical_filter_gamma0_delta0_avx2(
  982|  5.59k|        pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width,
  983|  5.59k|        i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const,
  984|  5.59k|        round_bits_const, wt);
  985|  9.11k|  else if (gamma == 0 && delta != 0)
  ------------------
  |  Branch (985:12): [True: 3.93k, False: 5.18k]
  |  Branch (985:26): [True: 3.93k, False: 0]
  ------------------
  986|  3.93k|    warp_vertical_filter_gamma0_avx2(
  987|  3.93k|        pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width,
  988|  3.93k|        i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const,
  989|  3.93k|        round_bits_const, wt);
  990|  5.18k|  else if (gamma != 0 && delta == 0)
  ------------------
  |  Branch (990:12): [True: 5.18k, False: 0]
  |  Branch (990:26): [True: 1.42k, False: 3.75k]
  ------------------
  991|  1.42k|    warp_vertical_filter_delta0_avx2(
  992|  1.42k|        pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width,
  993|  1.42k|        i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const,
  994|  1.42k|        round_bits_const, wt);
  995|  3.75k|  else
  996|  3.75k|    warp_vertical_filter_avx2(pred, horz_out, conv_params, gamma, delta,
  997|  3.75k|                              p_height, p_stride, p_width, i, j, sy4,
  998|  3.75k|                              reduce_bits_vert, res_add_const, round_bits,
  999|  3.75k|                              res_sub_const, round_bits_const, wt);
 1000|  14.7k|}
warp_plane_avx2.c:warp_vertical_filter_gamma0_delta0_avx2:
  931|  5.59k|    const __m256i *wt) {
  932|  5.59k|  (void)gamma;
  933|  5.59k|  int k, row = 0;
  934|  5.59k|  __m256i src[8], coeffs[8];
  935|  5.59k|  const __m256i src_0 = horz_out[0];
  936|  5.59k|  const __m256i src_1 =
  937|  5.59k|      _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
  938|  5.59k|  const __m256i src_2 = horz_out[1];
  939|  5.59k|  const __m256i src_3 =
  940|  5.59k|      _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
  941|  5.59k|  const __m256i src_4 = horz_out[2];
  942|  5.59k|  const __m256i src_5 =
  943|  5.59k|      _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
  944|       |
  945|  5.59k|  src[0] = _mm256_unpacklo_epi16(src_0, src_1);
  946|  5.59k|  src[2] = _mm256_unpacklo_epi16(src_2, src_3);
  947|  5.59k|  src[4] = _mm256_unpacklo_epi16(src_4, src_5);
  948|       |
  949|  5.59k|  src[1] = _mm256_unpackhi_epi16(src_0, src_1);
  950|  5.59k|  src[3] = _mm256_unpackhi_epi16(src_2, src_3);
  951|  5.59k|  src[5] = _mm256_unpackhi_epi16(src_4, src_5);
  952|       |
  953|  5.59k|  prepare_vertical_filter_coeffs_gamma0_avx2(delta, sy4, coeffs);
  954|       |
  955|  27.9k|  for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
  ------------------
  |  |   34|  27.9k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 18.4k, False: 9.50k]
  |  |  ------------------
  ------------------
  |  Branch (955:16): [True: 22.3k, False: 5.59k]
  ------------------
  956|  22.3k|    __m256i res_lo, res_hi;
  957|  22.3k|    filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
  958|  22.3k|                                    row);
  959|  22.3k|    store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
  960|  22.3k|                                      res_sub_const, round_bits_const, pred,
  961|  22.3k|                                      conv_params, i, j, k, reduce_bits_vert,
  962|  22.3k|                                      p_stride, p_width, round_bits);
  963|  22.3k|    src[0] = src[2];
  964|  22.3k|    src[2] = src[4];
  965|  22.3k|    src[4] = src[6];
  966|  22.3k|    src[1] = src[3];
  967|  22.3k|    src[3] = src[5];
  968|  22.3k|    src[5] = src[7];
  969|  22.3k|    row += 1;
  970|  22.3k|  }
  971|  5.59k|}
warp_plane_avx2.c:prepare_vertical_filter_coeffs_gamma0_avx2:
  600|  21.3k|                                                              __m256i *coeffs) {
  601|  21.3k|  const __m128i filt_0 = _mm_loadu_si128(
  602|  21.3k|      (__m128i *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|  21.3k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  21.3k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  21.3k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  603|  21.3k|  const __m128i filt_1 = _mm_loadu_si128(
  604|  21.3k|      (__m128i *)(av1_warped_filter + ((sy + delta) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|  21.3k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  21.3k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  21.3k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  605|       |
  606|  21.3k|  __m256i res_0 =
  607|  21.3k|      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_0), filt_1, 0x1);
  608|       |
  609|  21.3k|  coeffs[0] = _mm256_shuffle_epi8(
  610|  21.3k|      res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask0_avx2));
  611|  21.3k|  coeffs[1] = _mm256_shuffle_epi8(
  612|  21.3k|      res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask1_avx2));
  613|  21.3k|  coeffs[2] = _mm256_shuffle_epi8(
  614|  21.3k|      res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask2_avx2));
  615|  21.3k|  coeffs[3] = _mm256_shuffle_epi8(
  616|  21.3k|      res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask3_avx2));
  617|       |
  618|  21.3k|  coeffs[4] = coeffs[0];
  619|  21.3k|  coeffs[5] = coeffs[1];
  620|  21.3k|  coeffs[6] = coeffs[2];
  621|  21.3k|  coeffs[7] = coeffs[3];
  622|  21.3k|}
warp_plane_avx2.c:filter_src_pixels_vertical_avx2:
  628|  58.8k|                                                   __m256i *res_hi, int row) {
  629|  58.8k|  const __m256i src_6 = horz_out[row + 3];
  630|  58.8k|  const __m256i src_7 =
  631|  58.8k|      _mm256_permute2x128_si256(horz_out[row + 3], horz_out[row + 4], 0x21);
  632|       |
  633|  58.8k|  src[6] = _mm256_unpacklo_epi16(src_6, src_7);
  634|       |
  635|  58.8k|  const __m256i res_0 = _mm256_madd_epi16(src[0], coeffs[0]);
  636|  58.8k|  const __m256i res_2 = _mm256_madd_epi16(src[2], coeffs[1]);
  637|  58.8k|  const __m256i res_4 = _mm256_madd_epi16(src[4], coeffs[2]);
  638|  58.8k|  const __m256i res_6 = _mm256_madd_epi16(src[6], coeffs[3]);
  639|       |
  640|  58.8k|  const __m256i res_even = _mm256_add_epi32(_mm256_add_epi32(res_0, res_2),
  641|  58.8k|                                            _mm256_add_epi32(res_4, res_6));
  642|       |
  643|  58.8k|  src[7] = _mm256_unpackhi_epi16(src_6, src_7);
  644|       |
  645|  58.8k|  const __m256i res_1 = _mm256_madd_epi16(src[1], coeffs[4]);
  646|  58.8k|  const __m256i res_3 = _mm256_madd_epi16(src[3], coeffs[5]);
  647|  58.8k|  const __m256i res_5 = _mm256_madd_epi16(src[5], coeffs[6]);
  648|  58.8k|  const __m256i res_7 = _mm256_madd_epi16(src[7], coeffs[7]);
  649|       |
  650|  58.8k|  const __m256i res_odd = _mm256_add_epi32(_mm256_add_epi32(res_1, res_3),
  651|  58.8k|                                           _mm256_add_epi32(res_5, res_7));
  652|       |
  653|       |  // Rearrange pixels back into the order 0 ... 7
  654|  58.8k|  *res_lo = _mm256_unpacklo_epi32(res_even, res_odd);
  655|  58.8k|  *res_hi = _mm256_unpackhi_epi32(res_even, res_odd);
  656|  58.8k|}
warp_plane_avx2.c:store_vertical_filter_output_avx2:
  663|  58.8k|    const int round_bits) {
  664|  58.8k|  __m256i res_lo_1 = *res_lo;
  665|  58.8k|  __m256i res_hi_1 = *res_hi;
  666|       |
  667|  58.8k|  if (conv_params->is_compound) {
  ------------------
  |  Branch (667:7): [True: 7.64k, False: 51.1k]
  ------------------
  668|  7.64k|    __m128i *const p_0 =
  669|  7.64k|        (__m128i *)&conv_params->dst[(i + k + 4) * conv_params->dst_stride + j];
  670|  7.64k|    __m128i *const p_1 =
  671|  7.64k|        (__m128i *)&conv_params
  672|  7.64k|            ->dst[(i + (k + 1) + 4) * conv_params->dst_stride + j];
  673|       |
  674|  7.64k|    res_lo_1 = _mm256_srai_epi32(_mm256_add_epi32(res_lo_1, *res_add_const),
  675|  7.64k|                                 reduce_bits_vert);
  676|       |
  677|  7.64k|    const __m256i temp_lo_16 = _mm256_packus_epi32(res_lo_1, res_lo_1);
  678|  7.64k|    __m256i res_lo_16;
  679|  7.64k|    if (conv_params->do_average) {
  ------------------
  |  Branch (679:9): [True: 3.23k, False: 4.40k]
  ------------------
  680|  3.23k|      __m128i *const dst8_0 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
  681|  3.23k|      __m128i *const dst8_1 =
  682|  3.23k|          (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j];
  683|  3.23k|      const __m128i p_16_0 = _mm_loadl_epi64(p_0);
  684|  3.23k|      const __m128i p_16_1 = _mm_loadl_epi64(p_1);
  685|  3.23k|      const __m256i p_16 =
  686|  3.23k|          _mm256_inserti128_si256(_mm256_castsi128_si256(p_16_0), p_16_1, 1);
  687|  3.23k|      if (conv_params->use_dist_wtd_comp_avg) {
  ------------------
  |  Branch (687:11): [True: 1.10k, False: 2.12k]
  ------------------
  688|  1.10k|        const __m256i p_16_lo = _mm256_unpacklo_epi16(p_16, temp_lo_16);
  689|  1.10k|        const __m256i wt_res_lo = _mm256_madd_epi16(p_16_lo, *wt);
  690|  1.10k|        const __m256i shifted_32 =
  691|  1.10k|            _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
  ------------------
  |  |   76|  1.10k|#define DIST_PRECISION_BITS 4
  ------------------
  692|  1.10k|        res_lo_16 = _mm256_packus_epi32(shifted_32, shifted_32);
  693|  2.12k|      } else {
  694|  2.12k|        res_lo_16 = _mm256_srai_epi16(_mm256_add_epi16(p_16, temp_lo_16), 1);
  695|  2.12k|      }
  696|  3.23k|      res_lo_16 = _mm256_add_epi16(res_lo_16, *res_sub_const);
  697|  3.23k|      res_lo_16 = _mm256_srai_epi16(
  698|  3.23k|          _mm256_add_epi16(res_lo_16, *round_bits_const), round_bits);
  699|  3.23k|      const __m256i res_8_lo = _mm256_packus_epi16(res_lo_16, res_lo_16);
  700|  3.23k|      const __m128i res_8_lo_0 = _mm256_castsi256_si128(res_8_lo);
  701|  3.23k|      const __m128i res_8_lo_1 = _mm256_extracti128_si256(res_8_lo, 1);
  702|  3.23k|      *(int *)dst8_0 = _mm_cvtsi128_si32(res_8_lo_0);
  703|  3.23k|      *(int *)dst8_1 = _mm_cvtsi128_si32(res_8_lo_1);
  704|  4.40k|    } else {
  705|  4.40k|      const __m128i temp_lo_16_0 = _mm256_castsi256_si128(temp_lo_16);
  706|  4.40k|      const __m128i temp_lo_16_1 = _mm256_extracti128_si256(temp_lo_16, 1);
  707|  4.40k|      _mm_storel_epi64(p_0, temp_lo_16_0);
  708|  4.40k|      _mm_storel_epi64(p_1, temp_lo_16_1);
  709|  4.40k|    }
  710|  7.64k|    if (p_width > 4) {
  ------------------
  |  Branch (710:9): [True: 7.64k, False: 0]
  ------------------
  711|  7.64k|      __m128i *const p4_0 =
  712|  7.64k|          (__m128i *)&conv_params
  713|  7.64k|              ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
  714|  7.64k|      __m128i *const p4_1 =
  715|  7.64k|          (__m128i *)&conv_params
  716|  7.64k|              ->dst[(i + (k + 1) + 4) * conv_params->dst_stride + j + 4];
  717|  7.64k|      res_hi_1 = _mm256_srai_epi32(_mm256_add_epi32(res_hi_1, *res_add_const),
  718|  7.64k|                                   reduce_bits_vert);
  719|  7.64k|      const __m256i temp_hi_16 = _mm256_packus_epi32(res_hi_1, res_hi_1);
  720|  7.64k|      __m256i res_hi_16;
  721|  7.64k|      if (conv_params->do_average) {
  ------------------
  |  Branch (721:11): [True: 3.23k, False: 4.40k]
  ------------------
  722|  3.23k|        __m128i *const dst8_4_0 =
  723|  3.23k|            (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
  724|  3.23k|        __m128i *const dst8_4_1 =
  725|  3.23k|            (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j + 4];
  726|  3.23k|        const __m128i p4_16_0 = _mm_loadl_epi64(p4_0);
  727|  3.23k|        const __m128i p4_16_1 = _mm_loadl_epi64(p4_1);
  728|  3.23k|        const __m256i p4_16 = _mm256_inserti128_si256(
  729|  3.23k|            _mm256_castsi128_si256(p4_16_0), p4_16_1, 1);
  730|  3.23k|        if (conv_params->use_dist_wtd_comp_avg) {
  ------------------
  |  Branch (730:13): [True: 1.10k, False: 2.12k]
  ------------------
  731|  1.10k|          const __m256i p_16_hi = _mm256_unpacklo_epi16(p4_16, temp_hi_16);
  732|  1.10k|          const __m256i wt_res_hi = _mm256_madd_epi16(p_16_hi, *wt);
  733|  1.10k|          const __m256i shifted_32 =
  734|  1.10k|              _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
  ------------------
  |  |   76|  1.10k|#define DIST_PRECISION_BITS 4
  ------------------
  735|  1.10k|          res_hi_16 = _mm256_packus_epi32(shifted_32, shifted_32);
  736|  2.12k|        } else {
  737|  2.12k|          res_hi_16 = _mm256_srai_epi16(_mm256_add_epi16(p4_16, temp_hi_16), 1);
  738|  2.12k|        }
  739|  3.23k|        res_hi_16 = _mm256_add_epi16(res_hi_16, *res_sub_const);
  740|  3.23k|        res_hi_16 = _mm256_srai_epi16(
  741|  3.23k|            _mm256_add_epi16(res_hi_16, *round_bits_const), round_bits);
  742|  3.23k|        __m256i res_8_hi = _mm256_packus_epi16(res_hi_16, res_hi_16);
  743|  3.23k|        const __m128i res_8_hi_0 = _mm256_castsi256_si128(res_8_hi);
  744|  3.23k|        const __m128i res_8_hi_1 = _mm256_extracti128_si256(res_8_hi, 1);
  745|  3.23k|        *(int *)dst8_4_0 = _mm_cvtsi128_si32(res_8_hi_0);
  746|  3.23k|        *(int *)dst8_4_1 = _mm_cvtsi128_si32(res_8_hi_1);
  747|  4.40k|      } else {
  748|  4.40k|        const __m128i temp_hi_16_0 = _mm256_castsi256_si128(temp_hi_16);
  749|  4.40k|        const __m128i temp_hi_16_1 = _mm256_extracti128_si256(temp_hi_16, 1);
  750|  4.40k|        _mm_storel_epi64(p4_0, temp_hi_16_0);
  751|  4.40k|        _mm_storel_epi64(p4_1, temp_hi_16_1);
  752|  4.40k|      }
  753|  7.64k|    }
  754|  51.1k|  } else {
  755|  51.1k|    const __m256i res_lo_round = _mm256_srai_epi32(
  756|  51.1k|        _mm256_add_epi32(res_lo_1, *res_add_const), reduce_bits_vert);
  757|  51.1k|    const __m256i res_hi_round = _mm256_srai_epi32(
  758|  51.1k|        _mm256_add_epi32(res_hi_1, *res_add_const), reduce_bits_vert);
  759|       |
  760|  51.1k|    const __m256i res_16bit = _mm256_packs_epi32(res_lo_round, res_hi_round);
  761|  51.1k|    const __m256i res_8bit = _mm256_packus_epi16(res_16bit, res_16bit);
  762|  51.1k|    const __m128i res_8bit0 = _mm256_castsi256_si128(res_8bit);
  763|  51.1k|    const __m128i res_8bit1 = _mm256_extracti128_si256(res_8bit, 1);
  764|       |
  765|       |    // Store, blending with 'pred' if needed
  766|  51.1k|    __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
  767|  51.1k|    __m128i *const p1 = (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j];
  768|       |
  769|  51.1k|    if (p_width == 4) {
  ------------------
  |  Branch (769:9): [True: 0, False: 51.1k]
  ------------------
  770|      0|      *(int *)p = _mm_cvtsi128_si32(res_8bit0);
  771|      0|      *(int *)p1 = _mm_cvtsi128_si32(res_8bit1);
  772|  51.1k|    } else {
  773|  51.1k|      _mm_storel_epi64(p, res_8bit0);
  774|  51.1k|      _mm_storel_epi64(p1, res_8bit1);
  775|  51.1k|    }
  776|  51.1k|  }
  777|  58.8k|}
warp_plane_avx2.c:warp_vertical_filter_gamma0_avx2:
  834|  3.93k|    const __m256i *wt) {
  835|  3.93k|  (void)gamma;
  836|  3.93k|  int k, row = 0;
  837|  3.93k|  __m256i src[8];
  838|  3.93k|  const __m256i src_0 = horz_out[0];
  839|  3.93k|  const __m256i src_1 =
  840|  3.93k|      _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
  841|  3.93k|  const __m256i src_2 = horz_out[1];
  842|  3.93k|  const __m256i src_3 =
  843|  3.93k|      _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
  844|  3.93k|  const __m256i src_4 = horz_out[2];
  845|  3.93k|  const __m256i src_5 =
  846|  3.93k|      _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
  847|       |
  848|  3.93k|  src[0] = _mm256_unpacklo_epi16(src_0, src_1);
  849|  3.93k|  src[2] = _mm256_unpacklo_epi16(src_2, src_3);
  850|  3.93k|  src[4] = _mm256_unpacklo_epi16(src_4, src_5);
  851|       |
  852|  3.93k|  src[1] = _mm256_unpackhi_epi16(src_0, src_1);
  853|  3.93k|  src[3] = _mm256_unpackhi_epi16(src_2, src_3);
  854|  3.93k|  src[5] = _mm256_unpackhi_epi16(src_4, src_5);
  855|       |
  856|  19.6k|  for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
  ------------------
  |  |   34|  19.6k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 10.0k, False: 9.60k]
  |  |  ------------------
  ------------------
  |  Branch (856:16): [True: 15.7k, False: 3.93k]
  ------------------
  857|  15.7k|    int sy = sy4 + delta * (k + 4);
  858|  15.7k|    __m256i coeffs[8];
  859|  15.7k|    prepare_vertical_filter_coeffs_gamma0_avx2(delta, sy, coeffs);
  860|  15.7k|    __m256i res_lo, res_hi;
  861|  15.7k|    filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
  862|  15.7k|                                    row);
  863|  15.7k|    store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
  864|  15.7k|                                      res_sub_const, round_bits_const, pred,
  865|  15.7k|                                      conv_params, i, j, k, reduce_bits_vert,
  866|  15.7k|                                      p_stride, p_width, round_bits);
  867|  15.7k|    src[0] = src[2];
  868|  15.7k|    src[2] = src[4];
  869|  15.7k|    src[4] = src[6];
  870|  15.7k|    src[1] = src[3];
  871|  15.7k|    src[3] = src[5];
  872|  15.7k|    src[5] = src[7];
  873|  15.7k|    row += 1;
  874|  15.7k|  }
  875|  3.93k|}
warp_plane_avx2.c:warp_vertical_filter_delta0_avx2:
  883|  1.42k|    const __m256i *wt) {
  884|  1.42k|  (void)delta;
  885|  1.42k|  int k, row = 0;
  886|  1.42k|  __m256i src[8], coeffs[8];
  887|  1.42k|  const __m256i src_0 = horz_out[0];
  888|  1.42k|  const __m256i src_1 =
  889|  1.42k|      _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
  890|  1.42k|  const __m256i src_2 = horz_out[1];
  891|  1.42k|  const __m256i src_3 =
  892|  1.42k|      _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
  893|  1.42k|  const __m256i src_4 = horz_out[2];
  894|  1.42k|  const __m256i src_5 =
  895|  1.42k|      _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
  896|       |
  897|  1.42k|  src[0] = _mm256_unpacklo_epi16(src_0, src_1);
  898|  1.42k|  src[2] = _mm256_unpacklo_epi16(src_2, src_3);
  899|  1.42k|  src[4] = _mm256_unpacklo_epi16(src_4, src_5);
  900|       |
  901|  1.42k|  src[1] = _mm256_unpackhi_epi16(src_0, src_1);
  902|  1.42k|  src[3] = _mm256_unpackhi_epi16(src_2, src_3);
  903|  1.42k|  src[5] = _mm256_unpackhi_epi16(src_4, src_5);
  904|       |
  905|  1.42k|  prepare_vertical_filter_coeffs_delta0_avx2(gamma, sy4, coeffs);
  906|       |
  907|  7.12k|  for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
  ------------------
  |  |   34|  7.12k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 5.58k, False: 1.53k]
  |  |  ------------------
  ------------------
  |  Branch (907:16): [True: 5.69k, False: 1.42k]
  ------------------
  908|  5.69k|    __m256i res_lo, res_hi;
  909|  5.69k|    filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
  910|  5.69k|                                    row);
  911|  5.69k|    store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
  912|  5.69k|                                      res_sub_const, round_bits_const, pred,
  913|  5.69k|                                      conv_params, i, j, k, reduce_bits_vert,
  914|  5.69k|                                      p_stride, p_width, round_bits);
  915|  5.69k|    src[0] = src[2];
  916|  5.69k|    src[2] = src[4];
  917|  5.69k|    src[4] = src[6];
  918|  5.69k|    src[1] = src[3];
  919|  5.69k|    src[3] = src[5];
  920|  5.69k|    src[5] = src[7];
  921|  5.69k|    row += 1;
  922|  5.69k|  }
  923|  1.42k|}
warp_plane_avx2.c:prepare_vertical_filter_coeffs_delta0_avx2:
  541|  1.42k|                                                              __m256i *coeffs) {
  542|  1.42k|  __m128i filt_00 =
  543|  1.42k|      _mm_loadu_si128((__m128i *)(av1_warped_filter +
  544|  1.42k|                                  ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|  1.42k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  1.42k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  1.42k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  545|  1.42k|  __m128i filt_01 =
  546|  1.42k|      _mm_loadu_si128((__m128i *)(av1_warped_filter +
  547|  1.42k|                                  ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|  1.42k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  1.42k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  1.42k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  548|  1.42k|  __m128i filt_02 =
  549|  1.42k|      _mm_loadu_si128((__m128i *)(av1_warped_filter +
  550|  1.42k|                                  ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|  1.42k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  1.42k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  1.42k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  551|  1.42k|  __m128i filt_03 =
  552|  1.42k|      _mm_loadu_si128((__m128i *)(av1_warped_filter +
  553|  1.42k|                                  ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|  1.42k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  1.42k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  1.42k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  554|       |
  555|  1.42k|  __m256i filt_0 = _mm256_broadcastsi128_si256(filt_00);
  556|  1.42k|  __m256i filt_1 = _mm256_broadcastsi128_si256(filt_01);
  557|  1.42k|  __m256i filt_2 = _mm256_broadcastsi128_si256(filt_02);
  558|  1.42k|  __m256i filt_3 = _mm256_broadcastsi128_si256(filt_03);
  559|       |
  560|  1.42k|  __m256i res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
  561|  1.42k|  __m256i res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
  562|  1.42k|  __m256i res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
  563|  1.42k|  __m256i res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
  564|       |
  565|  1.42k|  coeffs[0] = _mm256_unpacklo_epi64(res_0, res_1);
  566|  1.42k|  coeffs[1] = _mm256_unpackhi_epi64(res_0, res_1);
  567|  1.42k|  coeffs[2] = _mm256_unpacklo_epi64(res_2, res_3);
  568|  1.42k|  coeffs[3] = _mm256_unpackhi_epi64(res_2, res_3);
  569|       |
  570|  1.42k|  filt_00 =
  571|  1.42k|      _mm_loadu_si128((__m128i *)(av1_warped_filter +
  572|  1.42k|                                  ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|  1.42k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  1.42k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  1.42k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  573|  1.42k|  filt_01 =
  574|  1.42k|      _mm_loadu_si128((__m128i *)(av1_warped_filter +
  575|  1.42k|                                  ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|  1.42k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  1.42k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  1.42k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  576|  1.42k|  filt_02 =
  577|  1.42k|      _mm_loadu_si128((__m128i *)(av1_warped_filter +
  578|  1.42k|                                  ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|  1.42k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  1.42k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  1.42k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  579|  1.42k|  filt_03 =
  580|  1.42k|      _mm_loadu_si128((__m128i *)(av1_warped_filter +
  581|  1.42k|                                  ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|  1.42k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  1.42k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  1.42k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  582|       |
  583|  1.42k|  filt_0 = _mm256_broadcastsi128_si256(filt_00);
  584|  1.42k|  filt_1 = _mm256_broadcastsi128_si256(filt_01);
  585|  1.42k|  filt_2 = _mm256_broadcastsi128_si256(filt_02);
  586|  1.42k|  filt_3 = _mm256_broadcastsi128_si256(filt_03);
  587|       |
  588|  1.42k|  res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
  589|  1.42k|  res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
  590|  1.42k|  res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
  591|  1.42k|  res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
  592|       |
  593|  1.42k|  coeffs[4] = _mm256_unpacklo_epi64(res_0, res_1);
  594|  1.42k|  coeffs[5] = _mm256_unpackhi_epi64(res_0, res_1);
  595|  1.42k|  coeffs[6] = _mm256_unpacklo_epi64(res_2, res_3);
  596|  1.42k|  coeffs[7] = _mm256_unpackhi_epi64(res_2, res_3);
  597|  1.42k|}
warp_plane_avx2.c:warp_vertical_filter_avx2:
  785|  3.75k|    const __m256i *wt) {
  786|  3.75k|  int k, row = 0;
  787|  3.75k|  __m256i src[8];
  788|  3.75k|  const __m256i src_0 = horz_out[0];
  789|  3.75k|  const __m256i src_1 =
  790|  3.75k|      _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
  791|  3.75k|  const __m256i src_2 = horz_out[1];
  792|  3.75k|  const __m256i src_3 =
  793|  3.75k|      _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
  794|  3.75k|  const __m256i src_4 = horz_out[2];
  795|  3.75k|  const __m256i src_5 =
  796|  3.75k|      _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
  797|       |
  798|  3.75k|  src[0] = _mm256_unpacklo_epi16(src_0, src_1);
  799|  3.75k|  src[2] = _mm256_unpacklo_epi16(src_2, src_3);
  800|  3.75k|  src[4] = _mm256_unpacklo_epi16(src_4, src_5);
  801|       |
  802|  3.75k|  src[1] = _mm256_unpackhi_epi16(src_0, src_1);
  803|  3.75k|  src[3] = _mm256_unpackhi_epi16(src_2, src_3);
  804|  3.75k|  src[5] = _mm256_unpackhi_epi16(src_4, src_5);
  805|       |
  806|  18.7k|  for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
  ------------------
  |  |   34|  18.7k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 9.56k, False: 9.21k]
  |  |  ------------------
  ------------------
  |  Branch (806:16): [True: 15.0k, False: 3.75k]
  ------------------
  807|  15.0k|    int sy = sy4 + delta * (k + 4);
  808|  15.0k|    __m256i coeffs[8];
  809|  15.0k|    prepare_vertical_filter_coeffs_avx2(gamma, delta, sy, coeffs);
  810|  15.0k|    __m256i res_lo, res_hi;
  811|  15.0k|    filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
  812|  15.0k|                                    row);
  813|  15.0k|    store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
  814|  15.0k|                                      res_sub_const, round_bits_const, pred,
  815|  15.0k|                                      conv_params, i, j, k, reduce_bits_vert,
  816|  15.0k|                                      p_stride, p_width, round_bits);
  817|  15.0k|    src[0] = src[2];
  818|  15.0k|    src[2] = src[4];
  819|  15.0k|    src[4] = src[6];
  820|  15.0k|    src[1] = src[3];
  821|  15.0k|    src[3] = src[5];
  822|  15.0k|    src[5] = src[7];
  823|       |
  824|  15.0k|    row += 1;
  825|  15.0k|  }
  826|  3.75k|}
warp_plane_avx2.c:prepare_vertical_filter_coeffs_avx2:
  448|  15.0k|                                                       __m256i *coeffs) {
  449|  15.0k|  __m128i filt_00 =
  450|  15.0k|      _mm_loadu_si128((__m128i *)(av1_warped_filter +
  451|  15.0k|                                  ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|  15.0k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  15.0k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  15.0k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  452|  15.0k|  __m128i filt_01 =
  453|  15.0k|      _mm_loadu_si128((__m128i *)(av1_warped_filter +
  454|  15.0k|                                  ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|  15.0k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  15.0k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  15.0k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  455|  15.0k|  __m128i filt_02 =
  456|  15.0k|      _mm_loadu_si128((__m128i *)(av1_warped_filter +
  457|  15.0k|                                  ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|  15.0k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  15.0k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  15.0k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  458|  15.0k|  __m128i filt_03 =
  459|  15.0k|      _mm_loadu_si128((__m128i *)(av1_warped_filter +
  460|  15.0k|                                  ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|  15.0k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  15.0k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  15.0k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  461|       |
  462|  15.0k|  __m128i filt_10 = _mm_loadu_si128(
  463|  15.0k|      (__m128i *)(av1_warped_filter +
  464|  15.0k|                  (((sy + delta) + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|  15.0k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  15.0k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  15.0k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  465|  15.0k|  __m128i filt_11 = _mm_loadu_si128(
  466|  15.0k|      (__m128i *)(av1_warped_filter +
  467|  15.0k|                  (((sy + delta) + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|  15.0k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  15.0k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  15.0k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  468|  15.0k|  __m128i filt_12 = _mm_loadu_si128(
  469|  15.0k|      (__m128i *)(av1_warped_filter +
  470|  15.0k|                  (((sy + delta) + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|  15.0k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  15.0k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  15.0k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  471|  15.0k|  __m128i filt_13 = _mm_loadu_si128(
  472|  15.0k|      (__m128i *)(av1_warped_filter +
  473|  15.0k|                  (((sy + delta) + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|  15.0k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  15.0k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  15.0k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  474|       |
  475|  15.0k|  __m256i filt_0 =
  476|  15.0k|      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_00), filt_10, 0x1);
  477|  15.0k|  __m256i filt_1 =
  478|  15.0k|      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_01), filt_11, 0x1);
  479|  15.0k|  __m256i filt_2 =
  480|  15.0k|      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_02), filt_12, 0x1);
  481|  15.0k|  __m256i filt_3 =
  482|  15.0k|      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_03), filt_13, 0x1);
  483|       |
  484|  15.0k|  __m256i res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
  485|  15.0k|  __m256i res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
  486|  15.0k|  __m256i res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
  487|  15.0k|  __m256i res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
  488|       |
  489|  15.0k|  coeffs[0] = _mm256_unpacklo_epi64(res_0, res_1);
  490|  15.0k|  coeffs[1] = _mm256_unpackhi_epi64(res_0, res_1);
  491|  15.0k|  coeffs[2] = _mm256_unpacklo_epi64(res_2, res_3);
  492|  15.0k|  coeffs[3] = _mm256_unpackhi_epi64(res_2, res_3);
  493|       |
  494|  15.0k|  filt_00 =
  495|  15.0k|      _mm_loadu_si128((__m128i *)(av1_warped_filter +
  496|  15.0k|                                  ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|  15.0k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  15.0k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  15.0k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  497|  15.0k|  filt_01 =
  498|  15.0k|      _mm_loadu_si128((__m128i *)(av1_warped_filter +
  499|  15.0k|                                  ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|  15.0k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  15.0k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  15.0k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  500|  15.0k|  filt_02 =
  501|  15.0k|      _mm_loadu_si128((__m128i *)(av1_warped_filter +
  502|  15.0k|                                  ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|  15.0k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  15.0k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  15.0k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  503|  15.0k|  filt_03 =
  504|  15.0k|      _mm_loadu_si128((__m128i *)(av1_warped_filter +
  505|  15.0k|                                  ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|  15.0k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  15.0k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  15.0k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  506|       |
  507|  15.0k|  filt_10 = _mm_loadu_si128(
  508|  15.0k|      (__m128i *)(av1_warped_filter +
  509|  15.0k|                  (((sy + delta) + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|  15.0k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  15.0k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  15.0k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  510|  15.0k|  filt_11 = _mm_loadu_si128(
  511|  15.0k|      (__m128i *)(av1_warped_filter +
  512|  15.0k|                  (((sy + delta) + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|  15.0k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  15.0k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  15.0k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  513|  15.0k|  filt_12 = _mm_loadu_si128(
  514|  15.0k|      (__m128i *)(av1_warped_filter +
  515|  15.0k|                  (((sy + delta) + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|  15.0k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  15.0k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  15.0k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  516|  15.0k|  filt_13 = _mm_loadu_si128(
  517|  15.0k|      (__m128i *)(av1_warped_filter +
  518|  15.0k|                  (((sy + delta) + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|  15.0k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  15.0k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  15.0k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  519|       |
  520|  15.0k|  filt_0 =
  521|  15.0k|      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_00), filt_10, 0x1);
  522|  15.0k|  filt_1 =
  523|  15.0k|      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_01), filt_11, 0x1);
  524|  15.0k|  filt_2 =
  525|  15.0k|      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_02), filt_12, 0x1);
  526|  15.0k|  filt_3 =
  527|  15.0k|      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_03), filt_13, 0x1);
  528|       |
  529|  15.0k|  res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
  530|  15.0k|  res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
  531|  15.0k|  res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
  532|  15.0k|  res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
  533|       |
  534|  15.0k|  coeffs[4] = _mm256_unpacklo_epi64(res_0, res_1);
  535|  15.0k|  coeffs[5] = _mm256_unpackhi_epi64(res_0, res_1);
  536|  15.0k|  coeffs[6] = _mm256_unpacklo_epi64(res_2, res_3);
  537|  15.0k|  coeffs[7] = _mm256_unpackhi_epi64(res_2, res_3);
  538|  15.0k|}

av1_wiener_convolve_add_src_avx2:
   48|  1.47k|                                      const WienerConvolveParams *conv_params) {
   49|  1.47k|  const int bd = 8;
   50|  1.47k|  assert(x_step_q4 == 16 && y_step_q4 == 16);
   51|  1.47k|  assert(!(w & 7));
   52|  1.47k|  (void)x_step_q4;
   53|  1.47k|  (void)y_step_q4;
   54|       |
   55|  1.47k|  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + SUBPEL_TAPS) * 8]);
  ------------------
  |  |   19|  1.47k|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
   56|  1.47k|  int im_h = h + SUBPEL_TAPS - 2;
  ------------------
  |  |   26|  1.47k|#define SUBPEL_TAPS 8
  ------------------
   57|  1.47k|  int im_stride = 8;
   58|  1.47k|  memset(im_block + (im_h * im_stride), 0, MAX_SB_SIZE);
  ------------------
  |  |   32|  1.47k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|  1.47k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
   59|  1.47k|  int i, j;
   60|  1.47k|  const int center_tap = (SUBPEL_TAPS - 1) / 2;
  ------------------
  |  |   26|  1.47k|#define SUBPEL_TAPS 8
  ------------------
   61|  1.47k|  const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap;
   62|       |
   63|  1.47k|  __m256i filt[4], coeffs_h[4], coeffs_v[4], filt_center;
   64|       |
   65|  1.47k|  assert(conv_params->round_0 > 0);
   66|       |
   67|  1.47k|  filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
   68|  1.47k|  filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
   69|  1.47k|  filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
   70|  1.47k|  filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
   71|       |
   72|  1.47k|  filt_center = _mm256_load_si256((__m256i const *)filt_center_global_avx2);
   73|       |
   74|  1.47k|  const __m128i coeffs_x = _mm_loadu_si128((__m128i *)filter_x);
   75|  1.47k|  const __m256i filter_coeffs_x = _mm256_broadcastsi128_si256(coeffs_x);
   76|       |
   77|       |  // coeffs 0 1 0 1 0 1 0 1
   78|  1.47k|  coeffs_h[0] =
   79|  1.47k|      _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0200u));
   80|       |  // coeffs 2 3 2 3 2 3 2 3
   81|  1.47k|  coeffs_h[1] =
   82|  1.47k|      _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0604u));
   83|       |  // coeffs 4 5 4 5 4 5 4 5
   84|  1.47k|  coeffs_h[2] =
   85|  1.47k|      _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0a08u));
   86|       |  // coeffs 6 7 6 7 6 7 6 7
   87|  1.47k|  coeffs_h[3] =
   88|  1.47k|      _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0e0cu));
   89|       |
   90|  1.47k|  const __m256i round_const_h =
   91|  1.47k|      _mm256_set1_epi16((1 << (conv_params->round_0 - 1)));
   92|  1.47k|  const __m256i round_const_horz =
   93|  1.47k|      _mm256_set1_epi16((1 << (bd + FILTER_BITS - conv_params->round_0 - 1)));
  ------------------
  |  |   21|  1.47k|#define FILTER_BITS 7
  ------------------
   94|  1.47k|  const __m256i clamp_low = _mm256_setzero_si256();
   95|  1.47k|  const __m256i clamp_high =
   96|  1.47k|      _mm256_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1);
  ------------------
  |  |   43|  1.47k|#define WIENER_CLAMP_LIMIT(r0, bd) (1 << ((bd) + 1 + FILTER_BITS - r0))
  |  |  ------------------
  |  |  |  |   21|  1.47k|#define FILTER_BITS 7
  |  |  ------------------
  ------------------
   97|  1.47k|  const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0);
   98|       |
   99|       |  // Add an offset to account for the "add_src" part of the convolve function.
  100|  1.47k|  const __m128i zero_128 = _mm_setzero_si128();
  101|  1.47k|  const __m128i offset_0 = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3);
  102|  1.47k|  const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset_0);
  103|       |
  104|  1.47k|  const __m256i filter_coeffs_y = _mm256_broadcastsi128_si256(coeffs_y);
  105|       |
  106|       |  // coeffs 0 1 0 1 0 1 0 1
  107|  1.47k|  coeffs_v[0] = _mm256_shuffle_epi32(filter_coeffs_y, 0x00);
  108|       |  // coeffs 2 3 2 3 2 3 2 3
  109|  1.47k|  coeffs_v[1] = _mm256_shuffle_epi32(filter_coeffs_y, 0x55);
  110|       |  // coeffs 4 5 4 5 4 5 4 5
  111|  1.47k|  coeffs_v[2] = _mm256_shuffle_epi32(filter_coeffs_y, 0xaa);
  112|       |  // coeffs 6 7 6 7 6 7 6 7
  113|  1.47k|  coeffs_v[3] = _mm256_shuffle_epi32(filter_coeffs_y, 0xff);
  114|       |
  115|  1.47k|  const __m256i round_const_v =
  116|  1.47k|      _mm256_set1_epi32((1 << (conv_params->round_1 - 1)) -
  117|  1.47k|                        (1 << (bd + conv_params->round_1 - 1)));
  118|  1.47k|  const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
  119|       |
  120|  12.2k|  for (j = 0; j < w; j += 8) {
  ------------------
  |  Branch (120:15): [True: 10.8k, False: 1.47k]
  ------------------
  121|   279k|    for (i = 0; i < im_h; i += 2) {
  ------------------
  |  Branch (121:17): [True: 268k, False: 10.8k]
  ------------------
  122|   268k|      __m256i data = _mm256_castsi128_si256(
  123|   268k|          _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
  124|       |
  125|       |      // Load the next line
  126|   268k|      if (i + 1 < im_h)
  ------------------
  |  Branch (126:11): [True: 288k, False: 18.4E]
  ------------------
  127|   288k|        data = _mm256_inserti128_si256(
  128|   268k|            data,
  129|   268k|            _mm_loadu_si128(
  130|   268k|                (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),
  131|   268k|            1);
  132|       |
  133|   268k|      __m256i res = convolve_lowbd_x(data, coeffs_h, filt);
  134|       |
  135|   268k|      res =
  136|   268k|          _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h);
  137|       |
  138|   268k|      __m256i data_0 = _mm256_shuffle_epi8(data, filt_center);
  139|       |
  140|       |      // multiply the center pixel by 2^(FILTER_BITS - round_0) and add it to
  141|       |      // the result
  142|   268k|      data_0 = _mm256_slli_epi16(data_0, FILTER_BITS - conv_params->round_0);
  ------------------
  |  |   21|   268k|#define FILTER_BITS 7
  ------------------
  143|   268k|      res = _mm256_add_epi16(res, data_0);
  144|   268k|      res = _mm256_add_epi16(res, round_const_horz);
  145|   268k|      const __m256i res_clamped =
  146|   268k|          _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high);
  147|   268k|      _mm256_store_si256((__m256i *)&im_block[i * im_stride], res_clamped);
  148|   268k|    }
  149|       |
  150|       |    /* Vertical filter */
  151|  10.8k|    {
  152|  10.8k|      __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
  153|  10.8k|      __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
  154|  10.8k|      __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
  155|  10.8k|      __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
  156|  10.8k|      __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
  157|  10.8k|      __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
  158|       |
  159|  10.8k|      __m256i s[8];
  160|  10.8k|      s[0] = _mm256_unpacklo_epi16(src_0, src_1);
  161|  10.8k|      s[1] = _mm256_unpacklo_epi16(src_2, src_3);
  162|  10.8k|      s[2] = _mm256_unpacklo_epi16(src_4, src_5);
  163|       |
  164|  10.8k|      s[4] = _mm256_unpackhi_epi16(src_0, src_1);
  165|  10.8k|      s[5] = _mm256_unpackhi_epi16(src_2, src_3);
  166|  10.8k|      s[6] = _mm256_unpackhi_epi16(src_4, src_5);
  167|       |
  168|   268k|      for (i = 0; i < h - 1; i += 2) {
  ------------------
  |  Branch (168:19): [True: 257k, False: 10.8k]
  ------------------
  169|   257k|        const int16_t *data = &im_block[i * im_stride];
  170|       |
  171|   257k|        const __m256i s6 =
  172|   257k|            _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
  173|   257k|        const __m256i s7 =
  174|   257k|            _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
  175|       |
  176|   257k|        s[3] = _mm256_unpacklo_epi16(s6, s7);
  177|   257k|        s[7] = _mm256_unpackhi_epi16(s6, s7);
  178|       |
  179|   257k|        __m256i res_a = convolve(s, coeffs_v);
  180|   257k|        __m256i res_b = convolve(s + 4, coeffs_v);
  181|       |
  182|   257k|        const __m256i res_a_round = _mm256_sra_epi32(
  183|   257k|            _mm256_add_epi32(res_a, round_const_v), round_shift_v);
  184|   257k|        const __m256i res_b_round = _mm256_sra_epi32(
  185|   257k|            _mm256_add_epi32(res_b, round_const_v), round_shift_v);
  186|       |
  187|       |        /* rounding code */
  188|       |        // 16 bit conversion
  189|   257k|        const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
  190|       |        // 8 bit conversion and saturation to uint8
  191|   257k|        const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit);
  192|       |
  193|   257k|        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
  194|   257k|        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
  195|       |
  196|       |        // Store values into the destination buffer
  197|   257k|        __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
  198|   257k|        __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];
  199|       |
  200|   257k|        _mm_storel_epi64(p_0, res_0);
  201|   257k|        _mm_storel_epi64(p_1, res_1);
  202|       |
  203|   257k|        s[0] = s[1];
  204|   257k|        s[1] = s[2];
  205|   257k|        s[2] = s[3];
  206|       |
  207|   257k|        s[4] = s[5];
  208|   257k|        s[5] = s[6];
  209|   257k|        s[6] = s[7];
  210|   257k|      }
  211|  10.8k|      if (h - i) {
  ------------------
  |  Branch (211:11): [True: 260, False: 10.5k]
  ------------------
  212|    260|        s[0] = _mm256_permute2x128_si256(s[0], s[4], 0x20);
  213|    260|        s[1] = _mm256_permute2x128_si256(s[1], s[5], 0x20);
  214|    260|        s[2] = _mm256_permute2x128_si256(s[2], s[6], 0x20);
  215|       |
  216|    260|        const int16_t *data = &im_block[i * im_stride];
  217|    260|        const __m128i s6_ = _mm_loadu_si128((__m128i *)(data + 6 * im_stride));
  218|    260|        const __m128i s7_ = _mm_loadu_si128((__m128i *)(data + 7 * im_stride));
  219|       |
  220|    260|        __m128i s3 = _mm_unpacklo_epi16(s6_, s7_);
  221|    260|        __m128i s7 = _mm_unpackhi_epi16(s6_, s7_);
  222|       |
  223|    260|        s[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s7, 1);
  224|    260|        __m256i convolveres = convolve(s, coeffs_v);
  225|       |
  226|    260|        const __m256i res_round = _mm256_sra_epi32(
  227|    260|            _mm256_add_epi32(convolveres, round_const_v), round_shift_v);
  228|       |
  229|       |        /* rounding code */
  230|       |        // 16 bit conversion
  231|    260|        __m128i reslo = _mm256_castsi256_si128(res_round);
  232|    260|        __m128i reshi = _mm256_extracti128_si256(res_round, 1);
  233|    260|        const __m128i res_16bit = _mm_packus_epi32(reslo, reshi);
  234|       |
  235|       |        // 8 bit conversion and saturation to uint8
  236|    260|        const __m128i res_8b = _mm_packus_epi16(res_16bit, res_16bit);
  237|    260|        __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
  238|    260|        _mm_storel_epi64(p_0, res_8b);
  239|    260|      }
  240|  10.8k|    }
  241|  10.8k|  }
  242|  1.47k|}

av1_check_trailing_bits:
   89|  18.8k|int av1_check_trailing_bits(AV1Decoder *pbi, struct aom_read_bit_buffer *rb) {
   90|       |  // bit_offset is set to 0 (mod 8) when the reader is already byte aligned
   91|  18.8k|  int bits_before_alignment = 8 - rb->bit_offset % 8;
   92|  18.8k|  int trailing = aom_rb_read_literal(rb, bits_before_alignment);
   93|  18.8k|  if (trailing != (1 << (bits_before_alignment - 1))) {
  ------------------
  |  Branch (93:7): [True: 63, False: 18.8k]
  ------------------
   94|     63|    pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
   95|     63|    return -1;
   96|     63|  }
   97|  18.8k|  return 0;
   98|  18.8k|}
av1_dec_row_mt_dealloc:
 2542|  30.7k|void av1_dec_row_mt_dealloc(AV1DecRowMTSync *dec_row_mt_sync) {
 2543|  30.7k|  if (dec_row_mt_sync != NULL) {
  ------------------
  |  Branch (2543:7): [True: 30.7k, False: 0]
  ------------------
 2544|  30.7k|#if CONFIG_MULTITHREAD
 2545|  30.7k|    int i;
 2546|  30.7k|    if (dec_row_mt_sync->mutex_ != NULL) {
  ------------------
  |  Branch (2546:9): [True: 11.1k, False: 19.6k]
  ------------------
 2547|  36.5k|      for (i = 0; i < dec_row_mt_sync->allocated_sb_rows; ++i) {
  ------------------
  |  Branch (2547:19): [True: 25.4k, False: 11.1k]
  ------------------
 2548|  25.4k|        pthread_mutex_destroy(&dec_row_mt_sync->mutex_[i]);
 2549|  25.4k|      }
 2550|  11.1k|      aom_free(dec_row_mt_sync->mutex_);
 2551|  11.1k|    }
 2552|  30.7k|    if (dec_row_mt_sync->cond_ != NULL) {
  ------------------
  |  Branch (2552:9): [True: 11.1k, False: 19.6k]
  ------------------
 2553|  36.5k|      for (i = 0; i < dec_row_mt_sync->allocated_sb_rows; ++i) {
  ------------------
  |  Branch (2553:19): [True: 25.4k, False: 11.1k]
  ------------------
 2554|  25.4k|        pthread_cond_destroy(&dec_row_mt_sync->cond_[i]);
 2555|  25.4k|      }
 2556|  11.1k|      aom_free(dec_row_mt_sync->cond_);
 2557|  11.1k|    }
 2558|  30.7k|#endif  // CONFIG_MULTITHREAD
 2559|  30.7k|    aom_free(dec_row_mt_sync->cur_sb_col);
 2560|       |
 2561|       |    // clear the structure as the source of this call may be a resize in which
 2562|       |    // case this call will be followed by an _alloc() which may fail.
 2563|  30.7k|    av1_zero(*dec_row_mt_sync);
  ------------------
  |  |   43|  30.7k|#define av1_zero(dest) memset(&(dest), 0, sizeof(dest))
  ------------------
 2564|  30.7k|  }
 2565|  30.7k|}
av1_free_mc_tmp_buf:
 3378|   597k|void av1_free_mc_tmp_buf(ThreadData *thread_data) {
 3379|   597k|  int ref;
 3380|  1.79M|  for (ref = 0; ref < 2; ref++) {
  ------------------
  |  Branch (3380:17): [True: 1.19M, False: 597k]
  ------------------
 3381|  1.19M|    if (thread_data->mc_buf_use_highbd)
  ------------------
  |  Branch (3381:9): [True: 178k, False: 1.01M]
  ------------------
 3382|   178k|      aom_free(CONVERT_TO_SHORTPTR(thread_data->mc_buf[ref]));
  ------------------
  |  |   75|   178k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
 3383|  1.01M|    else
 3384|  1.01M|      aom_free(thread_data->mc_buf[ref]);
 3385|  1.19M|    thread_data->mc_buf[ref] = NULL;
 3386|  1.19M|  }
 3387|   597k|  thread_data->mc_buf_size = 0;
 3388|   597k|  thread_data->mc_buf_use_highbd = 0;
 3389|       |
 3390|   597k|  aom_free(thread_data->tmp_conv_dst);
 3391|   597k|  thread_data->tmp_conv_dst = NULL;
 3392|   597k|  aom_free(thread_data->seg_mask);
 3393|   597k|  thread_data->seg_mask = NULL;
 3394|  1.79M|  for (int i = 0; i < 2; ++i) {
  ------------------
  |  Branch (3394:19): [True: 1.19M, False: 597k]
  ------------------
 3395|  1.19M|    aom_free(thread_data->tmp_obmc_bufs[i]);
 3396|       |    thread_data->tmp_obmc_bufs[i] = NULL;
 3397|  1.19M|  }
 3398|   597k|}
av1_read_color_config:
 4086|  18.5k|                           struct aom_internal_error_info *error_info) {
 4087|  18.5k|  read_bitdepth(rb, seq_params, error_info);
 4088|       |
 4089|  18.5k|  seq_params->use_highbitdepth =
 4090|  18.5k|      seq_params->bit_depth > AOM_BITS_8 || !allow_lowbitdepth;
  ------------------
  |  Branch (4090:7): [True: 7.07k, False: 11.5k]
  |  Branch (4090:45): [True: 0, False: 11.5k]
  ------------------
 4091|       |  // monochrome bit (not needed for PROFILE_1)
 4092|  18.5k|  const int is_monochrome =
 4093|  18.5k|      seq_params->profile != PROFILE_1 ? aom_rb_read_bit(rb) : 0;
  ------------------
  |  Branch (4093:7): [True: 8.60k, False: 9.98k]
  ------------------
 4094|  18.5k|  seq_params->monochrome = is_monochrome;
 4095|  18.5k|  int color_description_present_flag = aom_rb_read_bit(rb);
 4096|  18.5k|  if (color_description_present_flag) {
  ------------------
  |  Branch (4096:7): [True: 10.1k, False: 8.46k]
  ------------------
 4097|  10.1k|    seq_params->color_primaries = aom_rb_read_literal(rb, 8);
 4098|  10.1k|    seq_params->transfer_characteristics = aom_rb_read_literal(rb, 8);
 4099|  10.1k|    seq_params->matrix_coefficients = aom_rb_read_literal(rb, 8);
 4100|  10.1k|  } else {
 4101|  8.46k|    seq_params->color_primaries = AOM_CICP_CP_UNSPECIFIED;
 4102|  8.46k|    seq_params->transfer_characteristics = AOM_CICP_TC_UNSPECIFIED;
 4103|  8.46k|    seq_params->matrix_coefficients = AOM_CICP_MC_UNSPECIFIED;
 4104|  8.46k|  }
 4105|  18.5k|  if (is_monochrome) {
  ------------------
  |  Branch (4105:7): [True: 4.10k, False: 14.4k]
  ------------------
 4106|       |    // [16,235] (including xvycc) vs [0,255] range
 4107|  4.10k|    seq_params->color_range = aom_rb_read_bit(rb);
 4108|  4.10k|    seq_params->subsampling_y = seq_params->subsampling_x = 1;
 4109|  4.10k|    seq_params->chroma_sample_position = AOM_CSP_UNKNOWN;
 4110|  4.10k|    seq_params->separate_uv_delta_q = 0;
 4111|  4.10k|    return;
 4112|  4.10k|  }
 4113|  14.4k|  if (seq_params->color_primaries == AOM_CICP_CP_BT_709 &&
  ------------------
  |  Branch (4113:7): [True: 6.80k, False: 7.68k]
  ------------------
 4114|  6.80k|      seq_params->transfer_characteristics == AOM_CICP_TC_SRGB &&
  ------------------
  |  Branch (4114:7): [True: 4.26k, False: 2.54k]
  ------------------
 4115|  4.26k|      seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY) {
  ------------------
  |  Branch (4115:7): [True: 5, False: 4.25k]
  ------------------
 4116|      5|    seq_params->subsampling_y = seq_params->subsampling_x = 0;
 4117|      5|    seq_params->color_range = 1;  // assume full color-range
 4118|      5|    if (!(seq_params->profile == PROFILE_1 ||
  ------------------
  |  Branch (4118:11): [True: 3, False: 2]
  ------------------
 4119|      2|          (seq_params->profile == PROFILE_2 &&
  ------------------
  |  Branch (4119:12): [True: 0, False: 2]
  ------------------
 4120|      2|           seq_params->bit_depth == AOM_BITS_12))) {
  ------------------
  |  Branch (4120:12): [True: 0, False: 0]
  ------------------
 4121|      2|      aom_internal_error(
 4122|      2|          error_info, AOM_CODEC_UNSUP_BITSTREAM,
 4123|      2|          "sRGB colorspace not compatible with specified profile");
 4124|      2|    }
 4125|  14.4k|  } else {
 4126|       |    // [16,235] (including xvycc) vs [0,255] range
 4127|  14.4k|    seq_params->color_range = aom_rb_read_bit(rb);
 4128|  14.4k|    if (seq_params->profile == PROFILE_0) {
  ------------------
  |  Branch (4128:9): [True: 3.27k, False: 11.2k]
  ------------------
 4129|       |      // 420 only
 4130|  3.27k|      seq_params->subsampling_x = seq_params->subsampling_y = 1;
 4131|  11.2k|    } else if (seq_params->profile == PROFILE_1) {
  ------------------
  |  Branch (4131:16): [True: 9.97k, False: 1.22k]
  ------------------
 4132|       |      // 444 only
 4133|  9.97k|      seq_params->subsampling_x = seq_params->subsampling_y = 0;
 4134|  9.97k|    } else {
 4135|  1.22k|      assert(seq_params->profile == PROFILE_2);
 4136|  1.22k|      if (seq_params->bit_depth == AOM_BITS_12) {
  ------------------
  |  Branch (4136:11): [True: 838, False: 390]
  ------------------
 4137|    838|        seq_params->subsampling_x = aom_rb_read_bit(rb);
 4138|    838|        if (seq_params->subsampling_x)
  ------------------
  |  Branch (4138:13): [True: 835, False: 3]
  ------------------
 4139|    835|          seq_params->subsampling_y = aom_rb_read_bit(rb);  // 422 or 420
 4140|      3|        else
 4141|      3|          seq_params->subsampling_y = 0;  // 444
 4142|    838|      } else {
 4143|       |        // 422
 4144|    390|        seq_params->subsampling_x = 1;
 4145|    390|        seq_params->subsampling_y = 0;
 4146|    390|      }
 4147|  1.22k|    }
 4148|  14.4k|    if (seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY &&
  ------------------
  |  Branch (4148:9): [True: 3, False: 14.4k]
  ------------------
 4149|      3|        (seq_params->subsampling_x || seq_params->subsampling_y)) {
  ------------------
  |  Branch (4149:10): [True: 0, False: 3]
  |  Branch (4149:39): [True: 0, False: 3]
  ------------------
 4150|      0|      aom_internal_error(
 4151|      0|          error_info, AOM_CODEC_UNSUP_BITSTREAM,
 4152|      0|          "Identity CICP Matrix incompatible with non 4:4:4 color sampling");
 4153|      0|    }
 4154|  14.4k|    if (seq_params->subsampling_x && seq_params->subsampling_y) {
  ------------------
  |  Branch (4154:9): [True: 4.48k, False: 9.99k]
  |  Branch (4154:38): [True: 3.27k, False: 1.20k]
  ------------------
 4155|  3.27k|      seq_params->chroma_sample_position = aom_rb_read_literal(rb, 2);
 4156|  3.27k|    }
 4157|  14.4k|  }
 4158|  14.4k|  seq_params->separate_uv_delta_q = aom_rb_read_bit(rb);
 4159|  14.4k|}
av1_read_timing_info_header:
 4163|     84|                                 struct aom_read_bit_buffer *rb) {
 4164|     84|  timing_info->num_units_in_display_tick =
 4165|     84|      aom_rb_read_unsigned_literal(rb,
 4166|     84|                                   32);  // Number of units in a display tick
 4167|     84|  timing_info->time_scale = aom_rb_read_unsigned_literal(rb, 32);  // Time scale
 4168|     84|  if (timing_info->num_units_in_display_tick == 0 ||
  ------------------
  |  Branch (4168:7): [True: 21, False: 63]
  ------------------
 4169|     63|      timing_info->time_scale == 0) {
  ------------------
  |  Branch (4169:7): [True: 0, False: 63]
  ------------------
 4170|      0|    aom_internal_error(
 4171|      0|        error, AOM_CODEC_UNSUP_BITSTREAM,
 4172|      0|        "num_units_in_display_tick and time_scale must be greater than 0.");
 4173|      0|  }
 4174|     84|  timing_info->equal_picture_interval =
 4175|     84|      aom_rb_read_bit(rb);  // Equal picture interval bit
 4176|     84|  if (timing_info->equal_picture_interval) {
  ------------------
  |  Branch (4176:7): [True: 27, False: 57]
  ------------------
 4177|     27|    const uint32_t num_ticks_per_picture_minus_1 = aom_rb_read_uvlc(rb);
 4178|     27|    if (num_ticks_per_picture_minus_1 == UINT32_MAX) {
  ------------------
  |  Branch (4178:9): [True: 0, False: 27]
  ------------------
 4179|      0|      aom_internal_error(
 4180|      0|          error, AOM_CODEC_UNSUP_BITSTREAM,
 4181|      0|          "num_ticks_per_picture_minus_1 cannot be (1 << 32) - 1.");
 4182|      0|    }
 4183|     27|    timing_info->num_ticks_per_picture = num_ticks_per_picture_minus_1 + 1;
 4184|     27|  }
 4185|     84|}
av1_read_decoder_model_info:
 4188|     21|                                 struct aom_read_bit_buffer *rb) {
 4189|     21|  decoder_model_info->encoder_decoder_buffer_delay_length =
 4190|     21|      aom_rb_read_literal(rb, 5) + 1;
 4191|     21|  decoder_model_info->num_units_in_decoding_tick =
 4192|     21|      aom_rb_read_unsigned_literal(rb,
 4193|     21|                                   32);  // Number of units in a decoding tick
 4194|     21|  decoder_model_info->buffer_removal_time_length =
 4195|     21|      aom_rb_read_literal(rb, 5) + 1;
 4196|     21|  decoder_model_info->frame_presentation_time_length =
 4197|     21|      aom_rb_read_literal(rb, 5) + 1;
 4198|     21|}
av1_read_sequence_header:
 4217|  18.5k|                              SequenceHeader *seq_params) {
 4218|  18.5k|  const int num_bits_width = aom_rb_read_literal(rb, 4) + 1;
 4219|  18.5k|  const int num_bits_height = aom_rb_read_literal(rb, 4) + 1;
 4220|  18.5k|  const int max_frame_width = aom_rb_read_literal(rb, num_bits_width) + 1;
 4221|  18.5k|  const int max_frame_height = aom_rb_read_literal(rb, num_bits_height) + 1;
 4222|       |
 4223|  18.5k|  seq_params->num_bits_width = num_bits_width;
 4224|  18.5k|  seq_params->num_bits_height = num_bits_height;
 4225|  18.5k|  seq_params->max_frame_width = max_frame_width;
 4226|  18.5k|  seq_params->max_frame_height = max_frame_height;
 4227|       |
 4228|  18.5k|  if (seq_params->reduced_still_picture_hdr) {
  ------------------
  |  Branch (4228:7): [True: 9.08k, False: 9.51k]
  ------------------
 4229|  9.08k|    seq_params->frame_id_numbers_present_flag = 0;
 4230|  9.51k|  } else {
 4231|  9.51k|    seq_params->frame_id_numbers_present_flag = aom_rb_read_bit(rb);
 4232|  9.51k|  }
 4233|  18.5k|  if (seq_params->frame_id_numbers_present_flag) {
  ------------------
  |  Branch (4233:7): [True: 21, False: 18.5k]
  ------------------
 4234|       |    // We must always have delta_frame_id_length < frame_id_length,
 4235|       |    // in order for a frame to be referenced with a unique delta.
 4236|       |    // Avoid wasting bits by using a coding that enforces this restriction.
 4237|     21|    seq_params->delta_frame_id_length = aom_rb_read_literal(rb, 4) + 2;
 4238|     21|    seq_params->frame_id_length =
 4239|     21|        aom_rb_read_literal(rb, 3) + seq_params->delta_frame_id_length + 1;
 4240|     21|    if (seq_params->frame_id_length > 16)
  ------------------
  |  Branch (4240:9): [True: 7, False: 14]
  ------------------
 4241|      7|      aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
 4242|      7|                         "Invalid frame_id_length");
 4243|     21|  }
 4244|       |
 4245|  18.5k|  setup_sb_size(seq_params, rb);
 4246|       |
 4247|  18.5k|  seq_params->enable_filter_intra = aom_rb_read_bit(rb);
 4248|  18.5k|  seq_params->enable_intra_edge_filter = aom_rb_read_bit(rb);
 4249|       |
 4250|  18.5k|  if (seq_params->reduced_still_picture_hdr) {
  ------------------
  |  Branch (4250:7): [True: 9.08k, False: 9.51k]
  ------------------
 4251|  9.08k|    seq_params->enable_interintra_compound = 0;
 4252|  9.08k|    seq_params->enable_masked_compound = 0;
 4253|  9.08k|    seq_params->enable_warped_motion = 0;
 4254|  9.08k|    seq_params->enable_dual_filter = 0;
 4255|  9.08k|    seq_params->order_hint_info.enable_order_hint = 0;
 4256|  9.08k|    seq_params->order_hint_info.enable_dist_wtd_comp = 0;
 4257|  9.08k|    seq_params->order_hint_info.enable_ref_frame_mvs = 0;
 4258|  9.08k|    seq_params->force_screen_content_tools = 2;  // SELECT_SCREEN_CONTENT_TOOLS
 4259|  9.08k|    seq_params->force_integer_mv = 2;            // SELECT_INTEGER_MV
 4260|  9.08k|    seq_params->order_hint_info.order_hint_bits_minus_1 = -1;
 4261|  9.51k|  } else {
 4262|  9.51k|    seq_params->enable_interintra_compound = aom_rb_read_bit(rb);
 4263|  9.51k|    seq_params->enable_masked_compound = aom_rb_read_bit(rb);
 4264|  9.51k|    seq_params->enable_warped_motion = aom_rb_read_bit(rb);
 4265|  9.51k|    seq_params->enable_dual_filter = aom_rb_read_bit(rb);
 4266|       |
 4267|  9.51k|    seq_params->order_hint_info.enable_order_hint = aom_rb_read_bit(rb);
 4268|  9.51k|    seq_params->order_hint_info.enable_dist_wtd_comp =
 4269|  9.51k|        seq_params->order_hint_info.enable_order_hint ? aom_rb_read_bit(rb) : 0;
  ------------------
  |  Branch (4269:9): [True: 9.45k, False: 66]
  ------------------
 4270|  9.51k|    seq_params->order_hint_info.enable_ref_frame_mvs =
 4271|  9.51k|        seq_params->order_hint_info.enable_order_hint ? aom_rb_read_bit(rb) : 0;
  ------------------
  |  Branch (4271:9): [True: 9.45k, False: 66]
  ------------------
 4272|       |
 4273|  9.51k|    if (aom_rb_read_bit(rb)) {
  ------------------
  |  Branch (4273:9): [True: 9.37k, False: 138]
  ------------------
 4274|  9.37k|      seq_params->force_screen_content_tools =
 4275|  9.37k|          2;  // SELECT_SCREEN_CONTENT_TOOLS
 4276|  9.37k|    } else {
 4277|    138|      seq_params->force_screen_content_tools = aom_rb_read_bit(rb);
 4278|    138|    }
 4279|       |
 4280|  9.51k|    if (seq_params->force_screen_content_tools > 0) {
  ------------------
  |  Branch (4280:9): [True: 9.41k, False: 105]
  ------------------
 4281|  9.41k|      if (aom_rb_read_bit(rb)) {
  ------------------
  |  Branch (4281:11): [True: 9.37k, False: 32]
  ------------------
 4282|  9.37k|        seq_params->force_integer_mv = 2;  // SELECT_INTEGER_MV
 4283|  9.37k|      } else {
 4284|     32|        seq_params->force_integer_mv = aom_rb_read_bit(rb);
 4285|     32|      }
 4286|  9.41k|    } else {
 4287|    105|      seq_params->force_integer_mv = 2;  // SELECT_INTEGER_MV
 4288|    105|    }
 4289|  9.51k|    seq_params->order_hint_info.order_hint_bits_minus_1 =
 4290|  9.51k|        seq_params->order_hint_info.enable_order_hint
  ------------------
  |  Branch (4290:9): [True: 9.45k, False: 66]
  ------------------
 4291|  9.51k|            ? aom_rb_read_literal(rb, 3)
 4292|  9.51k|            : -1;
 4293|  9.51k|  }
 4294|       |
 4295|  18.5k|  seq_params->enable_superres = aom_rb_read_bit(rb);
 4296|  18.5k|  seq_params->enable_cdef = aom_rb_read_bit(rb);
 4297|  18.5k|  seq_params->enable_restoration = aom_rb_read_bit(rb);
 4298|  18.5k|}
av1_init_read_bit_buffer:
 5150|  63.2k|    const uint8_t *data_end) {
 5151|  63.2k|  rb->bit_offset = 0;
 5152|  63.2k|  rb->error_handler = error_handler;
 5153|  63.2k|  rb->error_handler_data = &pbi->common;
 5154|  63.2k|  rb->bit_buffer = data;
 5155|  63.2k|  rb->bit_buffer_end = data_end;
 5156|  63.2k|  return rb;
 5157|  63.2k|}
av1_read_profile:
 5159|  56.0k|BITSTREAM_PROFILE av1_read_profile(struct aom_read_bit_buffer *rb) {
 5160|  56.0k|  int profile = aom_rb_read_literal(rb, PROFILE_BITS);
  ------------------
  |  |   79|  56.0k|#define PROFILE_BITS 3
  ------------------
 5161|  56.0k|  return (BITSTREAM_PROFILE)profile;
 5162|  56.0k|}
av1_decode_frame_headers_and_setup:
 5176|  26.8k|                                            int trailing_bits_present) {
 5177|  26.8k|  AV1_COMMON *const cm = &pbi->common;
 5178|  26.8k|  const int num_planes = av1_num_planes(cm);
 5179|  26.8k|  MACROBLOCKD *const xd = &pbi->dcb.xd;
 5180|       |
 5181|       |#if CONFIG_BITSTREAM_DEBUG
 5182|       |  if (cm->seq_params->order_hint_info.enable_order_hint) {
 5183|       |    aom_bitstream_queue_set_frame_read(cm->current_frame.order_hint * 2 +
 5184|       |                                       cm->show_frame);
 5185|       |  } else {
 5186|       |    // This is currently used in RTC encoding. cm->show_frame is always 1.
 5187|       |    assert(cm->show_frame);
 5188|       |    aom_bitstream_queue_set_frame_read(cm->current_frame.frame_number);
 5189|       |  }
 5190|       |#endif
 5191|       |#if CONFIG_MISMATCH_DEBUG
 5192|       |  mismatch_move_frame_idx_r();
 5193|       |#endif
 5194|       |
 5195|   215k|  for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
  ------------------
  |  Branch (5195:28): [True: 188k, False: 26.8k]
  ------------------
 5196|   188k|    cm->global_motion[i] = default_warp_params;
 5197|   188k|    cm->cur_frame->global_motion[i] = default_warp_params;
 5198|   188k|  }
 5199|  26.8k|  xd->global_motion = cm->global_motion;
 5200|       |
 5201|  26.8k|  read_uncompressed_header(pbi, rb);
 5202|       |
 5203|  26.8k|  if (trailing_bits_present) av1_check_trailing_bits(pbi, rb);
  ------------------
  |  Branch (5203:7): [True: 292, False: 26.5k]
  ------------------
 5204|       |
 5205|  26.8k|  if (!cm->tiles.single_tile_decoding &&
  ------------------
  |  Branch (5205:7): [True: 26.3k, False: 501]
  ------------------
 5206|  26.3k|      (pbi->dec_tile_row >= 0 || pbi->dec_tile_col >= 0)) {
  ------------------
  |  Branch (5206:8): [True: 0, False: 26.3k]
  |  Branch (5206:34): [True: 0, False: 26.3k]
  ------------------
 5207|      0|    pbi->dec_tile_row = -1;
 5208|      0|    pbi->dec_tile_col = -1;
 5209|      0|  }
 5210|       |
 5211|  26.8k|  const uint32_t uncomp_hdr_size =
 5212|  26.8k|      (uint32_t)aom_rb_bytes_read(rb);  // Size of the uncompressed header
 5213|  26.8k|  YV12_BUFFER_CONFIG *new_fb = &cm->cur_frame->buf;
 5214|  26.8k|  xd->cur_buf = new_fb;
 5215|  26.8k|  if (av1_allow_intrabc(cm)) {
  ------------------
  |  Branch (5215:7): [True: 1.67k, False: 25.2k]
  ------------------
 5216|  1.67k|    av1_setup_scale_factors_for_frame(
 5217|  1.67k|        &cm->sf_identity, xd->cur_buf->y_crop_width, xd->cur_buf->y_crop_height,
 5218|  1.67k|        xd->cur_buf->y_crop_width, xd->cur_buf->y_crop_height);
 5219|  1.67k|  }
 5220|       |
 5221|       |  // Showing a frame directly.
 5222|  26.8k|  if (cm->show_existing_frame) {
  ------------------
  |  Branch (5222:7): [True: 252, False: 26.6k]
  ------------------
 5223|    252|    if (pbi->reset_decoder_state) {
  ------------------
  |  Branch (5223:9): [True: 0, False: 252]
  ------------------
 5224|       |      // Use the default frame context values.
 5225|      0|      *cm->fc = *cm->default_frame_context;
 5226|      0|      if (!cm->fc->initialized)
  ------------------
  |  Branch (5226:11): [True: 0, False: 0]
  ------------------
 5227|      0|        aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
 5228|      0|                           "Uninitialized entropy context.");
 5229|      0|    }
 5230|    252|    return uncomp_hdr_size;
 5231|    252|  }
 5232|       |
 5233|  26.6k|  cm->mi_params.setup_mi(&cm->mi_params);
 5234|       |
 5235|  26.6k|  av1_calculate_ref_frame_side(cm);
 5236|  26.6k|  if (cm->features.allow_ref_frame_mvs) av1_setup_motion_field(cm);
  ------------------
  |  Branch (5236:7): [True: 6.84k, False: 19.7k]
  ------------------
 5237|       |
 5238|  26.6k|  av1_setup_block_planes(xd, cm->seq_params->subsampling_x,
 5239|  26.6k|                         cm->seq_params->subsampling_y, num_planes);
 5240|  26.6k|  if (cm->features.primary_ref_frame == PRIMARY_REF_NONE) {
  ------------------
  |  |   66|  26.6k|#define PRIMARY_REF_NONE 7
  ------------------
  |  Branch (5240:7): [True: 22.5k, False: 4.03k]
  ------------------
 5241|       |    // use the default frame context values
 5242|  22.5k|    *cm->fc = *cm->default_frame_context;
 5243|  22.5k|  } else {
 5244|  4.03k|    *cm->fc = get_primary_ref_frame_buf(cm)->frame_context;
 5245|  4.03k|  }
 5246|  26.6k|  if (!cm->fc->initialized)
  ------------------
  |  Branch (5246:7): [True: 0, False: 26.6k]
  ------------------
 5247|      0|    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
 5248|      0|                       "Uninitialized entropy context.");
 5249|       |
 5250|  26.6k|  pbi->dcb.corrupted = 0;
 5251|  26.6k|  return uncomp_hdr_size;
 5252|  26.8k|}
av1_decode_tg_tiles_and_wrapup:
 5278|  25.8k|                                    int end_tile, int initialize_flag) {
 5279|  25.8k|  AV1_COMMON *const cm = &pbi->common;
 5280|  25.8k|  CommonTileParams *const tiles = &cm->tiles;
 5281|  25.8k|  MACROBLOCKD *const xd = &pbi->dcb.xd;
 5282|  25.8k|  const int tile_count_tg = end_tile - start_tile + 1;
 5283|       |
 5284|  25.8k|  xd->error_info = cm->error;
 5285|  25.8k|  if (initialize_flag) setup_frame_info(pbi);
  ------------------
  |  Branch (5285:7): [True: 25.8k, False: 16]
  ------------------
 5286|  25.8k|  const int num_planes = av1_num_planes(cm);
 5287|       |
 5288|  25.8k|  if (pbi->max_threads > 1 && !(tiles->large_scale && !pbi->ext_tile_debug) &&
  ------------------
  |  Branch (5288:7): [True: 13.1k, False: 12.7k]
  |  Branch (5288:33): [True: 0, False: 13.1k]
  |  Branch (5288:55): [True: 0, False: 0]
  ------------------
 5289|  13.1k|      pbi->row_mt)
  ------------------
  |  Branch (5289:7): [True: 13.1k, False: 0]
  ------------------
 5290|  13.1k|    *p_data_end =
 5291|  13.1k|        decode_tiles_row_mt(pbi, data, data_end, start_tile, end_tile);
 5292|  12.7k|  else if (pbi->max_threads > 1 && tile_count_tg > 1 &&
  ------------------
  |  Branch (5292:12): [True: 0, False: 12.7k]
  |  Branch (5292:36): [True: 0, False: 0]
  ------------------
 5293|      0|           !(tiles->large_scale && !pbi->ext_tile_debug))
  ------------------
  |  Branch (5293:14): [True: 0, False: 0]
  |  Branch (5293:36): [True: 0, False: 0]
  ------------------
 5294|      0|    *p_data_end = decode_tiles_mt(pbi, data, data_end, start_tile, end_tile);
 5295|  12.7k|  else
 5296|  12.7k|    *p_data_end = decode_tiles(pbi, data, data_end, start_tile, end_tile);
 5297|       |
 5298|       |  // If the bit stream is monochrome, set the U and V buffers to a constant.
 5299|  25.8k|  if (num_planes < 3) {
  ------------------
  |  Branch (5299:7): [True: 2.27k, False: 23.5k]
  ------------------
 5300|  2.27k|    set_planes_to_neutral_grey(cm->seq_params, xd->cur_buf, 1);
 5301|  2.27k|  }
 5302|       |
 5303|  25.8k|  if (end_tile != tiles->rows * tiles->cols - 1) {
  ------------------
  |  Branch (5303:7): [True: 22, False: 25.8k]
  ------------------
 5304|     22|    return;
 5305|     22|  }
 5306|       |
 5307|  25.8k|  av1_alloc_cdef_buffers(cm, &pbi->cdef_worker, &pbi->cdef_sync,
 5308|  25.8k|                         pbi->num_workers, 1);
 5309|  25.8k|  av1_alloc_cdef_sync(cm, &pbi->cdef_sync, pbi->num_workers);
 5310|       |
 5311|  25.8k|  if (!cm->features.allow_intrabc && !tiles->single_tile_decoding) {
  ------------------
  |  Branch (5311:7): [True: 12.4k, False: 13.3k]
  |  Branch (5311:38): [True: 12.4k, False: 0]
  ------------------
 5312|  12.4k|    if (cm->lf.filter_level[0] || cm->lf.filter_level[1]) {
  ------------------
  |  Branch (5312:9): [True: 1.47k, False: 10.9k]
  |  Branch (5312:35): [True: 387, False: 10.6k]
  ------------------
 5313|  1.86k|      av1_loop_filter_frame_mt(&cm->cur_frame->buf, cm, &pbi->dcb.xd, 0,
 5314|  1.86k|                               num_planes, 0, pbi->tile_workers,
 5315|  1.86k|                               pbi->num_workers, &pbi->lf_row_sync, 0);
 5316|  1.86k|    }
 5317|       |
 5318|  12.4k|    const int do_cdef =
 5319|  12.4k|        !pbi->skip_loop_filter && !cm->features.coded_lossless &&
  ------------------
  |  Branch (5319:9): [True: 12.4k, False: 0]
  |  Branch (5319:35): [True: 11.2k, False: 1.26k]
  ------------------
 5320|  11.2k|        (cm->cdef_info.cdef_bits || cm->cdef_info.cdef_strengths[0] ||
  ------------------
  |  Branch (5320:10): [True: 554, False: 10.6k]
  |  Branch (5320:37): [True: 1.39k, False: 9.25k]
  ------------------
 5321|  9.25k|         cm->cdef_info.cdef_uv_strengths[0]);
  ------------------
  |  Branch (5321:10): [True: 904, False: 8.35k]
  ------------------
 5322|  12.4k|    const int do_superres = av1_superres_scaled(cm);
 5323|  12.4k|    const int optimized_loop_restoration = !do_cdef && !do_superres;
  ------------------
  |  Branch (5323:44): [True: 9.61k, False: 2.85k]
  |  Branch (5323:56): [True: 9.61k, False: 0]
  ------------------
 5324|  12.4k|    const int do_loop_restoration =
 5325|  12.4k|        cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
  ------------------
  |  Branch (5325:9): [True: 613, False: 11.8k]
  ------------------
 5326|  11.8k|        cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
  ------------------
  |  Branch (5326:9): [True: 52, False: 11.8k]
  ------------------
 5327|  11.8k|        cm->rst_info[2].frame_restoration_type != RESTORE_NONE;
  ------------------
  |  Branch (5327:9): [True: 118, False: 11.6k]
  ------------------
 5328|       |    // Frame border extension is not required in the decoder
 5329|       |    // as it happens in extend_mc_border().
 5330|  12.4k|    int do_extend_border_mt = 0;
 5331|  12.4k|    if (!optimized_loop_restoration) {
  ------------------
  |  Branch (5331:9): [True: 2.85k, False: 9.61k]
  ------------------
 5332|  2.85k|      if (do_loop_restoration)
  ------------------
  |  Branch (5332:11): [True: 680, False: 2.17k]
  ------------------
 5333|    680|        av1_loop_restoration_save_boundary_lines(&pbi->common.cur_frame->buf,
 5334|    680|                                                 cm, 0);
 5335|       |
 5336|  2.85k|      if (do_cdef) {
  ------------------
  |  Branch (5336:11): [True: 2.85k, False: 0]
  ------------------
 5337|  2.85k|        if (pbi->num_workers > 1) {
  ------------------
  |  Branch (5337:13): [True: 1.67k, False: 1.17k]
  ------------------
 5338|  1.67k|          av1_cdef_frame_mt(cm, &pbi->dcb.xd, pbi->cdef_worker,
 5339|  1.67k|                            pbi->tile_workers, &pbi->cdef_sync,
 5340|  1.67k|                            pbi->num_workers, av1_cdef_init_fb_row_mt,
 5341|  1.67k|                            do_extend_border_mt);
 5342|  1.67k|        } else {
 5343|  1.17k|          av1_cdef_frame(&pbi->common.cur_frame->buf, cm, &pbi->dcb.xd,
 5344|  1.17k|                         av1_cdef_init_fb_row);
 5345|  1.17k|        }
 5346|  2.85k|      }
 5347|       |
 5348|  2.85k|      superres_post_decode(pbi);
 5349|       |
 5350|  2.85k|      if (do_loop_restoration) {
  ------------------
  |  Branch (5350:11): [True: 680, False: 2.17k]
  ------------------
 5351|    680|        av1_loop_restoration_save_boundary_lines(&pbi->common.cur_frame->buf,
 5352|    680|                                                 cm, 1);
 5353|    680|        if (pbi->num_workers > 1) {
  ------------------
  |  Branch (5353:13): [True: 628, False: 52]
  ------------------
 5354|    628|          av1_loop_restoration_filter_frame_mt(
 5355|    628|              (YV12_BUFFER_CONFIG *)xd->cur_buf, cm, optimized_loop_restoration,
 5356|    628|              pbi->tile_workers, pbi->num_workers, &pbi->lr_row_sync,
 5357|    628|              &pbi->lr_ctxt, do_extend_border_mt);
 5358|    628|        } else {
 5359|     52|          av1_loop_restoration_filter_frame((YV12_BUFFER_CONFIG *)xd->cur_buf,
 5360|     52|                                            cm, optimized_loop_restoration,
 5361|     52|                                            &pbi->lr_ctxt);
 5362|     52|        }
 5363|    680|      }
 5364|  9.61k|    } else {
 5365|       |      // In no cdef and no superres case. Provide an optimized version of
 5366|       |      // loop_restoration_filter.
 5367|  9.61k|      if (do_loop_restoration) {
  ------------------
  |  Branch (5367:11): [True: 103, False: 9.51k]
  ------------------
 5368|    103|        if (pbi->num_workers > 1) {
  ------------------
  |  Branch (5368:13): [True: 90, False: 13]
  ------------------
 5369|     90|          av1_loop_restoration_filter_frame_mt(
 5370|     90|              (YV12_BUFFER_CONFIG *)xd->cur_buf, cm, optimized_loop_restoration,
 5371|     90|              pbi->tile_workers, pbi->num_workers, &pbi->lr_row_sync,
 5372|     90|              &pbi->lr_ctxt, do_extend_border_mt);
 5373|     90|        } else {
 5374|     13|          av1_loop_restoration_filter_frame((YV12_BUFFER_CONFIG *)xd->cur_buf,
 5375|     13|                                            cm, optimized_loop_restoration,
 5376|     13|                                            &pbi->lr_ctxt);
 5377|     13|        }
 5378|    103|      }
 5379|  9.61k|    }
 5380|  12.4k|  }
 5381|       |
 5382|  25.8k|  if (!pbi->dcb.corrupted) {
  ------------------
  |  Branch (5382:7): [True: 12.5k, False: 13.3k]
  ------------------
 5383|  12.5k|    if (cm->features.refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
  ------------------
  |  Branch (5383:9): [True: 11.0k, False: 1.50k]
  ------------------
 5384|  11.0k|      assert(pbi->context_update_tile_id < pbi->allocated_tiles);
 5385|  11.0k|      *cm->fc = pbi->tile_data[pbi->context_update_tile_id].tctx;
 5386|  11.0k|      av1_reset_cdf_symbol_counters(cm->fc);
 5387|  11.0k|    }
 5388|  13.3k|  } else {
 5389|  13.3k|    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
 5390|  13.3k|                       "Decode failed. Frame data is corrupted.");
 5391|  13.3k|  }
 5392|       |
 5393|       |#if CONFIG_INSPECTION
 5394|       |  if (pbi->inspect_cb != NULL) {
 5395|       |    (*pbi->inspect_cb)(pbi, pbi->inspect_ctx);
 5396|       |  }
 5397|       |#endif
 5398|       |
 5399|       |  // Non frame parallel update frame context here.
 5400|  25.8k|  if (!tiles->large_scale) {
  ------------------
  |  Branch (5400:7): [True: 12.5k, False: 13.3k]
  ------------------
 5401|  12.5k|    cm->cur_frame->frame_context = *cm->fc;
 5402|  12.5k|  }
 5403|       |
 5404|  25.8k|  if (cm->show_frame && !cm->seq_params->order_hint_info.enable_order_hint) {
  ------------------
  |  Branch (5404:7): [True: 10.0k, False: 15.8k]
  |  Branch (5404:25): [True: 1.35k, False: 8.64k]
  ------------------
 5405|  1.35k|    ++cm->current_frame.frame_number;
 5406|  1.35k|  }
 5407|  25.8k|}
decodeframe.c:read_bitdepth:
 3888|  18.5k|                                 struct aom_internal_error_info *error_info) {
 3889|  18.5k|  const int high_bitdepth = aom_rb_read_bit(rb);
 3890|  18.5k|  if (seq_params->profile == PROFILE_2 && high_bitdepth) {
  ------------------
  |  Branch (3890:7): [True: 1.64k, False: 16.9k]
  |  Branch (3890:43): [True: 1.27k, False: 375]
  ------------------
 3891|  1.27k|    const int twelve_bit = aom_rb_read_bit(rb);
 3892|  1.27k|    seq_params->bit_depth = twelve_bit ? AOM_BITS_12 : AOM_BITS_10;
  ------------------
  |  Branch (3892:29): [True: 1.26k, False: 4]
  ------------------
 3893|  17.3k|  } else if (seq_params->profile <= PROFILE_2) {
  ------------------
  |  Branch (3893:14): [True: 17.3k, False: 0]
  ------------------
 3894|  17.3k|    seq_params->bit_depth = high_bitdepth ? AOM_BITS_10 : AOM_BITS_8;
  ------------------
  |  Branch (3894:29): [True: 5.80k, False: 11.5k]
  ------------------
 3895|  17.3k|  } else {
 3896|      0|    aom_internal_error(error_info, AOM_CODEC_UNSUP_BITSTREAM,
 3897|      0|                       "Unsupported profile/bit-depth combination");
 3898|      0|  }
 3899|       |#if !CONFIG_AV1_HIGHBITDEPTH
 3900|       |  if (seq_params->bit_depth > AOM_BITS_8) {
 3901|       |    aom_internal_error(error_info, AOM_CODEC_UNSUP_BITSTREAM,
 3902|       |                       "Bit-depth %d not supported", seq_params->bit_depth);
 3903|       |  }
 3904|       |#endif
 3905|  18.5k|}
decodeframe.c:setup_sb_size:
 2005|  18.5k|                                 struct aom_read_bit_buffer *rb) {
 2006|  18.5k|  set_sb_size(seq_params, aom_rb_read_bit(rb) ? BLOCK_128X128 : BLOCK_64X64);
  ------------------
  |  Branch (2006:27): [True: 4.71k, False: 13.8k]
  ------------------
 2007|  18.5k|}
decodeframe.c:error_handler:
 3877|    471|static inline void error_handler(void *data) {
 3878|    471|  AV1_COMMON *const cm = (AV1_COMMON *)data;
 3879|    471|  aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME, "Truncated packet");
 3880|    471|}
decodeframe.c:read_uncompressed_header:
 4487|  26.8k|                                    struct aom_read_bit_buffer *rb) {
 4488|  26.8k|  AV1_COMMON *const cm = &pbi->common;
 4489|  26.8k|  const SequenceHeader *const seq_params = cm->seq_params;
 4490|  26.8k|  CurrentFrame *const current_frame = &cm->current_frame;
 4491|  26.8k|  FeatureFlags *const features = &cm->features;
 4492|  26.8k|  MACROBLOCKD *const xd = &pbi->dcb.xd;
 4493|  26.8k|  BufferPool *const pool = cm->buffer_pool;
 4494|  26.8k|  RefCntBuffer *const frame_bufs = pool->frame_bufs;
 4495|  26.8k|  aom_s_frame_info *sframe_info = &pbi->sframe_info;
 4496|  26.8k|  sframe_info->is_s_frame = 0;
 4497|  26.8k|  sframe_info->is_s_frame_at_altref = 0;
 4498|       |
 4499|  26.8k|  if (!pbi->sequence_header_ready) {
  ------------------
  |  Branch (4499:7): [True: 3, False: 26.8k]
  ------------------
 4500|      3|    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
 4501|      3|                       "No sequence header");
 4502|      3|  }
 4503|       |
 4504|  26.8k|  if (seq_params->reduced_still_picture_hdr) {
  ------------------
  |  Branch (4504:7): [True: 9.03k, False: 17.8k]
  ------------------
 4505|  9.03k|    cm->show_existing_frame = 0;
 4506|  9.03k|    cm->show_frame = 1;
 4507|  9.03k|    current_frame->frame_type = KEY_FRAME;
 4508|  9.03k|    if (pbi->sequence_header_changed) {
  ------------------
  |  Branch (4508:9): [True: 45, False: 8.99k]
  ------------------
 4509|       |      // This is the start of a new coded video sequence.
 4510|     45|      pbi->sequence_header_changed = 0;
 4511|     45|      pbi->decoding_first_frame = 1;
 4512|     45|      reset_frame_buffers(cm);
 4513|     45|    }
 4514|  9.03k|    features->error_resilient_mode = 1;
 4515|  17.8k|  } else {
 4516|  17.8k|    cm->show_existing_frame = aom_rb_read_bit(rb);
 4517|  17.8k|    pbi->reset_decoder_state = 0;
 4518|       |
 4519|  17.8k|    if (cm->show_existing_frame) {
  ------------------
  |  Branch (4519:9): [True: 266, False: 17.5k]
  ------------------
 4520|    266|      if (pbi->sequence_header_changed) {
  ------------------
  |  Branch (4520:11): [True: 6, False: 260]
  ------------------
 4521|      6|        aom_internal_error(
 4522|      6|            &pbi->error, AOM_CODEC_CORRUPT_FRAME,
 4523|      6|            "New sequence header starts with a show_existing_frame.");
 4524|      6|      }
 4525|       |      // Show an existing frame directly.
 4526|    266|      const int existing_frame_idx = aom_rb_read_literal(rb, 3);
 4527|    266|      RefCntBuffer *const frame_to_show = cm->ref_frame_map[existing_frame_idx];
 4528|    266|      if (frame_to_show == NULL) {
  ------------------
  |  Branch (4528:11): [True: 2, False: 264]
  ------------------
 4529|      2|        aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM,
 4530|      2|                           "Buffer does not contain a decoded frame");
 4531|      2|      }
 4532|    266|      if (seq_params->decoder_model_info_present_flag &&
  ------------------
  |  Branch (4532:11): [True: 0, False: 266]
  ------------------
 4533|      0|          seq_params->timing_info.equal_picture_interval == 0) {
  ------------------
  |  Branch (4533:11): [True: 0, False: 0]
  ------------------
 4534|      0|        read_temporal_point_info(cm, rb);
 4535|      0|      }
 4536|    266|      if (seq_params->frame_id_numbers_present_flag) {
  ------------------
  |  Branch (4536:11): [True: 0, False: 266]
  ------------------
 4537|      0|        int frame_id_length = seq_params->frame_id_length;
 4538|      0|        int display_frame_id = aom_rb_read_literal(rb, frame_id_length);
 4539|       |        /* Compare display_frame_id with ref_frame_id and check valid for
 4540|       |         * referencing */
 4541|      0|        if (display_frame_id != cm->ref_frame_id[existing_frame_idx] ||
  ------------------
  |  Branch (4541:13): [True: 0, False: 0]
  ------------------
 4542|      0|            pbi->valid_for_referencing[existing_frame_idx] == 0)
  ------------------
  |  Branch (4542:13): [True: 0, False: 0]
  ------------------
 4543|      0|          aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
 4544|      0|                             "Reference buffer frame ID mismatch");
 4545|      0|      }
 4546|    266|      lock_buffer_pool(pool);
 4547|    266|      assert(frame_to_show->ref_count > 0);
 4548|       |      // cm->cur_frame should be the buffer referenced by the return value
 4549|       |      // of the get_free_fb() call in assign_cur_frame_new_fb() (called by
 4550|       |      // av1_receive_compressed_data()), so the ref_count should be 1.
 4551|    266|      assert(cm->cur_frame->ref_count == 1);
 4552|       |      // assign_frame_buffer_p() decrements ref_count directly rather than
 4553|       |      // call decrease_ref_count(). If cm->cur_frame->raw_frame_buffer has
 4554|       |      // already been allocated, it will not be released by
 4555|       |      // assign_frame_buffer_p()!
 4556|    266|      assert(!cm->cur_frame->raw_frame_buffer.data);
 4557|    266|      assign_frame_buffer_p(&cm->cur_frame, frame_to_show);
 4558|    266|      pbi->reset_decoder_state = frame_to_show->frame_type == KEY_FRAME;
 4559|    266|      unlock_buffer_pool(pool);
 4560|       |
 4561|    266|      cm->lf.filter_level[0] = 0;
 4562|    266|      cm->lf.filter_level[1] = 0;
 4563|    266|      cm->show_frame = 1;
 4564|    266|      current_frame->order_hint = frame_to_show->order_hint;
 4565|       |
 4566|       |      // Section 6.8.2: It is a requirement of bitstream conformance that when
 4567|       |      // show_existing_frame is used to show a previous frame, that the value
 4568|       |      // of showable_frame for the previous frame was equal to 1.
 4569|    266|      if (!frame_to_show->showable_frame) {
  ------------------
  |  Branch (4569:11): [True: 6, False: 260]
  ------------------
 4570|      6|        aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM,
 4571|      6|                           "Buffer does not contain a showable frame");
 4572|      6|      }
 4573|       |      // Section 6.8.2: It is a requirement of bitstream conformance that when
 4574|       |      // show_existing_frame is used to show a previous frame with
 4575|       |      // RefFrameType[ frame_to_show_map_idx ] equal to KEY_FRAME, that the
 4576|       |      // frame is output via the show_existing_frame mechanism at most once.
 4577|    266|      if (pbi->reset_decoder_state) frame_to_show->showable_frame = 0;
  ------------------
  |  Branch (4577:11): [True: 0, False: 266]
  ------------------
 4578|       |
 4579|    266|      cm->film_grain_params = frame_to_show->film_grain_params;
 4580|       |
 4581|    266|      if (pbi->reset_decoder_state) {
  ------------------
  |  Branch (4581:11): [True: 0, False: 266]
  ------------------
 4582|      0|        show_existing_frame_reset(pbi, existing_frame_idx);
 4583|    266|      } else {
 4584|    266|        current_frame->refresh_frame_flags = 0;
 4585|    266|      }
 4586|       |
 4587|    266|      return 0;
 4588|    266|    }
 4589|       |
 4590|  17.5k|    current_frame->frame_type = (FRAME_TYPE)aom_rb_read_literal(rb, 2);
 4591|  17.5k|    if (pbi->sequence_header_changed) {
  ------------------
  |  Branch (4591:9): [True: 212, False: 17.3k]
  ------------------
 4592|    212|      if (current_frame->frame_type == KEY_FRAME) {
  ------------------
  |  Branch (4592:11): [True: 210, False: 2]
  ------------------
 4593|       |        // This is the start of a new coded video sequence.
 4594|    210|        pbi->sequence_header_changed = 0;
 4595|    210|        pbi->decoding_first_frame = 1;
 4596|    210|        reset_frame_buffers(cm);
 4597|    210|      } else {
 4598|      2|        aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
 4599|      2|                           "Sequence header has changed without a keyframe.");
 4600|      2|      }
 4601|    212|    }
 4602|       |
 4603|  17.5k|    cm->show_frame = aom_rb_read_bit(rb);
 4604|  17.5k|    if (cm->show_frame == 0) pbi->is_arf_frame_present = 1;
  ------------------
  |  Branch (4604:9): [True: 3.90k, False: 13.6k]
  ------------------
 4605|  17.5k|    if (cm->show_frame == 0 && cm->current_frame.frame_type == KEY_FRAME)
  ------------------
  |  Branch (4605:9): [True: 3.90k, False: 13.6k]
  |  Branch (4605:32): [True: 199, False: 3.70k]
  ------------------
 4606|    199|      pbi->is_fwd_kf_present = 1;
 4607|  17.5k|    if (cm->current_frame.frame_type == S_FRAME) {
  ------------------
  |  Branch (4607:9): [True: 29, False: 17.5k]
  ------------------
 4608|     29|      sframe_info->is_s_frame = 1;
 4609|     29|      sframe_info->is_s_frame_at_altref = cm->show_frame ? 0 : 1;
  ------------------
  |  Branch (4609:43): [True: 13, False: 16]
  ------------------
 4610|     29|    }
 4611|  17.5k|    if (seq_params->still_picture &&
  ------------------
  |  Branch (4611:9): [True: 23, False: 17.5k]
  ------------------
 4612|     23|        (current_frame->frame_type != KEY_FRAME || !cm->show_frame)) {
  ------------------
  |  Branch (4612:10): [True: 8, False: 15]
  |  Branch (4612:52): [True: 2, False: 13]
  ------------------
 4613|     10|      aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
 4614|     10|                         "Still pictures must be coded as shown keyframes");
 4615|     10|    }
 4616|  17.5k|    cm->showable_frame = current_frame->frame_type != KEY_FRAME;
 4617|  17.5k|    if (cm->show_frame) {
  ------------------
  |  Branch (4617:9): [True: 13.6k, False: 3.91k]
  ------------------
 4618|  13.6k|      if (seq_params->decoder_model_info_present_flag &&
  ------------------
  |  Branch (4618:11): [True: 0, False: 13.6k]
  ------------------
 4619|      0|          seq_params->timing_info.equal_picture_interval == 0)
  ------------------
  |  Branch (4619:11): [True: 0, False: 0]
  ------------------
 4620|      0|        read_temporal_point_info(cm, rb);
 4621|  13.6k|    } else {
 4622|       |      // See if this frame can be used as show_existing_frame in future
 4623|  3.91k|      cm->showable_frame = aom_rb_read_bit(rb);
 4624|  3.91k|    }
 4625|  17.5k|    cm->cur_frame->showable_frame = cm->showable_frame;
 4626|  17.5k|    features->error_resilient_mode =
 4627|  17.5k|        frame_is_sframe(cm) ||
  ------------------
  |  Branch (4627:9): [True: 41, False: 17.5k]
  ------------------
 4628|  17.5k|                (current_frame->frame_type == KEY_FRAME && cm->show_frame)
  ------------------
  |  Branch (4628:18): [True: 9.29k, False: 8.24k]
  |  Branch (4628:60): [True: 9.09k, False: 197]
  ------------------
 4629|  17.5k|            ? 1
 4630|  17.5k|            : aom_rb_read_bit(rb);
 4631|  17.5k|  }
 4632|       |
 4633|  26.6k|  if (current_frame->frame_type == KEY_FRAME && cm->show_frame) {
  ------------------
  |  Branch (4633:7): [True: 18.3k, False: 8.28k]
  |  Branch (4633:49): [True: 18.1k, False: 197]
  ------------------
 4634|       |    /* All frames need to be marked as not valid for referencing */
 4635|   163k|    for (int i = 0; i < REF_FRAMES; i++) {
  ------------------
  |  Branch (4635:21): [True: 145k, False: 18.1k]
  ------------------
 4636|   145k|      pbi->valid_for_referencing[i] = 0;
 4637|   145k|    }
 4638|  18.1k|  }
 4639|  26.6k|  features->disable_cdf_update = aom_rb_read_bit(rb);
 4640|  26.6k|  if (seq_params->force_screen_content_tools == 2) {
  ------------------
  |  Branch (4640:7): [True: 26.4k, False: 136]
  ------------------
 4641|  26.4k|    features->allow_screen_content_tools = aom_rb_read_bit(rb);
 4642|  26.4k|  } else {
 4643|    136|    features->allow_screen_content_tools =
 4644|    136|        seq_params->force_screen_content_tools;
 4645|    136|  }
 4646|       |
 4647|  26.6k|  if (features->allow_screen_content_tools) {
  ------------------
  |  Branch (4647:7): [True: 9.77k, False: 16.8k]
  ------------------
 4648|  9.77k|    if (seq_params->force_integer_mv == 2) {
  ------------------
  |  Branch (4648:9): [True: 9.75k, False: 24]
  ------------------
 4649|  9.75k|      features->cur_frame_force_integer_mv = aom_rb_read_bit(rb);
 4650|  9.75k|    } else {
 4651|     24|      features->cur_frame_force_integer_mv = seq_params->force_integer_mv;
 4652|     24|    }
 4653|  16.8k|  } else {
 4654|  16.8k|    features->cur_frame_force_integer_mv = 0;
 4655|  16.8k|  }
 4656|       |
 4657|  26.6k|  int frame_size_override_flag = 0;
 4658|  26.6k|  features->allow_intrabc = 0;
 4659|  26.6k|  features->primary_ref_frame = PRIMARY_REF_NONE;
  ------------------
  |  |   66|  26.6k|#define PRIMARY_REF_NONE 7
  ------------------
 4660|       |
 4661|  26.6k|  if (!seq_params->reduced_still_picture_hdr) {
  ------------------
  |  Branch (4661:7): [True: 17.5k, False: 9.05k]
  ------------------
 4662|  17.5k|    if (seq_params->frame_id_numbers_present_flag) {
  ------------------
  |  Branch (4662:9): [True: 12, False: 17.5k]
  ------------------
 4663|     12|      int frame_id_length = seq_params->frame_id_length;
 4664|     12|      int diff_len = seq_params->delta_frame_id_length;
 4665|     12|      int prev_frame_id = 0;
 4666|     12|      int have_prev_frame_id =
 4667|     12|          !pbi->decoding_first_frame &&
  ------------------
  |  Branch (4667:11): [True: 0, False: 12]
  ------------------
 4668|      0|          !(current_frame->frame_type == KEY_FRAME && cm->show_frame);
  ------------------
  |  Branch (4668:13): [True: 0, False: 0]
  |  Branch (4668:55): [True: 0, False: 0]
  ------------------
 4669|     12|      if (have_prev_frame_id) {
  ------------------
  |  Branch (4669:11): [True: 0, False: 12]
  ------------------
 4670|      0|        prev_frame_id = cm->current_frame_id;
 4671|      0|      }
 4672|     12|      cm->current_frame_id = aom_rb_read_literal(rb, frame_id_length);
 4673|       |
 4674|     12|      if (have_prev_frame_id) {
  ------------------
  |  Branch (4674:11): [True: 0, False: 12]
  ------------------
 4675|      0|        int diff_frame_id;
 4676|      0|        if (cm->current_frame_id > prev_frame_id) {
  ------------------
  |  Branch (4676:13): [True: 0, False: 0]
  ------------------
 4677|      0|          diff_frame_id = cm->current_frame_id - prev_frame_id;
 4678|      0|        } else {
 4679|      0|          diff_frame_id =
 4680|      0|              (1 << frame_id_length) + cm->current_frame_id - prev_frame_id;
 4681|      0|        }
 4682|       |        /* Check current_frame_id for conformance */
 4683|      0|        if (prev_frame_id == cm->current_frame_id ||
  ------------------
  |  Branch (4683:13): [True: 0, False: 0]
  ------------------
 4684|      0|            diff_frame_id >= (1 << (frame_id_length - 1))) {
  ------------------
  |  Branch (4684:13): [True: 0, False: 0]
  ------------------
 4685|      0|          aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
 4686|      0|                             "Invalid value of current_frame_id");
 4687|      0|        }
 4688|      0|      }
 4689|       |      /* Check if some frames need to be marked as not valid for referencing */
 4690|    108|      for (int i = 0; i < REF_FRAMES; i++) {
  ------------------
  |  Branch (4690:23): [True: 96, False: 12]
  ------------------
 4691|     96|        if (cm->current_frame_id - (1 << diff_len) > 0) {
  ------------------
  |  Branch (4691:13): [True: 32, False: 64]
  ------------------
 4692|     32|          if (cm->ref_frame_id[i] > cm->current_frame_id ||
  ------------------
  |  Branch (4692:15): [True: 0, False: 32]
  ------------------
 4693|     32|              cm->ref_frame_id[i] < cm->current_frame_id - (1 << diff_len))
  ------------------
  |  Branch (4693:15): [True: 32, False: 0]
  ------------------
 4694|     32|            pbi->valid_for_referencing[i] = 0;
 4695|     64|        } else {
 4696|     64|          if (cm->ref_frame_id[i] > cm->current_frame_id &&
  ------------------
  |  Branch (4696:15): [True: 0, False: 64]
  ------------------
 4697|      0|              cm->ref_frame_id[i] < (1 << frame_id_length) +
  ------------------
  |  Branch (4697:15): [True: 0, False: 0]
  ------------------
 4698|      0|                                        cm->current_frame_id - (1 << diff_len))
 4699|      0|            pbi->valid_for_referencing[i] = 0;
 4700|     64|        }
 4701|     96|      }
 4702|     12|    }
 4703|       |
 4704|  17.5k|    frame_size_override_flag = frame_is_sframe(cm) ? 1 : aom_rb_read_bit(rb);
  ------------------
  |  Branch (4704:32): [True: 26, False: 17.5k]
  ------------------
 4705|       |
 4706|  17.5k|    current_frame->order_hint = aom_rb_read_literal(
 4707|  17.5k|        rb, seq_params->order_hint_info.order_hint_bits_minus_1 + 1);
 4708|       |
 4709|  17.5k|    if (seq_params->order_hint_info.enable_order_hint)
  ------------------
  |  Branch (4709:9): [True: 17.5k, False: 39]
  ------------------
 4710|  17.5k|      current_frame->frame_number = current_frame->order_hint;
 4711|       |
 4712|  17.5k|    if (!features->error_resilient_mode && !frame_is_intra_only(cm)) {
  ------------------
  |  Branch (4712:9): [True: 8.27k, False: 9.28k]
  |  Branch (4712:44): [True: 7.98k, False: 288]
  ------------------
 4713|  7.98k|      features->primary_ref_frame = aom_rb_read_literal(rb, PRIMARY_REF_BITS);
  ------------------
  |  |   65|  7.98k|#define PRIMARY_REF_BITS 3
  ------------------
 4714|  7.98k|    }
 4715|  17.5k|  }
 4716|       |
 4717|  26.6k|  if (seq_params->decoder_model_info_present_flag) {
  ------------------
  |  Branch (4717:7): [True: 0, False: 26.6k]
  ------------------
 4718|      0|    pbi->buffer_removal_time_present = aom_rb_read_bit(rb);
 4719|      0|    if (pbi->buffer_removal_time_present) {
  ------------------
  |  Branch (4719:9): [True: 0, False: 0]
  ------------------
 4720|      0|      for (int op_num = 0;
 4721|      0|           op_num < seq_params->operating_points_cnt_minus_1 + 1; op_num++) {
  ------------------
  |  Branch (4721:12): [True: 0, False: 0]
  ------------------
 4722|      0|        if (seq_params->op_params[op_num].decoder_model_param_present_flag) {
  ------------------
  |  Branch (4722:13): [True: 0, False: 0]
  ------------------
 4723|      0|          if (seq_params->operating_point_idc[op_num] == 0 ||
  ------------------
  |  Branch (4723:15): [True: 0, False: 0]
  ------------------
 4724|      0|              (((seq_params->operating_point_idc[op_num] >>
  ------------------
  |  Branch (4724:16): [True: 0, False: 0]
  ------------------
 4725|      0|                 cm->temporal_layer_id) &
 4726|      0|                0x1) &&
 4727|      0|               ((seq_params->operating_point_idc[op_num] >>
  ------------------
  |  Branch (4727:16): [True: 0, False: 0]
  ------------------
 4728|      0|                 (cm->spatial_layer_id + 8)) &
 4729|      0|                0x1))) {
 4730|      0|            cm->buffer_removal_times[op_num] = aom_rb_read_unsigned_literal(
 4731|      0|                rb, seq_params->decoder_model_info.buffer_removal_time_length);
 4732|      0|          } else {
 4733|      0|            cm->buffer_removal_times[op_num] = 0;
 4734|      0|          }
 4735|      0|        } else {
 4736|      0|          cm->buffer_removal_times[op_num] = 0;
 4737|      0|        }
 4738|      0|      }
 4739|      0|    }
 4740|      0|  }
 4741|  26.6k|  if (current_frame->frame_type == KEY_FRAME) {
  ------------------
  |  Branch (4741:7): [True: 18.3k, False: 8.28k]
  ------------------
 4742|  18.3k|    if (!cm->show_frame) {  // unshown keyframe (forward keyframe)
  ------------------
  |  Branch (4742:9): [True: 197, False: 18.1k]
  ------------------
 4743|    197|      current_frame->refresh_frame_flags = aom_rb_read_literal(rb, REF_FRAMES);
 4744|  18.1k|    } else {  // shown keyframe
 4745|  18.1k|      current_frame->refresh_frame_flags = (1 << REF_FRAMES) - 1;
 4746|  18.1k|    }
 4747|       |
 4748|   146k|    for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
  ------------------
  |  Branch (4748:21): [True: 128k, False: 18.3k]
  ------------------
 4749|   128k|      cm->remapped_ref_idx[i] = INVALID_IDX;
  ------------------
  |  |   15|   128k|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
 4750|   128k|    }
 4751|  18.3k|    if (pbi->need_resync) {
  ------------------
  |  Branch (4751:9): [True: 17.4k, False: 907]
  ------------------
 4752|  17.4k|      reset_ref_frame_map(cm);
 4753|  17.4k|      pbi->need_resync = 0;
 4754|  17.4k|    }
 4755|  18.3k|  } else {
 4756|  8.28k|    if (current_frame->frame_type == INTRA_ONLY_FRAME) {
  ------------------
  |  Branch (4756:9): [True: 226, False: 8.06k]
  ------------------
 4757|    226|      current_frame->refresh_frame_flags = aom_rb_read_literal(rb, REF_FRAMES);
 4758|    226|      if (current_frame->refresh_frame_flags == 0xFF) {
  ------------------
  |  Branch (4758:11): [True: 0, False: 226]
  ------------------
 4759|      0|        aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM,
 4760|      0|                           "Intra only frames cannot have refresh flags 0xFF");
 4761|      0|      }
 4762|    226|      if (pbi->need_resync) {
  ------------------
  |  Branch (4762:11): [True: 214, False: 12]
  ------------------
 4763|    214|        reset_ref_frame_map(cm);
 4764|    214|        pbi->need_resync = 0;
 4765|    214|      }
 4766|  8.06k|    } else if (pbi->need_resync != 1) { /* Skip if need resync */
  ------------------
  |  Branch (4766:16): [True: 8.04k, False: 19]
  ------------------
 4767|  8.04k|      current_frame->refresh_frame_flags =
 4768|  8.04k|          frame_is_sframe(cm) ? 0xFF : aom_rb_read_literal(rb, REF_FRAMES);
  ------------------
  |  Branch (4768:11): [True: 26, False: 8.01k]
  ------------------
 4769|  8.04k|    }
 4770|  8.28k|  }
 4771|       |
 4772|  26.6k|  if (!frame_is_intra_only(cm) || current_frame->refresh_frame_flags != 0xFF) {
  ------------------
  |  Branch (4772:7): [True: 8.06k, False: 18.5k]
  |  Branch (4772:35): [True: 423, False: 18.1k]
  ------------------
 4773|       |    // Read all ref frame order hints if error_resilient_mode == 1
 4774|  8.46k|    if (features->error_resilient_mode &&
  ------------------
  |  Branch (4774:9): [True: 193, False: 8.27k]
  ------------------
 4775|    193|        seq_params->order_hint_info.enable_order_hint) {
  ------------------
  |  Branch (4775:9): [True: 191, False: 2]
  ------------------
 4776|  1.63k|      for (int ref_idx = 0; ref_idx < REF_FRAMES; ref_idx++) {
  ------------------
  |  Branch (4776:29): [True: 1.44k, False: 191]
  ------------------
 4777|       |        // Read order hint from bit stream
 4778|  1.44k|        unsigned int order_hint = aom_rb_read_literal(
 4779|  1.44k|            rb, seq_params->order_hint_info.order_hint_bits_minus_1 + 1);
 4780|       |        // Get buffer
 4781|  1.44k|        RefCntBuffer *buf = cm->ref_frame_map[ref_idx];
 4782|  1.44k|        if (buf == NULL || order_hint != buf->order_hint) {
  ------------------
  |  Branch (4782:13): [True: 830, False: 616]
  |  Branch (4782:28): [True: 434, False: 182]
  ------------------
 4783|  1.24k|          if (buf != NULL) {
  ------------------
  |  Branch (4783:15): [True: 434, False: 810]
  ------------------
 4784|    434|            lock_buffer_pool(pool);
 4785|    434|            decrease_ref_count(buf, pool);
 4786|    434|            unlock_buffer_pool(pool);
 4787|    434|            cm->ref_frame_map[ref_idx] = NULL;
 4788|    434|          }
 4789|       |          // If no corresponding buffer exists, allocate a new buffer with all
 4790|       |          // pixels set to neutral grey.
 4791|  1.24k|          int buf_idx = get_free_fb(cm);
 4792|  1.24k|          if (buf_idx == INVALID_IDX) {
  ------------------
  |  |   15|  1.24k|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
  |  Branch (4792:15): [True: 0, False: 1.24k]
  ------------------
 4793|      0|            aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR,
 4794|      0|                               "Unable to find free frame buffer");
 4795|      0|          }
 4796|  1.24k|          buf = &frame_bufs[buf_idx];
 4797|  1.24k|          lock_buffer_pool(pool);
 4798|       |#if CONFIG_SIZE_LIMIT
 4799|       |          if (seq_params->max_frame_width > DECODE_WIDTH_LIMIT ||
 4800|       |              seq_params->max_frame_height > DECODE_HEIGHT_LIMIT) {
 4801|       |            decrease_ref_count(buf, pool);
 4802|       |            unlock_buffer_pool(pool);
 4803|       |            aom_internal_error(
 4804|       |                cm->error, AOM_CODEC_CORRUPT_FRAME,
 4805|       |                "Dimensions of %dx%d beyond allowed size of %dx%d.",
 4806|       |                seq_params->max_frame_width, seq_params->max_frame_height,
 4807|       |                DECODE_WIDTH_LIMIT, DECODE_HEIGHT_LIMIT);
 4808|       |          }
 4809|       |#endif
 4810|  1.24k|          if (aom_realloc_frame_buffer(
  ------------------
  |  Branch (4810:15): [True: 0, False: 1.24k]
  ------------------
 4811|  1.24k|                  &buf->buf, seq_params->max_frame_width,
 4812|  1.24k|                  seq_params->max_frame_height, seq_params->subsampling_x,
 4813|  1.24k|                  seq_params->subsampling_y, seq_params->use_highbitdepth,
 4814|  1.24k|                  AOM_BORDER_IN_PIXELS, features->byte_alignment,
  ------------------
  |  |   32|  1.24k|#define AOM_BORDER_IN_PIXELS 288
  ------------------
 4815|  1.24k|                  &buf->raw_frame_buffer, pool->get_fb_cb, pool->cb_priv, false,
 4816|  1.24k|                  0)) {
 4817|      0|            decrease_ref_count(buf, pool);
 4818|      0|            unlock_buffer_pool(pool);
 4819|      0|            aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR,
 4820|      0|                               "Failed to allocate frame buffer");
 4821|      0|          }
 4822|  1.24k|          unlock_buffer_pool(pool);
 4823|       |          // According to the specification, valid bitstreams are required to
 4824|       |          // never use missing reference frames so the filling process for
 4825|       |          // missing frames is not normatively defined and RefValid for missing
 4826|       |          // frames is set to 0.
 4827|       |
 4828|       |          // To make libaom more robust when the bitstream has been corrupted
 4829|       |          // by the loss of some frames of data, this code adds a neutral grey
 4830|       |          // buffer in place of missing frames, i.e.
 4831|       |          //
 4832|  1.24k|          set_planes_to_neutral_grey(seq_params, &buf->buf, 0);
 4833|       |          //
 4834|       |          // and allows the frames to be used for referencing, i.e.
 4835|       |          //
 4836|  1.24k|          pbi->valid_for_referencing[ref_idx] = 1;
 4837|       |          //
 4838|       |          // Please note such behavior is not normative and other decoders may
 4839|       |          // use a different approach.
 4840|  1.24k|          cm->ref_frame_map[ref_idx] = buf;
 4841|  1.24k|          buf->order_hint = order_hint;
 4842|  1.24k|        }
 4843|  1.44k|      }
 4844|    191|    }
 4845|  8.46k|  }
 4846|       |
 4847|  26.6k|  if (current_frame->frame_type == KEY_FRAME) {
  ------------------
  |  Branch (4847:7): [True: 18.3k, False: 8.30k]
  ------------------
 4848|  18.3k|    setup_frame_size(cm, frame_size_override_flag, rb);
 4849|       |
 4850|  18.3k|    if (features->allow_screen_content_tools && !av1_superres_scaled(cm))
  ------------------
  |  Branch (4850:9): [True: 7.92k, False: 10.3k]
  |  Branch (4850:49): [True: 7.90k, False: 19]
  ------------------
 4851|  7.90k|      features->allow_intrabc = aom_rb_read_bit(rb);
 4852|  18.3k|    features->allow_ref_frame_mvs = 0;
 4853|  18.3k|    cm->prev_frame = NULL;
 4854|  18.3k|  } else {
 4855|  8.30k|    features->allow_ref_frame_mvs = 0;
 4856|       |
 4857|  8.30k|    if (current_frame->frame_type == INTRA_ONLY_FRAME) {
  ------------------
  |  Branch (4857:9): [True: 226, False: 8.08k]
  ------------------
 4858|    226|      cm->cur_frame->film_grain_params_present =
 4859|    226|          seq_params->film_grain_params_present;
 4860|    226|      setup_frame_size(cm, frame_size_override_flag, rb);
 4861|    226|      if (features->allow_screen_content_tools && !av1_superres_scaled(cm))
  ------------------
  |  Branch (4861:11): [True: 192, False: 34]
  |  Branch (4861:51): [True: 114, False: 78]
  ------------------
 4862|    114|        features->allow_intrabc = aom_rb_read_bit(rb);
 4863|       |
 4864|  8.08k|    } else if (pbi->need_resync != 1) { /* Skip if need resync */
  ------------------
  |  Branch (4864:16): [True: 8.04k, False: 39]
  ------------------
 4865|  8.04k|      int frame_refs_short_signaling = 0;
 4866|       |      // Frame refs short signaling is off when error resilient mode is on.
 4867|  8.04k|      if (seq_params->order_hint_info.enable_order_hint)
  ------------------
  |  Branch (4867:11): [True: 8.04k, False: 0]
  ------------------
 4868|  8.04k|        frame_refs_short_signaling = aom_rb_read_bit(rb);
 4869|       |
 4870|  8.04k|      if (frame_refs_short_signaling) {
  ------------------
  |  Branch (4870:11): [True: 221, False: 7.82k]
  ------------------
 4871|       |        // == LAST_FRAME ==
 4872|    221|        const int lst_ref = aom_rb_read_literal(rb, REF_FRAMES_LOG2);
  ------------------
  |  |  554|    221|#define REF_FRAMES_LOG2 3
  ------------------
 4873|    221|        const RefCntBuffer *const lst_buf = cm->ref_frame_map[lst_ref];
 4874|       |
 4875|       |        // == GOLDEN_FRAME ==
 4876|    221|        const int gld_ref = aom_rb_read_literal(rb, REF_FRAMES_LOG2);
  ------------------
  |  |  554|    221|#define REF_FRAMES_LOG2 3
  ------------------
 4877|    221|        const RefCntBuffer *const gld_buf = cm->ref_frame_map[gld_ref];
 4878|       |
 4879|       |        // Most of the time, streams start with a keyframe. In that case,
 4880|       |        // ref_frame_map will have been filled in at that point and will not
 4881|       |        // contain any NULLs. However, streams are explicitly allowed to start
 4882|       |        // with an intra-only frame, so long as they don't then signal a
 4883|       |        // reference to a slot that hasn't been set yet. That's what we are
 4884|       |        // checking here.
 4885|    221|        if (lst_buf == NULL)
  ------------------
  |  Branch (4885:13): [True: 0, False: 221]
  ------------------
 4886|      0|          aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
 4887|      0|                             "Inter frame requests nonexistent reference");
 4888|    221|        if (gld_buf == NULL)
  ------------------
  |  Branch (4888:13): [True: 0, False: 221]
  ------------------
 4889|      0|          aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
 4890|      0|                             "Inter frame requests nonexistent reference");
 4891|       |
 4892|    221|        av1_set_frame_refs(cm, cm->remapped_ref_idx, lst_ref, gld_ref);
 4893|    221|      }
 4894|       |
 4895|  64.2k|      for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
  ------------------
  |  Branch (4895:23): [True: 56.2k, False: 8.04k]
  ------------------
 4896|  56.2k|        int ref = 0;
 4897|  56.2k|        if (!frame_refs_short_signaling) {
  ------------------
  |  Branch (4897:13): [True: 54.7k, False: 1.50k]
  ------------------
 4898|  54.7k|          ref = aom_rb_read_literal(rb, REF_FRAMES_LOG2);
  ------------------
  |  |  554|  54.7k|#define REF_FRAMES_LOG2 3
  ------------------
 4899|       |
 4900|       |          // Most of the time, streams start with a keyframe. In that case,
 4901|       |          // ref_frame_map will have been filled in at that point and will not
 4902|       |          // contain any NULLs. However, streams are explicitly allowed to start
 4903|       |          // with an intra-only frame, so long as they don't then signal a
 4904|       |          // reference to a slot that hasn't been set yet. That's what we are
 4905|       |          // checking here.
 4906|  54.7k|          if (cm->ref_frame_map[ref] == NULL)
  ------------------
  |  Branch (4906:15): [True: 0, False: 54.7k]
  ------------------
 4907|      0|            aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
 4908|      0|                               "Inter frame requests nonexistent reference");
 4909|  54.7k|          cm->remapped_ref_idx[i] = ref;
 4910|  54.7k|        } else {
 4911|  1.50k|          ref = cm->remapped_ref_idx[i];
 4912|  1.50k|        }
 4913|       |        // Check valid for referencing
 4914|  56.2k|        if (pbi->valid_for_referencing[ref] == 0)
  ------------------
  |  Branch (4914:13): [True: 0, False: 56.2k]
  ------------------
 4915|      0|          aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
 4916|      0|                             "Reference frame not valid for referencing");
 4917|       |
 4918|  56.2k|        cm->ref_frame_sign_bias[LAST_FRAME + i] = 0;
 4919|       |
 4920|  56.2k|        if (seq_params->frame_id_numbers_present_flag) {
  ------------------
  |  Branch (4920:13): [True: 0, False: 56.2k]
  ------------------
 4921|      0|          int frame_id_length = seq_params->frame_id_length;
 4922|      0|          int diff_len = seq_params->delta_frame_id_length;
 4923|      0|          int delta_frame_id_minus_1 = aom_rb_read_literal(rb, diff_len);
 4924|      0|          int ref_frame_id =
 4925|      0|              ((cm->current_frame_id - (delta_frame_id_minus_1 + 1) +
 4926|      0|                (1 << frame_id_length)) %
 4927|      0|               (1 << frame_id_length));
 4928|       |          // Compare values derived from delta_frame_id_minus_1 and
 4929|       |          // refresh_frame_flags.
 4930|      0|          if (ref_frame_id != cm->ref_frame_id[ref])
  ------------------
  |  Branch (4930:15): [True: 0, False: 0]
  ------------------
 4931|      0|            aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
 4932|      0|                               "Reference buffer frame ID mismatch");
 4933|      0|        }
 4934|  56.2k|      }
 4935|       |
 4936|  8.04k|      if (!features->error_resilient_mode && frame_size_override_flag) {
  ------------------
  |  Branch (4936:11): [True: 7.98k, False: 59]
  |  Branch (4936:46): [True: 189, False: 7.79k]
  ------------------
 4937|    189|        setup_frame_size_with_refs(cm, rb);
 4938|  7.85k|      } else {
 4939|  7.85k|        setup_frame_size(cm, frame_size_override_flag, rb);
 4940|  7.85k|      }
 4941|       |
 4942|  8.04k|      if (features->cur_frame_force_integer_mv) {
  ------------------
  |  Branch (4942:11): [True: 1.06k, False: 6.97k]
  ------------------
 4943|  1.06k|        features->allow_high_precision_mv = 0;
 4944|  6.97k|      } else {
 4945|  6.97k|        features->allow_high_precision_mv = aom_rb_read_bit(rb);
 4946|  6.97k|      }
 4947|  8.04k|      features->interp_filter = read_frame_interp_filter(rb);
 4948|  8.04k|      features->switchable_motion_mode = aom_rb_read_bit(rb);
 4949|  8.04k|    }
 4950|       |
 4951|  8.30k|    cm->prev_frame = get_primary_ref_frame_buf(cm);
 4952|  8.30k|    if (features->primary_ref_frame != PRIMARY_REF_NONE &&
  ------------------
  |  |   66|  16.6k|#define PRIMARY_REF_NONE 7
  ------------------
  |  Branch (4952:9): [True: 3.68k, False: 4.62k]
  ------------------
 4953|  3.68k|        get_primary_ref_frame_buf(cm) == NULL) {
  ------------------
  |  Branch (4953:9): [True: 2, False: 3.68k]
  ------------------
 4954|      2|      aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
 4955|      2|                         "Reference frame containing this frame's initial "
 4956|      2|                         "frame context is unavailable.");
 4957|      2|    }
 4958|       |
 4959|  8.30k|    if (!(current_frame->frame_type == INTRA_ONLY_FRAME) &&
  ------------------
  |  Branch (4959:9): [True: 7.99k, False: 310]
  ------------------
 4960|  7.99k|        pbi->need_resync != 1) {
  ------------------
  |  Branch (4960:9): [True: 7.99k, False: 2]
  ------------------
 4961|  7.99k|      if (frame_might_allow_ref_frame_mvs(cm))
  ------------------
  |  Branch (4961:11): [True: 7.81k, False: 184]
  ------------------
 4962|  7.81k|        features->allow_ref_frame_mvs = aom_rb_read_bit(rb);
 4963|    184|      else
 4964|    184|        features->allow_ref_frame_mvs = 0;
 4965|       |
 4966|  63.8k|      for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
  ------------------
  |  Branch (4966:32): [True: 55.8k, False: 7.99k]
  ------------------
 4967|  55.8k|        const RefCntBuffer *const ref_buf = get_ref_frame_buf(cm, i);
 4968|  55.8k|        struct scale_factors *const ref_scale_factors =
 4969|  55.8k|            get_ref_scale_factors(cm, i);
 4970|  55.8k|        av1_setup_scale_factors_for_frame(
 4971|  55.8k|            ref_scale_factors, ref_buf->buf.y_crop_width,
 4972|  55.8k|            ref_buf->buf.y_crop_height, cm->width, cm->height);
 4973|  55.8k|        if ((!av1_is_valid_scale(ref_scale_factors)))
  ------------------
  |  Branch (4973:13): [True: 12, False: 55.8k]
  ------------------
 4974|     12|          aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM,
 4975|     12|                             "Reference frame has invalid dimensions");
 4976|  55.8k|      }
 4977|  7.99k|    }
 4978|  8.30k|  }
 4979|       |
 4980|  26.6k|  av1_setup_frame_buf_refs(cm);
 4981|       |
 4982|  26.6k|  av1_setup_frame_sign_bias(cm);
 4983|       |
 4984|  26.6k|  cm->cur_frame->frame_type = current_frame->frame_type;
 4985|       |
 4986|  26.6k|  update_ref_frame_id(pbi);
 4987|       |
 4988|  26.6k|  const int might_bwd_adapt = !(seq_params->reduced_still_picture_hdr) &&
  ------------------
  |  Branch (4988:31): [True: 17.4k, False: 9.14k]
  ------------------
 4989|  17.4k|                              !(features->disable_cdf_update);
  ------------------
  |  Branch (4989:31): [True: 16.6k, False: 826]
  ------------------
 4990|  26.6k|  if (might_bwd_adapt) {
  ------------------
  |  Branch (4990:7): [True: 16.6k, False: 9.96k]
  ------------------
 4991|  16.6k|    features->refresh_frame_context = aom_rb_read_bit(rb)
  ------------------
  |  Branch (4991:39): [True: 1.54k, False: 15.1k]
  ------------------
 4992|  16.6k|                                          ? REFRESH_FRAME_CONTEXT_DISABLED
 4993|  16.6k|                                          : REFRESH_FRAME_CONTEXT_BACKWARD;
 4994|  16.6k|  } else {
 4995|  9.96k|    features->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
 4996|  9.96k|  }
 4997|       |
 4998|  26.6k|  cm->cur_frame->buf.bit_depth = seq_params->bit_depth;
 4999|  26.6k|  cm->cur_frame->buf.color_primaries = seq_params->color_primaries;
 5000|  26.6k|  cm->cur_frame->buf.transfer_characteristics =
 5001|  26.6k|      seq_params->transfer_characteristics;
 5002|  26.6k|  cm->cur_frame->buf.matrix_coefficients = seq_params->matrix_coefficients;
 5003|  26.6k|  cm->cur_frame->buf.monochrome = seq_params->monochrome;
 5004|  26.6k|  cm->cur_frame->buf.chroma_sample_position =
 5005|  26.6k|      seq_params->chroma_sample_position;
 5006|  26.6k|  cm->cur_frame->buf.color_range = seq_params->color_range;
 5007|  26.6k|  cm->cur_frame->buf.render_width = cm->render_width;
 5008|  26.6k|  cm->cur_frame->buf.render_height = cm->render_height;
 5009|       |
 5010|  26.6k|  if (pbi->need_resync) {
  ------------------
  |  Branch (5010:7): [True: 2, False: 26.6k]
  ------------------
 5011|      2|    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
 5012|      2|                       "Keyframe / intra-only frame required to reset decoder"
 5013|      2|                       " state");
 5014|      2|  }
 5015|       |
 5016|  26.6k|  if (features->allow_intrabc) {
  ------------------
  |  Branch (5016:7): [True: 1.69k, False: 24.9k]
  ------------------
 5017|       |    // Set parameters corresponding to no filtering.
 5018|  1.69k|    struct loopfilter *lf = &cm->lf;
 5019|  1.69k|    lf->filter_level[0] = 0;
 5020|  1.69k|    lf->filter_level[1] = 0;
 5021|  1.69k|    cm->cdef_info.cdef_bits = 0;
 5022|  1.69k|    cm->cdef_info.cdef_strengths[0] = 0;
 5023|  1.69k|    cm->cdef_info.nb_cdef_strengths = 1;
 5024|  1.69k|    cm->cdef_info.cdef_uv_strengths[0] = 0;
 5025|  1.69k|    cm->rst_info[0].frame_restoration_type = RESTORE_NONE;
 5026|  1.69k|    cm->rst_info[1].frame_restoration_type = RESTORE_NONE;
 5027|  1.69k|    cm->rst_info[2].frame_restoration_type = RESTORE_NONE;
 5028|  1.69k|  }
 5029|       |
 5030|  26.6k|  read_tile_info(pbi, rb);
 5031|  26.6k|  if (!av1_is_min_tile_width_satisfied(cm)) {
  ------------------
  |  Branch (5031:7): [True: 3, False: 26.6k]
  ------------------
 5032|      3|    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
 5033|      3|                       "Minimum tile width requirement not satisfied");
 5034|      3|  }
 5035|       |
 5036|  26.6k|  CommonQuantParams *const quant_params = &cm->quant_params;
 5037|  26.6k|  setup_quantization(quant_params, av1_num_planes(cm),
 5038|  26.6k|                     cm->seq_params->separate_uv_delta_q, rb);
 5039|  26.6k|  xd->bd = (int)seq_params->bit_depth;
 5040|       |
 5041|  26.6k|  CommonContexts *const above_contexts = &cm->above_contexts;
 5042|  26.6k|  if (above_contexts->num_planes < av1_num_planes(cm) ||
  ------------------
  |  Branch (5042:7): [True: 17.7k, False: 8.82k]
  ------------------
 5043|  8.82k|      above_contexts->num_mi_cols < cm->mi_params.mi_cols ||
  ------------------
  |  Branch (5043:7): [True: 2, False: 8.82k]
  ------------------
 5044|  18.4k|      above_contexts->num_tile_rows < cm->tiles.rows) {
  ------------------
  |  Branch (5044:7): [True: 751, False: 8.07k]
  ------------------
 5045|  18.4k|    av1_free_above_context_buffers(above_contexts);
 5046|  18.4k|    if (av1_alloc_above_context_buffers(above_contexts, cm->tiles.rows,
  ------------------
  |  Branch (5046:9): [True: 0, False: 18.4k]
  ------------------
 5047|  18.4k|                                        cm->mi_params.mi_cols,
 5048|  18.4k|                                        av1_num_planes(cm))) {
 5049|      0|      aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR,
 5050|      0|                         "Failed to allocate context buffers");
 5051|      0|    }
 5052|  18.4k|  }
 5053|       |
 5054|  26.6k|  if (features->primary_ref_frame == PRIMARY_REF_NONE) {
  ------------------
  |  |   66|  26.6k|#define PRIMARY_REF_NONE 7
  ------------------
  |  Branch (5054:7): [True: 22.8k, False: 3.79k]
  ------------------
 5055|  22.8k|    av1_setup_past_independence(cm);
 5056|  22.8k|  }
 5057|       |
 5058|  26.6k|  setup_segmentation(cm, rb);
 5059|       |
 5060|  26.6k|  cm->delta_q_info.delta_q_res = 1;
 5061|  26.6k|  cm->delta_q_info.delta_lf_res = 1;
 5062|  26.6k|  cm->delta_q_info.delta_lf_present_flag = 0;
 5063|  26.6k|  cm->delta_q_info.delta_lf_multi = 0;
 5064|  26.6k|  cm->delta_q_info.delta_q_present_flag =
 5065|  26.6k|      quant_params->base_qindex > 0 ? aom_rb_read_bit(rb) : 0;
  ------------------
  |  Branch (5065:7): [True: 23.2k, False: 3.38k]
  ------------------
 5066|  26.6k|  if (cm->delta_q_info.delta_q_present_flag) {
  ------------------
  |  Branch (5066:7): [True: 4.28k, False: 22.3k]
  ------------------
 5067|  4.28k|    xd->current_base_qindex = quant_params->base_qindex;
 5068|  4.28k|    cm->delta_q_info.delta_q_res = 1 << aom_rb_read_literal(rb, 2);
 5069|  4.28k|    if (!features->allow_intrabc)
  ------------------
  |  Branch (5069:9): [True: 3.45k, False: 833]
  ------------------
 5070|  3.45k|      cm->delta_q_info.delta_lf_present_flag = aom_rb_read_bit(rb);
 5071|  4.28k|    if (cm->delta_q_info.delta_lf_present_flag) {
  ------------------
  |  Branch (5071:9): [True: 1.85k, False: 2.43k]
  ------------------
 5072|  1.85k|      cm->delta_q_info.delta_lf_res = 1 << aom_rb_read_literal(rb, 2);
 5073|  1.85k|      cm->delta_q_info.delta_lf_multi = aom_rb_read_bit(rb);
 5074|  1.85k|      av1_reset_loop_filter_delta(xd, av1_num_planes(cm));
 5075|  1.85k|    }
 5076|  4.28k|  }
 5077|       |
 5078|  26.6k|  xd->cur_frame_force_integer_mv = features->cur_frame_force_integer_mv;
 5079|       |
 5080|   237k|  for (int i = 0; i < MAX_SEGMENTS; ++i) {
  ------------------
  |  |   21|   237k|#define MAX_SEGMENTS 8
  ------------------
  |  Branch (5080:19): [True: 211k, False: 26.6k]
  ------------------
 5081|   211k|    const int qindex = av1_get_qindex(&cm->seg, i, quant_params->base_qindex);
 5082|   211k|    xd->lossless[i] =
 5083|   211k|        qindex == 0 && quant_params->y_dc_delta_q == 0 &&
  ------------------
  |  Branch (5083:9): [True: 28.6k, False: 182k]
  |  Branch (5083:24): [True: 26.1k, False: 2.49k]
  ------------------
 5084|  26.1k|        quant_params->u_dc_delta_q == 0 && quant_params->u_ac_delta_q == 0 &&
  ------------------
  |  Branch (5084:9): [True: 25.6k, False: 470]
  |  Branch (5084:44): [True: 24.9k, False: 677]
  ------------------
 5085|  24.9k|        quant_params->v_dc_delta_q == 0 && quant_params->v_ac_delta_q == 0;
  ------------------
  |  Branch (5085:9): [True: 24.9k, False: 16]
  |  Branch (5085:44): [True: 24.9k, False: 0]
  ------------------
 5086|   211k|    xd->qindex[i] = qindex;
 5087|   211k|  }
 5088|  26.6k|  features->coded_lossless = is_coded_lossless(cm, xd);
 5089|  26.6k|  features->all_lossless = features->coded_lossless && !av1_superres_scaled(cm);
  ------------------
  |  Branch (5089:28): [True: 2.98k, False: 23.6k]
  |  Branch (5089:56): [True: 2.97k, False: 12]
  ------------------
 5090|  26.6k|  setup_segmentation_dequant(cm, xd);
 5091|  26.6k|  if (features->coded_lossless) {
  ------------------
  |  Branch (5091:7): [True: 2.98k, False: 23.6k]
  ------------------
 5092|  2.98k|    cm->lf.filter_level[0] = 0;
 5093|  2.98k|    cm->lf.filter_level[1] = 0;
 5094|  2.98k|  }
 5095|  26.6k|  if (features->coded_lossless || !seq_params->enable_cdef) {
  ------------------
  |  Branch (5095:7): [True: 3.19k, False: 23.4k]
  |  Branch (5095:35): [True: 6.53k, False: 16.8k]
  ------------------
 5096|  9.52k|    cm->cdef_info.cdef_bits = 0;
 5097|  9.52k|    cm->cdef_info.cdef_strengths[0] = 0;
 5098|  9.52k|    cm->cdef_info.cdef_uv_strengths[0] = 0;
 5099|  9.52k|  }
 5100|  26.6k|  if (features->all_lossless || !seq_params->enable_restoration) {
  ------------------
  |  Branch (5100:7): [True: 3.17k, False: 23.4k]
  |  Branch (5100:33): [True: 15.7k, False: 7.67k]
  ------------------
 5101|  18.7k|    cm->rst_info[0].frame_restoration_type = RESTORE_NONE;
 5102|  18.7k|    cm->rst_info[1].frame_restoration_type = RESTORE_NONE;
 5103|  18.7k|    cm->rst_info[2].frame_restoration_type = RESTORE_NONE;
 5104|  18.7k|  }
 5105|  26.6k|  setup_loopfilter(cm, rb);
 5106|       |
 5107|  26.6k|  if (!features->coded_lossless && seq_params->enable_cdef) {
  ------------------
  |  Branch (5107:7): [True: 23.3k, False: 3.25k]
  |  Branch (5107:36): [True: 16.8k, False: 6.51k]
  ------------------
 5108|  16.8k|    setup_cdef(cm, rb);
 5109|  16.8k|  }
 5110|  26.6k|  if (!features->all_lossless && seq_params->enable_restoration) {
  ------------------
  |  Branch (5110:7): [True: 23.3k, False: 3.28k]
  |  Branch (5110:34): [True: 7.63k, False: 15.6k]
  ------------------
 5111|  7.63k|    decode_restoration_mode(cm, rb);
 5112|  7.63k|  }
 5113|       |
 5114|  26.6k|  features->tx_mode = read_tx_mode(rb, features->coded_lossless);
 5115|  26.6k|  current_frame->reference_mode = read_frame_reference_mode(cm, rb);
 5116|       |
 5117|  26.6k|  av1_setup_skip_mode_allowed(cm);
 5118|  26.6k|  current_frame->skip_mode_info.skip_mode_flag =
 5119|  26.6k|      current_frame->skip_mode_info.skip_mode_allowed ? aom_rb_read_bit(rb) : 0;
  ------------------
  |  Branch (5119:7): [True: 506, False: 26.1k]
  ------------------
 5120|       |
 5121|  26.6k|  if (frame_might_allow_warped_motion(cm))
  ------------------
  |  Branch (5121:7): [True: 7.00k, False: 19.6k]
  ------------------
 5122|  7.00k|    features->allow_warped_motion = aom_rb_read_bit(rb);
 5123|  19.6k|  else
 5124|  19.6k|    features->allow_warped_motion = 0;
 5125|       |
 5126|  26.6k|  features->reduced_tx_set_used = aom_rb_read_bit(rb);
 5127|       |
 5128|  26.6k|  if (features->allow_ref_frame_mvs && !frame_might_allow_ref_frame_mvs(cm)) {
  ------------------
  |  Branch (5128:7): [True: 6.89k, False: 19.7k]
  |  Branch (5128:40): [True: 0, False: 6.89k]
  ------------------
 5129|      0|    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
 5130|      0|                       "Frame wrongly requests reference frame MVs");
 5131|      0|  }
 5132|       |
 5133|  26.6k|  if (!frame_is_intra_only(cm)) read_global_motion(cm, rb);
  ------------------
  |  Branch (5133:7): [True: 7.91k, False: 18.6k]
  ------------------
 5134|       |
 5135|  26.6k|  cm->cur_frame->film_grain_params_present =
 5136|  26.6k|      seq_params->film_grain_params_present;
 5137|  26.6k|  read_film_grain(cm, rb);
 5138|       |
 5139|  26.6k|#if EXT_TILE_DEBUG
 5140|  26.6k|  if (pbi->ext_tile_debug && cm->tiles.large_scale) {
  ------------------
  |  Branch (5140:7): [True: 0, False: 26.6k]
  |  Branch (5140:30): [True: 0, False: 0]
  ------------------
 5141|      0|    read_ext_tile_info(pbi, rb);
 5142|      0|    av1_set_single_tile_decoding_mode(cm);
 5143|      0|  }
 5144|  26.6k|#endif  // EXT_TILE_DEBUG
 5145|  26.6k|  return 0;
 5146|  26.8k|}
decodeframe.c:reset_frame_buffers:
 4464|    255|static inline void reset_frame_buffers(AV1_COMMON *cm) {
 4465|    255|  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
 4466|    255|  int i;
 4467|       |
 4468|    255|  lock_buffer_pool(cm->buffer_pool);
 4469|    255|  reset_ref_frame_map(cm);
 4470|    255|  assert(cm->cur_frame->ref_count == 1);
 4471|  4.33k|  for (i = 0; i < cm->buffer_pool->num_frame_bufs; ++i) {
  ------------------
  |  Branch (4471:15): [True: 4.08k, False: 255]
  ------------------
 4472|       |    // Reset all unreferenced frame buffers. We can also reset cm->cur_frame
 4473|       |    // because we are the sole owner of cm->cur_frame.
 4474|  4.08k|    if (frame_bufs[i].ref_count > 0 && &frame_bufs[i] != cm->cur_frame) {
  ------------------
  |  Branch (4474:9): [True: 341, False: 3.73k]
  |  Branch (4474:40): [True: 86, False: 255]
  ------------------
 4475|     86|      continue;
 4476|     86|    }
 4477|  3.99k|    frame_bufs[i].order_hint = 0;
 4478|  3.99k|    av1_zero(frame_bufs[i].ref_order_hints);
  ------------------
  |  |   43|  3.99k|#define av1_zero(dest) memset(&(dest), 0, sizeof(dest))
  ------------------
 4479|  3.99k|  }
 4480|    255|  av1_zero_unused_internal_frame_buffers(&cm->buffer_pool->int_frame_buffers);
 4481|    255|  unlock_buffer_pool(cm->buffer_pool);
 4482|    255|}
decodeframe.c:reset_ref_frame_map:
 4415|  17.8k|static inline void reset_ref_frame_map(AV1_COMMON *const cm) {
 4416|  17.8k|  BufferPool *const pool = cm->buffer_pool;
 4417|       |
 4418|   161k|  for (int i = 0; i < REF_FRAMES; i++) {
  ------------------
  |  Branch (4418:19): [True: 143k, False: 17.8k]
  ------------------
 4419|   143k|    decrease_ref_count(cm->ref_frame_map[i], pool);
 4420|       |    cm->ref_frame_map[i] = NULL;
 4421|   143k|  }
 4422|  17.8k|}
decodeframe.c:setup_frame_size:
 1980|  26.3k|                                    struct aom_read_bit_buffer *rb) {
 1981|  26.3k|  const SequenceHeader *const seq_params = cm->seq_params;
 1982|  26.3k|  int width, height;
 1983|       |
 1984|  26.3k|  if (frame_size_override_flag) {
  ------------------
  |  Branch (1984:7): [True: 383, False: 25.9k]
  ------------------
 1985|    383|    int num_bits_width = seq_params->num_bits_width;
 1986|    383|    int num_bits_height = seq_params->num_bits_height;
 1987|    383|    read_frame_size(rb, num_bits_width, num_bits_height, &width, &height);
 1988|    383|    if (width > seq_params->max_frame_width ||
  ------------------
  |  Branch (1988:9): [True: 9, False: 374]
  ------------------
 1989|    374|        height > seq_params->max_frame_height) {
  ------------------
  |  Branch (1989:9): [True: 6, False: 368]
  ------------------
 1990|     13|      aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
 1991|     13|                         "Frame dimensions are larger than the maximum values");
 1992|     13|    }
 1993|  25.9k|  } else {
 1994|  25.9k|    width = seq_params->max_frame_width;
 1995|  25.9k|    height = seq_params->max_frame_height;
 1996|  25.9k|  }
 1997|       |
 1998|  26.3k|  setup_superres(cm, rb, &width, &height);
 1999|  26.3k|  resize_context_buffers(cm, width, height);
 2000|  26.3k|  setup_render_size(cm, rb);
 2001|  26.3k|  setup_buffer_pool(cm);
 2002|  26.3k|}
decodeframe.c:read_frame_size:
 1873|  2.95k|                            int num_bits_height, int *width, int *height) {
 1874|  2.95k|  *width = aom_rb_read_literal(rb, num_bits_width) + 1;
 1875|  2.95k|  *height = aom_rb_read_literal(rb, num_bits_height) + 1;
 1876|  2.95k|}
decodeframe.c:setup_superres:
 1889|  26.5k|                                  int *height) {
 1890|  26.5k|  cm->superres_upscaled_width = *width;
 1891|  26.5k|  cm->superres_upscaled_height = *height;
 1892|       |
 1893|  26.5k|  const SequenceHeader *const seq_params = cm->seq_params;
 1894|  26.5k|  if (!seq_params->enable_superres) return;
  ------------------
  |  Branch (1894:7): [True: 25.8k, False: 725]
  ------------------
 1895|       |
 1896|    725|  if (aom_rb_read_bit(rb)) {
  ------------------
  |  Branch (1896:7): [True: 185, False: 540]
  ------------------
 1897|    185|    cm->superres_scale_denominator =
 1898|    185|        (uint8_t)aom_rb_read_literal(rb, SUPERRES_SCALE_BITS);
  ------------------
  |  |  638|    185|#define SUPERRES_SCALE_BITS 3
  ------------------
 1899|    185|    cm->superres_scale_denominator += SUPERRES_SCALE_DENOMINATOR_MIN;
  ------------------
  |  |  639|    185|#define SUPERRES_SCALE_DENOMINATOR_MIN (SCALE_NUMERATOR + 1)
  |  |  ------------------
  |  |  |  |   22|    185|#define SCALE_NUMERATOR 8
  |  |  ------------------
  ------------------
 1900|       |    // Don't edit cm->width or cm->height directly, or the buffers won't get
 1901|       |    // resized correctly
 1902|    185|    av1_calculate_scaled_superres_size(width, height,
 1903|    185|                                       cm->superres_scale_denominator);
 1904|    540|  } else {
 1905|       |    // 1:1 scaling - ie. no scaling, scale not provided
 1906|    540|    cm->superres_scale_denominator = SCALE_NUMERATOR;
  ------------------
  |  |   22|    540|#define SCALE_NUMERATOR 8
  ------------------
 1907|    540|  }
 1908|    725|}
decodeframe.c:resize_context_buffers:
 1911|  26.5k|                                          int height) {
 1912|       |#if CONFIG_SIZE_LIMIT
 1913|       |  if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT)
 1914|       |    aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
 1915|       |                       "Dimensions of %dx%d beyond allowed size of %dx%d.",
 1916|       |                       width, height, DECODE_WIDTH_LIMIT, DECODE_HEIGHT_LIMIT);
 1917|       |#endif
 1918|  26.5k|  if (cm->width != width || cm->height != height) {
  ------------------
  |  Branch (1918:7): [True: 17.7k, False: 8.78k]
  |  Branch (1918:29): [True: 33, False: 8.74k]
  ------------------
 1919|  17.8k|    const int new_mi_rows = CEIL_POWER_OF_TWO(height, MI_SIZE_LOG2);
  ------------------
  |  |   62|  17.8k|#define CEIL_POWER_OF_TWO(value, n) (((value) + (1 << (n)) - 1) >> (n))
  ------------------
 1920|  17.8k|    const int new_mi_cols = CEIL_POWER_OF_TWO(width, MI_SIZE_LOG2);
  ------------------
  |  |   62|  17.8k|#define CEIL_POWER_OF_TWO(value, n) (((value) + (1 << (n)) - 1) >> (n))
  ------------------
 1921|       |
 1922|       |    // Allocations in av1_alloc_context_buffers() depend on individual
 1923|       |    // dimensions as well as the overall size.
 1924|  17.8k|    if (new_mi_cols > cm->mi_params.mi_cols ||
  ------------------
  |  Branch (1924:9): [True: 17.6k, False: 143]
  ------------------
 1925|  17.6k|        new_mi_rows > cm->mi_params.mi_rows) {
  ------------------
  |  Branch (1925:9): [True: 22, False: 121]
  ------------------
 1926|  17.6k|      if (av1_alloc_context_buffers(cm, width, height, BLOCK_4X4)) {
  ------------------
  |  Branch (1926:11): [True: 0, False: 17.6k]
  ------------------
 1927|       |        // The cm->mi_* values have been cleared and any existing context
 1928|       |        // buffers have been freed. Clear cm->width and cm->height to be
 1929|       |        // consistent and to force a realloc next time.
 1930|      0|        cm->width = 0;
 1931|      0|        cm->height = 0;
 1932|      0|        aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
 1933|      0|                           "Failed to allocate context buffers");
 1934|      0|      }
 1935|  17.6k|    } else {
 1936|    121|      cm->mi_params.set_mb_mi(&cm->mi_params, width, height, BLOCK_4X4);
 1937|    121|    }
 1938|  17.8k|    av1_init_mi_buffers(&cm->mi_params);
 1939|  17.8k|    cm->width = width;
 1940|  17.8k|    cm->height = height;
 1941|  17.8k|  }
 1942|       |
 1943|  26.5k|  ensure_mv_buffer(cm->cur_frame, cm);
 1944|  26.5k|  cm->cur_frame->width = cm->width;
 1945|  26.5k|  cm->cur_frame->height = cm->height;
 1946|  26.5k|}
decodeframe.c:setup_render_size:
 1879|  26.3k|                                     struct aom_read_bit_buffer *rb) {
 1880|  26.3k|  cm->render_width = cm->superres_upscaled_width;
 1881|  26.3k|  cm->render_height = cm->superres_upscaled_height;
 1882|  26.3k|  if (aom_rb_read_bit(rb))
  ------------------
  |  Branch (1882:7): [True: 2.55k, False: 23.8k]
  ------------------
 1883|  2.55k|    read_frame_size(rb, 16, 16, &cm->render_width, &cm->render_height);
 1884|  26.3k|}
decodeframe.c:setup_buffer_pool:
 1948|  26.5k|static inline void setup_buffer_pool(AV1_COMMON *cm) {
 1949|  26.5k|  BufferPool *const pool = cm->buffer_pool;
 1950|  26.5k|  const SequenceHeader *const seq_params = cm->seq_params;
 1951|       |
 1952|  26.5k|  lock_buffer_pool(pool);
 1953|  26.5k|  if (aom_realloc_frame_buffer(
  ------------------
  |  Branch (1953:7): [True: 0, False: 26.5k]
  ------------------
 1954|  26.5k|          &cm->cur_frame->buf, cm->width, cm->height, seq_params->subsampling_x,
 1955|  26.5k|          seq_params->subsampling_y, seq_params->use_highbitdepth,
 1956|  26.5k|          AOM_DEC_BORDER_IN_PIXELS, cm->features.byte_alignment,
  ------------------
  |  |   35|  26.5k|#define AOM_DEC_BORDER_IN_PIXELS 64
  ------------------
 1957|  26.5k|          &cm->cur_frame->raw_frame_buffer, pool->get_fb_cb, pool->cb_priv,
 1958|  26.5k|          false, 0)) {
 1959|      0|    unlock_buffer_pool(pool);
 1960|      0|    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
 1961|      0|                       "Failed to allocate frame buffer");
 1962|      0|  }
 1963|  26.5k|  unlock_buffer_pool(pool);
 1964|       |
 1965|  26.5k|  cm->cur_frame->buf.bit_depth = (unsigned int)seq_params->bit_depth;
 1966|  26.5k|  cm->cur_frame->buf.color_primaries = seq_params->color_primaries;
 1967|  26.5k|  cm->cur_frame->buf.transfer_characteristics =
 1968|  26.5k|      seq_params->transfer_characteristics;
 1969|  26.5k|  cm->cur_frame->buf.matrix_coefficients = seq_params->matrix_coefficients;
 1970|  26.5k|  cm->cur_frame->buf.monochrome = seq_params->monochrome;
 1971|  26.5k|  cm->cur_frame->buf.chroma_sample_position =
 1972|  26.5k|      seq_params->chroma_sample_position;
 1973|  26.5k|  cm->cur_frame->buf.color_range = seq_params->color_range;
 1974|  26.5k|  cm->cur_frame->buf.render_width = cm->render_width;
 1975|  26.5k|  cm->cur_frame->buf.render_height = cm->render_height;
 1976|  26.5k|}
decodeframe.c:setup_frame_size_with_refs:
 2018|    189|                                              struct aom_read_bit_buffer *rb) {
 2019|    189|  int width, height;
 2020|    189|  int found = 0;
 2021|    189|  int has_valid_ref_frame = 0;
 2022|    644|  for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
  ------------------
  |  Branch (2022:28): [True: 627, False: 17]
  ------------------
 2023|    627|    if (aom_rb_read_bit(rb)) {
  ------------------
  |  Branch (2023:9): [True: 172, False: 455]
  ------------------
 2024|    172|      const RefCntBuffer *const ref_buf = get_ref_frame_buf(cm, i);
 2025|       |      // This will never be NULL in a normal stream, as streams are required to
 2026|       |      // have a shown keyframe before any inter frames, which would refresh all
 2027|       |      // the reference buffers. However, it might be null if we're starting in
 2028|       |      // the middle of a stream, and static analysis will error if we don't do
 2029|       |      // a null check here.
 2030|    172|      if (ref_buf == NULL) {
  ------------------
  |  Branch (2030:11): [True: 0, False: 172]
  ------------------
 2031|      0|        aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
 2032|      0|                           "Invalid condition: invalid reference buffer");
 2033|    172|      } else {
 2034|    172|        const YV12_BUFFER_CONFIG *const buf = &ref_buf->buf;
 2035|    172|        width = buf->y_crop_width;
 2036|    172|        height = buf->y_crop_height;
 2037|    172|        cm->render_width = buf->render_width;
 2038|    172|        cm->render_height = buf->render_height;
 2039|    172|        setup_superres(cm, rb, &width, &height);
 2040|    172|        resize_context_buffers(cm, width, height);
 2041|    172|        found = 1;
 2042|    172|        break;
 2043|    172|      }
 2044|    172|    }
 2045|    627|  }
 2046|       |
 2047|    189|  const SequenceHeader *const seq_params = cm->seq_params;
 2048|    189|  if (!found) {
  ------------------
  |  Branch (2048:7): [True: 17, False: 172]
  ------------------
 2049|     17|    int num_bits_width = seq_params->num_bits_width;
 2050|     17|    int num_bits_height = seq_params->num_bits_height;
 2051|       |
 2052|     17|    read_frame_size(rb, num_bits_width, num_bits_height, &width, &height);
 2053|     17|    setup_superres(cm, rb, &width, &height);
 2054|     17|    resize_context_buffers(cm, width, height);
 2055|     17|    setup_render_size(cm, rb);
 2056|     17|  }
 2057|       |
 2058|    189|  if (width <= 0 || height <= 0)
  ------------------
  |  Branch (2058:7): [True: 0, False: 189]
  |  Branch (2058:21): [True: 0, False: 189]
  ------------------
 2059|      0|    aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
 2060|      0|                       "Invalid frame size");
 2061|       |
 2062|       |  // Check to make sure at least one of frames that this frame references
 2063|       |  // has valid dimensions.
 2064|  1.51k|  for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
  ------------------
  |  Branch (2064:28): [True: 1.32k, False: 189]
  ------------------
 2065|  1.32k|    const RefCntBuffer *const ref_frame = get_ref_frame_buf(cm, i);
 2066|  1.32k|    has_valid_ref_frame |=
 2067|  1.32k|        valid_ref_frame_size(ref_frame->buf.y_crop_width,
 2068|  1.32k|                             ref_frame->buf.y_crop_height, width, height);
 2069|  1.32k|  }
 2070|    189|  if (!has_valid_ref_frame)
  ------------------
  |  Branch (2070:7): [True: 12, False: 177]
  ------------------
 2071|     12|    aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
 2072|     12|                       "Referenced frame has invalid size");
 2073|  1.42k|  for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
  ------------------
  |  Branch (2073:28): [True: 1.23k, False: 189]
  ------------------
 2074|  1.23k|    const RefCntBuffer *const ref_frame = get_ref_frame_buf(cm, i);
 2075|  1.23k|    if (!valid_ref_frame_img_fmt(
  ------------------
  |  Branch (2075:9): [True: 0, False: 1.23k]
  ------------------
 2076|  1.23k|            ref_frame->buf.bit_depth, ref_frame->buf.subsampling_x,
 2077|  1.23k|            ref_frame->buf.subsampling_y, seq_params->bit_depth,
 2078|  1.23k|            seq_params->subsampling_x, seq_params->subsampling_y))
 2079|      0|      aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
 2080|      0|                         "Referenced frame has incompatible color format");
 2081|  1.23k|  }
 2082|    189|  setup_buffer_pool(cm);
 2083|    189|}
decodeframe.c:valid_ref_frame_img_fmt:
 2012|  1.23k|                                          int this_xss, int this_yss) {
 2013|  1.23k|  return ref_bit_depth == this_bit_depth && ref_xss == this_xss &&
  ------------------
  |  Branch (2013:10): [True: 1.23k, False: 0]
  |  Branch (2013:45): [True: 1.23k, False: 0]
  ------------------
 2014|  1.23k|         ref_yss == this_yss;
  ------------------
  |  Branch (2014:10): [True: 1.23k, False: 0]
  ------------------
 2015|  1.23k|}
decodeframe.c:read_frame_interp_filter:
 1867|  7.99k|static InterpFilter read_frame_interp_filter(struct aom_read_bit_buffer *rb) {
 1868|  7.99k|  return aom_rb_read_bit(rb) ? SWITCHABLE
  ------------------
  |  Branch (1868:10): [True: 1.46k, False: 6.53k]
  ------------------
 1869|  7.99k|                             : aom_rb_read_literal(rb, LOG_SWITCHABLE_FILTERS);
  ------------------
  |  |   98|  6.53k|#define LOG_SWITCHABLE_FILTERS 2
  ------------------
 1870|  7.99k|}
decodeframe.c:update_ref_frame_id:
 4426|  26.5k|static inline void update_ref_frame_id(AV1Decoder *const pbi) {
 4427|  26.5k|  AV1_COMMON *const cm = &pbi->common;
 4428|  26.5k|  int refresh_frame_flags = cm->current_frame.refresh_frame_flags;
 4429|   238k|  for (int i = 0; i < REF_FRAMES; i++) {
  ------------------
  |  Branch (4429:19): [True: 212k, False: 26.5k]
  ------------------
 4430|   212k|    if ((refresh_frame_flags >> i) & 1) {
  ------------------
  |  Branch (4430:9): [True: 155k, False: 56.4k]
  ------------------
 4431|   155k|      cm->ref_frame_id[i] = cm->current_frame_id;
 4432|   155k|      pbi->valid_for_referencing[i] = 1;
 4433|   155k|    }
 4434|   212k|  }
 4435|  26.5k|}
decodeframe.c:read_tile_info:
 2182|  26.5k|                                  struct aom_read_bit_buffer *const rb) {
 2183|  26.5k|  AV1_COMMON *const cm = &pbi->common;
 2184|       |
 2185|  26.5k|  read_tile_info_max_tile(cm, rb);
 2186|       |
 2187|  26.5k|  pbi->context_update_tile_id = 0;
 2188|  26.5k|  if (cm->tiles.rows * cm->tiles.cols > 1) {
  ------------------
  |  Branch (2188:7): [True: 1.19k, False: 25.3k]
  ------------------
 2189|       |    // tile to use for cdf update
 2190|  1.19k|    pbi->context_update_tile_id =
 2191|  1.19k|        aom_rb_read_literal(rb, cm->tiles.log2_rows + cm->tiles.log2_cols);
 2192|  1.19k|    if (pbi->context_update_tile_id >= cm->tiles.rows * cm->tiles.cols) {
  ------------------
  |  Branch (2192:9): [True: 11, False: 1.17k]
  ------------------
 2193|     11|      aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
 2194|     11|                         "Invalid context_update_tile_id");
 2195|     11|    }
 2196|       |    // tile size magnitude
 2197|  1.19k|    pbi->tile_size_bytes = aom_rb_read_literal(rb, 2) + 1;
 2198|  1.19k|  }
 2199|  26.5k|}
decodeframe.c:read_tile_info_max_tile:
 2098|  26.5k|    AV1_COMMON *const cm, struct aom_read_bit_buffer *const rb) {
 2099|  26.5k|  const SequenceHeader *const seq_params = cm->seq_params;
 2100|  26.5k|  CommonTileParams *const tiles = &cm->tiles;
 2101|  26.5k|  int width_sb =
 2102|  26.5k|      CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, seq_params->mib_size_log2);
  ------------------
  |  |   62|  26.5k|#define CEIL_POWER_OF_TWO(value, n) (((value) + (1 << (n)) - 1) >> (n))
  ------------------
 2103|  26.5k|  int height_sb =
 2104|  26.5k|      CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, seq_params->mib_size_log2);
  ------------------
  |  |   62|  26.5k|#define CEIL_POWER_OF_TWO(value, n) (((value) + (1 << (n)) - 1) >> (n))
  ------------------
 2105|       |
 2106|  26.5k|  av1_get_tile_limits(cm);
 2107|  26.5k|  tiles->uniform_spacing = aom_rb_read_bit(rb);
 2108|       |
 2109|       |  // Read tile columns
 2110|  26.5k|  if (tiles->uniform_spacing) {
  ------------------
  |  Branch (2110:7): [True: 23.9k, False: 2.53k]
  ------------------
 2111|  23.9k|    tiles->log2_cols = tiles->min_log2_cols;
 2112|  24.2k|    while (tiles->log2_cols < tiles->max_log2_cols) {
  ------------------
  |  Branch (2112:12): [True: 15.6k, False: 8.56k]
  ------------------
 2113|  15.6k|      if (!aom_rb_read_bit(rb)) {
  ------------------
  |  Branch (2113:11): [True: 15.4k, False: 279]
  ------------------
 2114|  15.4k|        break;
 2115|  15.4k|      }
 2116|    279|      tiles->log2_cols++;
 2117|    279|    }
 2118|  23.9k|  } else {
 2119|  2.53k|    int i;
 2120|  2.53k|    int start_sb;
 2121|  5.41k|    for (i = 0, start_sb = 0; width_sb > 0 && i < MAX_TILE_COLS; i++) {
  ------------------
  |  |   54|  2.88k|#define MAX_TILE_COLS 64
  ------------------
  |  Branch (2121:31): [True: 2.88k, False: 2.53k]
  |  Branch (2121:47): [True: 2.88k, False: 0]
  ------------------
 2122|  2.88k|      const int size_sb =
 2123|  2.88k|          1 + rb_read_uniform(rb, AOMMIN(width_sb, tiles->max_width_sb));
  ------------------
  |  |   34|  2.88k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 2.88k, False: 0]
  |  |  ------------------
  ------------------
 2124|  2.88k|      tiles->col_start_sb[i] = start_sb;
 2125|  2.88k|      start_sb += size_sb;
 2126|  2.88k|      width_sb -= size_sb;
 2127|  2.88k|    }
 2128|  2.53k|    tiles->cols = i;
 2129|  2.53k|    tiles->col_start_sb[i] = start_sb + width_sb;
 2130|  2.53k|  }
 2131|  26.5k|  av1_calculate_tile_cols(seq_params, cm->mi_params.mi_rows,
 2132|  26.5k|                          cm->mi_params.mi_cols, tiles);
 2133|       |
 2134|       |  // Read tile rows
 2135|  26.5k|  if (tiles->uniform_spacing) {
  ------------------
  |  Branch (2135:7): [True: 23.9k, False: 2.53k]
  ------------------
 2136|  23.9k|    tiles->log2_rows = tiles->min_log2_rows;
 2137|  24.8k|    while (tiles->log2_rows < tiles->max_log2_rows) {
  ------------------
  |  Branch (2137:12): [True: 15.0k, False: 9.78k]
  ------------------
 2138|  15.0k|      if (!aom_rb_read_bit(rb)) {
  ------------------
  |  Branch (2138:11): [True: 14.1k, False: 903]
  ------------------
 2139|  14.1k|        break;
 2140|  14.1k|      }
 2141|    903|      tiles->log2_rows++;
 2142|    903|    }
 2143|  23.9k|  } else {
 2144|  2.53k|    int i;
 2145|  2.53k|    int start_sb;
 2146|  5.40k|    for (i = 0, start_sb = 0; height_sb > 0 && i < MAX_TILE_ROWS; i++) {
  ------------------
  |  |   53|  2.87k|#define MAX_TILE_ROWS 64
  ------------------
  |  Branch (2146:31): [True: 2.87k, False: 2.53k]
  |  Branch (2146:48): [True: 2.87k, False: 0]
  ------------------
 2147|  2.87k|      const int size_sb =
 2148|  2.87k|          1 + rb_read_uniform(rb, AOMMIN(height_sb, tiles->max_height_sb));
  ------------------
  |  |   34|  2.87k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 536, False: 2.33k]
  |  |  ------------------
  ------------------
 2149|  2.87k|      tiles->row_start_sb[i] = start_sb;
 2150|  2.87k|      start_sb += size_sb;
 2151|  2.87k|      height_sb -= size_sb;
 2152|  2.87k|    }
 2153|  2.53k|    tiles->rows = i;
 2154|  2.53k|    tiles->row_start_sb[i] = start_sb + height_sb;
 2155|  2.53k|  }
 2156|  26.5k|  av1_calculate_tile_rows(seq_params, cm->mi_params.mi_rows, tiles);
 2157|  26.5k|}
decodeframe.c:rb_read_uniform:
 2086|  5.75k|static int rb_read_uniform(struct aom_read_bit_buffer *const rb, int n) {
 2087|  5.75k|  const int l = get_unsigned_bits(n);
 2088|  5.75k|  const int m = (1 << l) - n;
 2089|  5.75k|  const int v = aom_rb_read_literal(rb, l - 1);
 2090|  5.75k|  assert(l != 0);
 2091|  5.75k|  if (v < m)
  ------------------
  |  Branch (2091:7): [True: 5.57k, False: 176]
  ------------------
 2092|  5.57k|    return v;
 2093|    176|  else
 2094|    176|    return (v << 1) - m + aom_rb_read_bit(rb);
 2095|  5.75k|}
decodeframe.c:setup_quantization:
 1778|  26.4k|                                      struct aom_read_bit_buffer *rb) {
 1779|  26.4k|  quant_params->base_qindex = aom_rb_read_literal(rb, QINDEX_BITS);
  ------------------
  |  |   28|  26.4k|#define QINDEX_BITS 8
  ------------------
 1780|  26.4k|  quant_params->y_dc_delta_q = read_delta_q(rb);
 1781|  26.4k|  if (num_planes > 1) {
  ------------------
  |  Branch (1781:7): [True: 21.4k, False: 5.06k]
  ------------------
 1782|  21.4k|    int diff_uv_delta = 0;
 1783|  21.4k|    if (separate_uv_delta_q) diff_uv_delta = aom_rb_read_bit(rb);
  ------------------
  |  Branch (1783:9): [True: 628, False: 20.7k]
  ------------------
 1784|  21.4k|    quant_params->u_dc_delta_q = read_delta_q(rb);
 1785|  21.4k|    quant_params->u_ac_delta_q = read_delta_q(rb);
 1786|  21.4k|    if (diff_uv_delta) {
  ------------------
  |  Branch (1786:9): [True: 145, False: 21.2k]
  ------------------
 1787|    145|      quant_params->v_dc_delta_q = read_delta_q(rb);
 1788|    145|      quant_params->v_ac_delta_q = read_delta_q(rb);
 1789|  21.2k|    } else {
 1790|  21.2k|      quant_params->v_dc_delta_q = quant_params->u_dc_delta_q;
 1791|  21.2k|      quant_params->v_ac_delta_q = quant_params->u_ac_delta_q;
 1792|  21.2k|    }
 1793|  21.4k|  } else {
 1794|  5.06k|    quant_params->u_dc_delta_q = 0;
 1795|  5.06k|    quant_params->u_ac_delta_q = 0;
 1796|  5.06k|    quant_params->v_dc_delta_q = 0;
 1797|  5.06k|    quant_params->v_ac_delta_q = 0;
 1798|  5.06k|  }
 1799|  26.4k|  quant_params->using_qmatrix = aom_rb_read_bit(rb);
 1800|  26.4k|  if (quant_params->using_qmatrix) {
  ------------------
  |  Branch (1800:7): [True: 3.40k, False: 23.0k]
  ------------------
 1801|  3.40k|    quant_params->qmatrix_level_y = aom_rb_read_literal(rb, QM_LEVEL_BITS);
  ------------------
  |  |   30|  3.40k|#define QM_LEVEL_BITS 4
  ------------------
 1802|  3.40k|    quant_params->qmatrix_level_u = aom_rb_read_literal(rb, QM_LEVEL_BITS);
  ------------------
  |  |   30|  3.40k|#define QM_LEVEL_BITS 4
  ------------------
 1803|  3.40k|    if (!separate_uv_delta_q)
  ------------------
  |  Branch (1803:9): [True: 3.36k, False: 37]
  ------------------
 1804|  3.36k|      quant_params->qmatrix_level_v = quant_params->qmatrix_level_u;
 1805|     37|    else
 1806|     37|      quant_params->qmatrix_level_v = aom_rb_read_literal(rb, QM_LEVEL_BITS);
  ------------------
  |  |   30|     37|#define QM_LEVEL_BITS 4
  ------------------
 1807|  23.0k|  } else {
 1808|  23.0k|    quant_params->qmatrix_level_y = 0;
 1809|  23.0k|    quant_params->qmatrix_level_u = 0;
 1810|  23.0k|    quant_params->qmatrix_level_v = 0;
 1811|  23.0k|  }
 1812|  26.4k|}
decodeframe.c:read_delta_q:
 1772|  69.6k|static inline int read_delta_q(struct aom_read_bit_buffer *rb) {
 1773|  69.6k|  return aom_rb_read_bit(rb) ? aom_rb_read_inv_signed_literal(rb, 6) : 0;
  ------------------
  |  Branch (1773:10): [True: 7.63k, False: 62.0k]
  ------------------
 1774|  69.6k|}
decodeframe.c:setup_segmentation:
 1432|  26.4k|                                      struct aom_read_bit_buffer *rb) {
 1433|  26.4k|  struct segmentation *const seg = &cm->seg;
 1434|       |
 1435|  26.4k|  seg->update_map = 0;
 1436|  26.4k|  seg->update_data = 0;
 1437|  26.4k|  seg->temporal_update = 0;
 1438|       |
 1439|  26.4k|  seg->enabled = aom_rb_read_bit(rb);
 1440|  26.4k|  if (!seg->enabled) {
  ------------------
  |  Branch (1440:7): [True: 23.4k, False: 3.05k]
  ------------------
 1441|  23.4k|    if (cm->cur_frame->seg_map) {
  ------------------
  |  Branch (1441:9): [True: 23.4k, False: 0]
  ------------------
 1442|  23.4k|      memset(cm->cur_frame->seg_map, 0,
 1443|  23.4k|             (cm->cur_frame->mi_rows * cm->cur_frame->mi_cols));
 1444|  23.4k|    }
 1445|       |
 1446|  23.4k|    memset(seg, 0, sizeof(*seg));
 1447|  23.4k|    segfeatures_copy(&cm->cur_frame->seg, seg);
 1448|  23.4k|    return;
 1449|  23.4k|  }
 1450|  3.05k|  if (cm->seg.enabled && cm->prev_frame &&
  ------------------
  |  Branch (1450:7): [True: 3.05k, False: 0]
  |  Branch (1450:26): [True: 178, False: 2.87k]
  ------------------
 1451|    178|      (cm->mi_params.mi_rows == cm->prev_frame->mi_rows) &&
  ------------------
  |  Branch (1451:7): [True: 178, False: 0]
  ------------------
 1452|    178|      (cm->mi_params.mi_cols == cm->prev_frame->mi_cols)) {
  ------------------
  |  Branch (1452:7): [True: 176, False: 2]
  ------------------
 1453|    176|    cm->last_frame_seg_map = cm->prev_frame->seg_map;
 1454|  2.87k|  } else {
 1455|  2.87k|    cm->last_frame_seg_map = NULL;
 1456|  2.87k|  }
 1457|       |  // Read update flags
 1458|  3.05k|  if (cm->features.primary_ref_frame == PRIMARY_REF_NONE) {
  ------------------
  |  |   66|  3.05k|#define PRIMARY_REF_NONE 7
  ------------------
  |  Branch (1458:7): [True: 2.87k, False: 178]
  ------------------
 1459|       |    // These frames can't use previous frames, so must signal map + features
 1460|  2.87k|    seg->update_map = 1;
 1461|  2.87k|    seg->temporal_update = 0;
 1462|  2.87k|    seg->update_data = 1;
 1463|  2.87k|  } else {
 1464|    178|    seg->update_map = aom_rb_read_bit(rb);
 1465|    178|    if (seg->update_map) {
  ------------------
  |  Branch (1465:9): [True: 73, False: 105]
  ------------------
 1466|     73|      seg->temporal_update = aom_rb_read_bit(rb);
 1467|    105|    } else {
 1468|    105|      seg->temporal_update = 0;
 1469|    105|    }
 1470|    178|    seg->update_data = aom_rb_read_bit(rb);
 1471|    178|  }
 1472|       |
 1473|       |  // Segmentation data update
 1474|  3.05k|  if (seg->update_data) {
  ------------------
  |  Branch (1474:7): [True: 2.92k, False: 132]
  ------------------
 1475|  2.92k|    av1_clearall_segfeatures(seg);
 1476|       |
 1477|  25.9k|    for (int i = 0; i < MAX_SEGMENTS; i++) {
  ------------------
  |  |   21|  25.9k|#define MAX_SEGMENTS 8
  ------------------
  |  Branch (1477:21): [True: 22.9k, False: 2.92k]
  ------------------
 1478|   206k|      for (int j = 0; j < SEG_LVL_MAX; j++) {
  ------------------
  |  Branch (1478:23): [True: 183k, False: 22.9k]
  ------------------
 1479|   183k|        int data = 0;
 1480|   183k|        const int feature_enabled = aom_rb_read_bit(rb);
 1481|   183k|        if (feature_enabled) {
  ------------------
  |  Branch (1481:13): [True: 77.8k, False: 105k]
  ------------------
 1482|  77.8k|          av1_enable_segfeature(seg, i, j);
 1483|       |
 1484|  77.8k|          const int data_max = av1_seg_feature_data_max(j);
 1485|  77.8k|          const int data_min = -data_max;
 1486|  77.8k|          const int ubits = get_unsigned_bits(data_max);
 1487|       |
 1488|  77.8k|          if (av1_is_segfeature_signed(j)) {
  ------------------
  |  Branch (1488:15): [True: 48.9k, False: 28.9k]
  ------------------
 1489|  48.9k|            data = aom_rb_read_inv_signed_literal(rb, ubits);
 1490|  48.9k|          } else {
 1491|  28.9k|            data = aom_rb_read_literal(rb, ubits);
 1492|  28.9k|          }
 1493|       |
 1494|  77.8k|          data = clamp(data, data_min, data_max);
 1495|  77.8k|        }
 1496|   183k|        av1_set_segdata(seg, i, j, data);
 1497|   183k|      }
 1498|  22.9k|    }
 1499|  2.92k|    av1_calculate_segdata(seg);
 1500|  2.92k|  } else if (cm->prev_frame) {
  ------------------
  |  Branch (1500:14): [True: 132, False: 0]
  ------------------
 1501|    132|    segfeatures_copy(seg, &cm->prev_frame->seg);
 1502|    132|  }
 1503|  3.05k|  segfeatures_copy(&cm->cur_frame->seg, seg);
 1504|  3.05k|}
decodeframe.c:setup_segmentation_dequant:
 1824|  26.4k|                                              MACROBLOCKD *const xd) {
 1825|  26.4k|  const int bit_depth = cm->seq_params->bit_depth;
 1826|       |  // When segmentation is disabled, only the first value is used.  The
 1827|       |  // remaining are don't cares.
 1828|  26.4k|  const int max_segments = cm->seg.enabled ? MAX_SEGMENTS : 1;
  ------------------
  |  |   21|  2.97k|#define MAX_SEGMENTS 8
  ------------------
  |  Branch (1828:28): [True: 2.97k, False: 23.4k]
  ------------------
 1829|  26.4k|  CommonQuantParams *const quant_params = &cm->quant_params;
 1830|  73.6k|  for (int i = 0; i < max_segments; ++i) {
  ------------------
  |  Branch (1830:19): [True: 47.2k, False: 26.4k]
  ------------------
 1831|  47.2k|    const int qindex = xd->qindex[i];
 1832|  47.2k|    quant_params->y_dequant_QTX[i][0] =
 1833|  47.2k|        av1_dc_quant_QTX(qindex, quant_params->y_dc_delta_q, bit_depth);
 1834|  47.2k|    quant_params->y_dequant_QTX[i][1] = av1_ac_quant_QTX(qindex, 0, bit_depth);
 1835|  47.2k|    quant_params->u_dequant_QTX[i][0] =
 1836|  47.2k|        av1_dc_quant_QTX(qindex, quant_params->u_dc_delta_q, bit_depth);
 1837|  47.2k|    quant_params->u_dequant_QTX[i][1] =
 1838|  47.2k|        av1_ac_quant_QTX(qindex, quant_params->u_ac_delta_q, bit_depth);
 1839|  47.2k|    quant_params->v_dequant_QTX[i][0] =
 1840|  47.2k|        av1_dc_quant_QTX(qindex, quant_params->v_dc_delta_q, bit_depth);
 1841|  47.2k|    quant_params->v_dequant_QTX[i][1] =
 1842|  47.2k|        av1_ac_quant_QTX(qindex, quant_params->v_ac_delta_q, bit_depth);
 1843|  47.2k|    const int use_qmatrix = av1_use_qmatrix(quant_params, xd, i);
 1844|       |    // NB: depends on base index so there is only 1 set per frame
 1845|       |    // No quant weighting when lossless or signalled not using QM
 1846|  47.2k|    const int qmlevel_y =
 1847|  47.2k|        use_qmatrix ? quant_params->qmatrix_level_y : NUM_QM_LEVELS - 1;
  ------------------
  |  |   31|  37.3k|#define NUM_QM_LEVELS (1 << QM_LEVEL_BITS)
  |  |  ------------------
  |  |  |  |   30|  37.3k|#define QM_LEVEL_BITS 4
  |  |  ------------------
  ------------------
  |  Branch (1847:9): [True: 9.91k, False: 37.3k]
  ------------------
 1848|   944k|    for (int j = 0; j < TX_SIZES_ALL; ++j) {
  ------------------
  |  Branch (1848:21): [True: 897k, False: 47.2k]
  ------------------
 1849|   897k|      quant_params->y_iqmatrix[i][j] =
 1850|   897k|          get_iqmatrix(quant_params, qmlevel_y, AOM_PLANE_Y, j);
  ------------------
  |  |  210|   897k|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
 1851|   897k|    }
 1852|  47.2k|    const int qmlevel_u =
 1853|  47.2k|        use_qmatrix ? quant_params->qmatrix_level_u : NUM_QM_LEVELS - 1;
  ------------------
  |  |   31|  37.3k|#define NUM_QM_LEVELS (1 << QM_LEVEL_BITS)
  |  |  ------------------
  |  |  |  |   30|  37.3k|#define QM_LEVEL_BITS 4
  |  |  ------------------
  ------------------
  |  Branch (1853:9): [True: 9.91k, False: 37.3k]
  ------------------
 1854|   944k|    for (int j = 0; j < TX_SIZES_ALL; ++j) {
  ------------------
  |  Branch (1854:21): [True: 897k, False: 47.2k]
  ------------------
 1855|   897k|      quant_params->u_iqmatrix[i][j] =
 1856|   897k|          get_iqmatrix(quant_params, qmlevel_u, AOM_PLANE_U, j);
  ------------------
  |  |  211|   897k|#define AOM_PLANE_U 1      /**< U (Chroma) plane */
  ------------------
 1857|   897k|    }
 1858|  47.2k|    const int qmlevel_v =
 1859|  47.2k|        use_qmatrix ? quant_params->qmatrix_level_v : NUM_QM_LEVELS - 1;
  ------------------
  |  |   31|  37.3k|#define NUM_QM_LEVELS (1 << QM_LEVEL_BITS)
  |  |  ------------------
  |  |  |  |   30|  37.3k|#define QM_LEVEL_BITS 4
  |  |  ------------------
  ------------------
  |  Branch (1859:9): [True: 9.91k, False: 37.3k]
  ------------------
 1860|   944k|    for (int j = 0; j < TX_SIZES_ALL; ++j) {
  ------------------
  |  Branch (1860:21): [True: 897k, False: 47.2k]
  ------------------
 1861|   897k|      quant_params->v_iqmatrix[i][j] =
 1862|   897k|          get_iqmatrix(quant_params, qmlevel_v, AOM_PLANE_V, j);
  ------------------
  |  |  212|   897k|#define AOM_PLANE_V 2      /**< V (Chroma) plane */
  ------------------
 1863|   897k|    }
 1864|  47.2k|  }
 1865|  26.4k|}
decodeframe.c:get_iqmatrix:
 1816|  2.69M|                                    int qmlevel, int plane, TX_SIZE tx_size) {
 1817|       |  assert(quant_params->giqmatrix[qmlevel][plane][tx_size] != NULL ||
 1818|  2.69M|         qmlevel == NUM_QM_LEVELS - 1);
 1819|  2.69M|  return quant_params->giqmatrix[qmlevel][plane][tx_size];
 1820|  2.69M|}
decodeframe.c:setup_loopfilter:
 1705|  26.4k|                                    struct aom_read_bit_buffer *rb) {
 1706|  26.4k|  const int num_planes = av1_num_planes(cm);
 1707|  26.4k|  struct loopfilter *lf = &cm->lf;
 1708|       |
 1709|  26.4k|  if (cm->features.allow_intrabc || cm->features.coded_lossless) {
  ------------------
  |  Branch (1709:7): [True: 1.67k, False: 24.7k]
  |  Branch (1709:37): [True: 2.95k, False: 21.7k]
  ------------------
 1710|       |    // write default deltas to frame buffer
 1711|  4.63k|    av1_set_default_ref_deltas(cm->cur_frame->ref_deltas);
 1712|  4.63k|    av1_set_default_mode_deltas(cm->cur_frame->mode_deltas);
 1713|  4.63k|    return;
 1714|  4.63k|  }
 1715|  26.4k|  assert(!cm->features.coded_lossless);
 1716|  21.7k|  if (cm->prev_frame) {
  ------------------
  |  Branch (1716:7): [True: 2.27k, False: 19.4k]
  ------------------
 1717|       |    // write deltas to frame buffer
 1718|  2.27k|    memcpy(lf->ref_deltas, cm->prev_frame->ref_deltas, REF_FRAMES);
 1719|  2.27k|    memcpy(lf->mode_deltas, cm->prev_frame->mode_deltas, MAX_MODE_LF_DELTAS);
  ------------------
  |  |   74|  2.27k|#define MAX_MODE_LF_DELTAS 2
  ------------------
 1720|  19.4k|  } else {
 1721|  19.4k|    av1_set_default_ref_deltas(lf->ref_deltas);
 1722|  19.4k|    av1_set_default_mode_deltas(lf->mode_deltas);
 1723|  19.4k|  }
 1724|  21.7k|  lf->filter_level[0] = aom_rb_read_literal(rb, 6);
 1725|  21.7k|  lf->filter_level[1] = aom_rb_read_literal(rb, 6);
 1726|  21.7k|  if (num_planes > 1) {
  ------------------
  |  Branch (1726:7): [True: 18.7k, False: 3.03k]
  ------------------
 1727|  18.7k|    if (lf->filter_level[0] || lf->filter_level[1]) {
  ------------------
  |  Branch (1727:9): [True: 6.63k, False: 12.1k]
  |  Branch (1727:32): [True: 1.02k, False: 11.0k]
  ------------------
 1728|  7.66k|      lf->filter_level_u = aom_rb_read_literal(rb, 6);
 1729|  7.66k|      lf->filter_level_v = aom_rb_read_literal(rb, 6);
 1730|  7.66k|    }
 1731|  18.7k|  }
 1732|  21.7k|  lf->sharpness_level = aom_rb_read_literal(rb, 3);
 1733|       |
 1734|       |  // Read in loop filter deltas applied at the MB level based on mode or ref
 1735|       |  // frame.
 1736|  21.7k|  lf->mode_ref_delta_update = 0;
 1737|       |
 1738|  21.7k|  lf->mode_ref_delta_enabled = aom_rb_read_bit(rb);
 1739|  21.7k|  if (lf->mode_ref_delta_enabled) {
  ------------------
  |  Branch (1739:7): [True: 16.3k, False: 5.42k]
  ------------------
 1740|  16.3k|    lf->mode_ref_delta_update = aom_rb_read_bit(rb);
 1741|  16.3k|    if (lf->mode_ref_delta_update) {
  ------------------
  |  Branch (1741:9): [True: 1.91k, False: 14.4k]
  ------------------
 1742|  17.1k|      for (int i = 0; i < REF_FRAMES; i++)
  ------------------
  |  Branch (1742:23): [True: 15.2k, False: 1.91k]
  ------------------
 1743|  15.2k|        if (aom_rb_read_bit(rb))
  ------------------
  |  Branch (1743:13): [True: 6.82k, False: 8.44k]
  ------------------
 1744|  6.82k|          lf->ref_deltas[i] = aom_rb_read_inv_signed_literal(rb, 6);
 1745|       |
 1746|  5.71k|      for (int i = 0; i < MAX_MODE_LF_DELTAS; i++)
  ------------------
  |  |   74|  5.71k|#define MAX_MODE_LF_DELTAS 2
  ------------------
  |  Branch (1746:23): [True: 3.80k, False: 1.91k]
  ------------------
 1747|  3.80k|        if (aom_rb_read_bit(rb))
  ------------------
  |  Branch (1747:13): [True: 1.92k, False: 1.87k]
  ------------------
 1748|  1.92k|          lf->mode_deltas[i] = aom_rb_read_inv_signed_literal(rb, 6);
 1749|  1.91k|    }
 1750|  16.3k|  }
 1751|       |
 1752|       |  // write deltas to frame buffer
 1753|  21.7k|  memcpy(cm->cur_frame->ref_deltas, lf->ref_deltas, REF_FRAMES);
 1754|  21.7k|  memcpy(cm->cur_frame->mode_deltas, lf->mode_deltas, MAX_MODE_LF_DELTAS);
  ------------------
  |  |   74|  21.7k|#define MAX_MODE_LF_DELTAS 2
  ------------------
 1755|  21.7k|}
decodeframe.c:setup_cdef:
 1757|  16.8k|static inline void setup_cdef(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
 1758|  16.8k|  const int num_planes = av1_num_planes(cm);
 1759|  16.8k|  CdefInfo *const cdef_info = &cm->cdef_info;
 1760|       |
 1761|  16.8k|  if (cm->features.allow_intrabc) return;
  ------------------
  |  Branch (1761:7): [True: 445, False: 16.4k]
  ------------------
 1762|  16.4k|  cdef_info->cdef_damping = aom_rb_read_literal(rb, 2) + 3;
 1763|  16.4k|  cdef_info->cdef_bits = aom_rb_read_literal(rb, 2);
 1764|  16.4k|  cdef_info->nb_cdef_strengths = 1 << cdef_info->cdef_bits;
 1765|  38.8k|  for (int i = 0; i < cdef_info->nb_cdef_strengths; i++) {
  ------------------
  |  Branch (1765:19): [True: 22.4k, False: 16.4k]
  ------------------
 1766|  22.4k|    cdef_info->cdef_strengths[i] = aom_rb_read_literal(rb, CDEF_STRENGTH_BITS);
  ------------------
  |  |   14|  22.4k|#define CDEF_STRENGTH_BITS 6
  ------------------
 1767|  22.4k|    cdef_info->cdef_uv_strengths[i] =
 1768|  22.4k|        num_planes > 1 ? aom_rb_read_literal(rb, CDEF_STRENGTH_BITS) : 0;
  ------------------
  |  |   14|  17.8k|#define CDEF_STRENGTH_BITS 6
  ------------------
  |  Branch (1768:9): [True: 17.8k, False: 4.60k]
  ------------------
 1769|  22.4k|  }
 1770|  16.4k|}
decodeframe.c:decode_restoration_mode:
 1507|  7.63k|                                           struct aom_read_bit_buffer *rb) {
 1508|  7.63k|  assert(!cm->features.all_lossless);
 1509|  7.63k|  const int num_planes = av1_num_planes(cm);
 1510|  7.63k|  if (cm->features.allow_intrabc) return;
  ------------------
  |  Branch (1510:7): [True: 309, False: 7.32k]
  ------------------
 1511|  7.32k|  int all_none = 1, chroma_none = 1;
 1512|  24.9k|  for (int p = 0; p < num_planes; ++p) {
  ------------------
  |  Branch (1512:19): [True: 17.5k, False: 7.32k]
  ------------------
 1513|  17.5k|    RestorationInfo *rsi = &cm->rst_info[p];
 1514|  17.5k|    if (aom_rb_read_bit(rb)) {
  ------------------
  |  Branch (1514:9): [True: 4.37k, False: 13.2k]
  ------------------
 1515|  4.37k|      rsi->frame_restoration_type =
 1516|  4.37k|          aom_rb_read_bit(rb) ? RESTORE_SGRPROJ : RESTORE_WIENER;
  ------------------
  |  Branch (1516:11): [True: 2.45k, False: 1.91k]
  ------------------
 1517|  13.2k|    } else {
 1518|  13.2k|      rsi->frame_restoration_type =
 1519|  13.2k|          aom_rb_read_bit(rb) ? RESTORE_SWITCHABLE : RESTORE_NONE;
  ------------------
  |  Branch (1519:11): [True: 2.51k, False: 10.6k]
  ------------------
 1520|  13.2k|    }
 1521|  17.5k|    if (rsi->frame_restoration_type != RESTORE_NONE) {
  ------------------
  |  Branch (1521:9): [True: 6.89k, False: 10.6k]
  ------------------
 1522|  6.89k|      all_none = 0;
 1523|  6.89k|      chroma_none &= p == 0;
 1524|  6.89k|    }
 1525|  17.5k|  }
 1526|  7.32k|  if (!all_none) {
  ------------------
  |  Branch (1526:7): [True: 3.69k, False: 3.63k]
  ------------------
 1527|  3.69k|    assert(cm->seq_params->sb_size == BLOCK_64X64 ||
 1528|  3.69k|           cm->seq_params->sb_size == BLOCK_128X128);
 1529|  3.69k|    const int sb_size = cm->seq_params->sb_size == BLOCK_128X128 ? 128 : 64;
  ------------------
  |  Branch (1529:25): [True: 1.75k, False: 1.94k]
  ------------------
 1530|       |
 1531|  12.9k|    for (int p = 0; p < num_planes; ++p)
  ------------------
  |  Branch (1531:21): [True: 9.22k, False: 3.69k]
  ------------------
 1532|  9.22k|      cm->rst_info[p].restoration_unit_size = sb_size;
 1533|       |
 1534|  3.69k|    RestorationInfo *rsi = &cm->rst_info[0];
 1535|       |
 1536|  3.69k|    if (sb_size == 64) {
  ------------------
  |  Branch (1536:9): [True: 1.94k, False: 1.75k]
  ------------------
 1537|  1.94k|      rsi->restoration_unit_size <<= aom_rb_read_bit(rb);
 1538|  1.94k|    }
 1539|  3.69k|    if (rsi->restoration_unit_size > 64) {
  ------------------
  |  Branch (1539:9): [True: 2.46k, False: 1.23k]
  ------------------
 1540|  2.46k|      rsi->restoration_unit_size <<= aom_rb_read_bit(rb);
 1541|  2.46k|    }
 1542|  3.69k|  } else {
 1543|  3.63k|    const int size = RESTORATION_UNITSIZE_MAX;
  ------------------
  |  |   80|  3.63k|#define RESTORATION_UNITSIZE_MAX 256
  ------------------
 1544|  11.9k|    for (int p = 0; p < num_planes; ++p)
  ------------------
  |  Branch (1544:21): [True: 8.36k, False: 3.63k]
  ------------------
 1545|  8.36k|      cm->rst_info[p].restoration_unit_size = size;
 1546|  3.63k|  }
 1547|       |
 1548|  7.32k|  if (num_planes > 1) {
  ------------------
  |  Branch (1548:7): [True: 5.12k, False: 2.20k]
  ------------------
 1549|  5.12k|    int s =
 1550|  5.12k|        AOMMIN(cm->seq_params->subsampling_x, cm->seq_params->subsampling_y);
  ------------------
  |  |   34|  5.12k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 5.12k]
  |  |  ------------------
  ------------------
 1551|  5.12k|    if (s && !chroma_none) {
  ------------------
  |  Branch (1551:9): [True: 1.36k, False: 3.76k]
  |  Branch (1551:14): [True: 211, False: 1.15k]
  ------------------
 1552|    211|      cm->rst_info[1].restoration_unit_size =
 1553|    211|          cm->rst_info[0].restoration_unit_size >> (aom_rb_read_bit(rb) * s);
 1554|  4.91k|    } else {
 1555|  4.91k|      cm->rst_info[1].restoration_unit_size =
 1556|  4.91k|          cm->rst_info[0].restoration_unit_size;
 1557|  4.91k|    }
 1558|  5.12k|    cm->rst_info[2].restoration_unit_size =
 1559|  5.12k|        cm->rst_info[1].restoration_unit_size;
 1560|  5.12k|  }
 1561|  7.32k|}
decodeframe.c:read_tx_mode:
  140|  26.3k|                            int coded_lossless) {
  141|  26.3k|  if (coded_lossless) return ONLY_4X4;
  ------------------
  |  Branch (141:7): [True: 2.98k, False: 23.3k]
  ------------------
  142|  23.3k|  return aom_rb_read_bit(rb) ? TX_MODE_SELECT : TX_MODE_LARGEST;
  ------------------
  |  Branch (142:10): [True: 8.18k, False: 15.1k]
  ------------------
  143|  26.3k|}
decodeframe.c:read_frame_reference_mode:
  146|  26.3k|    const AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
  147|  26.3k|  if (frame_is_intra_only(cm)) {
  ------------------
  |  Branch (147:7): [True: 18.3k, False: 7.92k]
  ------------------
  148|  18.3k|    return SINGLE_REFERENCE;
  149|  18.3k|  } else {
  150|  7.92k|    return aom_rb_read_bit(rb) ? REFERENCE_MODE_SELECT : SINGLE_REFERENCE;
  ------------------
  |  Branch (150:12): [True: 1.93k, False: 5.98k]
  ------------------
  151|  7.92k|  }
  152|  26.3k|}
decodeframe.c:read_global_motion:
 4371|  7.91k|                                      struct aom_read_bit_buffer *rb) {
 4372|  63.0k|  for (int frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) {
  ------------------
  |  Branch (4372:32): [True: 55.0k, False: 7.91k]
  ------------------
 4373|  55.0k|    const WarpedMotionParams *ref_params =
 4374|  55.0k|        cm->prev_frame ? &cm->prev_frame->global_motion[frame]
  ------------------
  |  Branch (4374:9): [True: 25.0k, False: 30.0k]
  ------------------
 4375|  55.0k|                       : &default_warp_params;
 4376|  55.0k|    int good_params =
 4377|  55.0k|        read_global_motion_params(&cm->global_motion[frame], ref_params, rb,
 4378|  55.0k|                                  cm->features.allow_high_precision_mv);
 4379|  55.0k|    if (!good_params) {
  ------------------
  |  Branch (4379:9): [True: 12, False: 55.0k]
  ------------------
 4380|       |#if WARPED_MOTION_DEBUG
 4381|       |      printf("Warning: unexpected global motion shear params from aomenc\n");
 4382|       |#endif
 4383|     12|      cm->global_motion[frame].invalid = 1;
 4384|     12|    }
 4385|       |
 4386|       |    // TODO(sarahparker, debargha): The logic in the commented out code below
 4387|       |    // does not work currently and causes mismatches when resize is on. Fix it
 4388|       |    // before turning the optimization back on.
 4389|       |    /*
 4390|       |    YV12_BUFFER_CONFIG *ref_buf = get_ref_frame(cm, frame);
 4391|       |    if (cm->width == ref_buf->y_crop_width &&
 4392|       |        cm->height == ref_buf->y_crop_height) {
 4393|       |      read_global_motion_params(&cm->global_motion[frame],
 4394|       |                                &cm->prev_frame->global_motion[frame], rb,
 4395|       |                                cm->features.allow_high_precision_mv);
 4396|       |    } else {
 4397|       |      cm->global_motion[frame] = default_warp_params;
 4398|       |    }
 4399|       |    */
 4400|       |    /*
 4401|       |    printf("Dec Ref %d [%d/%d]: %d %d %d %d\n",
 4402|       |           frame, cm->current_frame.frame_number, cm->show_frame,
 4403|       |           cm->global_motion[frame].wmmat[0],
 4404|       |           cm->global_motion[frame].wmmat[1],
 4405|       |           cm->global_motion[frame].wmmat[2],
 4406|       |           cm->global_motion[frame].wmmat[3]);
 4407|       |           */
 4408|  55.0k|  }
 4409|  7.91k|  memcpy(cm->cur_frame->global_motion, cm->global_motion,
 4410|  7.91k|         REF_FRAMES * sizeof(WarpedMotionParams));
 4411|  7.91k|}
decodeframe.c:read_global_motion_params:
 4303|  55.0k|                                     int allow_hp) {
 4304|  55.0k|  TransformationType type = aom_rb_read_bit(rb);
 4305|  55.0k|  if (type != IDENTITY) {
  ------------------
  |  Branch (4305:7): [True: 2.50k, False: 52.5k]
  ------------------
 4306|  2.50k|    if (aom_rb_read_bit(rb))
  ------------------
  |  Branch (4306:9): [True: 1.46k, False: 1.04k]
  ------------------
 4307|  1.46k|      type = ROTZOOM;
 4308|  1.04k|    else
 4309|  1.04k|      type = aom_rb_read_bit(rb) ? TRANSLATION : AFFINE;
  ------------------
  |  Branch (4309:14): [True: 661, False: 382]
  ------------------
 4310|  2.50k|  }
 4311|       |
 4312|  55.0k|  *params = default_warp_params;
 4313|  55.0k|  params->wmtype = type;
 4314|       |
 4315|  55.0k|  if (type >= ROTZOOM) {
  ------------------
  |  Branch (4315:7): [True: 1.83k, False: 53.2k]
  ------------------
 4316|  1.83k|    params->wmmat[2] = aom_rb_read_signed_primitive_refsubexpfin(
 4317|  1.83k|                           rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
  ------------------
  |  |  178|  1.83k|#define GM_ALPHA_MAX (1 << GM_ABS_ALPHA_BITS)
  |  |  ------------------
  |  |  |  |  173|  1.83k|#define GM_ABS_ALPHA_BITS 12
  |  |  ------------------
  ------------------
                                         rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
  ------------------
  |  |  163|  1.83k|#define SUBEXPFIN_K 3
  ------------------
 4318|  1.83k|                           (ref_params->wmmat[2] >> GM_ALPHA_PREC_DIFF) -
  ------------------
  |  |  174|  1.83k|#define GM_ALPHA_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_ALPHA_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  1.83k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define GM_ALPHA_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_ALPHA_PREC_BITS)
  |  |  ------------------
  |  |  |  |  172|  1.83k|#define GM_ALPHA_PREC_BITS 15
  |  |  ------------------
  ------------------
 4319|  1.83k|                               (1 << GM_ALPHA_PREC_BITS)) *
  ------------------
  |  |  172|  1.83k|#define GM_ALPHA_PREC_BITS 15
  ------------------
 4320|  1.83k|                           GM_ALPHA_DECODE_FACTOR +
  ------------------
  |  |  175|  1.83k|#define GM_ALPHA_DECODE_FACTOR (1 << GM_ALPHA_PREC_DIFF)
  |  |  ------------------
  |  |  |  |  174|  1.83k|#define GM_ALPHA_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_ALPHA_PREC_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   96|  1.83k|#define WARPEDMODEL_PREC_BITS 16
  |  |  |  |  ------------------
  |  |  |  |               #define GM_ALPHA_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_ALPHA_PREC_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |  172|  1.83k|#define GM_ALPHA_PREC_BITS 15
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 4321|  1.83k|                       (1 << WARPEDMODEL_PREC_BITS);
  ------------------
  |  |   96|  1.83k|#define WARPEDMODEL_PREC_BITS 16
  ------------------
 4322|  1.83k|    params->wmmat[3] = aom_rb_read_signed_primitive_refsubexpfin(
 4323|  1.83k|                           rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
  ------------------
  |  |  178|  1.83k|#define GM_ALPHA_MAX (1 << GM_ABS_ALPHA_BITS)
  |  |  ------------------
  |  |  |  |  173|  1.83k|#define GM_ABS_ALPHA_BITS 12
  |  |  ------------------
  ------------------
                                         rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
  ------------------
  |  |  163|  1.83k|#define SUBEXPFIN_K 3
  ------------------
 4324|  1.83k|                           (ref_params->wmmat[3] >> GM_ALPHA_PREC_DIFF)) *
  ------------------
  |  |  174|  1.83k|#define GM_ALPHA_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_ALPHA_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  1.83k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define GM_ALPHA_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_ALPHA_PREC_BITS)
  |  |  ------------------
  |  |  |  |  172|  1.83k|#define GM_ALPHA_PREC_BITS 15
  |  |  ------------------
  ------------------
 4325|  1.83k|                       GM_ALPHA_DECODE_FACTOR;
  ------------------
  |  |  175|  1.83k|#define GM_ALPHA_DECODE_FACTOR (1 << GM_ALPHA_PREC_DIFF)
  |  |  ------------------
  |  |  |  |  174|  1.83k|#define GM_ALPHA_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_ALPHA_PREC_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   96|  1.83k|#define WARPEDMODEL_PREC_BITS 16
  |  |  |  |  ------------------
  |  |  |  |               #define GM_ALPHA_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_ALPHA_PREC_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |  172|  1.83k|#define GM_ALPHA_PREC_BITS 15
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 4326|  1.83k|  }
 4327|       |
 4328|  55.0k|  if (type >= AFFINE) {
  ------------------
  |  Branch (4328:7): [True: 357, False: 54.7k]
  ------------------
 4329|    357|    params->wmmat[4] = aom_rb_read_signed_primitive_refsubexpfin(
 4330|    357|                           rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
  ------------------
  |  |  178|    357|#define GM_ALPHA_MAX (1 << GM_ABS_ALPHA_BITS)
  |  |  ------------------
  |  |  |  |  173|    357|#define GM_ABS_ALPHA_BITS 12
  |  |  ------------------
  ------------------
                                         rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
  ------------------
  |  |  163|    357|#define SUBEXPFIN_K 3
  ------------------
 4331|    357|                           (ref_params->wmmat[4] >> GM_ALPHA_PREC_DIFF)) *
  ------------------
  |  |  174|    357|#define GM_ALPHA_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_ALPHA_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|    357|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define GM_ALPHA_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_ALPHA_PREC_BITS)
  |  |  ------------------
  |  |  |  |  172|    357|#define GM_ALPHA_PREC_BITS 15
  |  |  ------------------
  ------------------
 4332|    357|                       GM_ALPHA_DECODE_FACTOR;
  ------------------
  |  |  175|    357|#define GM_ALPHA_DECODE_FACTOR (1 << GM_ALPHA_PREC_DIFF)
  |  |  ------------------
  |  |  |  |  174|    357|#define GM_ALPHA_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_ALPHA_PREC_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   96|    357|#define WARPEDMODEL_PREC_BITS 16
  |  |  |  |  ------------------
  |  |  |  |               #define GM_ALPHA_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_ALPHA_PREC_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |  172|    357|#define GM_ALPHA_PREC_BITS 15
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 4333|    357|    params->wmmat[5] = aom_rb_read_signed_primitive_refsubexpfin(
 4334|    357|                           rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
  ------------------
  |  |  178|    357|#define GM_ALPHA_MAX (1 << GM_ABS_ALPHA_BITS)
  |  |  ------------------
  |  |  |  |  173|    357|#define GM_ABS_ALPHA_BITS 12
  |  |  ------------------
  ------------------
                                         rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
  ------------------
  |  |  163|    357|#define SUBEXPFIN_K 3
  ------------------
 4335|    357|                           (ref_params->wmmat[5] >> GM_ALPHA_PREC_DIFF) -
  ------------------
  |  |  174|    357|#define GM_ALPHA_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_ALPHA_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|    357|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define GM_ALPHA_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_ALPHA_PREC_BITS)
  |  |  ------------------
  |  |  |  |  172|    357|#define GM_ALPHA_PREC_BITS 15
  |  |  ------------------
  ------------------
 4336|    357|                               (1 << GM_ALPHA_PREC_BITS)) *
  ------------------
  |  |  172|    357|#define GM_ALPHA_PREC_BITS 15
  ------------------
 4337|    357|                           GM_ALPHA_DECODE_FACTOR +
  ------------------
  |  |  175|    357|#define GM_ALPHA_DECODE_FACTOR (1 << GM_ALPHA_PREC_DIFF)
  |  |  ------------------
  |  |  |  |  174|    357|#define GM_ALPHA_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_ALPHA_PREC_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   96|    357|#define WARPEDMODEL_PREC_BITS 16
  |  |  |  |  ------------------
  |  |  |  |               #define GM_ALPHA_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_ALPHA_PREC_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |  172|    357|#define GM_ALPHA_PREC_BITS 15
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 4338|    357|                       (1 << WARPEDMODEL_PREC_BITS);
  ------------------
  |  |   96|    357|#define WARPEDMODEL_PREC_BITS 16
  ------------------
 4339|  54.7k|  } else {
 4340|  54.7k|    params->wmmat[4] = -params->wmmat[3];
 4341|  54.7k|    params->wmmat[5] = params->wmmat[2];
 4342|  54.7k|  }
 4343|       |
 4344|  55.0k|  if (type >= TRANSLATION) {
  ------------------
  |  Branch (4344:7): [True: 2.44k, False: 52.6k]
  ------------------
 4345|  2.44k|    const int trans_bits = (type == TRANSLATION)
  ------------------
  |  Branch (4345:28): [True: 661, False: 1.77k]
  ------------------
 4346|  2.44k|                               ? GM_ABS_TRANS_ONLY_BITS - !allow_hp
  ------------------
  |  |  166|    661|#define GM_ABS_TRANS_ONLY_BITS (GM_ABS_TRANS_BITS - GM_TRANS_PREC_BITS + 3)
  |  |  ------------------
  |  |  |  |  165|    661|#define GM_ABS_TRANS_BITS 12
  |  |  ------------------
  |  |               #define GM_ABS_TRANS_ONLY_BITS (GM_ABS_TRANS_BITS - GM_TRANS_PREC_BITS + 3)
  |  |  ------------------
  |  |  |  |  164|    661|#define GM_TRANS_PREC_BITS 6
  |  |  ------------------
  ------------------
 4347|  2.44k|                               : GM_ABS_TRANS_BITS;
  ------------------
  |  |  165|  1.77k|#define GM_ABS_TRANS_BITS 12
  ------------------
 4348|  2.44k|    const int trans_dec_factor =
 4349|  2.44k|        (type == TRANSLATION) ? GM_TRANS_ONLY_DECODE_FACTOR * (1 << !allow_hp)
  ------------------
  |  |  170|    661|#define GM_TRANS_ONLY_DECODE_FACTOR (1 << GM_TRANS_ONLY_PREC_DIFF)
  |  |  ------------------
  |  |  |  |  168|    661|#define GM_TRANS_ONLY_PREC_DIFF (WARPEDMODEL_PREC_BITS - 3)
  |  |  |  |  ------------------
  |  |  |  |  |  |   96|    661|#define WARPEDMODEL_PREC_BITS 16
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  |  Branch (4349:9): [True: 661, False: 1.77k]
  ------------------
 4350|  2.44k|                              : GM_TRANS_DECODE_FACTOR;
  ------------------
  |  |  169|  1.77k|#define GM_TRANS_DECODE_FACTOR (1 << GM_TRANS_PREC_DIFF)
  |  |  ------------------
  |  |  |  |  167|  1.77k|#define GM_TRANS_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_TRANS_PREC_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   96|  1.77k|#define WARPEDMODEL_PREC_BITS 16
  |  |  |  |  ------------------
  |  |  |  |               #define GM_TRANS_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_TRANS_PREC_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |  164|  1.77k|#define GM_TRANS_PREC_BITS 6
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 4351|  2.44k|    const int trans_prec_diff = (type == TRANSLATION)
  ------------------
  |  Branch (4351:33): [True: 661, False: 1.77k]
  ------------------
 4352|  2.44k|                                    ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp
  ------------------
  |  |  168|    661|#define GM_TRANS_ONLY_PREC_DIFF (WARPEDMODEL_PREC_BITS - 3)
  |  |  ------------------
  |  |  |  |   96|    661|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  ------------------
 4353|  2.44k|                                    : GM_TRANS_PREC_DIFF;
  ------------------
  |  |  167|  1.77k|#define GM_TRANS_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_TRANS_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  1.77k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define GM_TRANS_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_TRANS_PREC_BITS)
  |  |  ------------------
  |  |  |  |  164|  1.77k|#define GM_TRANS_PREC_BITS 6
  |  |  ------------------
  ------------------
 4354|  2.44k|    params->wmmat[0] = aom_rb_read_signed_primitive_refsubexpfin(
 4355|  2.44k|                           rb, (1 << trans_bits) + 1, SUBEXPFIN_K,
  ------------------
  |  |  163|  2.44k|#define SUBEXPFIN_K 3
  ------------------
 4356|  2.44k|                           (ref_params->wmmat[0] >> trans_prec_diff)) *
 4357|  2.44k|                       trans_dec_factor;
 4358|  2.44k|    params->wmmat[1] = aom_rb_read_signed_primitive_refsubexpfin(
 4359|  2.44k|                           rb, (1 << trans_bits) + 1, SUBEXPFIN_K,
  ------------------
  |  |  163|  2.44k|#define SUBEXPFIN_K 3
  ------------------
 4360|  2.44k|                           (ref_params->wmmat[1] >> trans_prec_diff)) *
 4361|  2.44k|                       trans_dec_factor;
 4362|  2.44k|  }
 4363|       |
 4364|  55.0k|  int good_shear_params = av1_get_shear_params(params);
 4365|  55.0k|  if (!good_shear_params) return 0;
  ------------------
  |  Branch (4365:7): [True: 12, False: 55.0k]
  ------------------
 4366|       |
 4367|  55.0k|  return 1;
 4368|  55.0k|}
decodeframe.c:read_film_grain:
 4073|  26.2k|                                   struct aom_read_bit_buffer *rb) {
 4074|  26.2k|  if (cm->seq_params->film_grain_params_present &&
  ------------------
  |  Branch (4074:7): [True: 1.04k, False: 25.1k]
  ------------------
 4075|  1.04k|      (cm->show_frame || cm->showable_frame)) {
  ------------------
  |  Branch (4075:8): [True: 787, False: 256]
  |  Branch (4075:26): [True: 248, False: 8]
  ------------------
 4076|  1.03k|    read_film_grain_params(cm, rb);
 4077|  25.1k|  } else {
 4078|  25.1k|    memset(&cm->film_grain_params, 0, sizeof(cm->film_grain_params));
 4079|  25.1k|  }
 4080|  26.2k|  cm->film_grain_params.bit_depth = cm->seq_params->bit_depth;
 4081|  26.2k|  cm->cur_frame->film_grain_params = cm->film_grain_params;
 4082|  26.2k|}
decodeframe.c:read_film_grain_params:
 3908|  1.03k|                                   struct aom_read_bit_buffer *rb) {
 3909|  1.03k|  aom_film_grain_t *pars = &cm->film_grain_params;
 3910|  1.03k|  const SequenceHeader *const seq_params = cm->seq_params;
 3911|       |
 3912|  1.03k|  pars->apply_grain = aom_rb_read_bit(rb);
 3913|  1.03k|  if (!pars->apply_grain) {
  ------------------
  |  Branch (3913:7): [True: 919, False: 116]
  ------------------
 3914|    919|    memset(pars, 0, sizeof(*pars));
 3915|    919|    return;
 3916|    919|  }
 3917|       |
 3918|    116|  pars->random_seed = aom_rb_read_literal(rb, 16);
 3919|    116|  if (cm->current_frame.frame_type == INTER_FRAME)
  ------------------
  |  Branch (3919:7): [True: 33, False: 83]
  ------------------
 3920|     33|    pars->update_parameters = aom_rb_read_bit(rb);
 3921|     83|  else
 3922|     83|    pars->update_parameters = 1;
 3923|       |
 3924|    116|  pars->bit_depth = seq_params->bit_depth;
 3925|       |
 3926|    116|  if (!pars->update_parameters) {
  ------------------
  |  Branch (3926:7): [True: 21, False: 95]
  ------------------
 3927|       |    // inherit parameters from a previous reference frame
 3928|     21|    int film_grain_params_ref_idx = aom_rb_read_literal(rb, 3);
 3929|       |    // Section 6.8.20: It is a requirement of bitstream conformance that
 3930|       |    // film_grain_params_ref_idx is equal to ref_frame_idx[ j ] for some value
 3931|       |    // of j in the range 0 to REFS_PER_FRAME - 1.
 3932|     21|    int found = 0;
 3933|    104|    for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
  ------------------
  |  Branch (3933:21): [True: 99, False: 5]
  ------------------
 3934|     99|      if (film_grain_params_ref_idx == cm->remapped_ref_idx[i]) {
  ------------------
  |  Branch (3934:11): [True: 16, False: 83]
  ------------------
 3935|     16|        found = 1;
 3936|     16|        break;
 3937|     16|      }
 3938|     99|    }
 3939|     21|    if (!found) {
  ------------------
  |  Branch (3939:9): [True: 5, False: 16]
  ------------------
 3940|      5|      aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
 3941|      5|                         "Invalid film grain reference idx %d. ref_frame_idx = "
 3942|      5|                         "{%d, %d, %d, %d, %d, %d, %d}",
 3943|      5|                         film_grain_params_ref_idx, cm->remapped_ref_idx[0],
 3944|      5|                         cm->remapped_ref_idx[1], cm->remapped_ref_idx[2],
 3945|      5|                         cm->remapped_ref_idx[3], cm->remapped_ref_idx[4],
 3946|      5|                         cm->remapped_ref_idx[5], cm->remapped_ref_idx[6]);
 3947|      5|    }
 3948|     21|    RefCntBuffer *const buf = cm->ref_frame_map[film_grain_params_ref_idx];
 3949|     21|    if (buf == NULL) {
  ------------------
  |  Branch (3949:9): [True: 0, False: 21]
  ------------------
 3950|      0|      aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
 3951|      0|                         "Invalid Film grain reference idx");
 3952|      0|    }
 3953|     21|    if (!buf->film_grain_params_present) {
  ------------------
  |  Branch (3953:9): [True: 0, False: 21]
  ------------------
 3954|      0|      aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
 3955|      0|                         "Film grain reference parameters not available");
 3956|      0|    }
 3957|     21|    uint16_t random_seed = pars->random_seed;
 3958|     21|    *pars = buf->film_grain_params;   // inherit paramaters
 3959|     21|    pars->random_seed = random_seed;  // with new random seed
 3960|     21|    return;
 3961|     21|  }
 3962|       |
 3963|       |  // Scaling functions parameters
 3964|     95|  pars->num_y_points = aom_rb_read_literal(rb, 4);  // max 14
 3965|     95|  if (pars->num_y_points > 14)
  ------------------
  |  Branch (3965:7): [True: 7, False: 88]
  ------------------
 3966|      7|    aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
 3967|      7|                       "Number of points for film grain luma scaling function "
 3968|      7|                       "exceeds the maximum value.");
 3969|    187|  for (int i = 0; i < pars->num_y_points; i++) {
  ------------------
  |  Branch (3969:19): [True: 92, False: 95]
  ------------------
 3970|     92|    pars->scaling_points_y[i][0] = aom_rb_read_literal(rb, 8);
 3971|     92|    if (i && pars->scaling_points_y[i - 1][0] >= pars->scaling_points_y[i][0])
  ------------------
  |  Branch (3971:9): [True: 37, False: 55]
  |  Branch (3971:14): [True: 11, False: 26]
  ------------------
 3972|     11|      aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
 3973|     11|                         "First coordinate of the scaling function points "
 3974|     11|                         "shall be increasing.");
 3975|     92|    pars->scaling_points_y[i][1] = aom_rb_read_literal(rb, 8);
 3976|     92|  }
 3977|       |
 3978|     95|  if (!seq_params->monochrome)
  ------------------
  |  Branch (3978:7): [True: 64, False: 31]
  ------------------
 3979|     64|    pars->chroma_scaling_from_luma = aom_rb_read_bit(rb);
 3980|     31|  else
 3981|     31|    pars->chroma_scaling_from_luma = 0;
 3982|       |
 3983|     95|  if (seq_params->monochrome || pars->chroma_scaling_from_luma ||
  ------------------
  |  Branch (3983:7): [True: 31, False: 64]
  |  Branch (3983:33): [True: 34, False: 30]
  ------------------
 3984|     30|      ((seq_params->subsampling_x == 1) && (seq_params->subsampling_y == 1) &&
  ------------------
  |  Branch (3984:8): [True: 4, False: 26]
  |  Branch (3984:44): [True: 4, False: 0]
  ------------------
 3985|     42|       (pars->num_y_points == 0))) {
  ------------------
  |  Branch (3985:8): [True: 2, False: 2]
  ------------------
 3986|     42|    pars->num_cb_points = 0;
 3987|     42|    pars->num_cr_points = 0;
 3988|     53|  } else {
 3989|     53|    pars->num_cb_points = aom_rb_read_literal(rb, 4);  // max 10
 3990|     53|    if (pars->num_cb_points > 10)
  ------------------
  |  Branch (3990:9): [True: 3, False: 50]
  ------------------
 3991|      3|      aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
 3992|      3|                         "Number of points for film grain cb scaling function "
 3993|      3|                         "exceeds the maximum value.");
 3994|     95|    for (int i = 0; i < pars->num_cb_points; i++) {
  ------------------
  |  Branch (3994:21): [True: 42, False: 53]
  ------------------
 3995|     42|      pars->scaling_points_cb[i][0] = aom_rb_read_literal(rb, 8);
 3996|     42|      if (i &&
  ------------------
  |  Branch (3996:11): [True: 20, False: 22]
  ------------------
 3997|     20|          pars->scaling_points_cb[i - 1][0] >= pars->scaling_points_cb[i][0])
  ------------------
  |  Branch (3997:11): [True: 9, False: 11]
  ------------------
 3998|      9|        aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
 3999|      9|                           "First coordinate of the scaling function points "
 4000|      9|                           "shall be increasing.");
 4001|     42|      pars->scaling_points_cb[i][1] = aom_rb_read_literal(rb, 8);
 4002|     42|    }
 4003|       |
 4004|     53|    pars->num_cr_points = aom_rb_read_literal(rb, 4);  // max 10
 4005|     53|    if (pars->num_cr_points > 10)
  ------------------
  |  Branch (4005:9): [True: 3, False: 50]
  ------------------
 4006|      3|      aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
 4007|      3|                         "Number of points for film grain cr scaling function "
 4008|      3|                         "exceeds the maximum value.");
 4009|     77|    for (int i = 0; i < pars->num_cr_points; i++) {
  ------------------
  |  Branch (4009:21): [True: 24, False: 53]
  ------------------
 4010|     24|      pars->scaling_points_cr[i][0] = aom_rb_read_literal(rb, 8);
 4011|     24|      if (i &&
  ------------------
  |  Branch (4011:11): [True: 14, False: 10]
  ------------------
 4012|     14|          pars->scaling_points_cr[i - 1][0] >= pars->scaling_points_cr[i][0])
  ------------------
  |  Branch (4012:11): [True: 4, False: 10]
  ------------------
 4013|      4|        aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
 4014|      4|                           "First coordinate of the scaling function points "
 4015|      4|                           "shall be increasing.");
 4016|     24|      pars->scaling_points_cr[i][1] = aom_rb_read_literal(rb, 8);
 4017|     24|    }
 4018|       |
 4019|     53|    if ((seq_params->subsampling_x == 1) && (seq_params->subsampling_y == 1) &&
  ------------------
  |  Branch (4019:9): [True: 0, False: 53]
  |  Branch (4019:45): [True: 0, False: 0]
  ------------------
 4020|      0|        (((pars->num_cb_points == 0) && (pars->num_cr_points != 0)) ||
  ------------------
  |  Branch (4020:11): [True: 0, False: 0]
  |  Branch (4020:41): [True: 0, False: 0]
  ------------------
 4021|      0|         ((pars->num_cb_points != 0) && (pars->num_cr_points == 0))))
  ------------------
  |  Branch (4021:11): [True: 0, False: 0]
  |  Branch (4021:41): [True: 0, False: 0]
  ------------------
 4022|      0|      aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
 4023|      0|                         "In YCbCr 4:2:0, film grain shall be applied "
 4024|      0|                         "to both chroma components or neither.");
 4025|     53|  }
 4026|       |
 4027|     95|  pars->scaling_shift = aom_rb_read_literal(rb, 2) + 8;  // 8 + value
 4028|       |
 4029|       |  // AR coefficients
 4030|       |  // Only sent if the corresponsing scaling function has
 4031|       |  // more than 0 points
 4032|       |
 4033|     95|  pars->ar_coeff_lag = aom_rb_read_literal(rb, 2);
 4034|       |
 4035|     95|  int num_pos_luma = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1);
 4036|     95|  int num_pos_chroma = num_pos_luma;
 4037|     95|  if (pars->num_y_points > 0) ++num_pos_chroma;
  ------------------
  |  Branch (4037:7): [True: 27, False: 68]
  ------------------
 4038|       |
 4039|     95|  if (pars->num_y_points)
  ------------------
  |  Branch (4039:7): [True: 27, False: 68]
  ------------------
 4040|    205|    for (int i = 0; i < num_pos_luma; i++)
  ------------------
  |  Branch (4040:21): [True: 178, False: 27]
  ------------------
 4041|    178|      pars->ar_coeffs_y[i] = aom_rb_read_literal(rb, 8) - 128;
 4042|       |
 4043|     95|  if (pars->num_cb_points || pars->chroma_scaling_from_luma)
  ------------------
  |  Branch (4043:7): [True: 59, False: 36]
  |  Branch (4043:30): [True: 26, False: 10]
  ------------------
 4044|    270|    for (int i = 0; i < num_pos_chroma; i++)
  ------------------
  |  Branch (4044:21): [True: 239, False: 31]
  ------------------
 4045|    239|      pars->ar_coeffs_cb[i] = aom_rb_read_literal(rb, 8) - 128;
 4046|       |
 4047|     95|  if (pars->num_cr_points || pars->chroma_scaling_from_luma)
  ------------------
  |  Branch (4047:7): [True: 66, False: 29]
  |  Branch (4047:30): [True: 18, False: 11]
  ------------------
 4048|    147|    for (int i = 0; i < num_pos_chroma; i++)
  ------------------
  |  Branch (4048:21): [True: 127, False: 20]
  ------------------
 4049|    127|      pars->ar_coeffs_cr[i] = aom_rb_read_literal(rb, 8) - 128;
 4050|       |
 4051|     95|  pars->ar_coeff_shift = aom_rb_read_literal(rb, 2) + 6;  // 6 + value
 4052|       |
 4053|     95|  pars->grain_scale_shift = aom_rb_read_literal(rb, 2);
 4054|       |
 4055|     95|  if (pars->num_cb_points) {
  ------------------
  |  Branch (4055:7): [True: 3, False: 92]
  ------------------
 4056|      3|    pars->cb_mult = aom_rb_read_literal(rb, 8);
 4057|      3|    pars->cb_luma_mult = aom_rb_read_literal(rb, 8);
 4058|      3|    pars->cb_offset = aom_rb_read_literal(rb, 9);
 4059|      3|  }
 4060|       |
 4061|     95|  if (pars->num_cr_points) {
  ------------------
  |  Branch (4061:7): [True: 2, False: 93]
  ------------------
 4062|      2|    pars->cr_mult = aom_rb_read_literal(rb, 8);
 4063|      2|    pars->cr_luma_mult = aom_rb_read_literal(rb, 8);
 4064|      2|    pars->cr_offset = aom_rb_read_literal(rb, 9);
 4065|      2|  }
 4066|       |
 4067|     95|  pars->overlap_flag = aom_rb_read_bit(rb);
 4068|       |
 4069|     95|  pars->clip_to_restricted_range = aom_rb_read_bit(rb);
 4070|     95|}
decodeframe.c:setup_frame_info:
 5255|  25.8k|static inline void setup_frame_info(AV1Decoder *pbi) {
 5256|  25.8k|  AV1_COMMON *const cm = &pbi->common;
 5257|       |
 5258|  25.8k|  if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
  ------------------
  |  Branch (5258:7): [True: 2.83k, False: 23.0k]
  ------------------
 5259|  23.0k|      cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
  ------------------
  |  Branch (5259:7): [True: 336, False: 22.6k]
  ------------------
 5260|  22.6k|      cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
  ------------------
  |  Branch (5260:7): [True: 416, False: 22.2k]
  ------------------
 5261|  3.58k|    av1_alloc_restoration_buffers(cm, /*is_sgr_enabled =*/true);
 5262|  12.4k|    for (int p = 0; p < av1_num_planes(cm); p++) {
  ------------------
  |  Branch (5262:21): [True: 8.90k, False: 3.58k]
  ------------------
 5263|  8.90k|      av1_alloc_restoration_struct(cm, &cm->rst_info[p], p > 0);
 5264|  8.90k|    }
 5265|  3.58k|  }
 5266|       |
 5267|  25.8k|  const int use_highbd = cm->seq_params->use_highbitdepth;
 5268|  25.8k|  const int buf_size = MC_TEMP_BUF_PELS << use_highbd;
  ------------------
  |  |   84|  25.8k|  (((MAX_SB_SIZE) * 2 + (AOM_INTERP_EXTEND) * 2) * \
  |  |  ------------------
  |  |  |  |   32|  25.8k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   31|  25.8k|#define MAX_SB_SIZE_LOG2 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (((MAX_SB_SIZE) * 2 + (AOM_INTERP_EXTEND) * 2) * \
  |  |  ------------------
  |  |  |  |   31|  25.8k|#define AOM_INTERP_EXTEND 4
  |  |  ------------------
  |  |   85|  25.8k|   ((MAX_SB_SIZE) * 2 + (AOM_INTERP_EXTEND) * 2))
  |  |  ------------------
  |  |  |  |   32|  25.8k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   31|  25.8k|#define MAX_SB_SIZE_LOG2 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                  ((MAX_SB_SIZE) * 2 + (AOM_INTERP_EXTEND) * 2))
  |  |  ------------------
  |  |  |  |   31|  25.8k|#define AOM_INTERP_EXTEND 4
  |  |  ------------------
  ------------------
 5269|  25.8k|  if (pbi->td.mc_buf_size != buf_size) {
  ------------------
  |  Branch (5269:7): [True: 17.2k, False: 8.57k]
  ------------------
 5270|  17.2k|    av1_free_mc_tmp_buf(&pbi->td);
 5271|  17.2k|    allocate_mc_tmp_buf(cm, &pbi->td, buf_size, use_highbd);
 5272|  17.2k|  }
 5273|  25.8k|}
decodeframe.c:allocate_mc_tmp_buf:
 3402|   298k|                                       int use_highbd) {
 3403|   895k|  for (int ref = 0; ref < 2; ref++) {
  ------------------
  |  Branch (3403:21): [True: 596k, False: 298k]
  ------------------
 3404|       |    // The mc_buf/hbd_mc_buf must be zeroed to fix a intermittent valgrind error
 3405|       |    // 'Conditional jump or move depends on uninitialised value' from the loop
 3406|       |    // filter. Uninitialized reads in convolve function (e.g. horiz_4tap path in
 3407|       |    // av1_convolve_2d_sr_avx2()) from mc_buf/hbd_mc_buf are seen to be the
 3408|       |    // potential reason for this issue.
 3409|   596k|    if (use_highbd) {
  ------------------
  |  Branch (3409:9): [True: 178k, False: 418k]
  ------------------
 3410|   178k|      uint16_t *hbd_mc_buf;
 3411|   178k|      CHECK_MEM_ERROR(cm, hbd_mc_buf, (uint16_t *)aom_memalign(16, buf_size));
  ------------------
  |  |   51|   178k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|   178k|  do {                                                    \
  |  |  |  |   69|   178k|    lval = (expr);                                        \
  |  |  |  |   70|   178k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 178k]
  |  |  |  |  ------------------
  |  |  |  |   71|   178k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|   178k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 178k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 3412|   178k|      memset(hbd_mc_buf, 0, buf_size);
 3413|   178k|      thread_data->mc_buf[ref] = CONVERT_TO_BYTEPTR(hbd_mc_buf);
  ------------------
  |  |   76|   178k|#define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1))
  ------------------
 3414|   418k|    } else {
 3415|   418k|      CHECK_MEM_ERROR(cm, thread_data->mc_buf[ref],
  ------------------
  |  |   51|   418k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|   418k|  do {                                                    \
  |  |  |  |   69|   418k|    lval = (expr);                                        \
  |  |  |  |   70|   418k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 418k]
  |  |  |  |  ------------------
  |  |  |  |   71|   418k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|   418k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 418k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 3416|   418k|                      (uint8_t *)aom_memalign(16, buf_size));
 3417|   418k|      memset(thread_data->mc_buf[ref], 0, buf_size);
 3418|   418k|    }
 3419|   596k|  }
 3420|   298k|  thread_data->mc_buf_size = buf_size;
 3421|   298k|  thread_data->mc_buf_use_highbd = use_highbd;
 3422|       |
 3423|   298k|  CHECK_MEM_ERROR(cm, thread_data->tmp_conv_dst,
  ------------------
  |  |   51|   298k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|   298k|  do {                                                    \
  |  |  |  |   69|   298k|    lval = (expr);                                        \
  |  |  |  |   70|   298k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 298k]
  |  |  |  |  ------------------
  |  |  |  |   71|   298k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|   298k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 298k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 3424|   298k|                  aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE *
 3425|   298k|                                       sizeof(*thread_data->tmp_conv_dst)));
 3426|   298k|  CHECK_MEM_ERROR(cm, thread_data->seg_mask,
  ------------------
  |  |   51|   298k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|   298k|  do {                                                    \
  |  |  |  |   69|   298k|    lval = (expr);                                        \
  |  |  |  |   70|   298k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 298k]
  |  |  |  |  ------------------
  |  |  |  |   71|   298k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|   298k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 298k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 3427|   298k|                  (uint8_t *)aom_memalign(
 3428|   298k|                      16, 2 * MAX_SB_SQUARE * sizeof(*thread_data->seg_mask)));
 3429|       |
 3430|   895k|  for (int i = 0; i < 2; ++i) {
  ------------------
  |  Branch (3430:19): [True: 596k, False: 298k]
  ------------------
 3431|   596k|    CHECK_MEM_ERROR(
  ------------------
  |  |   51|   596k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|   596k|  do {                                                    \
  |  |  |  |   69|   596k|    lval = (expr);                                        \
  |  |  |  |   70|   596k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 596k]
  |  |  |  |  ------------------
  |  |  |  |   71|   596k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|   596k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 596k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 3432|   596k|        cm, thread_data->tmp_obmc_bufs[i],
 3433|   596k|        aom_memalign(16, 2 * MAX_MB_PLANE * MAX_SB_SQUARE *
 3434|   596k|                             sizeof(*thread_data->tmp_obmc_bufs[i])));
 3435|   596k|  }
 3436|   298k|}
decodeframe.c:decode_tiles_row_mt:
 3750|  13.1k|                                          int start_tile, int end_tile) {
 3751|  13.1k|  AV1_COMMON *const cm = &pbi->common;
 3752|  13.1k|  CommonTileParams *const tiles = &cm->tiles;
 3753|  13.1k|  const int tile_cols = tiles->cols;
 3754|  13.1k|  const int tile_rows = tiles->rows;
 3755|  13.1k|  const int n_tiles = tile_cols * tile_rows;
 3756|  13.1k|  TileBufferDec(*const tile_buffers)[MAX_TILE_COLS] = pbi->tile_buffers;
 3757|  13.1k|  const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows);
  ------------------
  |  |   34|  13.1k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 13.1k, False: 0]
  |  |  ------------------
  ------------------
 3758|  13.1k|  const int single_row = pbi->dec_tile_row >= 0;
 3759|  13.1k|  const int dec_tile_col = AOMMIN(pbi->dec_tile_col, tile_cols);
  ------------------
  |  |   34|  13.1k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 13.1k, False: 0]
  |  |  ------------------
  ------------------
 3760|  13.1k|  const int single_col = pbi->dec_tile_col >= 0;
 3761|  13.1k|  int tile_rows_start;
 3762|  13.1k|  int tile_rows_end;
 3763|  13.1k|  int tile_cols_start;
 3764|  13.1k|  int tile_cols_end;
 3765|  13.1k|  int tile_count_tg;
 3766|  13.1k|  int num_workers = 0;
 3767|  13.1k|  int max_threads;
 3768|  13.1k|  const uint8_t *raw_data_end = NULL;
 3769|  13.1k|  int max_sb_rows = 0;
 3770|       |
 3771|  13.1k|  if (tiles->large_scale) {
  ------------------
  |  Branch (3771:7): [True: 0, False: 13.1k]
  ------------------
 3772|      0|    tile_rows_start = single_row ? dec_tile_row : 0;
  ------------------
  |  Branch (3772:23): [True: 0, False: 0]
  ------------------
 3773|      0|    tile_rows_end = single_row ? dec_tile_row + 1 : tile_rows;
  ------------------
  |  Branch (3773:21): [True: 0, False: 0]
  ------------------
 3774|      0|    tile_cols_start = single_col ? dec_tile_col : 0;
  ------------------
  |  Branch (3774:23): [True: 0, False: 0]
  ------------------
 3775|      0|    tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols;
  ------------------
  |  Branch (3775:21): [True: 0, False: 0]
  ------------------
 3776|  13.1k|  } else {
 3777|  13.1k|    tile_rows_start = 0;
 3778|  13.1k|    tile_rows_end = tile_rows;
 3779|  13.1k|    tile_cols_start = 0;
 3780|  13.1k|    tile_cols_end = tile_cols;
 3781|  13.1k|  }
 3782|  13.1k|  tile_count_tg = end_tile - start_tile + 1;
 3783|  13.1k|  max_threads = pbi->max_threads;
 3784|       |
 3785|       |  // No tiles to decode.
 3786|  13.1k|  if (tile_rows_end <= tile_rows_start || tile_cols_end <= tile_cols_start ||
  ------------------
  |  Branch (3786:7): [True: 0, False: 13.1k]
  |  Branch (3786:43): [True: 0, False: 13.1k]
  ------------------
 3787|       |      // First tile is larger than end_tile.
 3788|  13.1k|      tile_rows_start * tile_cols + tile_cols_start > end_tile ||
  ------------------
  |  Branch (3788:7): [True: 0, False: 13.1k]
  ------------------
 3789|       |      // Last tile is smaller than start_tile.
 3790|  13.1k|      (tile_rows_end - 1) * tile_cols + tile_cols_end - 1 < start_tile)
  ------------------
  |  Branch (3790:7): [True: 0, False: 13.1k]
  ------------------
 3791|      0|    return data;
 3792|       |
 3793|  13.1k|  assert(tile_rows <= MAX_TILE_ROWS);
 3794|  13.1k|  assert(tile_cols <= MAX_TILE_COLS);
 3795|  13.1k|  assert(tile_count_tg > 0);
 3796|  13.1k|  assert(max_threads > 0);
 3797|  13.1k|  assert(start_tile <= end_tile);
 3798|  13.1k|  assert(start_tile >= 0 && end_tile < n_tiles);
 3799|       |
 3800|  13.1k|  (void)tile_count_tg;
 3801|       |
 3802|  13.1k|  decode_mt_init(pbi);
 3803|       |
 3804|       |  // get tile size in tile group
 3805|  13.1k|#if EXT_TILE_DEBUG
 3806|  13.1k|  if (tiles->large_scale) assert(pbi->ext_tile_debug == 1);
  ------------------
  |  Branch (3806:7): [True: 0, False: 13.1k]
  ------------------
 3807|  13.1k|  if (tiles->large_scale)
  ------------------
  |  Branch (3807:7): [True: 0, False: 13.1k]
  ------------------
 3808|      0|    raw_data_end = get_ls_tile_buffers(pbi, data, data_end, tile_buffers);
 3809|  13.1k|  else
 3810|  13.1k|#endif  // EXT_TILE_DEBUG
 3811|  13.1k|    get_tile_buffers(pbi, data, data_end, tile_buffers, start_tile, end_tile);
 3812|       |
 3813|  13.1k|  if (pbi->tile_data == NULL || n_tiles != pbi->allocated_tiles) {
  ------------------
  |  Branch (3813:7): [True: 9.17k, False: 3.98k]
  |  Branch (3813:33): [True: 426, False: 3.55k]
  ------------------
 3814|  9.54k|    if (pbi->tile_data != NULL) {
  ------------------
  |  Branch (3814:9): [True: 426, False: 9.12k]
  ------------------
 3815|    852|      for (int i = 0; i < pbi->allocated_tiles; i++) {
  ------------------
  |  Branch (3815:23): [True: 426, False: 426]
  ------------------
 3816|    426|        TileDataDec *const tile_data = pbi->tile_data + i;
 3817|    426|        av1_dec_row_mt_dealloc(&tile_data->dec_row_mt_sync);
 3818|    426|      }
 3819|    426|    }
 3820|  9.54k|    decoder_alloc_tile_data(pbi, n_tiles);
 3821|  9.54k|  }
 3822|  13.1k|  if (pbi->dcb.xd.seg_mask == NULL)
  ------------------
  |  Branch (3822:7): [True: 9.12k, False: 4.03k]
  ------------------
 3823|  13.1k|    CHECK_MEM_ERROR(cm, pbi->dcb.xd.seg_mask,
  ------------------
  |  |   51|  9.12k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  9.12k|  do {                                                    \
  |  |  |  |   69|  9.12k|    lval = (expr);                                        \
  |  |  |  |   70|  9.12k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 9.12k]
  |  |  |  |  ------------------
  |  |  |  |   71|  9.12k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  9.12k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 9.12k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 3824|  13.1k|                    (uint8_t *)aom_memalign(
 3825|  13.1k|                        16, 2 * MAX_SB_SQUARE * sizeof(*pbi->dcb.xd.seg_mask)));
 3826|       |
 3827|  26.9k|  for (int row = 0; row < tile_rows; row++) {
  ------------------
  |  Branch (3827:21): [True: 13.8k, False: 13.1k]
  ------------------
 3828|  28.5k|    for (int col = 0; col < tile_cols; col++) {
  ------------------
  |  Branch (3828:23): [True: 14.6k, False: 13.8k]
  ------------------
 3829|  14.6k|      TileDataDec *tile_data = pbi->tile_data + row * tiles->cols + col;
 3830|  14.6k|      av1_tile_init(&tile_data->tile_info, cm, row, col);
 3831|       |
 3832|  14.6k|      max_sb_rows = AOMMAX(max_sb_rows,
  ------------------
  |  |   35|  14.6k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 734, False: 13.9k]
  |  |  ------------------
  ------------------
 3833|  14.6k|                           av1_get_sb_rows_in_tile(cm, &tile_data->tile_info));
 3834|  14.6k|      num_workers += get_max_row_mt_workers_per_tile(cm, &tile_data->tile_info);
 3835|  14.6k|    }
 3836|  13.8k|  }
 3837|  13.1k|  num_workers = AOMMIN(num_workers, max_threads);
  ------------------
  |  |   34|  13.1k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 12.8k, False: 285]
  |  |  ------------------
  ------------------
 3838|       |
 3839|  13.1k|  if (pbi->allocated_row_mt_sync_rows != max_sb_rows) {
  ------------------
  |  Branch (3839:7): [True: 9.55k, False: 3.60k]
  ------------------
 3840|  20.6k|    for (int i = 0; i < n_tiles; ++i) {
  ------------------
  |  Branch (3840:21): [True: 11.1k, False: 9.55k]
  ------------------
 3841|  11.1k|      TileDataDec *const tile_data = pbi->tile_data + i;
 3842|  11.1k|      av1_dec_row_mt_dealloc(&tile_data->dec_row_mt_sync);
 3843|  11.1k|      dec_row_mt_alloc(&tile_data->dec_row_mt_sync, cm, max_sb_rows);
 3844|  11.1k|    }
 3845|  9.55k|    pbi->allocated_row_mt_sync_rows = max_sb_rows;
 3846|  9.55k|  }
 3847|       |
 3848|  13.1k|  tile_mt_queue(pbi, tile_cols, tile_rows, tile_rows_start, tile_rows_end,
 3849|  13.1k|                tile_cols_start, tile_cols_end, start_tile, end_tile);
 3850|       |
 3851|  13.1k|  dec_alloc_cb_buf(pbi);
 3852|       |
 3853|  13.1k|  row_mt_frame_init(pbi, tile_rows_start, tile_rows_end, tile_cols_start,
 3854|  13.1k|                    tile_cols_end, start_tile, end_tile, max_sb_rows);
 3855|       |
 3856|  13.1k|  reset_dec_workers(pbi, row_mt_worker_hook, num_workers);
 3857|  13.1k|  launch_dec_workers(pbi, data_end, num_workers);
 3858|  13.1k|  sync_dec_workers(pbi, num_workers);
 3859|       |
 3860|  13.1k|  if (pbi->dcb.corrupted)
  ------------------
  |  Branch (3860:7): [True: 6.10k, False: 7.05k]
  ------------------
 3861|  6.10k|    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
 3862|  6.10k|                       "Failed to decode tile data");
 3863|       |
 3864|  13.1k|  if (tiles->large_scale) {
  ------------------
  |  Branch (3864:7): [True: 0, False: 13.1k]
  ------------------
 3865|      0|    if (n_tiles == 1) {
  ------------------
  |  Branch (3865:9): [True: 0, False: 0]
  ------------------
 3866|       |      // Find the end of the single tile buffer
 3867|      0|      return aom_reader_find_end(&pbi->tile_data->bit_reader);
 3868|      0|    }
 3869|       |    // Return the end of the last tile buffer
 3870|      0|    return raw_data_end;
 3871|      0|  }
 3872|  13.1k|  TileDataDec *const tile_data = pbi->tile_data + end_tile;
 3873|       |
 3874|  13.1k|  return aom_reader_find_end(&tile_data->bit_reader);
 3875|  13.1k|}
decodeframe.c:decode_mt_init:
 3501|  13.1k|static inline void decode_mt_init(AV1Decoder *pbi) {
 3502|  13.1k|  AV1_COMMON *const cm = &pbi->common;
 3503|  13.1k|  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
 3504|  13.1k|  int worker_idx;
 3505|       |
 3506|       |  // Create workers and thread_data
 3507|  13.1k|  if (pbi->num_workers == 0) {
  ------------------
  |  Branch (3507:7): [True: 9.15k, False: 4.00k]
  ------------------
 3508|  9.15k|    const int num_threads = pbi->max_threads;
 3509|  9.15k|    CHECK_MEM_ERROR(cm, pbi->tile_workers,
  ------------------
  |  |   51|  9.15k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  9.15k|  do {                                                    \
  |  |  |  |   69|  9.15k|    lval = (expr);                                        \
  |  |  |  |   70|  9.15k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 9.15k]
  |  |  |  |  ------------------
  |  |  |  |   71|  9.15k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  9.15k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 9.15k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 3510|  9.15k|                    aom_malloc(num_threads * sizeof(*pbi->tile_workers)));
 3511|  9.15k|    CHECK_MEM_ERROR(cm, pbi->thread_data,
  ------------------
  |  |   51|  9.15k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  9.15k|  do {                                                    \
  |  |  |  |   69|  9.15k|    lval = (expr);                                        \
  |  |  |  |   70|  9.15k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 9.15k]
  |  |  |  |  ------------------
  |  |  |  |   71|  9.15k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  9.15k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 9.15k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 3512|  9.15k|                    aom_calloc(num_threads, sizeof(*pbi->thread_data)));
 3513|       |
 3514|   299k|    for (worker_idx = 0; worker_idx < num_threads; ++worker_idx) {
  ------------------
  |  Branch (3514:26): [True: 289k, False: 9.15k]
  ------------------
 3515|   289k|      AVxWorker *const worker = &pbi->tile_workers[worker_idx];
 3516|   289k|      DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
 3517|       |
 3518|   289k|      winterface->init(worker);
 3519|   289k|      worker->thread_name = "aom tile worker";
 3520|   289k|      if (worker_idx != 0 && !winterface->reset(worker)) {
  ------------------
  |  Branch (3520:11): [True: 280k, False: 9.15k]
  |  Branch (3520:30): [True: 0, False: 280k]
  ------------------
 3521|      0|        aom_internal_error(&pbi->error, AOM_CODEC_ERROR,
 3522|      0|                           "Tile decoder thread creation failed");
 3523|      0|      }
 3524|   289k|      ++pbi->num_workers;
 3525|       |
 3526|   289k|      if (worker_idx != 0) {
  ------------------
  |  Branch (3526:11): [True: 280k, False: 9.15k]
  ------------------
 3527|       |        // Allocate thread data.
 3528|   280k|        CHECK_MEM_ERROR(cm, thread_data->td,
  ------------------
  |  |   51|   280k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|   280k|  do {                                                    \
  |  |  |  |   69|   280k|    lval = (expr);                                        \
  |  |  |  |   70|   280k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 280k]
  |  |  |  |  ------------------
  |  |  |  |   71|   280k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|   280k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 280k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 3529|   280k|                        aom_memalign(32, sizeof(*thread_data->td)));
 3530|   280k|        av1_zero(*thread_data->td);
  ------------------
  |  |   43|   280k|#define av1_zero(dest) memset(&(dest), 0, sizeof(dest))
  ------------------
 3531|   280k|      } else {
 3532|       |        // Main thread acts as a worker and uses the thread data in pbi
 3533|  9.15k|        thread_data->td = &pbi->td;
 3534|  9.15k|      }
 3535|   289k|      thread_data->error_info.error_code = AOM_CODEC_OK;
 3536|   289k|      thread_data->error_info.setjmp = 0;
 3537|   289k|    }
 3538|  9.15k|  }
 3539|  13.1k|  const int use_highbd = cm->seq_params->use_highbitdepth;
 3540|  13.1k|  const int buf_size = MC_TEMP_BUF_PELS << use_highbd;
  ------------------
  |  |   84|  13.1k|  (((MAX_SB_SIZE) * 2 + (AOM_INTERP_EXTEND) * 2) * \
  |  |  ------------------
  |  |  |  |   32|  13.1k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   31|  13.1k|#define MAX_SB_SIZE_LOG2 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (((MAX_SB_SIZE) * 2 + (AOM_INTERP_EXTEND) * 2) * \
  |  |  ------------------
  |  |  |  |   31|  13.1k|#define AOM_INTERP_EXTEND 4
  |  |  ------------------
  |  |   85|  13.1k|   ((MAX_SB_SIZE) * 2 + (AOM_INTERP_EXTEND) * 2))
  |  |  ------------------
  |  |  |  |   32|  13.1k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   31|  13.1k|#define MAX_SB_SIZE_LOG2 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                  ((MAX_SB_SIZE) * 2 + (AOM_INTERP_EXTEND) * 2))
  |  |  ------------------
  |  |  |  |   31|  13.1k|#define AOM_INTERP_EXTEND 4
  |  |  ------------------
  ------------------
 3541|   406k|  for (worker_idx = 1; worker_idx < pbi->max_threads; ++worker_idx) {
  ------------------
  |  Branch (3541:24): [True: 392k, False: 13.1k]
  ------------------
 3542|   392k|    DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
 3543|   392k|    if (thread_data->td->mc_buf_size != buf_size) {
  ------------------
  |  Branch (3543:9): [True: 281k, False: 111k]
  ------------------
 3544|   281k|      av1_free_mc_tmp_buf(thread_data->td);
 3545|   281k|      allocate_mc_tmp_buf(cm, thread_data->td, buf_size, use_highbd);
 3546|   281k|    }
 3547|   392k|  }
 3548|  13.1k|}
decodeframe.c:mem_get_varsize:
 2219|  2.04k|static size_t mem_get_varsize(const uint8_t *src, int sz) {
 2220|  2.04k|  switch (sz) {
 2221|  1.52k|    case 1: return src[0];
  ------------------
  |  Branch (2221:5): [True: 1.52k, False: 518]
  ------------------
 2222|    474|    case 2: return mem_get_le16(src);
  ------------------
  |  |  101|    474|#define mem_get_le16 mem_ops_wrap_symbol(mem_get_le16)
  |  |  ------------------
  |  |  |  |   51|    474|#define mem_ops_wrap_symbol(fn) mem_ops_wrap_symbol2(fn, MEM_VALUE_T)
  |  |  |  |  ------------------
  |  |  |  |  |  |   53|    474|#define mem_ops_wrap_symbol2(fn, typ) mem_ops_wrap_symbol3(fn, typ)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   55|    474|#define mem_ops_wrap_symbol3(fn, typ) fn##_as_##typ
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  |  Branch (2222:5): [True: 474, False: 1.56k]
  ------------------
 2223|     16|    case 3: return mem_get_le24(src);
  ------------------
  |  |  112|     16|#define mem_get_le24 mem_ops_wrap_symbol(mem_get_le24)
  |  |  ------------------
  |  |  |  |   51|     16|#define mem_ops_wrap_symbol(fn) mem_ops_wrap_symbol2(fn, MEM_VALUE_T)
  |  |  |  |  ------------------
  |  |  |  |  |  |   53|     16|#define mem_ops_wrap_symbol2(fn, typ) mem_ops_wrap_symbol3(fn, typ)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   55|     16|#define mem_ops_wrap_symbol3(fn, typ) fn##_as_##typ
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  |  Branch (2223:5): [True: 16, False: 2.02k]
  ------------------
 2224|     28|    case 4: return mem_get_le32(src);
  ------------------
  |  |  124|     28|#define mem_get_le32 mem_ops_wrap_symbol(mem_get_le32)
  |  |  ------------------
  |  |  |  |   51|     28|#define mem_ops_wrap_symbol(fn) mem_ops_wrap_symbol2(fn, MEM_VALUE_T)
  |  |  |  |  ------------------
  |  |  |  |  |  |   53|     28|#define mem_ops_wrap_symbol2(fn, typ) mem_ops_wrap_symbol3(fn, typ)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   55|     28|#define mem_ops_wrap_symbol3(fn, typ) fn##_as_##typ
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  |  Branch (2224:5): [True: 28, False: 2.01k]
  ------------------
 2225|      0|    default: assert(0 && "Invalid size"); return -1;
  ------------------
  |  Branch (2225:5): [True: 0, False: 2.04k]
  ------------------
 2226|  2.04k|  }
 2227|  2.04k|}
decodeframe.c:read_is_valid:
  135|  29.3k|static int read_is_valid(const uint8_t *start, size_t len, const uint8_t *end) {
  136|  29.3k|  return len != 0 && len <= (size_t)(end - start);
  ------------------
  |  Branch (136:10): [True: 29.3k, False: 0]
  |  Branch (136:22): [True: 29.3k, False: 2]
  ------------------
  137|  29.3k|}
decodeframe.c:get_tile_buffers:
 2429|  25.8k|    int end_tile) {
 2430|  25.8k|  AV1_COMMON *const cm = &pbi->common;
 2431|  25.8k|  const int tile_cols = cm->tiles.cols;
 2432|  25.8k|  const int tile_rows = cm->tiles.rows;
 2433|  25.8k|  int tc = 0;
 2434|       |
 2435|  52.7k|  for (int r = 0; r < tile_rows; ++r) {
  ------------------
  |  Branch (2435:19): [True: 26.9k, False: 25.8k]
  ------------------
 2436|  54.8k|    for (int c = 0; c < tile_cols; ++c, ++tc) {
  ------------------
  |  Branch (2436:21): [True: 27.9k, False: 26.9k]
  ------------------
 2437|  27.9k|      TileBufferDec *const buf = &tile_buffers[r][c];
 2438|       |
 2439|  27.9k|      const int is_last = (tc == end_tile);
 2440|  27.9k|      const size_t hdr_offset = 0;
 2441|       |
 2442|  27.9k|      if (tc < start_tile || tc > end_tile) continue;
  ------------------
  |  Branch (2442:11): [True: 16, False: 27.8k]
  |  Branch (2442:30): [True: 24, False: 27.8k]
  ------------------
 2443|       |
 2444|  27.8k|      if (data + hdr_offset >= data_end)
  ------------------
  |  Branch (2444:11): [True: 13, False: 27.8k]
  ------------------
 2445|     13|        aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
 2446|     13|                           "Data ended before all tiles were read.");
 2447|  27.8k|      data += hdr_offset;
 2448|  27.8k|      get_tile_buffer(data_end, pbi->tile_size_bytes, is_last, &pbi->error,
 2449|  27.8k|                      &data, buf);
 2450|  27.8k|    }
 2451|  26.9k|  }
 2452|  25.8k|}
decodeframe.c:get_tile_buffer:
 2402|  27.8k|                                   TileBufferDec *const buf) {
 2403|  27.8k|  size_t size;
 2404|       |
 2405|  27.8k|  if (!is_last) {
  ------------------
  |  Branch (2405:7): [True: 2.04k, False: 25.8k]
  ------------------
 2406|  2.04k|    if (!read_is_valid(*data, tile_size_bytes, data_end))
  ------------------
  |  Branch (2406:9): [True: 2, False: 2.04k]
  ------------------
 2407|      2|      aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME,
 2408|      2|                         "Not enough data to read tile size");
 2409|       |
 2410|  2.04k|    size = mem_get_varsize(*data, tile_size_bytes) + AV1_MIN_TILE_SIZE_BYTES;
  ------------------
  |  |   55|  2.04k|#define AV1_MIN_TILE_SIZE_BYTES 1
  ------------------
 2411|  2.04k|    *data += tile_size_bytes;
 2412|       |
 2413|  2.04k|    if (size > (size_t)(data_end - *data))
  ------------------
  |  Branch (2413:9): [True: 44, False: 2.00k]
  ------------------
 2414|     44|      aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME,
 2415|     44|                         "Truncated packet or corrupt tile size");
 2416|  25.8k|  } else {
 2417|  25.8k|    size = data_end - *data;
 2418|  25.8k|  }
 2419|       |
 2420|  27.8k|  buf->data = *data;
 2421|  27.8k|  buf->size = size;
 2422|       |
 2423|  27.8k|  *data += size;
 2424|  27.8k|}
decodeframe.c:decoder_alloc_tile_data:
 2476|  17.9k|static inline void decoder_alloc_tile_data(AV1Decoder *pbi, const int n_tiles) {
 2477|  17.9k|  AV1_COMMON *const cm = &pbi->common;
 2478|  17.9k|  aom_free(pbi->tile_data);
 2479|  17.9k|  pbi->allocated_tiles = 0;
 2480|  17.9k|  CHECK_MEM_ERROR(cm, pbi->tile_data,
  ------------------
  |  |   51|  17.9k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  17.9k|  do {                                                    \
  |  |  |  |   69|  17.9k|    lval = (expr);                                        \
  |  |  |  |   70|  17.9k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 17.9k]
  |  |  |  |  ------------------
  |  |  |  |   71|  17.9k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  17.9k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 17.9k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 2481|  17.9k|                  aom_memalign(32, n_tiles * sizeof(*pbi->tile_data)));
 2482|  17.9k|  pbi->allocated_tiles = n_tiles;
 2483|  37.8k|  for (int i = 0; i < n_tiles; i++) {
  ------------------
  |  Branch (2483:19): [True: 19.9k, False: 17.9k]
  ------------------
 2484|  19.9k|    TileDataDec *const tile_data = pbi->tile_data + i;
 2485|  19.9k|    av1_zero(tile_data->dec_row_mt_sync);
  ------------------
  |  |   43|  19.9k|#define av1_zero(dest) memset(&(dest), 0, sizeof(dest))
  ------------------
 2486|  19.9k|  }
 2487|  17.9k|  pbi->allocated_row_mt_sync_rows = 0;
 2488|  17.9k|}
decodeframe.c:get_max_row_mt_workers_per_tile:
 3013|  36.0k|                                                  const TileInfo *tile) {
 3014|       |  // NOTE: Currently value of max workers is calculated based
 3015|       |  // on the parse and decode time. As per the theoretical estimate
 3016|       |  // when percentage of parse time is equal to percentage of decode
 3017|       |  // time, number of workers needed to parse + decode a tile can not
 3018|       |  // exceed more than 2.
 3019|       |  // TODO(any): Modify this value if parsing is optimized in future.
 3020|  36.0k|  int sb_rows = av1_get_sb_rows_in_tile(cm, tile);
 3021|  36.0k|  int max_workers =
 3022|  36.0k|      sb_rows == 1 ? AOM_MIN_THREADS_PER_TILE : AOM_MAX_THREADS_PER_TILE;
  ------------------
  |  |   78|  9.42k|#define AOM_MIN_THREADS_PER_TILE 1
  ------------------
                    sb_rows == 1 ? AOM_MIN_THREADS_PER_TILE : AOM_MAX_THREADS_PER_TILE;
  ------------------
  |  |   79|  26.5k|#define AOM_MAX_THREADS_PER_TILE 2
  ------------------
  |  Branch (3022:7): [True: 9.42k, False: 26.5k]
  ------------------
 3023|  36.0k|  return max_workers;
 3024|  36.0k|}
decodeframe.c:dec_row_mt_alloc:
 2510|  11.1k|                                    AV1_COMMON *cm, int rows) {
 2511|  11.1k|  dec_row_mt_sync->allocated_sb_rows = rows;
 2512|  11.1k|#if CONFIG_MULTITHREAD
 2513|  11.1k|  {
 2514|  11.1k|    int i;
 2515|       |
 2516|  11.1k|    CHECK_MEM_ERROR(cm, dec_row_mt_sync->mutex_,
  ------------------
  |  |   51|  11.1k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  11.1k|  do {                                                    \
  |  |  |  |   69|  11.1k|    lval = (expr);                                        \
  |  |  |  |   70|  11.1k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 11.1k]
  |  |  |  |  ------------------
  |  |  |  |   71|  11.1k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  11.1k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 11.1k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 2517|  11.1k|                    aom_malloc(sizeof(*(dec_row_mt_sync->mutex_)) * rows));
 2518|  11.1k|    if (dec_row_mt_sync->mutex_) {
  ------------------
  |  Branch (2518:9): [True: 11.1k, False: 0]
  ------------------
 2519|  36.5k|      for (i = 0; i < rows; ++i) {
  ------------------
  |  Branch (2519:19): [True: 25.4k, False: 11.1k]
  ------------------
 2520|  25.4k|        pthread_mutex_init(&dec_row_mt_sync->mutex_[i], NULL);
 2521|  25.4k|      }
 2522|  11.1k|    }
 2523|       |
 2524|  11.1k|    CHECK_MEM_ERROR(cm, dec_row_mt_sync->cond_,
  ------------------
  |  |   51|  11.1k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  11.1k|  do {                                                    \
  |  |  |  |   69|  11.1k|    lval = (expr);                                        \
  |  |  |  |   70|  11.1k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 11.1k]
  |  |  |  |  ------------------
  |  |  |  |   71|  11.1k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  11.1k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 11.1k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 2525|  11.1k|                    aom_malloc(sizeof(*(dec_row_mt_sync->cond_)) * rows));
 2526|  11.1k|    if (dec_row_mt_sync->cond_) {
  ------------------
  |  Branch (2526:9): [True: 11.1k, False: 0]
  ------------------
 2527|  36.5k|      for (i = 0; i < rows; ++i) {
  ------------------
  |  Branch (2527:19): [True: 25.4k, False: 11.1k]
  ------------------
 2528|  25.4k|        pthread_cond_init(&dec_row_mt_sync->cond_[i], NULL);
 2529|  25.4k|      }
 2530|  11.1k|    }
 2531|  11.1k|  }
 2532|  11.1k|#endif  // CONFIG_MULTITHREAD
 2533|       |
 2534|  11.1k|  CHECK_MEM_ERROR(cm, dec_row_mt_sync->cur_sb_col,
  ------------------
  |  |   51|  11.1k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  11.1k|  do {                                                    \
  |  |  |  |   69|  11.1k|    lval = (expr);                                        \
  |  |  |  |   70|  11.1k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 11.1k]
  |  |  |  |  ------------------
  |  |  |  |   71|  11.1k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  11.1k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 11.1k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 2535|  11.1k|                  aom_malloc(sizeof(*(dec_row_mt_sync->cur_sb_col)) * rows));
 2536|       |
 2537|       |  // Set up nsync.
 2538|  11.1k|  dec_row_mt_sync->sync_range = get_sync_range(cm->width);
 2539|  11.1k|}
decodeframe.c:get_sync_range:
 2491|  11.1k|static inline int get_sync_range(int width) {
 2492|       |// nsync numbers are picked by testing.
 2493|       |#if 0
 2494|       |  if (width < 640)
 2495|       |    return 1;
 2496|       |  else if (width <= 1280)
 2497|       |    return 2;
 2498|       |  else if (width <= 4096)
 2499|       |    return 4;
 2500|       |  else
 2501|       |    return 8;
 2502|       |#else
 2503|  11.1k|  (void)width;
 2504|  11.1k|#endif
 2505|  11.1k|  return 1;
 2506|  11.1k|}
decodeframe.c:tile_mt_queue:
 3553|  13.1k|                                 int start_tile, int end_tile) {
 3554|  13.1k|  AV1_COMMON *const cm = &pbi->common;
 3555|  13.1k|  if (pbi->tile_mt_info.alloc_tile_cols != tile_cols ||
  ------------------
  |  Branch (3555:7): [True: 9.13k, False: 3.96k]
  ------------------
 3556|  9.54k|      pbi->tile_mt_info.alloc_tile_rows != tile_rows) {
  ------------------
  |  Branch (3556:7): [True: 410, False: 3.55k]
  ------------------
 3557|  9.54k|    av1_dealloc_dec_jobs(&pbi->tile_mt_info);
 3558|  9.54k|    alloc_dec_jobs(&pbi->tile_mt_info, cm, tile_rows, tile_cols);
 3559|  9.54k|  }
 3560|  13.1k|  enqueue_tile_jobs(pbi, cm, tile_rows_start, tile_rows_end, tile_cols_start,
 3561|  13.1k|                    tile_cols_end, start_tile, end_tile);
 3562|  13.1k|  qsort(pbi->tile_mt_info.job_queue, pbi->tile_mt_info.jobs_enqueued,
 3563|  13.1k|        sizeof(pbi->tile_mt_info.job_queue[0]), compare_tile_buffers);
 3564|  13.1k|}
decodeframe.c:alloc_dec_jobs:
 3360|  9.54k|                                  int tile_rows, int tile_cols) {
 3361|  9.54k|  tile_mt_info->alloc_tile_rows = tile_rows;
 3362|  9.54k|  tile_mt_info->alloc_tile_cols = tile_cols;
 3363|  9.54k|  int num_tiles = tile_rows * tile_cols;
 3364|  9.54k|#if CONFIG_MULTITHREAD
 3365|  9.54k|  {
 3366|  9.54k|    CHECK_MEM_ERROR(cm, tile_mt_info->job_mutex,
  ------------------
  |  |   51|  9.54k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  9.54k|  do {                                                    \
  |  |  |  |   69|  9.54k|    lval = (expr);                                        \
  |  |  |  |   70|  9.54k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 9.54k]
  |  |  |  |  ------------------
  |  |  |  |   71|  9.54k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  9.54k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 9.54k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 3367|  9.54k|                    aom_malloc(sizeof(*tile_mt_info->job_mutex) * num_tiles));
 3368|       |
 3369|  20.6k|    for (int i = 0; i < num_tiles; i++) {
  ------------------
  |  Branch (3369:21): [True: 11.1k, False: 9.54k]
  ------------------
 3370|  11.1k|      pthread_mutex_init(&tile_mt_info->job_mutex[i], NULL);
 3371|  11.1k|    }
 3372|  9.54k|  }
 3373|  9.54k|#endif
 3374|  9.54k|  CHECK_MEM_ERROR(cm, tile_mt_info->job_queue,
  ------------------
  |  |   51|  9.54k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  9.54k|  do {                                                    \
  |  |  |  |   69|  9.54k|    lval = (expr);                                        \
  |  |  |  |   70|  9.54k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 9.54k]
  |  |  |  |  ------------------
  |  |  |  |   71|  9.54k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  9.54k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 9.54k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 3375|  9.54k|                  aom_malloc(sizeof(*tile_mt_info->job_queue) * num_tiles));
 3376|  9.54k|}
decodeframe.c:enqueue_tile_jobs:
 3340|  13.1k|                                     int start_tile, int end_tile) {
 3341|  13.1k|  AV1DecTileMT *tile_mt_info = &pbi->tile_mt_info;
 3342|  13.1k|  TileJobsDec *tile_job_queue = tile_mt_info->job_queue;
 3343|  13.1k|  tile_mt_info->jobs_enqueued = 0;
 3344|  13.1k|  tile_mt_info->jobs_dequeued = 0;
 3345|       |
 3346|  26.9k|  for (int row = tile_rows_start; row < tile_rows_end; row++) {
  ------------------
  |  Branch (3346:35): [True: 13.8k, False: 13.1k]
  ------------------
 3347|  28.5k|    for (int col = tile_cols_start; col < tile_cols_end; col++) {
  ------------------
  |  Branch (3347:37): [True: 14.6k, False: 13.8k]
  ------------------
 3348|  14.6k|      if (row * cm->tiles.cols + col < start_tile ||
  ------------------
  |  Branch (3348:11): [True: 12, False: 14.6k]
  ------------------
 3349|  14.6k|          row * cm->tiles.cols + col > end_tile)
  ------------------
  |  Branch (3349:11): [True: 18, False: 14.6k]
  ------------------
 3350|     30|        continue;
 3351|  14.6k|      tile_job_queue->tile_buffer = &pbi->tile_buffers[row][col];
 3352|  14.6k|      tile_job_queue->tile_data = pbi->tile_data + row * cm->tiles.cols + col;
 3353|  14.6k|      tile_job_queue++;
 3354|  14.6k|      tile_mt_info->jobs_enqueued++;
 3355|  14.6k|    }
 3356|  13.8k|  }
 3357|  13.1k|}
decodeframe.c:compare_tile_buffers:
 3331|  3.01k|static int compare_tile_buffers(const void *a, const void *b) {
 3332|  3.01k|  const TileJobsDec *const buf1 = (const TileJobsDec *)a;
 3333|  3.01k|  const TileJobsDec *const buf2 = (const TileJobsDec *)b;
 3334|  3.01k|  return (((int)buf2->tile_buffer->size) - ((int)buf1->tile_buffer->size));
 3335|  3.01k|}
decodeframe.c:dec_alloc_cb_buf:
 3666|  13.1k|static inline void dec_alloc_cb_buf(AV1Decoder *pbi) {
 3667|  13.1k|  AV1_COMMON *const cm = &pbi->common;
 3668|  13.1k|  int size = ((cm->mi_params.mi_rows >> cm->seq_params->mib_size_log2) + 1) *
 3669|  13.1k|             ((cm->mi_params.mi_cols >> cm->seq_params->mib_size_log2) + 1);
 3670|       |
 3671|  13.1k|  if (pbi->cb_buffer_alloc_size < size) {
  ------------------
  |  Branch (3671:7): [True: 9.13k, False: 3.97k]
  ------------------
 3672|  9.13k|    av1_dec_free_cb_buf(pbi);
 3673|  9.13k|    CHECK_MEM_ERROR(cm, pbi->cb_buffer_base,
  ------------------
  |  |   51|  9.13k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  9.13k|  do {                                                    \
  |  |  |  |   69|  9.13k|    lval = (expr);                                        \
  |  |  |  |   70|  9.13k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 9.13k]
  |  |  |  |  ------------------
  |  |  |  |   71|  9.13k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  9.13k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 9.13k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 3674|  9.13k|                    aom_memalign(32, sizeof(*pbi->cb_buffer_base) * size));
 3675|  9.13k|    memset(pbi->cb_buffer_base, 0, sizeof(*pbi->cb_buffer_base) * size);
 3676|  9.13k|    pbi->cb_buffer_alloc_size = size;
 3677|  9.13k|  }
 3678|  13.1k|}
decodeframe.c:row_mt_frame_init:
 3683|  13.1k|                                     int end_tile, int max_sb_rows) {
 3684|  13.1k|  AV1_COMMON *const cm = &pbi->common;
 3685|  13.1k|  AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info;
 3686|       |
 3687|  13.1k|  frame_row_mt_info->tile_rows_start = tile_rows_start;
 3688|  13.1k|  frame_row_mt_info->tile_rows_end = tile_rows_end;
 3689|  13.1k|  frame_row_mt_info->tile_cols_start = tile_cols_start;
 3690|  13.1k|  frame_row_mt_info->tile_cols_end = tile_cols_end;
 3691|  13.1k|  frame_row_mt_info->start_tile = start_tile;
 3692|  13.1k|  frame_row_mt_info->end_tile = end_tile;
 3693|  13.1k|  frame_row_mt_info->mi_rows_to_decode = 0;
 3694|  13.1k|  frame_row_mt_info->mi_rows_parse_done = 0;
 3695|  13.1k|  frame_row_mt_info->mi_rows_decode_started = 0;
 3696|  13.1k|  frame_row_mt_info->row_mt_exit = 0;
 3697|       |
 3698|  26.9k|  for (int tile_row = tile_rows_start; tile_row < tile_rows_end; ++tile_row) {
  ------------------
  |  Branch (3698:40): [True: 13.8k, False: 13.1k]
  ------------------
 3699|  28.5k|    for (int tile_col = tile_cols_start; tile_col < tile_cols_end; ++tile_col) {
  ------------------
  |  Branch (3699:42): [True: 14.6k, False: 13.8k]
  ------------------
 3700|  14.6k|      if (tile_row * cm->tiles.cols + tile_col < start_tile ||
  ------------------
  |  Branch (3700:11): [True: 12, False: 14.6k]
  ------------------
 3701|  14.6k|          tile_row * cm->tiles.cols + tile_col > end_tile)
  ------------------
  |  Branch (3701:11): [True: 18, False: 14.6k]
  ------------------
 3702|     30|        continue;
 3703|       |
 3704|  14.6k|      TileDataDec *const tile_data =
 3705|  14.6k|          pbi->tile_data + tile_row * cm->tiles.cols + tile_col;
 3706|  14.6k|      const TileInfo *const tile_info = &tile_data->tile_info;
 3707|       |
 3708|  14.6k|      tile_data->dec_row_mt_sync.mi_rows_parse_done = 0;
 3709|  14.6k|      tile_data->dec_row_mt_sync.mi_rows_decode_started = 0;
 3710|  14.6k|      tile_data->dec_row_mt_sync.num_threads_working = 0;
 3711|  14.6k|      tile_data->dec_row_mt_sync.mi_rows =
 3712|  14.6k|          ALIGN_POWER_OF_TWO(tile_info->mi_row_end - tile_info->mi_row_start,
  ------------------
  |  |   69|  14.6k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  ------------------
 3713|  14.6k|                             cm->seq_params->mib_size_log2);
 3714|  14.6k|      tile_data->dec_row_mt_sync.mi_cols =
 3715|  14.6k|          ALIGN_POWER_OF_TWO(tile_info->mi_col_end - tile_info->mi_col_start,
  ------------------
  |  |   69|  14.6k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  ------------------
 3716|  14.6k|                             cm->seq_params->mib_size_log2);
 3717|  14.6k|      tile_data->dec_row_mt_sync.intrabc_extra_top_right_sb_delay =
 3718|  14.6k|          av1_get_intrabc_extra_top_right_sb_delay(cm);
 3719|       |
 3720|  14.6k|      frame_row_mt_info->mi_rows_to_decode +=
 3721|  14.6k|          tile_data->dec_row_mt_sync.mi_rows;
 3722|       |
 3723|       |      // Initialize cur_sb_col to -1 for all SB rows.
 3724|  14.6k|      memset(tile_data->dec_row_mt_sync.cur_sb_col, -1,
 3725|  14.6k|             sizeof(*tile_data->dec_row_mt_sync.cur_sb_col) * max_sb_rows);
 3726|  14.6k|    }
 3727|  13.8k|  }
 3728|       |
 3729|  13.1k|#if CONFIG_MULTITHREAD
 3730|  13.1k|  if (pbi->row_mt_mutex_ == NULL) {
  ------------------
  |  Branch (3730:7): [True: 9.12k, False: 3.98k]
  ------------------
 3731|  9.12k|    CHECK_MEM_ERROR(cm, pbi->row_mt_mutex_,
  ------------------
  |  |   51|  9.12k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  9.12k|  do {                                                    \
  |  |  |  |   69|  9.12k|    lval = (expr);                                        \
  |  |  |  |   70|  9.12k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 9.12k]
  |  |  |  |  ------------------
  |  |  |  |   71|  9.12k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  9.12k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 9.12k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 3732|  9.12k|                    aom_malloc(sizeof(*(pbi->row_mt_mutex_))));
 3733|  9.12k|    if (pbi->row_mt_mutex_) {
  ------------------
  |  Branch (3733:9): [True: 9.12k, False: 0]
  ------------------
 3734|  9.12k|      pthread_mutex_init(pbi->row_mt_mutex_, NULL);
 3735|  9.12k|    }
 3736|  9.12k|  }
 3737|       |
 3738|  13.1k|  if (pbi->row_mt_cond_ == NULL) {
  ------------------
  |  Branch (3738:7): [True: 9.12k, False: 3.98k]
  ------------------
 3739|  9.12k|    CHECK_MEM_ERROR(cm, pbi->row_mt_cond_,
  ------------------
  |  |   51|  9.12k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  9.12k|  do {                                                    \
  |  |  |  |   69|  9.12k|    lval = (expr);                                        \
  |  |  |  |   70|  9.12k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 9.12k]
  |  |  |  |  ------------------
  |  |  |  |   71|  9.12k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  9.12k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 9.12k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 3740|  9.12k|                    aom_malloc(sizeof(*(pbi->row_mt_cond_))));
 3741|  9.12k|    if (pbi->row_mt_cond_) {
  ------------------
  |  Branch (3741:9): [True: 9.12k, False: 0]
  ------------------
 3742|       |      pthread_cond_init(pbi->row_mt_cond_, NULL);
 3743|  9.12k|    }
 3744|  9.12k|  }
 3745|  13.1k|#endif
 3746|  13.1k|}
decodeframe.c:reset_dec_workers:
 3439|  13.1k|                                     int num_workers) {
 3440|  13.1k|  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
 3441|       |
 3442|       |  // Reset tile decoding hook
 3443|  36.1k|  for (int worker_idx = 0; worker_idx < num_workers; ++worker_idx) {
  ------------------
  |  Branch (3443:28): [True: 22.9k, False: 13.1k]
  ------------------
 3444|  22.9k|    AVxWorker *const worker = &pbi->tile_workers[worker_idx];
 3445|  22.9k|    DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
 3446|  22.9k|    thread_data->td->dcb = pbi->dcb;
 3447|  22.9k|    thread_data->td->dcb.corrupted = 0;
 3448|  22.9k|    thread_data->td->dcb.mc_buf[0] = thread_data->td->mc_buf[0];
 3449|  22.9k|    thread_data->td->dcb.mc_buf[1] = thread_data->td->mc_buf[1];
 3450|  22.9k|    thread_data->td->dcb.xd.tmp_conv_dst = thread_data->td->tmp_conv_dst;
 3451|  22.9k|    if (worker_idx)
  ------------------
  |  Branch (3451:9): [True: 9.89k, False: 13.1k]
  ------------------
 3452|  9.89k|      thread_data->td->dcb.xd.seg_mask = thread_data->td->seg_mask;
 3453|  68.9k|    for (int j = 0; j < 2; ++j) {
  ------------------
  |  Branch (3453:21): [True: 45.9k, False: 22.9k]
  ------------------
 3454|  45.9k|      thread_data->td->dcb.xd.tmp_obmc_bufs[j] =
 3455|  45.9k|          thread_data->td->tmp_obmc_bufs[j];
 3456|  45.9k|    }
 3457|  22.9k|    winterface->sync(worker);
 3458|       |
 3459|  22.9k|    worker->hook = worker_hook;
 3460|  22.9k|    worker->data1 = thread_data;
 3461|  22.9k|    worker->data2 = pbi;
 3462|  22.9k|  }
 3463|       |#if CONFIG_ACCOUNTING
 3464|       |  if (pbi->acct_enabled) {
 3465|       |    aom_accounting_reset(&pbi->accounting);
 3466|       |  }
 3467|       |#endif
 3468|  13.1k|}
decodeframe.c:row_mt_worker_hook:
 3203|  22.9k|static int row_mt_worker_hook(void *arg1, void *arg2) {
 3204|  22.9k|  DecWorkerData *const thread_data = (DecWorkerData *)arg1;
 3205|  22.9k|  AV1Decoder *const pbi = (AV1Decoder *)arg2;
 3206|  22.9k|  ThreadData *const td = thread_data->td;
 3207|  22.9k|  uint8_t allow_update_cdf;
 3208|  22.9k|  AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info;
 3209|  22.9k|  td->dcb.corrupted = 0;
 3210|       |
 3211|       |  // The jmp_buf is valid only for the duration of the function that calls
 3212|       |  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
 3213|       |  // before it returns.
 3214|  22.9k|  if (setjmp(thread_data->error_info.jmp)) {
  ------------------
  |  Branch (3214:7): [True: 712, False: 22.2k]
  ------------------
 3215|    712|    thread_data->error_info.setjmp = 0;
 3216|    712|    thread_data->td->dcb.corrupted = 1;
 3217|    712|#if CONFIG_MULTITHREAD
 3218|    712|    pthread_mutex_lock(pbi->row_mt_mutex_);
 3219|    712|#endif
 3220|    712|    frame_row_mt_info->row_mt_exit = 1;
 3221|    712|#if CONFIG_MULTITHREAD
 3222|    712|    pthread_cond_broadcast(pbi->row_mt_cond_);
 3223|    712|    pthread_mutex_unlock(pbi->row_mt_mutex_);
 3224|    712|#endif
 3225|       |    // If any SB row (erroneous row) processed by a thread encounters an
 3226|       |    // internal error, there is a need to indicate other threads that decoding
 3227|       |    // of the erroneous row is complete. This ensures that other threads which
 3228|       |    // wait upon the completion of SB's present in erroneous row are not waiting
 3229|       |    // indefinitely.
 3230|    712|    signal_decoding_done_for_erroneous_row(pbi, &thread_data->td->dcb.xd);
 3231|    712|    return 0;
 3232|    712|  }
 3233|  22.2k|  thread_data->error_info.setjmp = 1;
 3234|       |
 3235|  22.2k|  AV1_COMMON *cm = &pbi->common;
 3236|  22.2k|  allow_update_cdf = cm->tiles.large_scale ? 0 : 1;
  ------------------
  |  Branch (3236:22): [True: 0, False: 22.2k]
  ------------------
 3237|  22.9k|  allow_update_cdf = allow_update_cdf && !cm->features.disable_cdf_update;
  ------------------
  |  Branch (3237:22): [True: 22.9k, False: 18.4E]
  |  Branch (3237:42): [True: 20.8k, False: 2.11k]
  ------------------
 3238|       |
 3239|  22.2k|  set_decode_func_pointers(td, 0x1);
 3240|       |
 3241|  22.2k|  assert(cm->tiles.cols > 0);
 3242|  36.9k|  while (!td->dcb.corrupted) {
  ------------------
  |  Branch (3242:10): [True: 30.0k, False: 6.84k]
  ------------------
 3243|  30.0k|    TileJobsDec *cur_job_info = get_dec_job_info(&pbi->tile_mt_info);
 3244|       |
 3245|  30.0k|    if (cur_job_info != NULL) {
  ------------------
  |  Branch (3245:9): [True: 14.6k, False: 15.4k]
  ------------------
 3246|  14.6k|      const TileBufferDec *const tile_buffer = cur_job_info->tile_buffer;
 3247|  14.6k|      TileDataDec *const tile_data = cur_job_info->tile_data;
 3248|  14.6k|      tile_worker_hook_init(pbi, thread_data, tile_buffer, tile_data,
 3249|  14.6k|                            allow_update_cdf);
 3250|  14.6k|#if CONFIG_MULTITHREAD
 3251|  14.6k|      pthread_mutex_lock(pbi->row_mt_mutex_);
 3252|  14.6k|#endif
 3253|  14.6k|      tile_data->dec_row_mt_sync.num_threads_working++;
 3254|  14.6k|#if CONFIG_MULTITHREAD
 3255|  14.6k|      pthread_mutex_unlock(pbi->row_mt_mutex_);
 3256|  14.6k|#endif
 3257|       |      // decode tile
 3258|  14.6k|      parse_tile_row_mt(pbi, td, tile_data);
 3259|  14.6k|#if CONFIG_MULTITHREAD
 3260|  14.6k|      pthread_mutex_lock(pbi->row_mt_mutex_);
 3261|  14.6k|#endif
 3262|  14.6k|      tile_data->dec_row_mt_sync.num_threads_working--;
 3263|  14.6k|#if CONFIG_MULTITHREAD
 3264|  14.6k|      pthread_mutex_unlock(pbi->row_mt_mutex_);
 3265|  14.6k|#endif
 3266|  15.4k|    } else {
 3267|  15.4k|      break;
 3268|  15.4k|    }
 3269|  30.0k|  }
 3270|       |
 3271|  22.2k|  if (td->dcb.corrupted) {
  ------------------
  |  Branch (3271:7): [True: 6.81k, False: 15.4k]
  ------------------
 3272|  6.81k|    thread_data->error_info.setjmp = 0;
 3273|  6.81k|#if CONFIG_MULTITHREAD
 3274|  6.81k|    pthread_mutex_lock(pbi->row_mt_mutex_);
 3275|  6.81k|#endif
 3276|  6.81k|    frame_row_mt_info->row_mt_exit = 1;
 3277|  6.81k|#if CONFIG_MULTITHREAD
 3278|  6.81k|    pthread_cond_broadcast(pbi->row_mt_cond_);
 3279|  6.81k|    pthread_mutex_unlock(pbi->row_mt_mutex_);
 3280|  6.81k|#endif
 3281|  6.81k|    return 0;
 3282|  6.81k|  }
 3283|       |
 3284|  15.4k|  set_decode_func_pointers(td, 0x2);
 3285|       |
 3286|  36.7k|  while (1) {
  ------------------
  |  Branch (3286:10): [True: 36.7k, Folded]
  ------------------
 3287|  36.7k|    AV1DecRowMTJobInfo next_job_info;
 3288|  36.7k|    int end_of_frame = 0;
 3289|       |
 3290|  36.7k|#if CONFIG_MULTITHREAD
 3291|  36.7k|    pthread_mutex_lock(pbi->row_mt_mutex_);
 3292|  36.7k|#endif
 3293|  47.2k|    while (!get_next_job_info(pbi, &next_job_info, &end_of_frame)) {
  ------------------
  |  Branch (3293:12): [True: 10.5k, False: 36.7k]
  ------------------
 3294|  10.5k|#if CONFIG_MULTITHREAD
 3295|  10.5k|      pthread_cond_wait(pbi->row_mt_cond_, pbi->row_mt_mutex_);
 3296|  10.5k|#endif
 3297|  10.5k|    }
 3298|  36.7k|#if CONFIG_MULTITHREAD
 3299|  36.7k|    pthread_mutex_unlock(pbi->row_mt_mutex_);
 3300|  36.7k|#endif
 3301|       |
 3302|  36.7k|    if (end_of_frame) break;
  ------------------
  |  Branch (3302:9): [True: 15.4k, False: 21.2k]
  ------------------
 3303|       |
 3304|  21.2k|    int tile_row = next_job_info.tile_row;
 3305|  21.2k|    int tile_col = next_job_info.tile_col;
 3306|  21.2k|    int mi_row = next_job_info.mi_row;
 3307|       |
 3308|  21.2k|    TileDataDec *tile_data =
 3309|  21.2k|        pbi->tile_data + tile_row * cm->tiles.cols + tile_col;
 3310|  21.2k|    AV1DecRowMTSync *dec_row_mt_sync = &tile_data->dec_row_mt_sync;
 3311|       |
 3312|  21.2k|    av1_tile_init(&td->dcb.xd.tile, cm, tile_row, tile_col);
 3313|  21.2k|    av1_init_macroblockd(cm, &td->dcb.xd);
 3314|  21.2k|    td->dcb.xd.error_info = &thread_data->error_info;
 3315|       |
 3316|  21.2k|    decode_tile_sb_row(pbi, td, &tile_data->tile_info, mi_row);
 3317|       |
 3318|  21.2k|#if CONFIG_MULTITHREAD
 3319|  21.2k|    pthread_mutex_lock(pbi->row_mt_mutex_);
 3320|  21.2k|#endif
 3321|  21.2k|    dec_row_mt_sync->num_threads_working--;
 3322|  21.2k|#if CONFIG_MULTITHREAD
 3323|  21.2k|    pthread_mutex_unlock(pbi->row_mt_mutex_);
 3324|  21.2k|#endif
 3325|  21.2k|  }
 3326|  15.4k|  thread_data->error_info.setjmp = 0;
 3327|  15.4k|  return !td->dcb.corrupted;
 3328|  22.2k|}
decodeframe.c:signal_decoding_done_for_erroneous_row:
 2620|    712|    AV1Decoder *const pbi, const MACROBLOCKD *const xd) {
 2621|    712|  AV1_COMMON *const cm = &pbi->common;
 2622|    712|  const TileInfo *const tile = &xd->tile;
 2623|    712|  const int sb_row_in_tile =
 2624|    712|      ((xd->mi_row - tile->mi_row_start) >> cm->seq_params->mib_size_log2);
 2625|    712|  const int sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile);
 2626|    712|  TileDataDec *const tile_data =
 2627|    712|      pbi->tile_data + tile->tile_row * cm->tiles.cols + tile->tile_col;
 2628|    712|  AV1DecRowMTSync *dec_row_mt_sync = &tile_data->dec_row_mt_sync;
 2629|       |
 2630|    712|  sync_write(dec_row_mt_sync, sb_row_in_tile, sb_cols_in_tile - 1,
 2631|    712|             sb_cols_in_tile);
 2632|    712|}
decodeframe.c:sync_write:
 2590|  71.8k|                              int c, const int sb_cols) {
 2591|  71.8k|#if CONFIG_MULTITHREAD
 2592|  71.8k|  const int nsync = dec_row_mt_sync->sync_range;
 2593|  71.8k|  int cur;
 2594|  71.8k|  int sig = 1;
 2595|       |
 2596|  71.8k|  if (c < sb_cols - 1) {
  ------------------
  |  Branch (2596:7): [True: 49.8k, False: 21.9k]
  ------------------
 2597|  49.8k|    cur = c;
 2598|  49.8k|    if (c % nsync) sig = 0;
  ------------------
  |  Branch (2598:9): [True: 0, False: 49.8k]
  ------------------
 2599|  49.8k|  } else {
 2600|  21.9k|    cur = sb_cols + nsync + dec_row_mt_sync->intrabc_extra_top_right_sb_delay;
 2601|  21.9k|  }
 2602|       |
 2603|  71.8k|  if (sig) {
  ------------------
  |  Branch (2603:7): [True: 71.8k, False: 0]
  ------------------
 2604|  71.8k|    pthread_mutex_lock(&dec_row_mt_sync->mutex_[r]);
 2605|       |
 2606|  71.8k|    dec_row_mt_sync->cur_sb_col[r] = cur;
 2607|       |
 2608|  71.8k|    pthread_cond_signal(&dec_row_mt_sync->cond_[r]);
 2609|  71.8k|    pthread_mutex_unlock(&dec_row_mt_sync->mutex_[r]);
 2610|  71.8k|  }
 2611|       |#else
 2612|       |  (void)dec_row_mt_sync;
 2613|       |  (void)r;
 2614|       |  (void)c;
 2615|       |  (void)sb_cols;
 2616|       |#endif  // CONFIG_MULTITHREAD
 2617|  71.8k|}
decodeframe.c:set_decode_func_pointers:
 2698|  51.1k|                                            int parse_decode_flag) {
 2699|  51.1k|  td->read_coeffs_tx_intra_block_visit = decode_block_void;
 2700|  51.1k|  td->predict_and_recon_intra_block_visit = decode_block_void;
 2701|  51.1k|  td->read_coeffs_tx_inter_block_visit = decode_block_void;
 2702|  51.1k|  td->inverse_tx_inter_block_visit = decode_block_void;
 2703|  51.1k|  td->predict_inter_block_visit = predict_inter_block_void;
 2704|  51.1k|  td->cfl_store_inter_block_visit = cfl_store_inter_block_void;
 2705|       |
 2706|  51.1k|  if (parse_decode_flag & 0x1) {
  ------------------
  |  Branch (2706:7): [True: 35.6k, False: 15.4k]
  ------------------
 2707|  35.6k|    td->read_coeffs_tx_intra_block_visit = read_coeffs_tx_intra_block;
 2708|  35.6k|    td->read_coeffs_tx_inter_block_visit = av1_read_coeffs_txb;
 2709|  35.6k|  }
 2710|  51.1k|  if (parse_decode_flag & 0x2) {
  ------------------
  |  Branch (2710:7): [True: 28.1k, False: 22.9k]
  ------------------
 2711|  28.1k|    td->predict_and_recon_intra_block_visit =
 2712|  28.1k|        predict_and_reconstruct_intra_block;
 2713|  28.1k|    td->inverse_tx_inter_block_visit = inverse_transform_inter_block;
 2714|  28.1k|    td->predict_inter_block_visit = predict_inter_block;
 2715|  28.1k|    td->cfl_store_inter_block_visit = cfl_store_inter_block;
 2716|  28.1k|  }
 2717|  51.1k|}
decodeframe.c:decode_block_void:
  190|  6.93M|                                     const TX_SIZE tx_size) {
  191|  6.93M|  (void)cm;
  192|  6.93M|  (void)dcb;
  193|  6.93M|  (void)r;
  194|  6.93M|  (void)plane;
  195|  6.93M|  (void)row;
  196|  6.93M|  (void)col;
  197|  6.93M|  (void)tx_size;
  198|  6.93M|}
decodeframe.c:predict_inter_block_void:
  202|  42.3k|                                            BLOCK_SIZE bsize) {
  203|  42.3k|  (void)cm;
  204|  42.3k|  (void)dcb;
  205|  42.3k|  (void)bsize;
  206|  42.3k|}
decodeframe.c:cfl_store_inter_block_void:
  209|  42.3k|                                              MACROBLOCKD *const xd) {
  210|  42.3k|  (void)cm;
  211|  42.3k|  (void)xd;
  212|  42.3k|}
decodeframe.c:read_coeffs_tx_intra_block:
  169|  5.84M|    const int plane, const int row, const int col, const TX_SIZE tx_size) {
  170|  5.84M|  MB_MODE_INFO *mbmi = dcb->xd.mi[0];
  171|  5.84M|  if (!mbmi->skip_txfm) {
  ------------------
  |  Branch (171:7): [True: 4.43M, False: 1.40M]
  ------------------
  172|       |#if TXCOEFF_TIMER
  173|       |    struct aom_usec_timer timer;
  174|       |    aom_usec_timer_start(&timer);
  175|       |#endif
  176|  4.43M|    av1_read_coeffs_txb(cm, dcb, r, plane, row, col, tx_size);
  177|       |#if TXCOEFF_TIMER
  178|       |    aom_usec_timer_mark(&timer);
  179|       |    const int64_t elapsed_time = aom_usec_timer_elapsed(&timer);
  180|       |    cm->txcoeff_timer += elapsed_time;
  181|       |    ++cm->txb_count;
  182|       |#endif
  183|  4.43M|  }
  184|  5.84M|}
decodeframe.c:predict_and_reconstruct_intra_block:
  216|  4.52M|    const int plane, const int row, const int col, const TX_SIZE tx_size) {
  217|  4.52M|  (void)r;
  218|  4.52M|  MACROBLOCKD *const xd = &dcb->xd;
  219|  4.52M|  MB_MODE_INFO *mbmi = xd->mi[0];
  220|  4.52M|  PLANE_TYPE plane_type = get_plane_type(plane);
  221|       |
  222|  4.52M|  av1_predict_intra_block_facade(cm, xd, plane, col, row, tx_size);
  223|       |
  224|  4.52M|  if (!mbmi->skip_txfm) {
  ------------------
  |  Branch (224:7): [True: 3.45M, False: 1.07M]
  ------------------
  225|  3.45M|    eob_info *eob_data = dcb->eob_data[plane] + dcb->txb_offset[plane];
  226|  3.45M|    if (eob_data->eob) {
  ------------------
  |  Branch (226:9): [True: 1.92M, False: 1.52M]
  ------------------
  227|  1.92M|      const bool reduced_tx_set_used = cm->features.reduced_tx_set_used;
  228|       |      // tx_type was read out in av1_read_coeffs_txb.
  229|  1.92M|      const TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, row, col, tx_size,
  230|  1.92M|                                              reduced_tx_set_used);
  231|  1.92M|      struct macroblockd_plane *const pd = &xd->plane[plane];
  232|  1.92M|      uint8_t *dst = &pd->dst.buf[(row * pd->dst.stride + col) << MI_SIZE_LOG2];
  ------------------
  |  |   39|  1.92M|#define MI_SIZE_LOG2 2
  ------------------
  233|  1.92M|      inverse_transform_block(dcb, plane, tx_type, tx_size, dst, pd->dst.stride,
  234|  1.92M|                              reduced_tx_set_used);
  235|  1.92M|    }
  236|  3.45M|  }
  237|  4.52M|  if (plane == AOM_PLANE_Y && store_cfl_required(cm, xd)) {
  ------------------
  |  |  210|  9.05M|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
  |  Branch (237:7): [True: 2.40M, False: 2.12M]
  |  Branch (237:31): [True: 212k, False: 2.18M]
  ------------------
  238|   212k|    cfl_store_tx(xd, row, col, tx_size, mbmi->bsize);
  239|   212k|  }
  240|  4.52M|}
decodeframe.c:inverse_transform_block:
  157|  2.09M|                                           int stride, int reduced_tx_set) {
  158|  2.09M|  tran_low_t *const dqcoeff = dcb->dqcoeff_block[plane] + dcb->cb_offset[plane];
  159|  2.09M|  eob_info *eob_data = dcb->eob_data[plane] + dcb->txb_offset[plane];
  160|  2.09M|  uint16_t scan_line = eob_data->max_scan_line;
  161|  2.09M|  uint16_t eob = eob_data->eob;
  162|  2.09M|  av1_inverse_transform_block(&dcb->xd, dqcoeff, plane, tx_type, tx_size, dst,
  163|  2.09M|                              stride, eob, reduced_tx_set);
  164|  2.09M|  memset(dqcoeff, 0, (scan_line + 1) * sizeof(dqcoeff[0]));
  165|  2.09M|}
decodeframe.c:inverse_transform_inter_block:
  245|   161k|    const TX_SIZE tx_size) {
  246|   161k|  (void)r;
  247|   161k|  MACROBLOCKD *const xd = &dcb->xd;
  248|   161k|  PLANE_TYPE plane_type = get_plane_type(plane);
  249|   161k|  const struct macroblockd_plane *const pd = &xd->plane[plane];
  250|   161k|  const bool reduced_tx_set_used = cm->features.reduced_tx_set_used;
  251|       |  // tx_type was read out in av1_read_coeffs_txb.
  252|   161k|  const TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col,
  253|   161k|                                          tx_size, reduced_tx_set_used);
  254|       |
  255|   161k|  uint8_t *dst =
  256|   161k|      &pd->dst.buf[(blk_row * pd->dst.stride + blk_col) << MI_SIZE_LOG2];
  ------------------
  |  |   39|   161k|#define MI_SIZE_LOG2 2
  ------------------
  257|   161k|  inverse_transform_block(dcb, plane, tx_type, tx_size, dst, pd->dst.stride,
  258|   161k|                          reduced_tx_set_used);
  259|       |#if CONFIG_MISMATCH_DEBUG
  260|       |  int pixel_c, pixel_r;
  261|       |  BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
  262|       |  int blk_w = block_size_wide[bsize];
  263|       |  int blk_h = block_size_high[bsize];
  264|       |  const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2);
  265|       |  const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2);
  266|       |  mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, blk_col, blk_row,
  267|       |                  pd->subsampling_x, pd->subsampling_y);
  268|       |  mismatch_check_block_tx(dst, pd->dst.stride, cm->current_frame.order_hint,
  269|       |                          plane, pixel_c, pixel_r, blk_w, blk_h,
  270|       |                          xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
  271|       |#endif
  272|   161k|}
decodeframe.c:predict_inter_block:
  848|  51.3k|                                       BLOCK_SIZE bsize) {
  849|  51.3k|  MACROBLOCKD *const xd = &dcb->xd;
  850|  51.3k|  MB_MODE_INFO *mbmi = xd->mi[0];
  851|  51.3k|  const int num_planes = av1_num_planes(cm);
  852|  51.3k|  const int mi_row = xd->mi_row;
  853|  51.3k|  const int mi_col = xd->mi_col;
  854|   108k|  for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
  ------------------
  |  Branch (854:21): [True: 57.4k, False: 51.3k]
  ------------------
  855|  57.4k|    const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
  856|  57.4k|    if (frame < LAST_FRAME) {
  ------------------
  |  Branch (856:9): [True: 6.18k, False: 51.2k]
  ------------------
  857|  6.18k|      assert(is_intrabc_block(mbmi));
  858|  6.18k|      assert(frame == INTRA_FRAME);
  859|  6.18k|      assert(ref == 0);
  860|  51.2k|    } else {
  861|  51.2k|      const RefCntBuffer *ref_buf = get_ref_frame_buf(cm, frame);
  862|  51.2k|      const struct scale_factors *ref_scale_factors =
  863|  51.2k|          get_ref_scale_factors_const(cm, frame);
  864|       |
  865|  51.2k|      xd->block_ref_scale_factors[ref] = ref_scale_factors;
  866|  51.2k|      av1_setup_pre_planes(xd, ref, &ref_buf->buf, mi_row, mi_col,
  867|  51.2k|                           ref_scale_factors, num_planes);
  868|  51.2k|    }
  869|  57.4k|  }
  870|       |
  871|  51.3k|  dec_build_inter_predictor(cm, dcb, mi_row, mi_col, bsize);
  872|  51.3k|  if (mbmi->motion_mode == OBMC_CAUSAL) {
  ------------------
  |  Branch (872:7): [True: 7.49k, False: 43.8k]
  ------------------
  873|  7.49k|    dec_build_obmc_inter_predictors_sb(cm, dcb);
  874|  7.49k|  }
  875|       |#if CONFIG_MISMATCH_DEBUG
  876|       |  for (int plane = 0; plane < num_planes; ++plane) {
  877|       |    const struct macroblockd_plane *pd = &xd->plane[plane];
  878|       |    int pixel_c, pixel_r;
  879|       |    mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, 0, 0, pd->subsampling_x,
  880|       |                    pd->subsampling_y);
  881|       |    if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
  882|       |                             pd->subsampling_y))
  883|       |      continue;
  884|       |    mismatch_check_block_pre(pd->dst.buf, pd->dst.stride,
  885|       |                             cm->current_frame.order_hint, plane, pixel_c,
  886|       |                             pixel_r, pd->width, pd->height,
  887|       |                             xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
  888|       |  }
  889|       |#endif
  890|  51.3k|}
decodeframe.c:dec_build_inter_predictor:
  678|  51.3k|                                             BLOCK_SIZE bsize) {
  679|  51.3k|  MACROBLOCKD *const xd = &dcb->xd;
  680|  51.3k|  const int num_planes = av1_num_planes(cm);
  681|   182k|  for (int plane = 0; plane < num_planes; ++plane) {
  ------------------
  |  Branch (681:23): [True: 137k, False: 44.8k]
  ------------------
  682|   137k|    if (plane && !xd->is_chroma_ref) break;
  ------------------
  |  Branch (682:9): [True: 86.5k, False: 51.3k]
  |  Branch (682:18): [True: 6.53k, False: 80.0k]
  ------------------
  683|   131k|    const int mi_x = mi_col * MI_SIZE;
  ------------------
  |  |   40|   131k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|   131k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  684|   131k|    const int mi_y = mi_row * MI_SIZE;
  ------------------
  |  |   40|   131k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|   131k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  685|   131k|    dec_build_inter_predictors(cm, dcb, plane, xd->mi[0], 0,
  686|   131k|                               xd->plane[plane].width, xd->plane[plane].height,
  687|   131k|                               mi_x, mi_y);
  688|   131k|    if (is_interintra_pred(xd->mi[0])) {
  ------------------
  |  Branch (688:9): [True: 4.89k, False: 126k]
  ------------------
  689|  4.89k|      BUFFER_SET ctx = { { xd->plane[0].dst.buf, xd->plane[1].dst.buf,
  690|  4.89k|                           xd->plane[2].dst.buf },
  691|  4.89k|                         { xd->plane[0].dst.stride, xd->plane[1].dst.stride,
  692|  4.89k|                           xd->plane[2].dst.stride } };
  693|  4.89k|      av1_build_interintra_predictor(cm, xd, xd->plane[plane].dst.buf,
  694|  4.89k|                                     xd->plane[plane].dst.stride, &ctx, plane,
  695|  4.89k|                                     bsize);
  696|  4.89k|    }
  697|   131k|  }
  698|  51.3k|}
decodeframe.c:dec_build_inter_predictors:
  670|   163k|                                       int mi_x, int mi_y) {
  671|   163k|  build_inter_predictors(cm, &dcb->xd, plane, mi, build_for_obmc, bw, bh, mi_x,
  672|   163k|                         mi_y, dcb->mc_buf);
  673|   163k|}
decodeframe.c:dec_calc_subpel_params_and_extend:
  648|   193k|    uint8_t **pre, SubpelParams *subpel_params, int *src_stride) {
  649|   193k|  PadBlock block;
  650|   193k|  MV32 scaled_mv;
  651|   193k|  int subpel_x_mv, subpel_y_mv;
  652|   193k|  dec_calc_subpel_params(src_mv, inter_pred_params, xd, mi_x, mi_y, pre,
  653|   193k|                         subpel_params, src_stride, &block, &scaled_mv,
  654|   193k|                         &subpel_x_mv, &subpel_y_mv);
  655|   193k|  extend_mc_border(
  656|   193k|      inter_pred_params->scale_factors, &inter_pred_params->ref_frame_buf,
  657|   193k|      scaled_mv, block, subpel_x_mv, subpel_y_mv,
  658|   193k|      inter_pred_params->mode == WARP_PRED, inter_pred_params->is_intrabc,
  659|   193k|      inter_pred_params->use_hbd_buf, mc_buf[ref], pre, src_stride);
  660|   193k|}
decodeframe.c:dec_calc_subpel_params:
  563|   193k|    MV32 *scaled_mv, int *subpel_x_mv, int *subpel_y_mv) {
  564|   193k|  const struct scale_factors *sf = inter_pred_params->scale_factors;
  565|   193k|  struct buf_2d *pre_buf = &inter_pred_params->ref_frame_buf;
  566|   193k|  const int bw = inter_pred_params->block_width;
  567|   193k|  const int bh = inter_pred_params->block_height;
  568|   193k|  const int is_scaled = av1_is_scaled(sf);
  569|   193k|  if (is_scaled) {
  ------------------
  |  Branch (569:7): [True: 24, False: 193k]
  ------------------
  570|     24|    int ssx = inter_pred_params->subsampling_x;
  571|     24|    int ssy = inter_pred_params->subsampling_y;
  572|     24|    int orig_pos_y = inter_pred_params->pix_row << SUBPEL_BITS;
  ------------------
  |  |   23|     24|#define SUBPEL_BITS 4
  ------------------
  573|     24|    orig_pos_y += src_mv->row * (1 << (1 - ssy));
  574|     24|    int orig_pos_x = inter_pred_params->pix_col << SUBPEL_BITS;
  ------------------
  |  |   23|     24|#define SUBPEL_BITS 4
  ------------------
  575|     24|    orig_pos_x += src_mv->col * (1 << (1 - ssx));
  576|     24|    int pos_y = av1_scaled_y(orig_pos_y, sf);
  577|     24|    int pos_x = av1_scaled_x(orig_pos_x, sf);
  578|     24|    pos_x += SCALE_EXTRA_OFF;
  ------------------
  |  |   32|     24|#define SCALE_EXTRA_OFF ((1 << SCALE_EXTRA_BITS) / 2)
  |  |  ------------------
  |  |  |  |   31|     24|#define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   28|     24|#define SCALE_SUBPEL_BITS 10
  |  |  |  |  ------------------
  |  |  |  |               #define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   23|     24|#define SUBPEL_BITS 4
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  579|     24|    pos_y += SCALE_EXTRA_OFF;
  ------------------
  |  |   32|     24|#define SCALE_EXTRA_OFF ((1 << SCALE_EXTRA_BITS) / 2)
  |  |  ------------------
  |  |  |  |   31|     24|#define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   28|     24|#define SCALE_SUBPEL_BITS 10
  |  |  |  |  ------------------
  |  |  |  |               #define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   23|     24|#define SUBPEL_BITS 4
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  580|       |
  581|     24|    const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
  ------------------
  |  |   32|     24|  (AOM_LEFT_TOP_MARGIN_PX(subsampling) << SCALE_SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   30|     24|  ((AOM_BORDER_IN_PIXELS >> subsampling) - AOM_INTERP_EXTEND)
  |  |  |  |  ------------------
  |  |  |  |  |  |   32|     24|#define AOM_BORDER_IN_PIXELS 288
  |  |  |  |  ------------------
  |  |  |  |                 ((AOM_BORDER_IN_PIXELS >> subsampling) - AOM_INTERP_EXTEND)
  |  |  |  |  ------------------
  |  |  |  |  |  |   31|     24|#define AOM_INTERP_EXTEND 4
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (AOM_LEFT_TOP_MARGIN_PX(subsampling) << SCALE_SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   28|     24|#define SCALE_SUBPEL_BITS 10
  |  |  ------------------
  ------------------
  582|     24|    const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
  ------------------
  |  |   32|     24|  (AOM_LEFT_TOP_MARGIN_PX(subsampling) << SCALE_SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   30|     24|  ((AOM_BORDER_IN_PIXELS >> subsampling) - AOM_INTERP_EXTEND)
  |  |  |  |  ------------------
  |  |  |  |  |  |   32|     24|#define AOM_BORDER_IN_PIXELS 288
  |  |  |  |  ------------------
  |  |  |  |                 ((AOM_BORDER_IN_PIXELS >> subsampling) - AOM_INTERP_EXTEND)
  |  |  |  |  ------------------
  |  |  |  |  |  |   31|     24|#define AOM_INTERP_EXTEND 4
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (AOM_LEFT_TOP_MARGIN_PX(subsampling) << SCALE_SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   28|     24|#define SCALE_SUBPEL_BITS 10
  |  |  ------------------
  ------------------
  583|     24|    const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
  ------------------
  |  |   31|     24|#define AOM_INTERP_EXTEND 4
  ------------------
  584|     24|                       << SCALE_SUBPEL_BITS;
  ------------------
  |  |   28|     24|#define SCALE_SUBPEL_BITS 10
  ------------------
  585|     24|    const int right = (pre_buf->width + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS;
  ------------------
  |  |   31|     24|#define AOM_INTERP_EXTEND 4
  ------------------
                  const int right = (pre_buf->width + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS;
  ------------------
  |  |   28|     24|#define SCALE_SUBPEL_BITS 10
  ------------------
  586|     24|    pos_y = clamp(pos_y, top, bottom);
  587|     24|    pos_x = clamp(pos_x, left, right);
  588|       |
  589|     24|    subpel_params->subpel_x = pos_x & SCALE_SUBPEL_MASK;
  ------------------
  |  |   30|     24|#define SCALE_SUBPEL_MASK (SCALE_SUBPEL_SHIFTS - 1)
  |  |  ------------------
  |  |  |  |   29|     24|#define SCALE_SUBPEL_SHIFTS (1 << SCALE_SUBPEL_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   28|     24|#define SCALE_SUBPEL_BITS 10
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  590|     24|    subpel_params->subpel_y = pos_y & SCALE_SUBPEL_MASK;
  ------------------
  |  |   30|     24|#define SCALE_SUBPEL_MASK (SCALE_SUBPEL_SHIFTS - 1)
  |  |  ------------------
  |  |  |  |   29|     24|#define SCALE_SUBPEL_SHIFTS (1 << SCALE_SUBPEL_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   28|     24|#define SCALE_SUBPEL_BITS 10
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  591|     24|    subpel_params->xs = sf->x_step_q4;
  592|     24|    subpel_params->ys = sf->y_step_q4;
  593|       |
  594|       |    // Get reference block top left coordinate.
  595|     24|    block->x0 = pos_x >> SCALE_SUBPEL_BITS;
  ------------------
  |  |   28|     24|#define SCALE_SUBPEL_BITS 10
  ------------------
  596|     24|    block->y0 = pos_y >> SCALE_SUBPEL_BITS;
  ------------------
  |  |   28|     24|#define SCALE_SUBPEL_BITS 10
  ------------------
  597|       |
  598|       |    // Get reference block bottom right coordinate.
  599|     24|    block->x1 =
  600|     24|        ((pos_x + (bw - 1) * subpel_params->xs) >> SCALE_SUBPEL_BITS) + 1;
  ------------------
  |  |   28|     24|#define SCALE_SUBPEL_BITS 10
  ------------------
  601|     24|    block->y1 =
  602|     24|        ((pos_y + (bh - 1) * subpel_params->ys) >> SCALE_SUBPEL_BITS) + 1;
  ------------------
  |  |   28|     24|#define SCALE_SUBPEL_BITS 10
  ------------------
  603|       |
  604|     24|    MV temp_mv;
  605|     24|    temp_mv = clamp_mv_to_umv_border_sb(xd, src_mv, bw, bh,
  606|     24|                                        inter_pred_params->subsampling_x,
  607|     24|                                        inter_pred_params->subsampling_y);
  608|     24|    *scaled_mv = av1_scale_mv(&temp_mv, mi_x, mi_y, sf);
  609|     24|    scaled_mv->row += SCALE_EXTRA_OFF;
  ------------------
  |  |   32|     24|#define SCALE_EXTRA_OFF ((1 << SCALE_EXTRA_BITS) / 2)
  |  |  ------------------
  |  |  |  |   31|     24|#define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   28|     24|#define SCALE_SUBPEL_BITS 10
  |  |  |  |  ------------------
  |  |  |  |               #define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   23|     24|#define SUBPEL_BITS 4
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  610|     24|    scaled_mv->col += SCALE_EXTRA_OFF;
  ------------------
  |  |   32|     24|#define SCALE_EXTRA_OFF ((1 << SCALE_EXTRA_BITS) / 2)
  |  |  ------------------
  |  |  |  |   31|     24|#define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   28|     24|#define SCALE_SUBPEL_BITS 10
  |  |  |  |  ------------------
  |  |  |  |               #define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   23|     24|#define SUBPEL_BITS 4
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  611|       |
  612|     24|    *subpel_x_mv = scaled_mv->col & SCALE_SUBPEL_MASK;
  ------------------
  |  |   30|     24|#define SCALE_SUBPEL_MASK (SCALE_SUBPEL_SHIFTS - 1)
  |  |  ------------------
  |  |  |  |   29|     24|#define SCALE_SUBPEL_SHIFTS (1 << SCALE_SUBPEL_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   28|     24|#define SCALE_SUBPEL_BITS 10
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  613|     24|    *subpel_y_mv = scaled_mv->row & SCALE_SUBPEL_MASK;
  ------------------
  |  |   30|     24|#define SCALE_SUBPEL_MASK (SCALE_SUBPEL_SHIFTS - 1)
  |  |  ------------------
  |  |  |  |   29|     24|#define SCALE_SUBPEL_SHIFTS (1 << SCALE_SUBPEL_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   28|     24|#define SCALE_SUBPEL_BITS 10
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  614|   193k|  } else {
  615|       |    // Get block position in current frame.
  616|   193k|    int pos_x = inter_pred_params->pix_col << SUBPEL_BITS;
  ------------------
  |  |   23|   193k|#define SUBPEL_BITS 4
  ------------------
  617|   193k|    int pos_y = inter_pred_params->pix_row << SUBPEL_BITS;
  ------------------
  |  |   23|   193k|#define SUBPEL_BITS 4
  ------------------
  618|       |
  619|   193k|    const MV mv_q4 = clamp_mv_to_umv_border_sb(
  620|   193k|        xd, src_mv, bw, bh, inter_pred_params->subsampling_x,
  621|   193k|        inter_pred_params->subsampling_y);
  622|   193k|    subpel_params->xs = subpel_params->ys = SCALE_SUBPEL_SHIFTS;
  ------------------
  |  |   29|   193k|#define SCALE_SUBPEL_SHIFTS (1 << SCALE_SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   28|   193k|#define SCALE_SUBPEL_BITS 10
  |  |  ------------------
  ------------------
  623|   193k|    subpel_params->subpel_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS;
  ------------------
  |  |   24|   193k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|   193k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
                  subpel_params->subpel_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS;
  ------------------
  |  |   31|   193k|#define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   28|   193k|#define SCALE_SUBPEL_BITS 10
  |  |  ------------------
  |  |               #define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   23|   193k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  624|   193k|    subpel_params->subpel_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS;
  ------------------
  |  |   24|   193k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|   193k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
                  subpel_params->subpel_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS;
  ------------------
  |  |   31|   193k|#define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   28|   193k|#define SCALE_SUBPEL_BITS 10
  |  |  ------------------
  |  |               #define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   23|   193k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  625|       |
  626|       |    // Get reference block top left coordinate.
  627|   193k|    pos_x += mv_q4.col;
  628|   193k|    pos_y += mv_q4.row;
  629|   193k|    block->x0 = pos_x >> SUBPEL_BITS;
  ------------------
  |  |   23|   193k|#define SUBPEL_BITS 4
  ------------------
  630|   193k|    block->y0 = pos_y >> SUBPEL_BITS;
  ------------------
  |  |   23|   193k|#define SUBPEL_BITS 4
  ------------------
  631|       |
  632|       |    // Get reference block bottom right coordinate.
  633|   193k|    block->x1 = (pos_x >> SUBPEL_BITS) + (bw - 1) + 1;
  ------------------
  |  |   23|   193k|#define SUBPEL_BITS 4
  ------------------
  634|   193k|    block->y1 = (pos_y >> SUBPEL_BITS) + (bh - 1) + 1;
  ------------------
  |  |   23|   193k|#define SUBPEL_BITS 4
  ------------------
  635|       |
  636|   193k|    scaled_mv->row = mv_q4.row;
  637|   193k|    scaled_mv->col = mv_q4.col;
  638|   193k|    *subpel_x_mv = scaled_mv->col & SUBPEL_MASK;
  ------------------
  |  |   24|   193k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|   193k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  639|   193k|    *subpel_y_mv = scaled_mv->row & SUBPEL_MASK;
  ------------------
  |  |   24|   193k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|   193k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  640|   193k|  }
  641|   193k|  *pre = pre_buf->buf0 + block->y0 * pre_buf->stride + block->x0;
  642|   193k|  *src_stride = pre_buf->stride;
  643|   193k|}
decodeframe.c:extend_mc_border:
  526|   193k|                                    int *src_stride) {
  527|   193k|  int x_pad = 0, y_pad = 0;
  528|   193k|  if (update_extend_mc_border_params(sf, pre_buf, scaled_mv, &block,
  ------------------
  |  Branch (528:7): [True: 60.9k, False: 133k]
  ------------------
  529|   193k|                                     subpel_x_mv, subpel_y_mv, do_warp,
  530|   193k|                                     is_intrabc, &x_pad, &y_pad)) {
  531|       |    // Get reference block pointer.
  532|  60.9k|    const uint8_t *const buf_ptr =
  533|  60.9k|        pre_buf->buf0 + block.y0 * pre_buf->stride + block.x0;
  534|  60.9k|    int buf_stride = pre_buf->stride;
  535|  60.9k|    const int b_w = block.x1 - block.x0;
  536|  60.9k|    const int b_h = block.y1 - block.y0;
  537|       |
  538|  60.9k|#if CONFIG_AV1_HIGHBITDEPTH
  539|       |    // Extend the border.
  540|  60.9k|    if (highbd) {
  ------------------
  |  Branch (540:9): [True: 10.1k, False: 50.7k]
  ------------------
  541|  10.1k|      highbd_build_mc_border(buf_ptr, buf_stride, mc_buf, b_w, block.x0,
  542|  10.1k|                             block.y0, b_w, b_h, pre_buf->width,
  543|  10.1k|                             pre_buf->height);
  544|  50.7k|    } else {
  545|  50.7k|      build_mc_border(buf_ptr, buf_stride, mc_buf, b_w, block.x0, block.y0, b_w,
  546|  50.7k|                      b_h, pre_buf->width, pre_buf->height);
  547|  50.7k|    }
  548|       |#else
  549|       |    (void)highbd;
  550|       |    build_mc_border(buf_ptr, buf_stride, mc_buf, b_w, block.x0, block.y0, b_w,
  551|       |                    b_h, pre_buf->width, pre_buf->height);
  552|       |#endif
  553|  60.9k|    *src_stride = b_w;
  554|  60.9k|    *pre = mc_buf + y_pad * (AOM_INTERP_EXTEND - 1) * b_w +
  ------------------
  |  |   31|  60.9k|#define AOM_INTERP_EXTEND 4
  ------------------
  555|  60.9k|           x_pad * (AOM_INTERP_EXTEND - 1);
  ------------------
  |  |   31|  60.9k|#define AOM_INTERP_EXTEND 4
  ------------------
  556|  60.9k|  }
  557|   193k|}
decodeframe.c:update_extend_mc_border_params:
  488|   193k|    int do_warp, int is_intrabc, int *x_pad, int *y_pad) {
  489|   193k|  const int is_scaled = av1_is_scaled(sf);
  490|       |  // Get reference width and height.
  491|   193k|  int frame_width = pre_buf->width;
  492|   193k|  int frame_height = pre_buf->height;
  493|       |
  494|       |  // Do border extension if there is motion or
  495|       |  // width/height is not a multiple of 8 pixels.
  496|   193k|  if ((!is_intrabc) && (!do_warp) &&
  ------------------
  |  Branch (496:7): [True: 180k, False: 13.1k]
  |  Branch (496:24): [True: 176k, False: 4.17k]
  ------------------
  497|   176k|      (is_scaled || scaled_mv.col || scaled_mv.row || (frame_width & 0x7) ||
  ------------------
  |  Branch (497:8): [True: 25, False: 176k]
  |  Branch (497:21): [True: 103k, False: 72.6k]
  |  Branch (497:38): [True: 17.0k, False: 55.6k]
  |  Branch (497:55): [True: 54.5k, False: 1.11k]
  ------------------
  498|   175k|       (frame_height & 0x7))) {
  ------------------
  |  Branch (498:8): [True: 185, False: 928]
  ------------------
  499|   175k|    if (subpel_x_mv || (sf->x_step_q4 != SUBPEL_SHIFTS)) {
  ------------------
  |  |   25|  81.6k|#define SUBPEL_SHIFTS (1 << SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   23|  81.6k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  |  Branch (499:9): [True: 94.1k, False: 81.6k]
  |  Branch (499:24): [True: 81.6k, False: 0]
  ------------------
  500|   175k|      block->x0 -= AOM_INTERP_EXTEND - 1;
  ------------------
  |  |   31|   175k|#define AOM_INTERP_EXTEND 4
  ------------------
  501|   175k|      block->x1 += AOM_INTERP_EXTEND;
  ------------------
  |  |   31|   175k|#define AOM_INTERP_EXTEND 4
  ------------------
  502|   175k|      *x_pad = 1;
  503|   175k|    }
  504|       |
  505|   175k|    if (subpel_y_mv || (sf->y_step_q4 != SUBPEL_SHIFTS)) {
  ------------------
  |  |   25|  84.7k|#define SUBPEL_SHIFTS (1 << SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   23|  84.7k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  |  Branch (505:9): [True: 90.9k, False: 84.7k]
  |  Branch (505:24): [True: 84.7k, False: 0]
  ------------------
  506|   175k|      block->y0 -= AOM_INTERP_EXTEND - 1;
  ------------------
  |  |   31|   175k|#define AOM_INTERP_EXTEND 4
  ------------------
  507|   175k|      block->y1 += AOM_INTERP_EXTEND;
  ------------------
  |  |   31|   175k|#define AOM_INTERP_EXTEND 4
  ------------------
  508|   175k|      *y_pad = 1;
  509|   175k|    }
  510|       |
  511|       |    // Skip border extension if block is inside the frame.
  512|   175k|    if (block->x0 < 0 || block->x1 > frame_width - 1 || block->y0 < 0 ||
  ------------------
  |  Branch (512:9): [True: 23.1k, False: 152k]
  |  Branch (512:26): [True: 14.9k, False: 137k]
  |  Branch (512:57): [True: 18.4k, False: 119k]
  ------------------
  513|   119k|        block->y1 > frame_height - 1) {
  ------------------
  |  Branch (513:9): [True: 4.35k, False: 114k]
  ------------------
  514|  60.9k|      return 1;
  515|  60.9k|    }
  516|   175k|  }
  517|   133k|  return 0;
  518|   193k|}
decodeframe.c:highbd_build_mc_border:
  412|  10.1k|                                          int h) {
  413|       |  // Get a pointer to the start of the real data for this row.
  414|  10.1k|  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  ------------------
  |  |   75|  10.1k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  415|  10.1k|  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
  ------------------
  |  |   75|  10.1k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  416|  10.1k|  const uint16_t *ref_row = src - x - y * src_stride;
  417|       |
  418|  10.1k|  if (y >= h)
  ------------------
  |  Branch (418:7): [True: 38, False: 10.1k]
  ------------------
  419|     38|    ref_row += (h - 1) * src_stride;
  420|  10.1k|  else if (y > 0)
  ------------------
  |  Branch (420:12): [True: 4.83k, False: 5.29k]
  ------------------
  421|  4.83k|    ref_row += y * src_stride;
  422|       |
  423|   240k|  do {
  424|   240k|    int right = 0, copy;
  425|   240k|    int left = x < 0 ? -x : 0;
  ------------------
  |  Branch (425:16): [True: 97.6k, False: 142k]
  ------------------
  426|       |
  427|   240k|    if (left > b_w) left = b_w;
  ------------------
  |  Branch (427:9): [True: 72, False: 240k]
  ------------------
  428|       |
  429|   240k|    if (x + b_w > w) right = x + b_w - w;
  ------------------
  |  Branch (429:9): [True: 65.1k, False: 175k]
  ------------------
  430|       |
  431|   240k|    if (right > b_w) right = b_w;
  ------------------
  |  Branch (431:9): [True: 1.40k, False: 238k]
  ------------------
  432|       |
  433|   240k|    copy = b_w - left - right;
  434|       |
  435|   240k|    if (left) aom_memset16(dst, ref_row[0], left);
  ------------------
  |  Branch (435:9): [True: 97.6k, False: 142k]
  ------------------
  436|       |
  437|   240k|    if (copy) memcpy(dst + left, ref_row + x + left, copy * sizeof(uint16_t));
  ------------------
  |  Branch (437:9): [True: 237k, False: 3.02k]
  ------------------
  438|       |
  439|   240k|    if (right) aom_memset16(dst + left + copy, ref_row[w - 1], right);
  ------------------
  |  Branch (439:9): [True: 65.1k, False: 175k]
  ------------------
  440|       |
  441|   240k|    dst += dst_stride;
  442|   240k|    ++y;
  443|       |
  444|   240k|    if (y > 0 && y < h) ref_row += src_stride;
  ------------------
  |  Branch (444:9): [True: 222k, False: 18.2k]
  |  Branch (444:18): [True: 215k, False: 6.62k]
  ------------------
  445|   240k|  } while (--b_h);
  ------------------
  |  Branch (445:12): [True: 230k, False: 10.1k]
  ------------------
  446|  10.1k|}
decodeframe.c:build_mc_border:
  451|  50.7k|                                   int b_w, int b_h, int w, int h) {
  452|       |  // Get a pointer to the start of the real data for this row.
  453|  50.7k|  const uint8_t *ref_row = src - x - y * src_stride;
  454|       |
  455|  50.7k|  if (y >= h)
  ------------------
  |  Branch (455:7): [True: 505, False: 50.2k]
  ------------------
  456|    505|    ref_row += (h - 1) * src_stride;
  457|  50.2k|  else if (y > 0)
  ------------------
  |  Branch (457:12): [True: 21.2k, False: 29.0k]
  ------------------
  458|  21.2k|    ref_row += y * src_stride;
  459|       |
  460|  1.10M|  do {
  461|  1.10M|    int right = 0, copy;
  462|  1.10M|    int left = x < 0 ? -x : 0;
  ------------------
  |  Branch (462:16): [True: 467k, False: 642k]
  ------------------
  463|       |
  464|  1.10M|    if (left > b_w) left = b_w;
  ------------------
  |  Branch (464:9): [True: 120, False: 1.10M]
  ------------------
  465|       |
  466|  1.10M|    if (x + b_w > w) right = x + b_w - w;
  ------------------
  |  Branch (466:9): [True: 248k, False: 860k]
  ------------------
  467|       |
  468|  1.10M|    if (right > b_w) right = b_w;
  ------------------
  |  Branch (468:9): [True: 8.64k, False: 1.10M]
  ------------------
  469|       |
  470|  1.10M|    copy = b_w - left - right;
  471|       |
  472|  1.10M|    if (left) memset(dst, ref_row[0], left);
  ------------------
  |  Branch (472:9): [True: 467k, False: 642k]
  ------------------
  473|       |
  474|  1.10M|    if (copy) memcpy(dst + left, ref_row + x + left, copy);
  ------------------
  |  Branch (474:9): [True: 1.08M, False: 19.3k]
  ------------------
  475|       |
  476|  1.10M|    if (right) memset(dst + left + copy, ref_row[w - 1], right);
  ------------------
  |  Branch (476:9): [True: 248k, False: 860k]
  ------------------
  477|       |
  478|  1.10M|    dst += dst_stride;
  479|  1.10M|    ++y;
  480|       |
  481|  1.10M|    if (y > 0 && y < h) ref_row += src_stride;
  ------------------
  |  Branch (481:9): [True: 1.00M, False: 102k]
  |  Branch (481:18): [True: 899k, False: 107k]
  ------------------
  482|  1.10M|  } while (--b_h);
  ------------------
  |  Branch (482:12): [True: 1.05M, False: 50.7k]
  ------------------
  483|  50.7k|}
decodeframe.c:dec_build_obmc_inter_predictors_sb:
  813|  7.49k|                                                      DecoderCodingBlock *dcb) {
  814|  7.49k|  const int num_planes = av1_num_planes(cm);
  815|  7.49k|  uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
  816|  7.49k|  int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
  ------------------
  |  |   32|  7.49k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|  7.49k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
                int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
  ------------------
  |  |   32|  7.49k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|  7.49k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
                int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
  ------------------
  |  |   32|  7.49k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|  7.49k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
  817|  7.49k|  int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
  ------------------
  |  |   32|  7.49k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|  7.49k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
                int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
  ------------------
  |  |   32|  7.49k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|  7.49k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
                int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
  ------------------
  |  |   32|  7.49k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|  7.49k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
  818|  7.49k|  int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
  ------------------
  |  |   32|  7.49k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|  7.49k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
                int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
  ------------------
  |  |   32|  7.49k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|  7.49k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
                int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
  ------------------
  |  |   32|  7.49k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|  7.49k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
  819|  7.49k|  int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
  ------------------
  |  |   32|  7.49k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|  7.49k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
                int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
  ------------------
  |  |   32|  7.49k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|  7.49k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
                int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
  ------------------
  |  |   32|  7.49k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|  7.49k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
  820|  7.49k|  int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
  ------------------
  |  |   32|  7.49k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|  7.49k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
                int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
  ------------------
  |  |   32|  7.49k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|  7.49k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
                int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
  ------------------
  |  |   32|  7.49k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|  7.49k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
  821|  7.49k|  int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
  ------------------
  |  |   32|  7.49k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|  7.49k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
                int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
  ------------------
  |  |   32|  7.49k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|  7.49k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
                int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
  ------------------
  |  |   32|  7.49k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|  7.49k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
  822|       |
  823|  7.49k|  MACROBLOCKD *const xd = &dcb->xd;
  824|  7.49k|  av1_setup_obmc_dst_bufs(xd, dst_buf1, dst_buf2);
  825|       |
  826|  7.49k|  dec_build_prediction_by_above_preds(cm, dcb, dst_buf1, dst_width1,
  827|  7.49k|                                      dst_height1, dst_stride1);
  828|  7.49k|  dec_build_prediction_by_left_preds(cm, dcb, dst_buf2, dst_width2, dst_height2,
  829|  7.49k|                                     dst_stride2);
  830|  7.49k|  const int mi_row = xd->mi_row;
  831|  7.49k|  const int mi_col = xd->mi_col;
  832|  7.49k|  av1_setup_dst_planes(xd->plane, xd->mi[0]->bsize, &cm->cur_frame->buf, mi_row,
  833|  7.49k|                       mi_col, 0, num_planes);
  834|  7.49k|  av1_build_obmc_inter_prediction(cm, xd, dst_buf1, dst_stride1, dst_buf2,
  835|  7.49k|                                  dst_stride2);
  836|  7.49k|}
decodeframe.c:dec_build_prediction_by_above_preds:
  733|  7.49k|    int tmp_height[MAX_MB_PLANE], int tmp_stride[MAX_MB_PLANE]) {
  734|  7.49k|  MACROBLOCKD *const xd = &dcb->xd;
  735|  7.49k|  if (!xd->up_available) return;
  ------------------
  |  Branch (735:7): [True: 1.13k, False: 6.36k]
  ------------------
  736|       |
  737|       |  // Adjust mb_to_bottom_edge to have the correct value for the OBMC
  738|       |  // prediction block. This is half the height of the original block,
  739|       |  // except for 128-wide blocks, where we only use a height of 32.
  740|  6.36k|  const int this_height = xd->height * MI_SIZE;
  ------------------
  |  |   40|  6.36k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  6.36k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  741|  6.36k|  const int pred_height = AOMMIN(this_height / 2, 32);
  ------------------
  |  |   34|  6.36k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 6.34k, False: 24]
  |  |  ------------------
  ------------------
  742|  6.36k|  xd->mb_to_bottom_edge += GET_MV_SUBPEL(this_height - pred_height);
  ------------------
  |  |   29|  6.36k|#define GET_MV_SUBPEL(x) ((x) * 8)
  ------------------
  743|  6.36k|  struct build_prediction_ctxt ctxt = {
  744|  6.36k|    cm, tmp_buf, tmp_width, tmp_height, tmp_stride, xd->mb_to_right_edge, dcb
  745|  6.36k|  };
  746|  6.36k|  const BLOCK_SIZE bsize = xd->mi[0]->bsize;
  747|  6.36k|  foreach_overlappable_nb_above(cm, xd,
  748|  6.36k|                                max_neighbor_obmc[mi_size_wide_log2[bsize]],
  749|  6.36k|                                dec_build_prediction_by_above_pred, &ctxt);
  750|       |
  751|  6.36k|  xd->mb_to_left_edge = -GET_MV_SUBPEL(xd->mi_col * MI_SIZE);
  ------------------
  |  |   29|  6.36k|#define GET_MV_SUBPEL(x) ((x) * 8)
  ------------------
  752|  6.36k|  xd->mb_to_right_edge = ctxt.mb_to_far_edge;
  753|  6.36k|  xd->mb_to_bottom_edge -= GET_MV_SUBPEL(this_height - pred_height);
  ------------------
  |  |   29|  6.36k|#define GET_MV_SUBPEL(x) ((x) * 8)
  ------------------
  754|  6.36k|}
decodeframe.c:dec_build_prediction_by_above_pred:
  702|  6.80k|    int dir, MB_MODE_INFO *above_mbmi, void *fun_ctxt, const int num_planes) {
  703|  6.80k|  struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
  704|  6.80k|  const int above_mi_col = xd->mi_col + rel_mi_col;
  705|  6.80k|  int mi_x, mi_y;
  706|  6.80k|  MB_MODE_INFO backup_mbmi = *above_mbmi;
  707|       |
  708|  6.80k|  (void)rel_mi_row;
  709|  6.80k|  (void)dir;
  710|       |
  711|  6.80k|  av1_setup_build_prediction_by_above_pred(xd, rel_mi_col, op_mi_size,
  712|  6.80k|                                           &backup_mbmi, ctxt, num_planes);
  713|  6.80k|  mi_x = above_mi_col << MI_SIZE_LOG2;
  ------------------
  |  |   39|  6.80k|#define MI_SIZE_LOG2 2
  ------------------
  714|  6.80k|  mi_y = xd->mi_row << MI_SIZE_LOG2;
  ------------------
  |  |   39|  6.80k|#define MI_SIZE_LOG2 2
  ------------------
  715|       |
  716|  6.80k|  const BLOCK_SIZE bsize = xd->mi[0]->bsize;
  717|       |
  718|  27.0k|  for (int j = 0; j < num_planes; ++j) {
  ------------------
  |  Branch (718:19): [True: 20.2k, False: 6.80k]
  ------------------
  719|  20.2k|    const struct macroblockd_plane *pd = &xd->plane[j];
  720|  20.2k|    int bw = (op_mi_size * MI_SIZE) >> pd->subsampling_x;
  ------------------
  |  |   40|  20.2k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  20.2k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  721|  20.2k|    int bh = clamp(block_size_high[bsize] >> (pd->subsampling_y + 1), 4,
  722|  20.2k|                   block_size_high[BLOCK_64X64] >> (pd->subsampling_y + 1));
  723|       |
  724|  20.2k|    if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue;
  ------------------
  |  Branch (724:9): [True: 9.26k, False: 10.9k]
  ------------------
  725|  10.9k|    dec_build_inter_predictors(ctxt->cm, (DecoderCodingBlock *)ctxt->dcb, j,
  726|  10.9k|                               &backup_mbmi, 1, bw, bh, mi_x, mi_y);
  727|  10.9k|  }
  728|  6.80k|}
decodeframe.c:dec_build_prediction_by_left_preds:
  788|  7.49k|    int tmp_height[MAX_MB_PLANE], int tmp_stride[MAX_MB_PLANE]) {
  789|  7.49k|  MACROBLOCKD *const xd = &dcb->xd;
  790|  7.49k|  if (!xd->left_available) return;
  ------------------
  |  Branch (790:7): [True: 662, False: 6.83k]
  ------------------
  791|       |
  792|       |  // Adjust mb_to_right_edge to have the correct value for the OBMC
  793|       |  // prediction block. This is half the width of the original block,
  794|       |  // except for 128-wide blocks, where we only use a width of 32.
  795|  6.83k|  const int this_width = xd->width * MI_SIZE;
  ------------------
  |  |   40|  6.83k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  6.83k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  796|  6.83k|  const int pred_width = AOMMIN(this_width / 2, 32);
  ------------------
  |  |   34|  6.83k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 6.77k, False: 61]
  |  |  ------------------
  ------------------
  797|  6.83k|  xd->mb_to_right_edge += GET_MV_SUBPEL(this_width - pred_width);
  ------------------
  |  |   29|  6.83k|#define GET_MV_SUBPEL(x) ((x) * 8)
  ------------------
  798|       |
  799|  6.83k|  struct build_prediction_ctxt ctxt = {
  800|  6.83k|    cm, tmp_buf, tmp_width, tmp_height, tmp_stride, xd->mb_to_bottom_edge, dcb
  801|  6.83k|  };
  802|  6.83k|  const BLOCK_SIZE bsize = xd->mi[0]->bsize;
  803|  6.83k|  foreach_overlappable_nb_left(cm, xd,
  804|  6.83k|                               max_neighbor_obmc[mi_size_high_log2[bsize]],
  805|  6.83k|                               dec_build_prediction_by_left_pred, &ctxt);
  806|       |
  807|  6.83k|  xd->mb_to_top_edge = -GET_MV_SUBPEL(xd->mi_row * MI_SIZE);
  ------------------
  |  |   29|  6.83k|#define GET_MV_SUBPEL(x) ((x) * 8)
  ------------------
  808|  6.83k|  xd->mb_to_right_edge -= GET_MV_SUBPEL(this_width - pred_width);
  ------------------
  |  |   29|  6.83k|#define GET_MV_SUBPEL(x) ((x) * 8)
  ------------------
  809|  6.83k|  xd->mb_to_bottom_edge = ctxt.mb_to_far_edge;
  810|  6.83k|}
decodeframe.c:dec_build_prediction_by_left_pred:
  758|  7.21k|    int dir, MB_MODE_INFO *left_mbmi, void *fun_ctxt, const int num_planes) {
  759|  7.21k|  struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
  760|  7.21k|  const int left_mi_row = xd->mi_row + rel_mi_row;
  761|  7.21k|  int mi_x, mi_y;
  762|  7.21k|  MB_MODE_INFO backup_mbmi = *left_mbmi;
  763|       |
  764|  7.21k|  (void)rel_mi_col;
  765|  7.21k|  (void)dir;
  766|       |
  767|  7.21k|  av1_setup_build_prediction_by_left_pred(xd, rel_mi_row, op_mi_size,
  768|  7.21k|                                          &backup_mbmi, ctxt, num_planes);
  769|  7.21k|  mi_x = xd->mi_col << MI_SIZE_LOG2;
  ------------------
  |  |   39|  7.21k|#define MI_SIZE_LOG2 2
  ------------------
  770|  7.21k|  mi_y = left_mi_row << MI_SIZE_LOG2;
  ------------------
  |  |   39|  7.21k|#define MI_SIZE_LOG2 2
  ------------------
  771|  7.21k|  const BLOCK_SIZE bsize = xd->mi[0]->bsize;
  772|       |
  773|  28.5k|  for (int j = 0; j < num_planes; ++j) {
  ------------------
  |  Branch (773:19): [True: 21.3k, False: 7.21k]
  ------------------
  774|  21.3k|    const struct macroblockd_plane *pd = &xd->plane[j];
  775|  21.3k|    int bw = clamp(block_size_wide[bsize] >> (pd->subsampling_x + 1), 4,
  776|  21.3k|                   block_size_wide[BLOCK_64X64] >> (pd->subsampling_x + 1));
  777|  21.3k|    int bh = (op_mi_size << MI_SIZE_LOG2) >> pd->subsampling_y;
  ------------------
  |  |   39|  21.3k|#define MI_SIZE_LOG2 2
  ------------------
  778|       |
  779|  21.3k|    if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue;
  ------------------
  |  Branch (779:9): [True: 0, False: 21.3k]
  ------------------
  780|  21.3k|    dec_build_inter_predictors(ctxt->cm, (DecoderCodingBlock *)ctxt->dcb, j,
  781|  21.3k|                               &backup_mbmi, 1, bw, bh, mi_x, mi_y);
  782|  21.3k|  }
  783|  7.21k|}
decodeframe.c:cfl_store_inter_block:
  839|  51.3k|                                         MACROBLOCKD *const xd) {
  840|  51.3k|  MB_MODE_INFO *mbmi = xd->mi[0];
  841|  51.3k|  if (store_cfl_required(cm, xd)) {
  ------------------
  |  Branch (841:7): [True: 6.53k, False: 44.8k]
  ------------------
  842|  6.53k|    cfl_store_block(xd, mbmi->bsize, mbmi->tx_size);
  843|  6.53k|  }
  844|  51.3k|}
decodeframe.c:get_dec_job_info:
 2908|  30.0k|static TileJobsDec *get_dec_job_info(AV1DecTileMT *tile_mt_info) {
 2909|  30.0k|  TileJobsDec *cur_job_info = NULL;
 2910|  30.0k|#if CONFIG_MULTITHREAD
 2911|  30.0k|  pthread_mutex_lock(tile_mt_info->job_mutex);
 2912|       |
 2913|  30.0k|  if (tile_mt_info->jobs_dequeued < tile_mt_info->jobs_enqueued) {
  ------------------
  |  Branch (2913:7): [True: 14.6k, False: 15.4k]
  ------------------
 2914|  14.6k|    cur_job_info = tile_mt_info->job_queue + tile_mt_info->jobs_dequeued;
 2915|  14.6k|    tile_mt_info->jobs_dequeued++;
 2916|  14.6k|  }
 2917|       |
 2918|  30.0k|  pthread_mutex_unlock(tile_mt_info->job_mutex);
 2919|       |#else
 2920|       |  (void)tile_mt_info;
 2921|       |#endif
 2922|  30.0k|  return cur_job_info;
 2923|  30.0k|}
decodeframe.c:tile_worker_hook_init:
 2929|  14.6k|                                         uint8_t allow_update_cdf) {
 2930|  14.6k|  AV1_COMMON *cm = &pbi->common;
 2931|  14.6k|  ThreadData *const td = thread_data->td;
 2932|  14.6k|  int tile_row = tile_data->tile_info.tile_row;
 2933|  14.6k|  int tile_col = tile_data->tile_info.tile_col;
 2934|       |
 2935|  14.6k|  td->bit_reader = &tile_data->bit_reader;
 2936|  14.6k|  av1_zero(td->cb_buffer_base.dqcoeff);
  ------------------
  |  |   43|  14.6k|#define av1_zero(dest) memset(&(dest), 0, sizeof(dest))
  ------------------
 2937|       |
 2938|  14.6k|  MACROBLOCKD *const xd = &td->dcb.xd;
 2939|  14.6k|  av1_tile_init(&xd->tile, cm, tile_row, tile_col);
 2940|  14.6k|  xd->current_base_qindex = cm->quant_params.base_qindex;
 2941|       |
 2942|  14.6k|  setup_bool_decoder(xd, tile_buffer->data, thread_data->data_end,
 2943|  14.6k|                     tile_buffer->size, &thread_data->error_info,
 2944|  14.6k|                     td->bit_reader, allow_update_cdf);
 2945|       |#if CONFIG_ACCOUNTING
 2946|       |  if (pbi->acct_enabled) {
 2947|       |    td->bit_reader->accounting = &pbi->accounting;
 2948|       |    td->bit_reader->accounting->last_tell_frac =
 2949|       |        aom_reader_tell_frac(td->bit_reader);
 2950|       |  } else {
 2951|       |    td->bit_reader->accounting = NULL;
 2952|       |  }
 2953|       |#endif
 2954|  14.6k|  av1_init_macroblockd(cm, xd);
 2955|  14.6k|  xd->error_info = &thread_data->error_info;
 2956|  14.6k|  av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), tile_row, xd);
 2957|       |
 2958|       |  // Initialise the tile context from the frame context
 2959|  14.6k|  tile_data->tctx = *cm->fc;
 2960|  14.6k|  xd->tile_ctx = &tile_data->tctx;
 2961|       |#if CONFIG_ACCOUNTING
 2962|       |  if (pbi->acct_enabled) {
 2963|       |    tile_data->bit_reader.accounting->last_tell_frac =
 2964|       |        aom_reader_tell_frac(&tile_data->bit_reader);
 2965|       |  }
 2966|       |#endif
 2967|  14.6k|}
decodeframe.c:setup_bool_decoder:
 1405|  27.3k|    aom_reader *r, uint8_t allow_update_cdf) {
 1406|       |  // Validate the calculated partition length. If the buffer
 1407|       |  // described by the partition can't be fully read, then restrict
 1408|       |  // it to the portion that can be (for EC mode) or throw an error.
 1409|  27.3k|  if (!read_is_valid(data, read_size, data_end)) {
  ------------------
  |  Branch (1409:7): [True: 0, False: 27.3k]
  ------------------
 1410|       |    // When internal error occurs ensure that xd->mi_row is set appropriately
 1411|       |    // w.r.t. current tile, which is used to signal processing of current row is
 1412|       |    // done in row-mt decoding.
 1413|      0|    xd->mi_row = xd->tile.mi_row_start;
 1414|       |
 1415|      0|    aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME,
 1416|      0|                       "Truncated packet or corrupt tile length");
 1417|      0|  }
 1418|  27.3k|  if (aom_reader_init(r, data, read_size)) {
  ------------------
  |  Branch (1418:7): [True: 0, False: 27.3k]
  ------------------
 1419|       |    // When internal error occurs ensure that xd->mi_row is set appropriately
 1420|       |    // w.r.t. current tile, which is used to signal processing of current row is
 1421|       |    // done in row-mt decoding.
 1422|      0|    xd->mi_row = xd->tile.mi_row_start;
 1423|       |
 1424|      0|    aom_internal_error(error_info, AOM_CODEC_MEM_ERROR,
 1425|      0|                       "Failed to allocate bool decoder %d", 1);
 1426|      0|  }
 1427|       |
 1428|  27.3k|  r->allow_update_cdf = allow_update_cdf;
 1429|  27.3k|}
decodeframe.c:parse_tile_row_mt:
 3164|  14.6k|                                     TileDataDec *const tile_data) {
 3165|  14.6k|  AV1_COMMON *const cm = &pbi->common;
 3166|  14.6k|  const int sb_mi_size = mi_size_wide[cm->seq_params->sb_size];
 3167|  14.6k|  const int num_planes = av1_num_planes(cm);
 3168|  14.6k|  const TileInfo *const tile_info = &tile_data->tile_info;
 3169|  14.6k|  int tile_row = tile_info->tile_row;
 3170|  14.6k|  DecoderCodingBlock *const dcb = &td->dcb;
 3171|  14.6k|  MACROBLOCKD *const xd = &dcb->xd;
 3172|       |
 3173|  14.6k|  av1_zero_above_context(cm, xd, tile_info->mi_col_start, tile_info->mi_col_end,
 3174|  14.6k|                         tile_row);
 3175|  14.6k|  av1_reset_loop_filter_delta(xd, num_planes);
 3176|  14.6k|  av1_reset_loop_restoration(xd, num_planes);
 3177|       |
 3178|  40.7k|  for (int mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
  ------------------
  |  Branch (3178:46): [True: 30.2k, False: 10.5k]
  ------------------
 3179|  30.2k|       mi_row += cm->seq_params->mib_size) {
 3180|  30.2k|    av1_zero_left_context(xd);
 3181|       |
 3182|   119k|    for (int mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end;
  ------------------
  |  Branch (3182:48): [True: 93.8k, False: 26.1k]
  ------------------
 3183|  93.8k|         mi_col += cm->seq_params->mib_size) {
 3184|  93.8k|      set_cb_buffer(pbi, dcb, pbi->cb_buffer_base, num_planes, mi_row, mi_col);
 3185|       |
 3186|       |      // Bit-stream parsing of the superblock
 3187|  93.8k|      decode_partition(pbi, td, mi_row, mi_col, td->bit_reader,
 3188|  93.8k|                       cm->seq_params->sb_size, 0x1);
 3189|       |
 3190|  93.8k|      if (aom_reader_has_overflowed(td->bit_reader)) {
  ------------------
  |  Branch (3190:11): [True: 4.07k, False: 89.7k]
  ------------------
 3191|  4.07k|        aom_merge_corrupted_flag(&dcb->corrupted, 1);
 3192|  4.07k|        return;
 3193|  4.07k|      }
 3194|  93.8k|    }
 3195|  26.1k|    signal_parse_sb_row_done(pbi, tile_data, sb_mi_size);
 3196|  26.1k|  }
 3197|       |
 3198|  10.5k|  int corrupted =
 3199|  10.5k|      (check_trailing_bits_after_symbol_coder(td->bit_reader)) ? 1 : 0;
  ------------------
  |  Branch (3199:7): [True: 2.74k, False: 7.80k]
  ------------------
 3200|  10.5k|  aom_merge_corrupted_flag(&dcb->corrupted, corrupted);
 3201|  10.5k|}
decodeframe.c:set_cb_buffer:
 2456|   224k|                                 const int num_planes, int mi_row, int mi_col) {
 2457|   224k|  AV1_COMMON *const cm = &pbi->common;
 2458|   224k|  int mib_size_log2 = cm->seq_params->mib_size_log2;
 2459|   224k|  int stride = (cm->mi_params.mi_cols >> mib_size_log2) + 1;
 2460|   224k|  int offset = (mi_row >> mib_size_log2) * stride + (mi_col >> mib_size_log2);
 2461|   224k|  CB_BUFFER *cb_buffer = cb_buffer_base + offset;
 2462|       |
 2463|   762k|  for (int plane = 0; plane < num_planes; ++plane) {
  ------------------
  |  Branch (2463:23): [True: 538k, False: 224k]
  ------------------
 2464|   538k|    dcb->dqcoeff_block[plane] = cb_buffer->dqcoeff[plane];
 2465|   538k|    dcb->eob_data[plane] = cb_buffer->eob_data[plane];
 2466|   538k|    dcb->cb_offset[plane] = 0;
 2467|   538k|    dcb->txb_offset[plane] = 0;
 2468|   538k|  }
 2469|   224k|  MACROBLOCKD *const xd = &dcb->xd;
 2470|   224k|  xd->plane[0].color_index_map = cb_buffer->color_index_map[0];
 2471|   224k|  xd->plane[1].color_index_map = cb_buffer->color_index_map[1];
 2472|   224k|  xd->color_index_map_offset[0] = 0;
 2473|   224k|  xd->color_index_map_offset[1] = 0;
 2474|   224k|}
decodeframe.c:decode_partition:
 1256|  2.30M|                                    BLOCK_SIZE bsize, int parse_decode_flag) {
 1257|  2.30M|  assert(bsize < BLOCK_SIZES_ALL);
 1258|  2.30M|  AV1_COMMON *const cm = &pbi->common;
 1259|  2.30M|  DecoderCodingBlock *const dcb = &td->dcb;
 1260|  2.30M|  MACROBLOCKD *const xd = &dcb->xd;
 1261|  2.30M|  const int bw = mi_size_wide[bsize];
 1262|  2.30M|  const int hbs = bw >> 1;
 1263|  2.30M|  PARTITION_TYPE partition;
 1264|  2.30M|  BLOCK_SIZE subsize;
 1265|  2.30M|  const int quarter_step = bw / 4;
 1266|  2.30M|  BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
 1267|  2.30M|  const int has_rows = (mi_row + hbs) < cm->mi_params.mi_rows;
 1268|  2.30M|  const int has_cols = (mi_col + hbs) < cm->mi_params.mi_cols;
 1269|       |
 1270|  2.30M|  if (mi_row >= cm->mi_params.mi_rows || mi_col >= cm->mi_params.mi_cols)
  ------------------
  |  Branch (1270:7): [True: 115k, False: 2.18M]
  |  Branch (1270:42): [True: 101k, False: 2.08M]
  ------------------
 1271|   214k|    return;
 1272|       |
 1273|       |  // parse_decode_flag takes the following values :
 1274|       |  // 01 - do parse only
 1275|       |  // 10 - do decode only
 1276|       |  // 11 - do parse and decode
 1277|  2.09M|  static const block_visitor_fn_t block_visit[4] = { NULL, parse_decode_block,
 1278|  2.09M|                                                     decode_block,
 1279|  2.09M|                                                     parse_decode_block };
 1280|       |
 1281|  2.09M|  if (parse_decode_flag & 1) {
  ------------------
  |  Branch (1281:7): [True: 1.45M, False: 639k]
  ------------------
 1282|  1.45M|    const int num_planes = av1_num_planes(cm);
 1283|  4.75M|    for (int plane = 0; plane < num_planes; ++plane) {
  ------------------
  |  Branch (1283:25): [True: 3.30M, False: 1.45M]
  ------------------
 1284|  3.30M|      int rcol0, rcol1, rrow0, rrow1;
 1285|       |
 1286|       |      // Skip some unnecessary work if loop restoration is disabled
 1287|  3.30M|      if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
  ------------------
  |  Branch (1287:11): [True: 3.07M, False: 232k]
  ------------------
 1288|       |
 1289|   232k|      if (av1_loop_restoration_corners_in_sb(cm, plane, mi_row, mi_col, bsize,
  ------------------
  |  Branch (1289:11): [True: 18.1k, False: 214k]
  ------------------
 1290|   232k|                                             &rcol0, &rcol1, &rrow0, &rrow1)) {
 1291|  18.1k|        const int rstride = cm->rst_info[plane].horz_units;
 1292|  36.2k|        for (int rrow = rrow0; rrow < rrow1; ++rrow) {
  ------------------
  |  Branch (1292:32): [True: 18.1k, False: 18.1k]
  ------------------
 1293|  36.6k|          for (int rcol = rcol0; rcol < rcol1; ++rcol) {
  ------------------
  |  Branch (1293:34): [True: 18.5k, False: 18.1k]
  ------------------
 1294|  18.5k|            const int runit_idx = rcol + rrow * rstride;
 1295|  18.5k|            loop_restoration_read_sb_coeffs(cm, xd, reader, plane, runit_idx);
 1296|  18.5k|          }
 1297|  18.1k|        }
 1298|  18.1k|      }
 1299|   232k|    }
 1300|       |
 1301|  1.45M|    partition = (bsize < BLOCK_8X8) ? PARTITION_NONE
  ------------------
  |  Branch (1301:17): [True: 133k, False: 1.31M]
  ------------------
 1302|  1.45M|                                    : read_partition(xd, mi_row, mi_col, reader,
 1303|  1.31M|                                                     has_rows, has_cols, bsize);
 1304|  1.45M|  } else {
 1305|   639k|    partition = get_partition(cm, mi_row, mi_col, bsize);
 1306|   639k|  }
 1307|  2.09M|  subsize = get_partition_subsize(bsize, partition);
 1308|  2.09M|  if (subsize == BLOCK_INVALID) {
  ------------------
  |  Branch (1308:7): [True: 0, False: 2.09M]
  ------------------
 1309|       |    // When an internal error occurs ensure that xd->mi_row is set appropriately
 1310|       |    // w.r.t. current tile, which is used to signal processing of current row is
 1311|       |    // done.
 1312|      0|    xd->mi_row = mi_row;
 1313|      0|    aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
 1314|      0|                       "Partition is invalid for block size %dx%d",
 1315|      0|                       block_size_wide[bsize], block_size_high[bsize]);
 1316|      0|  }
 1317|       |  // Check the bitstream is conformant: if there is subsampling on the
 1318|       |  // chroma planes, subsize must subsample to a valid block size.
 1319|  2.09M|  const struct macroblockd_plane *const pd_u = &xd->plane[1];
 1320|  2.09M|  if (get_plane_block_size(subsize, pd_u->subsampling_x, pd_u->subsampling_y) ==
  ------------------
  |  Branch (1320:7): [True: 156, False: 2.09M]
  ------------------
 1321|  2.09M|      BLOCK_INVALID) {
 1322|       |    // When an internal error occurs ensure that xd->mi_row is set appropriately
 1323|       |    // w.r.t. current tile, which is used to signal processing of current row is
 1324|       |    // done.
 1325|    156|    xd->mi_row = mi_row;
 1326|    156|    aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
 1327|    156|                       "Block size %dx%d invalid with this subsampling mode",
 1328|    156|                       block_size_wide[subsize], block_size_high[subsize]);
 1329|    156|  }
 1330|       |
 1331|  2.09M|#define DEC_BLOCK_STX_ARG
 1332|  2.09M|#define DEC_BLOCK_EPT_ARG partition,
 1333|  2.09M|#define DEC_BLOCK(db_r, db_c, db_subsize)                                  \
 1334|  2.09M|  block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), \
 1335|  2.09M|                                 reader, DEC_BLOCK_EPT_ARG(db_subsize))
 1336|  2.09M|#define DEC_PARTITION(db_r, db_c, db_subsize)                        \
 1337|  2.09M|  decode_partition(pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), reader, \
 1338|  2.09M|                   (db_subsize), parse_decode_flag)
 1339|       |
 1340|  2.09M|  switch (partition) {
 1341|   910k|    case PARTITION_NONE: DEC_BLOCK(mi_row, mi_col, subsize); break;
  ------------------
  |  | 1334|   910k|  block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), \
  |  | 1335|   910k|                                 reader, DEC_BLOCK_EPT_ARG(db_subsize))
  |  |  ------------------
  |  |  |  | 1332|   910k|#define DEC_BLOCK_EPT_ARG partition,
  |  |  ------------------
  ------------------
  |  Branch (1341:5): [True: 910k, False: 1.18M]
  ------------------
 1342|   254k|    case PARTITION_HORZ:
  ------------------
  |  Branch (1342:5): [True: 254k, False: 1.83M]
  ------------------
 1343|   254k|      DEC_BLOCK(mi_row, mi_col, subsize);
  ------------------
  |  | 1334|   254k|  block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), \
  |  | 1335|   254k|                                 reader, DEC_BLOCK_EPT_ARG(db_subsize))
  |  |  ------------------
  |  |  |  | 1332|   254k|#define DEC_BLOCK_EPT_ARG partition,
  |  |  ------------------
  ------------------
 1344|   254k|      if (has_rows) DEC_BLOCK(mi_row + hbs, mi_col, subsize);
  ------------------
  |  | 1334|   225k|  block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), \
  |  | 1335|   225k|                                 reader, DEC_BLOCK_EPT_ARG(db_subsize))
  |  |  ------------------
  |  |  |  | 1332|   225k|#define DEC_BLOCK_EPT_ARG partition,
  |  |  ------------------
  ------------------
  |  Branch (1344:11): [True: 225k, False: 28.6k]
  ------------------
 1345|   254k|      break;
 1346|   173k|    case PARTITION_VERT:
  ------------------
  |  Branch (1346:5): [True: 173k, False: 1.91M]
  ------------------
 1347|   173k|      DEC_BLOCK(mi_row, mi_col, subsize);
  ------------------
  |  | 1334|   173k|  block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), \
  |  | 1335|   173k|                                 reader, DEC_BLOCK_EPT_ARG(db_subsize))
  |  |  ------------------
  |  |  |  | 1332|   173k|#define DEC_BLOCK_EPT_ARG partition,
  |  |  ------------------
  ------------------
 1348|   173k|      if (has_cols) DEC_BLOCK(mi_row, mi_col + hbs, subsize);
  ------------------
  |  | 1334|   145k|  block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), \
  |  | 1335|   145k|                                 reader, DEC_BLOCK_EPT_ARG(db_subsize))
  |  |  ------------------
  |  |  |  | 1332|   145k|#define DEC_BLOCK_EPT_ARG partition,
  |  |  ------------------
  ------------------
  |  Branch (1348:11): [True: 145k, False: 27.8k]
  ------------------
 1349|   173k|      break;
 1350|   521k|    case PARTITION_SPLIT:
  ------------------
  |  Branch (1350:5): [True: 521k, False: 1.56M]
  ------------------
 1351|   521k|      DEC_PARTITION(mi_row, mi_col, subsize);
  ------------------
  |  | 1337|   521k|  decode_partition(pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), reader, \
  |  | 1338|   521k|                   (db_subsize), parse_decode_flag)
  ------------------
 1352|   521k|      DEC_PARTITION(mi_row, mi_col + hbs, subsize);
  ------------------
  |  | 1337|   521k|  decode_partition(pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), reader, \
  |  | 1338|   521k|                   (db_subsize), parse_decode_flag)
  ------------------
 1353|   521k|      DEC_PARTITION(mi_row + hbs, mi_col, subsize);
  ------------------
  |  | 1337|   521k|  decode_partition(pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), reader, \
  |  | 1338|   521k|                   (db_subsize), parse_decode_flag)
  ------------------
 1354|   521k|      DEC_PARTITION(mi_row + hbs, mi_col + hbs, subsize);
  ------------------
  |  | 1337|   521k|  decode_partition(pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), reader, \
  |  | 1338|   521k|                   (db_subsize), parse_decode_flag)
  ------------------
 1355|   521k|      break;
 1356|  30.7k|    case PARTITION_HORZ_A:
  ------------------
  |  Branch (1356:5): [True: 30.7k, False: 2.05M]
  ------------------
 1357|  30.7k|      DEC_BLOCK(mi_row, mi_col, bsize2);
  ------------------
  |  | 1334|  30.7k|  block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), \
  |  | 1335|  30.7k|                                 reader, DEC_BLOCK_EPT_ARG(db_subsize))
  |  |  ------------------
  |  |  |  | 1332|  30.7k|#define DEC_BLOCK_EPT_ARG partition,
  |  |  ------------------
  ------------------
 1358|  30.7k|      DEC_BLOCK(mi_row, mi_col + hbs, bsize2);
  ------------------
  |  | 1334|  30.7k|  block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), \
  |  | 1335|  30.7k|                                 reader, DEC_BLOCK_EPT_ARG(db_subsize))
  |  |  ------------------
  |  |  |  | 1332|  30.7k|#define DEC_BLOCK_EPT_ARG partition,
  |  |  ------------------
  ------------------
 1359|  30.7k|      DEC_BLOCK(mi_row + hbs, mi_col, subsize);
  ------------------
  |  | 1334|  30.7k|  block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), \
  |  | 1335|  30.7k|                                 reader, DEC_BLOCK_EPT_ARG(db_subsize))
  |  |  ------------------
  |  |  |  | 1332|  30.7k|#define DEC_BLOCK_EPT_ARG partition,
  |  |  ------------------
  ------------------
 1360|  30.7k|      break;
 1361|  36.9k|    case PARTITION_HORZ_B:
  ------------------
  |  Branch (1361:5): [True: 36.9k, False: 2.05M]
  ------------------
 1362|  36.9k|      DEC_BLOCK(mi_row, mi_col, subsize);
  ------------------
  |  | 1334|  36.9k|  block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), \
  |  | 1335|  36.9k|                                 reader, DEC_BLOCK_EPT_ARG(db_subsize))
  |  |  ------------------
  |  |  |  | 1332|  36.9k|#define DEC_BLOCK_EPT_ARG partition,
  |  |  ------------------
  ------------------
 1363|  36.9k|      DEC_BLOCK(mi_row + hbs, mi_col, bsize2);
  ------------------
  |  | 1334|  36.9k|  block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), \
  |  | 1335|  36.9k|                                 reader, DEC_BLOCK_EPT_ARG(db_subsize))
  |  |  ------------------
  |  |  |  | 1332|  36.9k|#define DEC_BLOCK_EPT_ARG partition,
  |  |  ------------------
  ------------------
 1364|  36.9k|      DEC_BLOCK(mi_row + hbs, mi_col + hbs, bsize2);
  ------------------
  |  | 1334|  36.9k|  block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), \
  |  | 1335|  36.9k|                                 reader, DEC_BLOCK_EPT_ARG(db_subsize))
  |  |  ------------------
  |  |  |  | 1332|  36.9k|#define DEC_BLOCK_EPT_ARG partition,
  |  |  ------------------
  ------------------
 1365|  36.9k|      break;
 1366|  22.1k|    case PARTITION_VERT_A:
  ------------------
  |  Branch (1366:5): [True: 22.1k, False: 2.06M]
  ------------------
 1367|  22.1k|      DEC_BLOCK(mi_row, mi_col, bsize2);
  ------------------
  |  | 1334|  22.1k|  block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), \
  |  | 1335|  22.1k|                                 reader, DEC_BLOCK_EPT_ARG(db_subsize))
  |  |  ------------------
  |  |  |  | 1332|  22.1k|#define DEC_BLOCK_EPT_ARG partition,
  |  |  ------------------
  ------------------
 1368|  22.1k|      DEC_BLOCK(mi_row + hbs, mi_col, bsize2);
  ------------------
  |  | 1334|  22.1k|  block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), \
  |  | 1335|  22.1k|                                 reader, DEC_BLOCK_EPT_ARG(db_subsize))
  |  |  ------------------
  |  |  |  | 1332|  22.1k|#define DEC_BLOCK_EPT_ARG partition,
  |  |  ------------------
  ------------------
 1369|  22.1k|      DEC_BLOCK(mi_row, mi_col + hbs, subsize);
  ------------------
  |  | 1334|  22.1k|  block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), \
  |  | 1335|  22.1k|                                 reader, DEC_BLOCK_EPT_ARG(db_subsize))
  |  |  ------------------
  |  |  |  | 1332|  22.1k|#define DEC_BLOCK_EPT_ARG partition,
  |  |  ------------------
  ------------------
 1370|  22.1k|      break;
 1371|  26.6k|    case PARTITION_VERT_B:
  ------------------
  |  Branch (1371:5): [True: 26.6k, False: 2.06M]
  ------------------
 1372|  26.6k|      DEC_BLOCK(mi_row, mi_col, subsize);
  ------------------
  |  | 1334|  26.6k|  block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), \
  |  | 1335|  26.6k|                                 reader, DEC_BLOCK_EPT_ARG(db_subsize))
  |  |  ------------------
  |  |  |  | 1332|  26.6k|#define DEC_BLOCK_EPT_ARG partition,
  |  |  ------------------
  ------------------
 1373|  26.6k|      DEC_BLOCK(mi_row, mi_col + hbs, bsize2);
  ------------------
  |  | 1334|  26.6k|  block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), \
  |  | 1335|  26.6k|                                 reader, DEC_BLOCK_EPT_ARG(db_subsize))
  |  |  ------------------
  |  |  |  | 1332|  26.6k|#define DEC_BLOCK_EPT_ARG partition,
  |  |  ------------------
  ------------------
 1374|  26.6k|      DEC_BLOCK(mi_row + hbs, mi_col + hbs, bsize2);
  ------------------
  |  | 1334|  26.6k|  block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), \
  |  | 1335|  26.6k|                                 reader, DEC_BLOCK_EPT_ARG(db_subsize))
  |  |  ------------------
  |  |  |  | 1332|  26.6k|#define DEC_BLOCK_EPT_ARG partition,
  |  |  ------------------
  ------------------
 1375|  26.6k|      break;
 1376|  74.7k|    case PARTITION_HORZ_4:
  ------------------
  |  Branch (1376:5): [True: 74.7k, False: 2.01M]
  ------------------
 1377|   373k|      for (int i = 0; i < 4; ++i) {
  ------------------
  |  Branch (1377:23): [True: 298k, False: 74.6k]
  ------------------
 1378|   298k|        int this_mi_row = mi_row + i * quarter_step;
 1379|   298k|        if (i > 0 && this_mi_row >= cm->mi_params.mi_rows) break;
  ------------------
  |  Branch (1379:13): [True: 224k, False: 74.7k]
  |  Branch (1379:22): [True: 122, False: 223k]
  ------------------
 1380|   298k|        DEC_BLOCK(this_mi_row, mi_col, subsize);
  ------------------
  |  | 1334|   298k|  block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), \
  |  | 1335|   298k|                                 reader, DEC_BLOCK_EPT_ARG(db_subsize))
  |  |  ------------------
  |  |  |  | 1332|   298k|#define DEC_BLOCK_EPT_ARG partition,
  |  |  ------------------
  ------------------
 1381|   298k|      }
 1382|  74.7k|      break;
 1383|  40.7k|    case PARTITION_VERT_4:
  ------------------
  |  Branch (1383:5): [True: 40.7k, False: 2.04M]
  ------------------
 1384|   203k|      for (int i = 0; i < 4; ++i) {
  ------------------
  |  Branch (1384:23): [True: 162k, False: 40.4k]
  ------------------
 1385|   162k|        int this_mi_col = mi_col + i * quarter_step;
 1386|   162k|        if (i > 0 && this_mi_col >= cm->mi_params.mi_cols) break;
  ------------------
  |  Branch (1386:13): [True: 122k, False: 40.7k]
  |  Branch (1386:22): [True: 311, False: 121k]
  ------------------
 1387|   162k|        DEC_BLOCK(mi_row, this_mi_col, subsize);
  ------------------
  |  | 1334|   162k|  block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), \
  |  | 1335|   162k|                                 reader, DEC_BLOCK_EPT_ARG(db_subsize))
  |  |  ------------------
  |  |  |  | 1332|   162k|#define DEC_BLOCK_EPT_ARG partition,
  |  |  ------------------
  ------------------
 1388|   162k|      }
 1389|  40.7k|      break;
 1390|      0|    default: assert(0 && "Invalid partition type");
  ------------------
  |  Branch (1390:5): [True: 0, False: 2.09M]
  ------------------
 1391|  2.09M|  }
 1392|       |
 1393|  2.08M|#undef DEC_PARTITION
 1394|  2.08M|#undef DEC_BLOCK
 1395|  2.08M|#undef DEC_BLOCK_EPT_ARG
 1396|  2.08M|#undef DEC_BLOCK_STX_ARG
 1397|       |
 1398|  2.08M|  if (parse_decode_flag & 1)
  ------------------
  |  Branch (1398:7): [True: 1.44M, False: 640k]
  ------------------
 1399|  1.44M|    update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
 1400|  2.08M|}
decodeframe.c:parse_decode_block:
 1130|  1.75M|                                      BLOCK_SIZE bsize) {
 1131|  1.75M|  DecoderCodingBlock *const dcb = &td->dcb;
 1132|  1.75M|  MACROBLOCKD *const xd = &dcb->xd;
 1133|  1.75M|  decode_mbmi_block(pbi, dcb, mi_row, mi_col, r, partition, bsize);
 1134|       |
 1135|  1.75M|  av1_visit_palette(pbi, xd, r, av1_decode_palette_tokens);
 1136|       |
 1137|  1.75M|  AV1_COMMON *cm = &pbi->common;
 1138|  1.75M|  const int num_planes = av1_num_planes(cm);
 1139|  1.75M|  MB_MODE_INFO *mbmi = xd->mi[0];
 1140|  1.75M|  int inter_block_tx = is_inter_block(mbmi) || is_intrabc_block(mbmi);
  ------------------
  |  Branch (1140:24): [True: 88.4k, False: 1.66M]
  |  Branch (1140:48): [True: 27, False: 1.66M]
  ------------------
 1141|  1.75M|  if (cm->features.tx_mode == TX_MODE_SELECT && block_signals_txsize(bsize) &&
  ------------------
  |  Branch (1141:7): [True: 748k, False: 1.00M]
  |  Branch (1141:49): [True: 687k, False: 61.3k]
  ------------------
 1142|   687k|      !mbmi->skip_txfm && inter_block_tx && !xd->lossless[mbmi->segment_id]) {
  ------------------
  |  Branch (1142:7): [True: 557k, False: 130k]
  |  Branch (1142:27): [True: 31.1k, False: 525k]
  |  Branch (1142:45): [True: 31.1k, False: 14]
  ------------------
 1143|  31.1k|    const TX_SIZE max_tx_size = max_txsize_rect_lookup[bsize];
 1144|  31.1k|    const int bh = tx_size_high_unit[max_tx_size];
 1145|  31.1k|    const int bw = tx_size_wide_unit[max_tx_size];
 1146|  31.1k|    const int width = mi_size_wide[bsize];
 1147|  31.1k|    const int height = mi_size_high[bsize];
 1148|       |
 1149|  62.3k|    for (int idy = 0; idy < height; idy += bh)
  ------------------
  |  Branch (1149:23): [True: 31.2k, False: 31.1k]
  ------------------
 1150|  62.5k|      for (int idx = 0; idx < width; idx += bw)
  ------------------
  |  Branch (1150:25): [True: 31.3k, False: 31.2k]
  ------------------
 1151|  31.3k|        read_tx_size_vartx(xd, mbmi, max_tx_size, 0, idy, idx, r);
 1152|  1.72M|  } else {
 1153|  1.72M|    mbmi->tx_size = read_tx_size(xd, cm->features.tx_mode, inter_block_tx,
 1154|  1.72M|                                 !mbmi->skip_txfm, r);
 1155|  1.72M|    if (inter_block_tx)
  ------------------
  |  Branch (1155:9): [True: 56.4k, False: 1.66M]
  ------------------
 1156|  56.4k|      memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
 1157|  1.72M|    set_txfm_ctxs(mbmi->tx_size, xd->width, xd->height,
 1158|  1.72M|                  mbmi->skip_txfm && is_inter_block(mbmi), xd);
  ------------------
  |  Branch (1158:19): [True: 452k, False: 1.27M]
  |  Branch (1158:38): [True: 15.2k, False: 437k]
  ------------------
 1159|  1.72M|  }
 1160|       |
 1161|  1.75M|  if (cm->delta_q_info.delta_q_present_flag) {
  ------------------
  |  Branch (1161:7): [True: 818k, False: 937k]
  ------------------
 1162|  7.36M|    for (int i = 0; i < MAX_SEGMENTS; i++) {
  ------------------
  |  |   21|  7.36M|#define MAX_SEGMENTS 8
  ------------------
  |  Branch (1162:21): [True: 6.55M, False: 818k]
  ------------------
 1163|  6.55M|      const int current_qindex =
 1164|  6.55M|          av1_get_qindex(&cm->seg, i, xd->current_base_qindex);
 1165|  6.55M|      const CommonQuantParams *const quant_params = &cm->quant_params;
 1166|  20.0M|      for (int j = 0; j < num_planes; ++j) {
  ------------------
  |  Branch (1166:23): [True: 13.5M, False: 6.55M]
  ------------------
 1167|  13.5M|        const int dc_delta_q = j == 0 ? quant_params->y_dc_delta_q
  ------------------
  |  Branch (1167:32): [True: 6.55M, False: 6.99M]
  ------------------
 1168|  13.5M|                                      : (j == 1 ? quant_params->u_dc_delta_q
  ------------------
  |  Branch (1168:42): [True: 3.49M, False: 3.49M]
  ------------------
 1169|  6.99M|                                                : quant_params->v_dc_delta_q);
 1170|  13.5M|        const int ac_delta_q = j == 0 ? 0
  ------------------
  |  Branch (1170:32): [True: 6.55M, False: 6.99M]
  ------------------
 1171|  13.5M|                                      : (j == 1 ? quant_params->u_ac_delta_q
  ------------------
  |  Branch (1171:42): [True: 3.49M, False: 3.49M]
  ------------------
 1172|  6.99M|                                                : quant_params->v_ac_delta_q);
 1173|  13.5M|        xd->plane[j].seg_dequant_QTX[i][0] = av1_dc_quant_QTX(
 1174|  13.5M|            current_qindex, dc_delta_q, cm->seq_params->bit_depth);
 1175|  13.5M|        xd->plane[j].seg_dequant_QTX[i][1] = av1_ac_quant_QTX(
 1176|  13.5M|            current_qindex, ac_delta_q, cm->seq_params->bit_depth);
 1177|  13.5M|      }
 1178|  6.55M|    }
 1179|   818k|  }
 1180|  1.75M|  if (mbmi->skip_txfm) av1_reset_entropy_context(xd, bsize, num_planes);
  ------------------
  |  Branch (1180:7): [True: 452k, False: 1.30M]
  ------------------
 1181|       |
 1182|  1.75M|  decode_token_recon_block(pbi, td, r, bsize);
 1183|  1.75M|}
decodeframe.c:decode_mbmi_block:
  375|  1.75M|                                     BLOCK_SIZE bsize) {
  376|  1.75M|  AV1_COMMON *const cm = &pbi->common;
  377|  1.75M|  const SequenceHeader *const seq_params = cm->seq_params;
  378|  1.75M|  const int bw = mi_size_wide[bsize];
  379|  1.75M|  const int bh = mi_size_high[bsize];
  380|  1.75M|  const int x_mis = AOMMIN(bw, cm->mi_params.mi_cols - mi_col);
  ------------------
  |  |   34|  1.75M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 1.63M, False: 122k]
  |  |  ------------------
  ------------------
  381|  1.75M|  const int y_mis = AOMMIN(bh, cm->mi_params.mi_rows - mi_row);
  ------------------
  |  |   34|  1.75M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 1.62M, False: 132k]
  |  |  ------------------
  ------------------
  382|  1.75M|  MACROBLOCKD *const xd = &dcb->xd;
  383|       |
  384|       |#if CONFIG_ACCOUNTING
  385|       |  aom_accounting_set_context(&pbi->accounting, mi_col, mi_row);
  386|       |#endif
  387|  1.75M|  set_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis, y_mis);
  388|  1.75M|  xd->mi[0]->partition = partition;
  389|  1.75M|  av1_read_mode_info(pbi, dcb, r, x_mis, y_mis);
  390|  1.75M|  if (bsize >= BLOCK_8X8 &&
  ------------------
  |  Branch (390:7): [True: 1.36M, False: 389k]
  ------------------
  391|  1.36M|      (seq_params->subsampling_x || seq_params->subsampling_y)) {
  ------------------
  |  Branch (391:8): [True: 689k, False: 677k]
  |  Branch (391:37): [True: 18.4E, False: 677k]
  ------------------
  392|   689k|    const BLOCK_SIZE uv_subsize =
  393|   689k|        av1_ss_size_lookup[bsize][seq_params->subsampling_x]
  394|   689k|                          [seq_params->subsampling_y];
  395|   689k|    if (uv_subsize == BLOCK_INVALID)
  ------------------
  |  Branch (395:9): [True: 0, False: 689k]
  ------------------
  396|      0|      aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
  397|      0|                         "Invalid block size.");
  398|   689k|  }
  399|  1.75M|}
decodeframe.c:set_offsets:
  339|  1.75M|                               int bh, int x_mis, int y_mis) {
  340|  1.75M|  const int num_planes = av1_num_planes(cm);
  341|  1.75M|  const CommonModeInfoParams *const mi_params = &cm->mi_params;
  342|  1.75M|  const TileInfo *const tile = &xd->tile;
  343|       |
  344|  1.75M|  set_mi_offsets(mi_params, xd, mi_row, mi_col);
  345|  1.75M|  xd->mi[0]->bsize = bsize;
  346|       |#if CONFIG_RD_DEBUG
  347|       |  xd->mi[0]->mi_row = mi_row;
  348|       |  xd->mi[0]->mi_col = mi_col;
  349|       |#endif
  350|       |
  351|  1.75M|  assert(x_mis && y_mis);
  352|  6.28M|  for (int x = 1; x < x_mis; ++x) xd->mi[x] = xd->mi[0];
  ------------------
  |  Branch (352:19): [True: 4.52M, False: 1.75M]
  ------------------
  353|  1.75M|  int idx = mi_params->mi_stride;
  354|  5.72M|  for (int y = 1; y < y_mis; ++y) {
  ------------------
  |  Branch (354:19): [True: 3.96M, False: 1.75M]
  ------------------
  355|  3.96M|    memcpy(&xd->mi[idx], &xd->mi[0], x_mis * sizeof(xd->mi[0]));
  356|  3.96M|    idx += mi_params->mi_stride;
  357|  3.96M|  }
  358|       |
  359|  1.75M|  set_plane_n4(xd, bw, bh, num_planes);
  360|  1.75M|  set_entropy_context(xd, mi_row, mi_col, num_planes);
  361|       |
  362|       |  // Distance of Mb to the various image edges. These are specified to 8th pel
  363|       |  // as they are always compared to values that are in 1/8th pel units
  364|  1.75M|  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, mi_params->mi_rows,
  365|  1.75M|                 mi_params->mi_cols);
  366|       |
  367|  1.75M|  av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col, 0,
  368|  1.75M|                       num_planes);
  369|  1.75M|}
decodeframe.c:read_tx_size_vartx:
 1030|  56.8k|                                      int blk_col, aom_reader *r) {
 1031|  56.8k|  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 1032|  56.8k|  int is_split = 0;
 1033|  56.8k|  const BLOCK_SIZE bsize = mbmi->bsize;
 1034|  56.8k|  const int max_blocks_high = max_block_high(xd, bsize, 0);
 1035|  56.8k|  const int max_blocks_wide = max_block_wide(xd, bsize, 0);
 1036|  56.8k|  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
  ------------------
  |  Branch (1036:7): [True: 140, False: 56.6k]
  |  Branch (1036:37): [True: 66, False: 56.6k]
  ------------------
 1037|  56.8k|  assert(tx_size > TX_4X4);
 1038|  56.6k|  TX_SIZE txs = max_txsize_rect_lookup[bsize];
 1039|   113k|  for (int level = 0; level < MAX_VARTX_DEPTH - 1; ++level)
  ------------------
  |  |   56|   113k|#define MAX_VARTX_DEPTH 2
  ------------------
  |  Branch (1039:23): [True: 56.6k, False: 56.6k]
  ------------------
 1040|  56.6k|    txs = sub_tx_size_map[txs];
 1041|  56.6k|  const int tx_w_log2 = tx_size_wide_log2[txs] - MI_SIZE_LOG2;
  ------------------
  |  |   39|  56.6k|#define MI_SIZE_LOG2 2
  ------------------
 1042|  56.6k|  const int tx_h_log2 = tx_size_high_log2[txs] - MI_SIZE_LOG2;
  ------------------
  |  |   39|  56.6k|#define MI_SIZE_LOG2 2
  ------------------
 1043|  56.6k|  const int bw_log2 = mi_size_wide_log2[bsize];
 1044|  56.6k|  const int stride_log2 = bw_log2 - tx_w_log2;
 1045|       |
 1046|  56.6k|  if (depth == MAX_VARTX_DEPTH) {
  ------------------
  |  |   56|  56.6k|#define MAX_VARTX_DEPTH 2
  ------------------
  |  Branch (1046:7): [True: 6.77k, False: 49.8k]
  ------------------
 1047|  6.77k|    set_inter_tx_size(mbmi, stride_log2, tx_w_log2, tx_h_log2, txs, tx_size,
 1048|  6.77k|                      tx_size, blk_row, blk_col);
 1049|  6.77k|    mbmi->tx_size = tx_size;
 1050|  6.77k|    txfm_partition_update(xd->above_txfm_context + blk_col,
 1051|  6.77k|                          xd->left_txfm_context + blk_row, tx_size, tx_size);
 1052|  6.77k|    return;
 1053|  6.77k|  }
 1054|       |
 1055|  49.8k|  const int ctx = txfm_partition_context(xd->above_txfm_context + blk_col,
 1056|  49.8k|                                         xd->left_txfm_context + blk_row,
 1057|  49.8k|                                         mbmi->bsize, tx_size);
 1058|  49.8k|  is_split = aom_read_symbol(r, ec_ctx->txfm_partition_cdf[ctx], 2, ACCT_STR);
  ------------------
  |  |   51|  49.8k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1059|       |
 1060|  49.8k|  if (is_split) {
  ------------------
  |  Branch (1060:7): [True: 13.0k, False: 36.7k]
  ------------------
 1061|  13.0k|    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
 1062|  13.0k|    const int bsw = tx_size_wide_unit[sub_txs];
 1063|  13.0k|    const int bsh = tx_size_high_unit[sub_txs];
 1064|       |
 1065|  13.0k|    if (sub_txs == TX_4X4) {
  ------------------
  |  Branch (1065:9): [True: 3.97k, False: 9.07k]
  ------------------
 1066|  3.97k|      set_inter_tx_size(mbmi, stride_log2, tx_w_log2, tx_h_log2, txs, tx_size,
 1067|  3.97k|                        sub_txs, blk_row, blk_col);
 1068|  3.97k|      mbmi->tx_size = sub_txs;
 1069|  3.97k|      txfm_partition_update(xd->above_txfm_context + blk_col,
 1070|  3.97k|                            xd->left_txfm_context + blk_row, sub_txs, tx_size);
 1071|  3.97k|      return;
 1072|  3.97k|    }
 1073|       |
 1074|  13.0k|    assert(bsw > 0 && bsh > 0);
 1075|  23.8k|    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
  ------------------
  |  Branch (1075:23): [True: 14.7k, False: 9.07k]
  ------------------
 1076|  40.2k|      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
  ------------------
  |  Branch (1076:25): [True: 25.4k, False: 14.7k]
  ------------------
 1077|  25.4k|        int offsetr = blk_row + row;
 1078|  25.4k|        int offsetc = blk_col + col;
 1079|  25.4k|        read_tx_size_vartx(xd, mbmi, sub_txs, depth + 1, offsetr, offsetc, r);
 1080|  25.4k|      }
 1081|  14.7k|    }
 1082|  36.7k|  } else {
 1083|  36.7k|    set_inter_tx_size(mbmi, stride_log2, tx_w_log2, tx_h_log2, txs, tx_size,
 1084|  36.7k|                      tx_size, blk_row, blk_col);
 1085|  36.7k|    mbmi->tx_size = tx_size;
 1086|  36.7k|    txfm_partition_update(xd->above_txfm_context + blk_col,
 1087|  36.7k|                          xd->left_txfm_context + blk_row, tx_size, tx_size);
 1088|  36.7k|  }
 1089|  49.8k|}
decodeframe.c:set_inter_tx_size:
 1016|  47.5k|                                     int blk_col) {
 1017|   110k|  for (int idy = 0; idy < tx_size_high_unit[split_size];
  ------------------
  |  Branch (1017:21): [True: 63.0k, False: 47.5k]
  ------------------
 1018|  63.0k|       idy += tx_size_high_unit[min_txs]) {
 1019|   155k|    for (int idx = 0; idx < tx_size_wide_unit[split_size];
  ------------------
  |  Branch (1019:23): [True: 92.1k, False: 63.0k]
  ------------------
 1020|  92.1k|         idx += tx_size_wide_unit[min_txs]) {
 1021|  92.1k|      const int index = (((blk_row + idy) >> tx_h_log2) << stride_log2) +
 1022|  92.1k|                        ((blk_col + idx) >> tx_w_log2);
 1023|  92.1k|      mbmi->inter_tx_size[index] = txs;
 1024|  92.1k|    }
 1025|  63.0k|  }
 1026|  47.5k|}
decodeframe.c:read_tx_size:
 1109|  1.72M|                            aom_reader *r) {
 1110|  1.72M|  const BLOCK_SIZE bsize = xd->mi[0]->bsize;
 1111|  1.72M|  if (xd->lossless[xd->mi[0]->segment_id]) return TX_4X4;
  ------------------
  |  Branch (1111:7): [True: 51.4k, False: 1.67M]
  ------------------
 1112|       |
 1113|  1.67M|  if (block_signals_txsize(bsize)) {
  ------------------
  |  Branch (1113:7): [True: 1.54M, False: 128k]
  ------------------
 1114|  1.54M|    if ((!is_inter || allow_select_inter) && tx_mode == TX_MODE_SELECT) {
  ------------------
  |  Branch (1114:10): [True: 1.50M, False: 42.3k]
  |  Branch (1114:23): [True: 30.3k, False: 11.9k]
  |  Branch (1114:46): [True: 646k, False: 885k]
  ------------------
 1115|   646k|      const TX_SIZE coded_tx_size = read_selected_tx_size(xd, r);
 1116|   646k|      return coded_tx_size;
 1117|   897k|    } else {
 1118|   897k|      return tx_size_from_tx_mode(bsize, tx_mode);
 1119|   897k|    }
 1120|  1.54M|  } else {
 1121|       |    assert(IMPLIES(tx_mode == ONLY_4X4, bsize == BLOCK_4X4));
 1122|   128k|    return max_txsize_rect_lookup[bsize];
 1123|   128k|  }
 1124|  1.67M|}
decodeframe.c:read_selected_tx_size:
 1092|   646k|                                     aom_reader *r) {
 1093|       |  // TODO(debargha): Clean up the logic here. This function should only
 1094|       |  // be called for intra.
 1095|   646k|  const BLOCK_SIZE bsize = xd->mi[0]->bsize;
 1096|   646k|  const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize);
 1097|   646k|  const int max_depths = bsize_to_max_depth(bsize);
 1098|   646k|  const int ctx = get_tx_size_context(xd);
 1099|   646k|  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 1100|   646k|  const int depth = aom_read_symbol(r, ec_ctx->tx_size_cdf[tx_size_cat][ctx],
  ------------------
  |  |   51|   646k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1101|   646k|                                    max_depths + 1, ACCT_STR);
 1102|       |  assert(depth >= 0 && depth <= max_depths);
 1103|   646k|  const TX_SIZE tx_size = depth_to_tx_size(depth, bsize);
 1104|   646k|  return tx_size;
 1105|   646k|}
decodeframe.c:decode_token_recon_block:
  904|  2.51M|                                            BLOCK_SIZE bsize) {
  905|  2.51M|  AV1_COMMON *const cm = &pbi->common;
  906|  2.51M|  DecoderCodingBlock *const dcb = &td->dcb;
  907|  2.51M|  MACROBLOCKD *const xd = &dcb->xd;
  908|  2.51M|  const int num_planes = av1_num_planes(cm);
  909|  2.51M|  MB_MODE_INFO *mbmi = xd->mi[0];
  910|       |
  911|  2.51M|  if (!is_inter_block(mbmi)) {
  ------------------
  |  Branch (911:7): [True: 2.42M, False: 95.3k]
  ------------------
  912|  2.42M|    int row, col;
  913|  2.42M|    assert(bsize == get_plane_block_size(bsize, xd->plane[0].subsampling_x,
  914|  2.42M|                                         xd->plane[0].subsampling_y));
  915|  2.42M|    const int max_blocks_wide = max_block_wide(xd, bsize, 0);
  916|  2.42M|    const int max_blocks_high = max_block_high(xd, bsize, 0);
  917|  2.42M|    const BLOCK_SIZE max_unit_bsize = BLOCK_64X64;
  918|  2.42M|    int mu_blocks_wide = mi_size_wide[max_unit_bsize];
  919|  2.42M|    int mu_blocks_high = mi_size_high[max_unit_bsize];
  920|  2.42M|    mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide);
  ------------------
  |  |   34|  2.42M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 2.32M, False: 95.8k]
  |  |  ------------------
  ------------------
  921|  2.42M|    mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high);
  ------------------
  |  |   34|  2.42M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 2.33M, False: 87.6k]
  |  |  ------------------
  ------------------
  922|       |
  923|  4.84M|    for (row = 0; row < max_blocks_high; row += mu_blocks_high) {
  ------------------
  |  Branch (923:19): [True: 2.42M, False: 2.42M]
  ------------------
  924|  4.86M|      for (col = 0; col < max_blocks_wide; col += mu_blocks_wide) {
  ------------------
  |  Branch (924:21): [True: 2.43M, False: 2.42M]
  ------------------
  925|  7.67M|        for (int plane = 0; plane < num_planes; ++plane) {
  ------------------
  |  Branch (925:29): [True: 5.24M, False: 2.43M]
  ------------------
  926|  5.24M|          if (plane && !xd->is_chroma_ref) break;
  ------------------
  |  Branch (926:15): [True: 2.81M, False: 2.42M]
  |  Branch (926:24): [True: 2.71k, False: 2.81M]
  ------------------
  927|  5.24M|          const struct macroblockd_plane *const pd = &xd->plane[plane];
  928|  5.24M|          const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
  929|  5.24M|          const int stepr = tx_size_high_unit[tx_size];
  930|  5.24M|          const int stepc = tx_size_wide_unit[tx_size];
  931|       |
  932|  5.24M|          const int unit_height = ROUND_POWER_OF_TWO(
  ------------------
  |  |   41|  10.4M|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |  |  Branch (41:41): [True: 13.0k, False: 5.23M]
  |  |  ------------------
  ------------------
  933|  5.24M|              AOMMIN(mu_blocks_high + row, max_blocks_high), pd->subsampling_y);
  934|  5.24M|          const int unit_width = ROUND_POWER_OF_TWO(
  ------------------
  |  |   41|  10.4M|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |  |  Branch (41:41): [True: 13.4k, False: 5.23M]
  |  |  ------------------
  ------------------
  935|  5.24M|              AOMMIN(mu_blocks_wide + col, max_blocks_wide), pd->subsampling_x);
  936|       |
  937|  11.2M|          for (int blk_row = row >> pd->subsampling_y; blk_row < unit_height;
  ------------------
  |  Branch (937:56): [True: 5.97M, False: 5.24M]
  ------------------
  938|  5.97M|               blk_row += stepr) {
  939|  14.5M|            for (int blk_col = col >> pd->subsampling_x; blk_col < unit_width;
  ------------------
  |  Branch (939:58): [True: 8.56M, False: 5.97M]
  ------------------
  940|  8.56M|                 blk_col += stepc) {
  941|  8.56M|              td->read_coeffs_tx_intra_block_visit(cm, dcb, r, plane, blk_row,
  942|  8.56M|                                                   blk_col, tx_size);
  943|  8.56M|              td->predict_and_recon_intra_block_visit(
  944|  8.56M|                  cm, dcb, r, plane, blk_row, blk_col, tx_size);
  945|  8.56M|              set_cb_buffer_offsets(dcb, tx_size, plane);
  946|  8.56M|            }
  947|  5.97M|          }
  948|  5.24M|        }
  949|  2.43M|      }
  950|  2.42M|    }
  951|  2.42M|  } else {
  952|  95.3k|    td->predict_inter_block_visit(cm, dcb, bsize);
  953|       |    // Reconstruction
  954|  95.3k|    if (!mbmi->skip_txfm) {
  ------------------
  |  Branch (954:9): [True: 76.0k, False: 19.2k]
  ------------------
  955|  76.0k|      int eobtotal = 0;
  956|       |
  957|  76.0k|      const int max_blocks_wide = max_block_wide(xd, bsize, 0);
  958|  76.0k|      const int max_blocks_high = max_block_high(xd, bsize, 0);
  959|  76.0k|      int row, col;
  960|       |
  961|  76.0k|      const BLOCK_SIZE max_unit_bsize = BLOCK_64X64;
  962|  76.0k|      assert(max_unit_bsize ==
  963|  76.0k|             get_plane_block_size(BLOCK_64X64, xd->plane[0].subsampling_x,
  964|  76.0k|                                  xd->plane[0].subsampling_y));
  965|  76.0k|      int mu_blocks_wide = mi_size_wide[max_unit_bsize];
  966|  76.0k|      int mu_blocks_high = mi_size_high[max_unit_bsize];
  967|       |
  968|  76.0k|      mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide);
  ------------------
  |  |   34|  76.0k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 72.7k, False: 3.28k]
  |  |  ------------------
  ------------------
  969|  76.0k|      mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high);
  ------------------
  |  |   34|  76.0k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 72.8k, False: 3.21k]
  |  |  ------------------
  ------------------
  970|       |
  971|   152k|      for (row = 0; row < max_blocks_high; row += mu_blocks_high) {
  ------------------
  |  Branch (971:21): [True: 76.2k, False: 76.0k]
  ------------------
  972|   152k|        for (col = 0; col < max_blocks_wide; col += mu_blocks_wide) {
  ------------------
  |  Branch (972:23): [True: 76.5k, False: 76.2k]
  ------------------
  973|   270k|          for (int plane = 0; plane < num_planes; ++plane) {
  ------------------
  |  Branch (973:31): [True: 204k, False: 66.6k]
  ------------------
  974|   204k|            if (plane && !xd->is_chroma_ref) break;
  ------------------
  |  Branch (974:17): [True: 127k, False: 76.5k]
  |  Branch (974:26): [True: 9.87k, False: 117k]
  ------------------
  975|   194k|            const struct macroblockd_plane *const pd = &xd->plane[plane];
  976|   194k|            const int ss_x = pd->subsampling_x;
  977|   194k|            const int ss_y = pd->subsampling_y;
  978|   194k|            const BLOCK_SIZE plane_bsize =
  979|   194k|                get_plane_block_size(bsize, ss_x, ss_y);
  980|   194k|            const TX_SIZE max_tx_size =
  981|   194k|                get_vartx_max_txsize(xd, plane_bsize, plane);
  982|   194k|            const int bh_var_tx = tx_size_high_unit[max_tx_size];
  983|   194k|            const int bw_var_tx = tx_size_wide_unit[max_tx_size];
  984|   194k|            int block = 0;
  985|   194k|            int step =
  986|   194k|                tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
  987|   194k|            int blk_row, blk_col;
  988|   194k|            const int unit_height = ROUND_POWER_OF_TWO(
  ------------------
  |  |   41|   388k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |  |  Branch (41:41): [True: 720, False: 193k]
  |  |  ------------------
  ------------------
  989|   194k|                AOMMIN(mu_blocks_high + row, max_blocks_high), ss_y);
  990|   194k|            const int unit_width = ROUND_POWER_OF_TWO(
  ------------------
  |  |   41|   388k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |  |  Branch (41:41): [True: 678, False: 193k]
  |  |  ------------------
  ------------------
  991|   194k|                AOMMIN(mu_blocks_wide + col, max_blocks_wide), ss_x);
  992|       |
  993|   404k|            for (blk_row = row >> ss_y; blk_row < unit_height;
  ------------------
  |  Branch (993:41): [True: 210k, False: 194k]
  ------------------
  994|   210k|                 blk_row += bh_var_tx) {
  995|   473k|              for (blk_col = col >> ss_x; blk_col < unit_width;
  ------------------
  |  Branch (995:43): [True: 263k, False: 210k]
  ------------------
  996|   263k|                   blk_col += bw_var_tx) {
  997|   263k|                decode_reconstruct_tx(cm, td, r, mbmi, plane, plane_bsize,
  998|   263k|                                      blk_row, blk_col, block, max_tx_size,
  999|   263k|                                      &eobtotal);
 1000|   263k|                block += step;
 1001|   263k|              }
 1002|   210k|            }
 1003|   194k|          }
 1004|  76.5k|        }
 1005|  76.2k|      }
 1006|  76.0k|    }
 1007|  95.3k|    td->cfl_store_inter_block_visit(cm, xd);
 1008|  95.3k|  }
 1009|       |
 1010|  2.51M|  av1_visit_palette(pbi, xd, r, set_color_index_map_offset);
 1011|  2.51M|}
decodeframe.c:set_cb_buffer_offsets:
  275|  8.85M|                                         TX_SIZE tx_size, int plane) {
  276|  8.85M|  dcb->cb_offset[plane] += tx_size_wide[tx_size] * tx_size_high[tx_size];
  277|  8.85M|  dcb->txb_offset[plane] =
  278|  8.85M|      dcb->cb_offset[plane] / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
  ------------------
  |  |  231|  8.85M|#define TX_SIZE_W_MIN 4
  ------------------
                    dcb->cb_offset[plane] / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
  ------------------
  |  |  238|  8.85M|#define TX_SIZE_H_MIN 4
  ------------------
  279|  8.85M|}
decodeframe.c:decode_reconstruct_tx:
  286|   304k|                                         TX_SIZE tx_size, int *eob_total) {
  287|   304k|  DecoderCodingBlock *const dcb = &td->dcb;
  288|   304k|  MACROBLOCKD *const xd = &dcb->xd;
  289|   304k|  const struct macroblockd_plane *const pd = &xd->plane[plane];
  290|   304k|  const TX_SIZE plane_tx_size =
  291|   304k|      plane ? av1_get_max_uv_txsize(mbmi->bsize, pd->subsampling_x,
  ------------------
  |  Branch (291:7): [True: 158k, False: 146k]
  ------------------
  292|   158k|                                    pd->subsampling_y)
  293|   304k|            : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
  294|   146k|                                                         blk_col)];
  295|       |  // Scale to match transform block unit.
  296|   304k|  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
  297|   304k|  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
  298|       |
  299|   304k|  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
  ------------------
  |  Branch (299:7): [True: 18.4E, False: 304k]
  |  Branch (299:37): [True: 18.4E, False: 304k]
  ------------------
  300|       |
  301|   304k|  if (tx_size == plane_tx_size || plane) {
  ------------------
  |  Branch (301:7): [True: 241k, False: 62.7k]
  |  Branch (301:35): [True: 48.9k, False: 13.7k]
  ------------------
  302|   290k|    td->read_coeffs_tx_inter_block_visit(cm, dcb, r, plane, blk_row, blk_col,
  303|   290k|                                         tx_size);
  304|       |
  305|   290k|    td->inverse_tx_inter_block_visit(cm, dcb, r, plane, blk_row, blk_col,
  306|   290k|                                     tx_size);
  307|   290k|    eob_info *eob_data = dcb->eob_data[plane] + dcb->txb_offset[plane];
  308|   290k|    *eob_total += eob_data->eob;
  309|   290k|    set_cb_buffer_offsets(dcb, tx_size, plane);
  310|   290k|  } else {
  311|  13.7k|    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
  312|  13.7k|    assert(IMPLIES(tx_size <= TX_4X4, sub_txs == tx_size));
  313|  13.7k|    assert(IMPLIES(tx_size > TX_4X4, sub_txs < tx_size));
  314|  13.7k|    const int bsw = tx_size_wide_unit[sub_txs];
  315|  13.7k|    const int bsh = tx_size_high_unit[sub_txs];
  316|  13.7k|    const int sub_step = bsw * bsh;
  317|  13.7k|    const int row_end =
  318|  13.7k|        AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row);
  ------------------
  |  |   34|  13.7k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 1.45k, False: 12.3k]
  |  |  ------------------
  ------------------
  319|  13.7k|    const int col_end =
  320|  13.7k|        AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col);
  ------------------
  |  |   34|  13.7k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 1.67k, False: 12.0k]
  |  |  ------------------
  ------------------
  321|       |
  322|  13.7k|    assert(bsw > 0 && bsh > 0);
  323|       |
  324|  36.9k|    for (int row = 0; row < row_end; row += bsh) {
  ------------------
  |  Branch (324:23): [True: 23.2k, False: 13.7k]
  ------------------
  325|  23.2k|      const int offsetr = blk_row + row;
  326|  64.6k|      for (int col = 0; col < col_end; col += bsw) {
  ------------------
  |  Branch (326:25): [True: 41.4k, False: 23.2k]
  ------------------
  327|  41.4k|        const int offsetc = blk_col + col;
  328|       |
  329|  41.4k|        decode_reconstruct_tx(cm, td, r, mbmi, plane, plane_bsize, offsetr,
  330|  41.4k|                              offsetc, block, sub_txs, eob_total);
  331|  41.4k|        block += sub_step;
  332|  41.4k|      }
  333|  23.2k|    }
  334|  13.7k|  }
  335|   304k|}
decodeframe.c:set_color_index_map_offset:
  893|  93.0k|                                              aom_reader *r) {
  894|  93.0k|  (void)r;
  895|  93.0k|  Av1ColorMapParam params;
  896|  93.0k|  const MB_MODE_INFO *const mbmi = xd->mi[0];
  897|  93.0k|  av1_get_block_dimensions(mbmi->bsize, plane, xd, &params.plane_width,
  898|  93.0k|                           &params.plane_height, NULL, NULL);
  899|  93.0k|  xd->color_index_map_offset[plane] += params.plane_width * params.plane_height;
  900|  93.0k|}
decodeframe.c:decode_block:
 1218|   764k|                                PARTITION_TYPE partition, BLOCK_SIZE bsize) {
 1219|   764k|  (void)partition;
 1220|   764k|  set_offsets_for_pred_and_recon(pbi, td, mi_row, mi_col, bsize);
 1221|   764k|  decode_token_recon_block(pbi, td, r, bsize);
 1222|   764k|}
decodeframe.c:set_offsets_for_pred_and_recon:
 1188|   764k|                                                  BLOCK_SIZE bsize) {
 1189|   764k|  AV1_COMMON *const cm = &pbi->common;
 1190|   764k|  const CommonModeInfoParams *const mi_params = &cm->mi_params;
 1191|   764k|  DecoderCodingBlock *const dcb = &td->dcb;
 1192|   764k|  MACROBLOCKD *const xd = &dcb->xd;
 1193|   764k|  const int bw = mi_size_wide[bsize];
 1194|   764k|  const int bh = mi_size_high[bsize];
 1195|   764k|  const int num_planes = av1_num_planes(cm);
 1196|       |
 1197|   764k|  const int offset = mi_row * mi_params->mi_stride + mi_col;
 1198|   764k|  const TileInfo *const tile = &xd->tile;
 1199|       |
 1200|   764k|  xd->mi = mi_params->mi_grid_base + offset;
 1201|   764k|  xd->tx_type_map =
 1202|   764k|      &mi_params->tx_type_map[mi_row * mi_params->mi_stride + mi_col];
 1203|   764k|  xd->tx_type_map_stride = mi_params->mi_stride;
 1204|       |
 1205|   764k|  set_plane_n4(xd, bw, bh, num_planes);
 1206|       |
 1207|       |  // Distance of Mb to the various image edges. These are specified to 8th pel
 1208|       |  // as they are always compared to values that are in 1/8th pel units
 1209|   764k|  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, mi_params->mi_rows,
 1210|   764k|                 mi_params->mi_cols);
 1211|       |
 1212|   764k|  av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col, 0,
 1213|   764k|                       num_planes);
 1214|   764k|}
decodeframe.c:loop_restoration_read_sb_coeffs:
 1663|  18.5k|                                                   int plane, int runit_idx) {
 1664|  18.5k|  const RestorationInfo *rsi = &cm->rst_info[plane];
 1665|  18.5k|  RestorationUnitInfo *rui = &rsi->unit_info[runit_idx];
 1666|  18.5k|  assert(rsi->frame_restoration_type != RESTORE_NONE);
 1667|       |
 1668|  18.5k|  assert(!cm->features.all_lossless);
 1669|       |
 1670|  18.5k|  const int wiener_win = (plane > 0) ? WIENER_WIN_CHROMA : WIENER_WIN;
  ------------------
  |  |  128|  11.8k|#define WIENER_WIN_CHROMA (WIENER_WIN - 2)
  |  |  ------------------
  |  |  |  |  121|  11.8k|#define WIENER_WIN (2 * WIENER_HALFWIN + 1)
  |  |  |  |  ------------------
  |  |  |  |  |  |   43|  11.8k|#define WIENER_HALFWIN 3
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
                const int wiener_win = (plane > 0) ? WIENER_WIN_CHROMA : WIENER_WIN;
  ------------------
  |  |  121|  6.70k|#define WIENER_WIN (2 * WIENER_HALFWIN + 1)
  |  |  ------------------
  |  |  |  |   43|  6.70k|#define WIENER_HALFWIN 3
  |  |  ------------------
  ------------------
  |  Branch (1670:26): [True: 11.8k, False: 6.70k]
  ------------------
 1671|  18.5k|  WienerInfo *wiener_info = xd->wiener_info + plane;
 1672|  18.5k|  SgrprojInfo *sgrproj_info = xd->sgrproj_info + plane;
 1673|       |
 1674|  18.5k|  if (rsi->frame_restoration_type == RESTORE_SWITCHABLE) {
  ------------------
  |  Branch (1674:7): [True: 7.09k, False: 11.4k]
  ------------------
 1675|  7.09k|    rui->restoration_type =
 1676|  7.09k|        aom_read_symbol(r, xd->tile_ctx->switchable_restore_cdf,
  ------------------
  |  |   51|  7.09k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1677|  7.09k|                        RESTORE_SWITCHABLE_TYPES, ACCT_STR);
 1678|  7.09k|    switch (rui->restoration_type) {
 1679|  2.89k|      case RESTORE_WIENER:
  ------------------
  |  Branch (1679:7): [True: 2.89k, False: 4.20k]
  ------------------
 1680|  2.89k|        read_wiener_filter(wiener_win, &rui->wiener_info, wiener_info, r);
 1681|  2.89k|        break;
 1682|  2.38k|      case RESTORE_SGRPROJ:
  ------------------
  |  Branch (1682:7): [True: 2.38k, False: 4.71k]
  ------------------
 1683|  2.38k|        read_sgrproj_filter(&rui->sgrproj_info, sgrproj_info, r);
 1684|  2.38k|        break;
 1685|  1.82k|      default: assert(rui->restoration_type == RESTORE_NONE); break;
  ------------------
  |  Branch (1685:7): [True: 1.82k, False: 5.27k]
  ------------------
 1686|  7.09k|    }
 1687|  11.4k|  } else if (rsi->frame_restoration_type == RESTORE_WIENER) {
  ------------------
  |  Branch (1687:14): [True: 6.47k, False: 4.94k]
  ------------------
 1688|  6.47k|    if (aom_read_symbol(r, xd->tile_ctx->wiener_restore_cdf, 2, ACCT_STR)) {
  ------------------
  |  |   51|  6.47k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  |  |  ------------------
  |  |  |  Branch (51:3): [True: 4.17k, False: 2.29k]
  |  |  ------------------
  ------------------
 1689|  4.17k|      rui->restoration_type = RESTORE_WIENER;
 1690|  4.17k|      read_wiener_filter(wiener_win, &rui->wiener_info, wiener_info, r);
 1691|  4.17k|    } else {
 1692|  2.29k|      rui->restoration_type = RESTORE_NONE;
 1693|  2.29k|    }
 1694|  6.47k|  } else if (rsi->frame_restoration_type == RESTORE_SGRPROJ) {
  ------------------
  |  Branch (1694:14): [True: 4.94k, False: 0]
  ------------------
 1695|  4.94k|    if (aom_read_symbol(r, xd->tile_ctx->sgrproj_restore_cdf, 2, ACCT_STR)) {
  ------------------
  |  |   51|  4.94k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  |  |  ------------------
  |  |  |  Branch (51:3): [True: 2.98k, False: 1.96k]
  |  |  ------------------
  ------------------
 1696|  2.98k|      rui->restoration_type = RESTORE_SGRPROJ;
 1697|  2.98k|      read_sgrproj_filter(&rui->sgrproj_info, sgrproj_info, r);
 1698|  2.98k|    } else {
 1699|  1.96k|      rui->restoration_type = RESTORE_NONE;
 1700|  1.96k|    }
 1701|  4.94k|  }
 1702|  18.5k|}
decodeframe.c:read_wiener_filter:
 1565|  7.06k|                                      aom_reader *rb) {
 1566|  7.06k|  memset(wiener_info->vfilter, 0, sizeof(wiener_info->vfilter));
 1567|  7.06k|  memset(wiener_info->hfilter, 0, sizeof(wiener_info->hfilter));
 1568|       |
 1569|  7.06k|  if (wiener_win == WIENER_WIN)
  ------------------
  |  |  121|  7.06k|#define WIENER_WIN (2 * WIENER_HALFWIN + 1)
  |  |  ------------------
  |  |  |  |   43|  7.06k|#define WIENER_HALFWIN 3
  |  |  ------------------
  ------------------
  |  Branch (1569:7): [True: 2.55k, False: 4.50k]
  ------------------
 1570|  2.55k|    wiener_info->vfilter[0] = wiener_info->vfilter[WIENER_WIN - 1] =
  ------------------
  |  |  121|  2.55k|#define WIENER_WIN (2 * WIENER_HALFWIN + 1)
  |  |  ------------------
  |  |  |  |   43|  2.55k|#define WIENER_HALFWIN 3
  |  |  ------------------
  ------------------
 1571|  2.55k|        aom_read_primitive_refsubexpfin(
  ------------------
  |  |   28|  2.55k|  aom_read_primitive_refsubexpfin_(r, n, k, ref ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1572|  2.55k|            rb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
 1573|  2.55k|            WIENER_FILT_TAP0_SUBEXP_K,
 1574|  2.55k|            ref_wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV, ACCT_STR) +
 1575|  2.55k|        WIENER_FILT_TAP0_MINV;
  ------------------
  |  |  152|  2.55k|  (WIENER_FILT_TAP0_MIDV - (1 << WIENER_FILT_TAP0_BITS) / 2)
  |  |  ------------------
  |  |  |  |  137|  2.55k|#define WIENER_FILT_TAP0_MIDV (3)
  |  |  ------------------
  |  |                 (WIENER_FILT_TAP0_MIDV - (1 << WIENER_FILT_TAP0_BITS) / 2)
  |  |  ------------------
  |  |  |  |  144|  2.55k|#define WIENER_FILT_TAP0_BITS 4
  |  |  ------------------
  ------------------
 1576|  4.50k|  else
 1577|  4.50k|    wiener_info->vfilter[0] = wiener_info->vfilter[WIENER_WIN - 1] = 0;
  ------------------
  |  |  121|  4.50k|#define WIENER_WIN (2 * WIENER_HALFWIN + 1)
  |  |  ------------------
  |  |  |  |   43|  4.50k|#define WIENER_HALFWIN 3
  |  |  ------------------
  ------------------
 1578|  7.06k|  wiener_info->vfilter[1] = wiener_info->vfilter[WIENER_WIN - 2] =
  ------------------
  |  |  121|  7.06k|#define WIENER_WIN (2 * WIENER_HALFWIN + 1)
  |  |  ------------------
  |  |  |  |   43|  7.06k|#define WIENER_HALFWIN 3
  |  |  ------------------
  ------------------
 1579|  7.06k|      aom_read_primitive_refsubexpfin(
  ------------------
  |  |   28|  7.06k|  aom_read_primitive_refsubexpfin_(r, n, k, ref ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1580|  7.06k|          rb, WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
 1581|  7.06k|          WIENER_FILT_TAP1_SUBEXP_K,
 1582|  7.06k|          ref_wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV, ACCT_STR) +
 1583|  7.06k|      WIENER_FILT_TAP1_MINV;
  ------------------
  |  |  154|  7.06k|  (WIENER_FILT_TAP1_MIDV - (1 << WIENER_FILT_TAP1_BITS) / 2)
  |  |  ------------------
  |  |  |  |  138|  7.06k|#define WIENER_FILT_TAP1_MIDV (-7)
  |  |  ------------------
  |  |                 (WIENER_FILT_TAP1_MIDV - (1 << WIENER_FILT_TAP1_BITS) / 2)
  |  |  ------------------
  |  |  |  |  145|  7.06k|#define WIENER_FILT_TAP1_BITS 5
  |  |  ------------------
  ------------------
 1584|  7.06k|  wiener_info->vfilter[2] = wiener_info->vfilter[WIENER_WIN - 3] =
  ------------------
  |  |  121|  7.06k|#define WIENER_WIN (2 * WIENER_HALFWIN + 1)
  |  |  ------------------
  |  |  |  |   43|  7.06k|#define WIENER_HALFWIN 3
  |  |  ------------------
  ------------------
 1585|  7.06k|      aom_read_primitive_refsubexpfin(
  ------------------
  |  |   28|  7.06k|  aom_read_primitive_refsubexpfin_(r, n, k, ref ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1586|  7.06k|          rb, WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1,
 1587|  7.06k|          WIENER_FILT_TAP2_SUBEXP_K,
 1588|  7.06k|          ref_wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV, ACCT_STR) +
 1589|  7.06k|      WIENER_FILT_TAP2_MINV;
  ------------------
  |  |  156|  7.06k|  (WIENER_FILT_TAP2_MIDV - (1 << WIENER_FILT_TAP2_BITS) / 2)
  |  |  ------------------
  |  |  |  |  139|  7.06k|#define WIENER_FILT_TAP2_MIDV (15)
  |  |  ------------------
  |  |                 (WIENER_FILT_TAP2_MIDV - (1 << WIENER_FILT_TAP2_BITS) / 2)
  |  |  ------------------
  |  |  |  |  146|  7.06k|#define WIENER_FILT_TAP2_BITS 6
  |  |  ------------------
  ------------------
 1590|       |  // The central element has an implicit +WIENER_FILT_STEP
 1591|  7.06k|  wiener_info->vfilter[WIENER_HALFWIN] =
  ------------------
  |  |   43|  7.06k|#define WIENER_HALFWIN 3
  ------------------
 1592|  7.06k|      -2 * (wiener_info->vfilter[0] + wiener_info->vfilter[1] +
 1593|  7.06k|            wiener_info->vfilter[2]);
 1594|       |
 1595|  7.06k|  if (wiener_win == WIENER_WIN)
  ------------------
  |  |  121|  7.06k|#define WIENER_WIN (2 * WIENER_HALFWIN + 1)
  |  |  ------------------
  |  |  |  |   43|  7.06k|#define WIENER_HALFWIN 3
  |  |  ------------------
  ------------------
  |  Branch (1595:7): [True: 2.55k, False: 4.50k]
  ------------------
 1596|  2.55k|    wiener_info->hfilter[0] = wiener_info->hfilter[WIENER_WIN - 1] =
  ------------------
  |  |  121|  2.55k|#define WIENER_WIN (2 * WIENER_HALFWIN + 1)
  |  |  ------------------
  |  |  |  |   43|  2.55k|#define WIENER_HALFWIN 3
  |  |  ------------------
  ------------------
 1597|  2.55k|        aom_read_primitive_refsubexpfin(
  ------------------
  |  |   28|  2.55k|  aom_read_primitive_refsubexpfin_(r, n, k, ref ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1598|  2.55k|            rb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
 1599|  2.55k|            WIENER_FILT_TAP0_SUBEXP_K,
 1600|  2.55k|            ref_wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV, ACCT_STR) +
 1601|  2.55k|        WIENER_FILT_TAP0_MINV;
  ------------------
  |  |  152|  2.55k|  (WIENER_FILT_TAP0_MIDV - (1 << WIENER_FILT_TAP0_BITS) / 2)
  |  |  ------------------
  |  |  |  |  137|  2.55k|#define WIENER_FILT_TAP0_MIDV (3)
  |  |  ------------------
  |  |                 (WIENER_FILT_TAP0_MIDV - (1 << WIENER_FILT_TAP0_BITS) / 2)
  |  |  ------------------
  |  |  |  |  144|  2.55k|#define WIENER_FILT_TAP0_BITS 4
  |  |  ------------------
  ------------------
 1602|  4.50k|  else
 1603|  4.50k|    wiener_info->hfilter[0] = wiener_info->hfilter[WIENER_WIN - 1] = 0;
  ------------------
  |  |  121|  4.50k|#define WIENER_WIN (2 * WIENER_HALFWIN + 1)
  |  |  ------------------
  |  |  |  |   43|  4.50k|#define WIENER_HALFWIN 3
  |  |  ------------------
  ------------------
 1604|  7.06k|  wiener_info->hfilter[1] = wiener_info->hfilter[WIENER_WIN - 2] =
  ------------------
  |  |  121|  7.06k|#define WIENER_WIN (2 * WIENER_HALFWIN + 1)
  |  |  ------------------
  |  |  |  |   43|  7.06k|#define WIENER_HALFWIN 3
  |  |  ------------------
  ------------------
 1605|  7.06k|      aom_read_primitive_refsubexpfin(
  ------------------
  |  |   28|  7.06k|  aom_read_primitive_refsubexpfin_(r, n, k, ref ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1606|  7.06k|          rb, WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
 1607|  7.06k|          WIENER_FILT_TAP1_SUBEXP_K,
 1608|  7.06k|          ref_wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV, ACCT_STR) +
 1609|  7.06k|      WIENER_FILT_TAP1_MINV;
  ------------------
  |  |  154|  7.06k|  (WIENER_FILT_TAP1_MIDV - (1 << WIENER_FILT_TAP1_BITS) / 2)
  |  |  ------------------
  |  |  |  |  138|  7.06k|#define WIENER_FILT_TAP1_MIDV (-7)
  |  |  ------------------
  |  |                 (WIENER_FILT_TAP1_MIDV - (1 << WIENER_FILT_TAP1_BITS) / 2)
  |  |  ------------------
  |  |  |  |  145|  7.06k|#define WIENER_FILT_TAP1_BITS 5
  |  |  ------------------
  ------------------
 1610|  7.06k|  wiener_info->hfilter[2] = wiener_info->hfilter[WIENER_WIN - 3] =
  ------------------
  |  |  121|  7.06k|#define WIENER_WIN (2 * WIENER_HALFWIN + 1)
  |  |  ------------------
  |  |  |  |   43|  7.06k|#define WIENER_HALFWIN 3
  |  |  ------------------
  ------------------
 1611|  7.06k|      aom_read_primitive_refsubexpfin(
  ------------------
  |  |   28|  7.06k|  aom_read_primitive_refsubexpfin_(r, n, k, ref ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1612|  7.06k|          rb, WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1,
 1613|  7.06k|          WIENER_FILT_TAP2_SUBEXP_K,
 1614|  7.06k|          ref_wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV, ACCT_STR) +
 1615|  7.06k|      WIENER_FILT_TAP2_MINV;
  ------------------
  |  |  156|  7.06k|  (WIENER_FILT_TAP2_MIDV - (1 << WIENER_FILT_TAP2_BITS) / 2)
  |  |  ------------------
  |  |  |  |  139|  7.06k|#define WIENER_FILT_TAP2_MIDV (15)
  |  |  ------------------
  |  |                 (WIENER_FILT_TAP2_MIDV - (1 << WIENER_FILT_TAP2_BITS) / 2)
  |  |  ------------------
  |  |  |  |  146|  7.06k|#define WIENER_FILT_TAP2_BITS 6
  |  |  ------------------
  ------------------
 1616|       |  // The central element has an implicit +WIENER_FILT_STEP
 1617|  7.06k|  wiener_info->hfilter[WIENER_HALFWIN] =
  ------------------
  |  |   43|  7.06k|#define WIENER_HALFWIN 3
  ------------------
 1618|  7.06k|      -2 * (wiener_info->hfilter[0] + wiener_info->hfilter[1] +
 1619|  7.06k|            wiener_info->hfilter[2]);
 1620|  7.06k|  *ref_wiener_info = *wiener_info;
 1621|  7.06k|}
decodeframe.c:read_sgrproj_filter:
 1625|  5.36k|                                       aom_reader *rb) {
 1626|  5.36k|  sgrproj_info->ep = aom_read_literal(rb, SGRPROJ_PARAMS_BITS, ACCT_STR);
  ------------------
  |  |   47|  5.36k|  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1627|  5.36k|  const sgr_params_type *params = &av1_sgr_params[sgrproj_info->ep];
 1628|       |
 1629|  5.36k|  if (params->r[0] == 0) {
  ------------------
  |  Branch (1629:7): [True: 1.14k, False: 4.22k]
  ------------------
 1630|  1.14k|    sgrproj_info->xqd[0] = 0;
 1631|  1.14k|    sgrproj_info->xqd[1] =
 1632|  1.14k|        aom_read_primitive_refsubexpfin(
  ------------------
  |  |   28|  1.14k|  aom_read_primitive_refsubexpfin_(r, n, k, ref ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1633|  1.14k|            rb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K,
 1634|  1.14k|            ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1, ACCT_STR) +
 1635|  1.14k|        SGRPROJ_PRJ_MIN1;
  ------------------
  |  |  108|  1.14k|#define SGRPROJ_PRJ_MIN1 (-(1 << SGRPROJ_PRJ_BITS) / 4)
  |  |  ------------------
  |  |  |  |   99|  1.14k|#define SGRPROJ_PRJ_BITS 7
  |  |  ------------------
  ------------------
 1636|  4.22k|  } else if (params->r[1] == 0) {
  ------------------
  |  Branch (1636:14): [True: 1.13k, False: 3.08k]
  ------------------
 1637|  1.13k|    sgrproj_info->xqd[0] =
 1638|  1.13k|        aom_read_primitive_refsubexpfin(
  ------------------
  |  |   28|  1.13k|  aom_read_primitive_refsubexpfin_(r, n, k, ref ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1639|  1.13k|            rb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K,
 1640|  1.13k|            ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0, ACCT_STR) +
 1641|  1.13k|        SGRPROJ_PRJ_MIN0;
  ------------------
  |  |  106|  1.13k|#define SGRPROJ_PRJ_MIN0 (-(1 << SGRPROJ_PRJ_BITS) * 3 / 4)
  |  |  ------------------
  |  |  |  |   99|  1.13k|#define SGRPROJ_PRJ_BITS 7
  |  |  ------------------
  ------------------
 1642|  1.13k|    sgrproj_info->xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - sgrproj_info->xqd[0],
  ------------------
  |  |   99|  1.13k|#define SGRPROJ_PRJ_BITS 7
  ------------------
 1643|  1.13k|                                 SGRPROJ_PRJ_MIN1, SGRPROJ_PRJ_MAX1);
  ------------------
  |  |  108|  1.13k|#define SGRPROJ_PRJ_MIN1 (-(1 << SGRPROJ_PRJ_BITS) / 4)
  |  |  ------------------
  |  |  |  |   99|  1.13k|#define SGRPROJ_PRJ_BITS 7
  |  |  ------------------
  ------------------
                                               SGRPROJ_PRJ_MIN1, SGRPROJ_PRJ_MAX1);
  ------------------
  |  |  109|  1.13k|#define SGRPROJ_PRJ_MAX1 (SGRPROJ_PRJ_MIN1 + (1 << SGRPROJ_PRJ_BITS) - 1)
  |  |  ------------------
  |  |  |  |  108|  1.13k|#define SGRPROJ_PRJ_MIN1 (-(1 << SGRPROJ_PRJ_BITS) / 4)
  |  |  |  |  ------------------
  |  |  |  |  |  |   99|  1.13k|#define SGRPROJ_PRJ_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |               #define SGRPROJ_PRJ_MAX1 (SGRPROJ_PRJ_MIN1 + (1 << SGRPROJ_PRJ_BITS) - 1)
  |  |  ------------------
  |  |  |  |   99|  1.13k|#define SGRPROJ_PRJ_BITS 7
  |  |  ------------------
  ------------------
 1644|  3.08k|  } else {
 1645|  3.08k|    sgrproj_info->xqd[0] =
 1646|  3.08k|        aom_read_primitive_refsubexpfin(
  ------------------
  |  |   28|  3.08k|  aom_read_primitive_refsubexpfin_(r, n, k, ref ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1647|  3.08k|            rb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K,
 1648|  3.08k|            ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0, ACCT_STR) +
 1649|  3.08k|        SGRPROJ_PRJ_MIN0;
  ------------------
  |  |  106|  3.08k|#define SGRPROJ_PRJ_MIN0 (-(1 << SGRPROJ_PRJ_BITS) * 3 / 4)
  |  |  ------------------
  |  |  |  |   99|  3.08k|#define SGRPROJ_PRJ_BITS 7
  |  |  ------------------
  ------------------
 1650|  3.08k|    sgrproj_info->xqd[1] =
 1651|  3.08k|        aom_read_primitive_refsubexpfin(
  ------------------
  |  |   28|  3.08k|  aom_read_primitive_refsubexpfin_(r, n, k, ref ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1652|  3.08k|            rb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K,
 1653|  3.08k|            ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1, ACCT_STR) +
 1654|  3.08k|        SGRPROJ_PRJ_MIN1;
  ------------------
  |  |  108|  3.08k|#define SGRPROJ_PRJ_MIN1 (-(1 << SGRPROJ_PRJ_BITS) / 4)
  |  |  ------------------
  |  |  |  |   99|  3.08k|#define SGRPROJ_PRJ_BITS 7
  |  |  ------------------
  ------------------
 1655|  3.08k|  }
 1656|       |
 1657|  5.36k|  *ref_sgrproj_info = *sgrproj_info;
 1658|  5.36k|}
decodeframe.c:read_partition:
 1226|  1.31M|                                     BLOCK_SIZE bsize) {
 1227|  1.31M|  const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
 1228|  1.31M|  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 1229|       |
 1230|  1.31M|  if (!has_rows && !has_cols) return PARTITION_SPLIT;
  ------------------
  |  Branch (1230:7): [True: 63.4k, False: 1.25M]
  |  Branch (1230:20): [True: 21.2k, False: 42.1k]
  ------------------
 1231|       |
 1232|  1.31M|  assert(ctx >= 0);
 1233|  1.29M|  aom_cdf_prob *partition_cdf = ec_ctx->partition_cdf[ctx];
 1234|  1.29M|  if (has_rows && has_cols) {
  ------------------
  |  Branch (1234:7): [True: 1.25M, False: 42.1k]
  |  Branch (1234:19): [True: 1.20M, False: 45.5k]
  ------------------
 1235|  1.20M|    return (PARTITION_TYPE)aom_read_symbol(
  ------------------
  |  |   51|  1.20M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1236|  1.20M|        r, partition_cdf, partition_cdf_length(bsize), ACCT_STR);
 1237|  1.20M|  } else if (!has_rows && has_cols) {
  ------------------
  |  Branch (1237:14): [True: 42.1k, False: 45.5k]
  |  Branch (1237:27): [True: 42.1k, False: 0]
  ------------------
 1238|  42.1k|    assert(bsize > BLOCK_8X8);
 1239|  42.1k|    aom_cdf_prob cdf[2];
 1240|  42.1k|    partition_gather_vert_alike(cdf, partition_cdf, bsize);
 1241|  42.1k|    assert(cdf[1] == AOM_ICDF(CDF_PROB_TOP));
 1242|  42.1k|    return aom_read_cdf(r, cdf, 2, ACCT_STR) ? PARTITION_SPLIT : PARTITION_HORZ;
  ------------------
  |  |   49|  42.1k|  aom_read_cdf_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  |  |  ------------------
  |  |  |  Branch (49:3): [True: 21.2k, False: 20.9k]
  |  |  ------------------
  ------------------
 1243|  45.5k|  } else {
 1244|  45.5k|    assert(has_rows && !has_cols);
 1245|  45.5k|    assert(bsize > BLOCK_8X8);
 1246|  45.5k|    aom_cdf_prob cdf[2];
 1247|  45.5k|    partition_gather_horz_alike(cdf, partition_cdf, bsize);
 1248|  45.5k|    assert(cdf[1] == AOM_ICDF(CDF_PROB_TOP));
 1249|  45.5k|    return aom_read_cdf(r, cdf, 2, ACCT_STR) ? PARTITION_SPLIT : PARTITION_VERT;
  ------------------
  |  |   49|  45.5k|  aom_read_cdf_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  |  |  ------------------
  |  |  |  Branch (49:3): [True: 26.3k, False: 19.2k]
  |  |  ------------------
  ------------------
 1250|  45.5k|  }
 1251|  1.29M|}
decodeframe.c:signal_parse_sb_row_done:
 3142|  25.4k|                                            const int sb_mi_size) {
 3143|  25.4k|  AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info;
 3144|  25.4k|#if CONFIG_MULTITHREAD
 3145|  25.4k|  pthread_mutex_lock(pbi->row_mt_mutex_);
 3146|  25.4k|#endif
 3147|  25.4k|  assert(frame_row_mt_info->mi_rows_parse_done >=
 3148|  25.4k|         frame_row_mt_info->mi_rows_decode_started);
 3149|  25.4k|  tile_data->dec_row_mt_sync.mi_rows_parse_done += sb_mi_size;
 3150|  25.4k|  frame_row_mt_info->mi_rows_parse_done += sb_mi_size;
 3151|  25.4k|#if CONFIG_MULTITHREAD
 3152|       |  // A new decode job is available. Wake up one worker thread to handle the
 3153|       |  // new decode job.
 3154|       |  // NOTE: This assumes we bump mi_rows_parse_done and mi_rows_decode_started
 3155|       |  // by the same increment (sb_mi_size).
 3156|  25.4k|  pthread_cond_signal(pbi->row_mt_cond_);
 3157|  25.4k|  pthread_mutex_unlock(pbi->row_mt_mutex_);
 3158|  25.4k|#endif
 3159|  25.4k|}
decodeframe.c:check_trailing_bits_after_symbol_coder:
 2674|  17.6k|static int check_trailing_bits_after_symbol_coder(aom_reader *r) {
 2675|  17.6k|  if (aom_reader_has_overflowed(r)) return -1;
  ------------------
  |  Branch (2675:7): [True: 0, False: 17.6k]
  ------------------
 2676|       |
 2677|  17.6k|  uint32_t nb_bits = aom_reader_tell(r);
 2678|  17.6k|  uint32_t nb_bytes = (nb_bits + 7) >> 3;
 2679|  17.6k|  const uint8_t *p = aom_reader_find_begin(r) + nb_bytes;
 2680|       |
 2681|       |  // aom_reader_tell() returns 1 for a newly initialized decoder, and the
 2682|       |  // return value only increases as values are decoded. So nb_bits > 0, and
 2683|       |  // thus p > p_begin. Therefore accessing p[-1] is safe.
 2684|  17.6k|  uint8_t last_byte = p[-1];
 2685|  17.6k|  uint8_t pattern = 128 >> ((nb_bits - 1) & 7);
 2686|  17.6k|  if ((last_byte & (2 * pattern - 1)) != pattern) return -1;
  ------------------
  |  Branch (2686:7): [True: 4.40k, False: 13.2k]
  ------------------
 2687|       |
 2688|       |  // Make sure that all padding bytes are zero as required by the spec.
 2689|  13.2k|  const uint8_t *p_end = aom_reader_find_end(r);
 2690|  13.8k|  while (p < p_end) {
  ------------------
  |  Branch (2690:10): [True: 1.20k, False: 12.6k]
  ------------------
 2691|  1.20k|    if (*p != 0) return -1;
  ------------------
  |  Branch (2691:9): [True: 647, False: 558]
  ------------------
 2692|    558|    p++;
 2693|    558|  }
 2694|  12.6k|  return 0;
 2695|  13.2k|}
decodeframe.c:get_next_job_info:
 3038|  47.2k|                             int *end_of_frame) {
 3039|  47.2k|  AV1_COMMON *cm = &pbi->common;
 3040|  47.2k|  TileDataDec *tile_data;
 3041|  47.2k|  AV1DecRowMTSync *dec_row_mt_sync;
 3042|  47.2k|  AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info;
 3043|  47.2k|  const int tile_rows_start = frame_row_mt_info->tile_rows_start;
 3044|  47.2k|  const int tile_rows_end = frame_row_mt_info->tile_rows_end;
 3045|  47.2k|  const int tile_cols_start = frame_row_mt_info->tile_cols_start;
 3046|  47.2k|  const int tile_cols_end = frame_row_mt_info->tile_cols_end;
 3047|  47.2k|  const int start_tile = frame_row_mt_info->start_tile;
 3048|  47.2k|  const int end_tile = frame_row_mt_info->end_tile;
 3049|  47.2k|  const int sb_mi_size = mi_size_wide[cm->seq_params->sb_size];
 3050|  47.2k|  int num_mis_to_decode, num_threads_working;
 3051|  47.2k|  int num_mis_waiting_for_decode;
 3052|  47.2k|  int min_threads_working = INT_MAX;
 3053|  47.2k|  int max_mis_to_decode = 0;
 3054|  47.2k|  int tile_row_idx, tile_col_idx;
 3055|  47.2k|  int tile_row = -1;
 3056|  47.2k|  int tile_col = -1;
 3057|       |
 3058|  47.2k|  memset(next_job_info, 0, sizeof(*next_job_info));
 3059|       |
 3060|       |  // Frame decode is completed or error is encountered.
 3061|  47.2k|  *end_of_frame = (frame_row_mt_info->mi_rows_decode_started ==
  ------------------
  |  Branch (3061:19): [True: 10.9k, False: 36.3k]
  ------------------
 3062|  47.2k|                   frame_row_mt_info->mi_rows_to_decode) ||
 3063|  36.3k|                  (frame_row_mt_info->row_mt_exit == 1);
  ------------------
  |  Branch (3063:19): [True: 4.52k, False: 31.7k]
  ------------------
 3064|  47.2k|  if (*end_of_frame) {
  ------------------
  |  Branch (3064:7): [True: 15.4k, False: 31.7k]
  ------------------
 3065|  15.4k|    return 1;
 3066|  15.4k|  }
 3067|       |
 3068|       |  // Decoding cannot start as bit-stream parsing is not complete.
 3069|  47.2k|  assert(frame_row_mt_info->mi_rows_parse_done >=
 3070|  31.7k|         frame_row_mt_info->mi_rows_decode_started);
 3071|  31.7k|  if (frame_row_mt_info->mi_rows_parse_done ==
  ------------------
  |  Branch (3071:7): [True: 10.5k, False: 21.2k]
  ------------------
 3072|  31.7k|      frame_row_mt_info->mi_rows_decode_started)
 3073|  10.5k|    return 0;
 3074|       |
 3075|       |  // Choose the tile to decode.
 3076|  43.4k|  for (tile_row_idx = tile_rows_start; tile_row_idx < tile_rows_end;
  ------------------
  |  Branch (3076:40): [True: 22.1k, False: 21.2k]
  ------------------
 3077|  22.1k|       ++tile_row_idx) {
 3078|  45.9k|    for (tile_col_idx = tile_cols_start; tile_col_idx < tile_cols_end;
  ------------------
  |  Branch (3078:42): [True: 23.7k, False: 22.1k]
  ------------------
 3079|  23.7k|         ++tile_col_idx) {
 3080|  23.7k|      if (tile_row_idx * cm->tiles.cols + tile_col_idx < start_tile ||
  ------------------
  |  Branch (3080:11): [True: 13, False: 23.7k]
  ------------------
 3081|  23.7k|          tile_row_idx * cm->tiles.cols + tile_col_idx > end_tile)
  ------------------
  |  Branch (3081:11): [True: 17, False: 23.7k]
  ------------------
 3082|     30|        continue;
 3083|       |
 3084|  23.7k|      tile_data = pbi->tile_data + tile_row_idx * cm->tiles.cols + tile_col_idx;
 3085|  23.7k|      dec_row_mt_sync = &tile_data->dec_row_mt_sync;
 3086|       |
 3087|  23.7k|      num_threads_working = dec_row_mt_sync->num_threads_working;
 3088|  23.7k|      num_mis_waiting_for_decode = (dec_row_mt_sync->mi_rows_parse_done -
 3089|  23.7k|                                    dec_row_mt_sync->mi_rows_decode_started) *
 3090|  23.7k|                                   dec_row_mt_sync->mi_cols;
 3091|  23.7k|      num_mis_to_decode =
 3092|  23.7k|          (dec_row_mt_sync->mi_rows - dec_row_mt_sync->mi_rows_decode_started) *
 3093|  23.7k|          dec_row_mt_sync->mi_cols;
 3094|       |
 3095|  23.7k|      assert(num_mis_to_decode >= num_mis_waiting_for_decode);
 3096|       |
 3097|       |      // Pick the tile which has minimum number of threads working on it.
 3098|  23.7k|      if (num_mis_waiting_for_decode > 0) {
  ------------------
  |  Branch (3098:11): [True: 21.4k, False: 2.31k]
  ------------------
 3099|  21.4k|        if (num_threads_working < min_threads_working) {
  ------------------
  |  Branch (3099:13): [True: 21.3k, False: 140]
  ------------------
 3100|  21.3k|          min_threads_working = num_threads_working;
 3101|  21.3k|          max_mis_to_decode = 0;
 3102|  21.3k|        }
 3103|  21.4k|        if (num_threads_working == min_threads_working &&
  ------------------
  |  Branch (3103:13): [True: 21.4k, False: 47]
  ------------------
 3104|  21.4k|            num_mis_to_decode > max_mis_to_decode &&
  ------------------
  |  Branch (3104:13): [True: 21.3k, False: 84]
  ------------------
 3105|  21.3k|            num_threads_working <
  ------------------
  |  Branch (3105:13): [True: 21.3k, False: 5]
  ------------------
 3106|  21.3k|                get_max_row_mt_workers_per_tile(cm, &tile_data->tile_info)) {
 3107|  21.3k|          max_mis_to_decode = num_mis_to_decode;
 3108|  21.3k|          tile_row = tile_row_idx;
 3109|  21.3k|          tile_col = tile_col_idx;
 3110|  21.3k|        }
 3111|  21.4k|      }
 3112|  23.7k|    }
 3113|  22.1k|  }
 3114|       |  // No job found to process
 3115|  21.2k|  if (tile_row == -1 || tile_col == -1) return 0;
  ------------------
  |  Branch (3115:7): [True: 5, False: 21.2k]
  |  Branch (3115:25): [True: 0, False: 21.2k]
  ------------------
 3116|       |
 3117|  21.2k|  tile_data = pbi->tile_data + tile_row * cm->tiles.cols + tile_col;
 3118|  21.2k|  dec_row_mt_sync = &tile_data->dec_row_mt_sync;
 3119|       |
 3120|  21.2k|  next_job_info->tile_row = tile_row;
 3121|  21.2k|  next_job_info->tile_col = tile_col;
 3122|  21.2k|  next_job_info->mi_row = dec_row_mt_sync->mi_rows_decode_started +
 3123|  21.2k|                          tile_data->tile_info.mi_row_start;
 3124|       |
 3125|  21.2k|  dec_row_mt_sync->num_threads_working++;
 3126|  21.2k|  dec_row_mt_sync->mi_rows_decode_started += sb_mi_size;
 3127|  21.2k|  frame_row_mt_info->mi_rows_decode_started += sb_mi_size;
 3128|  21.2k|  assert(frame_row_mt_info->mi_rows_parse_done >=
 3129|  21.2k|         frame_row_mt_info->mi_rows_decode_started);
 3130|  21.2k|#if CONFIG_MULTITHREAD
 3131|  21.2k|  if (frame_row_mt_info->mi_rows_decode_started ==
  ------------------
  |  Branch (3131:7): [True: 7.01k, False: 14.2k]
  ------------------
 3132|  21.2k|      frame_row_mt_info->mi_rows_to_decode) {
 3133|  7.01k|    pthread_cond_broadcast(pbi->row_mt_cond_);
 3134|  7.01k|  }
 3135|  21.2k|#endif
 3136|       |
 3137|  21.2k|  return 1;
 3138|  21.2k|}
decodeframe.c:decode_tile_sb_row:
 2636|  21.2k|                                      const int mi_row) {
 2637|  21.2k|  AV1_COMMON *const cm = &pbi->common;
 2638|  21.2k|  const int num_planes = av1_num_planes(cm);
 2639|  21.2k|  TileDataDec *const tile_data = pbi->tile_data +
 2640|  21.2k|                                 tile_info->tile_row * cm->tiles.cols +
 2641|  21.2k|                                 tile_info->tile_col;
 2642|  21.2k|  const int sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile_info);
 2643|  21.2k|  const int sb_row_in_tile =
 2644|  21.2k|      (mi_row - tile_info->mi_row_start) >> cm->seq_params->mib_size_log2;
 2645|  21.2k|  int sb_col_in_tile = 0;
 2646|  21.2k|  int row_mt_exit = 0;
 2647|       |
 2648|  92.4k|  for (int mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end;
  ------------------
  |  Branch (2648:46): [True: 71.1k, False: 21.2k]
  ------------------
 2649|  71.1k|       mi_col += cm->seq_params->mib_size, sb_col_in_tile++) {
 2650|  71.1k|    set_cb_buffer(pbi, &td->dcb, pbi->cb_buffer_base, num_planes, mi_row,
 2651|  71.1k|                  mi_col);
 2652|       |
 2653|  71.1k|    sync_read(&tile_data->dec_row_mt_sync, sb_row_in_tile, sb_col_in_tile);
 2654|       |
 2655|  71.1k|#if CONFIG_MULTITHREAD
 2656|  71.1k|    pthread_mutex_lock(pbi->row_mt_mutex_);
 2657|  71.1k|#endif
 2658|  71.1k|    row_mt_exit = pbi->frame_row_mt_info.row_mt_exit;
 2659|  71.1k|#if CONFIG_MULTITHREAD
 2660|  71.1k|    pthread_mutex_unlock(pbi->row_mt_mutex_);
 2661|  71.1k|#endif
 2662|       |
 2663|  71.1k|    if (!row_mt_exit) {
  ------------------
  |  Branch (2663:9): [True: 68.8k, False: 2.30k]
  ------------------
 2664|       |      // Decoding of the super-block
 2665|  68.8k|      decode_partition(pbi, td, mi_row, mi_col, td->bit_reader,
 2666|  68.8k|                       cm->seq_params->sb_size, 0x2);
 2667|  68.8k|    }
 2668|       |
 2669|  71.1k|    sync_write(&tile_data->dec_row_mt_sync, sb_row_in_tile, sb_col_in_tile,
 2670|  71.1k|               sb_cols_in_tile);
 2671|  71.1k|  }
 2672|  21.2k|}
decodeframe.c:sync_read:
 2568|  71.1k|                             int c) {
 2569|  71.1k|#if CONFIG_MULTITHREAD
 2570|  71.1k|  const int nsync = dec_row_mt_sync->sync_range;
 2571|       |
 2572|  71.1k|  if (r && !(c & (nsync - 1))) {
  ------------------
  |  Branch (2572:7): [True: 42.2k, False: 28.9k]
  |  Branch (2572:12): [True: 42.2k, False: 0]
  ------------------
 2573|  42.2k|    pthread_mutex_t *const mutex = &dec_row_mt_sync->mutex_[r - 1];
 2574|  42.2k|    pthread_mutex_lock(mutex);
 2575|       |
 2576|  48.3k|    while (c > dec_row_mt_sync->cur_sb_col[r - 1] - nsync -
  ------------------
  |  Branch (2576:12): [True: 6.10k, False: 42.2k]
  ------------------
 2577|  48.3k|                   dec_row_mt_sync->intrabc_extra_top_right_sb_delay) {
 2578|  6.10k|      pthread_cond_wait(&dec_row_mt_sync->cond_[r - 1], mutex);
 2579|  6.10k|    }
 2580|  42.2k|    pthread_mutex_unlock(mutex);
 2581|  42.2k|  }
 2582|       |#else
 2583|       |  (void)dec_row_mt_sync;
 2584|       |  (void)r;
 2585|       |  (void)c;
 2586|       |#endif  // CONFIG_MULTITHREAD
 2587|  71.1k|}
decodeframe.c:launch_dec_workers:
 3471|  13.1k|                                      int num_workers) {
 3472|  13.1k|  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
 3473|       |
 3474|  36.1k|  for (int worker_idx = num_workers - 1; worker_idx >= 0; --worker_idx) {
  ------------------
  |  Branch (3474:42): [True: 22.9k, False: 13.1k]
  ------------------
 3475|  22.9k|    AVxWorker *const worker = &pbi->tile_workers[worker_idx];
 3476|  22.9k|    DecWorkerData *const thread_data = (DecWorkerData *)worker->data1;
 3477|       |
 3478|  22.9k|    thread_data->data_end = data_end;
 3479|       |
 3480|  22.9k|    worker->had_error = 0;
 3481|  22.9k|    if (worker_idx == 0) {
  ------------------
  |  Branch (3481:9): [True: 13.1k, False: 9.89k]
  ------------------
 3482|  13.1k|      winterface->execute(worker);
 3483|  13.1k|    } else {
 3484|  9.89k|      winterface->launch(worker);
 3485|  9.89k|    }
 3486|  22.9k|  }
 3487|  13.1k|}
decodeframe.c:sync_dec_workers:
 3489|  13.1k|static inline void sync_dec_workers(AV1Decoder *pbi, int num_workers) {
 3490|  13.1k|  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
 3491|  13.1k|  int corrupted = 0;
 3492|       |
 3493|  36.1k|  for (int worker_idx = num_workers; worker_idx > 0; --worker_idx) {
  ------------------
  |  Branch (3493:38): [True: 22.9k, False: 13.1k]
  ------------------
 3494|  22.9k|    AVxWorker *const worker = &pbi->tile_workers[worker_idx - 1];
 3495|  22.9k|    aom_merge_corrupted_flag(&corrupted, !winterface->sync(worker));
 3496|  22.9k|  }
 3497|       |
 3498|  13.1k|  pbi->dcb.corrupted = corrupted;
 3499|  13.1k|}
decodeframe.c:decode_tile:
 2720|  12.7k|                               int tile_row, int tile_col) {
 2721|  12.7k|  TileInfo tile_info;
 2722|       |
 2723|  12.7k|  AV1_COMMON *const cm = &pbi->common;
 2724|  12.7k|  const int num_planes = av1_num_planes(cm);
 2725|       |
 2726|  12.7k|  av1_tile_set_row(&tile_info, cm, tile_row);
 2727|  12.7k|  av1_tile_set_col(&tile_info, cm, tile_col);
 2728|  12.7k|  DecoderCodingBlock *const dcb = &td->dcb;
 2729|  12.7k|  MACROBLOCKD *const xd = &dcb->xd;
 2730|       |
 2731|  12.7k|  av1_zero_above_context(cm, xd, tile_info.mi_col_start, tile_info.mi_col_end,
 2732|  12.7k|                         tile_row);
 2733|  12.7k|  av1_reset_loop_filter_delta(xd, num_planes);
 2734|  12.7k|  av1_reset_loop_restoration(xd, num_planes);
 2735|       |
 2736|  30.9k|  for (int mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end;
  ------------------
  |  Branch (2736:45): [True: 22.8k, False: 8.05k]
  ------------------
 2737|  22.8k|       mi_row += cm->seq_params->mib_size) {
 2738|  22.8k|    av1_zero_left_context(xd);
 2739|       |
 2740|  77.7k|    for (int mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
  ------------------
  |  Branch (2740:47): [True: 59.5k, False: 18.1k]
  ------------------
 2741|  59.5k|         mi_col += cm->seq_params->mib_size) {
 2742|  59.5k|      set_cb_buffer(pbi, dcb, &td->cb_buffer_base, num_planes, 0, 0);
 2743|       |
 2744|       |      // Bit-stream parsing and decoding of the superblock
 2745|  59.5k|      decode_partition(pbi, td, mi_row, mi_col, td->bit_reader,
 2746|  59.5k|                       cm->seq_params->sb_size, 0x3);
 2747|       |
 2748|  59.5k|      if (aom_reader_has_overflowed(td->bit_reader)) {
  ------------------
  |  Branch (2748:11): [True: 4.66k, False: 54.9k]
  ------------------
 2749|  4.66k|        aom_merge_corrupted_flag(&dcb->corrupted, 1);
 2750|  4.66k|        return;
 2751|  4.66k|      }
 2752|  59.5k|    }
 2753|  22.8k|  }
 2754|       |
 2755|  8.05k|  int corrupted =
 2756|  8.05k|      (check_trailing_bits_after_symbol_coder(td->bit_reader)) ? 1 : 0;
  ------------------
  |  Branch (2756:7): [True: 2.31k, False: 5.73k]
  ------------------
 2757|  8.05k|  aom_merge_corrupted_flag(&dcb->corrupted, corrupted);
 2758|  8.05k|}
decodeframe.c:decode_tiles:
 2762|  12.7k|                                   int end_tile) {
 2763|  12.7k|  AV1_COMMON *const cm = &pbi->common;
 2764|  12.7k|  ThreadData *const td = &pbi->td;
 2765|  12.7k|  CommonTileParams *const tiles = &cm->tiles;
 2766|  12.7k|  const int tile_cols = tiles->cols;
 2767|  12.7k|  const int tile_rows = tiles->rows;
 2768|  12.7k|  const int n_tiles = tile_cols * tile_rows;
 2769|  12.7k|  TileBufferDec(*const tile_buffers)[MAX_TILE_COLS] = pbi->tile_buffers;
 2770|  12.7k|  const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows);
  ------------------
  |  |   34|  12.7k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 12.7k, False: 0]
  |  |  ------------------
  ------------------
 2771|  12.7k|  const int single_row = pbi->dec_tile_row >= 0;
 2772|  12.7k|  const int dec_tile_col = AOMMIN(pbi->dec_tile_col, tile_cols);
  ------------------
  |  |   34|  12.7k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 12.7k, False: 0]
  |  |  ------------------
  ------------------
 2773|  12.7k|  const int single_col = pbi->dec_tile_col >= 0;
 2774|  12.7k|  int tile_rows_start;
 2775|  12.7k|  int tile_rows_end;
 2776|  12.7k|  int tile_cols_start;
 2777|  12.7k|  int tile_cols_end;
 2778|  12.7k|  int inv_col_order;
 2779|  12.7k|  int inv_row_order;
 2780|  12.7k|  int tile_row, tile_col;
 2781|  12.7k|  uint8_t allow_update_cdf;
 2782|  12.7k|  const uint8_t *raw_data_end = NULL;
 2783|       |
 2784|  12.7k|  if (tiles->large_scale) {
  ------------------
  |  Branch (2784:7): [True: 0, False: 12.7k]
  ------------------
 2785|      0|    tile_rows_start = single_row ? dec_tile_row : 0;
  ------------------
  |  Branch (2785:23): [True: 0, False: 0]
  ------------------
 2786|      0|    tile_rows_end = single_row ? dec_tile_row + 1 : tile_rows;
  ------------------
  |  Branch (2786:21): [True: 0, False: 0]
  ------------------
 2787|      0|    tile_cols_start = single_col ? dec_tile_col : 0;
  ------------------
  |  Branch (2787:23): [True: 0, False: 0]
  ------------------
 2788|      0|    tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols;
  ------------------
  |  Branch (2788:21): [True: 0, False: 0]
  ------------------
 2789|      0|    inv_col_order = pbi->inv_tile_order && !single_col;
  ------------------
  |  Branch (2789:21): [True: 0, False: 0]
  |  Branch (2789:44): [True: 0, False: 0]
  ------------------
 2790|      0|    inv_row_order = pbi->inv_tile_order && !single_row;
  ------------------
  |  Branch (2790:21): [True: 0, False: 0]
  |  Branch (2790:44): [True: 0, False: 0]
  ------------------
 2791|      0|    allow_update_cdf = 0;
 2792|  12.7k|  } else {
 2793|  12.7k|    tile_rows_start = 0;
 2794|  12.7k|    tile_rows_end = tile_rows;
 2795|  12.7k|    tile_cols_start = 0;
 2796|  12.7k|    tile_cols_end = tile_cols;
 2797|  12.7k|    inv_col_order = pbi->inv_tile_order;
 2798|  12.7k|    inv_row_order = pbi->inv_tile_order;
 2799|  12.7k|    allow_update_cdf = 1;
 2800|  12.7k|  }
 2801|       |
 2802|       |  // No tiles to decode.
 2803|  12.7k|  if (tile_rows_end <= tile_rows_start || tile_cols_end <= tile_cols_start ||
  ------------------
  |  Branch (2803:7): [True: 0, False: 12.7k]
  |  Branch (2803:43): [True: 0, False: 12.7k]
  ------------------
 2804|       |      // First tile is larger than end_tile.
 2805|  12.7k|      tile_rows_start * tiles->cols + tile_cols_start > end_tile ||
  ------------------
  |  Branch (2805:7): [True: 0, False: 12.7k]
  ------------------
 2806|       |      // Last tile is smaller than start_tile.
 2807|  12.7k|      (tile_rows_end - 1) * tiles->cols + tile_cols_end - 1 < start_tile)
  ------------------
  |  Branch (2807:7): [True: 0, False: 12.7k]
  ------------------
 2808|      0|    return data;
 2809|       |
 2810|  12.7k|  allow_update_cdf = allow_update_cdf && !cm->features.disable_cdf_update;
  ------------------
  |  Branch (2810:22): [True: 12.7k, False: 0]
  |  Branch (2810:42): [True: 12.1k, False: 566]
  ------------------
 2811|       |
 2812|  12.7k|  assert(tile_rows <= MAX_TILE_ROWS);
 2813|  12.7k|  assert(tile_cols <= MAX_TILE_COLS);
 2814|       |
 2815|  12.7k|#if EXT_TILE_DEBUG
 2816|  12.7k|  if (tiles->large_scale && !pbi->ext_tile_debug)
  ------------------
  |  Branch (2816:7): [True: 0, False: 12.7k]
  |  Branch (2816:29): [True: 0, False: 0]
  ------------------
 2817|      0|    raw_data_end = get_ls_single_tile_buffer(pbi, data, tile_buffers);
 2818|  12.7k|  else if (tiles->large_scale && pbi->ext_tile_debug)
  ------------------
  |  Branch (2818:12): [True: 0, False: 12.7k]
  |  Branch (2818:34): [True: 0, False: 0]
  ------------------
 2819|      0|    raw_data_end = get_ls_tile_buffers(pbi, data, data_end, tile_buffers);
 2820|  12.7k|  else
 2821|  12.7k|#endif  // EXT_TILE_DEBUG
 2822|  12.7k|    get_tile_buffers(pbi, data, data_end, tile_buffers, start_tile, end_tile);
 2823|       |
 2824|  12.7k|  if (pbi->tile_data == NULL || n_tiles != pbi->allocated_tiles) {
  ------------------
  |  Branch (2824:7): [True: 8.11k, False: 4.59k]
  |  Branch (2824:33): [True: 260, False: 4.33k]
  ------------------
 2825|  8.36k|    decoder_alloc_tile_data(pbi, n_tiles);
 2826|  8.36k|  }
 2827|  12.7k|  if (pbi->dcb.xd.seg_mask == NULL)
  ------------------
  |  Branch (2827:7): [True: 8.10k, False: 4.60k]
  ------------------
 2828|  12.7k|    CHECK_MEM_ERROR(cm, pbi->dcb.xd.seg_mask,
  ------------------
  |  |   51|  8.10k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  8.10k|  do {                                                    \
  |  |  |  |   69|  8.10k|    lval = (expr);                                        \
  |  |  |  |   70|  8.10k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 8.10k]
  |  |  |  |  ------------------
  |  |  |  |   71|  8.10k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  8.10k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 8.10k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 2829|  12.7k|                    (uint8_t *)aom_memalign(
 2830|  12.7k|                        16, 2 * MAX_SB_SQUARE * sizeof(*pbi->dcb.xd.seg_mask)));
 2831|       |#if CONFIG_ACCOUNTING
 2832|       |  if (pbi->acct_enabled) {
 2833|       |    aom_accounting_reset(&pbi->accounting);
 2834|       |  }
 2835|       |#endif
 2836|       |
 2837|  12.7k|  set_decode_func_pointers(&pbi->td, 0x3);
 2838|       |
 2839|       |  // Load all tile information into thread_data.
 2840|  12.7k|  td->dcb = pbi->dcb;
 2841|       |
 2842|  12.7k|  td->dcb.corrupted = 0;
 2843|  12.7k|  td->dcb.mc_buf[0] = td->mc_buf[0];
 2844|  12.7k|  td->dcb.mc_buf[1] = td->mc_buf[1];
 2845|  12.7k|  td->dcb.xd.tmp_conv_dst = td->tmp_conv_dst;
 2846|  38.1k|  for (int j = 0; j < 2; ++j) {
  ------------------
  |  Branch (2846:19): [True: 25.4k, False: 12.7k]
  ------------------
 2847|  25.4k|    td->dcb.xd.tmp_obmc_bufs[j] = td->tmp_obmc_bufs[j];
 2848|  25.4k|  }
 2849|       |
 2850|  25.4k|  for (tile_row = tile_rows_start; tile_row < tile_rows_end; ++tile_row) {
  ------------------
  |  Branch (2850:36): [True: 12.7k, False: 12.7k]
  ------------------
 2851|  12.7k|    const int row = inv_row_order ? tile_rows - 1 - tile_row : tile_row;
  ------------------
  |  Branch (2851:21): [True: 0, False: 12.7k]
  ------------------
 2852|       |
 2853|  25.4k|    for (tile_col = tile_cols_start; tile_col < tile_cols_end; ++tile_col) {
  ------------------
  |  Branch (2853:38): [True: 12.7k, False: 12.7k]
  ------------------
 2854|  12.7k|      const int col = inv_col_order ? tile_cols - 1 - tile_col : tile_col;
  ------------------
  |  Branch (2854:23): [True: 0, False: 12.7k]
  ------------------
 2855|  12.7k|      TileDataDec *const tile_data = pbi->tile_data + row * tiles->cols + col;
 2856|  12.7k|      const TileBufferDec *const tile_bs_buf = &tile_buffers[row][col];
 2857|       |
 2858|  12.7k|      if (row * tiles->cols + col < start_tile ||
  ------------------
  |  Branch (2858:11): [True: 4, False: 12.7k]
  ------------------
 2859|  12.7k|          row * tiles->cols + col > end_tile)
  ------------------
  |  Branch (2859:11): [True: 6, False: 12.7k]
  ------------------
 2860|     10|        continue;
 2861|       |
 2862|  12.7k|      td->bit_reader = &tile_data->bit_reader;
 2863|  12.7k|      av1_zero(td->cb_buffer_base.dqcoeff);
  ------------------
  |  |   43|  12.7k|#define av1_zero(dest) memset(&(dest), 0, sizeof(dest))
  ------------------
 2864|  12.7k|      av1_tile_init(&td->dcb.xd.tile, cm, row, col);
 2865|  12.7k|      td->dcb.xd.current_base_qindex = cm->quant_params.base_qindex;
 2866|  12.7k|      setup_bool_decoder(&td->dcb.xd, tile_bs_buf->data, data_end,
 2867|  12.7k|                         tile_bs_buf->size, &pbi->error, td->bit_reader,
 2868|  12.7k|                         allow_update_cdf);
 2869|       |#if CONFIG_ACCOUNTING
 2870|       |      if (pbi->acct_enabled) {
 2871|       |        td->bit_reader->accounting = &pbi->accounting;
 2872|       |        td->bit_reader->accounting->last_tell_frac =
 2873|       |            aom_reader_tell_frac(td->bit_reader);
 2874|       |      } else {
 2875|       |        td->bit_reader->accounting = NULL;
 2876|       |      }
 2877|       |#endif
 2878|  12.7k|      av1_init_macroblockd(cm, &td->dcb.xd);
 2879|  12.7k|      av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), row,
 2880|  12.7k|                             &td->dcb.xd);
 2881|       |
 2882|       |      // Initialise the tile context from the frame context
 2883|  12.7k|      tile_data->tctx = *cm->fc;
 2884|  12.7k|      td->dcb.xd.tile_ctx = &tile_data->tctx;
 2885|       |
 2886|       |      // decode tile
 2887|  12.7k|      decode_tile(pbi, td, row, col);
 2888|  12.7k|      aom_merge_corrupted_flag(&pbi->dcb.corrupted, td->dcb.corrupted);
 2889|  12.7k|      if (pbi->dcb.corrupted)
  ------------------
  |  Branch (2889:11): [True: 6.97k, False: 5.73k]
  ------------------
 2890|  6.97k|        aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
 2891|  6.97k|                           "Failed to decode tile data");
 2892|  12.7k|    }
 2893|  12.7k|  }
 2894|       |
 2895|  12.7k|  if (tiles->large_scale) {
  ------------------
  |  Branch (2895:7): [True: 0, False: 12.7k]
  ------------------
 2896|      0|    if (n_tiles == 1) {
  ------------------
  |  Branch (2896:9): [True: 0, False: 0]
  ------------------
 2897|       |      // Find the end of the single tile buffer
 2898|      0|      return aom_reader_find_end(&pbi->tile_data->bit_reader);
 2899|      0|    }
 2900|       |    // Return the end of the last tile buffer
 2901|      0|    return raw_data_end;
 2902|      0|  }
 2903|  12.7k|  TileDataDec *const tile_data = pbi->tile_data + end_tile;
 2904|       |
 2905|  12.7k|  return aom_reader_find_end(&tile_data->bit_reader);
 2906|  12.7k|}
decodeframe.c:set_planes_to_neutral_grey:
  103|  3.51k|    int only_chroma) {
  104|  3.51k|  if (seq_params->use_highbitdepth) {
  ------------------
  |  Branch (104:7): [True: 910, False: 2.60k]
  ------------------
  105|    910|    const int val = 1 << (seq_params->bit_depth - 1);
  106|  3.08k|    for (int plane = only_chroma; plane < MAX_MB_PLANE; plane++) {
  ------------------
  |  |   36|  3.08k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (106:35): [True: 2.17k, False: 910]
  ------------------
  107|  2.17k|      const int is_uv = plane > 0;
  108|  2.17k|      uint16_t *const base = CONVERT_TO_SHORTPTR(buf->buffers[plane]);
  ------------------
  |  |   75|  2.17k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  109|       |      // Set the first row to neutral grey. Then copy the first row to all
  110|       |      // subsequent rows.
  111|  2.17k|      if (buf->crop_heights[is_uv] > 0) {
  ------------------
  |  Branch (111:11): [True: 2.17k, False: 0]
  ------------------
  112|  2.17k|        aom_memset16(base, val, buf->crop_widths[is_uv]);
  113|  76.9k|        for (int row_idx = 1; row_idx < buf->crop_heights[is_uv]; row_idx++) {
  ------------------
  |  Branch (113:31): [True: 74.7k, False: 2.17k]
  ------------------
  114|  74.7k|          memcpy(&base[row_idx * buf->strides[is_uv]], base,
  115|  74.7k|                 sizeof(*base) * buf->crop_widths[is_uv]);
  116|  74.7k|        }
  117|  2.17k|      }
  118|  2.17k|    }
  119|  2.60k|  } else {
  120|  8.72k|    for (int plane = only_chroma; plane < MAX_MB_PLANE; plane++) {
  ------------------
  |  |   36|  8.72k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (120:35): [True: 6.11k, False: 2.60k]
  ------------------
  121|  6.11k|      const int is_uv = plane > 0;
  122|   189k|      for (int row_idx = 0; row_idx < buf->crop_heights[is_uv]; row_idx++) {
  ------------------
  |  Branch (122:29): [True: 183k, False: 6.11k]
  ------------------
  123|   183k|        memset(&buf->buffers[plane][row_idx * buf->strides[is_uv]], 1 << 7,
  124|   183k|               buf->crop_widths[is_uv]);
  125|   183k|      }
  126|  6.11k|    }
  127|  2.60k|  }
  128|  3.51k|}
decodeframe.c:superres_post_decode:
 5164|  2.85k|static inline void superres_post_decode(AV1Decoder *pbi) {
 5165|  2.85k|  AV1_COMMON *const cm = &pbi->common;
 5166|  2.85k|  BufferPool *const pool = cm->buffer_pool;
 5167|       |
 5168|  2.85k|  if (!av1_superres_scaled(cm)) return;
  ------------------
  |  Branch (5168:7): [True: 2.82k, False: 32]
  ------------------
 5169|  2.85k|  assert(!cm->features.all_lossless);
 5170|       |
 5171|     32|  av1_superres_upscale(cm, pool, 0);
 5172|     32|}

av1_neg_deinterleave:
  258|   443k|int av1_neg_deinterleave(int diff, int ref, int max) {
  259|   443k|  if (!ref) return diff;
  ------------------
  |  Branch (259:7): [True: 55.2k, False: 388k]
  ------------------
  260|   388k|  if (ref >= (max - 1)) return max - diff - 1;
  ------------------
  |  Branch (260:7): [True: 17.8k, False: 370k]
  ------------------
  261|   370k|  if (2 * ref < max) {
  ------------------
  |  Branch (261:7): [True: 213k, False: 156k]
  ------------------
  262|   213k|    if (diff <= 2 * ref) {
  ------------------
  |  Branch (262:9): [True: 185k, False: 28.4k]
  ------------------
  263|   185k|      if (diff & 1)
  ------------------
  |  Branch (263:11): [True: 27.9k, False: 157k]
  ------------------
  264|  27.9k|        return ref + ((diff + 1) >> 1);
  265|   157k|      else
  266|   157k|        return ref - (diff >> 1);
  267|   185k|    }
  268|  28.4k|    return diff;
  269|   213k|  } else {
  270|   156k|    if (diff <= 2 * (max - ref - 1)) {
  ------------------
  |  Branch (270:9): [True: 139k, False: 17.3k]
  ------------------
  271|   139k|      if (diff & 1)
  ------------------
  |  Branch (271:11): [True: 20.9k, False: 118k]
  ------------------
  272|  20.9k|        return ref + ((diff + 1) >> 1);
  273|   118k|      else
  274|   118k|        return ref - (diff >> 1);
  275|   139k|    }
  276|  17.3k|    return max - (diff + 1);
  277|   156k|  }
  278|   370k|}
av1_read_tx_type:
  627|  1.76M|                      int blk_col, TX_SIZE tx_size, aom_reader *r) {
  628|  1.76M|  MB_MODE_INFO *mbmi = xd->mi[0];
  629|  1.76M|  uint8_t *tx_type =
  630|  1.76M|      &xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col];
  631|  1.76M|  *tx_type = DCT_DCT;
  632|       |
  633|       |  // No need to read transform type if block is skipped.
  634|  1.76M|  if (mbmi->skip_txfm ||
  ------------------
  |  Branch (634:7): [True: 69, False: 1.76M]
  ------------------
  635|  1.76M|      segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
  ------------------
  |  Branch (635:7): [True: 18.4E, False: 1.76M]
  ------------------
  636|      0|    return;
  637|       |
  638|       |  // No need to read transform type for lossless mode(qindex==0).
  639|  1.76M|  const int qindex = xd->qindex[mbmi->segment_id];
  640|  1.76M|  if (qindex == 0) return;
  ------------------
  |  Branch (640:7): [True: 293k, False: 1.46M]
  ------------------
  641|       |
  642|  1.46M|  const int inter_block = is_inter_block(mbmi);
  643|  1.46M|  if (get_ext_tx_types(tx_size, inter_block, cm->features.reduced_tx_set_used) >
  ------------------
  |  Branch (643:7): [True: 1.33M, False: 134k]
  ------------------
  644|  1.46M|      1) {
  645|  1.33M|    const TxSetType tx_set_type = av1_get_ext_tx_set_type(
  646|  1.33M|        tx_size, inter_block, cm->features.reduced_tx_set_used);
  647|  1.33M|    const int eset =
  648|  1.33M|        get_ext_tx_set(tx_size, inter_block, cm->features.reduced_tx_set_used);
  649|       |    // eset == 0 should correspond to a set with only DCT_DCT and
  650|       |    // there is no need to read the tx_type
  651|  1.33M|    assert(eset != 0);
  652|       |
  653|  1.33M|    const TX_SIZE square_tx_size = txsize_sqr_map[tx_size];
  654|  1.33M|    FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
  655|  1.33M|    if (inter_block) {
  ------------------
  |  Branch (655:9): [True: 75.2k, False: 1.25M]
  ------------------
  656|  75.2k|      *tx_type = av1_ext_tx_inv[tx_set_type][aom_read_symbol(
  ------------------
  |  |   51|  75.2k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  657|  75.2k|          r, ec_ctx->inter_ext_tx_cdf[eset][square_tx_size],
  658|  75.2k|          av1_num_ext_tx_set[tx_set_type], ACCT_STR)];
  659|  1.25M|    } else {
  660|  1.25M|      const PREDICTION_MODE intra_mode =
  661|  1.25M|          mbmi->filter_intra_mode_info.use_filter_intra
  ------------------
  |  Branch (661:11): [True: 247k, False: 1.01M]
  ------------------
  662|  1.25M|              ? fimode_to_intradir[mbmi->filter_intra_mode_info
  663|   247k|                                       .filter_intra_mode]
  664|  1.25M|              : mbmi->mode;
  665|  1.25M|      *tx_type = av1_ext_tx_inv[tx_set_type][aom_read_symbol(
  ------------------
  |  |   51|  1.25M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  666|  1.25M|          r, ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][intra_mode],
  667|  1.25M|          av1_num_ext_tx_set[tx_set_type], ACCT_STR)];
  668|  1.25M|    }
  669|  1.33M|  }
  670|  1.46M|}
av1_read_mode_info:
 1572|  1.75M|                        aom_reader *r, int x_mis, int y_mis) {
 1573|  1.75M|  AV1_COMMON *const cm = &pbi->common;
 1574|  1.75M|  MACROBLOCKD *const xd = &dcb->xd;
 1575|  1.75M|  MB_MODE_INFO *const mi = xd->mi[0];
 1576|  1.75M|  mi->use_intrabc = 0;
 1577|       |
 1578|  1.75M|  if (frame_is_intra_only(cm)) {
  ------------------
  |  Branch (1578:7): [True: 1.61M, False: 139k]
  ------------------
 1579|  1.61M|    read_intra_frame_mode_info(cm, dcb, r);
 1580|  1.61M|    if (cm->seq_params->order_hint_info.enable_ref_frame_mvs)
  ------------------
  |  Branch (1580:9): [True: 50.6k, False: 1.56M]
  ------------------
 1581|  50.6k|      intra_copy_frame_mvs(cm, xd->mi_row, xd->mi_col, x_mis, y_mis);
 1582|  1.61M|  } else {
 1583|   139k|    read_inter_frame_mode_info(pbi, dcb, r);
 1584|   139k|    if (cm->seq_params->order_hint_info.enable_ref_frame_mvs)
  ------------------
  |  Branch (1584:9): [True: 138k, False: 874]
  ------------------
 1585|   138k|      av1_copy_frame_mvs(cm, mi, xd->mi_row, xd->mi_col, x_mis, y_mis);
 1586|   139k|  }
 1587|  1.75M|}
decodemv.c:read_intra_frame_mode_info:
  774|  1.61M|                                       DecoderCodingBlock *dcb, aom_reader *r) {
  775|  1.61M|  MACROBLOCKD *const xd = &dcb->xd;
  776|  1.61M|  MB_MODE_INFO *const mbmi = xd->mi[0];
  777|  1.61M|  const MB_MODE_INFO *above_mi = xd->above_mbmi;
  778|  1.61M|  const MB_MODE_INFO *left_mi = xd->left_mbmi;
  779|  1.61M|  const BLOCK_SIZE bsize = mbmi->bsize;
  780|  1.61M|  struct segmentation *const seg = &cm->seg;
  781|       |
  782|  1.61M|  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
  783|       |
  784|  1.61M|  if (seg->segid_preskip)
  ------------------
  |  Branch (784:7): [True: 435k, False: 1.18M]
  ------------------
  785|   435k|    mbmi->segment_id = read_intra_segment_id(cm, xd, bsize, r, 0);
  786|       |
  787|  1.61M|  mbmi->skip_txfm = read_skip_txfm(cm, xd, mbmi->segment_id, r);
  788|       |
  789|  1.61M|  if (!seg->segid_preskip)
  ------------------
  |  Branch (789:7): [True: 1.18M, False: 435k]
  ------------------
  790|  1.18M|    mbmi->segment_id = read_intra_segment_id(cm, xd, bsize, r, mbmi->skip_txfm);
  791|       |
  792|  1.61M|  read_cdef(cm, r, xd);
  793|       |
  794|  1.61M|  read_delta_q_params(cm, xd, r);
  795|       |
  796|  1.61M|  mbmi->current_qindex = xd->current_base_qindex;
  797|       |
  798|  1.61M|  mbmi->ref_frame[0] = INTRA_FRAME;
  799|  1.61M|  mbmi->ref_frame[1] = NONE_FRAME;
  800|  1.61M|  mbmi->palette_mode_info.palette_size[0] = 0;
  801|  1.61M|  mbmi->palette_mode_info.palette_size[1] = 0;
  802|  1.61M|  mbmi->filter_intra_mode_info.use_filter_intra = 0;
  803|       |
  804|  1.61M|  const int mi_row = xd->mi_row;
  805|  1.61M|  const int mi_col = xd->mi_col;
  806|  1.61M|  xd->above_txfm_context = cm->above_contexts.txfm[xd->tile.tile_row] + mi_col;
  807|  1.61M|  xd->left_txfm_context =
  808|  1.61M|      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
  ------------------
  |  |   50|  1.61M|#define MAX_MIB_MASK (MAX_MIB_SIZE - 1)
  |  |  ------------------
  |  |  |  |   44|  1.61M|#define MAX_MIB_SIZE (1 << MAX_MIB_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   43|  1.61M|#define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   31|  1.61M|#define MAX_SB_SIZE_LOG2 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |               #define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   39|  1.61M|#define MI_SIZE_LOG2 2
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  809|       |
  810|  1.61M|  if (av1_allow_intrabc(cm)) {
  ------------------
  |  Branch (810:7): [True: 353k, False: 1.26M]
  ------------------
  811|   353k|    read_intrabc_info(cm, dcb, r);
  812|   353k|    if (is_intrabc_block(mbmi)) return;
  ------------------
  |  Branch (812:9): [True: 10.8k, False: 343k]
  ------------------
  813|   353k|  }
  814|       |
  815|  1.60M|  mbmi->mode = read_intra_mode(r, get_y_mode_cdf(ec_ctx, above_mi, left_mi));
  816|       |
  817|  1.60M|  const int use_angle_delta = av1_use_angle_delta(bsize);
  818|  1.60M|  mbmi->angle_delta[PLANE_TYPE_Y] =
  819|  1.60M|      (use_angle_delta && av1_is_directional_mode(mbmi->mode))
  ------------------
  |  Branch (819:8): [True: 1.23M, False: 368k]
  |  Branch (819:27): [True: 439k, False: 798k]
  ------------------
  820|  1.60M|          ? read_angle_delta(r, ec_ctx->angle_delta_cdf[mbmi->mode - V_PRED])
  821|  1.60M|          : 0;
  822|       |
  823|  1.60M|  if (!cm->seq_params->monochrome && xd->is_chroma_ref) {
  ------------------
  |  Branch (823:7): [True: 919k, False: 686k]
  |  Branch (823:38): [True: 919k, False: 970]
  ------------------
  824|   919k|    mbmi->uv_mode =
  825|   919k|        read_intra_mode_uv(ec_ctx, r, is_cfl_allowed(xd), mbmi->mode);
  826|   919k|    if (mbmi->uv_mode == UV_CFL_PRED) {
  ------------------
  |  Branch (826:9): [True: 171k, False: 747k]
  ------------------
  827|   171k|      mbmi->cfl_alpha_idx = read_cfl_alphas(ec_ctx, r, &mbmi->cfl_alpha_signs);
  828|   171k|    }
  829|   919k|    const PREDICTION_MODE intra_mode = get_uv_mode(mbmi->uv_mode);
  830|   919k|    mbmi->angle_delta[PLANE_TYPE_UV] =
  831|   919k|        (use_angle_delta && av1_is_directional_mode(intra_mode))
  ------------------
  |  Branch (831:10): [True: 700k, False: 218k]
  |  Branch (831:29): [True: 162k, False: 538k]
  ------------------
  832|   919k|            ? read_angle_delta(r, ec_ctx->angle_delta_cdf[intra_mode - V_PRED])
  833|   919k|            : 0;
  834|   919k|  } else {
  835|       |    // Avoid decoding angle_info if there is no chroma prediction
  836|   687k|    mbmi->uv_mode = UV_DC_PRED;
  837|   687k|  }
  838|  1.60M|  xd->cfl.store_y = store_cfl_required(cm, xd);
  839|       |
  840|  1.60M|  if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize))
  ------------------
  |  Branch (840:7): [True: 600k, False: 1.00M]
  ------------------
  841|   600k|    read_palette_mode_info(cm, xd, r);
  842|       |
  843|  1.60M|  read_filter_intra_mode_info(cm, xd, r);
  844|  1.60M|}
decodemv.c:read_intra_segment_id:
  316|  1.61M|                                 aom_reader *r, int skip) {
  317|  1.61M|  struct segmentation *const seg = &cm->seg;
  318|  1.61M|  if (!seg->enabled) return 0;  // Default for disabled segmentation
  ------------------
  |  Branch (318:7): [True: 1.17M, False: 443k]
  ------------------
  319|  1.61M|  assert(seg->update_map && !seg->temporal_update);
  320|       |
  321|   443k|  const CommonModeInfoParams *const mi_params = &cm->mi_params;
  322|   443k|  const int mi_row = xd->mi_row;
  323|   443k|  const int mi_col = xd->mi_col;
  324|   443k|  const int mi_stride = cm->mi_params.mi_cols;
  325|   443k|  const int mi_offset = mi_row * mi_stride + mi_col;
  326|   443k|  const int bw = mi_size_wide[bsize];
  327|   443k|  const int bh = mi_size_high[bsize];
  328|   443k|  const int x_mis = AOMMIN(mi_params->mi_cols - mi_col, bw);
  ------------------
  |  |   34|   443k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 1.00k, False: 442k]
  |  |  ------------------
  ------------------
  329|   443k|  const int y_mis = AOMMIN(mi_params->mi_rows - mi_row, bh);
  ------------------
  |  |   34|   443k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 1.20k, False: 442k]
  |  |  ------------------
  ------------------
  330|   443k|  const int segment_id = read_segment_id(cm, xd, r, skip);
  331|   443k|  set_segment_id(cm->cur_frame->seg_map, mi_offset, x_mis, y_mis, mi_stride,
  332|   443k|                 segment_id);
  333|   443k|  return segment_id;
  334|  1.61M|}
decodemv.c:read_segment_id:
  281|   444k|                           aom_reader *r, int skip) {
  282|   444k|  int cdf_num;
  283|   444k|  const uint8_t pred = av1_get_spatial_seg_pred(cm, xd, &cdf_num, 0);
  284|   444k|  if (skip) return pred;
  ------------------
  |  Branch (284:7): [True: 752, False: 443k]
  ------------------
  285|       |
  286|   443k|  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
  287|   443k|  struct segmentation *const seg = &cm->seg;
  288|   443k|  struct segmentation_probs *const segp = &ec_ctx->seg;
  289|   443k|  aom_cdf_prob *pred_cdf = segp->spatial_pred_seg_cdf[cdf_num];
  290|   443k|  const int coded_id = aom_read_symbol(r, pred_cdf, MAX_SEGMENTS, ACCT_STR);
  ------------------
  |  |   51|   443k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  291|   443k|  const int segment_id =
  292|   443k|      av1_neg_deinterleave(coded_id, pred, seg->last_active_segid + 1);
  293|       |
  294|   443k|  if (segment_id < 0 || segment_id > seg->last_active_segid) {
  ------------------
  |  Branch (294:7): [True: 42, False: 443k]
  |  Branch (294:25): [True: 156, False: 443k]
  ------------------
  295|    196|    aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
  296|    196|                       "Corrupted segment_ids");
  297|    196|  }
  298|   443k|  return segment_id;
  299|   444k|}
decodemv.c:read_skip_txfm:
  447|  1.75M|                          aom_reader *r) {
  448|  1.75M|  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
  ------------------
  |  Branch (448:7): [True: 192k, False: 1.56M]
  ------------------
  449|   192k|    return 1;
  450|  1.56M|  } else {
  451|  1.56M|    const int ctx = av1_get_skip_txfm_context(xd);
  452|  1.56M|    FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
  453|  1.56M|    const int skip_txfm =
  454|  1.56M|        aom_read_symbol(r, ec_ctx->skip_txfm_cdfs[ctx], 2, ACCT_STR);
  ------------------
  |  |   51|  1.56M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  455|  1.56M|    return skip_txfm;
  456|  1.56M|  }
  457|  1.75M|}
decodemv.c:read_cdef:
   40|  1.75M|static void read_cdef(AV1_COMMON *cm, aom_reader *r, MACROBLOCKD *const xd) {
   41|  1.75M|  const int skip_txfm = xd->mi[0]->skip_txfm;
   42|  1.75M|  if (cm->features.coded_lossless) return;
  ------------------
  |  Branch (42:7): [True: 39.7k, False: 1.71M]
  ------------------
   43|  1.71M|  if (cm->features.allow_intrabc) {
  ------------------
  |  Branch (43:7): [True: 353k, False: 1.36M]
  ------------------
   44|   353k|    assert(cm->cdef_info.cdef_bits == 0);
   45|   353k|    return;
   46|   353k|  }
   47|       |
   48|       |  // At the start of a superblock, mark that we haven't yet read CDEF strengths
   49|       |  // for any of the CDEF units contained in this superblock.
   50|  1.36M|  const int sb_mask = (cm->seq_params->mib_size - 1);
   51|  1.36M|  const int mi_row_in_sb = (xd->mi_row & sb_mask);
   52|  1.36M|  const int mi_col_in_sb = (xd->mi_col & sb_mask);
   53|  1.36M|  if (mi_row_in_sb == 0 && mi_col_in_sb == 0) {
  ------------------
  |  Branch (53:7): [True: 294k, False: 1.06M]
  |  Branch (53:28): [True: 127k, False: 167k]
  ------------------
   54|   127k|    xd->cdef_transmitted[0] = xd->cdef_transmitted[1] =
   55|   127k|        xd->cdef_transmitted[2] = xd->cdef_transmitted[3] = false;
   56|   127k|  }
   57|       |
   58|       |  // CDEF unit size is 64x64 irrespective of the superblock size.
   59|  1.36M|  const int cdef_size = 1 << (6 - MI_SIZE_LOG2);
  ------------------
  |  |   39|  1.36M|#define MI_SIZE_LOG2 2
  ------------------
   60|       |
   61|       |  // Find index of this CDEF unit in this superblock.
   62|  1.36M|  const int index_mask = cdef_size;
   63|  1.36M|  const int cdef_unit_row_in_sb = ((xd->mi_row & index_mask) != 0);
   64|  1.36M|  const int cdef_unit_col_in_sb = ((xd->mi_col & index_mask) != 0);
   65|  1.36M|  const int index = (cm->seq_params->sb_size == BLOCK_128X128)
  ------------------
  |  Branch (65:21): [True: 76.6k, False: 1.28M]
  ------------------
   66|  1.36M|                        ? cdef_unit_col_in_sb + 2 * cdef_unit_row_in_sb
   67|  1.36M|                        : 0;
   68|       |
   69|       |  // Read CDEF strength from the first non-skip coding block in this CDEF unit.
   70|  1.36M|  if (!xd->cdef_transmitted[index] && !skip_txfm) {
  ------------------
  |  Branch (70:7): [True: 232k, False: 1.13M]
  |  Branch (70:39): [True: 123k, False: 109k]
  ------------------
   71|       |    // CDEF strength for this CDEF unit needs to be read into the MB_MODE_INFO
   72|       |    // of the 1st block in this CDEF unit.
   73|   123k|    const int first_block_mask = ~(cdef_size - 1);
   74|   123k|    CommonModeInfoParams *const mi_params = &cm->mi_params;
   75|   123k|    const int grid_idx =
   76|   123k|        get_mi_grid_idx(mi_params, xd->mi_row & first_block_mask,
   77|   123k|                        xd->mi_col & first_block_mask);
   78|   123k|    MB_MODE_INFO *const mbmi = mi_params->mi_grid_base[grid_idx];
   79|   123k|    mbmi->cdef_strength =
   80|   123k|        aom_read_literal(r, cm->cdef_info.cdef_bits, ACCT_STR);
  ------------------
  |  |   47|   123k|  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
   81|       |    xd->cdef_transmitted[index] = true;
   82|   123k|  }
   83|  1.36M|}
decodemv.c:read_delta_q_params:
  736|  1.75M|                                aom_reader *r) {
  737|  1.75M|  DeltaQInfo *const delta_q_info = &cm->delta_q_info;
  738|       |
  739|  1.75M|  if (delta_q_info->delta_q_present_flag) {
  ------------------
  |  Branch (739:7): [True: 819k, False: 937k]
  ------------------
  740|   819k|    MB_MODE_INFO *const mbmi = xd->mi[0];
  741|   819k|    xd->current_base_qindex +=
  742|   819k|        read_delta_qindex(cm, xd, r, mbmi) * delta_q_info->delta_q_res;
  743|       |    /* Normative: Clamp to [1,MAXQ] to not interfere with lossless mode */
  744|   819k|    xd->current_base_qindex = clamp(xd->current_base_qindex, 1, MAXQ);
  ------------------
  |  |   26|   819k|#define MAXQ 255
  ------------------
  745|   819k|    FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
  746|   819k|    if (delta_q_info->delta_lf_present_flag) {
  ------------------
  |  Branch (746:9): [True: 292k, False: 526k]
  ------------------
  747|   292k|      const int mi_row = xd->mi_row;
  748|   292k|      const int mi_col = xd->mi_col;
  749|   292k|      if (delta_q_info->delta_lf_multi) {
  ------------------
  |  Branch (749:11): [True: 262k, False: 30.2k]
  ------------------
  750|   262k|        const int frame_lf_count =
  751|   262k|            av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
  ------------------
  |  |   72|   174k|#define FRAME_LF_COUNT 4
  ------------------
                          av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
  ------------------
  |  |   72|  87.1k|#define FRAME_LF_COUNT 4
  ------------------
  |  Branch (751:13): [True: 174k, False: 87.1k]
  ------------------
  752|  1.13M|        for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
  ------------------
  |  Branch (752:29): [True: 874k, False: 262k]
  ------------------
  753|   874k|          const int tmp_lvl =
  754|   874k|              xd->delta_lf[lf_id] +
  755|   874k|              read_delta_lflevel(cm, r, ec_ctx->delta_lf_multi_cdf[lf_id], mbmi,
  756|   874k|                                 mi_col, mi_row) *
  757|   874k|                  delta_q_info->delta_lf_res;
  758|   874k|          mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id] =
  759|   874k|              clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
  ------------------
  |  |   27|   874k|#define MAX_LOOP_FILTER 63
  ------------------
                            clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
  ------------------
  |  |   27|   874k|#define MAX_LOOP_FILTER 63
  ------------------
  760|   874k|        }
  761|   262k|      } else {
  762|  30.2k|        const int tmp_lvl = xd->delta_lf_from_base +
  763|  30.2k|                            read_delta_lflevel(cm, r, ec_ctx->delta_lf_cdf,
  764|  30.2k|                                               mbmi, mi_col, mi_row) *
  765|  30.2k|                                delta_q_info->delta_lf_res;
  766|  30.2k|        mbmi->delta_lf_from_base = xd->delta_lf_from_base =
  767|  30.2k|            clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
  ------------------
  |  |   27|  30.2k|#define MAX_LOOP_FILTER 63
  ------------------
                          clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
  ------------------
  |  |   27|  30.2k|#define MAX_LOOP_FILTER 63
  ------------------
  768|  30.2k|      }
  769|   292k|    }
  770|   819k|  }
  771|  1.75M|}
decodemv.c:read_delta_qindex:
   86|   819k|                             aom_reader *r, MB_MODE_INFO *const mbmi) {
   87|   819k|  int sign, abs, reduced_delta_qindex = 0;
   88|   819k|  BLOCK_SIZE bsize = mbmi->bsize;
   89|   819k|  const int b_col = xd->mi_col & (cm->seq_params->mib_size - 1);
   90|   819k|  const int b_row = xd->mi_row & (cm->seq_params->mib_size - 1);
   91|   819k|  const int read_delta_q_flag = (b_col == 0 && b_row == 0);
  ------------------
  |  Branch (91:34): [True: 167k, False: 651k]
  |  Branch (91:48): [True: 45.8k, False: 121k]
  ------------------
   92|   819k|  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
   93|       |
   94|   819k|  if ((bsize != cm->seq_params->sb_size || mbmi->skip_txfm == 0) &&
  ------------------
  |  Branch (94:8): [True: 811k, False: 8.08k]
  |  Branch (94:44): [True: 6.24k, False: 1.84k]
  ------------------
   95|   817k|      read_delta_q_flag) {
  ------------------
  |  Branch (95:7): [True: 44.0k, False: 773k]
  ------------------
   96|  44.0k|    abs = aom_read_symbol(r, ec_ctx->delta_q_cdf, DELTA_Q_PROBS + 1, ACCT_STR);
  ------------------
  |  |   51|  44.0k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
   97|  44.0k|    const int smallval = (abs < DELTA_Q_SMALL);
  ------------------
  |  |  497|  44.0k|#define DELTA_Q_SMALL 3
  ------------------
   98|       |
   99|  44.0k|    if (!smallval) {
  ------------------
  |  Branch (99:9): [True: 908, False: 43.1k]
  ------------------
  100|    908|      const int rem_bits = aom_read_literal(r, 3, ACCT_STR) + 1;
  ------------------
  |  |   47|    908|  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  101|    908|      const int thr = (1 << rem_bits) + 1;
  102|    908|      abs = aom_read_literal(r, rem_bits, ACCT_STR) + thr;
  ------------------
  |  |   47|    908|  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  103|    908|    }
  104|       |
  105|  44.0k|    if (abs) {
  ------------------
  |  Branch (105:9): [True: 7.48k, False: 36.5k]
  ------------------
  106|  7.48k|      sign = aom_read_bit(r, ACCT_STR);
  ------------------
  |  |   43|  7.48k|  aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  107|  36.5k|    } else {
  108|  36.5k|      sign = 1;
  109|  36.5k|    }
  110|       |
  111|  44.0k|    reduced_delta_qindex = sign ? -abs : abs;
  ------------------
  |  Branch (111:28): [True: 40.4k, False: 3.65k]
  ------------------
  112|  44.0k|  }
  113|   819k|  return reduced_delta_qindex;
  114|   819k|}
decodemv.c:read_delta_lflevel:
  118|   904k|                              int mi_row) {
  119|   904k|  int reduced_delta_lflevel = 0;
  120|   904k|  const BLOCK_SIZE bsize = mbmi->bsize;
  121|   904k|  const int b_col = mi_col & (cm->seq_params->mib_size - 1);
  122|   904k|  const int b_row = mi_row & (cm->seq_params->mib_size - 1);
  123|   904k|  const int read_delta_lf_flag = (b_col == 0 && b_row == 0);
  ------------------
  |  Branch (123:35): [True: 187k, False: 716k]
  |  Branch (123:49): [True: 49.9k, False: 137k]
  ------------------
  124|       |
  125|   904k|  if ((bsize != cm->seq_params->sb_size || mbmi->skip_txfm == 0) &&
  ------------------
  |  Branch (125:8): [True: 897k, False: 6.62k]
  |  Branch (125:44): [True: 5.04k, False: 1.57k]
  ------------------
  126|   902k|      read_delta_lf_flag) {
  ------------------
  |  Branch (126:7): [True: 48.3k, False: 854k]
  ------------------
  127|  48.3k|    int abs = aom_read_symbol(r, cdf, DELTA_LF_PROBS + 1, ACCT_STR);
  ------------------
  |  |   51|  48.3k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  128|  48.3k|    const int smallval = (abs < DELTA_LF_SMALL);
  ------------------
  |  |  503|  48.3k|#define DELTA_LF_SMALL 3
  ------------------
  129|  48.3k|    if (!smallval) {
  ------------------
  |  Branch (129:9): [True: 602, False: 47.7k]
  ------------------
  130|    602|      const int rem_bits = aom_read_literal(r, 3, ACCT_STR) + 1;
  ------------------
  |  |   47|    602|  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  131|    602|      const int thr = (1 << rem_bits) + 1;
  132|    602|      abs = aom_read_literal(r, rem_bits, ACCT_STR) + thr;
  ------------------
  |  |   47|    602|  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  133|    602|    }
  134|  48.3k|    const int sign = abs ? aom_read_bit(r, ACCT_STR) : 1;
  ------------------
  |  |   43|  7.06k|  aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  |  Branch (134:22): [True: 7.06k, False: 41.2k]
  ------------------
  135|  48.3k|    reduced_delta_lflevel = sign ? -abs : abs;
  ------------------
  |  Branch (135:29): [True: 44.8k, False: 3.49k]
  ------------------
  136|  48.3k|  }
  137|   904k|  return reduced_delta_lflevel;
  138|   904k|}
decodemv.c:read_intrabc_info:
  694|   353k|                              aom_reader *r) {
  695|   353k|  MACROBLOCKD *const xd = &dcb->xd;
  696|   353k|  MB_MODE_INFO *const mbmi = xd->mi[0];
  697|   353k|  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
  698|   353k|  mbmi->use_intrabc = aom_read_symbol(r, ec_ctx->intrabc_cdf, 2, ACCT_STR);
  ------------------
  |  |   51|   353k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  699|   353k|  if (mbmi->use_intrabc) {
  ------------------
  |  Branch (699:7): [True: 11.3k, False: 342k]
  ------------------
  700|  11.3k|    BLOCK_SIZE bsize = mbmi->bsize;
  701|  11.3k|    mbmi->mode = DC_PRED;
  702|  11.3k|    mbmi->uv_mode = UV_DC_PRED;
  703|  11.3k|    mbmi->interp_filters = av1_broadcast_interp_filter(BILINEAR);
  704|  11.3k|    mbmi->motion_mode = SIMPLE_TRANSLATION;
  705|       |
  706|  11.3k|    int16_t inter_mode_ctx[MODE_CTX_REF_FRAMES];
  707|  11.3k|    int_mv ref_mvs[INTRA_FRAME + 1][MAX_MV_REF_CANDIDATES];
  708|       |
  709|  11.3k|    av1_find_mv_refs(cm, xd, mbmi, INTRA_FRAME, dcb->ref_mv_count,
  710|  11.3k|                     xd->ref_mv_stack, xd->weight, ref_mvs, /*global_mvs=*/NULL,
  711|  11.3k|                     inter_mode_ctx);
  712|       |
  713|  11.3k|    int_mv nearestmv, nearmv;
  714|       |
  715|  11.3k|    av1_find_best_ref_mvs(0, ref_mvs[INTRA_FRAME], &nearestmv, &nearmv, 0);
  716|  11.3k|    int_mv dv_ref = nearestmv.as_int == 0 ? nearmv : nearestmv;
  ------------------
  |  Branch (716:21): [True: 5.17k, False: 6.20k]
  ------------------
  717|  11.3k|    if (dv_ref.as_int == 0)
  ------------------
  |  Branch (717:9): [True: 5.17k, False: 6.20k]
  ------------------
  718|  5.17k|      av1_find_ref_dv(&dv_ref, &xd->tile, cm->seq_params->mib_size, xd->mi_row);
  719|       |    // Ref DV should not have sub-pel.
  720|  11.3k|    int valid_dv = (dv_ref.as_mv.col & 7) == 0 && (dv_ref.as_mv.row & 7) == 0;
  ------------------
  |  Branch (720:20): [True: 11.3k, False: 0]
  |  Branch (720:51): [True: 11.3k, False: 0]
  ------------------
  721|  11.3k|    dv_ref.as_mv.col = (dv_ref.as_mv.col >> 3) * 8;
  722|  11.3k|    dv_ref.as_mv.row = (dv_ref.as_mv.row >> 3) * 8;
  723|  11.3k|    valid_dv = valid_dv && assign_dv(cm, xd, &mbmi->mv[0], &dv_ref, xd->mi_row,
  ------------------
  |  Branch (723:16): [True: 11.3k, False: 0]
  |  Branch (723:28): [True: 10.8k, False: 528]
  ------------------
  724|  11.3k|                                     xd->mi_col, bsize, r);
  725|  11.3k|    if (!valid_dv) {
  ------------------
  |  Branch (725:9): [True: 528, False: 10.8k]
  ------------------
  726|       |      // Intra bc motion vectors are not valid - signal corrupt frame
  727|    528|      aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
  728|    528|                         "Invalid intrabc dv");
  729|    528|    }
  730|  11.3k|  }
  731|   353k|}
decodemv.c:assign_dv:
  679|  11.3k|                            BLOCK_SIZE bsize, aom_reader *r) {
  680|  11.3k|  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
  681|  11.3k|  read_mv(r, &mv->as_mv, &ref_mv->as_mv, &ec_ctx->ndvc, MV_SUBPEL_NONE);
  682|       |  // DV should not have sub-pel.
  683|  11.3k|  assert((mv->as_mv.col & 7) == 0);
  684|  11.3k|  assert((mv->as_mv.row & 7) == 0);
  685|  11.3k|  mv->as_mv.col = (mv->as_mv.col >> 3) * 8;
  686|  11.3k|  mv->as_mv.row = (mv->as_mv.row >> 3) * 8;
  687|  11.3k|  int valid = is_mv_valid(&mv->as_mv) &&
  ------------------
  |  Branch (687:15): [True: 11.3k, False: 2]
  ------------------
  688|  11.3k|              av1_is_dv_valid(mv->as_mv, cm, xd, mi_row, mi_col, bsize,
  ------------------
  |  Branch (688:15): [True: 10.8k, False: 526]
  ------------------
  689|  11.3k|                              cm->seq_params->mib_size_log2);
  690|  11.3k|  return valid;
  691|  11.3k|}
decodemv.c:read_mv:
  887|  41.2k|                           nmv_context *ctx, MvSubpelPrecision precision) {
  888|  41.2k|  MV diff = kZeroMv;
  889|  41.2k|  const MV_JOINT_TYPE joint_type =
  890|  41.2k|      (MV_JOINT_TYPE)aom_read_symbol(r, ctx->joints_cdf, MV_JOINTS, ACCT_STR);
  ------------------
  |  |   51|  41.2k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  891|       |
  892|  41.2k|  if (mv_joint_vertical(joint_type))
  ------------------
  |  Branch (892:7): [True: 26.1k, False: 15.0k]
  ------------------
  893|  26.1k|    diff.row = read_mv_component(r, &ctx->comps[0], precision > MV_SUBPEL_NONE,
  894|  26.1k|                                 precision > MV_SUBPEL_LOW_PRECISION);
  895|       |
  896|  41.2k|  if (mv_joint_horizontal(joint_type))
  ------------------
  |  Branch (896:7): [True: 25.5k, False: 15.6k]
  ------------------
  897|  25.5k|    diff.col = read_mv_component(r, &ctx->comps[1], precision > MV_SUBPEL_NONE,
  898|  25.5k|                                 precision > MV_SUBPEL_LOW_PRECISION);
  899|       |
  900|  41.2k|  mv->row = ref->row + diff.row;
  901|  41.2k|  mv->col = ref->col + diff.col;
  902|  41.2k|}
decodemv.c:read_mv_component:
  847|  51.7k|                             int use_subpel, int usehp) {
  848|  51.7k|  int mag, d, fr, hp;
  849|  51.7k|  const int sign = aom_read_symbol(r, mvcomp->sign_cdf, 2, ACCT_STR);
  ------------------
  |  |   51|  51.7k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  850|  51.7k|  const int mv_class =
  851|  51.7k|      aom_read_symbol(r, mvcomp->classes_cdf, MV_CLASSES, ACCT_STR);
  ------------------
  |  |   51|  51.7k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  852|  51.7k|  const int class0 = mv_class == MV_CLASS_0;
  853|       |
  854|       |  // Integer part
  855|  51.7k|  if (class0) {
  ------------------
  |  Branch (855:7): [True: 44.8k, False: 6.89k]
  ------------------
  856|  44.8k|    d = aom_read_symbol(r, mvcomp->class0_cdf, CLASS0_SIZE, ACCT_STR);
  ------------------
  |  |   51|  44.8k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  857|  44.8k|    mag = 0;
  858|  44.8k|  } else {
  859|  6.89k|    const int n = mv_class + CLASS0_BITS - 1;  // number of bits
  ------------------
  |  |   64|  6.89k|#define CLASS0_BITS 1 /* bits at integer precision for class 0 */
  ------------------
  860|  6.89k|    d = 0;
  861|  20.7k|    for (int i = 0; i < n; ++i)
  ------------------
  |  Branch (861:21): [True: 13.8k, False: 6.89k]
  ------------------
  862|  13.8k|      d |= aom_read_symbol(r, mvcomp->bits_cdf[i], 2, ACCT_STR) << i;
  ------------------
  |  |   51|  13.8k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  863|  6.89k|    mag = CLASS0_SIZE << (mv_class + 2);
  ------------------
  |  |   65|  6.89k|#define CLASS0_SIZE (1 << CLASS0_BITS)
  |  |  ------------------
  |  |  |  |   64|  6.89k|#define CLASS0_BITS 1 /* bits at integer precision for class 0 */
  |  |  ------------------
  ------------------
  864|  6.89k|  }
  865|       |
  866|  51.7k|  if (use_subpel) {
  ------------------
  |  Branch (866:7): [True: 35.7k, False: 16.0k]
  ------------------
  867|       |    // Fractional part
  868|  35.7k|    fr = aom_read_symbol(r, class0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf,
  ------------------
  |  |   51|  71.4k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  |  |  ------------------
  |  |  |  Branch (51:23): [True: 30.9k, False: 4.73k]
  |  |  ------------------
  ------------------
  869|  35.7k|                         MV_FP_SIZE, ACCT_STR);
  870|       |
  871|       |    // High precision part (if hp is not used, the default value of the hp is 1)
  872|  35.7k|    hp = usehp ? aom_read_symbol(
  ------------------
  |  |   51|  50.4k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  |  |  ------------------
  |  |  |  Branch (51:23): [True: 21.9k, False: 3.33k]
  |  |  ------------------
  ------------------
  |  Branch (872:10): [True: 25.2k, False: 10.4k]
  ------------------
  873|  35.7k|                     r, class0 ? mvcomp->class0_hp_cdf : mvcomp->hp_cdf, 2,
  874|  35.7k|                     ACCT_STR)
  875|  35.7k|               : 1;
  876|  35.7k|  } else {
  877|  16.0k|    fr = 3;
  878|  16.0k|    hp = 1;
  879|  16.0k|  }
  880|       |
  881|       |  // Result
  882|  51.7k|  mag += ((d << 3) | (fr << 1) | hp) + 1;
  883|  51.7k|  return sign ? -mag : mag;
  ------------------
  |  Branch (883:10): [True: 27.9k, False: 23.8k]
  ------------------
  884|  51.7k|}
decodemv.c:is_mv_valid:
 1109|   100k|static inline int is_mv_valid(const MV *mv) {
 1110|   100k|  return mv->row > MV_LOW && mv->row < MV_UPP && mv->col > MV_LOW &&
  ------------------
  |  |   76|   200k|#define MV_LOW (-(1 << MV_IN_USE_BITS))
  |  |  ------------------
  |  |  |  |   74|   100k|#define MV_IN_USE_BITS 14
  |  |  ------------------
  ------------------
                return mv->row > MV_LOW && mv->row < MV_UPP && mv->col > MV_LOW &&
  ------------------
  |  |   75|   200k|#define MV_UPP (1 << MV_IN_USE_BITS)
  |  |  ------------------
  |  |  |  |   74|   100k|#define MV_IN_USE_BITS 14
  |  |  ------------------
  ------------------
                return mv->row > MV_LOW && mv->row < MV_UPP && mv->col > MV_LOW &&
  ------------------
  |  |   76|   200k|#define MV_LOW (-(1 << MV_IN_USE_BITS))
  |  |  ------------------
  |  |  |  |   74|   100k|#define MV_IN_USE_BITS 14
  |  |  ------------------
  ------------------
  |  Branch (1110:10): [True: 100k, False: 8]
  |  Branch (1110:30): [True: 100k, False: 0]
  |  Branch (1110:50): [True: 100k, False: 0]
  ------------------
 1111|   100k|         mv->col < MV_UPP;
  ------------------
  |  |   75|   100k|#define MV_UPP (1 << MV_IN_USE_BITS)
  |  |  ------------------
  |  |  |  |   74|   100k|#define MV_IN_USE_BITS 14
  |  |  ------------------
  ------------------
  |  Branch (1111:10): [True: 100k, False: 0]
  ------------------
 1112|   100k|}
decodemv.c:read_intra_mode:
   36|  1.66M|static PREDICTION_MODE read_intra_mode(aom_reader *r, aom_cdf_prob *cdf) {
   37|  1.66M|  return (PREDICTION_MODE)aom_read_symbol(r, cdf, INTRA_MODES, ACCT_STR);
  ------------------
  |  |   51|  1.66M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
   38|  1.66M|}
decodemv.c:read_angle_delta:
  603|   604k|static int read_angle_delta(aom_reader *r, aom_cdf_prob *cdf) {
  604|   604k|  const int sym = aom_read_symbol(r, cdf, 2 * MAX_ANGLE_DELTA + 1, ACCT_STR);
  ------------------
  |  |   51|   604k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  605|   604k|  return sym - MAX_ANGLE_DELTA;
  ------------------
  |  |  467|   604k|#define MAX_ANGLE_DELTA 3
  ------------------
  606|   604k|}
decodemv.c:read_intra_mode_uv:
  143|   977k|                                             PREDICTION_MODE y_mode) {
  144|   977k|  const UV_PREDICTION_MODE uv_mode =
  145|   977k|      aom_read_symbol(r, ec_ctx->uv_mode_cdf[cfl_allowed][y_mode],
  ------------------
  |  |   51|   977k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  146|   977k|                      UV_INTRA_MODES - !cfl_allowed, ACCT_STR);
  147|   977k|  return uv_mode;
  148|   977k|}
decodemv.c:read_cfl_alphas:
  151|   172k|                               int8_t *signs_out) {
  152|   172k|  const int8_t joint_sign =
  153|   172k|      aom_read_symbol(r, ec_ctx->cfl_sign_cdf, CFL_JOINT_SIGNS, "cfl:signs");
  ------------------
  |  |   51|   172k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  154|   172k|  uint8_t idx = 0;
  155|       |  // Magnitudes are only coded for nonzero values
  156|   172k|  if (CFL_SIGN_U(joint_sign) != CFL_SIGN_ZERO) {
  ------------------
  |  |  281|   172k|#define CFL_SIGN_U(js) (((js + 1) * 11) >> 5)
  ------------------
  |  Branch (156:7): [True: 161k, False: 11.1k]
  ------------------
  157|   161k|    aom_cdf_prob *cdf_u = ec_ctx->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)];
  ------------------
  |  |  288|   161k|#define CFL_CONTEXT_U(js) (js + 1 - CFL_SIGNS)
  ------------------
  158|   161k|    idx = (uint8_t)aom_read_symbol(r, cdf_u, CFL_ALPHABET_SIZE, "cfl:alpha_u")
  ------------------
  |  |   51|   161k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  159|   161k|          << CFL_ALPHABET_SIZE_LOG2;
  ------------------
  |  |  256|   161k|#define CFL_ALPHABET_SIZE_LOG2 4
  ------------------
  160|   161k|  }
  161|   172k|  if (CFL_SIGN_V(joint_sign) != CFL_SIGN_ZERO) {
  ------------------
  |  |  283|   172k|#define CFL_SIGN_V(js) ((js + 1) - CFL_SIGNS * CFL_SIGN_U(js))
  |  |  ------------------
  |  |  |  |  281|   172k|#define CFL_SIGN_U(js) (((js + 1) * 11) >> 5)
  |  |  ------------------
  ------------------
  |  Branch (161:7): [True: 109k, False: 62.9k]
  ------------------
  162|   109k|    aom_cdf_prob *cdf_v = ec_ctx->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)];
  ------------------
  |  |  291|   109k|  (CFL_SIGN_V(js) * CFL_SIGNS + CFL_SIGN_U(js) - CFL_SIGNS)
  |  |  ------------------
  |  |  |  |  283|   109k|#define CFL_SIGN_V(js) ((js + 1) - CFL_SIGNS * CFL_SIGN_U(js))
  |  |  |  |  ------------------
  |  |  |  |  |  |  281|   109k|#define CFL_SIGN_U(js) (((js + 1) * 11) >> 5)
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (CFL_SIGN_V(js) * CFL_SIGNS + CFL_SIGN_U(js) - CFL_SIGNS)
  |  |  ------------------
  |  |  |  |  281|   109k|#define CFL_SIGN_U(js) (((js + 1) * 11) >> 5)
  |  |  ------------------
  ------------------
  163|   109k|    idx += (uint8_t)aom_read_symbol(r, cdf_v, CFL_ALPHABET_SIZE, "cfl:alpha_v");
  ------------------
  |  |   51|   109k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  164|   109k|  }
  165|   172k|  *signs_out = joint_sign;
  166|   172k|  return idx;
  167|   172k|}
decodemv.c:read_palette_mode_info:
  568|   601k|                                   aom_reader *r) {
  569|   601k|  const int num_planes = av1_num_planes(cm);
  570|   601k|  MB_MODE_INFO *const mbmi = xd->mi[0];
  571|   601k|  const BLOCK_SIZE bsize = mbmi->bsize;
  572|   601k|  assert(av1_allow_palette(cm->features.allow_screen_content_tools, bsize));
  573|   601k|  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
  574|   601k|  const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
  575|       |
  576|   601k|  if (mbmi->mode == DC_PRED) {
  ------------------
  |  Branch (576:7): [True: 264k, False: 336k]
  ------------------
  577|   264k|    const int palette_mode_ctx = av1_get_palette_mode_ctx(xd);
  578|   264k|    const int modev = aom_read_symbol(
  ------------------
  |  |   51|   264k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  579|   264k|        r, xd->tile_ctx->palette_y_mode_cdf[bsize_ctx][palette_mode_ctx], 2,
  580|   264k|        ACCT_STR);
  581|   264k|    if (modev) {
  ------------------
  |  Branch (581:9): [True: 47.9k, False: 216k]
  ------------------
  582|  47.9k|      pmi->palette_size[0] =
  583|  47.9k|          aom_read_symbol(r, xd->tile_ctx->palette_y_size_cdf[bsize_ctx],
  ------------------
  |  |   51|  47.9k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  584|  47.9k|                          PALETTE_SIZES, ACCT_STR) +
  585|  47.9k|          2;
  586|  47.9k|      read_palette_colors_y(xd, cm->seq_params->bit_depth, pmi, r);
  587|  47.9k|    }
  588|   264k|  }
  589|   601k|  if (num_planes > 1 && mbmi->uv_mode == UV_DC_PRED && xd->is_chroma_ref) {
  ------------------
  |  Branch (589:7): [True: 329k, False: 271k]
  |  Branch (589:25): [True: 130k, False: 199k]
  |  Branch (589:56): [True: 130k, False: 79]
  ------------------
  590|   130k|    const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0);
  591|   130k|    const int modev = aom_read_symbol(
  ------------------
  |  |   51|   130k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  592|   130k|        r, xd->tile_ctx->palette_uv_mode_cdf[palette_uv_mode_ctx], 2, ACCT_STR);
  593|   130k|    if (modev) {
  ------------------
  |  Branch (593:9): [True: 14.5k, False: 116k]
  ------------------
  594|  14.5k|      pmi->palette_size[1] =
  595|  14.5k|          aom_read_symbol(r, xd->tile_ctx->palette_uv_size_cdf[bsize_ctx],
  ------------------
  |  |   51|  14.5k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  596|  14.5k|                          PALETTE_SIZES, ACCT_STR) +
  597|  14.5k|          2;
  598|  14.5k|      read_palette_colors_uv(xd, cm->seq_params->bit_depth, pmi, r);
  599|  14.5k|    }
  600|   130k|  }
  601|   601k|}
decodemv.c:read_palette_colors_y:
  479|  47.9k|                                  PALETTE_MODE_INFO *const pmi, aom_reader *r) {
  480|  47.9k|  uint16_t color_cache[2 * PALETTE_MAX_SIZE];
  481|  47.9k|  uint16_t cached_colors[PALETTE_MAX_SIZE];
  482|  47.9k|  const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
  483|  47.9k|  const int n = pmi->palette_size[0];
  484|  47.9k|  int idx = 0;
  485|   179k|  for (int i = 0; i < n_cache && idx < n; ++i)
  ------------------
  |  Branch (485:19): [True: 137k, False: 42.0k]
  |  Branch (485:34): [True: 131k, False: 5.87k]
  ------------------
  486|   131k|    if (aom_read_bit(r, ACCT_STR)) cached_colors[idx++] = color_cache[i];
  ------------------
  |  |   43|   131k|  aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME))
  |  |  ------------------
  |  |  |  Branch (43:3): [True: 67.3k, False: 64.1k]
  |  |  ------------------
  ------------------
  487|  47.9k|  if (idx < n) {
  ------------------
  |  Branch (487:7): [True: 39.0k, False: 8.87k]
  ------------------
  488|  39.0k|    const int n_cached_colors = idx;
  489|  39.0k|    pmi->palette_colors[idx++] = aom_read_literal(r, bit_depth, ACCT_STR);
  ------------------
  |  |   47|  39.0k|  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  490|  39.0k|    if (idx < n) {
  ------------------
  |  Branch (490:9): [True: 32.4k, False: 6.58k]
  ------------------
  491|  32.4k|      const int min_bits = bit_depth - 3;
  492|  32.4k|      int bits = min_bits + aom_read_literal(r, 2, ACCT_STR);
  ------------------
  |  |   47|  32.4k|  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  493|  32.4k|      int range = (1 << bit_depth) - pmi->palette_colors[idx - 1] - 1;
  494|   128k|      for (; idx < n; ++idx) {
  ------------------
  |  Branch (494:14): [True: 95.5k, False: 32.4k]
  ------------------
  495|  95.5k|        assert(range >= 0);
  496|  95.5k|        const int delta = aom_read_literal(r, bits, ACCT_STR) + 1;
  ------------------
  |  |   47|  95.5k|  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  497|  95.5k|        pmi->palette_colors[idx] = clamp(pmi->palette_colors[idx - 1] + delta,
  498|  95.5k|                                         0, (1 << bit_depth) - 1);
  499|  95.5k|        range -= (pmi->palette_colors[idx] - pmi->palette_colors[idx - 1]);
  500|  95.5k|        bits = AOMMIN(bits, aom_ceil_log2(range));
  ------------------
  |  |   34|  95.5k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 32.3k, False: 63.2k]
  |  |  ------------------
  ------------------
  501|  95.5k|      }
  502|  32.4k|    }
  503|  39.0k|    merge_colors(pmi->palette_colors, cached_colors, n, n_cached_colors);
  504|  39.0k|  } else {
  505|  8.87k|    memcpy(pmi->palette_colors, cached_colors, n * sizeof(cached_colors[0]));
  506|  8.87k|  }
  507|  47.9k|}
decodemv.c:merge_colors:
  463|  52.9k|                         int n_colors, int n_cached_colors) {
  464|  52.9k|  if (n_cached_colors == 0) return;
  ------------------
  |  Branch (464:7): [True: 27.8k, False: 25.0k]
  ------------------
  465|  25.0k|  int cache_idx = 0, trans_idx = n_cached_colors;
  466|   143k|  for (int i = 0; i < n_colors; ++i) {
  ------------------
  |  Branch (466:19): [True: 118k, False: 25.0k]
  ------------------
  467|   118k|    if (cache_idx < n_cached_colors &&
  ------------------
  |  Branch (467:9): [True: 82.9k, False: 35.9k]
  ------------------
  468|  82.9k|        (trans_idx >= n_colors ||
  ------------------
  |  Branch (468:10): [True: 18.5k, False: 64.3k]
  ------------------
  469|  64.3k|         cached_colors[cache_idx] <= colors[trans_idx])) {
  ------------------
  |  Branch (469:10): [True: 29.4k, False: 34.9k]
  ------------------
  470|  48.0k|      colors[i] = cached_colors[cache_idx++];
  471|  70.8k|    } else {
  472|       |      assert(trans_idx < n_colors);
  473|  70.8k|      colors[i] = colors[trans_idx++];
  474|  70.8k|    }
  475|   118k|  }
  476|  25.0k|}
decodemv.c:read_palette_colors_uv:
  511|  14.5k|                                   aom_reader *r) {
  512|  14.5k|  const int n = pmi->palette_size[1];
  513|       |  // U channel colors.
  514|  14.5k|  uint16_t color_cache[2 * PALETTE_MAX_SIZE];
  515|  14.5k|  uint16_t cached_colors[PALETTE_MAX_SIZE];
  516|  14.5k|  const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
  517|  14.5k|  int idx = 0;
  518|  28.8k|  for (int i = 0; i < n_cache && idx < n; ++i)
  ------------------
  |  Branch (518:19): [True: 14.6k, False: 14.1k]
  |  Branch (518:34): [True: 14.2k, False: 429]
  ------------------
  519|  14.2k|    if (aom_read_bit(r, ACCT_STR)) cached_colors[idx++] = color_cache[i];
  ------------------
  |  |   43|  14.2k|  aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME))
  |  |  ------------------
  |  |  |  Branch (43:3): [True: 7.00k, False: 7.24k]
  |  |  ------------------
  ------------------
  520|  14.5k|  if (idx < n) {
  ------------------
  |  Branch (520:7): [True: 13.8k, False: 720]
  ------------------
  521|  13.8k|    const int n_cached_colors = idx;
  522|  13.8k|    idx += PALETTE_MAX_SIZE;
  ------------------
  |  |   63|  13.8k|#define PALETTE_MAX_SIZE 8
  ------------------
  523|  13.8k|    pmi->palette_colors[idx++] = aom_read_literal(r, bit_depth, ACCT_STR);
  ------------------
  |  |   47|  13.8k|  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  524|  13.8k|    if (idx < PALETTE_MAX_SIZE + n) {
  ------------------
  |  |   63|  13.8k|#define PALETTE_MAX_SIZE 8
  ------------------
  |  Branch (524:9): [True: 11.3k, False: 2.52k]
  ------------------
  525|  11.3k|      const int min_bits = bit_depth - 3;
  526|  11.3k|      int bits = min_bits + aom_read_literal(r, 2, ACCT_STR);
  ------------------
  |  |   47|  11.3k|  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  527|  11.3k|      int range = (1 << bit_depth) - pmi->palette_colors[idx - 1];
  528|  40.3k|      for (; idx < PALETTE_MAX_SIZE + n; ++idx) {
  ------------------
  |  |   63|  40.3k|#define PALETTE_MAX_SIZE 8
  ------------------
  |  Branch (528:14): [True: 28.9k, False: 11.3k]
  ------------------
  529|  28.9k|        assert(range >= 0);
  530|  28.9k|        const int delta = aom_read_literal(r, bits, ACCT_STR);
  ------------------
  |  |   47|  28.9k|  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  531|  28.9k|        pmi->palette_colors[idx] = clamp(pmi->palette_colors[idx - 1] + delta,
  532|  28.9k|                                         0, (1 << bit_depth) - 1);
  533|  28.9k|        range -= (pmi->palette_colors[idx] - pmi->palette_colors[idx - 1]);
  534|  28.9k|        bits = AOMMIN(bits, aom_ceil_log2(range));
  ------------------
  |  |   34|  28.9k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 8.73k, False: 20.2k]
  |  |  ------------------
  ------------------
  535|  28.9k|      }
  536|  11.3k|    }
  537|  13.8k|    merge_colors(pmi->palette_colors + PALETTE_MAX_SIZE, cached_colors, n,
  ------------------
  |  |   63|  13.8k|#define PALETTE_MAX_SIZE 8
  ------------------
  538|  13.8k|                 n_cached_colors);
  539|  13.8k|  } else {
  540|    720|    memcpy(pmi->palette_colors + PALETTE_MAX_SIZE, cached_colors,
  ------------------
  |  |   63|    720|#define PALETTE_MAX_SIZE 8
  ------------------
  541|    720|           n * sizeof(cached_colors[0]));
  542|    720|  }
  543|       |
  544|       |  // V channel colors.
  545|  14.5k|  if (aom_read_bit(r, ACCT_STR)) {  // Delta encoding.
  ------------------
  |  |   43|  14.5k|  aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME))
  |  |  ------------------
  |  |  |  Branch (43:3): [True: 6.09k, False: 8.48k]
  |  |  ------------------
  ------------------
  546|  6.09k|    const int min_bits_v = bit_depth - 4;
  547|  6.09k|    const int max_val = 1 << bit_depth;
  548|  6.09k|    int bits = min_bits_v + aom_read_literal(r, 2, ACCT_STR);
  ------------------
  |  |   47|  6.09k|  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  549|  6.09k|    pmi->palette_colors[2 * PALETTE_MAX_SIZE] =
  ------------------
  |  |   63|  6.09k|#define PALETTE_MAX_SIZE 8
  ------------------
  550|  6.09k|        aom_read_literal(r, bit_depth, ACCT_STR);
  ------------------
  |  |   47|  6.09k|  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  551|  23.1k|    for (int i = 1; i < n; ++i) {
  ------------------
  |  Branch (551:21): [True: 17.0k, False: 6.09k]
  ------------------
  552|  17.0k|      int delta = aom_read_literal(r, bits, ACCT_STR);
  ------------------
  |  |   47|  17.0k|  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  553|  17.0k|      if (delta && aom_read_bit(r, ACCT_STR)) delta = -delta;
  ------------------
  |  |   43|  16.6k|  aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME))
  |  |  ------------------
  |  |  |  Branch (43:3): [True: 8.64k, False: 7.96k]
  |  |  ------------------
  ------------------
  |  Branch (553:11): [True: 16.6k, False: 431]
  ------------------
  554|  17.0k|      int val = (int)pmi->palette_colors[2 * PALETTE_MAX_SIZE + i - 1] + delta;
  ------------------
  |  |   63|  17.0k|#define PALETTE_MAX_SIZE 8
  ------------------
  555|  17.0k|      if (val < 0) val += max_val;
  ------------------
  |  Branch (555:11): [True: 1.06k, False: 15.9k]
  ------------------
  556|  17.0k|      if (val >= max_val) val -= max_val;
  ------------------
  |  Branch (556:11): [True: 961, False: 16.0k]
  ------------------
  557|  17.0k|      pmi->palette_colors[2 * PALETTE_MAX_SIZE + i] = val;
  ------------------
  |  |   63|  17.0k|#define PALETTE_MAX_SIZE 8
  ------------------
  558|  17.0k|    }
  559|  8.48k|  } else {
  560|  35.1k|    for (int i = 0; i < n; ++i) {
  ------------------
  |  Branch (560:21): [True: 26.7k, False: 8.48k]
  ------------------
  561|  26.7k|      pmi->palette_colors[2 * PALETTE_MAX_SIZE + i] =
  ------------------
  |  |   63|  26.7k|#define PALETTE_MAX_SIZE 8
  ------------------
  562|  26.7k|          aom_read_literal(r, bit_depth, ACCT_STR);
  ------------------
  |  |   47|  26.7k|  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  563|  26.7k|    }
  564|  8.48k|  }
  565|  14.5k|}
decodemv.c:read_filter_intra_mode_info:
  609|  1.66M|                                        MACROBLOCKD *const xd, aom_reader *r) {
  610|  1.66M|  MB_MODE_INFO *const mbmi = xd->mi[0];
  611|  1.66M|  FILTER_INTRA_MODE_INFO *filter_intra_mode_info =
  612|  1.66M|      &mbmi->filter_intra_mode_info;
  613|       |
  614|  1.66M|  if (av1_filter_intra_allowed(cm, mbmi)) {
  ------------------
  |  Branch (614:7): [True: 607k, False: 1.06M]
  ------------------
  615|   607k|    filter_intra_mode_info->use_filter_intra = aom_read_symbol(
  ------------------
  |  |   51|   607k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  616|   607k|        r, xd->tile_ctx->filter_intra_cdfs[mbmi->bsize], 2, ACCT_STR);
  617|   607k|    if (filter_intra_mode_info->use_filter_intra) {
  ------------------
  |  Branch (617:9): [True: 336k, False: 270k]
  ------------------
  618|   336k|      filter_intra_mode_info->filter_intra_mode = aom_read_symbol(
  ------------------
  |  |   51|   336k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  619|   336k|          r, xd->tile_ctx->filter_intra_mode_cdf, FILTER_INTRA_MODES, ACCT_STR);
  620|   336k|    }
  621|  1.06M|  } else {
  622|  1.06M|    filter_intra_mode_info->use_filter_intra = 0;
  623|  1.06M|  }
  624|  1.66M|}
decodemv.c:intra_copy_frame_mvs:
 1554|  50.6k|                                 int x_mis, int y_mis) {
 1555|  50.6k|  const int frame_mvs_stride = ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, 1);
  ------------------
  |  |   41|  50.6k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
 1556|  50.6k|  MV_REF *frame_mvs =
 1557|  50.6k|      cm->cur_frame->mvs + (mi_row >> 1) * frame_mvs_stride + (mi_col >> 1);
 1558|  50.6k|  x_mis = ROUND_POWER_OF_TWO(x_mis, 1);
  ------------------
  |  |   41|  50.6k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
 1559|  50.6k|  y_mis = ROUND_POWER_OF_TWO(y_mis, 1);
  ------------------
  |  |   41|  50.6k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
 1560|       |
 1561|   253k|  for (int h = 0; h < y_mis; h++) {
  ------------------
  |  Branch (1561:19): [True: 202k, False: 50.6k]
  ------------------
 1562|   202k|    MV_REF *mv = frame_mvs;
 1563|  1.40M|    for (int w = 0; w < x_mis; w++) {
  ------------------
  |  Branch (1563:21): [True: 1.20M, False: 202k]
  ------------------
 1564|  1.20M|      mv->ref_frame = NONE_FRAME;
 1565|  1.20M|      mv++;
 1566|  1.20M|    }
 1567|   202k|    frame_mvs += frame_mvs_stride;
 1568|   202k|  }
 1569|  50.6k|}
decodemv.c:read_inter_frame_mode_info:
 1513|   139k|                                       DecoderCodingBlock *dcb, aom_reader *r) {
 1514|   139k|  AV1_COMMON *const cm = &pbi->common;
 1515|   139k|  MACROBLOCKD *const xd = &dcb->xd;
 1516|   139k|  MB_MODE_INFO *const mbmi = xd->mi[0];
 1517|   139k|  int inter_block = 1;
 1518|       |
 1519|   139k|  mbmi->mv[0].as_int = 0;
 1520|   139k|  mbmi->mv[1].as_int = 0;
 1521|   139k|  mbmi->segment_id = read_inter_segment_id(cm, xd, 1, r);
 1522|       |
 1523|   139k|  mbmi->skip_mode = read_skip_mode(cm, xd, mbmi->segment_id, r);
 1524|       |
 1525|   139k|  if (mbmi->skip_mode)
  ------------------
  |  Branch (1525:7): [True: 110, False: 139k]
  ------------------
 1526|    110|    mbmi->skip_txfm = 1;
 1527|   139k|  else
 1528|   139k|    mbmi->skip_txfm = read_skip_txfm(cm, xd, mbmi->segment_id, r);
 1529|       |
 1530|   139k|  if (!cm->seg.segid_preskip)
  ------------------
  |  Branch (1530:7): [True: 138k, False: 1.17k]
  ------------------
 1531|   138k|    mbmi->segment_id = read_inter_segment_id(cm, xd, 0, r);
 1532|       |
 1533|   139k|  read_cdef(cm, r, xd);
 1534|       |
 1535|   139k|  read_delta_q_params(cm, xd, r);
 1536|       |
 1537|   139k|  if (!mbmi->skip_mode)
  ------------------
  |  Branch (1537:7): [True: 139k, False: 140]
  ------------------
 1538|   139k|    inter_block = read_is_inter_block(cm, xd, mbmi->segment_id, r);
 1539|       |
 1540|   139k|  mbmi->current_qindex = xd->current_base_qindex;
 1541|       |
 1542|   139k|  xd->above_txfm_context =
 1543|   139k|      cm->above_contexts.txfm[xd->tile.tile_row] + xd->mi_col;
 1544|   139k|  xd->left_txfm_context =
 1545|   139k|      xd->left_txfm_context_buffer + (xd->mi_row & MAX_MIB_MASK);
  ------------------
  |  |   50|   139k|#define MAX_MIB_MASK (MAX_MIB_SIZE - 1)
  |  |  ------------------
  |  |  |  |   44|   139k|#define MAX_MIB_SIZE (1 << MAX_MIB_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   43|   139k|#define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   31|   139k|#define MAX_SB_SIZE_LOG2 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |               #define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   39|   139k|#define MI_SIZE_LOG2 2
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1546|       |
 1547|   139k|  if (inter_block)
  ------------------
  |  Branch (1547:7): [True: 76.7k, False: 62.8k]
  ------------------
 1548|  76.7k|    read_inter_block_mode_info(pbi, dcb, mbmi, r);
 1549|  62.8k|  else
 1550|  62.8k|    read_intra_block_mode_info(cm, xd, mbmi, r);
 1551|   139k|}
decodemv.c:read_inter_segment_id:
  364|   277k|                                 int preskip, aom_reader *r) {
  365|   277k|  struct segmentation *const seg = &cm->seg;
  366|   277k|  const CommonModeInfoParams *const mi_params = &cm->mi_params;
  367|   277k|  MB_MODE_INFO *const mbmi = xd->mi[0];
  368|   277k|  const int mi_row = xd->mi_row;
  369|   277k|  const int mi_col = xd->mi_col;
  370|   277k|  const int mi_offset = mi_row * mi_params->mi_cols + mi_col;
  371|   277k|  const int bw = mi_size_wide[mbmi->bsize];
  372|   277k|  const int bh = mi_size_high[mbmi->bsize];
  373|       |
  374|       |  // TODO(slavarnway): move x_mis, y_mis into xd ?????
  375|   277k|  const int x_mis = AOMMIN(mi_params->mi_cols - mi_col, bw);
  ------------------
  |  |   34|   277k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 35.0k, False: 242k]
  |  |  ------------------
  ------------------
  376|   277k|  const int y_mis = AOMMIN(mi_params->mi_rows - mi_row, bh);
  ------------------
  |  |   34|   277k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 22.6k, False: 255k]
  |  |  ------------------
  ------------------
  377|       |
  378|   277k|  if (!seg->enabled) return 0;  // Default for disabled segmentation
  ------------------
  |  Branch (378:7): [True: 272k, False: 5.31k]
  ------------------
  379|       |
  380|  5.31k|  if (!seg->update_map) {
  ------------------
  |  Branch (380:7): [True: 3.91k, False: 1.40k]
  ------------------
  381|  3.91k|    copy_segment_id(mi_params, cm->last_frame_seg_map, cm->cur_frame->seg_map,
  382|  3.91k|                    mi_offset, x_mis, y_mis);
  383|  3.91k|    return get_predicted_segment_id(cm, mi_offset, x_mis, y_mis);
  384|  3.91k|  }
  385|       |
  386|  1.40k|  uint8_t segment_id;
  387|  1.40k|  const int mi_stride = cm->mi_params.mi_cols;
  388|  1.40k|  if (preskip) {
  ------------------
  |  Branch (388:7): [True: 1.29k, False: 112]
  ------------------
  389|  1.29k|    if (!seg->segid_preskip) return 0;
  ------------------
  |  Branch (389:9): [True: 112, False: 1.18k]
  ------------------
  390|  1.29k|  } else {
  391|    112|    if (mbmi->skip_txfm) {
  ------------------
  |  Branch (391:9): [True: 28, False: 84]
  ------------------
  392|     28|      if (seg->temporal_update) {
  ------------------
  |  Branch (392:11): [True: 8, False: 20]
  ------------------
  393|      8|        mbmi->seg_id_predicted = 0;
  394|      8|      }
  395|     28|      segment_id = read_segment_id(cm, xd, r, 1);
  396|     28|      set_segment_id(cm->cur_frame->seg_map, mi_offset, x_mis, y_mis, mi_stride,
  397|     28|                     segment_id);
  398|     28|      return segment_id;
  399|     28|    }
  400|    112|  }
  401|       |
  402|  1.26k|  if (seg->temporal_update) {
  ------------------
  |  Branch (402:7): [True: 1.03k, False: 226]
  ------------------
  403|  1.03k|    const uint8_t ctx = av1_get_pred_context_seg_id(xd);
  404|  1.03k|    FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
  405|  1.03k|    struct segmentation_probs *const segp = &ec_ctx->seg;
  406|  1.03k|    aom_cdf_prob *pred_cdf = segp->pred_cdf[ctx];
  407|  1.03k|    mbmi->seg_id_predicted = aom_read_symbol(r, pred_cdf, 2, ACCT_STR);
  ------------------
  |  |   51|  1.03k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  408|  1.03k|    if (mbmi->seg_id_predicted) {
  ------------------
  |  Branch (408:9): [True: 572, False: 466]
  ------------------
  409|    572|      segment_id = get_predicted_segment_id(cm, mi_offset, x_mis, y_mis);
  410|    572|    } else {
  411|    466|      segment_id = read_segment_id(cm, xd, r, 0);
  412|    466|    }
  413|  1.03k|  } else {
  414|    226|    segment_id = read_segment_id(cm, xd, r, 0);
  415|    226|  }
  416|  1.26k|  set_segment_id(cm->cur_frame->seg_map, mi_offset, x_mis, y_mis, mi_stride,
  417|  1.26k|                 segment_id);
  418|  1.26k|  return segment_id;
  419|  1.40k|}
decodemv.c:copy_segment_id:
  339|  3.91k|                            int x_mis, int y_mis) {
  340|  3.91k|  const int stride = mi_params->mi_cols;
  341|  3.91k|  if (last_segment_ids) {
  ------------------
  |  Branch (341:7): [True: 3.91k, False: 0]
  ------------------
  342|  3.91k|    assert(last_segment_ids != current_segment_ids);
  343|  16.0k|    for (int y = 0; y < y_mis; y++) {
  ------------------
  |  Branch (343:21): [True: 12.1k, False: 3.91k]
  ------------------
  344|  12.1k|      memcpy(&current_segment_ids[mi_offset + y * stride],
  345|  12.1k|             &last_segment_ids[mi_offset + y * stride],
  346|  12.1k|             sizeof(current_segment_ids[0]) * x_mis);
  347|  12.1k|    }
  348|  3.91k|  } else {
  349|      0|    for (int y = 0; y < y_mis; y++) {
  ------------------
  |  Branch (349:21): [True: 0, False: 0]
  ------------------
  350|      0|      memset(&current_segment_ids[mi_offset + y * stride], 0,
  351|      0|             sizeof(current_segment_ids[0]) * x_mis);
  352|      0|    }
  353|      0|  }
  354|  3.91k|}
decodemv.c:get_predicted_segment_id:
  357|  4.48k|                                    int x_mis, int y_mis) {
  358|  4.48k|  return cm->last_frame_seg_map ? dec_get_segment_id(cm, cm->last_frame_seg_map,
  ------------------
  |  Branch (358:10): [True: 4.48k, False: 0]
  ------------------
  359|  4.48k|                                                     mi_offset, x_mis, y_mis)
  360|  4.48k|                                : 0;
  361|  4.48k|}
decodemv.c:dec_get_segment_id:
  302|  4.48k|                              int mi_offset, int x_mis, int y_mis) {
  303|  4.48k|  int segment_id = INT_MAX;
  304|       |
  305|  19.3k|  for (int y = 0; y < y_mis; y++)
  ------------------
  |  Branch (305:19): [True: 14.8k, False: 4.48k]
  ------------------
  306|  80.2k|    for (int x = 0; x < x_mis; x++)
  ------------------
  |  Branch (306:21): [True: 65.3k, False: 14.8k]
  ------------------
  307|  65.3k|      segment_id = AOMMIN(
  ------------------
  |  |   34|  65.3k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 65.3k]
  |  |  ------------------
  ------------------
  308|  4.48k|          segment_id, segment_ids[mi_offset + y * cm->mi_params.mi_cols + x]);
  309|       |
  310|       |  assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
  311|  4.48k|  return segment_id;
  312|  4.48k|}
decodemv.c:read_skip_mode:
  422|   139k|                          aom_reader *r) {
  423|   139k|  if (!cm->current_frame.skip_mode_info.skip_mode_flag) return 0;
  ------------------
  |  Branch (423:7): [True: 131k, False: 7.72k]
  ------------------
  424|       |
  425|  7.72k|  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
  ------------------
  |  Branch (425:7): [True: 0, False: 7.72k]
  ------------------
  426|      0|    return 0;
  427|      0|  }
  428|       |
  429|  7.72k|  if (!is_comp_ref_allowed(xd->mi[0]->bsize)) return 0;
  ------------------
  |  Branch (429:7): [True: 2.18k, False: 5.54k]
  ------------------
  430|       |
  431|  5.54k|  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME) ||
  ------------------
  |  Branch (431:7): [True: 0, False: 5.54k]
  ------------------
  432|  5.54k|      segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
  ------------------
  |  Branch (432:7): [True: 0, False: 5.54k]
  ------------------
  433|       |    // These features imply single-reference mode, while skip mode implies
  434|       |    // compound reference. Hence, the two are mutually exclusive.
  435|       |    // In other words, skip_mode is implicitly 0 here.
  436|      0|    return 0;
  437|      0|  }
  438|       |
  439|  5.54k|  const int ctx = av1_get_skip_mode_context(xd);
  440|  5.54k|  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
  441|  5.54k|  const int skip_mode =
  442|  5.54k|      aom_read_symbol(r, ec_ctx->skip_mode_cdfs[ctx], 2, ACCT_STR);
  ------------------
  |  |   51|  5.54k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  443|  5.54k|  return skip_mode;
  444|  5.54k|}
decodemv.c:read_is_inter_block:
 1224|   139k|                               int segment_id, aom_reader *r) {
 1225|   139k|  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
  ------------------
  |  Branch (1225:7): [True: 88, False: 139k]
  ------------------
 1226|     88|    const int frame = get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
 1227|     88|    if (frame < LAST_FRAME) return 0;
  ------------------
  |  Branch (1227:9): [True: 4, False: 84]
  ------------------
 1228|     84|    return frame != INTRA_FRAME;
 1229|     88|  }
 1230|   139k|  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
  ------------------
  |  Branch (1230:7): [True: 88, False: 139k]
  ------------------
 1231|     88|    return 1;
 1232|     88|  }
 1233|   139k|  const int ctx = av1_get_intra_inter_context(xd);
 1234|   139k|  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 1235|   139k|  const int is_inter =
 1236|   139k|      aom_read_symbol(r, ec_ctx->intra_inter_cdf[ctx], 2, ACCT_STR);
  ------------------
  |  |   51|   139k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1237|   139k|  return is_inter;
 1238|   139k|}
decodemv.c:read_inter_block_mode_info:
 1276|  76.7k|                                       aom_reader *r) {
 1277|  76.7k|  AV1_COMMON *const cm = &pbi->common;
 1278|  76.7k|  FeatureFlags *const features = &cm->features;
 1279|  76.7k|  const BLOCK_SIZE bsize = mbmi->bsize;
 1280|  76.7k|  const int allow_hp = features->allow_high_precision_mv;
 1281|  76.7k|  int_mv nearestmv[2], nearmv[2];
 1282|  76.7k|  int_mv ref_mvs[MODE_CTX_REF_FRAMES][MAX_MV_REF_CANDIDATES] = { { { 0 } } };
 1283|  76.7k|  int16_t inter_mode_ctx[MODE_CTX_REF_FRAMES];
 1284|  76.7k|  int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
 1285|  76.7k|  MACROBLOCKD *const xd = &dcb->xd;
 1286|  76.7k|  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 1287|       |
 1288|  76.7k|  mbmi->uv_mode = UV_DC_PRED;
 1289|  76.7k|  mbmi->palette_mode_info.palette_size[0] = 0;
 1290|  76.7k|  mbmi->palette_mode_info.palette_size[1] = 0;
 1291|       |
 1292|  76.7k|  av1_collect_neighbors_ref_counts(xd);
 1293|       |
 1294|  76.7k|  read_ref_frames(cm, xd, r, mbmi->segment_id, mbmi->ref_frame);
 1295|  76.7k|  const int is_compound = has_second_ref(mbmi);
 1296|       |
 1297|  76.7k|  const MV_REFERENCE_FRAME ref_frame = av1_ref_frame_type(mbmi->ref_frame);
 1298|  76.7k|  av1_find_mv_refs(cm, xd, mbmi, ref_frame, dcb->ref_mv_count, xd->ref_mv_stack,
 1299|  76.7k|                   xd->weight, ref_mvs, /*global_mvs=*/NULL, inter_mode_ctx);
 1300|       |
 1301|  76.7k|  mbmi->ref_mv_idx = 0;
 1302|       |
 1303|  76.7k|  if (mbmi->skip_mode) {
  ------------------
  |  Branch (1303:7): [True: 110, False: 76.5k]
  ------------------
 1304|    110|    assert(is_compound);
 1305|    110|    mbmi->mode = NEAREST_NEARESTMV;
 1306|  76.5k|  } else {
 1307|  76.5k|    if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) ||
  ------------------
  |  Branch (1307:9): [True: 843, False: 75.7k]
  ------------------
 1308|  75.7k|        segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_GLOBALMV)) {
  ------------------
  |  Branch (1308:9): [True: 113, False: 75.6k]
  ------------------
 1309|    938|      mbmi->mode = GLOBALMV;
 1310|  75.6k|    } else {
 1311|  75.6k|      const int mode_ctx =
 1312|  75.6k|          av1_mode_context_analyzer(inter_mode_ctx, mbmi->ref_frame);
 1313|  75.6k|      if (is_compound)
  ------------------
  |  Branch (1313:11): [True: 11.8k, False: 63.7k]
  ------------------
 1314|  11.8k|        mbmi->mode = read_inter_compound_mode(xd, r, mode_ctx);
 1315|  63.7k|      else
 1316|  63.7k|        mbmi->mode = read_inter_mode(ec_ctx, r, mode_ctx);
 1317|  75.6k|      if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV ||
  ------------------
  |  Branch (1317:11): [True: 23.9k, False: 51.6k]
  |  Branch (1317:34): [True: 1.73k, False: 49.9k]
  ------------------
 1318|  49.9k|          have_nearmv_in_inter_mode(mbmi->mode))
  ------------------
  |  Branch (1318:11): [True: 12.5k, False: 37.4k]
  ------------------
 1319|  38.2k|        read_drl_idx(ec_ctx, dcb, mbmi, r);
 1320|  75.6k|    }
 1321|  76.5k|  }
 1322|       |
 1323|  76.7k|  if (is_compound != is_inter_compound_mode(mbmi->mode)) {
  ------------------
  |  Branch (1323:7): [True: 0, False: 76.7k]
  ------------------
 1324|      0|    aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
 1325|      0|                       "Prediction mode %d invalid with ref frame %d %d",
 1326|      0|                       mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]);
 1327|      0|  }
 1328|       |
 1329|  76.7k|  if (!is_compound && mbmi->mode != GLOBALMV) {
  ------------------
  |  Branch (1329:7): [True: 64.7k, False: 11.9k]
  |  Branch (1329:23): [True: 62.3k, False: 2.38k]
  ------------------
 1330|  62.3k|    av1_find_best_ref_mvs(allow_hp, ref_mvs[mbmi->ref_frame[0]], &nearestmv[0],
 1331|  62.3k|                          &nearmv[0], features->cur_frame_force_integer_mv);
 1332|  62.3k|  }
 1333|       |
 1334|  76.7k|  if (is_compound && mbmi->mode != GLOBAL_GLOBALMV) {
  ------------------
  |  Branch (1334:7): [True: 12.0k, False: 64.7k]
  |  Branch (1334:22): [True: 10.6k, False: 1.38k]
  ------------------
 1335|  10.6k|    const int ref_mv_idx = mbmi->ref_mv_idx + 1;
 1336|  10.6k|    nearestmv[0] = xd->ref_mv_stack[ref_frame][0].this_mv;
 1337|  10.6k|    nearestmv[1] = xd->ref_mv_stack[ref_frame][0].comp_mv;
 1338|  10.6k|    nearmv[0] = xd->ref_mv_stack[ref_frame][ref_mv_idx].this_mv;
 1339|  10.6k|    nearmv[1] = xd->ref_mv_stack[ref_frame][ref_mv_idx].comp_mv;
 1340|  10.6k|    lower_mv_precision(&nearestmv[0].as_mv, allow_hp,
 1341|  10.6k|                       features->cur_frame_force_integer_mv);
 1342|  10.6k|    lower_mv_precision(&nearestmv[1].as_mv, allow_hp,
 1343|  10.6k|                       features->cur_frame_force_integer_mv);
 1344|  10.6k|    lower_mv_precision(&nearmv[0].as_mv, allow_hp,
 1345|  10.6k|                       features->cur_frame_force_integer_mv);
 1346|  10.6k|    lower_mv_precision(&nearmv[1].as_mv, allow_hp,
 1347|  10.6k|                       features->cur_frame_force_integer_mv);
 1348|  66.0k|  } else if (mbmi->ref_mv_idx > 0 && mbmi->mode == NEARMV) {
  ------------------
  |  Branch (1348:14): [True: 7.23k, False: 58.8k]
  |  Branch (1348:38): [True: 1.04k, False: 6.18k]
  ------------------
 1349|  1.04k|    nearmv[0] =
 1350|  1.04k|        xd->ref_mv_stack[mbmi->ref_frame[0]][1 + mbmi->ref_mv_idx].this_mv;
 1351|  1.04k|  }
 1352|       |
 1353|  76.7k|  int_mv ref_mv[2] = { nearestmv[0], nearestmv[1] };
 1354|       |
 1355|  76.7k|  if (is_compound) {
  ------------------
  |  Branch (1355:7): [True: 12.0k, False: 64.7k]
  ------------------
 1356|  12.0k|    int ref_mv_idx = mbmi->ref_mv_idx;
 1357|       |    // Special case: NEAR_NEWMV and NEW_NEARMV modes use
 1358|       |    // 1 + mbmi->ref_mv_idx (like NEARMV) instead of
 1359|       |    // mbmi->ref_mv_idx (like NEWMV)
 1360|  12.0k|    if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV)
  ------------------
  |  Branch (1360:9): [True: 440, False: 11.5k]
  |  Branch (1360:37): [True: 352, False: 11.2k]
  ------------------
 1361|    792|      ref_mv_idx = 1 + mbmi->ref_mv_idx;
 1362|       |
 1363|       |    // TODO(jingning, yunqing): Do we need a lower_mv_precision() call here?
 1364|  12.0k|    if (compound_ref0_mode(mbmi->mode) == NEWMV)
  ------------------
  |  Branch (1364:9): [True: 2.88k, False: 9.11k]
  ------------------
 1365|  2.88k|      ref_mv[0] = xd->ref_mv_stack[ref_frame][ref_mv_idx].this_mv;
 1366|       |
 1367|  12.0k|    if (compound_ref1_mode(mbmi->mode) == NEWMV)
  ------------------
  |  Branch (1367:9): [True: 2.96k, False: 9.03k]
  ------------------
 1368|  2.96k|      ref_mv[1] = xd->ref_mv_stack[ref_frame][ref_mv_idx].comp_mv;
 1369|  64.7k|  } else {
 1370|  64.7k|    if (mbmi->mode == NEWMV) {
  ------------------
  |  Branch (1370:9): [True: 23.9k, False: 40.7k]
  ------------------
 1371|  23.9k|      if (dcb->ref_mv_count[ref_frame] > 1)
  ------------------
  |  Branch (1371:11): [True: 15.0k, False: 8.88k]
  ------------------
 1372|  15.0k|        ref_mv[0] = xd->ref_mv_stack[ref_frame][mbmi->ref_mv_idx].this_mv;
 1373|  23.9k|    }
 1374|  64.7k|  }
 1375|       |
 1376|  76.7k|  if (mbmi->skip_mode) assert(mbmi->mode == NEAREST_NEARESTMV);
  ------------------
  |  Branch (1376:7): [True: 110, False: 76.5k]
  ------------------
 1377|       |
 1378|  76.7k|  const int mv_corrupted_flag =
 1379|  76.7k|      !assign_mv(cm, xd, mbmi->mode, mbmi->ref_frame, mbmi->mv, ref_mv,
 1380|  76.7k|                 nearestmv, nearmv, is_compound, allow_hp, r);
 1381|  76.7k|  aom_merge_corrupted_flag(&dcb->corrupted, mv_corrupted_flag);
 1382|       |
 1383|  76.7k|  mbmi->use_wedge_interintra = 0;
 1384|  76.7k|  if (cm->seq_params->enable_interintra_compound && !mbmi->skip_mode &&
  ------------------
  |  Branch (1384:7): [True: 28.7k, False: 47.9k]
  |  Branch (1384:53): [True: 28.7k, False: 0]
  ------------------
 1385|  28.7k|      is_interintra_allowed(mbmi)) {
  ------------------
  |  Branch (1385:7): [True: 14.2k, False: 14.5k]
  ------------------
 1386|  14.2k|    const int bsize_group = size_group_lookup[bsize];
 1387|  14.2k|    const int interintra =
 1388|  14.2k|        aom_read_symbol(r, ec_ctx->interintra_cdf[bsize_group], 2, ACCT_STR);
  ------------------
  |  |   51|  14.2k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1389|  14.2k|    assert(mbmi->ref_frame[1] == NONE_FRAME);
 1390|  14.2k|    if (interintra) {
  ------------------
  |  Branch (1390:9): [True: 2.50k, False: 11.7k]
  ------------------
 1391|  2.50k|      const INTERINTRA_MODE interintra_mode =
 1392|  2.50k|          read_interintra_mode(xd, r, bsize_group);
 1393|  2.50k|      mbmi->ref_frame[1] = INTRA_FRAME;
 1394|  2.50k|      mbmi->interintra_mode = interintra_mode;
 1395|  2.50k|      mbmi->angle_delta[PLANE_TYPE_Y] = 0;
 1396|  2.50k|      mbmi->angle_delta[PLANE_TYPE_UV] = 0;
 1397|  2.50k|      mbmi->filter_intra_mode_info.use_filter_intra = 0;
 1398|  2.50k|      if (av1_is_wedge_used(bsize)) {
  ------------------
  |  Branch (1398:11): [True: 2.50k, False: 0]
  ------------------
 1399|  2.50k|        mbmi->use_wedge_interintra = aom_read_symbol(
  ------------------
  |  |   51|  2.50k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1400|  2.50k|            r, ec_ctx->wedge_interintra_cdf[bsize], 2, ACCT_STR);
 1401|  2.50k|        if (mbmi->use_wedge_interintra) {
  ------------------
  |  Branch (1401:13): [True: 723, False: 1.78k]
  ------------------
 1402|    723|          mbmi->interintra_wedge_index = (int8_t)aom_read_symbol(
  ------------------
  |  |   51|    723|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1403|    723|              r, ec_ctx->wedge_idx_cdf[bsize], MAX_WEDGE_TYPES, ACCT_STR);
 1404|    723|        }
 1405|  2.50k|      }
 1406|  2.50k|    }
 1407|  14.2k|  }
 1408|       |
 1409|   165k|  for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
  ------------------
  |  Branch (1409:21): [True: 88.6k, False: 76.7k]
  ------------------
 1410|  88.6k|    const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
 1411|  88.6k|    xd->block_ref_scale_factors[ref] = get_ref_scale_factors_const(cm, frame);
 1412|  88.6k|  }
 1413|       |
 1414|  76.7k|  mbmi->motion_mode = SIMPLE_TRANSLATION;
 1415|  76.7k|  if (is_motion_variation_allowed_bsize(mbmi->bsize) && !mbmi->skip_mode &&
  ------------------
  |  Branch (1415:7): [True: 52.1k, False: 24.5k]
  |  Branch (1415:57): [True: 52.0k, False: 111]
  ------------------
 1416|  52.0k|      !has_second_ref(mbmi)) {
  ------------------
  |  Branch (1416:7): [True: 40.1k, False: 11.8k]
  ------------------
 1417|  40.1k|    mbmi->num_proj_ref = av1_findSamples(cm, xd, pts, pts_inref);
 1418|  40.1k|  }
 1419|  76.7k|  av1_count_overlappable_neighbors(cm, xd);
 1420|       |
 1421|  76.7k|  if (mbmi->ref_frame[1] != INTRA_FRAME)
  ------------------
  |  Branch (1421:7): [True: 74.1k, False: 2.51k]
  ------------------
 1422|  74.1k|    mbmi->motion_mode = read_motion_mode(cm, xd, mbmi, r);
 1423|       |
 1424|       |  // init
 1425|  76.7k|  mbmi->comp_group_idx = 0;
 1426|  76.7k|  mbmi->compound_idx = 1;
 1427|  76.7k|  mbmi->interinter_comp.type = COMPOUND_AVERAGE;
 1428|       |
 1429|  76.7k|  if (has_second_ref(mbmi) && !mbmi->skip_mode) {
  ------------------
  |  Branch (1429:7): [True: 12.0k, False: 64.7k]
  |  Branch (1429:31): [True: 11.8k, False: 110]
  ------------------
 1430|       |    // Read idx to indicate current compound inter prediction mode group
 1431|  11.8k|    const int masked_compound_used = is_any_masked_compound_used(bsize) &&
  ------------------
  |  Branch (1431:38): [True: 11.8k, False: 12]
  ------------------
 1432|  11.8k|                                     cm->seq_params->enable_masked_compound;
  ------------------
  |  Branch (1432:38): [True: 7.69k, False: 4.19k]
  ------------------
 1433|       |
 1434|  11.8k|    if (masked_compound_used) {
  ------------------
  |  Branch (1434:9): [True: 7.69k, False: 4.20k]
  ------------------
 1435|  7.69k|      const int ctx_comp_group_idx = get_comp_group_idx_context(xd);
 1436|  7.69k|      mbmi->comp_group_idx = (uint8_t)aom_read_symbol(
  ------------------
  |  |   51|  7.69k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1437|  7.69k|          r, ec_ctx->comp_group_idx_cdf[ctx_comp_group_idx], 2, ACCT_STR);
 1438|  7.69k|    }
 1439|       |
 1440|  11.8k|    if (mbmi->comp_group_idx == 0) {
  ------------------
  |  Branch (1440:9): [True: 10.2k, False: 1.65k]
  ------------------
 1441|  10.2k|      if (cm->seq_params->order_hint_info.enable_dist_wtd_comp) {
  ------------------
  |  Branch (1441:11): [True: 6.20k, False: 4.03k]
  ------------------
 1442|  6.20k|        const int comp_index_ctx = get_comp_index_context(cm, xd);
 1443|  6.20k|        mbmi->compound_idx = (uint8_t)aom_read_symbol(
  ------------------
  |  |   51|  6.20k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1444|  6.20k|            r, ec_ctx->compound_index_cdf[comp_index_ctx], 2, ACCT_STR);
 1445|  6.20k|        mbmi->interinter_comp.type =
 1446|  6.20k|            mbmi->compound_idx ? COMPOUND_AVERAGE : COMPOUND_DISTWTD;
  ------------------
  |  Branch (1446:13): [True: 4.15k, False: 2.04k]
  ------------------
 1447|  6.20k|      } else {
 1448|       |        // Distance-weighted compound is disabled, so always use average
 1449|  4.03k|        mbmi->compound_idx = 1;
 1450|  4.03k|        mbmi->interinter_comp.type = COMPOUND_AVERAGE;
 1451|  4.03k|      }
 1452|  10.2k|    } else {
 1453|  1.65k|      assert(cm->current_frame.reference_mode != SINGLE_REFERENCE &&
 1454|  1.65k|             is_inter_compound_mode(mbmi->mode) &&
 1455|  1.65k|             mbmi->motion_mode == SIMPLE_TRANSLATION);
 1456|  1.65k|      assert(masked_compound_used);
 1457|       |
 1458|       |      // compound_diffwtd, wedge
 1459|  1.65k|      if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) {
  ------------------
  |  Branch (1459:11): [True: 1.51k, False: 139]
  ------------------
 1460|  1.51k|        mbmi->interinter_comp.type =
 1461|  1.51k|            COMPOUND_WEDGE + aom_read_symbol(r,
  ------------------
  |  |   51|  1.51k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1462|  1.51k|                                             ec_ctx->compound_type_cdf[bsize],
 1463|  1.51k|                                             MASKED_COMPOUND_TYPES, ACCT_STR);
 1464|  1.51k|      } else {
 1465|    139|        mbmi->interinter_comp.type = COMPOUND_DIFFWTD;
 1466|    139|      }
 1467|       |
 1468|  1.65k|      if (mbmi->interinter_comp.type == COMPOUND_WEDGE) {
  ------------------
  |  Branch (1468:11): [True: 760, False: 897]
  ------------------
 1469|    760|        assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize));
 1470|    760|        mbmi->interinter_comp.wedge_index = (int8_t)aom_read_symbol(
  ------------------
  |  |   51|    760|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1471|    760|            r, ec_ctx->wedge_idx_cdf[bsize], MAX_WEDGE_TYPES, ACCT_STR);
 1472|    760|        mbmi->interinter_comp.wedge_sign = (int8_t)aom_read_bit(r, ACCT_STR);
  ------------------
  |  |   43|    760|  aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1473|    897|      } else {
 1474|    897|        assert(mbmi->interinter_comp.type == COMPOUND_DIFFWTD);
 1475|    897|        mbmi->interinter_comp.mask_type =
 1476|    897|            aom_read_literal(r, MAX_DIFFWTD_MASK_BITS, ACCT_STR);
  ------------------
  |  |   47|    897|  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1477|    897|      }
 1478|  1.65k|    }
 1479|  11.8k|  }
 1480|       |
 1481|  76.7k|  read_mb_interp_filter(xd, features->interp_filter,
 1482|  76.7k|                        cm->seq_params->enable_dual_filter, mbmi, r);
 1483|       |
 1484|  76.7k|  if (mbmi->motion_mode == WARPED_CAUSAL) {
  ------------------
  |  Branch (1484:7): [True: 3.45k, False: 73.2k]
  ------------------
 1485|  3.45k|    const int mi_row = xd->mi_row;
 1486|  3.45k|    const int mi_col = xd->mi_col;
 1487|  3.45k|    mbmi->wm_params.wmtype = DEFAULT_WMTYPE;
  ------------------
  |  |   32|  3.45k|#define DEFAULT_WMTYPE AFFINE
  ------------------
 1488|  3.45k|    mbmi->wm_params.invalid = 0;
 1489|       |
 1490|  3.45k|    if (mbmi->num_proj_ref > 1) {
  ------------------
  |  Branch (1490:9): [True: 2.13k, False: 1.32k]
  ------------------
 1491|  2.13k|      mbmi->num_proj_ref = av1_selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref,
 1492|  2.13k|                                             mbmi->num_proj_ref, bsize);
 1493|  2.13k|    }
 1494|       |
 1495|  3.45k|    if (av1_find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize,
  ------------------
  |  Branch (1495:9): [True: 218, False: 3.23k]
  ------------------
 1496|  3.45k|                            mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col,
 1497|  3.45k|                            &mbmi->wm_params, mi_row, mi_col)) {
 1498|       |#if WARPED_MOTION_DEBUG
 1499|       |      printf("Warning: unexpected warped model from aomenc\n");
 1500|       |#endif
 1501|    218|      mbmi->wm_params.invalid = 1;
 1502|    218|    }
 1503|  3.45k|  }
 1504|       |
 1505|  76.7k|  xd->cfl.store_y = store_cfl_required(cm, xd);
 1506|       |
 1507|       |#if DEC_MISMATCH_DEBUG
 1508|       |  dec_dump_logs(cm, mi, mi_row, mi_col, mode_ctx);
 1509|       |#endif  // DEC_MISMATCH_DEBUG
 1510|  76.7k|}
decodemv.c:read_ref_frames:
  940|  76.7k|                            MV_REFERENCE_FRAME ref_frame[2]) {
  941|  76.7k|  if (xd->mi[0]->skip_mode) {
  ------------------
  |  Branch (941:7): [True: 110, False: 76.6k]
  ------------------
  942|    110|    set_ref_frames_for_skip_mode(cm, ref_frame);
  943|    110|    return;
  944|    110|  }
  945|       |
  946|  76.6k|  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
  ------------------
  |  Branch (946:7): [True: 84, False: 76.5k]
  ------------------
  947|     84|    ref_frame[0] = (MV_REFERENCE_FRAME)get_segdata(&cm->seg, segment_id,
  948|     84|                                                   SEG_LVL_REF_FRAME);
  949|     84|    ref_frame[1] = NONE_FRAME;
  950|  76.5k|  } else if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP) ||
  ------------------
  |  Branch (950:14): [True: 787, False: 75.7k]
  ------------------
  951|  75.7k|             segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
  ------------------
  |  Branch (951:14): [True: 104, False: 75.6k]
  ------------------
  952|    864|    ref_frame[0] = LAST_FRAME;
  953|    864|    ref_frame[1] = NONE_FRAME;
  954|  75.6k|  } else {
  955|  75.6k|    const REFERENCE_MODE mode = read_block_reference_mode(cm, xd, r);
  956|       |
  957|  75.6k|    if (mode == COMPOUND_REFERENCE) {
  ------------------
  |  Branch (957:9): [True: 11.8k, False: 63.7k]
  ------------------
  958|  11.8k|      const COMP_REFERENCE_TYPE comp_ref_type = read_comp_reference_type(xd, r);
  959|       |
  960|  11.8k|      if (comp_ref_type == UNIDIR_COMP_REFERENCE) {
  ------------------
  |  Branch (960:11): [True: 2.75k, False: 9.13k]
  ------------------
  961|  2.75k|        const int bit = READ_REF_BIT(uni_comp_ref_p);
  ------------------
  |  |  920|  2.75k|  aom_read_symbol(r, av1_get_pred_cdf_##pname(xd), 2, ACCT_STR)
  |  |  ------------------
  |  |  |  |   51|  2.75k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  |  |  ------------------
  ------------------
  962|  2.75k|        if (bit) {
  ------------------
  |  Branch (962:13): [True: 998, False: 1.75k]
  ------------------
  963|    998|          ref_frame[0] = BWDREF_FRAME;
  964|    998|          ref_frame[1] = ALTREF_FRAME;
  965|  1.75k|        } else {
  966|  1.75k|          const int bit1 = READ_REF_BIT(uni_comp_ref_p1);
  ------------------
  |  |  920|  1.75k|  aom_read_symbol(r, av1_get_pred_cdf_##pname(xd), 2, ACCT_STR)
  |  |  ------------------
  |  |  |  |   51|  1.75k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  |  |  ------------------
  ------------------
  967|  1.75k|          if (bit1) {
  ------------------
  |  Branch (967:15): [True: 1.23k, False: 526]
  ------------------
  968|  1.23k|            const int bit2 = READ_REF_BIT(uni_comp_ref_p2);
  ------------------
  |  |  920|  1.23k|  aom_read_symbol(r, av1_get_pred_cdf_##pname(xd), 2, ACCT_STR)
  |  |  ------------------
  |  |  |  |   51|  1.23k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  |  |  ------------------
  ------------------
  969|  1.23k|            if (bit2) {
  ------------------
  |  Branch (969:17): [True: 547, False: 683]
  ------------------
  970|    547|              ref_frame[0] = LAST_FRAME;
  971|    547|              ref_frame[1] = GOLDEN_FRAME;
  972|    683|            } else {
  973|    683|              ref_frame[0] = LAST_FRAME;
  974|    683|              ref_frame[1] = LAST3_FRAME;
  975|    683|            }
  976|  1.23k|          } else {
  977|    526|            ref_frame[0] = LAST_FRAME;
  978|    526|            ref_frame[1] = LAST2_FRAME;
  979|    526|          }
  980|  1.75k|        }
  981|       |
  982|  2.75k|        return;
  983|  2.75k|      }
  984|       |
  985|  11.8k|      assert(comp_ref_type == BIDIR_COMP_REFERENCE);
  986|       |
  987|  9.13k|      const int idx = 1;
  988|  9.13k|      const int bit = READ_REF_BIT(comp_ref_p);
  ------------------
  |  |  920|  9.13k|  aom_read_symbol(r, av1_get_pred_cdf_##pname(xd), 2, ACCT_STR)
  |  |  ------------------
  |  |  |  |   51|  9.13k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  |  |  ------------------
  ------------------
  989|       |      // Decode forward references.
  990|  9.13k|      if (!bit) {
  ------------------
  |  Branch (990:11): [True: 6.19k, False: 2.94k]
  ------------------
  991|  6.19k|        const int bit1 = READ_REF_BIT(comp_ref_p1);
  ------------------
  |  |  920|  6.19k|  aom_read_symbol(r, av1_get_pred_cdf_##pname(xd), 2, ACCT_STR)
  |  |  ------------------
  |  |  |  |   51|  6.19k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  |  |  ------------------
  ------------------
  992|  6.19k|        ref_frame[!idx] = bit1 ? LAST2_FRAME : LAST_FRAME;
  ------------------
  |  Branch (992:27): [True: 1.16k, False: 5.02k]
  ------------------
  993|  6.19k|      } else {
  994|  2.94k|        const int bit2 = READ_REF_BIT(comp_ref_p2);
  ------------------
  |  |  920|  2.94k|  aom_read_symbol(r, av1_get_pred_cdf_##pname(xd), 2, ACCT_STR)
  |  |  ------------------
  |  |  |  |   51|  2.94k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  |  |  ------------------
  ------------------
  995|  2.94k|        ref_frame[!idx] = bit2 ? GOLDEN_FRAME : LAST3_FRAME;
  ------------------
  |  Branch (995:27): [True: 1.55k, False: 1.38k]
  ------------------
  996|  2.94k|      }
  997|       |
  998|       |      // Decode backward references.
  999|  9.13k|      const int bit_bwd = READ_REF_BIT(comp_bwdref_p);
  ------------------
  |  |  920|  9.13k|  aom_read_symbol(r, av1_get_pred_cdf_##pname(xd), 2, ACCT_STR)
  |  |  ------------------
  |  |  |  |   51|  9.13k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  |  |  ------------------
  ------------------
 1000|  9.13k|      if (!bit_bwd) {
  ------------------
  |  Branch (1000:11): [True: 4.75k, False: 4.38k]
  ------------------
 1001|  4.75k|        const int bit1_bwd = READ_REF_BIT(comp_bwdref_p1);
  ------------------
  |  |  920|  4.75k|  aom_read_symbol(r, av1_get_pred_cdf_##pname(xd), 2, ACCT_STR)
  |  |  ------------------
  |  |  |  |   51|  4.75k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  |  |  ------------------
  ------------------
 1002|  4.75k|        ref_frame[idx] = bit1_bwd ? ALTREF2_FRAME : BWDREF_FRAME;
  ------------------
  |  Branch (1002:26): [True: 2.62k, False: 2.12k]
  ------------------
 1003|  4.75k|      } else {
 1004|  4.38k|        ref_frame[idx] = ALTREF_FRAME;
 1005|  4.38k|      }
 1006|  63.7k|    } else if (mode == SINGLE_REFERENCE) {
  ------------------
  |  Branch (1006:16): [True: 63.7k, False: 9]
  ------------------
 1007|  63.7k|      const int bit0 = READ_REF_BIT(single_ref_p1);
  ------------------
  |  |  920|  63.7k|  aom_read_symbol(r, av1_get_pred_cdf_##pname(xd), 2, ACCT_STR)
  |  |  ------------------
  |  |  |  |   51|  63.7k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  |  |  ------------------
  ------------------
 1008|  63.7k|      if (bit0) {
  ------------------
  |  Branch (1008:11): [True: 22.8k, False: 40.8k]
  ------------------
 1009|  22.8k|        const int bit1 = READ_REF_BIT(single_ref_p2);
  ------------------
  |  |  920|  22.8k|  aom_read_symbol(r, av1_get_pred_cdf_##pname(xd), 2, ACCT_STR)
  |  |  ------------------
  |  |  |  |   51|  22.8k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  |  |  ------------------
  ------------------
 1010|  22.8k|        if (!bit1) {
  ------------------
  |  Branch (1010:13): [True: 10.2k, False: 12.6k]
  ------------------
 1011|  10.2k|          const int bit5 = READ_REF_BIT(single_ref_p6);
  ------------------
  |  |  920|  10.2k|  aom_read_symbol(r, av1_get_pred_cdf_##pname(xd), 2, ACCT_STR)
  |  |  ------------------
  |  |  |  |   51|  10.2k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  |  |  ------------------
  ------------------
 1012|  10.2k|          ref_frame[0] = bit5 ? ALTREF2_FRAME : BWDREF_FRAME;
  ------------------
  |  Branch (1012:26): [True: 5.54k, False: 4.71k]
  ------------------
 1013|  12.6k|        } else {
 1014|  12.6k|          ref_frame[0] = ALTREF_FRAME;
 1015|  12.6k|        }
 1016|  40.8k|      } else {
 1017|  40.8k|        const int bit2 = READ_REF_BIT(single_ref_p3);
  ------------------
  |  |  920|  40.8k|  aom_read_symbol(r, av1_get_pred_cdf_##pname(xd), 2, ACCT_STR)
  |  |  ------------------
  |  |  |  |   51|  40.8k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  |  |  ------------------
  ------------------
 1018|  40.8k|        if (bit2) {
  ------------------
  |  Branch (1018:13): [True: 9.40k, False: 31.4k]
  ------------------
 1019|  9.40k|          const int bit4 = READ_REF_BIT(single_ref_p5);
  ------------------
  |  |  920|  9.40k|  aom_read_symbol(r, av1_get_pred_cdf_##pname(xd), 2, ACCT_STR)
  |  |  ------------------
  |  |  |  |   51|  9.40k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  |  |  ------------------
  ------------------
 1020|  9.40k|          ref_frame[0] = bit4 ? GOLDEN_FRAME : LAST3_FRAME;
  ------------------
  |  Branch (1020:26): [True: 6.35k, False: 3.05k]
  ------------------
 1021|  31.4k|        } else {
 1022|  31.4k|          const int bit3 = READ_REF_BIT(single_ref_p4);
  ------------------
  |  |  920|  31.4k|  aom_read_symbol(r, av1_get_pred_cdf_##pname(xd), 2, ACCT_STR)
  |  |  ------------------
  |  |  |  |   51|  31.4k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  |  |  ------------------
  ------------------
 1023|  31.4k|          ref_frame[0] = bit3 ? LAST2_FRAME : LAST_FRAME;
  ------------------
  |  Branch (1023:26): [True: 2.75k, False: 28.7k]
  ------------------
 1024|  31.4k|        }
 1025|  40.8k|      }
 1026|       |
 1027|  63.7k|      ref_frame[1] = NONE_FRAME;
 1028|  63.7k|    } else {
 1029|       |      assert(0 && "Invalid prediction mode.");
 1030|      9|    }
 1031|  75.6k|  }
 1032|  76.6k|}
decodemv.c:set_ref_frames_for_skip_mode:
  932|    110|                                         MV_REFERENCE_FRAME ref_frame[2]) {
  933|    110|  ref_frame[0] = LAST_FRAME + cm->current_frame.skip_mode_info.ref_frame_idx_0;
  934|    110|  ref_frame[1] = LAST_FRAME + cm->current_frame.skip_mode_info.ref_frame_idx_1;
  935|    110|}
decodemv.c:read_block_reference_mode:
  906|  75.6k|                                                aom_reader *r) {
  907|  75.6k|  if (!is_comp_ref_allowed(xd->mi[0]->bsize)) return SINGLE_REFERENCE;
  ------------------
  |  Branch (907:7): [True: 24.2k, False: 51.3k]
  ------------------
  908|  51.3k|  if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) {
  ------------------
  |  Branch (908:7): [True: 22.6k, False: 28.7k]
  ------------------
  909|  22.6k|    const int ctx = av1_get_reference_mode_context(xd);
  910|  22.6k|    const REFERENCE_MODE mode = (REFERENCE_MODE)aom_read_symbol(
  ------------------
  |  |   51|  22.6k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  911|  22.6k|        r, xd->tile_ctx->comp_inter_cdf[ctx], 2, ACCT_STR);
  912|  22.6k|    return mode;  // SINGLE_REFERENCE or COMPOUND_REFERENCE
  913|  28.7k|  } else {
  914|       |    assert(cm->current_frame.reference_mode == SINGLE_REFERENCE);
  915|  28.7k|    return cm->current_frame.reference_mode;
  916|  28.7k|  }
  917|  51.3k|}
decodemv.c:read_comp_reference_type:
  923|  11.8k|                                                    aom_reader *r) {
  924|  11.8k|  const int ctx = av1_get_comp_reference_type_context(xd);
  925|  11.8k|  const COMP_REFERENCE_TYPE comp_ref_type =
  926|  11.8k|      (COMP_REFERENCE_TYPE)aom_read_symbol(
  ------------------
  |  |   51|  11.8k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  927|  11.8k|          r, xd->tile_ctx->comp_ref_type_cdf[ctx], 2, ACCT_STR);
  928|  11.8k|  return comp_ref_type;  // UNIDIR_COMP_REFERENCE or BIDIR_COMP_REFERENCE
  929|  11.8k|}
decodemv.c:read_inter_compound_mode:
  250|  11.8k|                                                int16_t ctx) {
  251|  11.8k|  const int mode =
  252|  11.8k|      aom_read_symbol(r, xd->tile_ctx->inter_compound_mode_cdf[ctx],
  ------------------
  |  |   51|  11.8k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  253|  11.8k|                      INTER_COMPOUND_MODES, ACCT_STR);
  254|       |  assert(is_inter_compound_mode(NEAREST_NEARESTMV + mode));
  255|  11.8k|  return NEAREST_NEARESTMV + mode;
  256|  11.8k|}
decodemv.c:read_inter_mode:
  178|  63.7k|                                       int16_t ctx) {
  179|  63.7k|  int16_t mode_ctx = ctx & NEWMV_CTX_MASK;
  ------------------
  |  |  490|  63.7k|#define NEWMV_CTX_MASK ((1 << GLOBALMV_OFFSET) - 1)
  |  |  ------------------
  |  |  |  |  487|  63.7k|#define GLOBALMV_OFFSET 3
  |  |  ------------------
  ------------------
  180|  63.7k|  int is_newmv, is_zeromv, is_refmv;
  181|  63.7k|  is_newmv = aom_read_symbol(r, ec_ctx->newmv_cdf[mode_ctx], 2, ACCT_STR) == 0;
  ------------------
  |  |   51|  63.7k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  182|  63.7k|  if (is_newmv) return NEWMV;
  ------------------
  |  Branch (182:7): [True: 23.9k, False: 39.7k]
  ------------------
  183|       |
  184|  39.7k|  mode_ctx = (ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
  ------------------
  |  |  487|  39.7k|#define GLOBALMV_OFFSET 3
  ------------------
                mode_ctx = (ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
  ------------------
  |  |  491|  39.7k|#define GLOBALMV_CTX_MASK ((1 << (REFMV_OFFSET - GLOBALMV_OFFSET)) - 1)
  |  |  ------------------
  |  |  |  |  488|  39.7k|#define REFMV_OFFSET 4
  |  |  ------------------
  |  |               #define GLOBALMV_CTX_MASK ((1 << (REFMV_OFFSET - GLOBALMV_OFFSET)) - 1)
  |  |  ------------------
  |  |  |  |  487|  39.7k|#define GLOBALMV_OFFSET 3
  |  |  ------------------
  ------------------
  185|  39.7k|  is_zeromv =
  186|  39.7k|      aom_read_symbol(r, ec_ctx->zeromv_cdf[mode_ctx], 2, ACCT_STR) == 0;
  ------------------
  |  |   51|  39.7k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  187|  39.7k|  if (is_zeromv) return GLOBALMV;
  ------------------
  |  Branch (187:7): [True: 1.44k, False: 38.3k]
  ------------------
  188|       |
  189|  38.3k|  mode_ctx = (ctx >> REFMV_OFFSET) & REFMV_CTX_MASK;
  ------------------
  |  |  488|  38.3k|#define REFMV_OFFSET 4
  ------------------
                mode_ctx = (ctx >> REFMV_OFFSET) & REFMV_CTX_MASK;
  ------------------
  |  |  492|  38.3k|#define REFMV_CTX_MASK ((1 << (8 - REFMV_OFFSET)) - 1)
  |  |  ------------------
  |  |  |  |  488|  38.3k|#define REFMV_OFFSET 4
  |  |  ------------------
  ------------------
  190|  38.3k|  is_refmv = aom_read_symbol(r, ec_ctx->refmv_cdf[mode_ctx], 2, ACCT_STR) == 0;
  ------------------
  |  |   51|  38.3k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  191|  38.3k|  if (is_refmv)
  ------------------
  |  Branch (191:7): [True: 28.7k, False: 9.56k]
  ------------------
  192|  28.7k|    return NEARESTMV;
  193|  9.56k|  else
  194|  9.56k|    return NEARMV;
  195|  38.3k|}
decodemv.c:read_drl_idx:
  198|  38.2k|                         MB_MODE_INFO *mbmi, aom_reader *r) {
  199|  38.2k|  MACROBLOCKD *const xd = &dcb->xd;
  200|  38.2k|  uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
  201|  38.2k|  mbmi->ref_mv_idx = 0;
  202|  38.2k|  if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) {
  ------------------
  |  Branch (202:7): [True: 23.9k, False: 14.2k]
  |  Branch (202:30): [True: 1.73k, False: 12.5k]
  ------------------
  203|  55.4k|    for (int idx = 0; idx < 2; ++idx) {
  ------------------
  |  Branch (203:23): [True: 41.3k, False: 14.0k]
  ------------------
  204|  41.3k|      if (dcb->ref_mv_count[ref_frame_type] > idx + 1) {
  ------------------
  |  Branch (204:11): [True: 19.4k, False: 21.9k]
  ------------------
  205|  19.4k|        uint8_t drl_ctx = av1_drl_ctx(xd->weight[ref_frame_type], idx);
  206|  19.4k|        int drl_idx = aom_read_symbol(r, ec_ctx->drl_cdf[drl_ctx], 2, ACCT_STR);
  ------------------
  |  |   51|  19.4k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  207|  19.4k|        mbmi->ref_mv_idx = idx + drl_idx;
  208|  19.4k|        if (!drl_idx) return;
  ------------------
  |  Branch (208:13): [True: 11.6k, False: 7.81k]
  ------------------
  209|  19.4k|      }
  210|  41.3k|    }
  211|  25.7k|  }
  212|  26.6k|  if (have_nearmv_in_inter_mode(mbmi->mode)) {
  ------------------
  |  Branch (212:7): [True: 12.5k, False: 14.0k]
  ------------------
  213|       |    // Offset the NEARESTMV mode.
  214|       |    // TODO(jingning): Unify the two syntax decoding loops after the NEARESTMV
  215|       |    // mode is factored in.
  216|  33.4k|    for (int idx = 1; idx < 3; ++idx) {
  ------------------
  |  Branch (216:23): [True: 23.1k, False: 10.2k]
  ------------------
  217|  23.1k|      if (dcb->ref_mv_count[ref_frame_type] > idx + 1) {
  ------------------
  |  Branch (217:11): [True: 3.51k, False: 19.6k]
  ------------------
  218|  3.51k|        uint8_t drl_ctx = av1_drl_ctx(xd->weight[ref_frame_type], idx);
  219|  3.51k|        int drl_idx = aom_read_symbol(r, ec_ctx->drl_cdf[drl_ctx], 2, ACCT_STR);
  ------------------
  |  |   51|  3.51k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  220|  3.51k|        mbmi->ref_mv_idx = idx + drl_idx - 1;
  221|  3.51k|        if (!drl_idx) return;
  ------------------
  |  Branch (221:13): [True: 2.26k, False: 1.25k]
  ------------------
  222|  3.51k|      }
  223|  23.1k|    }
  224|  12.5k|  }
  225|  26.6k|}
decodemv.c:assign_mv:
 1119|  76.7k|                            aom_reader *r) {
 1120|  76.7k|  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 1121|  76.7k|  MB_MODE_INFO *mbmi = xd->mi[0];
 1122|  76.7k|  BLOCK_SIZE bsize = mbmi->bsize;
 1123|  76.7k|  FeatureFlags *const features = &cm->features;
 1124|  76.7k|  if (features->cur_frame_force_integer_mv) {
  ------------------
  |  Branch (1124:7): [True: 5.68k, False: 71.0k]
  ------------------
 1125|  5.68k|    allow_hp = MV_SUBPEL_NONE;
 1126|  5.68k|  }
 1127|  76.7k|  switch (mode) {
 1128|  23.9k|    case NEWMV: {
  ------------------
  |  Branch (1128:5): [True: 23.9k, False: 52.7k]
  ------------------
 1129|  23.9k|      nmv_context *const nmvc = &ec_ctx->nmvc;
 1130|  23.9k|      read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, nmvc, allow_hp);
 1131|  23.9k|      break;
 1132|      0|    }
 1133|  28.7k|    case NEARESTMV: {
  ------------------
  |  Branch (1133:5): [True: 28.7k, False: 47.9k]
  ------------------
 1134|  28.7k|      mv[0].as_int = nearest_mv[0].as_int;
 1135|  28.7k|      break;
 1136|      0|    }
 1137|  9.58k|    case NEARMV: {
  ------------------
  |  Branch (1137:5): [True: 9.58k, False: 67.1k]
  ------------------
 1138|  9.58k|      mv[0].as_int = near_mv[0].as_int;
 1139|  9.58k|      break;
 1140|      0|    }
 1141|  2.38k|    case GLOBALMV: {
  ------------------
  |  Branch (1141:5): [True: 2.38k, False: 74.3k]
  ------------------
 1142|  2.38k|      mv[0].as_int = gm_get_motion_vector(&cm->global_motion[ref_frame[0]],
 1143|  2.38k|                                          features->allow_high_precision_mv,
 1144|  2.38k|                                          bsize, xd->mi_col, xd->mi_row,
 1145|  2.38k|                                          features->cur_frame_force_integer_mv)
 1146|  2.38k|                         .as_int;
 1147|  2.38k|      break;
 1148|      0|    }
 1149|  1.73k|    case NEW_NEWMV: {
  ------------------
  |  Branch (1149:5): [True: 1.73k, False: 74.9k]
  ------------------
 1150|  1.73k|      assert(is_compound);
 1151|  5.21k|      for (int i = 0; i < 2; ++i) {
  ------------------
  |  Branch (1151:23): [True: 3.47k, False: 1.73k]
  ------------------
 1152|  3.47k|        nmv_context *const nmvc = &ec_ctx->nmvc;
 1153|  3.47k|        read_mv(r, &mv[i].as_mv, &ref_mv[i].as_mv, nmvc, allow_hp);
 1154|  3.47k|      }
 1155|  1.73k|      break;
 1156|      0|    }
 1157|  4.33k|    case NEAREST_NEARESTMV: {
  ------------------
  |  Branch (1157:5): [True: 4.33k, False: 72.3k]
  ------------------
 1158|  4.33k|      assert(is_compound);
 1159|  4.33k|      mv[0].as_int = nearest_mv[0].as_int;
 1160|  4.33k|      mv[1].as_int = nearest_mv[1].as_int;
 1161|  4.33k|      break;
 1162|      0|    }
 1163|  2.16k|    case NEAR_NEARMV: {
  ------------------
  |  Branch (1163:5): [True: 2.16k, False: 74.5k]
  ------------------
 1164|  2.16k|      assert(is_compound);
 1165|  2.16k|      mv[0].as_int = near_mv[0].as_int;
 1166|  2.16k|      mv[1].as_int = near_mv[1].as_int;
 1167|  2.16k|      break;
 1168|      0|    }
 1169|    795|    case NEW_NEARESTMV: {
  ------------------
  |  Branch (1169:5): [True: 795, False: 75.9k]
  ------------------
 1170|    795|      nmv_context *const nmvc = &ec_ctx->nmvc;
 1171|    795|      read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, nmvc, allow_hp);
 1172|    795|      assert(is_compound);
 1173|    795|      mv[1].as_int = nearest_mv[1].as_int;
 1174|    795|      break;
 1175|      0|    }
 1176|    792|    case NEAREST_NEWMV: {
  ------------------
  |  Branch (1176:5): [True: 792, False: 75.9k]
  ------------------
 1177|    792|      nmv_context *const nmvc = &ec_ctx->nmvc;
 1178|    792|      mv[0].as_int = nearest_mv[0].as_int;
 1179|    792|      read_mv(r, &mv[1].as_mv, &ref_mv[1].as_mv, nmvc, allow_hp);
 1180|    792|      assert(is_compound);
 1181|    792|      break;
 1182|      0|    }
 1183|    440|    case NEAR_NEWMV: {
  ------------------
  |  Branch (1183:5): [True: 440, False: 76.2k]
  ------------------
 1184|    440|      nmv_context *const nmvc = &ec_ctx->nmvc;
 1185|    440|      mv[0].as_int = near_mv[0].as_int;
 1186|    440|      read_mv(r, &mv[1].as_mv, &ref_mv[1].as_mv, nmvc, allow_hp);
 1187|    440|      assert(is_compound);
 1188|    440|      break;
 1189|      0|    }
 1190|    352|    case NEW_NEARMV: {
  ------------------
  |  Branch (1190:5): [True: 352, False: 76.3k]
  ------------------
 1191|    352|      nmv_context *const nmvc = &ec_ctx->nmvc;
 1192|    352|      read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, nmvc, allow_hp);
 1193|    352|      assert(is_compound);
 1194|    352|      mv[1].as_int = near_mv[1].as_int;
 1195|    352|      break;
 1196|      0|    }
 1197|  1.38k|    case GLOBAL_GLOBALMV: {
  ------------------
  |  Branch (1197:5): [True: 1.38k, False: 75.3k]
  ------------------
 1198|  1.38k|      assert(is_compound);
 1199|  1.38k|      mv[0].as_int = gm_get_motion_vector(&cm->global_motion[ref_frame[0]],
 1200|  1.38k|                                          features->allow_high_precision_mv,
 1201|  1.38k|                                          bsize, xd->mi_col, xd->mi_row,
 1202|  1.38k|                                          features->cur_frame_force_integer_mv)
 1203|  1.38k|                         .as_int;
 1204|  1.38k|      mv[1].as_int = gm_get_motion_vector(&cm->global_motion[ref_frame[1]],
 1205|  1.38k|                                          features->allow_high_precision_mv,
 1206|  1.38k|                                          bsize, xd->mi_col, xd->mi_row,
 1207|  1.38k|                                          features->cur_frame_force_integer_mv)
 1208|  1.38k|                         .as_int;
 1209|  1.38k|      break;
 1210|      0|    }
 1211|      0|    default: {
  ------------------
  |  Branch (1211:5): [True: 0, False: 76.7k]
  ------------------
 1212|      0|      return 0;
 1213|      0|    }
 1214|  76.7k|  }
 1215|       |
 1216|  76.7k|  int ret = is_mv_valid(&mv[0].as_mv);
 1217|  76.7k|  if (is_compound) {
  ------------------
  |  Branch (1217:7): [True: 12.0k, False: 64.7k]
  ------------------
 1218|  12.0k|    ret = ret && is_mv_valid(&mv[1].as_mv);
  ------------------
  |  Branch (1218:11): [True: 12.0k, False: 0]
  |  Branch (1218:18): [True: 12.0k, False: 2]
  ------------------
 1219|  12.0k|  }
 1220|  76.7k|  return ret;
 1221|  76.7k|}
decodemv.c:read_interintra_mode:
  170|  2.50k|                                            int size_group) {
  171|  2.50k|  const INTERINTRA_MODE ii_mode = (INTERINTRA_MODE)aom_read_symbol(
  ------------------
  |  |   51|  2.50k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  172|  2.50k|      r, xd->tile_ctx->interintra_mode_cdf[size_group], INTERINTRA_MODES,
  173|  2.50k|      ACCT_STR);
  174|  2.50k|  return ii_mode;
  175|  2.50k|}
decodemv.c:read_motion_mode:
  228|  74.1k|                                    MB_MODE_INFO *mbmi, aom_reader *r) {
  229|  74.1k|  if (cm->features.switchable_motion_mode == 0) return SIMPLE_TRANSLATION;
  ------------------
  |  Branch (229:7): [True: 11.1k, False: 63.0k]
  ------------------
  230|  63.0k|  if (mbmi->skip_mode) return SIMPLE_TRANSLATION;
  ------------------
  |  Branch (230:7): [True: 106, False: 62.9k]
  ------------------
  231|       |
  232|  62.9k|  const MOTION_MODE last_motion_mode_allowed = motion_mode_allowed(
  233|  62.9k|      xd->global_motion, xd, mbmi, cm->features.allow_warped_motion);
  234|  62.9k|  int motion_mode;
  235|       |
  236|  62.9k|  if (last_motion_mode_allowed == SIMPLE_TRANSLATION) return SIMPLE_TRANSLATION;
  ------------------
  |  Branch (236:7): [True: 33.8k, False: 29.0k]
  ------------------
  237|       |
  238|  29.0k|  if (last_motion_mode_allowed == OBMC_CAUSAL) {
  ------------------
  |  Branch (238:7): [True: 13.4k, False: 15.6k]
  ------------------
  239|  13.4k|    motion_mode =
  240|  13.4k|        aom_read_symbol(r, xd->tile_ctx->obmc_cdf[mbmi->bsize], 2, ACCT_STR);
  ------------------
  |  |   51|  13.4k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  241|  13.4k|    return (MOTION_MODE)(SIMPLE_TRANSLATION + motion_mode);
  242|  15.6k|  } else {
  243|  15.6k|    motion_mode = aom_read_symbol(r, xd->tile_ctx->motion_mode_cdf[mbmi->bsize],
  ------------------
  |  |   51|  15.6k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  244|  15.6k|                                  MOTION_MODES, ACCT_STR);
  245|  15.6k|    return (MOTION_MODE)(SIMPLE_TRANSLATION + motion_mode);
  246|  15.6k|  }
  247|  29.0k|}
decodemv.c:read_mb_interp_filter:
 1038|  76.6k|                                         aom_reader *r) {
 1039|  76.6k|  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 1040|       |
 1041|  76.6k|  if (!av1_is_interp_needed(xd)) {
  ------------------
  |  Branch (1041:7): [True: 6.14k, False: 70.5k]
  ------------------
 1042|  6.14k|    set_default_interp_filters(mbmi, interp_filter);
 1043|  6.14k|    return;
 1044|  6.14k|  }
 1045|       |
 1046|  70.5k|  if (interp_filter != SWITCHABLE) {
  ------------------
  |  Branch (1046:7): [True: 53.9k, False: 16.5k]
  ------------------
 1047|  53.9k|    mbmi->interp_filters = av1_broadcast_interp_filter(interp_filter);
 1048|  53.9k|  } else {
 1049|  16.5k|    InterpFilter ref0_filter[2] = { EIGHTTAP_REGULAR, EIGHTTAP_REGULAR };
 1050|  47.0k|    for (int dir = 0; dir < 2; ++dir) {
  ------------------
  |  Branch (1050:23): [True: 31.7k, False: 15.2k]
  ------------------
 1051|  31.7k|      const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
 1052|  31.7k|      ref0_filter[dir] = (InterpFilter)aom_read_symbol(
  ------------------
  |  |   51|  31.7k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1053|  31.7k|          r, ec_ctx->switchable_interp_cdf[ctx], SWITCHABLE_FILTERS, ACCT_STR);
 1054|  31.7k|      if (!enable_dual_filter) {
  ------------------
  |  Branch (1054:11): [True: 1.35k, False: 30.4k]
  ------------------
 1055|  1.35k|        ref0_filter[1] = ref0_filter[0];
 1056|  1.35k|        break;
 1057|  1.35k|      }
 1058|  31.7k|    }
 1059|       |    // The index system works as: (0, 1) -> (vertical, horizontal) filter types
 1060|  16.5k|    mbmi->interp_filters.as_filters.x_filter = ref0_filter[1];
 1061|  16.5k|    mbmi->interp_filters.as_filters.y_filter = ref0_filter[0];
 1062|  16.5k|  }
 1063|  70.5k|}
decodemv.c:read_intra_block_mode_info:
 1068|  62.8k|                                       aom_reader *r) {
 1069|  62.8k|  const BLOCK_SIZE bsize = mbmi->bsize;
 1070|  62.8k|  const int use_angle_delta = av1_use_angle_delta(bsize);
 1071|       |
 1072|  62.8k|  mbmi->ref_frame[0] = INTRA_FRAME;
 1073|  62.8k|  mbmi->ref_frame[1] = NONE_FRAME;
 1074|       |
 1075|  62.8k|  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 1076|       |
 1077|  62.8k|  mbmi->mode = read_intra_mode(r, ec_ctx->y_mode_cdf[size_group_lookup[bsize]]);
 1078|       |
 1079|  62.8k|  mbmi->angle_delta[PLANE_TYPE_Y] =
 1080|  62.8k|      use_angle_delta && av1_is_directional_mode(mbmi->mode)
  ------------------
  |  Branch (1080:7): [True: 60.6k, False: 2.20k]
  |  Branch (1080:26): [True: 1.21k, False: 59.4k]
  ------------------
 1081|  62.8k|          ? read_angle_delta(r, ec_ctx->angle_delta_cdf[mbmi->mode - V_PRED])
 1082|  62.8k|          : 0;
 1083|  62.8k|  if (!cm->seq_params->monochrome && xd->is_chroma_ref) {
  ------------------
  |  Branch (1083:7): [True: 60.4k, False: 2.34k]
  |  Branch (1083:38): [True: 58.9k, False: 1.54k]
  ------------------
 1084|  58.9k|    mbmi->uv_mode =
 1085|  58.9k|        read_intra_mode_uv(ec_ctx, r, is_cfl_allowed(xd), mbmi->mode);
 1086|  58.9k|    if (mbmi->uv_mode == UV_CFL_PRED) {
  ------------------
  |  Branch (1086:9): [True: 1.01k, False: 57.9k]
  ------------------
 1087|  1.01k|      mbmi->cfl_alpha_idx =
 1088|  1.01k|          read_cfl_alphas(xd->tile_ctx, r, &mbmi->cfl_alpha_signs);
 1089|  1.01k|    }
 1090|  58.9k|    const PREDICTION_MODE intra_mode = get_uv_mode(mbmi->uv_mode);
 1091|  58.9k|    mbmi->angle_delta[PLANE_TYPE_UV] =
 1092|  58.9k|        use_angle_delta && av1_is_directional_mode(intra_mode)
  ------------------
  |  Branch (1092:9): [True: 57.8k, False: 1.07k]
  |  Branch (1092:28): [True: 1.00k, False: 56.8k]
  ------------------
 1093|  58.9k|            ? read_angle_delta(r, ec_ctx->angle_delta_cdf[intra_mode - V_PRED])
 1094|  58.9k|            : 0;
 1095|  58.9k|  } else {
 1096|       |    // Avoid decoding angle_info if there is no chroma prediction
 1097|  3.88k|    mbmi->uv_mode = UV_DC_PRED;
 1098|  3.88k|  }
 1099|  62.8k|  xd->cfl.store_y = store_cfl_required(cm, xd);
 1100|       |
 1101|  62.8k|  mbmi->palette_mode_info.palette_size[0] = 0;
 1102|  62.8k|  mbmi->palette_mode_info.palette_size[1] = 0;
 1103|  62.8k|  if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize))
  ------------------
  |  Branch (1103:7): [True: 601, False: 62.2k]
  ------------------
 1104|    601|    read_palette_mode_info(cm, xd, r);
 1105|       |
 1106|  62.8k|  read_filter_intra_mode_info(cm, xd, r);
 1107|  62.8k|}

av1_decoder_create:
   90|  17.9k|AV1Decoder *av1_decoder_create(BufferPool *const pool) {
   91|  17.9k|  AV1Decoder *volatile const pbi = aom_memalign(32, sizeof(*pbi));
   92|  17.9k|  if (!pbi) return NULL;
  ------------------
  |  Branch (92:7): [True: 0, False: 17.9k]
  ------------------
   93|  17.9k|  av1_zero(*pbi);
  ------------------
  |  |   43|  17.9k|#define av1_zero(dest) memset(&(dest), 0, sizeof(dest))
  ------------------
   94|       |
   95|  17.9k|  AV1_COMMON *volatile const cm = &pbi->common;
   96|  17.9k|  cm->seq_params = &pbi->seq_params;
   97|  17.9k|  cm->error = &pbi->error;
   98|       |
   99|       |  // The jmp_buf is valid only for the duration of the function that calls
  100|       |  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
  101|       |  // before it returns.
  102|  17.9k|  if (setjmp(pbi->error.jmp)) {
  ------------------
  |  Branch (102:7): [True: 0, False: 17.9k]
  ------------------
  103|      0|    pbi->error.setjmp = 0;
  104|      0|    av1_decoder_remove(pbi);
  105|      0|    return NULL;
  106|      0|  }
  107|       |
  108|  17.9k|  pbi->error.setjmp = 1;
  109|       |
  110|  17.9k|  CHECK_MEM_ERROR(cm, cm->fc,
  ------------------
  |  |   51|  17.9k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  17.9k|  do {                                                    \
  |  |  |  |   69|  17.9k|    lval = (expr);                                        \
  |  |  |  |   70|  17.9k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 17.9k]
  |  |  |  |  ------------------
  |  |  |  |   71|  17.9k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  17.9k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 17.9k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  111|  17.9k|                  (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->fc)));
  112|  17.9k|  CHECK_MEM_ERROR(
  ------------------
  |  |   51|  17.9k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  17.9k|  do {                                                    \
  |  |  |  |   69|  17.9k|    lval = (expr);                                        \
  |  |  |  |   70|  17.9k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 17.9k]
  |  |  |  |  ------------------
  |  |  |  |   71|  17.9k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  17.9k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded, False: 17.9k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  113|  17.9k|      cm, cm->default_frame_context,
  114|  17.9k|      (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->default_frame_context)));
  115|  17.9k|  memset(cm->fc, 0, sizeof(*cm->fc));
  116|  17.9k|  memset(cm->default_frame_context, 0, sizeof(*cm->default_frame_context));
  117|       |
  118|  17.9k|  pbi->need_resync = 1;
  119|  17.9k|  initialize_dec();
  120|       |
  121|       |  // Initialize the references to not point to any frame buffers.
  122|   161k|  for (int i = 0; i < REF_FRAMES; i++) {
  ------------------
  |  Branch (122:19): [True: 143k, False: 17.9k]
  ------------------
  123|   143k|    cm->ref_frame_map[i] = NULL;
  124|   143k|  }
  125|       |
  126|  17.9k|  cm->current_frame.frame_number = 0;
  127|  17.9k|  pbi->decoding_first_frame = 1;
  128|  17.9k|  pbi->common.buffer_pool = pool;
  129|       |
  130|  17.9k|  cm->seq_params->bit_depth = AOM_BITS_8;
  131|       |
  132|  17.9k|  cm->mi_params.free_mi = dec_free_mi;
  133|  17.9k|  cm->mi_params.setup_mi = dec_setup_mi;
  134|  17.9k|  cm->mi_params.set_mb_mi = dec_set_mb_mi;
  135|       |
  136|  17.9k|  av1_loop_filter_init(cm);
  137|       |
  138|  17.9k|  av1_qm_init(&cm->quant_params, av1_num_planes(cm));
  139|  17.9k|  av1_loop_restoration_precal();
  140|       |
  141|       |#if CONFIG_ACCOUNTING
  142|       |  pbi->acct_enabled = 1;
  143|       |  aom_accounting_init(&pbi->accounting);
  144|       |#endif
  145|       |
  146|  17.9k|  pbi->error.setjmp = 0;
  147|       |
  148|  17.9k|  aom_get_worker_interface()->init(&pbi->lf_worker);
  149|  17.9k|  pbi->lf_worker.thread_name = "aom lf worker";
  150|       |
  151|  17.9k|  return pbi;
  152|  17.9k|}
av1_dealloc_dec_jobs:
  154|  18.7k|void av1_dealloc_dec_jobs(struct AV1DecTileMTData *tile_mt_info) {
  155|  18.7k|  if (tile_mt_info != NULL) {
  ------------------
  |  Branch (155:7): [True: 18.7k, False: 0]
  ------------------
  156|  18.7k|#if CONFIG_MULTITHREAD
  157|  18.7k|    if (tile_mt_info->job_mutex != NULL) {
  ------------------
  |  Branch (157:9): [True: 9.54k, False: 9.15k]
  ------------------
  158|  9.54k|      pthread_mutex_destroy(tile_mt_info->job_mutex);
  159|  9.54k|      aom_free(tile_mt_info->job_mutex);
  160|  9.54k|    }
  161|  18.7k|#endif
  162|  18.7k|    aom_free(tile_mt_info->job_queue);
  163|       |    // clear the structure as the source of this call may be a resize in which
  164|       |    // case this call will be followed by an _alloc() which may fail.
  165|  18.7k|    av1_zero(*tile_mt_info);
  ------------------
  |  |   43|  18.7k|#define av1_zero(dest) memset(&(dest), 0, sizeof(dest))
  ------------------
  166|  18.7k|  }
  167|  18.7k|}
av1_dec_free_cb_buf:
  169|  27.0k|void av1_dec_free_cb_buf(AV1Decoder *pbi) {
  170|  27.0k|  aom_free(pbi->cb_buffer_base);
  171|       |  pbi->cb_buffer_base = NULL;
  172|  27.0k|  pbi->cb_buffer_alloc_size = 0;
  173|  27.0k|}
av1_decoder_remove:
  175|  17.9k|void av1_decoder_remove(AV1Decoder *pbi) {
  176|  17.9k|  int i;
  177|       |
  178|  17.9k|  if (!pbi) return;
  ------------------
  |  Branch (178:7): [True: 0, False: 17.9k]
  ------------------
  179|       |
  180|       |  // Free the tile list output buffer.
  181|  17.9k|  aom_free_frame_buffer(&pbi->tile_list_outbuf);
  182|       |
  183|  17.9k|  aom_get_worker_interface()->end(&pbi->lf_worker);
  184|  17.9k|  aom_free(pbi->lf_worker.data1);
  185|       |
  186|  17.9k|  if (pbi->thread_data) {
  ------------------
  |  Branch (186:7): [True: 9.15k, False: 8.78k]
  ------------------
  187|   289k|    for (int worker_idx = 1; worker_idx < pbi->num_workers; worker_idx++) {
  ------------------
  |  Branch (187:30): [True: 280k, False: 9.15k]
  ------------------
  188|   280k|      DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
  189|   280k|      if (thread_data->td != NULL) {
  ------------------
  |  Branch (189:11): [True: 280k, False: 0]
  ------------------
  190|   280k|        av1_free_mc_tmp_buf(thread_data->td);
  191|   280k|        aom_free(thread_data->td);
  192|   280k|      }
  193|   280k|    }
  194|  9.15k|    aom_free(pbi->thread_data);
  195|  9.15k|  }
  196|  17.9k|  aom_free(pbi->dcb.xd.seg_mask);
  197|       |
  198|   307k|  for (i = 0; i < pbi->num_workers; ++i) {
  ------------------
  |  Branch (198:15): [True: 289k, False: 17.9k]
  ------------------
  199|   289k|    AVxWorker *const worker = &pbi->tile_workers[i];
  200|   289k|    aom_get_worker_interface()->end(worker);
  201|   289k|  }
  202|  17.9k|#if CONFIG_MULTITHREAD
  203|  17.9k|  if (pbi->row_mt_mutex_ != NULL) {
  ------------------
  |  Branch (203:7): [True: 9.12k, False: 8.82k]
  ------------------
  204|  9.12k|    pthread_mutex_destroy(pbi->row_mt_mutex_);
  205|  9.12k|    aom_free(pbi->row_mt_mutex_);
  206|  9.12k|  }
  207|  17.9k|  if (pbi->row_mt_cond_ != NULL) {
  ------------------
  |  Branch (207:7): [True: 9.12k, False: 8.82k]
  ------------------
  208|  9.12k|    pthread_cond_destroy(pbi->row_mt_cond_);
  209|  9.12k|    aom_free(pbi->row_mt_cond_);
  210|  9.12k|  }
  211|  17.9k|#endif
  212|  37.1k|  for (i = 0; i < pbi->allocated_tiles; i++) {
  ------------------
  |  Branch (212:15): [True: 19.2k, False: 17.9k]
  ------------------
  213|  19.2k|    TileDataDec *const tile_data = pbi->tile_data + i;
  214|  19.2k|    av1_dec_row_mt_dealloc(&tile_data->dec_row_mt_sync);
  215|  19.2k|  }
  216|  17.9k|  aom_free(pbi->tile_data);
  217|  17.9k|  aom_free(pbi->tile_workers);
  218|       |
  219|  17.9k|  if (pbi->num_workers > 0) {
  ------------------
  |  Branch (219:7): [True: 9.15k, False: 8.78k]
  ------------------
  220|  9.15k|    av1_loop_filter_dealloc(&pbi->lf_row_sync);
  221|  9.15k|    av1_loop_restoration_dealloc(&pbi->lr_row_sync);
  222|  9.15k|    av1_dealloc_dec_jobs(&pbi->tile_mt_info);
  223|  9.15k|  }
  224|       |
  225|  17.9k|  av1_dec_free_cb_buf(pbi);
  226|       |#if CONFIG_ACCOUNTING
  227|       |  aom_accounting_clear(&pbi->accounting);
  228|       |#endif
  229|  17.9k|  av1_free_mc_tmp_buf(&pbi->td);
  230|  17.9k|  aom_img_metadata_array_free(pbi->metadata);
  231|  17.9k|  av1_remove_common(&pbi->common);
  232|  17.9k|  aom_free(pbi);
  233|  17.9k|}
av1_visit_palette:
  236|  4.27M|                       aom_reader *r, palette_visitor_fn_t visit) {
  237|  4.27M|  if (!is_inter_block(xd->mi[0])) {
  ------------------
  |  Branch (237:7): [True: 4.08M, False: 180k]
  ------------------
  238|  10.5M|    for (int plane = 0; plane < AOMMIN(2, av1_num_planes(&pbi->common));
  ------------------
  |  |   34|  10.5M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 7.15M, False: 3.40M]
  |  |  ------------------
  ------------------
  |  Branch (238:25): [True: 6.47M, False: 4.08M]
  ------------------
  239|  6.47M|         ++plane) {
  240|  6.47M|      if (plane == 0 || xd->is_chroma_ref) {
  ------------------
  |  Branch (240:11): [True: 4.08M, False: 2.38M]
  |  Branch (240:25): [True: 2.37M, False: 5.22k]
  ------------------
  241|  6.46M|        if (xd->mi[0]->palette_mode_info.palette_size[plane])
  ------------------
  |  Branch (241:13): [True: 155k, False: 6.30M]
  ------------------
  242|   155k|          visit(xd, plane, r);
  243|  6.46M|      } else {
  244|       |        assert(xd->mi[0]->palette_mode_info.palette_size[plane] == 0);
  245|  5.15k|      }
  246|  6.47M|    }
  247|  4.08M|  }
  248|  4.27M|}
av1_receive_compressed_data:
  426|  29.0k|                                const uint8_t **psource) {
  427|  29.0k|  AV1_COMMON *volatile const cm = &pbi->common;
  428|  29.0k|  const uint8_t *source = *psource;
  429|  29.0k|  pbi->error.error_code = AOM_CODEC_OK;
  430|  29.0k|  pbi->error.has_detail = 0;
  431|       |
  432|  29.0k|  if (size == 0) {
  ------------------
  |  Branch (432:7): [True: 0, False: 29.0k]
  ------------------
  433|       |    // This is used to signal that we are missing frames.
  434|       |    // We do not know if the missing frame(s) was supposed to update
  435|       |    // any of the reference buffers, but we act conservative and
  436|       |    // mark only the last buffer as corrupted.
  437|       |    //
  438|       |    // TODO(jkoleszar): Error concealment is undefined and non-normative
  439|       |    // at this point, but if it becomes so, [0] may not always be the correct
  440|       |    // thing to do here.
  441|      0|    RefCntBuffer *ref_buf = get_ref_frame_buf(cm, LAST_FRAME);
  442|      0|    if (ref_buf != NULL) ref_buf->buf.corrupted = 1;
  ------------------
  |  Branch (442:9): [True: 0, False: 0]
  ------------------
  443|      0|  }
  444|       |
  445|  29.0k|  if (assign_cur_frame_new_fb(cm) == NULL) {
  ------------------
  |  Branch (445:7): [True: 0, False: 29.0k]
  ------------------
  446|      0|    pbi->error.error_code = AOM_CODEC_MEM_ERROR;
  447|      0|    return 1;
  448|      0|  }
  449|       |
  450|       |  // The jmp_buf is valid only for the duration of the function that calls
  451|       |  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
  452|       |  // before it returns.
  453|  29.0k|  if (setjmp(pbi->error.jmp)) {
  ------------------
  |  Branch (453:7): [True: 13.9k, False: 15.0k]
  ------------------
  454|  13.9k|    const AVxWorkerInterface *const winterface = aom_get_worker_interface();
  455|  13.9k|    int i;
  456|       |
  457|  13.9k|    pbi->error.setjmp = 0;
  458|       |
  459|       |    // Synchronize all threads immediately as a subsequent decode call may
  460|       |    // cause a resize invalidating some allocations.
  461|  13.9k|    winterface->sync(&pbi->lf_worker);
  462|   220k|    for (i = 0; i < pbi->num_workers; ++i) {
  ------------------
  |  Branch (462:17): [True: 206k, False: 13.9k]
  ------------------
  463|   206k|      winterface->sync(&pbi->tile_workers[i]);
  464|   206k|    }
  465|       |
  466|  13.9k|    release_current_frame(pbi);
  467|  13.9k|    return -1;
  468|  13.9k|  }
  469|       |
  470|  15.0k|  pbi->error.setjmp = 1;
  471|       |
  472|  15.0k|  int frame_decoded =
  473|  15.0k|      aom_decode_frame_from_obus(pbi, source, source + size, psource);
  474|       |
  475|  15.0k|  if (frame_decoded < 0) {
  ------------------
  |  Branch (475:7): [True: 1.28k, False: 13.7k]
  ------------------
  476|  1.28k|    assert(pbi->error.error_code != AOM_CODEC_OK);
  477|  1.28k|    release_current_frame(pbi);
  478|  1.28k|    pbi->error.setjmp = 0;
  479|  1.28k|    return 1;
  480|  1.28k|  }
  481|       |
  482|       |#if TXCOEFF_TIMER
  483|       |  cm->cum_txcoeff_timer += cm->txcoeff_timer;
  484|       |  fprintf(stderr,
  485|       |          "txb coeff block number: %d, frame time: %ld, cum time %ld in us\n",
  486|       |          cm->txb_count, cm->txcoeff_timer, cm->cum_txcoeff_timer);
  487|       |  cm->txcoeff_timer = 0;
  488|       |  cm->txb_count = 0;
  489|       |#endif
  490|       |
  491|       |  // Note: At this point, this function holds a reference to cm->cur_frame
  492|       |  // in the buffer pool. This reference is consumed by update_frame_buffers().
  493|  13.7k|  update_frame_buffers(pbi, frame_decoded);
  494|       |
  495|  13.7k|  if (frame_decoded) {
  ------------------
  |  Branch (495:7): [True: 12.7k, False: 984]
  ------------------
  496|  12.7k|    pbi->decoding_first_frame = 0;
  497|  12.7k|  }
  498|       |
  499|  13.7k|  if (pbi->error.error_code != AOM_CODEC_OK) {
  ------------------
  |  Branch (499:7): [True: 0, False: 13.7k]
  ------------------
  500|      0|    pbi->error.setjmp = 0;
  501|      0|    return 1;
  502|      0|  }
  503|       |
  504|  13.7k|  if (!cm->show_existing_frame) {
  ------------------
  |  Branch (504:7): [True: 13.4k, False: 252]
  ------------------
  505|  13.4k|    if (cm->seg.enabled) {
  ------------------
  |  Branch (505:9): [True: 504, False: 12.9k]
  ------------------
  506|    504|      if (cm->prev_frame &&
  ------------------
  |  Branch (506:11): [True: 7, False: 497]
  ------------------
  507|      7|          (cm->mi_params.mi_rows == cm->prev_frame->mi_rows) &&
  ------------------
  |  Branch (507:11): [True: 7, False: 0]
  ------------------
  508|      7|          (cm->mi_params.mi_cols == cm->prev_frame->mi_cols)) {
  ------------------
  |  Branch (508:11): [True: 7, False: 0]
  ------------------
  509|      7|        cm->last_frame_seg_map = cm->prev_frame->seg_map;
  510|    497|      } else {
  511|    497|        cm->last_frame_seg_map = NULL;
  512|    497|      }
  513|    504|    }
  514|  13.4k|  }
  515|       |
  516|       |  // Update progress in frame parallel decode.
  517|  13.7k|  pbi->error.setjmp = 0;
  518|       |
  519|  13.7k|  return 0;
  520|  13.7k|}
av1_get_raw_frame:
  524|  11.6k|                      aom_film_grain_t **grain_params) {
  525|  11.6k|  if (index >= pbi->num_output_frames) return -1;
  ------------------
  |  Branch (525:7): [True: 4.61k, False: 7.04k]
  ------------------
  526|  7.04k|  *sd = &pbi->output_frames[index]->buf;
  527|  7.04k|  *grain_params = &pbi->output_frames[index]->film_grain_params;
  528|  7.04k|  return 0;
  529|  11.6k|}
decoder.c:initialize_dec:
   38|  17.9k|static void initialize_dec(void) {
   39|  17.9k|  av1_rtcd();
   40|  17.9k|  aom_dsp_rtcd();
   41|  17.9k|  aom_scale_rtcd();
   42|  17.9k|  av1_init_intra_predictors();
   43|  17.9k|  av1_init_wedge_masks();
   44|  17.9k|}
decoder.c:dec_free_mi:
   79|  53.5k|static void dec_free_mi(CommonModeInfoParams *mi_params) {
   80|  53.5k|  aom_free(mi_params->mi_alloc);
   81|  53.5k|  mi_params->mi_alloc = NULL;
   82|  53.5k|  mi_params->mi_alloc_size = 0;
   83|  53.5k|  aom_free(mi_params->mi_grid_base);
   84|  53.5k|  mi_params->mi_grid_base = NULL;
   85|  53.5k|  mi_params->mi_grid_size = 0;
   86|  53.5k|  aom_free(mi_params->tx_type_map);
   87|       |  mi_params->tx_type_map = NULL;
   88|  53.5k|}
decoder.c:dec_setup_mi:
   72|  43.9k|static void dec_setup_mi(CommonModeInfoParams *mi_params) {
   73|  43.9k|  const int mi_grid_size =
   74|  43.9k|      mi_params->mi_stride * calc_mi_size(mi_params->mi_rows);
   75|  43.9k|  memset(mi_params->mi_grid_base, 0,
   76|  43.9k|         mi_grid_size * sizeof(*mi_params->mi_grid_base));
   77|  43.9k|}
decoder.c:dec_set_mb_mi:
   47|  17.8k|                          int height, BLOCK_SIZE min_partition_size) {
   48|  17.8k|  (void)min_partition_size;
   49|       |  // Ensure that the decoded width and height are both multiples of
   50|       |  // 8 luma pixels (note: this may only be a multiple of 4 chroma pixels if
   51|       |  // subsampling is used).
   52|       |  // This simplifies the implementation of various experiments,
   53|       |  // eg. cdef, which operates on units of 8x8 luma pixels.
   54|  17.8k|  const int aligned_width = ALIGN_POWER_OF_TWO(width, 3);
  ------------------
  |  |   69|  17.8k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  ------------------
   55|  17.8k|  const int aligned_height = ALIGN_POWER_OF_TWO(height, 3);
  ------------------
  |  |   69|  17.8k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  ------------------
   56|       |
   57|  17.8k|  mi_params->mi_cols = aligned_width >> MI_SIZE_LOG2;
  ------------------
  |  |   39|  17.8k|#define MI_SIZE_LOG2 2
  ------------------
   58|  17.8k|  mi_params->mi_rows = aligned_height >> MI_SIZE_LOG2;
  ------------------
  |  |   39|  17.8k|#define MI_SIZE_LOG2 2
  ------------------
   59|  17.8k|  mi_params->mi_stride = calc_mi_size(mi_params->mi_cols);
   60|       |
   61|  17.8k|  mi_params->mb_cols = ROUND_POWER_OF_TWO(mi_params->mi_cols, 2);
  ------------------
  |  |   41|  17.8k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
   62|  17.8k|  mi_params->mb_rows = ROUND_POWER_OF_TWO(mi_params->mi_rows, 2);
  ------------------
  |  |   41|  17.8k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
   63|  17.8k|  mi_params->MBs = mi_params->mb_rows * mi_params->mb_cols;
   64|       |
   65|  17.8k|  mi_params->mi_alloc_bsize = BLOCK_4X4;
   66|  17.8k|  mi_params->mi_alloc_stride = mi_params->mi_stride;
   67|       |
   68|       |  assert(mi_size_wide[mi_params->mi_alloc_bsize] ==
   69|  17.8k|         mi_size_high[mi_params->mi_alloc_bsize]);
   70|  17.8k|}
decoder.c:release_current_frame:
  342|  15.2k|static void release_current_frame(AV1Decoder *pbi) {
  343|  15.2k|  AV1_COMMON *const cm = &pbi->common;
  344|  15.2k|  BufferPool *const pool = cm->buffer_pool;
  345|       |
  346|  15.2k|  cm->cur_frame->buf.corrupted = 1;
  347|  15.2k|  lock_buffer_pool(pool);
  348|  15.2k|  decrease_ref_count(cm->cur_frame, pool);
  349|  15.2k|  unlock_buffer_pool(pool);
  350|       |  cm->cur_frame = NULL;
  351|  15.2k|}
decoder.c:update_frame_buffers:
  358|  13.7k|static void update_frame_buffers(AV1Decoder *pbi, int frame_decoded) {
  359|  13.7k|  int ref_index = 0, mask;
  360|  13.7k|  AV1_COMMON *const cm = &pbi->common;
  361|  13.7k|  BufferPool *const pool = cm->buffer_pool;
  362|       |
  363|  13.7k|  if (frame_decoded) {
  ------------------
  |  Branch (363:7): [True: 12.7k, False: 984]
  ------------------
  364|  12.7k|    lock_buffer_pool(pool);
  365|       |
  366|       |    // In ext-tile decoding, the camera frame header is only decoded once. So,
  367|       |    // we don't update the references here.
  368|  12.7k|    if (!pbi->camera_frame_header_ready) {
  ------------------
  |  Branch (368:9): [True: 12.7k, False: 0]
  ------------------
  369|       |      // The following for loop needs to release the reference stored in
  370|       |      // cm->ref_frame_map[ref_index] before storing a reference to
  371|       |      // cm->cur_frame in cm->ref_frame_map[ref_index].
  372|  89.9k|      for (mask = cm->current_frame.refresh_frame_flags; mask; mask >>= 1) {
  ------------------
  |  Branch (372:58): [True: 77.1k, False: 12.7k]
  ------------------
  373|  77.1k|        if (mask & 1) {
  ------------------
  |  Branch (373:13): [True: 70.5k, False: 6.58k]
  ------------------
  374|  70.5k|          decrease_ref_count(cm->ref_frame_map[ref_index], pool);
  375|  70.5k|          cm->ref_frame_map[ref_index] = cm->cur_frame;
  376|  70.5k|          ++cm->cur_frame->ref_count;
  377|  70.5k|        }
  378|  77.1k|        ++ref_index;
  379|  77.1k|      }
  380|  12.7k|    }
  381|       |
  382|  12.7k|    if (cm->show_existing_frame || cm->show_frame) {
  ------------------
  |  Branch (382:9): [True: 248, False: 12.5k]
  |  Branch (382:36): [True: 10.0k, False: 2.50k]
  ------------------
  383|  10.2k|      if (pbi->output_all_layers) {
  ------------------
  |  Branch (383:11): [True: 1.75k, False: 8.50k]
  ------------------
  384|       |        // Append this frame to the output queue
  385|  1.75k|        if (pbi->num_output_frames >= MAX_NUM_SPATIAL_LAYERS) {
  ------------------
  |  |   71|  1.75k|#define MAX_NUM_SPATIAL_LAYERS 4
  ------------------
  |  Branch (385:13): [True: 0, False: 1.75k]
  ------------------
  386|       |          // We can't store the new frame anywhere, so drop it and return an
  387|       |          // error
  388|      0|          cm->cur_frame->buf.corrupted = 1;
  389|      0|          decrease_ref_count(cm->cur_frame, pool);
  390|      0|          pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
  391|  1.75k|        } else {
  392|  1.75k|          pbi->output_frames[pbi->num_output_frames] = cm->cur_frame;
  393|  1.75k|          pbi->num_output_frames++;
  394|  1.75k|        }
  395|  8.50k|      } else {
  396|       |        // Replace any existing output frame
  397|  8.50k|        assert(pbi->num_output_frames == 0 || pbi->num_output_frames == 1);
  398|  8.50k|        if (pbi->num_output_frames > 0) {
  ------------------
  |  Branch (398:13): [True: 495, False: 8.00k]
  ------------------
  399|    495|          decrease_ref_count(pbi->output_frames[0], pool);
  400|    495|        }
  401|  8.50k|        pbi->output_frames[0] = cm->cur_frame;
  402|  8.50k|        pbi->num_output_frames = 1;
  403|  8.50k|      }
  404|  10.2k|    } else {
  405|  2.50k|      decrease_ref_count(cm->cur_frame, pool);
  406|  2.50k|    }
  407|       |
  408|  12.7k|    unlock_buffer_pool(pool);
  409|  12.7k|  } else {
  410|       |    // Nothing was decoded, so just drop this frame buffer
  411|    984|    lock_buffer_pool(pool);
  412|    984|    decrease_ref_count(cm->cur_frame, pool);
  413|    984|    unlock_buffer_pool(pool);
  414|    984|  }
  415|  13.7k|  cm->cur_frame = NULL;
  416|       |
  417|  13.7k|  if (!pbi->camera_frame_header_ready) {
  ------------------
  |  Branch (417:7): [True: 13.7k, False: 0]
  ------------------
  418|       |    // Invalidate these references until the next frame starts.
  419|   109k|    for (ref_index = 0; ref_index < INTER_REFS_PER_FRAME; ref_index++) {
  ------------------
  |  Branch (419:25): [True: 96.1k, False: 13.7k]
  ------------------
  420|  96.1k|      cm->remapped_ref_idx[ref_index] = INVALID_IDX;
  ------------------
  |  |   15|  96.1k|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
  421|  96.1k|    }
  422|  13.7k|  }
  423|  13.7k|}

av1_dx_iface.c:decrease_ref_count:
  405|  4.48k|                                      BufferPool *const pool) {
  406|  4.48k|  if (buf != NULL) {
  ------------------
  |  Branch (406:7): [True: 4.48k, False: 0]
  ------------------
  407|  4.48k|    --buf->ref_count;
  408|       |    // Reference counts should never become negative. If this assertion fails,
  409|       |    // there is a bug in our reference count management.
  410|  4.48k|    assert(buf->ref_count >= 0);
  411|       |    // A worker may only get a free framebuffer index when calling get_free_fb.
  412|       |    // But the raw frame buffer is not set up until we finish decoding header.
  413|       |    // So if any error happens during decoding header, frame_bufs[idx] will not
  414|       |    // have a valid raw frame buffer.
  415|  4.48k|    if (buf->ref_count == 0 && buf->raw_frame_buffer.data) {
  ------------------
  |  Branch (415:9): [True: 8, False: 4.48k]
  |  Branch (415:32): [True: 8, False: 0]
  ------------------
  416|      8|      pool->release_fb_cb(pool->cb_priv, &buf->raw_frame_buffer);
  417|      8|      buf->raw_frame_buffer.data = NULL;
  418|      8|      buf->raw_frame_buffer.size = 0;
  419|       |      buf->raw_frame_buffer.priv = NULL;
  420|      8|    }
  421|  4.48k|  }
  422|  4.48k|}
decodeframe.c:decrease_ref_count:
  405|   143k|                                      BufferPool *const pool) {
  406|   143k|  if (buf != NULL) {
  ------------------
  |  Branch (406:7): [True: 2.47k, False: 141k]
  ------------------
  407|  2.47k|    --buf->ref_count;
  408|       |    // Reference counts should never become negative. If this assertion fails,
  409|       |    // there is a bug in our reference count management.
  410|  2.47k|    assert(buf->ref_count >= 0);
  411|       |    // A worker may only get a free framebuffer index when calling get_free_fb.
  412|       |    // But the raw frame buffer is not set up until we finish decoding header.
  413|       |    // So if any error happens during decoding header, frame_bufs[idx] will not
  414|       |    // have a valid raw frame buffer.
  415|  2.47k|    if (buf->ref_count == 0 && buf->raw_frame_buffer.data) {
  ------------------
  |  Branch (415:9): [True: 332, False: 2.14k]
  |  Branch (415:32): [True: 332, False: 0]
  ------------------
  416|    332|      pool->release_fb_cb(pool->cb_priv, &buf->raw_frame_buffer);
  417|    332|      buf->raw_frame_buffer.data = NULL;
  418|    332|      buf->raw_frame_buffer.size = 0;
  419|       |      buf->raw_frame_buffer.priv = NULL;
  420|    332|    }
  421|  2.47k|  }
  422|   143k|}
decoder.c:decrease_ref_count:
  405|  89.8k|                                      BufferPool *const pool) {
  406|  89.8k|  if (buf != NULL) {
  ------------------
  |  Branch (406:7): [True: 26.9k, False: 62.8k]
  ------------------
  407|  26.9k|    --buf->ref_count;
  408|       |    // Reference counts should never become negative. If this assertion fails,
  409|       |    // there is a bug in our reference count management.
  410|  26.9k|    assert(buf->ref_count >= 0);
  411|       |    // A worker may only get a free framebuffer index when calling get_free_fb.
  412|       |    // But the raw frame buffer is not set up until we finish decoding header.
  413|       |    // So if any error happens during decoding header, frame_bufs[idx] will not
  414|       |    // have a valid raw frame buffer.
  415|  26.9k|    if (buf->ref_count == 0 && buf->raw_frame_buffer.data) {
  ------------------
  |  Branch (415:9): [True: 17.2k, False: 9.72k]
  |  Branch (415:32): [True: 15.0k, False: 2.22k]
  ------------------
  416|  15.0k|      pool->release_fb_cb(pool->cb_priv, &buf->raw_frame_buffer);
  417|  15.0k|      buf->raw_frame_buffer.data = NULL;
  418|  15.0k|      buf->raw_frame_buffer.size = 0;
  419|       |      buf->raw_frame_buffer.priv = NULL;
  420|  15.0k|    }
  421|  26.9k|  }
  422|  89.8k|}
detokenize.c:av1_read_uniform:
  425|  62.4k|static inline int av1_read_uniform(aom_reader *r, int n) {
  426|  62.4k|  const int l = get_unsigned_bits(n);
  427|  62.4k|  const int m = (1 << l) - n;
  428|  62.4k|  const int v = aom_read_literal(r, l - 1, ACCT_STR);
  ------------------
  |  |   47|  62.4k|  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  429|  62.4k|  assert(l != 0);
  430|  62.4k|  if (v < m)
  ------------------
  |  Branch (430:7): [True: 48.7k, False: 13.7k]
  ------------------
  431|  48.7k|    return v;
  432|  13.7k|  else
  433|  13.7k|    return (v << 1) - m + aom_read_literal(r, 1, ACCT_STR);
  ------------------
  |  |   47|  13.7k|  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  434|  62.4k|}

av1_read_coeffs_txb:
  327|  4.71M|                         const int col, const TX_SIZE tx_size) {
  328|       |#if TXCOEFF_TIMER
  329|       |  struct aom_usec_timer timer;
  330|       |  aom_usec_timer_start(&timer);
  331|       |#endif
  332|  4.71M|  MACROBLOCKD *const xd = &dcb->xd;
  333|  4.71M|  MB_MODE_INFO *const mbmi = xd->mi[0];
  334|  4.71M|  struct macroblockd_plane *const pd = &xd->plane[plane];
  335|       |
  336|  4.71M|  const BLOCK_SIZE bsize = mbmi->bsize;
  337|  4.71M|  assert(bsize < BLOCK_SIZES_ALL);
  338|  4.71M|  const BLOCK_SIZE plane_bsize =
  339|  4.71M|      get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
  340|       |
  341|  4.71M|  TXB_CTX txb_ctx;
  342|  4.71M|  get_txb_ctx(plane_bsize, tx_size, plane, pd->above_entropy_context + col,
  343|  4.71M|              pd->left_entropy_context + row, &txb_ctx);
  344|  4.71M|  const uint8_t cul_level =
  345|  4.71M|      read_coeffs_txb(cm, dcb, r, row, col, plane, &txb_ctx, tx_size);
  346|  4.71M|  av1_set_entropy_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level, col,
  347|  4.71M|                           row);
  348|       |
  349|  4.71M|  if (is_inter_block(mbmi)) {
  ------------------
  |  Branch (349:7): [True: 281k, False: 4.43M]
  ------------------
  350|   281k|    const PLANE_TYPE plane_type = get_plane_type(plane);
  351|       |    // tx_type will be read out in av1_read_coeffs_txb_facade
  352|   281k|    const TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, row, col, tx_size,
  353|   281k|                                            cm->features.reduced_tx_set_used);
  354|       |
  355|   281k|    if (plane == 0) {
  ------------------
  |  Branch (355:9): [True: 127k, False: 154k]
  ------------------
  356|   127k|      const int txw = tx_size_wide_unit[tx_size];
  357|   127k|      const int txh = tx_size_high_unit[tx_size];
  358|       |      // The 16x16 unit is due to the constraint from tx_64x64 which sets the
  359|       |      // maximum tx size for chroma as 32x32. Coupled with 4x1 transform block
  360|       |      // size, the constraint takes effect in 32x16 / 16x32 size too. To solve
  361|       |      // the intricacy, cover all the 16x16 units inside a 64 level transform.
  362|   127k|      if (txw == tx_size_wide_unit[TX_64X64] ||
  ------------------
  |  Branch (362:11): [True: 3.06k, False: 124k]
  ------------------
  363|   124k|          txh == tx_size_high_unit[TX_64X64]) {
  ------------------
  |  Branch (363:11): [True: 751, False: 123k]
  ------------------
  364|  3.81k|        const int tx_unit = tx_size_wide_unit[TX_16X16];
  365|  3.81k|        const int stride = xd->tx_type_map_stride;
  366|  16.9k|        for (int idy = 0; idy < txh; idy += tx_unit) {
  ------------------
  |  Branch (366:27): [True: 13.1k, False: 3.81k]
  ------------------
  367|  58.7k|          for (int idx = 0; idx < txw; idx += tx_unit) {
  ------------------
  |  Branch (367:29): [True: 45.6k, False: 13.1k]
  ------------------
  368|  45.6k|            xd->tx_type_map[(row + idy) * stride + col + idx] = tx_type;
  369|  45.6k|          }
  370|  13.1k|        }
  371|  3.81k|      }
  372|   127k|    }
  373|   281k|  }
  374|       |
  375|       |#if TXCOEFF_TIMER
  376|       |  aom_usec_timer_mark(&timer);
  377|       |  const int64_t elapsed_time = aom_usec_timer_elapsed(&timer);
  378|       |  cm->txcoeff_timer += elapsed_time;
  379|       |  ++cm->txb_count;
  380|       |#endif
  381|  4.71M|}
decodetxb.c:read_coeffs_txb:
  114|  4.72M|                               const TX_SIZE tx_size) {
  115|  4.72M|  MACROBLOCKD *const xd = &dcb->xd;
  116|  4.72M|  FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
  117|  4.72M|  const int32_t max_value = (1 << (7 + xd->bd)) - 1;
  118|  4.72M|  const int32_t min_value = -(1 << (7 + xd->bd));
  119|  4.72M|  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
  120|  4.72M|  const PLANE_TYPE plane_type = get_plane_type(plane);
  121|  4.72M|  MB_MODE_INFO *const mbmi = xd->mi[0];
  122|  4.72M|  struct macroblockd_plane *const pd = &xd->plane[plane];
  123|  4.72M|  const int16_t *const dequant = pd->seg_dequant_QTX[mbmi->segment_id];
  124|  4.72M|  tran_low_t *const tcoeffs = dcb->dqcoeff_block[plane] + dcb->cb_offset[plane];
  125|  4.72M|  const int shift = av1_get_tx_scale(tx_size);
  126|  4.72M|  const int bhl = get_txb_bhl(tx_size);
  127|  4.72M|  const int width = get_txb_wide(tx_size);
  128|  4.72M|  const int height = get_txb_high(tx_size);
  129|  4.72M|  int cul_level = 0;
  130|  4.72M|  int dc_val = 0;
  131|  4.72M|  uint8_t levels_buf[TX_PAD_2D];
  132|  4.72M|  uint8_t *const levels = set_levels(levels_buf, height);
  133|  4.72M|  const int all_zero = aom_read_symbol(
  ------------------
  |  |   51|  4.72M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  134|  4.72M|      r, ec_ctx->txb_skip_cdf[txs_ctx][txb_ctx->txb_skip_ctx], 2, ACCT_STR);
  135|  4.72M|  eob_info *eob_data = dcb->eob_data[plane] + dcb->txb_offset[plane];
  136|  4.72M|  uint16_t *const eob = &(eob_data->eob);
  137|  4.72M|  uint16_t *const max_scan_line = &(eob_data->max_scan_line);
  138|  4.72M|  *max_scan_line = 0;
  139|  4.72M|  *eob = 0;
  140|       |
  141|       |#if CONFIG_INSPECTION
  142|       |  if (plane == 0) {
  143|       |    const int txk_type_idx =
  144|       |        av1_get_txk_type_index(mbmi->bsize, blk_row, blk_col);
  145|       |    mbmi->tx_skip[txk_type_idx] = all_zero;
  146|       |  }
  147|       |#endif
  148|       |
  149|  4.72M|  if (all_zero) {
  ------------------
  |  Branch (149:7): [True: 1.94M, False: 2.77M]
  ------------------
  150|  1.94M|    *max_scan_line = 0;
  151|  1.94M|    if (plane == 0) {
  ------------------
  |  Branch (151:9): [True: 778k, False: 1.16M]
  ------------------
  152|   778k|      xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col] = DCT_DCT;
  153|   778k|    }
  154|  1.94M|    return 0;
  155|  1.94M|  }
  156|       |
  157|  2.77M|  if (plane == AOM_PLANE_Y) {
  ------------------
  |  |  210|  2.77M|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
  |  Branch (157:7): [True: 1.76M, False: 1.01M]
  ------------------
  158|       |    // only y plane's tx_type is transmitted
  159|  1.76M|    av1_read_tx_type(cm, xd, blk_row, blk_col, tx_size, r);
  160|  1.76M|  }
  161|  2.77M|  const TX_TYPE tx_type =
  162|  2.77M|      av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size,
  163|  2.77M|                      cm->features.reduced_tx_set_used);
  164|  2.77M|  const TX_CLASS tx_class = tx_type_to_class[tx_type];
  165|  2.77M|  const qm_val_t *iqmatrix =
  166|  2.77M|      av1_get_iqmatrix(&cm->quant_params, xd, plane, tx_size, tx_type);
  167|  2.77M|  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
  168|  2.77M|  const int16_t *const scan = scan_order->scan;
  169|  2.77M|  int eob_extra = 0;
  170|  2.77M|  int eob_pt = 1;
  171|       |
  172|  2.77M|  const int eob_multi_size = txsize_log2_minus4[tx_size];
  173|  2.77M|  const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1;
  ------------------
  |  Branch (173:29): [True: 2.64M, False: 136k]
  ------------------
  174|  2.77M|  switch (eob_multi_size) {
  175|  1.03M|    case 0:
  ------------------
  |  Branch (175:5): [True: 1.03M, False: 1.74M]
  ------------------
  176|  1.03M|      eob_pt =
  177|  1.03M|          aom_read_symbol(r, ec_ctx->eob_flag_cdf16[plane_type][eob_multi_ctx],
  ------------------
  |  |   51|  1.03M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  178|  1.03M|                          5, ACCT_STR) +
  179|  1.03M|          1;
  180|  1.03M|      break;
  181|   269k|    case 1:
  ------------------
  |  Branch (181:5): [True: 269k, False: 2.50M]
  ------------------
  182|   269k|      eob_pt =
  183|   269k|          aom_read_symbol(r, ec_ctx->eob_flag_cdf32[plane_type][eob_multi_ctx],
  ------------------
  |  |   51|   269k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  184|   269k|                          6, ACCT_STR) +
  185|   269k|          1;
  186|   269k|      break;
  187|   754k|    case 2:
  ------------------
  |  Branch (187:5): [True: 754k, False: 2.02M]
  ------------------
  188|   754k|      eob_pt =
  189|   754k|          aom_read_symbol(r, ec_ctx->eob_flag_cdf64[plane_type][eob_multi_ctx],
  ------------------
  |  |   51|   754k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  190|   754k|                          7, ACCT_STR) +
  191|   754k|          1;
  192|   754k|      break;
  193|   237k|    case 3:
  ------------------
  |  Branch (193:5): [True: 237k, False: 2.54M]
  ------------------
  194|   237k|      eob_pt =
  195|   237k|          aom_read_symbol(r, ec_ctx->eob_flag_cdf128[plane_type][eob_multi_ctx],
  ------------------
  |  |   51|   237k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  196|   237k|                          8, ACCT_STR) +
  197|   237k|          1;
  198|   237k|      break;
  199|   314k|    case 4:
  ------------------
  |  Branch (199:5): [True: 314k, False: 2.46M]
  ------------------
  200|   314k|      eob_pt =
  201|   314k|          aom_read_symbol(r, ec_ctx->eob_flag_cdf256[plane_type][eob_multi_ctx],
  ------------------
  |  |   51|   314k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  202|   314k|                          9, ACCT_STR) +
  203|   314k|          1;
  204|   314k|      break;
  205|  65.0k|    case 5:
  ------------------
  |  Branch (205:5): [True: 65.0k, False: 2.71M]
  ------------------
  206|  65.0k|      eob_pt =
  207|  65.0k|          aom_read_symbol(r, ec_ctx->eob_flag_cdf512[plane_type][eob_multi_ctx],
  ------------------
  |  |   51|  65.0k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  208|  65.0k|                          10, ACCT_STR) +
  209|  65.0k|          1;
  210|  65.0k|      break;
  211|   103k|    case 6:
  ------------------
  |  Branch (211:5): [True: 103k, False: 2.67M]
  ------------------
  212|   103k|    default:
  ------------------
  |  Branch (212:5): [True: 0, False: 2.77M]
  ------------------
  213|   103k|      eob_pt = aom_read_symbol(
  ------------------
  |  |   51|   103k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  214|   103k|                   r, ec_ctx->eob_flag_cdf1024[plane_type][eob_multi_ctx], 11,
  215|   103k|                   ACCT_STR) +
  216|   103k|               1;
  217|   103k|      break;
  218|  2.77M|  }
  219|       |
  220|  2.77M|  const int eob_offset_bits = av1_eob_offset_bits[eob_pt];
  221|  2.77M|  if (eob_offset_bits > 0) {
  ------------------
  |  Branch (221:7): [True: 2.21M, False: 567k]
  ------------------
  222|  2.21M|    const int eob_ctx = eob_pt - 3;
  223|  2.21M|    int bit = aom_read_symbol(
  ------------------
  |  |   51|  2.21M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  224|  2.21M|        r, ec_ctx->eob_extra_cdf[txs_ctx][plane_type][eob_ctx], 2, ACCT_STR);
  225|  2.21M|    if (bit) {
  ------------------
  |  Branch (225:9): [True: 1.06M, False: 1.14M]
  ------------------
  226|  1.06M|      eob_extra += (1 << (eob_offset_bits - 1));
  227|  1.06M|    }
  228|       |
  229|  7.31M|    for (int i = 1; i < eob_offset_bits; i++) {
  ------------------
  |  Branch (229:21): [True: 5.09M, False: 2.21M]
  ------------------
  230|  5.09M|      bit = aom_read_bit(r, ACCT_STR);
  ------------------
  |  |   43|  5.09M|  aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  231|  5.09M|      if (bit) {
  ------------------
  |  Branch (231:11): [True: 2.60M, False: 2.49M]
  ------------------
  232|  2.60M|        eob_extra += (1 << (eob_offset_bits - 1 - i));
  233|  2.60M|      }
  234|  5.09M|    }
  235|  2.21M|  }
  236|  2.77M|  *eob = rec_eob_pos(eob_pt, eob_extra);
  237|       |
  238|  2.77M|  if (*eob > 1) {
  ------------------
  |  Branch (238:7): [True: 2.30M, False: 473k]
  ------------------
  239|  2.30M|    memset(levels_buf, 0,
  240|  2.30M|           sizeof(*levels_buf) *
  241|  2.30M|               ((height + TX_PAD_HOR) * (width + TX_PAD_VER) + TX_PAD_END));
  ------------------
  |  |  190|  2.30M|#define TX_PAD_HOR 4
  ------------------
                             ((height + TX_PAD_HOR) * (width + TX_PAD_VER) + TX_PAD_END));
  ------------------
  |  |  195|  2.30M|#define TX_PAD_VER (TX_PAD_TOP + TX_PAD_BOTTOM)
  |  |  ------------------
  |  |  |  |  193|  2.30M|#define TX_PAD_TOP 0
  |  |  ------------------
  |  |               #define TX_PAD_VER (TX_PAD_TOP + TX_PAD_BOTTOM)
  |  |  ------------------
  |  |  |  |  194|  2.30M|#define TX_PAD_BOTTOM 4
  |  |  ------------------
  ------------------
                             ((height + TX_PAD_HOR) * (width + TX_PAD_VER) + TX_PAD_END));
  ------------------
  |  |  197|  2.30M|#define TX_PAD_END 16
  ------------------
  242|  2.30M|  }
  243|       |
  244|  2.77M|  {
  245|       |    // Read the non-zero coefficient with scan index eob-1
  246|       |    // TODO(angiebird): Put this into a function
  247|  2.77M|    const int c = *eob - 1;
  248|  2.77M|    const int pos = scan[c];
  249|  2.77M|    const int coeff_ctx = get_lower_levels_ctx_eob(bhl, width, c);
  250|  2.77M|    const int nsymbs = 3;
  251|  2.77M|    aom_cdf_prob *cdf =
  252|  2.77M|        ec_ctx->coeff_base_eob_cdf[txs_ctx][plane_type][coeff_ctx];
  253|  2.77M|    int level = aom_read_symbol(r, cdf, nsymbs, ACCT_STR) + 1;
  ------------------
  |  |   51|  2.77M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  254|  2.77M|    if (level > NUM_BASE_LEVELS) {
  ------------------
  |  |   46|  2.77M|#define NUM_BASE_LEVELS 2
  ------------------
  |  Branch (254:9): [True: 207k, False: 2.57M]
  ------------------
  255|   207k|      const int br_ctx = get_br_ctx_eob(pos, bhl, tx_class);
  256|   207k|      cdf = ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type][br_ctx];
  ------------------
  |  |   34|   207k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 185k, False: 22.1k]
  |  |  ------------------
  ------------------
  257|   372k|      for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
  ------------------
  |  |   49|   372k|#define COEFF_BASE_RANGE (4 * (BR_CDF_SIZE - 1))
  |  |  ------------------
  |  |  |  |   48|   372k|#define BR_CDF_SIZE (4)
  |  |  ------------------
  ------------------
                    for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
  ------------------
  |  |   48|   164k|#define BR_CDF_SIZE (4)
  ------------------
  |  Branch (257:25): [True: 347k, False: 24.3k]
  ------------------
  258|   347k|        const int k = aom_read_symbol(r, cdf, BR_CDF_SIZE, ACCT_STR);
  ------------------
  |  |   51|   347k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  259|   347k|        level += k;
  260|   347k|        if (k < BR_CDF_SIZE - 1) break;
  ------------------
  |  |   48|   347k|#define BR_CDF_SIZE (4)
  ------------------
  |  Branch (260:13): [True: 183k, False: 164k]
  ------------------
  261|   347k|      }
  262|   207k|    }
  263|  2.77M|    levels[get_padded_idx(pos, bhl)] = level;
  264|  2.77M|  }
  265|  2.77M|  if (*eob > 1) {
  ------------------
  |  Branch (265:7): [True: 2.30M, False: 474k]
  ------------------
  266|  2.30M|    base_cdf_arr base_cdf = ec_ctx->coeff_base_cdf[txs_ctx][plane_type];
  267|  2.30M|    br_cdf_arr br_cdf =
  268|  2.30M|        ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type];
  ------------------
  |  |   34|  2.30M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 2.18M, False: 121k]
  |  |  ------------------
  ------------------
  269|  2.30M|    if (tx_class == TX_CLASS_2D) {
  ------------------
  |  Branch (269:9): [True: 2.17M, False: 132k]
  ------------------
  270|  2.17M|      read_coeffs_reverse_2d(r, tx_size, 1, *eob - 1 - 1, scan, bhl, levels,
  271|  2.17M|                             base_cdf, br_cdf);
  272|  2.17M|      read_coeffs_reverse(r, tx_size, tx_class, 0, 0, scan, bhl, levels,
  273|  2.17M|                          base_cdf, br_cdf);
  274|  2.17M|    } else {
  275|   132k|      read_coeffs_reverse(r, tx_size, tx_class, 0, *eob - 1 - 1, scan, bhl,
  276|   132k|                          levels, base_cdf, br_cdf);
  277|   132k|    }
  278|  2.30M|  }
  279|       |
  280|  64.9M|  for (int c = 0; c < *eob; ++c) {
  ------------------
  |  Branch (280:19): [True: 62.1M, False: 2.77M]
  ------------------
  281|  62.1M|    const int pos = scan[c];
  282|  62.1M|    uint8_t sign;
  283|  62.1M|    tran_low_t level = levels[get_padded_idx(pos, bhl)];
  284|  62.1M|    if (level) {
  ------------------
  |  Branch (284:9): [True: 26.2M, False: 35.9M]
  ------------------
  285|  26.2M|      *max_scan_line = AOMMAX(*max_scan_line, pos);
  ------------------
  |  |   35|  26.2M|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 12.6M, False: 13.6M]
  |  |  ------------------
  ------------------
  286|  26.2M|      if (c == 0) {
  ------------------
  |  Branch (286:11): [True: 2.25M, False: 23.9M]
  ------------------
  287|  2.25M|        const int dc_sign_ctx = txb_ctx->dc_sign_ctx;
  288|  2.25M|        sign = aom_read_symbol(r, ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx],
  ------------------
  |  |   51|  2.25M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  289|  2.25M|                               2, ACCT_STR);
  290|  23.9M|      } else {
  291|  23.9M|        sign = aom_read_bit(r, ACCT_STR);
  ------------------
  |  |   43|  23.9M|  aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  292|  23.9M|      }
  293|  26.2M|      if (level >= MAX_BASE_BR_RANGE) {
  ------------------
  |  |   53|  26.2M|#define MAX_BASE_BR_RANGE (COEFF_BASE_RANGE + NUM_BASE_LEVELS + 1)
  |  |  ------------------
  |  |  |  |   49|  26.2M|#define COEFF_BASE_RANGE (4 * (BR_CDF_SIZE - 1))
  |  |  |  |  ------------------
  |  |  |  |  |  |   48|  26.2M|#define BR_CDF_SIZE (4)
  |  |  |  |  ------------------
  |  |  ------------------
  |  |               #define MAX_BASE_BR_RANGE (COEFF_BASE_RANGE + NUM_BASE_LEVELS + 1)
  |  |  ------------------
  |  |  |  |   46|  26.2M|#define NUM_BASE_LEVELS 2
  |  |  ------------------
  ------------------
  |  Branch (293:11): [True: 559k, False: 25.6M]
  ------------------
  294|   559k|        level += read_golomb(xd, r);
  295|   559k|      }
  296|       |
  297|  26.2M|      if (c == 0) dc_val = sign ? -level : level;
  ------------------
  |  Branch (297:11): [True: 2.25M, False: 23.9M]
  |  Branch (297:28): [True: 1.17M, False: 1.08M]
  ------------------
  298|       |
  299|       |      // Bitmasking to clamp level to valid range:
  300|       |      //   The valid range for 8/10/12 bit vdieo is at most 14/16/18 bit
  301|  26.2M|      level &= 0xfffff;
  302|  26.2M|      cul_level += level;
  303|  26.2M|      tran_low_t dq_coeff;
  304|       |      // Bitmasking to clamp dq_coeff to valid range:
  305|       |      //   The valid range for 8/10/12 bit video is at most 17/19/21 bit
  306|  26.2M|      dq_coeff =
  307|  26.2M|          (tran_low_t)((int64_t)level * get_dqv(dequant, scan[c], iqmatrix) &
  308|  26.2M|                       0xffffff);
  309|  26.2M|      dq_coeff = dq_coeff >> shift;
  310|  26.2M|      if (sign) {
  ------------------
  |  Branch (310:11): [True: 13.2M, False: 12.9M]
  ------------------
  311|  13.2M|        dq_coeff = -dq_coeff;
  312|  13.2M|      }
  313|  26.2M|      tcoeffs[pos] = clamp(dq_coeff, min_value, max_value);
  314|  26.2M|    }
  315|  62.1M|  }
  316|       |
  317|  2.77M|  cul_level = AOMMIN(COEFF_CONTEXT_MASK, cul_level);
  ------------------
  |  |   34|  2.77M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 1.43M, False: 1.34M]
  |  |  ------------------
  ------------------
  318|       |
  319|       |  // DC value
  320|  2.77M|  set_dc_sign(&cul_level, dc_val);
  321|       |
  322|  2.77M|  return cul_level;
  323|  2.77M|}
decodetxb.c:rec_eob_pos:
   45|  2.77M|static inline int rec_eob_pos(const int eob_token, const int extra) {
   46|  2.77M|  int eob = av1_eob_group_start[eob_token];
   47|  2.77M|  if (eob > 2) {
  ------------------
  |  Branch (47:7): [True: 2.21M, False: 567k]
  ------------------
   48|  2.21M|    eob += extra;
   49|  2.21M|  }
   50|  2.77M|  return eob;
   51|  2.77M|}
decodetxb.c:read_coeffs_reverse_2d:
   67|  2.17M|                                          br_cdf_arr br_cdf) {
   68|  55.6M|  for (int c = end_si; c >= start_si; --c) {
  ------------------
  |  Branch (68:24): [True: 53.4M, False: 2.17M]
  ------------------
   69|  53.4M|    const int pos = scan[c];
   70|  53.4M|    const int coeff_ctx = get_lower_levels_ctx_2d(levels, pos, bhl, tx_size);
   71|  53.4M|    const int nsymbs = 4;
   72|  53.4M|    int level = aom_read_symbol(r, base_cdf[coeff_ctx], nsymbs, ACCT_STR);
  ------------------
  |  |   51|  53.4M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
   73|  53.4M|    if (level > NUM_BASE_LEVELS) {
  ------------------
  |  |   46|  53.4M|#define NUM_BASE_LEVELS 2
  ------------------
  |  Branch (73:9): [True: 5.18M, False: 48.2M]
  ------------------
   74|  5.18M|      const int br_ctx = get_br_ctx_2d(levels, pos, bhl);
   75|  5.18M|      aom_cdf_prob *cdf = br_cdf[br_ctx];
   76|  9.19M|      for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
  ------------------
  |  |   49|  9.19M|#define COEFF_BASE_RANGE (4 * (BR_CDF_SIZE - 1))
  |  |  ------------------
  |  |  |  |   48|  9.19M|#define BR_CDF_SIZE (4)
  |  |  ------------------
  ------------------
                    for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
  ------------------
  |  |   48|  4.00M|#define BR_CDF_SIZE (4)
  ------------------
  |  Branch (76:25): [True: 8.80M, False: 383k]
  ------------------
   77|  8.80M|        const int k = aom_read_symbol(r, cdf, BR_CDF_SIZE, ACCT_STR);
  ------------------
  |  |   51|  8.80M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
   78|  8.80M|        level += k;
   79|  8.80M|        if (k < BR_CDF_SIZE - 1) break;
  ------------------
  |  |   48|  8.80M|#define BR_CDF_SIZE (4)
  ------------------
  |  Branch (79:13): [True: 4.80M, False: 4.00M]
  ------------------
   80|  8.80M|      }
   81|  5.18M|    }
   82|  53.4M|    levels[get_padded_idx(pos, bhl)] = level;
   83|  53.4M|  }
   84|  2.17M|}
decodetxb.c:read_coeffs_reverse:
   90|  2.30M|                                       br_cdf_arr br_cdf) {
   91|  7.59M|  for (int c = end_si; c >= start_si; --c) {
  ------------------
  |  Branch (91:24): [True: 5.29M, False: 2.30M]
  ------------------
   92|  5.29M|    const int pos = scan[c];
   93|  5.29M|    const int coeff_ctx =
   94|  5.29M|        get_lower_levels_ctx(levels, pos, bhl, tx_size, tx_class);
   95|  5.29M|    const int nsymbs = 4;
   96|  5.29M|    int level = aom_read_symbol(r, base_cdf[coeff_ctx], nsymbs, ACCT_STR);
  ------------------
  |  |   51|  5.29M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
   97|  5.29M|    if (level > NUM_BASE_LEVELS) {
  ------------------
  |  |   46|  5.29M|#define NUM_BASE_LEVELS 2
  ------------------
  |  Branch (97:9): [True: 1.11M, False: 4.18M]
  ------------------
   98|  1.11M|      const int br_ctx = get_br_ctx(levels, pos, bhl, tx_class);
   99|  1.11M|      aom_cdf_prob *cdf = br_cdf[br_ctx];
  100|  2.23M|      for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
  ------------------
  |  |   49|  2.23M|#define COEFF_BASE_RANGE (4 * (BR_CDF_SIZE - 1))
  |  |  ------------------
  |  |  |  |   48|  2.23M|#define BR_CDF_SIZE (4)
  |  |  ------------------
  ------------------
                    for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
  ------------------
  |  |   48|  1.12M|#define BR_CDF_SIZE (4)
  ------------------
  |  Branch (100:25): [True: 2.09M, False: 140k]
  ------------------
  101|  2.09M|        const int k = aom_read_symbol(r, cdf, BR_CDF_SIZE, ACCT_STR);
  ------------------
  |  |   51|  2.09M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  102|  2.09M|        level += k;
  103|  2.09M|        if (k < BR_CDF_SIZE - 1) break;
  ------------------
  |  |   48|  2.09M|#define BR_CDF_SIZE (4)
  ------------------
  |  Branch (103:13): [True: 970k, False: 1.12M]
  ------------------
  104|  2.09M|      }
  105|  1.11M|    }
  106|  5.29M|    levels[get_padded_idx(pos, bhl)] = level;
  107|  5.29M|  }
  108|  2.30M|}
decodetxb.c:read_golomb:
   22|   559k|static int read_golomb(MACROBLOCKD *xd, aom_reader *r) {
   23|   559k|  int x = 1;
   24|   559k|  int length = 0;
   25|   559k|  int i = 0;
   26|       |
   27|  2.08M|  while (!i) {
  ------------------
  |  Branch (27:10): [True: 1.52M, False: 559k]
  ------------------
   28|  1.52M|    i = aom_read_bit(r, ACCT_STR);
  ------------------
  |  |   43|  1.52M|  aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
   29|  1.52M|    ++length;
   30|  1.52M|    if (length > 20) {
  ------------------
  |  Branch (30:9): [True: 35, False: 1.52M]
  ------------------
   31|     35|      aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
   32|     35|                         "Invalid length in read_golomb");
   33|     35|      break;
   34|     35|    }
   35|  1.52M|  }
   36|       |
   37|  1.52M|  for (i = 0; i < length - 1; ++i) {
  ------------------
  |  Branch (37:15): [True: 963k, False: 559k]
  ------------------
   38|   963k|    x <<= 1;
   39|   963k|    x += aom_read_bit(r, ACCT_STR);
  ------------------
  |  |   43|   963k|  aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
   40|   963k|  }
   41|       |
   42|   559k|  return x - 1;
   43|   559k|}
decodetxb.c:get_dqv:
   54|  26.2M|                          const qm_val_t *iqmatrix) {
   55|  26.2M|  int dqv = dequant[!!coeff_idx];
   56|  26.2M|  if (iqmatrix != NULL)
  ------------------
  |  Branch (56:7): [True: 5.52M, False: 20.7M]
  ------------------
   57|  5.52M|    dqv =
   58|  5.52M|        ((iqmatrix[coeff_idx] * dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
  ------------------
  |  |   62|  5.52M|#define AOM_QM_BITS 5
  ------------------
                      ((iqmatrix[coeff_idx] * dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
  ------------------
  |  |   62|  5.52M|#define AOM_QM_BITS 5
  ------------------
   59|  26.2M|  return dqv;
   60|  26.2M|}

av1_decode_palette_tokens:
   66|  62.4k|                               aom_reader *r) {
   67|  62.4k|  assert(plane == 0 || plane == 1);
   68|  62.4k|  Av1ColorMapParam params;
   69|  62.4k|  params.color_map =
   70|  62.4k|      xd->plane[plane].color_index_map + xd->color_index_map_offset[plane];
   71|  62.4k|  params.map_cdf = plane ? xd->tile_ctx->palette_uv_color_index_cdf
  ------------------
  |  Branch (71:20): [True: 14.5k, False: 47.9k]
  ------------------
   72|  62.4k|                         : xd->tile_ctx->palette_y_color_index_cdf;
   73|  62.4k|  const MB_MODE_INFO *const mbmi = xd->mi[0];
   74|  62.4k|  params.n_colors = mbmi->palette_mode_info.palette_size[plane];
   75|  62.4k|  av1_get_block_dimensions(mbmi->bsize, plane, xd, &params.plane_width,
   76|  62.4k|                           &params.plane_height, &params.rows, &params.cols);
   77|  62.4k|  decode_color_map_tokens(&params, r);
   78|  62.4k|}
detokenize.c:decode_color_map_tokens:
   25|  62.4k|static void decode_color_map_tokens(Av1ColorMapParam *param, aom_reader *r) {
   26|  62.4k|  uint8_t color_order[PALETTE_MAX_SIZE];
   27|  62.4k|  const int n = param->n_colors;
   28|  62.4k|  uint8_t *const color_map = param->color_map;
   29|  62.4k|  MapCdf color_map_cdf = param->map_cdf;
   30|  62.4k|  int plane_block_width = param->plane_width;
   31|  62.4k|  int plane_block_height = param->plane_height;
   32|  62.4k|  int rows = param->rows;
   33|  62.4k|  int cols = param->cols;
   34|       |
   35|       |  // The first color index.
   36|  62.4k|  color_map[0] = av1_read_uniform(r, n);
   37|  62.4k|  assert(color_map[0] < n);
   38|       |
   39|       |  // Run wavefront on the palette map index decoding.
   40|  1.53M|  for (int i = 1; i < rows + cols - 1; ++i) {
  ------------------
  |  Branch (40:19): [True: 1.47M, False: 62.4k]
  ------------------
   41|  12.2M|    for (int j = AOMMIN(i, cols - 1); j >= AOMMAX(0, i - rows + 1); --j) {
  ------------------
  |  |   34|  1.47M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 758k, False: 717k]
  |  |  ------------------
  ------------------
                  for (int j = AOMMIN(i, cols - 1); j >= AOMMAX(0, i - rows + 1); --j) {
  ------------------
  |  |   35|  12.2M|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 5.05M, False: 7.22M]
  |  |  ------------------
  ------------------
  |  Branch (41:39): [True: 10.8M, False: 1.47M]
  ------------------
   42|  10.8M|      const int color_ctx = av1_get_palette_color_index_context(
   43|  10.8M|          color_map, plane_block_width, (i - j), j, n, color_order, NULL);
   44|  10.8M|      const int color_idx = aom_read_symbol(
  ------------------
  |  |   51|  10.8M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
   45|  10.8M|          r, color_map_cdf[n - PALETTE_MIN_SIZE][color_ctx], n, ACCT_STR);
   46|  10.8M|      assert(color_idx >= 0 && color_idx < n);
   47|  10.8M|      color_map[(i - j) * plane_block_width + j] = color_order[color_idx];
   48|  10.8M|    }
   49|  1.47M|  }
   50|       |  // Copy last column to extra columns.
   51|  62.4k|  if (cols < plane_block_width) {
  ------------------
  |  Branch (51:7): [True: 365, False: 62.1k]
  ------------------
   52|  12.8k|    for (int i = 0; i < rows; ++i) {
  ------------------
  |  Branch (52:21): [True: 12.4k, False: 365]
  ------------------
   53|  12.4k|      memset(color_map + i * plane_block_width + cols,
   54|  12.4k|             color_map[i * plane_block_width + cols - 1],
   55|  12.4k|             (plane_block_width - cols));
   56|  12.4k|    }
   57|    365|  }
   58|       |  // Copy last row to extra rows.
   59|  65.9k|  for (int i = rows; i < plane_block_height; ++i) {
  ------------------
  |  Branch (59:22): [True: 3.48k, False: 62.4k]
  ------------------
   60|  3.48k|    memcpy(color_map + i * plane_block_width,
   61|  3.48k|           color_map + (rows - 1) * plane_block_width, plane_block_width);
   62|  3.48k|  }
   63|  62.4k|}

aom_get_num_layers_from_operating_point_idc:
   31|  55.7k|    unsigned int *number_temporal_layers) {
   32|       |  // derive number of spatial/temporal layers from operating_point_idc
   33|       |
   34|  55.7k|  if (!number_spatial_layers || !number_temporal_layers)
  ------------------
  |  Branch (34:7): [True: 0, False: 55.7k]
  |  Branch (34:33): [True: 0, False: 55.7k]
  ------------------
   35|      0|    return AOM_CODEC_INVALID_PARAM;
   36|       |
   37|  55.7k|  if (operating_point_idc == 0) {
  ------------------
  |  Branch (37:7): [True: 46.3k, False: 9.42k]
  ------------------
   38|  46.3k|    *number_temporal_layers = 1;
   39|  46.3k|    *number_spatial_layers = 1;
   40|  46.3k|  } else {
   41|  9.42k|    *number_spatial_layers = 0;
   42|  9.42k|    *number_temporal_layers = 0;
   43|  47.1k|    for (int j = 0; j < MAX_NUM_SPATIAL_LAYERS; j++) {
  ------------------
  |  |   71|  47.1k|#define MAX_NUM_SPATIAL_LAYERS 4
  ------------------
  |  Branch (43:21): [True: 37.7k, False: 9.42k]
  ------------------
   44|  37.7k|      *number_spatial_layers +=
   45|  37.7k|          (operating_point_idc >> (j + MAX_NUM_TEMPORAL_LAYERS)) & 0x1;
  ------------------
  |  |   70|  37.7k|#define MAX_NUM_TEMPORAL_LAYERS 8
  ------------------
   46|  37.7k|    }
   47|  84.8k|    for (int j = 0; j < MAX_NUM_TEMPORAL_LAYERS; j++) {
  ------------------
  |  |   70|  84.8k|#define MAX_NUM_TEMPORAL_LAYERS 8
  ------------------
  |  Branch (47:21): [True: 75.4k, False: 9.42k]
  ------------------
   48|  75.4k|      *number_temporal_layers += (operating_point_idc >> j) & 0x1;
   49|  75.4k|    }
   50|  9.42k|  }
   51|       |
   52|  55.7k|  return AOM_CODEC_OK;
   53|  55.7k|}
aom_decode_frame_from_obus:
  867|  29.0k|                               const uint8_t **p_data_end) {
  868|  29.0k|  AV1_COMMON *const cm = &pbi->common;
  869|  29.0k|  int frame_decoding_finished = 0;
  870|  29.0k|  int is_first_tg_obu_received = 1;
  871|       |  // Whenever pbi->seen_frame_header is set to 1, frame_header is set to the
  872|       |  // beginning of the frame_header_obu and frame_header_size is set to its
  873|       |  // size. This allows us to check if a redundant frame_header_obu is a copy
  874|       |  // of the previous frame_header_obu.
  875|       |  //
  876|       |  // Initialize frame_header to a dummy nonnull pointer, otherwise the Clang
  877|       |  // Static Analyzer in clang 7.0.1 will falsely warn that a null pointer is
  878|       |  // passed as an argument to a 'nonnull' parameter of memcmp(). The initial
  879|       |  // value will not be used.
  880|  29.0k|  const uint8_t *frame_header = data;
  881|  29.0k|  uint32_t frame_header_size = 0;
  882|  29.0k|  ObuHeader obu_header;
  883|  29.0k|  memset(&obu_header, 0, sizeof(obu_header));
  884|  29.0k|  pbi->seen_frame_header = 0;
  885|  29.0k|  pbi->next_start_tile = 0;
  886|  29.0k|  pbi->num_tile_groups = 0;
  887|       |
  888|  29.0k|  if (data_end < data) {
  ------------------
  |  Branch (888:7): [True: 0, False: 29.0k]
  ------------------
  889|      0|    pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
  890|      0|    return -1;
  891|      0|  }
  892|       |
  893|       |  // Reset pbi->camera_frame_header_ready to 0 if cm->tiles.large_scale = 0.
  894|  29.0k|  if (!cm->tiles.large_scale) pbi->camera_frame_header_ready = 0;
  ------------------
  |  Branch (894:7): [True: 29.0k, False: 0]
  ------------------
  895|       |
  896|       |  // decode frame as a series of OBUs
  897|  78.0k|  while (!frame_decoding_finished && pbi->error.error_code == AOM_CODEC_OK) {
  ------------------
  |  Branch (897:10): [True: 65.2k, False: 12.7k]
  |  Branch (897:38): [True: 65.2k, False: 3]
  ------------------
  898|  65.2k|    struct aom_read_bit_buffer rb;
  899|  65.2k|    size_t payload_size = 0;
  900|  65.2k|    size_t decoded_payload_size = 0;
  901|  65.2k|    size_t obu_payload_offset = 0;
  902|  65.2k|    size_t bytes_read = 0;
  903|  65.2k|    const size_t bytes_available = data_end - data;
  904|       |
  905|  65.2k|    if (bytes_available == 0 && !pbi->seen_frame_header) {
  ------------------
  |  Branch (905:9): [True: 990, False: 64.2k]
  |  Branch (905:33): [True: 984, False: 6]
  ------------------
  906|    984|      *p_data_end = data;
  907|    984|      pbi->error.error_code = AOM_CODEC_OK;
  908|    984|      break;
  909|    984|    }
  910|       |
  911|  64.2k|    aom_codec_err_t status =
  912|  64.2k|        aom_read_obu_header_and_size(data, bytes_available, pbi->is_annexb,
  913|  64.2k|                                     &obu_header, &payload_size, &bytes_read);
  914|       |
  915|  64.2k|    if (status != AOM_CODEC_OK) {
  ------------------
  |  Branch (915:9): [True: 700, False: 63.5k]
  ------------------
  916|    700|      pbi->error.error_code = status;
  917|    700|      return -1;
  918|    700|    }
  919|       |
  920|       |    // Record obu size header information.
  921|  63.5k|    pbi->obu_size_hdr.data = data + obu_header.size;
  922|  63.5k|    pbi->obu_size_hdr.size = bytes_read - obu_header.size;
  923|       |
  924|       |    // Note: aom_read_obu_header_and_size() takes care of checking that this
  925|       |    // doesn't cause 'data' to advance past 'data_end'.
  926|  63.5k|    data += bytes_read;
  927|       |
  928|  63.5k|    if ((size_t)(data_end - data) < payload_size) {
  ------------------
  |  Branch (928:9): [True: 73, False: 63.4k]
  ------------------
  929|     73|      pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
  930|     73|      return -1;
  931|     73|    }
  932|       |
  933|  63.4k|    cm->temporal_layer_id = obu_header.temporal_layer_id;
  934|  63.4k|    cm->spatial_layer_id = obu_header.spatial_layer_id;
  935|       |
  936|  63.4k|    if (obu_header.type != OBU_TEMPORAL_DELIMITER &&
  ------------------
  |  Branch (936:9): [True: 47.0k, False: 16.4k]
  ------------------
  937|  47.0k|        obu_header.type != OBU_SEQUENCE_HEADER) {
  ------------------
  |  Branch (937:9): [True: 28.2k, False: 18.8k]
  ------------------
  938|       |      // don't decode obu if it's not in current operating mode
  939|  28.2k|      if (!is_obu_in_current_operating_point(pbi, &obu_header)) {
  ------------------
  |  Branch (939:11): [True: 279, False: 27.9k]
  ------------------
  940|    279|        data += payload_size;
  941|    279|        continue;
  942|    279|      }
  943|  28.2k|    }
  944|       |
  945|  63.2k|    av1_init_read_bit_buffer(pbi, &rb, data, data + payload_size);
  946|       |
  947|  63.2k|    switch (obu_header.type) {
  948|  16.4k|      case OBU_TEMPORAL_DELIMITER:
  ------------------
  |  Branch (948:7): [True: 16.4k, False: 46.7k]
  ------------------
  949|  16.4k|        decoded_payload_size = read_temporal_delimiter_obu();
  950|  16.4k|        if (pbi->seen_frame_header) {
  ------------------
  |  Branch (950:13): [True: 0, False: 16.4k]
  ------------------
  951|       |          // A new temporal unit has started, but the frame in the previous
  952|       |          // temporal unit is incomplete.
  953|      0|          pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
  954|      0|          return -1;
  955|      0|        }
  956|  16.4k|        break;
  957|  18.8k|      case OBU_SEQUENCE_HEADER:
  ------------------
  |  Branch (957:7): [True: 18.8k, False: 44.3k]
  ------------------
  958|  18.8k|        decoded_payload_size = read_sequence_header_obu(pbi, &rb);
  959|  18.8k|        if (pbi->error.error_code != AOM_CODEC_OK) return -1;
  ------------------
  |  Branch (959:13): [True: 175, False: 18.6k]
  ------------------
  960|       |        // The sequence header should not change in the middle of a frame.
  961|  18.6k|        if (pbi->sequence_header_changed && pbi->seen_frame_header) {
  ------------------
  |  Branch (961:13): [True: 269, False: 18.3k]
  |  Branch (961:45): [True: 0, False: 269]
  ------------------
  962|      0|          pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
  963|      0|          return -1;
  964|      0|        }
  965|  18.6k|        break;
  966|  18.6k|      case OBU_FRAME_HEADER:
  ------------------
  |  Branch (966:7): [True: 302, False: 62.9k]
  ------------------
  967|    305|      case OBU_REDUNDANT_FRAME_HEADER:
  ------------------
  |  Branch (967:7): [True: 3, False: 63.2k]
  ------------------
  968|  26.8k|      case OBU_FRAME:
  ------------------
  |  Branch (968:7): [True: 26.5k, False: 36.6k]
  ------------------
  969|  26.8k|        if (obu_header.type == OBU_REDUNDANT_FRAME_HEADER) {
  ------------------
  |  Branch (969:13): [True: 3, False: 26.8k]
  ------------------
  970|      3|          if (!pbi->seen_frame_header) {
  ------------------
  |  Branch (970:15): [True: 3, False: 0]
  ------------------
  971|      3|            pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
  972|      3|            return -1;
  973|      3|          }
  974|  26.8k|        } else {
  975|       |          // OBU_FRAME_HEADER or OBU_FRAME.
  976|  26.8k|          if (pbi->seen_frame_header) {
  ------------------
  |  Branch (976:15): [True: 0, False: 26.8k]
  ------------------
  977|      0|            pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
  978|      0|            return -1;
  979|      0|          }
  980|  26.8k|        }
  981|       |        // Only decode first frame header received
  982|  26.8k|        if (!pbi->seen_frame_header ||
  ------------------
  |  Branch (982:13): [True: 26.8k, False: 0]
  ------------------
  983|  26.8k|            (cm->tiles.large_scale && !pbi->camera_frame_header_ready)) {
  ------------------
  |  Branch (983:14): [True: 0, False: 0]
  |  Branch (983:39): [True: 0, False: 0]
  ------------------
  984|  26.8k|          frame_header_size = read_frame_header_obu(
  985|  26.8k|              pbi, &rb, data, p_data_end, obu_header.type != OBU_FRAME);
  986|  26.8k|          frame_header = data;
  987|  26.8k|          pbi->seen_frame_header = 1;
  988|  26.8k|          if (!pbi->ext_tile_debug && cm->tiles.large_scale)
  ------------------
  |  Branch (988:15): [True: 26.3k, False: 501]
  |  Branch (988:39): [True: 0, False: 26.3k]
  ------------------
  989|      0|            pbi->camera_frame_header_ready = 1;
  990|  26.8k|        } else {
  991|       |          // Verify that the frame_header_obu is identical to the original
  992|       |          // frame_header_obu.
  993|      0|          if (frame_header_size > payload_size ||
  ------------------
  |  Branch (993:15): [True: 0, False: 0]
  ------------------
  994|      0|              memcmp(data, frame_header, frame_header_size) != 0) {
  ------------------
  |  Branch (994:15): [True: 0, False: 0]
  ------------------
  995|      0|            pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
  996|      0|            return -1;
  997|      0|          }
  998|      0|          assert(rb.bit_offset == 0);
  999|      0|          rb.bit_offset = 8 * frame_header_size;
 1000|      0|        }
 1001|       |
 1002|  26.8k|        decoded_payload_size = frame_header_size;
 1003|  26.8k|        pbi->frame_header_size = frame_header_size;
 1004|  26.8k|        cm->cur_frame->temporal_id = obu_header.temporal_layer_id;
 1005|  26.8k|        cm->cur_frame->spatial_id = obu_header.spatial_layer_id;
 1006|       |
 1007|  26.8k|        if (cm->show_existing_frame) {
  ------------------
  |  Branch (1007:13): [True: 252, False: 26.6k]
  ------------------
 1008|    252|          if (obu_header.type == OBU_FRAME) {
  ------------------
  |  Branch (1008:15): [True: 2, False: 250]
  ------------------
 1009|      2|            pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
 1010|      2|            return -1;
 1011|      2|          }
 1012|    250|          frame_decoding_finished = 1;
 1013|    250|          pbi->seen_frame_header = 0;
 1014|       |
 1015|    250|          if (cm->show_frame &&
  ------------------
  |  Branch (1015:15): [True: 250, False: 0]
  ------------------
 1016|    250|              !cm->seq_params->order_hint_info.enable_order_hint) {
  ------------------
  |  Branch (1016:15): [True: 0, False: 250]
  ------------------
 1017|      0|            ++cm->current_frame.frame_number;
 1018|      0|          }
 1019|    250|          break;
 1020|    252|        }
 1021|       |
 1022|       |        // In large scale tile coding, decode the common camera frame header
 1023|       |        // before any tile list OBU.
 1024|  26.6k|        if (!pbi->ext_tile_debug && pbi->camera_frame_header_ready) {
  ------------------
  |  Branch (1024:13): [True: 26.1k, False: 501]
  |  Branch (1024:37): [True: 0, False: 26.1k]
  ------------------
 1025|      0|          frame_decoding_finished = 1;
 1026|       |          // Skip the rest of the frame data.
 1027|      0|          decoded_payload_size = payload_size;
 1028|       |          // Update data_end.
 1029|      0|          *p_data_end = data_end;
 1030|      0|          break;
 1031|      0|        }
 1032|       |
 1033|  26.6k|        if (obu_header.type != OBU_FRAME) break;
  ------------------
  |  Branch (1033:13): [True: 42, False: 26.5k]
  ------------------
 1034|  26.5k|        obu_payload_offset = frame_header_size;
 1035|       |        // Byte align the reader before reading the tile group.
 1036|       |        // byte_alignment() has set pbi->error.error_code if it returns -1.
 1037|  26.5k|        if (byte_alignment(cm, &rb)) return -1;
  ------------------
  |  Branch (1037:13): [True: 211, False: 26.3k]
  ------------------
 1038|  26.3k|        AOM_FALLTHROUGH_INTENDED;  // fall through to read tile group.
  ------------------
  |  |   52|  26.3k|  do {                           \
  |  |   53|  26.3k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (53:12): [Folded, False: 26.3k]
  |  |  ------------------
  ------------------
 1039|  26.4k|      case OBU_TILE_GROUP:
  ------------------
  |  Branch (1039:7): [True: 47, False: 63.1k]
  ------------------
 1040|  26.4k|        if (!pbi->seen_frame_header) {
  ------------------
  |  Branch (1040:13): [True: 5, False: 26.4k]
  ------------------
 1041|      5|          pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
 1042|      5|          return -1;
 1043|      5|        }
 1044|  26.4k|        if (obu_payload_offset > payload_size) {
  ------------------
  |  Branch (1044:13): [True: 0, False: 26.4k]
  ------------------
 1045|      0|          pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
 1046|      0|          return -1;
 1047|      0|        }
 1048|  26.4k|        decoded_payload_size += read_one_tile_group_obu(
 1049|  26.4k|            pbi, &rb, is_first_tg_obu_received, data + obu_payload_offset,
 1050|  26.4k|            data + payload_size, p_data_end, &frame_decoding_finished,
 1051|  26.4k|            obu_header.type == OBU_FRAME);
 1052|  26.4k|        if (pbi->error.error_code != AOM_CODEC_OK) return -1;
  ------------------
  |  Branch (1052:13): [True: 30, False: 26.3k]
  ------------------
 1053|  26.3k|        is_first_tg_obu_received = 0;
 1054|  26.3k|        if (frame_decoding_finished) {
  ------------------
  |  Branch (1054:13): [True: 12.5k, False: 13.8k]
  ------------------
 1055|  12.5k|          pbi->seen_frame_header = 0;
 1056|  12.5k|          pbi->next_start_tile = 0;
 1057|  12.5k|        }
 1058|  26.3k|        pbi->num_tile_groups++;
 1059|  26.3k|        break;
 1060|     54|      case OBU_METADATA:
  ------------------
  |  Branch (1060:7): [True: 54, False: 63.1k]
  ------------------
 1061|     54|        decoded_payload_size = read_metadata(pbi, data, payload_size);
 1062|     54|        if (pbi->error.error_code != AOM_CODEC_OK) return -1;
  ------------------
  |  Branch (1062:13): [True: 22, False: 32]
  ------------------
 1063|     32|        break;
 1064|     32|      case OBU_TILE_LIST:
  ------------------
  |  Branch (1064:7): [True: 3, False: 63.2k]
  ------------------
 1065|      3|        if (CONFIG_NORMAL_TILE_MODE) {
  ------------------
  |  |   54|      3|#define CONFIG_NORMAL_TILE_MODE 0
  |  |  ------------------
  |  |  |  Branch (54:33): [Folded, False: 3]
  |  |  ------------------
  ------------------
 1066|      0|          pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
 1067|      0|          return -1;
 1068|      0|        }
 1069|       |
 1070|       |        // This OBU type is purely for the large scale tile coding mode.
 1071|       |        // The common camera frame header has to be already decoded.
 1072|      3|        if (!pbi->camera_frame_header_ready) {
  ------------------
  |  Branch (1072:13): [True: 3, False: 0]
  ------------------
 1073|      3|          pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
 1074|      3|          return -1;
 1075|      3|        }
 1076|       |
 1077|      0|        cm->tiles.large_scale = 1;
 1078|      0|        av1_set_single_tile_decoding_mode(cm);
 1079|      0|        decoded_payload_size =
 1080|      0|            read_and_decode_one_tile_list(pbi, &rb, data, data + payload_size,
 1081|      0|                                          p_data_end, &frame_decoding_finished);
 1082|      0|        if (pbi->error.error_code != AOM_CODEC_OK) return -1;
  ------------------
  |  Branch (1082:13): [True: 0, False: 0]
  ------------------
 1083|      0|        break;
 1084|     24|      case OBU_PADDING:
  ------------------
  |  Branch (1084:7): [True: 24, False: 63.1k]
  ------------------
 1085|     24|        decoded_payload_size = read_padding(cm, data, payload_size);
 1086|     24|        if (pbi->error.error_code != AOM_CODEC_OK) return -1;
  ------------------
  |  Branch (1086:13): [True: 15, False: 9]
  ------------------
 1087|      9|        break;
 1088|    967|      default:
  ------------------
  |  Branch (1088:7): [True: 967, False: 62.2k]
  ------------------
 1089|       |        // Skip unrecognized OBUs
 1090|    967|        if (payload_size > 0 &&
  ------------------
  |  Branch (1090:13): [True: 919, False: 48]
  ------------------
 1091|    919|            get_last_nonzero_byte(data, payload_size) == 0) {
  ------------------
  |  Branch (1091:13): [True: 3, False: 916]
  ------------------
 1092|      3|          pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
 1093|      3|          return -1;
 1094|      3|        }
 1095|    964|        decoded_payload_size = payload_size;
 1096|    964|        break;
 1097|  63.2k|    }
 1098|       |
 1099|       |    // Check that the signalled OBU size matches the actual amount of data read
 1100|  48.7k|    if (decoded_payload_size > payload_size) {
  ------------------
  |  Branch (1100:9): [True: 0, False: 48.7k]
  ------------------
 1101|      0|      pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
 1102|      0|      return -1;
 1103|      0|    }
 1104|       |
 1105|       |    // If there are extra padding bytes, they should all be zero
 1106|  48.7k|    while (decoded_payload_size < payload_size) {
  ------------------
  |  Branch (1106:12): [True: 43, False: 48.7k]
  ------------------
 1107|     43|      uint8_t padding_byte = data[decoded_payload_size++];
 1108|     43|      if (padding_byte != 0) {
  ------------------
  |  Branch (1108:11): [True: 37, False: 6]
  ------------------
 1109|     37|        pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
 1110|     37|        return -1;
 1111|     37|      }
 1112|     43|    }
 1113|       |
 1114|  48.7k|    data += payload_size;
 1115|  48.7k|  }
 1116|       |
 1117|  13.7k|  if (pbi->error.error_code != AOM_CODEC_OK) return -1;
  ------------------
  |  Branch (1117:7): [True: 5, False: 13.7k]
  ------------------
 1118|  13.7k|  return frame_decoding_finished;
 1119|  13.7k|}
obu.c:is_obu_in_current_operating_point:
   56|  28.2k|                                             const ObuHeader *obu_header) {
   57|  28.2k|  if (!pbi->current_operating_point || !obu_header->has_extension) {
  ------------------
  |  Branch (57:7): [True: 23.1k, False: 5.12k]
  |  Branch (57:40): [True: 3.28k, False: 1.84k]
  ------------------
   58|  26.4k|    return 1;
   59|  26.4k|  }
   60|       |
   61|  1.84k|  if ((pbi->current_operating_point >> obu_header->temporal_layer_id) & 0x1 &&
  ------------------
  |  Branch (61:7): [True: 1.59k, False: 252]
  ------------------
   62|  1.59k|      (pbi->current_operating_point >> (obu_header->spatial_layer_id + 8)) &
  ------------------
  |  Branch (62:7): [True: 1.56k, False: 27]
  ------------------
   63|  1.59k|          0x1) {
   64|  1.56k|    return 1;
   65|  1.56k|  }
   66|    279|  return 0;
   67|  1.84k|}
obu.c:read_temporal_delimiter_obu:
   80|  16.4k|static uint32_t read_temporal_delimiter_obu(void) { return 0; }
obu.c:read_sequence_header_obu:
  105|  18.8k|                                         struct aom_read_bit_buffer *rb) {
  106|  18.8k|  AV1_COMMON *const cm = &pbi->common;
  107|  18.8k|  const uint32_t saved_bit_offset = rb->bit_offset;
  108|       |
  109|       |  // Verify rb has been configured to report errors.
  110|  18.8k|  assert(rb->error_handler);
  111|       |
  112|       |  // Use a local variable to store the information as we decode. At the end,
  113|       |  // if no errors have occurred, cm->seq_params is updated.
  114|  18.8k|  SequenceHeader sh = *cm->seq_params;
  115|  18.8k|  SequenceHeader *const seq_params = &sh;
  116|       |
  117|  18.8k|  seq_params->profile = av1_read_profile(rb);
  118|  18.8k|  if (seq_params->profile > CONFIG_MAX_DECODE_PROFILE) {
  ------------------
  |  |   50|  18.8k|#define CONFIG_MAX_DECODE_PROFILE 2
  ------------------
  |  Branch (118:7): [True: 14, False: 18.7k]
  ------------------
  119|     14|    pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
  120|     14|    return 0;
  121|     14|  }
  122|       |
  123|       |  // Still picture or not
  124|  18.7k|  seq_params->still_picture = aom_rb_read_bit(rb);
  125|  18.7k|  seq_params->reduced_still_picture_hdr = aom_rb_read_bit(rb);
  126|       |  // Video must have reduced_still_picture_hdr = 0
  127|  18.7k|  if (!seq_params->still_picture && seq_params->reduced_still_picture_hdr) {
  ------------------
  |  Branch (127:7): [True: 9.57k, False: 9.22k]
  |  Branch (127:37): [True: 3, False: 9.57k]
  ------------------
  128|      3|    pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
  129|      3|    return 0;
  130|      3|  }
  131|       |
  132|  18.7k|  if (seq_params->reduced_still_picture_hdr) {
  ------------------
  |  Branch (132:7): [True: 9.12k, False: 9.67k]
  ------------------
  133|  9.12k|    seq_params->timing_info_present = 0;
  134|  9.12k|    seq_params->decoder_model_info_present_flag = 0;
  135|  9.12k|    seq_params->display_model_info_present_flag = 0;
  136|  9.12k|    seq_params->operating_points_cnt_minus_1 = 0;
  137|  9.12k|    seq_params->operating_point_idc[0] = 0;
  138|  9.12k|    seq_params->has_nonzero_operating_point_idc = false;
  139|  9.12k|    if (!read_bitstream_level(&seq_params->seq_level_idx[0], rb)) {
  ------------------
  |  Branch (139:9): [True: 45, False: 9.08k]
  ------------------
  140|     45|      pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
  141|     45|      return 0;
  142|     45|    }
  143|  9.08k|    seq_params->tier[0] = 0;
  144|  9.08k|    seq_params->op_params[0].decoder_model_param_present_flag = 0;
  145|  9.08k|    seq_params->op_params[0].display_model_param_present_flag = 0;
  146|  9.67k|  } else {
  147|  9.67k|    seq_params->timing_info_present = aom_rb_read_bit(rb);
  148|  9.67k|    if (seq_params->timing_info_present) {
  ------------------
  |  Branch (148:9): [True: 84, False: 9.58k]
  ------------------
  149|     84|      av1_read_timing_info_header(&seq_params->timing_info, &pbi->error, rb);
  150|       |
  151|     84|      seq_params->decoder_model_info_present_flag = aom_rb_read_bit(rb);
  152|     84|      if (seq_params->decoder_model_info_present_flag)
  ------------------
  |  Branch (152:11): [True: 21, False: 63]
  ------------------
  153|     21|        av1_read_decoder_model_info(&seq_params->decoder_model_info, rb);
  154|  9.58k|    } else {
  155|  9.58k|      seq_params->decoder_model_info_present_flag = 0;
  156|  9.58k|    }
  157|  9.67k|    seq_params->display_model_info_present_flag = aom_rb_read_bit(rb);
  158|  9.67k|    seq_params->operating_points_cnt_minus_1 =
  159|  9.67k|        aom_rb_read_literal(rb, OP_POINTS_CNT_MINUS_1_BITS);
  ------------------
  |  |   93|  9.67k|#define OP_POINTS_CNT_MINUS_1_BITS 5
  ------------------
  160|  9.67k|    seq_params->has_nonzero_operating_point_idc = false;
  161|  22.2k|    for (int i = 0; i < seq_params->operating_points_cnt_minus_1 + 1; i++) {
  ------------------
  |  Branch (161:21): [True: 12.6k, False: 9.59k]
  ------------------
  162|  12.6k|      seq_params->operating_point_idc[i] =
  163|  12.6k|          aom_rb_read_literal(rb, OP_POINTS_IDC_BITS);
  ------------------
  |  |   94|  12.6k|#define OP_POINTS_IDC_BITS 12
  ------------------
  164|  12.6k|      if (seq_params->operating_point_idc[i] != 0)
  ------------------
  |  Branch (164:11): [True: 6.01k, False: 6.61k]
  ------------------
  165|  6.01k|        seq_params->has_nonzero_operating_point_idc = true;
  166|  12.6k|      if (!read_bitstream_level(&seq_params->seq_level_idx[i], rb)) {
  ------------------
  |  Branch (166:11): [True: 72, False: 12.5k]
  ------------------
  167|     72|        pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
  168|     72|        return 0;
  169|     72|      }
  170|       |      // This is the seq_level_idx[i] > 7 check in the spec. seq_level_idx 7
  171|       |      // is equivalent to level 3.3.
  172|  12.5k|      if (seq_params->seq_level_idx[i] >= SEQ_LEVEL_4_0)
  ------------------
  |  Branch (172:11): [True: 100, False: 12.4k]
  ------------------
  173|    100|        seq_params->tier[i] = aom_rb_read_bit(rb);
  174|  12.4k|      else
  175|  12.4k|        seq_params->tier[i] = 0;
  176|  12.5k|      if (seq_params->decoder_model_info_present_flag) {
  ------------------
  |  Branch (176:11): [True: 2, False: 12.5k]
  ------------------
  177|      2|        seq_params->op_params[i].decoder_model_param_present_flag =
  178|      2|            aom_rb_read_bit(rb);
  179|      2|        if (seq_params->op_params[i].decoder_model_param_present_flag)
  ------------------
  |  Branch (179:13): [True: 0, False: 2]
  ------------------
  180|      0|          av1_read_op_parameters_info(&seq_params->op_params[i],
  181|      0|                                      seq_params->decoder_model_info
  182|      0|                                          .encoder_decoder_buffer_delay_length,
  183|      0|                                      rb);
  184|  12.5k|      } else {
  185|  12.5k|        seq_params->op_params[i].decoder_model_param_present_flag = 0;
  186|  12.5k|      }
  187|  12.5k|      if (seq_params->timing_info_present &&
  ------------------
  |  Branch (187:11): [True: 88, False: 12.4k]
  ------------------
  188|     88|          (seq_params->timing_info.equal_picture_interval ||
  ------------------
  |  Branch (188:12): [True: 73, False: 15]
  ------------------
  189|     73|           seq_params->op_params[i].decoder_model_param_present_flag)) {
  ------------------
  |  Branch (189:12): [True: 0, False: 15]
  ------------------
  190|     73|        seq_params->op_params[i].bitrate = av1_max_level_bitrate(
  191|     73|            seq_params->profile, seq_params->seq_level_idx[i],
  192|     73|            seq_params->tier[i]);
  193|       |        // Level with seq_level_idx = 31 returns a high "dummy" bitrate to pass
  194|       |        // the check
  195|     73|        if (seq_params->op_params[i].bitrate == 0)
  ------------------
  |  Branch (195:13): [True: 0, False: 73]
  ------------------
  196|      0|          aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM,
  197|      0|                             "AV1 does not support this combination of "
  198|      0|                             "profile, level, and tier.");
  199|       |        // Buffer size in bits/s is bitrate in bits/s * 1 s
  200|     73|        seq_params->op_params[i].buffer_size = seq_params->op_params[i].bitrate;
  201|     73|      }
  202|  12.5k|      if (seq_params->timing_info_present &&
  ------------------
  |  Branch (202:11): [True: 88, False: 12.4k]
  ------------------
  203|     88|          seq_params->timing_info.equal_picture_interval &&
  ------------------
  |  Branch (203:11): [True: 73, False: 15]
  ------------------
  204|     73|          !seq_params->op_params[i].decoder_model_param_present_flag) {
  ------------------
  |  Branch (204:11): [True: 73, False: 0]
  ------------------
  205|       |        // When the decoder_model_parameters are not sent for this op, set
  206|       |        // the default ones that can be used with the resource availability mode
  207|     73|        seq_params->op_params[i].decoder_buffer_delay = 70000;
  208|     73|        seq_params->op_params[i].encoder_buffer_delay = 20000;
  209|     73|        seq_params->op_params[i].low_delay_mode_flag = 0;
  210|     73|      }
  211|       |
  212|  12.5k|      if (seq_params->display_model_info_present_flag) {
  ------------------
  |  Branch (212:11): [True: 149, False: 12.4k]
  ------------------
  213|    149|        seq_params->op_params[i].display_model_param_present_flag =
  214|    149|            aom_rb_read_bit(rb);
  215|    149|        if (seq_params->op_params[i].display_model_param_present_flag) {
  ------------------
  |  Branch (215:13): [True: 19, False: 130]
  ------------------
  216|     19|          seq_params->op_params[i].initial_display_delay =
  217|     19|              aom_rb_read_literal(rb, 4) + 1;
  218|     19|          if (seq_params->op_params[i].initial_display_delay > 10)
  ------------------
  |  Branch (218:15): [True: 5, False: 14]
  ------------------
  219|      5|            aom_internal_error(
  220|      5|                &pbi->error, AOM_CODEC_UNSUP_BITSTREAM,
  221|      5|                "AV1 does not support more than 10 decoded frames delay");
  222|    130|        } else {
  223|    130|          seq_params->op_params[i].initial_display_delay = 10;
  224|    130|        }
  225|  12.4k|      } else {
  226|  12.4k|        seq_params->op_params[i].display_model_param_present_flag = 0;
  227|  12.4k|        seq_params->op_params[i].initial_display_delay = 10;
  228|  12.4k|      }
  229|  12.5k|    }
  230|  9.67k|  }
  231|       |  // This decoder supports all levels.  Choose operating point provided by
  232|       |  // external means
  233|  18.6k|  int operating_point = pbi->operating_point;
  234|  18.6k|  if (operating_point < 0 ||
  ------------------
  |  Branch (234:7): [True: 83, False: 18.5k]
  ------------------
  235|  18.5k|      operating_point > seq_params->operating_points_cnt_minus_1)
  ------------------
  |  Branch (235:7): [True: 0, False: 18.5k]
  ------------------
  236|      0|    operating_point = 0;
  237|  18.6k|  pbi->current_operating_point =
  238|  18.6k|      seq_params->operating_point_idc[operating_point];
  239|  18.6k|  if (aom_get_num_layers_from_operating_point_idc(
  ------------------
  |  Branch (239:7): [True: 0, False: 18.6k]
  ------------------
  240|  18.6k|          pbi->current_operating_point, &pbi->number_spatial_layers,
  241|  18.6k|          &pbi->number_temporal_layers) != AOM_CODEC_OK) {
  242|      0|    pbi->error.error_code = AOM_CODEC_ERROR;
  243|      0|    return 0;
  244|      0|  }
  245|       |
  246|  18.6k|  av1_read_sequence_header(cm, rb, seq_params);
  247|       |
  248|  18.6k|  av1_read_color_config(rb, pbi->allow_lowbitdepth, seq_params, &pbi->error);
  249|  18.6k|  if (!(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0) &&
  ------------------
  |  Branch (249:9): [True: 9.98k, False: 8.69k]
  |  Branch (249:43): [True: 9.98k, False: 0]
  ------------------
  250|  8.58k|      !(seq_params->subsampling_x == 1 && seq_params->subsampling_y == 1) &&
  ------------------
  |  Branch (250:9): [True: 8.58k, False: 0]
  |  Branch (250:43): [True: 7.37k, False: 1.20k]
  ------------------
  251|  1.20k|      !(seq_params->subsampling_x == 1 && seq_params->subsampling_y == 0)) {
  ------------------
  |  Branch (251:9): [True: 1.20k, False: 0]
  |  Branch (251:43): [True: 1.20k, False: 0]
  ------------------
  252|      0|    aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM,
  253|      0|                       "Only 4:4:4, 4:2:2 and 4:2:0 are currently supported, "
  254|      0|                       "%d %d subsampling is not supported.\n",
  255|      0|                       seq_params->subsampling_x, seq_params->subsampling_y);
  256|      0|  }
  257|       |
  258|  18.6k|  seq_params->film_grain_params_present = aom_rb_read_bit(rb);
  259|       |
  260|  18.6k|  if (av1_check_trailing_bits(pbi, rb) != 0) {
  ------------------
  |  Branch (260:7): [True: 41, False: 18.6k]
  ------------------
  261|       |    // pbi->error.error_code is already set.
  262|     41|    return 0;
  263|     41|  }
  264|       |
  265|       |  // If a sequence header has been decoded before, we check if the new
  266|       |  // one is consistent with the old one.
  267|  18.6k|  if (pbi->sequence_header_ready) {
  ------------------
  |  Branch (267:7): [True: 867, False: 17.7k]
  ------------------
  268|    867|    if (!are_seq_headers_consistent(cm->seq_params, seq_params))
  ------------------
  |  Branch (268:9): [True: 269, False: 598]
  ------------------
  269|    269|      pbi->sequence_header_changed = 1;
  270|    867|  }
  271|       |
  272|  18.6k|  *cm->seq_params = *seq_params;
  273|  18.6k|  pbi->sequence_header_ready = 1;
  274|       |
  275|  18.6k|  return ((rb->bit_offset - saved_bit_offset + 7) >> 3);
  276|  18.6k|}
obu.c:read_bitstream_level:
   84|  21.7k|                                struct aom_read_bit_buffer *rb) {
   85|  21.7k|  *seq_level_idx = aom_rb_read_literal(rb, LEVEL_BITS);
  ------------------
  |  |  464|  21.7k|#define LEVEL_BITS 5
  ------------------
   86|  21.7k|  if (!is_valid_seq_level_idx(*seq_level_idx)) return 0;
  ------------------
  |  Branch (86:7): [True: 117, False: 21.6k]
  ------------------
   87|  21.6k|  return 1;
   88|  21.7k|}
obu.c:are_seq_headers_consistent:
   96|    867|                                      const SequenceHeader *seq_params_new) {
   97|    867|  return !memcmp(seq_params_old, seq_params_new,
   98|       |                 offsetof(SequenceHeader, op_params));
   99|    867|}
obu.c:read_frame_header_obu:
  285|  26.8k|                                      int trailing_bits_present) {
  286|  26.8k|  const uint32_t hdr_size =
  287|  26.8k|      av1_decode_frame_headers_and_setup(pbi, rb, trailing_bits_present);
  288|  26.8k|  const AV1_COMMON *cm = &pbi->common;
  289|  26.8k|  if (cm->show_existing_frame) {
  ------------------
  |  Branch (289:7): [True: 252, False: 26.6k]
  ------------------
  290|    252|    *p_data_end = data + hdr_size;
  291|    252|  }
  292|  26.8k|  return hdr_size;
  293|  26.8k|}
obu.c:byte_alignment:
   70|  51.9k|                          struct aom_read_bit_buffer *const rb) {
   71|   125k|  while (rb->bit_offset & 7) {
  ------------------
  |  Branch (71:10): [True: 74.2k, False: 51.7k]
  ------------------
   72|  74.2k|    if (aom_rb_read_bit(rb)) {
  ------------------
  |  Branch (72:9): [True: 241, False: 73.9k]
  ------------------
   73|    241|      cm->error->error_code = AOM_CODEC_CORRUPT_FRAME;
   74|    241|      return -1;
   75|    241|    }
   76|  74.2k|  }
   77|  51.7k|  return 0;
   78|  51.9k|}
obu.c:read_one_tile_group_obu:
  354|  25.9k|    int *is_last_tg, int tile_start_implicit) {
  355|  25.9k|  AV1_COMMON *const cm = &pbi->common;
  356|  25.9k|  int start_tile, end_tile;
  357|  25.9k|  int32_t header_size, tg_payload_size;
  358|       |
  359|  25.9k|  assert((rb->bit_offset & 7) == 0);
  360|  25.9k|  assert(rb->bit_buffer + aom_rb_bytes_read(rb) == data);
  361|       |
  362|  25.9k|  header_size = read_tile_group_header(pbi, rb, &start_tile, &end_tile,
  363|  25.9k|                                       tile_start_implicit);
  364|  25.9k|  if (header_size == -1 || byte_alignment(cm, rb)) return 0;
  ------------------
  |  Branch (364:7): [True: 16, False: 25.9k]
  |  Branch (364:28): [True: 30, False: 25.8k]
  ------------------
  365|  25.8k|  data += header_size;
  366|  25.8k|  av1_decode_tg_tiles_and_wrapup(pbi, data, data_end, p_data_end, start_tile,
  367|  25.8k|                                 end_tile, is_first_tg);
  368|       |
  369|  25.8k|  tg_payload_size = (uint32_t)(*p_data_end - data);
  370|       |
  371|  25.8k|  *is_last_tg = end_tile == cm->tiles.rows * cm->tiles.cols - 1;
  372|  25.8k|  return header_size + tg_payload_size;
  373|  25.9k|}
obu.c:read_tile_group_header:
  300|  25.9k|                                      int tile_start_implicit) {
  301|  25.9k|  AV1_COMMON *const cm = &pbi->common;
  302|  25.9k|  CommonTileParams *const tiles = &cm->tiles;
  303|  25.9k|  uint32_t saved_bit_offset = rb->bit_offset;
  304|  25.9k|  int tile_start_and_end_present_flag = 0;
  305|  25.9k|  const int num_tiles = tiles->rows * tiles->cols;
  306|       |
  307|  25.9k|  if (!tiles->large_scale && num_tiles > 1) {
  ------------------
  |  Branch (307:7): [True: 25.9k, False: 0]
  |  Branch (307:30): [True: 1.09k, False: 24.8k]
  ------------------
  308|  1.09k|    tile_start_and_end_present_flag = aom_rb_read_bit(rb);
  309|  1.09k|    if (tile_start_implicit && tile_start_and_end_present_flag) {
  ------------------
  |  Branch (309:9): [True: 1.04k, False: 44]
  |  Branch (309:32): [True: 14, False: 1.03k]
  ------------------
  310|     14|      aom_internal_error(
  311|     14|          &pbi->error, AOM_CODEC_UNSUP_BITSTREAM,
  312|     14|          "For OBU_FRAME type obu tile_start_and_end_present_flag must be 0");
  313|     14|      return -1;
  314|     14|    }
  315|  1.09k|  }
  316|  25.9k|  if (tiles->large_scale || num_tiles == 1 ||
  ------------------
  |  Branch (316:7): [True: 2, False: 25.9k]
  |  Branch (316:29): [True: 24.8k, False: 1.07k]
  ------------------
  317|  25.8k|      !tile_start_and_end_present_flag) {
  ------------------
  |  Branch (317:7): [True: 1.03k, False: 40]
  ------------------
  318|  25.8k|    *start_tile = 0;
  319|  25.8k|    *end_tile = num_tiles - 1;
  320|  25.8k|  } else {
  321|     42|    int tile_bits = tiles->log2_rows + tiles->log2_cols;
  322|     42|    *start_tile = aom_rb_read_literal(rb, tile_bits);
  323|     42|    *end_tile = aom_rb_read_literal(rb, tile_bits);
  324|     42|  }
  325|  25.9k|  if (*start_tile != pbi->next_start_tile) {
  ------------------
  |  Branch (325:7): [True: 0, False: 25.9k]
  ------------------
  326|      0|    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
  327|      0|                       "tg_start (%d) must be equal to %d", *start_tile,
  328|      0|                       pbi->next_start_tile);
  329|      0|    return -1;
  330|      0|  }
  331|  25.9k|  if (*start_tile > *end_tile) {
  ------------------
  |  Branch (331:7): [True: 0, False: 25.9k]
  ------------------
  332|      0|    aom_internal_error(
  333|      0|        &pbi->error, AOM_CODEC_CORRUPT_FRAME,
  334|      0|        "tg_end (%d) must be greater than or equal to tg_start (%d)", *end_tile,
  335|      0|        *start_tile);
  336|      0|    return -1;
  337|      0|  }
  338|  25.9k|  if (*end_tile >= num_tiles) {
  ------------------
  |  Branch (338:7): [True: 0, False: 25.9k]
  ------------------
  339|      0|    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
  340|      0|                       "tg_end (%d) must be less than NumTiles (%d)", *end_tile,
  341|      0|                       num_tiles);
  342|      0|    return -1;
  343|      0|  }
  344|  25.9k|  pbi->next_start_tile = (*end_tile == num_tiles - 1) ? 0 : *end_tile + 1;
  ------------------
  |  Branch (344:26): [True: 25.8k, False: 26]
  ------------------
  345|       |
  346|  25.9k|  return ((rb->bit_offset - saved_bit_offset + 7) >> 3);
  347|  25.9k|}
obu.c:read_metadata:
  787|     54|static size_t read_metadata(AV1Decoder *pbi, const uint8_t *data, size_t sz) {
  788|     54|  size_t type_length;
  789|     54|  uint64_t type_value;
  790|     54|  if (aom_uleb_decode(data, sz, &type_value, &type_length) < 0) {
  ------------------
  |  Branch (790:7): [True: 5, False: 49]
  ------------------
  791|      5|    pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
  792|      5|    return 0;
  793|      5|  }
  794|     49|  const OBU_METADATA_TYPE metadata_type = (OBU_METADATA_TYPE)type_value;
  795|     49|  if (metadata_type == 0 || metadata_type >= 6) {
  ------------------
  |  Branch (795:7): [True: 5, False: 44]
  |  Branch (795:29): [True: 11, False: 33]
  ------------------
  796|       |    // If metadata_type is reserved for future use or a user private value,
  797|       |    // ignore the entire OBU and just check trailing bits.
  798|     16|    if (get_last_nonzero_byte(data + type_length, sz - type_length) == 0) {
  ------------------
  |  Branch (798:9): [True: 2, False: 14]
  ------------------
  799|      2|      pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
  800|      2|      return 0;
  801|      2|    }
  802|     14|    return sz;
  803|     16|  }
  804|     33|  if (metadata_type == OBU_METADATA_TYPE_ITUT_T35) {
  ------------------
  |  Branch (804:7): [True: 4, False: 29]
  ------------------
  805|       |    // read_metadata_itut_t35() checks trailing bits.
  806|      4|    read_metadata_itut_t35(pbi, data + type_length, sz - type_length);
  807|      4|    return sz;
  808|     29|  } else if (metadata_type == OBU_METADATA_TYPE_HDR_CLL) {
  ------------------
  |  Branch (808:14): [True: 4, False: 25]
  ------------------
  809|      4|    size_t bytes_read =
  810|      4|        type_length +
  811|      4|        read_metadata_hdr_cll(pbi, data + type_length, sz - type_length);
  812|      4|    if (get_last_nonzero_byte(data + bytes_read, sz - bytes_read) != 0x80) {
  ------------------
  |  Branch (812:9): [True: 4, False: 0]
  ------------------
  813|      4|      pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
  814|      4|      return 0;
  815|      4|    }
  816|      0|    return sz;
  817|     25|  } else if (metadata_type == OBU_METADATA_TYPE_HDR_MDCV) {
  ------------------
  |  Branch (817:14): [True: 3, False: 22]
  ------------------
  818|      3|    size_t bytes_read =
  819|      3|        type_length +
  820|      3|        read_metadata_hdr_mdcv(pbi, data + type_length, sz - type_length);
  821|      3|    if (get_last_nonzero_byte(data + bytes_read, sz - bytes_read) != 0x80) {
  ------------------
  |  Branch (821:9): [True: 0, False: 3]
  ------------------
  822|      0|      pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
  823|      0|      return 0;
  824|      0|    }
  825|      3|    return sz;
  826|      3|  }
  827|       |
  828|     22|  struct aom_read_bit_buffer rb;
  829|     22|  av1_init_read_bit_buffer(pbi, &rb, data + type_length, data + sz);
  830|     22|  if (metadata_type == OBU_METADATA_TYPE_SCALABILITY) {
  ------------------
  |  Branch (830:7): [True: 2, False: 20]
  ------------------
  831|      2|    read_metadata_scalability(&rb);
  832|     20|  } else {
  833|     20|    assert(metadata_type == OBU_METADATA_TYPE_TIMECODE);
  834|     20|    read_metadata_timecode(&rb);
  835|     20|  }
  836|     22|  if (av1_check_trailing_bits(pbi, &rb) != 0) {
  ------------------
  |  Branch (836:7): [True: 11, False: 11]
  ------------------
  837|       |    // pbi->error.error_code is already set.
  838|     11|    return 0;
  839|     11|  }
  840|     22|  assert((rb.bit_offset & 7) == 0);
  841|     11|  return type_length + (rb.bit_offset >> 3);
  842|     22|}
obu.c:read_metadata_itut_t35:
  631|      4|                                   size_t sz) {
  632|      4|  if (sz == 0) {
  ------------------
  |  Branch (632:7): [True: 0, False: 4]
  ------------------
  633|      0|    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
  634|      0|                       "itu_t_t35_country_code is missing");
  635|      0|  }
  636|      4|  int country_code_size = 1;
  637|      4|  if (*data == 0xFF) {
  ------------------
  |  Branch (637:7): [True: 0, False: 4]
  ------------------
  638|      0|    if (sz == 1) {
  ------------------
  |  Branch (638:9): [True: 0, False: 0]
  ------------------
  639|      0|      aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
  640|      0|                         "itu_t_t35_country_code_extension_byte is missing");
  641|      0|    }
  642|      0|    ++country_code_size;
  643|      0|  }
  644|      4|  int end_index = get_last_nonzero_byte_index(data, sz);
  645|      4|  if (end_index < country_code_size) {
  ------------------
  |  Branch (645:7): [True: 0, False: 4]
  ------------------
  646|      0|    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
  647|      0|                       "No trailing bits found in ITU-T T.35 metadata OBU");
  648|      0|  }
  649|       |  // itu_t_t35_payload_bytes is byte aligned. Section 6.7.2 of the spec says:
  650|       |  //   itu_t_t35_payload_bytes shall be bytes containing data registered as
  651|       |  //   specified in Recommendation ITU-T T.35.
  652|       |  // Therefore the first trailing byte should be 0x80.
  653|      4|  if (data[end_index] != 0x80) {
  ------------------
  |  Branch (653:7): [True: 2, False: 2]
  ------------------
  654|      2|    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
  655|      2|                       "The last nonzero byte of the ITU-T T.35 metadata OBU "
  656|      2|                       "is 0x%02x, should be 0x80.",
  657|      2|                       data[end_index]);
  658|      2|  }
  659|      4|  alloc_read_metadata(pbi, OBU_METADATA_TYPE_ITUT_T35, data, end_index,
  660|      4|                      AOM_MIF_ANY_FRAME);
  661|      4|}
obu.c:get_last_nonzero_byte_index:
  589|      4|static int get_last_nonzero_byte_index(const uint8_t *data, size_t sz) {
  590|       |  // Scan backward and return on the first nonzero byte.
  591|      4|  int i = (int)sz - 1;
  592|     10|  while (i >= 0 && data[i] == 0) {
  ------------------
  |  Branch (592:10): [True: 10, False: 0]
  |  Branch (592:20): [True: 6, False: 4]
  ------------------
  593|      6|    --i;
  594|      6|  }
  595|      4|  return i;
  596|      4|}
obu.c:alloc_read_metadata:
  602|      6|                                aom_metadata_insert_flags_t insert_flag) {
  603|      6|  if (!pbi->metadata) {
  ------------------
  |  Branch (603:7): [True: 6, False: 0]
  ------------------
  604|      6|    pbi->metadata = aom_img_metadata_array_alloc(0);
  605|      6|    if (!pbi->metadata) {
  ------------------
  |  Branch (605:9): [True: 0, False: 6]
  ------------------
  606|      0|      aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR,
  607|      0|                         "Failed to allocate metadata array");
  608|      0|    }
  609|      6|  }
  610|      6|  aom_metadata_t *metadata =
  611|      6|      aom_img_metadata_alloc(metadata_type, data, sz, insert_flag);
  612|      6|  if (!metadata) {
  ------------------
  |  Branch (612:7): [True: 0, False: 6]
  ------------------
  613|      0|    aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR,
  614|      0|                       "Error allocating metadata");
  615|      0|  }
  616|      6|  aom_metadata_t **metadata_array = (aom_metadata_t **)realloc(
  617|      6|      pbi->metadata->metadata_array,
  618|      6|      (pbi->metadata->sz + 1) * sizeof(*metadata_array));
  619|      6|  if (!metadata_array) {
  ------------------
  |  Branch (619:7): [True: 0, False: 6]
  ------------------
  620|      0|    aom_img_metadata_free(metadata);
  621|      0|    aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR,
  622|      0|                       "Error growing metadata array");
  623|      0|  }
  624|      6|  pbi->metadata->metadata_array = metadata_array;
  625|      6|  pbi->metadata->metadata_array[pbi->metadata->sz] = metadata;
  626|      6|  pbi->metadata->sz++;
  627|      6|}
obu.c:read_metadata_hdr_cll:
  666|      4|                                    size_t sz) {
  667|      4|  const size_t kHdrCllPayloadSize = 4;
  668|      4|  if (sz < kHdrCllPayloadSize) {
  ------------------
  |  Branch (668:7): [True: 0, False: 4]
  ------------------
  669|      0|    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
  670|      0|                       "Incorrect HDR CLL metadata payload size");
  671|      0|  }
  672|      4|  alloc_read_metadata(pbi, OBU_METADATA_TYPE_HDR_CLL, data, kHdrCllPayloadSize,
  673|      4|                      AOM_MIF_ANY_FRAME);
  674|      4|  return kHdrCllPayloadSize;
  675|      4|}
obu.c:read_metadata_hdr_mdcv:
  680|      3|                                     size_t sz) {
  681|      3|  const size_t kMdcvPayloadSize = 24;
  682|      3|  if (sz < kMdcvPayloadSize) {
  ------------------
  |  Branch (682:7): [True: 3, False: 0]
  ------------------
  683|      3|    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
  684|      3|                       "Incorrect HDR MDCV metadata payload size");
  685|      3|  }
  686|      3|  alloc_read_metadata(pbi, OBU_METADATA_TYPE_HDR_MDCV, data, kMdcvPayloadSize,
  687|      3|                      AOM_MIF_ANY_FRAME);
  688|      3|  return kMdcvPayloadSize;
  689|      3|}
obu.c:read_metadata_scalability:
  725|      2|static void read_metadata_scalability(struct aom_read_bit_buffer *rb) {
  726|      2|  const int scalability_mode_idc = aom_rb_read_literal(rb, 8);
  727|      2|  if (scalability_mode_idc == SCALABILITY_SS) {
  ------------------
  |  Branch (727:7): [True: 0, False: 2]
  ------------------
  728|      0|    scalability_structure(rb);
  729|      0|  }
  730|      2|}
obu.c:read_metadata_timecode:
  732|     20|static void read_metadata_timecode(struct aom_read_bit_buffer *rb) {
  733|     20|  aom_rb_read_literal(rb, 5);  // counting_type f(5)
  734|     20|  const int full_timestamp_flag =
  735|     20|      aom_rb_read_bit(rb);     // full_timestamp_flag f(1)
  736|     20|  aom_rb_read_bit(rb);         // discontinuity_flag (f1)
  737|     20|  aom_rb_read_bit(rb);         // cnt_dropped_flag f(1)
  738|     20|  aom_rb_read_literal(rb, 9);  // n_frames f(9)
  739|     20|  if (full_timestamp_flag) {
  ------------------
  |  Branch (739:7): [True: 3, False: 17]
  ------------------
  740|      3|    aom_rb_read_literal(rb, 6);  // seconds_value f(6)
  741|      3|    aom_rb_read_literal(rb, 6);  // minutes_value f(6)
  742|      3|    aom_rb_read_literal(rb, 5);  // hours_value f(5)
  743|     17|  } else {
  744|     17|    const int seconds_flag = aom_rb_read_bit(rb);  // seconds_flag f(1)
  745|     17|    if (seconds_flag) {
  ------------------
  |  Branch (745:9): [True: 9, False: 8]
  ------------------
  746|      9|      aom_rb_read_literal(rb, 6);                    // seconds_value f(6)
  747|      9|      const int minutes_flag = aom_rb_read_bit(rb);  // minutes_flag f(1)
  748|      9|      if (minutes_flag) {
  ------------------
  |  Branch (748:11): [True: 6, False: 3]
  ------------------
  749|      6|        aom_rb_read_literal(rb, 6);                  // minutes_value f(6)
  750|      6|        const int hours_flag = aom_rb_read_bit(rb);  // hours_flag f(1)
  751|      6|        if (hours_flag) {
  ------------------
  |  Branch (751:13): [True: 3, False: 3]
  ------------------
  752|      3|          aom_rb_read_literal(rb, 5);  // hours_value f(5)
  753|      3|        }
  754|      6|      }
  755|      9|    }
  756|     17|  }
  757|       |  // time_offset_length f(5)
  758|     20|  const int time_offset_length = aom_rb_read_literal(rb, 5);
  759|     20|  if (time_offset_length) {
  ------------------
  |  Branch (759:7): [True: 17, False: 3]
  ------------------
  760|       |    // time_offset_value f(time_offset_length)
  761|     17|    aom_rb_read_literal(rb, time_offset_length);
  762|     17|  }
  763|     20|}
obu.c:read_padding:
  847|     24|                           size_t sz) {
  848|       |  // The spec allows a padding OBU to be header-only (i.e., obu_size = 0). So
  849|       |  // check trailing bits only if sz > 0.
  850|     24|  if (sz > 0) {
  ------------------
  |  Branch (850:7): [True: 21, False: 3]
  ------------------
  851|       |    // The payload of a padding OBU is byte aligned. Therefore the first
  852|       |    // trailing byte should be 0x80. See https://crbug.com/aomedia/2393.
  853|     21|    const uint8_t last_nonzero_byte = get_last_nonzero_byte(data, sz);
  854|     21|    if (last_nonzero_byte != 0x80) {
  ------------------
  |  Branch (854:9): [True: 15, False: 6]
  ------------------
  855|     15|      cm->error->error_code = AOM_CODEC_CORRUPT_FRAME;
  856|     15|      return 0;
  857|     15|    }
  858|     21|  }
  859|      9|  return sz;
  860|     24|}
obu.c:get_last_nonzero_byte:
  772|    960|static uint8_t get_last_nonzero_byte(const uint8_t *data, size_t sz) {
  773|       |  // Scan backward and return on the first nonzero byte.
  774|    960|  size_t i = sz;
  775|  38.5k|  while (i != 0) {
  ------------------
  |  Branch (775:10): [True: 38.5k, False: 7]
  ------------------
  776|  38.5k|    --i;
  777|  38.5k|    if (data[i] != 0) return data[i];
  ------------------
  |  Branch (777:9): [True: 953, False: 37.5k]
  ------------------
  778|  38.5k|  }
  779|      7|  return 0;
  780|    960|}

aom_dsp_rtcd.c:setup_rtcd_internal:
 5629|      1|{
 5630|      1|    int flags = x86_simd_caps();
 5631|       |
 5632|      1|    (void)flags;
 5633|       |
 5634|      1|    aom_avg_8x8_quad = aom_avg_8x8_quad_sse2;
 5635|      1|    if (flags & HAS_AVX2) aom_avg_8x8_quad = aom_avg_8x8_quad_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5635:9): [True: 1, False: 0]
  ------------------
 5636|      1|    aom_blend_a64_hmask = aom_blend_a64_hmask_c;
 5637|      1|    if (flags & HAS_SSE4_1) aom_blend_a64_hmask = aom_blend_a64_hmask_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5637:9): [True: 1, False: 0]
  ------------------
 5638|      1|    aom_blend_a64_mask = aom_blend_a64_mask_c;
 5639|      1|    if (flags & HAS_SSE4_1) aom_blend_a64_mask = aom_blend_a64_mask_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5639:9): [True: 1, False: 0]
  ------------------
 5640|      1|    if (flags & HAS_AVX2) aom_blend_a64_mask = aom_blend_a64_mask_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5640:9): [True: 1, False: 0]
  ------------------
 5641|      1|    aom_blend_a64_vmask = aom_blend_a64_vmask_c;
 5642|      1|    if (flags & HAS_SSE4_1) aom_blend_a64_vmask = aom_blend_a64_vmask_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5642:9): [True: 1, False: 0]
  ------------------
 5643|      1|    aom_comp_avg_pred = aom_comp_avg_pred_c;
 5644|      1|    if (flags & HAS_AVX2) aom_comp_avg_pred = aom_comp_avg_pred_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5644:9): [True: 1, False: 0]
  ------------------
 5645|      1|    aom_comp_mask_pred = aom_comp_mask_pred_c;
 5646|      1|    if (flags & HAS_SSSE3) aom_comp_mask_pred = aom_comp_mask_pred_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5646:9): [True: 1, False: 0]
  ------------------
 5647|      1|    if (flags & HAS_AVX2) aom_comp_mask_pred = aom_comp_mask_pred_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5647:9): [True: 1, False: 0]
  ------------------
 5648|      1|    aom_compute_correlation = aom_compute_correlation_c;
 5649|      1|    if (flags & HAS_SSE4_1) aom_compute_correlation = aom_compute_correlation_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5649:9): [True: 1, False: 0]
  ------------------
 5650|      1|    if (flags & HAS_AVX2) aom_compute_correlation = aom_compute_correlation_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5650:9): [True: 1, False: 0]
  ------------------
 5651|      1|    aom_compute_flow_at_point = aom_compute_flow_at_point_c;
 5652|      1|    if (flags & HAS_SSE4_1) aom_compute_flow_at_point = aom_compute_flow_at_point_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5652:9): [True: 1, False: 0]
  ------------------
 5653|      1|    if (flags & HAS_AVX2) aom_compute_flow_at_point = aom_compute_flow_at_point_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5653:9): [True: 1, False: 0]
  ------------------
 5654|      1|    aom_compute_mean_stddev = aom_compute_mean_stddev_c;
 5655|      1|    if (flags & HAS_SSE4_1) aom_compute_mean_stddev = aom_compute_mean_stddev_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5655:9): [True: 1, False: 0]
  ------------------
 5656|      1|    if (flags & HAS_AVX2) aom_compute_mean_stddev = aom_compute_mean_stddev_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5656:9): [True: 1, False: 0]
  ------------------
 5657|      1|    aom_convolve8_horiz = aom_convolve8_horiz_c;
 5658|      1|    if (flags & HAS_SSSE3) aom_convolve8_horiz = aom_convolve8_horiz_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5658:9): [True: 1, False: 0]
  ------------------
 5659|      1|    if (flags & HAS_AVX2) aom_convolve8_horiz = aom_convolve8_horiz_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5659:9): [True: 1, False: 0]
  ------------------
 5660|      1|    aom_convolve8_vert = aom_convolve8_vert_c;
 5661|      1|    if (flags & HAS_SSSE3) aom_convolve8_vert = aom_convolve8_vert_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5661:9): [True: 1, False: 0]
  ------------------
 5662|      1|    if (flags & HAS_AVX2) aom_convolve8_vert = aom_convolve8_vert_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5662:9): [True: 1, False: 0]
  ------------------
 5663|      1|    aom_convolve_copy = aom_convolve_copy_sse2;
 5664|      1|    if (flags & HAS_AVX2) aom_convolve_copy = aom_convolve_copy_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5664:9): [True: 1, False: 0]
  ------------------
 5665|      1|    aom_dc_128_predictor_32x16 = aom_dc_128_predictor_32x16_sse2;
 5666|      1|    if (flags & HAS_AVX2) aom_dc_128_predictor_32x16 = aom_dc_128_predictor_32x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5666:9): [True: 1, False: 0]
  ------------------
 5667|      1|    aom_dc_128_predictor_32x32 = aom_dc_128_predictor_32x32_sse2;
 5668|      1|    if (flags & HAS_AVX2) aom_dc_128_predictor_32x32 = aom_dc_128_predictor_32x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5668:9): [True: 1, False: 0]
  ------------------
 5669|      1|    aom_dc_128_predictor_32x64 = aom_dc_128_predictor_32x64_sse2;
 5670|      1|    if (flags & HAS_AVX2) aom_dc_128_predictor_32x64 = aom_dc_128_predictor_32x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5670:9): [True: 1, False: 0]
  ------------------
 5671|      1|    aom_dc_128_predictor_64x16 = aom_dc_128_predictor_64x16_sse2;
 5672|      1|    if (flags & HAS_AVX2) aom_dc_128_predictor_64x16 = aom_dc_128_predictor_64x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5672:9): [True: 1, False: 0]
  ------------------
 5673|      1|    aom_dc_128_predictor_64x32 = aom_dc_128_predictor_64x32_sse2;
 5674|      1|    if (flags & HAS_AVX2) aom_dc_128_predictor_64x32 = aom_dc_128_predictor_64x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5674:9): [True: 1, False: 0]
  ------------------
 5675|      1|    aom_dc_128_predictor_64x64 = aom_dc_128_predictor_64x64_sse2;
 5676|      1|    if (flags & HAS_AVX2) aom_dc_128_predictor_64x64 = aom_dc_128_predictor_64x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5676:9): [True: 1, False: 0]
  ------------------
 5677|      1|    aom_dc_left_predictor_32x16 = aom_dc_left_predictor_32x16_sse2;
 5678|      1|    if (flags & HAS_AVX2) aom_dc_left_predictor_32x16 = aom_dc_left_predictor_32x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5678:9): [True: 1, False: 0]
  ------------------
 5679|      1|    aom_dc_left_predictor_32x32 = aom_dc_left_predictor_32x32_sse2;
 5680|      1|    if (flags & HAS_AVX2) aom_dc_left_predictor_32x32 = aom_dc_left_predictor_32x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5680:9): [True: 1, False: 0]
  ------------------
 5681|      1|    aom_dc_left_predictor_32x64 = aom_dc_left_predictor_32x64_sse2;
 5682|      1|    if (flags & HAS_AVX2) aom_dc_left_predictor_32x64 = aom_dc_left_predictor_32x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5682:9): [True: 1, False: 0]
  ------------------
 5683|      1|    aom_dc_left_predictor_64x16 = aom_dc_left_predictor_64x16_sse2;
 5684|      1|    if (flags & HAS_AVX2) aom_dc_left_predictor_64x16 = aom_dc_left_predictor_64x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5684:9): [True: 1, False: 0]
  ------------------
 5685|      1|    aom_dc_left_predictor_64x32 = aom_dc_left_predictor_64x32_sse2;
 5686|      1|    if (flags & HAS_AVX2) aom_dc_left_predictor_64x32 = aom_dc_left_predictor_64x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5686:9): [True: 1, False: 0]
  ------------------
 5687|      1|    aom_dc_left_predictor_64x64 = aom_dc_left_predictor_64x64_sse2;
 5688|      1|    if (flags & HAS_AVX2) aom_dc_left_predictor_64x64 = aom_dc_left_predictor_64x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5688:9): [True: 1, False: 0]
  ------------------
 5689|      1|    aom_dc_predictor_32x16 = aom_dc_predictor_32x16_sse2;
 5690|      1|    if (flags & HAS_AVX2) aom_dc_predictor_32x16 = aom_dc_predictor_32x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5690:9): [True: 1, False: 0]
  ------------------
 5691|      1|    aom_dc_predictor_32x32 = aom_dc_predictor_32x32_sse2;
 5692|      1|    if (flags & HAS_AVX2) aom_dc_predictor_32x32 = aom_dc_predictor_32x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5692:9): [True: 1, False: 0]
  ------------------
 5693|      1|    aom_dc_predictor_32x64 = aom_dc_predictor_32x64_sse2;
 5694|      1|    if (flags & HAS_AVX2) aom_dc_predictor_32x64 = aom_dc_predictor_32x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5694:9): [True: 1, False: 0]
  ------------------
 5695|      1|    aom_dc_predictor_64x16 = aom_dc_predictor_64x16_sse2;
 5696|      1|    if (flags & HAS_AVX2) aom_dc_predictor_64x16 = aom_dc_predictor_64x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5696:9): [True: 1, False: 0]
  ------------------
 5697|      1|    aom_dc_predictor_64x32 = aom_dc_predictor_64x32_sse2;
 5698|      1|    if (flags & HAS_AVX2) aom_dc_predictor_64x32 = aom_dc_predictor_64x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5698:9): [True: 1, False: 0]
  ------------------
 5699|      1|    aom_dc_predictor_64x64 = aom_dc_predictor_64x64_sse2;
 5700|      1|    if (flags & HAS_AVX2) aom_dc_predictor_64x64 = aom_dc_predictor_64x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5700:9): [True: 1, False: 0]
  ------------------
 5701|      1|    aom_dc_top_predictor_32x16 = aom_dc_top_predictor_32x16_sse2;
 5702|      1|    if (flags & HAS_AVX2) aom_dc_top_predictor_32x16 = aom_dc_top_predictor_32x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5702:9): [True: 1, False: 0]
  ------------------
 5703|      1|    aom_dc_top_predictor_32x32 = aom_dc_top_predictor_32x32_sse2;
 5704|      1|    if (flags & HAS_AVX2) aom_dc_top_predictor_32x32 = aom_dc_top_predictor_32x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5704:9): [True: 1, False: 0]
  ------------------
 5705|      1|    aom_dc_top_predictor_32x64 = aom_dc_top_predictor_32x64_sse2;
 5706|      1|    if (flags & HAS_AVX2) aom_dc_top_predictor_32x64 = aom_dc_top_predictor_32x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5706:9): [True: 1, False: 0]
  ------------------
 5707|      1|    aom_dc_top_predictor_64x16 = aom_dc_top_predictor_64x16_sse2;
 5708|      1|    if (flags & HAS_AVX2) aom_dc_top_predictor_64x16 = aom_dc_top_predictor_64x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5708:9): [True: 1, False: 0]
  ------------------
 5709|      1|    aom_dc_top_predictor_64x32 = aom_dc_top_predictor_64x32_sse2;
 5710|      1|    if (flags & HAS_AVX2) aom_dc_top_predictor_64x32 = aom_dc_top_predictor_64x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5710:9): [True: 1, False: 0]
  ------------------
 5711|      1|    aom_dc_top_predictor_64x64 = aom_dc_top_predictor_64x64_sse2;
 5712|      1|    if (flags & HAS_AVX2) aom_dc_top_predictor_64x64 = aom_dc_top_predictor_64x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5712:9): [True: 1, False: 0]
  ------------------
 5713|      1|    aom_fft16x16_float = aom_fft16x16_float_sse2;
 5714|      1|    if (flags & HAS_AVX2) aom_fft16x16_float = aom_fft16x16_float_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5714:9): [True: 1, False: 0]
  ------------------
 5715|      1|    aom_fft32x32_float = aom_fft32x32_float_sse2;
 5716|      1|    if (flags & HAS_AVX2) aom_fft32x32_float = aom_fft32x32_float_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5716:9): [True: 1, False: 0]
  ------------------
 5717|      1|    aom_fft8x8_float = aom_fft8x8_float_sse2;
 5718|      1|    if (flags & HAS_AVX2) aom_fft8x8_float = aom_fft8x8_float_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5718:9): [True: 1, False: 0]
  ------------------
 5719|      1|    aom_get_blk_sse_sum = aom_get_blk_sse_sum_sse2;
 5720|      1|    if (flags & HAS_AVX2) aom_get_blk_sse_sum = aom_get_blk_sse_sum_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5720:9): [True: 1, False: 0]
  ------------------
 5721|      1|    aom_get_var_sse_sum_16x16_dual = aom_get_var_sse_sum_16x16_dual_sse2;
 5722|      1|    if (flags & HAS_AVX2) aom_get_var_sse_sum_16x16_dual = aom_get_var_sse_sum_16x16_dual_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5722:9): [True: 1, False: 0]
  ------------------
 5723|      1|    aom_get_var_sse_sum_8x8_quad = aom_get_var_sse_sum_8x8_quad_sse2;
 5724|      1|    if (flags & HAS_AVX2) aom_get_var_sse_sum_8x8_quad = aom_get_var_sse_sum_8x8_quad_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5724:9): [True: 1, False: 0]
  ------------------
 5725|      1|    aom_h_predictor_32x32 = aom_h_predictor_32x32_sse2;
 5726|      1|    if (flags & HAS_AVX2) aom_h_predictor_32x32 = aom_h_predictor_32x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5726:9): [True: 1, False: 0]
  ------------------
 5727|      1|    aom_hadamard_16x16 = aom_hadamard_16x16_sse2;
 5728|      1|    if (flags & HAS_AVX2) aom_hadamard_16x16 = aom_hadamard_16x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5728:9): [True: 1, False: 0]
  ------------------
 5729|      1|    aom_hadamard_32x32 = aom_hadamard_32x32_sse2;
 5730|      1|    if (flags & HAS_AVX2) aom_hadamard_32x32 = aom_hadamard_32x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5730:9): [True: 1, False: 0]
  ------------------
 5731|      1|    aom_hadamard_lp_16x16 = aom_hadamard_lp_16x16_sse2;
 5732|      1|    if (flags & HAS_AVX2) aom_hadamard_lp_16x16 = aom_hadamard_lp_16x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5732:9): [True: 1, False: 0]
  ------------------
 5733|      1|    aom_hadamard_lp_8x8_dual = aom_hadamard_lp_8x8_dual_sse2;
 5734|      1|    if (flags & HAS_AVX2) aom_hadamard_lp_8x8_dual = aom_hadamard_lp_8x8_dual_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5734:9): [True: 1, False: 0]
  ------------------
 5735|      1|    aom_highbd_10_masked_sub_pixel_variance128x128 = aom_highbd_10_masked_sub_pixel_variance128x128_c;
 5736|      1|    if (flags & HAS_SSSE3) aom_highbd_10_masked_sub_pixel_variance128x128 = aom_highbd_10_masked_sub_pixel_variance128x128_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5736:9): [True: 1, False: 0]
  ------------------
 5737|      1|    aom_highbd_10_masked_sub_pixel_variance128x64 = aom_highbd_10_masked_sub_pixel_variance128x64_c;
 5738|      1|    if (flags & HAS_SSSE3) aom_highbd_10_masked_sub_pixel_variance128x64 = aom_highbd_10_masked_sub_pixel_variance128x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5738:9): [True: 1, False: 0]
  ------------------
 5739|      1|    aom_highbd_10_masked_sub_pixel_variance16x16 = aom_highbd_10_masked_sub_pixel_variance16x16_c;
 5740|      1|    if (flags & HAS_SSSE3) aom_highbd_10_masked_sub_pixel_variance16x16 = aom_highbd_10_masked_sub_pixel_variance16x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5740:9): [True: 1, False: 0]
  ------------------
 5741|      1|    aom_highbd_10_masked_sub_pixel_variance16x32 = aom_highbd_10_masked_sub_pixel_variance16x32_c;
 5742|      1|    if (flags & HAS_SSSE3) aom_highbd_10_masked_sub_pixel_variance16x32 = aom_highbd_10_masked_sub_pixel_variance16x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5742:9): [True: 1, False: 0]
  ------------------
 5743|      1|    aom_highbd_10_masked_sub_pixel_variance16x4 = aom_highbd_10_masked_sub_pixel_variance16x4_c;
 5744|      1|    if (flags & HAS_SSSE3) aom_highbd_10_masked_sub_pixel_variance16x4 = aom_highbd_10_masked_sub_pixel_variance16x4_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5744:9): [True: 1, False: 0]
  ------------------
 5745|      1|    aom_highbd_10_masked_sub_pixel_variance16x64 = aom_highbd_10_masked_sub_pixel_variance16x64_c;
 5746|      1|    if (flags & HAS_SSSE3) aom_highbd_10_masked_sub_pixel_variance16x64 = aom_highbd_10_masked_sub_pixel_variance16x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5746:9): [True: 1, False: 0]
  ------------------
 5747|      1|    aom_highbd_10_masked_sub_pixel_variance16x8 = aom_highbd_10_masked_sub_pixel_variance16x8_c;
 5748|      1|    if (flags & HAS_SSSE3) aom_highbd_10_masked_sub_pixel_variance16x8 = aom_highbd_10_masked_sub_pixel_variance16x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5748:9): [True: 1, False: 0]
  ------------------
 5749|      1|    aom_highbd_10_masked_sub_pixel_variance32x16 = aom_highbd_10_masked_sub_pixel_variance32x16_c;
 5750|      1|    if (flags & HAS_SSSE3) aom_highbd_10_masked_sub_pixel_variance32x16 = aom_highbd_10_masked_sub_pixel_variance32x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5750:9): [True: 1, False: 0]
  ------------------
 5751|      1|    aom_highbd_10_masked_sub_pixel_variance32x32 = aom_highbd_10_masked_sub_pixel_variance32x32_c;
 5752|      1|    if (flags & HAS_SSSE3) aom_highbd_10_masked_sub_pixel_variance32x32 = aom_highbd_10_masked_sub_pixel_variance32x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5752:9): [True: 1, False: 0]
  ------------------
 5753|      1|    aom_highbd_10_masked_sub_pixel_variance32x64 = aom_highbd_10_masked_sub_pixel_variance32x64_c;
 5754|      1|    if (flags & HAS_SSSE3) aom_highbd_10_masked_sub_pixel_variance32x64 = aom_highbd_10_masked_sub_pixel_variance32x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5754:9): [True: 1, False: 0]
  ------------------
 5755|      1|    aom_highbd_10_masked_sub_pixel_variance32x8 = aom_highbd_10_masked_sub_pixel_variance32x8_c;
 5756|      1|    if (flags & HAS_SSSE3) aom_highbd_10_masked_sub_pixel_variance32x8 = aom_highbd_10_masked_sub_pixel_variance32x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5756:9): [True: 1, False: 0]
  ------------------
 5757|      1|    aom_highbd_10_masked_sub_pixel_variance4x16 = aom_highbd_10_masked_sub_pixel_variance4x16_c;
 5758|      1|    if (flags & HAS_SSSE3) aom_highbd_10_masked_sub_pixel_variance4x16 = aom_highbd_10_masked_sub_pixel_variance4x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5758:9): [True: 1, False: 0]
  ------------------
 5759|      1|    aom_highbd_10_masked_sub_pixel_variance4x4 = aom_highbd_10_masked_sub_pixel_variance4x4_c;
 5760|      1|    if (flags & HAS_SSSE3) aom_highbd_10_masked_sub_pixel_variance4x4 = aom_highbd_10_masked_sub_pixel_variance4x4_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5760:9): [True: 1, False: 0]
  ------------------
 5761|      1|    aom_highbd_10_masked_sub_pixel_variance4x8 = aom_highbd_10_masked_sub_pixel_variance4x8_c;
 5762|      1|    if (flags & HAS_SSSE3) aom_highbd_10_masked_sub_pixel_variance4x8 = aom_highbd_10_masked_sub_pixel_variance4x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5762:9): [True: 1, False: 0]
  ------------------
 5763|      1|    aom_highbd_10_masked_sub_pixel_variance64x128 = aom_highbd_10_masked_sub_pixel_variance64x128_c;
 5764|      1|    if (flags & HAS_SSSE3) aom_highbd_10_masked_sub_pixel_variance64x128 = aom_highbd_10_masked_sub_pixel_variance64x128_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5764:9): [True: 1, False: 0]
  ------------------
 5765|      1|    aom_highbd_10_masked_sub_pixel_variance64x16 = aom_highbd_10_masked_sub_pixel_variance64x16_c;
 5766|      1|    if (flags & HAS_SSSE3) aom_highbd_10_masked_sub_pixel_variance64x16 = aom_highbd_10_masked_sub_pixel_variance64x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5766:9): [True: 1, False: 0]
  ------------------
 5767|      1|    aom_highbd_10_masked_sub_pixel_variance64x32 = aom_highbd_10_masked_sub_pixel_variance64x32_c;
 5768|      1|    if (flags & HAS_SSSE3) aom_highbd_10_masked_sub_pixel_variance64x32 = aom_highbd_10_masked_sub_pixel_variance64x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5768:9): [True: 1, False: 0]
  ------------------
 5769|      1|    aom_highbd_10_masked_sub_pixel_variance64x64 = aom_highbd_10_masked_sub_pixel_variance64x64_c;
 5770|      1|    if (flags & HAS_SSSE3) aom_highbd_10_masked_sub_pixel_variance64x64 = aom_highbd_10_masked_sub_pixel_variance64x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5770:9): [True: 1, False: 0]
  ------------------
 5771|      1|    aom_highbd_10_masked_sub_pixel_variance8x16 = aom_highbd_10_masked_sub_pixel_variance8x16_c;
 5772|      1|    if (flags & HAS_SSSE3) aom_highbd_10_masked_sub_pixel_variance8x16 = aom_highbd_10_masked_sub_pixel_variance8x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5772:9): [True: 1, False: 0]
  ------------------
 5773|      1|    aom_highbd_10_masked_sub_pixel_variance8x32 = aom_highbd_10_masked_sub_pixel_variance8x32_c;
 5774|      1|    if (flags & HAS_SSSE3) aom_highbd_10_masked_sub_pixel_variance8x32 = aom_highbd_10_masked_sub_pixel_variance8x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5774:9): [True: 1, False: 0]
  ------------------
 5775|      1|    aom_highbd_10_masked_sub_pixel_variance8x4 = aom_highbd_10_masked_sub_pixel_variance8x4_c;
 5776|      1|    if (flags & HAS_SSSE3) aom_highbd_10_masked_sub_pixel_variance8x4 = aom_highbd_10_masked_sub_pixel_variance8x4_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5776:9): [True: 1, False: 0]
  ------------------
 5777|      1|    aom_highbd_10_masked_sub_pixel_variance8x8 = aom_highbd_10_masked_sub_pixel_variance8x8_c;
 5778|      1|    if (flags & HAS_SSSE3) aom_highbd_10_masked_sub_pixel_variance8x8 = aom_highbd_10_masked_sub_pixel_variance8x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5778:9): [True: 1, False: 0]
  ------------------
 5779|      1|    aom_highbd_10_mse16x16 = aom_highbd_10_mse16x16_sse2;
 5780|      1|    if (flags & HAS_AVX2) aom_highbd_10_mse16x16 = aom_highbd_10_mse16x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5780:9): [True: 1, False: 0]
  ------------------
 5781|      1|    aom_highbd_10_obmc_variance128x128 = aom_highbd_10_obmc_variance128x128_c;
 5782|      1|    if (flags & HAS_SSE4_1) aom_highbd_10_obmc_variance128x128 = aom_highbd_10_obmc_variance128x128_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5782:9): [True: 1, False: 0]
  ------------------
 5783|      1|    aom_highbd_10_obmc_variance128x64 = aom_highbd_10_obmc_variance128x64_c;
 5784|      1|    if (flags & HAS_SSE4_1) aom_highbd_10_obmc_variance128x64 = aom_highbd_10_obmc_variance128x64_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5784:9): [True: 1, False: 0]
  ------------------
 5785|      1|    aom_highbd_10_obmc_variance16x16 = aom_highbd_10_obmc_variance16x16_c;
 5786|      1|    if (flags & HAS_SSE4_1) aom_highbd_10_obmc_variance16x16 = aom_highbd_10_obmc_variance16x16_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5786:9): [True: 1, False: 0]
  ------------------
 5787|      1|    aom_highbd_10_obmc_variance16x32 = aom_highbd_10_obmc_variance16x32_c;
 5788|      1|    if (flags & HAS_SSE4_1) aom_highbd_10_obmc_variance16x32 = aom_highbd_10_obmc_variance16x32_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5788:9): [True: 1, False: 0]
  ------------------
 5789|      1|    aom_highbd_10_obmc_variance16x4 = aom_highbd_10_obmc_variance16x4_c;
 5790|      1|    if (flags & HAS_SSE4_1) aom_highbd_10_obmc_variance16x4 = aom_highbd_10_obmc_variance16x4_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5790:9): [True: 1, False: 0]
  ------------------
 5791|      1|    aom_highbd_10_obmc_variance16x64 = aom_highbd_10_obmc_variance16x64_c;
 5792|      1|    if (flags & HAS_SSE4_1) aom_highbd_10_obmc_variance16x64 = aom_highbd_10_obmc_variance16x64_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5792:9): [True: 1, False: 0]
  ------------------
 5793|      1|    aom_highbd_10_obmc_variance16x8 = aom_highbd_10_obmc_variance16x8_c;
 5794|      1|    if (flags & HAS_SSE4_1) aom_highbd_10_obmc_variance16x8 = aom_highbd_10_obmc_variance16x8_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5794:9): [True: 1, False: 0]
  ------------------
 5795|      1|    aom_highbd_10_obmc_variance32x16 = aom_highbd_10_obmc_variance32x16_c;
 5796|      1|    if (flags & HAS_SSE4_1) aom_highbd_10_obmc_variance32x16 = aom_highbd_10_obmc_variance32x16_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5796:9): [True: 1, False: 0]
  ------------------
 5797|      1|    aom_highbd_10_obmc_variance32x32 = aom_highbd_10_obmc_variance32x32_c;
 5798|      1|    if (flags & HAS_SSE4_1) aom_highbd_10_obmc_variance32x32 = aom_highbd_10_obmc_variance32x32_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5798:9): [True: 1, False: 0]
  ------------------
 5799|      1|    aom_highbd_10_obmc_variance32x64 = aom_highbd_10_obmc_variance32x64_c;
 5800|      1|    if (flags & HAS_SSE4_1) aom_highbd_10_obmc_variance32x64 = aom_highbd_10_obmc_variance32x64_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5800:9): [True: 1, False: 0]
  ------------------
 5801|      1|    aom_highbd_10_obmc_variance32x8 = aom_highbd_10_obmc_variance32x8_c;
 5802|      1|    if (flags & HAS_SSE4_1) aom_highbd_10_obmc_variance32x8 = aom_highbd_10_obmc_variance32x8_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5802:9): [True: 1, False: 0]
  ------------------
 5803|      1|    aom_highbd_10_obmc_variance4x16 = aom_highbd_10_obmc_variance4x16_c;
 5804|      1|    if (flags & HAS_SSE4_1) aom_highbd_10_obmc_variance4x16 = aom_highbd_10_obmc_variance4x16_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5804:9): [True: 1, False: 0]
  ------------------
 5805|      1|    aom_highbd_10_obmc_variance4x4 = aom_highbd_10_obmc_variance4x4_c;
 5806|      1|    if (flags & HAS_SSE4_1) aom_highbd_10_obmc_variance4x4 = aom_highbd_10_obmc_variance4x4_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5806:9): [True: 1, False: 0]
  ------------------
 5807|      1|    aom_highbd_10_obmc_variance4x8 = aom_highbd_10_obmc_variance4x8_c;
 5808|      1|    if (flags & HAS_SSE4_1) aom_highbd_10_obmc_variance4x8 = aom_highbd_10_obmc_variance4x8_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5808:9): [True: 1, False: 0]
  ------------------
 5809|      1|    aom_highbd_10_obmc_variance64x128 = aom_highbd_10_obmc_variance64x128_c;
 5810|      1|    if (flags & HAS_SSE4_1) aom_highbd_10_obmc_variance64x128 = aom_highbd_10_obmc_variance64x128_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5810:9): [True: 1, False: 0]
  ------------------
 5811|      1|    aom_highbd_10_obmc_variance64x16 = aom_highbd_10_obmc_variance64x16_c;
 5812|      1|    if (flags & HAS_SSE4_1) aom_highbd_10_obmc_variance64x16 = aom_highbd_10_obmc_variance64x16_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5812:9): [True: 1, False: 0]
  ------------------
 5813|      1|    aom_highbd_10_obmc_variance64x32 = aom_highbd_10_obmc_variance64x32_c;
 5814|      1|    if (flags & HAS_SSE4_1) aom_highbd_10_obmc_variance64x32 = aom_highbd_10_obmc_variance64x32_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5814:9): [True: 1, False: 0]
  ------------------
 5815|      1|    aom_highbd_10_obmc_variance64x64 = aom_highbd_10_obmc_variance64x64_c;
 5816|      1|    if (flags & HAS_SSE4_1) aom_highbd_10_obmc_variance64x64 = aom_highbd_10_obmc_variance64x64_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5816:9): [True: 1, False: 0]
  ------------------
 5817|      1|    aom_highbd_10_obmc_variance8x16 = aom_highbd_10_obmc_variance8x16_c;
 5818|      1|    if (flags & HAS_SSE4_1) aom_highbd_10_obmc_variance8x16 = aom_highbd_10_obmc_variance8x16_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5818:9): [True: 1, False: 0]
  ------------------
 5819|      1|    aom_highbd_10_obmc_variance8x32 = aom_highbd_10_obmc_variance8x32_c;
 5820|      1|    if (flags & HAS_SSE4_1) aom_highbd_10_obmc_variance8x32 = aom_highbd_10_obmc_variance8x32_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5820:9): [True: 1, False: 0]
  ------------------
 5821|      1|    aom_highbd_10_obmc_variance8x4 = aom_highbd_10_obmc_variance8x4_c;
 5822|      1|    if (flags & HAS_SSE4_1) aom_highbd_10_obmc_variance8x4 = aom_highbd_10_obmc_variance8x4_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5822:9): [True: 1, False: 0]
  ------------------
 5823|      1|    aom_highbd_10_obmc_variance8x8 = aom_highbd_10_obmc_variance8x8_c;
 5824|      1|    if (flags & HAS_SSE4_1) aom_highbd_10_obmc_variance8x8 = aom_highbd_10_obmc_variance8x8_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5824:9): [True: 1, False: 0]
  ------------------
 5825|      1|    aom_highbd_10_sub_pixel_avg_variance4x4 = aom_highbd_10_sub_pixel_avg_variance4x4_c;
 5826|      1|    if (flags & HAS_SSE4_1) aom_highbd_10_sub_pixel_avg_variance4x4 = aom_highbd_10_sub_pixel_avg_variance4x4_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5826:9): [True: 1, False: 0]
  ------------------
 5827|      1|    aom_highbd_10_sub_pixel_variance128x128 = aom_highbd_10_sub_pixel_variance128x128_sse2;
 5828|      1|    if (flags & HAS_AVX2) aom_highbd_10_sub_pixel_variance128x128 = aom_highbd_10_sub_pixel_variance128x128_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5828:9): [True: 1, False: 0]
  ------------------
 5829|      1|    aom_highbd_10_sub_pixel_variance128x64 = aom_highbd_10_sub_pixel_variance128x64_sse2;
 5830|      1|    if (flags & HAS_AVX2) aom_highbd_10_sub_pixel_variance128x64 = aom_highbd_10_sub_pixel_variance128x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5830:9): [True: 1, False: 0]
  ------------------
 5831|      1|    aom_highbd_10_sub_pixel_variance16x16 = aom_highbd_10_sub_pixel_variance16x16_sse2;
 5832|      1|    if (flags & HAS_AVX2) aom_highbd_10_sub_pixel_variance16x16 = aom_highbd_10_sub_pixel_variance16x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5832:9): [True: 1, False: 0]
  ------------------
 5833|      1|    aom_highbd_10_sub_pixel_variance16x32 = aom_highbd_10_sub_pixel_variance16x32_sse2;
 5834|      1|    if (flags & HAS_AVX2) aom_highbd_10_sub_pixel_variance16x32 = aom_highbd_10_sub_pixel_variance16x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5834:9): [True: 1, False: 0]
  ------------------
 5835|      1|    aom_highbd_10_sub_pixel_variance16x8 = aom_highbd_10_sub_pixel_variance16x8_sse2;
 5836|      1|    if (flags & HAS_AVX2) aom_highbd_10_sub_pixel_variance16x8 = aom_highbd_10_sub_pixel_variance16x8_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5836:9): [True: 1, False: 0]
  ------------------
 5837|      1|    aom_highbd_10_sub_pixel_variance32x16 = aom_highbd_10_sub_pixel_variance32x16_sse2;
 5838|      1|    if (flags & HAS_AVX2) aom_highbd_10_sub_pixel_variance32x16 = aom_highbd_10_sub_pixel_variance32x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5838:9): [True: 1, False: 0]
  ------------------
 5839|      1|    aom_highbd_10_sub_pixel_variance32x32 = aom_highbd_10_sub_pixel_variance32x32_sse2;
 5840|      1|    if (flags & HAS_AVX2) aom_highbd_10_sub_pixel_variance32x32 = aom_highbd_10_sub_pixel_variance32x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5840:9): [True: 1, False: 0]
  ------------------
 5841|      1|    aom_highbd_10_sub_pixel_variance32x64 = aom_highbd_10_sub_pixel_variance32x64_sse2;
 5842|      1|    if (flags & HAS_AVX2) aom_highbd_10_sub_pixel_variance32x64 = aom_highbd_10_sub_pixel_variance32x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5842:9): [True: 1, False: 0]
  ------------------
 5843|      1|    aom_highbd_10_sub_pixel_variance4x4 = aom_highbd_10_sub_pixel_variance4x4_c;
 5844|      1|    if (flags & HAS_SSE4_1) aom_highbd_10_sub_pixel_variance4x4 = aom_highbd_10_sub_pixel_variance4x4_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5844:9): [True: 1, False: 0]
  ------------------
 5845|      1|    aom_highbd_10_sub_pixel_variance64x128 = aom_highbd_10_sub_pixel_variance64x128_sse2;
 5846|      1|    if (flags & HAS_AVX2) aom_highbd_10_sub_pixel_variance64x128 = aom_highbd_10_sub_pixel_variance64x128_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5846:9): [True: 1, False: 0]
  ------------------
 5847|      1|    aom_highbd_10_sub_pixel_variance64x32 = aom_highbd_10_sub_pixel_variance64x32_sse2;
 5848|      1|    if (flags & HAS_AVX2) aom_highbd_10_sub_pixel_variance64x32 = aom_highbd_10_sub_pixel_variance64x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5848:9): [True: 1, False: 0]
  ------------------
 5849|      1|    aom_highbd_10_sub_pixel_variance64x64 = aom_highbd_10_sub_pixel_variance64x64_sse2;
 5850|      1|    if (flags & HAS_AVX2) aom_highbd_10_sub_pixel_variance64x64 = aom_highbd_10_sub_pixel_variance64x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5850:9): [True: 1, False: 0]
  ------------------
 5851|      1|    aom_highbd_10_sub_pixel_variance8x16 = aom_highbd_10_sub_pixel_variance8x16_sse2;
 5852|      1|    if (flags & HAS_AVX2) aom_highbd_10_sub_pixel_variance8x16 = aom_highbd_10_sub_pixel_variance8x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5852:9): [True: 1, False: 0]
  ------------------
 5853|      1|    aom_highbd_10_sub_pixel_variance8x8 = aom_highbd_10_sub_pixel_variance8x8_sse2;
 5854|      1|    if (flags & HAS_AVX2) aom_highbd_10_sub_pixel_variance8x8 = aom_highbd_10_sub_pixel_variance8x8_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5854:9): [True: 1, False: 0]
  ------------------
 5855|      1|    aom_highbd_10_variance128x128 = aom_highbd_10_variance128x128_sse2;
 5856|      1|    if (flags & HAS_AVX2) aom_highbd_10_variance128x128 = aom_highbd_10_variance128x128_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5856:9): [True: 1, False: 0]
  ------------------
 5857|      1|    aom_highbd_10_variance128x64 = aom_highbd_10_variance128x64_sse2;
 5858|      1|    if (flags & HAS_AVX2) aom_highbd_10_variance128x64 = aom_highbd_10_variance128x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5858:9): [True: 1, False: 0]
  ------------------
 5859|      1|    aom_highbd_10_variance16x16 = aom_highbd_10_variance16x16_sse2;
 5860|      1|    if (flags & HAS_AVX2) aom_highbd_10_variance16x16 = aom_highbd_10_variance16x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5860:9): [True: 1, False: 0]
  ------------------
 5861|      1|    aom_highbd_10_variance16x32 = aom_highbd_10_variance16x32_sse2;
 5862|      1|    if (flags & HAS_AVX2) aom_highbd_10_variance16x32 = aom_highbd_10_variance16x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5862:9): [True: 1, False: 0]
  ------------------
 5863|      1|    aom_highbd_10_variance16x64 = aom_highbd_10_variance16x64_sse2;
 5864|      1|    if (flags & HAS_AVX2) aom_highbd_10_variance16x64 = aom_highbd_10_variance16x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5864:9): [True: 1, False: 0]
  ------------------
 5865|      1|    aom_highbd_10_variance16x8 = aom_highbd_10_variance16x8_sse2;
 5866|      1|    if (flags & HAS_AVX2) aom_highbd_10_variance16x8 = aom_highbd_10_variance16x8_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5866:9): [True: 1, False: 0]
  ------------------
 5867|      1|    aom_highbd_10_variance32x16 = aom_highbd_10_variance32x16_sse2;
 5868|      1|    if (flags & HAS_AVX2) aom_highbd_10_variance32x16 = aom_highbd_10_variance32x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5868:9): [True: 1, False: 0]
  ------------------
 5869|      1|    aom_highbd_10_variance32x32 = aom_highbd_10_variance32x32_sse2;
 5870|      1|    if (flags & HAS_AVX2) aom_highbd_10_variance32x32 = aom_highbd_10_variance32x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5870:9): [True: 1, False: 0]
  ------------------
 5871|      1|    aom_highbd_10_variance32x64 = aom_highbd_10_variance32x64_sse2;
 5872|      1|    if (flags & HAS_AVX2) aom_highbd_10_variance32x64 = aom_highbd_10_variance32x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5872:9): [True: 1, False: 0]
  ------------------
 5873|      1|    aom_highbd_10_variance32x8 = aom_highbd_10_variance32x8_sse2;
 5874|      1|    if (flags & HAS_AVX2) aom_highbd_10_variance32x8 = aom_highbd_10_variance32x8_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5874:9): [True: 1, False: 0]
  ------------------
 5875|      1|    aom_highbd_10_variance4x4 = aom_highbd_10_variance4x4_c;
 5876|      1|    if (flags & HAS_SSE4_1) aom_highbd_10_variance4x4 = aom_highbd_10_variance4x4_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5876:9): [True: 1, False: 0]
  ------------------
 5877|      1|    aom_highbd_10_variance64x128 = aom_highbd_10_variance64x128_sse2;
 5878|      1|    if (flags & HAS_AVX2) aom_highbd_10_variance64x128 = aom_highbd_10_variance64x128_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5878:9): [True: 1, False: 0]
  ------------------
 5879|      1|    aom_highbd_10_variance64x16 = aom_highbd_10_variance64x16_sse2;
 5880|      1|    if (flags & HAS_AVX2) aom_highbd_10_variance64x16 = aom_highbd_10_variance64x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5880:9): [True: 1, False: 0]
  ------------------
 5881|      1|    aom_highbd_10_variance64x32 = aom_highbd_10_variance64x32_sse2;
 5882|      1|    if (flags & HAS_AVX2) aom_highbd_10_variance64x32 = aom_highbd_10_variance64x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5882:9): [True: 1, False: 0]
  ------------------
 5883|      1|    aom_highbd_10_variance64x64 = aom_highbd_10_variance64x64_sse2;
 5884|      1|    if (flags & HAS_AVX2) aom_highbd_10_variance64x64 = aom_highbd_10_variance64x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5884:9): [True: 1, False: 0]
  ------------------
 5885|      1|    aom_highbd_10_variance8x16 = aom_highbd_10_variance8x16_sse2;
 5886|      1|    if (flags & HAS_AVX2) aom_highbd_10_variance8x16 = aom_highbd_10_variance8x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5886:9): [True: 1, False: 0]
  ------------------
 5887|      1|    aom_highbd_10_variance8x32 = aom_highbd_10_variance8x32_sse2;
 5888|      1|    if (flags & HAS_AVX2) aom_highbd_10_variance8x32 = aom_highbd_10_variance8x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5888:9): [True: 1, False: 0]
  ------------------
 5889|      1|    aom_highbd_10_variance8x8 = aom_highbd_10_variance8x8_sse2;
 5890|      1|    if (flags & HAS_AVX2) aom_highbd_10_variance8x8 = aom_highbd_10_variance8x8_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (5890:9): [True: 1, False: 0]
  ------------------
 5891|      1|    aom_highbd_12_masked_sub_pixel_variance128x128 = aom_highbd_12_masked_sub_pixel_variance128x128_c;
 5892|      1|    if (flags & HAS_SSSE3) aom_highbd_12_masked_sub_pixel_variance128x128 = aom_highbd_12_masked_sub_pixel_variance128x128_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5892:9): [True: 1, False: 0]
  ------------------
 5893|      1|    aom_highbd_12_masked_sub_pixel_variance128x64 = aom_highbd_12_masked_sub_pixel_variance128x64_c;
 5894|      1|    if (flags & HAS_SSSE3) aom_highbd_12_masked_sub_pixel_variance128x64 = aom_highbd_12_masked_sub_pixel_variance128x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5894:9): [True: 1, False: 0]
  ------------------
 5895|      1|    aom_highbd_12_masked_sub_pixel_variance16x16 = aom_highbd_12_masked_sub_pixel_variance16x16_c;
 5896|      1|    if (flags & HAS_SSSE3) aom_highbd_12_masked_sub_pixel_variance16x16 = aom_highbd_12_masked_sub_pixel_variance16x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5896:9): [True: 1, False: 0]
  ------------------
 5897|      1|    aom_highbd_12_masked_sub_pixel_variance16x32 = aom_highbd_12_masked_sub_pixel_variance16x32_c;
 5898|      1|    if (flags & HAS_SSSE3) aom_highbd_12_masked_sub_pixel_variance16x32 = aom_highbd_12_masked_sub_pixel_variance16x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5898:9): [True: 1, False: 0]
  ------------------
 5899|      1|    aom_highbd_12_masked_sub_pixel_variance16x4 = aom_highbd_12_masked_sub_pixel_variance16x4_c;
 5900|      1|    if (flags & HAS_SSSE3) aom_highbd_12_masked_sub_pixel_variance16x4 = aom_highbd_12_masked_sub_pixel_variance16x4_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5900:9): [True: 1, False: 0]
  ------------------
 5901|      1|    aom_highbd_12_masked_sub_pixel_variance16x64 = aom_highbd_12_masked_sub_pixel_variance16x64_c;
 5902|      1|    if (flags & HAS_SSSE3) aom_highbd_12_masked_sub_pixel_variance16x64 = aom_highbd_12_masked_sub_pixel_variance16x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5902:9): [True: 1, False: 0]
  ------------------
 5903|      1|    aom_highbd_12_masked_sub_pixel_variance16x8 = aom_highbd_12_masked_sub_pixel_variance16x8_c;
 5904|      1|    if (flags & HAS_SSSE3) aom_highbd_12_masked_sub_pixel_variance16x8 = aom_highbd_12_masked_sub_pixel_variance16x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5904:9): [True: 1, False: 0]
  ------------------
 5905|      1|    aom_highbd_12_masked_sub_pixel_variance32x16 = aom_highbd_12_masked_sub_pixel_variance32x16_c;
 5906|      1|    if (flags & HAS_SSSE3) aom_highbd_12_masked_sub_pixel_variance32x16 = aom_highbd_12_masked_sub_pixel_variance32x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5906:9): [True: 1, False: 0]
  ------------------
 5907|      1|    aom_highbd_12_masked_sub_pixel_variance32x32 = aom_highbd_12_masked_sub_pixel_variance32x32_c;
 5908|      1|    if (flags & HAS_SSSE3) aom_highbd_12_masked_sub_pixel_variance32x32 = aom_highbd_12_masked_sub_pixel_variance32x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5908:9): [True: 1, False: 0]
  ------------------
 5909|      1|    aom_highbd_12_masked_sub_pixel_variance32x64 = aom_highbd_12_masked_sub_pixel_variance32x64_c;
 5910|      1|    if (flags & HAS_SSSE3) aom_highbd_12_masked_sub_pixel_variance32x64 = aom_highbd_12_masked_sub_pixel_variance32x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5910:9): [True: 1, False: 0]
  ------------------
 5911|      1|    aom_highbd_12_masked_sub_pixel_variance32x8 = aom_highbd_12_masked_sub_pixel_variance32x8_c;
 5912|      1|    if (flags & HAS_SSSE3) aom_highbd_12_masked_sub_pixel_variance32x8 = aom_highbd_12_masked_sub_pixel_variance32x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5912:9): [True: 1, False: 0]
  ------------------
 5913|      1|    aom_highbd_12_masked_sub_pixel_variance4x16 = aom_highbd_12_masked_sub_pixel_variance4x16_c;
 5914|      1|    if (flags & HAS_SSSE3) aom_highbd_12_masked_sub_pixel_variance4x16 = aom_highbd_12_masked_sub_pixel_variance4x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5914:9): [True: 1, False: 0]
  ------------------
 5915|      1|    aom_highbd_12_masked_sub_pixel_variance4x4 = aom_highbd_12_masked_sub_pixel_variance4x4_c;
 5916|      1|    if (flags & HAS_SSSE3) aom_highbd_12_masked_sub_pixel_variance4x4 = aom_highbd_12_masked_sub_pixel_variance4x4_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5916:9): [True: 1, False: 0]
  ------------------
 5917|      1|    aom_highbd_12_masked_sub_pixel_variance4x8 = aom_highbd_12_masked_sub_pixel_variance4x8_c;
 5918|      1|    if (flags & HAS_SSSE3) aom_highbd_12_masked_sub_pixel_variance4x8 = aom_highbd_12_masked_sub_pixel_variance4x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5918:9): [True: 1, False: 0]
  ------------------
 5919|      1|    aom_highbd_12_masked_sub_pixel_variance64x128 = aom_highbd_12_masked_sub_pixel_variance64x128_c;
 5920|      1|    if (flags & HAS_SSSE3) aom_highbd_12_masked_sub_pixel_variance64x128 = aom_highbd_12_masked_sub_pixel_variance64x128_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5920:9): [True: 1, False: 0]
  ------------------
 5921|      1|    aom_highbd_12_masked_sub_pixel_variance64x16 = aom_highbd_12_masked_sub_pixel_variance64x16_c;
 5922|      1|    if (flags & HAS_SSSE3) aom_highbd_12_masked_sub_pixel_variance64x16 = aom_highbd_12_masked_sub_pixel_variance64x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5922:9): [True: 1, False: 0]
  ------------------
 5923|      1|    aom_highbd_12_masked_sub_pixel_variance64x32 = aom_highbd_12_masked_sub_pixel_variance64x32_c;
 5924|      1|    if (flags & HAS_SSSE3) aom_highbd_12_masked_sub_pixel_variance64x32 = aom_highbd_12_masked_sub_pixel_variance64x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5924:9): [True: 1, False: 0]
  ------------------
 5925|      1|    aom_highbd_12_masked_sub_pixel_variance64x64 = aom_highbd_12_masked_sub_pixel_variance64x64_c;
 5926|      1|    if (flags & HAS_SSSE3) aom_highbd_12_masked_sub_pixel_variance64x64 = aom_highbd_12_masked_sub_pixel_variance64x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5926:9): [True: 1, False: 0]
  ------------------
 5927|      1|    aom_highbd_12_masked_sub_pixel_variance8x16 = aom_highbd_12_masked_sub_pixel_variance8x16_c;
 5928|      1|    if (flags & HAS_SSSE3) aom_highbd_12_masked_sub_pixel_variance8x16 = aom_highbd_12_masked_sub_pixel_variance8x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5928:9): [True: 1, False: 0]
  ------------------
 5929|      1|    aom_highbd_12_masked_sub_pixel_variance8x32 = aom_highbd_12_masked_sub_pixel_variance8x32_c;
 5930|      1|    if (flags & HAS_SSSE3) aom_highbd_12_masked_sub_pixel_variance8x32 = aom_highbd_12_masked_sub_pixel_variance8x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5930:9): [True: 1, False: 0]
  ------------------
 5931|      1|    aom_highbd_12_masked_sub_pixel_variance8x4 = aom_highbd_12_masked_sub_pixel_variance8x4_c;
 5932|      1|    if (flags & HAS_SSSE3) aom_highbd_12_masked_sub_pixel_variance8x4 = aom_highbd_12_masked_sub_pixel_variance8x4_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5932:9): [True: 1, False: 0]
  ------------------
 5933|      1|    aom_highbd_12_masked_sub_pixel_variance8x8 = aom_highbd_12_masked_sub_pixel_variance8x8_c;
 5934|      1|    if (flags & HAS_SSSE3) aom_highbd_12_masked_sub_pixel_variance8x8 = aom_highbd_12_masked_sub_pixel_variance8x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5934:9): [True: 1, False: 0]
  ------------------
 5935|      1|    aom_highbd_12_obmc_variance128x128 = aom_highbd_12_obmc_variance128x128_c;
 5936|      1|    if (flags & HAS_SSE4_1) aom_highbd_12_obmc_variance128x128 = aom_highbd_12_obmc_variance128x128_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5936:9): [True: 1, False: 0]
  ------------------
 5937|      1|    aom_highbd_12_obmc_variance128x64 = aom_highbd_12_obmc_variance128x64_c;
 5938|      1|    if (flags & HAS_SSE4_1) aom_highbd_12_obmc_variance128x64 = aom_highbd_12_obmc_variance128x64_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5938:9): [True: 1, False: 0]
  ------------------
 5939|      1|    aom_highbd_12_obmc_variance16x16 = aom_highbd_12_obmc_variance16x16_c;
 5940|      1|    if (flags & HAS_SSE4_1) aom_highbd_12_obmc_variance16x16 = aom_highbd_12_obmc_variance16x16_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5940:9): [True: 1, False: 0]
  ------------------
 5941|      1|    aom_highbd_12_obmc_variance16x32 = aom_highbd_12_obmc_variance16x32_c;
 5942|      1|    if (flags & HAS_SSE4_1) aom_highbd_12_obmc_variance16x32 = aom_highbd_12_obmc_variance16x32_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5942:9): [True: 1, False: 0]
  ------------------
 5943|      1|    aom_highbd_12_obmc_variance16x4 = aom_highbd_12_obmc_variance16x4_c;
 5944|      1|    if (flags & HAS_SSE4_1) aom_highbd_12_obmc_variance16x4 = aom_highbd_12_obmc_variance16x4_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5944:9): [True: 1, False: 0]
  ------------------
 5945|      1|    aom_highbd_12_obmc_variance16x64 = aom_highbd_12_obmc_variance16x64_c;
 5946|      1|    if (flags & HAS_SSE4_1) aom_highbd_12_obmc_variance16x64 = aom_highbd_12_obmc_variance16x64_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5946:9): [True: 1, False: 0]
  ------------------
 5947|      1|    aom_highbd_12_obmc_variance16x8 = aom_highbd_12_obmc_variance16x8_c;
 5948|      1|    if (flags & HAS_SSE4_1) aom_highbd_12_obmc_variance16x8 = aom_highbd_12_obmc_variance16x8_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5948:9): [True: 1, False: 0]
  ------------------
 5949|      1|    aom_highbd_12_obmc_variance32x16 = aom_highbd_12_obmc_variance32x16_c;
 5950|      1|    if (flags & HAS_SSE4_1) aom_highbd_12_obmc_variance32x16 = aom_highbd_12_obmc_variance32x16_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5950:9): [True: 1, False: 0]
  ------------------
 5951|      1|    aom_highbd_12_obmc_variance32x32 = aom_highbd_12_obmc_variance32x32_c;
 5952|      1|    if (flags & HAS_SSE4_1) aom_highbd_12_obmc_variance32x32 = aom_highbd_12_obmc_variance32x32_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5952:9): [True: 1, False: 0]
  ------------------
 5953|      1|    aom_highbd_12_obmc_variance32x64 = aom_highbd_12_obmc_variance32x64_c;
 5954|      1|    if (flags & HAS_SSE4_1) aom_highbd_12_obmc_variance32x64 = aom_highbd_12_obmc_variance32x64_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5954:9): [True: 1, False: 0]
  ------------------
 5955|      1|    aom_highbd_12_obmc_variance32x8 = aom_highbd_12_obmc_variance32x8_c;
 5956|      1|    if (flags & HAS_SSE4_1) aom_highbd_12_obmc_variance32x8 = aom_highbd_12_obmc_variance32x8_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5956:9): [True: 1, False: 0]
  ------------------
 5957|      1|    aom_highbd_12_obmc_variance4x16 = aom_highbd_12_obmc_variance4x16_c;
 5958|      1|    if (flags & HAS_SSE4_1) aom_highbd_12_obmc_variance4x16 = aom_highbd_12_obmc_variance4x16_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5958:9): [True: 1, False: 0]
  ------------------
 5959|      1|    aom_highbd_12_obmc_variance4x4 = aom_highbd_12_obmc_variance4x4_c;
 5960|      1|    if (flags & HAS_SSE4_1) aom_highbd_12_obmc_variance4x4 = aom_highbd_12_obmc_variance4x4_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5960:9): [True: 1, False: 0]
  ------------------
 5961|      1|    aom_highbd_12_obmc_variance4x8 = aom_highbd_12_obmc_variance4x8_c;
 5962|      1|    if (flags & HAS_SSE4_1) aom_highbd_12_obmc_variance4x8 = aom_highbd_12_obmc_variance4x8_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5962:9): [True: 1, False: 0]
  ------------------
 5963|      1|    aom_highbd_12_obmc_variance64x128 = aom_highbd_12_obmc_variance64x128_c;
 5964|      1|    if (flags & HAS_SSE4_1) aom_highbd_12_obmc_variance64x128 = aom_highbd_12_obmc_variance64x128_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5964:9): [True: 1, False: 0]
  ------------------
 5965|      1|    aom_highbd_12_obmc_variance64x16 = aom_highbd_12_obmc_variance64x16_c;
 5966|      1|    if (flags & HAS_SSE4_1) aom_highbd_12_obmc_variance64x16 = aom_highbd_12_obmc_variance64x16_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5966:9): [True: 1, False: 0]
  ------------------
 5967|      1|    aom_highbd_12_obmc_variance64x32 = aom_highbd_12_obmc_variance64x32_c;
 5968|      1|    if (flags & HAS_SSE4_1) aom_highbd_12_obmc_variance64x32 = aom_highbd_12_obmc_variance64x32_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5968:9): [True: 1, False: 0]
  ------------------
 5969|      1|    aom_highbd_12_obmc_variance64x64 = aom_highbd_12_obmc_variance64x64_c;
 5970|      1|    if (flags & HAS_SSE4_1) aom_highbd_12_obmc_variance64x64 = aom_highbd_12_obmc_variance64x64_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5970:9): [True: 1, False: 0]
  ------------------
 5971|      1|    aom_highbd_12_obmc_variance8x16 = aom_highbd_12_obmc_variance8x16_c;
 5972|      1|    if (flags & HAS_SSE4_1) aom_highbd_12_obmc_variance8x16 = aom_highbd_12_obmc_variance8x16_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5972:9): [True: 1, False: 0]
  ------------------
 5973|      1|    aom_highbd_12_obmc_variance8x32 = aom_highbd_12_obmc_variance8x32_c;
 5974|      1|    if (flags & HAS_SSE4_1) aom_highbd_12_obmc_variance8x32 = aom_highbd_12_obmc_variance8x32_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5974:9): [True: 1, False: 0]
  ------------------
 5975|      1|    aom_highbd_12_obmc_variance8x4 = aom_highbd_12_obmc_variance8x4_c;
 5976|      1|    if (flags & HAS_SSE4_1) aom_highbd_12_obmc_variance8x4 = aom_highbd_12_obmc_variance8x4_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5976:9): [True: 1, False: 0]
  ------------------
 5977|      1|    aom_highbd_12_obmc_variance8x8 = aom_highbd_12_obmc_variance8x8_c;
 5978|      1|    if (flags & HAS_SSE4_1) aom_highbd_12_obmc_variance8x8 = aom_highbd_12_obmc_variance8x8_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5978:9): [True: 1, False: 0]
  ------------------
 5979|      1|    aom_highbd_12_sub_pixel_avg_variance4x4 = aom_highbd_12_sub_pixel_avg_variance4x4_c;
 5980|      1|    if (flags & HAS_SSE4_1) aom_highbd_12_sub_pixel_avg_variance4x4 = aom_highbd_12_sub_pixel_avg_variance4x4_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5980:9): [True: 1, False: 0]
  ------------------
 5981|      1|    aom_highbd_12_sub_pixel_variance4x4 = aom_highbd_12_sub_pixel_variance4x4_c;
 5982|      1|    if (flags & HAS_SSE4_1) aom_highbd_12_sub_pixel_variance4x4 = aom_highbd_12_sub_pixel_variance4x4_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5982:9): [True: 1, False: 0]
  ------------------
 5983|      1|    aom_highbd_12_variance4x4 = aom_highbd_12_variance4x4_c;
 5984|      1|    if (flags & HAS_SSE4_1) aom_highbd_12_variance4x4 = aom_highbd_12_variance4x4_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (5984:9): [True: 1, False: 0]
  ------------------
 5985|      1|    aom_highbd_8_masked_sub_pixel_variance128x128 = aom_highbd_8_masked_sub_pixel_variance128x128_c;
 5986|      1|    if (flags & HAS_SSSE3) aom_highbd_8_masked_sub_pixel_variance128x128 = aom_highbd_8_masked_sub_pixel_variance128x128_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5986:9): [True: 1, False: 0]
  ------------------
 5987|      1|    aom_highbd_8_masked_sub_pixel_variance128x64 = aom_highbd_8_masked_sub_pixel_variance128x64_c;
 5988|      1|    if (flags & HAS_SSSE3) aom_highbd_8_masked_sub_pixel_variance128x64 = aom_highbd_8_masked_sub_pixel_variance128x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5988:9): [True: 1, False: 0]
  ------------------
 5989|      1|    aom_highbd_8_masked_sub_pixel_variance16x16 = aom_highbd_8_masked_sub_pixel_variance16x16_c;
 5990|      1|    if (flags & HAS_SSSE3) aom_highbd_8_masked_sub_pixel_variance16x16 = aom_highbd_8_masked_sub_pixel_variance16x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5990:9): [True: 1, False: 0]
  ------------------
 5991|      1|    aom_highbd_8_masked_sub_pixel_variance16x32 = aom_highbd_8_masked_sub_pixel_variance16x32_c;
 5992|      1|    if (flags & HAS_SSSE3) aom_highbd_8_masked_sub_pixel_variance16x32 = aom_highbd_8_masked_sub_pixel_variance16x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5992:9): [True: 1, False: 0]
  ------------------
 5993|      1|    aom_highbd_8_masked_sub_pixel_variance16x4 = aom_highbd_8_masked_sub_pixel_variance16x4_c;
 5994|      1|    if (flags & HAS_SSSE3) aom_highbd_8_masked_sub_pixel_variance16x4 = aom_highbd_8_masked_sub_pixel_variance16x4_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5994:9): [True: 1, False: 0]
  ------------------
 5995|      1|    aom_highbd_8_masked_sub_pixel_variance16x64 = aom_highbd_8_masked_sub_pixel_variance16x64_c;
 5996|      1|    if (flags & HAS_SSSE3) aom_highbd_8_masked_sub_pixel_variance16x64 = aom_highbd_8_masked_sub_pixel_variance16x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5996:9): [True: 1, False: 0]
  ------------------
 5997|      1|    aom_highbd_8_masked_sub_pixel_variance16x8 = aom_highbd_8_masked_sub_pixel_variance16x8_c;
 5998|      1|    if (flags & HAS_SSSE3) aom_highbd_8_masked_sub_pixel_variance16x8 = aom_highbd_8_masked_sub_pixel_variance16x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (5998:9): [True: 1, False: 0]
  ------------------
 5999|      1|    aom_highbd_8_masked_sub_pixel_variance32x16 = aom_highbd_8_masked_sub_pixel_variance32x16_c;
 6000|      1|    if (flags & HAS_SSSE3) aom_highbd_8_masked_sub_pixel_variance32x16 = aom_highbd_8_masked_sub_pixel_variance32x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6000:9): [True: 1, False: 0]
  ------------------
 6001|      1|    aom_highbd_8_masked_sub_pixel_variance32x32 = aom_highbd_8_masked_sub_pixel_variance32x32_c;
 6002|      1|    if (flags & HAS_SSSE3) aom_highbd_8_masked_sub_pixel_variance32x32 = aom_highbd_8_masked_sub_pixel_variance32x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6002:9): [True: 1, False: 0]
  ------------------
 6003|      1|    aom_highbd_8_masked_sub_pixel_variance32x64 = aom_highbd_8_masked_sub_pixel_variance32x64_c;
 6004|      1|    if (flags & HAS_SSSE3) aom_highbd_8_masked_sub_pixel_variance32x64 = aom_highbd_8_masked_sub_pixel_variance32x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6004:9): [True: 1, False: 0]
  ------------------
 6005|      1|    aom_highbd_8_masked_sub_pixel_variance32x8 = aom_highbd_8_masked_sub_pixel_variance32x8_c;
 6006|      1|    if (flags & HAS_SSSE3) aom_highbd_8_masked_sub_pixel_variance32x8 = aom_highbd_8_masked_sub_pixel_variance32x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6006:9): [True: 1, False: 0]
  ------------------
 6007|      1|    aom_highbd_8_masked_sub_pixel_variance4x16 = aom_highbd_8_masked_sub_pixel_variance4x16_c;
 6008|      1|    if (flags & HAS_SSSE3) aom_highbd_8_masked_sub_pixel_variance4x16 = aom_highbd_8_masked_sub_pixel_variance4x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6008:9): [True: 1, False: 0]
  ------------------
 6009|      1|    aom_highbd_8_masked_sub_pixel_variance4x4 = aom_highbd_8_masked_sub_pixel_variance4x4_c;
 6010|      1|    if (flags & HAS_SSSE3) aom_highbd_8_masked_sub_pixel_variance4x4 = aom_highbd_8_masked_sub_pixel_variance4x4_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6010:9): [True: 1, False: 0]
  ------------------
 6011|      1|    aom_highbd_8_masked_sub_pixel_variance4x8 = aom_highbd_8_masked_sub_pixel_variance4x8_c;
 6012|      1|    if (flags & HAS_SSSE3) aom_highbd_8_masked_sub_pixel_variance4x8 = aom_highbd_8_masked_sub_pixel_variance4x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6012:9): [True: 1, False: 0]
  ------------------
 6013|      1|    aom_highbd_8_masked_sub_pixel_variance64x128 = aom_highbd_8_masked_sub_pixel_variance64x128_c;
 6014|      1|    if (flags & HAS_SSSE3) aom_highbd_8_masked_sub_pixel_variance64x128 = aom_highbd_8_masked_sub_pixel_variance64x128_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6014:9): [True: 1, False: 0]
  ------------------
 6015|      1|    aom_highbd_8_masked_sub_pixel_variance64x16 = aom_highbd_8_masked_sub_pixel_variance64x16_c;
 6016|      1|    if (flags & HAS_SSSE3) aom_highbd_8_masked_sub_pixel_variance64x16 = aom_highbd_8_masked_sub_pixel_variance64x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6016:9): [True: 1, False: 0]
  ------------------
 6017|      1|    aom_highbd_8_masked_sub_pixel_variance64x32 = aom_highbd_8_masked_sub_pixel_variance64x32_c;
 6018|      1|    if (flags & HAS_SSSE3) aom_highbd_8_masked_sub_pixel_variance64x32 = aom_highbd_8_masked_sub_pixel_variance64x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6018:9): [True: 1, False: 0]
  ------------------
 6019|      1|    aom_highbd_8_masked_sub_pixel_variance64x64 = aom_highbd_8_masked_sub_pixel_variance64x64_c;
 6020|      1|    if (flags & HAS_SSSE3) aom_highbd_8_masked_sub_pixel_variance64x64 = aom_highbd_8_masked_sub_pixel_variance64x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6020:9): [True: 1, False: 0]
  ------------------
 6021|      1|    aom_highbd_8_masked_sub_pixel_variance8x16 = aom_highbd_8_masked_sub_pixel_variance8x16_c;
 6022|      1|    if (flags & HAS_SSSE3) aom_highbd_8_masked_sub_pixel_variance8x16 = aom_highbd_8_masked_sub_pixel_variance8x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6022:9): [True: 1, False: 0]
  ------------------
 6023|      1|    aom_highbd_8_masked_sub_pixel_variance8x32 = aom_highbd_8_masked_sub_pixel_variance8x32_c;
 6024|      1|    if (flags & HAS_SSSE3) aom_highbd_8_masked_sub_pixel_variance8x32 = aom_highbd_8_masked_sub_pixel_variance8x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6024:9): [True: 1, False: 0]
  ------------------
 6025|      1|    aom_highbd_8_masked_sub_pixel_variance8x4 = aom_highbd_8_masked_sub_pixel_variance8x4_c;
 6026|      1|    if (flags & HAS_SSSE3) aom_highbd_8_masked_sub_pixel_variance8x4 = aom_highbd_8_masked_sub_pixel_variance8x4_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6026:9): [True: 1, False: 0]
  ------------------
 6027|      1|    aom_highbd_8_masked_sub_pixel_variance8x8 = aom_highbd_8_masked_sub_pixel_variance8x8_c;
 6028|      1|    if (flags & HAS_SSSE3) aom_highbd_8_masked_sub_pixel_variance8x8 = aom_highbd_8_masked_sub_pixel_variance8x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6028:9): [True: 1, False: 0]
  ------------------
 6029|      1|    aom_highbd_8_obmc_variance128x128 = aom_highbd_8_obmc_variance128x128_c;
 6030|      1|    if (flags & HAS_SSE4_1) aom_highbd_8_obmc_variance128x128 = aom_highbd_8_obmc_variance128x128_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6030:9): [True: 1, False: 0]
  ------------------
 6031|      1|    aom_highbd_8_obmc_variance128x64 = aom_highbd_8_obmc_variance128x64_c;
 6032|      1|    if (flags & HAS_SSE4_1) aom_highbd_8_obmc_variance128x64 = aom_highbd_8_obmc_variance128x64_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6032:9): [True: 1, False: 0]
  ------------------
 6033|      1|    aom_highbd_8_obmc_variance16x16 = aom_highbd_8_obmc_variance16x16_c;
 6034|      1|    if (flags & HAS_SSE4_1) aom_highbd_8_obmc_variance16x16 = aom_highbd_8_obmc_variance16x16_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6034:9): [True: 1, False: 0]
  ------------------
 6035|      1|    aom_highbd_8_obmc_variance16x32 = aom_highbd_8_obmc_variance16x32_c;
 6036|      1|    if (flags & HAS_SSE4_1) aom_highbd_8_obmc_variance16x32 = aom_highbd_8_obmc_variance16x32_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6036:9): [True: 1, False: 0]
  ------------------
 6037|      1|    aom_highbd_8_obmc_variance16x4 = aom_highbd_8_obmc_variance16x4_c;
 6038|      1|    if (flags & HAS_SSE4_1) aom_highbd_8_obmc_variance16x4 = aom_highbd_8_obmc_variance16x4_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6038:9): [True: 1, False: 0]
  ------------------
 6039|      1|    aom_highbd_8_obmc_variance16x64 = aom_highbd_8_obmc_variance16x64_c;
 6040|      1|    if (flags & HAS_SSE4_1) aom_highbd_8_obmc_variance16x64 = aom_highbd_8_obmc_variance16x64_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6040:9): [True: 1, False: 0]
  ------------------
 6041|      1|    aom_highbd_8_obmc_variance16x8 = aom_highbd_8_obmc_variance16x8_c;
 6042|      1|    if (flags & HAS_SSE4_1) aom_highbd_8_obmc_variance16x8 = aom_highbd_8_obmc_variance16x8_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6042:9): [True: 1, False: 0]
  ------------------
 6043|      1|    aom_highbd_8_obmc_variance32x16 = aom_highbd_8_obmc_variance32x16_c;
 6044|      1|    if (flags & HAS_SSE4_1) aom_highbd_8_obmc_variance32x16 = aom_highbd_8_obmc_variance32x16_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6044:9): [True: 1, False: 0]
  ------------------
 6045|      1|    aom_highbd_8_obmc_variance32x32 = aom_highbd_8_obmc_variance32x32_c;
 6046|      1|    if (flags & HAS_SSE4_1) aom_highbd_8_obmc_variance32x32 = aom_highbd_8_obmc_variance32x32_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6046:9): [True: 1, False: 0]
  ------------------
 6047|      1|    aom_highbd_8_obmc_variance32x64 = aom_highbd_8_obmc_variance32x64_c;
 6048|      1|    if (flags & HAS_SSE4_1) aom_highbd_8_obmc_variance32x64 = aom_highbd_8_obmc_variance32x64_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6048:9): [True: 1, False: 0]
  ------------------
 6049|      1|    aom_highbd_8_obmc_variance32x8 = aom_highbd_8_obmc_variance32x8_c;
 6050|      1|    if (flags & HAS_SSE4_1) aom_highbd_8_obmc_variance32x8 = aom_highbd_8_obmc_variance32x8_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6050:9): [True: 1, False: 0]
  ------------------
 6051|      1|    aom_highbd_8_obmc_variance4x16 = aom_highbd_8_obmc_variance4x16_c;
 6052|      1|    if (flags & HAS_SSE4_1) aom_highbd_8_obmc_variance4x16 = aom_highbd_8_obmc_variance4x16_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6052:9): [True: 1, False: 0]
  ------------------
 6053|      1|    aom_highbd_8_obmc_variance4x4 = aom_highbd_8_obmc_variance4x4_c;
 6054|      1|    if (flags & HAS_SSE4_1) aom_highbd_8_obmc_variance4x4 = aom_highbd_8_obmc_variance4x4_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6054:9): [True: 1, False: 0]
  ------------------
 6055|      1|    aom_highbd_8_obmc_variance4x8 = aom_highbd_8_obmc_variance4x8_c;
 6056|      1|    if (flags & HAS_SSE4_1) aom_highbd_8_obmc_variance4x8 = aom_highbd_8_obmc_variance4x8_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6056:9): [True: 1, False: 0]
  ------------------
 6057|      1|    aom_highbd_8_obmc_variance64x128 = aom_highbd_8_obmc_variance64x128_c;
 6058|      1|    if (flags & HAS_SSE4_1) aom_highbd_8_obmc_variance64x128 = aom_highbd_8_obmc_variance64x128_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6058:9): [True: 1, False: 0]
  ------------------
 6059|      1|    aom_highbd_8_obmc_variance64x16 = aom_highbd_8_obmc_variance64x16_c;
 6060|      1|    if (flags & HAS_SSE4_1) aom_highbd_8_obmc_variance64x16 = aom_highbd_8_obmc_variance64x16_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6060:9): [True: 1, False: 0]
  ------------------
 6061|      1|    aom_highbd_8_obmc_variance64x32 = aom_highbd_8_obmc_variance64x32_c;
 6062|      1|    if (flags & HAS_SSE4_1) aom_highbd_8_obmc_variance64x32 = aom_highbd_8_obmc_variance64x32_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6062:9): [True: 1, False: 0]
  ------------------
 6063|      1|    aom_highbd_8_obmc_variance64x64 = aom_highbd_8_obmc_variance64x64_c;
 6064|      1|    if (flags & HAS_SSE4_1) aom_highbd_8_obmc_variance64x64 = aom_highbd_8_obmc_variance64x64_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6064:9): [True: 1, False: 0]
  ------------------
 6065|      1|    aom_highbd_8_obmc_variance8x16 = aom_highbd_8_obmc_variance8x16_c;
 6066|      1|    if (flags & HAS_SSE4_1) aom_highbd_8_obmc_variance8x16 = aom_highbd_8_obmc_variance8x16_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6066:9): [True: 1, False: 0]
  ------------------
 6067|      1|    aom_highbd_8_obmc_variance8x32 = aom_highbd_8_obmc_variance8x32_c;
 6068|      1|    if (flags & HAS_SSE4_1) aom_highbd_8_obmc_variance8x32 = aom_highbd_8_obmc_variance8x32_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6068:9): [True: 1, False: 0]
  ------------------
 6069|      1|    aom_highbd_8_obmc_variance8x4 = aom_highbd_8_obmc_variance8x4_c;
 6070|      1|    if (flags & HAS_SSE4_1) aom_highbd_8_obmc_variance8x4 = aom_highbd_8_obmc_variance8x4_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6070:9): [True: 1, False: 0]
  ------------------
 6071|      1|    aom_highbd_8_obmc_variance8x8 = aom_highbd_8_obmc_variance8x8_c;
 6072|      1|    if (flags & HAS_SSE4_1) aom_highbd_8_obmc_variance8x8 = aom_highbd_8_obmc_variance8x8_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6072:9): [True: 1, False: 0]
  ------------------
 6073|      1|    aom_highbd_8_sub_pixel_avg_variance4x4 = aom_highbd_8_sub_pixel_avg_variance4x4_c;
 6074|      1|    if (flags & HAS_SSE4_1) aom_highbd_8_sub_pixel_avg_variance4x4 = aom_highbd_8_sub_pixel_avg_variance4x4_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6074:9): [True: 1, False: 0]
  ------------------
 6075|      1|    aom_highbd_8_sub_pixel_variance4x4 = aom_highbd_8_sub_pixel_variance4x4_c;
 6076|      1|    if (flags & HAS_SSE4_1) aom_highbd_8_sub_pixel_variance4x4 = aom_highbd_8_sub_pixel_variance4x4_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6076:9): [True: 1, False: 0]
  ------------------
 6077|      1|    aom_highbd_8_variance4x4 = aom_highbd_8_variance4x4_c;
 6078|      1|    if (flags & HAS_SSE4_1) aom_highbd_8_variance4x4 = aom_highbd_8_variance4x4_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6078:9): [True: 1, False: 0]
  ------------------
 6079|      1|    aom_highbd_blend_a64_d16_mask = aom_highbd_blend_a64_d16_mask_c;
 6080|      1|    if (flags & HAS_SSE4_1) aom_highbd_blend_a64_d16_mask = aom_highbd_blend_a64_d16_mask_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6080:9): [True: 1, False: 0]
  ------------------
 6081|      1|    if (flags & HAS_AVX2) aom_highbd_blend_a64_d16_mask = aom_highbd_blend_a64_d16_mask_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6081:9): [True: 1, False: 0]
  ------------------
 6082|      1|    aom_highbd_blend_a64_hmask = aom_highbd_blend_a64_hmask_c;
 6083|      1|    if (flags & HAS_SSE4_1) aom_highbd_blend_a64_hmask = aom_highbd_blend_a64_hmask_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6083:9): [True: 1, False: 0]
  ------------------
 6084|      1|    aom_highbd_blend_a64_mask = aom_highbd_blend_a64_mask_c;
 6085|      1|    if (flags & HAS_SSE4_1) aom_highbd_blend_a64_mask = aom_highbd_blend_a64_mask_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6085:9): [True: 1, False: 0]
  ------------------
 6086|      1|    aom_highbd_blend_a64_vmask = aom_highbd_blend_a64_vmask_c;
 6087|      1|    if (flags & HAS_SSE4_1) aom_highbd_blend_a64_vmask = aom_highbd_blend_a64_vmask_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6087:9): [True: 1, False: 0]
  ------------------
 6088|      1|    aom_highbd_comp_mask_pred = aom_highbd_comp_mask_pred_sse2;
 6089|      1|    if (flags & HAS_AVX2) aom_highbd_comp_mask_pred = aom_highbd_comp_mask_pred_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6089:9): [True: 1, False: 0]
  ------------------
 6090|      1|    aom_highbd_convolve8_horiz = aom_highbd_convolve8_horiz_sse2;
 6091|      1|    if (flags & HAS_AVX2) aom_highbd_convolve8_horiz = aom_highbd_convolve8_horiz_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6091:9): [True: 1, False: 0]
  ------------------
 6092|      1|    aom_highbd_convolve8_vert = aom_highbd_convolve8_vert_sse2;
 6093|      1|    if (flags & HAS_AVX2) aom_highbd_convolve8_vert = aom_highbd_convolve8_vert_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6093:9): [True: 1, False: 0]
  ------------------
 6094|      1|    aom_highbd_convolve_copy = aom_highbd_convolve_copy_sse2;
 6095|      1|    if (flags & HAS_AVX2) aom_highbd_convolve_copy = aom_highbd_convolve_copy_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6095:9): [True: 1, False: 0]
  ------------------
 6096|      1|    aom_highbd_hadamard_16x16 = aom_highbd_hadamard_16x16_c;
 6097|      1|    if (flags & HAS_AVX2) aom_highbd_hadamard_16x16 = aom_highbd_hadamard_16x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6097:9): [True: 1, False: 0]
  ------------------
 6098|      1|    aom_highbd_hadamard_32x32 = aom_highbd_hadamard_32x32_c;
 6099|      1|    if (flags & HAS_AVX2) aom_highbd_hadamard_32x32 = aom_highbd_hadamard_32x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6099:9): [True: 1, False: 0]
  ------------------
 6100|      1|    aom_highbd_hadamard_8x8 = aom_highbd_hadamard_8x8_c;
 6101|      1|    if (flags & HAS_AVX2) aom_highbd_hadamard_8x8 = aom_highbd_hadamard_8x8_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6101:9): [True: 1, False: 0]
  ------------------
 6102|      1|    aom_highbd_lpf_horizontal_14_dual = aom_highbd_lpf_horizontal_14_dual_sse2;
 6103|      1|    if (flags & HAS_AVX2) aom_highbd_lpf_horizontal_14_dual = aom_highbd_lpf_horizontal_14_dual_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6103:9): [True: 1, False: 0]
  ------------------
 6104|      1|    aom_highbd_lpf_horizontal_4_dual = aom_highbd_lpf_horizontal_4_dual_sse2;
 6105|      1|    if (flags & HAS_AVX2) aom_highbd_lpf_horizontal_4_dual = aom_highbd_lpf_horizontal_4_dual_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6105:9): [True: 1, False: 0]
  ------------------
 6106|      1|    aom_highbd_lpf_horizontal_8_dual = aom_highbd_lpf_horizontal_8_dual_sse2;
 6107|      1|    if (flags & HAS_AVX2) aom_highbd_lpf_horizontal_8_dual = aom_highbd_lpf_horizontal_8_dual_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6107:9): [True: 1, False: 0]
  ------------------
 6108|      1|    aom_highbd_lpf_vertical_14_dual = aom_highbd_lpf_vertical_14_dual_sse2;
 6109|      1|    if (flags & HAS_AVX2) aom_highbd_lpf_vertical_14_dual = aom_highbd_lpf_vertical_14_dual_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6109:9): [True: 1, False: 0]
  ------------------
 6110|      1|    aom_highbd_lpf_vertical_4_dual = aom_highbd_lpf_vertical_4_dual_sse2;
 6111|      1|    if (flags & HAS_AVX2) aom_highbd_lpf_vertical_4_dual = aom_highbd_lpf_vertical_4_dual_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6111:9): [True: 1, False: 0]
  ------------------
 6112|      1|    aom_highbd_lpf_vertical_8_dual = aom_highbd_lpf_vertical_8_dual_sse2;
 6113|      1|    if (flags & HAS_AVX2) aom_highbd_lpf_vertical_8_dual = aom_highbd_lpf_vertical_8_dual_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6113:9): [True: 1, False: 0]
  ------------------
 6114|      1|    aom_highbd_masked_sad128x128 = aom_highbd_masked_sad128x128_c;
 6115|      1|    if (flags & HAS_SSSE3) aom_highbd_masked_sad128x128 = aom_highbd_masked_sad128x128_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6115:9): [True: 1, False: 0]
  ------------------
 6116|      1|    if (flags & HAS_AVX2) aom_highbd_masked_sad128x128 = aom_highbd_masked_sad128x128_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6116:9): [True: 1, False: 0]
  ------------------
 6117|      1|    aom_highbd_masked_sad128x64 = aom_highbd_masked_sad128x64_c;
 6118|      1|    if (flags & HAS_SSSE3) aom_highbd_masked_sad128x64 = aom_highbd_masked_sad128x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6118:9): [True: 1, False: 0]
  ------------------
 6119|      1|    if (flags & HAS_AVX2) aom_highbd_masked_sad128x64 = aom_highbd_masked_sad128x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6119:9): [True: 1, False: 0]
  ------------------
 6120|      1|    aom_highbd_masked_sad16x16 = aom_highbd_masked_sad16x16_c;
 6121|      1|    if (flags & HAS_SSSE3) aom_highbd_masked_sad16x16 = aom_highbd_masked_sad16x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6121:9): [True: 1, False: 0]
  ------------------
 6122|      1|    if (flags & HAS_AVX2) aom_highbd_masked_sad16x16 = aom_highbd_masked_sad16x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6122:9): [True: 1, False: 0]
  ------------------
 6123|      1|    aom_highbd_masked_sad16x32 = aom_highbd_masked_sad16x32_c;
 6124|      1|    if (flags & HAS_SSSE3) aom_highbd_masked_sad16x32 = aom_highbd_masked_sad16x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6124:9): [True: 1, False: 0]
  ------------------
 6125|      1|    if (flags & HAS_AVX2) aom_highbd_masked_sad16x32 = aom_highbd_masked_sad16x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6125:9): [True: 1, False: 0]
  ------------------
 6126|      1|    aom_highbd_masked_sad16x4 = aom_highbd_masked_sad16x4_c;
 6127|      1|    if (flags & HAS_SSSE3) aom_highbd_masked_sad16x4 = aom_highbd_masked_sad16x4_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6127:9): [True: 1, False: 0]
  ------------------
 6128|      1|    if (flags & HAS_AVX2) aom_highbd_masked_sad16x4 = aom_highbd_masked_sad16x4_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6128:9): [True: 1, False: 0]
  ------------------
 6129|      1|    aom_highbd_masked_sad16x64 = aom_highbd_masked_sad16x64_c;
 6130|      1|    if (flags & HAS_SSSE3) aom_highbd_masked_sad16x64 = aom_highbd_masked_sad16x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6130:9): [True: 1, False: 0]
  ------------------
 6131|      1|    if (flags & HAS_AVX2) aom_highbd_masked_sad16x64 = aom_highbd_masked_sad16x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6131:9): [True: 1, False: 0]
  ------------------
 6132|      1|    aom_highbd_masked_sad16x8 = aom_highbd_masked_sad16x8_c;
 6133|      1|    if (flags & HAS_SSSE3) aom_highbd_masked_sad16x8 = aom_highbd_masked_sad16x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6133:9): [True: 1, False: 0]
  ------------------
 6134|      1|    if (flags & HAS_AVX2) aom_highbd_masked_sad16x8 = aom_highbd_masked_sad16x8_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6134:9): [True: 1, False: 0]
  ------------------
 6135|      1|    aom_highbd_masked_sad32x16 = aom_highbd_masked_sad32x16_c;
 6136|      1|    if (flags & HAS_SSSE3) aom_highbd_masked_sad32x16 = aom_highbd_masked_sad32x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6136:9): [True: 1, False: 0]
  ------------------
 6137|      1|    if (flags & HAS_AVX2) aom_highbd_masked_sad32x16 = aom_highbd_masked_sad32x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6137:9): [True: 1, False: 0]
  ------------------
 6138|      1|    aom_highbd_masked_sad32x32 = aom_highbd_masked_sad32x32_c;
 6139|      1|    if (flags & HAS_SSSE3) aom_highbd_masked_sad32x32 = aom_highbd_masked_sad32x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6139:9): [True: 1, False: 0]
  ------------------
 6140|      1|    if (flags & HAS_AVX2) aom_highbd_masked_sad32x32 = aom_highbd_masked_sad32x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6140:9): [True: 1, False: 0]
  ------------------
 6141|      1|    aom_highbd_masked_sad32x64 = aom_highbd_masked_sad32x64_c;
 6142|      1|    if (flags & HAS_SSSE3) aom_highbd_masked_sad32x64 = aom_highbd_masked_sad32x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6142:9): [True: 1, False: 0]
  ------------------
 6143|      1|    if (flags & HAS_AVX2) aom_highbd_masked_sad32x64 = aom_highbd_masked_sad32x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6143:9): [True: 1, False: 0]
  ------------------
 6144|      1|    aom_highbd_masked_sad32x8 = aom_highbd_masked_sad32x8_c;
 6145|      1|    if (flags & HAS_SSSE3) aom_highbd_masked_sad32x8 = aom_highbd_masked_sad32x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6145:9): [True: 1, False: 0]
  ------------------
 6146|      1|    if (flags & HAS_AVX2) aom_highbd_masked_sad32x8 = aom_highbd_masked_sad32x8_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6146:9): [True: 1, False: 0]
  ------------------
 6147|      1|    aom_highbd_masked_sad4x16 = aom_highbd_masked_sad4x16_c;
 6148|      1|    if (flags & HAS_SSSE3) aom_highbd_masked_sad4x16 = aom_highbd_masked_sad4x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6148:9): [True: 1, False: 0]
  ------------------
 6149|      1|    if (flags & HAS_AVX2) aom_highbd_masked_sad4x16 = aom_highbd_masked_sad4x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6149:9): [True: 1, False: 0]
  ------------------
 6150|      1|    aom_highbd_masked_sad4x4 = aom_highbd_masked_sad4x4_c;
 6151|      1|    if (flags & HAS_SSSE3) aom_highbd_masked_sad4x4 = aom_highbd_masked_sad4x4_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6151:9): [True: 1, False: 0]
  ------------------
 6152|      1|    if (flags & HAS_AVX2) aom_highbd_masked_sad4x4 = aom_highbd_masked_sad4x4_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6152:9): [True: 1, False: 0]
  ------------------
 6153|      1|    aom_highbd_masked_sad4x8 = aom_highbd_masked_sad4x8_c;
 6154|      1|    if (flags & HAS_SSSE3) aom_highbd_masked_sad4x8 = aom_highbd_masked_sad4x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6154:9): [True: 1, False: 0]
  ------------------
 6155|      1|    if (flags & HAS_AVX2) aom_highbd_masked_sad4x8 = aom_highbd_masked_sad4x8_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6155:9): [True: 1, False: 0]
  ------------------
 6156|      1|    aom_highbd_masked_sad64x128 = aom_highbd_masked_sad64x128_c;
 6157|      1|    if (flags & HAS_SSSE3) aom_highbd_masked_sad64x128 = aom_highbd_masked_sad64x128_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6157:9): [True: 1, False: 0]
  ------------------
 6158|      1|    if (flags & HAS_AVX2) aom_highbd_masked_sad64x128 = aom_highbd_masked_sad64x128_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6158:9): [True: 1, False: 0]
  ------------------
 6159|      1|    aom_highbd_masked_sad64x16 = aom_highbd_masked_sad64x16_c;
 6160|      1|    if (flags & HAS_SSSE3) aom_highbd_masked_sad64x16 = aom_highbd_masked_sad64x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6160:9): [True: 1, False: 0]
  ------------------
 6161|      1|    if (flags & HAS_AVX2) aom_highbd_masked_sad64x16 = aom_highbd_masked_sad64x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6161:9): [True: 1, False: 0]
  ------------------
 6162|      1|    aom_highbd_masked_sad64x32 = aom_highbd_masked_sad64x32_c;
 6163|      1|    if (flags & HAS_SSSE3) aom_highbd_masked_sad64x32 = aom_highbd_masked_sad64x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6163:9): [True: 1, False: 0]
  ------------------
 6164|      1|    if (flags & HAS_AVX2) aom_highbd_masked_sad64x32 = aom_highbd_masked_sad64x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6164:9): [True: 1, False: 0]
  ------------------
 6165|      1|    aom_highbd_masked_sad64x64 = aom_highbd_masked_sad64x64_c;
 6166|      1|    if (flags & HAS_SSSE3) aom_highbd_masked_sad64x64 = aom_highbd_masked_sad64x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6166:9): [True: 1, False: 0]
  ------------------
 6167|      1|    if (flags & HAS_AVX2) aom_highbd_masked_sad64x64 = aom_highbd_masked_sad64x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6167:9): [True: 1, False: 0]
  ------------------
 6168|      1|    aom_highbd_masked_sad8x16 = aom_highbd_masked_sad8x16_c;
 6169|      1|    if (flags & HAS_SSSE3) aom_highbd_masked_sad8x16 = aom_highbd_masked_sad8x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6169:9): [True: 1, False: 0]
  ------------------
 6170|      1|    if (flags & HAS_AVX2) aom_highbd_masked_sad8x16 = aom_highbd_masked_sad8x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6170:9): [True: 1, False: 0]
  ------------------
 6171|      1|    aom_highbd_masked_sad8x32 = aom_highbd_masked_sad8x32_c;
 6172|      1|    if (flags & HAS_SSSE3) aom_highbd_masked_sad8x32 = aom_highbd_masked_sad8x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6172:9): [True: 1, False: 0]
  ------------------
 6173|      1|    if (flags & HAS_AVX2) aom_highbd_masked_sad8x32 = aom_highbd_masked_sad8x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6173:9): [True: 1, False: 0]
  ------------------
 6174|      1|    aom_highbd_masked_sad8x4 = aom_highbd_masked_sad8x4_c;
 6175|      1|    if (flags & HAS_SSSE3) aom_highbd_masked_sad8x4 = aom_highbd_masked_sad8x4_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6175:9): [True: 1, False: 0]
  ------------------
 6176|      1|    if (flags & HAS_AVX2) aom_highbd_masked_sad8x4 = aom_highbd_masked_sad8x4_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6176:9): [True: 1, False: 0]
  ------------------
 6177|      1|    aom_highbd_masked_sad8x8 = aom_highbd_masked_sad8x8_c;
 6178|      1|    if (flags & HAS_SSSE3) aom_highbd_masked_sad8x8 = aom_highbd_masked_sad8x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6178:9): [True: 1, False: 0]
  ------------------
 6179|      1|    if (flags & HAS_AVX2) aom_highbd_masked_sad8x8 = aom_highbd_masked_sad8x8_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6179:9): [True: 1, False: 0]
  ------------------
 6180|      1|    aom_highbd_obmc_sad128x128 = aom_highbd_obmc_sad128x128_c;
 6181|      1|    if (flags & HAS_SSE4_1) aom_highbd_obmc_sad128x128 = aom_highbd_obmc_sad128x128_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6181:9): [True: 1, False: 0]
  ------------------
 6182|      1|    if (flags & HAS_AVX2) aom_highbd_obmc_sad128x128 = aom_highbd_obmc_sad128x128_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6182:9): [True: 1, False: 0]
  ------------------
 6183|      1|    aom_highbd_obmc_sad128x64 = aom_highbd_obmc_sad128x64_c;
 6184|      1|    if (flags & HAS_SSE4_1) aom_highbd_obmc_sad128x64 = aom_highbd_obmc_sad128x64_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6184:9): [True: 1, False: 0]
  ------------------
 6185|      1|    if (flags & HAS_AVX2) aom_highbd_obmc_sad128x64 = aom_highbd_obmc_sad128x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6185:9): [True: 1, False: 0]
  ------------------
 6186|      1|    aom_highbd_obmc_sad16x16 = aom_highbd_obmc_sad16x16_c;
 6187|      1|    if (flags & HAS_SSE4_1) aom_highbd_obmc_sad16x16 = aom_highbd_obmc_sad16x16_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6187:9): [True: 1, False: 0]
  ------------------
 6188|      1|    if (flags & HAS_AVX2) aom_highbd_obmc_sad16x16 = aom_highbd_obmc_sad16x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6188:9): [True: 1, False: 0]
  ------------------
 6189|      1|    aom_highbd_obmc_sad16x32 = aom_highbd_obmc_sad16x32_c;
 6190|      1|    if (flags & HAS_SSE4_1) aom_highbd_obmc_sad16x32 = aom_highbd_obmc_sad16x32_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6190:9): [True: 1, False: 0]
  ------------------
 6191|      1|    if (flags & HAS_AVX2) aom_highbd_obmc_sad16x32 = aom_highbd_obmc_sad16x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6191:9): [True: 1, False: 0]
  ------------------
 6192|      1|    aom_highbd_obmc_sad16x4 = aom_highbd_obmc_sad16x4_c;
 6193|      1|    if (flags & HAS_SSE4_1) aom_highbd_obmc_sad16x4 = aom_highbd_obmc_sad16x4_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6193:9): [True: 1, False: 0]
  ------------------
 6194|      1|    if (flags & HAS_AVX2) aom_highbd_obmc_sad16x4 = aom_highbd_obmc_sad16x4_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6194:9): [True: 1, False: 0]
  ------------------
 6195|      1|    aom_highbd_obmc_sad16x64 = aom_highbd_obmc_sad16x64_c;
 6196|      1|    if (flags & HAS_SSE4_1) aom_highbd_obmc_sad16x64 = aom_highbd_obmc_sad16x64_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6196:9): [True: 1, False: 0]
  ------------------
 6197|      1|    if (flags & HAS_AVX2) aom_highbd_obmc_sad16x64 = aom_highbd_obmc_sad16x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6197:9): [True: 1, False: 0]
  ------------------
 6198|      1|    aom_highbd_obmc_sad16x8 = aom_highbd_obmc_sad16x8_c;
 6199|      1|    if (flags & HAS_SSE4_1) aom_highbd_obmc_sad16x8 = aom_highbd_obmc_sad16x8_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6199:9): [True: 1, False: 0]
  ------------------
 6200|      1|    if (flags & HAS_AVX2) aom_highbd_obmc_sad16x8 = aom_highbd_obmc_sad16x8_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6200:9): [True: 1, False: 0]
  ------------------
 6201|      1|    aom_highbd_obmc_sad32x16 = aom_highbd_obmc_sad32x16_c;
 6202|      1|    if (flags & HAS_SSE4_1) aom_highbd_obmc_sad32x16 = aom_highbd_obmc_sad32x16_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6202:9): [True: 1, False: 0]
  ------------------
 6203|      1|    if (flags & HAS_AVX2) aom_highbd_obmc_sad32x16 = aom_highbd_obmc_sad32x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6203:9): [True: 1, False: 0]
  ------------------
 6204|      1|    aom_highbd_obmc_sad32x32 = aom_highbd_obmc_sad32x32_c;
 6205|      1|    if (flags & HAS_SSE4_1) aom_highbd_obmc_sad32x32 = aom_highbd_obmc_sad32x32_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6205:9): [True: 1, False: 0]
  ------------------
 6206|      1|    if (flags & HAS_AVX2) aom_highbd_obmc_sad32x32 = aom_highbd_obmc_sad32x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6206:9): [True: 1, False: 0]
  ------------------
 6207|      1|    aom_highbd_obmc_sad32x64 = aom_highbd_obmc_sad32x64_c;
 6208|      1|    if (flags & HAS_SSE4_1) aom_highbd_obmc_sad32x64 = aom_highbd_obmc_sad32x64_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6208:9): [True: 1, False: 0]
  ------------------
 6209|      1|    if (flags & HAS_AVX2) aom_highbd_obmc_sad32x64 = aom_highbd_obmc_sad32x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6209:9): [True: 1, False: 0]
  ------------------
 6210|      1|    aom_highbd_obmc_sad32x8 = aom_highbd_obmc_sad32x8_c;
 6211|      1|    if (flags & HAS_SSE4_1) aom_highbd_obmc_sad32x8 = aom_highbd_obmc_sad32x8_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6211:9): [True: 1, False: 0]
  ------------------
 6212|      1|    if (flags & HAS_AVX2) aom_highbd_obmc_sad32x8 = aom_highbd_obmc_sad32x8_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6212:9): [True: 1, False: 0]
  ------------------
 6213|      1|    aom_highbd_obmc_sad4x16 = aom_highbd_obmc_sad4x16_c;
 6214|      1|    if (flags & HAS_SSE4_1) aom_highbd_obmc_sad4x16 = aom_highbd_obmc_sad4x16_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6214:9): [True: 1, False: 0]
  ------------------
 6215|      1|    if (flags & HAS_AVX2) aom_highbd_obmc_sad4x16 = aom_highbd_obmc_sad4x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6215:9): [True: 1, False: 0]
  ------------------
 6216|      1|    aom_highbd_obmc_sad4x4 = aom_highbd_obmc_sad4x4_c;
 6217|      1|    if (flags & HAS_SSE4_1) aom_highbd_obmc_sad4x4 = aom_highbd_obmc_sad4x4_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6217:9): [True: 1, False: 0]
  ------------------
 6218|      1|    if (flags & HAS_AVX2) aom_highbd_obmc_sad4x4 = aom_highbd_obmc_sad4x4_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6218:9): [True: 1, False: 0]
  ------------------
 6219|      1|    aom_highbd_obmc_sad4x8 = aom_highbd_obmc_sad4x8_c;
 6220|      1|    if (flags & HAS_SSE4_1) aom_highbd_obmc_sad4x8 = aom_highbd_obmc_sad4x8_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6220:9): [True: 1, False: 0]
  ------------------
 6221|      1|    if (flags & HAS_AVX2) aom_highbd_obmc_sad4x8 = aom_highbd_obmc_sad4x8_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6221:9): [True: 1, False: 0]
  ------------------
 6222|      1|    aom_highbd_obmc_sad64x128 = aom_highbd_obmc_sad64x128_c;
 6223|      1|    if (flags & HAS_SSE4_1) aom_highbd_obmc_sad64x128 = aom_highbd_obmc_sad64x128_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6223:9): [True: 1, False: 0]
  ------------------
 6224|      1|    if (flags & HAS_AVX2) aom_highbd_obmc_sad64x128 = aom_highbd_obmc_sad64x128_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6224:9): [True: 1, False: 0]
  ------------------
 6225|      1|    aom_highbd_obmc_sad64x16 = aom_highbd_obmc_sad64x16_c;
 6226|      1|    if (flags & HAS_SSE4_1) aom_highbd_obmc_sad64x16 = aom_highbd_obmc_sad64x16_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6226:9): [True: 1, False: 0]
  ------------------
 6227|      1|    if (flags & HAS_AVX2) aom_highbd_obmc_sad64x16 = aom_highbd_obmc_sad64x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6227:9): [True: 1, False: 0]
  ------------------
 6228|      1|    aom_highbd_obmc_sad64x32 = aom_highbd_obmc_sad64x32_c;
 6229|      1|    if (flags & HAS_SSE4_1) aom_highbd_obmc_sad64x32 = aom_highbd_obmc_sad64x32_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6229:9): [True: 1, False: 0]
  ------------------
 6230|      1|    if (flags & HAS_AVX2) aom_highbd_obmc_sad64x32 = aom_highbd_obmc_sad64x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6230:9): [True: 1, False: 0]
  ------------------
 6231|      1|    aom_highbd_obmc_sad64x64 = aom_highbd_obmc_sad64x64_c;
 6232|      1|    if (flags & HAS_SSE4_1) aom_highbd_obmc_sad64x64 = aom_highbd_obmc_sad64x64_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6232:9): [True: 1, False: 0]
  ------------------
 6233|      1|    if (flags & HAS_AVX2) aom_highbd_obmc_sad64x64 = aom_highbd_obmc_sad64x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6233:9): [True: 1, False: 0]
  ------------------
 6234|      1|    aom_highbd_obmc_sad8x16 = aom_highbd_obmc_sad8x16_c;
 6235|      1|    if (flags & HAS_SSE4_1) aom_highbd_obmc_sad8x16 = aom_highbd_obmc_sad8x16_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6235:9): [True: 1, False: 0]
  ------------------
 6236|      1|    if (flags & HAS_AVX2) aom_highbd_obmc_sad8x16 = aom_highbd_obmc_sad8x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6236:9): [True: 1, False: 0]
  ------------------
 6237|      1|    aom_highbd_obmc_sad8x32 = aom_highbd_obmc_sad8x32_c;
 6238|      1|    if (flags & HAS_SSE4_1) aom_highbd_obmc_sad8x32 = aom_highbd_obmc_sad8x32_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6238:9): [True: 1, False: 0]
  ------------------
 6239|      1|    if (flags & HAS_AVX2) aom_highbd_obmc_sad8x32 = aom_highbd_obmc_sad8x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6239:9): [True: 1, False: 0]
  ------------------
 6240|      1|    aom_highbd_obmc_sad8x4 = aom_highbd_obmc_sad8x4_c;
 6241|      1|    if (flags & HAS_SSE4_1) aom_highbd_obmc_sad8x4 = aom_highbd_obmc_sad8x4_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6241:9): [True: 1, False: 0]
  ------------------
 6242|      1|    if (flags & HAS_AVX2) aom_highbd_obmc_sad8x4 = aom_highbd_obmc_sad8x4_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6242:9): [True: 1, False: 0]
  ------------------
 6243|      1|    aom_highbd_obmc_sad8x8 = aom_highbd_obmc_sad8x8_c;
 6244|      1|    if (flags & HAS_SSE4_1) aom_highbd_obmc_sad8x8 = aom_highbd_obmc_sad8x8_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6244:9): [True: 1, False: 0]
  ------------------
 6245|      1|    if (flags & HAS_AVX2) aom_highbd_obmc_sad8x8 = aom_highbd_obmc_sad8x8_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6245:9): [True: 1, False: 0]
  ------------------
 6246|      1|    aom_highbd_quantize_b = aom_highbd_quantize_b_sse2;
 6247|      1|    if (flags & HAS_AVX2) aom_highbd_quantize_b = aom_highbd_quantize_b_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6247:9): [True: 1, False: 0]
  ------------------
 6248|      1|    aom_highbd_quantize_b_32x32 = aom_highbd_quantize_b_32x32_sse2;
 6249|      1|    if (flags & HAS_AVX2) aom_highbd_quantize_b_32x32 = aom_highbd_quantize_b_32x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6249:9): [True: 1, False: 0]
  ------------------
 6250|      1|    aom_highbd_quantize_b_32x32_adaptive = aom_highbd_quantize_b_32x32_adaptive_sse2;
 6251|      1|    if (flags & HAS_AVX2) aom_highbd_quantize_b_32x32_adaptive = aom_highbd_quantize_b_32x32_adaptive_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6251:9): [True: 1, False: 0]
  ------------------
 6252|      1|    aom_highbd_quantize_b_64x64 = aom_highbd_quantize_b_64x64_sse2;
 6253|      1|    if (flags & HAS_AVX2) aom_highbd_quantize_b_64x64 = aom_highbd_quantize_b_64x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6253:9): [True: 1, False: 0]
  ------------------
 6254|      1|    aom_highbd_quantize_b_adaptive = aom_highbd_quantize_b_adaptive_sse2;
 6255|      1|    if (flags & HAS_AVX2) aom_highbd_quantize_b_adaptive = aom_highbd_quantize_b_adaptive_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6255:9): [True: 1, False: 0]
  ------------------
 6256|      1|    aom_highbd_sad128x128 = aom_highbd_sad128x128_c;
 6257|      1|    if (flags & HAS_AVX2) aom_highbd_sad128x128 = aom_highbd_sad128x128_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6257:9): [True: 1, False: 0]
  ------------------
 6258|      1|    aom_highbd_sad128x128_avg = aom_highbd_sad128x128_avg_c;
 6259|      1|    if (flags & HAS_AVX2) aom_highbd_sad128x128_avg = aom_highbd_sad128x128_avg_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6259:9): [True: 1, False: 0]
  ------------------
 6260|      1|    aom_highbd_sad128x128x3d = aom_highbd_sad128x128x3d_c;
 6261|      1|    if (flags & HAS_AVX2) aom_highbd_sad128x128x3d = aom_highbd_sad128x128x3d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6261:9): [True: 1, False: 0]
  ------------------
 6262|      1|    aom_highbd_sad128x128x4d = aom_highbd_sad128x128x4d_c;
 6263|      1|    if (flags & HAS_AVX2) aom_highbd_sad128x128x4d = aom_highbd_sad128x128x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6263:9): [True: 1, False: 0]
  ------------------
 6264|      1|    aom_highbd_sad128x64 = aom_highbd_sad128x64_c;
 6265|      1|    if (flags & HAS_AVX2) aom_highbd_sad128x64 = aom_highbd_sad128x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6265:9): [True: 1, False: 0]
  ------------------
 6266|      1|    aom_highbd_sad128x64_avg = aom_highbd_sad128x64_avg_c;
 6267|      1|    if (flags & HAS_AVX2) aom_highbd_sad128x64_avg = aom_highbd_sad128x64_avg_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6267:9): [True: 1, False: 0]
  ------------------
 6268|      1|    aom_highbd_sad128x64x3d = aom_highbd_sad128x64x3d_c;
 6269|      1|    if (flags & HAS_AVX2) aom_highbd_sad128x64x3d = aom_highbd_sad128x64x3d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6269:9): [True: 1, False: 0]
  ------------------
 6270|      1|    aom_highbd_sad128x64x4d = aom_highbd_sad128x64x4d_c;
 6271|      1|    if (flags & HAS_AVX2) aom_highbd_sad128x64x4d = aom_highbd_sad128x64x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6271:9): [True: 1, False: 0]
  ------------------
 6272|      1|    aom_highbd_sad16x16 = aom_highbd_sad16x16_sse2;
 6273|      1|    if (flags & HAS_AVX2) aom_highbd_sad16x16 = aom_highbd_sad16x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6273:9): [True: 1, False: 0]
  ------------------
 6274|      1|    aom_highbd_sad16x16_avg = aom_highbd_sad16x16_avg_sse2;
 6275|      1|    if (flags & HAS_AVX2) aom_highbd_sad16x16_avg = aom_highbd_sad16x16_avg_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6275:9): [True: 1, False: 0]
  ------------------
 6276|      1|    aom_highbd_sad16x16x3d = aom_highbd_sad16x16x3d_c;
 6277|      1|    if (flags & HAS_AVX2) aom_highbd_sad16x16x3d = aom_highbd_sad16x16x3d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6277:9): [True: 1, False: 0]
  ------------------
 6278|      1|    aom_highbd_sad16x16x4d = aom_highbd_sad16x16x4d_sse2;
 6279|      1|    if (flags & HAS_AVX2) aom_highbd_sad16x16x4d = aom_highbd_sad16x16x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6279:9): [True: 1, False: 0]
  ------------------
 6280|      1|    aom_highbd_sad16x32 = aom_highbd_sad16x32_sse2;
 6281|      1|    if (flags & HAS_AVX2) aom_highbd_sad16x32 = aom_highbd_sad16x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6281:9): [True: 1, False: 0]
  ------------------
 6282|      1|    aom_highbd_sad16x32_avg = aom_highbd_sad16x32_avg_sse2;
 6283|      1|    if (flags & HAS_AVX2) aom_highbd_sad16x32_avg = aom_highbd_sad16x32_avg_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6283:9): [True: 1, False: 0]
  ------------------
 6284|      1|    aom_highbd_sad16x32x3d = aom_highbd_sad16x32x3d_c;
 6285|      1|    if (flags & HAS_AVX2) aom_highbd_sad16x32x3d = aom_highbd_sad16x32x3d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6285:9): [True: 1, False: 0]
  ------------------
 6286|      1|    aom_highbd_sad16x32x4d = aom_highbd_sad16x32x4d_sse2;
 6287|      1|    if (flags & HAS_AVX2) aom_highbd_sad16x32x4d = aom_highbd_sad16x32x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6287:9): [True: 1, False: 0]
  ------------------
 6288|      1|    aom_highbd_sad16x4 = aom_highbd_sad16x4_sse2;
 6289|      1|    if (flags & HAS_AVX2) aom_highbd_sad16x4 = aom_highbd_sad16x4_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6289:9): [True: 1, False: 0]
  ------------------
 6290|      1|    aom_highbd_sad16x4x3d = aom_highbd_sad16x4x3d_c;
 6291|      1|    if (flags & HAS_AVX2) aom_highbd_sad16x4x3d = aom_highbd_sad16x4x3d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6291:9): [True: 1, False: 0]
  ------------------
 6292|      1|    aom_highbd_sad16x4x4d = aom_highbd_sad16x4x4d_sse2;
 6293|      1|    if (flags & HAS_AVX2) aom_highbd_sad16x4x4d = aom_highbd_sad16x4x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6293:9): [True: 1, False: 0]
  ------------------
 6294|      1|    aom_highbd_sad16x64 = aom_highbd_sad16x64_sse2;
 6295|      1|    if (flags & HAS_AVX2) aom_highbd_sad16x64 = aom_highbd_sad16x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6295:9): [True: 1, False: 0]
  ------------------
 6296|      1|    aom_highbd_sad16x64_avg = aom_highbd_sad16x64_avg_sse2;
 6297|      1|    if (flags & HAS_AVX2) aom_highbd_sad16x64_avg = aom_highbd_sad16x64_avg_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6297:9): [True: 1, False: 0]
  ------------------
 6298|      1|    aom_highbd_sad16x64x3d = aom_highbd_sad16x64x3d_c;
 6299|      1|    if (flags & HAS_AVX2) aom_highbd_sad16x64x3d = aom_highbd_sad16x64x3d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6299:9): [True: 1, False: 0]
  ------------------
 6300|      1|    aom_highbd_sad16x64x4d = aom_highbd_sad16x64x4d_sse2;
 6301|      1|    if (flags & HAS_AVX2) aom_highbd_sad16x64x4d = aom_highbd_sad16x64x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6301:9): [True: 1, False: 0]
  ------------------
 6302|      1|    aom_highbd_sad16x8 = aom_highbd_sad16x8_sse2;
 6303|      1|    if (flags & HAS_AVX2) aom_highbd_sad16x8 = aom_highbd_sad16x8_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6303:9): [True: 1, False: 0]
  ------------------
 6304|      1|    aom_highbd_sad16x8_avg = aom_highbd_sad16x8_avg_sse2;
 6305|      1|    if (flags & HAS_AVX2) aom_highbd_sad16x8_avg = aom_highbd_sad16x8_avg_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6305:9): [True: 1, False: 0]
  ------------------
 6306|      1|    aom_highbd_sad16x8x3d = aom_highbd_sad16x8x3d_c;
 6307|      1|    if (flags & HAS_AVX2) aom_highbd_sad16x8x3d = aom_highbd_sad16x8x3d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6307:9): [True: 1, False: 0]
  ------------------
 6308|      1|    aom_highbd_sad16x8x4d = aom_highbd_sad16x8x4d_sse2;
 6309|      1|    if (flags & HAS_AVX2) aom_highbd_sad16x8x4d = aom_highbd_sad16x8x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6309:9): [True: 1, False: 0]
  ------------------
 6310|      1|    aom_highbd_sad32x16 = aom_highbd_sad32x16_sse2;
 6311|      1|    if (flags & HAS_AVX2) aom_highbd_sad32x16 = aom_highbd_sad32x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6311:9): [True: 1, False: 0]
  ------------------
 6312|      1|    aom_highbd_sad32x16_avg = aom_highbd_sad32x16_avg_sse2;
 6313|      1|    if (flags & HAS_AVX2) aom_highbd_sad32x16_avg = aom_highbd_sad32x16_avg_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6313:9): [True: 1, False: 0]
  ------------------
 6314|      1|    aom_highbd_sad32x16x3d = aom_highbd_sad32x16x3d_c;
 6315|      1|    if (flags & HAS_AVX2) aom_highbd_sad32x16x3d = aom_highbd_sad32x16x3d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6315:9): [True: 1, False: 0]
  ------------------
 6316|      1|    aom_highbd_sad32x16x4d = aom_highbd_sad32x16x4d_sse2;
 6317|      1|    if (flags & HAS_AVX2) aom_highbd_sad32x16x4d = aom_highbd_sad32x16x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6317:9): [True: 1, False: 0]
  ------------------
 6318|      1|    aom_highbd_sad32x32 = aom_highbd_sad32x32_sse2;
 6319|      1|    if (flags & HAS_AVX2) aom_highbd_sad32x32 = aom_highbd_sad32x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6319:9): [True: 1, False: 0]
  ------------------
 6320|      1|    aom_highbd_sad32x32_avg = aom_highbd_sad32x32_avg_sse2;
 6321|      1|    if (flags & HAS_AVX2) aom_highbd_sad32x32_avg = aom_highbd_sad32x32_avg_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6321:9): [True: 1, False: 0]
  ------------------
 6322|      1|    aom_highbd_sad32x32x3d = aom_highbd_sad32x32x3d_c;
 6323|      1|    if (flags & HAS_AVX2) aom_highbd_sad32x32x3d = aom_highbd_sad32x32x3d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6323:9): [True: 1, False: 0]
  ------------------
 6324|      1|    aom_highbd_sad32x32x4d = aom_highbd_sad32x32x4d_sse2;
 6325|      1|    if (flags & HAS_AVX2) aom_highbd_sad32x32x4d = aom_highbd_sad32x32x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6325:9): [True: 1, False: 0]
  ------------------
 6326|      1|    aom_highbd_sad32x64 = aom_highbd_sad32x64_sse2;
 6327|      1|    if (flags & HAS_AVX2) aom_highbd_sad32x64 = aom_highbd_sad32x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6327:9): [True: 1, False: 0]
  ------------------
 6328|      1|    aom_highbd_sad32x64_avg = aom_highbd_sad32x64_avg_sse2;
 6329|      1|    if (flags & HAS_AVX2) aom_highbd_sad32x64_avg = aom_highbd_sad32x64_avg_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6329:9): [True: 1, False: 0]
  ------------------
 6330|      1|    aom_highbd_sad32x64x3d = aom_highbd_sad32x64x3d_c;
 6331|      1|    if (flags & HAS_AVX2) aom_highbd_sad32x64x3d = aom_highbd_sad32x64x3d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6331:9): [True: 1, False: 0]
  ------------------
 6332|      1|    aom_highbd_sad32x64x4d = aom_highbd_sad32x64x4d_sse2;
 6333|      1|    if (flags & HAS_AVX2) aom_highbd_sad32x64x4d = aom_highbd_sad32x64x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6333:9): [True: 1, False: 0]
  ------------------
 6334|      1|    aom_highbd_sad32x8 = aom_highbd_sad32x8_sse2;
 6335|      1|    if (flags & HAS_AVX2) aom_highbd_sad32x8 = aom_highbd_sad32x8_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6335:9): [True: 1, False: 0]
  ------------------
 6336|      1|    aom_highbd_sad32x8_avg = aom_highbd_sad32x8_avg_sse2;
 6337|      1|    if (flags & HAS_AVX2) aom_highbd_sad32x8_avg = aom_highbd_sad32x8_avg_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6337:9): [True: 1, False: 0]
  ------------------
 6338|      1|    aom_highbd_sad32x8x3d = aom_highbd_sad32x8x3d_c;
 6339|      1|    if (flags & HAS_AVX2) aom_highbd_sad32x8x3d = aom_highbd_sad32x8x3d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6339:9): [True: 1, False: 0]
  ------------------
 6340|      1|    aom_highbd_sad32x8x4d = aom_highbd_sad32x8x4d_sse2;
 6341|      1|    if (flags & HAS_AVX2) aom_highbd_sad32x8x4d = aom_highbd_sad32x8x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6341:9): [True: 1, False: 0]
  ------------------
 6342|      1|    aom_highbd_sad64x128 = aom_highbd_sad64x128_c;
 6343|      1|    if (flags & HAS_AVX2) aom_highbd_sad64x128 = aom_highbd_sad64x128_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6343:9): [True: 1, False: 0]
  ------------------
 6344|      1|    aom_highbd_sad64x128_avg = aom_highbd_sad64x128_avg_c;
 6345|      1|    if (flags & HAS_AVX2) aom_highbd_sad64x128_avg = aom_highbd_sad64x128_avg_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6345:9): [True: 1, False: 0]
  ------------------
 6346|      1|    aom_highbd_sad64x128x3d = aom_highbd_sad64x128x3d_c;
 6347|      1|    if (flags & HAS_AVX2) aom_highbd_sad64x128x3d = aom_highbd_sad64x128x3d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6347:9): [True: 1, False: 0]
  ------------------
 6348|      1|    aom_highbd_sad64x128x4d = aom_highbd_sad64x128x4d_c;
 6349|      1|    if (flags & HAS_AVX2) aom_highbd_sad64x128x4d = aom_highbd_sad64x128x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6349:9): [True: 1, False: 0]
  ------------------
 6350|      1|    aom_highbd_sad64x16 = aom_highbd_sad64x16_sse2;
 6351|      1|    if (flags & HAS_AVX2) aom_highbd_sad64x16 = aom_highbd_sad64x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6351:9): [True: 1, False: 0]
  ------------------
 6352|      1|    aom_highbd_sad64x16_avg = aom_highbd_sad64x16_avg_sse2;
 6353|      1|    if (flags & HAS_AVX2) aom_highbd_sad64x16_avg = aom_highbd_sad64x16_avg_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6353:9): [True: 1, False: 0]
  ------------------
 6354|      1|    aom_highbd_sad64x16x3d = aom_highbd_sad64x16x3d_c;
 6355|      1|    if (flags & HAS_AVX2) aom_highbd_sad64x16x3d = aom_highbd_sad64x16x3d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6355:9): [True: 1, False: 0]
  ------------------
 6356|      1|    aom_highbd_sad64x16x4d = aom_highbd_sad64x16x4d_sse2;
 6357|      1|    if (flags & HAS_AVX2) aom_highbd_sad64x16x4d = aom_highbd_sad64x16x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6357:9): [True: 1, False: 0]
  ------------------
 6358|      1|    aom_highbd_sad64x32 = aom_highbd_sad64x32_sse2;
 6359|      1|    if (flags & HAS_AVX2) aom_highbd_sad64x32 = aom_highbd_sad64x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6359:9): [True: 1, False: 0]
  ------------------
 6360|      1|    aom_highbd_sad64x32_avg = aom_highbd_sad64x32_avg_sse2;
 6361|      1|    if (flags & HAS_AVX2) aom_highbd_sad64x32_avg = aom_highbd_sad64x32_avg_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6361:9): [True: 1, False: 0]
  ------------------
 6362|      1|    aom_highbd_sad64x32x3d = aom_highbd_sad64x32x3d_c;
 6363|      1|    if (flags & HAS_AVX2) aom_highbd_sad64x32x3d = aom_highbd_sad64x32x3d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6363:9): [True: 1, False: 0]
  ------------------
 6364|      1|    aom_highbd_sad64x32x4d = aom_highbd_sad64x32x4d_sse2;
 6365|      1|    if (flags & HAS_AVX2) aom_highbd_sad64x32x4d = aom_highbd_sad64x32x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6365:9): [True: 1, False: 0]
  ------------------
 6366|      1|    aom_highbd_sad64x64 = aom_highbd_sad64x64_sse2;
 6367|      1|    if (flags & HAS_AVX2) aom_highbd_sad64x64 = aom_highbd_sad64x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6367:9): [True: 1, False: 0]
  ------------------
 6368|      1|    aom_highbd_sad64x64_avg = aom_highbd_sad64x64_avg_sse2;
 6369|      1|    if (flags & HAS_AVX2) aom_highbd_sad64x64_avg = aom_highbd_sad64x64_avg_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6369:9): [True: 1, False: 0]
  ------------------
 6370|      1|    aom_highbd_sad64x64x3d = aom_highbd_sad64x64x3d_c;
 6371|      1|    if (flags & HAS_AVX2) aom_highbd_sad64x64x3d = aom_highbd_sad64x64x3d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6371:9): [True: 1, False: 0]
  ------------------
 6372|      1|    aom_highbd_sad64x64x4d = aom_highbd_sad64x64x4d_sse2;
 6373|      1|    if (flags & HAS_AVX2) aom_highbd_sad64x64x4d = aom_highbd_sad64x64x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6373:9): [True: 1, False: 0]
  ------------------
 6374|      1|    aom_highbd_sad_skip_128x128 = aom_highbd_sad_skip_128x128_c;
 6375|      1|    if (flags & HAS_AVX2) aom_highbd_sad_skip_128x128 = aom_highbd_sad_skip_128x128_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6375:9): [True: 1, False: 0]
  ------------------
 6376|      1|    aom_highbd_sad_skip_128x128x4d = aom_highbd_sad_skip_128x128x4d_c;
 6377|      1|    if (flags & HAS_AVX2) aom_highbd_sad_skip_128x128x4d = aom_highbd_sad_skip_128x128x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6377:9): [True: 1, False: 0]
  ------------------
 6378|      1|    aom_highbd_sad_skip_128x64 = aom_highbd_sad_skip_128x64_c;
 6379|      1|    if (flags & HAS_AVX2) aom_highbd_sad_skip_128x64 = aom_highbd_sad_skip_128x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6379:9): [True: 1, False: 0]
  ------------------
 6380|      1|    aom_highbd_sad_skip_128x64x4d = aom_highbd_sad_skip_128x64x4d_c;
 6381|      1|    if (flags & HAS_AVX2) aom_highbd_sad_skip_128x64x4d = aom_highbd_sad_skip_128x64x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6381:9): [True: 1, False: 0]
  ------------------
 6382|      1|    aom_highbd_sad_skip_16x16 = aom_highbd_sad_skip_16x16_sse2;
 6383|      1|    if (flags & HAS_AVX2) aom_highbd_sad_skip_16x16 = aom_highbd_sad_skip_16x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6383:9): [True: 1, False: 0]
  ------------------
 6384|      1|    aom_highbd_sad_skip_16x16x4d = aom_highbd_sad_skip_16x16x4d_sse2;
 6385|      1|    if (flags & HAS_AVX2) aom_highbd_sad_skip_16x16x4d = aom_highbd_sad_skip_16x16x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6385:9): [True: 1, False: 0]
  ------------------
 6386|      1|    aom_highbd_sad_skip_16x32 = aom_highbd_sad_skip_16x32_sse2;
 6387|      1|    if (flags & HAS_AVX2) aom_highbd_sad_skip_16x32 = aom_highbd_sad_skip_16x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6387:9): [True: 1, False: 0]
  ------------------
 6388|      1|    aom_highbd_sad_skip_16x32x4d = aom_highbd_sad_skip_16x32x4d_sse2;
 6389|      1|    if (flags & HAS_AVX2) aom_highbd_sad_skip_16x32x4d = aom_highbd_sad_skip_16x32x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6389:9): [True: 1, False: 0]
  ------------------
 6390|      1|    aom_highbd_sad_skip_16x64 = aom_highbd_sad_skip_16x64_sse2;
 6391|      1|    if (flags & HAS_AVX2) aom_highbd_sad_skip_16x64 = aom_highbd_sad_skip_16x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6391:9): [True: 1, False: 0]
  ------------------
 6392|      1|    aom_highbd_sad_skip_16x64x4d = aom_highbd_sad_skip_16x64x4d_sse2;
 6393|      1|    if (flags & HAS_AVX2) aom_highbd_sad_skip_16x64x4d = aom_highbd_sad_skip_16x64x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6393:9): [True: 1, False: 0]
  ------------------
 6394|      1|    aom_highbd_sad_skip_32x16 = aom_highbd_sad_skip_32x16_sse2;
 6395|      1|    if (flags & HAS_AVX2) aom_highbd_sad_skip_32x16 = aom_highbd_sad_skip_32x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6395:9): [True: 1, False: 0]
  ------------------
 6396|      1|    aom_highbd_sad_skip_32x16x4d = aom_highbd_sad_skip_32x16x4d_sse2;
 6397|      1|    if (flags & HAS_AVX2) aom_highbd_sad_skip_32x16x4d = aom_highbd_sad_skip_32x16x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6397:9): [True: 1, False: 0]
  ------------------
 6398|      1|    aom_highbd_sad_skip_32x32 = aom_highbd_sad_skip_32x32_sse2;
 6399|      1|    if (flags & HAS_AVX2) aom_highbd_sad_skip_32x32 = aom_highbd_sad_skip_32x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6399:9): [True: 1, False: 0]
  ------------------
 6400|      1|    aom_highbd_sad_skip_32x32x4d = aom_highbd_sad_skip_32x32x4d_sse2;
 6401|      1|    if (flags & HAS_AVX2) aom_highbd_sad_skip_32x32x4d = aom_highbd_sad_skip_32x32x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6401:9): [True: 1, False: 0]
  ------------------
 6402|      1|    aom_highbd_sad_skip_32x64 = aom_highbd_sad_skip_32x64_sse2;
 6403|      1|    if (flags & HAS_AVX2) aom_highbd_sad_skip_32x64 = aom_highbd_sad_skip_32x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6403:9): [True: 1, False: 0]
  ------------------
 6404|      1|    aom_highbd_sad_skip_32x64x4d = aom_highbd_sad_skip_32x64x4d_sse2;
 6405|      1|    if (flags & HAS_AVX2) aom_highbd_sad_skip_32x64x4d = aom_highbd_sad_skip_32x64x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6405:9): [True: 1, False: 0]
  ------------------
 6406|      1|    aom_highbd_sad_skip_64x128 = aom_highbd_sad_skip_64x128_c;
 6407|      1|    if (flags & HAS_AVX2) aom_highbd_sad_skip_64x128 = aom_highbd_sad_skip_64x128_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6407:9): [True: 1, False: 0]
  ------------------
 6408|      1|    aom_highbd_sad_skip_64x128x4d = aom_highbd_sad_skip_64x128x4d_c;
 6409|      1|    if (flags & HAS_AVX2) aom_highbd_sad_skip_64x128x4d = aom_highbd_sad_skip_64x128x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6409:9): [True: 1, False: 0]
  ------------------
 6410|      1|    aom_highbd_sad_skip_64x16 = aom_highbd_sad_skip_64x16_sse2;
 6411|      1|    if (flags & HAS_AVX2) aom_highbd_sad_skip_64x16 = aom_highbd_sad_skip_64x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6411:9): [True: 1, False: 0]
  ------------------
 6412|      1|    aom_highbd_sad_skip_64x16x4d = aom_highbd_sad_skip_64x16x4d_sse2;
 6413|      1|    if (flags & HAS_AVX2) aom_highbd_sad_skip_64x16x4d = aom_highbd_sad_skip_64x16x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6413:9): [True: 1, False: 0]
  ------------------
 6414|      1|    aom_highbd_sad_skip_64x32 = aom_highbd_sad_skip_64x32_sse2;
 6415|      1|    if (flags & HAS_AVX2) aom_highbd_sad_skip_64x32 = aom_highbd_sad_skip_64x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6415:9): [True: 1, False: 0]
  ------------------
 6416|      1|    aom_highbd_sad_skip_64x32x4d = aom_highbd_sad_skip_64x32x4d_sse2;
 6417|      1|    if (flags & HAS_AVX2) aom_highbd_sad_skip_64x32x4d = aom_highbd_sad_skip_64x32x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6417:9): [True: 1, False: 0]
  ------------------
 6418|      1|    aom_highbd_sad_skip_64x64 = aom_highbd_sad_skip_64x64_sse2;
 6419|      1|    if (flags & HAS_AVX2) aom_highbd_sad_skip_64x64 = aom_highbd_sad_skip_64x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6419:9): [True: 1, False: 0]
  ------------------
 6420|      1|    aom_highbd_sad_skip_64x64x4d = aom_highbd_sad_skip_64x64x4d_sse2;
 6421|      1|    if (flags & HAS_AVX2) aom_highbd_sad_skip_64x64x4d = aom_highbd_sad_skip_64x64x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6421:9): [True: 1, False: 0]
  ------------------
 6422|      1|    aom_highbd_sse = aom_highbd_sse_c;
 6423|      1|    if (flags & HAS_SSE4_1) aom_highbd_sse = aom_highbd_sse_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6423:9): [True: 1, False: 0]
  ------------------
 6424|      1|    if (flags & HAS_AVX2) aom_highbd_sse = aom_highbd_sse_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6424:9): [True: 1, False: 0]
  ------------------
 6425|      1|    aom_ifft16x16_float = aom_ifft16x16_float_sse2;
 6426|      1|    if (flags & HAS_AVX2) aom_ifft16x16_float = aom_ifft16x16_float_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6426:9): [True: 1, False: 0]
  ------------------
 6427|      1|    aom_ifft32x32_float = aom_ifft32x32_float_sse2;
 6428|      1|    if (flags & HAS_AVX2) aom_ifft32x32_float = aom_ifft32x32_float_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6428:9): [True: 1, False: 0]
  ------------------
 6429|      1|    aom_ifft8x8_float = aom_ifft8x8_float_sse2;
 6430|      1|    if (flags & HAS_AVX2) aom_ifft8x8_float = aom_ifft8x8_float_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6430:9): [True: 1, False: 0]
  ------------------
 6431|      1|    aom_int_pro_col = aom_int_pro_col_sse2;
 6432|      1|    if (flags & HAS_AVX2) aom_int_pro_col = aom_int_pro_col_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6432:9): [True: 1, False: 0]
  ------------------
 6433|      1|    aom_int_pro_row = aom_int_pro_row_sse2;
 6434|      1|    if (flags & HAS_AVX2) aom_int_pro_row = aom_int_pro_row_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6434:9): [True: 1, False: 0]
  ------------------
 6435|      1|    aom_lowbd_blend_a64_d16_mask = aom_lowbd_blend_a64_d16_mask_c;
 6436|      1|    if (flags & HAS_SSE4_1) aom_lowbd_blend_a64_d16_mask = aom_lowbd_blend_a64_d16_mask_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6436:9): [True: 1, False: 0]
  ------------------
 6437|      1|    if (flags & HAS_AVX2) aom_lowbd_blend_a64_d16_mask = aom_lowbd_blend_a64_d16_mask_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6437:9): [True: 1, False: 0]
  ------------------
 6438|      1|    aom_lpf_horizontal_14_quad = aom_lpf_horizontal_14_quad_sse2;
 6439|      1|    if (flags & HAS_AVX2) aom_lpf_horizontal_14_quad = aom_lpf_horizontal_14_quad_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6439:9): [True: 1, False: 0]
  ------------------
 6440|      1|    aom_lpf_horizontal_6_quad = aom_lpf_horizontal_6_quad_sse2;
 6441|      1|    if (flags & HAS_AVX2) aom_lpf_horizontal_6_quad = aom_lpf_horizontal_6_quad_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6441:9): [True: 1, False: 0]
  ------------------
 6442|      1|    aom_lpf_horizontal_8_quad = aom_lpf_horizontal_8_quad_sse2;
 6443|      1|    if (flags & HAS_AVX2) aom_lpf_horizontal_8_quad = aom_lpf_horizontal_8_quad_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6443:9): [True: 1, False: 0]
  ------------------
 6444|      1|    aom_lpf_vertical_14_quad = aom_lpf_vertical_14_quad_sse2;
 6445|      1|    if (flags & HAS_AVX2) aom_lpf_vertical_14_quad = aom_lpf_vertical_14_quad_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6445:9): [True: 1, False: 0]
  ------------------
 6446|      1|    aom_masked_sad128x128 = aom_masked_sad128x128_c;
 6447|      1|    if (flags & HAS_SSSE3) aom_masked_sad128x128 = aom_masked_sad128x128_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6447:9): [True: 1, False: 0]
  ------------------
 6448|      1|    if (flags & HAS_AVX2) aom_masked_sad128x128 = aom_masked_sad128x128_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6448:9): [True: 1, False: 0]
  ------------------
 6449|      1|    aom_masked_sad128x64 = aom_masked_sad128x64_c;
 6450|      1|    if (flags & HAS_SSSE3) aom_masked_sad128x64 = aom_masked_sad128x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6450:9): [True: 1, False: 0]
  ------------------
 6451|      1|    if (flags & HAS_AVX2) aom_masked_sad128x64 = aom_masked_sad128x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6451:9): [True: 1, False: 0]
  ------------------
 6452|      1|    aom_masked_sad16x16 = aom_masked_sad16x16_c;
 6453|      1|    if (flags & HAS_SSSE3) aom_masked_sad16x16 = aom_masked_sad16x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6453:9): [True: 1, False: 0]
  ------------------
 6454|      1|    if (flags & HAS_AVX2) aom_masked_sad16x16 = aom_masked_sad16x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6454:9): [True: 1, False: 0]
  ------------------
 6455|      1|    aom_masked_sad16x32 = aom_masked_sad16x32_c;
 6456|      1|    if (flags & HAS_SSSE3) aom_masked_sad16x32 = aom_masked_sad16x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6456:9): [True: 1, False: 0]
  ------------------
 6457|      1|    if (flags & HAS_AVX2) aom_masked_sad16x32 = aom_masked_sad16x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6457:9): [True: 1, False: 0]
  ------------------
 6458|      1|    aom_masked_sad16x4 = aom_masked_sad16x4_c;
 6459|      1|    if (flags & HAS_SSSE3) aom_masked_sad16x4 = aom_masked_sad16x4_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6459:9): [True: 1, False: 0]
  ------------------
 6460|      1|    if (flags & HAS_AVX2) aom_masked_sad16x4 = aom_masked_sad16x4_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6460:9): [True: 1, False: 0]
  ------------------
 6461|      1|    aom_masked_sad16x64 = aom_masked_sad16x64_c;
 6462|      1|    if (flags & HAS_SSSE3) aom_masked_sad16x64 = aom_masked_sad16x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6462:9): [True: 1, False: 0]
  ------------------
 6463|      1|    if (flags & HAS_AVX2) aom_masked_sad16x64 = aom_masked_sad16x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6463:9): [True: 1, False: 0]
  ------------------
 6464|      1|    aom_masked_sad16x8 = aom_masked_sad16x8_c;
 6465|      1|    if (flags & HAS_SSSE3) aom_masked_sad16x8 = aom_masked_sad16x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6465:9): [True: 1, False: 0]
  ------------------
 6466|      1|    if (flags & HAS_AVX2) aom_masked_sad16x8 = aom_masked_sad16x8_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6466:9): [True: 1, False: 0]
  ------------------
 6467|      1|    aom_masked_sad32x16 = aom_masked_sad32x16_c;
 6468|      1|    if (flags & HAS_SSSE3) aom_masked_sad32x16 = aom_masked_sad32x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6468:9): [True: 1, False: 0]
  ------------------
 6469|      1|    if (flags & HAS_AVX2) aom_masked_sad32x16 = aom_masked_sad32x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6469:9): [True: 1, False: 0]
  ------------------
 6470|      1|    aom_masked_sad32x32 = aom_masked_sad32x32_c;
 6471|      1|    if (flags & HAS_SSSE3) aom_masked_sad32x32 = aom_masked_sad32x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6471:9): [True: 1, False: 0]
  ------------------
 6472|      1|    if (flags & HAS_AVX2) aom_masked_sad32x32 = aom_masked_sad32x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6472:9): [True: 1, False: 0]
  ------------------
 6473|      1|    aom_masked_sad32x64 = aom_masked_sad32x64_c;
 6474|      1|    if (flags & HAS_SSSE3) aom_masked_sad32x64 = aom_masked_sad32x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6474:9): [True: 1, False: 0]
  ------------------
 6475|      1|    if (flags & HAS_AVX2) aom_masked_sad32x64 = aom_masked_sad32x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6475:9): [True: 1, False: 0]
  ------------------
 6476|      1|    aom_masked_sad32x8 = aom_masked_sad32x8_c;
 6477|      1|    if (flags & HAS_SSSE3) aom_masked_sad32x8 = aom_masked_sad32x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6477:9): [True: 1, False: 0]
  ------------------
 6478|      1|    if (flags & HAS_AVX2) aom_masked_sad32x8 = aom_masked_sad32x8_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6478:9): [True: 1, False: 0]
  ------------------
 6479|      1|    aom_masked_sad4x16 = aom_masked_sad4x16_c;
 6480|      1|    if (flags & HAS_SSSE3) aom_masked_sad4x16 = aom_masked_sad4x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6480:9): [True: 1, False: 0]
  ------------------
 6481|      1|    if (flags & HAS_AVX2) aom_masked_sad4x16 = aom_masked_sad4x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6481:9): [True: 1, False: 0]
  ------------------
 6482|      1|    aom_masked_sad4x4 = aom_masked_sad4x4_c;
 6483|      1|    if (flags & HAS_SSSE3) aom_masked_sad4x4 = aom_masked_sad4x4_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6483:9): [True: 1, False: 0]
  ------------------
 6484|      1|    if (flags & HAS_AVX2) aom_masked_sad4x4 = aom_masked_sad4x4_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6484:9): [True: 1, False: 0]
  ------------------
 6485|      1|    aom_masked_sad4x8 = aom_masked_sad4x8_c;
 6486|      1|    if (flags & HAS_SSSE3) aom_masked_sad4x8 = aom_masked_sad4x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6486:9): [True: 1, False: 0]
  ------------------
 6487|      1|    if (flags & HAS_AVX2) aom_masked_sad4x8 = aom_masked_sad4x8_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6487:9): [True: 1, False: 0]
  ------------------
 6488|      1|    aom_masked_sad64x128 = aom_masked_sad64x128_c;
 6489|      1|    if (flags & HAS_SSSE3) aom_masked_sad64x128 = aom_masked_sad64x128_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6489:9): [True: 1, False: 0]
  ------------------
 6490|      1|    if (flags & HAS_AVX2) aom_masked_sad64x128 = aom_masked_sad64x128_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6490:9): [True: 1, False: 0]
  ------------------
 6491|      1|    aom_masked_sad64x16 = aom_masked_sad64x16_c;
 6492|      1|    if (flags & HAS_SSSE3) aom_masked_sad64x16 = aom_masked_sad64x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6492:9): [True: 1, False: 0]
  ------------------
 6493|      1|    if (flags & HAS_AVX2) aom_masked_sad64x16 = aom_masked_sad64x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6493:9): [True: 1, False: 0]
  ------------------
 6494|      1|    aom_masked_sad64x32 = aom_masked_sad64x32_c;
 6495|      1|    if (flags & HAS_SSSE3) aom_masked_sad64x32 = aom_masked_sad64x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6495:9): [True: 1, False: 0]
  ------------------
 6496|      1|    if (flags & HAS_AVX2) aom_masked_sad64x32 = aom_masked_sad64x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6496:9): [True: 1, False: 0]
  ------------------
 6497|      1|    aom_masked_sad64x64 = aom_masked_sad64x64_c;
 6498|      1|    if (flags & HAS_SSSE3) aom_masked_sad64x64 = aom_masked_sad64x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6498:9): [True: 1, False: 0]
  ------------------
 6499|      1|    if (flags & HAS_AVX2) aom_masked_sad64x64 = aom_masked_sad64x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6499:9): [True: 1, False: 0]
  ------------------
 6500|      1|    aom_masked_sad8x16 = aom_masked_sad8x16_c;
 6501|      1|    if (flags & HAS_SSSE3) aom_masked_sad8x16 = aom_masked_sad8x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6501:9): [True: 1, False: 0]
  ------------------
 6502|      1|    if (flags & HAS_AVX2) aom_masked_sad8x16 = aom_masked_sad8x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6502:9): [True: 1, False: 0]
  ------------------
 6503|      1|    aom_masked_sad8x32 = aom_masked_sad8x32_c;
 6504|      1|    if (flags & HAS_SSSE3) aom_masked_sad8x32 = aom_masked_sad8x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6504:9): [True: 1, False: 0]
  ------------------
 6505|      1|    if (flags & HAS_AVX2) aom_masked_sad8x32 = aom_masked_sad8x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6505:9): [True: 1, False: 0]
  ------------------
 6506|      1|    aom_masked_sad8x4 = aom_masked_sad8x4_c;
 6507|      1|    if (flags & HAS_SSSE3) aom_masked_sad8x4 = aom_masked_sad8x4_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6507:9): [True: 1, False: 0]
  ------------------
 6508|      1|    if (flags & HAS_AVX2) aom_masked_sad8x4 = aom_masked_sad8x4_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6508:9): [True: 1, False: 0]
  ------------------
 6509|      1|    aom_masked_sad8x8 = aom_masked_sad8x8_c;
 6510|      1|    if (flags & HAS_SSSE3) aom_masked_sad8x8 = aom_masked_sad8x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6510:9): [True: 1, False: 0]
  ------------------
 6511|      1|    if (flags & HAS_AVX2) aom_masked_sad8x8 = aom_masked_sad8x8_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6511:9): [True: 1, False: 0]
  ------------------
 6512|      1|    aom_masked_sub_pixel_variance128x128 = aom_masked_sub_pixel_variance128x128_c;
 6513|      1|    if (flags & HAS_SSSE3) aom_masked_sub_pixel_variance128x128 = aom_masked_sub_pixel_variance128x128_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6513:9): [True: 1, False: 0]
  ------------------
 6514|      1|    aom_masked_sub_pixel_variance128x64 = aom_masked_sub_pixel_variance128x64_c;
 6515|      1|    if (flags & HAS_SSSE3) aom_masked_sub_pixel_variance128x64 = aom_masked_sub_pixel_variance128x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6515:9): [True: 1, False: 0]
  ------------------
 6516|      1|    aom_masked_sub_pixel_variance16x16 = aom_masked_sub_pixel_variance16x16_c;
 6517|      1|    if (flags & HAS_SSSE3) aom_masked_sub_pixel_variance16x16 = aom_masked_sub_pixel_variance16x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6517:9): [True: 1, False: 0]
  ------------------
 6518|      1|    aom_masked_sub_pixel_variance16x32 = aom_masked_sub_pixel_variance16x32_c;
 6519|      1|    if (flags & HAS_SSSE3) aom_masked_sub_pixel_variance16x32 = aom_masked_sub_pixel_variance16x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6519:9): [True: 1, False: 0]
  ------------------
 6520|      1|    aom_masked_sub_pixel_variance16x4 = aom_masked_sub_pixel_variance16x4_c;
 6521|      1|    if (flags & HAS_SSSE3) aom_masked_sub_pixel_variance16x4 = aom_masked_sub_pixel_variance16x4_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6521:9): [True: 1, False: 0]
  ------------------
 6522|      1|    aom_masked_sub_pixel_variance16x64 = aom_masked_sub_pixel_variance16x64_c;
 6523|      1|    if (flags & HAS_SSSE3) aom_masked_sub_pixel_variance16x64 = aom_masked_sub_pixel_variance16x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6523:9): [True: 1, False: 0]
  ------------------
 6524|      1|    aom_masked_sub_pixel_variance16x8 = aom_masked_sub_pixel_variance16x8_c;
 6525|      1|    if (flags & HAS_SSSE3) aom_masked_sub_pixel_variance16x8 = aom_masked_sub_pixel_variance16x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6525:9): [True: 1, False: 0]
  ------------------
 6526|      1|    aom_masked_sub_pixel_variance32x16 = aom_masked_sub_pixel_variance32x16_c;
 6527|      1|    if (flags & HAS_SSSE3) aom_masked_sub_pixel_variance32x16 = aom_masked_sub_pixel_variance32x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6527:9): [True: 1, False: 0]
  ------------------
 6528|      1|    aom_masked_sub_pixel_variance32x32 = aom_masked_sub_pixel_variance32x32_c;
 6529|      1|    if (flags & HAS_SSSE3) aom_masked_sub_pixel_variance32x32 = aom_masked_sub_pixel_variance32x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6529:9): [True: 1, False: 0]
  ------------------
 6530|      1|    aom_masked_sub_pixel_variance32x64 = aom_masked_sub_pixel_variance32x64_c;
 6531|      1|    if (flags & HAS_SSSE3) aom_masked_sub_pixel_variance32x64 = aom_masked_sub_pixel_variance32x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6531:9): [True: 1, False: 0]
  ------------------
 6532|      1|    aom_masked_sub_pixel_variance32x8 = aom_masked_sub_pixel_variance32x8_c;
 6533|      1|    if (flags & HAS_SSSE3) aom_masked_sub_pixel_variance32x8 = aom_masked_sub_pixel_variance32x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6533:9): [True: 1, False: 0]
  ------------------
 6534|      1|    aom_masked_sub_pixel_variance4x16 = aom_masked_sub_pixel_variance4x16_c;
 6535|      1|    if (flags & HAS_SSSE3) aom_masked_sub_pixel_variance4x16 = aom_masked_sub_pixel_variance4x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6535:9): [True: 1, False: 0]
  ------------------
 6536|      1|    aom_masked_sub_pixel_variance4x4 = aom_masked_sub_pixel_variance4x4_c;
 6537|      1|    if (flags & HAS_SSSE3) aom_masked_sub_pixel_variance4x4 = aom_masked_sub_pixel_variance4x4_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6537:9): [True: 1, False: 0]
  ------------------
 6538|      1|    aom_masked_sub_pixel_variance4x8 = aom_masked_sub_pixel_variance4x8_c;
 6539|      1|    if (flags & HAS_SSSE3) aom_masked_sub_pixel_variance4x8 = aom_masked_sub_pixel_variance4x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6539:9): [True: 1, False: 0]
  ------------------
 6540|      1|    aom_masked_sub_pixel_variance64x128 = aom_masked_sub_pixel_variance64x128_c;
 6541|      1|    if (flags & HAS_SSSE3) aom_masked_sub_pixel_variance64x128 = aom_masked_sub_pixel_variance64x128_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6541:9): [True: 1, False: 0]
  ------------------
 6542|      1|    aom_masked_sub_pixel_variance64x16 = aom_masked_sub_pixel_variance64x16_c;
 6543|      1|    if (flags & HAS_SSSE3) aom_masked_sub_pixel_variance64x16 = aom_masked_sub_pixel_variance64x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6543:9): [True: 1, False: 0]
  ------------------
 6544|      1|    aom_masked_sub_pixel_variance64x32 = aom_masked_sub_pixel_variance64x32_c;
 6545|      1|    if (flags & HAS_SSSE3) aom_masked_sub_pixel_variance64x32 = aom_masked_sub_pixel_variance64x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6545:9): [True: 1, False: 0]
  ------------------
 6546|      1|    aom_masked_sub_pixel_variance64x64 = aom_masked_sub_pixel_variance64x64_c;
 6547|      1|    if (flags & HAS_SSSE3) aom_masked_sub_pixel_variance64x64 = aom_masked_sub_pixel_variance64x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6547:9): [True: 1, False: 0]
  ------------------
 6548|      1|    aom_masked_sub_pixel_variance8x16 = aom_masked_sub_pixel_variance8x16_c;
 6549|      1|    if (flags & HAS_SSSE3) aom_masked_sub_pixel_variance8x16 = aom_masked_sub_pixel_variance8x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6549:9): [True: 1, False: 0]
  ------------------
 6550|      1|    aom_masked_sub_pixel_variance8x32 = aom_masked_sub_pixel_variance8x32_c;
 6551|      1|    if (flags & HAS_SSSE3) aom_masked_sub_pixel_variance8x32 = aom_masked_sub_pixel_variance8x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6551:9): [True: 1, False: 0]
  ------------------
 6552|      1|    aom_masked_sub_pixel_variance8x4 = aom_masked_sub_pixel_variance8x4_c;
 6553|      1|    if (flags & HAS_SSSE3) aom_masked_sub_pixel_variance8x4 = aom_masked_sub_pixel_variance8x4_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6553:9): [True: 1, False: 0]
  ------------------
 6554|      1|    aom_masked_sub_pixel_variance8x8 = aom_masked_sub_pixel_variance8x8_c;
 6555|      1|    if (flags & HAS_SSSE3) aom_masked_sub_pixel_variance8x8 = aom_masked_sub_pixel_variance8x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6555:9): [True: 1, False: 0]
  ------------------
 6556|      1|    aom_mse16x16 = aom_mse16x16_sse2;
 6557|      1|    if (flags & HAS_AVX2) aom_mse16x16 = aom_mse16x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6557:9): [True: 1, False: 0]
  ------------------
 6558|      1|    aom_mse_16xh_16bit = aom_mse_16xh_16bit_sse2;
 6559|      1|    if (flags & HAS_AVX2) aom_mse_16xh_16bit = aom_mse_16xh_16bit_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6559:9): [True: 1, False: 0]
  ------------------
 6560|      1|    aom_mse_wxh_16bit = aom_mse_wxh_16bit_sse2;
 6561|      1|    if (flags & HAS_AVX2) aom_mse_wxh_16bit = aom_mse_wxh_16bit_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6561:9): [True: 1, False: 0]
  ------------------
 6562|      1|    aom_mse_wxh_16bit_highbd = aom_mse_wxh_16bit_highbd_sse2;
 6563|      1|    if (flags & HAS_AVX2) aom_mse_wxh_16bit_highbd = aom_mse_wxh_16bit_highbd_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6563:9): [True: 1, False: 0]
  ------------------
 6564|      1|    aom_obmc_sad128x128 = aom_obmc_sad128x128_c;
 6565|      1|    if (flags & HAS_SSE4_1) aom_obmc_sad128x128 = aom_obmc_sad128x128_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6565:9): [True: 1, False: 0]
  ------------------
 6566|      1|    if (flags & HAS_AVX2) aom_obmc_sad128x128 = aom_obmc_sad128x128_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6566:9): [True: 1, False: 0]
  ------------------
 6567|      1|    aom_obmc_sad128x64 = aom_obmc_sad128x64_c;
 6568|      1|    if (flags & HAS_SSE4_1) aom_obmc_sad128x64 = aom_obmc_sad128x64_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6568:9): [True: 1, False: 0]
  ------------------
 6569|      1|    if (flags & HAS_AVX2) aom_obmc_sad128x64 = aom_obmc_sad128x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6569:9): [True: 1, False: 0]
  ------------------
 6570|      1|    aom_obmc_sad16x16 = aom_obmc_sad16x16_c;
 6571|      1|    if (flags & HAS_SSE4_1) aom_obmc_sad16x16 = aom_obmc_sad16x16_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6571:9): [True: 1, False: 0]
  ------------------
 6572|      1|    if (flags & HAS_AVX2) aom_obmc_sad16x16 = aom_obmc_sad16x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6572:9): [True: 1, False: 0]
  ------------------
 6573|      1|    aom_obmc_sad16x32 = aom_obmc_sad16x32_c;
 6574|      1|    if (flags & HAS_SSE4_1) aom_obmc_sad16x32 = aom_obmc_sad16x32_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6574:9): [True: 1, False: 0]
  ------------------
 6575|      1|    if (flags & HAS_AVX2) aom_obmc_sad16x32 = aom_obmc_sad16x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6575:9): [True: 1, False: 0]
  ------------------
 6576|      1|    aom_obmc_sad16x4 = aom_obmc_sad16x4_c;
 6577|      1|    if (flags & HAS_SSE4_1) aom_obmc_sad16x4 = aom_obmc_sad16x4_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6577:9): [True: 1, False: 0]
  ------------------
 6578|      1|    if (flags & HAS_AVX2) aom_obmc_sad16x4 = aom_obmc_sad16x4_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6578:9): [True: 1, False: 0]
  ------------------
 6579|      1|    aom_obmc_sad16x64 = aom_obmc_sad16x64_c;
 6580|      1|    if (flags & HAS_SSE4_1) aom_obmc_sad16x64 = aom_obmc_sad16x64_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6580:9): [True: 1, False: 0]
  ------------------
 6581|      1|    if (flags & HAS_AVX2) aom_obmc_sad16x64 = aom_obmc_sad16x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6581:9): [True: 1, False: 0]
  ------------------
 6582|      1|    aom_obmc_sad16x8 = aom_obmc_sad16x8_c;
 6583|      1|    if (flags & HAS_SSE4_1) aom_obmc_sad16x8 = aom_obmc_sad16x8_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6583:9): [True: 1, False: 0]
  ------------------
 6584|      1|    if (flags & HAS_AVX2) aom_obmc_sad16x8 = aom_obmc_sad16x8_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6584:9): [True: 1, False: 0]
  ------------------
 6585|      1|    aom_obmc_sad32x16 = aom_obmc_sad32x16_c;
 6586|      1|    if (flags & HAS_SSE4_1) aom_obmc_sad32x16 = aom_obmc_sad32x16_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6586:9): [True: 1, False: 0]
  ------------------
 6587|      1|    if (flags & HAS_AVX2) aom_obmc_sad32x16 = aom_obmc_sad32x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6587:9): [True: 1, False: 0]
  ------------------
 6588|      1|    aom_obmc_sad32x32 = aom_obmc_sad32x32_c;
 6589|      1|    if (flags & HAS_SSE4_1) aom_obmc_sad32x32 = aom_obmc_sad32x32_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6589:9): [True: 1, False: 0]
  ------------------
 6590|      1|    if (flags & HAS_AVX2) aom_obmc_sad32x32 = aom_obmc_sad32x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6590:9): [True: 1, False: 0]
  ------------------
 6591|      1|    aom_obmc_sad32x64 = aom_obmc_sad32x64_c;
 6592|      1|    if (flags & HAS_SSE4_1) aom_obmc_sad32x64 = aom_obmc_sad32x64_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6592:9): [True: 1, False: 0]
  ------------------
 6593|      1|    if (flags & HAS_AVX2) aom_obmc_sad32x64 = aom_obmc_sad32x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6593:9): [True: 1, False: 0]
  ------------------
 6594|      1|    aom_obmc_sad32x8 = aom_obmc_sad32x8_c;
 6595|      1|    if (flags & HAS_SSE4_1) aom_obmc_sad32x8 = aom_obmc_sad32x8_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6595:9): [True: 1, False: 0]
  ------------------
 6596|      1|    if (flags & HAS_AVX2) aom_obmc_sad32x8 = aom_obmc_sad32x8_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6596:9): [True: 1, False: 0]
  ------------------
 6597|      1|    aom_obmc_sad4x16 = aom_obmc_sad4x16_c;
 6598|      1|    if (flags & HAS_SSE4_1) aom_obmc_sad4x16 = aom_obmc_sad4x16_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6598:9): [True: 1, False: 0]
  ------------------
 6599|      1|    if (flags & HAS_AVX2) aom_obmc_sad4x16 = aom_obmc_sad4x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6599:9): [True: 1, False: 0]
  ------------------
 6600|      1|    aom_obmc_sad4x4 = aom_obmc_sad4x4_c;
 6601|      1|    if (flags & HAS_SSE4_1) aom_obmc_sad4x4 = aom_obmc_sad4x4_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6601:9): [True: 1, False: 0]
  ------------------
 6602|      1|    if (flags & HAS_AVX2) aom_obmc_sad4x4 = aom_obmc_sad4x4_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6602:9): [True: 1, False: 0]
  ------------------
 6603|      1|    aom_obmc_sad4x8 = aom_obmc_sad4x8_c;
 6604|      1|    if (flags & HAS_SSE4_1) aom_obmc_sad4x8 = aom_obmc_sad4x8_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6604:9): [True: 1, False: 0]
  ------------------
 6605|      1|    if (flags & HAS_AVX2) aom_obmc_sad4x8 = aom_obmc_sad4x8_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6605:9): [True: 1, False: 0]
  ------------------
 6606|      1|    aom_obmc_sad64x128 = aom_obmc_sad64x128_c;
 6607|      1|    if (flags & HAS_SSE4_1) aom_obmc_sad64x128 = aom_obmc_sad64x128_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6607:9): [True: 1, False: 0]
  ------------------
 6608|      1|    if (flags & HAS_AVX2) aom_obmc_sad64x128 = aom_obmc_sad64x128_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6608:9): [True: 1, False: 0]
  ------------------
 6609|      1|    aom_obmc_sad64x16 = aom_obmc_sad64x16_c;
 6610|      1|    if (flags & HAS_SSE4_1) aom_obmc_sad64x16 = aom_obmc_sad64x16_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6610:9): [True: 1, False: 0]
  ------------------
 6611|      1|    if (flags & HAS_AVX2) aom_obmc_sad64x16 = aom_obmc_sad64x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6611:9): [True: 1, False: 0]
  ------------------
 6612|      1|    aom_obmc_sad64x32 = aom_obmc_sad64x32_c;
 6613|      1|    if (flags & HAS_SSE4_1) aom_obmc_sad64x32 = aom_obmc_sad64x32_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6613:9): [True: 1, False: 0]
  ------------------
 6614|      1|    if (flags & HAS_AVX2) aom_obmc_sad64x32 = aom_obmc_sad64x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6614:9): [True: 1, False: 0]
  ------------------
 6615|      1|    aom_obmc_sad64x64 = aom_obmc_sad64x64_c;
 6616|      1|    if (flags & HAS_SSE4_1) aom_obmc_sad64x64 = aom_obmc_sad64x64_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6616:9): [True: 1, False: 0]
  ------------------
 6617|      1|    if (flags & HAS_AVX2) aom_obmc_sad64x64 = aom_obmc_sad64x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6617:9): [True: 1, False: 0]
  ------------------
 6618|      1|    aom_obmc_sad8x16 = aom_obmc_sad8x16_c;
 6619|      1|    if (flags & HAS_SSE4_1) aom_obmc_sad8x16 = aom_obmc_sad8x16_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6619:9): [True: 1, False: 0]
  ------------------
 6620|      1|    if (flags & HAS_AVX2) aom_obmc_sad8x16 = aom_obmc_sad8x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6620:9): [True: 1, False: 0]
  ------------------
 6621|      1|    aom_obmc_sad8x32 = aom_obmc_sad8x32_c;
 6622|      1|    if (flags & HAS_SSE4_1) aom_obmc_sad8x32 = aom_obmc_sad8x32_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6622:9): [True: 1, False: 0]
  ------------------
 6623|      1|    if (flags & HAS_AVX2) aom_obmc_sad8x32 = aom_obmc_sad8x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6623:9): [True: 1, False: 0]
  ------------------
 6624|      1|    aom_obmc_sad8x4 = aom_obmc_sad8x4_c;
 6625|      1|    if (flags & HAS_SSE4_1) aom_obmc_sad8x4 = aom_obmc_sad8x4_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6625:9): [True: 1, False: 0]
  ------------------
 6626|      1|    if (flags & HAS_AVX2) aom_obmc_sad8x4 = aom_obmc_sad8x4_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6626:9): [True: 1, False: 0]
  ------------------
 6627|      1|    aom_obmc_sad8x8 = aom_obmc_sad8x8_c;
 6628|      1|    if (flags & HAS_SSE4_1) aom_obmc_sad8x8 = aom_obmc_sad8x8_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6628:9): [True: 1, False: 0]
  ------------------
 6629|      1|    if (flags & HAS_AVX2) aom_obmc_sad8x8 = aom_obmc_sad8x8_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6629:9): [True: 1, False: 0]
  ------------------
 6630|      1|    aom_obmc_sub_pixel_variance128x128 = aom_obmc_sub_pixel_variance128x128_c;
 6631|      1|    if (flags & HAS_SSE4_1) aom_obmc_sub_pixel_variance128x128 = aom_obmc_sub_pixel_variance128x128_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6631:9): [True: 1, False: 0]
  ------------------
 6632|      1|    aom_obmc_sub_pixel_variance128x64 = aom_obmc_sub_pixel_variance128x64_c;
 6633|      1|    if (flags & HAS_SSE4_1) aom_obmc_sub_pixel_variance128x64 = aom_obmc_sub_pixel_variance128x64_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6633:9): [True: 1, False: 0]
  ------------------
 6634|      1|    aom_obmc_sub_pixel_variance16x16 = aom_obmc_sub_pixel_variance16x16_c;
 6635|      1|    if (flags & HAS_SSE4_1) aom_obmc_sub_pixel_variance16x16 = aom_obmc_sub_pixel_variance16x16_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6635:9): [True: 1, False: 0]
  ------------------
 6636|      1|    aom_obmc_sub_pixel_variance16x32 = aom_obmc_sub_pixel_variance16x32_c;
 6637|      1|    if (flags & HAS_SSE4_1) aom_obmc_sub_pixel_variance16x32 = aom_obmc_sub_pixel_variance16x32_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6637:9): [True: 1, False: 0]
  ------------------
 6638|      1|    aom_obmc_sub_pixel_variance16x4 = aom_obmc_sub_pixel_variance16x4_c;
 6639|      1|    if (flags & HAS_SSE4_1) aom_obmc_sub_pixel_variance16x4 = aom_obmc_sub_pixel_variance16x4_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6639:9): [True: 1, False: 0]
  ------------------
 6640|      1|    aom_obmc_sub_pixel_variance16x64 = aom_obmc_sub_pixel_variance16x64_c;
 6641|      1|    if (flags & HAS_SSE4_1) aom_obmc_sub_pixel_variance16x64 = aom_obmc_sub_pixel_variance16x64_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6641:9): [True: 1, False: 0]
  ------------------
 6642|      1|    aom_obmc_sub_pixel_variance16x8 = aom_obmc_sub_pixel_variance16x8_c;
 6643|      1|    if (flags & HAS_SSE4_1) aom_obmc_sub_pixel_variance16x8 = aom_obmc_sub_pixel_variance16x8_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6643:9): [True: 1, False: 0]
  ------------------
 6644|      1|    aom_obmc_sub_pixel_variance32x16 = aom_obmc_sub_pixel_variance32x16_c;
 6645|      1|    if (flags & HAS_SSE4_1) aom_obmc_sub_pixel_variance32x16 = aom_obmc_sub_pixel_variance32x16_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6645:9): [True: 1, False: 0]
  ------------------
 6646|      1|    aom_obmc_sub_pixel_variance32x32 = aom_obmc_sub_pixel_variance32x32_c;
 6647|      1|    if (flags & HAS_SSE4_1) aom_obmc_sub_pixel_variance32x32 = aom_obmc_sub_pixel_variance32x32_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6647:9): [True: 1, False: 0]
  ------------------
 6648|      1|    aom_obmc_sub_pixel_variance32x64 = aom_obmc_sub_pixel_variance32x64_c;
 6649|      1|    if (flags & HAS_SSE4_1) aom_obmc_sub_pixel_variance32x64 = aom_obmc_sub_pixel_variance32x64_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6649:9): [True: 1, False: 0]
  ------------------
 6650|      1|    aom_obmc_sub_pixel_variance32x8 = aom_obmc_sub_pixel_variance32x8_c;
 6651|      1|    if (flags & HAS_SSE4_1) aom_obmc_sub_pixel_variance32x8 = aom_obmc_sub_pixel_variance32x8_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6651:9): [True: 1, False: 0]
  ------------------
 6652|      1|    aom_obmc_sub_pixel_variance4x16 = aom_obmc_sub_pixel_variance4x16_c;
 6653|      1|    if (flags & HAS_SSE4_1) aom_obmc_sub_pixel_variance4x16 = aom_obmc_sub_pixel_variance4x16_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6653:9): [True: 1, False: 0]
  ------------------
 6654|      1|    aom_obmc_sub_pixel_variance4x4 = aom_obmc_sub_pixel_variance4x4_c;
 6655|      1|    if (flags & HAS_SSE4_1) aom_obmc_sub_pixel_variance4x4 = aom_obmc_sub_pixel_variance4x4_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6655:9): [True: 1, False: 0]
  ------------------
 6656|      1|    aom_obmc_sub_pixel_variance4x8 = aom_obmc_sub_pixel_variance4x8_c;
 6657|      1|    if (flags & HAS_SSE4_1) aom_obmc_sub_pixel_variance4x8 = aom_obmc_sub_pixel_variance4x8_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6657:9): [True: 1, False: 0]
  ------------------
 6658|      1|    aom_obmc_sub_pixel_variance64x128 = aom_obmc_sub_pixel_variance64x128_c;
 6659|      1|    if (flags & HAS_SSE4_1) aom_obmc_sub_pixel_variance64x128 = aom_obmc_sub_pixel_variance64x128_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6659:9): [True: 1, False: 0]
  ------------------
 6660|      1|    aom_obmc_sub_pixel_variance64x16 = aom_obmc_sub_pixel_variance64x16_c;
 6661|      1|    if (flags & HAS_SSE4_1) aom_obmc_sub_pixel_variance64x16 = aom_obmc_sub_pixel_variance64x16_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6661:9): [True: 1, False: 0]
  ------------------
 6662|      1|    aom_obmc_sub_pixel_variance64x32 = aom_obmc_sub_pixel_variance64x32_c;
 6663|      1|    if (flags & HAS_SSE4_1) aom_obmc_sub_pixel_variance64x32 = aom_obmc_sub_pixel_variance64x32_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6663:9): [True: 1, False: 0]
  ------------------
 6664|      1|    aom_obmc_sub_pixel_variance64x64 = aom_obmc_sub_pixel_variance64x64_c;
 6665|      1|    if (flags & HAS_SSE4_1) aom_obmc_sub_pixel_variance64x64 = aom_obmc_sub_pixel_variance64x64_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6665:9): [True: 1, False: 0]
  ------------------
 6666|      1|    aom_obmc_sub_pixel_variance8x16 = aom_obmc_sub_pixel_variance8x16_c;
 6667|      1|    if (flags & HAS_SSE4_1) aom_obmc_sub_pixel_variance8x16 = aom_obmc_sub_pixel_variance8x16_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6667:9): [True: 1, False: 0]
  ------------------
 6668|      1|    aom_obmc_sub_pixel_variance8x32 = aom_obmc_sub_pixel_variance8x32_c;
 6669|      1|    if (flags & HAS_SSE4_1) aom_obmc_sub_pixel_variance8x32 = aom_obmc_sub_pixel_variance8x32_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6669:9): [True: 1, False: 0]
  ------------------
 6670|      1|    aom_obmc_sub_pixel_variance8x4 = aom_obmc_sub_pixel_variance8x4_c;
 6671|      1|    if (flags & HAS_SSE4_1) aom_obmc_sub_pixel_variance8x4 = aom_obmc_sub_pixel_variance8x4_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6671:9): [True: 1, False: 0]
  ------------------
 6672|      1|    aom_obmc_sub_pixel_variance8x8 = aom_obmc_sub_pixel_variance8x8_c;
 6673|      1|    if (flags & HAS_SSE4_1) aom_obmc_sub_pixel_variance8x8 = aom_obmc_sub_pixel_variance8x8_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6673:9): [True: 1, False: 0]
  ------------------
 6674|      1|    aom_obmc_variance128x128 = aom_obmc_variance128x128_c;
 6675|      1|    if (flags & HAS_SSE4_1) aom_obmc_variance128x128 = aom_obmc_variance128x128_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6675:9): [True: 1, False: 0]
  ------------------
 6676|      1|    if (flags & HAS_AVX2) aom_obmc_variance128x128 = aom_obmc_variance128x128_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6676:9): [True: 1, False: 0]
  ------------------
 6677|      1|    aom_obmc_variance128x64 = aom_obmc_variance128x64_c;
 6678|      1|    if (flags & HAS_SSE4_1) aom_obmc_variance128x64 = aom_obmc_variance128x64_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6678:9): [True: 1, False: 0]
  ------------------
 6679|      1|    if (flags & HAS_AVX2) aom_obmc_variance128x64 = aom_obmc_variance128x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6679:9): [True: 1, False: 0]
  ------------------
 6680|      1|    aom_obmc_variance16x16 = aom_obmc_variance16x16_c;
 6681|      1|    if (flags & HAS_SSE4_1) aom_obmc_variance16x16 = aom_obmc_variance16x16_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6681:9): [True: 1, False: 0]
  ------------------
 6682|      1|    if (flags & HAS_AVX2) aom_obmc_variance16x16 = aom_obmc_variance16x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6682:9): [True: 1, False: 0]
  ------------------
 6683|      1|    aom_obmc_variance16x32 = aom_obmc_variance16x32_c;
 6684|      1|    if (flags & HAS_SSE4_1) aom_obmc_variance16x32 = aom_obmc_variance16x32_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6684:9): [True: 1, False: 0]
  ------------------
 6685|      1|    if (flags & HAS_AVX2) aom_obmc_variance16x32 = aom_obmc_variance16x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6685:9): [True: 1, False: 0]
  ------------------
 6686|      1|    aom_obmc_variance16x4 = aom_obmc_variance16x4_c;
 6687|      1|    if (flags & HAS_SSE4_1) aom_obmc_variance16x4 = aom_obmc_variance16x4_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6687:9): [True: 1, False: 0]
  ------------------
 6688|      1|    if (flags & HAS_AVX2) aom_obmc_variance16x4 = aom_obmc_variance16x4_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6688:9): [True: 1, False: 0]
  ------------------
 6689|      1|    aom_obmc_variance16x64 = aom_obmc_variance16x64_c;
 6690|      1|    if (flags & HAS_SSE4_1) aom_obmc_variance16x64 = aom_obmc_variance16x64_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6690:9): [True: 1, False: 0]
  ------------------
 6691|      1|    if (flags & HAS_AVX2) aom_obmc_variance16x64 = aom_obmc_variance16x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6691:9): [True: 1, False: 0]
  ------------------
 6692|      1|    aom_obmc_variance16x8 = aom_obmc_variance16x8_c;
 6693|      1|    if (flags & HAS_SSE4_1) aom_obmc_variance16x8 = aom_obmc_variance16x8_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6693:9): [True: 1, False: 0]
  ------------------
 6694|      1|    if (flags & HAS_AVX2) aom_obmc_variance16x8 = aom_obmc_variance16x8_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6694:9): [True: 1, False: 0]
  ------------------
 6695|      1|    aom_obmc_variance32x16 = aom_obmc_variance32x16_c;
 6696|      1|    if (flags & HAS_SSE4_1) aom_obmc_variance32x16 = aom_obmc_variance32x16_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6696:9): [True: 1, False: 0]
  ------------------
 6697|      1|    if (flags & HAS_AVX2) aom_obmc_variance32x16 = aom_obmc_variance32x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6697:9): [True: 1, False: 0]
  ------------------
 6698|      1|    aom_obmc_variance32x32 = aom_obmc_variance32x32_c;
 6699|      1|    if (flags & HAS_SSE4_1) aom_obmc_variance32x32 = aom_obmc_variance32x32_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6699:9): [True: 1, False: 0]
  ------------------
 6700|      1|    if (flags & HAS_AVX2) aom_obmc_variance32x32 = aom_obmc_variance32x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6700:9): [True: 1, False: 0]
  ------------------
 6701|      1|    aom_obmc_variance32x64 = aom_obmc_variance32x64_c;
 6702|      1|    if (flags & HAS_SSE4_1) aom_obmc_variance32x64 = aom_obmc_variance32x64_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6702:9): [True: 1, False: 0]
  ------------------
 6703|      1|    if (flags & HAS_AVX2) aom_obmc_variance32x64 = aom_obmc_variance32x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6703:9): [True: 1, False: 0]
  ------------------
 6704|      1|    aom_obmc_variance32x8 = aom_obmc_variance32x8_c;
 6705|      1|    if (flags & HAS_SSE4_1) aom_obmc_variance32x8 = aom_obmc_variance32x8_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6705:9): [True: 1, False: 0]
  ------------------
 6706|      1|    if (flags & HAS_AVX2) aom_obmc_variance32x8 = aom_obmc_variance32x8_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6706:9): [True: 1, False: 0]
  ------------------
 6707|      1|    aom_obmc_variance4x16 = aom_obmc_variance4x16_c;
 6708|      1|    if (flags & HAS_SSE4_1) aom_obmc_variance4x16 = aom_obmc_variance4x16_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6708:9): [True: 1, False: 0]
  ------------------
 6709|      1|    if (flags & HAS_AVX2) aom_obmc_variance4x16 = aom_obmc_variance4x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6709:9): [True: 1, False: 0]
  ------------------
 6710|      1|    aom_obmc_variance4x4 = aom_obmc_variance4x4_c;
 6711|      1|    if (flags & HAS_SSE4_1) aom_obmc_variance4x4 = aom_obmc_variance4x4_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6711:9): [True: 1, False: 0]
  ------------------
 6712|      1|    if (flags & HAS_AVX2) aom_obmc_variance4x4 = aom_obmc_variance4x4_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6712:9): [True: 1, False: 0]
  ------------------
 6713|      1|    aom_obmc_variance4x8 = aom_obmc_variance4x8_c;
 6714|      1|    if (flags & HAS_SSE4_1) aom_obmc_variance4x8 = aom_obmc_variance4x8_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6714:9): [True: 1, False: 0]
  ------------------
 6715|      1|    if (flags & HAS_AVX2) aom_obmc_variance4x8 = aom_obmc_variance4x8_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6715:9): [True: 1, False: 0]
  ------------------
 6716|      1|    aom_obmc_variance64x128 = aom_obmc_variance64x128_c;
 6717|      1|    if (flags & HAS_SSE4_1) aom_obmc_variance64x128 = aom_obmc_variance64x128_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6717:9): [True: 1, False: 0]
  ------------------
 6718|      1|    if (flags & HAS_AVX2) aom_obmc_variance64x128 = aom_obmc_variance64x128_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6718:9): [True: 1, False: 0]
  ------------------
 6719|      1|    aom_obmc_variance64x16 = aom_obmc_variance64x16_c;
 6720|      1|    if (flags & HAS_SSE4_1) aom_obmc_variance64x16 = aom_obmc_variance64x16_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6720:9): [True: 1, False: 0]
  ------------------
 6721|      1|    if (flags & HAS_AVX2) aom_obmc_variance64x16 = aom_obmc_variance64x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6721:9): [True: 1, False: 0]
  ------------------
 6722|      1|    aom_obmc_variance64x32 = aom_obmc_variance64x32_c;
 6723|      1|    if (flags & HAS_SSE4_1) aom_obmc_variance64x32 = aom_obmc_variance64x32_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6723:9): [True: 1, False: 0]
  ------------------
 6724|      1|    if (flags & HAS_AVX2) aom_obmc_variance64x32 = aom_obmc_variance64x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6724:9): [True: 1, False: 0]
  ------------------
 6725|      1|    aom_obmc_variance64x64 = aom_obmc_variance64x64_c;
 6726|      1|    if (flags & HAS_SSE4_1) aom_obmc_variance64x64 = aom_obmc_variance64x64_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6726:9): [True: 1, False: 0]
  ------------------
 6727|      1|    if (flags & HAS_AVX2) aom_obmc_variance64x64 = aom_obmc_variance64x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6727:9): [True: 1, False: 0]
  ------------------
 6728|      1|    aom_obmc_variance8x16 = aom_obmc_variance8x16_c;
 6729|      1|    if (flags & HAS_SSE4_1) aom_obmc_variance8x16 = aom_obmc_variance8x16_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6729:9): [True: 1, False: 0]
  ------------------
 6730|      1|    if (flags & HAS_AVX2) aom_obmc_variance8x16 = aom_obmc_variance8x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6730:9): [True: 1, False: 0]
  ------------------
 6731|      1|    aom_obmc_variance8x32 = aom_obmc_variance8x32_c;
 6732|      1|    if (flags & HAS_SSE4_1) aom_obmc_variance8x32 = aom_obmc_variance8x32_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6732:9): [True: 1, False: 0]
  ------------------
 6733|      1|    if (flags & HAS_AVX2) aom_obmc_variance8x32 = aom_obmc_variance8x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6733:9): [True: 1, False: 0]
  ------------------
 6734|      1|    aom_obmc_variance8x4 = aom_obmc_variance8x4_c;
 6735|      1|    if (flags & HAS_SSE4_1) aom_obmc_variance8x4 = aom_obmc_variance8x4_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6735:9): [True: 1, False: 0]
  ------------------
 6736|      1|    if (flags & HAS_AVX2) aom_obmc_variance8x4 = aom_obmc_variance8x4_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6736:9): [True: 1, False: 0]
  ------------------
 6737|      1|    aom_obmc_variance8x8 = aom_obmc_variance8x8_c;
 6738|      1|    if (flags & HAS_SSE4_1) aom_obmc_variance8x8 = aom_obmc_variance8x8_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (6738:9): [True: 1, False: 0]
  ------------------
 6739|      1|    if (flags & HAS_AVX2) aom_obmc_variance8x8 = aom_obmc_variance8x8_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6739:9): [True: 1, False: 0]
  ------------------
 6740|      1|    aom_paeth_predictor_16x16 = aom_paeth_predictor_16x16_c;
 6741|      1|    if (flags & HAS_SSSE3) aom_paeth_predictor_16x16 = aom_paeth_predictor_16x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6741:9): [True: 1, False: 0]
  ------------------
 6742|      1|    if (flags & HAS_AVX2) aom_paeth_predictor_16x16 = aom_paeth_predictor_16x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6742:9): [True: 1, False: 0]
  ------------------
 6743|      1|    aom_paeth_predictor_16x32 = aom_paeth_predictor_16x32_c;
 6744|      1|    if (flags & HAS_SSSE3) aom_paeth_predictor_16x32 = aom_paeth_predictor_16x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6744:9): [True: 1, False: 0]
  ------------------
 6745|      1|    if (flags & HAS_AVX2) aom_paeth_predictor_16x32 = aom_paeth_predictor_16x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6745:9): [True: 1, False: 0]
  ------------------
 6746|      1|    aom_paeth_predictor_16x4 = aom_paeth_predictor_16x4_c;
 6747|      1|    if (flags & HAS_SSSE3) aom_paeth_predictor_16x4 = aom_paeth_predictor_16x4_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6747:9): [True: 1, False: 0]
  ------------------
 6748|      1|    aom_paeth_predictor_16x64 = aom_paeth_predictor_16x64_c;
 6749|      1|    if (flags & HAS_SSSE3) aom_paeth_predictor_16x64 = aom_paeth_predictor_16x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6749:9): [True: 1, False: 0]
  ------------------
 6750|      1|    if (flags & HAS_AVX2) aom_paeth_predictor_16x64 = aom_paeth_predictor_16x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6750:9): [True: 1, False: 0]
  ------------------
 6751|      1|    aom_paeth_predictor_16x8 = aom_paeth_predictor_16x8_c;
 6752|      1|    if (flags & HAS_SSSE3) aom_paeth_predictor_16x8 = aom_paeth_predictor_16x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6752:9): [True: 1, False: 0]
  ------------------
 6753|      1|    if (flags & HAS_AVX2) aom_paeth_predictor_16x8 = aom_paeth_predictor_16x8_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6753:9): [True: 1, False: 0]
  ------------------
 6754|      1|    aom_paeth_predictor_32x16 = aom_paeth_predictor_32x16_c;
 6755|      1|    if (flags & HAS_SSSE3) aom_paeth_predictor_32x16 = aom_paeth_predictor_32x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6755:9): [True: 1, False: 0]
  ------------------
 6756|      1|    if (flags & HAS_AVX2) aom_paeth_predictor_32x16 = aom_paeth_predictor_32x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6756:9): [True: 1, False: 0]
  ------------------
 6757|      1|    aom_paeth_predictor_32x32 = aom_paeth_predictor_32x32_c;
 6758|      1|    if (flags & HAS_SSSE3) aom_paeth_predictor_32x32 = aom_paeth_predictor_32x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6758:9): [True: 1, False: 0]
  ------------------
 6759|      1|    if (flags & HAS_AVX2) aom_paeth_predictor_32x32 = aom_paeth_predictor_32x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6759:9): [True: 1, False: 0]
  ------------------
 6760|      1|    aom_paeth_predictor_32x64 = aom_paeth_predictor_32x64_c;
 6761|      1|    if (flags & HAS_SSSE3) aom_paeth_predictor_32x64 = aom_paeth_predictor_32x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6761:9): [True: 1, False: 0]
  ------------------
 6762|      1|    if (flags & HAS_AVX2) aom_paeth_predictor_32x64 = aom_paeth_predictor_32x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6762:9): [True: 1, False: 0]
  ------------------
 6763|      1|    aom_paeth_predictor_32x8 = aom_paeth_predictor_32x8_c;
 6764|      1|    if (flags & HAS_SSSE3) aom_paeth_predictor_32x8 = aom_paeth_predictor_32x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6764:9): [True: 1, False: 0]
  ------------------
 6765|      1|    aom_paeth_predictor_4x16 = aom_paeth_predictor_4x16_c;
 6766|      1|    if (flags & HAS_SSSE3) aom_paeth_predictor_4x16 = aom_paeth_predictor_4x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6766:9): [True: 1, False: 0]
  ------------------
 6767|      1|    aom_paeth_predictor_4x4 = aom_paeth_predictor_4x4_c;
 6768|      1|    if (flags & HAS_SSSE3) aom_paeth_predictor_4x4 = aom_paeth_predictor_4x4_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6768:9): [True: 1, False: 0]
  ------------------
 6769|      1|    aom_paeth_predictor_4x8 = aom_paeth_predictor_4x8_c;
 6770|      1|    if (flags & HAS_SSSE3) aom_paeth_predictor_4x8 = aom_paeth_predictor_4x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6770:9): [True: 1, False: 0]
  ------------------
 6771|      1|    aom_paeth_predictor_64x16 = aom_paeth_predictor_64x16_c;
 6772|      1|    if (flags & HAS_SSSE3) aom_paeth_predictor_64x16 = aom_paeth_predictor_64x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6772:9): [True: 1, False: 0]
  ------------------
 6773|      1|    if (flags & HAS_AVX2) aom_paeth_predictor_64x16 = aom_paeth_predictor_64x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6773:9): [True: 1, False: 0]
  ------------------
 6774|      1|    aom_paeth_predictor_64x32 = aom_paeth_predictor_64x32_c;
 6775|      1|    if (flags & HAS_SSSE3) aom_paeth_predictor_64x32 = aom_paeth_predictor_64x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6775:9): [True: 1, False: 0]
  ------------------
 6776|      1|    if (flags & HAS_AVX2) aom_paeth_predictor_64x32 = aom_paeth_predictor_64x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6776:9): [True: 1, False: 0]
  ------------------
 6777|      1|    aom_paeth_predictor_64x64 = aom_paeth_predictor_64x64_c;
 6778|      1|    if (flags & HAS_SSSE3) aom_paeth_predictor_64x64 = aom_paeth_predictor_64x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6778:9): [True: 1, False: 0]
  ------------------
 6779|      1|    if (flags & HAS_AVX2) aom_paeth_predictor_64x64 = aom_paeth_predictor_64x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6779:9): [True: 1, False: 0]
  ------------------
 6780|      1|    aom_paeth_predictor_8x16 = aom_paeth_predictor_8x16_c;
 6781|      1|    if (flags & HAS_SSSE3) aom_paeth_predictor_8x16 = aom_paeth_predictor_8x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6781:9): [True: 1, False: 0]
  ------------------
 6782|      1|    aom_paeth_predictor_8x32 = aom_paeth_predictor_8x32_c;
 6783|      1|    if (flags & HAS_SSSE3) aom_paeth_predictor_8x32 = aom_paeth_predictor_8x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6783:9): [True: 1, False: 0]
  ------------------
 6784|      1|    aom_paeth_predictor_8x4 = aom_paeth_predictor_8x4_c;
 6785|      1|    if (flags & HAS_SSSE3) aom_paeth_predictor_8x4 = aom_paeth_predictor_8x4_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6785:9): [True: 1, False: 0]
  ------------------
 6786|      1|    aom_paeth_predictor_8x8 = aom_paeth_predictor_8x8_c;
 6787|      1|    if (flags & HAS_SSSE3) aom_paeth_predictor_8x8 = aom_paeth_predictor_8x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6787:9): [True: 1, False: 0]
  ------------------
 6788|      1|    aom_quantize_b = aom_quantize_b_sse2;
 6789|      1|    if (flags & HAS_SSSE3) aom_quantize_b = aom_quantize_b_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6789:9): [True: 1, False: 0]
  ------------------
 6790|      1|    if (flags & HAS_AVX) aom_quantize_b = aom_quantize_b_avx;
  ------------------
  |  |  167|      1|#define HAS_AVX 0x40
  ------------------
  |  Branch (6790:9): [True: 1, False: 0]
  ------------------
 6791|      1|    if (flags & HAS_AVX2) aom_quantize_b = aom_quantize_b_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6791:9): [True: 1, False: 0]
  ------------------
 6792|      1|    aom_quantize_b_32x32 = aom_quantize_b_32x32_c;
 6793|      1|    if (flags & HAS_SSSE3) aom_quantize_b_32x32 = aom_quantize_b_32x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6793:9): [True: 1, False: 0]
  ------------------
 6794|      1|    if (flags & HAS_AVX) aom_quantize_b_32x32 = aom_quantize_b_32x32_avx;
  ------------------
  |  |  167|      1|#define HAS_AVX 0x40
  ------------------
  |  Branch (6794:9): [True: 1, False: 0]
  ------------------
 6795|      1|    if (flags & HAS_AVX2) aom_quantize_b_32x32 = aom_quantize_b_32x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6795:9): [True: 1, False: 0]
  ------------------
 6796|      1|    aom_quantize_b_64x64 = aom_quantize_b_64x64_c;
 6797|      1|    if (flags & HAS_SSSE3) aom_quantize_b_64x64 = aom_quantize_b_64x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6797:9): [True: 1, False: 0]
  ------------------
 6798|      1|    if (flags & HAS_AVX2) aom_quantize_b_64x64 = aom_quantize_b_64x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6798:9): [True: 1, False: 0]
  ------------------
 6799|      1|    aom_quantize_b_adaptive = aom_quantize_b_adaptive_sse2;
 6800|      1|    if (flags & HAS_AVX2) aom_quantize_b_adaptive = aom_quantize_b_adaptive_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6800:9): [True: 1, False: 0]
  ------------------
 6801|      1|    aom_sad128x128 = aom_sad128x128_sse2;
 6802|      1|    if (flags & HAS_AVX2) aom_sad128x128 = aom_sad128x128_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6802:9): [True: 1, False: 0]
  ------------------
 6803|      1|    aom_sad128x128_avg = aom_sad128x128_avg_sse2;
 6804|      1|    if (flags & HAS_AVX2) aom_sad128x128_avg = aom_sad128x128_avg_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6804:9): [True: 1, False: 0]
  ------------------
 6805|      1|    aom_sad128x128x3d = aom_sad128x128x3d_c;
 6806|      1|    if (flags & HAS_AVX2) aom_sad128x128x3d = aom_sad128x128x3d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6806:9): [True: 1, False: 0]
  ------------------
 6807|      1|    aom_sad128x128x4d = aom_sad128x128x4d_sse2;
 6808|      1|    if (flags & HAS_AVX2) aom_sad128x128x4d = aom_sad128x128x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6808:9): [True: 1, False: 0]
  ------------------
 6809|      1|    aom_sad128x64 = aom_sad128x64_sse2;
 6810|      1|    if (flags & HAS_AVX2) aom_sad128x64 = aom_sad128x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6810:9): [True: 1, False: 0]
  ------------------
 6811|      1|    aom_sad128x64_avg = aom_sad128x64_avg_sse2;
 6812|      1|    if (flags & HAS_AVX2) aom_sad128x64_avg = aom_sad128x64_avg_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6812:9): [True: 1, False: 0]
  ------------------
 6813|      1|    aom_sad128x64x3d = aom_sad128x64x3d_c;
 6814|      1|    if (flags & HAS_AVX2) aom_sad128x64x3d = aom_sad128x64x3d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6814:9): [True: 1, False: 0]
  ------------------
 6815|      1|    aom_sad128x64x4d = aom_sad128x64x4d_sse2;
 6816|      1|    if (flags & HAS_AVX2) aom_sad128x64x4d = aom_sad128x64x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6816:9): [True: 1, False: 0]
  ------------------
 6817|      1|    aom_sad16x16x3d = aom_sad16x16x3d_c;
 6818|      1|    if (flags & HAS_AVX2) aom_sad16x16x3d = aom_sad16x16x3d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6818:9): [True: 1, False: 0]
  ------------------
 6819|      1|    aom_sad16x16x4d = aom_sad16x16x4d_sse2;
 6820|      1|    if (flags & HAS_AVX2) aom_sad16x16x4d = aom_sad16x16x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6820:9): [True: 1, False: 0]
  ------------------
 6821|      1|    aom_sad16x32x3d = aom_sad16x32x3d_c;
 6822|      1|    if (flags & HAS_AVX2) aom_sad16x32x3d = aom_sad16x32x3d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6822:9): [True: 1, False: 0]
  ------------------
 6823|      1|    aom_sad16x32x4d = aom_sad16x32x4d_sse2;
 6824|      1|    if (flags & HAS_AVX2) aom_sad16x32x4d = aom_sad16x32x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6824:9): [True: 1, False: 0]
  ------------------
 6825|      1|    aom_sad16x4x3d = aom_sad16x4x3d_c;
 6826|      1|    if (flags & HAS_AVX2) aom_sad16x4x3d = aom_sad16x4x3d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6826:9): [True: 1, False: 0]
  ------------------
 6827|      1|    aom_sad16x4x4d = aom_sad16x4x4d_sse2;
 6828|      1|    if (flags & HAS_AVX2) aom_sad16x4x4d = aom_sad16x4x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6828:9): [True: 1, False: 0]
  ------------------
 6829|      1|    aom_sad16x64x3d = aom_sad16x64x3d_c;
 6830|      1|    if (flags & HAS_AVX2) aom_sad16x64x3d = aom_sad16x64x3d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6830:9): [True: 1, False: 0]
  ------------------
 6831|      1|    aom_sad16x64x4d = aom_sad16x64x4d_sse2;
 6832|      1|    if (flags & HAS_AVX2) aom_sad16x64x4d = aom_sad16x64x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6832:9): [True: 1, False: 0]
  ------------------
 6833|      1|    aom_sad16x8x3d = aom_sad16x8x3d_c;
 6834|      1|    if (flags & HAS_AVX2) aom_sad16x8x3d = aom_sad16x8x3d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6834:9): [True: 1, False: 0]
  ------------------
 6835|      1|    aom_sad16x8x4d = aom_sad16x8x4d_sse2;
 6836|      1|    if (flags & HAS_AVX2) aom_sad16x8x4d = aom_sad16x8x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6836:9): [True: 1, False: 0]
  ------------------
 6837|      1|    aom_sad32x16 = aom_sad32x16_sse2;
 6838|      1|    if (flags & HAS_AVX2) aom_sad32x16 = aom_sad32x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6838:9): [True: 1, False: 0]
  ------------------
 6839|      1|    aom_sad32x16_avg = aom_sad32x16_avg_sse2;
 6840|      1|    if (flags & HAS_AVX2) aom_sad32x16_avg = aom_sad32x16_avg_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6840:9): [True: 1, False: 0]
  ------------------
 6841|      1|    aom_sad32x16x3d = aom_sad32x16x3d_c;
 6842|      1|    if (flags & HAS_AVX2) aom_sad32x16x3d = aom_sad32x16x3d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6842:9): [True: 1, False: 0]
  ------------------
 6843|      1|    aom_sad32x16x4d = aom_sad32x16x4d_sse2;
 6844|      1|    if (flags & HAS_AVX2) aom_sad32x16x4d = aom_sad32x16x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6844:9): [True: 1, False: 0]
  ------------------
 6845|      1|    aom_sad32x32 = aom_sad32x32_sse2;
 6846|      1|    if (flags & HAS_AVX2) aom_sad32x32 = aom_sad32x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6846:9): [True: 1, False: 0]
  ------------------
 6847|      1|    aom_sad32x32_avg = aom_sad32x32_avg_sse2;
 6848|      1|    if (flags & HAS_AVX2) aom_sad32x32_avg = aom_sad32x32_avg_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6848:9): [True: 1, False: 0]
  ------------------
 6849|      1|    aom_sad32x32x3d = aom_sad32x32x3d_c;
 6850|      1|    if (flags & HAS_AVX2) aom_sad32x32x3d = aom_sad32x32x3d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6850:9): [True: 1, False: 0]
  ------------------
 6851|      1|    aom_sad32x32x4d = aom_sad32x32x4d_sse2;
 6852|      1|    if (flags & HAS_AVX2) aom_sad32x32x4d = aom_sad32x32x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6852:9): [True: 1, False: 0]
  ------------------
 6853|      1|    aom_sad32x64 = aom_sad32x64_sse2;
 6854|      1|    if (flags & HAS_AVX2) aom_sad32x64 = aom_sad32x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6854:9): [True: 1, False: 0]
  ------------------
 6855|      1|    aom_sad32x64_avg = aom_sad32x64_avg_sse2;
 6856|      1|    if (flags & HAS_AVX2) aom_sad32x64_avg = aom_sad32x64_avg_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6856:9): [True: 1, False: 0]
  ------------------
 6857|      1|    aom_sad32x64x3d = aom_sad32x64x3d_c;
 6858|      1|    if (flags & HAS_AVX2) aom_sad32x64x3d = aom_sad32x64x3d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6858:9): [True: 1, False: 0]
  ------------------
 6859|      1|    aom_sad32x64x4d = aom_sad32x64x4d_sse2;
 6860|      1|    if (flags & HAS_AVX2) aom_sad32x64x4d = aom_sad32x64x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6860:9): [True: 1, False: 0]
  ------------------
 6861|      1|    aom_sad32x8x3d = aom_sad32x8x3d_c;
 6862|      1|    if (flags & HAS_AVX2) aom_sad32x8x3d = aom_sad32x8x3d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6862:9): [True: 1, False: 0]
  ------------------
 6863|      1|    aom_sad32x8x4d = aom_sad32x8x4d_sse2;
 6864|      1|    if (flags & HAS_AVX2) aom_sad32x8x4d = aom_sad32x8x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6864:9): [True: 1, False: 0]
  ------------------
 6865|      1|    aom_sad64x128 = aom_sad64x128_sse2;
 6866|      1|    if (flags & HAS_AVX2) aom_sad64x128 = aom_sad64x128_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6866:9): [True: 1, False: 0]
  ------------------
 6867|      1|    aom_sad64x128_avg = aom_sad64x128_avg_sse2;
 6868|      1|    if (flags & HAS_AVX2) aom_sad64x128_avg = aom_sad64x128_avg_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6868:9): [True: 1, False: 0]
  ------------------
 6869|      1|    aom_sad64x128x3d = aom_sad64x128x3d_c;
 6870|      1|    if (flags & HAS_AVX2) aom_sad64x128x3d = aom_sad64x128x3d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6870:9): [True: 1, False: 0]
  ------------------
 6871|      1|    aom_sad64x128x4d = aom_sad64x128x4d_sse2;
 6872|      1|    if (flags & HAS_AVX2) aom_sad64x128x4d = aom_sad64x128x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6872:9): [True: 1, False: 0]
  ------------------
 6873|      1|    aom_sad64x16x3d = aom_sad64x16x3d_c;
 6874|      1|    if (flags & HAS_AVX2) aom_sad64x16x3d = aom_sad64x16x3d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6874:9): [True: 1, False: 0]
  ------------------
 6875|      1|    aom_sad64x16x4d = aom_sad64x16x4d_sse2;
 6876|      1|    if (flags & HAS_AVX2) aom_sad64x16x4d = aom_sad64x16x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6876:9): [True: 1, False: 0]
  ------------------
 6877|      1|    aom_sad64x32 = aom_sad64x32_sse2;
 6878|      1|    if (flags & HAS_AVX2) aom_sad64x32 = aom_sad64x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6878:9): [True: 1, False: 0]
  ------------------
 6879|      1|    aom_sad64x32_avg = aom_sad64x32_avg_sse2;
 6880|      1|    if (flags & HAS_AVX2) aom_sad64x32_avg = aom_sad64x32_avg_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6880:9): [True: 1, False: 0]
  ------------------
 6881|      1|    aom_sad64x32x3d = aom_sad64x32x3d_c;
 6882|      1|    if (flags & HAS_AVX2) aom_sad64x32x3d = aom_sad64x32x3d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6882:9): [True: 1, False: 0]
  ------------------
 6883|      1|    aom_sad64x32x4d = aom_sad64x32x4d_sse2;
 6884|      1|    if (flags & HAS_AVX2) aom_sad64x32x4d = aom_sad64x32x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6884:9): [True: 1, False: 0]
  ------------------
 6885|      1|    aom_sad64x64 = aom_sad64x64_sse2;
 6886|      1|    if (flags & HAS_AVX2) aom_sad64x64 = aom_sad64x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6886:9): [True: 1, False: 0]
  ------------------
 6887|      1|    aom_sad64x64_avg = aom_sad64x64_avg_sse2;
 6888|      1|    if (flags & HAS_AVX2) aom_sad64x64_avg = aom_sad64x64_avg_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6888:9): [True: 1, False: 0]
  ------------------
 6889|      1|    aom_sad64x64x3d = aom_sad64x64x3d_c;
 6890|      1|    if (flags & HAS_AVX2) aom_sad64x64x3d = aom_sad64x64x3d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6890:9): [True: 1, False: 0]
  ------------------
 6891|      1|    aom_sad64x64x4d = aom_sad64x64x4d_sse2;
 6892|      1|    if (flags & HAS_AVX2) aom_sad64x64x4d = aom_sad64x64x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6892:9): [True: 1, False: 0]
  ------------------
 6893|      1|    aom_sad_skip_128x128 = aom_sad_skip_128x128_sse2;
 6894|      1|    if (flags & HAS_AVX2) aom_sad_skip_128x128 = aom_sad_skip_128x128_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6894:9): [True: 1, False: 0]
  ------------------
 6895|      1|    aom_sad_skip_128x128x4d = aom_sad_skip_128x128x4d_sse2;
 6896|      1|    if (flags & HAS_AVX2) aom_sad_skip_128x128x4d = aom_sad_skip_128x128x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6896:9): [True: 1, False: 0]
  ------------------
 6897|      1|    aom_sad_skip_128x64 = aom_sad_skip_128x64_sse2;
 6898|      1|    if (flags & HAS_AVX2) aom_sad_skip_128x64 = aom_sad_skip_128x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6898:9): [True: 1, False: 0]
  ------------------
 6899|      1|    aom_sad_skip_128x64x4d = aom_sad_skip_128x64x4d_sse2;
 6900|      1|    if (flags & HAS_AVX2) aom_sad_skip_128x64x4d = aom_sad_skip_128x64x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6900:9): [True: 1, False: 0]
  ------------------
 6901|      1|    aom_sad_skip_16x16x4d = aom_sad_skip_16x16x4d_sse2;
 6902|      1|    if (flags & HAS_AVX2) aom_sad_skip_16x16x4d = aom_sad_skip_16x16x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6902:9): [True: 1, False: 0]
  ------------------
 6903|      1|    aom_sad_skip_16x32x4d = aom_sad_skip_16x32x4d_sse2;
 6904|      1|    if (flags & HAS_AVX2) aom_sad_skip_16x32x4d = aom_sad_skip_16x32x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6904:9): [True: 1, False: 0]
  ------------------
 6905|      1|    aom_sad_skip_16x64x4d = aom_sad_skip_16x64x4d_sse2;
 6906|      1|    if (flags & HAS_AVX2) aom_sad_skip_16x64x4d = aom_sad_skip_16x64x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6906:9): [True: 1, False: 0]
  ------------------
 6907|      1|    aom_sad_skip_32x16 = aom_sad_skip_32x16_sse2;
 6908|      1|    if (flags & HAS_AVX2) aom_sad_skip_32x16 = aom_sad_skip_32x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6908:9): [True: 1, False: 0]
  ------------------
 6909|      1|    aom_sad_skip_32x16x4d = aom_sad_skip_32x16x4d_sse2;
 6910|      1|    if (flags & HAS_AVX2) aom_sad_skip_32x16x4d = aom_sad_skip_32x16x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6910:9): [True: 1, False: 0]
  ------------------
 6911|      1|    aom_sad_skip_32x32 = aom_sad_skip_32x32_sse2;
 6912|      1|    if (flags & HAS_AVX2) aom_sad_skip_32x32 = aom_sad_skip_32x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6912:9): [True: 1, False: 0]
  ------------------
 6913|      1|    aom_sad_skip_32x32x4d = aom_sad_skip_32x32x4d_sse2;
 6914|      1|    if (flags & HAS_AVX2) aom_sad_skip_32x32x4d = aom_sad_skip_32x32x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6914:9): [True: 1, False: 0]
  ------------------
 6915|      1|    aom_sad_skip_32x64 = aom_sad_skip_32x64_sse2;
 6916|      1|    if (flags & HAS_AVX2) aom_sad_skip_32x64 = aom_sad_skip_32x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6916:9): [True: 1, False: 0]
  ------------------
 6917|      1|    aom_sad_skip_32x64x4d = aom_sad_skip_32x64x4d_sse2;
 6918|      1|    if (flags & HAS_AVX2) aom_sad_skip_32x64x4d = aom_sad_skip_32x64x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6918:9): [True: 1, False: 0]
  ------------------
 6919|      1|    aom_sad_skip_64x128 = aom_sad_skip_64x128_sse2;
 6920|      1|    if (flags & HAS_AVX2) aom_sad_skip_64x128 = aom_sad_skip_64x128_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6920:9): [True: 1, False: 0]
  ------------------
 6921|      1|    aom_sad_skip_64x128x4d = aom_sad_skip_64x128x4d_sse2;
 6922|      1|    if (flags & HAS_AVX2) aom_sad_skip_64x128x4d = aom_sad_skip_64x128x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6922:9): [True: 1, False: 0]
  ------------------
 6923|      1|    aom_sad_skip_64x16x4d = aom_sad_skip_64x16x4d_sse2;
 6924|      1|    if (flags & HAS_AVX2) aom_sad_skip_64x16x4d = aom_sad_skip_64x16x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6924:9): [True: 1, False: 0]
  ------------------
 6925|      1|    aom_sad_skip_64x32 = aom_sad_skip_64x32_sse2;
 6926|      1|    if (flags & HAS_AVX2) aom_sad_skip_64x32 = aom_sad_skip_64x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6926:9): [True: 1, False: 0]
  ------------------
 6927|      1|    aom_sad_skip_64x32x4d = aom_sad_skip_64x32x4d_sse2;
 6928|      1|    if (flags & HAS_AVX2) aom_sad_skip_64x32x4d = aom_sad_skip_64x32x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6928:9): [True: 1, False: 0]
  ------------------
 6929|      1|    aom_sad_skip_64x64 = aom_sad_skip_64x64_sse2;
 6930|      1|    if (flags & HAS_AVX2) aom_sad_skip_64x64 = aom_sad_skip_64x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6930:9): [True: 1, False: 0]
  ------------------
 6931|      1|    aom_sad_skip_64x64x4d = aom_sad_skip_64x64x4d_sse2;
 6932|      1|    if (flags & HAS_AVX2) aom_sad_skip_64x64x4d = aom_sad_skip_64x64x4d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6932:9): [True: 1, False: 0]
  ------------------
 6933|      1|    aom_satd = aom_satd_sse2;
 6934|      1|    if (flags & HAS_AVX2) aom_satd = aom_satd_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6934:9): [True: 1, False: 0]
  ------------------
 6935|      1|    aom_satd_lp = aom_satd_lp_sse2;
 6936|      1|    if (flags & HAS_AVX2) aom_satd_lp = aom_satd_lp_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (6936:9): [True: 1, False: 0]
  ------------------
 6937|      1|    aom_scaled_2d = aom_scaled_2d_c;
 6938|      1|    if (flags & HAS_SSSE3) aom_scaled_2d = aom_scaled_2d_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6938:9): [True: 1, False: 0]
  ------------------
 6939|      1|    aom_smooth_h_predictor_16x16 = aom_smooth_h_predictor_16x16_c;
 6940|      1|    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x16 = aom_smooth_h_predictor_16x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6940:9): [True: 1, False: 0]
  ------------------
 6941|      1|    aom_smooth_h_predictor_16x32 = aom_smooth_h_predictor_16x32_c;
 6942|      1|    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x32 = aom_smooth_h_predictor_16x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6942:9): [True: 1, False: 0]
  ------------------
 6943|      1|    aom_smooth_h_predictor_16x4 = aom_smooth_h_predictor_16x4_c;
 6944|      1|    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x4 = aom_smooth_h_predictor_16x4_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6944:9): [True: 1, False: 0]
  ------------------
 6945|      1|    aom_smooth_h_predictor_16x64 = aom_smooth_h_predictor_16x64_c;
 6946|      1|    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x64 = aom_smooth_h_predictor_16x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6946:9): [True: 1, False: 0]
  ------------------
 6947|      1|    aom_smooth_h_predictor_16x8 = aom_smooth_h_predictor_16x8_c;
 6948|      1|    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x8 = aom_smooth_h_predictor_16x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6948:9): [True: 1, False: 0]
  ------------------
 6949|      1|    aom_smooth_h_predictor_32x16 = aom_smooth_h_predictor_32x16_c;
 6950|      1|    if (flags & HAS_SSSE3) aom_smooth_h_predictor_32x16 = aom_smooth_h_predictor_32x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6950:9): [True: 1, False: 0]
  ------------------
 6951|      1|    aom_smooth_h_predictor_32x32 = aom_smooth_h_predictor_32x32_c;
 6952|      1|    if (flags & HAS_SSSE3) aom_smooth_h_predictor_32x32 = aom_smooth_h_predictor_32x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6952:9): [True: 1, False: 0]
  ------------------
 6953|      1|    aom_smooth_h_predictor_32x64 = aom_smooth_h_predictor_32x64_c;
 6954|      1|    if (flags & HAS_SSSE3) aom_smooth_h_predictor_32x64 = aom_smooth_h_predictor_32x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6954:9): [True: 1, False: 0]
  ------------------
 6955|      1|    aom_smooth_h_predictor_32x8 = aom_smooth_h_predictor_32x8_c;
 6956|      1|    if (flags & HAS_SSSE3) aom_smooth_h_predictor_32x8 = aom_smooth_h_predictor_32x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6956:9): [True: 1, False: 0]
  ------------------
 6957|      1|    aom_smooth_h_predictor_4x16 = aom_smooth_h_predictor_4x16_c;
 6958|      1|    if (flags & HAS_SSSE3) aom_smooth_h_predictor_4x16 = aom_smooth_h_predictor_4x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6958:9): [True: 1, False: 0]
  ------------------
 6959|      1|    aom_smooth_h_predictor_4x4 = aom_smooth_h_predictor_4x4_c;
 6960|      1|    if (flags & HAS_SSSE3) aom_smooth_h_predictor_4x4 = aom_smooth_h_predictor_4x4_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6960:9): [True: 1, False: 0]
  ------------------
 6961|      1|    aom_smooth_h_predictor_4x8 = aom_smooth_h_predictor_4x8_c;
 6962|      1|    if (flags & HAS_SSSE3) aom_smooth_h_predictor_4x8 = aom_smooth_h_predictor_4x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6962:9): [True: 1, False: 0]
  ------------------
 6963|      1|    aom_smooth_h_predictor_64x16 = aom_smooth_h_predictor_64x16_c;
 6964|      1|    if (flags & HAS_SSSE3) aom_smooth_h_predictor_64x16 = aom_smooth_h_predictor_64x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6964:9): [True: 1, False: 0]
  ------------------
 6965|      1|    aom_smooth_h_predictor_64x32 = aom_smooth_h_predictor_64x32_c;
 6966|      1|    if (flags & HAS_SSSE3) aom_smooth_h_predictor_64x32 = aom_smooth_h_predictor_64x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6966:9): [True: 1, False: 0]
  ------------------
 6967|      1|    aom_smooth_h_predictor_64x64 = aom_smooth_h_predictor_64x64_c;
 6968|      1|    if (flags & HAS_SSSE3) aom_smooth_h_predictor_64x64 = aom_smooth_h_predictor_64x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6968:9): [True: 1, False: 0]
  ------------------
 6969|      1|    aom_smooth_h_predictor_8x16 = aom_smooth_h_predictor_8x16_c;
 6970|      1|    if (flags & HAS_SSSE3) aom_smooth_h_predictor_8x16 = aom_smooth_h_predictor_8x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6970:9): [True: 1, False: 0]
  ------------------
 6971|      1|    aom_smooth_h_predictor_8x32 = aom_smooth_h_predictor_8x32_c;
 6972|      1|    if (flags & HAS_SSSE3) aom_smooth_h_predictor_8x32 = aom_smooth_h_predictor_8x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6972:9): [True: 1, False: 0]
  ------------------
 6973|      1|    aom_smooth_h_predictor_8x4 = aom_smooth_h_predictor_8x4_c;
 6974|      1|    if (flags & HAS_SSSE3) aom_smooth_h_predictor_8x4 = aom_smooth_h_predictor_8x4_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6974:9): [True: 1, False: 0]
  ------------------
 6975|      1|    aom_smooth_h_predictor_8x8 = aom_smooth_h_predictor_8x8_c;
 6976|      1|    if (flags & HAS_SSSE3) aom_smooth_h_predictor_8x8 = aom_smooth_h_predictor_8x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6976:9): [True: 1, False: 0]
  ------------------
 6977|      1|    aom_smooth_predictor_16x16 = aom_smooth_predictor_16x16_c;
 6978|      1|    if (flags & HAS_SSSE3) aom_smooth_predictor_16x16 = aom_smooth_predictor_16x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6978:9): [True: 1, False: 0]
  ------------------
 6979|      1|    aom_smooth_predictor_16x32 = aom_smooth_predictor_16x32_c;
 6980|      1|    if (flags & HAS_SSSE3) aom_smooth_predictor_16x32 = aom_smooth_predictor_16x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6980:9): [True: 1, False: 0]
  ------------------
 6981|      1|    aom_smooth_predictor_16x4 = aom_smooth_predictor_16x4_c;
 6982|      1|    if (flags & HAS_SSSE3) aom_smooth_predictor_16x4 = aom_smooth_predictor_16x4_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6982:9): [True: 1, False: 0]
  ------------------
 6983|      1|    aom_smooth_predictor_16x64 = aom_smooth_predictor_16x64_c;
 6984|      1|    if (flags & HAS_SSSE3) aom_smooth_predictor_16x64 = aom_smooth_predictor_16x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6984:9): [True: 1, False: 0]
  ------------------
 6985|      1|    aom_smooth_predictor_16x8 = aom_smooth_predictor_16x8_c;
 6986|      1|    if (flags & HAS_SSSE3) aom_smooth_predictor_16x8 = aom_smooth_predictor_16x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6986:9): [True: 1, False: 0]
  ------------------
 6987|      1|    aom_smooth_predictor_32x16 = aom_smooth_predictor_32x16_c;
 6988|      1|    if (flags & HAS_SSSE3) aom_smooth_predictor_32x16 = aom_smooth_predictor_32x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6988:9): [True: 1, False: 0]
  ------------------
 6989|      1|    aom_smooth_predictor_32x32 = aom_smooth_predictor_32x32_c;
 6990|      1|    if (flags & HAS_SSSE3) aom_smooth_predictor_32x32 = aom_smooth_predictor_32x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6990:9): [True: 1, False: 0]
  ------------------
 6991|      1|    aom_smooth_predictor_32x64 = aom_smooth_predictor_32x64_c;
 6992|      1|    if (flags & HAS_SSSE3) aom_smooth_predictor_32x64 = aom_smooth_predictor_32x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6992:9): [True: 1, False: 0]
  ------------------
 6993|      1|    aom_smooth_predictor_32x8 = aom_smooth_predictor_32x8_c;
 6994|      1|    if (flags & HAS_SSSE3) aom_smooth_predictor_32x8 = aom_smooth_predictor_32x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6994:9): [True: 1, False: 0]
  ------------------
 6995|      1|    aom_smooth_predictor_4x16 = aom_smooth_predictor_4x16_c;
 6996|      1|    if (flags & HAS_SSSE3) aom_smooth_predictor_4x16 = aom_smooth_predictor_4x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6996:9): [True: 1, False: 0]
  ------------------
 6997|      1|    aom_smooth_predictor_4x4 = aom_smooth_predictor_4x4_c;
 6998|      1|    if (flags & HAS_SSSE3) aom_smooth_predictor_4x4 = aom_smooth_predictor_4x4_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (6998:9): [True: 1, False: 0]
  ------------------
 6999|      1|    aom_smooth_predictor_4x8 = aom_smooth_predictor_4x8_c;
 7000|      1|    if (flags & HAS_SSSE3) aom_smooth_predictor_4x8 = aom_smooth_predictor_4x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7000:9): [True: 1, False: 0]
  ------------------
 7001|      1|    aom_smooth_predictor_64x16 = aom_smooth_predictor_64x16_c;
 7002|      1|    if (flags & HAS_SSSE3) aom_smooth_predictor_64x16 = aom_smooth_predictor_64x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7002:9): [True: 1, False: 0]
  ------------------
 7003|      1|    aom_smooth_predictor_64x32 = aom_smooth_predictor_64x32_c;
 7004|      1|    if (flags & HAS_SSSE3) aom_smooth_predictor_64x32 = aom_smooth_predictor_64x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7004:9): [True: 1, False: 0]
  ------------------
 7005|      1|    aom_smooth_predictor_64x64 = aom_smooth_predictor_64x64_c;
 7006|      1|    if (flags & HAS_SSSE3) aom_smooth_predictor_64x64 = aom_smooth_predictor_64x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7006:9): [True: 1, False: 0]
  ------------------
 7007|      1|    aom_smooth_predictor_8x16 = aom_smooth_predictor_8x16_c;
 7008|      1|    if (flags & HAS_SSSE3) aom_smooth_predictor_8x16 = aom_smooth_predictor_8x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7008:9): [True: 1, False: 0]
  ------------------
 7009|      1|    aom_smooth_predictor_8x32 = aom_smooth_predictor_8x32_c;
 7010|      1|    if (flags & HAS_SSSE3) aom_smooth_predictor_8x32 = aom_smooth_predictor_8x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7010:9): [True: 1, False: 0]
  ------------------
 7011|      1|    aom_smooth_predictor_8x4 = aom_smooth_predictor_8x4_c;
 7012|      1|    if (flags & HAS_SSSE3) aom_smooth_predictor_8x4 = aom_smooth_predictor_8x4_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7012:9): [True: 1, False: 0]
  ------------------
 7013|      1|    aom_smooth_predictor_8x8 = aom_smooth_predictor_8x8_c;
 7014|      1|    if (flags & HAS_SSSE3) aom_smooth_predictor_8x8 = aom_smooth_predictor_8x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7014:9): [True: 1, False: 0]
  ------------------
 7015|      1|    aom_smooth_v_predictor_16x16 = aom_smooth_v_predictor_16x16_c;
 7016|      1|    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x16 = aom_smooth_v_predictor_16x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7016:9): [True: 1, False: 0]
  ------------------
 7017|      1|    aom_smooth_v_predictor_16x32 = aom_smooth_v_predictor_16x32_c;
 7018|      1|    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x32 = aom_smooth_v_predictor_16x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7018:9): [True: 1, False: 0]
  ------------------
 7019|      1|    aom_smooth_v_predictor_16x4 = aom_smooth_v_predictor_16x4_c;
 7020|      1|    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x4 = aom_smooth_v_predictor_16x4_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7020:9): [True: 1, False: 0]
  ------------------
 7021|      1|    aom_smooth_v_predictor_16x64 = aom_smooth_v_predictor_16x64_c;
 7022|      1|    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x64 = aom_smooth_v_predictor_16x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7022:9): [True: 1, False: 0]
  ------------------
 7023|      1|    aom_smooth_v_predictor_16x8 = aom_smooth_v_predictor_16x8_c;
 7024|      1|    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x8 = aom_smooth_v_predictor_16x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7024:9): [True: 1, False: 0]
  ------------------
 7025|      1|    aom_smooth_v_predictor_32x16 = aom_smooth_v_predictor_32x16_c;
 7026|      1|    if (flags & HAS_SSSE3) aom_smooth_v_predictor_32x16 = aom_smooth_v_predictor_32x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7026:9): [True: 1, False: 0]
  ------------------
 7027|      1|    aom_smooth_v_predictor_32x32 = aom_smooth_v_predictor_32x32_c;
 7028|      1|    if (flags & HAS_SSSE3) aom_smooth_v_predictor_32x32 = aom_smooth_v_predictor_32x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7028:9): [True: 1, False: 0]
  ------------------
 7029|      1|    aom_smooth_v_predictor_32x64 = aom_smooth_v_predictor_32x64_c;
 7030|      1|    if (flags & HAS_SSSE3) aom_smooth_v_predictor_32x64 = aom_smooth_v_predictor_32x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7030:9): [True: 1, False: 0]
  ------------------
 7031|      1|    aom_smooth_v_predictor_32x8 = aom_smooth_v_predictor_32x8_c;
 7032|      1|    if (flags & HAS_SSSE3) aom_smooth_v_predictor_32x8 = aom_smooth_v_predictor_32x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7032:9): [True: 1, False: 0]
  ------------------
 7033|      1|    aom_smooth_v_predictor_4x16 = aom_smooth_v_predictor_4x16_c;
 7034|      1|    if (flags & HAS_SSSE3) aom_smooth_v_predictor_4x16 = aom_smooth_v_predictor_4x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7034:9): [True: 1, False: 0]
  ------------------
 7035|      1|    aom_smooth_v_predictor_4x4 = aom_smooth_v_predictor_4x4_c;
 7036|      1|    if (flags & HAS_SSSE3) aom_smooth_v_predictor_4x4 = aom_smooth_v_predictor_4x4_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7036:9): [True: 1, False: 0]
  ------------------
 7037|      1|    aom_smooth_v_predictor_4x8 = aom_smooth_v_predictor_4x8_c;
 7038|      1|    if (flags & HAS_SSSE3) aom_smooth_v_predictor_4x8 = aom_smooth_v_predictor_4x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7038:9): [True: 1, False: 0]
  ------------------
 7039|      1|    aom_smooth_v_predictor_64x16 = aom_smooth_v_predictor_64x16_c;
 7040|      1|    if (flags & HAS_SSSE3) aom_smooth_v_predictor_64x16 = aom_smooth_v_predictor_64x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7040:9): [True: 1, False: 0]
  ------------------
 7041|      1|    aom_smooth_v_predictor_64x32 = aom_smooth_v_predictor_64x32_c;
 7042|      1|    if (flags & HAS_SSSE3) aom_smooth_v_predictor_64x32 = aom_smooth_v_predictor_64x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7042:9): [True: 1, False: 0]
  ------------------
 7043|      1|    aom_smooth_v_predictor_64x64 = aom_smooth_v_predictor_64x64_c;
 7044|      1|    if (flags & HAS_SSSE3) aom_smooth_v_predictor_64x64 = aom_smooth_v_predictor_64x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7044:9): [True: 1, False: 0]
  ------------------
 7045|      1|    aom_smooth_v_predictor_8x16 = aom_smooth_v_predictor_8x16_c;
 7046|      1|    if (flags & HAS_SSSE3) aom_smooth_v_predictor_8x16 = aom_smooth_v_predictor_8x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7046:9): [True: 1, False: 0]
  ------------------
 7047|      1|    aom_smooth_v_predictor_8x32 = aom_smooth_v_predictor_8x32_c;
 7048|      1|    if (flags & HAS_SSSE3) aom_smooth_v_predictor_8x32 = aom_smooth_v_predictor_8x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7048:9): [True: 1, False: 0]
  ------------------
 7049|      1|    aom_smooth_v_predictor_8x4 = aom_smooth_v_predictor_8x4_c;
 7050|      1|    if (flags & HAS_SSSE3) aom_smooth_v_predictor_8x4 = aom_smooth_v_predictor_8x4_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7050:9): [True: 1, False: 0]
  ------------------
 7051|      1|    aom_smooth_v_predictor_8x8 = aom_smooth_v_predictor_8x8_c;
 7052|      1|    if (flags & HAS_SSSE3) aom_smooth_v_predictor_8x8 = aom_smooth_v_predictor_8x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7052:9): [True: 1, False: 0]
  ------------------
 7053|      1|    aom_sse = aom_sse_c;
 7054|      1|    if (flags & HAS_SSE4_1) aom_sse = aom_sse_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (7054:9): [True: 1, False: 0]
  ------------------
 7055|      1|    if (flags & HAS_AVX2) aom_sse = aom_sse_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7055:9): [True: 1, False: 0]
  ------------------
 7056|      1|    aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_c;
 7057|      1|    if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7057:9): [True: 1, False: 0]
  ------------------
 7058|      1|    if (flags & HAS_AVX2) aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7058:9): [True: 1, False: 0]
  ------------------
 7059|      1|    aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_c;
 7060|      1|    if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7060:9): [True: 1, False: 0]
  ------------------
 7061|      1|    if (flags & HAS_AVX2) aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7061:9): [True: 1, False: 0]
  ------------------
 7062|      1|    aom_sub_pixel_avg_variance16x16 = aom_sub_pixel_avg_variance16x16_c;
 7063|      1|    if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x16 = aom_sub_pixel_avg_variance16x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7063:9): [True: 1, False: 0]
  ------------------
 7064|      1|    aom_sub_pixel_avg_variance16x32 = aom_sub_pixel_avg_variance16x32_c;
 7065|      1|    if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x32 = aom_sub_pixel_avg_variance16x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7065:9): [True: 1, False: 0]
  ------------------
 7066|      1|    aom_sub_pixel_avg_variance16x4 = aom_sub_pixel_avg_variance16x4_c;
 7067|      1|    if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x4 = aom_sub_pixel_avg_variance16x4_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7067:9): [True: 1, False: 0]
  ------------------
 7068|      1|    aom_sub_pixel_avg_variance16x64 = aom_sub_pixel_avg_variance16x64_c;
 7069|      1|    if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x64 = aom_sub_pixel_avg_variance16x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7069:9): [True: 1, False: 0]
  ------------------
 7070|      1|    aom_sub_pixel_avg_variance16x8 = aom_sub_pixel_avg_variance16x8_c;
 7071|      1|    if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x8 = aom_sub_pixel_avg_variance16x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7071:9): [True: 1, False: 0]
  ------------------
 7072|      1|    aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_c;
 7073|      1|    if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7073:9): [True: 1, False: 0]
  ------------------
 7074|      1|    if (flags & HAS_AVX2) aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7074:9): [True: 1, False: 0]
  ------------------
 7075|      1|    aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_c;
 7076|      1|    if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7076:9): [True: 1, False: 0]
  ------------------
 7077|      1|    if (flags & HAS_AVX2) aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7077:9): [True: 1, False: 0]
  ------------------
 7078|      1|    aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_c;
 7079|      1|    if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7079:9): [True: 1, False: 0]
  ------------------
 7080|      1|    if (flags & HAS_AVX2) aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7080:9): [True: 1, False: 0]
  ------------------
 7081|      1|    aom_sub_pixel_avg_variance32x8 = aom_sub_pixel_avg_variance32x8_c;
 7082|      1|    if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x8 = aom_sub_pixel_avg_variance32x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7082:9): [True: 1, False: 0]
  ------------------
 7083|      1|    aom_sub_pixel_avg_variance4x16 = aom_sub_pixel_avg_variance4x16_c;
 7084|      1|    if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance4x16 = aom_sub_pixel_avg_variance4x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7084:9): [True: 1, False: 0]
  ------------------
 7085|      1|    aom_sub_pixel_avg_variance4x4 = aom_sub_pixel_avg_variance4x4_c;
 7086|      1|    if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance4x4 = aom_sub_pixel_avg_variance4x4_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7086:9): [True: 1, False: 0]
  ------------------
 7087|      1|    aom_sub_pixel_avg_variance4x8 = aom_sub_pixel_avg_variance4x8_c;
 7088|      1|    if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance4x8 = aom_sub_pixel_avg_variance4x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7088:9): [True: 1, False: 0]
  ------------------
 7089|      1|    aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_c;
 7090|      1|    if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7090:9): [True: 1, False: 0]
  ------------------
 7091|      1|    if (flags & HAS_AVX2) aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7091:9): [True: 1, False: 0]
  ------------------
 7092|      1|    aom_sub_pixel_avg_variance64x16 = aom_sub_pixel_avg_variance64x16_c;
 7093|      1|    if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x16 = aom_sub_pixel_avg_variance64x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7093:9): [True: 1, False: 0]
  ------------------
 7094|      1|    aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_c;
 7095|      1|    if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7095:9): [True: 1, False: 0]
  ------------------
 7096|      1|    if (flags & HAS_AVX2) aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7096:9): [True: 1, False: 0]
  ------------------
 7097|      1|    aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_c;
 7098|      1|    if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7098:9): [True: 1, False: 0]
  ------------------
 7099|      1|    if (flags & HAS_AVX2) aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7099:9): [True: 1, False: 0]
  ------------------
 7100|      1|    aom_sub_pixel_avg_variance8x16 = aom_sub_pixel_avg_variance8x16_c;
 7101|      1|    if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x16 = aom_sub_pixel_avg_variance8x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7101:9): [True: 1, False: 0]
  ------------------
 7102|      1|    aom_sub_pixel_avg_variance8x32 = aom_sub_pixel_avg_variance8x32_c;
 7103|      1|    if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x32 = aom_sub_pixel_avg_variance8x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7103:9): [True: 1, False: 0]
  ------------------
 7104|      1|    aom_sub_pixel_avg_variance8x4 = aom_sub_pixel_avg_variance8x4_c;
 7105|      1|    if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x4 = aom_sub_pixel_avg_variance8x4_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7105:9): [True: 1, False: 0]
  ------------------
 7106|      1|    aom_sub_pixel_avg_variance8x8 = aom_sub_pixel_avg_variance8x8_c;
 7107|      1|    if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x8 = aom_sub_pixel_avg_variance8x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7107:9): [True: 1, False: 0]
  ------------------
 7108|      1|    aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_c;
 7109|      1|    if (flags & HAS_SSSE3) aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7109:9): [True: 1, False: 0]
  ------------------
 7110|      1|    if (flags & HAS_AVX2) aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7110:9): [True: 1, False: 0]
  ------------------
 7111|      1|    aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_c;
 7112|      1|    if (flags & HAS_SSSE3) aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7112:9): [True: 1, False: 0]
  ------------------
 7113|      1|    if (flags & HAS_AVX2) aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7113:9): [True: 1, False: 0]
  ------------------
 7114|      1|    aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_c;
 7115|      1|    if (flags & HAS_SSSE3) aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7115:9): [True: 1, False: 0]
  ------------------
 7116|      1|    if (flags & HAS_AVX2) aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7116:9): [True: 1, False: 0]
  ------------------
 7117|      1|    aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_c;
 7118|      1|    if (flags & HAS_SSSE3) aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7118:9): [True: 1, False: 0]
  ------------------
 7119|      1|    if (flags & HAS_AVX2) aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7119:9): [True: 1, False: 0]
  ------------------
 7120|      1|    aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_c;
 7121|      1|    if (flags & HAS_SSSE3) aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7121:9): [True: 1, False: 0]
  ------------------
 7122|      1|    if (flags & HAS_AVX2) aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7122:9): [True: 1, False: 0]
  ------------------
 7123|      1|    aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_c;
 7124|      1|    if (flags & HAS_SSSE3) aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7124:9): [True: 1, False: 0]
  ------------------
 7125|      1|    if (flags & HAS_AVX2) aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7125:9): [True: 1, False: 0]
  ------------------
 7126|      1|    aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_c;
 7127|      1|    if (flags & HAS_SSSE3) aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7127:9): [True: 1, False: 0]
  ------------------
 7128|      1|    if (flags & HAS_AVX2) aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7128:9): [True: 1, False: 0]
  ------------------
 7129|      1|    aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_c;
 7130|      1|    if (flags & HAS_SSSE3) aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7130:9): [True: 1, False: 0]
  ------------------
 7131|      1|    if (flags & HAS_AVX2) aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7131:9): [True: 1, False: 0]
  ------------------
 7132|      1|    aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_c;
 7133|      1|    if (flags & HAS_SSSE3) aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7133:9): [True: 1, False: 0]
  ------------------
 7134|      1|    if (flags & HAS_AVX2) aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7134:9): [True: 1, False: 0]
  ------------------
 7135|      1|    aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_c;
 7136|      1|    if (flags & HAS_SSSE3) aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7136:9): [True: 1, False: 0]
  ------------------
 7137|      1|    if (flags & HAS_AVX2) aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7137:9): [True: 1, False: 0]
  ------------------
 7138|      1|    aom_sub_pixel_variance32x8 = aom_sub_pixel_variance32x8_c;
 7139|      1|    if (flags & HAS_SSSE3) aom_sub_pixel_variance32x8 = aom_sub_pixel_variance32x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7139:9): [True: 1, False: 0]
  ------------------
 7140|      1|    aom_sub_pixel_variance4x16 = aom_sub_pixel_variance4x16_c;
 7141|      1|    if (flags & HAS_SSSE3) aom_sub_pixel_variance4x16 = aom_sub_pixel_variance4x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7141:9): [True: 1, False: 0]
  ------------------
 7142|      1|    aom_sub_pixel_variance4x4 = aom_sub_pixel_variance4x4_c;
 7143|      1|    if (flags & HAS_SSSE3) aom_sub_pixel_variance4x4 = aom_sub_pixel_variance4x4_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7143:9): [True: 1, False: 0]
  ------------------
 7144|      1|    aom_sub_pixel_variance4x8 = aom_sub_pixel_variance4x8_c;
 7145|      1|    if (flags & HAS_SSSE3) aom_sub_pixel_variance4x8 = aom_sub_pixel_variance4x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7145:9): [True: 1, False: 0]
  ------------------
 7146|      1|    aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_c;
 7147|      1|    if (flags & HAS_SSSE3) aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7147:9): [True: 1, False: 0]
  ------------------
 7148|      1|    if (flags & HAS_AVX2) aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7148:9): [True: 1, False: 0]
  ------------------
 7149|      1|    aom_sub_pixel_variance64x16 = aom_sub_pixel_variance64x16_c;
 7150|      1|    if (flags & HAS_SSSE3) aom_sub_pixel_variance64x16 = aom_sub_pixel_variance64x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7150:9): [True: 1, False: 0]
  ------------------
 7151|      1|    aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_c;
 7152|      1|    if (flags & HAS_SSSE3) aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7152:9): [True: 1, False: 0]
  ------------------
 7153|      1|    if (flags & HAS_AVX2) aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7153:9): [True: 1, False: 0]
  ------------------
 7154|      1|    aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_c;
 7155|      1|    if (flags & HAS_SSSE3) aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7155:9): [True: 1, False: 0]
  ------------------
 7156|      1|    if (flags & HAS_AVX2) aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7156:9): [True: 1, False: 0]
  ------------------
 7157|      1|    aom_sub_pixel_variance8x16 = aom_sub_pixel_variance8x16_c;
 7158|      1|    if (flags & HAS_SSSE3) aom_sub_pixel_variance8x16 = aom_sub_pixel_variance8x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7158:9): [True: 1, False: 0]
  ------------------
 7159|      1|    aom_sub_pixel_variance8x32 = aom_sub_pixel_variance8x32_c;
 7160|      1|    if (flags & HAS_SSSE3) aom_sub_pixel_variance8x32 = aom_sub_pixel_variance8x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7160:9): [True: 1, False: 0]
  ------------------
 7161|      1|    aom_sub_pixel_variance8x4 = aom_sub_pixel_variance8x4_c;
 7162|      1|    if (flags & HAS_SSSE3) aom_sub_pixel_variance8x4 = aom_sub_pixel_variance8x4_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7162:9): [True: 1, False: 0]
  ------------------
 7163|      1|    aom_sub_pixel_variance8x8 = aom_sub_pixel_variance8x8_c;
 7164|      1|    if (flags & HAS_SSSE3) aom_sub_pixel_variance8x8 = aom_sub_pixel_variance8x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (7164:9): [True: 1, False: 0]
  ------------------
 7165|      1|    aom_subtract_block = aom_subtract_block_sse2;
 7166|      1|    if (flags & HAS_AVX2) aom_subtract_block = aom_subtract_block_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7166:9): [True: 1, False: 0]
  ------------------
 7167|      1|    aom_sum_squares_2d_i16 = aom_sum_squares_2d_i16_sse2;
 7168|      1|    if (flags & HAS_AVX2) aom_sum_squares_2d_i16 = aom_sum_squares_2d_i16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7168:9): [True: 1, False: 0]
  ------------------
 7169|      1|    aom_sum_sse_2d_i16 = aom_sum_sse_2d_i16_sse2;
 7170|      1|    if (flags & HAS_AVX2) aom_sum_sse_2d_i16 = aom_sum_sse_2d_i16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7170:9): [True: 1, False: 0]
  ------------------
 7171|      1|    aom_v_predictor_32x16 = aom_v_predictor_32x16_sse2;
 7172|      1|    if (flags & HAS_AVX2) aom_v_predictor_32x16 = aom_v_predictor_32x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7172:9): [True: 1, False: 0]
  ------------------
 7173|      1|    aom_v_predictor_32x32 = aom_v_predictor_32x32_sse2;
 7174|      1|    if (flags & HAS_AVX2) aom_v_predictor_32x32 = aom_v_predictor_32x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7174:9): [True: 1, False: 0]
  ------------------
 7175|      1|    aom_v_predictor_32x64 = aom_v_predictor_32x64_sse2;
 7176|      1|    if (flags & HAS_AVX2) aom_v_predictor_32x64 = aom_v_predictor_32x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7176:9): [True: 1, False: 0]
  ------------------
 7177|      1|    aom_v_predictor_64x16 = aom_v_predictor_64x16_sse2;
 7178|      1|    if (flags & HAS_AVX2) aom_v_predictor_64x16 = aom_v_predictor_64x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7178:9): [True: 1, False: 0]
  ------------------
 7179|      1|    aom_v_predictor_64x32 = aom_v_predictor_64x32_sse2;
 7180|      1|    if (flags & HAS_AVX2) aom_v_predictor_64x32 = aom_v_predictor_64x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7180:9): [True: 1, False: 0]
  ------------------
 7181|      1|    aom_v_predictor_64x64 = aom_v_predictor_64x64_sse2;
 7182|      1|    if (flags & HAS_AVX2) aom_v_predictor_64x64 = aom_v_predictor_64x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7182:9): [True: 1, False: 0]
  ------------------
 7183|      1|    aom_var_2d_u16 = aom_var_2d_u16_sse2;
 7184|      1|    if (flags & HAS_AVX2) aom_var_2d_u16 = aom_var_2d_u16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7184:9): [True: 1, False: 0]
  ------------------
 7185|      1|    aom_var_2d_u8 = aom_var_2d_u8_sse2;
 7186|      1|    if (flags & HAS_AVX2) aom_var_2d_u8 = aom_var_2d_u8_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7186:9): [True: 1, False: 0]
  ------------------
 7187|      1|    aom_variance128x128 = aom_variance128x128_sse2;
 7188|      1|    if (flags & HAS_AVX2) aom_variance128x128 = aom_variance128x128_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7188:9): [True: 1, False: 0]
  ------------------
 7189|      1|    aom_variance128x64 = aom_variance128x64_sse2;
 7190|      1|    if (flags & HAS_AVX2) aom_variance128x64 = aom_variance128x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7190:9): [True: 1, False: 0]
  ------------------
 7191|      1|    aom_variance16x16 = aom_variance16x16_sse2;
 7192|      1|    if (flags & HAS_AVX2) aom_variance16x16 = aom_variance16x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7192:9): [True: 1, False: 0]
  ------------------
 7193|      1|    aom_variance16x32 = aom_variance16x32_sse2;
 7194|      1|    if (flags & HAS_AVX2) aom_variance16x32 = aom_variance16x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7194:9): [True: 1, False: 0]
  ------------------
 7195|      1|    aom_variance16x4 = aom_variance16x4_sse2;
 7196|      1|    if (flags & HAS_AVX2) aom_variance16x4 = aom_variance16x4_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7196:9): [True: 1, False: 0]
  ------------------
 7197|      1|    aom_variance16x64 = aom_variance16x64_sse2;
 7198|      1|    if (flags & HAS_AVX2) aom_variance16x64 = aom_variance16x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7198:9): [True: 1, False: 0]
  ------------------
 7199|      1|    aom_variance16x8 = aom_variance16x8_sse2;
 7200|      1|    if (flags & HAS_AVX2) aom_variance16x8 = aom_variance16x8_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7200:9): [True: 1, False: 0]
  ------------------
 7201|      1|    aom_variance32x16 = aom_variance32x16_sse2;
 7202|      1|    if (flags & HAS_AVX2) aom_variance32x16 = aom_variance32x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7202:9): [True: 1, False: 0]
  ------------------
 7203|      1|    aom_variance32x32 = aom_variance32x32_sse2;
 7204|      1|    if (flags & HAS_AVX2) aom_variance32x32 = aom_variance32x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7204:9): [True: 1, False: 0]
  ------------------
 7205|      1|    aom_variance32x64 = aom_variance32x64_sse2;
 7206|      1|    if (flags & HAS_AVX2) aom_variance32x64 = aom_variance32x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7206:9): [True: 1, False: 0]
  ------------------
 7207|      1|    aom_variance32x8 = aom_variance32x8_sse2;
 7208|      1|    if (flags & HAS_AVX2) aom_variance32x8 = aom_variance32x8_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7208:9): [True: 1, False: 0]
  ------------------
 7209|      1|    aom_variance64x128 = aom_variance64x128_sse2;
 7210|      1|    if (flags & HAS_AVX2) aom_variance64x128 = aom_variance64x128_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7210:9): [True: 1, False: 0]
  ------------------
 7211|      1|    aom_variance64x16 = aom_variance64x16_sse2;
 7212|      1|    if (flags & HAS_AVX2) aom_variance64x16 = aom_variance64x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7212:9): [True: 1, False: 0]
  ------------------
 7213|      1|    aom_variance64x32 = aom_variance64x32_sse2;
 7214|      1|    if (flags & HAS_AVX2) aom_variance64x32 = aom_variance64x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7214:9): [True: 1, False: 0]
  ------------------
 7215|      1|    aom_variance64x64 = aom_variance64x64_sse2;
 7216|      1|    if (flags & HAS_AVX2) aom_variance64x64 = aom_variance64x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7216:9): [True: 1, False: 0]
  ------------------
 7217|      1|    aom_vector_var = aom_vector_var_c;
 7218|      1|    if (flags & HAS_SSE4_1) aom_vector_var = aom_vector_var_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (7218:9): [True: 1, False: 0]
  ------------------
 7219|      1|    if (flags & HAS_AVX2) aom_vector_var = aom_vector_var_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (7219:9): [True: 1, False: 0]
  ------------------
 7220|      1|}

aom_scale_rtcd.c:setup_rtcd_internal:
   77|      1|{
   78|      1|    int flags = x86_simd_caps();
   79|       |
   80|      1|    (void)flags;
   81|       |
   82|      1|}

av1_rtcd.c:setup_rtcd_internal:
  806|      1|{
  807|      1|    int flags = x86_simd_caps();
  808|       |
  809|      1|    (void)flags;
  810|       |
  811|      1|    av1_apply_selfguided_restoration = av1_apply_selfguided_restoration_c;
  812|      1|    if (flags & HAS_SSE4_1) av1_apply_selfguided_restoration = av1_apply_selfguided_restoration_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (812:9): [True: 1, False: 0]
  ------------------
  813|      1|    if (flags & HAS_AVX2) av1_apply_selfguided_restoration = av1_apply_selfguided_restoration_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (813:9): [True: 1, False: 0]
  ------------------
  814|      1|    av1_apply_temporal_filter = av1_apply_temporal_filter_sse2;
  815|      1|    if (flags & HAS_AVX2) av1_apply_temporal_filter = av1_apply_temporal_filter_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (815:9): [True: 1, False: 0]
  ------------------
  816|      1|    av1_block_error = av1_block_error_sse2;
  817|      1|    if (flags & HAS_AVX2) av1_block_error = av1_block_error_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (817:9): [True: 1, False: 0]
  ------------------
  818|      1|    av1_block_error_lp = av1_block_error_lp_sse2;
  819|      1|    if (flags & HAS_AVX2) av1_block_error_lp = av1_block_error_lp_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (819:9): [True: 1, False: 0]
  ------------------
  820|      1|    av1_build_compound_diffwtd_mask = av1_build_compound_diffwtd_mask_c;
  821|      1|    if (flags & HAS_SSE4_1) av1_build_compound_diffwtd_mask = av1_build_compound_diffwtd_mask_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (821:9): [True: 1, False: 0]
  ------------------
  822|      1|    if (flags & HAS_AVX2) av1_build_compound_diffwtd_mask = av1_build_compound_diffwtd_mask_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (822:9): [True: 1, False: 0]
  ------------------
  823|      1|    av1_build_compound_diffwtd_mask_d16 = av1_build_compound_diffwtd_mask_d16_c;
  824|      1|    if (flags & HAS_SSE4_1) av1_build_compound_diffwtd_mask_d16 = av1_build_compound_diffwtd_mask_d16_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (824:9): [True: 1, False: 0]
  ------------------
  825|      1|    if (flags & HAS_AVX2) av1_build_compound_diffwtd_mask_d16 = av1_build_compound_diffwtd_mask_d16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (825:9): [True: 1, False: 0]
  ------------------
  826|      1|    av1_build_compound_diffwtd_mask_highbd = av1_build_compound_diffwtd_mask_highbd_c;
  827|      1|    if (flags & HAS_SSSE3) av1_build_compound_diffwtd_mask_highbd = av1_build_compound_diffwtd_mask_highbd_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (827:9): [True: 1, False: 0]
  ------------------
  828|      1|    if (flags & HAS_AVX2) av1_build_compound_diffwtd_mask_highbd = av1_build_compound_diffwtd_mask_highbd_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (828:9): [True: 1, False: 0]
  ------------------
  829|      1|    av1_calc_indices_dim1 = av1_calc_indices_dim1_sse2;
  830|      1|    if (flags & HAS_AVX2) av1_calc_indices_dim1 = av1_calc_indices_dim1_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (830:9): [True: 1, False: 0]
  ------------------
  831|      1|    av1_calc_indices_dim2 = av1_calc_indices_dim2_sse2;
  832|      1|    if (flags & HAS_AVX2) av1_calc_indices_dim2 = av1_calc_indices_dim2_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (832:9): [True: 1, False: 0]
  ------------------
  833|      1|    av1_calc_proj_params = av1_calc_proj_params_c;
  834|      1|    if (flags & HAS_SSE4_1) av1_calc_proj_params = av1_calc_proj_params_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (834:9): [True: 1, False: 0]
  ------------------
  835|      1|    if (flags & HAS_AVX2) av1_calc_proj_params = av1_calc_proj_params_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (835:9): [True: 1, False: 0]
  ------------------
  836|      1|    av1_calc_proj_params_high_bd = av1_calc_proj_params_high_bd_c;
  837|      1|    if (flags & HAS_SSE4_1) av1_calc_proj_params_high_bd = av1_calc_proj_params_high_bd_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (837:9): [True: 1, False: 0]
  ------------------
  838|      1|    if (flags & HAS_AVX2) av1_calc_proj_params_high_bd = av1_calc_proj_params_high_bd_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (838:9): [True: 1, False: 0]
  ------------------
  839|      1|    av1_cnn_convolve_no_maxpool_padding_valid = av1_cnn_convolve_no_maxpool_padding_valid_c;
  840|      1|    if (flags & HAS_AVX2) av1_cnn_convolve_no_maxpool_padding_valid = av1_cnn_convolve_no_maxpool_padding_valid_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (840:9): [True: 1, False: 0]
  ------------------
  841|      1|    av1_compute_stats = av1_compute_stats_c;
  842|      1|    if (flags & HAS_SSE4_1) av1_compute_stats = av1_compute_stats_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (842:9): [True: 1, False: 0]
  ------------------
  843|      1|    if (flags & HAS_AVX2) av1_compute_stats = av1_compute_stats_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (843:9): [True: 1, False: 0]
  ------------------
  844|      1|    av1_compute_stats_highbd = av1_compute_stats_highbd_c;
  845|      1|    if (flags & HAS_SSE4_1) av1_compute_stats_highbd = av1_compute_stats_highbd_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (845:9): [True: 1, False: 0]
  ------------------
  846|      1|    if (flags & HAS_AVX2) av1_compute_stats_highbd = av1_compute_stats_highbd_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (846:9): [True: 1, False: 0]
  ------------------
  847|      1|    av1_convolve_2d_scale = av1_convolve_2d_scale_c;
  848|      1|    if (flags & HAS_SSE4_1) av1_convolve_2d_scale = av1_convolve_2d_scale_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (848:9): [True: 1, False: 0]
  ------------------
  849|      1|    av1_convolve_2d_sr = av1_convolve_2d_sr_sse2;
  850|      1|    if (flags & HAS_AVX2) av1_convolve_2d_sr = av1_convolve_2d_sr_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (850:9): [True: 1, False: 0]
  ------------------
  851|      1|    av1_convolve_horiz_rs = av1_convolve_horiz_rs_c;
  852|      1|    if (flags & HAS_SSE4_1) av1_convolve_horiz_rs = av1_convolve_horiz_rs_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (852:9): [True: 1, False: 0]
  ------------------
  853|      1|    av1_convolve_x_sr = av1_convolve_x_sr_sse2;
  854|      1|    if (flags & HAS_AVX2) av1_convolve_x_sr = av1_convolve_x_sr_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (854:9): [True: 1, False: 0]
  ------------------
  855|      1|    av1_convolve_y_sr = av1_convolve_y_sr_sse2;
  856|      1|    if (flags & HAS_AVX2) av1_convolve_y_sr = av1_convolve_y_sr_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (856:9): [True: 1, False: 0]
  ------------------
  857|      1|    av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_c;
  858|      1|    if (flags & HAS_SSSE3) av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (858:9): [True: 1, False: 0]
  ------------------
  859|      1|    if (flags & HAS_AVX2) av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (859:9): [True: 1, False: 0]
  ------------------
  860|      1|    av1_dist_wtd_convolve_2d_copy = av1_dist_wtd_convolve_2d_copy_sse2;
  861|      1|    if (flags & HAS_AVX2) av1_dist_wtd_convolve_2d_copy = av1_dist_wtd_convolve_2d_copy_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (861:9): [True: 1, False: 0]
  ------------------
  862|      1|    av1_dist_wtd_convolve_x = av1_dist_wtd_convolve_x_sse2;
  863|      1|    if (flags & HAS_AVX2) av1_dist_wtd_convolve_x = av1_dist_wtd_convolve_x_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (863:9): [True: 1, False: 0]
  ------------------
  864|      1|    av1_dist_wtd_convolve_y = av1_dist_wtd_convolve_y_sse2;
  865|      1|    if (flags & HAS_AVX2) av1_dist_wtd_convolve_y = av1_dist_wtd_convolve_y_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (865:9): [True: 1, False: 0]
  ------------------
  866|      1|    av1_dr_prediction_z1 = av1_dr_prediction_z1_c;
  867|      1|    if (flags & HAS_SSE4_1) av1_dr_prediction_z1 = av1_dr_prediction_z1_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (867:9): [True: 1, False: 0]
  ------------------
  868|      1|    if (flags & HAS_AVX2) av1_dr_prediction_z1 = av1_dr_prediction_z1_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (868:9): [True: 1, False: 0]
  ------------------
  869|      1|    av1_dr_prediction_z2 = av1_dr_prediction_z2_c;
  870|      1|    if (flags & HAS_SSE4_1) av1_dr_prediction_z2 = av1_dr_prediction_z2_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (870:9): [True: 1, False: 0]
  ------------------
  871|      1|    if (flags & HAS_AVX2) av1_dr_prediction_z2 = av1_dr_prediction_z2_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (871:9): [True: 1, False: 0]
  ------------------
  872|      1|    av1_dr_prediction_z3 = av1_dr_prediction_z3_c;
  873|      1|    if (flags & HAS_SSE4_1) av1_dr_prediction_z3 = av1_dr_prediction_z3_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (873:9): [True: 1, False: 0]
  ------------------
  874|      1|    if (flags & HAS_AVX2) av1_dr_prediction_z3 = av1_dr_prediction_z3_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (874:9): [True: 1, False: 0]
  ------------------
  875|      1|    av1_estimate_noise_from_single_plane = av1_estimate_noise_from_single_plane_c;
  876|      1|    if (flags & HAS_AVX2) av1_estimate_noise_from_single_plane = av1_estimate_noise_from_single_plane_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (876:9): [True: 1, False: 0]
  ------------------
  877|      1|    av1_filter_intra_edge = av1_filter_intra_edge_c;
  878|      1|    if (flags & HAS_SSE4_1) av1_filter_intra_edge = av1_filter_intra_edge_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (878:9): [True: 1, False: 0]
  ------------------
  879|      1|    av1_filter_intra_predictor = av1_filter_intra_predictor_c;
  880|      1|    if (flags & HAS_SSE4_1) av1_filter_intra_predictor = av1_filter_intra_predictor_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (880:9): [True: 1, False: 0]
  ------------------
  881|      1|    av1_fwd_txfm2d_16x16 = av1_fwd_txfm2d_16x16_c;
  882|      1|    if (flags & HAS_SSE4_1) av1_fwd_txfm2d_16x16 = av1_fwd_txfm2d_16x16_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (882:9): [True: 1, False: 0]
  ------------------
  883|      1|    if (flags & HAS_AVX2) av1_fwd_txfm2d_16x16 = av1_fwd_txfm2d_16x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (883:9): [True: 1, False: 0]
  ------------------
  884|      1|    av1_fwd_txfm2d_16x32 = av1_fwd_txfm2d_16x32_c;
  885|      1|    if (flags & HAS_SSE4_1) av1_fwd_txfm2d_16x32 = av1_fwd_txfm2d_16x32_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (885:9): [True: 1, False: 0]
  ------------------
  886|      1|    av1_fwd_txfm2d_16x4 = av1_fwd_txfm2d_16x4_c;
  887|      1|    if (flags & HAS_SSE4_1) av1_fwd_txfm2d_16x4 = av1_fwd_txfm2d_16x4_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (887:9): [True: 1, False: 0]
  ------------------
  888|      1|    av1_fwd_txfm2d_16x64 = av1_fwd_txfm2d_16x64_c;
  889|      1|    if (flags & HAS_SSE4_1) av1_fwd_txfm2d_16x64 = av1_fwd_txfm2d_16x64_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (889:9): [True: 1, False: 0]
  ------------------
  890|      1|    av1_fwd_txfm2d_16x8 = av1_fwd_txfm2d_16x8_c;
  891|      1|    if (flags & HAS_SSE4_1) av1_fwd_txfm2d_16x8 = av1_fwd_txfm2d_16x8_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (891:9): [True: 1, False: 0]
  ------------------
  892|      1|    if (flags & HAS_AVX2) av1_fwd_txfm2d_16x8 = av1_fwd_txfm2d_16x8_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (892:9): [True: 1, False: 0]
  ------------------
  893|      1|    av1_fwd_txfm2d_32x16 = av1_fwd_txfm2d_32x16_c;
  894|      1|    if (flags & HAS_SSE4_1) av1_fwd_txfm2d_32x16 = av1_fwd_txfm2d_32x16_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (894:9): [True: 1, False: 0]
  ------------------
  895|      1|    av1_fwd_txfm2d_32x32 = av1_fwd_txfm2d_32x32_c;
  896|      1|    if (flags & HAS_SSE4_1) av1_fwd_txfm2d_32x32 = av1_fwd_txfm2d_32x32_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (896:9): [True: 1, False: 0]
  ------------------
  897|      1|    if (flags & HAS_AVX2) av1_fwd_txfm2d_32x32 = av1_fwd_txfm2d_32x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (897:9): [True: 1, False: 0]
  ------------------
  898|      1|    av1_fwd_txfm2d_32x64 = av1_fwd_txfm2d_32x64_c;
  899|      1|    if (flags & HAS_SSE4_1) av1_fwd_txfm2d_32x64 = av1_fwd_txfm2d_32x64_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (899:9): [True: 1, False: 0]
  ------------------
  900|      1|    av1_fwd_txfm2d_32x8 = av1_fwd_txfm2d_32x8_c;
  901|      1|    if (flags & HAS_SSE4_1) av1_fwd_txfm2d_32x8 = av1_fwd_txfm2d_32x8_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (901:9): [True: 1, False: 0]
  ------------------
  902|      1|    av1_fwd_txfm2d_4x16 = av1_fwd_txfm2d_4x16_c;
  903|      1|    if (flags & HAS_SSE4_1) av1_fwd_txfm2d_4x16 = av1_fwd_txfm2d_4x16_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (903:9): [True: 1, False: 0]
  ------------------
  904|      1|    av1_fwd_txfm2d_4x4 = av1_fwd_txfm2d_4x4_c;
  905|      1|    if (flags & HAS_SSE4_1) av1_fwd_txfm2d_4x4 = av1_fwd_txfm2d_4x4_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (905:9): [True: 1, False: 0]
  ------------------
  906|      1|    av1_fwd_txfm2d_4x8 = av1_fwd_txfm2d_4x8_c;
  907|      1|    if (flags & HAS_SSE4_1) av1_fwd_txfm2d_4x8 = av1_fwd_txfm2d_4x8_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (907:9): [True: 1, False: 0]
  ------------------
  908|      1|    av1_fwd_txfm2d_64x16 = av1_fwd_txfm2d_64x16_c;
  909|      1|    if (flags & HAS_SSE4_1) av1_fwd_txfm2d_64x16 = av1_fwd_txfm2d_64x16_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (909:9): [True: 1, False: 0]
  ------------------
  910|      1|    av1_fwd_txfm2d_64x32 = av1_fwd_txfm2d_64x32_c;
  911|      1|    if (flags & HAS_SSE4_1) av1_fwd_txfm2d_64x32 = av1_fwd_txfm2d_64x32_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (911:9): [True: 1, False: 0]
  ------------------
  912|      1|    av1_fwd_txfm2d_64x64 = av1_fwd_txfm2d_64x64_c;
  913|      1|    if (flags & HAS_SSE4_1) av1_fwd_txfm2d_64x64 = av1_fwd_txfm2d_64x64_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (913:9): [True: 1, False: 0]
  ------------------
  914|      1|    if (flags & HAS_AVX2) av1_fwd_txfm2d_64x64 = av1_fwd_txfm2d_64x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (914:9): [True: 1, False: 0]
  ------------------
  915|      1|    av1_fwd_txfm2d_8x16 = av1_fwd_txfm2d_8x16_c;
  916|      1|    if (flags & HAS_SSE4_1) av1_fwd_txfm2d_8x16 = av1_fwd_txfm2d_8x16_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (916:9): [True: 1, False: 0]
  ------------------
  917|      1|    if (flags & HAS_AVX2) av1_fwd_txfm2d_8x16 = av1_fwd_txfm2d_8x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (917:9): [True: 1, False: 0]
  ------------------
  918|      1|    av1_fwd_txfm2d_8x32 = av1_fwd_txfm2d_8x32_c;
  919|      1|    if (flags & HAS_SSE4_1) av1_fwd_txfm2d_8x32 = av1_fwd_txfm2d_8x32_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (919:9): [True: 1, False: 0]
  ------------------
  920|      1|    av1_fwd_txfm2d_8x4 = av1_fwd_txfm2d_8x4_c;
  921|      1|    if (flags & HAS_SSE4_1) av1_fwd_txfm2d_8x4 = av1_fwd_txfm2d_8x4_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (921:9): [True: 1, False: 0]
  ------------------
  922|      1|    av1_fwd_txfm2d_8x8 = av1_fwd_txfm2d_8x8_c;
  923|      1|    if (flags & HAS_SSE4_1) av1_fwd_txfm2d_8x8 = av1_fwd_txfm2d_8x8_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (923:9): [True: 1, False: 0]
  ------------------
  924|      1|    if (flags & HAS_AVX2) av1_fwd_txfm2d_8x8 = av1_fwd_txfm2d_8x8_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (924:9): [True: 1, False: 0]
  ------------------
  925|      1|    av1_fwht4x4 = av1_fwht4x4_c;
  926|      1|    if (flags & HAS_SSE4_1) av1_fwht4x4 = av1_fwht4x4_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (926:9): [True: 1, False: 0]
  ------------------
  927|      1|    av1_get_crc32c_value = av1_get_crc32c_value_c;
  928|      1|    if (flags & HAS_SSE4_2) av1_get_crc32c_value = av1_get_crc32c_value_sse4_2;
  ------------------
  |  |  169|      1|#define HAS_SSE4_2 0x100
  ------------------
  |  Branch (928:9): [True: 1, False: 0]
  ------------------
  929|      1|    av1_get_horver_correlation_full = av1_get_horver_correlation_full_c;
  930|      1|    if (flags & HAS_SSE4_1) av1_get_horver_correlation_full = av1_get_horver_correlation_full_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (930:9): [True: 1, False: 0]
  ------------------
  931|      1|    if (flags & HAS_AVX2) av1_get_horver_correlation_full = av1_get_horver_correlation_full_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (931:9): [True: 1, False: 0]
  ------------------
  932|      1|    av1_highbd_apply_temporal_filter = av1_highbd_apply_temporal_filter_sse2;
  933|      1|    if (flags & HAS_AVX2) av1_highbd_apply_temporal_filter = av1_highbd_apply_temporal_filter_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (933:9): [True: 1, False: 0]
  ------------------
  934|      1|    av1_highbd_block_error = av1_highbd_block_error_sse2;
  935|      1|    if (flags & HAS_AVX2) av1_highbd_block_error = av1_highbd_block_error_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (935:9): [True: 1, False: 0]
  ------------------
  936|      1|    av1_highbd_convolve_2d_scale = av1_highbd_convolve_2d_scale_c;
  937|      1|    if (flags & HAS_SSE4_1) av1_highbd_convolve_2d_scale = av1_highbd_convolve_2d_scale_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (937:9): [True: 1, False: 0]
  ------------------
  938|      1|    av1_highbd_convolve_2d_sr = av1_highbd_convolve_2d_sr_c;
  939|      1|    if (flags & HAS_SSSE3) av1_highbd_convolve_2d_sr = av1_highbd_convolve_2d_sr_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (939:9): [True: 1, False: 0]
  ------------------
  940|      1|    if (flags & HAS_AVX2) av1_highbd_convolve_2d_sr = av1_highbd_convolve_2d_sr_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (940:9): [True: 1, False: 0]
  ------------------
  941|      1|    av1_highbd_convolve_horiz_rs = av1_highbd_convolve_horiz_rs_c;
  942|      1|    if (flags & HAS_SSE4_1) av1_highbd_convolve_horiz_rs = av1_highbd_convolve_horiz_rs_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (942:9): [True: 1, False: 0]
  ------------------
  943|      1|    av1_highbd_convolve_x_sr = av1_highbd_convolve_x_sr_c;
  944|      1|    if (flags & HAS_SSSE3) av1_highbd_convolve_x_sr = av1_highbd_convolve_x_sr_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (944:9): [True: 1, False: 0]
  ------------------
  945|      1|    if (flags & HAS_AVX2) av1_highbd_convolve_x_sr = av1_highbd_convolve_x_sr_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (945:9): [True: 1, False: 0]
  ------------------
  946|      1|    av1_highbd_convolve_y_sr = av1_highbd_convolve_y_sr_c;
  947|      1|    if (flags & HAS_SSSE3) av1_highbd_convolve_y_sr = av1_highbd_convolve_y_sr_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (947:9): [True: 1, False: 0]
  ------------------
  948|      1|    if (flags & HAS_AVX2) av1_highbd_convolve_y_sr = av1_highbd_convolve_y_sr_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (948:9): [True: 1, False: 0]
  ------------------
  949|      1|    av1_highbd_dist_wtd_convolve_2d = av1_highbd_dist_wtd_convolve_2d_c;
  950|      1|    if (flags & HAS_SSE4_1) av1_highbd_dist_wtd_convolve_2d = av1_highbd_dist_wtd_convolve_2d_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (950:9): [True: 1, False: 0]
  ------------------
  951|      1|    if (flags & HAS_AVX2) av1_highbd_dist_wtd_convolve_2d = av1_highbd_dist_wtd_convolve_2d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (951:9): [True: 1, False: 0]
  ------------------
  952|      1|    av1_highbd_dist_wtd_convolve_2d_copy = av1_highbd_dist_wtd_convolve_2d_copy_c;
  953|      1|    if (flags & HAS_SSE4_1) av1_highbd_dist_wtd_convolve_2d_copy = av1_highbd_dist_wtd_convolve_2d_copy_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (953:9): [True: 1, False: 0]
  ------------------
  954|      1|    if (flags & HAS_AVX2) av1_highbd_dist_wtd_convolve_2d_copy = av1_highbd_dist_wtd_convolve_2d_copy_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (954:9): [True: 1, False: 0]
  ------------------
  955|      1|    av1_highbd_dist_wtd_convolve_x = av1_highbd_dist_wtd_convolve_x_c;
  956|      1|    if (flags & HAS_SSE4_1) av1_highbd_dist_wtd_convolve_x = av1_highbd_dist_wtd_convolve_x_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (956:9): [True: 1, False: 0]
  ------------------
  957|      1|    if (flags & HAS_AVX2) av1_highbd_dist_wtd_convolve_x = av1_highbd_dist_wtd_convolve_x_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (957:9): [True: 1, False: 0]
  ------------------
  958|      1|    av1_highbd_dist_wtd_convolve_y = av1_highbd_dist_wtd_convolve_y_c;
  959|      1|    if (flags & HAS_SSE4_1) av1_highbd_dist_wtd_convolve_y = av1_highbd_dist_wtd_convolve_y_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (959:9): [True: 1, False: 0]
  ------------------
  960|      1|    if (flags & HAS_AVX2) av1_highbd_dist_wtd_convolve_y = av1_highbd_dist_wtd_convolve_y_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (960:9): [True: 1, False: 0]
  ------------------
  961|      1|    av1_highbd_dr_prediction_z1 = av1_highbd_dr_prediction_z1_c;
  962|      1|    if (flags & HAS_AVX2) av1_highbd_dr_prediction_z1 = av1_highbd_dr_prediction_z1_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (962:9): [True: 1, False: 0]
  ------------------
  963|      1|    av1_highbd_dr_prediction_z2 = av1_highbd_dr_prediction_z2_c;
  964|      1|    if (flags & HAS_AVX2) av1_highbd_dr_prediction_z2 = av1_highbd_dr_prediction_z2_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (964:9): [True: 1, False: 0]
  ------------------
  965|      1|    av1_highbd_dr_prediction_z3 = av1_highbd_dr_prediction_z3_c;
  966|      1|    if (flags & HAS_AVX2) av1_highbd_dr_prediction_z3 = av1_highbd_dr_prediction_z3_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (966:9): [True: 1, False: 0]
  ------------------
  967|      1|    av1_highbd_filter_intra_edge = av1_highbd_filter_intra_edge_c;
  968|      1|    if (flags & HAS_SSE4_1) av1_highbd_filter_intra_edge = av1_highbd_filter_intra_edge_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (968:9): [True: 1, False: 0]
  ------------------
  969|      1|    av1_highbd_inv_txfm_add = av1_highbd_inv_txfm_add_c;
  970|      1|    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add = av1_highbd_inv_txfm_add_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (970:9): [True: 1, False: 0]
  ------------------
  971|      1|    if (flags & HAS_AVX2) av1_highbd_inv_txfm_add = av1_highbd_inv_txfm_add_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (971:9): [True: 1, False: 0]
  ------------------
  972|      1|    av1_highbd_iwht4x4_16_add = av1_highbd_iwht4x4_16_add_c;
  973|      1|    if (flags & HAS_SSE4_1) av1_highbd_iwht4x4_16_add = av1_highbd_iwht4x4_16_add_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (973:9): [True: 1, False: 0]
  ------------------
  974|      1|    av1_highbd_pixel_proj_error = av1_highbd_pixel_proj_error_c;
  975|      1|    if (flags & HAS_SSE4_1) av1_highbd_pixel_proj_error = av1_highbd_pixel_proj_error_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (975:9): [True: 1, False: 0]
  ------------------
  976|      1|    if (flags & HAS_AVX2) av1_highbd_pixel_proj_error = av1_highbd_pixel_proj_error_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (976:9): [True: 1, False: 0]
  ------------------
  977|      1|    av1_highbd_quantize_fp = av1_highbd_quantize_fp_c;
  978|      1|    if (flags & HAS_SSE4_1) av1_highbd_quantize_fp = av1_highbd_quantize_fp_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (978:9): [True: 1, False: 0]
  ------------------
  979|      1|    if (flags & HAS_AVX2) av1_highbd_quantize_fp = av1_highbd_quantize_fp_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (979:9): [True: 1, False: 0]
  ------------------
  980|      1|    av1_highbd_upsample_intra_edge = av1_highbd_upsample_intra_edge_c;
  981|      1|    if (flags & HAS_SSE4_1) av1_highbd_upsample_intra_edge = av1_highbd_upsample_intra_edge_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (981:9): [True: 1, False: 0]
  ------------------
  982|      1|    av1_highbd_warp_affine = av1_highbd_warp_affine_c;
  983|      1|    if (flags & HAS_SSE4_1) av1_highbd_warp_affine = av1_highbd_warp_affine_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (983:9): [True: 1, False: 0]
  ------------------
  984|      1|    if (flags & HAS_AVX2) av1_highbd_warp_affine = av1_highbd_warp_affine_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (984:9): [True: 1, False: 0]
  ------------------
  985|      1|    av1_highbd_wiener_convolve_add_src = av1_highbd_wiener_convolve_add_src_c;
  986|      1|    if (flags & HAS_SSSE3) av1_highbd_wiener_convolve_add_src = av1_highbd_wiener_convolve_add_src_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (986:9): [True: 1, False: 0]
  ------------------
  987|      1|    if (flags & HAS_AVX2) av1_highbd_wiener_convolve_add_src = av1_highbd_wiener_convolve_add_src_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (987:9): [True: 1, False: 0]
  ------------------
  988|      1|    av1_inv_txfm2d_add_4x4 = av1_inv_txfm2d_add_4x4_c;
  989|      1|    if (flags & HAS_SSE4_1) av1_inv_txfm2d_add_4x4 = av1_inv_txfm2d_add_4x4_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (989:9): [True: 1, False: 0]
  ------------------
  990|      1|    av1_inv_txfm2d_add_8x8 = av1_inv_txfm2d_add_8x8_c;
  991|      1|    if (flags & HAS_SSE4_1) av1_inv_txfm2d_add_8x8 = av1_inv_txfm2d_add_8x8_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (991:9): [True: 1, False: 0]
  ------------------
  992|      1|    av1_inv_txfm_add = av1_inv_txfm_add_c;
  993|      1|    if (flags & HAS_SSSE3) av1_inv_txfm_add = av1_inv_txfm_add_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (993:9): [True: 1, False: 0]
  ------------------
  994|      1|    if (flags & HAS_AVX2) av1_inv_txfm_add = av1_inv_txfm_add_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (994:9): [True: 1, False: 0]
  ------------------
  995|      1|    av1_lowbd_fwd_txfm = av1_lowbd_fwd_txfm_c;
  996|      1|    if (flags & HAS_SSE4_1) av1_lowbd_fwd_txfm = av1_lowbd_fwd_txfm_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (996:9): [True: 1, False: 0]
  ------------------
  997|      1|    if (flags & HAS_AVX2) av1_lowbd_fwd_txfm = av1_lowbd_fwd_txfm_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (997:9): [True: 1, False: 0]
  ------------------
  998|      1|    av1_lowbd_pixel_proj_error = av1_lowbd_pixel_proj_error_c;
  999|      1|    if (flags & HAS_SSE4_1) av1_lowbd_pixel_proj_error = av1_lowbd_pixel_proj_error_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (999:9): [True: 1, False: 0]
  ------------------
 1000|      1|    if (flags & HAS_AVX2) av1_lowbd_pixel_proj_error = av1_lowbd_pixel_proj_error_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1000:9): [True: 1, False: 0]
  ------------------
 1001|      1|    av1_nn_fast_softmax_16 = av1_nn_fast_softmax_16_c;
 1002|      1|    if (flags & HAS_SSE3) av1_nn_fast_softmax_16 = av1_nn_fast_softmax_16_sse3;
  ------------------
  |  |  164|      1|#define HAS_SSE3 0x08
  ------------------
  |  Branch (1002:9): [True: 1, False: 0]
  ------------------
 1003|      1|    av1_nn_predict = av1_nn_predict_c;
 1004|      1|    if (flags & HAS_SSE3) av1_nn_predict = av1_nn_predict_sse3;
  ------------------
  |  |  164|      1|#define HAS_SSE3 0x08
  ------------------
  |  Branch (1004:9): [True: 1, False: 0]
  ------------------
 1005|      1|    if (flags & HAS_AVX2) av1_nn_predict = av1_nn_predict_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1005:9): [True: 1, False: 0]
  ------------------
 1006|      1|    av1_quantize_fp = av1_quantize_fp_sse2;
 1007|      1|    if (flags & HAS_AVX2) av1_quantize_fp = av1_quantize_fp_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1007:9): [True: 1, False: 0]
  ------------------
 1008|      1|    av1_quantize_fp_32x32 = av1_quantize_fp_32x32_c;
 1009|      1|    if (flags & HAS_AVX2) av1_quantize_fp_32x32 = av1_quantize_fp_32x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1009:9): [True: 1, False: 0]
  ------------------
 1010|      1|    av1_quantize_fp_64x64 = av1_quantize_fp_64x64_c;
 1011|      1|    if (flags & HAS_AVX2) av1_quantize_fp_64x64 = av1_quantize_fp_64x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1011:9): [True: 1, False: 0]
  ------------------
 1012|      1|    av1_quantize_lp = av1_quantize_lp_sse2;
 1013|      1|    if (flags & HAS_AVX2) av1_quantize_lp = av1_quantize_lp_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1013:9): [True: 1, False: 0]
  ------------------
 1014|      1|    av1_resize_and_extend_frame = av1_resize_and_extend_frame_c;
 1015|      1|    if (flags & HAS_SSSE3) av1_resize_and_extend_frame = av1_resize_and_extend_frame_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1015:9): [True: 1, False: 0]
  ------------------
 1016|      1|    av1_resize_horz_dir = av1_resize_horz_dir_sse2;
 1017|      1|    if (flags & HAS_AVX2) av1_resize_horz_dir = av1_resize_horz_dir_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1017:9): [True: 1, False: 0]
  ------------------
 1018|      1|    av1_resize_vert_dir = av1_resize_vert_dir_sse2;
 1019|      1|    if (flags & HAS_AVX2) av1_resize_vert_dir = av1_resize_vert_dir_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1019:9): [True: 1, False: 0]
  ------------------
 1020|      1|    av1_round_shift_array = av1_round_shift_array_c;
 1021|      1|    if (flags & HAS_SSE4_1) av1_round_shift_array = av1_round_shift_array_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (1021:9): [True: 1, False: 0]
  ------------------
 1022|      1|    av1_selfguided_restoration = av1_selfguided_restoration_c;
 1023|      1|    if (flags & HAS_SSE4_1) av1_selfguided_restoration = av1_selfguided_restoration_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (1023:9): [True: 1, False: 0]
  ------------------
 1024|      1|    if (flags & HAS_AVX2) av1_selfguided_restoration = av1_selfguided_restoration_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1024:9): [True: 1, False: 0]
  ------------------
 1025|      1|    av1_txb_init_levels = av1_txb_init_levels_c;
 1026|      1|    if (flags & HAS_SSE4_1) av1_txb_init_levels = av1_txb_init_levels_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (1026:9): [True: 1, False: 0]
  ------------------
 1027|      1|    if (flags & HAS_AVX2) av1_txb_init_levels = av1_txb_init_levels_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1027:9): [True: 1, False: 0]
  ------------------
 1028|      1|    av1_upsample_intra_edge = av1_upsample_intra_edge_c;
 1029|      1|    if (flags & HAS_SSE4_1) av1_upsample_intra_edge = av1_upsample_intra_edge_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (1029:9): [True: 1, False: 0]
  ------------------
 1030|      1|    av1_warp_affine = av1_warp_affine_c;
 1031|      1|    if (flags & HAS_SSE4_1) av1_warp_affine = av1_warp_affine_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (1031:9): [True: 1, False: 0]
  ------------------
 1032|      1|    if (flags & HAS_AVX2) av1_warp_affine = av1_warp_affine_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1032:9): [True: 1, False: 0]
  ------------------
 1033|      1|    av1_wedge_compute_delta_squares = av1_wedge_compute_delta_squares_sse2;
 1034|      1|    if (flags & HAS_AVX2) av1_wedge_compute_delta_squares = av1_wedge_compute_delta_squares_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1034:9): [True: 1, False: 0]
  ------------------
 1035|      1|    av1_wedge_sign_from_residuals = av1_wedge_sign_from_residuals_sse2;
 1036|      1|    if (flags & HAS_AVX2) av1_wedge_sign_from_residuals = av1_wedge_sign_from_residuals_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1036:9): [True: 1, False: 0]
  ------------------
 1037|      1|    av1_wedge_sse_from_residuals = av1_wedge_sse_from_residuals_sse2;
 1038|      1|    if (flags & HAS_AVX2) av1_wedge_sse_from_residuals = av1_wedge_sse_from_residuals_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1038:9): [True: 1, False: 0]
  ------------------
 1039|      1|    av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_sse2;
 1040|      1|    if (flags & HAS_AVX2) av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1040:9): [True: 1, False: 0]
  ------------------
 1041|      1|    cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_c;
 1042|      1|    if (flags & HAS_SSE4_1) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (1042:9): [True: 1, False: 0]
  ------------------
 1043|      1|    if (flags & HAS_AVX2) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1043:9): [True: 1, False: 0]
  ------------------
 1044|      1|    cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_c;
 1045|      1|    if (flags & HAS_SSE4_1) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (1045:9): [True: 1, False: 0]
  ------------------
 1046|      1|    if (flags & HAS_AVX2) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1046:9): [True: 1, False: 0]
  ------------------
 1047|      1|    cdef_filter_16_0 = cdef_filter_16_0_c;
 1048|      1|    if (flags & HAS_SSE4_1) cdef_filter_16_0 = cdef_filter_16_0_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (1048:9): [True: 1, False: 0]
  ------------------
 1049|      1|    if (flags & HAS_AVX2) cdef_filter_16_0 = cdef_filter_16_0_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1049:9): [True: 1, False: 0]
  ------------------
 1050|      1|    cdef_filter_16_1 = cdef_filter_16_1_c;
 1051|      1|    if (flags & HAS_SSE4_1) cdef_filter_16_1 = cdef_filter_16_1_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (1051:9): [True: 1, False: 0]
  ------------------
 1052|      1|    if (flags & HAS_AVX2) cdef_filter_16_1 = cdef_filter_16_1_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1052:9): [True: 1, False: 0]
  ------------------
 1053|      1|    cdef_filter_16_2 = cdef_filter_16_2_c;
 1054|      1|    if (flags & HAS_SSE4_1) cdef_filter_16_2 = cdef_filter_16_2_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (1054:9): [True: 1, False: 0]
  ------------------
 1055|      1|    if (flags & HAS_AVX2) cdef_filter_16_2 = cdef_filter_16_2_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1055:9): [True: 1, False: 0]
  ------------------
 1056|      1|    cdef_filter_16_3 = cdef_filter_16_3_c;
 1057|      1|    if (flags & HAS_SSE4_1) cdef_filter_16_3 = cdef_filter_16_3_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (1057:9): [True: 1, False: 0]
  ------------------
 1058|      1|    if (flags & HAS_AVX2) cdef_filter_16_3 = cdef_filter_16_3_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1058:9): [True: 1, False: 0]
  ------------------
 1059|      1|    cdef_filter_8_0 = cdef_filter_8_0_c;
 1060|      1|    if (flags & HAS_SSE4_1) cdef_filter_8_0 = cdef_filter_8_0_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (1060:9): [True: 1, False: 0]
  ------------------
 1061|      1|    if (flags & HAS_AVX2) cdef_filter_8_0 = cdef_filter_8_0_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1061:9): [True: 1, False: 0]
  ------------------
 1062|      1|    cdef_filter_8_1 = cdef_filter_8_1_c;
 1063|      1|    if (flags & HAS_SSE4_1) cdef_filter_8_1 = cdef_filter_8_1_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (1063:9): [True: 1, False: 0]
  ------------------
 1064|      1|    if (flags & HAS_AVX2) cdef_filter_8_1 = cdef_filter_8_1_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1064:9): [True: 1, False: 0]
  ------------------
 1065|      1|    cdef_filter_8_2 = cdef_filter_8_2_c;
 1066|      1|    if (flags & HAS_SSE4_1) cdef_filter_8_2 = cdef_filter_8_2_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (1066:9): [True: 1, False: 0]
  ------------------
 1067|      1|    if (flags & HAS_AVX2) cdef_filter_8_2 = cdef_filter_8_2_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1067:9): [True: 1, False: 0]
  ------------------
 1068|      1|    cdef_filter_8_3 = cdef_filter_8_3_c;
 1069|      1|    if (flags & HAS_SSE4_1) cdef_filter_8_3 = cdef_filter_8_3_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (1069:9): [True: 1, False: 0]
  ------------------
 1070|      1|    if (flags & HAS_AVX2) cdef_filter_8_3 = cdef_filter_8_3_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1070:9): [True: 1, False: 0]
  ------------------
 1071|      1|    cdef_find_dir = cdef_find_dir_c;
 1072|      1|    if (flags & HAS_SSE4_1) cdef_find_dir = cdef_find_dir_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (1072:9): [True: 1, False: 0]
  ------------------
 1073|      1|    if (flags & HAS_AVX2) cdef_find_dir = cdef_find_dir_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1073:9): [True: 1, False: 0]
  ------------------
 1074|      1|    cdef_find_dir_dual = cdef_find_dir_dual_c;
 1075|      1|    if (flags & HAS_SSE4_1) cdef_find_dir_dual = cdef_find_dir_dual_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (1075:9): [True: 1, False: 0]
  ------------------
 1076|      1|    if (flags & HAS_AVX2) cdef_find_dir_dual = cdef_find_dir_dual_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1076:9): [True: 1, False: 0]
  ------------------
 1077|      1|    cfl_get_luma_subsampling_420_hbd = cfl_get_luma_subsampling_420_hbd_c;
 1078|      1|    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_420_hbd = cfl_get_luma_subsampling_420_hbd_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1078:9): [True: 1, False: 0]
  ------------------
 1079|      1|    if (flags & HAS_AVX2) cfl_get_luma_subsampling_420_hbd = cfl_get_luma_subsampling_420_hbd_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1079:9): [True: 1, False: 0]
  ------------------
 1080|      1|    cfl_get_luma_subsampling_420_lbd = cfl_get_luma_subsampling_420_lbd_c;
 1081|      1|    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_420_lbd = cfl_get_luma_subsampling_420_lbd_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1081:9): [True: 1, False: 0]
  ------------------
 1082|      1|    if (flags & HAS_AVX2) cfl_get_luma_subsampling_420_lbd = cfl_get_luma_subsampling_420_lbd_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1082:9): [True: 1, False: 0]
  ------------------
 1083|      1|    cfl_get_luma_subsampling_422_hbd = cfl_get_luma_subsampling_422_hbd_c;
 1084|      1|    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_422_hbd = cfl_get_luma_subsampling_422_hbd_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1084:9): [True: 1, False: 0]
  ------------------
 1085|      1|    if (flags & HAS_AVX2) cfl_get_luma_subsampling_422_hbd = cfl_get_luma_subsampling_422_hbd_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1085:9): [True: 1, False: 0]
  ------------------
 1086|      1|    cfl_get_luma_subsampling_422_lbd = cfl_get_luma_subsampling_422_lbd_c;
 1087|      1|    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_422_lbd = cfl_get_luma_subsampling_422_lbd_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1087:9): [True: 1, False: 0]
  ------------------
 1088|      1|    if (flags & HAS_AVX2) cfl_get_luma_subsampling_422_lbd = cfl_get_luma_subsampling_422_lbd_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1088:9): [True: 1, False: 0]
  ------------------
 1089|      1|    cfl_get_luma_subsampling_444_hbd = cfl_get_luma_subsampling_444_hbd_c;
 1090|      1|    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_444_hbd = cfl_get_luma_subsampling_444_hbd_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1090:9): [True: 1, False: 0]
  ------------------
 1091|      1|    if (flags & HAS_AVX2) cfl_get_luma_subsampling_444_hbd = cfl_get_luma_subsampling_444_hbd_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1091:9): [True: 1, False: 0]
  ------------------
 1092|      1|    cfl_get_luma_subsampling_444_lbd = cfl_get_luma_subsampling_444_lbd_c;
 1093|      1|    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_444_lbd = cfl_get_luma_subsampling_444_lbd_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1093:9): [True: 1, False: 0]
  ------------------
 1094|      1|    if (flags & HAS_AVX2) cfl_get_luma_subsampling_444_lbd = cfl_get_luma_subsampling_444_lbd_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1094:9): [True: 1, False: 0]
  ------------------
 1095|      1|    cfl_get_predict_hbd_fn = cfl_get_predict_hbd_fn_c;
 1096|      1|    if (flags & HAS_SSSE3) cfl_get_predict_hbd_fn = cfl_get_predict_hbd_fn_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1096:9): [True: 1, False: 0]
  ------------------
 1097|      1|    if (flags & HAS_AVX2) cfl_get_predict_hbd_fn = cfl_get_predict_hbd_fn_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1097:9): [True: 1, False: 0]
  ------------------
 1098|      1|    cfl_get_predict_lbd_fn = cfl_get_predict_lbd_fn_c;
 1099|      1|    if (flags & HAS_SSSE3) cfl_get_predict_lbd_fn = cfl_get_predict_lbd_fn_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1099:9): [True: 1, False: 0]
  ------------------
 1100|      1|    if (flags & HAS_AVX2) cfl_get_predict_lbd_fn = cfl_get_predict_lbd_fn_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1100:9): [True: 1, False: 0]
  ------------------
 1101|      1|    cfl_get_subtract_average_fn = cfl_get_subtract_average_fn_sse2;
 1102|      1|    if (flags & HAS_AVX2) cfl_get_subtract_average_fn = cfl_get_subtract_average_fn_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1102:9): [True: 1, False: 0]
  ------------------
 1103|      1|}

convolve_2d_avx2.c:loadu_8bit_16x2_avx2:
   70|   201k|                                           const ptrdiff_t strideInByte) {
   71|   201k|  const __m128i src0 = _mm_loadu_si128((__m128i *)src);
   72|   201k|  const __m128i src1 =
   73|   201k|      _mm_loadu_si128((__m128i *)((uint8_t *)src + strideInByte));
   74|       |  return _mm256_setr_m128i(src0, src1);
  ------------------
  |  |   29|   201k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|   201k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
   75|   201k|}
convolve_2d_avx2.c:storeu_u8_16x2_avx2:
   96|  38.6k|                                       const ptrdiff_t stride) {
   97|  38.6k|  storeu_8bit_16x2_avx2(src, dst, sizeof(*dst) * stride);
   98|  38.6k|}
convolve_2d_avx2.c:storeu_8bit_16x2_avx2:
   88|  38.6k|                                         const ptrdiff_t strideInByte) {
   89|  38.6k|  const __m128i d0 = _mm256_castsi256_si128(src);
   90|       |  const __m128i d1 = _mm256_extracti128_si256(src, 1);
   91|  38.6k|  _mm_storeu_si128((__m128i *)dst, d0);
   92|  38.6k|  _mm_storeu_si128((__m128i *)((uint8_t *)dst + strideInByte), d1);
   93|  38.6k|}
convolve_avx2.c:storeu_u8_16x2_avx2:
   96|  25.1k|                                       const ptrdiff_t stride) {
   97|  25.1k|  storeu_8bit_16x2_avx2(src, dst, sizeof(*dst) * stride);
   98|  25.1k|}
convolve_avx2.c:storeu_8bit_16x2_avx2:
   88|  25.1k|                                         const ptrdiff_t strideInByte) {
   89|  25.1k|  const __m128i d0 = _mm256_castsi256_si128(src);
   90|       |  const __m128i d1 = _mm256_extracti128_si256(src, 1);
   91|  25.1k|  _mm_storeu_si128((__m128i *)dst, d0);
   92|  25.1k|  _mm_storeu_si128((__m128i *)((uint8_t *)dst + strideInByte), d1);
   93|  25.1k|}
convolve_avx2.c:loadu_8bit_16x2_avx2:
   70|  39.8k|                                           const ptrdiff_t strideInByte) {
   71|  39.8k|  const __m128i src0 = _mm_loadu_si128((__m128i *)src);
   72|  39.8k|  const __m128i src1 =
   73|  39.8k|      _mm_loadu_si128((__m128i *)((uint8_t *)src + strideInByte));
   74|       |  return _mm256_setr_m128i(src0, src1);
  ------------------
  |  |   29|  39.8k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  39.8k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
   75|  39.8k|}

convolve_2d_avx2.c:load_u8_4x2_sse4_1:
   30|    972|                                         const ptrdiff_t stride) {
   31|    972|  return load8bit_4x2_sse4_1(src, sizeof(*src) * stride);
   32|    972|}
convolve_2d_avx2.c:load8bit_4x2_sse4_1:
   24|    972|                                          const ptrdiff_t strideInByte) {
   25|    972|  const __m128i s = _mm_cvtsi32_si128(loadu_int32(src));
   26|       |  return _mm_insert_epi32(s, loadu_int32((uint8_t *)src + strideInByte), 1);
   27|    972|}
convolve_avx2.c:load_u8_4x2_sse4_1:
   30|    292|                                         const ptrdiff_t stride) {
   31|    292|  return load8bit_4x2_sse4_1(src, sizeof(*src) * stride);
   32|    292|}
convolve_avx2.c:load8bit_4x2_sse4_1:
   24|    292|                                          const ptrdiff_t strideInByte) {
   25|    292|  const __m128i s = _mm_cvtsi32_si128(loadu_int32(src));
   26|       |  return _mm_insert_epi32(s, loadu_int32((uint8_t *)src + strideInByte), 1);
   27|    292|}

convolve_2d_avx2.c:av1_convolve_2d_sr_specialized_avx2:
 1146|  49.0k|    const int32_t subpel_y_q4, ConvolveParams *conv_params) {
 1147|  49.0k|  static const Convolve2dSrHorTapFunc
 1148|  49.0k|      convolve_2d_sr_hor_tap_func_table[MAX_FILTER_TAP + 1] = {
 1149|  49.0k|        NULL,
 1150|  49.0k|        NULL,
 1151|  49.0k|        convolve_2d_sr_hor_2tap_avx2,
 1152|  49.0k|        NULL,
 1153|  49.0k|        convolve_2d_sr_hor_4tap_ssse3,
 1154|  49.0k|        NULL,
 1155|  49.0k|        convolve_2d_sr_hor_6tap_avx2,
 1156|  49.0k|        NULL,
 1157|  49.0k|        convolve_2d_sr_hor_8tap_avx2
 1158|  49.0k|      };
 1159|  49.0k|  static const Convolve2dSrVerTapFunc
 1160|  49.0k|      convolve_2d_sr_ver_tap_func_table[MAX_FILTER_TAP + 1] = {
 1161|  49.0k|        NULL,
 1162|  49.0k|        convolve_2d_sr_ver_2tap_half_avx2,
 1163|  49.0k|        convolve_2d_sr_ver_2tap_avx2,
 1164|  49.0k|        convolve_2d_sr_ver_4tap_avx2,
 1165|  49.0k|        convolve_2d_sr_ver_4tap_avx2,
 1166|  49.0k|        convolve_2d_sr_ver_6tap_avx2,
 1167|  49.0k|        convolve_2d_sr_ver_6tap_avx2,
 1168|  49.0k|        convolve_2d_sr_ver_8tap_avx2,
 1169|  49.0k|        convolve_2d_sr_ver_8tap_avx2
 1170|  49.0k|      };
 1171|  49.0k|  const int32_t tap_x = get_filter_tap(filter_params_x, subpel_x_q4);
 1172|  49.0k|  const int32_t tap_y = get_filter_tap(filter_params_y, subpel_y_q4);
 1173|       |
 1174|  49.0k|  assert(tap_x != 12 && tap_y != 12);
 1175|       |
 1176|  49.0k|  const uint8_t *src_ptr = src - ((tap_y >> 1) - 1) * src_stride;
 1177|       |  // Note: im_block is 8-pixel interlaced for width 32 and up, to avoid data
 1178|       |  //       permutation.
 1179|  49.0k|  DECLARE_ALIGNED(32, int16_t,
  ------------------
  |  |   19|  49.0k|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
 1180|  49.0k|                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
 1181|       |
 1182|  49.0k|  (void)conv_params;
 1183|       |
 1184|  49.0k|  assert(conv_params->round_0 == 3);
 1185|  49.0k|  assert(conv_params->round_1 == 11);
 1186|       |
 1187|       |  // horizontal filter
 1188|  49.0k|  int32_t hh = h + tap_y;
 1189|  49.0k|  assert(!(hh % 2));
 1190|       |
 1191|  49.0k|  convolve_2d_sr_hor_tap_func_table[tap_x](
 1192|  49.0k|      src_ptr, src_stride, w, hh, filter_params_x, subpel_x_q4, im_block);
 1193|       |
 1194|       |  // vertical filter
 1195|  49.0k|  convolve_2d_sr_ver_tap_func_table[tap_y - (subpel_y_q4 == 8)](
 1196|  49.0k|      im_block, w, h, filter_params_y, subpel_y_q4, dst, dst_stride);
 1197|  49.0k|}
convolve_2d_avx2.c:convolve_2d_sr_hor_2tap_avx2:
   20|  3.03k|    const int32_t subpel_x_q4, int16_t *const im_block) {
   21|  3.03k|  const uint8_t *src_ptr = src;
   22|  3.03k|  int32_t y = h;
   23|  3.03k|  int16_t *im = im_block;
   24|       |
   25|  3.03k|  if (w <= 8) {
  ------------------
  |  Branch (25:7): [True: 2.09k, False: 945]
  ------------------
   26|  2.09k|    __m128i coeffs_128;
   27|       |
   28|  2.09k|    prepare_half_coeffs_2tap_ssse3(filter_params_x, subpel_x_q4, &coeffs_128);
   29|       |
   30|  2.09k|    if (w == 2) {
  ------------------
  |  Branch (30:9): [True: 324, False: 1.77k]
  ------------------
   31|    972|      do {
   32|    972|        const __m128i r =
   33|    972|            x_convolve_2tap_2x2_sse4_1(src_ptr, src_stride, &coeffs_128);
   34|    972|        xy_x_round_store_2x2_sse2(r, im);
   35|    972|        src_ptr += 2 * src_stride;
   36|    972|        im += 2 * 2;
   37|    972|        y -= 2;
   38|    972|      } while (y);
  ------------------
  |  Branch (38:16): [True: 648, False: 324]
  ------------------
   39|  1.77k|    } else if (w == 4) {
  ------------------
  |  Branch (39:16): [True: 968, False: 802]
  ------------------
   40|  4.11k|      do {
   41|  4.11k|        const __m128i r =
   42|  4.11k|            x_convolve_2tap_4x2_ssse3(src_ptr, src_stride, &coeffs_128);
   43|  4.11k|        xy_x_round_store_4x2_sse2(r, im);
   44|  4.11k|        src_ptr += 2 * src_stride;
   45|  4.11k|        im += 2 * 4;
   46|  4.11k|        y -= 2;
   47|  4.11k|      } while (y);
  ------------------
  |  Branch (47:16): [True: 3.14k, False: 968]
  ------------------
   48|    968|    } else {
   49|    802|      assert(w == 8);
   50|       |
   51|  4.18k|      do {
   52|  4.18k|        __m128i r[2];
   53|       |
   54|  4.18k|        x_convolve_2tap_8x2_ssse3(src_ptr, src_stride, &coeffs_128, r);
   55|  4.18k|        xy_x_round_store_8x2_sse2(r, im);
   56|  4.18k|        src_ptr += 2 * src_stride;
   57|  4.18k|        im += 2 * 8;
   58|  4.18k|        y -= 2;
   59|  4.18k|      } while (y);
  ------------------
  |  Branch (59:16): [True: 3.38k, False: 802]
  ------------------
   60|    802|    }
   61|  2.09k|  } else {
   62|    945|    __m256i coeffs_256;
   63|       |
   64|    945|    prepare_half_coeffs_2tap_avx2(filter_params_x, subpel_x_q4, &coeffs_256);
   65|       |
   66|    945|    if (w == 16) {
  ------------------
  |  Branch (66:9): [True: 475, False: 470]
  ------------------
   67|  3.35k|      do {
   68|  3.35k|        __m256i r[2];
   69|       |
   70|  3.35k|        x_convolve_2tap_16x2_avx2(src_ptr, src_stride, &coeffs_256, r);
   71|  3.35k|        xy_x_round_store_32_avx2(r, im);
   72|  3.35k|        src_ptr += 2 * src_stride;
   73|  3.35k|        im += 2 * 16;
   74|  3.35k|        y -= 2;
   75|  3.35k|      } while (y);
  ------------------
  |  Branch (75:16): [True: 2.88k, False: 475]
  ------------------
   76|    475|    } else if (w == 32) {
  ------------------
  |  Branch (76:16): [True: 398, False: 72]
  ------------------
   77|  9.28k|      do {
   78|  9.28k|        xy_x_2tap_32_avx2(src_ptr, &coeffs_256, im);
   79|  9.28k|        src_ptr += src_stride;
   80|  9.28k|        im += 32;
   81|  9.28k|      } while (--y);
  ------------------
  |  Branch (81:16): [True: 8.88k, False: 398]
  ------------------
   82|    398|    } else if (w == 64) {
  ------------------
  |  Branch (82:16): [True: 72, False: 0]
  ------------------
   83|  2.73k|      do {
   84|  2.73k|        xy_x_2tap_32_avx2(src_ptr + 0 * 32, &coeffs_256, im + 0 * 32);
   85|  2.73k|        xy_x_2tap_32_avx2(src_ptr + 1 * 32, &coeffs_256, im + 1 * 32);
   86|  2.73k|        src_ptr += src_stride;
   87|  2.73k|        im += 64;
   88|  2.73k|      } while (--y);
  ------------------
  |  Branch (88:16): [True: 2.66k, False: 72]
  ------------------
   89|     72|    } else {
   90|      0|      assert(w == 128);
   91|       |
   92|      0|      do {
   93|      0|        xy_x_2tap_32_avx2(src_ptr + 0 * 32, &coeffs_256, im + 0 * 32);
   94|      0|        xy_x_2tap_32_avx2(src_ptr + 1 * 32, &coeffs_256, im + 1 * 32);
   95|      0|        xy_x_2tap_32_avx2(src_ptr + 2 * 32, &coeffs_256, im + 2 * 32);
   96|      0|        xy_x_2tap_32_avx2(src_ptr + 3 * 32, &coeffs_256, im + 3 * 32);
   97|      0|        src_ptr += src_stride;
   98|      0|        im += 128;
   99|      0|      } while (--y);
  ------------------
  |  Branch (99:16): [True: 0, False: 0]
  ------------------
  100|      0|    }
  101|    945|  }
  102|  3.03k|}
convolve_2d_avx2.c:convolve_2d_sr_hor_4tap_ssse3:
  107|  24.0k|    const int32_t subpel_x_q4, int16_t *const im_block) {
  108|  24.0k|  const uint8_t *src_ptr = src - 1;
  109|  24.0k|  int32_t y = h;
  110|  24.0k|  int16_t *im = im_block;
  111|       |
  112|  24.0k|  if (w <= 4) {
  ------------------
  |  Branch (112:7): [True: 22.6k, False: 1.43k]
  ------------------
  113|  22.6k|    __m128i coeffs_128[2];
  114|       |
  115|  22.6k|    prepare_half_coeffs_4tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128);
  116|  22.6k|    if (w == 2) {
  ------------------
  |  Branch (116:9): [True: 4.99k, False: 17.6k]
  ------------------
  117|  21.6k|      do {
  118|  21.6k|        const __m128i r =
  119|  21.6k|            x_convolve_4tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
  120|  21.6k|        xy_x_round_store_2x2_sse2(r, im);
  121|  21.6k|        src_ptr += 2 * src_stride;
  122|  21.6k|        im += 2 * 2;
  123|  21.6k|        y -= 2;
  124|  21.6k|      } while (y);
  ------------------
  |  Branch (124:16): [True: 16.6k, False: 4.99k]
  ------------------
  125|  17.6k|    } else if (w == 4) {
  ------------------
  |  Branch (125:16): [True: 17.6k, False: 0]
  ------------------
  126|  94.1k|      do {
  127|  94.1k|        const __m128i r =
  128|  94.1k|            x_convolve_4tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
  129|  94.1k|        xy_x_round_store_4x2_sse2(r, im);
  130|  94.1k|        src_ptr += 2 * src_stride;
  131|  94.1k|        im += 2 * 4;
  132|  94.1k|        y -= 2;
  133|  94.1k|      } while (y);
  ------------------
  |  Branch (133:16): [True: 76.5k, False: 17.6k]
  ------------------
  134|  17.6k|    }
  135|  22.6k|  } else {
  136|       |    // TODO(chiyotsai@google.com): Add better optimization
  137|  1.43k|    __m256i coeffs_256[2], filt_256[2];
  138|       |
  139|  1.43k|    prepare_half_coeffs_4tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
  140|  1.43k|    filt_256[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
  141|  1.43k|    filt_256[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
  142|       |
  143|  1.43k|    if (w == 8) {
  ------------------
  |  Branch (143:9): [True: 811, False: 621]
  ------------------
  144|  4.33k|      do {
  145|  4.33k|        __m256i res =
  146|  4.33k|            x_convolve_4tap_8x2_avx2(src_ptr, src_stride, coeffs_256, filt_256);
  147|  4.33k|        xy_x_round_store_8x2_avx2(res, im);
  148|       |
  149|  4.33k|        src_ptr += 2 * src_stride;
  150|  4.33k|        im += 2 * 8;
  151|  4.33k|        y -= 2;
  152|  4.33k|      } while (y);
  ------------------
  |  Branch (152:16): [True: 3.52k, False: 811]
  ------------------
  153|    811|    } else if (w == 16) {
  ------------------
  |  Branch (153:16): [True: 416, False: 205]
  ------------------
  154|  3.28k|      do {
  155|  3.28k|        __m256i r[2];
  156|       |
  157|  3.28k|        x_convolve_4tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256, r);
  158|  3.28k|        xy_x_round_store_32_avx2(r, im);
  159|  3.28k|        src_ptr += 2 * src_stride;
  160|  3.28k|        im += 2 * 16;
  161|  3.28k|        y -= 2;
  162|  3.28k|      } while (y);
  ------------------
  |  Branch (162:16): [True: 2.86k, False: 416]
  ------------------
  163|    416|    } else if (w == 32) {
  ------------------
  |  Branch (163:16): [True: 145, False: 60]
  ------------------
  164|  3.68k|      do {
  165|  3.68k|        xy_x_4tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
  166|       |
  167|  3.68k|        src_ptr += src_stride;
  168|  3.68k|        im += 32;
  169|  3.68k|      } while (--y);
  ------------------
  |  Branch (169:16): [True: 3.53k, False: 145]
  ------------------
  170|    145|    } else if (w == 64) {
  ------------------
  |  Branch (170:16): [True: 50, False: 10]
  ------------------
  171|  1.97k|      do {
  172|  1.97k|        xy_x_4tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
  173|  1.97k|        xy_x_4tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, im + 32);
  174|  1.97k|        src_ptr += src_stride;
  175|  1.97k|        im += 64;
  176|  1.97k|      } while (--y);
  ------------------
  |  Branch (176:16): [True: 1.92k, False: 50]
  ------------------
  177|     50|    } else {
  178|     10|      assert(w == 128);
  179|       |
  180|  1.07k|      do {
  181|  1.07k|        xy_x_4tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
  182|  1.07k|        xy_x_4tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, im + 32);
  183|  1.07k|        xy_x_4tap_32_avx2(src_ptr + 64, coeffs_256, filt_256, im + 64);
  184|  1.07k|        xy_x_4tap_32_avx2(src_ptr + 96, coeffs_256, filt_256, im + 96);
  185|  1.07k|        src_ptr += src_stride;
  186|  1.07k|        im += 128;
  187|  1.07k|      } while (--y);
  ------------------
  |  Branch (187:16): [True: 1.06k, False: 10]
  ------------------
  188|     10|    }
  189|  1.43k|  }
  190|  24.0k|}
convolve_2d_avx2.c:convolve_2d_sr_hor_6tap_avx2:
  195|  20.2k|    const int32_t subpel_x_q4, int16_t *const im_block) {
  196|  20.2k|  const uint8_t *src_ptr = src - 2;
  197|  20.2k|  int32_t y = h;
  198|  20.2k|  int16_t *im = im_block;
  199|       |
  200|  20.2k|  if (w <= 4) {
  ------------------
  |  Branch (200:7): [True: 0, False: 20.2k]
  ------------------
  201|      0|    __m128i coeffs_128[3];
  202|       |
  203|      0|    prepare_half_coeffs_6tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128);
  204|      0|    if (w == 2) {
  ------------------
  |  Branch (204:9): [True: 0, False: 0]
  ------------------
  205|      0|      do {
  206|      0|        const __m128i r =
  207|      0|            x_convolve_6tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
  208|      0|        xy_x_round_store_2x2_sse2(r, im);
  209|      0|        src_ptr += 2 * src_stride;
  210|      0|        im += 2 * 2;
  211|      0|        y -= 2;
  212|      0|      } while (y);
  ------------------
  |  Branch (212:16): [True: 0, False: 0]
  ------------------
  213|      0|    } else if (w == 4) {
  ------------------
  |  Branch (213:16): [True: 0, False: 0]
  ------------------
  214|      0|      do {
  215|      0|        const __m128i r =
  216|      0|            x_convolve_6tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
  217|      0|        xy_x_round_store_4x2_sse2(r, im);
  218|      0|        src_ptr += 2 * src_stride;
  219|      0|        im += 2 * 4;
  220|      0|        y -= 2;
  221|      0|      } while (y);
  ------------------
  |  Branch (221:16): [True: 0, False: 0]
  ------------------
  222|      0|    }
  223|  20.2k|  } else {
  224|  20.2k|    __m256i coeffs_256[3], filt_256[3];
  225|       |
  226|  20.2k|    filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
  227|  20.2k|    filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
  228|  20.2k|    filt_256[2] = _mm256_loadu_si256((__m256i const *)filt3_global_avx2);
  229|       |
  230|  20.2k|    prepare_half_coeffs_6tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
  231|       |
  232|  20.2k|    if (w == 8) {
  ------------------
  |  Branch (232:9): [True: 12.6k, False: 7.57k]
  ------------------
  233|  75.8k|      do {
  234|  75.8k|        const __m256i res =
  235|  75.8k|            x_convolve_6tap_8x2_avx2(src_ptr, src_stride, coeffs_256, filt_256);
  236|  75.8k|        xy_x_round_store_8x2_avx2(res, im);
  237|       |
  238|  75.8k|        src_ptr += 2 * src_stride;
  239|  75.8k|        im += 2 * 8;
  240|  75.8k|        y -= 2;
  241|  75.8k|      } while (y);
  ------------------
  |  Branch (241:16): [True: 63.1k, False: 12.6k]
  ------------------
  242|  12.6k|    } else if (w == 16) {
  ------------------
  |  Branch (242:16): [True: 5.97k, False: 1.59k]
  ------------------
  243|  47.2k|      do {
  244|  47.2k|        __m256i r[2];
  245|       |
  246|  47.2k|        x_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256, r);
  247|  47.2k|        xy_x_round_store_32_avx2(r, im);
  248|  47.2k|        src_ptr += 2 * src_stride;
  249|  47.2k|        im += 2 * 16;
  250|  47.2k|        y -= 2;
  251|  47.2k|      } while (y);
  ------------------
  |  Branch (251:16): [True: 41.2k, False: 5.97k]
  ------------------
  252|  5.97k|    } else if (w == 32) {
  ------------------
  |  Branch (252:16): [True: 1.39k, False: 199]
  ------------------
  253|  37.9k|      do {
  254|  37.9k|        xy_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
  255|  37.9k|        src_ptr += src_stride;
  256|  37.9k|        im += 32;
  257|  37.9k|      } while (--y);
  ------------------
  |  Branch (257:16): [True: 36.5k, False: 1.39k]
  ------------------
  258|  1.39k|    } else if (w == 64) {
  ------------------
  |  Branch (258:16): [True: 189, False: 10]
  ------------------
  259|  8.90k|      do {
  260|  8.90k|        xy_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
  261|  8.90k|        xy_x_6tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, im + 32);
  262|  8.90k|        src_ptr += src_stride;
  263|  8.90k|        im += 64;
  264|  8.90k|      } while (--y);
  ------------------
  |  Branch (264:16): [True: 8.71k, False: 189]
  ------------------
  265|    189|    } else {
  266|     10|      assert(w == 128);
  267|       |
  268|  1.08k|      do {
  269|  1.08k|        xy_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
  270|  1.08k|        xy_x_6tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, im + 32);
  271|  1.08k|        xy_x_6tap_32_avx2(src_ptr + 64, coeffs_256, filt_256, im + 64);
  272|  1.08k|        xy_x_6tap_32_avx2(src_ptr + 96, coeffs_256, filt_256, im + 96);
  273|  1.08k|        src_ptr += src_stride;
  274|  1.08k|        im += 128;
  275|  1.08k|      } while (--y);
  ------------------
  |  Branch (275:16): [True: 1.07k, False: 10]
  ------------------
  276|     10|    }
  277|  20.2k|  }
  278|  20.2k|}
convolve_2d_avx2.c:convolve_2d_sr_hor_8tap_avx2:
  283|  1.71k|    const int32_t subpel_x_q4, int16_t *const im_block) {
  284|  1.71k|  const uint8_t *src_ptr = src - 3;
  285|  1.71k|  int32_t y = h;
  286|  1.71k|  int16_t *im = im_block;
  287|  1.71k|  __m256i coeffs_256[4], filt_256[4];
  288|       |
  289|  1.71k|  filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
  290|  1.71k|  filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
  291|  1.71k|  filt_256[2] = _mm256_loadu_si256((__m256i const *)filt3_global_avx2);
  292|  1.71k|  filt_256[3] = _mm256_loadu_si256((__m256i const *)filt4_global_avx2);
  293|       |
  294|  1.71k|  prepare_half_coeffs_8tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
  295|       |
  296|  1.71k|  if (w == 8) {
  ------------------
  |  Branch (296:7): [True: 1.08k, False: 627]
  ------------------
  297|  7.22k|    do {
  298|  7.22k|      const __m256i res =
  299|  7.22k|          x_convolve_8tap_8x2_avx2(src_ptr, src_stride, coeffs_256, filt_256);
  300|  7.22k|      xy_x_round_store_8x2_avx2(res, im);
  301|  7.22k|      src_ptr += 2 * src_stride;
  302|  7.22k|      im += 2 * 8;
  303|  7.22k|      y -= 2;
  304|  7.22k|    } while (y);
  ------------------
  |  Branch (304:14): [True: 6.13k, False: 1.08k]
  ------------------
  305|  1.08k|  } else if (w == 16) {
  ------------------
  |  Branch (305:14): [True: 408, False: 219]
  ------------------
  306|  3.00k|    do {
  307|  3.00k|      __m256i r[2];
  308|       |
  309|  3.00k|      x_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256, r);
  310|  3.00k|      xy_x_round_store_32_avx2(r, im);
  311|  3.00k|      src_ptr += 2 * src_stride;
  312|  3.00k|      im += 2 * 16;
  313|  3.00k|      y -= 2;
  314|  3.00k|    } while (y);
  ------------------
  |  Branch (314:14): [True: 2.59k, False: 408]
  ------------------
  315|    408|  } else if (w == 32) {
  ------------------
  |  Branch (315:14): [True: 152, False: 67]
  ------------------
  316|  3.25k|    do {
  317|  3.25k|      xy_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
  318|  3.25k|      src_ptr += src_stride;
  319|  3.25k|      im += 32;
  320|  3.25k|    } while (--y);
  ------------------
  |  Branch (320:14): [True: 3.10k, False: 152]
  ------------------
  321|    152|  } else if (w == 64) {
  ------------------
  |  Branch (321:14): [True: 63, False: 4]
  ------------------
  322|  2.37k|    do {
  323|  2.37k|      xy_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
  324|  2.37k|      xy_x_8tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, im + 32);
  325|  2.37k|      src_ptr += src_stride;
  326|  2.37k|      im += 64;
  327|  2.37k|    } while (--y);
  ------------------
  |  Branch (327:14): [True: 2.31k, False: 63]
  ------------------
  328|     63|  } else {
  329|      4|    assert(w == 128);
  330|       |
  331|    544|    do {
  332|    544|      xy_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
  333|    544|      xy_x_8tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, im + 32);
  334|    544|      xy_x_8tap_32_avx2(src_ptr + 64, coeffs_256, filt_256, im + 64);
  335|    544|      xy_x_8tap_32_avx2(src_ptr + 96, coeffs_256, filt_256, im + 96);
  336|    544|      src_ptr += src_stride;
  337|    544|      im += 128;
  338|    544|    } while (--y);
  ------------------
  |  Branch (338:14): [True: 540, False: 4]
  ------------------
  339|      4|  }
  340|  1.71k|}
convolve_2d_avx2.c:convolve_2d_sr_ver_2tap_half_avx2:
  485|    638|    uint8_t *dst, const int32_t dst_stride) {
  486|    638|  const int16_t *im = im_block;
  487|    638|  int32_t y = h;
  488|       |
  489|    638|  (void)filter_params_y;
  490|    638|  (void)subpel_y_q4;
  491|       |
  492|    638|  if (w == 2) {
  ------------------
  |  Branch (492:7): [True: 28, False: 610]
  ------------------
  493|     28|    __m128i s_32[2];
  494|       |
  495|     28|    s_32[0] = _mm_cvtsi32_si128(*(int32_t *)im);
  496|       |
  497|     60|    do {
  498|     60|      const __m128i res = xy_y_convolve_2tap_2x2_half_pel_sse2(im, s_32);
  499|     60|      const __m128i r = xy_y_round_half_pel_sse2(res);
  500|     60|      pack_store_2x2_sse2(r, dst, dst_stride);
  501|     60|      im += 2 * 2;
  502|     60|      dst += 2 * dst_stride;
  503|     60|      y -= 2;
  504|     60|    } while (y);
  ------------------
  |  Branch (504:14): [True: 32, False: 28]
  ------------------
  505|    610|  } else if (w == 4) {
  ------------------
  |  Branch (505:14): [True: 153, False: 457]
  ------------------
  506|    153|    __m128i s_64[2];
  507|       |
  508|    153|    s_64[0] = _mm_loadl_epi64((__m128i *)im);
  509|       |
  510|    464|    do {
  511|    464|      const __m128i res = xy_y_convolve_2tap_4x2_half_pel_sse2(im, s_64);
  512|    464|      const __m128i r = xy_y_round_half_pel_sse2(res);
  513|    464|      pack_store_4x2_sse2(r, dst, dst_stride);
  514|    464|      im += 2 * 4;
  515|    464|      dst += 2 * dst_stride;
  516|    464|      y -= 2;
  517|    464|    } while (y);
  ------------------
  |  Branch (517:14): [True: 311, False: 153]
  ------------------
  518|    457|  } else if (w == 8) {
  ------------------
  |  Branch (518:14): [True: 139, False: 318]
  ------------------
  519|    139|    __m128i s_128[2];
  520|       |
  521|    139|    s_128[0] = _mm_loadu_si128((__m128i *)im);
  522|       |
  523|    628|    do {
  524|    628|      const __m256i res = xy_y_convolve_2tap_8x2_half_pel_avx2(im, s_128);
  525|    628|      const __m256i r = xy_y_round_half_pel_avx2(res);
  526|    628|      pack_store_8x2_avx2(r, dst, dst_stride);
  527|    628|      im += 2 * 8;
  528|    628|      dst += 2 * dst_stride;
  529|    628|      y -= 2;
  530|    628|    } while (y);
  ------------------
  |  Branch (530:14): [True: 489, False: 139]
  ------------------
  531|    318|  } else if (w == 16) {
  ------------------
  |  Branch (531:14): [True: 132, False: 186]
  ------------------
  532|    132|    __m256i s_256[2], r[2];
  533|       |
  534|    132|    s_256[0] = _mm256_loadu_si256((__m256i *)im);
  535|       |
  536|    984|    do {
  537|    984|      xy_y_convolve_2tap_16x2_half_pel_avx2(im, s_256, r);
  538|    984|      r[0] = xy_y_round_half_pel_avx2(r[0]);
  539|    984|      r[1] = xy_y_round_half_pel_avx2(r[1]);
  540|    984|      xy_y_pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
  541|    984|      im += 2 * 16;
  542|    984|      dst += 2 * dst_stride;
  543|    984|      y -= 2;
  544|    984|    } while (y);
  ------------------
  |  Branch (544:14): [True: 852, False: 132]
  ------------------
  545|    186|  } else if (w == 32) {
  ------------------
  |  Branch (545:14): [True: 152, False: 34]
  ------------------
  546|    152|    __m256i s_256[2][2];
  547|       |
  548|    152|    s_256[0][0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
  549|    152|    s_256[0][1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
  550|       |
  551|  1.98k|    do {
  552|  1.98k|      xy_y_convolve_2tap_half_pel_32_all_avx2(im + 32, s_256[0], s_256[1], dst);
  553|  1.98k|      xy_y_convolve_2tap_half_pel_32_all_avx2(im + 2 * 32, s_256[1], s_256[0],
  554|  1.98k|                                              dst + dst_stride);
  555|  1.98k|      im += 2 * 32;
  556|  1.98k|      dst += 2 * dst_stride;
  557|  1.98k|      y -= 2;
  558|  1.98k|    } while (y);
  ------------------
  |  Branch (558:14): [True: 1.83k, False: 152]
  ------------------
  559|    152|  } else if (w == 64) {
  ------------------
  |  Branch (559:14): [True: 34, False: 0]
  ------------------
  560|     34|    __m256i s_256[2][4];
  561|       |
  562|     34|    s_256[0][0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
  563|     34|    s_256[0][1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
  564|     34|    s_256[0][2] = _mm256_loadu_si256((__m256i *)(im + 2 * 16));
  565|     34|    s_256[0][3] = _mm256_loadu_si256((__m256i *)(im + 3 * 16));
  566|       |
  567|    512|    do {
  568|    512|      xy_y_convolve_2tap_half_pel_32_all_avx2(im + 64, s_256[0] + 0,
  569|    512|                                              s_256[1] + 0, dst);
  570|    512|      xy_y_convolve_2tap_half_pel_32_all_avx2(im + 96, s_256[0] + 2,
  571|    512|                                              s_256[1] + 2, dst + 32);
  572|    512|      im += 2 * 64;
  573|    512|      xy_y_convolve_2tap_half_pel_32_all_avx2(im, s_256[1] + 0, s_256[0] + 0,
  574|    512|                                              dst + dst_stride);
  575|    512|      xy_y_convolve_2tap_half_pel_32_all_avx2(
  576|    512|          im + 32, s_256[1] + 2, s_256[0] + 2, dst + dst_stride + 32);
  577|    512|      dst += 2 * dst_stride;
  578|    512|      y -= 2;
  579|    512|    } while (y);
  ------------------
  |  Branch (579:14): [True: 478, False: 34]
  ------------------
  580|     34|  } else {
  581|      0|    __m256i s_256[2][8];
  582|       |
  583|      0|    assert(w == 128);
  584|       |
  585|      0|    load_16bit_8rows_avx2(im, 16, s_256[0]);
  586|       |
  587|      0|    do {
  588|      0|      xy_y_convolve_2tap_half_pel_32_all_avx2(im + 128, s_256[0] + 0,
  589|      0|                                              s_256[1] + 0, dst);
  590|      0|      xy_y_convolve_2tap_half_pel_32_all_avx2(im + 160, s_256[0] + 2,
  591|      0|                                              s_256[1] + 2, dst + 1 * 32);
  592|      0|      xy_y_convolve_2tap_half_pel_32_all_avx2(im + 192, s_256[0] + 4,
  593|      0|                                              s_256[1] + 4, dst + 2 * 32);
  594|      0|      xy_y_convolve_2tap_half_pel_32_all_avx2(im + 224, s_256[0] + 6,
  595|      0|                                              s_256[1] + 6, dst + 3 * 32);
  596|      0|      im += 2 * 128;
  597|      0|      xy_y_convolve_2tap_half_pel_32_all_avx2(im, s_256[1] + 0, s_256[0] + 0,
  598|      0|                                              dst + dst_stride);
  599|      0|      xy_y_convolve_2tap_half_pel_32_all_avx2(
  600|      0|          im + 32, s_256[1] + 2, s_256[0] + 2, dst + dst_stride + 1 * 32);
  601|      0|      xy_y_convolve_2tap_half_pel_32_all_avx2(
  602|      0|          im + 64, s_256[1] + 4, s_256[0] + 4, dst + dst_stride + 2 * 32);
  603|      0|      xy_y_convolve_2tap_half_pel_32_all_avx2(
  604|      0|          im + 96, s_256[1] + 6, s_256[0] + 6, dst + dst_stride + 3 * 32);
  605|      0|      dst += 2 * dst_stride;
  606|      0|      y -= 2;
  607|      0|    } while (y);
  ------------------
  |  Branch (607:14): [True: 0, False: 0]
  ------------------
  608|      0|  }
  609|    638|}
convolve_2d_avx2.c:convolve_2d_sr_ver_2tap_avx2:
  345|  2.40k|    uint8_t *dst, const int32_t dst_stride) {
  346|  2.40k|  const int16_t *im = im_block;
  347|  2.40k|  int32_t y = h;
  348|       |
  349|  2.40k|  if (w <= 4) {
  ------------------
  |  Branch (349:7): [True: 1.11k, False: 1.29k]
  ------------------
  350|  1.11k|    __m128i coeffs_128;
  351|       |
  352|  1.11k|    prepare_coeffs_2tap_sse2(filter_params_y, subpel_y_q4, &coeffs_128);
  353|       |
  354|  1.11k|    if (w == 2) {
  ------------------
  |  Branch (354:9): [True: 296, False: 815]
  ------------------
  355|    296|      __m128i s_32[2];
  356|       |
  357|    296|      s_32[0] = _mm_cvtsi32_si128(*(int32_t *)im);
  358|       |
  359|    588|      do {
  360|    588|        const __m128i res = xy_y_convolve_2tap_2x2_sse2(im, s_32, &coeffs_128);
  361|    588|        xy_y_round_store_2x2_sse2(res, dst, dst_stride);
  362|    588|        im += 2 * 2;
  363|    588|        dst += 2 * dst_stride;
  364|    588|        y -= 2;
  365|    588|      } while (y);
  ------------------
  |  Branch (365:16): [True: 292, False: 296]
  ------------------
  366|    815|    } else {
  367|    815|      __m128i s_64[2], r[2];
  368|       |
  369|    815|      assert(w == 4);
  370|       |
  371|    815|      s_64[0] = _mm_loadl_epi64((__m128i *)im);
  372|       |
  373|  2.68k|      do {
  374|  2.68k|        xy_y_convolve_2tap_4x2_sse2(im, s_64, &coeffs_128, r);
  375|  2.68k|        r[0] = xy_y_round_sse2(r[0]);
  376|  2.68k|        r[1] = xy_y_round_sse2(r[1]);
  377|  2.68k|        const __m128i rr = _mm_packs_epi32(r[0], r[1]);
  378|  2.68k|        pack_store_4x2_sse2(rr, dst, dst_stride);
  379|  2.68k|        im += 2 * 4;
  380|  2.68k|        dst += 2 * dst_stride;
  381|  2.68k|        y -= 2;
  382|  2.68k|      } while (y);
  ------------------
  |  Branch (382:16): [True: 1.86k, False: 815]
  ------------------
  383|    815|    }
  384|  1.29k|  } else {
  385|  1.29k|    __m256i coeffs_256;
  386|       |
  387|  1.29k|    prepare_coeffs_2tap_avx2(filter_params_y, subpel_y_q4, &coeffs_256);
  388|       |
  389|  1.29k|    if (w == 8) {
  ------------------
  |  Branch (389:9): [True: 663, False: 627]
  ------------------
  390|    663|      __m128i s_128[2];
  391|    663|      __m256i r[2];
  392|       |
  393|    663|      s_128[0] = _mm_loadu_si128((__m128i *)im);
  394|       |
  395|  2.75k|      do {
  396|  2.75k|        xy_y_convolve_2tap_8x2_avx2(im, s_128, &coeffs_256, r);
  397|  2.75k|        xy_y_round_store_8x2_avx2(r, dst, dst_stride);
  398|  2.75k|        im += 2 * 8;
  399|  2.75k|        dst += 2 * dst_stride;
  400|  2.75k|        y -= 2;
  401|  2.75k|      } while (y);
  ------------------
  |  Branch (401:16): [True: 2.08k, False: 663]
  ------------------
  402|    663|    } else if (w == 16) {
  ------------------
  |  Branch (402:16): [True: 343, False: 284]
  ------------------
  403|    343|      __m256i s_256[2], r[4];
  404|       |
  405|    343|      s_256[0] = _mm256_loadu_si256((__m256i *)im);
  406|       |
  407|  1.90k|      do {
  408|  1.90k|        xy_y_convolve_2tap_16x2_avx2(im, s_256, &coeffs_256, r);
  409|  1.90k|        xy_y_round_store_16x2_avx2(r, dst, dst_stride);
  410|  1.90k|        im += 2 * 16;
  411|  1.90k|        dst += 2 * dst_stride;
  412|  1.90k|        y -= 2;
  413|  1.90k|      } while (y);
  ------------------
  |  Branch (413:16): [True: 1.55k, False: 343]
  ------------------
  414|    343|    } else if (w == 32) {
  ------------------
  |  Branch (414:16): [True: 246, False: 38]
  ------------------
  415|    246|      __m256i s_256[2][2];
  416|       |
  417|    246|      s_256[0][0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
  418|    246|      s_256[0][1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
  419|       |
  420|  2.26k|      do {
  421|  2.26k|        xy_y_convolve_2tap_32_all_avx2(im + 32, s_256[0], s_256[1], &coeffs_256,
  422|  2.26k|                                       dst);
  423|  2.26k|        im += 2 * 32;
  424|  2.26k|        xy_y_convolve_2tap_32_all_avx2(im, s_256[1], s_256[0], &coeffs_256,
  425|  2.26k|                                       dst + dst_stride);
  426|  2.26k|        dst += 2 * dst_stride;
  427|  2.26k|        y -= 2;
  428|  2.26k|      } while (y);
  ------------------
  |  Branch (428:16): [True: 2.01k, False: 246]
  ------------------
  429|    246|    } else if (w == 64) {
  ------------------
  |  Branch (429:16): [True: 38, False: 0]
  ------------------
  430|     38|      __m256i s_256[2][4];
  431|       |
  432|     38|      s_256[0][0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
  433|     38|      s_256[0][1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
  434|     38|      s_256[0][2] = _mm256_loadu_si256((__m256i *)(im + 2 * 16));
  435|     38|      s_256[0][3] = _mm256_loadu_si256((__m256i *)(im + 3 * 16));
  436|       |
  437|    784|      do {
  438|    784|        xy_y_convolve_2tap_32_all_avx2(im + 64, s_256[0] + 0, s_256[1] + 0,
  439|    784|                                       &coeffs_256, dst);
  440|    784|        xy_y_convolve_2tap_32_all_avx2(im + 96, s_256[0] + 2, s_256[1] + 2,
  441|    784|                                       &coeffs_256, dst + 32);
  442|    784|        im += 2 * 64;
  443|    784|        xy_y_convolve_2tap_32_all_avx2(im, s_256[1] + 0, s_256[0] + 0,
  444|    784|                                       &coeffs_256, dst + dst_stride);
  445|    784|        xy_y_convolve_2tap_32_all_avx2(im + 32, s_256[1] + 2, s_256[0] + 2,
  446|    784|                                       &coeffs_256, dst + dst_stride + 32);
  447|    784|        dst += 2 * dst_stride;
  448|    784|        y -= 2;
  449|    784|      } while (y);
  ------------------
  |  Branch (449:16): [True: 746, False: 38]
  ------------------
  450|     38|    } else {
  451|      0|      __m256i s_256[2][8];
  452|       |
  453|      0|      assert(w == 128);
  454|       |
  455|      0|      load_16bit_8rows_avx2(im, 16, s_256[0]);
  456|       |
  457|      0|      do {
  458|      0|        xy_y_convolve_2tap_32_all_avx2(im + 128, s_256[0] + 0, s_256[1] + 0,
  459|      0|                                       &coeffs_256, dst);
  460|      0|        xy_y_convolve_2tap_32_all_avx2(im + 160, s_256[0] + 2, s_256[1] + 2,
  461|      0|                                       &coeffs_256, dst + 1 * 32);
  462|      0|        xy_y_convolve_2tap_32_all_avx2(im + 192, s_256[0] + 4, s_256[1] + 4,
  463|      0|                                       &coeffs_256, dst + 2 * 32);
  464|      0|        xy_y_convolve_2tap_32_all_avx2(im + 224, s_256[0] + 6, s_256[1] + 6,
  465|      0|                                       &coeffs_256, dst + 3 * 32);
  466|      0|        im += 2 * 128;
  467|      0|        xy_y_convolve_2tap_32_all_avx2(im, s_256[1] + 0, s_256[0] + 0,
  468|      0|                                       &coeffs_256, dst + dst_stride);
  469|      0|        xy_y_convolve_2tap_32_all_avx2(im + 32, s_256[1] + 2, s_256[0] + 2,
  470|      0|                                       &coeffs_256, dst + dst_stride + 1 * 32);
  471|      0|        xy_y_convolve_2tap_32_all_avx2(im + 64, s_256[1] + 4, s_256[0] + 4,
  472|      0|                                       &coeffs_256, dst + dst_stride + 2 * 32);
  473|      0|        xy_y_convolve_2tap_32_all_avx2(im + 96, s_256[1] + 6, s_256[0] + 6,
  474|      0|                                       &coeffs_256, dst + dst_stride + 3 * 32);
  475|      0|        dst += 2 * dst_stride;
  476|      0|        y -= 2;
  477|      0|      } while (y);
  ------------------
  |  Branch (477:16): [True: 0, False: 0]
  ------------------
  478|      0|    }
  479|  1.29k|  }
  480|  2.40k|}
convolve_2d_avx2.c:convolve_2d_sr_ver_4tap_avx2:
  614|  26.6k|    uint8_t *dst, const int32_t dst_stride) {
  615|  26.6k|  const int16_t *im = im_block;
  616|  26.6k|  int32_t y = h;
  617|       |
  618|  26.6k|  if (w == 2) {
  ------------------
  |  Branch (618:7): [True: 3.83k, False: 22.8k]
  ------------------
  619|  3.83k|    __m128i coeffs_128[2], s_32[4], ss_128[2];
  620|       |
  621|  3.83k|    prepare_coeffs_4tap_sse2(filter_params_y, subpel_y_q4, coeffs_128);
  622|       |
  623|  3.83k|    s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(im + 0 * 2));
  624|  3.83k|    s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(im + 1 * 2));
  625|  3.83k|    s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(im + 2 * 2));
  626|       |
  627|  3.83k|    const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
  628|  3.83k|    const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
  629|       |
  630|  3.83k|    ss_128[0] = _mm_unpacklo_epi16(src01, src12);
  631|       |
  632|  5.76k|    do {
  633|  5.76k|      const __m128i res =
  634|  5.76k|          xy_y_convolve_4tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
  635|  5.76k|      xy_y_round_store_2x2_sse2(res, dst, dst_stride);
  636|  5.76k|      im += 2 * 2;
  637|  5.76k|      dst += 2 * dst_stride;
  638|  5.76k|      y -= 2;
  639|  5.76k|    } while (y);
  ------------------
  |  Branch (639:14): [True: 1.93k, False: 3.83k]
  ------------------
  640|  22.8k|  } else {
  641|  22.8k|    __m256i coeffs_256[2];
  642|       |
  643|  22.8k|    prepare_coeffs_4tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
  644|       |
  645|  22.8k|    if (w == 4) {
  ------------------
  |  Branch (645:9): [True: 11.6k, False: 11.1k]
  ------------------
  646|  11.6k|      __m128i s_64[4];
  647|  11.6k|      __m256i s_256[2], ss_256[2];
  648|       |
  649|  11.6k|      s_64[0] = _mm_loadl_epi64((__m128i *)(im + 0 * 4));
  650|  11.6k|      s_64[1] = _mm_loadl_epi64((__m128i *)(im + 1 * 4));
  651|  11.6k|      s_64[2] = _mm_loadl_epi64((__m128i *)(im + 2 * 4));
  652|       |
  653|       |      // Load lines a and b. Line a to lower 128, line b to upper 128
  654|  11.6k|      s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
  ------------------
  |  |   29|  11.6k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  11.6k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
  655|  11.6k|      s_256[1] = _mm256_setr_m128i(s_64[1], s_64[2]);
  ------------------
  |  |   29|  11.6k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  11.6k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
  656|       |
  657|  11.6k|      ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
  658|       |
  659|  21.9k|      do {
  660|  21.9k|        const __m256i res =
  661|  21.9k|            xy_y_convolve_4tap_4x2_avx2(im, s_64, ss_256, coeffs_256);
  662|  21.9k|        xy_y_round_store_4x2_avx2(res, dst, dst_stride);
  663|  21.9k|        im += 2 * 4;
  664|  21.9k|        dst += 2 * dst_stride;
  665|  21.9k|        y -= 2;
  666|  21.9k|      } while (y);
  ------------------
  |  Branch (666:16): [True: 10.2k, False: 11.6k]
  ------------------
  667|  11.6k|    } else if (w == 8) {
  ------------------
  |  Branch (667:16): [True: 8.02k, False: 3.15k]
  ------------------
  668|  8.02k|      __m256i s_256[4], r[2];
  669|       |
  670|  8.02k|      s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 8));
  671|  8.02k|      s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 8));
  672|       |
  673|  8.02k|      if (subpel_y_q4 != 8) {
  ------------------
  |  Branch (673:11): [True: 7.23k, False: 793]
  ------------------
  674|  7.23k|        __m256i ss_256[4];
  675|       |
  676|  7.23k|        ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
  677|  7.23k|        ss_256[2] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
  678|       |
  679|  13.6k|        do {
  680|  13.6k|          xy_y_convolve_4tap_8x2_avx2(im, ss_256, coeffs_256, r);
  681|  13.6k|          xy_y_round_store_8x2_avx2(r, dst, dst_stride);
  682|  13.6k|          im += 2 * 8;
  683|  13.6k|          dst += 2 * dst_stride;
  684|  13.6k|          y -= 2;
  685|  13.6k|        } while (y);
  ------------------
  |  Branch (685:18): [True: 6.38k, False: 7.23k]
  ------------------
  686|  7.23k|      } else {
  687|  1.38k|        do {
  688|  1.38k|          xy_y_convolve_4tap_8x2_half_pel_avx2(im, coeffs_256, s_256, r);
  689|  1.38k|          xy_y_round_store_8x2_avx2(r, dst, dst_stride);
  690|  1.38k|          im += 2 * 8;
  691|  1.38k|          dst += 2 * dst_stride;
  692|  1.38k|          y -= 2;
  693|  1.38k|        } while (y);
  ------------------
  |  Branch (693:18): [True: 593, False: 793]
  ------------------
  694|    793|      }
  695|  8.02k|    } else if (w == 16) {
  ------------------
  |  Branch (695:16): [True: 2.88k, False: 267]
  ------------------
  696|  2.88k|      __m256i s_256[5];
  697|       |
  698|  2.88k|      s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
  699|  2.88k|      s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
  700|  2.88k|      s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 16));
  701|       |
  702|  2.88k|      if (subpel_y_q4 != 8) {
  ------------------
  |  Branch (702:11): [True: 2.50k, False: 375]
  ------------------
  703|  2.50k|        __m256i ss_256[4], tt_256[4], r[4];
  704|       |
  705|  2.50k|        ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
  706|  2.50k|        ss_256[2] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
  707|       |
  708|  2.50k|        tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
  709|  2.50k|        tt_256[2] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
  710|       |
  711|  6.87k|        do {
  712|  6.87k|          xy_y_convolve_4tap_16x2_avx2(im, s_256, ss_256, tt_256, coeffs_256,
  713|  6.87k|                                       r);
  714|  6.87k|          xy_y_round_store_16x2_avx2(r, dst, dst_stride);
  715|  6.87k|          im += 2 * 16;
  716|  6.87k|          dst += 2 * dst_stride;
  717|  6.87k|          y -= 2;
  718|  6.87k|        } while (y);
  ------------------
  |  Branch (718:18): [True: 4.36k, False: 2.50k]
  ------------------
  719|  2.50k|      } else {
  720|    375|        __m256i r[4];
  721|       |
  722|    750|        do {
  723|    750|          xy_y_convolve_4tap_16x2_half_pelavx2(im, s_256, coeffs_256, r);
  724|    750|          xy_y_round_store_16x2_avx2(r, dst, dst_stride);
  725|    750|          im += 2 * 16;
  726|    750|          dst += 2 * dst_stride;
  727|    750|          y -= 2;
  728|    750|        } while (y);
  ------------------
  |  Branch (728:18): [True: 375, False: 375]
  ------------------
  729|    375|      }
  730|  2.88k|    } else {
  731|       |      /*It's a special condition for OBMC. A/c  to Av1 spec 4-tap won't
  732|       |      support for width(w)>16, but for OBMC while predicting above block
  733|       |      it reduces size block to Wx(h/2), for example, if above block size
  734|       |      is 32x8, we get block size as 32x4 for OBMC.*/
  735|    267|      int32_t x = 0;
  736|       |
  737|    267|      assert(!(w % 32));
  738|       |
  739|    267|      __m256i s_256[2][4], ss_256[2][4], tt_256[2][4], r0[4], r1[4];
  740|    332|      do {
  741|    332|        const int16_t *s = im + x;
  742|    332|        uint8_t *d = dst + x;
  743|       |
  744|    332|        loadu_unpack_16bit_3rows_avx2(s, w, s_256[0], ss_256[0], tt_256[0]);
  745|    332|        loadu_unpack_16bit_3rows_avx2(s + 16, w, s_256[1], ss_256[1],
  746|    332|                                      tt_256[1]);
  747|       |
  748|    332|        y = h;
  749|  4.54k|        do {
  750|  4.54k|          xy_y_convolve_4tap_32x2_avx2(s, w, s_256[0], ss_256[0], tt_256[0],
  751|  4.54k|                                       coeffs_256, r0);
  752|  4.54k|          xy_y_convolve_4tap_32x2_avx2(s + 16, w, s_256[1], ss_256[1],
  753|  4.54k|                                       tt_256[1], coeffs_256, r1);
  754|       |
  755|  4.54k|          xy_y_round_store_32_avx2(r0 + 0, r1 + 0, d);
  756|  4.54k|          xy_y_round_store_32_avx2(r0 + 2, r1 + 2, d + dst_stride);
  757|       |
  758|  4.54k|          s += 2 * w;
  759|  4.54k|          d += 2 * dst_stride;
  760|  4.54k|          y -= 2;
  761|  4.54k|        } while (y);
  ------------------
  |  Branch (761:18): [True: 4.21k, False: 332]
  ------------------
  762|       |
  763|    332|        x += 32;
  764|    332|      } while (x < w);
  ------------------
  |  Branch (764:16): [True: 65, False: 267]
  ------------------
  765|    267|    }
  766|  22.8k|  }
  767|  26.6k|}
convolve_2d_avx2.c:convolve_2d_sr_ver_6tap_avx2:
  772|  17.8k|    uint8_t *dst, const int32_t dst_stride) {
  773|  17.8k|  const int16_t *im = im_block;
  774|  17.8k|  int32_t y;
  775|       |
  776|  17.8k|  if (w == 2) {
  ------------------
  |  Branch (776:7): [True: 1.11k, False: 16.7k]
  ------------------
  777|  1.11k|    __m128i coeffs_128[3], s_32[6], ss_128[3];
  778|       |
  779|  1.11k|    prepare_coeffs_6tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
  780|       |
  781|  1.11k|    s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(im + 0 * 2));
  782|  1.11k|    s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(im + 1 * 2));
  783|  1.11k|    s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(im + 2 * 2));
  784|  1.11k|    s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(im + 3 * 2));
  785|  1.11k|    s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(im + 4 * 2));
  786|       |
  787|  1.11k|    const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
  788|  1.11k|    const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
  789|  1.11k|    const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
  790|  1.11k|    const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
  791|       |
  792|  1.11k|    ss_128[0] = _mm_unpacklo_epi16(src01, src12);
  793|  1.11k|    ss_128[1] = _mm_unpacklo_epi16(src23, src34);
  794|       |
  795|  1.11k|    y = h;
  796|  4.44k|    do {
  797|  4.44k|      const __m128i res =
  798|  4.44k|          xy_y_convolve_6tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
  799|  4.44k|      xy_y_round_store_2x2_sse2(res, dst, dst_stride);
  800|  4.44k|      im += 2 * 2;
  801|  4.44k|      dst += 2 * dst_stride;
  802|  4.44k|      y -= 2;
  803|  4.44k|    } while (y);
  ------------------
  |  Branch (803:14): [True: 3.33k, False: 1.11k]
  ------------------
  804|  16.7k|  } else {
  805|  16.7k|    __m256i coeffs_256[3];
  806|       |
  807|  16.7k|    prepare_coeffs_6tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
  808|       |
  809|  16.7k|    if (w == 4) {
  ------------------
  |  Branch (809:9): [True: 5.39k, False: 11.3k]
  ------------------
  810|  5.39k|      __m128i s_64[6];
  811|  5.39k|      __m256i s_256[6], ss_256[3];
  812|       |
  813|  5.39k|      s_64[0] = _mm_loadl_epi64((__m128i *)(im + 0 * 4));
  814|  5.39k|      s_64[1] = _mm_loadl_epi64((__m128i *)(im + 1 * 4));
  815|  5.39k|      s_64[2] = _mm_loadl_epi64((__m128i *)(im + 2 * 4));
  816|  5.39k|      s_64[3] = _mm_loadl_epi64((__m128i *)(im + 3 * 4));
  817|  5.39k|      s_64[4] = _mm_loadl_epi64((__m128i *)(im + 4 * 4));
  818|       |
  819|       |      // Load lines a and b. Line a to lower 128, line b to upper 128
  820|  5.39k|      s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
  ------------------
  |  |   29|  5.39k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  5.39k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
  821|  5.39k|      s_256[1] = _mm256_setr_m128i(s_64[1], s_64[2]);
  ------------------
  |  |   29|  5.39k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  5.39k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
  822|  5.39k|      s_256[2] = _mm256_setr_m128i(s_64[2], s_64[3]);
  ------------------
  |  |   29|  5.39k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  5.39k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
  823|  5.39k|      s_256[3] = _mm256_setr_m128i(s_64[3], s_64[4]);
  ------------------
  |  |   29|  5.39k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  5.39k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
  824|       |
  825|  5.39k|      ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
  826|  5.39k|      ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
  827|       |
  828|  5.39k|      y = h;
  829|  27.5k|      do {
  830|  27.5k|        const __m256i res =
  831|  27.5k|            xy_y_convolve_6tap_4x2_avx2(im, s_64, ss_256, coeffs_256);
  832|  27.5k|        xy_y_round_store_4x2_avx2(res, dst, dst_stride);
  833|  27.5k|        im += 2 * 4;
  834|  27.5k|        dst += 2 * dst_stride;
  835|  27.5k|        y -= 2;
  836|  27.5k|      } while (y);
  ------------------
  |  Branch (836:16): [True: 22.1k, False: 5.39k]
  ------------------
  837|  11.3k|    } else if (w == 8) {
  ------------------
  |  Branch (837:16): [True: 6.07k, False: 5.26k]
  ------------------
  838|  6.07k|      __m256i s_256[6], r[2];
  839|       |
  840|  6.07k|      s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 8));
  841|  6.07k|      s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 8));
  842|  6.07k|      s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 8));
  843|  6.07k|      s_256[3] = _mm256_loadu_si256((__m256i *)(im + 3 * 8));
  844|  6.07k|      y = h;
  845|       |
  846|  6.07k|      if (subpel_y_q4 != 8) {
  ------------------
  |  Branch (846:11): [True: 5.35k, False: 724]
  ------------------
  847|  5.35k|        __m256i ss_256[6];
  848|       |
  849|  5.35k|        ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
  850|  5.35k|        ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
  851|       |
  852|  5.35k|        ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
  853|  5.35k|        ss_256[4] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
  854|       |
  855|  29.4k|        do {
  856|  29.4k|          xy_y_convolve_6tap_8x2_avx2(im, ss_256, coeffs_256, r);
  857|  29.4k|          xy_y_round_store_8x2_avx2(r, dst, dst_stride);
  858|  29.4k|          im += 2 * 8;
  859|  29.4k|          dst += 2 * dst_stride;
  860|  29.4k|          y -= 2;
  861|  29.4k|        } while (y);
  ------------------
  |  Branch (861:18): [True: 24.0k, False: 5.35k]
  ------------------
  862|  5.35k|      } else {
  863|  4.00k|        do {
  864|  4.00k|          xy_y_convolve_6tap_8x2_half_pel_avx2(im, coeffs_256, s_256, r);
  865|  4.00k|          xy_y_round_store_8x2_avx2(r, dst, dst_stride);
  866|  4.00k|          im += 2 * 8;
  867|  4.00k|          dst += 2 * dst_stride;
  868|  4.00k|          y -= 2;
  869|  4.00k|        } while (y);
  ------------------
  |  Branch (869:18): [True: 3.28k, False: 724]
  ------------------
  870|    724|      }
  871|  6.07k|    } else if (w == 16) {
  ------------------
  |  Branch (871:16): [True: 3.71k, False: 1.55k]
  ------------------
  872|  3.71k|      __m256i s_256[6];
  873|       |
  874|  3.71k|      s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
  875|  3.71k|      s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
  876|  3.71k|      s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 16));
  877|  3.71k|      s_256[3] = _mm256_loadu_si256((__m256i *)(im + 3 * 16));
  878|  3.71k|      s_256[4] = _mm256_loadu_si256((__m256i *)(im + 4 * 16));
  879|  3.71k|      y = h;
  880|       |
  881|  3.71k|      if (subpel_y_q4 != 8) {
  ------------------
  |  Branch (881:11): [True: 3.19k, False: 512]
  ------------------
  882|  3.19k|        __m256i ss_256[6], tt_256[6], r[4];
  883|       |
  884|  3.19k|        ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
  885|  3.19k|        ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
  886|  3.19k|        ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
  887|  3.19k|        ss_256[4] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
  888|       |
  889|  3.19k|        tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
  890|  3.19k|        tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[4]);
  891|  3.19k|        tt_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
  892|  3.19k|        tt_256[4] = _mm256_unpackhi_epi16(s_256[3], s_256[4]);
  893|       |
  894|  23.4k|        do {
  895|  23.4k|          xy_y_convolve_6tap_16x2_avx2(im, 16, s_256, ss_256, tt_256,
  896|  23.4k|                                       coeffs_256, r);
  897|  23.4k|          xy_y_round_store_16x2_avx2(r, dst, dst_stride);
  898|  23.4k|          im += 2 * 16;
  899|  23.4k|          dst += 2 * dst_stride;
  900|  23.4k|          y -= 2;
  901|  23.4k|        } while (y);
  ------------------
  |  Branch (901:18): [True: 20.2k, False: 3.19k]
  ------------------
  902|  3.19k|      } else {
  903|    512|        __m256i ss_256[4], r[4];
  904|       |
  905|  3.48k|        do {
  906|  3.48k|          xy_y_convolve_6tap_16x2_half_pel_avx2(im, 16, s_256, ss_256,
  907|  3.48k|                                                coeffs_256, r);
  908|  3.48k|          xy_y_round_store_16x2_avx2(r, dst, dst_stride);
  909|       |
  910|  3.48k|          im += 2 * 16;
  911|  3.48k|          dst += 2 * dst_stride;
  912|  3.48k|          y -= 2;
  913|  3.48k|        } while (y);
  ------------------
  |  Branch (913:18): [True: 2.97k, False: 512]
  ------------------
  914|    512|      }
  915|  3.71k|    } else {
  916|  1.55k|      int32_t x = 0;
  917|       |
  918|  1.55k|      assert(!(w % 32));
  919|       |
  920|  1.55k|      __m256i s_256[2][6], ss_256[2][6], tt_256[2][6], r0[4], r1[4];
  921|       |
  922|  1.78k|      do {
  923|  1.78k|        const int16_t *s = im + x;
  924|  1.78k|        uint8_t *d = dst + x;
  925|       |
  926|  1.78k|        loadu_unpack_16bit_5rows_avx2(s, w, s_256[0], ss_256[0], tt_256[0]);
  927|  1.78k|        loadu_unpack_16bit_5rows_avx2(s + 16, w, s_256[1], ss_256[1],
  928|  1.78k|                                      tt_256[1]);
  929|       |
  930|  1.78k|        y = h;
  931|  25.5k|        do {
  932|  25.5k|          xy_y_convolve_6tap_16x2_avx2(s, w, s_256[0], ss_256[0], tt_256[0],
  933|  25.5k|                                       coeffs_256, r0);
  934|  25.5k|          xy_y_convolve_6tap_16x2_avx2(s + 16, w, s_256[1], ss_256[1],
  935|  25.5k|                                       tt_256[1], coeffs_256, r1);
  936|       |
  937|  25.5k|          xy_y_round_store_32_avx2(r0 + 0, r1 + 0, d);
  938|  25.5k|          xy_y_round_store_32_avx2(r0 + 2, r1 + 2, d + dst_stride);
  939|       |
  940|  25.5k|          s += 2 * w;
  941|  25.5k|          d += 2 * dst_stride;
  942|  25.5k|          y -= 2;
  943|  25.5k|        } while (y);
  ------------------
  |  Branch (943:18): [True: 23.7k, False: 1.78k]
  ------------------
  944|       |
  945|  1.78k|        x += 32;
  946|  1.78k|      } while (x < w);
  ------------------
  |  Branch (946:16): [True: 234, False: 1.55k]
  ------------------
  947|  1.55k|    }
  948|  16.7k|  }
  949|  17.8k|}
convolve_2d_avx2.c:convolve_2d_sr_ver_8tap_avx2:
  954|  1.47k|    uint8_t *dst, const int32_t dst_stride) {
  955|  1.47k|  const int16_t *im = im_block;
  956|  1.47k|  int32_t y;
  957|       |
  958|  1.47k|  if (w == 2) {
  ------------------
  |  Branch (958:7): [True: 52, False: 1.42k]
  ------------------
  959|     52|    __m128i coeffs_128[4], s_32[8], ss_128[4];
  960|       |
  961|     52|    prepare_coeffs_8tap_sse2(filter_params_y, subpel_y_q4, coeffs_128);
  962|       |
  963|     52|    s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(im + 0 * 2));
  964|     52|    s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(im + 1 * 2));
  965|     52|    s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(im + 2 * 2));
  966|     52|    s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(im + 3 * 2));
  967|     52|    s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(im + 4 * 2));
  968|     52|    s_32[5] = _mm_cvtsi32_si128(*(int32_t *)(im + 5 * 2));
  969|     52|    s_32[6] = _mm_cvtsi32_si128(*(int32_t *)(im + 6 * 2));
  970|       |
  971|     52|    const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
  972|     52|    const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
  973|     52|    const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
  974|     52|    const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
  975|     52|    const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
  976|     52|    const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[6]);
  977|       |
  978|     52|    ss_128[0] = _mm_unpacklo_epi16(src01, src12);
  979|     52|    ss_128[1] = _mm_unpacklo_epi16(src23, src34);
  980|     52|    ss_128[2] = _mm_unpacklo_epi16(src45, src56);
  981|       |
  982|     52|    y = h;
  983|    208|    do {
  984|    208|      const __m128i res =
  985|    208|          xy_y_convolve_8tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
  986|    208|      xy_y_round_store_2x2_sse2(res, dst, dst_stride);
  987|    208|      im += 2 * 2;
  988|    208|      dst += 2 * dst_stride;
  989|    208|      y -= 2;
  990|    208|    } while (y);
  ------------------
  |  Branch (990:14): [True: 156, False: 52]
  ------------------
  991|  1.42k|  } else {
  992|  1.42k|    __m256i coeffs_256[4];
  993|       |
  994|  1.42k|    prepare_coeffs_8tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
  995|       |
  996|  1.42k|    if (w == 4) {
  ------------------
  |  Branch (996:9): [True: 542, False: 880]
  ------------------
  997|    542|      __m128i s_64[8];
  998|    542|      __m256i s_256[8], ss_256[4];
  999|       |
 1000|    542|      s_64[0] = _mm_loadl_epi64((__m128i *)(im + 0 * 4));
 1001|    542|      s_64[1] = _mm_loadl_epi64((__m128i *)(im + 1 * 4));
 1002|    542|      s_64[2] = _mm_loadl_epi64((__m128i *)(im + 2 * 4));
 1003|    542|      s_64[3] = _mm_loadl_epi64((__m128i *)(im + 3 * 4));
 1004|    542|      s_64[4] = _mm_loadl_epi64((__m128i *)(im + 4 * 4));
 1005|    542|      s_64[5] = _mm_loadl_epi64((__m128i *)(im + 5 * 4));
 1006|    542|      s_64[6] = _mm_loadl_epi64((__m128i *)(im + 6 * 4));
 1007|       |
 1008|       |      // Load lines a and b. Line a to lower 128, line b to upper 128
 1009|    542|      s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
  ------------------
  |  |   29|    542|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|    542|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1010|    542|      s_256[1] = _mm256_setr_m128i(s_64[1], s_64[2]);
  ------------------
  |  |   29|    542|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|    542|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1011|    542|      s_256[2] = _mm256_setr_m128i(s_64[2], s_64[3]);
  ------------------
  |  |   29|    542|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|    542|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1012|    542|      s_256[3] = _mm256_setr_m128i(s_64[3], s_64[4]);
  ------------------
  |  |   29|    542|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|    542|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1013|    542|      s_256[4] = _mm256_setr_m128i(s_64[4], s_64[5]);
  ------------------
  |  |   29|    542|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|    542|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1014|    542|      s_256[5] = _mm256_setr_m128i(s_64[5], s_64[6]);
  ------------------
  |  |   29|    542|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|    542|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1015|       |
 1016|    542|      ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
 1017|    542|      ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
 1018|    542|      ss_256[2] = _mm256_unpacklo_epi16(s_256[4], s_256[5]);
 1019|       |
 1020|    542|      y = h;
 1021|  2.91k|      do {
 1022|  2.91k|        const __m256i res =
 1023|  2.91k|            xy_y_convolve_8tap_4x2_avx2(im, s_64, ss_256, coeffs_256);
 1024|  2.91k|        xy_y_round_store_4x2_avx2(res, dst, dst_stride);
 1025|  2.91k|        im += 2 * 4;
 1026|  2.91k|        dst += 2 * dst_stride;
 1027|  2.91k|        y -= 2;
 1028|  2.91k|      } while (y);
  ------------------
  |  Branch (1028:16): [True: 2.37k, False: 542]
  ------------------
 1029|    880|    } else if (w == 8) {
  ------------------
  |  Branch (1029:16): [True: 473, False: 407]
  ------------------
 1030|    473|      __m256i s_256[8], r[2];
 1031|       |
 1032|    473|      s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 8));
 1033|    473|      s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 8));
 1034|    473|      s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 8));
 1035|    473|      s_256[3] = _mm256_loadu_si256((__m256i *)(im + 3 * 8));
 1036|    473|      s_256[4] = _mm256_loadu_si256((__m256i *)(im + 4 * 8));
 1037|    473|      s_256[5] = _mm256_loadu_si256((__m256i *)(im + 5 * 8));
 1038|    473|      y = h;
 1039|       |
 1040|    473|      if (subpel_y_q4 != 8) {
  ------------------
  |  Branch (1040:11): [True: 394, False: 79]
  ------------------
 1041|    394|        __m256i ss_256[8];
 1042|       |
 1043|    394|        convolve_8tap_unpack_avx2(s_256, ss_256);
 1044|       |
 1045|  2.05k|        do {
 1046|  2.05k|          xy_y_convolve_8tap_8x2_avx2(im, ss_256, coeffs_256, r);
 1047|  2.05k|          xy_y_round_store_8x2_avx2(r, dst, dst_stride);
 1048|  2.05k|          im += 2 * 8;
 1049|  2.05k|          dst += 2 * dst_stride;
 1050|  2.05k|          y -= 2;
 1051|  2.05k|        } while (y);
  ------------------
  |  Branch (1051:18): [True: 1.65k, False: 394]
  ------------------
 1052|    394|      } else {
 1053|    792|        do {
 1054|    792|          xy_y_convolve_8tap_8x2_half_pel_avx2(im, coeffs_256, s_256, r);
 1055|    792|          xy_y_round_store_8x2_avx2(r, dst, dst_stride);
 1056|    792|          im += 2 * 8;
 1057|    792|          dst += 2 * dst_stride;
 1058|    792|          y -= 2;
 1059|    792|        } while (y);
  ------------------
  |  Branch (1059:18): [True: 713, False: 79]
  ------------------
 1060|     79|      }
 1061|    473|    } else if (w == 16) {
  ------------------
  |  Branch (1061:16): [True: 204, False: 203]
  ------------------
 1062|    204|      __m256i s_256[8], r[4];
 1063|       |
 1064|    204|      load_16bit_7rows_avx2(im, 16, s_256);
 1065|    204|      y = h;
 1066|       |
 1067|    204|      if (subpel_y_q4 != 8) {
  ------------------
  |  Branch (1067:11): [True: 175, False: 29]
  ------------------
 1068|    175|        __m256i ss_256[8], tt_256[8];
 1069|       |
 1070|    175|        convolve_8tap_unpack_avx2(s_256, ss_256);
 1071|    175|        convolve_8tap_unpack_avx2(s_256 + 1, tt_256);
 1072|       |
 1073|  1.08k|        do {
 1074|  1.08k|          xy_y_convolve_8tap_16x2_avx2(im, 16, coeffs_256, s_256, ss_256,
 1075|  1.08k|                                       tt_256, r);
 1076|  1.08k|          xy_y_round_store_16x2_avx2(r, dst, dst_stride);
 1077|       |
 1078|  1.08k|          im += 2 * 16;
 1079|  1.08k|          dst += 2 * dst_stride;
 1080|  1.08k|          y -= 2;
 1081|  1.08k|        } while (y);
  ------------------
  |  Branch (1081:18): [True: 913, False: 175]
  ------------------
 1082|    175|      } else {
 1083|    188|        do {
 1084|    188|          xy_y_convolve_8tap_16x2_half_pel_avx2(im, 16, coeffs_256, s_256, r);
 1085|    188|          xy_y_round_store_16x2_avx2(r, dst, dst_stride);
 1086|       |
 1087|    188|          im += 2 * 16;
 1088|    188|          dst += 2 * dst_stride;
 1089|    188|          y -= 2;
 1090|    188|        } while (y);
  ------------------
  |  Branch (1090:18): [True: 159, False: 29]
  ------------------
 1091|     29|      }
 1092|    204|    } else {
 1093|    203|      int32_t x = 0;
 1094|    203|      __m256i s_256[2][8], r0[4], r1[4];
 1095|       |
 1096|    203|      assert(!(w % 32));
 1097|       |
 1098|    203|      __m256i ss_256[2][8], tt_256[2][8];
 1099|       |
 1100|    278|      do {
 1101|    278|        const int16_t *s = im + x;
 1102|    278|        uint8_t *d = dst + x;
 1103|       |
 1104|    278|        load_16bit_7rows_avx2(s, w, s_256[0]);
 1105|    278|        convolve_8tap_unpack_avx2(s_256[0], ss_256[0]);
 1106|    278|        convolve_8tap_unpack_avx2(s_256[0] + 1, tt_256[0]);
 1107|       |
 1108|    278|        load_16bit_7rows_avx2(s + 16, w, s_256[1]);
 1109|    278|        convolve_8tap_unpack_avx2(s_256[1], ss_256[1]);
 1110|    278|        convolve_8tap_unpack_avx2(s_256[1] + 1, tt_256[1]);
 1111|       |
 1112|    278|        y = h;
 1113|  3.88k|        do {
 1114|  3.88k|          xy_y_convolve_8tap_16x2_avx2(s, w, coeffs_256, s_256[0], ss_256[0],
 1115|  3.88k|                                       tt_256[0], r0);
 1116|  3.88k|          xy_y_convolve_8tap_16x2_avx2(s + 16, w, coeffs_256, s_256[1],
 1117|  3.88k|                                       ss_256[1], tt_256[1], r1);
 1118|  3.88k|          xy_y_round_store_32_avx2(r0 + 0, r1 + 0, d);
 1119|  3.88k|          xy_y_round_store_32_avx2(r0 + 2, r1 + 2, d + dst_stride);
 1120|       |
 1121|  3.88k|          s += 2 * w;
 1122|  3.88k|          d += 2 * dst_stride;
 1123|  3.88k|          y -= 2;
 1124|  3.88k|        } while (y);
  ------------------
  |  Branch (1124:18): [True: 3.60k, False: 278]
  ------------------
 1125|       |
 1126|    278|        x += 32;
 1127|    278|      } while (x < w);
  ------------------
  |  Branch (1127:16): [True: 75, False: 203]
  ------------------
 1128|    203|    }
 1129|  1.42k|  }
 1130|  1.47k|}

convolve_2d_avx2.c:prepare_half_coeffs_2tap_ssse3:
   61|  2.09k|    __m128i *const coeffs /* [1] */) {
   62|  2.09k|  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
   63|  2.09k|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|  2.09k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|  2.09k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
   64|  2.09k|  const __m128i coeffs_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
   65|       |
   66|       |  // right shift all filter co-efficients by 1 to reduce the bits required.
   67|       |  // This extra right shift will be taken care of at the end while rounding
   68|       |  // the result.
   69|       |  // Since all filter co-efficients are even, this change will not affect the
   70|       |  // end result
   71|  2.09k|  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
   72|  2.09k|                            _mm_set1_epi16((short)0xffff)));
   73|       |
   74|  2.09k|  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
   75|       |
   76|       |  // coeffs 3 4 3 4 3 4 3 4
   77|  2.09k|  *coeffs = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
   78|  2.09k|}
convolve_2d_avx2.c:x_convolve_2tap_2x2_sse4_1:
  859|    972|                                                 const __m128i coeffs[1]) {
  860|    972|  const __m128i sfl =
  861|    972|      _mm_setr_epi8(0, 1, 1, 2, 4, 5, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0);
  862|    972|  const __m128i s_128 = load_u8_4x2_sse4_1(src, stride);
  863|    972|  const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
  864|    972|  return convolve_2tap_ssse3(&ss, coeffs);
  865|    972|}
convolve_2d_avx2.c:convolve_2tap_ssse3:
  433|  13.4k|                                          const __m128i coeffs[1]) {
  434|  13.4k|  return _mm_maddubs_epi16(ss[0], coeffs[0]);
  435|  13.4k|}
convolve_2d_avx2.c:xy_x_round_store_2x2_sse2:
  615|  22.6k|                                             int16_t *const dst) {
  616|  22.6k|  const __m128i d = xy_x_round_sse2(res);
  617|  22.6k|  _mm_storel_epi64((__m128i *)dst, d);
  618|  22.6k|}
convolve_2d_avx2.c:xy_x_round_sse2:
  602|   129k|static inline __m128i xy_x_round_sse2(const __m128i src) {
  603|   129k|  const __m128i round = _mm_set1_epi16(2);
  604|   129k|  const __m128i dst = _mm_add_epi16(src, round);
  605|   129k|  return _mm_srai_epi16(dst, 2);
  606|   129k|}
convolve_2d_avx2.c:x_convolve_2tap_4x2_ssse3:
  869|  4.11k|                                                const __m128i coeffs[1]) {
  870|  4.11k|  const __m128i sfl =
  871|  4.11k|      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
  872|  4.11k|  const __m128i s_128 = load_u8_8x2_sse2(src, stride);
  873|  4.11k|  const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
  874|  4.11k|  return convolve_2tap_ssse3(&ss, coeffs);
  875|  4.11k|}
convolve_2d_avx2.c:xy_x_round_store_4x2_sse2:
  621|  98.2k|                                             int16_t *const dst) {
  622|  98.2k|  const __m128i d = xy_x_round_sse2(res);
  623|  98.2k|  _mm_storeu_si128((__m128i *)dst, d);
  624|  98.2k|}
convolve_2d_avx2.c:x_convolve_2tap_8x2_ssse3:
  880|  4.18k|                                             __m128i r[2]) {
  881|  4.18k|  __m128i ss[2];
  882|  4.18k|  const __m128i s00 = _mm_loadu_si128((__m128i *)src);
  883|  4.18k|  const __m128i s10 = _mm_loadu_si128((__m128i *)(src + stride));
  884|  4.18k|  const __m128i s01 = _mm_srli_si128(s00, 1);
  885|  4.18k|  const __m128i s11 = _mm_srli_si128(s10, 1);
  886|  4.18k|  ss[0] = _mm_unpacklo_epi8(s00, s01);
  887|  4.18k|  ss[1] = _mm_unpacklo_epi8(s10, s11);
  888|       |
  889|  4.18k|  r[0] = convolve_2tap_ssse3(&ss[0], coeffs);
  890|  4.18k|  r[1] = convolve_2tap_ssse3(&ss[1], coeffs);
  891|  4.18k|}
convolve_2d_avx2.c:xy_x_round_store_8x2_sse2:
  627|  4.18k|                                             int16_t *const dst) {
  628|  4.18k|  __m128i r[2];
  629|       |
  630|  4.18k|  r[0] = xy_x_round_sse2(res[0]);
  631|  4.18k|  r[1] = xy_x_round_sse2(res[1]);
  632|  4.18k|  _mm_storeu_si128((__m128i *)dst, r[0]);
  633|  4.18k|  _mm_storeu_si128((__m128i *)(dst + 8), r[1]);
  634|  4.18k|}
convolve_2d_avx2.c:prepare_half_coeffs_2tap_avx2:
  157|    945|    __m256i *const coeffs /* [1] */) {
  158|    945|  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
  159|    945|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|    945|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|    945|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  160|    945|  const __m128i coeffs_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
  161|    945|  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
  162|       |
  163|       |  // right shift all filter co-efficients by 1 to reduce the bits required.
  164|       |  // This extra right shift will be taken care of at the end while rounding
  165|       |  // the result.
  166|       |  // Since all filter co-efficients are even, this change will not affect the
  167|       |  // end result
  168|    945|  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
  169|    945|                            _mm_set1_epi16((short)0xffff)));
  170|       |
  171|    945|  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
  172|       |
  173|       |  // coeffs 3 4 3 4 3 4 3 4
  174|    945|  *coeffs = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
  175|    945|}
convolve_2d_avx2.c:x_convolve_2tap_16x2_avx2:
  912|  3.35k|                                             __m256i r[2]) {
  913|  3.35k|  const __m256i s0_256 = loadu_8bit_16x2_avx2(src, stride);
  914|  3.35k|  const __m256i s1_256 = loadu_8bit_16x2_avx2(src + 1, stride);
  915|  3.35k|  const __m256i s0 = _mm256_unpacklo_epi8(s0_256, s1_256);
  916|  3.35k|  const __m256i s1 = _mm256_unpackhi_epi8(s0_256, s1_256);
  917|  3.35k|  r[0] = convolve_2tap_avx2(&s0, coeffs);
  918|  3.35k|  r[1] = convolve_2tap_avx2(&s1, coeffs);
  919|  3.35k|}
convolve_2d_avx2.c:convolve_2tap_avx2:
  465|  36.2k|                                         const __m256i coeffs[1]) {
  466|  36.2k|  return _mm256_maddubs_epi16(ss[0], coeffs[0]);
  467|  36.2k|}
convolve_2d_avx2.c:xy_x_round_store_32_avx2:
  643|  56.8k|                                            int16_t *const dst) {
  644|  56.8k|  __m256i r[2];
  645|       |
  646|  56.8k|  r[0] = xy_x_round_avx2(res[0]);
  647|  56.8k|  r[1] = xy_x_round_avx2(res[1]);
  648|  56.8k|  const __m256i d0 =
  649|  56.8k|      _mm256_inserti128_si256(r[0], _mm256_castsi256_si128(r[1]), 1);
  650|  56.8k|  const __m256i d1 =
  651|       |      _mm256_inserti128_si256(r[1], _mm256_extracti128_si256(r[0], 1), 0);
  652|  56.8k|  _mm256_storeu_si256((__m256i *)dst, d0);
  653|  56.8k|  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
  654|  56.8k|}
convolve_2d_avx2.c:xy_x_round_avx2:
  608|   395k|static inline __m256i xy_x_round_avx2(const __m256i src) {
  609|   395k|  const __m256i round = _mm256_set1_epi16(2);
  610|   395k|  const __m256i dst = _mm256_add_epi16(src, round);
  611|   395k|  return _mm256_srai_epi16(dst, 2);
  612|   395k|}
convolve_2d_avx2.c:xy_x_2tap_32_avx2:
 1374|  14.7k|                                     int16_t *const dst) {
 1375|  14.7k|  __m256i r[2];
 1376|       |
 1377|  14.7k|  xy_x_convolve_2tap_32_avx2(src, coeffs, r);
 1378|  14.7k|  const __m256i d0 = xy_x_round_avx2(r[0]);
 1379|  14.7k|  const __m256i d1 = xy_x_round_avx2(r[1]);
 1380|  14.7k|  _mm256_storeu_si256((__m256i *)dst, d0);
 1381|  14.7k|  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
 1382|  14.7k|}
convolve_2d_avx2.c:xy_x_convolve_2tap_32_avx2:
 1362|  14.7k|                                              __m256i r[2]) {
 1363|  14.7k|  const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
 1364|  14.7k|  const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
 1365|  14.7k|  const __m256i ss0 = _mm256_unpacklo_epi8(s0, s1);
 1366|  14.7k|  const __m256i ss1 = _mm256_unpackhi_epi8(s0, s1);
 1367|       |
 1368|  14.7k|  r[0] = convolve_2tap_avx2(&ss0, coeffs);
 1369|  14.7k|  r[1] = convolve_2tap_avx2(&ss1, coeffs);
 1370|  14.7k|}
convolve_2d_avx2.c:prepare_half_coeffs_4tap_ssse3:
   82|  22.6k|    __m128i *const coeffs /* [2] */) {
   83|  22.6k|  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
   84|  22.6k|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|  22.6k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|  22.6k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
   85|  22.6k|  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
   86|       |
   87|       |  // right shift all filter co-efficients by 1 to reduce the bits required.
   88|       |  // This extra right shift will be taken care of at the end while rounding
   89|       |  // the result.
   90|       |  // Since all filter co-efficients are even, this change will not affect the
   91|       |  // end result
   92|  22.6k|  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
   93|  22.6k|                            _mm_set1_epi16((short)0xffff)));
   94|       |
   95|  22.6k|  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
   96|       |
   97|       |  // coeffs 2 3 2 3 2 3 2 3
   98|  22.6k|  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
   99|       |  // coeffs 4 5 4 5 4 5 4 5
  100|  22.6k|  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
  101|  22.6k|}
convolve_2d_avx2.c:x_convolve_4tap_2x2_ssse3:
  935|  21.6k|                                                const __m128i coeffs[2]) {
  936|  21.6k|  const __m128i sfl0 =
  937|  21.6k|      _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
  938|  21.6k|  const __m128i sfl1 =
  939|  21.6k|      _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
  940|  21.6k|  const __m128i s = load_u8_8x2_sse2(src, stride);
  941|  21.6k|  __m128i ss[2];
  942|       |
  943|  21.6k|  ss[0] = _mm_shuffle_epi8(s, sfl0);
  944|  21.6k|  ss[1] = _mm_shuffle_epi8(s, sfl1);
  945|  21.6k|  return convolve_4tap_ssse3(ss, coeffs);
  946|  21.6k|}
convolve_2d_avx2.c:convolve_4tap_ssse3:
  438|   115k|                                          const __m128i coeffs[2]) {
  439|   115k|  const __m128i res_23 = _mm_maddubs_epi16(ss[0], coeffs[0]);
  440|   115k|  const __m128i res_45 = _mm_maddubs_epi16(ss[1], coeffs[1]);
  441|   115k|  return _mm_add_epi16(res_23, res_45);
  442|   115k|}
convolve_2d_avx2.c:x_convolve_4tap_4x2_ssse3:
  950|  94.1k|                                                const __m128i coeffs[2]) {
  951|  94.1k|  const __m128i s = load_u8_8x2_sse2(src, stride);
  952|  94.1k|  const __m128i sfl0 =
  953|  94.1k|      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
  954|  94.1k|  const __m128i sfl1 =
  955|  94.1k|      _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14);
  956|  94.1k|  __m128i ss[2];
  957|       |
  958|  94.1k|  ss[0] = _mm_shuffle_epi8(s, sfl0);
  959|  94.1k|  ss[1] = _mm_shuffle_epi8(s, sfl1);
  960|  94.1k|  return convolve_4tap_ssse3(ss, coeffs);
  961|  94.1k|}
convolve_2d_avx2.c:prepare_half_coeffs_4tap_avx2:
  179|  1.43k|    __m256i *const coeffs /* [2] */) {
  180|  1.43k|  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
  181|  1.43k|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|  1.43k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|  1.43k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  182|  1.43k|  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
  183|       |
  184|       |  // right shift all filter co-efficients by 1 to reduce the bits required.
  185|       |  // This extra right shift will be taken care of at the end while rounding
  186|       |  // the result.
  187|       |  // Since all filter co-efficients are even, this change will not affect the
  188|       |  // end result
  189|       |  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
  190|  1.43k|                            _mm_set1_epi16((short)0xffff)));
  191|  1.43k|  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
  192|  1.43k|  populate_coeffs_4tap_avx2(coeffs_1, coeffs);
  193|  1.43k|}
convolve_2d_avx2.c:populate_coeffs_4tap_avx2:
   24|  1.43k|                                             __m256i coeffs[2]) {
   25|  1.43k|  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
   26|       |
   27|       |  // coeffs 2 3 2 3 2 3 2 3
   28|  1.43k|  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
   29|       |  // coeffs 4 5 4 5 4 5 4 5
   30|  1.43k|  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
   31|  1.43k|}
convolve_2d_avx2.c:x_convolve_4tap_8x2_avx2:
  966|  10.9k|                                               const __m256i filt[2]) {
  967|  10.9k|  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
  968|  10.9k|  return x_convolve_4tap_avx2(s_256, coeffs, filt);
  969|  10.9k|}
convolve_2d_avx2.c:x_convolve_4tap_avx2:
  562|  34.7k|                                           const __m256i filt[2]) {
  563|  34.7k|  __m256i ss[2];
  564|       |
  565|  34.7k|  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
  566|  34.7k|  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
  567|       |
  568|  34.7k|  return convolve_4tap_avx2(ss, coeffs);
  569|  34.7k|}
convolve_2d_avx2.c:convolve_4tap_avx2:
  470|  34.7k|                                         const __m256i coeffs[2]) {
  471|  34.7k|  const __m256i res_23 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
  472|  34.7k|  const __m256i res_45 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
  473|  34.7k|  return _mm256_add_epi16(res_23, res_45);
  474|  34.7k|}
convolve_2d_avx2.c:xy_x_round_store_8x2_avx2:
  637|  87.4k|                                             int16_t *const dst) {
  638|  87.4k|  const __m256i d = xy_x_round_avx2(res);
  639|  87.4k|  _mm256_storeu_si256((__m256i *)dst, d);
  640|  87.4k|}
convolve_2d_avx2.c:x_convolve_4tap_16x2_avx2:
  975|  3.28k|                                             __m256i r[2]) {
  976|  3.28k|  r[0] = x_convolve_4tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
  977|  3.28k|  r[1] = x_convolve_4tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
  978|  3.28k|}
convolve_2d_avx2.c:xy_x_4tap_32_avx2:
 1387|  11.9k|                                     int16_t *const dst) {
 1388|  11.9k|  __m256i r[2];
 1389|       |
 1390|  11.9k|  x_convolve_4tap_32_avx2(src, coeffs, filt, r);
 1391|  11.9k|  const __m256i d0 = xy_x_round_avx2(r[0]);
 1392|  11.9k|  const __m256i d1 = xy_x_round_avx2(r[1]);
 1393|  11.9k|  _mm256_storeu_si256((__m256i *)dst, d0);
 1394|  11.9k|  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
 1395|  11.9k|}
convolve_2d_avx2.c:x_convolve_4tap_32_avx2:
  983|  11.9k|                                           __m256i r[2]) {
  984|  11.9k|  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
  985|  11.9k|  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
  986|       |
  987|  11.9k|  r[0] = x_convolve_4tap_avx2(s0_256, coeffs, filt);
  988|  11.9k|  r[1] = x_convolve_4tap_avx2(s1_256, coeffs, filt);
  989|  11.9k|}
convolve_2d_avx2.c:prepare_half_coeffs_6tap_avx2:
  197|  20.2k|    __m256i *const coeffs /* [3] */) {
  198|  20.2k|  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
  199|  20.2k|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|  20.2k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|  20.2k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  200|  20.2k|  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
  201|       |
  202|       |  // right shift all filter co-efficients by 1 to reduce the bits required.
  203|       |  // This extra right shift will be taken care of at the end while rounding
  204|       |  // the result.
  205|       |  // Since all filter co-efficients are even, this change will not affect the
  206|       |  // end result
  207|       |  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
  208|  20.2k|                            _mm_set1_epi16((short)0xffff)));
  209|  20.2k|  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
  210|  20.2k|  populate_coeffs_6tap_avx2(coeffs_1, coeffs);
  211|  20.2k|}
convolve_2d_avx2.c:populate_coeffs_6tap_avx2:
   34|  20.2k|                                             __m256i coeffs[3]) {
   35|  20.2k|  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
   36|       |
   37|       |  // coeffs 1 2 1 2 1 2 1 2
   38|  20.2k|  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0402u));
   39|       |  // coeffs 3 4 3 4 3 4 3 4
   40|  20.2k|  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0806u));
   41|       |  // coeffs 5 6 5 6 5 6 5 6
   42|  20.2k|  coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0C0Au));
   43|  20.2k|}
convolve_2d_avx2.c:x_convolve_6tap_8x2_avx2:
 1031|   170k|                                               const __m256i filt[3]) {
 1032|   170k|  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
 1033|   170k|  return x_convolve_6tap_avx2(s_256, coeffs, filt);
 1034|   170k|}
convolve_2d_avx2.c:x_convolve_6tap_avx2:
  573|   290k|                                           const __m256i filt[3]) {
  574|   290k|  __m256i ss[3];
  575|       |
  576|   290k|  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
  577|   290k|  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
  578|   290k|  ss[2] = _mm256_shuffle_epi8(data, filt[2]);
  579|       |
  580|   290k|  return convolve_6tap_avx2(ss, coeffs);
  581|   290k|}
convolve_2d_avx2.c:convolve_6tap_avx2:
  477|   290k|                                         const __m256i coeffs[3]) {
  478|   290k|  const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
  479|   290k|  const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
  480|   290k|  const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
  481|   290k|  const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
  482|   290k|  return _mm256_add_epi16(res_0145, res_23);
  483|   290k|}
convolve_2d_avx2.c:x_convolve_6tap_16x2_avx2:
 1040|  47.2k|                                             __m256i r[2]) {
 1041|  47.2k|  r[0] = x_convolve_6tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
 1042|  47.2k|  r[1] = x_convolve_6tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
 1043|  47.2k|}
convolve_2d_avx2.c:xy_x_6tap_32_avx2:
 1400|  60.1k|                                     int16_t *const dst) {
 1401|  60.1k|  __m256i r[2];
 1402|       |
 1403|  60.1k|  x_convolve_6tap_32_avx2(src, coeffs, filt, r);
 1404|  60.1k|  const __m256i d0 = xy_x_round_avx2(r[0]);
 1405|  60.1k|  const __m256i d1 = xy_x_round_avx2(r[1]);
 1406|  60.1k|  _mm256_storeu_si256((__m256i *)dst, d0);
 1407|  60.1k|  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
 1408|  60.1k|}
convolve_2d_avx2.c:x_convolve_6tap_32_avx2:
 1048|  60.1k|                                           __m256i r[2]) {
 1049|  60.1k|  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
 1050|  60.1k|  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
 1051|       |
 1052|  60.1k|  r[0] = x_convolve_6tap_avx2(s0_256, coeffs, filt);
 1053|  60.1k|  r[1] = x_convolve_6tap_avx2(s1_256, coeffs, filt);
 1054|  60.1k|}
convolve_2d_avx2.c:prepare_half_coeffs_8tap_avx2:
  215|  1.71k|    __m256i *const coeffs /* [4] */) {
  216|  1.71k|  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
  217|  1.71k|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|  1.71k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|  1.71k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  218|  1.71k|  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
  219|       |
  220|       |  // right shift all filter co-efficients by 1 to reduce the bits required.
  221|       |  // This extra right shift will be taken care of at the end while rounding
  222|       |  // the result.
  223|       |  // Since all filter co-efficients are even, this change will not affect the
  224|       |  // end result
  225|       |  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
  226|  1.71k|                            _mm_set1_epi16((short)0xffff)));
  227|  1.71k|  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
  228|  1.71k|  populate_coeffs_8tap_avx2(coeffs_1, coeffs);
  229|  1.71k|}
convolve_2d_avx2.c:populate_coeffs_8tap_avx2:
   46|  1.71k|                                             __m256i coeffs[4]) {
   47|  1.71k|  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
   48|       |
   49|       |  // coeffs 0 1 0 1 0 1 0 1
   50|  1.71k|  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0200u));
   51|       |  // coeffs 2 3 2 3 2 3 2 3
   52|  1.71k|  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
   53|       |  // coeffs 4 5 4 5 4 5 4 5
   54|  1.71k|  coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
   55|       |  // coeffs 6 7 6 7 6 7 6 7
   56|  1.71k|  coeffs[3] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0e0cu));
   57|  1.71k|}
convolve_2d_avx2.c:x_convolve_8tap_8x2_avx2:
 1059|  13.2k|                                               const __m256i filt[4]) {
 1060|  13.2k|  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
 1061|  13.2k|  return x_convolve_8tap_avx2(s_256, coeffs, filt);
 1062|  13.2k|}
convolve_2d_avx2.c:x_convolve_8tap_avx2:
  585|  33.5k|                                           const __m256i filt[4]) {
  586|  33.5k|  __m256i ss[4];
  587|       |
  588|  33.5k|  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
  589|  33.5k|  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
  590|  33.5k|  ss[2] = _mm256_shuffle_epi8(data, filt[2]);
  591|  33.5k|  ss[3] = _mm256_shuffle_epi8(data, filt[3]);
  592|       |
  593|  33.5k|  return convolve_8tap_avx2(ss, coeffs);
  594|  33.5k|}
convolve_2d_avx2.c:convolve_8tap_avx2:
  486|  33.5k|                                         const __m256i coeffs[4]) {
  487|  33.5k|  const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
  488|  33.5k|  const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
  489|  33.5k|  const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
  490|  33.5k|  const __m256i res_67 = _mm256_maddubs_epi16(ss[3], coeffs[3]);
  491|  33.5k|  const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
  492|  33.5k|  const __m256i res_2367 = _mm256_add_epi16(res_23, res_67);
  493|  33.5k|  return _mm256_add_epi16(res_0145, res_2367);
  494|  33.5k|}
convolve_2d_avx2.c:x_convolve_8tap_16x2_avx2:
 1068|  3.00k|                                                       __m256i r[2]) {
 1069|  3.00k|  r[0] = x_convolve_8tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
 1070|  3.00k|  r[1] = x_convolve_8tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
 1071|  3.00k|}
convolve_2d_avx2.c:xy_x_8tap_32_avx2:
 1413|  10.1k|                                     int16_t *const dst) {
 1414|  10.1k|  __m256i r[2];
 1415|       |
 1416|  10.1k|  x_convolve_8tap_32_avx2(src, coeffs, filt, r);
 1417|  10.1k|  const __m256i d0 = xy_x_round_avx2(r[0]);
 1418|  10.1k|  const __m256i d1 = xy_x_round_avx2(r[1]);
 1419|  10.1k|  _mm256_storeu_si256((__m256i *)dst, d0);
 1420|  10.1k|  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
 1421|  10.1k|}
convolve_2d_avx2.c:x_convolve_8tap_32_avx2:
 1076|  10.1k|                                                     __m256i r[2]) {
 1077|  10.1k|  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
 1078|  10.1k|  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
 1079|       |
 1080|  10.1k|  r[0] = x_convolve_8tap_avx2(s0_256, coeffs, filt);
 1081|  10.1k|  r[1] = x_convolve_8tap_avx2(s1_256, coeffs, filt);
 1082|  10.1k|}
convolve_2d_avx2.c:xy_y_convolve_2tap_2x2_half_pel_sse2:
 1437|     60|    const int16_t *const src, __m128i s_32[2]) {
 1438|     60|  __m128i s_128[2];
 1439|       |
 1440|     60|  s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + 2));
 1441|     60|  s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
 1442|     60|  s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * 2));
 1443|     60|  s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
 1444|     60|  return _mm_add_epi16(s_128[0], s_128[1]);
 1445|     60|}
convolve_2d_avx2.c:xy_y_round_half_pel_sse2:
  662|    524|static inline __m128i xy_y_round_half_pel_sse2(const __m128i src) {
  663|    524|  const __m128i round = _mm_set1_epi16(16);
  664|    524|  const __m128i dst = _mm_add_epi16(src, round);
  665|    524|  return _mm_srai_epi16(dst, 5);
  666|    524|}
convolve_2d_avx2.c:pack_store_2x2_sse2:
  687|  11.0k|                                       const ptrdiff_t stride) {
  688|  11.0k|  const __m128i d = _mm_packus_epi16(res, res);
  689|  11.0k|  *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(d);
  690|       |  *(int16_t *)(dst + stride) = (int16_t)_mm_extract_epi16(d, 1);
  691|  11.0k|}
convolve_2d_avx2.c:xy_y_convolve_2tap_4x2_half_pel_sse2:
 1464|    464|    const int16_t *const src, __m128i s_64[2]) {
 1465|    464|  __m128i s_128[2];
 1466|       |
 1467|    464|  s_64[1] = _mm_loadl_epi64((__m128i *)(src + 4));
 1468|    464|  s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
 1469|    464|  s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * 4));
 1470|    464|  s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
 1471|    464|  return _mm_add_epi16(s_128[0], s_128[1]);
 1472|    464|}
convolve_2d_avx2.c:pack_store_4x2_sse2:
  694|  3.14k|                                       const ptrdiff_t stride) {
  695|  3.14k|  const __m128i d = _mm_packus_epi16(res, res);
  696|  3.14k|  store_u8_4x2_sse2(d, dst, stride);
  697|  3.14k|}
convolve_2d_avx2.c:xy_y_convolve_2tap_8x2_half_pel_avx2:
 1497|    628|    const int16_t *const src, __m128i s_128[2]) {
 1498|    628|  __m256i s_256[2];
 1499|    628|  s_128[1] = _mm_loadu_si128((__m128i *)(src + 8));
 1500|    628|  s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
  ------------------
  |  |   29|    628|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|    628|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1501|    628|  s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
 1502|       |  s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
  ------------------
  |  |   29|    628|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|    628|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1503|    628|  return _mm256_add_epi16(s_256[0], s_256[1]);
 1504|    628|}
convolve_2d_avx2.c:xy_y_round_half_pel_avx2:
  680|  14.6k|static inline __m256i xy_y_round_half_pel_avx2(const __m256i src) {
  681|  14.6k|  const __m256i round = _mm256_set1_epi16(16);
  682|  14.6k|  const __m256i dst = _mm256_add_epi16(src, round);
  683|  14.6k|  return _mm256_srai_epi16(dst, 5);
  684|  14.6k|}
convolve_2d_avx2.c:pack_store_8x2_avx2:
  710|  54.6k|                                       const ptrdiff_t stride) {
  711|  54.6k|  const __m256i d = _mm256_packus_epi16(res, res);
  712|  54.6k|  const __m128i d0 = _mm256_castsi256_si128(d);
  713|       |  const __m128i d1 = _mm256_extracti128_si256(d, 1);
  714|  54.6k|  _mm_storel_epi64((__m128i *)dst, d0);
  715|  54.6k|  _mm_storel_epi64((__m128i *)(dst + stride), d1);
  716|  54.6k|}
convolve_2d_avx2.c:xy_y_convolve_2tap_16x2_half_pel_avx2:
 1507|    984|    const int16_t *const src, __m256i s_256[2], __m256i r[2]) {
 1508|    984|  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 16));
 1509|    984|  r[0] = _mm256_add_epi16(s_256[0], s_256[1]);
 1510|    984|  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
 1511|    984|  r[1] = _mm256_add_epi16(s_256[1], s_256[0]);
 1512|    984|}
convolve_2d_avx2.c:xy_y_pack_store_16x2_avx2:
  728|  38.6k|                                             const ptrdiff_t stride) {
  729|  38.6k|  const __m256i t = _mm256_packus_epi16(res0, res1);
  730|       |  const __m256i d = _mm256_permute4x64_epi64(t, 0xD8);
  731|  38.6k|  storeu_u8_16x2_avx2(d, dst, stride);
  732|  38.6k|}
convolve_2d_avx2.c:xy_y_convolve_2tap_half_pel_32_all_avx2:
 1565|  6.01k|    uint8_t *const dst) {
 1566|  6.01k|  __m256i r[2];
 1567|       |
 1568|  6.01k|  xy_y_convolve_2tap_half_pel_32_avx2(src, s0, s1, r);
 1569|  6.01k|  r[0] = xy_y_round_half_pel_avx2(r[0]);
 1570|  6.01k|  r[1] = xy_y_round_half_pel_avx2(r[1]);
 1571|  6.01k|  xy_y_pack_store_32_avx2(r[0], r[1], dst);
 1572|  6.01k|}
convolve_2d_avx2.c:xy_y_convolve_2tap_half_pel_32_avx2:
 1556|  6.01k|                                                       __m256i r[2]) {
 1557|  6.01k|  s1[0] = _mm256_loadu_si256((__m256i *)src);
 1558|  6.01k|  s1[1] = _mm256_loadu_si256((__m256i *)(src + 16));
 1559|  6.01k|  r[0] = _mm256_add_epi16(s0[0], s1[0]);
 1560|  6.01k|  r[1] = _mm256_add_epi16(s0[1], s1[1]);
 1561|  6.01k|}
convolve_2d_avx2.c:xy_y_pack_store_32_avx2:
  759|  81.6k|                                           uint8_t *const dst) {
  760|  81.6k|  const __m256i d = _mm256_packus_epi16(res0, res1);
  761|       |  // d = _mm256_permute4x64_epi64(d, 0xD8);
  762|  81.6k|  _mm256_storeu_si256((__m256i *)dst, d);
  763|  81.6k|}
convolve_2d_avx2.c:prepare_coeffs_2tap_sse2:
  233|  1.11k|    __m128i *const coeffs /* [1] */) {
  234|  1.11k|  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
  235|  1.11k|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|  1.11k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|  1.11k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  236|       |
  237|  1.11k|  const __m128i coeff = _mm_cvtsi32_si128(loadu_int32(filter + 3));
  238|       |
  239|       |  // coeffs 3 4 3 4 3 4 3 4
  240|       |  coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
  241|  1.11k|}
convolve_2d_avx2.c:xy_y_convolve_2tap_2x2_sse2:
 1425|    588|                                                  const __m128i coeffs[1]) {
 1426|    588|  __m128i s_128[2];
 1427|       |
 1428|    588|  s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + 2));
 1429|    588|  s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
 1430|    588|  s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * 2));
 1431|    588|  s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
 1432|    588|  const __m128i ss = _mm_unpacklo_epi16(s_128[0], s_128[1]);
 1433|    588|  return convolve16_2tap_sse2(&ss, coeffs);
 1434|    588|}
convolve_2d_avx2.c:convolve16_2tap_sse2:
  497|  5.94k|                                           const __m128i coeffs[1]) {
  498|  5.94k|  return _mm_madd_epi16(ss[0], coeffs[0]);
  499|  5.94k|}
convolve_2d_avx2.c:xy_y_round_store_2x2_sse2:
  743|  11.0k|                                             const ptrdiff_t stride) {
  744|  11.0k|  const __m128i r = xy_y_round_sse2(res);
  745|  11.0k|  const __m128i rr = _mm_packs_epi32(r, r);
  746|  11.0k|  pack_store_2x2_sse2(rr, dst, stride);
  747|  11.0k|}
convolve_2d_avx2.c:xy_y_convolve_2tap_4x2_sse2:
 1450|  2.68k|                                               __m128i r[2]) {
 1451|  2.68k|  __m128i s_128[2];
 1452|       |
 1453|  2.68k|  s_64[1] = _mm_loadl_epi64((__m128i *)(src + 4));
 1454|  2.68k|  s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
 1455|  2.68k|  s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * 4));
 1456|  2.68k|  s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
 1457|  2.68k|  const __m128i ss0 = _mm_unpacklo_epi16(s_128[0], s_128[1]);
 1458|  2.68k|  const __m128i ss1 = _mm_unpackhi_epi16(s_128[0], s_128[1]);
 1459|  2.68k|  r[0] = convolve16_2tap_sse2(&ss0, coeffs);
 1460|  2.68k|  r[1] = convolve16_2tap_sse2(&ss1, coeffs);
 1461|  2.68k|}
convolve_2d_avx2.c:xy_y_round_sse2:
  656|  16.3k|static inline __m128i xy_y_round_sse2(const __m128i src) {
  657|  16.3k|  const __m128i round = _mm_set1_epi32(1024);
  658|  16.3k|  const __m128i dst = _mm_add_epi32(src, round);
  659|  16.3k|  return _mm_srai_epi32(dst, 11);
  660|  16.3k|}
convolve_2d_avx2.c:prepare_coeffs_2tap_avx2:
  292|  1.29k|    __m256i *const coeffs /* [1] */) {
  293|  1.29k|  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
  294|  1.29k|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|  1.29k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|  1.29k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  295|       |
  296|  1.29k|  const __m128i coeff_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
  297|  1.29k|  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
  298|       |
  299|       |  // coeffs 3 4 3 4 3 4 3 4
  300|       |  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
  301|  1.29k|}
convolve_2d_avx2.c:xy_y_convolve_2tap_8x2_avx2:
 1487|  2.75k|                                               __m256i r[2]) {
 1488|  2.75k|  __m256i s_256[2];
 1489|  2.75k|  s_128[1] = _mm_loadu_si128((__m128i *)(src + 8));
 1490|  2.75k|  s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
  ------------------
  |  |   29|  2.75k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  2.75k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1491|  2.75k|  s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
 1492|       |  s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
  ------------------
  |  |   29|  2.75k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  2.75k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1493|  2.75k|  xy_y_convolve_2tap_16_avx2(s_256[0], s_256[1], coeffs, r);
 1494|  2.75k|}
convolve_2d_avx2.c:xy_y_convolve_2tap_16_avx2:
 1477|  24.7k|                                              __m256i r[2]) {
 1478|  24.7k|  const __m256i ss0 = _mm256_unpacklo_epi16(s0, s1);
 1479|  24.7k|  const __m256i ss1 = _mm256_unpackhi_epi16(s0, s1);
 1480|  24.7k|  r[0] = convolve16_2tap_avx2(&ss0, coeffs);
 1481|  24.7k|  r[1] = convolve16_2tap_avx2(&ss1, coeffs);
 1482|  24.7k|}
convolve_2d_avx2.c:convolve16_2tap_avx2:
  529|  49.5k|                                           const __m256i coeffs[1]) {
  530|  49.5k|  return _mm256_madd_epi16(ss[0], coeffs[0]);
  531|  49.5k|}
convolve_2d_avx2.c:xy_y_round_store_8x2_avx2:
 1968|  54.0k|                                             const ptrdiff_t stride) {
 1969|  54.0k|  const __m256i r = xy_y_round_16_avx2(res);
 1970|  54.0k|  pack_store_8x2_avx2(r, dst, stride);
 1971|  54.0k|}
convolve_2d_avx2.c:xy_y_round_16_avx2:
  674|   280k|static inline __m256i xy_y_round_16_avx2(const __m256i r[2]) {
  675|   280k|  const __m256i r0 = xy_y_round_avx2(r[0]);
  676|   280k|  const __m256i r1 = xy_y_round_avx2(r[1]);
  677|   280k|  return _mm256_packs_epi32(r0, r1);
  678|   280k|}
convolve_2d_avx2.c:xy_y_round_avx2:
  668|   613k|static inline __m256i xy_y_round_avx2(const __m256i src) {
  669|   613k|  const __m256i round = _mm256_set1_epi32(1024);
  670|   613k|  const __m256i dst = _mm256_add_epi32(src, round);
  671|   613k|  return _mm256_srai_epi32(dst, 11);
  672|   613k|}
convolve_2d_avx2.c:xy_y_convolve_2tap_16x2_avx2:
 1524|  1.90k|                                                __m256i r[4]) {
 1525|  1.90k|  s[1] = _mm256_loadu_si256((__m256i *)(src + 16));
 1526|  1.90k|  xy_y_convolve_2tap_16_avx2(s[0], s[1], coeffs, r + 0);
 1527|  1.90k|  s[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
 1528|  1.90k|  xy_y_convolve_2tap_16_avx2(s[1], s[0], coeffs, r + 2);
 1529|  1.90k|}
convolve_2d_avx2.c:xy_y_round_store_16x2_avx2:
 1975|  37.7k|                                              const ptrdiff_t stride) {
 1976|  37.7k|  const __m256i r0 = xy_y_round_16_avx2(res + 0);
 1977|  37.7k|  const __m256i r1 = xy_y_round_16_avx2(res + 2);
 1978|  37.7k|  xy_y_pack_store_16x2_avx2(r0, r1, dst, stride);
 1979|  37.7k|}
convolve_2d_avx2.c:xy_y_convolve_2tap_32_all_avx2:
 1546|  7.65k|                                                  uint8_t *const dst) {
 1547|  7.65k|  __m256i r[4];
 1548|       |
 1549|  7.65k|  xy_y_convolve_2tap_32_avx2(src, s0, s1, coeffs, r);
 1550|  7.65k|  xy_y_round_store_32_avx2(r + 0, r + 2, dst);
 1551|  7.65k|}
convolve_2d_avx2.c:xy_y_convolve_2tap_32_avx2:
 1535|  7.65k|                                              __m256i r[4]) {
 1536|  7.65k|  s1[0] = _mm256_loadu_si256((__m256i *)src);
 1537|  7.65k|  s1[1] = _mm256_loadu_si256((__m256i *)(src + 16));
 1538|  7.65k|  xy_y_convolve_2tap_16_avx2(s0[0], s1[0], coeffs, r + 0);
 1539|  7.65k|  xy_y_convolve_2tap_16_avx2(s0[1], s1[1], coeffs, r + 2);
 1540|  7.65k|}
convolve_2d_avx2.c:xy_y_round_store_32_avx2:
  767|  75.5k|                                            uint8_t *const dst) {
  768|  75.5k|  const __m256i ra = xy_y_round_16_avx2(r0);
  769|  75.5k|  const __m256i rb = xy_y_round_16_avx2(r1);
  770|  75.5k|  xy_y_pack_store_32_avx2(ra, rb, dst);
  771|  75.5k|}
convolve_2d_avx2.c:prepare_coeffs_4tap_sse2:
  245|  3.83k|    __m128i *const coeffs /* [2] */) {
  246|  3.83k|  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
  247|  3.83k|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|  3.83k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|  3.83k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  248|       |
  249|  3.83k|  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
  250|       |
  251|       |  // coeffs 2 3 2 3 2 3 2 3
  252|  3.83k|  coeffs[0] = _mm_shuffle_epi32(coeff, 0x55);
  253|       |  // coeffs 4 5 4 5 4 5 4 5
  254|       |  coeffs[1] = _mm_shuffle_epi32(coeff, 0xaa);
  255|  3.83k|}
convolve_2d_avx2.c:xy_y_convolve_4tap_2x2_sse2:
 1577|  5.76k|                                                  const __m128i coeffs[2]) {
 1578|  5.76k|  s_32[3] = _mm_cvtsi32_si128(loadu_int32(src + 3 * 2));
 1579|  5.76k|  const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
 1580|  5.76k|  s_32[2] = _mm_cvtsi32_si128(loadu_int32(src + 4 * 2));
 1581|  5.76k|  const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[2]);
 1582|  5.76k|  ss_128[1] = _mm_unpacklo_epi16(src23, src34);
 1583|  5.76k|  const __m128i r = convolve16_4tap_sse2(ss_128, coeffs);
 1584|  5.76k|  ss_128[0] = ss_128[1];
 1585|  5.76k|  return r;
 1586|  5.76k|}
convolve_2d_avx2.c:convolve16_4tap_sse2:
  502|  5.76k|                                           const __m128i coeffs[2]) {
  503|  5.76k|  const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
  504|  5.76k|  const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
  505|  5.76k|  return _mm_add_epi32(res_01, res_23);
  506|  5.76k|}
convolve_2d_avx2.c:prepare_coeffs_4tap_avx2:
  305|  22.8k|    __m256i *const coeffs /* [2] */) {
  306|  22.8k|  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
  307|  22.8k|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|  22.8k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|  22.8k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  308|       |
  309|  22.8k|  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
  310|  22.8k|  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
  311|       |
  312|       |  // coeffs 2 3 2 3 2 3 2 3
  313|  22.8k|  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x55);
  314|       |  // coeffs 4 5 4 5 4 5 4 5
  315|       |  coeffs[1] = _mm256_shuffle_epi32(coeff, 0xaa);
  316|  22.8k|}
convolve_2d_avx2.c:xy_y_convolve_4tap_4x2_avx2:
 1591|  21.9k|                                                  const __m256i coeffs[2]) {
 1592|  21.9k|  __m256i s_256[2];
 1593|  21.9k|  s_64[3] = _mm_loadl_epi64((__m128i *)(src + 3 * 4));
 1594|  21.9k|  s_256[0] = _mm256_setr_m128i(s_64[2], s_64[3]);
  ------------------
  |  |   29|  21.9k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  21.9k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1595|  21.9k|  s_64[2] = _mm_loadl_epi64((__m128i *)(src + 4 * 4));
 1596|       |  s_256[1] = _mm256_setr_m128i(s_64[3], s_64[2]);
  ------------------
  |  |   29|  21.9k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  21.9k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1597|  21.9k|  ss_256[1] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
 1598|  21.9k|  const __m256i r = convolve16_4tap_avx2(ss_256, coeffs);
 1599|  21.9k|  ss_256[0] = ss_256[1];
 1600|  21.9k|  return r;
 1601|  21.9k|}
convolve_2d_avx2.c:convolve16_4tap_avx2:
  534|   137k|                                           const __m256i coeffs[2]) {
  535|   137k|  const __m256i res_1 = _mm256_madd_epi16(ss[0], coeffs[0]);
  536|   137k|  const __m256i res_2 = _mm256_madd_epi16(ss[1], coeffs[1]);
  537|   137k|  return _mm256_add_epi32(res_1, res_2);
  538|   137k|}
convolve_2d_avx2.c:xy_y_round_store_4x2_avx2:
  751|  52.4k|                                             const ptrdiff_t stride) {
  752|  52.4k|  const __m256i r = xy_y_round_avx2(res);
  753|  52.4k|  const __m256i rr = _mm256_packs_epi32(r, r);
  754|  52.4k|  pack_store_4x2_avx2(rr, dst, stride);
  755|  52.4k|}
convolve_2d_avx2.c:pack_store_4x2_avx2:
  700|  52.4k|                                       const ptrdiff_t stride) {
  701|  52.4k|  const __m256i d = _mm256_packus_epi16(res, res);
  702|  52.4k|  const __m128i d0 = _mm256_castsi256_si128(d);
  703|  52.4k|  const __m128i d1 = _mm256_extracti128_si256(d, 1);
  704|       |
  705|  52.4k|  xx_storel_32(dst, d0);
  706|  52.4k|  xx_storel_32(dst + stride, d1);
  707|  52.4k|}
convolve_2d_avx2.c:xy_y_convolve_4tap_8x2_avx2:
 1613|  13.6k|                                               __m256i r[2]) {
 1614|  13.6k|  __m256i s_256[2];
 1615|  13.6k|  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 8));
 1616|  13.6k|  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 3 * 8));
 1617|  13.6k|  ss_256[1] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
 1618|  13.6k|  ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
 1619|  13.6k|  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
 1620|  13.6k|  ss_256[0] = ss_256[1];
 1621|  13.6k|  ss_256[2] = ss_256[3];
 1622|  13.6k|}
convolve_2d_avx2.c:xy_y_convolve_4tap_16_avx2:
 1605|  57.6k|                                              __m256i r[2]) {
 1606|  57.6k|  r[0] = convolve16_4tap_avx2(ss, coeffs);
 1607|  57.6k|  r[1] = convolve16_4tap_avx2(ss + 2, coeffs);
 1608|  57.6k|}
convolve_2d_avx2.c:xy_y_convolve_4tap_8x2_half_pel_avx2:
 1626|  1.38k|    __m256i r[2]) {
 1627|  1.38k|  __m256i a_256[2];
 1628|  1.38k|  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 8));
 1629|  1.38k|  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 8));
 1630|  1.38k|  a_256[0] = _mm256_add_epi16(s_256[0], s_256[3]);
 1631|  1.38k|  a_256[1] = _mm256_add_epi16(s_256[1], s_256[2]);
 1632|  1.38k|  xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r);
 1633|  1.38k|  s_256[0] = s_256[2];
 1634|  1.38k|  s_256[1] = s_256[3];
 1635|  1.38k|}
convolve_2d_avx2.c:xy_y_convolve_4tap_16x2_avx2:
 1639|  6.87k|    __m256i tt_256[4], const __m256i coeffs[2], __m256i r[4]) {
 1640|  6.87k|  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
 1641|  6.87k|  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
 1642|  6.87k|  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
 1643|  6.87k|  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
 1644|  6.87k|  tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[2]);
 1645|  6.87k|  tt_256[3] = _mm256_unpackhi_epi16(s_256[3], s_256[2]);
 1646|  6.87k|  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
 1647|  6.87k|  xy_y_convolve_4tap_16_avx2(tt_256, coeffs, r + 2);
 1648|  6.87k|  ss_256[0] = ss_256[1];
 1649|  6.87k|  ss_256[2] = ss_256[3];
 1650|  6.87k|  tt_256[0] = tt_256[1];
 1651|  6.87k|  tt_256[2] = tt_256[3];
 1652|  6.87k|}
convolve_2d_avx2.c:xy_y_convolve_4tap_16x2_half_pelavx2:
 1674|    750|    __m256i r[4]) {
 1675|    750|  __m256i a_256[2];
 1676|       |
 1677|    750|  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
 1678|    750|  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
 1679|       |
 1680|    750|  a_256[0] = _mm256_add_epi16(s_256[0], s_256[3]);
 1681|    750|  a_256[1] = _mm256_add_epi16(s_256[1], s_256[2]);
 1682|    750|  xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r + 0);
 1683|       |
 1684|    750|  a_256[0] = _mm256_add_epi16(s_256[1], s_256[4]);
 1685|    750|  a_256[1] = _mm256_add_epi16(s_256[2], s_256[3]);
 1686|    750|  xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r + 2);
 1687|       |
 1688|    750|  s_256[0] = s_256[2];
 1689|    750|  s_256[1] = s_256[3];
 1690|    750|  s_256[2] = s_256[4];
 1691|    750|}
convolve_2d_avx2.c:loadu_unpack_16bit_3rows_avx2:
  410|    664|    __m256i ss_256[3], __m256i tt_256[3]) {
  411|    664|  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
  412|    664|  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
  413|    664|  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
  414|       |
  415|    664|  ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
  416|    664|  ss_256[2] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
  417|       |
  418|    664|  tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
  419|    664|  tt_256[2] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
  420|    664|}
convolve_2d_avx2.c:xy_y_convolve_4tap_32x2_avx2:
 1657|  9.08k|    __m256i r[4]) {
 1658|  9.08k|  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
 1659|  9.08k|  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
 1660|  9.08k|  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
 1661|  9.08k|  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
 1662|  9.08k|  tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[2]);
 1663|  9.08k|  tt_256[3] = _mm256_unpackhi_epi16(s_256[3], s_256[2]);
 1664|  9.08k|  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
 1665|  9.08k|  xy_y_convolve_4tap_16_avx2(tt_256, coeffs, r + 2);
 1666|  9.08k|  ss_256[0] = ss_256[1];
 1667|  9.08k|  ss_256[2] = ss_256[3];
 1668|  9.08k|  tt_256[0] = tt_256[1];
 1669|  9.08k|  tt_256[2] = tt_256[3];
 1670|  9.08k|}
convolve_2d_avx2.c:prepare_coeffs_6tap_ssse3:
  259|  1.11k|    __m128i *const coeffs /* [3] */) {
  260|  1.11k|  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
  261|  1.11k|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|  1.11k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|  1.11k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  262|  1.11k|  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
  263|       |
  264|       |  // coeffs 1 2 1 2 1 2 1 2
  265|  1.11k|  coeffs[0] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x05040302u));
  266|       |  // coeffs 3 4 3 4 3 4 3 4
  267|  1.11k|  coeffs[1] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x09080706u));
  268|       |  // coeffs 5 6 5 6 5 6 5 6
  269|  1.11k|  coeffs[2] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x0D0C0B0Au));
  270|  1.11k|}
convolve_2d_avx2.c:xy_y_convolve_6tap_2x2_sse2:
 1696|  4.44k|                                                  const __m128i coeffs[3]) {
 1697|  4.44k|  s_32[5] = _mm_cvtsi32_si128(loadu_int32(src + 5 * 2));
 1698|  4.44k|  const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
 1699|  4.44k|  s_32[4] = _mm_cvtsi32_si128(loadu_int32(src + 6 * 2));
 1700|  4.44k|  const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[4]);
 1701|  4.44k|  ss_128[2] = _mm_unpacklo_epi16(src45, src56);
 1702|  4.44k|  const __m128i r = convolve16_6tap_sse2(ss_128, coeffs);
 1703|  4.44k|  ss_128[0] = ss_128[1];
 1704|  4.44k|  ss_128[1] = ss_128[2];
 1705|  4.44k|  return r;
 1706|  4.44k|}
convolve_2d_avx2.c:convolve16_6tap_sse2:
  509|  4.44k|                                           const __m128i coeffs[3]) {
  510|  4.44k|  const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
  511|  4.44k|  const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
  512|  4.44k|  const __m128i res_45 = _mm_madd_epi16(ss[2], coeffs[2]);
  513|  4.44k|  const __m128i res_0123 = _mm_add_epi32(res_01, res_23);
  514|  4.44k|  return _mm_add_epi32(res_0123, res_45);
  515|  4.44k|}
convolve_2d_avx2.c:prepare_coeffs_6tap_avx2:
  320|  16.7k|    __m256i *const coeffs /* [3]*/) {
  321|  16.7k|  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
  322|  16.7k|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|  16.7k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|  16.7k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  323|  16.7k|  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
  324|  16.7k|  const __m256i coeff = _mm256_broadcastsi128_si256(coeffs_8);
  325|       |
  326|       |  // coeffs 1 2 1 2 1 2 1 2
  327|  16.7k|  coeffs[0] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x05040302u));
  328|       |  // coeffs 3 4 3 4 3 4 3 4
  329|  16.7k|  coeffs[1] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x09080706u));
  330|       |  // coeffs 5 6 5 6 5 6 5 6
  331|  16.7k|  coeffs[2] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x0D0C0B0Au));
  332|  16.7k|}
convolve_2d_avx2.c:xy_y_convolve_6tap_4x2_avx2:
 1711|  27.5k|                                                  const __m256i coeffs[3]) {
 1712|  27.5k|  __m256i s_256[2];
 1713|  27.5k|  s_64[5] = _mm_loadl_epi64((__m128i *)(src + 5 * 4));
 1714|  27.5k|  s_256[0] = _mm256_setr_m128i(s_64[4], s_64[5]);
  ------------------
  |  |   29|  27.5k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  27.5k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1715|  27.5k|  s_64[4] = _mm_loadl_epi64((__m128i *)(src + 6 * 4));
 1716|       |  s_256[1] = _mm256_setr_m128i(s_64[5], s_64[4]);
  ------------------
  |  |   29|  27.5k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  27.5k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1717|  27.5k|  ss_256[2] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
 1718|  27.5k|  const __m256i r = convolve16_6tap_avx2(ss_256, coeffs);
 1719|  27.5k|  ss_256[0] = ss_256[1];
 1720|  27.5k|  ss_256[1] = ss_256[2];
 1721|  27.5k|  return r;
 1722|  27.5k|}
convolve_2d_avx2.c:convolve16_6tap_avx2:
  541|   384k|                                           const __m256i coeffs[3]) {
  542|   384k|  const __m256i res_01 = _mm256_madd_epi16(ss[0], coeffs[0]);
  543|   384k|  const __m256i res_23 = _mm256_madd_epi16(ss[1], coeffs[1]);
  544|   384k|  const __m256i res_45 = _mm256_madd_epi16(ss[2], coeffs[2]);
  545|   384k|  const __m256i res_0123 = _mm256_add_epi32(res_01, res_23);
  546|   384k|  return _mm256_add_epi32(res_0123, res_45);
  547|   384k|}
convolve_2d_avx2.c:xy_y_convolve_6tap_8x2_avx2:
 1734|  29.4k|                                               __m256i r[2]) {
 1735|  29.4k|  __m256i s_256[2];
 1736|  29.4k|  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 4 * 8));
 1737|  29.4k|  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 5 * 8));
 1738|  29.4k|  ss_256[2] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
 1739|  29.4k|  ss_256[5] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
 1740|  29.4k|  xy_y_convolve_6tap_16_avx2(ss_256, coeffs, r);
 1741|  29.4k|  ss_256[0] = ss_256[1];
 1742|  29.4k|  ss_256[1] = ss_256[2];
 1743|  29.4k|  ss_256[3] = ss_256[4];
 1744|  29.4k|  ss_256[4] = ss_256[5];
 1745|  29.4k|}
convolve_2d_avx2.c:xy_y_convolve_6tap_16_avx2:
 1726|   178k|                                              __m256i r[2]) {
 1727|   178k|  r[0] = convolve16_6tap_avx2(ss, coeffs);
 1728|   178k|  r[1] = convolve16_6tap_avx2(ss + 3, coeffs);
 1729|   178k|}
convolve_2d_avx2.c:xy_y_convolve_6tap_8x2_half_pel_avx2:
 1749|  4.00k|    __m256i r[2]) {
 1750|  4.00k|  __m256i a_256[2], ss_256[4];
 1751|  4.00k|  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 8));
 1752|  4.00k|  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * 8));
 1753|  4.00k|  a_256[0] = _mm256_add_epi16(s_256[0], s_256[5]);
 1754|  4.00k|  a_256[1] = _mm256_add_epi16(s_256[1], s_256[4]);
 1755|  4.00k|  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
 1756|  4.00k|  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
 1757|  4.00k|  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
 1758|  4.00k|  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
 1759|  4.00k|  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
 1760|  4.00k|  s_256[0] = s_256[2];
 1761|  4.00k|  s_256[1] = s_256[3];
 1762|  4.00k|  s_256[2] = s_256[4];
 1763|  4.00k|  s_256[3] = s_256[5];
 1764|  4.00k|}
convolve_2d_avx2.c:xy_y_convolve_6tap_16x2_avx2:
 1769|  74.5k|    __m256i r[4]) {
 1770|  74.5k|  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
 1771|  74.5k|  ss_256[2] = _mm256_unpacklo_epi16(s_256[4], s_256[5]);
 1772|  74.5k|  ss_256[5] = _mm256_unpackhi_epi16(s_256[4], s_256[5]);
 1773|  74.5k|  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
 1774|  74.5k|  tt_256[2] = _mm256_unpacklo_epi16(s_256[5], s_256[4]);
 1775|  74.5k|  tt_256[5] = _mm256_unpackhi_epi16(s_256[5], s_256[4]);
 1776|       |
 1777|  74.5k|  xy_y_convolve_6tap_16_avx2(ss_256, coeffs, r + 0);
 1778|  74.5k|  xy_y_convolve_6tap_16_avx2(tt_256, coeffs, r + 2);
 1779|       |
 1780|  74.5k|  ss_256[0] = ss_256[1];
 1781|  74.5k|  ss_256[1] = ss_256[2];
 1782|  74.5k|  ss_256[3] = ss_256[4];
 1783|  74.5k|  ss_256[4] = ss_256[5];
 1784|       |
 1785|  74.5k|  tt_256[0] = tt_256[1];
 1786|  74.5k|  tt_256[1] = tt_256[2];
 1787|  74.5k|  tt_256[3] = tt_256[4];
 1788|  74.5k|  tt_256[4] = tt_256[5];
 1789|  74.5k|}
convolve_2d_avx2.c:xy_y_convolve_6tap_16x2_half_pel_avx2:
 1793|  3.48k|    __m256i ss_256[4], const __m256i coeffs[2], __m256i r[4]) {
 1794|  3.48k|  __m256i a_256[2];
 1795|       |
 1796|  3.48k|  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
 1797|  3.48k|  a_256[0] = _mm256_add_epi16(s_256[0], s_256[5]);
 1798|  3.48k|  a_256[1] = _mm256_add_epi16(s_256[1], s_256[4]);
 1799|  3.48k|  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
 1800|  3.48k|  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
 1801|  3.48k|  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
 1802|  3.48k|  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
 1803|  3.48k|  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
 1804|       |
 1805|  3.48k|  a_256[1] = _mm256_add_epi16(s_256[2], s_256[5]);
 1806|  3.48k|  s_256[0] = s_256[2];
 1807|  3.48k|  s_256[2] = s_256[4];
 1808|  3.48k|  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
 1809|  3.48k|  a_256[0] = _mm256_add_epi16(s_256[1], s_256[4]);
 1810|  3.48k|  s_256[1] = s_256[3];
 1811|  3.48k|  s_256[3] = s_256[5];
 1812|  3.48k|  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
 1813|  3.48k|  ss_256[1] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
 1814|  3.48k|  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
 1815|  3.48k|  ss_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
 1816|  3.48k|  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 2);
 1817|  3.48k|}
convolve_2d_avx2.c:loadu_unpack_16bit_5rows_avx2:
  390|  3.57k|    __m256i ss_256[5], __m256i tt_256[5]) {
  391|  3.57k|  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
  392|  3.57k|  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
  393|  3.57k|  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
  394|  3.57k|  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
  395|  3.57k|  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
  396|       |
  397|  3.57k|  ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
  398|  3.57k|  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
  399|  3.57k|  ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
  400|  3.57k|  ss_256[4] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
  401|       |
  402|  3.57k|  tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
  403|  3.57k|  tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[4]);
  404|  3.57k|  tt_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
  405|  3.57k|  tt_256[4] = _mm256_unpackhi_epi16(s_256[3], s_256[4]);
  406|  3.57k|}
convolve_2d_avx2.c:prepare_coeffs_8tap_sse2:
  274|     52|    __m128i *const coeffs /* [4] */) {
  275|     52|  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
  276|     52|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|     52|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|     52|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  277|       |
  278|     52|  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
  279|       |
  280|       |  // coeffs 0 1 0 1 0 1 0 1
  281|     52|  coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
  282|       |  // coeffs 2 3 2 3 2 3 2 3
  283|     52|  coeffs[1] = _mm_shuffle_epi32(coeff, 0x55);
  284|       |  // coeffs 4 5 4 5 4 5 4 5
  285|     52|  coeffs[2] = _mm_shuffle_epi32(coeff, 0xaa);
  286|       |  // coeffs 6 7 6 7 6 7 6 7
  287|       |  coeffs[3] = _mm_shuffle_epi32(coeff, 0xff);
  288|     52|}
convolve_2d_avx2.c:xy_y_convolve_8tap_2x2_sse2:
 1822|    208|                                                  const __m128i coeffs[4]) {
 1823|    208|  s_32[7] = _mm_cvtsi32_si128(loadu_int32(src + 7 * 2));
 1824|    208|  const __m128i src67 = _mm_unpacklo_epi32(s_32[6], s_32[7]);
 1825|    208|  s_32[6] = _mm_cvtsi32_si128(loadu_int32(src + 8 * 2));
 1826|    208|  const __m128i src78 = _mm_unpacklo_epi32(s_32[7], s_32[6]);
 1827|    208|  ss_128[3] = _mm_unpacklo_epi16(src67, src78);
 1828|    208|  const __m128i r = convolve16_8tap_sse2(ss_128, coeffs);
 1829|    208|  ss_128[0] = ss_128[1];
 1830|    208|  ss_128[1] = ss_128[2];
 1831|    208|  ss_128[2] = ss_128[3];
 1832|    208|  return r;
 1833|    208|}
convolve_2d_avx2.c:convolve16_8tap_sse2:
  518|    208|                                           const __m128i coeffs[4]) {
  519|    208|  const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
  520|    208|  const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
  521|    208|  const __m128i res_45 = _mm_madd_epi16(ss[2], coeffs[2]);
  522|    208|  const __m128i res_67 = _mm_madd_epi16(ss[3], coeffs[3]);
  523|    208|  const __m128i res_0123 = _mm_add_epi32(res_01, res_23);
  524|    208|  const __m128i res_4567 = _mm_add_epi32(res_45, res_67);
  525|    208|  return _mm_add_epi32(res_0123, res_4567);
  526|    208|}
convolve_2d_avx2.c:prepare_coeffs_8tap_avx2:
  336|  1.42k|    __m256i *const coeffs /* [4] */) {
  337|  1.42k|  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
  338|  1.42k|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|  1.42k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|  1.42k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  339|       |
  340|  1.42k|  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
  341|  1.42k|  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
  342|       |
  343|       |  // coeffs 0 1 0 1 0 1 0 1
  344|  1.42k|  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
  345|       |  // coeffs 2 3 2 3 2 3 2 3
  346|  1.42k|  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
  347|       |  // coeffs 4 5 4 5 4 5 4 5
  348|  1.42k|  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
  349|       |  // coeffs 6 7 6 7 6 7 6 7
  350|       |  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
  351|  1.42k|}
convolve_2d_avx2.c:xy_y_convolve_8tap_4x2_avx2:
 1838|  2.91k|                                                  const __m256i coeffs[4]) {
 1839|  2.91k|  __m256i s_256[2];
 1840|  2.91k|  s_64[7] = _mm_loadl_epi64((__m128i *)(src + 7 * 4));
 1841|  2.91k|  s_256[0] = _mm256_setr_m128i(s_64[6], s_64[7]);
  ------------------
  |  |   29|  2.91k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  2.91k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1842|  2.91k|  s_64[6] = _mm_loadl_epi64((__m128i *)(src + 8 * 4));
 1843|       |  s_256[1] = _mm256_setr_m128i(s_64[7], s_64[6]);
  ------------------
  |  |   29|  2.91k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  2.91k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1844|  2.91k|  ss_256[3] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
 1845|  2.91k|  const __m256i r = convolve16_8tap_avx2(ss_256, coeffs);
 1846|  2.91k|  ss_256[0] = ss_256[1];
 1847|  2.91k|  ss_256[1] = ss_256[2];
 1848|  2.91k|  ss_256[2] = ss_256[3];
 1849|  2.91k|  return r;
 1850|  2.91k|}
convolve_2d_avx2.c:convolve16_8tap_avx2:
  550|  42.4k|                                           const __m256i coeffs[4]) {
  551|  42.4k|  const __m256i res_01 = _mm256_madd_epi16(ss[0], coeffs[0]);
  552|  42.4k|  const __m256i res_23 = _mm256_madd_epi16(ss[1], coeffs[1]);
  553|  42.4k|  const __m256i res_45 = _mm256_madd_epi16(ss[2], coeffs[2]);
  554|  42.4k|  const __m256i res_67 = _mm256_madd_epi16(ss[3], coeffs[3]);
  555|  42.4k|  const __m256i res_0123 = _mm256_add_epi32(res_01, res_23);
  556|  42.4k|  const __m256i res_4567 = _mm256_add_epi32(res_45, res_67);
  557|  42.4k|  return _mm256_add_epi32(res_0123, res_4567);
  558|  42.4k|}
convolve_2d_avx2.c:convolve_8tap_unpack_avx2:
  423|  1.85k|                                             __m256i ss[7]) {
  424|  1.85k|  ss[0] = _mm256_unpacklo_epi16(s[0], s[1]);
  425|  1.85k|  ss[1] = _mm256_unpacklo_epi16(s[2], s[3]);
  426|  1.85k|  ss[2] = _mm256_unpacklo_epi16(s[4], s[5]);
  427|  1.85k|  ss[4] = _mm256_unpackhi_epi16(s[0], s[1]);
  428|  1.85k|  ss[5] = _mm256_unpackhi_epi16(s[2], s[3]);
  429|  1.85k|  ss[6] = _mm256_unpackhi_epi16(s[4], s[5]);
  430|  1.85k|}
convolve_2d_avx2.c:xy_y_convolve_8tap_8x2_avx2:
 1862|  2.05k|                                               __m256i r[2]) {
 1863|  2.05k|  __m256i s_256[2];
 1864|  2.05k|  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 6 * 8));
 1865|  2.05k|  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 7 * 8));
 1866|  2.05k|  ss_256[3] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
 1867|  2.05k|  ss_256[7] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
 1868|  2.05k|  xy_y_convolve_8tap_16_avx2(ss_256, coeffs, r);
 1869|  2.05k|  ss_256[0] = ss_256[1];
 1870|  2.05k|  ss_256[1] = ss_256[2];
 1871|  2.05k|  ss_256[2] = ss_256[3];
 1872|  2.05k|  ss_256[4] = ss_256[5];
 1873|  2.05k|  ss_256[5] = ss_256[6];
 1874|  2.05k|  ss_256[6] = ss_256[7];
 1875|  2.05k|}
convolve_2d_avx2.c:xy_y_convolve_8tap_16_avx2:
 1854|  19.7k|                                              __m256i r[2]) {
 1855|  19.7k|  r[0] = convolve16_8tap_avx2(ss, coeffs);
 1856|  19.7k|  r[1] = convolve16_8tap_avx2(ss + 4, coeffs);
 1857|  19.7k|}
convolve_2d_avx2.c:xy_y_convolve_8tap_8x2_half_pel_avx2:
 1879|    792|    __m256i r[2]) {
 1880|    792|  __m256i a_256[4], ss_256[4];
 1881|       |
 1882|    792|  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 6 * 8));
 1883|    792|  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * 8));
 1884|    792|  a_256[0] = _mm256_add_epi16(s_256[0], s_256[7]);
 1885|    792|  a_256[1] = _mm256_add_epi16(s_256[1], s_256[6]);
 1886|    792|  a_256[2] = _mm256_add_epi16(s_256[2], s_256[5]);
 1887|    792|  a_256[3] = _mm256_add_epi16(s_256[3], s_256[4]);
 1888|    792|  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
 1889|    792|  ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
 1890|    792|  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
 1891|    792|  ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
 1892|    792|  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
 1893|    792|  s_256[0] = s_256[2];
 1894|    792|  s_256[1] = s_256[3];
 1895|    792|  s_256[2] = s_256[4];
 1896|    792|  s_256[3] = s_256[5];
 1897|    792|  s_256[4] = s_256[6];
 1898|    792|  s_256[5] = s_256[7];
 1899|    792|}
convolve_2d_avx2.c:load_16bit_7rows_avx2:
  365|    760|                                         __m256i dst[7]) {
  366|    760|  dst[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
  367|    760|  dst[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
  368|    760|  dst[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
  369|    760|  dst[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
  370|    760|  dst[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
  371|    760|  dst[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
  372|    760|  dst[6] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
  373|    760|}
convolve_2d_avx2.c:xy_y_convolve_8tap_16x2_avx2:
 1903|  8.84k|    __m256i s_256[8], __m256i ss_256[8], __m256i tt_256[8], __m256i r[4]) {
 1904|  8.84k|  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
 1905|  8.84k|  ss_256[3] = _mm256_unpacklo_epi16(s_256[6], s_256[7]);
 1906|  8.84k|  ss_256[7] = _mm256_unpackhi_epi16(s_256[6], s_256[7]);
 1907|  8.84k|  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
 1908|  8.84k|  tt_256[3] = _mm256_unpacklo_epi16(s_256[7], s_256[6]);
 1909|  8.84k|  tt_256[7] = _mm256_unpackhi_epi16(s_256[7], s_256[6]);
 1910|       |
 1911|  8.84k|  xy_y_convolve_8tap_16_avx2(ss_256, coeffs, r + 0);
 1912|  8.84k|  xy_y_convolve_8tap_16_avx2(tt_256, coeffs, r + 2);
 1913|       |
 1914|  8.84k|  ss_256[0] = ss_256[1];
 1915|  8.84k|  ss_256[1] = ss_256[2];
 1916|  8.84k|  ss_256[2] = ss_256[3];
 1917|  8.84k|  ss_256[4] = ss_256[5];
 1918|  8.84k|  ss_256[5] = ss_256[6];
 1919|  8.84k|  ss_256[6] = ss_256[7];
 1920|       |
 1921|  8.84k|  tt_256[0] = tt_256[1];
 1922|  8.84k|  tt_256[1] = tt_256[2];
 1923|  8.84k|  tt_256[2] = tt_256[3];
 1924|  8.84k|  tt_256[4] = tt_256[5];
 1925|  8.84k|  tt_256[5] = tt_256[6];
 1926|  8.84k|  tt_256[6] = tt_256[7];
 1927|  8.84k|}
convolve_2d_avx2.c:xy_y_convolve_8tap_16x2_half_pel_avx2:
 1931|    188|    __m256i s_256[8], __m256i r[4]) {
 1932|    188|  __m256i a_256[4], ss_256[4];
 1933|    188|  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
 1934|       |
 1935|    188|  a_256[0] = _mm256_add_epi16(s_256[0], s_256[7]);
 1936|    188|  a_256[1] = _mm256_add_epi16(s_256[1], s_256[6]);
 1937|    188|  a_256[2] = _mm256_add_epi16(s_256[2], s_256[5]);
 1938|    188|  a_256[3] = _mm256_add_epi16(s_256[3], s_256[4]);
 1939|    188|  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
 1940|    188|  ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
 1941|    188|  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
 1942|    188|  ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
 1943|       |
 1944|    188|  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
 1945|       |
 1946|    188|  a_256[1] = _mm256_add_epi16(s_256[2], s_256[7]);
 1947|    188|  a_256[2] = _mm256_add_epi16(s_256[3], s_256[6]);
 1948|    188|  a_256[3] = _mm256_add_epi16(s_256[4], s_256[5]);
 1949|    188|  s_256[0] = s_256[2];
 1950|    188|  s_256[2] = s_256[4];
 1951|    188|  s_256[4] = s_256[6];
 1952|    188|  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
 1953|       |
 1954|    188|  a_256[0] = _mm256_add_epi16(s_256[1], s_256[6]);
 1955|    188|  s_256[1] = s_256[3];
 1956|    188|  s_256[3] = s_256[5];
 1957|    188|  s_256[5] = s_256[7];
 1958|    188|  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
 1959|    188|  ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
 1960|    188|  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
 1961|    188|  ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
 1962|       |
 1963|    188|  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 2);
 1964|    188|}
convolve_avx2.c:av1_convolve_y_sr_specialized_avx2:
 2008|  14.4k|    const int32_t subpel_y_q4) {
 2009|  14.4k|  int32_t x, y;
 2010|  14.4k|  __m128i coeffs_128[4];
 2011|  14.4k|  __m256i coeffs_256[4];
 2012|       |
 2013|  14.4k|  int vert_tap = get_filter_tap(filter_params_y, subpel_y_q4);
 2014|       |
 2015|  14.4k|  if (vert_tap == 2) {
  ------------------
  |  Branch (2015:7): [True: 849, False: 13.5k]
  ------------------
 2016|       |    // vert_filt as 2 tap
 2017|    849|    const uint8_t *src_ptr = src;
 2018|       |
 2019|    849|    y = h;
 2020|       |
 2021|    849|    if (subpel_y_q4 != 8) {
  ------------------
  |  Branch (2021:9): [True: 629, False: 220]
  ------------------
 2022|    629|      if (w <= 8) {
  ------------------
  |  Branch (2022:11): [True: 453, False: 176]
  ------------------
 2023|    453|        prepare_half_coeffs_2tap_ssse3(filter_params_y, subpel_y_q4,
 2024|    453|                                       coeffs_128);
 2025|       |
 2026|    453|        if (w == 2) {
  ------------------
  |  Branch (2026:13): [True: 40, False: 413]
  ------------------
 2027|     40|          __m128i s_16[2];
 2028|       |
 2029|     40|          s_16[0] = _mm_cvtsi32_si128(*(int16_t *)src_ptr);
 2030|       |
 2031|    140|          do {
 2032|    140|            const __m128i res = y_convolve_2tap_2x2_ssse3(src_ptr, src_stride,
 2033|    140|                                                          coeffs_128, s_16);
 2034|    140|            const __m128i r = sr_y_round_sse2(res);
 2035|    140|            pack_store_2x2_sse2(r, dst, dst_stride);
 2036|    140|            src_ptr += 2 * src_stride;
 2037|    140|            dst += 2 * dst_stride;
 2038|    140|            y -= 2;
 2039|    140|          } while (y);
  ------------------
  |  Branch (2039:20): [True: 100, False: 40]
  ------------------
 2040|    413|        } else if (w == 4) {
  ------------------
  |  Branch (2040:20): [True: 177, False: 236]
  ------------------
 2041|    177|          __m128i s_32[2];
 2042|       |
 2043|    177|          s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr));
 2044|       |
 2045|    732|          do {
 2046|    732|            const __m128i res = y_convolve_2tap_4x2_ssse3(src_ptr, src_stride,
 2047|    732|                                                          coeffs_128, s_32);
 2048|    732|            const __m128i r = sr_y_round_sse2(res);
 2049|    732|            pack_store_4x2_sse2(r, dst, dst_stride);
 2050|    732|            src_ptr += 2 * src_stride;
 2051|    732|            dst += 2 * dst_stride;
 2052|    732|            y -= 2;
 2053|    732|          } while (y);
  ------------------
  |  Branch (2053:20): [True: 555, False: 177]
  ------------------
 2054|    236|        } else {
 2055|    236|          __m128i s_64[2], s_128[2];
 2056|       |
 2057|    236|          assert(w == 8);
 2058|       |
 2059|    236|          s_64[0] = _mm_loadl_epi64((__m128i *)src_ptr);
 2060|       |
 2061|  1.18k|          do {
 2062|       |            // Note: Faster than binding to AVX2 registers.
 2063|  1.18k|            s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + src_stride));
 2064|  1.18k|            s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
 2065|  1.18k|            s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
 2066|  1.18k|            s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
 2067|  1.18k|            const __m128i ss0 = _mm_unpacklo_epi8(s_128[0], s_128[1]);
 2068|  1.18k|            const __m128i ss1 = _mm_unpackhi_epi8(s_128[0], s_128[1]);
 2069|  1.18k|            const __m128i res0 = convolve_2tap_ssse3(&ss0, coeffs_128);
 2070|  1.18k|            const __m128i res1 = convolve_2tap_ssse3(&ss1, coeffs_128);
 2071|  1.18k|            const __m128i r0 = sr_y_round_sse2(res0);
 2072|  1.18k|            const __m128i r1 = sr_y_round_sse2(res1);
 2073|  1.18k|            const __m128i d = _mm_packus_epi16(r0, r1);
 2074|  1.18k|            _mm_storel_epi64((__m128i *)dst, d);
 2075|  1.18k|            _mm_storeh_epi64((__m128i *)(dst + dst_stride), d);
 2076|  1.18k|            src_ptr += 2 * src_stride;
 2077|  1.18k|            dst += 2 * dst_stride;
 2078|  1.18k|            y -= 2;
 2079|  1.18k|          } while (y);
  ------------------
  |  Branch (2079:20): [True: 952, False: 236]
  ------------------
 2080|    236|        }
 2081|    453|      } else {
 2082|    176|        prepare_half_coeffs_2tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
 2083|       |
 2084|    176|        if (w == 16) {
  ------------------
  |  Branch (2084:13): [True: 121, False: 55]
  ------------------
 2085|    121|          __m128i s_128[2];
 2086|       |
 2087|    121|          s_128[0] = _mm_loadu_si128((__m128i *)src_ptr);
 2088|       |
 2089|    776|          do {
 2090|    776|            __m256i r[2];
 2091|       |
 2092|    776|            y_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
 2093|    776|                                      r);
 2094|    776|            sr_y_round_store_16x2_avx2(r, dst, dst_stride);
 2095|    776|            src_ptr += 2 * src_stride;
 2096|    776|            dst += 2 * dst_stride;
 2097|    776|            y -= 2;
 2098|    776|          } while (y);
  ------------------
  |  Branch (2098:20): [True: 655, False: 121]
  ------------------
 2099|    121|        } else if (w == 32) {
  ------------------
  |  Branch (2099:20): [True: 36, False: 19]
  ------------------
 2100|     36|          __m256i s_256[2];
 2101|       |
 2102|     36|          s_256[0] = _mm256_loadu_si256((__m256i *)src_ptr);
 2103|       |
 2104|    536|          do {
 2105|    536|            sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0],
 2106|    536|                              &s_256[1], dst);
 2107|    536|            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1],
 2108|    536|                              &s_256[0], dst + dst_stride);
 2109|    536|            src_ptr += 2 * src_stride;
 2110|    536|            dst += 2 * dst_stride;
 2111|    536|            y -= 2;
 2112|    536|          } while (y);
  ------------------
  |  Branch (2112:20): [True: 500, False: 36]
  ------------------
 2113|     36|        } else if (w == 64) {
  ------------------
  |  Branch (2113:20): [True: 19, False: 0]
  ------------------
 2114|     19|          __m256i s_256[2][2];
 2115|       |
 2116|     19|          s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
 2117|     19|          s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
 2118|       |
 2119|    320|          do {
 2120|    320|            sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0][0],
 2121|    320|                              &s_256[1][0], dst);
 2122|    320|            sr_y_2tap_32_avx2(src_ptr + src_stride + 32, coeffs_256,
 2123|    320|                              s_256[0][1], &s_256[1][1], dst + 32);
 2124|    320|            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1][0],
 2125|    320|                              &s_256[0][0], dst + dst_stride);
 2126|    320|            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 32, coeffs_256,
 2127|    320|                              s_256[1][1], &s_256[0][1], dst + dst_stride + 32);
 2128|       |
 2129|    320|            src_ptr += 2 * src_stride;
 2130|    320|            dst += 2 * dst_stride;
 2131|    320|            y -= 2;
 2132|    320|          } while (y);
  ------------------
  |  Branch (2132:20): [True: 301, False: 19]
  ------------------
 2133|     19|        } else {
 2134|      0|          __m256i s_256[2][4];
 2135|       |
 2136|      0|          assert(w == 128);
 2137|       |
 2138|      0|          s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
 2139|      0|          s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
 2140|      0|          s_256[0][2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * 32));
 2141|      0|          s_256[0][3] = _mm256_loadu_si256((__m256i *)(src_ptr + 3 * 32));
 2142|       |
 2143|      0|          do {
 2144|      0|            sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0][0],
 2145|      0|                              &s_256[1][0], dst);
 2146|      0|            sr_y_2tap_32_avx2(src_ptr + src_stride + 1 * 32, coeffs_256,
 2147|      0|                              s_256[0][1], &s_256[1][1], dst + 1 * 32);
 2148|      0|            sr_y_2tap_32_avx2(src_ptr + src_stride + 2 * 32, coeffs_256,
 2149|      0|                              s_256[0][2], &s_256[1][2], dst + 2 * 32);
 2150|      0|            sr_y_2tap_32_avx2(src_ptr + src_stride + 3 * 32, coeffs_256,
 2151|      0|                              s_256[0][3], &s_256[1][3], dst + 3 * 32);
 2152|       |
 2153|      0|            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1][0],
 2154|      0|                              &s_256[0][0], dst + dst_stride);
 2155|      0|            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 1 * 32, coeffs_256,
 2156|      0|                              s_256[1][1], &s_256[0][1],
 2157|      0|                              dst + dst_stride + 1 * 32);
 2158|      0|            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 2 * 32, coeffs_256,
 2159|      0|                              s_256[1][2], &s_256[0][2],
 2160|      0|                              dst + dst_stride + 2 * 32);
 2161|      0|            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 3 * 32, coeffs_256,
 2162|      0|                              s_256[1][3], &s_256[0][3],
 2163|      0|                              dst + dst_stride + 3 * 32);
 2164|       |
 2165|      0|            src_ptr += 2 * src_stride;
 2166|      0|            dst += 2 * dst_stride;
 2167|      0|            y -= 2;
 2168|      0|          } while (y);
  ------------------
  |  Branch (2168:20): [True: 0, False: 0]
  ------------------
 2169|      0|        }
 2170|    176|      }
 2171|    629|    } else {
 2172|       |      // average to get half pel
 2173|    220|      if (w <= 8) {
  ------------------
  |  Branch (2173:11): [True: 104, False: 116]
  ------------------
 2174|    104|        if (w == 2) {
  ------------------
  |  Branch (2174:13): [True: 8, False: 96]
  ------------------
 2175|      8|          __m128i s_16[2];
 2176|       |
 2177|      8|          s_16[0] = _mm_cvtsi32_si128(*(int16_t *)src_ptr);
 2178|       |
 2179|     16|          do {
 2180|     16|            s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + src_stride));
 2181|     16|            const __m128i d0 = _mm_avg_epu8(s_16[0], s_16[1]);
 2182|     16|            *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(d0);
 2183|     16|            s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
 2184|     16|            const __m128i d1 = _mm_avg_epu8(s_16[1], s_16[0]);
 2185|     16|            *(int16_t *)(dst + dst_stride) = (int16_t)_mm_cvtsi128_si32(d1);
 2186|     16|            src_ptr += 2 * src_stride;
 2187|     16|            dst += 2 * dst_stride;
 2188|     16|            y -= 2;
 2189|     16|          } while (y);
  ------------------
  |  Branch (2189:20): [True: 8, False: 8]
  ------------------
 2190|     96|        } else if (w == 4) {
  ------------------
  |  Branch (2190:20): [True: 62, False: 34]
  ------------------
 2191|     62|          __m128i s_32[2];
 2192|       |
 2193|     62|          s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr));
 2194|       |
 2195|    292|          do {
 2196|    292|            s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + src_stride));
 2197|    292|            const __m128i d0 = _mm_avg_epu8(s_32[0], s_32[1]);
 2198|    292|            xx_storel_32(dst, d0);
 2199|    292|            s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
 2200|    292|            const __m128i d1 = _mm_avg_epu8(s_32[1], s_32[0]);
 2201|    292|            xx_storel_32(dst + dst_stride, d1);
 2202|    292|            src_ptr += 2 * src_stride;
 2203|    292|            dst += 2 * dst_stride;
 2204|    292|            y -= 2;
 2205|    292|          } while (y);
  ------------------
  |  Branch (2205:20): [True: 230, False: 62]
  ------------------
 2206|     62|        } else {
 2207|     34|          __m128i s_64[2];
 2208|       |
 2209|     34|          assert(w == 8);
 2210|       |
 2211|     34|          s_64[0] = _mm_loadl_epi64((__m128i *)src_ptr);
 2212|       |
 2213|    312|          do {
 2214|       |            // Note: Faster than binding to AVX2 registers.
 2215|    312|            s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + src_stride));
 2216|    312|            const __m128i d0 = _mm_avg_epu8(s_64[0], s_64[1]);
 2217|    312|            _mm_storel_epi64((__m128i *)dst, d0);
 2218|    312|            s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
 2219|    312|            const __m128i d1 = _mm_avg_epu8(s_64[1], s_64[0]);
 2220|    312|            _mm_storel_epi64((__m128i *)(dst + dst_stride), d1);
 2221|    312|            src_ptr += 2 * src_stride;
 2222|    312|            dst += 2 * dst_stride;
 2223|    312|            y -= 2;
 2224|    312|          } while (y);
  ------------------
  |  Branch (2224:20): [True: 278, False: 34]
  ------------------
 2225|     34|        }
 2226|    116|      } else if (w == 16) {
  ------------------
  |  Branch (2226:18): [True: 56, False: 60]
  ------------------
 2227|     56|        __m128i s_128[2];
 2228|       |
 2229|     56|        s_128[0] = _mm_loadu_si128((__m128i *)src_ptr);
 2230|       |
 2231|    704|        do {
 2232|    704|          s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
 2233|    704|          const __m128i d0 = _mm_avg_epu8(s_128[0], s_128[1]);
 2234|    704|          _mm_storeu_si128((__m128i *)dst, d0);
 2235|    704|          s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
 2236|    704|          const __m128i d1 = _mm_avg_epu8(s_128[1], s_128[0]);
 2237|    704|          _mm_storeu_si128((__m128i *)(dst + dst_stride), d1);
 2238|    704|          src_ptr += 2 * src_stride;
 2239|    704|          dst += 2 * dst_stride;
 2240|    704|          y -= 2;
 2241|    704|        } while (y);
  ------------------
  |  Branch (2241:18): [True: 648, False: 56]
  ------------------
 2242|     60|      } else if (w == 32) {
  ------------------
  |  Branch (2242:18): [True: 48, False: 12]
  ------------------
 2243|     48|        __m256i s_256[2];
 2244|       |
 2245|     48|        s_256[0] = _mm256_loadu_si256((__m256i *)src_ptr);
 2246|       |
 2247|    360|        do {
 2248|    360|          sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0], &s_256[1], dst);
 2249|    360|          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1], &s_256[0],
 2250|    360|                                dst + dst_stride);
 2251|    360|          src_ptr += 2 * src_stride;
 2252|    360|          dst += 2 * dst_stride;
 2253|    360|          y -= 2;
 2254|    360|        } while (y);
  ------------------
  |  Branch (2254:18): [True: 312, False: 48]
  ------------------
 2255|     48|      } else if (w == 64) {
  ------------------
  |  Branch (2255:18): [True: 12, False: 0]
  ------------------
 2256|     12|        __m256i s_256[2][2];
 2257|       |
 2258|     12|        s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
 2259|     12|        s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
 2260|       |
 2261|    336|        do {
 2262|    336|          sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0][0], &s_256[1][0],
 2263|    336|                                dst);
 2264|    336|          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 32, s_256[0][1],
 2265|    336|                                &s_256[1][1], dst + 32);
 2266|       |
 2267|    336|          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1][0],
 2268|    336|                                &s_256[0][0], dst + dst_stride);
 2269|    336|          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 32, s_256[1][1],
 2270|    336|                                &s_256[0][1], dst + dst_stride + 32);
 2271|       |
 2272|    336|          src_ptr += 2 * src_stride;
 2273|    336|          dst += 2 * dst_stride;
 2274|    336|          y -= 2;
 2275|    336|        } while (y);
  ------------------
  |  Branch (2275:18): [True: 324, False: 12]
  ------------------
 2276|     12|      } else {
 2277|      0|        __m256i s_256[2][4];
 2278|       |
 2279|      0|        assert(w == 128);
 2280|       |
 2281|      0|        s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
 2282|      0|        s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
 2283|      0|        s_256[0][2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * 32));
 2284|      0|        s_256[0][3] = _mm256_loadu_si256((__m256i *)(src_ptr + 3 * 32));
 2285|       |
 2286|      0|        do {
 2287|      0|          sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0][0], &s_256[1][0],
 2288|      0|                                dst);
 2289|      0|          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 1 * 32, s_256[0][1],
 2290|      0|                                &s_256[1][1], dst + 1 * 32);
 2291|      0|          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 2 * 32, s_256[0][2],
 2292|      0|                                &s_256[1][2], dst + 2 * 32);
 2293|      0|          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 3 * 32, s_256[0][3],
 2294|      0|                                &s_256[1][3], dst + 3 * 32);
 2295|       |
 2296|      0|          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1][0],
 2297|      0|                                &s_256[0][0], dst + dst_stride);
 2298|      0|          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 1 * 32, s_256[1][1],
 2299|      0|                                &s_256[0][1], dst + dst_stride + 1 * 32);
 2300|      0|          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 2 * 32, s_256[1][2],
 2301|      0|                                &s_256[0][2], dst + dst_stride + 2 * 32);
 2302|      0|          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 3 * 32, s_256[1][3],
 2303|      0|                                &s_256[0][3], dst + dst_stride + 3 * 32);
 2304|       |
 2305|      0|          src_ptr += 2 * src_stride;
 2306|      0|          dst += 2 * dst_stride;
 2307|      0|          y -= 2;
 2308|      0|        } while (y);
  ------------------
  |  Branch (2308:18): [True: 0, False: 0]
  ------------------
 2309|      0|      }
 2310|    220|    }
 2311|  13.5k|  } else if (vert_tap == 4) {
  ------------------
  |  Branch (2311:14): [True: 7.14k, False: 6.43k]
  ------------------
 2312|       |    // vert_filt as 4 tap
 2313|  7.14k|    const uint8_t *src_ptr = src - src_stride;
 2314|       |
 2315|  7.14k|    y = h;
 2316|       |
 2317|  7.14k|    if (w <= 4) {
  ------------------
  |  Branch (2317:9): [True: 3.67k, False: 3.46k]
  ------------------
 2318|  3.67k|      prepare_half_coeffs_4tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
 2319|       |
 2320|  3.67k|      if (w == 2) {
  ------------------
  |  Branch (2320:11): [True: 668, False: 3.00k]
  ------------------
 2321|    668|        __m128i s_16[4], ss_128[2];
 2322|       |
 2323|    668|        s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
 2324|    668|        s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
 2325|    668|        s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
 2326|       |
 2327|    668|        const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
 2328|    668|        const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
 2329|       |
 2330|    668|        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
 2331|       |
 2332|  1.00k|        do {
 2333|  1.00k|          src_ptr += 2 * src_stride;
 2334|  1.00k|          const __m128i res = y_convolve_4tap_2x2_ssse3(
 2335|  1.00k|              src_ptr, src_stride, coeffs_128, s_16, ss_128);
 2336|  1.00k|          const __m128i r = sr_y_round_sse2(res);
 2337|  1.00k|          pack_store_2x2_sse2(r, dst, dst_stride);
 2338|       |
 2339|  1.00k|          ss_128[0] = ss_128[1];
 2340|  1.00k|          dst += 2 * dst_stride;
 2341|  1.00k|          y -= 2;
 2342|  1.00k|        } while (y);
  ------------------
  |  Branch (2342:18): [True: 340, False: 668]
  ------------------
 2343|  3.00k|      } else {
 2344|  3.00k|        __m128i s_32[4], ss_128[2];
 2345|       |
 2346|  3.00k|        assert(w == 4);
 2347|       |
 2348|  3.00k|        s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
 2349|  3.00k|        s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
 2350|  3.00k|        s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
 2351|       |
 2352|  3.00k|        const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
 2353|  3.00k|        const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
 2354|       |
 2355|  3.00k|        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
 2356|       |
 2357|  6.15k|        do {
 2358|  6.15k|          src_ptr += 2 * src_stride;
 2359|  6.15k|          const __m128i res = y_convolve_4tap_4x2_ssse3(
 2360|  6.15k|              src_ptr, src_stride, coeffs_128, s_32, ss_128);
 2361|  6.15k|          const __m128i r = sr_y_round_sse2(res);
 2362|  6.15k|          pack_store_4x2_sse2(r, dst, dst_stride);
 2363|       |
 2364|  6.15k|          ss_128[0] = ss_128[1];
 2365|  6.15k|          dst += 2 * dst_stride;
 2366|  6.15k|          y -= 2;
 2367|  6.15k|        } while (y);
  ------------------
  |  Branch (2367:18): [True: 3.14k, False: 3.00k]
  ------------------
 2368|  3.00k|      }
 2369|  3.67k|    } else {
 2370|  3.46k|      prepare_half_coeffs_4tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
 2371|       |
 2372|  3.46k|      if (w == 8) {
  ------------------
  |  Branch (2372:11): [True: 2.35k, False: 1.11k]
  ------------------
 2373|  2.35k|        __m128i s_64[4];
 2374|  2.35k|        __m256i ss_256[2];
 2375|       |
 2376|  2.35k|        s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
 2377|  2.35k|        s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
 2378|  2.35k|        s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
 2379|       |
 2380|       |        // Load lines a and b. Line a to lower 128, line b to upper 128
 2381|  2.35k|        const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
  ------------------
  |  |   29|  2.35k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  2.35k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 2382|  2.35k|        const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
  ------------------
  |  |   29|  2.35k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  2.35k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 2383|       |
 2384|  2.35k|        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
 2385|       |
 2386|  5.04k|        do {
 2387|  5.04k|          src_ptr += 2 * src_stride;
 2388|  5.04k|          const __m256i res = y_convolve_4tap_8x2_avx2(
 2389|  5.04k|              src_ptr, src_stride, coeffs_256, s_64, ss_256);
 2390|  5.04k|          sr_y_round_store_8x2_avx2(res, dst, dst_stride);
 2391|       |
 2392|  5.04k|          ss_256[0] = ss_256[1];
 2393|  5.04k|          dst += 2 * dst_stride;
 2394|  5.04k|          y -= 2;
 2395|  5.04k|        } while (y);
  ------------------
  |  Branch (2395:18): [True: 2.68k, False: 2.35k]
  ------------------
 2396|  2.35k|      } else if (w == 16) {
  ------------------
  |  Branch (2396:18): [True: 944, False: 171]
  ------------------
 2397|    944|        __m128i s_128[4];
 2398|    944|        __m256i ss_256[4], r[2];
 2399|       |
 2400|    944|        s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
 2401|    944|        s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
 2402|    944|        s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
 2403|       |
 2404|       |        // Load lines a and b. Line a to lower 128, line b to upper 128
 2405|    944|        const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
  ------------------
  |  |   29|    944|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|    944|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 2406|    944|        const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
  ------------------
  |  |   29|    944|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|    944|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 2407|       |
 2408|    944|        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
 2409|    944|        ss_256[2] = _mm256_unpackhi_epi8(src01, src12);
 2410|       |
 2411|  2.99k|        do {
 2412|  2.99k|          src_ptr += 2 * src_stride;
 2413|  2.99k|          y_convolve_4tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
 2414|  2.99k|                                    ss_256, r);
 2415|  2.99k|          sr_y_round_store_16x2_avx2(r, dst, dst_stride);
 2416|       |
 2417|  2.99k|          ss_256[0] = ss_256[1];
 2418|  2.99k|          ss_256[2] = ss_256[3];
 2419|  2.99k|          dst += 2 * dst_stride;
 2420|  2.99k|          y -= 2;
 2421|  2.99k|        } while (y);
  ------------------
  |  Branch (2421:18): [True: 2.04k, False: 944]
  ------------------
 2422|    944|      } else if (w == 32) {
  ------------------
  |  Branch (2422:18): [True: 125, False: 46]
  ------------------
 2423|       |        // AV1 standard won't have 32x4 case.
 2424|       |        // This only favors some optimization feature which
 2425|       |        // subsamples 32x8 to 32x4 and triggers 4-tap filter.
 2426|       |
 2427|    125|        __m256i s_256[4], ss_256[4], tt_256[4], r[4];
 2428|       |
 2429|    125|        s_256[0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * src_stride));
 2430|    125|        s_256[1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * src_stride));
 2431|    125|        s_256[2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * src_stride));
 2432|       |
 2433|    125|        ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
 2434|    125|        ss_256[2] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
 2435|       |
 2436|    125|        tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
 2437|    125|        tt_256[2] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
 2438|       |
 2439|  1.34k|        do {
 2440|  1.34k|          src_ptr += 2 * src_stride;
 2441|  1.34k|          y_convolve_4tap_32x2_avx2(src_ptr, src_stride, coeffs_256, s_256,
 2442|  1.34k|                                    ss_256, tt_256, r);
 2443|  1.34k|          sr_y_round_store_32x2_avx2(r, dst, dst_stride);
 2444|       |
 2445|  1.34k|          ss_256[0] = ss_256[1];
 2446|  1.34k|          ss_256[2] = ss_256[3];
 2447|       |
 2448|  1.34k|          tt_256[0] = tt_256[1];
 2449|  1.34k|          tt_256[2] = tt_256[3];
 2450|  1.34k|          dst += 2 * dst_stride;
 2451|  1.34k|          y -= 2;
 2452|  1.34k|        } while (y);
  ------------------
  |  Branch (2452:18): [True: 1.21k, False: 125]
  ------------------
 2453|    125|      } else {
 2454|     46|        assert(!(w % 32));
 2455|       |
 2456|     46|        __m256i s_256[4], ss_256[4], tt_256[4], r[4];
 2457|     46|        x = 0;
 2458|    104|        do {
 2459|    104|          const uint8_t *s = src_ptr + x;
 2460|    104|          uint8_t *d = dst + x;
 2461|    104|          s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
 2462|    104|          s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
 2463|    104|          s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
 2464|       |
 2465|    104|          ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
 2466|    104|          ss_256[2] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
 2467|       |
 2468|    104|          tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
 2469|    104|          tt_256[2] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
 2470|       |
 2471|    104|          y = h;
 2472|  3.13k|          do {
 2473|  3.13k|            s += 2 * src_stride;
 2474|  3.13k|            y_convolve_4tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
 2475|  3.13k|                                      tt_256, r);
 2476|  3.13k|            sr_y_round_store_32x2_avx2(r, d, dst_stride);
 2477|       |
 2478|  3.13k|            ss_256[0] = ss_256[1];
 2479|  3.13k|            ss_256[2] = ss_256[3];
 2480|       |
 2481|  3.13k|            tt_256[0] = tt_256[1];
 2482|  3.13k|            tt_256[2] = tt_256[3];
 2483|  3.13k|            d += 2 * dst_stride;
 2484|  3.13k|            y -= 2;
 2485|  3.13k|          } while (y);
  ------------------
  |  Branch (2485:20): [True: 3.03k, False: 104]
  ------------------
 2486|    104|          x += 32;
 2487|    104|        } while (x < w);
  ------------------
  |  Branch (2487:18): [True: 58, False: 46]
  ------------------
 2488|     46|      }
 2489|  3.46k|    }
 2490|  7.14k|  } else if (vert_tap == 6) {
  ------------------
  |  Branch (2490:14): [True: 5.86k, False: 566]
  ------------------
 2491|       |    // vert_filt as 6 tap
 2492|  5.86k|    const uint8_t *src_ptr = src - 2 * src_stride;
 2493|       |
 2494|  5.86k|    if (w <= 4) {
  ------------------
  |  Branch (2494:9): [True: 2.11k, False: 3.75k]
  ------------------
 2495|  2.11k|      prepare_half_coeffs_6tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
 2496|       |
 2497|  2.11k|      y = h;
 2498|       |
 2499|  2.11k|      if (w == 2) {
  ------------------
  |  Branch (2499:11): [True: 312, False: 1.80k]
  ------------------
 2500|    312|        __m128i s_16[6], ss_128[3];
 2501|       |
 2502|    312|        s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
 2503|    312|        s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
 2504|    312|        s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
 2505|    312|        s_16[3] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 3 * src_stride));
 2506|    312|        s_16[4] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 4 * src_stride));
 2507|       |
 2508|    312|        const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
 2509|    312|        const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
 2510|    312|        const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
 2511|    312|        const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
 2512|       |
 2513|    312|        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
 2514|    312|        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
 2515|       |
 2516|  1.24k|        do {
 2517|  1.24k|          src_ptr += 2 * src_stride;
 2518|  1.24k|          const __m128i res = y_convolve_6tap_2x2_ssse3(
 2519|  1.24k|              src_ptr, src_stride, coeffs_128, s_16, ss_128);
 2520|  1.24k|          const __m128i r = sr_y_round_sse2(res);
 2521|  1.24k|          pack_store_2x2_sse2(r, dst, dst_stride);
 2522|       |
 2523|  1.24k|          ss_128[0] = ss_128[1];
 2524|  1.24k|          ss_128[1] = ss_128[2];
 2525|  1.24k|          dst += 2 * dst_stride;
 2526|  1.24k|          y -= 2;
 2527|  1.24k|        } while (y);
  ------------------
  |  Branch (2527:18): [True: 936, False: 312]
  ------------------
 2528|  1.80k|      } else {
 2529|  1.80k|        __m128i s_32[6], ss_128[3];
 2530|       |
 2531|  1.80k|        assert(w == 4);
 2532|       |
 2533|  1.80k|        s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
 2534|  1.80k|        s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
 2535|  1.80k|        s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
 2536|  1.80k|        s_32[3] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 3 * src_stride));
 2537|  1.80k|        s_32[4] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 4 * src_stride));
 2538|       |
 2539|  1.80k|        const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
 2540|  1.80k|        const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
 2541|  1.80k|        const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
 2542|  1.80k|        const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
 2543|       |
 2544|  1.80k|        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
 2545|  1.80k|        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
 2546|       |
 2547|  9.36k|        do {
 2548|  9.36k|          src_ptr += 2 * src_stride;
 2549|  9.36k|          const __m128i res = y_convolve_6tap_4x2_ssse3(
 2550|  9.36k|              src_ptr, src_stride, coeffs_128, s_32, ss_128);
 2551|  9.36k|          const __m128i r = sr_y_round_sse2(res);
 2552|  9.36k|          pack_store_4x2_sse2(r, dst, dst_stride);
 2553|       |
 2554|  9.36k|          ss_128[0] = ss_128[1];
 2555|  9.36k|          ss_128[1] = ss_128[2];
 2556|  9.36k|          dst += 2 * dst_stride;
 2557|  9.36k|          y -= 2;
 2558|  9.36k|        } while (y);
  ------------------
  |  Branch (2558:18): [True: 7.56k, False: 1.80k]
  ------------------
 2559|  1.80k|      }
 2560|  3.75k|    } else {
 2561|  3.75k|      prepare_half_coeffs_6tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
 2562|       |
 2563|  3.75k|      if (w == 8) {
  ------------------
  |  Branch (2563:11): [True: 2.08k, False: 1.66k]
  ------------------
 2564|  2.08k|        __m128i s_64[6];
 2565|  2.08k|        __m256i ss_256[3];
 2566|       |
 2567|  2.08k|        s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
 2568|  2.08k|        s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
 2569|  2.08k|        s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
 2570|  2.08k|        s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
 2571|  2.08k|        s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
 2572|       |
 2573|       |        // Load lines a and b. Line a to lower 128, line b to upper 128
 2574|  2.08k|        const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
  ------------------
  |  |   29|  2.08k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  2.08k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 2575|  2.08k|        const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
  ------------------
  |  |   29|  2.08k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  2.08k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 2576|  2.08k|        const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
  ------------------
  |  |   29|  2.08k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  2.08k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 2577|  2.08k|        const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
  ------------------
  |  |   29|  2.08k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  2.08k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 2578|       |
 2579|  2.08k|        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
 2580|  2.08k|        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
 2581|       |
 2582|  2.08k|        y = h;
 2583|  12.0k|        do {
 2584|  12.0k|          src_ptr += 2 * src_stride;
 2585|  12.0k|          const __m256i res = y_convolve_6tap_8x2_avx2(
 2586|  12.0k|              src_ptr, src_stride, coeffs_256, s_64, ss_256);
 2587|  12.0k|          sr_y_round_store_8x2_avx2(res, dst, dst_stride);
 2588|       |
 2589|  12.0k|          ss_256[0] = ss_256[1];
 2590|  12.0k|          ss_256[1] = ss_256[2];
 2591|  12.0k|          dst += 2 * dst_stride;
 2592|  12.0k|          y -= 2;
 2593|  12.0k|        } while (y);
  ------------------
  |  Branch (2593:18): [True: 9.93k, False: 2.08k]
  ------------------
 2594|  2.08k|      } else if (w == 16) {
  ------------------
  |  Branch (2594:18): [True: 1.12k, False: 544]
  ------------------
 2595|  1.12k|        __m128i s_128[6];
 2596|  1.12k|        __m256i ss_256[6], r[2];
 2597|       |
 2598|  1.12k|        s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
 2599|  1.12k|        s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
 2600|  1.12k|        s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
 2601|  1.12k|        s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
 2602|  1.12k|        s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
 2603|       |
 2604|       |        // Load lines a and b. Line a to lower 128, line b to upper 128
 2605|  1.12k|        const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
  ------------------
  |  |   29|  1.12k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  1.12k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 2606|  1.12k|        const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
  ------------------
  |  |   29|  1.12k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  1.12k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 2607|  1.12k|        const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
  ------------------
  |  |   29|  1.12k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  1.12k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 2608|  1.12k|        const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
  ------------------
  |  |   29|  1.12k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  1.12k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 2609|       |
 2610|  1.12k|        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
 2611|  1.12k|        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
 2612|       |
 2613|  1.12k|        ss_256[3] = _mm256_unpackhi_epi8(src01, src12);
 2614|  1.12k|        ss_256[4] = _mm256_unpackhi_epi8(src23, src34);
 2615|       |
 2616|  1.12k|        y = h;
 2617|  8.38k|        do {
 2618|  8.38k|          src_ptr += 2 * src_stride;
 2619|  8.38k|          y_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
 2620|  8.38k|                                    ss_256, r);
 2621|  8.38k|          sr_y_round_store_16x2_avx2(r, dst, dst_stride);
 2622|       |
 2623|  8.38k|          ss_256[0] = ss_256[1];
 2624|  8.38k|          ss_256[1] = ss_256[2];
 2625|       |
 2626|  8.38k|          ss_256[3] = ss_256[4];
 2627|  8.38k|          ss_256[4] = ss_256[5];
 2628|  8.38k|          dst += 2 * dst_stride;
 2629|  8.38k|          y -= 2;
 2630|  8.38k|        } while (y);
  ------------------
  |  Branch (2630:18): [True: 7.26k, False: 1.12k]
  ------------------
 2631|  1.12k|      } else {
 2632|    544|        __m256i s_256[6], ss_256[6], tt_256[6], r[4];
 2633|       |
 2634|    544|        assert(!(w % 32));
 2635|       |
 2636|    544|        x = 0;
 2637|    679|        do {
 2638|    679|          const uint8_t *s = src_ptr + x;
 2639|    679|          uint8_t *d = dst + x;
 2640|       |
 2641|    679|          s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
 2642|    679|          s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
 2643|    679|          s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
 2644|    679|          s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
 2645|    679|          s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
 2646|       |
 2647|    679|          ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
 2648|    679|          ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
 2649|    679|          ss_256[3] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
 2650|    679|          ss_256[4] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
 2651|       |
 2652|    679|          tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
 2653|    679|          tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
 2654|    679|          tt_256[3] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
 2655|    679|          tt_256[4] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
 2656|       |
 2657|    679|          y = h;
 2658|  11.8k|          do {
 2659|  11.8k|            s += 2 * src_stride;
 2660|  11.8k|            y_convolve_6tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
 2661|  11.8k|                                      tt_256, r);
 2662|  11.8k|            sr_y_round_store_32x2_avx2(r, d, dst_stride);
 2663|       |
 2664|  11.8k|            ss_256[0] = ss_256[1];
 2665|  11.8k|            ss_256[1] = ss_256[2];
 2666|  11.8k|            ss_256[3] = ss_256[4];
 2667|  11.8k|            ss_256[4] = ss_256[5];
 2668|       |
 2669|  11.8k|            tt_256[0] = tt_256[1];
 2670|  11.8k|            tt_256[1] = tt_256[2];
 2671|  11.8k|            tt_256[3] = tt_256[4];
 2672|  11.8k|            tt_256[4] = tt_256[5];
 2673|  11.8k|            d += 2 * dst_stride;
 2674|  11.8k|            y -= 2;
 2675|  11.8k|          } while (y);
  ------------------
  |  Branch (2675:20): [True: 11.1k, False: 679]
  ------------------
 2676|       |
 2677|    679|          x += 32;
 2678|    679|        } while (x < w);
  ------------------
  |  Branch (2678:18): [True: 135, False: 544]
  ------------------
 2679|    544|      }
 2680|  3.75k|    }
 2681|  5.86k|  } else if (vert_tap == 8) {
  ------------------
  |  Branch (2681:14): [True: 566, False: 0]
  ------------------
 2682|       |    // vert_filt as 8 tap
 2683|    566|    const uint8_t *src_ptr = src - 3 * src_stride;
 2684|       |
 2685|    566|    if (w <= 4) {
  ------------------
  |  Branch (2685:9): [True: 179, False: 387]
  ------------------
 2686|    179|      prepare_half_coeffs_8tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
 2687|       |
 2688|    179|      y = h;
 2689|       |
 2690|    179|      if (w == 2) {
  ------------------
  |  Branch (2690:11): [True: 36, False: 143]
  ------------------
 2691|     36|        __m128i s_16[8], ss_128[4];
 2692|       |
 2693|     36|        s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
 2694|     36|        s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
 2695|     36|        s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
 2696|     36|        s_16[3] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 3 * src_stride));
 2697|     36|        s_16[4] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 4 * src_stride));
 2698|     36|        s_16[5] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 5 * src_stride));
 2699|     36|        s_16[6] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 6 * src_stride));
 2700|       |
 2701|     36|        const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
 2702|     36|        const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
 2703|     36|        const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
 2704|     36|        const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
 2705|     36|        const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
 2706|     36|        const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[6]);
 2707|       |
 2708|     36|        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
 2709|     36|        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
 2710|     36|        ss_128[2] = _mm_unpacklo_epi8(src45, src56);
 2711|       |
 2712|    144|        do {
 2713|    144|          const __m128i res = y_convolve_8tap_2x2_ssse3(
 2714|    144|              src_ptr, src_stride, coeffs_128, s_16, ss_128);
 2715|    144|          const __m128i r = sr_y_round_sse2(res);
 2716|    144|          pack_store_2x2_sse2(r, dst, dst_stride);
 2717|    144|          ss_128[0] = ss_128[1];
 2718|    144|          ss_128[1] = ss_128[2];
 2719|    144|          ss_128[2] = ss_128[3];
 2720|    144|          src_ptr += 2 * src_stride;
 2721|    144|          dst += 2 * dst_stride;
 2722|    144|          y -= 2;
 2723|    144|        } while (y);
  ------------------
  |  Branch (2723:18): [True: 108, False: 36]
  ------------------
 2724|    143|      } else {
 2725|    143|        __m128i s_32[8], ss_128[4];
 2726|       |
 2727|    143|        assert(w == 4);
 2728|       |
 2729|    143|        s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
 2730|    143|        s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
 2731|    143|        s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
 2732|    143|        s_32[3] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 3 * src_stride));
 2733|    143|        s_32[4] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 4 * src_stride));
 2734|    143|        s_32[5] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 5 * src_stride));
 2735|    143|        s_32[6] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 6 * src_stride));
 2736|       |
 2737|    143|        const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
 2738|    143|        const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
 2739|    143|        const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
 2740|    143|        const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
 2741|    143|        const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
 2742|    143|        const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[6]);
 2743|       |
 2744|    143|        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
 2745|    143|        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
 2746|    143|        ss_128[2] = _mm_unpacklo_epi8(src45, src56);
 2747|       |
 2748|    740|        do {
 2749|    740|          const __m128i res = y_convolve_8tap_4x2_ssse3(
 2750|    740|              src_ptr, src_stride, coeffs_128, s_32, ss_128);
 2751|    740|          const __m128i r = sr_y_round_sse2(res);
 2752|    740|          pack_store_4x2_sse2(r, dst, dst_stride);
 2753|    740|          ss_128[0] = ss_128[1];
 2754|    740|          ss_128[1] = ss_128[2];
 2755|    740|          ss_128[2] = ss_128[3];
 2756|    740|          src_ptr += 2 * src_stride;
 2757|    740|          dst += 2 * dst_stride;
 2758|    740|          y -= 2;
 2759|    740|        } while (y);
  ------------------
  |  Branch (2759:18): [True: 597, False: 143]
  ------------------
 2760|    143|      }
 2761|    387|    } else {
 2762|    387|      prepare_half_coeffs_8tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
 2763|       |
 2764|    387|      if (w == 8) {
  ------------------
  |  Branch (2764:11): [True: 163, False: 224]
  ------------------
 2765|    163|        __m128i s_64[8];
 2766|    163|        __m256i ss_256[4];
 2767|       |
 2768|    163|        s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
 2769|    163|        s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
 2770|    163|        s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
 2771|    163|        s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
 2772|    163|        s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
 2773|    163|        s_64[5] = _mm_loadl_epi64((__m128i *)(src_ptr + 5 * src_stride));
 2774|    163|        s_64[6] = _mm_loadl_epi64((__m128i *)(src_ptr + 6 * src_stride));
 2775|       |
 2776|       |        // Load lines a and b. Line a to lower 128, line b to upper 128
 2777|    163|        const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
  ------------------
  |  |   29|    163|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|    163|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 2778|    163|        const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
  ------------------
  |  |   29|    163|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|    163|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 2779|    163|        const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
  ------------------
  |  |   29|    163|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|    163|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 2780|    163|        const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
  ------------------
  |  |   29|    163|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|    163|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 2781|    163|        const __m256i src45 = _mm256_setr_m128i(s_64[4], s_64[5]);
  ------------------
  |  |   29|    163|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|    163|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 2782|    163|        const __m256i src56 = _mm256_setr_m128i(s_64[5], s_64[6]);
  ------------------
  |  |   29|    163|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|    163|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 2783|       |
 2784|    163|        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
 2785|    163|        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
 2786|    163|        ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
 2787|       |
 2788|    163|        y = h;
 2789|    824|        do {
 2790|    824|          const __m256i res = y_convolve_8tap_8x2_avx2(
 2791|    824|              src_ptr, src_stride, coeffs_256, s_64, ss_256);
 2792|    824|          sr_y_round_store_8x2_avx2(res, dst, dst_stride);
 2793|    824|          ss_256[0] = ss_256[1];
 2794|    824|          ss_256[1] = ss_256[2];
 2795|    824|          ss_256[2] = ss_256[3];
 2796|    824|          src_ptr += 2 * src_stride;
 2797|    824|          dst += 2 * dst_stride;
 2798|    824|          y -= 2;
 2799|    824|        } while (y);
  ------------------
  |  Branch (2799:18): [True: 661, False: 163]
  ------------------
 2800|    224|      } else if (w == 16) {
  ------------------
  |  Branch (2800:18): [True: 91, False: 133]
  ------------------
 2801|     91|        __m128i s_128[8];
 2802|     91|        __m256i ss_256[8], r[2];
 2803|       |
 2804|     91|        s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
 2805|     91|        s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
 2806|     91|        s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
 2807|     91|        s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
 2808|     91|        s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
 2809|     91|        s_128[5] = _mm_loadu_si128((__m128i *)(src_ptr + 5 * src_stride));
 2810|     91|        s_128[6] = _mm_loadu_si128((__m128i *)(src_ptr + 6 * src_stride));
 2811|       |
 2812|       |        // Load lines a and b. Line a to lower 128, line b to upper 128
 2813|     91|        const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
  ------------------
  |  |   29|     91|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|     91|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 2814|     91|        const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
  ------------------
  |  |   29|     91|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|     91|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 2815|     91|        const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
  ------------------
  |  |   29|     91|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|     91|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 2816|     91|        const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
  ------------------
  |  |   29|     91|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|     91|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 2817|     91|        const __m256i src45 = _mm256_setr_m128i(s_128[4], s_128[5]);
  ------------------
  |  |   29|     91|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|     91|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 2818|     91|        const __m256i src56 = _mm256_setr_m128i(s_128[5], s_128[6]);
  ------------------
  |  |   29|     91|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|     91|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 2819|       |
 2820|     91|        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
 2821|     91|        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
 2822|     91|        ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
 2823|       |
 2824|     91|        ss_256[4] = _mm256_unpackhi_epi8(src01, src12);
 2825|     91|        ss_256[5] = _mm256_unpackhi_epi8(src23, src34);
 2826|     91|        ss_256[6] = _mm256_unpackhi_epi8(src45, src56);
 2827|       |
 2828|     91|        y = h;
 2829|    644|        do {
 2830|    644|          y_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
 2831|    644|                                    ss_256, r);
 2832|    644|          sr_y_round_store_16x2_avx2(r, dst, dst_stride);
 2833|       |
 2834|    644|          ss_256[0] = ss_256[1];
 2835|    644|          ss_256[1] = ss_256[2];
 2836|    644|          ss_256[2] = ss_256[3];
 2837|       |
 2838|    644|          ss_256[4] = ss_256[5];
 2839|    644|          ss_256[5] = ss_256[6];
 2840|    644|          ss_256[6] = ss_256[7];
 2841|    644|          src_ptr += 2 * src_stride;
 2842|    644|          dst += 2 * dst_stride;
 2843|    644|          y -= 2;
 2844|    644|        } while (y);
  ------------------
  |  Branch (2844:18): [True: 553, False: 91]
  ------------------
 2845|    133|      } else {
 2846|    133|        __m256i s_256[8], ss_256[8], tt_256[8], r[4];
 2847|       |
 2848|    133|        assert(!(w % 32));
 2849|       |
 2850|    133|        x = 0;
 2851|    176|        do {
 2852|    176|          const uint8_t *s = src_ptr + x;
 2853|    176|          uint8_t *d = dst + x;
 2854|       |
 2855|    176|          s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
 2856|    176|          s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
 2857|    176|          s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
 2858|    176|          s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
 2859|    176|          s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
 2860|    176|          s_256[5] = _mm256_loadu_si256((__m256i *)(s + 5 * src_stride));
 2861|    176|          s_256[6] = _mm256_loadu_si256((__m256i *)(s + 6 * src_stride));
 2862|       |
 2863|    176|          ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
 2864|    176|          ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
 2865|    176|          ss_256[2] = _mm256_unpacklo_epi8(s_256[4], s_256[5]);
 2866|    176|          ss_256[4] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
 2867|    176|          ss_256[5] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
 2868|    176|          ss_256[6] = _mm256_unpackhi_epi8(s_256[4], s_256[5]);
 2869|       |
 2870|    176|          tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
 2871|    176|          tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
 2872|    176|          tt_256[2] = _mm256_unpacklo_epi8(s_256[5], s_256[6]);
 2873|    176|          tt_256[4] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
 2874|    176|          tt_256[5] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
 2875|    176|          tt_256[6] = _mm256_unpackhi_epi8(s_256[5], s_256[6]);
 2876|       |
 2877|    176|          y = h;
 2878|  1.80k|          do {
 2879|  1.80k|            y_convolve_8tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
 2880|  1.80k|                                      tt_256, r);
 2881|  1.80k|            sr_y_round_store_32x2_avx2(r, d, dst_stride);
 2882|       |
 2883|  1.80k|            ss_256[0] = ss_256[1];
 2884|  1.80k|            ss_256[1] = ss_256[2];
 2885|  1.80k|            ss_256[2] = ss_256[3];
 2886|  1.80k|            ss_256[4] = ss_256[5];
 2887|  1.80k|            ss_256[5] = ss_256[6];
 2888|  1.80k|            ss_256[6] = ss_256[7];
 2889|       |
 2890|  1.80k|            tt_256[0] = tt_256[1];
 2891|  1.80k|            tt_256[1] = tt_256[2];
 2892|  1.80k|            tt_256[2] = tt_256[3];
 2893|  1.80k|            tt_256[4] = tt_256[5];
 2894|  1.80k|            tt_256[5] = tt_256[6];
 2895|  1.80k|            tt_256[6] = tt_256[7];
 2896|  1.80k|            s += 2 * src_stride;
 2897|  1.80k|            d += 2 * dst_stride;
 2898|  1.80k|            y -= 2;
 2899|  1.80k|          } while (y);
  ------------------
  |  Branch (2899:20): [True: 1.62k, False: 176]
  ------------------
 2900|       |
 2901|    176|          x += 32;
 2902|    176|        } while (x < w);
  ------------------
  |  Branch (2902:18): [True: 43, False: 133]
  ------------------
 2903|    133|      }
 2904|    387|    }
 2905|    566|  }
 2906|  14.4k|}
convolve_avx2.c:prepare_half_coeffs_2tap_ssse3:
   61|  1.46k|    __m128i *const coeffs /* [1] */) {
   62|  1.46k|  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
   63|  1.46k|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|  1.46k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|  1.46k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
   64|  1.46k|  const __m128i coeffs_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
   65|       |
   66|       |  // right shift all filter co-efficients by 1 to reduce the bits required.
   67|       |  // This extra right shift will be taken care of at the end while rounding
   68|       |  // the result.
   69|       |  // Since all filter co-efficients are even, this change will not affect the
   70|       |  // end result
   71|  1.46k|  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
   72|  1.46k|                            _mm_set1_epi16((short)0xffff)));
   73|       |
   74|  1.46k|  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
   75|       |
   76|       |  // coeffs 3 4 3 4 3 4 3 4
   77|  1.46k|  *coeffs = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
   78|  1.46k|}
convolve_avx2.c:y_convolve_2tap_2x2_ssse3:
 1087|    140|                                                __m128i s_16[2]) {
 1088|    140|  __m128i s_128[2];
 1089|       |
 1090|    140|  s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src + stride));
 1091|    140|  s_128[0] = _mm_unpacklo_epi16(s_16[0], s_16[1]);
 1092|    140|  s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src + 2 * stride));
 1093|    140|  s_128[1] = _mm_unpacklo_epi16(s_16[1], s_16[0]);
 1094|    140|  const __m128i ss = _mm_unpacklo_epi8(s_128[0], s_128[1]);
 1095|    140|  return convolve_2tap_ssse3(&ss, coeffs);
 1096|    140|}
convolve_avx2.c:sr_y_round_sse2:
  792|  21.9k|static inline __m128i sr_y_round_sse2(const __m128i src) {
  793|  21.9k|  const __m128i round = _mm_set1_epi16(32);
  794|  21.9k|  const __m128i dst = _mm_add_epi16(src, round);
  795|  21.9k|  return _mm_srai_epi16(dst, FILTER_BITS - 1);
  ------------------
  |  |   21|  21.9k|#define FILTER_BITS 7
  ------------------
  796|  21.9k|}
convolve_avx2.c:pack_store_2x2_sse2:
  687|  4.94k|                                       const ptrdiff_t stride) {
  688|  4.94k|  const __m128i d = _mm_packus_epi16(res, res);
  689|  4.94k|  *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(d);
  690|       |  *(int16_t *)(dst + stride) = (int16_t)_mm_extract_epi16(d, 1);
  691|  4.94k|}
convolve_avx2.c:y_convolve_2tap_4x2_ssse3:
 1101|    732|                                                __m128i s_32[2]) {
 1102|    732|  __m128i s_128[2];
 1103|       |
 1104|    732|  s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + stride));
 1105|    732|  s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
 1106|    732|  s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * stride));
 1107|    732|  s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
 1108|    732|  const __m128i ss = _mm_unpacklo_epi8(s_128[0], s_128[1]);
 1109|    732|  return convolve_2tap_ssse3(&ss, coeffs);
 1110|    732|}
convolve_avx2.c:pack_store_4x2_sse2:
  694|  32.4k|                                       const ptrdiff_t stride) {
  695|  32.4k|  const __m128i d = _mm_packus_epi16(res, res);
  696|  32.4k|  store_u8_4x2_sse2(d, dst, stride);
  697|  32.4k|}
convolve_avx2.c:convolve_2tap_ssse3:
  433|  10.5k|                                          const __m128i coeffs[1]) {
  434|  10.5k|  return _mm_maddubs_epi16(ss[0], coeffs[0]);
  435|  10.5k|}
convolve_avx2.c:prepare_half_coeffs_2tap_avx2:
  157|    818|    __m256i *const coeffs /* [1] */) {
  158|    818|  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
  159|    818|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|    818|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|    818|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  160|    818|  const __m128i coeffs_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
  161|    818|  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
  162|       |
  163|       |  // right shift all filter co-efficients by 1 to reduce the bits required.
  164|       |  // This extra right shift will be taken care of at the end while rounding
  165|       |  // the result.
  166|       |  // Since all filter co-efficients are even, this change will not affect the
  167|       |  // end result
  168|    818|  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
  169|    818|                            _mm_set1_epi16((short)0xffff)));
  170|       |
  171|    818|  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
  172|       |
  173|       |  // coeffs 3 4 3 4 3 4 3 4
  174|    818|  *coeffs = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
  175|    818|}
convolve_avx2.c:y_convolve_2tap_16x2_avx2:
 1129|    776|                                             __m128i s_128[2], __m256i r[2]) {
 1130|    776|  __m256i s_256[2];
 1131|       |
 1132|    776|  s_128[1] = _mm_loadu_si128((__m128i *)(src + stride));
 1133|    776|  s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
  ------------------
  |  |   29|    776|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|    776|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1134|    776|  s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * stride));
 1135|       |  s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
  ------------------
  |  |   29|    776|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|    776|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1136|    776|  const __m256i ss0 = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
 1137|    776|  const __m256i ss1 = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
 1138|    776|  r[0] = convolve_2tap_avx2(&ss0, coeffs);
 1139|    776|  r[1] = convolve_2tap_avx2(&ss1, coeffs);
 1140|    776|}
convolve_avx2.c:convolve_2tap_avx2:
  465|  46.3k|                                         const __m256i coeffs[1]) {
  466|  46.3k|  return _mm256_maddubs_epi16(ss[0], coeffs[0]);
  467|  46.3k|}
convolve_avx2.c:sr_y_round_store_16x2_avx2:
  833|  12.7k|                                              const ptrdiff_t dst_stride) {
  834|  12.7k|  __m256i r[2];
  835|       |
  836|  12.7k|  r[0] = sr_y_round_avx2(res[0]);
  837|  12.7k|  r[1] = sr_y_round_avx2(res[1]);
  838|  12.7k|  pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
  839|  12.7k|}
convolve_avx2.c:sr_y_round_avx2:
  596|   120k|static inline __m256i sr_y_round_avx2(const __m256i src) {
  597|   120k|  const __m256i round = _mm256_set1_epi16(32);
  598|   120k|  const __m256i dst = _mm256_add_epi16(src, round);
  599|   120k|  return _mm256_srai_epi16(dst, FILTER_BITS - 1);
  ------------------
  |  |   21|   120k|#define FILTER_BITS 7
  ------------------
  600|   120k|}
convolve_avx2.c:pack_store_16x2_avx2:
  720|  25.1k|                                        const ptrdiff_t stride) {
  721|  25.1k|  const __m256i d = _mm256_packus_epi16(res0, res1);
  722|  25.1k|  storeu_u8_16x2_avx2(d, dst, stride);
  723|  25.1k|}
convolve_avx2.c:sr_y_2tap_32_avx2:
 1999|  2.35k|                                     __m256i *const s1, uint8_t *const dst) {
 2000|  2.35k|  __m256i r[2];
 2001|  2.35k|  y_convolve_2tap_32_avx2(src, coeffs, s0, s1, r);
 2002|  2.35k|  sr_y_round_store_32_avx2(r, dst);
 2003|  2.35k|}
convolve_avx2.c:y_convolve_2tap_32_avx2:
 1145|  2.35k|                                           __m256i r[2]) {
 1146|  2.35k|  *s1 = _mm256_loadu_si256((__m256i *)src);
 1147|  2.35k|  const __m256i ss0 = _mm256_unpacklo_epi8(s0, *s1);
 1148|  2.35k|  const __m256i ss1 = _mm256_unpackhi_epi8(s0, *s1);
 1149|  2.35k|  r[0] = convolve_2tap_avx2(&ss0, coeffs);
 1150|  2.35k|  r[1] = convolve_2tap_avx2(&ss1, coeffs);
 1151|  2.35k|}
convolve_avx2.c:sr_y_round_store_32_avx2:
 1982|  38.6k|                                            uint8_t *const dst) {
 1983|  38.6k|  __m256i r[2];
 1984|       |
 1985|  38.6k|  r[0] = sr_y_round_avx2(res[0]);
 1986|  38.6k|  r[1] = sr_y_round_avx2(res[1]);
 1987|  38.6k|  convolve_store_32_avx2(r[0], r[1], dst);
 1988|  38.6k|}
convolve_avx2.c:convolve_store_32_avx2:
  775|  91.2k|                                          uint8_t *const dst) {
  776|  91.2k|  const __m256i d = _mm256_packus_epi16(res0, res1);
  777|  91.2k|  _mm256_storeu_si256((__m256i *)dst, d);
  778|  91.2k|}
convolve_avx2.c:sr_y_2tap_32_avg_avx2:
  843|  2.06k|                                         uint8_t *const dst) {
  844|  2.06k|  *s1 = _mm256_loadu_si256((__m256i *)src);
  845|  2.06k|  const __m256i d = _mm256_avg_epu8(s0, *s1);
  846|  2.06k|  _mm256_storeu_si256((__m256i *)dst, d);
  847|  2.06k|}
convolve_avx2.c:prepare_half_coeffs_4tap_ssse3:
   82|  9.68k|    __m128i *const coeffs /* [2] */) {
   83|  9.68k|  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
   84|  9.68k|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|  9.68k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|  9.68k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
   85|  9.68k|  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
   86|       |
   87|       |  // right shift all filter co-efficients by 1 to reduce the bits required.
   88|       |  // This extra right shift will be taken care of at the end while rounding
   89|       |  // the result.
   90|       |  // Since all filter co-efficients are even, this change will not affect the
   91|       |  // end result
   92|  9.68k|  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
   93|  9.68k|                            _mm_set1_epi16((short)0xffff)));
   94|       |
   95|  9.68k|  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
   96|       |
   97|       |  // coeffs 2 3 2 3 2 3 2 3
   98|  9.68k|  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
   99|       |  // coeffs 4 5 4 5 4 5 4 5
  100|  9.68k|  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
  101|  9.68k|}
convolve_avx2.c:y_convolve_4tap_2x2_ssse3:
 1157|  1.00k|                                                __m128i ss_128[2]) {
 1158|  1.00k|  s_16[3] = _mm_cvtsi32_si128(loadu_int16(src + stride));
 1159|  1.00k|  const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
 1160|  1.00k|  s_16[2] = _mm_cvtsi32_si128(loadu_int16(src + 2 * stride));
 1161|  1.00k|  const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[2]);
 1162|  1.00k|  ss_128[1] = _mm_unpacklo_epi8(src23, src34);
 1163|  1.00k|  return convolve_4tap_ssse3(ss_128, coeffs);
 1164|  1.00k|}
convolve_avx2.c:convolve_4tap_ssse3:
  438|  23.0k|                                          const __m128i coeffs[2]) {
  439|  23.0k|  const __m128i res_23 = _mm_maddubs_epi16(ss[0], coeffs[0]);
  440|  23.0k|  const __m128i res_45 = _mm_maddubs_epi16(ss[1], coeffs[1]);
  441|  23.0k|  return _mm_add_epi16(res_23, res_45);
  442|  23.0k|}
convolve_avx2.c:y_convolve_4tap_4x2_ssse3:
 1170|  6.15k|                                                __m128i ss_128[2]) {
 1171|  6.15k|  s_32[3] = _mm_cvtsi32_si128(loadu_int32(src + stride));
 1172|  6.15k|  const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
 1173|  6.15k|  s_32[2] = _mm_cvtsi32_si128(loadu_int32(src + 2 * stride));
 1174|  6.15k|  const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[2]);
 1175|  6.15k|  ss_128[1] = _mm_unpacklo_epi8(src23, src34);
 1176|  6.15k|  return convolve_4tap_ssse3(ss_128, coeffs);
 1177|  6.15k|}
convolve_avx2.c:prepare_half_coeffs_4tap_avx2:
  179|  3.46k|    __m256i *const coeffs /* [2] */) {
  180|  3.46k|  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
  181|  3.46k|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|  3.46k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|  3.46k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  182|  3.46k|  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
  183|       |
  184|       |  // right shift all filter co-efficients by 1 to reduce the bits required.
  185|       |  // This extra right shift will be taken care of at the end while rounding
  186|       |  // the result.
  187|       |  // Since all filter co-efficients are even, this change will not affect the
  188|       |  // end result
  189|       |  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
  190|  3.46k|                            _mm_set1_epi16((short)0xffff)));
  191|  3.46k|  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
  192|  3.46k|  populate_coeffs_4tap_avx2(coeffs_1, coeffs);
  193|  3.46k|}
convolve_avx2.c:populate_coeffs_4tap_avx2:
   24|  3.46k|                                             __m256i coeffs[2]) {
   25|  3.46k|  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
   26|       |
   27|       |  // coeffs 2 3 2 3 2 3 2 3
   28|  3.46k|  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
   29|       |  // coeffs 4 5 4 5 4 5 4 5
   30|  3.46k|  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
   31|  3.46k|}
convolve_avx2.c:y_convolve_4tap_8x2_avx2:
 1183|  5.04k|                                               __m256i ss_256[2]) {
 1184|  5.04k|  s_64[3] = _mm_loadl_epi64((__m128i *)(src + stride));
 1185|  5.04k|  const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
  ------------------
  |  |   29|  5.04k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  5.04k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1186|  5.04k|  s_64[2] = _mm_loadl_epi64((__m128i *)(src + 2 * stride));
 1187|       |  const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[2]);
  ------------------
  |  |   29|  5.04k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  5.04k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1188|  5.04k|  ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
 1189|  5.04k|  return convolve_4tap_avx2(ss_256, coeffs);
 1190|  5.04k|}
convolve_avx2.c:convolve_4tap_avx2:
  470|  28.9k|                                         const __m256i coeffs[2]) {
  471|  28.9k|  const __m256i res_23 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
  472|  28.9k|  const __m256i res_45 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
  473|  28.9k|  return _mm256_add_epi16(res_23, res_45);
  474|  28.9k|}
convolve_avx2.c:sr_y_round_store_8x2_avx2:
  826|  17.8k|                                             const ptrdiff_t dst_stride) {
  827|  17.8k|  const __m256i r = sr_y_round_avx2(res);
  828|  17.8k|  pack_store_8x2_avx2(r, dst, dst_stride);
  829|  17.8k|}
convolve_avx2.c:pack_store_8x2_avx2:
  710|  33.1k|                                       const ptrdiff_t stride) {
  711|  33.1k|  const __m256i d = _mm256_packus_epi16(res, res);
  712|  33.1k|  const __m128i d0 = _mm256_castsi256_si128(d);
  713|       |  const __m128i d1 = _mm256_extracti128_si256(d, 1);
  714|  33.1k|  _mm_storel_epi64((__m128i *)dst, d0);
  715|  33.1k|  _mm_storel_epi64((__m128i *)(dst + stride), d1);
  716|  33.1k|}
convolve_avx2.c:y_convolve_4tap_16x2_avx2:
 1196|  2.99k|                                             __m256i ss_256[4], __m256i r[2]) {
 1197|  2.99k|  s_128[3] = _mm_loadu_si128((__m128i *)(src + stride));
 1198|  2.99k|  const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
  ------------------
  |  |   29|  2.99k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  2.99k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1199|  2.99k|  s_128[2] = _mm_loadu_si128((__m128i *)(src + 2 * stride));
 1200|       |  const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[2]);
  ------------------
  |  |   29|  2.99k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  2.99k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1201|  2.99k|  ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
 1202|  2.99k|  ss_256[3] = _mm256_unpackhi_epi8(src23, src34);
 1203|  2.99k|  r[0] = convolve_4tap_avx2(ss_256, coeffs);
 1204|  2.99k|  r[1] = convolve_4tap_avx2(ss_256 + 2, coeffs);
 1205|  2.99k|}
convolve_avx2.c:y_convolve_4tap_32x2_avx2:
 1222|  4.48k|    __m256i s_256[4], __m256i ss_256[4], __m256i tt_256[4], __m256i r[4]) {
 1223|  4.48k|  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
 1224|  4.48k|  ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
 1225|  4.48k|  ss_256[3] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
 1226|  4.48k|  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
 1227|  4.48k|  tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[2]);
 1228|  4.48k|  tt_256[3] = _mm256_unpackhi_epi8(s_256[3], s_256[2]);
 1229|  4.48k|  r[0] = convolve_4tap_avx2(ss_256 + 0, coeffs);
 1230|  4.48k|  r[1] = convolve_4tap_avx2(ss_256 + 2, coeffs);
 1231|  4.48k|  r[2] = convolve_4tap_avx2(tt_256 + 0, coeffs);
 1232|  4.48k|  r[3] = convolve_4tap_avx2(tt_256 + 2, coeffs);
 1233|  4.48k|}
convolve_avx2.c:sr_y_round_store_32x2_avx2:
 1992|  18.1k|                                              const int32_t dst_stride) {
 1993|  18.1k|  sr_y_round_store_32_avx2(res, dst);
 1994|  18.1k|  sr_y_round_store_32_avx2(res + 2, dst + dst_stride);
 1995|  18.1k|}
convolve_avx2.c:prepare_half_coeffs_6tap_ssse3:
  105|  2.11k|    __m128i *const coeffs /* [3] */) {
  106|  2.11k|  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
  107|  2.11k|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|  2.11k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|  2.11k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  108|  2.11k|  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
  109|       |
  110|       |  // right shift all filter co-efficients by 1 to reduce the bits required.
  111|       |  // This extra right shift will be taken care of at the end while rounding
  112|       |  // the result.
  113|       |  // Since all filter co-efficients are even, this change will not affect the
  114|       |  // end result
  115|  2.11k|  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
  116|  2.11k|                            _mm_set1_epi16((short)0xffff)));
  117|       |
  118|  2.11k|  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
  119|       |
  120|       |  // coeffs 1 2 1 2 1 2 1 2
  121|  2.11k|  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0402u));
  122|       |  // coeffs 3 4 3 4 3 4 3 4
  123|  2.11k|  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u));
  124|       |  // coeffs 5 6 5 6 5 6 5 6
  125|  2.11k|  coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0C0Au));
  126|  2.11k|}
convolve_avx2.c:y_convolve_6tap_2x2_ssse3:
 1211|  1.24k|                                                __m128i ss_128[3]) {
 1212|  1.24k|  s_16[5] = _mm_cvtsi32_si128(loadu_int16(src + 3 * stride));
 1213|  1.24k|  const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
 1214|  1.24k|  s_16[4] = _mm_cvtsi32_si128(loadu_int16(src + 4 * stride));
 1215|  1.24k|  const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[4]);
 1216|  1.24k|  ss_128[2] = _mm_unpacklo_epi8(src45, src56);
 1217|  1.24k|  return convolve_6tap_ssse3(ss_128, coeffs);
 1218|  1.24k|}
convolve_avx2.c:convolve_6tap_ssse3:
  445|  10.6k|                                          const __m128i coeffs[3]) {
  446|  10.6k|  const __m128i res_12 = _mm_maddubs_epi16(ss[0], coeffs[0]);
  447|  10.6k|  const __m128i res_34 = _mm_maddubs_epi16(ss[1], coeffs[1]);
  448|  10.6k|  const __m128i res_56 = _mm_maddubs_epi16(ss[2], coeffs[2]);
  449|  10.6k|  const __m128i res_1256 = _mm_add_epi16(res_12, res_56);
  450|  10.6k|  return _mm_add_epi16(res_1256, res_34);
  451|  10.6k|}
convolve_avx2.c:y_convolve_6tap_4x2_ssse3:
 1239|  9.36k|                                                __m128i ss_128[3]) {
 1240|  9.36k|  s_32[5] = _mm_cvtsi32_si128(loadu_int32(src + 3 * stride));
 1241|  9.36k|  const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
 1242|  9.36k|  s_32[4] = _mm_cvtsi32_si128(loadu_int32(src + 4 * stride));
 1243|  9.36k|  const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[4]);
 1244|  9.36k|  ss_128[2] = _mm_unpacklo_epi8(src45, src56);
 1245|  9.36k|  return convolve_6tap_ssse3(ss_128, coeffs);
 1246|  9.36k|}
convolve_avx2.c:prepare_half_coeffs_6tap_avx2:
  197|  9.27k|    __m256i *const coeffs /* [3] */) {
  198|  9.27k|  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
  199|  9.27k|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|  9.27k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|  9.27k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  200|  9.27k|  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
  201|       |
  202|       |  // right shift all filter co-efficients by 1 to reduce the bits required.
  203|       |  // This extra right shift will be taken care of at the end while rounding
  204|       |  // the result.
  205|       |  // Since all filter co-efficients are even, this change will not affect the
  206|       |  // end result
  207|       |  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
  208|  9.27k|                            _mm_set1_epi16((short)0xffff)));
  209|  9.27k|  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
  210|  9.27k|  populate_coeffs_6tap_avx2(coeffs_1, coeffs);
  211|  9.27k|}
convolve_avx2.c:populate_coeffs_6tap_avx2:
   34|  9.27k|                                             __m256i coeffs[3]) {
   35|  9.27k|  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
   36|       |
   37|       |  // coeffs 1 2 1 2 1 2 1 2
   38|  9.27k|  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0402u));
   39|       |  // coeffs 3 4 3 4 3 4 3 4
   40|  9.27k|  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0806u));
   41|       |  // coeffs 5 6 5 6 5 6 5 6
   42|  9.27k|  coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0C0Au));
   43|  9.27k|}
convolve_avx2.c:y_convolve_6tap_8x2_avx2:
 1252|  12.0k|                                               __m256i ss_256[3]) {
 1253|  12.0k|  s_64[5] = _mm_loadl_epi64((__m128i *)(src + 3 * stride));
 1254|  12.0k|  const __m256i src45 = _mm256_setr_m128i(s_64[4], s_64[5]);
  ------------------
  |  |   29|  12.0k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  12.0k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1255|  12.0k|  s_64[4] = _mm_loadl_epi64((__m128i *)(src + 4 * stride));
 1256|       |  const __m256i src56 = _mm256_setr_m128i(s_64[5], s_64[4]);
  ------------------
  |  |   29|  12.0k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  12.0k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1257|  12.0k|  ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
 1258|  12.0k|  return convolve_6tap_avx2(ss_256, coeffs);
 1259|  12.0k|}
convolve_avx2.c:convolve_6tap_avx2:
  477|   166k|                                         const __m256i coeffs[3]) {
  478|   166k|  const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
  479|   166k|  const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
  480|   166k|  const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
  481|   166k|  const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
  482|   166k|  return _mm256_add_epi16(res_0145, res_23);
  483|   166k|}
convolve_avx2.c:y_convolve_6tap_16x2_avx2:
 1265|  8.38k|                                             __m256i ss_256[6], __m256i r[2]) {
 1266|  8.38k|  s_128[5] = _mm_loadu_si128((__m128i *)(src + 3 * stride));
 1267|  8.38k|  const __m256i src45 = _mm256_setr_m128i(s_128[4], s_128[5]);
  ------------------
  |  |   29|  8.38k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  8.38k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1268|  8.38k|  s_128[4] = _mm_loadu_si128((__m128i *)(src + 4 * stride));
 1269|       |  const __m256i src56 = _mm256_setr_m128i(s_128[5], s_128[4]);
  ------------------
  |  |   29|  8.38k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  8.38k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1270|  8.38k|  ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
 1271|  8.38k|  ss_256[5] = _mm256_unpackhi_epi8(src45, src56);
 1272|  8.38k|  r[0] = convolve_6tap_avx2(ss_256, coeffs);
 1273|  8.38k|  r[1] = convolve_6tap_avx2(ss_256 + 3, coeffs);
 1274|  8.38k|}
convolve_avx2.c:y_convolve_6tap_32x2_avx2:
 1278|  11.8k|    __m256i s_256[6], __m256i ss_256[6], __m256i tt_256[6], __m256i r[4]) {
 1279|  11.8k|  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
 1280|  11.8k|  ss_256[2] = _mm256_unpacklo_epi8(s_256[4], s_256[5]);
 1281|  11.8k|  ss_256[5] = _mm256_unpackhi_epi8(s_256[4], s_256[5]);
 1282|  11.8k|  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
 1283|  11.8k|  tt_256[2] = _mm256_unpacklo_epi8(s_256[5], s_256[4]);
 1284|  11.8k|  tt_256[5] = _mm256_unpackhi_epi8(s_256[5], s_256[4]);
 1285|  11.8k|  r[0] = convolve_6tap_avx2(ss_256 + 0, coeffs);
 1286|  11.8k|  r[1] = convolve_6tap_avx2(ss_256 + 3, coeffs);
 1287|  11.8k|  r[2] = convolve_6tap_avx2(tt_256 + 0, coeffs);
 1288|  11.8k|  r[3] = convolve_6tap_avx2(tt_256 + 3, coeffs);
 1289|  11.8k|}
convolve_avx2.c:prepare_half_coeffs_8tap_ssse3:
  130|    179|    __m128i *const coeffs /* [4] */) {
  131|    179|  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
  132|    179|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|    179|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|    179|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  133|    179|  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
  134|       |
  135|       |  // right shift all filter co-efficients by 1 to reduce the bits required.
  136|       |  // This extra right shift will be taken care of at the end while rounding
  137|       |  // the result.
  138|       |  // Since all filter co-efficients are even, this change will not affect the
  139|       |  // end result
  140|    179|  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
  141|    179|                            _mm_set1_epi16((short)0xffff)));
  142|       |
  143|    179|  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
  144|       |
  145|       |  // coeffs 0 1 0 1 0 1 0 1
  146|    179|  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
  147|       |  // coeffs 2 3 2 3 2 3 2 3
  148|    179|  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
  149|       |  // coeffs 4 5 4 5 4 5 4 5
  150|    179|  coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
  151|       |  // coeffs 6 7 6 7 6 7 6 7
  152|    179|  coeffs[3] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0e0cu));
  153|    179|}
convolve_avx2.c:y_convolve_8tap_2x2_ssse3:
 1295|    144|                                                __m128i ss_128[4]) {
 1296|    144|  s_16[7] = _mm_cvtsi32_si128(loadu_int16(src + 7 * stride));
 1297|    144|  const __m128i src67 = _mm_unpacklo_epi16(s_16[6], s_16[7]);
 1298|    144|  s_16[6] = _mm_cvtsi32_si128(loadu_int16(src + 8 * stride));
 1299|    144|  const __m128i src78 = _mm_unpacklo_epi16(s_16[7], s_16[6]);
 1300|    144|  ss_128[3] = _mm_unpacklo_epi8(src67, src78);
 1301|    144|  return convolve_8tap_ssse3(ss_128, coeffs);
 1302|    144|}
convolve_avx2.c:convolve_8tap_ssse3:
  454|    884|                                          const __m128i coeffs[4]) {
  455|    884|  const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]);
  456|    884|  const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]);
  457|    884|  const __m128i res_45 = _mm_maddubs_epi16(ss[2], coeffs[2]);
  458|    884|  const __m128i res_67 = _mm_maddubs_epi16(ss[3], coeffs[3]);
  459|    884|  const __m128i res_0145 = _mm_add_epi16(res_01, res_45);
  460|    884|  const __m128i res_2367 = _mm_add_epi16(res_23, res_67);
  461|    884|  return _mm_add_epi16(res_0145, res_2367);
  462|    884|}
convolve_avx2.c:y_convolve_8tap_4x2_ssse3:
 1308|    740|                                                __m128i ss_128[4]) {
 1309|    740|  s_32[7] = _mm_cvtsi32_si128(loadu_int32(src + 7 * stride));
 1310|    740|  const __m128i src67 = _mm_unpacklo_epi32(s_32[6], s_32[7]);
 1311|    740|  s_32[6] = _mm_cvtsi32_si128(loadu_int32(src + 8 * stride));
 1312|    740|  const __m128i src78 = _mm_unpacklo_epi32(s_32[7], s_32[6]);
 1313|    740|  ss_128[3] = _mm_unpacklo_epi8(src67, src78);
 1314|    740|  return convolve_8tap_ssse3(ss_128, coeffs);
 1315|    740|}
convolve_avx2.c:prepare_half_coeffs_8tap_avx2:
  215|  1.13k|    __m256i *const coeffs /* [4] */) {
  216|  1.13k|  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
  217|  1.13k|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|  1.13k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|  1.13k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  218|  1.13k|  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
  219|       |
  220|       |  // right shift all filter co-efficients by 1 to reduce the bits required.
  221|       |  // This extra right shift will be taken care of at the end while rounding
  222|       |  // the result.
  223|       |  // Since all filter co-efficients are even, this change will not affect the
  224|       |  // end result
  225|       |  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
  226|  1.13k|                            _mm_set1_epi16((short)0xffff)));
  227|  1.13k|  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
  228|  1.13k|  populate_coeffs_8tap_avx2(coeffs_1, coeffs);
  229|  1.13k|}
convolve_avx2.c:populate_coeffs_8tap_avx2:
   46|  1.13k|                                             __m256i coeffs[4]) {
   47|  1.13k|  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
   48|       |
   49|       |  // coeffs 0 1 0 1 0 1 0 1
   50|  1.13k|  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0200u));
   51|       |  // coeffs 2 3 2 3 2 3 2 3
   52|  1.13k|  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
   53|       |  // coeffs 4 5 4 5 4 5 4 5
   54|  1.13k|  coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
   55|       |  // coeffs 6 7 6 7 6 7 6 7
   56|  1.13k|  coeffs[3] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0e0cu));
   57|  1.13k|}
convolve_avx2.c:y_convolve_8tap_8x2_avx2:
 1321|    824|                                               __m256i ss_256[4]) {
 1322|    824|  s_64[7] = _mm_loadl_epi64((__m128i *)(src + 7 * stride));
 1323|    824|  const __m256i src67 = _mm256_setr_m128i(s_64[6], s_64[7]);
  ------------------
  |  |   29|    824|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|    824|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1324|    824|  s_64[6] = _mm_loadl_epi64((__m128i *)(src + 8 * stride));
 1325|       |  const __m256i src78 = _mm256_setr_m128i(s_64[7], s_64[6]);
  ------------------
  |  |   29|    824|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|    824|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1326|    824|  ss_256[3] = _mm256_unpacklo_epi8(src67, src78);
 1327|    824|  return convolve_8tap_avx2(ss_256, coeffs);
 1328|    824|}
convolve_avx2.c:convolve_8tap_avx2:
  486|  24.3k|                                         const __m256i coeffs[4]) {
  487|  24.3k|  const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
  488|  24.3k|  const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
  489|  24.3k|  const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
  490|  24.3k|  const __m256i res_67 = _mm256_maddubs_epi16(ss[3], coeffs[3]);
  491|  24.3k|  const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
  492|  24.3k|  const __m256i res_2367 = _mm256_add_epi16(res_23, res_67);
  493|  24.3k|  return _mm256_add_epi16(res_0145, res_2367);
  494|  24.3k|}
convolve_avx2.c:y_convolve_8tap_16x2_avx2:
 1334|    644|                                             __m256i ss_256[8], __m256i r[2]) {
 1335|    644|  s_128[7] = _mm_loadu_si128((__m128i *)(src + 7 * stride));
 1336|    644|  const __m256i src67 = _mm256_setr_m128i(s_128[6], s_128[7]);
  ------------------
  |  |   29|    644|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|    644|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1337|    644|  s_128[6] = _mm_loadu_si128((__m128i *)(src + 8 * stride));
 1338|       |  const __m256i src78 = _mm256_setr_m128i(s_128[7], s_128[6]);
  ------------------
  |  |   29|    644|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|    644|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1339|    644|  ss_256[3] = _mm256_unpacklo_epi8(src67, src78);
 1340|    644|  ss_256[7] = _mm256_unpackhi_epi8(src67, src78);
 1341|    644|  r[0] = convolve_8tap_avx2(ss_256, coeffs);
 1342|    644|  r[1] = convolve_8tap_avx2(ss_256 + 4, coeffs);
 1343|    644|}
convolve_avx2.c:y_convolve_8tap_32x2_avx2:
 1347|  1.80k|    __m256i s_256[8], __m256i ss_256[8], __m256i tt_256[8], __m256i r[4]) {
 1348|  1.80k|  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
 1349|  1.80k|  ss_256[3] = _mm256_unpacklo_epi8(s_256[6], s_256[7]);
 1350|  1.80k|  ss_256[7] = _mm256_unpackhi_epi8(s_256[6], s_256[7]);
 1351|  1.80k|  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
 1352|  1.80k|  tt_256[3] = _mm256_unpacklo_epi8(s_256[7], s_256[6]);
 1353|  1.80k|  tt_256[7] = _mm256_unpackhi_epi8(s_256[7], s_256[6]);
 1354|  1.80k|  r[0] = convolve_8tap_avx2(ss_256 + 0, coeffs);
 1355|  1.80k|  r[1] = convolve_8tap_avx2(ss_256 + 4, coeffs);
 1356|  1.80k|  r[2] = convolve_8tap_avx2(tt_256 + 0, coeffs);
 1357|  1.80k|  r[3] = convolve_8tap_avx2(tt_256 + 4, coeffs);
 1358|  1.80k|}
convolve_avx2.c:av1_convolve_x_sr_specialized_avx2:
 2940|  14.2k|    const int32_t subpel_x_q4, ConvolveParams *conv_params) {
 2941|  14.2k|  int32_t y = h;
 2942|  14.2k|  __m128i coeffs_128[4];
 2943|  14.2k|  __m256i coeffs_256[4];
 2944|       |
 2945|  14.2k|  assert(conv_params->round_0 == 3);
 2946|  14.2k|  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
 2947|  14.2k|         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
 2948|  14.2k|  (void)conv_params;
 2949|       |
 2950|  14.2k|  const int horz_tap = get_filter_tap(filter_params_x, subpel_x_q4);
 2951|       |
 2952|  14.2k|  if (horz_tap == 2) {
  ------------------
  |  Branch (2952:7): [True: 1.97k, False: 12.2k]
  ------------------
 2953|       |    // horz_filt as 2 tap
 2954|  1.97k|    const uint8_t *src_ptr = src;
 2955|       |
 2956|  1.97k|    if (subpel_x_q4 != 8) {
  ------------------
  |  Branch (2956:9): [True: 1.65k, False: 321]
  ------------------
 2957|  1.65k|      if (w <= 8) {
  ------------------
  |  Branch (2957:11): [True: 1.01k, False: 642]
  ------------------
 2958|  1.01k|        prepare_half_coeffs_2tap_ssse3(filter_params_x, subpel_x_q4,
 2959|  1.01k|                                       coeffs_128);
 2960|       |
 2961|  1.01k|        if (w == 2) {
  ------------------
  |  Branch (2961:13): [True: 124, False: 889]
  ------------------
 2962|    288|          do {
 2963|    288|            const __m128i res =
 2964|    288|                x_convolve_2tap_2x2_sse4_1(src_ptr, src_stride, coeffs_128);
 2965|    288|            const __m128i r = sr_x_round_sse2(res);
 2966|    288|            pack_store_2x2_sse2(r, dst, dst_stride);
 2967|    288|            src_ptr += 2 * src_stride;
 2968|    288|            dst += 2 * dst_stride;
 2969|    288|            y -= 2;
 2970|    288|          } while (y);
  ------------------
  |  Branch (2970:20): [True: 164, False: 124]
  ------------------
 2971|    889|        } else if (w == 4) {
  ------------------
  |  Branch (2971:20): [True: 410, False: 479]
  ------------------
 2972|  1.70k|          do {
 2973|  1.70k|            const __m128i res =
 2974|  1.70k|                x_convolve_2tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
 2975|  1.70k|            const __m128i r = sr_x_round_sse2(res);
 2976|  1.70k|            pack_store_4x2_sse2(r, dst, dst_stride);
 2977|  1.70k|            src_ptr += 2 * src_stride;
 2978|  1.70k|            dst += 2 * dst_stride;
 2979|  1.70k|            y -= 2;
 2980|  1.70k|          } while (y);
  ------------------
  |  Branch (2980:20): [True: 1.29k, False: 410]
  ------------------
 2981|    479|        } else {
 2982|    479|          assert(w == 8);
 2983|       |
 2984|  2.64k|          do {
 2985|  2.64k|            __m128i res[2];
 2986|       |
 2987|  2.64k|            x_convolve_2tap_8x2_ssse3(src_ptr, src_stride, coeffs_128, res);
 2988|  2.64k|            res[0] = sr_x_round_sse2(res[0]);
 2989|  2.64k|            res[1] = sr_x_round_sse2(res[1]);
 2990|  2.64k|            const __m128i d = _mm_packus_epi16(res[0], res[1]);
 2991|  2.64k|            _mm_storel_epi64((__m128i *)dst, d);
 2992|  2.64k|            _mm_storeh_epi64((__m128i *)(dst + dst_stride), d);
 2993|       |
 2994|  2.64k|            src_ptr += 2 * src_stride;
 2995|  2.64k|            dst += 2 * dst_stride;
 2996|  2.64k|            y -= 2;
 2997|  2.64k|          } while (y);
  ------------------
  |  Branch (2997:20): [True: 2.16k, False: 479]
  ------------------
 2998|    479|        }
 2999|  1.01k|      } else {
 3000|    642|        prepare_half_coeffs_2tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
 3001|       |
 3002|    642|        if (w == 16) {
  ------------------
  |  Branch (3002:13): [True: 294, False: 348]
  ------------------
 3003|  2.44k|          do {
 3004|  2.44k|            __m256i r[2];
 3005|       |
 3006|  2.44k|            x_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, r);
 3007|  2.44k|            sr_x_round_store_16x2_avx2(r, dst, dst_stride);
 3008|  2.44k|            src_ptr += 2 * src_stride;
 3009|  2.44k|            dst += 2 * dst_stride;
 3010|  2.44k|            y -= 2;
 3011|  2.44k|          } while (y);
  ------------------
  |  Branch (3011:20): [True: 2.15k, False: 294]
  ------------------
 3012|    348|        } else if (w == 32) {
  ------------------
  |  Branch (3012:20): [True: 260, False: 88]
  ------------------
 3013|  8.20k|          do {
 3014|  8.20k|            sr_x_2tap_32_avx2(src_ptr, coeffs_256, dst);
 3015|  8.20k|            src_ptr += src_stride;
 3016|  8.20k|            dst += dst_stride;
 3017|  8.20k|          } while (--y);
  ------------------
  |  Branch (3017:20): [True: 7.94k, False: 260]
  ------------------
 3018|    260|        } else if (w == 64) {
  ------------------
  |  Branch (3018:20): [True: 88, False: 0]
  ------------------
 3019|  4.70k|          do {
 3020|  4.70k|            sr_x_2tap_32_avx2(src_ptr + 0 * 32, coeffs_256, dst + 0 * 32);
 3021|  4.70k|            sr_x_2tap_32_avx2(src_ptr + 1 * 32, coeffs_256, dst + 1 * 32);
 3022|  4.70k|            src_ptr += src_stride;
 3023|  4.70k|            dst += dst_stride;
 3024|  4.70k|          } while (--y);
  ------------------
  |  Branch (3024:20): [True: 4.61k, False: 88]
  ------------------
 3025|     88|        } else {
 3026|      0|          assert(w == 128);
 3027|       |
 3028|      0|          do {
 3029|      0|            sr_x_2tap_32_avx2(src_ptr + 0 * 32, coeffs_256, dst + 0 * 32);
 3030|      0|            sr_x_2tap_32_avx2(src_ptr + 1 * 32, coeffs_256, dst + 1 * 32);
 3031|      0|            sr_x_2tap_32_avx2(src_ptr + 2 * 32, coeffs_256, dst + 2 * 32);
 3032|      0|            sr_x_2tap_32_avx2(src_ptr + 3 * 32, coeffs_256, dst + 3 * 32);
 3033|      0|            src_ptr += src_stride;
 3034|      0|            dst += dst_stride;
 3035|      0|          } while (--y);
  ------------------
  |  Branch (3035:20): [True: 0, False: 0]
  ------------------
 3036|      0|        }
 3037|    642|      }
 3038|  1.65k|    } else {
 3039|       |      // average to get half pel
 3040|    321|      if (w == 2) {
  ------------------
  |  Branch (3040:11): [True: 4, False: 317]
  ------------------
 3041|      4|        do {
 3042|      4|          __m128i s_128;
 3043|       |
 3044|      4|          s_128 = load_u8_4x2_sse4_1(src_ptr, src_stride);
 3045|      4|          const __m128i s1 = _mm_srli_si128(s_128, 1);
 3046|      4|          const __m128i d = _mm_avg_epu8(s_128, s1);
 3047|      4|          *(uint16_t *)dst = (uint16_t)_mm_cvtsi128_si32(d);
 3048|      4|          *(uint16_t *)(dst + dst_stride) = _mm_extract_epi16(d, 2);
 3049|       |
 3050|      4|          src_ptr += 2 * src_stride;
 3051|      4|          dst += 2 * dst_stride;
 3052|      4|          y -= 2;
 3053|      4|        } while (y);
  ------------------
  |  Branch (3053:18): [True: 0, False: 4]
  ------------------
 3054|    317|      } else if (w == 4) {
  ------------------
  |  Branch (3054:18): [True: 70, False: 247]
  ------------------
 3055|    208|        do {
 3056|    208|          __m128i s_128;
 3057|       |
 3058|    208|          s_128 = load_u8_8x2_sse2(src_ptr, src_stride);
 3059|    208|          const __m128i s1 = _mm_srli_si128(s_128, 1);
 3060|    208|          const __m128i d = _mm_avg_epu8(s_128, s1);
 3061|    208|          xx_storel_32(dst, d);
 3062|    208|          *(int32_t *)(dst + dst_stride) = _mm_extract_epi32(d, 2);
 3063|       |
 3064|    208|          src_ptr += 2 * src_stride;
 3065|    208|          dst += 2 * dst_stride;
 3066|    208|          y -= 2;
 3067|    208|        } while (y);
  ------------------
  |  Branch (3067:18): [True: 138, False: 70]
  ------------------
 3068|    247|      } else if (w == 8) {
  ------------------
  |  Branch (3068:18): [True: 100, False: 147]
  ------------------
 3069|    430|        do {
 3070|    430|          const __m128i s00 = _mm_loadu_si128((__m128i *)src_ptr);
 3071|    430|          const __m128i s10 =
 3072|    430|              _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
 3073|    430|          const __m128i s01 = _mm_srli_si128(s00, 1);
 3074|    430|          const __m128i s11 = _mm_srli_si128(s10, 1);
 3075|    430|          const __m128i d0 = _mm_avg_epu8(s00, s01);
 3076|    430|          const __m128i d1 = _mm_avg_epu8(s10, s11);
 3077|    430|          _mm_storel_epi64((__m128i *)dst, d0);
 3078|    430|          _mm_storel_epi64((__m128i *)(dst + dst_stride), d1);
 3079|       |
 3080|    430|          src_ptr += 2 * src_stride;
 3081|    430|          dst += 2 * dst_stride;
 3082|    430|          y -= 2;
 3083|    430|        } while (y);
  ------------------
  |  Branch (3083:18): [True: 330, False: 100]
  ------------------
 3084|    147|      } else if (w == 16) {
  ------------------
  |  Branch (3084:18): [True: 69, False: 78]
  ------------------
 3085|    572|        do {
 3086|    572|          const __m128i s00 = _mm_loadu_si128((__m128i *)src_ptr);
 3087|    572|          const __m128i s01 = _mm_loadu_si128((__m128i *)(src_ptr + 1));
 3088|    572|          const __m128i s10 =
 3089|    572|              _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
 3090|    572|          const __m128i s11 =
 3091|    572|              _mm_loadu_si128((__m128i *)(src_ptr + src_stride + 1));
 3092|    572|          const __m128i d0 = _mm_avg_epu8(s00, s01);
 3093|    572|          const __m128i d1 = _mm_avg_epu8(s10, s11);
 3094|    572|          _mm_storeu_si128((__m128i *)dst, d0);
 3095|    572|          _mm_storeu_si128((__m128i *)(dst + dst_stride), d1);
 3096|       |
 3097|    572|          src_ptr += 2 * src_stride;
 3098|    572|          dst += 2 * dst_stride;
 3099|    572|          y -= 2;
 3100|    572|        } while (y);
  ------------------
  |  Branch (3100:18): [True: 503, False: 69]
  ------------------
 3101|     78|      } else if (w == 32) {
  ------------------
  |  Branch (3101:18): [True: 33, False: 45]
  ------------------
 3102|    752|        do {
 3103|    752|          sr_x_2tap_32_avg_avx2(src_ptr, dst);
 3104|    752|          src_ptr += src_stride;
 3105|    752|          dst += dst_stride;
 3106|    752|        } while (--y);
  ------------------
  |  Branch (3106:18): [True: 719, False: 33]
  ------------------
 3107|     45|      } else if (w == 64) {
  ------------------
  |  Branch (3107:18): [True: 45, False: 0]
  ------------------
 3108|  2.40k|        do {
 3109|  2.40k|          sr_x_2tap_32_avg_avx2(src_ptr + 0 * 32, dst + 0 * 32);
 3110|  2.40k|          sr_x_2tap_32_avg_avx2(src_ptr + 1 * 32, dst + 1 * 32);
 3111|  2.40k|          src_ptr += src_stride;
 3112|  2.40k|          dst += dst_stride;
 3113|  2.40k|        } while (--y);
  ------------------
  |  Branch (3113:18): [True: 2.35k, False: 45]
  ------------------
 3114|     45|      } else {
 3115|      0|        assert(w == 128);
 3116|       |
 3117|      0|        do {
 3118|      0|          sr_x_2tap_32_avg_avx2(src_ptr + 0 * 32, dst + 0 * 32);
 3119|      0|          sr_x_2tap_32_avg_avx2(src_ptr + 1 * 32, dst + 1 * 32);
 3120|      0|          sr_x_2tap_32_avg_avx2(src_ptr + 2 * 32, dst + 2 * 32);
 3121|      0|          sr_x_2tap_32_avg_avx2(src_ptr + 3 * 32, dst + 3 * 32);
 3122|      0|          src_ptr += src_stride;
 3123|      0|          dst += dst_stride;
 3124|      0|        } while (--y);
  ------------------
  |  Branch (3124:18): [True: 0, False: 0]
  ------------------
 3125|      0|      }
 3126|    321|    }
 3127|  12.2k|  } else if (horz_tap == 4) {
  ------------------
  |  Branch (3127:14): [True: 6.00k, False: 6.27k]
  ------------------
 3128|       |    // horz_filt as 4 tap
 3129|  6.00k|    const uint8_t *src_ptr = src - 1;
 3130|       |
 3131|  6.00k|    prepare_half_coeffs_4tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128);
 3132|       |
 3133|  6.00k|    if (w == 2) {
  ------------------
  |  Branch (3133:9): [True: 1.06k, False: 4.94k]
  ------------------
 3134|  2.12k|      do {
 3135|  2.12k|        const __m128i res =
 3136|  2.12k|            x_convolve_4tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
 3137|  2.12k|        const __m128i r = sr_x_round_sse2(res);
 3138|  2.12k|        pack_store_2x2_sse2(r, dst, dst_stride);
 3139|  2.12k|        src_ptr += 2 * src_stride;
 3140|  2.12k|        dst += 2 * dst_stride;
 3141|  2.12k|        y -= 2;
 3142|  2.12k|      } while (y);
  ------------------
  |  Branch (3142:16): [True: 1.05k, False: 1.06k]
  ------------------
 3143|  4.94k|    } else if (w == 4) {
  ------------------
  |  Branch (3143:16): [True: 4.32k, False: 616]
  ------------------
 3144|  13.7k|      do {
 3145|  13.7k|        const __m128i res =
 3146|  13.7k|            x_convolve_4tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
 3147|  13.7k|        const __m128i r = sr_x_round_sse2(res);
 3148|  13.7k|        pack_store_4x2_sse2(r, dst, dst_stride);
 3149|  13.7k|        src_ptr += 2 * src_stride;
 3150|  13.7k|        dst += 2 * dst_stride;
 3151|  13.7k|        y -= 2;
 3152|  13.7k|      } while (y);
  ------------------
  |  Branch (3152:16): [True: 9.45k, False: 4.32k]
  ------------------
 3153|  4.32k|    } else if (w == 8) {
  ------------------
  |  Branch (3153:16): [True: 342, False: 274]
  ------------------
 3154|       |      // TODO(chiyotsai@google.com): Reuse the old SIMD code here. Need to
 3155|       |      // rewrite this for better performance later.
 3156|    342|      __m256i filt_256[2];
 3157|    342|      prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_256);
 3158|       |
 3159|    342|      filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
 3160|    342|      filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
 3161|  1.61k|      for (int i = 0; i < h; i += 2) {
  ------------------
  |  Branch (3161:23): [True: 1.27k, False: 342]
  ------------------
 3162|  1.27k|        const __m256i data = _mm256_permute2x128_si256(
 3163|  1.27k|            _mm256_castsi128_si256(
 3164|  1.27k|                _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
 3165|  1.27k|            _mm256_castsi128_si256(_mm_loadu_si128(
 3166|  1.27k|                (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
 3167|  1.27k|            0x20);
 3168|       |
 3169|  1.27k|        __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs_256 + 1, filt_256);
 3170|  1.27k|        res_16b = sr_x_round_avx2(res_16b);
 3171|       |
 3172|  1.27k|        __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
 3173|       |
 3174|  1.27k|        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
 3175|  1.27k|        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
 3176|       |
 3177|  1.27k|        _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
 3178|  1.27k|        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
 3179|  1.27k|      }
 3180|    342|    } else {
 3181|    274|      assert(!(w % 16));
 3182|       |      // TODO(chiyotsai@google.com): Reuse the old SIMD code here. Need to
 3183|       |      // rewrite this for better performance later.
 3184|    274|      __m256i filt_256[2];
 3185|    274|      prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_256);
 3186|    274|      filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
 3187|    274|      filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
 3188|       |
 3189|  4.83k|      for (int i = 0; i < h; ++i) {
  ------------------
  |  Branch (3189:23): [True: 4.55k, False: 274]
  ------------------
 3190|  13.6k|        for (int j = 0; j < w; j += 16) {
  ------------------
  |  Branch (3190:25): [True: 9.12k, False: 4.55k]
  ------------------
 3191|       |          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
 3192|       |          // 18 19 20 21 22 23
 3193|  9.12k|          const __m256i data = _mm256_inserti128_si256(
 3194|  9.12k|              _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
 3195|  9.12k|              _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
 3196|  9.12k|              1);
 3197|       |
 3198|  9.12k|          __m256i res_16b =
 3199|  9.12k|              convolve_lowbd_x_4tap(data, coeffs_256 + 1, filt_256);
 3200|  9.12k|          res_16b = sr_x_round_avx2(res_16b);
 3201|       |
 3202|       |          /* rounding code */
 3203|       |          // 8 bit conversion and saturation to uint8
 3204|  9.12k|          __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
 3205|       |
 3206|       |          // Store values into the destination buffer
 3207|       |          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
 3208|  9.12k|          res_8b = _mm256_permute4x64_epi64(res_8b, 216);
 3209|  9.12k|          __m128i res = _mm256_castsi256_si128(res_8b);
 3210|  9.12k|          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
 3211|  9.12k|        }
 3212|  4.55k|      }
 3213|    274|    }
 3214|  6.27k|  } else {
 3215|  6.27k|    __m256i filt_256[4];
 3216|       |
 3217|  6.27k|    filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
 3218|  6.27k|    filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
 3219|  6.27k|    filt_256[2] = _mm256_loadu_si256((__m256i const *)filt3_global_avx2);
 3220|       |
 3221|  6.27k|    if (horz_tap == 6) {
  ------------------
  |  Branch (3221:9): [True: 5.52k, False: 750]
  ------------------
 3222|       |      // horz_filt as 6 tap
 3223|  5.52k|      const uint8_t *src_ptr = src - 2;
 3224|       |
 3225|  5.52k|      prepare_half_coeffs_6tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
 3226|       |
 3227|  5.52k|      if (w == 8) {
  ------------------
  |  Branch (3227:11): [True: 3.36k, False: 2.15k]
  ------------------
 3228|  13.3k|        do {
 3229|  13.3k|          const __m256i res = x_convolve_6tap_8x2_avx2(src_ptr, src_stride,
 3230|  13.3k|                                                       coeffs_256, filt_256);
 3231|  13.3k|          sr_x_round_store_8x2_avx2(res, dst, dst_stride);
 3232|  13.3k|          src_ptr += 2 * src_stride;
 3233|  13.3k|          dst += 2 * dst_stride;
 3234|  13.3k|          y -= 2;
 3235|  13.3k|        } while (y);
  ------------------
  |  Branch (3235:18): [True: 9.95k, False: 3.36k]
  ------------------
 3236|  3.36k|      } else if (w == 16) {
  ------------------
  |  Branch (3236:18): [True: 1.54k, False: 614]
  ------------------
 3237|  9.10k|        do {
 3238|  9.10k|          __m256i r[2];
 3239|       |
 3240|  9.10k|          x_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256,
 3241|  9.10k|                                    r);
 3242|  9.10k|          sr_x_round_store_16x2_avx2(r, dst, dst_stride);
 3243|  9.10k|          src_ptr += 2 * src_stride;
 3244|  9.10k|          dst += 2 * dst_stride;
 3245|  9.10k|          y -= 2;
 3246|  9.10k|        } while (y);
  ------------------
  |  Branch (3246:18): [True: 7.56k, False: 1.54k]
  ------------------
 3247|  1.54k|      } else if (w == 32) {
  ------------------
  |  Branch (3247:18): [True: 468, False: 146]
  ------------------
 3248|  11.3k|        do {
 3249|  11.3k|          sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
 3250|  11.3k|          src_ptr += src_stride;
 3251|  11.3k|          dst += dst_stride;
 3252|  11.3k|        } while (--y);
  ------------------
  |  Branch (3252:18): [True: 10.8k, False: 468]
  ------------------
 3253|    468|      } else if (w == 64) {
  ------------------
  |  Branch (3253:18): [True: 140, False: 6]
  ------------------
 3254|  7.64k|        do {
 3255|  7.64k|          sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
 3256|  7.64k|          sr_x_6tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, dst + 32);
 3257|  7.64k|          src_ptr += src_stride;
 3258|  7.64k|          dst += dst_stride;
 3259|  7.64k|        } while (--y);
  ------------------
  |  Branch (3259:18): [True: 7.50k, False: 140]
  ------------------
 3260|    140|      } else {
 3261|      6|        assert(w == 128);
 3262|       |
 3263|    640|        do {
 3264|    640|          sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
 3265|    640|          sr_x_6tap_32_avx2(src_ptr + 1 * 32, coeffs_256, filt_256,
 3266|    640|                            dst + 1 * 32);
 3267|    640|          sr_x_6tap_32_avx2(src_ptr + 2 * 32, coeffs_256, filt_256,
 3268|    640|                            dst + 2 * 32);
 3269|    640|          sr_x_6tap_32_avx2(src_ptr + 3 * 32, coeffs_256, filt_256,
 3270|    640|                            dst + 3 * 32);
 3271|    640|          src_ptr += src_stride;
 3272|    640|          dst += dst_stride;
 3273|    640|        } while (--y);
  ------------------
  |  Branch (3273:18): [True: 634, False: 6]
  ------------------
 3274|      6|      }
 3275|  5.52k|    } else if (horz_tap == 8) {
  ------------------
  |  Branch (3275:16): [True: 750, False: 0]
  ------------------
 3276|       |      // horz_filt as 8 tap
 3277|    750|      const uint8_t *src_ptr = src - 3;
 3278|       |
 3279|    750|      filt_256[3] = _mm256_loadu_si256((__m256i const *)filt4_global_avx2);
 3280|       |
 3281|    750|      prepare_half_coeffs_8tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
 3282|       |
 3283|    750|      if (w == 8) {
  ------------------
  |  Branch (3283:11): [True: 493, False: 257]
  ------------------
 3284|  1.90k|        do {
 3285|  1.90k|          const __m256i res = x_convolve_8tap_8x2_avx2(src_ptr, src_stride,
 3286|  1.90k|                                                       coeffs_256, filt_256);
 3287|  1.90k|          sr_x_round_store_8x2_avx2(res, dst, dst_stride);
 3288|  1.90k|          src_ptr += 2 * src_stride;
 3289|  1.90k|          dst += 2 * dst_stride;
 3290|  1.90k|          y -= 2;
 3291|  1.90k|        } while (y);
  ------------------
  |  Branch (3291:18): [True: 1.40k, False: 493]
  ------------------
 3292|    493|      } else if (w == 16) {
  ------------------
  |  Branch (3292:18): [True: 154, False: 103]
  ------------------
 3293|    766|        do {
 3294|    766|          __m256i r[2];
 3295|       |
 3296|    766|          x_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256,
 3297|    766|                                    r);
 3298|    766|          sr_x_round_store_16x2_avx2(r, dst, dst_stride);
 3299|    766|          src_ptr += 2 * src_stride;
 3300|    766|          dst += 2 * dst_stride;
 3301|    766|          y -= 2;
 3302|    766|        } while (y);
  ------------------
  |  Branch (3302:18): [True: 612, False: 154]
  ------------------
 3303|    154|      } else if (w == 32) {
  ------------------
  |  Branch (3303:18): [True: 66, False: 37]
  ------------------
 3304|  2.01k|        do {
 3305|  2.01k|          sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
 3306|  2.01k|          src_ptr += src_stride;
 3307|  2.01k|          dst += dst_stride;
 3308|  2.01k|        } while (--y);
  ------------------
  |  Branch (3308:18): [True: 1.95k, False: 66]
  ------------------
 3309|     66|      } else if (w == 64) {
  ------------------
  |  Branch (3309:18): [True: 37, False: 0]
  ------------------
 3310|  1.90k|        do {
 3311|  1.90k|          sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
 3312|  1.90k|          sr_x_8tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, dst + 32);
 3313|  1.90k|          src_ptr += src_stride;
 3314|  1.90k|          dst += dst_stride;
 3315|  1.90k|        } while (--y);
  ------------------
  |  Branch (3315:18): [True: 1.86k, False: 37]
  ------------------
 3316|     37|      } else {
 3317|      0|        assert(w == 128);
 3318|       |
 3319|      0|        do {
 3320|      0|          sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
 3321|      0|          sr_x_8tap_32_avx2(src_ptr + 1 * 32, coeffs_256, filt_256,
 3322|      0|                            dst + 1 * 32);
 3323|      0|          sr_x_8tap_32_avx2(src_ptr + 2 * 32, coeffs_256, filt_256,
 3324|      0|                            dst + 2 * 32);
 3325|      0|          sr_x_8tap_32_avx2(src_ptr + 3 * 32, coeffs_256, filt_256,
 3326|      0|                            dst + 3 * 32);
 3327|      0|          src_ptr += src_stride;
 3328|      0|          dst += dst_stride;
 3329|      0|        } while (--y);
  ------------------
  |  Branch (3329:18): [True: 0, False: 0]
  ------------------
 3330|      0|      }
 3331|    750|    }
 3332|  6.27k|  }
 3333|  14.2k|}
convolve_avx2.c:x_convolve_2tap_2x2_sse4_1:
  859|    288|                                                 const __m128i coeffs[1]) {
  860|    288|  const __m128i sfl =
  861|    288|      _mm_setr_epi8(0, 1, 1, 2, 4, 5, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0);
  862|    288|  const __m128i s_128 = load_u8_4x2_sse4_1(src, stride);
  863|    288|  const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
  864|    288|  return convolve_2tap_ssse3(&ss, coeffs);
  865|    288|}
convolve_avx2.c:sr_x_round_sse2:
  780|  23.1k|static inline __m128i sr_x_round_sse2(const __m128i src) {
  781|  23.1k|  const __m128i round = _mm_set1_epi16(34);
  782|  23.1k|  const __m128i dst = _mm_add_epi16(src, round);
  783|  23.1k|  return _mm_srai_epi16(dst, 6);
  784|  23.1k|}
convolve_avx2.c:x_convolve_2tap_4x2_ssse3:
  869|  1.70k|                                                const __m128i coeffs[1]) {
  870|  1.70k|  const __m128i sfl =
  871|  1.70k|      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
  872|  1.70k|  const __m128i s_128 = load_u8_8x2_sse2(src, stride);
  873|  1.70k|  const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
  874|  1.70k|  return convolve_2tap_ssse3(&ss, coeffs);
  875|  1.70k|}
convolve_avx2.c:x_convolve_2tap_8x2_ssse3:
  880|  2.64k|                                             __m128i r[2]) {
  881|  2.64k|  __m128i ss[2];
  882|  2.64k|  const __m128i s00 = _mm_loadu_si128((__m128i *)src);
  883|  2.64k|  const __m128i s10 = _mm_loadu_si128((__m128i *)(src + stride));
  884|  2.64k|  const __m128i s01 = _mm_srli_si128(s00, 1);
  885|  2.64k|  const __m128i s11 = _mm_srli_si128(s10, 1);
  886|  2.64k|  ss[0] = _mm_unpacklo_epi8(s00, s01);
  887|  2.64k|  ss[1] = _mm_unpacklo_epi8(s10, s11);
  888|       |
  889|  2.64k|  r[0] = convolve_2tap_ssse3(&ss[0], coeffs);
  890|  2.64k|  r[1] = convolve_2tap_ssse3(&ss[1], coeffs);
  891|  2.64k|}
convolve_avx2.c:x_convolve_2tap_16x2_avx2:
  912|  2.44k|                                             __m256i r[2]) {
  913|  2.44k|  const __m256i s0_256 = loadu_8bit_16x2_avx2(src, stride);
  914|  2.44k|  const __m256i s1_256 = loadu_8bit_16x2_avx2(src + 1, stride);
  915|  2.44k|  const __m256i s0 = _mm256_unpacklo_epi8(s0_256, s1_256);
  916|  2.44k|  const __m256i s1 = _mm256_unpackhi_epi8(s0_256, s1_256);
  917|  2.44k|  r[0] = convolve_2tap_avx2(&s0, coeffs);
  918|  2.44k|  r[1] = convolve_2tap_avx2(&s1, coeffs);
  919|  2.44k|}
convolve_avx2.c:sr_x_round_store_16x2_avx2:
  807|  12.3k|                                              const ptrdiff_t dst_stride) {
  808|  12.3k|  __m256i r[2];
  809|       |
  810|  12.3k|  r[0] = sr_x_round_avx2(res[0]);
  811|  12.3k|  r[1] = sr_x_round_avx2(res[1]);
  812|  12.3k|  pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
  813|  12.3k|}
convolve_avx2.c:sr_x_2tap_32_avx2:
 2910|  17.6k|                                     uint8_t *const dst) {
 2911|  17.6k|  __m256i r[2];
 2912|       |
 2913|  17.6k|  x_convolve_2tap_32_avx2(src, coeffs, r);
 2914|  17.6k|  sr_x_round_store_32_avx2(r, dst);
 2915|  17.6k|}
convolve_avx2.c:x_convolve_2tap_32_avx2:
  923|  17.6k|                                           __m256i r[2]) {
  924|  17.6k|  const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
  925|  17.6k|  const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
  926|  17.6k|  const __m256i ss0 = _mm256_unpacklo_epi8(s0, s1);
  927|  17.6k|  const __m256i ss1 = _mm256_unpackhi_epi8(s0, s1);
  928|       |
  929|  17.6k|  r[0] = convolve_2tap_avx2(&ss0, coeffs);
  930|  17.6k|  r[1] = convolve_2tap_avx2(&ss1, coeffs);
  931|  17.6k|}
convolve_avx2.c:sr_x_round_store_32_avx2:
  816|  52.6k|                                            uint8_t *const dst) {
  817|  52.6k|  __m256i r[2];
  818|       |
  819|  52.6k|  r[0] = sr_x_round_avx2(res[0]);
  820|  52.6k|  r[1] = sr_x_round_avx2(res[1]);
  821|  52.6k|  convolve_store_32_avx2(r[0], r[1], dst);
  822|  52.6k|}
convolve_avx2.c:sr_x_2tap_32_avg_avx2:
  850|  5.55k|                                         uint8_t *const dst) {
  851|  5.55k|  const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
  852|  5.55k|  const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
  853|  5.55k|  const __m256i d = _mm256_avg_epu8(s0, s1);
  854|  5.55k|  _mm256_storeu_si256((__m256i *)dst, d);
  855|  5.55k|}
convolve_avx2.c:x_convolve_4tap_2x2_ssse3:
  935|  2.12k|                                                const __m128i coeffs[2]) {
  936|  2.12k|  const __m128i sfl0 =
  937|  2.12k|      _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
  938|  2.12k|  const __m128i sfl1 =
  939|  2.12k|      _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
  940|  2.12k|  const __m128i s = load_u8_8x2_sse2(src, stride);
  941|  2.12k|  __m128i ss[2];
  942|       |
  943|  2.12k|  ss[0] = _mm_shuffle_epi8(s, sfl0);
  944|  2.12k|  ss[1] = _mm_shuffle_epi8(s, sfl1);
  945|  2.12k|  return convolve_4tap_ssse3(ss, coeffs);
  946|  2.12k|}
convolve_avx2.c:x_convolve_4tap_4x2_ssse3:
  950|  13.7k|                                                const __m128i coeffs[2]) {
  951|  13.7k|  const __m128i s = load_u8_8x2_sse2(src, stride);
  952|  13.7k|  const __m128i sfl0 =
  953|  13.7k|      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
  954|  13.7k|  const __m128i sfl1 =
  955|  13.7k|      _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14);
  956|  13.7k|  __m128i ss[2];
  957|       |
  958|  13.7k|  ss[0] = _mm_shuffle_epi8(s, sfl0);
  959|  13.7k|  ss[1] = _mm_shuffle_epi8(s, sfl1);
  960|  13.7k|  return convolve_4tap_ssse3(ss, coeffs);
  961|  13.7k|}
convolve_avx2.c:sr_x_round_avx2:
  786|   155k|static inline __m256i sr_x_round_avx2(const __m256i src) {
  787|   155k|  const __m256i round = _mm256_set1_epi16(34);
  788|   155k|  const __m256i dst = _mm256_add_epi16(src, round);
  789|   155k|  return _mm256_srai_epi16(dst, 6);
  790|   155k|}
convolve_avx2.c:x_convolve_6tap_8x2_avx2:
 1031|  31.5k|                                               const __m256i filt[3]) {
 1032|  31.5k|  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
 1033|  31.5k|  return x_convolve_6tap_avx2(s_256, coeffs, filt);
 1034|  31.5k|}
convolve_avx2.c:x_convolve_6tap_avx2:
  573|  89.8k|                                           const __m256i filt[3]) {
  574|  89.8k|  __m256i ss[3];
  575|       |
  576|  89.8k|  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
  577|  89.8k|  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
  578|  89.8k|  ss[2] = _mm256_shuffle_epi8(data, filt[2]);
  579|       |
  580|  89.8k|  return convolve_6tap_avx2(ss, coeffs);
  581|  89.8k|}
convolve_avx2.c:sr_x_round_store_8x2_avx2:
  800|  15.2k|                                             const ptrdiff_t dst_stride) {
  801|  15.2k|  const __m256i r = sr_x_round_avx2(res);
  802|  15.2k|  pack_store_8x2_avx2(r, dst, dst_stride);
  803|  15.2k|}
convolve_avx2.c:x_convolve_6tap_16x2_avx2:
 1040|  9.10k|                                             __m256i r[2]) {
 1041|  9.10k|  r[0] = x_convolve_6tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
 1042|  9.10k|  r[1] = x_convolve_6tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
 1043|  9.10k|}
convolve_avx2.c:sr_x_6tap_32_avx2:
 2920|  29.1k|                                     uint8_t *const dst) {
 2921|  29.1k|  __m256i r[2];
 2922|       |
 2923|  29.1k|  x_convolve_6tap_32_avx2(src, coeffs, filt, r);
 2924|  29.1k|  sr_x_round_store_32_avx2(r, dst);
 2925|  29.1k|}
convolve_avx2.c:x_convolve_6tap_32_avx2:
 1048|  29.1k|                                           __m256i r[2]) {
 1049|  29.1k|  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
 1050|  29.1k|  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
 1051|       |
 1052|  29.1k|  r[0] = x_convolve_6tap_avx2(s0_256, coeffs, filt);
 1053|  29.1k|  r[1] = x_convolve_6tap_avx2(s1_256, coeffs, filt);
 1054|  29.1k|}
convolve_avx2.c:x_convolve_8tap_8x2_avx2:
 1059|  3.43k|                                               const __m256i filt[4]) {
 1060|  3.43k|  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
 1061|  3.43k|  return x_convolve_8tap_avx2(s_256, coeffs, filt);
 1062|  3.43k|}
convolve_avx2.c:x_convolve_8tap_avx2:
  585|  15.0k|                                           const __m256i filt[4]) {
  586|  15.0k|  __m256i ss[4];
  587|       |
  588|  15.0k|  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
  589|  15.0k|  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
  590|  15.0k|  ss[2] = _mm256_shuffle_epi8(data, filt[2]);
  591|  15.0k|  ss[3] = _mm256_shuffle_epi8(data, filt[3]);
  592|       |
  593|  15.0k|  return convolve_8tap_avx2(ss, coeffs);
  594|  15.0k|}
convolve_avx2.c:x_convolve_8tap_16x2_avx2:
 1068|    766|                                                       __m256i r[2]) {
 1069|    766|  r[0] = x_convolve_8tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
 1070|    766|  r[1] = x_convolve_8tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
 1071|    766|}
convolve_avx2.c:sr_x_8tap_32_avx2:
 2930|  5.82k|                                               uint8_t *const dst) {
 2931|  5.82k|  __m256i r[2];
 2932|       |
 2933|  5.82k|  x_convolve_8tap_32_avx2(src, coeffs, filt, r);
 2934|  5.82k|  sr_x_round_store_32_avx2(r, dst);
 2935|  5.82k|}
convolve_avx2.c:x_convolve_8tap_32_avx2:
 1076|  5.82k|                                                     __m256i r[2]) {
 1077|  5.82k|  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
 1078|  5.82k|  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
 1079|       |
 1080|  5.82k|  r[0] = x_convolve_8tap_avx2(s0_256, coeffs, filt);
 1081|  5.82k|  r[1] = x_convolve_8tap_avx2(s1_256, coeffs, filt);
 1082|  5.82k|}

convolve_2d_avx2.c:load_u8_8x2_sse2:
   19|   119k|                                       const ptrdiff_t stride) {
   20|   119k|  return load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * stride));
   21|   119k|}
convolve_2d_avx2.c:store_u8_4x2_sse2:
   25|  3.14k|                                               const ptrdiff_t stride) {
   26|  3.14k|  xx_storel_32(dst, src);
   27|  3.14k|  *(uint32_t *)(dst + stride) =
   28|  3.14k|      ((uint32_t)_mm_extract_epi16(src, 3) << 16) | _mm_extract_epi16(src, 2);
   29|  3.14k|}
convolve_avx2.c:store_u8_4x2_sse2:
   25|  32.4k|                                               const ptrdiff_t stride) {
   26|  32.4k|  xx_storel_32(dst, src);
   27|  32.4k|  *(uint32_t *)(dst + stride) =
   28|  32.4k|      ((uint32_t)_mm_extract_epi16(src, 3) << 16) | _mm_extract_epi16(src, 2);
   29|  32.4k|}
convolve_avx2.c:load_u8_8x2_sse2:
   19|  17.8k|                                       const ptrdiff_t stride) {
   20|  17.8k|  return load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * stride));
   21|  17.8k|}

obu.c:clz:
  186|  3.36k|static inline int clz(const unsigned int mask) {
  187|  3.36k|    return __builtin_clz(mask);
  188|  3.36k|}
thread_task.c:ctz:
  182|   376k|static inline int ctz(const unsigned int mask) {
  183|   376k|    return __builtin_ctz(mask);
  184|   376k|}
decode.c:ctz:
  182|  5.10k|static inline int ctz(const unsigned int mask) {
  183|  5.10k|    return __builtin_ctz(mask);
  184|  5.10k|}
decode.c:clz:
  186|  6.30M|static inline int clz(const unsigned int mask) {
  187|  6.30M|    return __builtin_clz(mask);
  188|  6.30M|}
getbits.c:clz:
  186|  11.6k|static inline int clz(const unsigned int mask) {
  187|  11.6k|    return __builtin_clz(mask);
  188|  11.6k|}
lf_mask.c:clz:
  186|  2.84M|static inline int clz(const unsigned int mask) {
  187|  2.84M|    return __builtin_clz(mask);
  188|  2.84M|}
warpmv.c:clz:
  186|  8.18k|static inline int clz(const unsigned int mask) {
  187|  8.18k|    return __builtin_clz(mask);
  188|  8.18k|}
warpmv.c:clzll:
  190|  5.26k|static inline int clzll(const unsigned long long mask) {
  191|  5.26k|    return __builtin_clzll(mask);
  192|  5.26k|}
looprestoration_tmpl.c:clz:
  186|  3.29M|static inline int clz(const unsigned int mask) {
  187|  3.29M|    return __builtin_clz(mask);
  188|  3.29M|}
recon_tmpl.c:clz:
  186|  10.5M|static inline int clz(const unsigned int mask) {
  187|  10.5M|    return __builtin_clz(mask);
  188|  10.5M|}
cdef_apply_tmpl.c:clz:
  186|   172k|static inline int clz(const unsigned int mask) {
  187|   172k|    return __builtin_clz(mask);
  188|   172k|}
ipred_prepare_tmpl.c:clz:
  186|  3.94M|static inline int clz(const unsigned int mask) {
  187|  3.94M|    return __builtin_clz(mask);
  188|  3.94M|}

fg_apply_tmpl.c:PXSTRIDE:
   79|  57.7k|static inline ptrdiff_t PXSTRIDE(const ptrdiff_t x) {
   80|  57.7k|    assert(!(x & 1));
  ------------------
  |  |  140|  57.7k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 57.7k]
  |  |  |  Branch (140:68): [Folded, False: 57.7k]
  |  |  ------------------
  ------------------
   81|  57.7k|    return x >> 1;
   82|  57.7k|}
itx_tmpl.c:PXSTRIDE:
   79|  2.22M|static inline ptrdiff_t PXSTRIDE(const ptrdiff_t x) {
   80|  2.22M|    assert(!(x & 1));
  ------------------
  |  |  140|  2.22M|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 2.22M]
  |  |  |  Branch (140:68): [Folded, False: 2.22M]
  |  |  ------------------
  ------------------
   81|  2.22M|    return x >> 1;
   82|  2.22M|}
looprestoration_tmpl.c:PXSTRIDE:
   79|  7.55M|static inline ptrdiff_t PXSTRIDE(const ptrdiff_t x) {
   80|  7.55M|    assert(!(x & 1));
  ------------------
  |  |  140|  7.55M|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 7.55M]
  |  |  |  Branch (140:68): [Folded, False: 7.55M]
  |  |  ------------------
  ------------------
   81|  7.55M|    return x >> 1;
   82|  7.55M|}
recon_tmpl.c:PXSTRIDE:
   79|  7.11M|static inline ptrdiff_t PXSTRIDE(const ptrdiff_t x) {
   80|  7.11M|    assert(!(x & 1));
  ------------------
  |  |  140|  7.11M|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 7.11M]
  |  |  |  Branch (140:68): [Folded, False: 7.11M]
  |  |  ------------------
  ------------------
   81|  7.11M|    return x >> 1;
   82|  7.11M|}
cdef_apply_tmpl.c:PXSTRIDE:
   79|  6.99M|static inline ptrdiff_t PXSTRIDE(const ptrdiff_t x) {
   80|  6.99M|    assert(!(x & 1));
  ------------------
  |  |  140|  6.99M|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 6.99M]
  |  |  |  Branch (140:68): [Folded, False: 6.99M]
  |  |  ------------------
  ------------------
   81|  6.99M|    return x >> 1;
   82|  6.99M|}
ipred_prepare_tmpl.c:PXSTRIDE:
   79|  22.8M|static inline ptrdiff_t PXSTRIDE(const ptrdiff_t x) {
   80|  22.8M|    assert(!(x & 1));
  ------------------
  |  |  140|  22.8M|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 22.8M]
  |  |  |  Branch (140:68): [Folded, False: 22.8M]
  |  |  ------------------
  ------------------
   81|  22.8M|    return x >> 1;
   82|  22.8M|}
ipred_prepare_tmpl.c:pixel_set:
   66|  1.34M|static inline void pixel_set(pixel *const dst, const int val, const int num) {
   67|  21.0M|    for (int n = 0; n < num; n++)
  ------------------
  |  Branch (67:21): [True: 19.7M, False: 1.34M]
  ------------------
   68|  19.7M|        dst[n] = val;
   69|  1.34M|}
lf_apply_tmpl.c:PXSTRIDE:
   79|  3.86M|static inline ptrdiff_t PXSTRIDE(const ptrdiff_t x) {
   80|  3.86M|    assert(!(x & 1));
  ------------------
  |  |  140|  3.86M|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 3.86M]
  |  |  |  Branch (140:68): [Folded, False: 3.86M]
  |  |  ------------------
  ------------------
   81|  3.86M|    return x >> 1;
   82|  3.86M|}
lr_apply_tmpl.c:PXSTRIDE:
   79|   357k|static inline ptrdiff_t PXSTRIDE(const ptrdiff_t x) {
   80|   357k|    assert(!(x & 1));
  ------------------
  |  |  140|   357k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 357k]
  |  |  |  Branch (140:68): [Folded, False: 357k]
  |  |  ------------------
  ------------------
   81|   357k|    return x >> 1;
   82|   357k|}

lib.c:umin:
   47|  17.2k|static inline unsigned umin(const unsigned a, const unsigned b) {
   48|  17.2k|    return a < b ? a : b;
  ------------------
  |  Branch (48:12): [True: 16.3k, False: 912]
  ------------------
   49|  17.2k|}
obu.c:ulog2:
   67|  3.36k|static inline int ulog2(const unsigned v) {
   68|  3.36k|    return 31 ^ clz(v);
   69|  3.36k|}
obu.c:imin:
   39|  68.7k|static inline int imin(const int a, const int b) {
   40|  68.7k|    return a < b ? a : b;
  ------------------
  |  Branch (40:12): [True: 58.0k, False: 10.7k]
  ------------------
   41|  68.7k|}
obu.c:imax:
   35|  55.9k|static inline int imax(const int a, const int b) {
   36|  55.9k|    return a > b ? a : b;
  ------------------
  |  Branch (36:12): [True: 4.70k, False: 51.2k]
  ------------------
   37|  55.9k|}
obu.c:iclip_u8:
   55|  40.6k|static inline int iclip_u8(const int v) {
   56|  40.6k|    return iclip(v, 0, 255);
   57|  40.6k|}
obu.c:iclip:
   51|  40.6k|static inline int iclip(const int v, const int min, const int max) {
   52|  40.6k|    return v < min ? min : v > max ? max : v;
  ------------------
  |  Branch (52:12): [True: 3.35k, False: 37.2k]
  |  Branch (52:28): [True: 5.69k, False: 31.5k]
  ------------------
   53|  40.6k|}
refmvs.c:imin:
   39|  9.03M|static inline int imin(const int a, const int b) {
   40|  9.03M|    return a < b ? a : b;
  ------------------
  |  Branch (40:12): [True: 4.78M, False: 4.25M]
  ------------------
   41|  9.03M|}
refmvs.c:apply_sign:
   59|  26.5k|static inline int apply_sign(const int v, const int s) {
   60|  26.5k|    return s < 0 ? -v : v;
  ------------------
  |  Branch (60:12): [True: 10.9k, False: 15.5k]
  ------------------
   61|  26.5k|}
refmvs.c:imax:
   35|  3.29M|static inline int imax(const int a, const int b) {
   36|  3.29M|    return a > b ? a : b;
  ------------------
  |  Branch (36:12): [True: 1.00M, False: 2.29M]
  ------------------
   37|  3.29M|}
refmvs.c:iclip:
   51|  6.19M|static inline int iclip(const int v, const int min, const int max) {
   52|  6.19M|    return v < min ? min : v > max ? max : v;
  ------------------
  |  Branch (52:12): [True: 19.2k, False: 6.17M]
  |  Branch (52:28): [True: 5.08k, False: 6.16M]
  ------------------
   53|  6.19M|}
thread_task.c:umin:
   47|   266k|static inline unsigned umin(const unsigned a, const unsigned b) {
   48|   266k|    return a < b ? a : b;
  ------------------
  |  Branch (48:12): [True: 0, False: 266k]
  ------------------
   49|   266k|}
wedge.c:imax:
   35|    256|static inline int imax(const int a, const int b) {
   36|    256|    return a > b ? a : b;
  ------------------
  |  Branch (36:12): [True: 128, False: 128]
  ------------------
   37|    256|}
wedge.c:imin:
   39|  2.48k|static inline int imin(const int a, const int b) {
   40|  2.48k|    return a < b ? a : b;
  ------------------
  |  Branch (40:12): [True: 1.41k, False: 1.06k]
  ------------------
   41|  2.48k|}
fg_apply_tmpl.c:imin:
   39|  27.6k|static inline int imin(const int a, const int b) {
   40|  27.6k|    return a < b ? a : b;
  ------------------
  |  Branch (40:12): [True: 431, False: 27.2k]
  ------------------
   41|  27.6k|}
cdf.c:imin:
   39|  86.6k|static inline int imin(const int a, const int b) {
   40|  86.6k|    return a < b ? a : b;
  ------------------
  |  Branch (40:12): [True: 21.6k, False: 64.9k]
  ------------------
   41|  86.6k|}
decode.c:iclip:
   51|  1.41M|static inline int iclip(const int v, const int min, const int max) {
   52|  1.41M|    return v < min ? min : v > max ? max : v;
  ------------------
  |  Branch (52:12): [True: 67.4k, False: 1.34M]
  |  Branch (52:28): [True: 41.5k, False: 1.30M]
  ------------------
   53|  1.41M|}
decode.c:apply_sign:
   59|  6.98k|static inline int apply_sign(const int v, const int s) {
   60|  6.98k|    return s < 0 ? -v : v;
  ------------------
  |  Branch (60:12): [True: 3.32k, False: 3.66k]
  ------------------
   61|  6.98k|}
decode.c:ulog2:
   67|  6.31M|static inline int ulog2(const unsigned v) {
   68|  6.31M|    return 31 ^ clz(v);
   69|  6.31M|}
decode.c:imax:
   35|  2.33M|static inline int imax(const int a, const int b) {
   36|  2.33M|    return a > b ? a : b;
  ------------------
  |  Branch (36:12): [True: 908k, False: 1.43M]
  ------------------
   37|  2.33M|}
decode.c:imin:
   39|  7.89M|static inline int imin(const int a, const int b) {
   40|  7.89M|    return a < b ? a : b;
  ------------------
  |  Branch (40:12): [True: 5.15M, False: 2.73M]
  ------------------
   41|  7.89M|}
decode.c:iclip_u8:
   55|  1.04M|static inline int iclip_u8(const int v) {
   56|  1.04M|    return iclip(v, 0, 255);
   57|  1.04M|}
getbits.c:ulog2:
   67|  11.6k|static inline int ulog2(const unsigned v) {
   68|  11.6k|    return 31 ^ clz(v);
   69|  11.6k|}
getbits.c:inv_recenter:
   75|  20.0k|static inline unsigned inv_recenter(const unsigned r, const unsigned v) {
   76|  20.0k|    if (v > (r << 1))
  ------------------
  |  Branch (76:9): [True: 51, False: 19.9k]
  ------------------
   77|     51|        return v;
   78|  19.9k|    else if ((v & 1) == 0)
  ------------------
  |  Branch (78:14): [True: 12.5k, False: 7.35k]
  ------------------
   79|  12.5k|        return (v >> 1) + r;
   80|  7.35k|    else
   81|  7.35k|        return r - ((v + 1) >> 1);
   82|  20.0k|}
lf_mask.c:imin:
   39|  15.0M|static inline int imin(const int a, const int b) {
   40|  15.0M|    return a < b ? a : b;
  ------------------
  |  Branch (40:12): [True: 2.86M, False: 12.2M]
  ------------------
   41|  15.0M|}
lf_mask.c:ulog2:
   67|  2.84M|static inline int ulog2(const unsigned v) {
   68|  2.84M|    return 31 ^ clz(v);
   69|  2.84M|}
lf_mask.c:imax:
   35|  1.11M|static inline int imax(const int a, const int b) {
   36|  1.11M|    return a > b ? a : b;
  ------------------
  |  Branch (36:12): [True: 1.05M, False: 61.5k]
  ------------------
   37|  1.11M|}
lf_mask.c:iclip:
   51|  8.05M|static inline int iclip(const int v, const int min, const int max) {
   52|  8.05M|    return v < min ? min : v > max ? max : v;
  ------------------
  |  Branch (52:12): [True: 1.91M, False: 6.14M]
  |  Branch (52:28): [True: 631k, False: 5.51M]
  ------------------
   53|  8.05M|}
msac.c:inv_recenter:
   75|   201k|static inline unsigned inv_recenter(const unsigned r, const unsigned v) {
   76|   201k|    if (v > (r << 1))
  ------------------
  |  Branch (76:9): [True: 41.5k, False: 159k]
  ------------------
   77|  41.5k|        return v;
   78|   159k|    else if ((v & 1) == 0)
  ------------------
  |  Branch (78:14): [True: 80.3k, False: 79.2k]
  ------------------
   79|  80.3k|        return (v >> 1) + r;
   80|  79.2k|    else
   81|  79.2k|        return r - ((v + 1) >> 1);
   82|   201k|}
warpmv.c:apply_sign:
   59|  40.8k|static inline int apply_sign(const int v, const int s) {
   60|  40.8k|    return s < 0 ? -v : v;
  ------------------
  |  Branch (60:12): [True: 11.6k, False: 29.2k]
  ------------------
   61|  40.8k|}
warpmv.c:ulog2:
   67|  8.18k|static inline int ulog2(const unsigned v) {
   68|  8.18k|    return 31 ^ clz(v);
   69|  8.18k|}
warpmv.c:apply_sign64:
   63|  42.6k|static inline int apply_sign64(const int v, const int64_t s) {
   64|  42.6k|    return s < 0 ? -v : v;
  ------------------
  |  Branch (64:12): [True: 7.43k, False: 35.2k]
  ------------------
   65|  42.6k|}
warpmv.c:iclip:
   51|  64.2k|static inline int iclip(const int v, const int min, const int max) {
   52|  64.2k|    return v < min ? min : v > max ? max : v;
  ------------------
  |  Branch (52:12): [True: 894, False: 63.3k]
  |  Branch (52:28): [True: 744, False: 62.6k]
  ------------------
   53|  64.2k|}
warpmv.c:u64log2:
   71|  5.26k|static inline int u64log2(const uint64_t v) {
   72|  5.26k|    return 63 ^ clzll(v);
   73|  5.26k|}
itx_tmpl.c:iclip:
   51|   114M|static inline int iclip(const int v, const int min, const int max) {
   52|   114M|    return v < min ? min : v > max ? max : v;
  ------------------
  |  Branch (52:12): [True: 6.84M, False: 107M]
  |  Branch (52:28): [True: 8.38M, False: 99.1M]
  ------------------
   53|   114M|}
itx_tmpl.c:imin:
   39|  80.9k|static inline int imin(const int a, const int b) {
   40|  80.9k|    return a < b ? a : b;
  ------------------
  |  Branch (40:12): [True: 18.7k, False: 62.2k]
  ------------------
   41|  80.9k|}
looprestoration_tmpl.c:iclip:
   51|  21.6M|static inline int iclip(const int v, const int min, const int max) {
   52|  21.6M|    return v < min ? min : v > max ? max : v;
  ------------------
  |  Branch (52:12): [True: 17.5k, False: 21.6M]
  |  Branch (52:28): [True: 47.4k, False: 21.5M]
  ------------------
   53|  21.6M|}
looprestoration_tmpl.c:imax:
   35|  31.3M|static inline int imax(const int a, const int b) {
   36|  31.3M|    return a > b ? a : b;
  ------------------
  |  Branch (36:12): [True: 13.6M, False: 17.6M]
  ------------------
   37|  31.3M|}
looprestoration_tmpl.c:umin:
   47|  31.3M|static inline unsigned umin(const unsigned a, const unsigned b) {
   48|  31.3M|    return a < b ? a : b;
  ------------------
  |  Branch (48:12): [True: 30.9M, False: 372k]
  ------------------
   49|  31.3M|}
recon_tmpl.c:ulog2:
   67|  10.5M|static inline int ulog2(const unsigned v) {
   68|  10.5M|    return 31 ^ clz(v);
   69|  10.5M|}
recon_tmpl.c:imin:
   39|  30.5M|static inline int imin(const int a, const int b) {
   40|  30.5M|    return a < b ? a : b;
  ------------------
  |  Branch (40:12): [True: 25.5M, False: 4.99M]
  ------------------
   41|  30.5M|}
recon_tmpl.c:imax:
   35|  2.30M|static inline int imax(const int a, const int b) {
   36|  2.30M|    return a > b ? a : b;
  ------------------
  |  Branch (36:12): [True: 1.76M, False: 546k]
  ------------------
   37|  2.30M|}
recon_tmpl.c:umin:
   47|  80.8M|static inline unsigned umin(const unsigned a, const unsigned b) {
   48|  80.8M|    return a < b ? a : b;
  ------------------
  |  Branch (48:12): [True: 52.1M, False: 28.6M]
  ------------------
   49|  80.8M|}
recon_tmpl.c:apply_sign64:
   63|  30.8k|static inline int apply_sign64(const int v, const int64_t s) {
   64|  30.8k|    return s < 0 ? -v : v;
  ------------------
  |  Branch (64:12): [True: 7.69k, False: 23.1k]
  ------------------
   65|  30.8k|}
recon_tmpl.c:iclip:
   51|  62.7k|static inline int iclip(const int v, const int min, const int max) {
   52|  62.7k|    return v < min ? min : v > max ? max : v;
  ------------------
  |  Branch (52:12): [True: 5.07k, False: 57.7k]
  |  Branch (52:28): [True: 220, False: 57.4k]
  ------------------
   53|  62.7k|}
itx_1d.c:iclip:
   51|   319M|static inline int iclip(const int v, const int min, const int max) {
   52|   319M|    return v < min ? min : v > max ? max : v;
  ------------------
  |  Branch (52:12): [True: 2.96M, False: 316M]
  |  Branch (52:28): [True: 2.92M, False: 313M]
  ------------------
   53|   319M|}
scan.c:imax:
   35|  3.34k|static inline int imax(const int a, const int b) {
   36|  3.34k|    return a > b ? a : b;
  ------------------
  |  Branch (36:12): [True: 2.82k, False: 523]
  ------------------
   37|  3.34k|}
cdef_apply_tmpl.c:imin:
   39|  1.63M|static inline int imin(const int a, const int b) {
   40|  1.63M|    return a < b ? a : b;
  ------------------
  |  Branch (40:12): [True: 676k, False: 955k]
  ------------------
   41|  1.63M|}
cdef_apply_tmpl.c:ulog2:
   67|   172k|static inline int ulog2(const unsigned v) {
   68|   172k|    return 31 ^ clz(v);
   69|   172k|}
ipred_prepare_tmpl.c:imin:
   39|  12.6M|static inline int imin(const int a, const int b) {
   40|  12.6M|    return a < b ? a : b;
  ------------------
  |  Branch (40:12): [True: 8.88M, False: 3.75M]
  ------------------
   41|  12.6M|}
lf_apply_tmpl.c:imin:
   39|   643k|static inline int imin(const int a, const int b) {
   40|   643k|    return a < b ? a : b;
  ------------------
  |  Branch (40:12): [True: 165k, False: 478k]
  ------------------
   41|   643k|}
lr_apply_tmpl.c:imin:
   39|   224k|static inline int imin(const int a, const int b) {
   40|   224k|    return a < b ? a : b;
  ------------------
  |  Branch (40:12): [True: 101k, False: 123k]
  ------------------
   41|   224k|}

dav1d_cdef_brow_8bpc:
  102|  49.6k|{
  103|  49.6k|    Dav1dFrameContext *const f = (Dav1dFrameContext *)tc->f;
  104|  49.6k|    const int bitdepth_min_8 = BITDEPTH == 8 ? 0 : f->cur.p.bpc - 8;
  ------------------
  |  Branch (104:32): [True: 49.6k, Folded]
  ------------------
  105|  49.6k|    const Dav1dDSPContext *const dsp = f->dsp;
  106|  49.6k|    enum CdefEdgeFlags edges = CDEF_HAVE_BOTTOM | (by_start > 0 ? CDEF_HAVE_TOP : 0);
  ------------------
  |  Branch (106:52): [True: 43.3k, False: 6.28k]
  ------------------
  107|  49.6k|    pixel *ptrs[3] = { p[0], p[1], p[2] };
  108|  49.6k|    const int sbsz = 16;
  109|  49.6k|    const int sb64w = f->sb128w << 1;
  110|  49.6k|    const int damping = f->frame_hdr->cdef.damping + bitdepth_min_8;
  111|  49.6k|    const enum Dav1dPixelLayout layout = f->cur.p.layout;
  112|  49.6k|    const int uv_idx = DAV1D_PIXEL_LAYOUT_I444 - layout;
  113|  49.6k|    const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
  114|  49.6k|    const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
  115|  49.6k|    static const uint8_t uv_dirs[2][8] = { { 0, 1, 2, 3, 4, 5, 6, 7 },
  116|  49.6k|                                           { 7, 0, 2, 4, 5, 6, 6, 6 } };
  117|  49.6k|    const uint8_t *uv_dir = uv_dirs[layout == DAV1D_PIXEL_LAYOUT_I422];
  118|  49.6k|    const int have_tt = f->c->n_tc > 1;
  119|  49.6k|    const int sb128 = f->seq_hdr->sb128;
  120|  49.6k|    const int resize = f->frame_hdr->width[0] != f->frame_hdr->width[1];
  121|  49.6k|    const ptrdiff_t y_stride = PXSTRIDE(f->cur.stride[0]);
  ------------------
  |  |   53|  49.6k|#define PXSTRIDE(x) (x)
  ------------------
  122|  49.6k|    const ptrdiff_t uv_stride = PXSTRIDE(f->cur.stride[1]);
  ------------------
  |  |   53|  49.6k|#define PXSTRIDE(x) (x)
  ------------------
  123|       |
  124|   303k|    for (int bit = 0, by = by_start; by < by_end; by += 2, edges |= CDEF_HAVE_TOP) {
  ------------------
  |  Branch (124:38): [True: 253k, False: 49.7k]
  ------------------
  125|   253k|        const int tf = tc->top_pre_cdef_toggle;
  126|   253k|        const int by_idx = (by & 30) >> 1;
  127|   253k|        if (by + 2 >= f->bh) edges &= ~CDEF_HAVE_BOTTOM;
  ------------------
  |  Branch (127:13): [True: 5.45k, False: 247k]
  ------------------
  128|       |
  129|   253k|        if ((!have_tt || sbrow_start || by + 2 < by_end) &&
  ------------------
  |  Branch (129:14): [True: 14.3k, False: 238k]
  |  Branch (129:26): [True: 20.4k, False: 218k]
  |  Branch (129:41): [True: 192k, False: 26.2k]
  ------------------
  130|   226k|            edges & CDEF_HAVE_BOTTOM)
  ------------------
  |  Branch (130:13): [True: 226k, False: 401]
  ------------------
  131|   226k|        {
  132|       |            // backup pre-filter data for next iteration
  133|   226k|            pixel *const cdef_top_bak[3] = {
  134|   226k|                f->lf.cdef_line[!tf][0] + have_tt * sby * 4 * y_stride,
  135|   226k|                f->lf.cdef_line[!tf][1] + have_tt * sby * 8 * uv_stride,
  136|   226k|                f->lf.cdef_line[!tf][2] + have_tt * sby * 8 * uv_stride
  137|   226k|            };
  138|   226k|            backup2lines(cdef_top_bak, ptrs, f->cur.stride, layout);
  139|   226k|        }
  140|       |
  141|   253k|        ALIGN_STK_16(pixel, lr_bak, 2 /* idx */, [3 /* plane */][8 /* y */][2 /* x */]);
  ------------------
  |  |  100|   253k|    ALIGN(type var[sz1d]sznd, ALIGN_16_VAL)
  |  |  ------------------
  |  |  |  |   86|   253k|    line __attribute__((aligned(align)))
  |  |  ------------------
  ------------------
  142|   253k|        pixel *iptrs[3] = { ptrs[0], ptrs[1], ptrs[2] };
  143|   253k|        edges &= ~CDEF_HAVE_LEFT;
  144|   253k|        edges |= CDEF_HAVE_RIGHT;
  145|   253k|        enum Backup2x8Flags prev_flag = 0;
  146|   865k|        for (int sbx = 0; sbx < sb64w; sbx++, edges |= CDEF_HAVE_LEFT) {
  ------------------
  |  Branch (146:27): [True: 612k, False: 253k]
  ------------------
  147|   612k|            const int sb128x = sbx >> 1;
  148|   612k|            const int sb64_idx = ((by & sbsz) >> 3) + (sbx & 1);
  149|   612k|            const int cdef_idx = lflvl[sb128x].cdef_idx[sb64_idx];
  150|   612k|            if (cdef_idx == -1 ||
  ------------------
  |  Branch (150:17): [True: 267k, False: 345k]
  ------------------
  151|   345k|                (!f->frame_hdr->cdef.y_strength[cdef_idx] &&
  ------------------
  |  Branch (151:18): [True: 241k, False: 103k]
  ------------------
  152|   241k|                 !f->frame_hdr->cdef.uv_strength[cdef_idx]))
  ------------------
  |  Branch (152:18): [True: 230k, False: 11.1k]
  ------------------
  153|   497k|            {
  154|   497k|                prev_flag = 0;
  155|   497k|                goto next_sb;
  156|   497k|            }
  157|       |
  158|       |            // Create a complete 32-bit mask for the sb row ahead of time.
  159|   114k|            const uint16_t (*noskip_row)[2] = &lflvl[sb128x].noskip_mask[by_idx];
  160|   114k|            const unsigned noskip_mask = (unsigned) noskip_row[0][1] << 16 |
  161|   114k|                                                    noskip_row[0][0];
  162|       |
  163|   114k|            const int y_lvl = f->frame_hdr->cdef.y_strength[cdef_idx];
  164|   114k|            const int uv_lvl = f->frame_hdr->cdef.uv_strength[cdef_idx];
  165|   114k|            const enum Backup2x8Flags flag = !!y_lvl + (!!uv_lvl << 1);
  166|       |
  167|   114k|            const int y_pri_lvl = (y_lvl >> 2) << bitdepth_min_8;
  168|   114k|            int y_sec_lvl = y_lvl & 3;
  169|   114k|            y_sec_lvl += y_sec_lvl == 3;
  170|   114k|            y_sec_lvl <<= bitdepth_min_8;
  171|       |
  172|   114k|            const int uv_pri_lvl = (uv_lvl >> 2) << bitdepth_min_8;
  173|   114k|            int uv_sec_lvl = uv_lvl & 3;
  174|   114k|            uv_sec_lvl += uv_sec_lvl == 3;
  175|   114k|            uv_sec_lvl <<= bitdepth_min_8;
  176|       |
  177|   114k|            pixel *bptrs[3] = { iptrs[0], iptrs[1], iptrs[2] };
  178|   427k|            for (int bx = sbx * sbsz; bx < imin((sbx + 1) * sbsz, f->bw);
  ------------------
  |  Branch (178:39): [True: 312k, False: 114k]
  ------------------
  179|   312k|                 bx += 2, edges |= CDEF_HAVE_LEFT)
  180|   312k|            {
  181|   312k|                if (bx + 2 >= f->bw) edges &= ~CDEF_HAVE_RIGHT;
  ------------------
  |  Branch (181:21): [True: 45.7k, False: 266k]
  ------------------
  182|       |
  183|       |                // check if this 8x8 block had any coded coefficients; if not,
  184|       |                // go to the next block
  185|   312k|                const uint32_t bx_mask = 3U << (bx & 30);
  186|   312k|                if (!(noskip_mask & bx_mask)) {
  ------------------
  |  Branch (186:21): [True: 21.4k, False: 291k]
  ------------------
  187|  21.4k|                    prev_flag = 0;
  188|  21.4k|                    goto next_b;
  189|  21.4k|                }
  190|   291k|                const enum Backup2x8Flags do_left = (prev_flag ^ flag) & flag;
  191|   291k|                prev_flag = flag;
  192|   291k|                if (do_left && edges & CDEF_HAVE_LEFT) {
  ------------------
  |  Branch (192:21): [True: 42.3k, False: 248k]
  |  Branch (192:32): [True: 2.60k, False: 39.7k]
  ------------------
  193|       |                    // we didn't backup the prefilter data because it wasn't
  194|       |                    // there, so do it here instead
  195|  2.60k|                    backup2x8(lr_bak[bit], bptrs, f->cur.stride, 0, layout, do_left);
  196|  2.60k|                }
  197|   291k|                if (edges & CDEF_HAVE_RIGHT) {
  ------------------
  |  Branch (197:21): [True: 253k, False: 37.3k]
  ------------------
  198|       |                    // backup pre-filter data for next iteration
  199|   253k|                    backup2x8(lr_bak[!bit], bptrs, f->cur.stride, 8, layout, flag);
  200|   253k|                }
  201|       |
  202|   291k|                int dir;
  203|   291k|                unsigned variance;
  204|   291k|                if (y_pri_lvl || uv_pri_lvl)
  ------------------
  |  Branch (204:21): [True: 220k, False: 71.0k]
  |  Branch (204:34): [True: 45.6k, False: 25.4k]
  ------------------
  205|   264k|                    dir = dsp->cdef.dir(bptrs[0], f->cur.stride[0],
  206|   264k|                                        &variance HIGHBD_CALL_SUFFIX);
  207|       |
  208|   291k|                const pixel *top, *bot;
  209|   291k|                ptrdiff_t offset;
  210|       |
  211|   291k|                if (!have_tt) goto st_y;
  ------------------
  |  Branch (211:21): [True: 17.9k, False: 273k]
  ------------------
  212|   273k|                if (sbrow_start && by == by_start) {
  ------------------
  |  Branch (212:21): [True: 18.8k, False: 254k]
  |  Branch (212:36): [True: 18.8k, False: 0]
  ------------------
  213|  18.8k|                    if (resize) {
  ------------------
  |  Branch (213:25): [True: 2.40k, False: 16.4k]
  ------------------
  214|  2.40k|                        offset = (sby - 1) * 4 * y_stride + bx * 4;
  215|  2.40k|                        top = &f->lf.cdef_lpf_line[0][offset];
  216|  16.4k|                    } else {
  217|  16.4k|                        offset = (sby * (4 << sb128) - 4) * y_stride + bx * 4;
  218|  16.4k|                        top = &f->lf.lr_lpf_line[0][offset];
  219|  16.4k|                    }
  220|  18.8k|                    bot = bptrs[0] + 8 * y_stride;
  221|   254k|                } else if (!sbrow_start && by + 2 >= by_end) {
  ------------------
  |  Branch (221:28): [True: 254k, False: 18.4E]
  |  Branch (221:44): [True: 31.5k, False: 223k]
  ------------------
  222|  31.5k|                    top = &f->lf.cdef_line[tf][0][sby * 4 * y_stride + bx * 4];
  223|  31.5k|                    if (resize) {
  ------------------
  |  Branch (223:25): [True: 3.10k, False: 28.4k]
  ------------------
  224|  3.10k|                        offset = (sby * 4 + 2) * y_stride + bx * 4;
  225|  3.10k|                        bot = &f->lf.cdef_lpf_line[0][offset];
  226|  28.4k|                    } else {
  227|  28.4k|                        const int line = sby * (4 << sb128) + 4 * sb128 + 2;
  228|  28.4k|                        offset = line * y_stride + bx * 4;
  229|  28.4k|                        bot = &f->lf.lr_lpf_line[0][offset];
  230|  28.4k|                    }
  231|   222k|                } else {
  232|   241k|            st_y:;
  233|   241k|                    offset = sby * 4 * y_stride;
  234|   241k|                    top = &f->lf.cdef_line[tf][0][have_tt * offset + bx * 4];
  235|   241k|                    bot = bptrs[0] + 8 * y_stride;
  236|   241k|                }
  237|   291k|                if (y_pri_lvl) {
  ------------------
  |  Branch (237:21): [True: 220k, False: 71.0k]
  ------------------
  238|   220k|                    const int adj_y_pri_lvl = adjust_strength(y_pri_lvl, variance);
  239|   220k|                    if (adj_y_pri_lvl || y_sec_lvl)
  ------------------
  |  Branch (239:25): [True: 55.9k, False: 164k]
  |  Branch (239:42): [True: 104k, False: 60.1k]
  ------------------
  240|   160k|                        dsp->cdef.fb[0](bptrs[0], f->cur.stride[0], lr_bak[bit][0],
  241|   160k|                                        top, bot, adj_y_pri_lvl, y_sec_lvl,
  242|   160k|                                        dir, damping, edges HIGHBD_CALL_SUFFIX);
  243|   220k|                } else if (y_sec_lvl)
  ------------------
  |  Branch (243:28): [True: 16.5k, False: 54.5k]
  ------------------
  244|  16.5k|                    dsp->cdef.fb[0](bptrs[0], f->cur.stride[0], lr_bak[bit][0],
  245|  16.5k|                                    top, bot, 0, y_sec_lvl, 0, damping,
  246|  16.5k|                                    edges HIGHBD_CALL_SUFFIX);
  247|       |
  248|   291k|                if (!uv_lvl) goto skip_uv;
  ------------------
  |  Branch (248:21): [True: 69.0k, False: 222k]
  ------------------
  249|   222k|                assert(layout != DAV1D_PIXEL_LAYOUT_I400);
  ------------------
  |  |  140|   222k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 222k]
  |  |  |  Branch (140:68): [Folded, False: 222k]
  |  |  ------------------
  ------------------
  250|       |
  251|   222k|                const int uvdir = uv_pri_lvl ? uv_dir[dir] : 0;
  ------------------
  |  Branch (251:35): [True: 193k, False: 28.9k]
  ------------------
  252|   666k|                for (int pl = 1; pl <= 2; pl++) {
  ------------------
  |  Branch (252:34): [True: 443k, False: 223k]
  ------------------
  253|   443k|                    if (!have_tt) goto st_uv;
  ------------------
  |  Branch (253:25): [True: 25.1k, False: 418k]
  ------------------
  254|   418k|                    if (sbrow_start && by == by_start) {
  ------------------
  |  Branch (254:25): [True: 27.7k, False: 390k]
  |  Branch (254:40): [True: 27.7k, False: 8]
  ------------------
  255|  27.7k|                        if (resize) {
  ------------------
  |  Branch (255:29): [True: 4.26k, False: 23.4k]
  ------------------
  256|  4.26k|                            offset = (sby - 1) * 4 * uv_stride + (bx * 4 >> ss_hor);
  257|  4.26k|                            top = &f->lf.cdef_lpf_line[pl][offset];
  258|  23.4k|                        } else {
  259|  23.4k|                            const int line = sby * (4 << sb128) - 4;
  260|  23.4k|                            offset = line * uv_stride + (bx * 4 >> ss_hor);
  261|  23.4k|                            top = &f->lf.lr_lpf_line[pl][offset];
  262|  23.4k|                        }
  263|  27.7k|                        bot = bptrs[pl] + (8 >> ss_ver) * uv_stride;
  264|   391k|                    } else if (!sbrow_start && by + 2 >= by_end) {
  ------------------
  |  Branch (264:32): [True: 391k, False: 18.4E]
  |  Branch (264:48): [True: 47.7k, False: 343k]
  ------------------
  265|  47.7k|                        const ptrdiff_t top_offset = sby * 8 * uv_stride +
  266|  47.7k|                                                     (bx * 4 >> ss_hor);
  267|  47.7k|                        top = &f->lf.cdef_line[tf][pl][top_offset];
  268|  47.7k|                        if (resize) {
  ------------------
  |  Branch (268:29): [True: 5.46k, False: 42.2k]
  ------------------
  269|  5.46k|                            offset = (sby * 4 + 2) * uv_stride + (bx * 4 >> ss_hor);
  270|  5.46k|                            bot = &f->lf.cdef_lpf_line[pl][offset];
  271|  42.2k|                        } else {
  272|  42.2k|                            const int line = sby * (4 << sb128) + 4 * sb128 + 2;
  273|  42.2k|                            offset = line * uv_stride + (bx * 4 >> ss_hor);
  274|  42.2k|                            bot = &f->lf.lr_lpf_line[pl][offset];
  275|  42.2k|                        }
  276|   342k|                    } else {
  277|   368k|                st_uv:;
  278|   368k|                        const ptrdiff_t offset = sby * 8 * uv_stride;
  279|   368k|                        top = &f->lf.cdef_line[tf][pl][have_tt * offset + (bx * 4 >> ss_hor)];
  280|   368k|                        bot = bptrs[pl] + (8 >> ss_ver) * uv_stride;
  281|   368k|                    }
  282|   444k|                    dsp->cdef.fb[uv_idx](bptrs[pl], f->cur.stride[1],
  283|   444k|                                         lr_bak[bit][pl], top, bot,
  284|   444k|                                         uv_pri_lvl, uv_sec_lvl, uvdir,
  285|   444k|                                         damping - 1, edges HIGHBD_CALL_SUFFIX);
  286|   444k|                }
  287|       |
  288|   291k|            skip_uv:
  289|   291k|                bit ^= 1;
  290|       |
  291|   312k|            next_b:
  292|   312k|                bptrs[0] += 8;
  293|   312k|                bptrs[1] += 8 >> ss_hor;
  294|   312k|                bptrs[2] += 8 >> ss_hor;
  295|   312k|            }
  296|       |
  297|   612k|        next_sb:
  298|   612k|            iptrs[0] += sbsz * 4;
  299|   612k|            iptrs[1] += sbsz * 4 >> ss_hor;
  300|   612k|            iptrs[2] += sbsz * 4 >> ss_hor;
  301|   612k|        }
  302|       |
  303|   253k|        ptrs[0] += 8 * PXSTRIDE(f->cur.stride[0]);
  ------------------
  |  |   53|   253k|#define PXSTRIDE(x) (x)
  ------------------
  304|   253k|        ptrs[1] += 8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver;
  ------------------
  |  |   53|   253k|#define PXSTRIDE(x) (x)
  ------------------
  305|   253k|        ptrs[2] += 8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver;
  ------------------
  |  |   53|   253k|#define PXSTRIDE(x) (x)
  ------------------
  306|   253k|        tc->top_pre_cdef_toggle ^= 1;
  307|   253k|    }
  308|  49.6k|}
cdef_apply_tmpl.c:backup2lines:
   44|   830k|{
   45|   830k|    const ptrdiff_t y_stride = PXSTRIDE(stride[0]);
  ------------------
  |  |   53|   830k|#define PXSTRIDE(x) (x)
  ------------------
   46|   830k|    if (y_stride < 0)
  ------------------
  |  Branch (46:9): [True: 0, False: 830k]
  ------------------
   47|      0|        pixel_copy(dst[0] + y_stride, src[0] + 7 * y_stride, -2 * y_stride);
  ------------------
  |  |   47|      0|#define pixel_copy memcpy
  ------------------
   48|   830k|    else
   49|   830k|        pixel_copy(dst[0], src[0] + 6 * y_stride, 2 * y_stride);
  ------------------
  |  |   47|   830k|#define pixel_copy memcpy
  ------------------
   50|       |
   51|   830k|    if (layout != DAV1D_PIXEL_LAYOUT_I400) {
  ------------------
  |  Branch (51:9): [True: 362k, False: 468k]
  ------------------
   52|   362k|        const ptrdiff_t uv_stride = PXSTRIDE(stride[1]);
  ------------------
  |  |   53|   362k|#define PXSTRIDE(x) (x)
  ------------------
   53|   362k|        if (uv_stride < 0) {
  ------------------
  |  Branch (53:13): [True: 0, False: 362k]
  ------------------
   54|      0|            const int uv_off = layout == DAV1D_PIXEL_LAYOUT_I420 ? 3 : 7;
  ------------------
  |  Branch (54:32): [True: 0, False: 0]
  ------------------
   55|      0|            pixel_copy(dst[1] + uv_stride, src[1] + uv_off * uv_stride, -2 * uv_stride);
  ------------------
  |  |   47|      0|#define pixel_copy memcpy
  ------------------
   56|      0|            pixel_copy(dst[2] + uv_stride, src[2] + uv_off * uv_stride, -2 * uv_stride);
  ------------------
  |  |   47|      0|#define pixel_copy memcpy
  ------------------
   57|   362k|        } else {
   58|   362k|            const int uv_off = layout == DAV1D_PIXEL_LAYOUT_I420 ? 2 : 6;
  ------------------
  |  Branch (58:32): [True: 150k, False: 211k]
  ------------------
   59|   362k|            pixel_copy(dst[1], src[1] + uv_off * uv_stride, 2 * uv_stride);
  ------------------
  |  |   47|   362k|#define pixel_copy memcpy
  ------------------
   60|   362k|            pixel_copy(dst[2], src[2] + uv_off * uv_stride, 2 * uv_stride);
  ------------------
  |  |   47|   362k|#define pixel_copy memcpy
  ------------------
   61|   362k|        }
   62|   362k|    }
   63|   830k|}
cdef_apply_tmpl.c:backup2x8:
   70|   558k|{
   71|   558k|    ptrdiff_t y_off = 0;
   72|   558k|    if (flag & BACKUP_2X8_Y) {
  ------------------
  |  Branch (72:9): [True: 485k, False: 73.4k]
  ------------------
   73|  4.35M|        for (int y = 0; y < 8; y++, y_off += PXSTRIDE(src_stride[0]))
  ------------------
  |  |   53|  3.87M|#define PXSTRIDE(x) (x)
  ------------------
  |  Branch (73:25): [True: 3.87M, False: 485k]
  ------------------
   74|  3.87M|            pixel_copy(dst[0][y], &src[0][y_off + x_off - 2], 2);
  ------------------
  |  |   47|  3.87M|#define pixel_copy memcpy
  ------------------
   75|   485k|    }
   76|       |
   77|   558k|    if (layout == DAV1D_PIXEL_LAYOUT_I400 || !(flag & BACKUP_2X8_UV))
  ------------------
  |  Branch (77:9): [True: 86.6k, False: 471k]
  |  Branch (77:46): [True: 33.8k, False: 437k]
  ------------------
   78|   119k|        return;
   79|       |
   80|   438k|    const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
   81|   438k|    const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
   82|       |
   83|   438k|    x_off >>= ss_hor;
   84|   438k|    y_off = 0;
   85|  3.40M|    for (int y = 0; y < (8 >> ss_ver); y++, y_off += PXSTRIDE(src_stride[1])) {
  ------------------
  |  |   53|  2.96M|#define PXSTRIDE(x) (x)
  ------------------
  |  Branch (85:21): [True: 2.96M, False: 438k]
  ------------------
   86|  2.96M|        pixel_copy(dst[1][y], &src[1][y_off + x_off - 2], 2);
  ------------------
  |  |   47|  2.96M|#define pixel_copy memcpy
  ------------------
   87|  2.96M|        pixel_copy(dst[2][y], &src[2][y_off + x_off - 2], 2);
  ------------------
  |  |   47|  2.96M|#define pixel_copy memcpy
  ------------------
   88|  2.96M|    }
   89|   438k|}
cdef_apply_tmpl.c:adjust_strength:
   91|   595k|static int adjust_strength(const int strength, const unsigned var) {
   92|   595k|    if (!var) return 0;
  ------------------
  |  Branch (92:9): [True: 323k, False: 272k]
  ------------------
   93|   272k|    const int i = var >> 6 ? imin(ulog2(var >> 6), 12) : 0;
  ------------------
  |  Branch (93:19): [True: 172k, False: 99.9k]
  ------------------
   94|   272k|    return (strength * (4 + i) + 8) >> 4;
   95|   595k|}
dav1d_cdef_brow_16bpc:
  102|   129k|{
  103|   129k|    Dav1dFrameContext *const f = (Dav1dFrameContext *)tc->f;
  104|   129k|    const int bitdepth_min_8 = BITDEPTH == 8 ? 0 : f->cur.p.bpc - 8;
  ------------------
  |  Branch (104:32): [Folded, False: 129k]
  ------------------
  105|   129k|    const Dav1dDSPContext *const dsp = f->dsp;
  106|   129k|    enum CdefEdgeFlags edges = CDEF_HAVE_BOTTOM | (by_start > 0 ? CDEF_HAVE_TOP : 0);
  ------------------
  |  Branch (106:52): [True: 125k, False: 4.29k]
  ------------------
  107|   129k|    pixel *ptrs[3] = { p[0], p[1], p[2] };
  108|   129k|    const int sbsz = 16;
  109|   129k|    const int sb64w = f->sb128w << 1;
  110|   129k|    const int damping = f->frame_hdr->cdef.damping + bitdepth_min_8;
  111|   129k|    const enum Dav1dPixelLayout layout = f->cur.p.layout;
  112|   129k|    const int uv_idx = DAV1D_PIXEL_LAYOUT_I444 - layout;
  113|   129k|    const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
  114|   129k|    const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
  115|   129k|    static const uint8_t uv_dirs[2][8] = { { 0, 1, 2, 3, 4, 5, 6, 7 },
  116|   129k|                                           { 7, 0, 2, 4, 5, 6, 6, 6 } };
  117|   129k|    const uint8_t *uv_dir = uv_dirs[layout == DAV1D_PIXEL_LAYOUT_I422];
  118|   129k|    const int have_tt = f->c->n_tc > 1;
  119|   129k|    const int sb128 = f->seq_hdr->sb128;
  120|   129k|    const int resize = f->frame_hdr->width[0] != f->frame_hdr->width[1];
  121|   129k|    const ptrdiff_t y_stride = PXSTRIDE(f->cur.stride[0]);
  122|   129k|    const ptrdiff_t uv_stride = PXSTRIDE(f->cur.stride[1]);
  123|       |
  124|   797k|    for (int bit = 0, by = by_start; by < by_end; by += 2, edges |= CDEF_HAVE_TOP) {
  ------------------
  |  Branch (124:38): [True: 667k, False: 129k]
  ------------------
  125|   667k|        const int tf = tc->top_pre_cdef_toggle;
  126|   667k|        const int by_idx = (by & 30) >> 1;
  127|   667k|        if (by + 2 >= f->bh) edges &= ~CDEF_HAVE_BOTTOM;
  ------------------
  |  Branch (127:13): [True: 2.95k, False: 664k]
  ------------------
  128|       |
  129|   667k|        if ((!have_tt || sbrow_start || by + 2 < by_end) &&
  ------------------
  |  Branch (129:14): [True: 34.9k, False: 632k]
  |  Branch (129:26): [True: 59.5k, False: 573k]
  |  Branch (129:41): [True: 509k, False: 63.4k]
  ------------------
  130|   604k|            edges & CDEF_HAVE_BOTTOM)
  ------------------
  |  Branch (130:13): [True: 604k, False: 263]
  ------------------
  131|   604k|        {
  132|       |            // backup pre-filter data for next iteration
  133|   604k|            pixel *const cdef_top_bak[3] = {
  134|   604k|                f->lf.cdef_line[!tf][0] + have_tt * sby * 4 * y_stride,
  135|   604k|                f->lf.cdef_line[!tf][1] + have_tt * sby * 8 * uv_stride,
  136|   604k|                f->lf.cdef_line[!tf][2] + have_tt * sby * 8 * uv_stride
  137|   604k|            };
  138|   604k|            backup2lines(cdef_top_bak, ptrs, f->cur.stride, layout);
  139|   604k|        }
  140|       |
  141|   667k|        ALIGN_STK_16(pixel, lr_bak, 2 /* idx */, [3 /* plane */][8 /* y */][2 /* x */]);
  ------------------
  |  |  100|   667k|    ALIGN(type var[sz1d]sznd, ALIGN_16_VAL)
  |  |  ------------------
  |  |  |  |   86|   667k|    line __attribute__((aligned(align)))
  |  |  ------------------
  ------------------
  142|   667k|        pixel *iptrs[3] = { ptrs[0], ptrs[1], ptrs[2] };
  143|   667k|        edges &= ~CDEF_HAVE_LEFT;
  144|   667k|        edges |= CDEF_HAVE_RIGHT;
  145|   667k|        enum Backup2x8Flags prev_flag = 0;
  146|  2.04M|        for (int sbx = 0; sbx < sb64w; sbx++, edges |= CDEF_HAVE_LEFT) {
  ------------------
  |  Branch (146:27): [True: 1.37M, False: 667k]
  ------------------
  147|  1.37M|            const int sb128x = sbx >> 1;
  148|  1.37M|            const int sb64_idx = ((by & sbsz) >> 3) + (sbx & 1);
  149|  1.37M|            const int cdef_idx = lflvl[sb128x].cdef_idx[sb64_idx];
  150|  1.37M|            if (cdef_idx == -1 ||
  ------------------
  |  Branch (150:17): [True: 710k, False: 665k]
  ------------------
  151|   665k|                (!f->frame_hdr->cdef.y_strength[cdef_idx] &&
  ------------------
  |  Branch (151:18): [True: 143k, False: 521k]
  ------------------
  152|   143k|                 !f->frame_hdr->cdef.uv_strength[cdef_idx]))
  ------------------
  |  Branch (152:18): [True: 139k, False: 4.34k]
  ------------------
  153|   850k|            {
  154|   850k|                prev_flag = 0;
  155|   850k|                goto next_sb;
  156|   850k|            }
  157|       |
  158|       |            // Create a complete 32-bit mask for the sb row ahead of time.
  159|   526k|            const uint16_t (*noskip_row)[2] = &lflvl[sb128x].noskip_mask[by_idx];
  160|   526k|            const unsigned noskip_mask = (unsigned) noskip_row[0][1] << 16 |
  161|   526k|                                                    noskip_row[0][0];
  162|       |
  163|   526k|            const int y_lvl = f->frame_hdr->cdef.y_strength[cdef_idx];
  164|   526k|            const int uv_lvl = f->frame_hdr->cdef.uv_strength[cdef_idx];
  165|   526k|            const enum Backup2x8Flags flag = !!y_lvl + (!!uv_lvl << 1);
  166|       |
  167|   526k|            const int y_pri_lvl = (y_lvl >> 2) << bitdepth_min_8;
  168|   526k|            int y_sec_lvl = y_lvl & 3;
  169|   526k|            y_sec_lvl += y_sec_lvl == 3;
  170|   526k|            y_sec_lvl <<= bitdepth_min_8;
  171|       |
  172|   526k|            const int uv_pri_lvl = (uv_lvl >> 2) << bitdepth_min_8;
  173|   526k|            int uv_sec_lvl = uv_lvl & 3;
  174|   526k|            uv_sec_lvl += uv_sec_lvl == 3;
  175|   526k|            uv_sec_lvl <<= bitdepth_min_8;
  176|       |
  177|   526k|            pixel *bptrs[3] = { iptrs[0], iptrs[1], iptrs[2] };
  178|  1.03M|            for (int bx = sbx * sbsz; bx < imin((sbx + 1) * sbsz, f->bw);
  ------------------
  |  Branch (178:39): [True: 506k, False: 526k]
  ------------------
  179|   526k|                 bx += 2, edges |= CDEF_HAVE_LEFT)
  180|   506k|            {
  181|   506k|                if (bx + 2 >= f->bw) edges &= ~CDEF_HAVE_RIGHT;
  ------------------
  |  Branch (181:21): [True: 176k, False: 329k]
  ------------------
  182|       |
  183|       |                // check if this 8x8 block had any coded coefficients; if not,
  184|       |                // go to the next block
  185|   506k|                const uint32_t bx_mask = 3U << (bx & 30);
  186|   506k|                if (!(noskip_mask & bx_mask)) {
  ------------------
  |  Branch (186:21): [True: 87.7k, False: 418k]
  ------------------
  187|  87.7k|                    prev_flag = 0;
  188|  87.7k|                    goto next_b;
  189|  87.7k|                }
  190|   418k|                const enum Backup2x8Flags do_left = (prev_flag ^ flag) & flag;
  191|   418k|                prev_flag = flag;
  192|   418k|                if (do_left && edges & CDEF_HAVE_LEFT) {
  ------------------
  |  Branch (192:21): [True: 130k, False: 288k]
  |  Branch (192:32): [True: 5.23k, False: 125k]
  ------------------
  193|       |                    // we didn't backup the prefilter data because it wasn't
  194|       |                    // there, so do it here instead
  195|  5.23k|                    backup2x8(lr_bak[bit], bptrs, f->cur.stride, 0, layout, do_left);
  196|  5.23k|                }
  197|   418k|                if (edges & CDEF_HAVE_RIGHT) {
  ------------------
  |  Branch (197:21): [True: 296k, False: 122k]
  ------------------
  198|       |                    // backup pre-filter data for next iteration
  199|   296k|                    backup2x8(lr_bak[!bit], bptrs, f->cur.stride, 8, layout, flag);
  200|   296k|                }
  201|       |
  202|   418k|                int dir;
  203|   418k|                unsigned variance;
  204|   418k|                if (y_pri_lvl || uv_pri_lvl)
  ------------------
  |  Branch (204:21): [True: 375k, False: 43.8k]
  |  Branch (204:34): [True: 38.6k, False: 5.19k]
  ------------------
  205|   413k|                    dir = dsp->cdef.dir(bptrs[0], f->cur.stride[0],
  206|   413k|                                        &variance HIGHBD_CALL_SUFFIX);
  ------------------
  |  |   73|   413k|#define HIGHBD_CALL_SUFFIX , f->bitdepth_max
  ------------------
  207|       |
  208|   418k|                const pixel *top, *bot;
  209|   418k|                ptrdiff_t offset;
  210|       |
  211|   418k|                if (!have_tt) goto st_y;
  ------------------
  |  Branch (211:21): [True: 35.3k, False: 383k]
  ------------------
  212|   383k|                if (sbrow_start && by == by_start) {
  ------------------
  |  Branch (212:21): [True: 27.8k, False: 355k]
  |  Branch (212:36): [True: 27.8k, False: 6]
  ------------------
  213|  27.8k|                    if (resize) {
  ------------------
  |  Branch (213:25): [True: 5.88k, False: 21.9k]
  ------------------
  214|  5.88k|                        offset = (sby - 1) * 4 * y_stride + bx * 4;
  215|  5.88k|                        top = &f->lf.cdef_lpf_line[0][offset];
  216|  21.9k|                    } else {
  217|  21.9k|                        offset = (sby * (4 << sb128) - 4) * y_stride + bx * 4;
  218|  21.9k|                        top = &f->lf.lr_lpf_line[0][offset];
  219|  21.9k|                    }
  220|  27.8k|                    bot = bptrs[0] + 8 * y_stride;
  221|   355k|                } else if (!sbrow_start && by + 2 >= by_end) {
  ------------------
  |  Branch (221:28): [True: 355k, False: 18.4E]
  |  Branch (221:44): [True: 45.3k, False: 310k]
  ------------------
  222|  45.3k|                    top = &f->lf.cdef_line[tf][0][sby * 4 * y_stride + bx * 4];
  223|  45.3k|                    if (resize) {
  ------------------
  |  Branch (223:25): [True: 7.56k, False: 37.8k]
  ------------------
  224|  7.56k|                        offset = (sby * 4 + 2) * y_stride + bx * 4;
  225|  7.56k|                        bot = &f->lf.cdef_lpf_line[0][offset];
  226|  37.8k|                    } else {
  227|  37.8k|                        const int line = sby * (4 << sb128) + 4 * sb128 + 2;
  228|  37.8k|                        offset = line * y_stride + bx * 4;
  229|  37.8k|                        bot = &f->lf.lr_lpf_line[0][offset];
  230|  37.8k|                    }
  231|   310k|                } else {
  232|   345k|            st_y:;
  233|   345k|                    offset = sby * 4 * y_stride;
  234|   345k|                    top = &f->lf.cdef_line[tf][0][have_tt * offset + bx * 4];
  235|   345k|                    bot = bptrs[0] + 8 * y_stride;
  236|   345k|                }
  237|   419k|                if (y_pri_lvl) {
  ------------------
  |  Branch (237:21): [True: 375k, False: 43.9k]
  ------------------
  238|   375k|                    const int adj_y_pri_lvl = adjust_strength(y_pri_lvl, variance);
  239|   375k|                    if (adj_y_pri_lvl || y_sec_lvl)
  ------------------
  |  Branch (239:25): [True: 211k, False: 163k]
  |  Branch (239:42): [True: 117k, False: 45.9k]
  ------------------
  240|   329k|                        dsp->cdef.fb[0](bptrs[0], f->cur.stride[0], lr_bak[bit][0],
  241|   329k|                                        top, bot, adj_y_pri_lvl, y_sec_lvl,
  242|   329k|                                        dir, damping, edges HIGHBD_CALL_SUFFIX);
  ------------------
  |  |   73|   329k|#define HIGHBD_CALL_SUFFIX , f->bitdepth_max
  ------------------
  243|   375k|                } else if (y_sec_lvl)
  ------------------
  |  Branch (243:28): [True: 19.6k, False: 24.2k]
  ------------------
  244|  19.6k|                    dsp->cdef.fb[0](bptrs[0], f->cur.stride[0], lr_bak[bit][0],
  245|  19.6k|                                    top, bot, 0, y_sec_lvl, 0, damping,
  246|  19.6k|                                    edges HIGHBD_CALL_SUFFIX);
  ------------------
  |  |   73|  19.6k|#define HIGHBD_CALL_SUFFIX , f->bitdepth_max
  ------------------
  247|       |
  248|   419k|                if (!uv_lvl) goto skip_uv;
  ------------------
  |  Branch (248:21): [True: 125k, False: 293k]
  ------------------
  249|   293k|                assert(layout != DAV1D_PIXEL_LAYOUT_I400);
  ------------------
  |  |  140|   293k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 293k]
  |  |  |  Branch (140:68): [Folded, False: 293k]
  |  |  ------------------
  ------------------
  250|       |
  251|   293k|                const int uvdir = uv_pri_lvl ? uv_dir[dir] : 0;
  ------------------
  |  Branch (251:35): [True: 284k, False: 9.13k]
  ------------------
  252|   880k|                for (int pl = 1; pl <= 2; pl++) {
  ------------------
  |  Branch (252:34): [True: 586k, False: 294k]
  ------------------
  253|   586k|                    if (!have_tt) goto st_uv;
  ------------------
  |  Branch (253:25): [True: 47.2k, False: 539k]
  ------------------
  254|   539k|                    if (sbrow_start && by == by_start) {
  ------------------
  |  Branch (254:25): [True: 37.6k, False: 501k]
  |  Branch (254:40): [True: 37.6k, False: 0]
  ------------------
  255|  37.6k|                        if (resize) {
  ------------------
  |  Branch (255:29): [True: 10.8k, False: 26.7k]
  ------------------
  256|  10.8k|                            offset = (sby - 1) * 4 * uv_stride + (bx * 4 >> ss_hor);
  257|  10.8k|                            top = &f->lf.cdef_lpf_line[pl][offset];
  258|  26.7k|                        } else {
  259|  26.7k|                            const int line = sby * (4 << sb128) - 4;
  260|  26.7k|                            offset = line * uv_stride + (bx * 4 >> ss_hor);
  261|  26.7k|                            top = &f->lf.lr_lpf_line[pl][offset];
  262|  26.7k|                        }
  263|  37.6k|                        bot = bptrs[pl] + (8 >> ss_ver) * uv_stride;
  264|   501k|                    } else if (!sbrow_start && by + 2 >= by_end) {
  ------------------
  |  Branch (264:32): [True: 501k, False: 18.4E]
  |  Branch (264:48): [True: 67.2k, False: 434k]
  ------------------
  265|  67.2k|                        const ptrdiff_t top_offset = sby * 8 * uv_stride +
  266|  67.2k|                                                     (bx * 4 >> ss_hor);
  267|  67.2k|                        top = &f->lf.cdef_line[tf][pl][top_offset];
  268|  67.2k|                        if (resize) {
  ------------------
  |  Branch (268:29): [True: 13.3k, False: 53.8k]
  ------------------
  269|  13.3k|                            offset = (sby * 4 + 2) * uv_stride + (bx * 4 >> ss_hor);
  270|  13.3k|                            bot = &f->lf.cdef_lpf_line[pl][offset];
  271|  53.8k|                        } else {
  272|  53.8k|                            const int line = sby * (4 << sb128) + 4 * sb128 + 2;
  273|  53.8k|                            offset = line * uv_stride + (bx * 4 >> ss_hor);
  274|  53.8k|                            bot = &f->lf.lr_lpf_line[pl][offset];
  275|  53.8k|                        }
  276|   434k|                    } else {
  277|   481k|                st_uv:;
  278|   481k|                        const ptrdiff_t offset = sby * 8 * uv_stride;
  279|   481k|                        top = &f->lf.cdef_line[tf][pl][have_tt * offset + (bx * 4 >> ss_hor)];
  280|   481k|                        bot = bptrs[pl] + (8 >> ss_ver) * uv_stride;
  281|   481k|                    }
  282|   586k|                    dsp->cdef.fb[uv_idx](bptrs[pl], f->cur.stride[1],
  283|   586k|                                         lr_bak[bit][pl], top, bot,
  284|   586k|                                         uv_pri_lvl, uv_sec_lvl, uvdir,
  285|   586k|                                         damping - 1, edges HIGHBD_CALL_SUFFIX);
  ------------------
  |  |   73|   586k|#define HIGHBD_CALL_SUFFIX , f->bitdepth_max
  ------------------
  286|   586k|                }
  287|       |
  288|   418k|            skip_uv:
  289|   418k|                bit ^= 1;
  290|       |
  291|   506k|            next_b:
  292|   506k|                bptrs[0] += 8;
  293|   506k|                bptrs[1] += 8 >> ss_hor;
  294|   506k|                bptrs[2] += 8 >> ss_hor;
  295|   506k|            }
  296|       |
  297|  1.37M|        next_sb:
  298|  1.37M|            iptrs[0] += sbsz * 4;
  299|  1.37M|            iptrs[1] += sbsz * 4 >> ss_hor;
  300|  1.37M|            iptrs[2] += sbsz * 4 >> ss_hor;
  301|  1.37M|        }
  302|       |
  303|   667k|        ptrs[0] += 8 * PXSTRIDE(f->cur.stride[0]);
  304|   667k|        ptrs[1] += 8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver;
  305|   667k|        ptrs[2] += 8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver;
  306|   667k|        tc->top_pre_cdef_toggle ^= 1;
  307|   667k|    }
  308|   129k|}

dav1d_cdef_dsp_init_8bpc:
  320|  7.82k|COLD void bitfn(dav1d_cdef_dsp_init)(Dav1dCdefDSPContext *const c) {
  321|  7.82k|    c->dir = cdef_find_dir_c;
  322|  7.82k|    c->fb[0] = cdef_filter_block_8x8_c;
  323|  7.82k|    c->fb[1] = cdef_filter_block_4x8_c;
  324|  7.82k|    c->fb[2] = cdef_filter_block_4x4_c;
  325|       |
  326|  7.82k|#if HAVE_ASM
  327|       |#if ARCH_AARCH64 || ARCH_ARM
  328|       |    cdef_dsp_init_arm(c);
  329|       |#elif ARCH_PPC64LE
  330|       |    cdef_dsp_init_ppc(c);
  331|       |#elif ARCH_RISCV
  332|       |    cdef_dsp_init_riscv(c);
  333|       |#elif ARCH_X86
  334|       |    cdef_dsp_init_x86(c);
  335|       |#elif ARCH_LOONGARCH64
  336|       |    cdef_dsp_init_loongarch(c);
  337|       |#endif
  338|  7.82k|#endif
  339|  7.82k|}
dav1d_cdef_dsp_init_16bpc:
  320|  7.63k|COLD void bitfn(dav1d_cdef_dsp_init)(Dav1dCdefDSPContext *const c) {
  321|  7.63k|    c->dir = cdef_find_dir_c;
  322|  7.63k|    c->fb[0] = cdef_filter_block_8x8_c;
  323|  7.63k|    c->fb[1] = cdef_filter_block_4x8_c;
  324|  7.63k|    c->fb[2] = cdef_filter_block_4x4_c;
  325|       |
  326|  7.63k|#if HAVE_ASM
  327|       |#if ARCH_AARCH64 || ARCH_ARM
  328|       |    cdef_dsp_init_arm(c);
  329|       |#elif ARCH_PPC64LE
  330|       |    cdef_dsp_init_ppc(c);
  331|       |#elif ARCH_RISCV
  332|       |    cdef_dsp_init_riscv(c);
  333|       |#elif ARCH_X86
  334|       |    cdef_dsp_init_x86(c);
  335|       |#elif ARCH_LOONGARCH64
  336|       |    cdef_dsp_init_loongarch(c);
  337|       |#endif
  338|  7.63k|#endif
  339|  7.63k|}

dav1d_cdf_thread_update:
 3918|  7.22k|{
 3919|  7.22k|#define update_cdf_1d(n1d, name) \
 3920|  7.22k|    do { \
 3921|  7.22k|        dst->name[n1d] = 0; \
 3922|  7.22k|    } while (0)
 3923|  7.22k|#define update_cdf_2d(n1d, n2d, name) \
 3924|  7.22k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
 3925|  7.22k|#define update_cdf_3d(n1d, n2d, n3d, name) \
 3926|  7.22k|    for (int k = 0; k < (n1d); k++) update_cdf_2d(n2d, n3d, name[k])
 3927|  7.22k|#define update_cdf_4d(n1d, n2d, n3d, n4d, name) \
 3928|  7.22k|    for (int l = 0; l < (n1d); l++) update_cdf_3d(n2d, n3d, n4d, name[l])
 3929|       |
 3930|  7.22k|    memcpy(dst, src, offsetof(CdfContext, m.intrabc));
 3931|       |
 3932|  7.22k|    update_cdf_3d(2, 2, 4, coef.eob_bin_16);
  ------------------
  |  | 3926|  21.6k|    for (int k = 0; k < (n1d); k++) update_cdf_2d(n2d, n3d, name[k])
  |  |  ------------------
  |  |  |  | 3924|  43.3k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  |  |  ------------------
  |  |  |  |  |  | 3920|  28.8k|    do { \
  |  |  |  |  |  | 3921|  28.8k|        dst->name[n1d] = 0; \
  |  |  |  |  |  | 3922|  28.8k|    } while (0)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (3922:14): [Folded, False: 28.8k]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3924:21): [True: 28.8k, False: 14.4k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3926:21): [True: 14.4k, False: 7.22k]
  |  |  ------------------
  ------------------
 3933|  7.22k|    update_cdf_3d(2, 2, 5, coef.eob_bin_32);
  ------------------
  |  | 3926|  21.6k|    for (int k = 0; k < (n1d); k++) update_cdf_2d(n2d, n3d, name[k])
  |  |  ------------------
  |  |  |  | 3924|  43.3k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  |  |  ------------------
  |  |  |  |  |  | 3920|  28.8k|    do { \
  |  |  |  |  |  | 3921|  28.8k|        dst->name[n1d] = 0; \
  |  |  |  |  |  | 3922|  28.8k|    } while (0)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (3922:14): [Folded, False: 28.8k]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3924:21): [True: 28.8k, False: 14.4k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3926:21): [True: 14.4k, False: 7.22k]
  |  |  ------------------
  ------------------
 3934|  7.22k|    update_cdf_3d(2, 2, 6, coef.eob_bin_64);
  ------------------
  |  | 3926|  21.6k|    for (int k = 0; k < (n1d); k++) update_cdf_2d(n2d, n3d, name[k])
  |  |  ------------------
  |  |  |  | 3924|  43.3k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  |  |  ------------------
  |  |  |  |  |  | 3920|  28.8k|    do { \
  |  |  |  |  |  | 3921|  28.8k|        dst->name[n1d] = 0; \
  |  |  |  |  |  | 3922|  28.8k|    } while (0)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (3922:14): [Folded, False: 28.8k]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3924:21): [True: 28.8k, False: 14.4k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3926:21): [True: 14.4k, False: 7.22k]
  |  |  ------------------
  ------------------
 3935|  7.22k|    update_cdf_3d(2, 2, 7, coef.eob_bin_128);
  ------------------
  |  | 3926|  21.6k|    for (int k = 0; k < (n1d); k++) update_cdf_2d(n2d, n3d, name[k])
  |  |  ------------------
  |  |  |  | 3924|  43.3k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  |  |  ------------------
  |  |  |  |  |  | 3920|  28.8k|    do { \
  |  |  |  |  |  | 3921|  28.8k|        dst->name[n1d] = 0; \
  |  |  |  |  |  | 3922|  28.8k|    } while (0)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (3922:14): [Folded, False: 28.8k]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3924:21): [True: 28.8k, False: 14.4k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3926:21): [True: 14.4k, False: 7.22k]
  |  |  ------------------
  ------------------
 3936|  7.22k|    update_cdf_3d(2, 2, 8, coef.eob_bin_256);
  ------------------
  |  | 3926|  21.6k|    for (int k = 0; k < (n1d); k++) update_cdf_2d(n2d, n3d, name[k])
  |  |  ------------------
  |  |  |  | 3924|  43.3k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  |  |  ------------------
  |  |  |  |  |  | 3920|  28.8k|    do { \
  |  |  |  |  |  | 3921|  28.8k|        dst->name[n1d] = 0; \
  |  |  |  |  |  | 3922|  28.8k|    } while (0)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (3922:14): [Folded, False: 28.8k]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3924:21): [True: 28.8k, False: 14.4k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3926:21): [True: 14.4k, False: 7.22k]
  |  |  ------------------
  ------------------
 3937|  7.22k|    update_cdf_2d(2, 9, coef.eob_bin_512);
  ------------------
  |  | 3924|  21.6k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  ------------------
  |  |  |  | 3920|  14.4k|    do { \
  |  |  |  | 3921|  14.4k|        dst->name[n1d] = 0; \
  |  |  |  | 3922|  14.4k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3922:14): [Folded, False: 14.4k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3924:21): [True: 14.4k, False: 7.22k]
  |  |  ------------------
  ------------------
 3938|  7.22k|    update_cdf_2d(2, 10, coef.eob_bin_1024);
  ------------------
  |  | 3924|  21.6k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  ------------------
  |  |  |  | 3920|  14.4k|    do { \
  |  |  |  | 3921|  14.4k|        dst->name[n1d] = 0; \
  |  |  |  | 3922|  14.4k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3922:14): [Folded, False: 14.4k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3924:21): [True: 14.4k, False: 7.22k]
  |  |  ------------------
  ------------------
 3939|  7.22k|    update_cdf_4d(N_TX_SIZES, 2, 4, 2, coef.eob_base_tok);
  ------------------
  |  | 3928|  43.3k|    for (int l = 0; l < (n1d); l++) update_cdf_3d(n2d, n3d, n4d, name[l])
  |  |  ------------------
  |  |  |  | 3926|   108k|    for (int k = 0; k < (n1d); k++) update_cdf_2d(n2d, n3d, name[k])
  |  |  |  |  ------------------
  |  |  |  |  |  | 3924|   361k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3920|   288k|    do { \
  |  |  |  |  |  |  |  | 3921|   288k|        dst->name[n1d] = 0; \
  |  |  |  |  |  |  |  | 3922|   288k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (3922:14): [Folded, False: 288k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (3924:21): [True: 288k, False: 72.2k]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3926:21): [True: 72.2k, False: 36.1k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3928:21): [True: 36.1k, False: 7.22k]
  |  |  ------------------
  ------------------
 3940|  7.22k|    update_cdf_4d(N_TX_SIZES, 2, 41 /*42*/, 3, coef.base_tok);
  ------------------
  |  | 3928|  43.3k|    for (int l = 0; l < (n1d); l++) update_cdf_3d(n2d, n3d, n4d, name[l])
  |  |  ------------------
  |  |  |  | 3926|   108k|    for (int k = 0; k < (n1d); k++) update_cdf_2d(n2d, n3d, name[k])
  |  |  |  |  ------------------
  |  |  |  |  |  | 3924|  3.03M|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3920|  2.96M|    do { \
  |  |  |  |  |  |  |  | 3921|  2.96M|        dst->name[n1d] = 0; \
  |  |  |  |  |  |  |  | 3922|  2.96M|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (3922:14): [Folded, False: 2.96M]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (3924:21): [True: 2.96M, False: 72.2k]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3926:21): [True: 72.2k, False: 36.1k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3928:21): [True: 36.1k, False: 7.22k]
  |  |  ------------------
  ------------------
 3941|  7.22k|    update_cdf_4d(4, 2, 21, 3, coef.br_tok);
  ------------------
  |  | 3928|  36.1k|    for (int l = 0; l < (n1d); l++) update_cdf_3d(n2d, n3d, n4d, name[l])
  |  |  ------------------
  |  |  |  | 3926|  86.6k|    for (int k = 0; k < (n1d); k++) update_cdf_2d(n2d, n3d, name[k])
  |  |  |  |  ------------------
  |  |  |  |  |  | 3924|  1.27M|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3920|  1.21M|    do { \
  |  |  |  |  |  |  |  | 3921|  1.21M|        dst->name[n1d] = 0; \
  |  |  |  |  |  |  |  | 3922|  1.21M|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (3922:14): [Folded, False: 1.21M]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (3924:21): [True: 1.21M, False: 57.7k]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3926:21): [True: 57.7k, False: 28.8k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3928:21): [True: 28.8k, False: 7.22k]
  |  |  ------------------
  ------------------
 3942|  7.22k|    update_cdf_4d(N_TX_SIZES, 2, 9, 1, coef.eob_hi_bit);
  ------------------
  |  | 3928|  43.3k|    for (int l = 0; l < (n1d); l++) update_cdf_3d(n2d, n3d, n4d, name[l])
  |  |  ------------------
  |  |  |  | 3926|   108k|    for (int k = 0; k < (n1d); k++) update_cdf_2d(n2d, n3d, name[k])
  |  |  |  |  ------------------
  |  |  |  |  |  | 3924|   722k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3920|   649k|    do { \
  |  |  |  |  |  |  |  | 3921|   649k|        dst->name[n1d] = 0; \
  |  |  |  |  |  |  |  | 3922|   649k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (3922:14): [Folded, False: 649k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (3924:21): [True: 649k, False: 72.2k]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3926:21): [True: 72.2k, False: 36.1k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3928:21): [True: 36.1k, False: 7.22k]
  |  |  ------------------
  ------------------
 3943|  7.22k|    update_cdf_3d(N_TX_SIZES, 13, 1, coef.skip);
  ------------------
  |  | 3926|  43.3k|    for (int k = 0; k < (n1d); k++) update_cdf_2d(n2d, n3d, name[k])
  |  |  ------------------
  |  |  |  | 3924|   505k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  |  |  ------------------
  |  |  |  |  |  | 3920|   469k|    do { \
  |  |  |  |  |  | 3921|   469k|        dst->name[n1d] = 0; \
  |  |  |  |  |  | 3922|   469k|    } while (0)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (3922:14): [Folded, False: 469k]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3924:21): [True: 469k, False: 36.1k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3926:21): [True: 36.1k, False: 7.22k]
  |  |  ------------------
  ------------------
 3944|  7.22k|    update_cdf_3d(2, 3, 1, coef.dc_sign);
  ------------------
  |  | 3926|  21.6k|    for (int k = 0; k < (n1d); k++) update_cdf_2d(n2d, n3d, name[k])
  |  |  ------------------
  |  |  |  | 3924|  57.7k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  |  |  ------------------
  |  |  |  |  |  | 3920|  43.3k|    do { \
  |  |  |  |  |  | 3921|  43.3k|        dst->name[n1d] = 0; \
  |  |  |  |  |  | 3922|  43.3k|    } while (0)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (3922:14): [Folded, False: 43.3k]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3924:21): [True: 43.3k, False: 14.4k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3926:21): [True: 14.4k, False: 7.22k]
  |  |  ------------------
  ------------------
 3945|       |
 3946|  7.22k|    update_cdf_3d(2, N_INTRA_PRED_MODES, N_UV_INTRA_PRED_MODES - 1 - !k, m.uv_mode);
  ------------------
  |  | 3926|  21.6k|    for (int k = 0; k < (n1d); k++) update_cdf_2d(n2d, n3d, name[k])
  |  |  ------------------
  |  |  |  | 3924|   202k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  |  |  ------------------
  |  |  |  |  |  | 3920|   187k|    do { \
  |  |  |  |  |  | 3921|   187k|        dst->name[n1d] = 0; \
  |  |  |  |  |  | 3922|   187k|    } while (0)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (3922:14): [Folded, False: 187k]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3924:21): [True: 187k, False: 14.4k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3926:21): [True: 14.4k, False: 7.22k]
  |  |  ------------------
  ------------------
 3947|  7.22k|    update_cdf_2d(4, N_PARTITIONS - 3, m.partition[BL_128X128]);
  ------------------
  |  | 3924|  36.1k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  ------------------
  |  |  |  | 3920|  28.8k|    do { \
  |  |  |  | 3921|  28.8k|        dst->name[n1d] = 0; \
  |  |  |  | 3922|  28.8k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3922:14): [Folded, False: 28.8k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3924:21): [True: 28.8k, False: 7.22k]
  |  |  ------------------
  ------------------
 3948|  28.8k|    for (int k = BL_64X64; k < BL_8X8; k++)
  ------------------
  |  Branch (3948:28): [True: 21.6k, False: 7.22k]
  ------------------
 3949|  21.6k|        update_cdf_2d(4, N_PARTITIONS - 1, m.partition[k]);
  ------------------
  |  | 3924|   108k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  ------------------
  |  |  |  | 3920|  86.6k|    do { \
  |  |  |  | 3921|  86.6k|        dst->name[n1d] = 0; \
  |  |  |  | 3922|  86.6k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3922:14): [Folded, False: 86.6k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3924:21): [True: 86.6k, False: 21.6k]
  |  |  ------------------
  ------------------
 3950|  7.22k|    update_cdf_2d(4, N_SUB8X8_PARTITIONS - 1, m.partition[BL_8X8]);
  ------------------
  |  | 3924|  36.1k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  ------------------
  |  |  |  | 3920|  28.8k|    do { \
  |  |  |  | 3921|  28.8k|        dst->name[n1d] = 0; \
  |  |  |  | 3922|  28.8k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3922:14): [Folded, False: 28.8k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3924:21): [True: 28.8k, False: 7.22k]
  |  |  ------------------
  ------------------
 3951|  7.22k|    update_cdf_2d(6, 15, m.cfl_alpha);
  ------------------
  |  | 3924|  50.5k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  ------------------
  |  |  |  | 3920|  43.3k|    do { \
  |  |  |  | 3921|  43.3k|        dst->name[n1d] = 0; \
  |  |  |  | 3922|  43.3k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3922:14): [Folded, False: 43.3k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3924:21): [True: 43.3k, False: 7.22k]
  |  |  ------------------
  ------------------
 3952|  7.22k|    update_cdf_2d(2, 15, m.txtp_inter1);
  ------------------
  |  | 3924|  21.6k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  ------------------
  |  |  |  | 3920|  14.4k|    do { \
  |  |  |  | 3921|  14.4k|        dst->name[n1d] = 0; \
  |  |  |  | 3922|  14.4k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3922:14): [Folded, False: 14.4k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3924:21): [True: 14.4k, False: 7.22k]
  |  |  ------------------
  ------------------
 3953|  7.22k|    update_cdf_1d(11, m.txtp_inter2);
  ------------------
  |  | 3920|  7.22k|    do { \
  |  | 3921|  7.22k|        dst->name[n1d] = 0; \
  |  | 3922|  7.22k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (3922:14): [Folded, False: 7.22k]
  |  |  ------------------
  ------------------
 3954|  7.22k|    update_cdf_3d(2, N_INTRA_PRED_MODES, 6, m.txtp_intra1);
  ------------------
  |  | 3926|  21.6k|    for (int k = 0; k < (n1d); k++) update_cdf_2d(n2d, n3d, name[k])
  |  |  ------------------
  |  |  |  | 3924|   202k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  |  |  ------------------
  |  |  |  |  |  | 3920|   187k|    do { \
  |  |  |  |  |  | 3921|   187k|        dst->name[n1d] = 0; \
  |  |  |  |  |  | 3922|   187k|    } while (0)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (3922:14): [Folded, False: 187k]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3924:21): [True: 187k, False: 14.4k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3926:21): [True: 14.4k, False: 7.22k]
  |  |  ------------------
  ------------------
 3955|  7.22k|    update_cdf_3d(3, N_INTRA_PRED_MODES, 4, m.txtp_intra2);
  ------------------
  |  | 3926|  28.8k|    for (int k = 0; k < (n1d); k++) update_cdf_2d(n2d, n3d, name[k])
  |  |  ------------------
  |  |  |  | 3924|   303k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  |  |  ------------------
  |  |  |  |  |  | 3920|   281k|    do { \
  |  |  |  |  |  | 3921|   281k|        dst->name[n1d] = 0; \
  |  |  |  |  |  | 3922|   281k|    } while (0)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (3922:14): [Folded, False: 281k]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3924:21): [True: 281k, False: 21.6k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3926:21): [True: 21.6k, False: 7.22k]
  |  |  ------------------
  ------------------
 3956|  7.22k|    update_cdf_1d(7, m.cfl_sign);
  ------------------
  |  | 3920|  7.22k|    do { \
  |  | 3921|  7.22k|        dst->name[n1d] = 0; \
  |  | 3922|  7.22k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (3922:14): [Folded, False: 7.22k]
  |  |  ------------------
  ------------------
 3957|  7.22k|    update_cdf_2d(8, 6, m.angle_delta);
  ------------------
  |  | 3924|  64.9k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  ------------------
  |  |  |  | 3920|  57.7k|    do { \
  |  |  |  | 3921|  57.7k|        dst->name[n1d] = 0; \
  |  |  |  | 3922|  57.7k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3922:14): [Folded, False: 57.7k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3924:21): [True: 57.7k, False: 7.22k]
  |  |  ------------------
  ------------------
 3958|  7.22k|    update_cdf_1d(4, m.filter_intra);
  ------------------
  |  | 3920|  7.22k|    do { \
  |  | 3921|  7.22k|        dst->name[n1d] = 0; \
  |  | 3922|  7.22k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (3922:14): [Folded, False: 7.22k]
  |  |  ------------------
  ------------------
 3959|  7.22k|    update_cdf_2d(3, DAV1D_MAX_SEGMENTS - 1, m.seg_id);
  ------------------
  |  | 3924|  28.8k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  ------------------
  |  |  |  | 3920|  21.6k|    do { \
  |  |  |  | 3921|  21.6k|        dst->name[n1d] = 0; \
  |  |  |  | 3922|  21.6k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3922:14): [Folded, False: 21.6k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3924:21): [True: 21.6k, False: 7.22k]
  |  |  ------------------
  ------------------
 3960|  7.22k|    update_cdf_3d(2, 7, 6, m.pal_sz);
  ------------------
  |  | 3926|  21.6k|    for (int k = 0; k < (n1d); k++) update_cdf_2d(n2d, n3d, name[k])
  |  |  ------------------
  |  |  |  | 3924|   115k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  |  |  ------------------
  |  |  |  |  |  | 3920|   101k|    do { \
  |  |  |  |  |  | 3921|   101k|        dst->name[n1d] = 0; \
  |  |  |  |  |  | 3922|   101k|    } while (0)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (3922:14): [Folded, False: 101k]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3924:21): [True: 101k, False: 14.4k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3926:21): [True: 14.4k, False: 7.22k]
  |  |  ------------------
  ------------------
 3961|  7.22k|    update_cdf_4d(2, 7, 5, k + 1, m.color_map);
  ------------------
  |  | 3928|  21.6k|    for (int l = 0; l < (n1d); l++) update_cdf_3d(n2d, n3d, n4d, name[l])
  |  |  ------------------
  |  |  |  | 3926|   115k|    for (int k = 0; k < (n1d); k++) update_cdf_2d(n2d, n3d, name[k])
  |  |  |  |  ------------------
  |  |  |  |  |  | 3924|   606k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  | 3920|   505k|    do { \
  |  |  |  |  |  |  |  | 3921|   505k|        dst->name[n1d] = 0; \
  |  |  |  |  |  |  |  | 3922|   505k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (3922:14): [Folded, False: 505k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (3924:21): [True: 505k, False: 101k]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3926:21): [True: 101k, False: 14.4k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3928:21): [True: 14.4k, False: 7.22k]
  |  |  ------------------
  ------------------
 3962|  7.22k|    update_cdf_3d(N_TX_SIZES - 1, 3, imin(k + 1, 2), m.txsz);
  ------------------
  |  | 3926|  36.1k|    for (int k = 0; k < (n1d); k++) update_cdf_2d(n2d, n3d, name[k])
  |  |  ------------------
  |  |  |  | 3924|   115k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  |  |  ------------------
  |  |  |  |  |  | 3920|  86.6k|    do { \
  |  |  |  |  |  | 3921|  86.6k|        dst->name[n1d] = 0; \
  |  |  |  |  |  | 3922|  86.6k|    } while (0)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (3922:14): [Folded, False: 86.6k]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3924:21): [True: 86.6k, False: 28.8k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3926:21): [True: 28.8k, False: 7.22k]
  |  |  ------------------
  ------------------
 3963|  7.22k|    update_cdf_1d(3, m.delta_q);
  ------------------
  |  | 3920|  7.22k|    do { \
  |  | 3921|  7.22k|        dst->name[n1d] = 0; \
  |  | 3922|  7.22k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (3922:14): [Folded, False: 7.22k]
  |  |  ------------------
  ------------------
 3964|  7.22k|    update_cdf_2d(5, 3, m.delta_lf);
  ------------------
  |  | 3924|  43.3k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  ------------------
  |  |  |  | 3920|  36.1k|    do { \
  |  |  |  | 3921|  36.1k|        dst->name[n1d] = 0; \
  |  |  |  | 3922|  36.1k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3922:14): [Folded, False: 36.1k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3924:21): [True: 36.1k, False: 7.22k]
  |  |  ------------------
  ------------------
 3965|  7.22k|    update_cdf_1d(2, m.restore_switchable);
  ------------------
  |  | 3920|  7.22k|    do { \
  |  | 3921|  7.22k|        dst->name[n1d] = 0; \
  |  | 3922|  7.22k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (3922:14): [Folded, False: 7.22k]
  |  |  ------------------
  ------------------
 3966|  7.22k|    update_cdf_1d(1, m.restore_wiener);
  ------------------
  |  | 3920|  7.22k|    do { \
  |  | 3921|  7.22k|        dst->name[n1d] = 0; \
  |  | 3922|  7.22k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (3922:14): [Folded, False: 7.22k]
  |  |  ------------------
  ------------------
 3967|  7.22k|    update_cdf_1d(1, m.restore_sgrproj);
  ------------------
  |  | 3920|  7.22k|    do { \
  |  | 3921|  7.22k|        dst->name[n1d] = 0; \
  |  | 3922|  7.22k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (3922:14): [Folded, False: 7.22k]
  |  |  ------------------
  ------------------
 3968|  7.22k|    update_cdf_2d(4, 1, m.txtp_inter3);
  ------------------
  |  | 3924|  36.1k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  ------------------
  |  |  |  | 3920|  28.8k|    do { \
  |  |  |  | 3921|  28.8k|        dst->name[n1d] = 0; \
  |  |  |  | 3922|  28.8k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3922:14): [Folded, False: 28.8k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3924:21): [True: 28.8k, False: 7.22k]
  |  |  ------------------
  ------------------
 3969|  7.22k|    update_cdf_2d(N_BS_SIZES, 1, m.use_filter_intra);
  ------------------
  |  | 3924|   166k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  ------------------
  |  |  |  | 3920|   158k|    do { \
  |  |  |  | 3921|   158k|        dst->name[n1d] = 0; \
  |  |  |  | 3922|   158k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3922:14): [Folded, False: 158k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3924:21): [True: 158k, False: 7.22k]
  |  |  ------------------
  ------------------
 3970|  7.22k|    update_cdf_3d(7, 3, 1, m.txpart);
  ------------------
  |  | 3926|  57.7k|    for (int k = 0; k < (n1d); k++) update_cdf_2d(n2d, n3d, name[k])
  |  |  ------------------
  |  |  |  | 3924|   202k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  |  |  ------------------
  |  |  |  |  |  | 3920|   151k|    do { \
  |  |  |  |  |  | 3921|   151k|        dst->name[n1d] = 0; \
  |  |  |  |  |  | 3922|   151k|    } while (0)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (3922:14): [Folded, False: 151k]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3924:21): [True: 151k, False: 50.5k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3926:21): [True: 50.5k, False: 7.22k]
  |  |  ------------------
  ------------------
 3971|  7.22k|    update_cdf_2d(3, 1, m.skip);
  ------------------
  |  | 3924|  28.8k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  ------------------
  |  |  |  | 3920|  21.6k|    do { \
  |  |  |  | 3921|  21.6k|        dst->name[n1d] = 0; \
  |  |  |  | 3922|  21.6k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3922:14): [Folded, False: 21.6k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3924:21): [True: 21.6k, False: 7.22k]
  |  |  ------------------
  ------------------
 3972|  7.22k|    update_cdf_3d(7, 3, 1, m.pal_y);
  ------------------
  |  | 3926|  57.7k|    for (int k = 0; k < (n1d); k++) update_cdf_2d(n2d, n3d, name[k])
  |  |  ------------------
  |  |  |  | 3924|   202k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  |  |  ------------------
  |  |  |  |  |  | 3920|   151k|    do { \
  |  |  |  |  |  | 3921|   151k|        dst->name[n1d] = 0; \
  |  |  |  |  |  | 3922|   151k|    } while (0)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (3922:14): [Folded, False: 151k]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3924:21): [True: 151k, False: 50.5k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3926:21): [True: 50.5k, False: 7.22k]
  |  |  ------------------
  ------------------
 3973|  7.22k|    update_cdf_2d(2, 1, m.pal_uv);
  ------------------
  |  | 3924|  21.6k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  ------------------
  |  |  |  | 3920|  14.4k|    do { \
  |  |  |  | 3921|  14.4k|        dst->name[n1d] = 0; \
  |  |  |  | 3922|  14.4k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3922:14): [Folded, False: 14.4k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3924:21): [True: 14.4k, False: 7.22k]
  |  |  ------------------
  ------------------
 3974|       |
 3975|  7.22k|    if (IS_KEY_OR_INTRA(hdr))
  ------------------
  |  |   43|  7.22k|    (!IS_INTER_OR_SWITCH(frame_header))
  |  |  ------------------
  |  |  |  |   36|  7.22k|    ((frame_header)->frame_type & 1)
  |  |  ------------------
  |  |  |  Branch (43:5): [True: 5.06k, False: 2.15k]
  |  |  ------------------
  ------------------
 3976|  5.06k|        return;
 3977|       |
 3978|  2.15k|    memcpy(dst->m.y_mode, src->m.y_mode,
 3979|  2.15k|           offsetof(CdfContext, kfym) - offsetof(CdfContext, m.y_mode));
 3980|       |
 3981|  2.15k|    update_cdf_2d(4, N_INTRA_PRED_MODES - 1, m.y_mode);
  ------------------
  |  | 3924|  10.7k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  ------------------
  |  |  |  | 3920|  10.7k|    do { \
  |  |  |  | 3921|  8.63k|        dst->name[n1d] = 0; \
  |  |  |  | 3922|  8.63k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3922:14): [Folded, False: 8.63k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3924:21): [True: 8.63k, False: 2.15k]
  |  |  ------------------
  ------------------
 3982|  2.15k|    update_cdf_2d(9, 15, m.wedge_idx);
  ------------------
  |  | 3924|  21.5k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  ------------------
  |  |  |  | 3920|  21.5k|    do { \
  |  |  |  | 3921|  19.4k|        dst->name[n1d] = 0; \
  |  |  |  | 3922|  19.4k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3922:14): [Folded, False: 19.4k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3924:21): [True: 19.4k, False: 2.15k]
  |  |  ------------------
  ------------------
 3983|  2.15k|    update_cdf_2d(8, N_COMP_INTER_PRED_MODES - 1, m.comp_inter_mode);
  ------------------
  |  | 3924|  19.4k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  ------------------
  |  |  |  | 3920|  19.4k|    do { \
  |  |  |  | 3921|  17.2k|        dst->name[n1d] = 0; \
  |  |  |  | 3922|  17.2k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3922:14): [Folded, False: 17.2k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3924:21): [True: 17.2k, False: 2.15k]
  |  |  ------------------
  ------------------
 3984|  2.15k|    update_cdf_3d(2, 8, DAV1D_N_SWITCHABLE_FILTERS - 1, m.filter);
  ------------------
  |  | 3926|  6.47k|    for (int k = 0; k < (n1d); k++) update_cdf_2d(n2d, n3d, name[k])
  |  |  ------------------
  |  |  |  | 3924|  38.8k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  |  |  ------------------
  |  |  |  |  |  | 3920|  36.7k|    do { \
  |  |  |  |  |  | 3921|  34.5k|        dst->name[n1d] = 0; \
  |  |  |  |  |  | 3922|  34.5k|    } while (0)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (3922:14): [Folded, False: 34.5k]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3924:21): [True: 34.5k, False: 4.31k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3926:21): [True: 4.31k, False: 2.15k]
  |  |  ------------------
  ------------------
 3985|  2.15k|    update_cdf_2d(4, 3, m.interintra_mode);
  ------------------
  |  | 3924|  10.7k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  ------------------
  |  |  |  | 3920|  10.7k|    do { \
  |  |  |  | 3921|  8.63k|        dst->name[n1d] = 0; \
  |  |  |  | 3922|  8.63k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3922:14): [Folded, False: 8.63k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3924:21): [True: 8.63k, False: 2.15k]
  |  |  ------------------
  ------------------
 3986|  2.15k|    update_cdf_2d(N_BS_SIZES, 2, m.motion_mode);
  ------------------
  |  | 3924|  49.6k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  ------------------
  |  |  |  | 3920|  49.6k|    do { \
  |  |  |  | 3921|  47.4k|        dst->name[n1d] = 0; \
  |  |  |  | 3922|  47.4k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3922:14): [Folded, False: 47.4k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3924:21): [True: 47.4k, False: 2.15k]
  |  |  ------------------
  ------------------
 3987|  2.15k|    update_cdf_2d(3, 1, m.skip_mode);
  ------------------
  |  | 3924|  8.63k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  ------------------
  |  |  |  | 3920|  8.63k|    do { \
  |  |  |  | 3921|  6.47k|        dst->name[n1d] = 0; \
  |  |  |  | 3922|  6.47k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3922:14): [Folded, False: 6.47k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3924:21): [True: 6.47k, False: 2.15k]
  |  |  ------------------
  ------------------
 3988|  2.15k|    update_cdf_2d(6, 1, m.newmv_mode);
  ------------------
  |  | 3924|  15.1k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  ------------------
  |  |  |  | 3920|  15.1k|    do { \
  |  |  |  | 3921|  12.9k|        dst->name[n1d] = 0; \
  |  |  |  | 3922|  12.9k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3922:14): [Folded, False: 12.9k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3924:21): [True: 12.9k, False: 2.15k]
  |  |  ------------------
  ------------------
 3989|  2.15k|    update_cdf_2d(2, 1, m.globalmv_mode);
  ------------------
  |  | 3924|  6.47k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  ------------------
  |  |  |  | 3920|  6.47k|    do { \
  |  |  |  | 3921|  4.31k|        dst->name[n1d] = 0; \
  |  |  |  | 3922|  4.31k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3922:14): [Folded, False: 4.31k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3924:21): [True: 4.31k, False: 2.15k]
  |  |  ------------------
  ------------------
 3990|  2.15k|    update_cdf_2d(6, 1, m.refmv_mode);
  ------------------
  |  | 3924|  15.1k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  ------------------
  |  |  |  | 3920|  15.1k|    do { \
  |  |  |  | 3921|  12.9k|        dst->name[n1d] = 0; \
  |  |  |  | 3922|  12.9k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3922:14): [Folded, False: 12.9k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3924:21): [True: 12.9k, False: 2.15k]
  |  |  ------------------
  ------------------
 3991|  2.15k|    update_cdf_2d(3, 1, m.drl_bit);
  ------------------
  |  | 3924|  8.63k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  ------------------
  |  |  |  | 3920|  8.63k|    do { \
  |  |  |  | 3921|  6.47k|        dst->name[n1d] = 0; \
  |  |  |  | 3922|  6.47k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3922:14): [Folded, False: 6.47k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3924:21): [True: 6.47k, False: 2.15k]
  |  |  ------------------
  ------------------
 3992|  2.15k|    update_cdf_2d(4, 1, m.intra);
  ------------------
  |  | 3924|  10.7k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  ------------------
  |  |  |  | 3920|  10.7k|    do { \
  |  |  |  | 3921|  8.63k|        dst->name[n1d] = 0; \
  |  |  |  | 3922|  8.63k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3922:14): [Folded, False: 8.63k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3924:21): [True: 8.63k, False: 2.15k]
  |  |  ------------------
  ------------------
 3993|  2.15k|    update_cdf_2d(5, 1, m.comp);
  ------------------
  |  | 3924|  12.9k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  ------------------
  |  |  |  | 3920|  12.9k|    do { \
  |  |  |  | 3921|  10.7k|        dst->name[n1d] = 0; \
  |  |  |  | 3922|  10.7k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3922:14): [Folded, False: 10.7k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3924:21): [True: 10.7k, False: 2.15k]
  |  |  ------------------
  ------------------
 3994|  2.15k|    update_cdf_2d(5, 1, m.comp_dir);
  ------------------
  |  | 3924|  12.9k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  ------------------
  |  |  |  | 3920|  12.9k|    do { \
  |  |  |  | 3921|  10.7k|        dst->name[n1d] = 0; \
  |  |  |  | 3922|  10.7k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3922:14): [Folded, False: 10.7k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3924:21): [True: 10.7k, False: 2.15k]
  |  |  ------------------
  ------------------
 3995|  2.15k|    update_cdf_2d(6, 1, m.jnt_comp);
  ------------------
  |  | 3924|  15.1k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  ------------------
  |  |  |  | 3920|  15.1k|    do { \
  |  |  |  | 3921|  12.9k|        dst->name[n1d] = 0; \
  |  |  |  | 3922|  12.9k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3922:14): [Folded, False: 12.9k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3924:21): [True: 12.9k, False: 2.15k]
  |  |  ------------------
  ------------------
 3996|  2.15k|    update_cdf_2d(6, 1, m.mask_comp);
  ------------------
  |  | 3924|  15.1k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  ------------------
  |  |  |  | 3920|  15.1k|    do { \
  |  |  |  | 3921|  12.9k|        dst->name[n1d] = 0; \
  |  |  |  | 3922|  12.9k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3922:14): [Folded, False: 12.9k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3924:21): [True: 12.9k, False: 2.15k]
  |  |  ------------------
  ------------------
 3997|  2.15k|    update_cdf_2d(9, 1, m.wedge_comp);
  ------------------
  |  | 3924|  21.5k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  ------------------
  |  |  |  | 3920|  21.5k|    do { \
  |  |  |  | 3921|  19.4k|        dst->name[n1d] = 0; \
  |  |  |  | 3922|  19.4k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3922:14): [Folded, False: 19.4k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3924:21): [True: 19.4k, False: 2.15k]
  |  |  ------------------
  ------------------
 3998|  2.15k|    update_cdf_3d(6, 3, 1, m.ref);
  ------------------
  |  | 3926|  15.1k|    for (int k = 0; k < (n1d); k++) update_cdf_2d(n2d, n3d, name[k])
  |  |  ------------------
  |  |  |  | 3924|  51.8k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  |  |  ------------------
  |  |  |  |  |  | 3920|  41.0k|    do { \
  |  |  |  |  |  | 3921|  38.8k|        dst->name[n1d] = 0; \
  |  |  |  |  |  | 3922|  38.8k|    } while (0)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (3922:14): [Folded, False: 38.8k]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3924:21): [True: 38.8k, False: 12.9k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3926:21): [True: 12.9k, False: 2.15k]
  |  |  ------------------
  ------------------
 3999|  2.15k|    update_cdf_3d(3, 3, 1, m.comp_fwd_ref);
  ------------------
  |  | 3926|  8.63k|    for (int k = 0; k < (n1d); k++) update_cdf_2d(n2d, n3d, name[k])
  |  |  ------------------
  |  |  |  | 3924|  25.9k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  |  |  ------------------
  |  |  |  |  |  | 3920|  21.5k|    do { \
  |  |  |  |  |  | 3921|  19.4k|        dst->name[n1d] = 0; \
  |  |  |  |  |  | 3922|  19.4k|    } while (0)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (3922:14): [Folded, False: 19.4k]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3924:21): [True: 19.4k, False: 6.47k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3926:21): [True: 6.47k, False: 2.15k]
  |  |  ------------------
  ------------------
 4000|  2.15k|    update_cdf_3d(2, 3, 1, m.comp_bwd_ref);
  ------------------
  |  | 3926|  6.47k|    for (int k = 0; k < (n1d); k++) update_cdf_2d(n2d, n3d, name[k])
  |  |  ------------------
  |  |  |  | 3924|  17.2k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  |  |  ------------------
  |  |  |  |  |  | 3920|  15.1k|    do { \
  |  |  |  |  |  | 3921|  12.9k|        dst->name[n1d] = 0; \
  |  |  |  |  |  | 3922|  12.9k|    } while (0)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (3922:14): [Folded, False: 12.9k]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3924:21): [True: 12.9k, False: 4.31k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3926:21): [True: 4.31k, False: 2.15k]
  |  |  ------------------
  ------------------
 4001|  2.15k|    update_cdf_3d(3, 3, 1, m.comp_uni_ref);
  ------------------
  |  | 3926|  8.63k|    for (int k = 0; k < (n1d); k++) update_cdf_2d(n2d, n3d, name[k])
  |  |  ------------------
  |  |  |  | 3924|  25.9k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  |  |  ------------------
  |  |  |  |  |  | 3920|  21.5k|    do { \
  |  |  |  |  |  | 3921|  19.4k|        dst->name[n1d] = 0; \
  |  |  |  |  |  | 3922|  19.4k|    } while (0)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (3922:14): [Folded, False: 19.4k]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3924:21): [True: 19.4k, False: 6.47k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3926:21): [True: 6.47k, False: 2.15k]
  |  |  ------------------
  ------------------
 4002|  2.15k|    update_cdf_2d(3, 1, m.seg_pred);
  ------------------
  |  | 3924|  8.63k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  ------------------
  |  |  |  | 3920|  8.63k|    do { \
  |  |  |  | 3921|  6.47k|        dst->name[n1d] = 0; \
  |  |  |  | 3922|  6.47k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3922:14): [Folded, False: 6.47k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3924:21): [True: 6.47k, False: 2.15k]
  |  |  ------------------
  ------------------
 4003|  2.15k|    update_cdf_2d(4, 1, m.interintra);
  ------------------
  |  | 3924|  10.7k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  ------------------
  |  |  |  | 3920|  10.7k|    do { \
  |  |  |  | 3921|  8.63k|        dst->name[n1d] = 0; \
  |  |  |  | 3922|  8.63k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3922:14): [Folded, False: 8.63k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3924:21): [True: 8.63k, False: 2.15k]
  |  |  ------------------
  ------------------
 4004|  2.15k|    update_cdf_2d(7, 1, m.interintra_wedge);
  ------------------
  |  | 3924|  17.2k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  ------------------
  |  |  |  | 3920|  17.2k|    do { \
  |  |  |  | 3921|  15.1k|        dst->name[n1d] = 0; \
  |  |  |  | 3922|  15.1k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3922:14): [Folded, False: 15.1k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3924:21): [True: 15.1k, False: 2.15k]
  |  |  ------------------
  ------------------
 4005|  2.15k|    update_cdf_2d(N_BS_SIZES, 1, m.obmc);
  ------------------
  |  | 3924|  49.6k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  ------------------
  |  |  |  | 3920|  49.6k|    do { \
  |  |  |  | 3921|  47.4k|        dst->name[n1d] = 0; \
  |  |  |  | 3922|  47.4k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3922:14): [Folded, False: 47.4k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3924:21): [True: 47.4k, False: 2.15k]
  |  |  ------------------
  ------------------
 4006|       |
 4007|  6.47k|    for (int k = 0; k < 2; k++) {
  ------------------
  |  Branch (4007:21): [True: 4.31k, False: 2.15k]
  ------------------
 4008|  4.31k|        update_cdf_1d(10, mv.comp[k].classes);
  ------------------
  |  | 3920|  4.31k|    do { \
  |  | 3921|  4.31k|        dst->name[n1d] = 0; \
  |  | 3922|  4.31k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (3922:14): [Folded, False: 4.31k]
  |  |  ------------------
  ------------------
 4009|  4.31k|        update_cdf_1d(1, mv.comp[k].sign);
  ------------------
  |  | 3920|  4.31k|    do { \
  |  | 3921|  4.31k|        dst->name[n1d] = 0; \
  |  | 3922|  4.31k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (3922:14): [Folded, False: 4.31k]
  |  |  ------------------
  ------------------
 4010|  4.31k|        update_cdf_1d(1, mv.comp[k].class0);
  ------------------
  |  | 3920|  4.31k|    do { \
  |  | 3921|  4.31k|        dst->name[n1d] = 0; \
  |  | 3922|  4.31k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (3922:14): [Folded, False: 4.31k]
  |  |  ------------------
  ------------------
 4011|  4.31k|        update_cdf_2d(2, 3, mv.comp[k].class0_fp);
  ------------------
  |  | 3924|  12.9k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  ------------------
  |  |  |  | 3920|  8.63k|    do { \
  |  |  |  | 3921|  8.63k|        dst->name[n1d] = 0; \
  |  |  |  | 3922|  8.63k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3922:14): [Folded, False: 8.63k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3924:21): [True: 8.63k, False: 4.31k]
  |  |  ------------------
  ------------------
 4012|  4.31k|        update_cdf_1d(1, mv.comp[k].class0_hp);
  ------------------
  |  | 3920|  4.31k|    do { \
  |  | 3921|  4.31k|        dst->name[n1d] = 0; \
  |  | 3922|  4.31k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (3922:14): [Folded, False: 4.31k]
  |  |  ------------------
  ------------------
 4013|  4.31k|        update_cdf_2d(10, 1, mv.comp[k].classN);
  ------------------
  |  | 3924|  47.4k|    for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
  |  |  ------------------
  |  |  |  | 3920|  43.1k|    do { \
  |  |  |  | 3921|  43.1k|        dst->name[n1d] = 0; \
  |  |  |  | 3922|  43.1k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (3922:14): [Folded, False: 43.1k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (3924:21): [True: 43.1k, False: 4.31k]
  |  |  ------------------
  ------------------
 4014|  4.31k|        update_cdf_1d(3, mv.comp[k].classN_fp);
  ------------------
  |  | 3920|  4.31k|    do { \
  |  | 3921|  4.31k|        dst->name[n1d] = 0; \
  |  | 3922|  4.31k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (3922:14): [Folded, False: 4.31k]
  |  |  ------------------
  ------------------
 4015|  4.31k|        update_cdf_1d(1, mv.comp[k].classN_hp);
  ------------------
  |  | 3920|  4.31k|    do { \
  |  | 3921|  4.31k|        dst->name[n1d] = 0; \
  |  | 3922|  4.31k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (3922:14): [Folded, False: 4.31k]
  |  |  ------------------
  ------------------
 4016|  4.31k|    }
 4017|  2.15k|    update_cdf_1d(N_MV_JOINTS - 1, mv.joint);
  ------------------
  |  | 3920|  2.15k|    do { \
  |  | 3921|  2.15k|        dst->name[n1d] = 0; \
  |  | 3922|  2.15k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (3922:14): [Folded, False: 2.15k]
  |  |  ------------------
  ------------------
 4018|  2.15k|}
dav1d_cdf_thread_init_static:
 4023|  18.8k|void dav1d_cdf_thread_init_static(CdfThreadContext *const cdf, const unsigned qidx) {
 4024|       |    cdf->ref = NULL;
 4025|  18.8k|    cdf->data.qcat = (qidx > 20) + (qidx > 60) + (qidx > 120);
 4026|  18.8k|}
dav1d_cdf_thread_copy:
 4028|  40.8k|void dav1d_cdf_thread_copy(CdfContext *const dst, const CdfThreadContext *const src) {
 4029|  40.8k|    if (src->ref) {
  ------------------
  |  Branch (4029:9): [True: 4.16k, False: 36.7k]
  ------------------
 4030|  4.16k|        memcpy(dst, src->data.cdf, sizeof(*dst));
 4031|  36.7k|    } else {
 4032|  36.7k|        dst->coef = default_coef_cdf[src->data.qcat];
 4033|  36.7k|        memcpy(&dst->m, &default_cdf.m,
 4034|  36.7k|               offsetof(CdfDefaultContext, mv.joint));
 4035|  36.7k|        memcpy(&dst->mv.comp[1], &default_cdf.mv.comp,
 4036|       |               sizeof(default_cdf) - offsetof(CdfDefaultContext, mv.comp));
 4037|  36.7k|    }
 4038|  40.8k|}
dav1d_cdf_thread_alloc:
 4042|  8.68k|{
 4043|  8.68k|    cdf->ref = dav1d_ref_create_using_pool(c->cdf_pool,
 4044|  8.68k|                                           sizeof(CdfContext) + sizeof(atomic_uint));
 4045|  8.68k|    if (!cdf->ref) return DAV1D_ERR(ENOMEM);
  ------------------
  |  |   56|      0|#define DAV1D_ERR(e) (-(e)) ///< Negate POSIX error code.
  ------------------
  |  Branch (4045:9): [True: 0, False: 8.68k]
  ------------------
 4046|  8.68k|    cdf->data.cdf = cdf->ref->data;
 4047|  8.68k|    if (have_frame_mt) {
  ------------------
  |  Branch (4047:9): [True: 0, False: 8.68k]
  ------------------
 4048|      0|        cdf->progress = (atomic_uint *) &cdf->data.cdf[1];
 4049|       |        atomic_init(cdf->progress, 0);
 4050|      0|    }
 4051|  8.68k|    return 0;
 4052|  8.68k|}
dav1d_cdf_thread_ref:
 4056|   146k|{
 4057|   146k|    *dst = *src;
 4058|   146k|    if (src->ref)
  ------------------
  |  Branch (4058:9): [True: 53.9k, False: 92.4k]
  ------------------
 4059|  53.9k|        dav1d_ref_inc(src->ref);
 4060|   146k|}
dav1d_cdf_thread_unref:
 4062|   558k|void dav1d_cdf_thread_unref(CdfThreadContext *const cdf) {
 4063|       |    memset(&cdf->data, 0, sizeof(*cdf) - offsetof(CdfThreadContext, data));
 4064|   558k|    dav1d_ref_dec(&cdf->ref);
 4065|   558k|}

dav1d_init_cpu:
   63|      1|COLD void dav1d_init_cpu(void) {
   64|      1|#if HAVE_ASM && !__has_feature(memory_sanitizer)
   65|       |// memory sanitizer is inherently incompatible with asm
   66|       |#if ARCH_AARCH64 || ARCH_ARM
   67|       |    dav1d_cpu_flags = dav1d_get_cpu_flags_arm();
   68|       |#elif ARCH_LOONGARCH
   69|       |    dav1d_cpu_flags = dav1d_get_cpu_flags_loongarch();
   70|       |#elif ARCH_PPC64LE
   71|       |    dav1d_cpu_flags = dav1d_get_cpu_flags_ppc();
   72|       |#elif ARCH_RISCV
   73|       |    dav1d_cpu_flags = dav1d_get_cpu_flags_riscv();
   74|       |#elif ARCH_X86
   75|       |    dav1d_cpu_flags = dav1d_get_cpu_flags_x86();
   76|      1|#endif
   77|      1|#endif
   78|      1|}

pal.c:dav1d_get_cpu_flags:
  124|  17.2k|static ALWAYS_INLINE unsigned dav1d_get_cpu_flags(void) {
  125|  17.2k|    unsigned flags = dav1d_cpu_flags & dav1d_cpu_flags_mask;
  126|       |
  127|  17.2k|#if TRIM_DSP_FUNCTIONS
  128|       |/* Since this function is inlined, unconditionally setting a flag here will
  129|       | * enable dead code elimination in the calling function. */
  130|  17.2k|    flags |= dav1d_get_default_cpu_flags();
  131|  17.2k|#endif
  132|       |
  133|  17.2k|    return flags;
  134|  17.2k|}
pal.c:dav1d_get_default_cpu_flags:
   58|  17.2k|static ALWAYS_INLINE unsigned dav1d_get_default_cpu_flags(void) {
   59|  17.2k|    unsigned flags = 0;
   60|       |
   61|       |#if ARCH_AARCH64 || ARCH_ARM
   62|       |#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64
   63|       |    flags |= DAV1D_ARM_CPU_FLAG_NEON;
   64|       |#endif
   65|       |#ifdef __ARM_FEATURE_DOTPROD
   66|       |    flags |= DAV1D_ARM_CPU_FLAG_DOTPROD;
   67|       |#endif
   68|       |#ifdef __ARM_FEATURE_MATMUL_INT8
   69|       |    flags |= DAV1D_ARM_CPU_FLAG_I8MM;
   70|       |#endif
   71|       |#if ARCH_AARCH64
   72|       |#ifdef __ARM_FEATURE_SVE
   73|       |    flags |= DAV1D_ARM_CPU_FLAG_SVE;
   74|       |#endif
   75|       |#ifdef __ARM_FEATURE_SVE2
   76|       |    flags |= DAV1D_ARM_CPU_FLAG_SVE2;
   77|       |#endif
   78|       |#endif /* ARCH_AARCH64 */
   79|       |#elif ARCH_PPC64LE
   80|       |#if defined(__VSX__)
   81|       |    flags |= DAV1D_PPC_CPU_FLAG_VSX;
   82|       |#endif
   83|       |#if defined(__POWER9_VECTOR__)
   84|       |    flags |= DAV1D_PPC_CPU_FLAG_PWR9;
   85|       |#endif
   86|       |#elif ARCH_RISCV
   87|       |#if defined(__riscv_v)
   88|       |    flags |= DAV1D_RISCV_CPU_FLAG_V;
   89|       |#endif
   90|       |#elif ARCH_X86
   91|       |#if defined(__AVX512F__) && defined(__AVX512CD__) && \
   92|       |    defined(__AVX512BW__) && defined(__AVX512DQ__) && \
   93|       |    defined(__AVX512VL__) && defined(__AVX512VNNI__) && \
   94|       |    defined(__AVX512IFMA__) && defined(__AVX512VBMI__) && \
   95|       |    defined(__AVX512VBMI2__) && defined(__AVX512VPOPCNTDQ__) && \
   96|       |    defined(__AVX512BITALG__) && defined(__GFNI__) && \
   97|       |    defined(__VAES__) && defined(__VPCLMULQDQ__)
   98|       |    flags |= DAV1D_X86_CPU_FLAG_AVX512ICL |
   99|       |             DAV1D_X86_CPU_FLAG_AVX2 |
  100|       |             DAV1D_X86_CPU_FLAG_SSE41 |
  101|       |             DAV1D_X86_CPU_FLAG_SSSE3 |
  102|       |             DAV1D_X86_CPU_FLAG_SSE2;
  103|       |#elif defined(__AVX2__)
  104|       |    flags |= DAV1D_X86_CPU_FLAG_AVX2 |
  105|       |             DAV1D_X86_CPU_FLAG_SSE41 |
  106|       |             DAV1D_X86_CPU_FLAG_SSSE3 |
  107|       |             DAV1D_X86_CPU_FLAG_SSE2;
  108|       |#elif defined(__SSE4_1__) || defined(__AVX__)
  109|       |    flags |= DAV1D_X86_CPU_FLAG_SSE41 |
  110|       |             DAV1D_X86_CPU_FLAG_SSSE3 |
  111|       |             DAV1D_X86_CPU_FLAG_SSE2;
  112|       |#elif defined(__SSSE3__)
  113|       |    flags |= DAV1D_X86_CPU_FLAG_SSSE3 |
  114|       |             DAV1D_X86_CPU_FLAG_SSE2;
  115|       |#elif ARCH_X86_64 || defined(__SSE2__) || \
  116|       |      (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
  117|       |    flags |= DAV1D_X86_CPU_FLAG_SSE2;
  118|  17.2k|#endif
  119|  17.2k|#endif
  120|       |
  121|  17.2k|    return flags;
  122|  17.2k|}
refmvs.c:dav1d_get_cpu_flags:
  124|  17.2k|static ALWAYS_INLINE unsigned dav1d_get_cpu_flags(void) {
  125|  17.2k|    unsigned flags = dav1d_cpu_flags & dav1d_cpu_flags_mask;
  126|       |
  127|  17.2k|#if TRIM_DSP_FUNCTIONS
  128|       |/* Since this function is inlined, unconditionally setting a flag here will
  129|       | * enable dead code elimination in the calling function. */
  130|  17.2k|    flags |= dav1d_get_default_cpu_flags();
  131|  17.2k|#endif
  132|       |
  133|  17.2k|    return flags;
  134|  17.2k|}
refmvs.c:dav1d_get_default_cpu_flags:
   58|  17.2k|static ALWAYS_INLINE unsigned dav1d_get_default_cpu_flags(void) {
   59|  17.2k|    unsigned flags = 0;
   60|       |
   61|       |#if ARCH_AARCH64 || ARCH_ARM
   62|       |#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64
   63|       |    flags |= DAV1D_ARM_CPU_FLAG_NEON;
   64|       |#endif
   65|       |#ifdef __ARM_FEATURE_DOTPROD
   66|       |    flags |= DAV1D_ARM_CPU_FLAG_DOTPROD;
   67|       |#endif
   68|       |#ifdef __ARM_FEATURE_MATMUL_INT8
   69|       |    flags |= DAV1D_ARM_CPU_FLAG_I8MM;
   70|       |#endif
   71|       |#if ARCH_AARCH64
   72|       |#ifdef __ARM_FEATURE_SVE
   73|       |    flags |= DAV1D_ARM_CPU_FLAG_SVE;
   74|       |#endif
   75|       |#ifdef __ARM_FEATURE_SVE2
   76|       |    flags |= DAV1D_ARM_CPU_FLAG_SVE2;
   77|       |#endif
   78|       |#endif /* ARCH_AARCH64 */
   79|       |#elif ARCH_PPC64LE
   80|       |#if defined(__VSX__)
   81|       |    flags |= DAV1D_PPC_CPU_FLAG_VSX;
   82|       |#endif
   83|       |#if defined(__POWER9_VECTOR__)
   84|       |    flags |= DAV1D_PPC_CPU_FLAG_PWR9;
   85|       |#endif
   86|       |#elif ARCH_RISCV
   87|       |#if defined(__riscv_v)
   88|       |    flags |= DAV1D_RISCV_CPU_FLAG_V;
   89|       |#endif
   90|       |#elif ARCH_X86
   91|       |#if defined(__AVX512F__) && defined(__AVX512CD__) && \
   92|       |    defined(__AVX512BW__) && defined(__AVX512DQ__) && \
   93|       |    defined(__AVX512VL__) && defined(__AVX512VNNI__) && \
   94|       |    defined(__AVX512IFMA__) && defined(__AVX512VBMI__) && \
   95|       |    defined(__AVX512VBMI2__) && defined(__AVX512VPOPCNTDQ__) && \
   96|       |    defined(__AVX512BITALG__) && defined(__GFNI__) && \
   97|       |    defined(__VAES__) && defined(__VPCLMULQDQ__)
   98|       |    flags |= DAV1D_X86_CPU_FLAG_AVX512ICL |
   99|       |             DAV1D_X86_CPU_FLAG_AVX2 |
  100|       |             DAV1D_X86_CPU_FLAG_SSE41 |
  101|       |             DAV1D_X86_CPU_FLAG_SSSE3 |
  102|       |             DAV1D_X86_CPU_FLAG_SSE2;
  103|       |#elif defined(__AVX2__)
  104|       |    flags |= DAV1D_X86_CPU_FLAG_AVX2 |
  105|       |             DAV1D_X86_CPU_FLAG_SSE41 |
  106|       |             DAV1D_X86_CPU_FLAG_SSSE3 |
  107|       |             DAV1D_X86_CPU_FLAG_SSE2;
  108|       |#elif defined(__SSE4_1__) || defined(__AVX__)
  109|       |    flags |= DAV1D_X86_CPU_FLAG_SSE41 |
  110|       |             DAV1D_X86_CPU_FLAG_SSSE3 |
  111|       |             DAV1D_X86_CPU_FLAG_SSE2;
  112|       |#elif defined(__SSSE3__)
  113|       |    flags |= DAV1D_X86_CPU_FLAG_SSSE3 |
  114|       |             DAV1D_X86_CPU_FLAG_SSE2;
  115|       |#elif ARCH_X86_64 || defined(__SSE2__) || \
  116|       |      (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
  117|       |    flags |= DAV1D_X86_CPU_FLAG_SSE2;
  118|  17.2k|#endif
  119|  17.2k|#endif
  120|       |
  121|  17.2k|    return flags;
  122|  17.2k|}
cpu.c:dav1d_get_default_cpu_flags:
   58|      1|static ALWAYS_INLINE unsigned dav1d_get_default_cpu_flags(void) {
   59|      1|    unsigned flags = 0;
   60|       |
   61|       |#if ARCH_AARCH64 || ARCH_ARM
   62|       |#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64
   63|       |    flags |= DAV1D_ARM_CPU_FLAG_NEON;
   64|       |#endif
   65|       |#ifdef __ARM_FEATURE_DOTPROD
   66|       |    flags |= DAV1D_ARM_CPU_FLAG_DOTPROD;
   67|       |#endif
   68|       |#ifdef __ARM_FEATURE_MATMUL_INT8
   69|       |    flags |= DAV1D_ARM_CPU_FLAG_I8MM;
   70|       |#endif
   71|       |#if ARCH_AARCH64
   72|       |#ifdef __ARM_FEATURE_SVE
   73|       |    flags |= DAV1D_ARM_CPU_FLAG_SVE;
   74|       |#endif
   75|       |#ifdef __ARM_FEATURE_SVE2
   76|       |    flags |= DAV1D_ARM_CPU_FLAG_SVE2;
   77|       |#endif
   78|       |#endif /* ARCH_AARCH64 */
   79|       |#elif ARCH_PPC64LE
   80|       |#if defined(__VSX__)
   81|       |    flags |= DAV1D_PPC_CPU_FLAG_VSX;
   82|       |#endif
   83|       |#if defined(__POWER9_VECTOR__)
   84|       |    flags |= DAV1D_PPC_CPU_FLAG_PWR9;
   85|       |#endif
   86|       |#elif ARCH_RISCV
   87|       |#if defined(__riscv_v)
   88|       |    flags |= DAV1D_RISCV_CPU_FLAG_V;
   89|       |#endif
   90|       |#elif ARCH_X86
   91|       |#if defined(__AVX512F__) && defined(__AVX512CD__) && \
   92|       |    defined(__AVX512BW__) && defined(__AVX512DQ__) && \
   93|       |    defined(__AVX512VL__) && defined(__AVX512VNNI__) && \
   94|       |    defined(__AVX512IFMA__) && defined(__AVX512VBMI__) && \
   95|       |    defined(__AVX512VBMI2__) && defined(__AVX512VPOPCNTDQ__) && \
   96|       |    defined(__AVX512BITALG__) && defined(__GFNI__) && \
   97|       |    defined(__VAES__) && defined(__VPCLMULQDQ__)
   98|       |    flags |= DAV1D_X86_CPU_FLAG_AVX512ICL |
   99|       |             DAV1D_X86_CPU_FLAG_AVX2 |
  100|       |             DAV1D_X86_CPU_FLAG_SSE41 |
  101|       |             DAV1D_X86_CPU_FLAG_SSSE3 |
  102|       |             DAV1D_X86_CPU_FLAG_SSE2;
  103|       |#elif defined(__AVX2__)
  104|       |    flags |= DAV1D_X86_CPU_FLAG_AVX2 |
  105|       |             DAV1D_X86_CPU_FLAG_SSE41 |
  106|       |             DAV1D_X86_CPU_FLAG_SSSE3 |
  107|       |             DAV1D_X86_CPU_FLAG_SSE2;
  108|       |#elif defined(__SSE4_1__) || defined(__AVX__)
  109|       |    flags |= DAV1D_X86_CPU_FLAG_SSE41 |
  110|       |             DAV1D_X86_CPU_FLAG_SSSE3 |
  111|       |             DAV1D_X86_CPU_FLAG_SSE2;
  112|       |#elif defined(__SSSE3__)
  113|       |    flags |= DAV1D_X86_CPU_FLAG_SSSE3 |
  114|       |             DAV1D_X86_CPU_FLAG_SSE2;
  115|       |#elif ARCH_X86_64 || defined(__SSE2__) || \
  116|       |      (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
  117|       |    flags |= DAV1D_X86_CPU_FLAG_SSE2;
  118|      1|#endif
  119|      1|#endif
  120|       |
  121|      1|    return flags;
  122|      1|}
msac.c:dav1d_get_cpu_flags:
  124|  32.2k|static ALWAYS_INLINE unsigned dav1d_get_cpu_flags(void) {
  125|  32.2k|    unsigned flags = dav1d_cpu_flags & dav1d_cpu_flags_mask;
  126|       |
  127|  32.2k|#if TRIM_DSP_FUNCTIONS
  128|       |/* Since this function is inlined, unconditionally setting a flag here will
  129|       | * enable dead code elimination in the calling function. */
  130|  32.2k|    flags |= dav1d_get_default_cpu_flags();
  131|  32.2k|#endif
  132|       |
  133|  32.2k|    return flags;
  134|  32.2k|}
msac.c:dav1d_get_default_cpu_flags:
   58|  32.2k|static ALWAYS_INLINE unsigned dav1d_get_default_cpu_flags(void) {
   59|  32.2k|    unsigned flags = 0;
   60|       |
   61|       |#if ARCH_AARCH64 || ARCH_ARM
   62|       |#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64
   63|       |    flags |= DAV1D_ARM_CPU_FLAG_NEON;
   64|       |#endif
   65|       |#ifdef __ARM_FEATURE_DOTPROD
   66|       |    flags |= DAV1D_ARM_CPU_FLAG_DOTPROD;
   67|       |#endif
   68|       |#ifdef __ARM_FEATURE_MATMUL_INT8
   69|       |    flags |= DAV1D_ARM_CPU_FLAG_I8MM;
   70|       |#endif
   71|       |#if ARCH_AARCH64
   72|       |#ifdef __ARM_FEATURE_SVE
   73|       |    flags |= DAV1D_ARM_CPU_FLAG_SVE;
   74|       |#endif
   75|       |#ifdef __ARM_FEATURE_SVE2
   76|       |    flags |= DAV1D_ARM_CPU_FLAG_SVE2;
   77|       |#endif
   78|       |#endif /* ARCH_AARCH64 */
   79|       |#elif ARCH_PPC64LE
   80|       |#if defined(__VSX__)
   81|       |    flags |= DAV1D_PPC_CPU_FLAG_VSX;
   82|       |#endif
   83|       |#if defined(__POWER9_VECTOR__)
   84|       |    flags |= DAV1D_PPC_CPU_FLAG_PWR9;
   85|       |#endif
   86|       |#elif ARCH_RISCV
   87|       |#if defined(__riscv_v)
   88|       |    flags |= DAV1D_RISCV_CPU_FLAG_V;
   89|       |#endif
   90|       |#elif ARCH_X86
   91|       |#if defined(__AVX512F__) && defined(__AVX512CD__) && \
   92|       |    defined(__AVX512BW__) && defined(__AVX512DQ__) && \
   93|       |    defined(__AVX512VL__) && defined(__AVX512VNNI__) && \
   94|       |    defined(__AVX512IFMA__) && defined(__AVX512VBMI__) && \
   95|       |    defined(__AVX512VBMI2__) && defined(__AVX512VPOPCNTDQ__) && \
   96|       |    defined(__AVX512BITALG__) && defined(__GFNI__) && \
   97|       |    defined(__VAES__) && defined(__VPCLMULQDQ__)
   98|       |    flags |= DAV1D_X86_CPU_FLAG_AVX512ICL |
   99|       |             DAV1D_X86_CPU_FLAG_AVX2 |
  100|       |             DAV1D_X86_CPU_FLAG_SSE41 |
  101|       |             DAV1D_X86_CPU_FLAG_SSSE3 |
  102|       |             DAV1D_X86_CPU_FLAG_SSE2;
  103|       |#elif defined(__AVX2__)
  104|       |    flags |= DAV1D_X86_CPU_FLAG_AVX2 |
  105|       |             DAV1D_X86_CPU_FLAG_SSE41 |
  106|       |             DAV1D_X86_CPU_FLAG_SSSE3 |
  107|       |             DAV1D_X86_CPU_FLAG_SSE2;
  108|       |#elif defined(__SSE4_1__) || defined(__AVX__)
  109|       |    flags |= DAV1D_X86_CPU_FLAG_SSE41 |
  110|       |             DAV1D_X86_CPU_FLAG_SSSE3 |
  111|       |             DAV1D_X86_CPU_FLAG_SSE2;
  112|       |#elif defined(__SSSE3__)
  113|       |    flags |= DAV1D_X86_CPU_FLAG_SSSE3 |
  114|       |             DAV1D_X86_CPU_FLAG_SSE2;
  115|       |#elif ARCH_X86_64 || defined(__SSE2__) || \
  116|       |      (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
  117|       |    flags |= DAV1D_X86_CPU_FLAG_SSE2;
  118|  32.2k|#endif
  119|  32.2k|#endif
  120|       |
  121|  32.2k|    return flags;
  122|  32.2k|}
cdef_tmpl.c:dav1d_get_cpu_flags:
  124|  15.4k|static ALWAYS_INLINE unsigned dav1d_get_cpu_flags(void) {
  125|  15.4k|    unsigned flags = dav1d_cpu_flags & dav1d_cpu_flags_mask;
  126|       |
  127|  15.4k|#if TRIM_DSP_FUNCTIONS
  128|       |/* Since this function is inlined, unconditionally setting a flag here will
  129|       | * enable dead code elimination in the calling function. */
  130|  15.4k|    flags |= dav1d_get_default_cpu_flags();
  131|  15.4k|#endif
  132|       |
  133|  15.4k|    return flags;
  134|  15.4k|}
cdef_tmpl.c:dav1d_get_default_cpu_flags:
   58|  15.4k|static ALWAYS_INLINE unsigned dav1d_get_default_cpu_flags(void) {
   59|  15.4k|    unsigned flags = 0;
   60|       |
   61|       |#if ARCH_AARCH64 || ARCH_ARM
   62|       |#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64
   63|       |    flags |= DAV1D_ARM_CPU_FLAG_NEON;
   64|       |#endif
   65|       |#ifdef __ARM_FEATURE_DOTPROD
   66|       |    flags |= DAV1D_ARM_CPU_FLAG_DOTPROD;
   67|       |#endif
   68|       |#ifdef __ARM_FEATURE_MATMUL_INT8
   69|       |    flags |= DAV1D_ARM_CPU_FLAG_I8MM;
   70|       |#endif
   71|       |#if ARCH_AARCH64
   72|       |#ifdef __ARM_FEATURE_SVE
   73|       |    flags |= DAV1D_ARM_CPU_FLAG_SVE;
   74|       |#endif
   75|       |#ifdef __ARM_FEATURE_SVE2
   76|       |    flags |= DAV1D_ARM_CPU_FLAG_SVE2;
   77|       |#endif
   78|       |#endif /* ARCH_AARCH64 */
   79|       |#elif ARCH_PPC64LE
   80|       |#if defined(__VSX__)
   81|       |    flags |= DAV1D_PPC_CPU_FLAG_VSX;
   82|       |#endif
   83|       |#if defined(__POWER9_VECTOR__)
   84|       |    flags |= DAV1D_PPC_CPU_FLAG_PWR9;
   85|       |#endif
   86|       |#elif ARCH_RISCV
   87|       |#if defined(__riscv_v)
   88|       |    flags |= DAV1D_RISCV_CPU_FLAG_V;
   89|       |#endif
   90|       |#elif ARCH_X86
   91|       |#if defined(__AVX512F__) && defined(__AVX512CD__) && \
   92|       |    defined(__AVX512BW__) && defined(__AVX512DQ__) && \
   93|       |    defined(__AVX512VL__) && defined(__AVX512VNNI__) && \
   94|       |    defined(__AVX512IFMA__) && defined(__AVX512VBMI__) && \
   95|       |    defined(__AVX512VBMI2__) && defined(__AVX512VPOPCNTDQ__) && \
   96|       |    defined(__AVX512BITALG__) && defined(__GFNI__) && \
   97|       |    defined(__VAES__) && defined(__VPCLMULQDQ__)
   98|       |    flags |= DAV1D_X86_CPU_FLAG_AVX512ICL |
   99|       |             DAV1D_X86_CPU_FLAG_AVX2 |
  100|       |             DAV1D_X86_CPU_FLAG_SSE41 |
  101|       |             DAV1D_X86_CPU_FLAG_SSSE3 |
  102|       |             DAV1D_X86_CPU_FLAG_SSE2;
  103|       |#elif defined(__AVX2__)
  104|       |    flags |= DAV1D_X86_CPU_FLAG_AVX2 |
  105|       |             DAV1D_X86_CPU_FLAG_SSE41 |
  106|       |             DAV1D_X86_CPU_FLAG_SSSE3 |
  107|       |             DAV1D_X86_CPU_FLAG_SSE2;
  108|       |#elif defined(__SSE4_1__) || defined(__AVX__)
  109|       |    flags |= DAV1D_X86_CPU_FLAG_SSE41 |
  110|       |             DAV1D_X86_CPU_FLAG_SSSE3 |
  111|       |             DAV1D_X86_CPU_FLAG_SSE2;
  112|       |#elif defined(__SSSE3__)
  113|       |    flags |= DAV1D_X86_CPU_FLAG_SSSE3 |
  114|       |             DAV1D_X86_CPU_FLAG_SSE2;
  115|       |#elif ARCH_X86_64 || defined(__SSE2__) || \
  116|       |      (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
  117|       |    flags |= DAV1D_X86_CPU_FLAG_SSE2;
  118|  15.4k|#endif
  119|  15.4k|#endif
  120|       |
  121|  15.4k|    return flags;
  122|  15.4k|}
filmgrain_tmpl.c:dav1d_get_cpu_flags:
  124|  15.4k|static ALWAYS_INLINE unsigned dav1d_get_cpu_flags(void) {
  125|  15.4k|    unsigned flags = dav1d_cpu_flags & dav1d_cpu_flags_mask;
  126|       |
  127|  15.4k|#if TRIM_DSP_FUNCTIONS
  128|       |/* Since this function is inlined, unconditionally setting a flag here will
  129|       | * enable dead code elimination in the calling function. */
  130|  15.4k|    flags |= dav1d_get_default_cpu_flags();
  131|  15.4k|#endif
  132|       |
  133|  15.4k|    return flags;
  134|  15.4k|}
filmgrain_tmpl.c:dav1d_get_default_cpu_flags:
   58|  15.4k|static ALWAYS_INLINE unsigned dav1d_get_default_cpu_flags(void) {
   59|  15.4k|    unsigned flags = 0;
   60|       |
   61|       |#if ARCH_AARCH64 || ARCH_ARM
   62|       |#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64
   63|       |    flags |= DAV1D_ARM_CPU_FLAG_NEON;
   64|       |#endif
   65|       |#ifdef __ARM_FEATURE_DOTPROD
   66|       |    flags |= DAV1D_ARM_CPU_FLAG_DOTPROD;
   67|       |#endif
   68|       |#ifdef __ARM_FEATURE_MATMUL_INT8
   69|       |    flags |= DAV1D_ARM_CPU_FLAG_I8MM;
   70|       |#endif
   71|       |#if ARCH_AARCH64
   72|       |#ifdef __ARM_FEATURE_SVE
   73|       |    flags |= DAV1D_ARM_CPU_FLAG_SVE;
   74|       |#endif
   75|       |#ifdef __ARM_FEATURE_SVE2
   76|       |    flags |= DAV1D_ARM_CPU_FLAG_SVE2;
   77|       |#endif
   78|       |#endif /* ARCH_AARCH64 */
   79|       |#elif ARCH_PPC64LE
   80|       |#if defined(__VSX__)
   81|       |    flags |= DAV1D_PPC_CPU_FLAG_VSX;
   82|       |#endif
   83|       |#if defined(__POWER9_VECTOR__)
   84|       |    flags |= DAV1D_PPC_CPU_FLAG_PWR9;
   85|       |#endif
   86|       |#elif ARCH_RISCV
   87|       |#if defined(__riscv_v)
   88|       |    flags |= DAV1D_RISCV_CPU_FLAG_V;
   89|       |#endif
   90|       |#elif ARCH_X86
   91|       |#if defined(__AVX512F__) && defined(__AVX512CD__) && \
   92|       |    defined(__AVX512BW__) && defined(__AVX512DQ__) && \
   93|       |    defined(__AVX512VL__) && defined(__AVX512VNNI__) && \
   94|       |    defined(__AVX512IFMA__) && defined(__AVX512VBMI__) && \
   95|       |    defined(__AVX512VBMI2__) && defined(__AVX512VPOPCNTDQ__) && \
   96|       |    defined(__AVX512BITALG__) && defined(__GFNI__) && \
   97|       |    defined(__VAES__) && defined(__VPCLMULQDQ__)
   98|       |    flags |= DAV1D_X86_CPU_FLAG_AVX512ICL |
   99|       |             DAV1D_X86_CPU_FLAG_AVX2 |
  100|       |             DAV1D_X86_CPU_FLAG_SSE41 |
  101|       |             DAV1D_X86_CPU_FLAG_SSSE3 |
  102|       |             DAV1D_X86_CPU_FLAG_SSE2;
  103|       |#elif defined(__AVX2__)
  104|       |    flags |= DAV1D_X86_CPU_FLAG_AVX2 |
  105|       |             DAV1D_X86_CPU_FLAG_SSE41 |
  106|       |             DAV1D_X86_CPU_FLAG_SSSE3 |
  107|       |             DAV1D_X86_CPU_FLAG_SSE2;
  108|       |#elif defined(__SSE4_1__) || defined(__AVX__)
  109|       |    flags |= DAV1D_X86_CPU_FLAG_SSE41 |
  110|       |             DAV1D_X86_CPU_FLAG_SSSE3 |
  111|       |             DAV1D_X86_CPU_FLAG_SSE2;
  112|       |#elif defined(__SSSE3__)
  113|       |    flags |= DAV1D_X86_CPU_FLAG_SSSE3 |
  114|       |             DAV1D_X86_CPU_FLAG_SSE2;
  115|       |#elif ARCH_X86_64 || defined(__SSE2__) || \
  116|       |      (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
  117|       |    flags |= DAV1D_X86_CPU_FLAG_SSE2;
  118|  15.4k|#endif
  119|  15.4k|#endif
  120|       |
  121|  15.4k|    return flags;
  122|  15.4k|}
ipred_tmpl.c:dav1d_get_cpu_flags:
  124|  15.4k|static ALWAYS_INLINE unsigned dav1d_get_cpu_flags(void) {
  125|  15.4k|    unsigned flags = dav1d_cpu_flags & dav1d_cpu_flags_mask;
  126|       |
  127|  15.4k|#if TRIM_DSP_FUNCTIONS
  128|       |/* Since this function is inlined, unconditionally setting a flag here will
  129|       | * enable dead code elimination in the calling function. */
  130|  15.4k|    flags |= dav1d_get_default_cpu_flags();
  131|  15.4k|#endif
  132|       |
  133|  15.4k|    return flags;
  134|  15.4k|}
ipred_tmpl.c:dav1d_get_default_cpu_flags:
   58|  15.4k|static ALWAYS_INLINE unsigned dav1d_get_default_cpu_flags(void) {
   59|  15.4k|    unsigned flags = 0;
   60|       |
   61|       |#if ARCH_AARCH64 || ARCH_ARM
   62|       |#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64
   63|       |    flags |= DAV1D_ARM_CPU_FLAG_NEON;
   64|       |#endif
   65|       |#ifdef __ARM_FEATURE_DOTPROD
   66|       |    flags |= DAV1D_ARM_CPU_FLAG_DOTPROD;
   67|       |#endif
   68|       |#ifdef __ARM_FEATURE_MATMUL_INT8
   69|       |    flags |= DAV1D_ARM_CPU_FLAG_I8MM;
   70|       |#endif
   71|       |#if ARCH_AARCH64
   72|       |#ifdef __ARM_FEATURE_SVE
   73|       |    flags |= DAV1D_ARM_CPU_FLAG_SVE;
   74|       |#endif
   75|       |#ifdef __ARM_FEATURE_SVE2
   76|       |    flags |= DAV1D_ARM_CPU_FLAG_SVE2;
   77|       |#endif
   78|       |#endif /* ARCH_AARCH64 */
   79|       |#elif ARCH_PPC64LE
   80|       |#if defined(__VSX__)
   81|       |    flags |= DAV1D_PPC_CPU_FLAG_VSX;
   82|       |#endif
   83|       |#if defined(__POWER9_VECTOR__)
   84|       |    flags |= DAV1D_PPC_CPU_FLAG_PWR9;
   85|       |#endif
   86|       |#elif ARCH_RISCV
   87|       |#if defined(__riscv_v)
   88|       |    flags |= DAV1D_RISCV_CPU_FLAG_V;
   89|       |#endif
   90|       |#elif ARCH_X86
   91|       |#if defined(__AVX512F__) && defined(__AVX512CD__) && \
   92|       |    defined(__AVX512BW__) && defined(__AVX512DQ__) && \
   93|       |    defined(__AVX512VL__) && defined(__AVX512VNNI__) && \
   94|       |    defined(__AVX512IFMA__) && defined(__AVX512VBMI__) && \
   95|       |    defined(__AVX512VBMI2__) && defined(__AVX512VPOPCNTDQ__) && \
   96|       |    defined(__AVX512BITALG__) && defined(__GFNI__) && \
   97|       |    defined(__VAES__) && defined(__VPCLMULQDQ__)
   98|       |    flags |= DAV1D_X86_CPU_FLAG_AVX512ICL |
   99|       |             DAV1D_X86_CPU_FLAG_AVX2 |
  100|       |             DAV1D_X86_CPU_FLAG_SSE41 |
  101|       |             DAV1D_X86_CPU_FLAG_SSSE3 |
  102|       |             DAV1D_X86_CPU_FLAG_SSE2;
  103|       |#elif defined(__AVX2__)
  104|       |    flags |= DAV1D_X86_CPU_FLAG_AVX2 |
  105|       |             DAV1D_X86_CPU_FLAG_SSE41 |
  106|       |             DAV1D_X86_CPU_FLAG_SSSE3 |
  107|       |             DAV1D_X86_CPU_FLAG_SSE2;
  108|       |#elif defined(__SSE4_1__) || defined(__AVX__)
  109|       |    flags |= DAV1D_X86_CPU_FLAG_SSE41 |
  110|       |             DAV1D_X86_CPU_FLAG_SSSE3 |
  111|       |             DAV1D_X86_CPU_FLAG_SSE2;
  112|       |#elif defined(__SSSE3__)
  113|       |    flags |= DAV1D_X86_CPU_FLAG_SSSE3 |
  114|       |             DAV1D_X86_CPU_FLAG_SSE2;
  115|       |#elif ARCH_X86_64 || defined(__SSE2__) || \
  116|       |      (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
  117|       |    flags |= DAV1D_X86_CPU_FLAG_SSE2;
  118|  15.4k|#endif
  119|  15.4k|#endif
  120|       |
  121|  15.4k|    return flags;
  122|  15.4k|}
itx_tmpl.c:dav1d_get_cpu_flags:
  124|  15.4k|static ALWAYS_INLINE unsigned dav1d_get_cpu_flags(void) {
  125|  15.4k|    unsigned flags = dav1d_cpu_flags & dav1d_cpu_flags_mask;
  126|       |
  127|  15.4k|#if TRIM_DSP_FUNCTIONS
  128|       |/* Since this function is inlined, unconditionally setting a flag here will
  129|       | * enable dead code elimination in the calling function. */
  130|  15.4k|    flags |= dav1d_get_default_cpu_flags();
  131|  15.4k|#endif
  132|       |
  133|  15.4k|    return flags;
  134|  15.4k|}
itx_tmpl.c:dav1d_get_default_cpu_flags:
   58|  15.4k|static ALWAYS_INLINE unsigned dav1d_get_default_cpu_flags(void) {
   59|  15.4k|    unsigned flags = 0;
   60|       |
   61|       |#if ARCH_AARCH64 || ARCH_ARM
   62|       |#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64
   63|       |    flags |= DAV1D_ARM_CPU_FLAG_NEON;
   64|       |#endif
   65|       |#ifdef __ARM_FEATURE_DOTPROD
   66|       |    flags |= DAV1D_ARM_CPU_FLAG_DOTPROD;
   67|       |#endif
   68|       |#ifdef __ARM_FEATURE_MATMUL_INT8
   69|       |    flags |= DAV1D_ARM_CPU_FLAG_I8MM;
   70|       |#endif
   71|       |#if ARCH_AARCH64
   72|       |#ifdef __ARM_FEATURE_SVE
   73|       |    flags |= DAV1D_ARM_CPU_FLAG_SVE;
   74|       |#endif
   75|       |#ifdef __ARM_FEATURE_SVE2
   76|       |    flags |= DAV1D_ARM_CPU_FLAG_SVE2;
   77|       |#endif
   78|       |#endif /* ARCH_AARCH64 */
   79|       |#elif ARCH_PPC64LE
   80|       |#if defined(__VSX__)
   81|       |    flags |= DAV1D_PPC_CPU_FLAG_VSX;
   82|       |#endif
   83|       |#if defined(__POWER9_VECTOR__)
   84|       |    flags |= DAV1D_PPC_CPU_FLAG_PWR9;
   85|       |#endif
   86|       |#elif ARCH_RISCV
   87|       |#if defined(__riscv_v)
   88|       |    flags |= DAV1D_RISCV_CPU_FLAG_V;
   89|       |#endif
   90|       |#elif ARCH_X86
   91|       |#if defined(__AVX512F__) && defined(__AVX512CD__) && \
   92|       |    defined(__AVX512BW__) && defined(__AVX512DQ__) && \
   93|       |    defined(__AVX512VL__) && defined(__AVX512VNNI__) && \
   94|       |    defined(__AVX512IFMA__) && defined(__AVX512VBMI__) && \
   95|       |    defined(__AVX512VBMI2__) && defined(__AVX512VPOPCNTDQ__) && \
   96|       |    defined(__AVX512BITALG__) && defined(__GFNI__) && \
   97|       |    defined(__VAES__) && defined(__VPCLMULQDQ__)
   98|       |    flags |= DAV1D_X86_CPU_FLAG_AVX512ICL |
   99|       |             DAV1D_X86_CPU_FLAG_AVX2 |
  100|       |             DAV1D_X86_CPU_FLAG_SSE41 |
  101|       |             DAV1D_X86_CPU_FLAG_SSSE3 |
  102|       |             DAV1D_X86_CPU_FLAG_SSE2;
  103|       |#elif defined(__AVX2__)
  104|       |    flags |= DAV1D_X86_CPU_FLAG_AVX2 |
  105|       |             DAV1D_X86_CPU_FLAG_SSE41 |
  106|       |             DAV1D_X86_CPU_FLAG_SSSE3 |
  107|       |             DAV1D_X86_CPU_FLAG_SSE2;
  108|       |#elif defined(__SSE4_1__) || defined(__AVX__)
  109|       |    flags |= DAV1D_X86_CPU_FLAG_SSE41 |
  110|       |             DAV1D_X86_CPU_FLAG_SSSE3 |
  111|       |             DAV1D_X86_CPU_FLAG_SSE2;
  112|       |#elif defined(__SSSE3__)
  113|       |    flags |= DAV1D_X86_CPU_FLAG_SSSE3 |
  114|       |             DAV1D_X86_CPU_FLAG_SSE2;
  115|       |#elif ARCH_X86_64 || defined(__SSE2__) || \
  116|       |      (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
  117|       |    flags |= DAV1D_X86_CPU_FLAG_SSE2;
  118|  15.4k|#endif
  119|  15.4k|#endif
  120|       |
  121|  15.4k|    return flags;
  122|  15.4k|}
loopfilter_tmpl.c:dav1d_get_cpu_flags:
  124|  15.4k|static ALWAYS_INLINE unsigned dav1d_get_cpu_flags(void) {
  125|  15.4k|    unsigned flags = dav1d_cpu_flags & dav1d_cpu_flags_mask;
  126|       |
  127|  15.4k|#if TRIM_DSP_FUNCTIONS
  128|       |/* Since this function is inlined, unconditionally setting a flag here will
  129|       | * enable dead code elimination in the calling function. */
  130|  15.4k|    flags |= dav1d_get_default_cpu_flags();
  131|  15.4k|#endif
  132|       |
  133|  15.4k|    return flags;
  134|  15.4k|}
loopfilter_tmpl.c:dav1d_get_default_cpu_flags:
   58|  15.4k|static ALWAYS_INLINE unsigned dav1d_get_default_cpu_flags(void) {
   59|  15.4k|    unsigned flags = 0;
   60|       |
   61|       |#if ARCH_AARCH64 || ARCH_ARM
   62|       |#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64
   63|       |    flags |= DAV1D_ARM_CPU_FLAG_NEON;
   64|       |#endif
   65|       |#ifdef __ARM_FEATURE_DOTPROD
   66|       |    flags |= DAV1D_ARM_CPU_FLAG_DOTPROD;
   67|       |#endif
   68|       |#ifdef __ARM_FEATURE_MATMUL_INT8
   69|       |    flags |= DAV1D_ARM_CPU_FLAG_I8MM;
   70|       |#endif
   71|       |#if ARCH_AARCH64
   72|       |#ifdef __ARM_FEATURE_SVE
   73|       |    flags |= DAV1D_ARM_CPU_FLAG_SVE;
   74|       |#endif
   75|       |#ifdef __ARM_FEATURE_SVE2
   76|       |    flags |= DAV1D_ARM_CPU_FLAG_SVE2;
   77|       |#endif
   78|       |#endif /* ARCH_AARCH64 */
   79|       |#elif ARCH_PPC64LE
   80|       |#if defined(__VSX__)
   81|       |    flags |= DAV1D_PPC_CPU_FLAG_VSX;
   82|       |#endif
   83|       |#if defined(__POWER9_VECTOR__)
   84|       |    flags |= DAV1D_PPC_CPU_FLAG_PWR9;
   85|       |#endif
   86|       |#elif ARCH_RISCV
   87|       |#if defined(__riscv_v)
   88|       |    flags |= DAV1D_RISCV_CPU_FLAG_V;
   89|       |#endif
   90|       |#elif ARCH_X86
   91|       |#if defined(__AVX512F__) && defined(__AVX512CD__) && \
   92|       |    defined(__AVX512BW__) && defined(__AVX512DQ__) && \
   93|       |    defined(__AVX512VL__) && defined(__AVX512VNNI__) && \
   94|       |    defined(__AVX512IFMA__) && defined(__AVX512VBMI__) && \
   95|       |    defined(__AVX512VBMI2__) && defined(__AVX512VPOPCNTDQ__) && \
   96|       |    defined(__AVX512BITALG__) && defined(__GFNI__) && \
   97|       |    defined(__VAES__) && defined(__VPCLMULQDQ__)
   98|       |    flags |= DAV1D_X86_CPU_FLAG_AVX512ICL |
   99|       |             DAV1D_X86_CPU_FLAG_AVX2 |
  100|       |             DAV1D_X86_CPU_FLAG_SSE41 |
  101|       |             DAV1D_X86_CPU_FLAG_SSSE3 |
  102|       |             DAV1D_X86_CPU_FLAG_SSE2;
  103|       |#elif defined(__AVX2__)
  104|       |    flags |= DAV1D_X86_CPU_FLAG_AVX2 |
  105|       |             DAV1D_X86_CPU_FLAG_SSE41 |
  106|       |             DAV1D_X86_CPU_FLAG_SSSE3 |
  107|       |             DAV1D_X86_CPU_FLAG_SSE2;
  108|       |#elif defined(__SSE4_1__) || defined(__AVX__)
  109|       |    flags |= DAV1D_X86_CPU_FLAG_SSE41 |
  110|       |             DAV1D_X86_CPU_FLAG_SSSE3 |
  111|       |             DAV1D_X86_CPU_FLAG_SSE2;
  112|       |#elif defined(__SSSE3__)
  113|       |    flags |= DAV1D_X86_CPU_FLAG_SSSE3 |
  114|       |             DAV1D_X86_CPU_FLAG_SSE2;
  115|       |#elif ARCH_X86_64 || defined(__SSE2__) || \
  116|       |      (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
  117|       |    flags |= DAV1D_X86_CPU_FLAG_SSE2;
  118|  15.4k|#endif
  119|  15.4k|#endif
  120|       |
  121|  15.4k|    return flags;
  122|  15.4k|}
looprestoration_tmpl.c:dav1d_get_cpu_flags:
  124|  15.4k|static ALWAYS_INLINE unsigned dav1d_get_cpu_flags(void) {
  125|  15.4k|    unsigned flags = dav1d_cpu_flags & dav1d_cpu_flags_mask;
  126|       |
  127|  15.4k|#if TRIM_DSP_FUNCTIONS
  128|       |/* Since this function is inlined, unconditionally setting a flag here will
  129|       | * enable dead code elimination in the calling function. */
  130|  15.4k|    flags |= dav1d_get_default_cpu_flags();
  131|  15.4k|#endif
  132|       |
  133|  15.4k|    return flags;
  134|  15.4k|}
looprestoration_tmpl.c:dav1d_get_default_cpu_flags:
   58|  15.4k|static ALWAYS_INLINE unsigned dav1d_get_default_cpu_flags(void) {
   59|  15.4k|    unsigned flags = 0;
   60|       |
   61|       |#if ARCH_AARCH64 || ARCH_ARM
   62|       |#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64
   63|       |    flags |= DAV1D_ARM_CPU_FLAG_NEON;
   64|       |#endif
   65|       |#ifdef __ARM_FEATURE_DOTPROD
   66|       |    flags |= DAV1D_ARM_CPU_FLAG_DOTPROD;
   67|       |#endif
   68|       |#ifdef __ARM_FEATURE_MATMUL_INT8
   69|       |    flags |= DAV1D_ARM_CPU_FLAG_I8MM;
   70|       |#endif
   71|       |#if ARCH_AARCH64
   72|       |#ifdef __ARM_FEATURE_SVE
   73|       |    flags |= DAV1D_ARM_CPU_FLAG_SVE;
   74|       |#endif
   75|       |#ifdef __ARM_FEATURE_SVE2
   76|       |    flags |= DAV1D_ARM_CPU_FLAG_SVE2;
   77|       |#endif
   78|       |#endif /* ARCH_AARCH64 */
   79|       |#elif ARCH_PPC64LE
   80|       |#if defined(__VSX__)
   81|       |    flags |= DAV1D_PPC_CPU_FLAG_VSX;
   82|       |#endif
   83|       |#if defined(__POWER9_VECTOR__)
   84|       |    flags |= DAV1D_PPC_CPU_FLAG_PWR9;
   85|       |#endif
   86|       |#elif ARCH_RISCV
   87|       |#if defined(__riscv_v)
   88|       |    flags |= DAV1D_RISCV_CPU_FLAG_V;
   89|       |#endif
   90|       |#elif ARCH_X86
   91|       |#if defined(__AVX512F__) && defined(__AVX512CD__) && \
   92|       |    defined(__AVX512BW__) && defined(__AVX512DQ__) && \
   93|       |    defined(__AVX512VL__) && defined(__AVX512VNNI__) && \
   94|       |    defined(__AVX512IFMA__) && defined(__AVX512VBMI__) && \
   95|       |    defined(__AVX512VBMI2__) && defined(__AVX512VPOPCNTDQ__) && \
   96|       |    defined(__AVX512BITALG__) && defined(__GFNI__) && \
   97|       |    defined(__VAES__) && defined(__VPCLMULQDQ__)
   98|       |    flags |= DAV1D_X86_CPU_FLAG_AVX512ICL |
   99|       |             DAV1D_X86_CPU_FLAG_AVX2 |
  100|       |             DAV1D_X86_CPU_FLAG_SSE41 |
  101|       |             DAV1D_X86_CPU_FLAG_SSSE3 |
  102|       |             DAV1D_X86_CPU_FLAG_SSE2;
  103|       |#elif defined(__AVX2__)
  104|       |    flags |= DAV1D_X86_CPU_FLAG_AVX2 |
  105|       |             DAV1D_X86_CPU_FLAG_SSE41 |
  106|       |             DAV1D_X86_CPU_FLAG_SSSE3 |
  107|       |             DAV1D_X86_CPU_FLAG_SSE2;
  108|       |#elif defined(__SSE4_1__) || defined(__AVX__)
  109|       |    flags |= DAV1D_X86_CPU_FLAG_SSE41 |
  110|       |             DAV1D_X86_CPU_FLAG_SSSE3 |
  111|       |             DAV1D_X86_CPU_FLAG_SSE2;
  112|       |#elif defined(__SSSE3__)
  113|       |    flags |= DAV1D_X86_CPU_FLAG_SSSE3 |
  114|       |             DAV1D_X86_CPU_FLAG_SSE2;
  115|       |#elif ARCH_X86_64 || defined(__SSE2__) || \
  116|       |      (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
  117|       |    flags |= DAV1D_X86_CPU_FLAG_SSE2;
  118|  15.4k|#endif
  119|  15.4k|#endif
  120|       |
  121|  15.4k|    return flags;
  122|  15.4k|}
mc_tmpl.c:dav1d_get_cpu_flags:
  124|  15.4k|static ALWAYS_INLINE unsigned dav1d_get_cpu_flags(void) {
  125|  15.4k|    unsigned flags = dav1d_cpu_flags & dav1d_cpu_flags_mask;
  126|       |
  127|  15.4k|#if TRIM_DSP_FUNCTIONS
  128|       |/* Since this function is inlined, unconditionally setting a flag here will
  129|       | * enable dead code elimination in the calling function. */
  130|  15.4k|    flags |= dav1d_get_default_cpu_flags();
  131|  15.4k|#endif
  132|       |
  133|  15.4k|    return flags;
  134|  15.4k|}
mc_tmpl.c:dav1d_get_default_cpu_flags:
   58|  15.4k|static ALWAYS_INLINE unsigned dav1d_get_default_cpu_flags(void) {
   59|  15.4k|    unsigned flags = 0;
   60|       |
   61|       |#if ARCH_AARCH64 || ARCH_ARM
   62|       |#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64
   63|       |    flags |= DAV1D_ARM_CPU_FLAG_NEON;
   64|       |#endif
   65|       |#ifdef __ARM_FEATURE_DOTPROD
   66|       |    flags |= DAV1D_ARM_CPU_FLAG_DOTPROD;
   67|       |#endif
   68|       |#ifdef __ARM_FEATURE_MATMUL_INT8
   69|       |    flags |= DAV1D_ARM_CPU_FLAG_I8MM;
   70|       |#endif
   71|       |#if ARCH_AARCH64
   72|       |#ifdef __ARM_FEATURE_SVE
   73|       |    flags |= DAV1D_ARM_CPU_FLAG_SVE;
   74|       |#endif
   75|       |#ifdef __ARM_FEATURE_SVE2
   76|       |    flags |= DAV1D_ARM_CPU_FLAG_SVE2;
   77|       |#endif
   78|       |#endif /* ARCH_AARCH64 */
   79|       |#elif ARCH_PPC64LE
   80|       |#if defined(__VSX__)
   81|       |    flags |= DAV1D_PPC_CPU_FLAG_VSX;
   82|       |#endif
   83|       |#if defined(__POWER9_VECTOR__)
   84|       |    flags |= DAV1D_PPC_CPU_FLAG_PWR9;
   85|       |#endif
   86|       |#elif ARCH_RISCV
   87|       |#if defined(__riscv_v)
   88|       |    flags |= DAV1D_RISCV_CPU_FLAG_V;
   89|       |#endif
   90|       |#elif ARCH_X86
   91|       |#if defined(__AVX512F__) && defined(__AVX512CD__) && \
   92|       |    defined(__AVX512BW__) && defined(__AVX512DQ__) && \
   93|       |    defined(__AVX512VL__) && defined(__AVX512VNNI__) && \
   94|       |    defined(__AVX512IFMA__) && defined(__AVX512VBMI__) && \
   95|       |    defined(__AVX512VBMI2__) && defined(__AVX512VPOPCNTDQ__) && \
   96|       |    defined(__AVX512BITALG__) && defined(__GFNI__) && \
   97|       |    defined(__VAES__) && defined(__VPCLMULQDQ__)
   98|       |    flags |= DAV1D_X86_CPU_FLAG_AVX512ICL |
   99|       |             DAV1D_X86_CPU_FLAG_AVX2 |
  100|       |             DAV1D_X86_CPU_FLAG_SSE41 |
  101|       |             DAV1D_X86_CPU_FLAG_SSSE3 |
  102|       |             DAV1D_X86_CPU_FLAG_SSE2;
  103|       |#elif defined(__AVX2__)
  104|       |    flags |= DAV1D_X86_CPU_FLAG_AVX2 |
  105|       |             DAV1D_X86_CPU_FLAG_SSE41 |
  106|       |             DAV1D_X86_CPU_FLAG_SSSE3 |
  107|       |             DAV1D_X86_CPU_FLAG_SSE2;
  108|       |#elif defined(__SSE4_1__) || defined(__AVX__)
  109|       |    flags |= DAV1D_X86_CPU_FLAG_SSE41 |
  110|       |             DAV1D_X86_CPU_FLAG_SSSE3 |
  111|       |             DAV1D_X86_CPU_FLAG_SSE2;
  112|       |#elif defined(__SSSE3__)
  113|       |    flags |= DAV1D_X86_CPU_FLAG_SSSE3 |
  114|       |             DAV1D_X86_CPU_FLAG_SSE2;
  115|       |#elif ARCH_X86_64 || defined(__SSE2__) || \
  116|       |      (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
  117|       |    flags |= DAV1D_X86_CPU_FLAG_SSE2;
  118|  15.4k|#endif
  119|  15.4k|#endif
  120|       |
  121|  15.4k|    return flags;
  122|  15.4k|}

ctx.c:memset_w1:
   34|  22.1M|static void memset_w1(void *const ptr, const int value) {
   35|  22.1M|    set_ctx1((uint8_t *) ptr, 0, value);
  ------------------
  |  |   56|  22.1M|    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
  ------------------
   36|  22.1M|}
ctx.c:memset_w2:
   38|  4.94M|static void memset_w2(void *const ptr, const int value) {
   39|  4.94M|    set_ctx2((uint8_t *) ptr, 0, value);
  ------------------
  |  |   58|  4.94M|    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
  ------------------
   40|  4.94M|}
ctx.c:memset_w4:
   42|  2.41M|static void memset_w4(void *const ptr, const int value) {
   43|  2.41M|    set_ctx4((uint8_t *) ptr, 0, value);
  ------------------
  |  |   60|  2.41M|    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
  ------------------
   44|  2.41M|}
ctx.c:memset_w8:
   46|  1.91M|static void memset_w8(void *const ptr, const int value) {
   47|  1.91M|    set_ctx8((uint8_t *) ptr, 0, value);
  ------------------
  |  |   62|  1.91M|    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
  ------------------
   48|  1.91M|}
ctx.c:memset_w16:
   50|   410k|static void memset_w16(void *const ptr, const int value) {
   51|   410k|    set_ctx16((uint8_t *) ptr, 0, value);
  ------------------
  |  |   63|   410k|#define set_ctx16(var, off, val) do { \
  |  |   64|   410k|        memset(&(var)[off], val, 16); \
  |  |   65|   410k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (65:14): [Folded, False: 410k]
  |  |  ------------------
  ------------------
   52|   410k|}
ctx.c:memset_w32:
   54|  41.4k|static void memset_w32(void *const ptr, const int value) {
   55|  41.4k|    set_ctx32((uint8_t *) ptr, 0, value);
  ------------------
  |  |   66|  41.4k|#define set_ctx32(var, off, val) do { \
  |  |   67|  41.4k|        memset(&(var)[off], val, 32); \
  |  |   68|  41.4k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (68:14): [Folded, False: 41.4k]
  |  |  ------------------
  ------------------
   56|  41.4k|}

lf_mask.c:dav1d_memset_likely_pow2:
   44|  2.89M|static inline void dav1d_memset_likely_pow2(void *const ptr, const int value, const int n) {
   45|  2.89M|    assert(n >= 1 && n <= 32);
  ------------------
  |  |  140|  5.78M|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:30): [True: 2.89M, False: 578]
  |  |  |  Branch (140:30): [True: 2.89M, False: 18.4E]
  |  |  |  Branch (140:68): [Folded, False: 2.89M]
  |  |  ------------------
  ------------------
   46|  2.89M|    if ((n&(n-1)) == 0) {
  ------------------
  |  Branch (46:9): [True: 2.84M, False: 45.1k]
  ------------------
   47|  2.84M|        dav1d_memset_pow2[ulog2(n)](ptr, value);
   48|  2.84M|    } else {
   49|  45.1k|        memset(ptr, value, n);
   50|  45.1k|    }
   51|  2.89M|}
recon_tmpl.c:dav1d_memset_likely_pow2:
   44|  8.95M|static inline void dav1d_memset_likely_pow2(void *const ptr, const int value, const int n) {
   45|  8.95M|    assert(n >= 1 && n <= 32);
  ------------------
  |  |  140|  17.9M|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:30): [True: 8.95M, False: 908]
  |  |  |  Branch (140:30): [True: 8.95M, False: 18.4E]
  |  |  |  Branch (140:68): [Folded, False: 8.95M]
  |  |  ------------------
  ------------------
   46|  8.95M|    if ((n&(n-1)) == 0) {
  ------------------
  |  Branch (46:9): [True: 8.81M, False: 141k]
  ------------------
   47|  8.81M|        dav1d_memset_pow2[ulog2(n)](ptr, value);
   48|  8.81M|    } else {
   49|   141k|        memset(ptr, value, n);
   50|   141k|    }
   51|  8.95M|}

dav1d_data_wrap_internal:
   62|  20.8k|{
   63|  20.8k|    validate_input_or_ret(buf != NULL, DAV1D_ERR(EINVAL));
  ------------------
  |  |   52|  20.8k|    if (!(x)) { \
  |  |  ------------------
  |  |  |  Branch (52:9): [True: 0, False: 20.8k]
  |  |  ------------------
  |  |   53|      0|        debug_print("Input validation check \'%s\' failed in %s!\n", \
  |  |  ------------------
  |  |  |  |   35|      0|#define debug_print(...) do {} while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (35:39): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   54|      0|                    #x, __func__); \
  |  |   55|      0|        debug_abort(); \
  |  |  ------------------
  |  |  |  |   36|      0|#define debug_abort() do {} while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (36:36): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   56|      0|        return r; \
  |  |   57|      0|    }
  ------------------
   64|  20.8k|    validate_input_or_ret(ptr != NULL, DAV1D_ERR(EINVAL));
  ------------------
  |  |   52|  20.8k|    if (!(x)) { \
  |  |  ------------------
  |  |  |  Branch (52:9): [True: 0, False: 20.8k]
  |  |  ------------------
  |  |   53|      0|        debug_print("Input validation check \'%s\' failed in %s!\n", \
  |  |  ------------------
  |  |  |  |   35|      0|#define debug_print(...) do {} while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (35:39): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   54|      0|                    #x, __func__); \
  |  |   55|      0|        debug_abort(); \
  |  |  ------------------
  |  |  |  |   36|      0|#define debug_abort() do {} while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (36:36): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   56|      0|        return r; \
  |  |   57|      0|    }
  ------------------
   65|  20.8k|    validate_input_or_ret(free_callback != NULL, DAV1D_ERR(EINVAL));
  ------------------
  |  |   52|  20.8k|    if (!(x)) { \
  |  |  ------------------
  |  |  |  Branch (52:9): [True: 0, False: 20.8k]
  |  |  ------------------
  |  |   53|      0|        debug_print("Input validation check \'%s\' failed in %s!\n", \
  |  |  ------------------
  |  |  |  |   35|      0|#define debug_print(...) do {} while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (35:39): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   54|      0|                    #x, __func__); \
  |  |   55|      0|        debug_abort(); \
  |  |  ------------------
  |  |  |  |   36|      0|#define debug_abort() do {} while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (36:36): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   56|      0|        return r; \
  |  |   57|      0|    }
  ------------------
   66|       |
   67|  20.8k|    if (sz > SIZE_MAX / 2) return DAV1D_ERR(EINVAL);
  ------------------
  |  |   56|      0|#define DAV1D_ERR(e) (-(e)) ///< Negate POSIX error code.
  ------------------
  |  Branch (67:9): [True: 0, False: 20.8k]
  ------------------
   68|  20.8k|    Dav1dRef *const ref = dav1d_malloc(ALLOC_DAV1DDATA, sizeof(Dav1dRef));
  ------------------
  |  |  132|  20.8k|#define dav1d_malloc(type, sz) malloc(sz)
  ------------------
   69|  20.8k|    if (!ref) return DAV1D_ERR(ENOMEM);
  ------------------
  |  |   56|      0|#define DAV1D_ERR(e) (-(e)) ///< Negate POSIX error code.
  ------------------
  |  Branch (69:9): [True: 0, False: 20.8k]
  ------------------
   70|       |
   71|  20.8k|    buf->ref = dav1d_ref_init(ref, ptr, free_callback, cookie, 1);
   72|  20.8k|    buf->data = ptr;
   73|  20.8k|    buf->sz = sz;
   74|  20.8k|    dav1d_data_props_set_defaults(&buf->m);
   75|  20.8k|    buf->m.size = sz;
   76|       |
   77|  20.8k|    return 0;
   78|  20.8k|}
dav1d_data_ref:
   98|  42.4k|void dav1d_data_ref(Dav1dData *const dst, const Dav1dData *const src) {
   99|  42.4k|    assert(dst != NULL);
  ------------------
  |  |  140|  42.4k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 42.4k]
  |  |  |  Branch (140:68): [Folded, False: 42.4k]
  |  |  ------------------
  ------------------
  100|  42.4k|    assert(dst->data == NULL);
  ------------------
  |  |  140|  42.4k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 42.4k]
  |  |  |  Branch (140:68): [Folded, False: 42.4k]
  |  |  ------------------
  ------------------
  101|  42.4k|    assert(src != NULL);
  ------------------
  |  |  140|  42.4k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 42.4k]
  |  |  |  Branch (140:68): [Folded, False: 42.4k]
  |  |  ------------------
  ------------------
  102|       |
  103|  42.4k|    if (src->ref) {
  ------------------
  |  Branch (103:9): [True: 42.4k, False: 0]
  ------------------
  104|  42.4k|        assert(src->data != NULL);
  ------------------
  |  |  140|  42.4k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 42.4k]
  |  |  |  Branch (140:68): [Folded, False: 42.4k]
  |  |  ------------------
  ------------------
  105|  42.4k|        dav1d_ref_inc(src->ref);
  106|  42.4k|    }
  107|  42.4k|    if (src->m.user_data.ref) dav1d_ref_inc(src->m.user_data.ref);
  ------------------
  |  Branch (107:9): [True: 0, False: 42.4k]
  ------------------
  108|  42.4k|    *dst = *src;
  109|  42.4k|}
dav1d_data_props_copy:
  113|  35.5k|{
  114|  35.5k|    assert(dst != NULL);
  ------------------
  |  |  140|  35.5k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 35.5k]
  |  |  |  Branch (140:68): [Folded, False: 35.5k]
  |  |  ------------------
  ------------------
  115|  35.5k|    assert(src != NULL);
  ------------------
  |  |  140|  35.5k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 35.5k]
  |  |  |  Branch (140:68): [Folded, False: 35.5k]
  |  |  ------------------
  ------------------
  116|       |
  117|  35.5k|    dav1d_ref_dec(&dst->user_data.ref);
  118|  35.5k|    *dst = *src;
  119|  35.5k|    if (dst->user_data.ref) dav1d_ref_inc(dst->user_data.ref);
  ------------------
  |  Branch (119:9): [True: 0, False: 35.5k]
  ------------------
  120|  35.5k|}
dav1d_data_props_set_defaults:
  122|   485k|void dav1d_data_props_set_defaults(Dav1dDataProps *const props) {
  123|   485k|    assert(props != NULL);
  ------------------
  |  |  140|   485k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 485k]
  |  |  |  Branch (140:68): [Folded, False: 485k]
  |  |  ------------------
  ------------------
  124|       |
  125|   485k|    memset(props, 0, sizeof(*props));
  126|       |    props->timestamp = INT64_MIN;
  127|   485k|    props->offset = -1;
  128|   485k|}
dav1d_data_props_unref_internal:
  130|  17.2k|void dav1d_data_props_unref_internal(Dav1dDataProps *const props) {
  131|  17.2k|    validate_input(props != NULL);
  ------------------
  |  |   59|  17.2k|#define validate_input(x) validate_input_or_ret(x, )
  |  |  ------------------
  |  |  |  |   52|  17.2k|    if (!(x)) { \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (52:9): [True: 0, False: 17.2k]
  |  |  |  |  ------------------
  |  |  |  |   53|      0|        debug_print("Input validation check \'%s\' failed in %s!\n", \
  |  |  |  |  ------------------
  |  |  |  |  |  |   35|      0|#define debug_print(...) do {} while (0)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (35:39): [Folded, False: 0]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   54|      0|                    #x, __func__); \
  |  |  |  |   55|      0|        debug_abort(); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   36|      0|#define debug_abort() do {} while (0)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (36:36): [Folded, False: 0]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   56|      0|        return r; \
  |  |  |  |   57|      0|    }
  |  |  ------------------
  ------------------
  132|       |
  133|  17.2k|    struct Dav1dRef *user_data_ref = props->user_data.ref;
  134|  17.2k|    dav1d_data_props_set_defaults(props);
  135|  17.2k|    dav1d_ref_dec(&user_data_ref);
  136|  17.2k|}
dav1d_data_unref_internal:
  138|  80.5k|void dav1d_data_unref_internal(Dav1dData *const buf) {
  139|  80.5k|    validate_input(buf != NULL);
  ------------------
  |  |   59|  80.5k|#define validate_input(x) validate_input_or_ret(x, )
  |  |  ------------------
  |  |  |  |   52|  80.5k|    if (!(x)) { \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (52:9): [True: 0, False: 80.5k]
  |  |  |  |  ------------------
  |  |  |  |   53|      0|        debug_print("Input validation check \'%s\' failed in %s!\n", \
  |  |  |  |  ------------------
  |  |  |  |  |  |   35|      0|#define debug_print(...) do {} while (0)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (35:39): [Folded, False: 0]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   54|      0|                    #x, __func__); \
  |  |  |  |   55|      0|        debug_abort(); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   36|      0|#define debug_abort() do {} while (0)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (36:36): [Folded, False: 0]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   56|      0|        return r; \
  |  |  |  |   57|      0|    }
  |  |  ------------------
  ------------------
  140|       |
  141|  80.5k|    struct Dav1dRef *user_data_ref = buf->m.user_data.ref;
  142|  80.5k|    if (buf->ref) {
  ------------------
  |  Branch (142:9): [True: 63.3k, False: 17.2k]
  ------------------
  143|  63.3k|        validate_input(buf->data != NULL);
  ------------------
  |  |   59|  63.3k|#define validate_input(x) validate_input_or_ret(x, )
  |  |  ------------------
  |  |  |  |   52|  63.3k|    if (!(x)) { \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (52:9): [True: 0, False: 63.3k]
  |  |  |  |  ------------------
  |  |  |  |   53|      0|        debug_print("Input validation check \'%s\' failed in %s!\n", \
  |  |  |  |  ------------------
  |  |  |  |  |  |   35|      0|#define debug_print(...) do {} while (0)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (35:39): [Folded, False: 0]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   54|      0|                    #x, __func__); \
  |  |  |  |   55|      0|        debug_abort(); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   36|      0|#define debug_abort() do {} while (0)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (36:36): [Folded, False: 0]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   56|      0|        return r; \
  |  |  |  |   57|      0|    }
  |  |  ------------------
  ------------------
  144|  63.3k|        dav1d_ref_dec(&buf->ref);
  145|  63.3k|    }
  146|  80.5k|    memset(buf, 0, sizeof(*buf));
  147|  80.5k|    dav1d_data_props_set_defaults(&buf->m);
  148|  80.5k|    dav1d_ref_dec(&user_data_ref);
  149|  80.5k|}

dav1d_decode_tile_sbrow:
 2594|   150k|int dav1d_decode_tile_sbrow(Dav1dTaskContext *const t) {
 2595|   150k|    const Dav1dFrameContext *const f = t->f;
 2596|   150k|    const enum BlockLevel root_bl = f->seq_hdr->sb128 ? BL_128X128 : BL_64X64;
  ------------------
  |  Branch (2596:37): [True: 54.9k, False: 95.9k]
  ------------------
 2597|   150k|    Dav1dTileState *const ts = t->ts;
 2598|   150k|    const Dav1dContext *const c = f->c;
 2599|   150k|    const int sb_step = f->sb_step;
 2600|   150k|    const int tile_row = ts->tiling.row, tile_col = ts->tiling.col;
 2601|   150k|    const int col_sb_start = f->frame_hdr->tiling.col_start_sb[tile_col];
 2602|   150k|    const int col_sb128_start = col_sb_start >> !f->seq_hdr->sb128;
 2603|       |
 2604|   150k|    if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc) {
  ------------------
  |  |   36|   301k|    ((frame_header)->frame_type & 1)
  |  |  ------------------
  |  |  |  Branch (36:5): [True: 8.29k, False: 142k]
  |  |  ------------------
  ------------------
  |  Branch (2604:45): [True: 39.9k, False: 102k]
  ------------------
 2605|  48.2k|        dav1d_refmvs_tile_sbrow_init(&t->rt, &f->rf, ts->tiling.col_start,
 2606|  48.2k|                                     ts->tiling.col_end, ts->tiling.row_start,
 2607|  48.2k|                                     ts->tiling.row_end, t->by >> f->sb_shift,
 2608|  48.2k|                                     ts->tiling.row, t->frame_thread.pass);
 2609|  48.2k|    }
 2610|       |
 2611|   150k|    if (IS_INTER_OR_SWITCH(f->frame_hdr) && c->n_fc > 1) {
  ------------------
  |  |   36|   301k|    ((frame_header)->frame_type & 1)
  |  |  ------------------
  |  |  |  Branch (36:5): [True: 8.29k, False: 142k]
  |  |  ------------------
  ------------------
  |  Branch (2611:45): [True: 0, False: 8.29k]
  ------------------
 2612|      0|        const int sby = (t->by - ts->tiling.row_start) >> f->sb_shift;
 2613|      0|        int (*const lowest_px)[2] = ts->lowest_pixel[sby];
 2614|      0|        for (int n = 0; n < 7; n++)
  ------------------
  |  Branch (2614:25): [True: 0, False: 0]
  ------------------
 2615|      0|            for (int m = 0; m < 2; m++)
  ------------------
  |  Branch (2615:29): [True: 0, False: 0]
  ------------------
 2616|      0|                lowest_px[n][m] = INT_MIN;
 2617|      0|    }
 2618|       |
 2619|   150k|    reset_context(&t->l, IS_KEY_OR_INTRA(f->frame_hdr), t->frame_thread.pass);
  ------------------
  |  |   43|   150k|    (!IS_INTER_OR_SWITCH(frame_header))
  |  |  ------------------
  |  |  |  |   36|   150k|    ((frame_header)->frame_type & 1)
  |  |  ------------------
  ------------------
 2620|   150k|    if (t->frame_thread.pass == 2) {
  ------------------
  |  Branch (2620:9): [True: 0, False: 150k]
  ------------------
 2621|      0|        const int off_2pass = c->n_tc > 1 ? f->sb128w * f->frame_hdr->tiling.rows : 0;
  ------------------
  |  Branch (2621:31): [True: 0, False: 0]
  ------------------
 2622|      0|        for (t->bx = ts->tiling.col_start,
 2623|      0|             t->a = f->a + off_2pass + col_sb128_start + tile_row * f->sb128w;
 2624|      0|             t->bx < ts->tiling.col_end; t->bx += sb_step)
  ------------------
  |  Branch (2624:14): [True: 0, False: 0]
  ------------------
 2625|      0|        {
 2626|      0|            if (atomic_load_explicit(c->flush, memory_order_acquire))
  ------------------
  |  Branch (2626:17): [True: 0, False: 0]
  ------------------
 2627|      0|                return 1;
 2628|      0|            if (decode_sb(t, root_bl, dav1d_intra_edge_tree[root_bl]))
  ------------------
  |  Branch (2628:17): [True: 0, False: 0]
  ------------------
 2629|      0|                return 1;
 2630|      0|            if (t->bx & 16 || f->seq_hdr->sb128)
  ------------------
  |  Branch (2630:17): [True: 0, False: 0]
  |  Branch (2630:31): [True: 0, False: 0]
  ------------------
 2631|      0|                t->a++;
 2632|      0|        }
 2633|      0|        f->bd_fn.backup_ipred_edge(t);
 2634|      0|        return 0;
 2635|      0|    }
 2636|       |
 2637|   150k|    if (f->c->n_tc > 1 && f->frame_hdr->use_ref_frame_mvs) {
  ------------------
  |  Branch (2637:9): [True: 143k, False: 7.10k]
  |  Branch (2637:27): [True: 5.92k, False: 137k]
  ------------------
 2638|  5.92k|        f->c->refmvs_dsp.load_tmvs(&f->rf, ts->tiling.row,
 2639|  5.92k|                                   ts->tiling.col_start >> 1, ts->tiling.col_end >> 1,
 2640|  5.92k|                                   t->by >> 1, (t->by + sb_step) >> 1);
 2641|  5.92k|    }
 2642|   150k|    memset(t->pal_sz_uv[1], 0, sizeof(*t->pal_sz_uv));
 2643|   150k|    const int sb128y = t->by >> 5;
 2644|   150k|    for (t->bx = ts->tiling.col_start, t->a = f->a + col_sb128_start + tile_row * f->sb128w,
 2645|   150k|         t->lf_mask = f->lf.mask + sb128y * f->sb128w + col_sb128_start;
 2646|   476k|         t->bx < ts->tiling.col_end; t->bx += sb_step)
  ------------------
  |  Branch (2646:10): [True: 325k, False: 150k]
  ------------------
 2647|   325k|    {
 2648|   325k|        if (atomic_load_explicit(c->flush, memory_order_acquire))
  ------------------
  |  Branch (2648:13): [True: 0, False: 325k]
  ------------------
 2649|      0|            return 1;
 2650|   325k|        if (root_bl == BL_128X128) {
  ------------------
  |  Branch (2650:13): [True: 120k, False: 205k]
  ------------------
 2651|   120k|            t->cur_sb_cdef_idx_ptr = t->lf_mask->cdef_idx;
 2652|   120k|            t->cur_sb_cdef_idx_ptr[0] = -1;
 2653|   120k|            t->cur_sb_cdef_idx_ptr[1] = -1;
 2654|   120k|            t->cur_sb_cdef_idx_ptr[2] = -1;
 2655|   120k|            t->cur_sb_cdef_idx_ptr[3] = -1;
 2656|   205k|        } else {
 2657|   205k|            t->cur_sb_cdef_idx_ptr =
 2658|   205k|                &t->lf_mask->cdef_idx[((t->bx & 16) >> 4) +
 2659|   205k|                                      ((t->by & 16) >> 3)];
 2660|   205k|            t->cur_sb_cdef_idx_ptr[0] = -1;
 2661|   205k|        }
 2662|       |        // Restoration filter
 2663|  1.30M|        for (int p = 0; p < 3; p++) {
  ------------------
  |  Branch (2663:25): [True: 976k, False: 325k]
  ------------------
 2664|   976k|            if (!((f->lf.restore_planes >> p) & 1U))
  ------------------
  |  Branch (2664:17): [True: 816k, False: 159k]
  ------------------
 2665|   816k|                continue;
 2666|       |
 2667|   159k|            const int ss_ver = p && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
  ------------------
  |  Branch (2667:32): [True: 70.6k, False: 89.0k]
  |  Branch (2667:37): [True: 17.4k, False: 53.1k]
  ------------------
 2668|   159k|            const int ss_hor = p && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
  ------------------
  |  Branch (2668:32): [True: 70.6k, False: 89.0k]
  |  Branch (2668:37): [True: 17.9k, False: 52.7k]
  ------------------
 2669|   159k|            const int unit_size_log2 = f->frame_hdr->restoration.unit_size[!!p];
 2670|   159k|            const int y = t->by * 4 >> ss_ver;
 2671|   159k|            const int h = (f->cur.p.h + ss_ver) >> ss_ver;
 2672|       |
 2673|   159k|            const int unit_size = 1 << unit_size_log2;
 2674|   159k|            const unsigned mask = unit_size - 1;
 2675|   159k|            if (y & mask) continue;
  ------------------
  |  Branch (2675:17): [True: 40.4k, False: 119k]
  ------------------
 2676|   119k|            const int half_unit = unit_size >> 1;
 2677|       |            // Round half up at frame boundaries, if there's more than one
 2678|       |            // restoration unit
 2679|   119k|            if (y && y + half_unit > h) continue;
  ------------------
  |  Branch (2679:17): [True: 66.2k, False: 52.9k]
  |  Branch (2679:22): [True: 1.14k, False: 65.1k]
  ------------------
 2680|       |
 2681|   118k|            const enum Dav1dRestorationType frame_type = f->frame_hdr->restoration.type[p];
 2682|       |
 2683|   118k|            if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
  ------------------
  |  Branch (2683:17): [True: 23.6k, False: 94.4k]
  ------------------
 2684|  23.6k|                const int w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor;
 2685|  23.6k|                const int n_units = imax(1, (w + half_unit) >> unit_size_log2);
 2686|       |
 2687|  23.6k|                const int d = f->frame_hdr->super_res.width_scale_denominator;
 2688|  23.6k|                const int rnd = unit_size * 8 - 1, shift = unit_size_log2 + 3;
 2689|  23.6k|                const int x0 = ((4 *  t->bx            * d >> ss_hor) + rnd) >> shift;
 2690|  23.6k|                const int x1 = ((4 * (t->bx + sb_step) * d >> ss_hor) + rnd) >> shift;
 2691|       |
 2692|  43.9k|                for (int x = x0; x < imin(x1, n_units); x++) {
  ------------------
  |  Branch (2692:34): [True: 20.3k, False: 23.6k]
  ------------------
 2693|  20.3k|                    const int px_x = x << (unit_size_log2 + ss_hor);
 2694|  20.3k|                    const int sb_idx = (t->by >> 5) * f->sr_sb128w + (px_x >> 7);
 2695|  20.3k|                    const int unit_idx = ((t->by & 16) >> 3) + ((px_x & 64) >> 6);
 2696|  20.3k|                    Av1RestorationUnit *const lr = &f->lf.lr_mask[sb_idx].lr[p][unit_idx];
 2697|       |
 2698|  20.3k|                    read_restoration_info(t, lr, p, frame_type);
 2699|  20.3k|                }
 2700|  94.4k|            } else {
 2701|  94.4k|                const int x = 4 * t->bx >> ss_hor;
 2702|  94.4k|                if (x & mask) continue;
  ------------------
  |  Branch (2702:21): [True: 4.21k, False: 90.2k]
  ------------------
 2703|  90.2k|                const int w = (f->cur.p.w + ss_hor) >> ss_hor;
 2704|       |                // Round half up at frame boundaries, if there's more than one
 2705|       |                // restoration unit
 2706|  90.2k|                if (x && x + half_unit > w) continue;
  ------------------
  |  Branch (2706:21): [True: 26.0k, False: 64.1k]
  |  Branch (2706:26): [True: 907, False: 25.1k]
  ------------------
 2707|  89.3k|                const int sb_idx = (t->by >> 5) * f->sr_sb128w + (t->bx >> 5);
 2708|  89.3k|                const int unit_idx = ((t->by & 16) >> 3) + ((t->bx & 16) >> 4);
 2709|  89.3k|                Av1RestorationUnit *const lr = &f->lf.lr_mask[sb_idx].lr[p][unit_idx];
 2710|       |
 2711|  89.3k|                read_restoration_info(t, lr, p, frame_type);
 2712|  89.3k|            }
 2713|   118k|        }
 2714|   325k|        if (decode_sb(t, root_bl, dav1d_intra_edge_tree[root_bl]))
  ------------------
  |  Branch (2714:13): [True: 557, False: 325k]
  ------------------
 2715|    557|            return 1;
 2716|   325k|        if (t->bx & 16 || f->seq_hdr->sb128) {
  ------------------
  |  Branch (2716:13): [True: 60.1k, False: 264k]
  |  Branch (2716:27): [True: 120k, False: 144k]
  ------------------
 2717|   180k|            t->a++;
 2718|   180k|            t->lf_mask++;
 2719|   180k|        }
 2720|   325k|    }
 2721|       |
 2722|   150k|    if (f->seq_hdr->ref_frame_mvs && f->c->n_tc > 1 && IS_INTER_OR_SWITCH(f->frame_hdr)) {
  ------------------
  |  |   36|  14.8k|    ((frame_header)->frame_type & 1)
  |  |  ------------------
  |  |  |  Branch (36:5): [True: 7.24k, False: 7.62k]
  |  |  ------------------
  ------------------
  |  Branch (2722:9): [True: 15.8k, False: 134k]
  |  Branch (2722:38): [True: 14.8k, False: 1.01k]
  ------------------
 2723|  7.24k|        dav1d_refmvs_save_tmvs(&f->c->refmvs_dsp, &t->rt,
 2724|  7.24k|                               ts->tiling.col_start >> 1, ts->tiling.col_end >> 1,
 2725|  7.24k|                               t->by >> 1, (t->by + sb_step) >> 1);
 2726|  7.24k|    }
 2727|       |
 2728|       |    // backup pre-loopfilter pixels for intra prediction of the next sbrow
 2729|   150k|    if (t->frame_thread.pass != 1)
  ------------------
  |  Branch (2729:9): [True: 150k, False: 27]
  ------------------
 2730|   150k|        f->bd_fn.backup_ipred_edge(t);
 2731|       |
 2732|       |    // backup t->a/l.tx_lpf_y/uv at tile boundaries to use them to "fix"
 2733|       |    // up the initial value in neighbour tiles when running the loopfilter
 2734|   150k|    int align_h = (f->bh + 31) & ~31;
 2735|   150k|    memcpy(&f->lf.tx_lpf_right_edge[0][align_h * tile_col + t->by],
 2736|   150k|           &t->l.tx_lpf_y[t->by & 16], sb_step);
 2737|   150k|    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
 2738|   150k|    align_h >>= ss_ver;
 2739|   150k|    memcpy(&f->lf.tx_lpf_right_edge[1][align_h * tile_col + (t->by >> ss_ver)],
 2740|   150k|           &t->l.tx_lpf_uv[(t->by & 16) >> ss_ver], sb_step >> ss_ver);
 2741|       |
 2742|       |    // error out on symbol decoder overread
 2743|   150k|    if (ts->msac.cnt <= -15) return 1;
  ------------------
  |  Branch (2743:9): [True: 13.7k, False: 136k]
  ------------------
 2744|       |
 2745|   136k|    return c->strict_std_compliance &&
  ------------------
  |  Branch (2745:12): [True: 0, False: 136k]
  ------------------
 2746|      0|           (t->by >> f->sb_shift) + 1 >= f->frame_hdr->tiling.row_start_sb[tile_row + 1] &&
  ------------------
  |  Branch (2746:12): [True: 0, False: 0]
  ------------------
 2747|      0|           check_trailing_bits_after_symbol_coder(&ts->msac);
  ------------------
  |  Branch (2747:12): [True: 0, False: 0]
  ------------------
 2748|   150k|}
dav1d_decode_frame_init:
 2750|  21.4k|int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
 2751|  21.4k|    const Dav1dContext *const c = f->c;
 2752|  21.4k|    int retval = DAV1D_ERR(ENOMEM);
  ------------------
  |  |   56|  21.4k|#define DAV1D_ERR(e) (-(e)) ///< Negate POSIX error code.
  ------------------
 2753|       |
 2754|  21.4k|    if (f->sbh > f->lf.start_of_tile_row_sz) {
  ------------------
  |  Branch (2754:9): [True: 15.4k, False: 6.06k]
  ------------------
 2755|  15.4k|        dav1d_free(f->lf.start_of_tile_row);
  ------------------
  |  |  135|  15.4k|#define dav1d_free(ptr) free(ptr)
  ------------------
 2756|  15.4k|        f->lf.start_of_tile_row = dav1d_malloc(ALLOC_TILE, f->sbh * sizeof(uint8_t));
  ------------------
  |  |  132|  15.4k|#define dav1d_malloc(type, sz) malloc(sz)
  ------------------
 2757|  15.4k|        if (!f->lf.start_of_tile_row) {
  ------------------
  |  Branch (2757:13): [True: 0, False: 15.4k]
  ------------------
 2758|      0|            f->lf.start_of_tile_row_sz = 0;
 2759|      0|            goto error;
 2760|      0|        }
 2761|  15.4k|        f->lf.start_of_tile_row_sz = f->sbh;
 2762|  15.4k|    }
 2763|  21.4k|    int sby = 0;
 2764|  49.4k|    for (int tile_row = 0; tile_row < f->frame_hdr->tiling.rows; tile_row++) {
  ------------------
  |  Branch (2764:28): [True: 27.9k, False: 21.4k]
  ------------------
 2765|  27.9k|        f->lf.start_of_tile_row[sby++] = tile_row;
 2766|   413k|        while (sby < f->frame_hdr->tiling.row_start_sb[tile_row + 1])
  ------------------
  |  Branch (2766:16): [True: 385k, False: 27.9k]
  ------------------
 2767|   385k|            f->lf.start_of_tile_row[sby++] = 0;
 2768|  27.9k|    }
 2769|       |
 2770|  21.4k|    const int n_ts = f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows;
 2771|  21.4k|    if (n_ts != f->n_ts) {
  ------------------
  |  Branch (2771:9): [True: 16.1k, False: 5.34k]
  ------------------
 2772|  16.1k|        if (c->n_fc > 1) {
  ------------------
  |  Branch (2772:13): [True: 0, False: 16.1k]
  ------------------
 2773|      0|            dav1d_free(f->frame_thread.tile_start_off);
  ------------------
  |  |  135|      0|#define dav1d_free(ptr) free(ptr)
  ------------------
 2774|      0|            f->frame_thread.tile_start_off =
 2775|      0|                dav1d_malloc(ALLOC_TILE, sizeof(*f->frame_thread.tile_start_off) * n_ts);
  ------------------
  |  |  132|      0|#define dav1d_malloc(type, sz) malloc(sz)
  ------------------
 2776|      0|            if (!f->frame_thread.tile_start_off) {
  ------------------
  |  Branch (2776:17): [True: 0, False: 0]
  ------------------
 2777|      0|                f->n_ts = 0;
 2778|      0|                goto error;
 2779|      0|            }
 2780|      0|        }
 2781|  16.1k|        dav1d_free_aligned(f->ts);
  ------------------
  |  |  136|  16.1k|#define dav1d_free_aligned(ptr) dav1d_free_aligned_internal(ptr)
  ------------------
 2782|  16.1k|        f->ts = dav1d_alloc_aligned(ALLOC_TILE, sizeof(*f->ts) * n_ts, 32);
  ------------------
  |  |  134|  16.1k|#define dav1d_alloc_aligned(type, sz, align) dav1d_alloc_aligned_internal(sz, align)
  ------------------
 2783|  16.1k|        if (!f->ts) goto error;
  ------------------
  |  Branch (2783:13): [True: 0, False: 16.1k]
  ------------------
 2784|  16.1k|        f->n_ts = n_ts;
 2785|  16.1k|    }
 2786|       |
 2787|  21.4k|    const int a_sz = f->sb128w * f->frame_hdr->tiling.rows * (1 + (c->n_fc > 1 && c->n_tc > 1));
  ------------------
  |  Branch (2787:68): [True: 0, False: 21.4k]
  |  Branch (2787:83): [True: 0, False: 0]
  ------------------
 2788|  21.4k|    if (a_sz != f->a_sz) {
  ------------------
  |  Branch (2788:9): [True: 16.1k, False: 5.34k]
  ------------------
 2789|  16.1k|        dav1d_free(f->a);
  ------------------
  |  |  135|  16.1k|#define dav1d_free(ptr) free(ptr)
  ------------------
 2790|  16.1k|        f->a = dav1d_malloc(ALLOC_TILE, sizeof(*f->a) * a_sz);
  ------------------
  |  |  132|  16.1k|#define dav1d_malloc(type, sz) malloc(sz)
  ------------------
 2791|  16.1k|        if (!f->a) {
  ------------------
  |  Branch (2791:13): [True: 0, False: 16.1k]
  ------------------
 2792|      0|            f->a_sz = 0;
 2793|      0|            goto error;
 2794|      0|        }
 2795|  16.1k|        f->a_sz = a_sz;
 2796|  16.1k|    }
 2797|       |
 2798|  21.4k|    const int num_sb128 = f->sb128w * f->sb128h;
 2799|  21.4k|    const uint8_t *const size_mul = ss_size_mul[f->cur.p.layout];
 2800|  21.4k|    const int hbd = !!f->seq_hdr->hbd;
 2801|  21.4k|    if (c->n_fc > 1) {
  ------------------
  |  Branch (2801:9): [True: 0, False: 21.4k]
  ------------------
 2802|      0|        const unsigned sb_step4 = f->sb_step * 4;
 2803|      0|        int tile_idx = 0;
 2804|      0|        for (int tile_row = 0; tile_row < f->frame_hdr->tiling.rows; tile_row++) {
  ------------------
  |  Branch (2804:32): [True: 0, False: 0]
  ------------------
 2805|      0|            const unsigned row_off = f->frame_hdr->tiling.row_start_sb[tile_row] *
 2806|      0|                                     sb_step4 * f->sb128w * 128;
 2807|      0|            const unsigned b_diff = (f->frame_hdr->tiling.row_start_sb[tile_row + 1] -
 2808|      0|                                     f->frame_hdr->tiling.row_start_sb[tile_row]) * sb_step4;
 2809|      0|            for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols; tile_col++) {
  ------------------
  |  Branch (2809:36): [True: 0, False: 0]
  ------------------
 2810|      0|                f->frame_thread.tile_start_off[tile_idx++] = row_off + b_diff *
 2811|      0|                    f->frame_hdr->tiling.col_start_sb[tile_col] * sb_step4;
 2812|      0|            }
 2813|      0|        }
 2814|       |
 2815|      0|        const int lowest_pixel_mem_sz = f->frame_hdr->tiling.cols * f->sbh;
 2816|      0|        if (lowest_pixel_mem_sz != f->tile_thread.lowest_pixel_mem_sz) {
  ------------------
  |  Branch (2816:13): [True: 0, False: 0]
  ------------------
 2817|      0|            dav1d_free(f->tile_thread.lowest_pixel_mem);
  ------------------
  |  |  135|      0|#define dav1d_free(ptr) free(ptr)
  ------------------
 2818|      0|            f->tile_thread.lowest_pixel_mem =
 2819|      0|                dav1d_malloc(ALLOC_TILE, lowest_pixel_mem_sz *
  ------------------
  |  |  132|      0|#define dav1d_malloc(type, sz) malloc(sz)
  ------------------
 2820|      0|                             sizeof(*f->tile_thread.lowest_pixel_mem));
 2821|      0|            if (!f->tile_thread.lowest_pixel_mem) {
  ------------------
  |  Branch (2821:17): [True: 0, False: 0]
  ------------------
 2822|      0|                f->tile_thread.lowest_pixel_mem_sz = 0;
 2823|      0|                goto error;
 2824|      0|            }
 2825|      0|            f->tile_thread.lowest_pixel_mem_sz = lowest_pixel_mem_sz;
 2826|      0|        }
 2827|      0|        int (*lowest_pixel_ptr)[7][2] = f->tile_thread.lowest_pixel_mem;
 2828|      0|        for (int tile_row = 0, tile_row_base = 0; tile_row < f->frame_hdr->tiling.rows;
  ------------------
  |  Branch (2828:51): [True: 0, False: 0]
  ------------------
 2829|      0|             tile_row++, tile_row_base += f->frame_hdr->tiling.cols)
 2830|      0|        {
 2831|      0|            const int tile_row_sb_h = f->frame_hdr->tiling.row_start_sb[tile_row + 1] -
 2832|      0|                                      f->frame_hdr->tiling.row_start_sb[tile_row];
 2833|      0|            for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols; tile_col++) {
  ------------------
  |  Branch (2833:36): [True: 0, False: 0]
  ------------------
 2834|      0|                f->ts[tile_row_base + tile_col].lowest_pixel = lowest_pixel_ptr;
 2835|      0|                lowest_pixel_ptr += tile_row_sb_h;
 2836|      0|            }
 2837|      0|        }
 2838|       |
 2839|      0|        const int cbi_sz = num_sb128 * size_mul[0];
 2840|      0|        if (cbi_sz != f->frame_thread.cbi_sz) {
  ------------------
  |  Branch (2840:13): [True: 0, False: 0]
  ------------------
 2841|      0|            dav1d_free_aligned(f->frame_thread.cbi);
  ------------------
  |  |  136|      0|#define dav1d_free_aligned(ptr) dav1d_free_aligned_internal(ptr)
  ------------------
 2842|      0|            f->frame_thread.cbi =
 2843|      0|                dav1d_alloc_aligned(ALLOC_BLOCK, sizeof(*f->frame_thread.cbi) *
  ------------------
  |  |  134|      0|#define dav1d_alloc_aligned(type, sz, align) dav1d_alloc_aligned_internal(sz, align)
  ------------------
 2844|      0|                                    cbi_sz * 32 * 32 / 4, 64);
 2845|      0|            if (!f->frame_thread.cbi) {
  ------------------
  |  Branch (2845:17): [True: 0, False: 0]
  ------------------
 2846|      0|                f->frame_thread.cbi_sz = 0;
 2847|      0|                goto error;
 2848|      0|            }
 2849|      0|            f->frame_thread.cbi_sz = cbi_sz;
 2850|      0|        }
 2851|       |
 2852|      0|        const int cf_sz = (num_sb128 * size_mul[0]) << hbd;
 2853|      0|        if (cf_sz != f->frame_thread.cf_sz) {
  ------------------
  |  Branch (2853:13): [True: 0, False: 0]
  ------------------
 2854|      0|            dav1d_free_aligned(f->frame_thread.cf);
  ------------------
  |  |  136|      0|#define dav1d_free_aligned(ptr) dav1d_free_aligned_internal(ptr)
  ------------------
 2855|      0|            f->frame_thread.cf =
 2856|      0|                dav1d_alloc_aligned(ALLOC_COEF, (size_t)cf_sz * 128 * 128 / 2, 64);
  ------------------
  |  |  134|      0|#define dav1d_alloc_aligned(type, sz, align) dav1d_alloc_aligned_internal(sz, align)
  ------------------
 2857|      0|            if (!f->frame_thread.cf) {
  ------------------
  |  Branch (2857:17): [True: 0, False: 0]
  ------------------
 2858|      0|                f->frame_thread.cf_sz = 0;
 2859|      0|                goto error;
 2860|      0|            }
 2861|      0|            memset(f->frame_thread.cf, 0, (size_t)cf_sz * 128 * 128 / 2);
 2862|      0|            f->frame_thread.cf_sz = cf_sz;
 2863|      0|        }
 2864|       |
 2865|      0|        if (f->frame_hdr->allow_screen_content_tools) {
  ------------------
  |  Branch (2865:13): [True: 0, False: 0]
  ------------------
 2866|      0|            const int pal_sz = num_sb128 << hbd;
 2867|      0|            if (pal_sz != f->frame_thread.pal_sz) {
  ------------------
  |  Branch (2867:17): [True: 0, False: 0]
  ------------------
 2868|      0|                dav1d_free_aligned(f->frame_thread.pal);
  ------------------
  |  |  136|      0|#define dav1d_free_aligned(ptr) dav1d_free_aligned_internal(ptr)
  ------------------
 2869|      0|                f->frame_thread.pal =
 2870|      0|                    dav1d_alloc_aligned(ALLOC_PAL, sizeof(*f->frame_thread.pal) *
  ------------------
  |  |  134|      0|#define dav1d_alloc_aligned(type, sz, align) dav1d_alloc_aligned_internal(sz, align)
  ------------------
 2871|      0|                                        pal_sz * 16 * 16, 64);
 2872|      0|                if (!f->frame_thread.pal) {
  ------------------
  |  Branch (2872:21): [True: 0, False: 0]
  ------------------
 2873|      0|                    f->frame_thread.pal_sz = 0;
 2874|      0|                    goto error;
 2875|      0|                }
 2876|      0|                f->frame_thread.pal_sz = pal_sz;
 2877|      0|            }
 2878|       |
 2879|      0|            const int pal_idx_sz = num_sb128 * size_mul[1];
 2880|      0|            if (pal_idx_sz != f->frame_thread.pal_idx_sz) {
  ------------------
  |  Branch (2880:17): [True: 0, False: 0]
  ------------------
 2881|      0|                dav1d_free_aligned(f->frame_thread.pal_idx);
  ------------------
  |  |  136|      0|#define dav1d_free_aligned(ptr) dav1d_free_aligned_internal(ptr)
  ------------------
 2882|      0|                f->frame_thread.pal_idx =
 2883|      0|                    dav1d_alloc_aligned(ALLOC_PAL, sizeof(*f->frame_thread.pal_idx) *
  ------------------
  |  |  134|      0|#define dav1d_alloc_aligned(type, sz, align) dav1d_alloc_aligned_internal(sz, align)
  ------------------
 2884|      0|                                        pal_idx_sz * 128 * 128 / 8, 64);
 2885|      0|                if (!f->frame_thread.pal_idx) {
  ------------------
  |  Branch (2885:21): [True: 0, False: 0]
  ------------------
 2886|      0|                    f->frame_thread.pal_idx_sz = 0;
 2887|      0|                    goto error;
 2888|      0|                }
 2889|      0|                f->frame_thread.pal_idx_sz = pal_idx_sz;
 2890|      0|            }
 2891|      0|        } else if (f->frame_thread.pal) {
  ------------------
  |  Branch (2891:20): [True: 0, False: 0]
  ------------------
 2892|      0|            dav1d_freep_aligned(&f->frame_thread.pal);
 2893|      0|            dav1d_freep_aligned(&f->frame_thread.pal_idx);
 2894|      0|            f->frame_thread.pal_sz = f->frame_thread.pal_idx_sz = 0;
 2895|      0|        }
 2896|      0|    }
 2897|       |
 2898|       |    // update allocation of block contexts for above
 2899|  21.4k|    ptrdiff_t y_stride = f->cur.stride[0], uv_stride = f->cur.stride[1];
 2900|  21.4k|    const int has_resize = f->frame_hdr->width[0] != f->frame_hdr->width[1];
 2901|  21.4k|    const int need_cdef_lpf_copy = c->n_tc > 1 && has_resize;
  ------------------
  |  Branch (2901:36): [True: 20.1k, False: 1.31k]
  |  Branch (2901:51): [True: 1.11k, False: 19.0k]
  ------------------
 2902|  21.4k|    if (y_stride * f->sbh * 4 != f->lf.cdef_buf_plane_sz[0] ||
  ------------------
  |  Branch (2902:9): [True: 15.5k, False: 5.97k]
  ------------------
 2903|  5.97k|        uv_stride * f->sbh * 8 != f->lf.cdef_buf_plane_sz[1] ||
  ------------------
  |  Branch (2903:9): [True: 145, False: 5.82k]
  ------------------
 2904|  5.82k|        need_cdef_lpf_copy != f->lf.need_cdef_lpf_copy ||
  ------------------
  |  Branch (2904:9): [True: 104, False: 5.72k]
  ------------------
 2905|  5.72k|        f->sbh != f->lf.cdef_buf_sbh)
  ------------------
  |  Branch (2905:9): [True: 3, False: 5.71k]
  ------------------
 2906|  15.7k|    {
 2907|  15.7k|        dav1d_free_aligned(f->lf.cdef_line_buf);
  ------------------
  |  |  136|  15.7k|#define dav1d_free_aligned(ptr) dav1d_free_aligned_internal(ptr)
  ------------------
 2908|  15.7k|        size_t alloc_sz = 64;
 2909|  15.7k|        alloc_sz += (size_t)llabs(y_stride) * 4 * f->sbh << need_cdef_lpf_copy;
 2910|  15.7k|        alloc_sz += (size_t)llabs(uv_stride) * 8 * f->sbh << need_cdef_lpf_copy;
 2911|  15.7k|        uint8_t *ptr = f->lf.cdef_line_buf = dav1d_alloc_aligned(ALLOC_CDEF, alloc_sz, 32);
  ------------------
  |  |  134|  15.7k|#define dav1d_alloc_aligned(type, sz, align) dav1d_alloc_aligned_internal(sz, align)
  ------------------
 2912|  15.7k|        if (!ptr) {
  ------------------
  |  Branch (2912:13): [True: 0, False: 15.7k]
  ------------------
 2913|      0|            f->lf.cdef_buf_plane_sz[0] = f->lf.cdef_buf_plane_sz[1] = 0;
 2914|      0|            goto error;
 2915|      0|        }
 2916|       |
 2917|  15.7k|        ptr += 32;
 2918|  15.7k|        if (y_stride < 0) {
  ------------------
  |  Branch (2918:13): [True: 0, False: 15.7k]
  ------------------
 2919|      0|            f->lf.cdef_line[0][0] = ptr - y_stride * (f->sbh * 4 - 1);
 2920|      0|            f->lf.cdef_line[1][0] = ptr - y_stride * (f->sbh * 4 - 3);
 2921|  15.7k|        } else {
 2922|  15.7k|            f->lf.cdef_line[0][0] = ptr + y_stride * 0;
 2923|  15.7k|            f->lf.cdef_line[1][0] = ptr + y_stride * 2;
 2924|  15.7k|        }
 2925|  15.7k|        ptr += llabs(y_stride) * f->sbh * 4;
 2926|  15.7k|        if (uv_stride < 0) {
  ------------------
  |  Branch (2926:13): [True: 0, False: 15.7k]
  ------------------
 2927|      0|            f->lf.cdef_line[0][1] = ptr - uv_stride * (f->sbh * 8 - 1);
 2928|      0|            f->lf.cdef_line[0][2] = ptr - uv_stride * (f->sbh * 8 - 3);
 2929|      0|            f->lf.cdef_line[1][1] = ptr - uv_stride * (f->sbh * 8 - 5);
 2930|      0|            f->lf.cdef_line[1][2] = ptr - uv_stride * (f->sbh * 8 - 7);
 2931|  15.7k|        } else {
 2932|  15.7k|            f->lf.cdef_line[0][1] = ptr + uv_stride * 0;
 2933|  15.7k|            f->lf.cdef_line[0][2] = ptr + uv_stride * 2;
 2934|  15.7k|            f->lf.cdef_line[1][1] = ptr + uv_stride * 4;
 2935|  15.7k|            f->lf.cdef_line[1][2] = ptr + uv_stride * 6;
 2936|  15.7k|        }
 2937|       |
 2938|  15.7k|        if (need_cdef_lpf_copy) {
  ------------------
  |  Branch (2938:13): [True: 1.06k, False: 14.7k]
  ------------------
 2939|  1.06k|            ptr += llabs(uv_stride) * f->sbh * 8;
 2940|  1.06k|            if (y_stride < 0)
  ------------------
  |  Branch (2940:17): [True: 0, False: 1.06k]
  ------------------
 2941|      0|                f->lf.cdef_lpf_line[0] = ptr - y_stride * (f->sbh * 4 - 1);
 2942|  1.06k|            else
 2943|  1.06k|                f->lf.cdef_lpf_line[0] = ptr;
 2944|  1.06k|            ptr += llabs(y_stride) * f->sbh * 4;
 2945|  1.06k|            if (uv_stride < 0) {
  ------------------
  |  Branch (2945:17): [True: 0, False: 1.06k]
  ------------------
 2946|      0|                f->lf.cdef_lpf_line[1] = ptr - uv_stride * (f->sbh * 4 - 1);
 2947|      0|                f->lf.cdef_lpf_line[2] = ptr - uv_stride * (f->sbh * 8 - 1);
 2948|  1.06k|            } else {
 2949|  1.06k|                f->lf.cdef_lpf_line[1] = ptr;
 2950|  1.06k|                f->lf.cdef_lpf_line[2] = ptr + uv_stride * f->sbh * 4;
 2951|  1.06k|            }
 2952|  1.06k|        }
 2953|       |
 2954|  15.7k|        f->lf.cdef_buf_plane_sz[0] = (int) y_stride * f->sbh * 4;
 2955|  15.7k|        f->lf.cdef_buf_plane_sz[1] = (int) uv_stride * f->sbh * 8;
 2956|  15.7k|        f->lf.need_cdef_lpf_copy = need_cdef_lpf_copy;
 2957|  15.7k|        f->lf.cdef_buf_sbh = f->sbh;
 2958|  15.7k|    }
 2959|       |
 2960|  21.4k|    const int sb128 = f->seq_hdr->sb128;
 2961|  21.4k|    const int num_lines = c->n_tc > 1 ? f->sbh * 4 << sb128 : 12;
  ------------------
  |  Branch (2961:27): [True: 20.1k, False: 1.31k]
  ------------------
 2962|  21.4k|    y_stride = f->sr_cur.p.stride[0], uv_stride = f->sr_cur.p.stride[1];
 2963|  21.4k|    if (y_stride * num_lines != f->lf.lr_buf_plane_sz[0] ||
  ------------------
  |  Branch (2963:9): [True: 15.5k, False: 5.93k]
  ------------------
 2964|  5.93k|        uv_stride * num_lines * 2 != f->lf.lr_buf_plane_sz[1])
  ------------------
  |  Branch (2964:9): [True: 114, False: 5.82k]
  ------------------
 2965|  15.6k|    {
 2966|  15.6k|        dav1d_free_aligned(f->lf.lr_line_buf);
  ------------------
  |  |  136|  15.6k|#define dav1d_free_aligned(ptr) dav1d_free_aligned_internal(ptr)
  ------------------
 2967|       |        // lr simd may overread the input, so slightly over-allocate the lpf buffer
 2968|  15.6k|        size_t alloc_sz = 128;
 2969|  15.6k|        alloc_sz += (size_t)llabs(y_stride) * num_lines;
 2970|  15.6k|        alloc_sz += (size_t)llabs(uv_stride) * num_lines * 2;
 2971|  15.6k|        uint8_t *ptr = f->lf.lr_line_buf = dav1d_alloc_aligned(ALLOC_LR, alloc_sz, 64);
  ------------------
  |  |  134|  15.6k|#define dav1d_alloc_aligned(type, sz, align) dav1d_alloc_aligned_internal(sz, align)
  ------------------
 2972|  15.6k|        if (!ptr) {
  ------------------
  |  Branch (2972:13): [True: 0, False: 15.6k]
  ------------------
 2973|      0|            f->lf.lr_buf_plane_sz[0] = f->lf.lr_buf_plane_sz[1] = 0;
 2974|      0|            goto error;
 2975|      0|        }
 2976|       |
 2977|  15.6k|        ptr += 64;
 2978|  15.6k|        if (y_stride < 0)
  ------------------
  |  Branch (2978:13): [True: 0, False: 15.6k]
  ------------------
 2979|      0|            f->lf.lr_lpf_line[0] = ptr - y_stride * (num_lines - 1);
 2980|  15.6k|        else
 2981|  15.6k|            f->lf.lr_lpf_line[0] = ptr;
 2982|  15.6k|        ptr += llabs(y_stride) * num_lines;
 2983|  15.6k|        if (uv_stride < 0) {
  ------------------
  |  Branch (2983:13): [True: 0, False: 15.6k]
  ------------------
 2984|      0|            f->lf.lr_lpf_line[1] = ptr - uv_stride * (num_lines * 1 - 1);
 2985|      0|            f->lf.lr_lpf_line[2] = ptr - uv_stride * (num_lines * 2 - 1);
 2986|  15.6k|        } else {
 2987|  15.6k|            f->lf.lr_lpf_line[1] = ptr;
 2988|  15.6k|            f->lf.lr_lpf_line[2] = ptr + uv_stride * num_lines;
 2989|  15.6k|        }
 2990|       |
 2991|  15.6k|        f->lf.lr_buf_plane_sz[0] = (int) y_stride * num_lines;
 2992|  15.6k|        f->lf.lr_buf_plane_sz[1] = (int) uv_stride * num_lines * 2;
 2993|  15.6k|    }
 2994|       |
 2995|       |    // update allocation for loopfilter masks
 2996|  21.4k|    if (num_sb128 != f->lf.mask_sz) {
  ------------------
  |  Branch (2996:9): [True: 15.4k, False: 6.01k]
  ------------------
 2997|  15.4k|        dav1d_free(f->lf.mask);
  ------------------
  |  |  135|  15.4k|#define dav1d_free(ptr) free(ptr)
  ------------------
 2998|  15.4k|        dav1d_free(f->lf.level);
  ------------------
  |  |  135|  15.4k|#define dav1d_free(ptr) free(ptr)
  ------------------
 2999|  15.4k|        f->lf.mask = dav1d_malloc(ALLOC_LF, sizeof(*f->lf.mask) * num_sb128);
  ------------------
  |  |  132|  15.4k|#define dav1d_malloc(type, sz) malloc(sz)
  ------------------
 3000|       |        // over-allocate by 3 bytes since some of the SIMD implementations
 3001|       |        // index this from the level type and can thus over-read by up to 3
 3002|  15.4k|        f->lf.level = dav1d_malloc(ALLOC_LF, sizeof(*f->lf.level) * num_sb128 * 32 * 32 + 3);
  ------------------
  |  |  132|  15.4k|#define dav1d_malloc(type, sz) malloc(sz)
  ------------------
 3003|  15.4k|        if (!f->lf.mask || !f->lf.level) {
  ------------------
  |  Branch (3003:13): [True: 0, False: 15.4k]
  |  Branch (3003:28): [True: 0, False: 15.4k]
  ------------------
 3004|      0|            f->lf.mask_sz = 0;
 3005|      0|            goto error;
 3006|      0|        }
 3007|  15.4k|        if (c->n_fc > 1) {
  ------------------
  |  Branch (3007:13): [True: 0, False: 15.4k]
  ------------------
 3008|      0|            dav1d_free(f->frame_thread.b);
  ------------------
  |  |  135|      0|#define dav1d_free(ptr) free(ptr)
  ------------------
 3009|      0|            f->frame_thread.b = dav1d_malloc(ALLOC_BLOCK, sizeof(*f->frame_thread.b) *
  ------------------
  |  |  132|      0|#define dav1d_malloc(type, sz) malloc(sz)
  ------------------
 3010|      0|                                             num_sb128 * 32 * 32);
 3011|      0|            if (!f->frame_thread.b) {
  ------------------
  |  Branch (3011:17): [True: 0, False: 0]
  ------------------
 3012|      0|                f->lf.mask_sz = 0;
 3013|      0|                goto error;
 3014|      0|            }
 3015|      0|        }
 3016|  15.4k|        f->lf.mask_sz = num_sb128;
 3017|  15.4k|    }
 3018|       |
 3019|  21.4k|    f->sr_sb128w = (f->sr_cur.p.p.w + 127) >> 7;
 3020|  21.4k|    const int lr_mask_sz = f->sr_sb128w * f->sb128h;
 3021|  21.4k|    if (lr_mask_sz != f->lf.lr_mask_sz) {
  ------------------
  |  Branch (3021:9): [True: 15.4k, False: 6.01k]
  ------------------
 3022|  15.4k|        dav1d_free(f->lf.lr_mask);
  ------------------
  |  |  135|  15.4k|#define dav1d_free(ptr) free(ptr)
  ------------------
 3023|  15.4k|        f->lf.lr_mask = dav1d_malloc(ALLOC_LR, sizeof(*f->lf.lr_mask) * lr_mask_sz);
  ------------------
  |  |  132|  15.4k|#define dav1d_malloc(type, sz) malloc(sz)
  ------------------
 3024|  15.4k|        if (!f->lf.lr_mask) {
  ------------------
  |  Branch (3024:13): [True: 0, False: 15.4k]
  ------------------
 3025|      0|            f->lf.lr_mask_sz = 0;
 3026|      0|            goto error;
 3027|      0|        }
 3028|  15.4k|        f->lf.lr_mask_sz = lr_mask_sz;
 3029|  15.4k|    }
 3030|  21.4k|    f->lf.restore_planes =
 3031|  21.4k|        ((f->frame_hdr->restoration.type[0] != DAV1D_RESTORATION_NONE) << 0) +
 3032|  21.4k|        ((f->frame_hdr->restoration.type[1] != DAV1D_RESTORATION_NONE) << 1) +
 3033|  21.4k|        ((f->frame_hdr->restoration.type[2] != DAV1D_RESTORATION_NONE) << 2);
 3034|  21.4k|    if (f->frame_hdr->loopfilter.sharpness != f->lf.last_sharpness) {
  ------------------
  |  Branch (3034:9): [True: 17.4k, False: 4.05k]
  ------------------
 3035|  17.4k|        dav1d_calc_eih(&f->lf.lim_lut, f->frame_hdr->loopfilter.sharpness);
 3036|  17.4k|        f->lf.last_sharpness = f->frame_hdr->loopfilter.sharpness;
 3037|  17.4k|    }
 3038|  21.4k|    dav1d_calc_lf_values(f->lf.lvl, f->frame_hdr, (int8_t[4]) { 0, 0, 0, 0 });
 3039|  21.4k|    memset(f->lf.mask, 0, sizeof(*f->lf.mask) * num_sb128);
 3040|       |
 3041|  21.4k|    const int ipred_edge_sz = f->sbh * f->sb128w << hbd;
 3042|  21.4k|    if (ipred_edge_sz != f->ipred_edge_sz) {
  ------------------
  |  Branch (3042:9): [True: 15.5k, False: 5.97k]
  ------------------
 3043|  15.5k|        dav1d_free_aligned(f->ipred_edge[0]);
  ------------------
  |  |  136|  15.5k|#define dav1d_free_aligned(ptr) dav1d_free_aligned_internal(ptr)
  ------------------
 3044|  15.5k|        uint8_t *ptr = f->ipred_edge[0] =
 3045|  15.5k|            dav1d_alloc_aligned(ALLOC_IPRED, ipred_edge_sz * 128 * 3, 64);
  ------------------
  |  |  134|  15.5k|#define dav1d_alloc_aligned(type, sz, align) dav1d_alloc_aligned_internal(sz, align)
  ------------------
 3046|  15.5k|        if (!ptr) {
  ------------------
  |  Branch (3046:13): [True: 0, False: 15.5k]
  ------------------
 3047|      0|            f->ipred_edge_sz = 0;
 3048|      0|            goto error;
 3049|      0|        }
 3050|  15.5k|        f->ipred_edge[1] = ptr + ipred_edge_sz * 128 * 1;
 3051|  15.5k|        f->ipred_edge[2] = ptr + ipred_edge_sz * 128 * 2;
 3052|  15.5k|        f->ipred_edge_sz = ipred_edge_sz;
 3053|  15.5k|    }
 3054|       |
 3055|  21.4k|    const int re_sz = f->sb128h * f->frame_hdr->tiling.cols;
 3056|  21.4k|    if (re_sz != f->lf.re_sz) {
  ------------------
  |  Branch (3056:9): [True: 15.4k, False: 6.01k]
  ------------------
 3057|  15.4k|        dav1d_free(f->lf.tx_lpf_right_edge[0]);
  ------------------
  |  |  135|  15.4k|#define dav1d_free(ptr) free(ptr)
  ------------------
 3058|  15.4k|        f->lf.tx_lpf_right_edge[0] = dav1d_malloc(ALLOC_LF, re_sz * 32 * 2);
  ------------------
  |  |  132|  15.4k|#define dav1d_malloc(type, sz) malloc(sz)
  ------------------
 3059|  15.4k|        if (!f->lf.tx_lpf_right_edge[0]) {
  ------------------
  |  Branch (3059:13): [True: 0, False: 15.4k]
  ------------------
 3060|      0|            f->lf.re_sz = 0;
 3061|      0|            goto error;
 3062|      0|        }
 3063|  15.4k|        f->lf.tx_lpf_right_edge[1] = f->lf.tx_lpf_right_edge[0] + re_sz * 32;
 3064|  15.4k|        f->lf.re_sz = re_sz;
 3065|  15.4k|    }
 3066|       |
 3067|       |    // init ref mvs
 3068|  21.4k|    if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc) {
  ------------------
  |  |   36|  42.9k|    ((frame_header)->frame_type & 1)
  |  |  ------------------
  |  |  |  Branch (36:5): [True: 5.04k, False: 16.4k]
  |  |  ------------------
  ------------------
  |  Branch (3068:45): [True: 3.33k, False: 13.1k]
  ------------------
 3069|  8.37k|        const int ret =
 3070|  8.37k|            dav1d_refmvs_init_frame(&f->rf, f->seq_hdr, f->frame_hdr,
 3071|  8.37k|                                    f->refpoc, f->mvs, f->refrefpoc, f->ref_mvs,
 3072|  8.37k|                                    f->c->n_tc, f->c->n_fc);
 3073|  8.37k|        if (ret < 0) goto error;
  ------------------
  |  Branch (3073:13): [True: 0, False: 8.37k]
  ------------------
 3074|  8.37k|    }
 3075|       |
 3076|       |    // setup dequant tables
 3077|  21.4k|    init_quant_tables(f->seq_hdr, f->frame_hdr, f->frame_hdr->quant.yac, f->dq);
 3078|  21.4k|    if (f->frame_hdr->quant.qm)
  ------------------
  |  Branch (3078:9): [True: 5.78k, False: 15.7k]
  ------------------
 3079|   115k|        for (int i = 0; i < N_RECT_TX_SIZES; i++) {
  ------------------
  |  Branch (3079:25): [True: 109k, False: 5.78k]
  ------------------
 3080|   109k|            f->qm[i][0] = dav1d_qm_tbl[f->frame_hdr->quant.qm_y][0][i];
 3081|   109k|            f->qm[i][1] = dav1d_qm_tbl[f->frame_hdr->quant.qm_u][1][i];
 3082|   109k|            f->qm[i][2] = dav1d_qm_tbl[f->frame_hdr->quant.qm_v][1][i];
 3083|   109k|        }
 3084|  15.7k|    else
 3085|  15.7k|        memset(f->qm, 0, sizeof(f->qm));
 3086|       |
 3087|       |    // setup jnt_comp weights
 3088|  21.4k|    if (f->frame_hdr->switchable_comp_refs) {
  ------------------
  |  Branch (3088:9): [True: 1.81k, False: 19.6k]
  ------------------
 3089|  14.5k|        for (int i = 0; i < 7; i++) {
  ------------------
  |  Branch (3089:25): [True: 12.7k, False: 1.81k]
  ------------------
 3090|  12.7k|            const unsigned ref0poc = f->refp[i].p.frame_hdr->frame_offset;
 3091|       |
 3092|  50.8k|            for (int j = i + 1; j < 7; j++) {
  ------------------
  |  Branch (3092:33): [True: 38.1k, False: 12.7k]
  ------------------
 3093|  38.1k|                const unsigned ref1poc = f->refp[j].p.frame_hdr->frame_offset;
 3094|       |
 3095|  38.1k|                const unsigned d1 =
 3096|  38.1k|                    imin(abs(get_poc_diff(f->seq_hdr->order_hint_n_bits, ref0poc,
 3097|  38.1k|                                          f->cur.frame_hdr->frame_offset)), 31);
 3098|  38.1k|                const unsigned d0 =
 3099|  38.1k|                    imin(abs(get_poc_diff(f->seq_hdr->order_hint_n_bits, ref1poc,
 3100|  38.1k|                                          f->cur.frame_hdr->frame_offset)), 31);
 3101|  38.1k|                const int order = d0 <= d1;
 3102|       |
 3103|  38.1k|                static const uint8_t quant_dist_weight[3][2] = {
 3104|  38.1k|                    { 2, 3 }, { 2, 5 }, { 2, 7 }
 3105|  38.1k|                };
 3106|  38.1k|                static const uint8_t quant_dist_lookup_table[4][2] = {
 3107|  38.1k|                    { 9, 7 }, { 11, 5 }, { 12, 4 }, { 13, 3 }
 3108|  38.1k|                };
 3109|       |
 3110|  38.1k|                int k;
 3111|  59.9k|                for (k = 0; k < 3; k++) {
  ------------------
  |  Branch (3111:29): [True: 53.3k, False: 6.57k]
  ------------------
 3112|  53.3k|                    const int c0 = quant_dist_weight[k][order];
 3113|  53.3k|                    const int c1 = quant_dist_weight[k][!order];
 3114|  53.3k|                    const int d0_c0 = d0 * c0;
 3115|  53.3k|                    const int d1_c1 = d1 * c1;
 3116|  53.3k|                    if ((d0 > d1 && d0_c0 < d1_c1) || (d0 <= d1 && d0_c0 > d1_c1)) break;
  ------------------
  |  Branch (3116:26): [True: 4.13k, False: 49.2k]
  |  Branch (3116:37): [True: 1.15k, False: 2.98k]
  |  Branch (3116:56): [True: 49.2k, False: 2.98k]
  |  Branch (3116:68): [True: 30.4k, False: 18.7k]
  ------------------
 3117|  53.3k|                }
 3118|       |
 3119|  38.1k|                f->jnt_weights[i][j] = quant_dist_lookup_table[k][order];
 3120|  38.1k|            }
 3121|  12.7k|        }
 3122|  1.81k|    }
 3123|       |
 3124|       |    /* Init loopfilter pointers. Increasing NULL pointers is technically UB,
 3125|       |     * so just point the chroma pointers in 4:0:0 to the luma plane here to
 3126|       |     * avoid having additional in-loop branches in various places. We never
 3127|       |     * dereference those pointers so it doesn't really matter what they
 3128|       |     * point at, as long as the pointers are valid. */
 3129|  21.4k|    const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400;
 3130|  21.4k|    f->lf.p[0] = f->cur.data[0];
 3131|  21.4k|    f->lf.p[1] = f->cur.data[has_chroma ? 1 : 0];
  ------------------
  |  Branch (3131:30): [True: 17.0k, False: 4.40k]
  ------------------
 3132|  21.4k|    f->lf.p[2] = f->cur.data[has_chroma ? 2 : 0];
  ------------------
  |  Branch (3132:30): [True: 17.0k, False: 4.40k]
  ------------------
 3133|  21.4k|    f->lf.sr_p[0] = f->sr_cur.p.data[0];
 3134|  21.4k|    f->lf.sr_p[1] = f->sr_cur.p.data[has_chroma ? 1 : 0];
  ------------------
  |  Branch (3134:38): [True: 17.0k, False: 4.40k]
  ------------------
 3135|  21.4k|    f->lf.sr_p[2] = f->sr_cur.p.data[has_chroma ? 2 : 0];
  ------------------
  |  Branch (3135:38): [True: 17.0k, False: 4.40k]
  ------------------
 3136|       |
 3137|  21.4k|    retval = 0;
 3138|  21.4k|error:
 3139|  21.4k|    return retval;
 3140|  21.4k|}
dav1d_decode_frame_init_cdf:
 3142|  21.4k|int dav1d_decode_frame_init_cdf(Dav1dFrameContext *const f) {
 3143|  21.4k|    const Dav1dContext *const c = f->c;
 3144|  21.4k|    int retval = DAV1D_ERR(EINVAL);
  ------------------
  |  |   56|  21.4k|#define DAV1D_ERR(e) (-(e)) ///< Negate POSIX error code.
  ------------------
 3145|       |
 3146|  21.4k|    if (f->frame_hdr->refresh_context)
  ------------------
  |  Branch (3146:9): [True: 8.68k, False: 12.8k]
  ------------------
 3147|  8.68k|        dav1d_cdf_thread_copy(f->out_cdf.data.cdf, &f->in_cdf);
 3148|       |
 3149|       |    // parse individual tiles per tile group
 3150|  21.4k|    int tile_row = 0, tile_col = 0;
 3151|  21.4k|    f->task_thread.update_set = 0;
 3152|  42.8k|    for (int i = 0; i < f->n_tile_data; i++) {
  ------------------
  |  Branch (3152:21): [True: 21.5k, False: 21.3k]
  ------------------
 3153|  21.5k|        const uint8_t *data = f->tile[i].data.data;
 3154|  21.5k|        size_t size = f->tile[i].data.sz;
 3155|       |
 3156|  53.7k|        for (int j = f->tile[i].start; j <= f->tile[i].end; j++) {
  ------------------
  |  Branch (3156:40): [True: 32.3k, False: 21.3k]
  ------------------
 3157|  32.3k|            size_t tile_sz;
 3158|  32.3k|            if (j == f->tile[i].end) {
  ------------------
  |  Branch (3158:17): [True: 21.3k, False: 11.0k]
  ------------------
 3159|  21.3k|                tile_sz = size;
 3160|  21.3k|            } else {
 3161|  11.0k|                if (f->frame_hdr->tiling.n_bytes > size) goto error;
  ------------------
  |  Branch (3161:21): [True: 9, False: 11.0k]
  ------------------
 3162|  11.0k|                tile_sz = 0;
 3163|  23.2k|                for (unsigned k = 0; k < f->frame_hdr->tiling.n_bytes; k++)
  ------------------
  |  Branch (3163:38): [True: 12.2k, False: 11.0k]
  ------------------
 3164|  12.2k|                    tile_sz |= (unsigned)*data++ << (k * 8);
 3165|  11.0k|                tile_sz++;
 3166|  11.0k|                size -= f->frame_hdr->tiling.n_bytes;
 3167|  11.0k|                if (tile_sz > size) goto error;
  ------------------
  |  Branch (3167:21): [True: 144, False: 10.8k]
  ------------------
 3168|  11.0k|            }
 3169|       |
 3170|  32.2k|            setup_tile(&f->ts[j], f, data, tile_sz, tile_row, tile_col++,
 3171|  32.2k|                       c->n_fc > 1 ? f->frame_thread.tile_start_off[j] : 0);
  ------------------
  |  Branch (3171:24): [True: 0, False: 32.2k]
  ------------------
 3172|       |
 3173|  32.2k|            if (tile_col == f->frame_hdr->tiling.cols) {
  ------------------
  |  Branch (3173:17): [True: 27.3k, False: 4.90k]
  ------------------
 3174|  27.3k|                tile_col = 0;
 3175|  27.3k|                tile_row++;
 3176|  27.3k|            }
 3177|  32.2k|            if (j == f->frame_hdr->tiling.update && f->frame_hdr->refresh_context)
  ------------------
  |  Branch (3177:17): [True: 21.3k, False: 10.8k]
  |  Branch (3177:53): [True: 8.66k, False: 12.6k]
  ------------------
 3178|  8.66k|                f->task_thread.update_set = 1;
 3179|  32.2k|            data += tile_sz;
 3180|  32.2k|            size -= tile_sz;
 3181|  32.2k|        }
 3182|  21.5k|    }
 3183|       |
 3184|  21.3k|    if (c->n_tc > 1) {
  ------------------
  |  Branch (3184:9): [True: 20.0k, False: 1.31k]
  ------------------
 3185|  20.0k|        const int uses_2pass = c->n_fc > 1;
 3186|   161k|        for (int n = 0; n < f->sb128w * f->frame_hdr->tiling.rows * (1 + uses_2pass); n++)
  ------------------
  |  Branch (3186:25): [True: 141k, False: 20.0k]
  ------------------
 3187|   141k|            reset_context(&f->a[n], IS_KEY_OR_INTRA(f->frame_hdr),
  ------------------
  |  |   43|   141k|    (!IS_INTER_OR_SWITCH(frame_header))
  |  |  ------------------
  |  |  |  |   36|   141k|    ((frame_header)->frame_type & 1)
  |  |  ------------------
  ------------------
 3188|   141k|                          uses_2pass ? 1 + (n >= f->sb128w * f->frame_hdr->tiling.rows) : 0);
  ------------------
  |  Branch (3188:27): [True: 0, False: 141k]
  ------------------
 3189|  20.0k|    }
 3190|       |
 3191|  21.3k|    retval = 0;
 3192|  21.4k|error:
 3193|  21.4k|    return retval;
 3194|  21.3k|}
dav1d_decode_frame_main:
 3196|  1.31k|int dav1d_decode_frame_main(Dav1dFrameContext *const f) {
 3197|  1.31k|    const Dav1dContext *const c = f->c;
 3198|  1.31k|    int retval = DAV1D_ERR(EINVAL);
  ------------------
  |  |   56|  1.31k|#define DAV1D_ERR(e) (-(e)) ///< Negate POSIX error code.
  ------------------
 3199|       |
 3200|  1.31k|    assert(f->c->n_tc == 1);
  ------------------
  |  |  140|  1.31k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 1.31k]
  |  |  |  Branch (140:68): [Folded, False: 1.31k]
  |  |  ------------------
  ------------------
 3201|       |
 3202|  1.31k|    Dav1dTaskContext *const t = &c->tc[f - c->fc];
 3203|  1.31k|    t->f = f;
 3204|  1.31k|    t->frame_thread.pass = 0;
 3205|       |
 3206|  4.72k|    for (int n = 0; n < f->sb128w * f->frame_hdr->tiling.rows; n++)
  ------------------
  |  Branch (3206:21): [True: 3.41k, False: 1.31k]
  ------------------
 3207|  3.41k|        reset_context(&f->a[n], IS_KEY_OR_INTRA(f->frame_hdr), 0);
  ------------------
  |  |   43|  3.41k|    (!IS_INTER_OR_SWITCH(frame_header))
  |  |  ------------------
  |  |  |  |   36|  3.41k|    ((frame_header)->frame_type & 1)
  |  |  ------------------
  ------------------
 3208|       |
 3209|       |    // no threading - we explicitly interleave tile/sbrow decoding
 3210|       |    // and post-filtering, so that the full process runs in-line
 3211|  2.38k|    for (int tile_row = 0; tile_row < f->frame_hdr->tiling.rows; tile_row++) {
  ------------------
  |  Branch (3211:28): [True: 1.48k, False: 903]
  ------------------
 3212|  1.48k|        const int sbh_end =
 3213|  1.48k|            imin(f->frame_hdr->tiling.row_start_sb[tile_row + 1], f->sbh);
 3214|  1.48k|        for (int sby = f->frame_hdr->tiling.row_start_sb[tile_row];
 3215|  8.04k|             sby < sbh_end; sby++)
  ------------------
  |  Branch (3215:14): [True: 6.97k, False: 1.07k]
  ------------------
 3216|  6.97k|        {
 3217|  6.97k|            t->by = sby << (4 + f->seq_hdr->sb128);
 3218|  6.97k|            const int by_end = (t->by + f->sb_step) >> 1;
 3219|  6.97k|            if (f->frame_hdr->use_ref_frame_mvs) {
  ------------------
  |  Branch (3219:17): [True: 491, False: 6.48k]
  ------------------
 3220|    491|                f->c->refmvs_dsp.load_tmvs(&f->rf, tile_row,
 3221|    491|                                           0, f->bw >> 1, t->by >> 1, by_end);
 3222|    491|            }
 3223|  13.6k|            for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols; tile_col++) {
  ------------------
  |  Branch (3223:36): [True: 7.10k, False: 6.56k]
  ------------------
 3224|  7.10k|                t->ts = &f->ts[tile_row * f->frame_hdr->tiling.cols + tile_col];
 3225|  7.10k|                if (dav1d_decode_tile_sbrow(t)) goto error;
  ------------------
  |  Branch (3225:21): [True: 410, False: 6.69k]
  ------------------
 3226|  7.10k|            }
 3227|  6.56k|            if (IS_INTER_OR_SWITCH(f->frame_hdr)) {
  ------------------
  |  |   36|  6.56k|    ((frame_header)->frame_type & 1)
  |  |  ------------------
  |  |  |  Branch (36:5): [True: 501, False: 6.06k]
  |  |  ------------------
  ------------------
 3228|    501|                dav1d_refmvs_save_tmvs(&f->c->refmvs_dsp, &t->rt,
 3229|    501|                                       0, f->bw >> 1, t->by >> 1, by_end);
 3230|    501|            }
 3231|       |
 3232|       |            // loopfilter + cdef + restoration
 3233|  6.56k|            f->bd_fn.filter_sbrow(f, sby);
 3234|  6.56k|        }
 3235|  1.48k|    }
 3236|       |
 3237|    903|    retval = 0;
 3238|  1.31k|error:
 3239|  1.31k|    return retval;
 3240|    903|}
dav1d_decode_frame_exit:
 3242|  41.5k|void dav1d_decode_frame_exit(Dav1dFrameContext *const f, int retval) {
 3243|  41.5k|    const Dav1dContext *const c = f->c;
 3244|       |
 3245|  41.5k|    if (f->sr_cur.p.data[0])
  ------------------
  |  Branch (3245:9): [True: 21.4k, False: 20.0k]
  ------------------
 3246|  41.5k|        atomic_init(&f->task_thread.error, 0);
 3247|       |
 3248|  41.5k|    if (c->n_fc > 1 && retval && f->frame_thread.cf) {
  ------------------
  |  Branch (3248:9): [True: 0, False: 41.5k]
  |  Branch (3248:24): [True: 0, False: 0]
  |  Branch (3248:34): [True: 0, False: 0]
  ------------------
 3249|      0|        memset(f->frame_thread.cf, 0,
 3250|      0|               (size_t)f->frame_thread.cf_sz * 128 * 128 / 2);
 3251|      0|    }
 3252|   332k|    for (int i = 0; i < 7; i++) {
  ------------------
  |  Branch (3252:21): [True: 290k, False: 41.5k]
  ------------------
 3253|   290k|        if (f->refp[i].p.frame_hdr) {
  ------------------
  |  Branch (3253:13): [True: 35.2k, False: 255k]
  ------------------
 3254|  35.2k|            if (!retval && c->n_fc > 1 && c->strict_std_compliance &&
  ------------------
  |  Branch (3254:17): [True: 19.6k, False: 15.6k]
  |  Branch (3254:28): [True: 0, False: 19.6k]
  |  Branch (3254:43): [True: 0, False: 0]
  ------------------
 3255|  35.2k|                atomic_load(&f->refp[i].progress[1]) == FRAME_ERROR)
  ------------------
  |  |   35|      0|#define FRAME_ERROR (UINT_MAX - 1)
  ------------------
  |  Branch (3255:17): [True: 0, False: 0]
  ------------------
 3256|      0|            {
 3257|      0|                retval = DAV1D_ERR(EINVAL);
  ------------------
  |  |   56|      0|#define DAV1D_ERR(e) (-(e)) ///< Negate POSIX error code.
  ------------------
 3258|      0|                atomic_store(&f->task_thread.error, 1);
 3259|      0|                atomic_store(&f->sr_cur.progress[1], FRAME_ERROR);
 3260|      0|            }
 3261|  35.2k|            dav1d_thread_picture_unref(&f->refp[i]);
 3262|  35.2k|        }
 3263|   290k|        dav1d_ref_dec(&f->ref_mvs_ref[i]);
 3264|   290k|    }
 3265|       |
 3266|  41.5k|    dav1d_picture_unref_internal(&f->cur);
 3267|  41.5k|    dav1d_thread_picture_unref(&f->sr_cur);
 3268|  41.5k|    dav1d_cdf_thread_unref(&f->in_cdf);
 3269|  41.5k|    if (f->frame_hdr && f->frame_hdr->refresh_context) {
  ------------------
  |  Branch (3269:9): [True: 41.5k, False: 0]
  |  Branch (3269:25): [True: 16.7k, False: 24.7k]
  ------------------
 3270|  16.7k|        if (f->out_cdf.progress)
  ------------------
  |  Branch (3270:13): [True: 0, False: 16.7k]
  ------------------
 3271|  16.7k|            atomic_store(f->out_cdf.progress, retval == 0 ? 1 : TILE_ERROR);
  ------------------
  |  Branch (3271:13): [True: 0, False: 0]
  ------------------
 3272|  16.7k|        dav1d_cdf_thread_unref(&f->out_cdf);
 3273|  16.7k|    }
 3274|  41.5k|    dav1d_ref_dec(&f->cur_segmap_ref);
 3275|  41.5k|    dav1d_ref_dec(&f->prev_segmap_ref);
 3276|  41.5k|    dav1d_ref_dec(&f->mvs_ref);
 3277|  41.5k|    dav1d_ref_dec(&f->seq_hdr_ref);
 3278|  41.5k|    dav1d_ref_dec(&f->frame_hdr_ref);
 3279|       |
 3280|  63.0k|    for (int i = 0; i < f->n_tile_data; i++)
  ------------------
  |  Branch (3280:21): [True: 21.5k, False: 41.5k]
  ------------------
 3281|  21.5k|        dav1d_data_unref_internal(&f->tile[i].data);
 3282|  41.5k|    f->task_thread.retval = retval;
 3283|  41.5k|}
dav1d_decode_frame:
 3285|  21.4k|int dav1d_decode_frame(Dav1dFrameContext *const f) {
 3286|  21.4k|    assert(f->c->n_fc == 1);
  ------------------
  |  |  140|  21.4k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 21.4k]
  |  |  |  Branch (140:68): [Folded, False: 21.4k]
  |  |  ------------------
  ------------------
 3287|       |    // if n_tc > 1 (but n_fc == 1), we could run init/exit in the task
 3288|       |    // threads also. Not sure it makes a measurable difference.
 3289|  21.4k|    int res = dav1d_decode_frame_init(f);
 3290|  21.4k|    if (!res) res = dav1d_decode_frame_init_cdf(f);
  ------------------
  |  Branch (3290:9): [True: 21.4k, False: 0]
  ------------------
 3291|       |    // wait until all threads have completed
 3292|  21.4k|    if (!res) {
  ------------------
  |  Branch (3292:9): [True: 21.3k, False: 153]
  ------------------
 3293|  21.3k|        if (f->c->n_tc > 1) {
  ------------------
  |  Branch (3293:13): [True: 20.0k, False: 1.31k]
  ------------------
 3294|  20.0k|            res = dav1d_task_create_tile_sbrow(f, 0, 1);
 3295|  20.0k|            pthread_mutex_lock(&f->task_thread.ttd->lock);
 3296|  20.0k|            pthread_cond_signal(&f->task_thread.ttd->cond);
 3297|  20.0k|            if (!res) {
  ------------------
  |  Branch (3297:17): [True: 20.0k, False: 0]
  ------------------
 3298|  40.0k|                while (!f->task_thread.done[0] ||
  ------------------
  |  Branch (3298:24): [True: 20.0k, False: 20.0k]
  ------------------
 3299|  40.0k|                       atomic_load(&f->task_thread.task_counter) > 0)
  ------------------
  |  Branch (3299:24): [True: 3, False: 20.0k]
  ------------------
 3300|  20.0k|                {
 3301|  20.0k|                    pthread_cond_wait(&f->task_thread.cond,
 3302|  20.0k|                                      &f->task_thread.ttd->lock);
 3303|  20.0k|                }
 3304|  20.0k|            }
 3305|  20.0k|            pthread_mutex_unlock(&f->task_thread.ttd->lock);
 3306|  20.0k|            res = f->task_thread.retval;
 3307|  20.0k|        } else {
 3308|  1.31k|            res = dav1d_decode_frame_main(f);
 3309|  1.31k|            if (!res && f->frame_hdr->refresh_context && f->task_thread.update_set) {
  ------------------
  |  Branch (3309:17): [True: 903, False: 410]
  |  Branch (3309:25): [True: 548, False: 355]
  |  Branch (3309:58): [True: 548, False: 0]
  ------------------
 3310|    548|                dav1d_cdf_thread_update(f->frame_hdr, f->out_cdf.data.cdf,
 3311|    548|                                        &f->ts[f->frame_hdr->tiling.update].cdf);
 3312|    548|            }
 3313|  1.31k|        }
 3314|  21.3k|    }
 3315|  21.4k|    dav1d_decode_frame_exit(f, res);
 3316|  21.4k|    res = f->task_thread.retval;
 3317|  21.4k|    f->n_tile_data = 0;
 3318|  21.4k|    return res;
 3319|  21.4k|}
dav1d_submit_frame:
 3327|  21.5k|int dav1d_submit_frame(Dav1dContext *const c) {
 3328|  21.5k|    Dav1dFrameContext *f;
 3329|  21.5k|    int res = -1;
 3330|       |
 3331|       |    // wait for c->out_delayed[next] and move into c->out if visible
 3332|  21.5k|    Dav1dThreadPicture *out_delayed;
 3333|  21.5k|    if (c->n_fc > 1) {
  ------------------
  |  Branch (3333:9): [True: 0, False: 21.5k]
  ------------------
 3334|      0|        pthread_mutex_lock(&c->task_thread.lock);
 3335|      0|        const unsigned next = c->frame_thread.next++;
 3336|      0|        if (c->frame_thread.next == c->n_fc)
  ------------------
  |  Branch (3336:13): [True: 0, False: 0]
  ------------------
 3337|      0|            c->frame_thread.next = 0;
 3338|       |
 3339|      0|        f = &c->fc[next];
 3340|      0|        while (f->n_tile_data > 0)
  ------------------
  |  Branch (3340:16): [True: 0, False: 0]
  ------------------
 3341|      0|            pthread_cond_wait(&f->task_thread.cond,
 3342|      0|                              &c->task_thread.lock);
 3343|      0|        out_delayed = &c->frame_thread.out_delayed[next];
 3344|      0|        if (out_delayed->p.data[0] || atomic_load(&f->task_thread.error)) {
  ------------------
  |  Branch (3344:13): [True: 0, False: 0]
  |  Branch (3344:39): [True: 0, False: 0]
  ------------------
 3345|      0|            unsigned first = atomic_load(&c->task_thread.first);
 3346|      0|            if (first + 1U < c->n_fc)
  ------------------
  |  Branch (3346:17): [True: 0, False: 0]
  ------------------
 3347|      0|                atomic_fetch_add(&c->task_thread.first, 1U);
 3348|      0|            else
 3349|      0|                atomic_store(&c->task_thread.first, 0);
 3350|      0|            atomic_compare_exchange_strong(&c->task_thread.reset_task_cur,
 3351|      0|                                           &first, UINT_MAX);
 3352|      0|            if (c->task_thread.cur && c->task_thread.cur < c->n_fc)
  ------------------
  |  Branch (3352:17): [True: 0, False: 0]
  |  Branch (3352:39): [True: 0, False: 0]
  ------------------
 3353|      0|                c->task_thread.cur--;
 3354|      0|        }
 3355|      0|        const int error = f->task_thread.retval;
 3356|      0|        if (error) {
  ------------------
  |  Branch (3356:13): [True: 0, False: 0]
  ------------------
 3357|      0|            f->task_thread.retval = 0;
 3358|      0|            c->cached_error = error;
 3359|      0|            dav1d_data_props_copy(&c->cached_error_props, &out_delayed->p.m);
 3360|      0|            dav1d_thread_picture_unref(out_delayed);
 3361|      0|        } else if (out_delayed->p.data[0]) {
  ------------------
  |  Branch (3361:20): [True: 0, False: 0]
  ------------------
 3362|      0|            const unsigned progress = atomic_load_explicit(&out_delayed->progress[1],
 3363|      0|                                                           memory_order_relaxed);
 3364|      0|            if ((out_delayed->visible || c->output_invisible_frames) &&
  ------------------
  |  Branch (3364:18): [True: 0, False: 0]
  |  Branch (3364:42): [True: 0, False: 0]
  ------------------
 3365|      0|                progress != FRAME_ERROR)
  ------------------
  |  |   35|      0|#define FRAME_ERROR (UINT_MAX - 1)
  ------------------
  |  Branch (3365:17): [True: 0, False: 0]
  ------------------
 3366|      0|            {
 3367|      0|                dav1d_thread_picture_ref(&c->out, out_delayed);
 3368|      0|                c->event_flags |= dav1d_picture_get_event_flags(out_delayed);
 3369|      0|            }
 3370|      0|            dav1d_thread_picture_unref(out_delayed);
 3371|      0|        }
 3372|  21.5k|    } else {
 3373|  21.5k|        f = c->fc;
 3374|  21.5k|    }
 3375|       |
 3376|  21.5k|    f->seq_hdr = c->seq_hdr;
 3377|  21.5k|    f->seq_hdr_ref = c->seq_hdr_ref;
 3378|  21.5k|    dav1d_ref_inc(f->seq_hdr_ref);
 3379|  21.5k|    f->frame_hdr = c->frame_hdr;
 3380|  21.5k|    f->frame_hdr_ref = c->frame_hdr_ref;
 3381|  21.5k|    c->frame_hdr = NULL;
 3382|  21.5k|    c->frame_hdr_ref = NULL;
 3383|  21.5k|    f->dsp = &c->dsp[f->seq_hdr->hbd];
 3384|       |
 3385|  21.5k|    const int bpc = 8 + 2 * f->seq_hdr->hbd;
 3386|       |
 3387|  21.5k|    if (!f->dsp->ipred.intra_pred[DC_PRED]) {
  ------------------
  |  Branch (3387:9): [True: 15.4k, False: 6.05k]
  ------------------
 3388|  15.4k|        Dav1dDSPContext *const dsp = &c->dsp[f->seq_hdr->hbd];
 3389|       |
 3390|  15.4k|        switch (bpc) {
 3391|      0|#define assign_bitdepth_case(bd) \
 3392|      0|            dav1d_cdef_dsp_init_##bd##bpc(&dsp->cdef); \
 3393|      0|            dav1d_intra_pred_dsp_init_##bd##bpc(&dsp->ipred); \
 3394|      0|            dav1d_itx_dsp_init_##bd##bpc(&dsp->itx, bpc); \
 3395|      0|            dav1d_loop_filter_dsp_init_##bd##bpc(&dsp->lf); \
 3396|      0|            dav1d_loop_restoration_dsp_init_##bd##bpc(&dsp->lr, bpc); \
 3397|      0|            dav1d_mc_dsp_init_##bd##bpc(&dsp->mc); \
 3398|      0|            dav1d_film_grain_dsp_init_##bd##bpc(&dsp->fg); \
 3399|      0|            break
 3400|      0|#if CONFIG_8BPC
 3401|  7.82k|        case 8:
  ------------------
  |  Branch (3401:9): [True: 7.82k, False: 7.63k]
  ------------------
 3402|  7.82k|            assign_bitdepth_case(8);
  ------------------
  |  | 3392|  7.82k|            dav1d_cdef_dsp_init_##bd##bpc(&dsp->cdef); \
  |  | 3393|  7.82k|            dav1d_intra_pred_dsp_init_##bd##bpc(&dsp->ipred); \
  |  | 3394|  7.82k|            dav1d_itx_dsp_init_##bd##bpc(&dsp->itx, bpc); \
  |  | 3395|  7.82k|            dav1d_loop_filter_dsp_init_##bd##bpc(&dsp->lf); \
  |  | 3396|  7.82k|            dav1d_loop_restoration_dsp_init_##bd##bpc(&dsp->lr, bpc); \
  |  | 3397|  7.82k|            dav1d_mc_dsp_init_##bd##bpc(&dsp->mc); \
  |  | 3398|  7.82k|            dav1d_film_grain_dsp_init_##bd##bpc(&dsp->fg); \
  |  | 3399|  7.82k|            break
  ------------------
 3403|      0|#endif
 3404|      0|#if CONFIG_16BPC
 3405|  4.68k|        case 10:
  ------------------
  |  Branch (3405:9): [True: 4.68k, False: 10.7k]
  ------------------
 3406|  7.63k|        case 12:
  ------------------
  |  Branch (3406:9): [True: 2.94k, False: 12.5k]
  ------------------
 3407|  7.63k|            assign_bitdepth_case(16);
  ------------------
  |  | 3392|  7.63k|            dav1d_cdef_dsp_init_##bd##bpc(&dsp->cdef); \
  |  | 3393|  7.63k|            dav1d_intra_pred_dsp_init_##bd##bpc(&dsp->ipred); \
  |  | 3394|  7.63k|            dav1d_itx_dsp_init_##bd##bpc(&dsp->itx, bpc); \
  |  | 3395|  7.63k|            dav1d_loop_filter_dsp_init_##bd##bpc(&dsp->lf); \
  |  | 3396|  7.63k|            dav1d_loop_restoration_dsp_init_##bd##bpc(&dsp->lr, bpc); \
  |  | 3397|  7.63k|            dav1d_mc_dsp_init_##bd##bpc(&dsp->mc); \
  |  | 3398|  7.63k|            dav1d_film_grain_dsp_init_##bd##bpc(&dsp->fg); \
  |  | 3399|  7.63k|            break
  ------------------
 3408|      0|#endif
 3409|      0|#undef assign_bitdepth_case
 3410|      0|        default:
  ------------------
  |  Branch (3410:9): [True: 0, False: 15.4k]
  ------------------
 3411|      0|            dav1d_log(c, "Compiled without support for %d-bit decoding\n",
  ------------------
  |  |   39|      0|#define dav1d_log dav1d_log
  ------------------
 3412|      0|                    8 + 2 * f->seq_hdr->hbd);
 3413|      0|            res = DAV1D_ERR(ENOPROTOOPT);
  ------------------
  |  |   56|      0|#define DAV1D_ERR(e) (-(e)) ///< Negate POSIX error code.
  ------------------
 3414|      0|            goto error;
 3415|  15.4k|        }
 3416|  15.4k|    }
 3417|       |
 3418|  21.5k|#define assign_bitdepth_case(bd) \
 3419|  21.5k|        f->bd_fn.recon_b_inter = dav1d_recon_b_inter_##bd##bpc; \
 3420|  21.5k|        f->bd_fn.recon_b_intra = dav1d_recon_b_intra_##bd##bpc; \
 3421|  21.5k|        f->bd_fn.filter_sbrow = dav1d_filter_sbrow_##bd##bpc; \
 3422|  21.5k|        f->bd_fn.filter_sbrow_deblock_cols = dav1d_filter_sbrow_deblock_cols_##bd##bpc; \
 3423|  21.5k|        f->bd_fn.filter_sbrow_deblock_rows = dav1d_filter_sbrow_deblock_rows_##bd##bpc; \
 3424|  21.5k|        f->bd_fn.filter_sbrow_cdef = dav1d_filter_sbrow_cdef_##bd##bpc; \
 3425|  21.5k|        f->bd_fn.filter_sbrow_resize = dav1d_filter_sbrow_resize_##bd##bpc; \
 3426|  21.5k|        f->bd_fn.filter_sbrow_lr = dav1d_filter_sbrow_lr_##bd##bpc; \
 3427|  21.5k|        f->bd_fn.backup_ipred_edge = dav1d_backup_ipred_edge_##bd##bpc; \
 3428|  21.5k|        f->bd_fn.read_coef_blocks = dav1d_read_coef_blocks_##bd##bpc; \
 3429|  21.5k|        f->bd_fn.copy_pal_block_y = dav1d_copy_pal_block_y_##bd##bpc; \
 3430|  21.5k|        f->bd_fn.copy_pal_block_uv = dav1d_copy_pal_block_uv_##bd##bpc; \
 3431|  21.5k|        f->bd_fn.read_pal_plane = dav1d_read_pal_plane_##bd##bpc; \
 3432|  21.5k|        f->bd_fn.read_pal_uv = dav1d_read_pal_uv_##bd##bpc
 3433|  21.5k|    if (!f->seq_hdr->hbd) {
  ------------------
  |  Branch (3433:9): [True: 11.7k, False: 9.72k]
  ------------------
 3434|  11.7k|#if CONFIG_8BPC
 3435|  11.7k|        assign_bitdepth_case(8);
  ------------------
  |  | 3419|  11.7k|        f->bd_fn.recon_b_inter = dav1d_recon_b_inter_##bd##bpc; \
  |  | 3420|  11.7k|        f->bd_fn.recon_b_intra = dav1d_recon_b_intra_##bd##bpc; \
  |  | 3421|  11.7k|        f->bd_fn.filter_sbrow = dav1d_filter_sbrow_##bd##bpc; \
  |  | 3422|  11.7k|        f->bd_fn.filter_sbrow_deblock_cols = dav1d_filter_sbrow_deblock_cols_##bd##bpc; \
  |  | 3423|  11.7k|        f->bd_fn.filter_sbrow_deblock_rows = dav1d_filter_sbrow_deblock_rows_##bd##bpc; \
  |  | 3424|  11.7k|        f->bd_fn.filter_sbrow_cdef = dav1d_filter_sbrow_cdef_##bd##bpc; \
  |  | 3425|  11.7k|        f->bd_fn.filter_sbrow_resize = dav1d_filter_sbrow_resize_##bd##bpc; \
  |  | 3426|  11.7k|        f->bd_fn.filter_sbrow_lr = dav1d_filter_sbrow_lr_##bd##bpc; \
  |  | 3427|  11.7k|        f->bd_fn.backup_ipred_edge = dav1d_backup_ipred_edge_##bd##bpc; \
  |  | 3428|  11.7k|        f->bd_fn.read_coef_blocks = dav1d_read_coef_blocks_##bd##bpc; \
  |  | 3429|  11.7k|        f->bd_fn.copy_pal_block_y = dav1d_copy_pal_block_y_##bd##bpc; \
  |  | 3430|  11.7k|        f->bd_fn.copy_pal_block_uv = dav1d_copy_pal_block_uv_##bd##bpc; \
  |  | 3431|  11.7k|        f->bd_fn.read_pal_plane = dav1d_read_pal_plane_##bd##bpc; \
  |  | 3432|  11.7k|        f->bd_fn.read_pal_uv = dav1d_read_pal_uv_##bd##bpc
  ------------------
 3436|  11.7k|#endif
 3437|  11.7k|    } else {
 3438|  9.72k|#if CONFIG_16BPC
 3439|  9.72k|        assign_bitdepth_case(16);
  ------------------
  |  | 3419|  9.72k|        f->bd_fn.recon_b_inter = dav1d_recon_b_inter_##bd##bpc; \
  |  | 3420|  9.72k|        f->bd_fn.recon_b_intra = dav1d_recon_b_intra_##bd##bpc; \
  |  | 3421|  9.72k|        f->bd_fn.filter_sbrow = dav1d_filter_sbrow_##bd##bpc; \
  |  | 3422|  9.72k|        f->bd_fn.filter_sbrow_deblock_cols = dav1d_filter_sbrow_deblock_cols_##bd##bpc; \
  |  | 3423|  9.72k|        f->bd_fn.filter_sbrow_deblock_rows = dav1d_filter_sbrow_deblock_rows_##bd##bpc; \
  |  | 3424|  9.72k|        f->bd_fn.filter_sbrow_cdef = dav1d_filter_sbrow_cdef_##bd##bpc; \
  |  | 3425|  9.72k|        f->bd_fn.filter_sbrow_resize = dav1d_filter_sbrow_resize_##bd##bpc; \
  |  | 3426|  9.72k|        f->bd_fn.filter_sbrow_lr = dav1d_filter_sbrow_lr_##bd##bpc; \
  |  | 3427|  9.72k|        f->bd_fn.backup_ipred_edge = dav1d_backup_ipred_edge_##bd##bpc; \
  |  | 3428|  9.72k|        f->bd_fn.read_coef_blocks = dav1d_read_coef_blocks_##bd##bpc; \
  |  | 3429|  9.72k|        f->bd_fn.copy_pal_block_y = dav1d_copy_pal_block_y_##bd##bpc; \
  |  | 3430|  9.72k|        f->bd_fn.copy_pal_block_uv = dav1d_copy_pal_block_uv_##bd##bpc; \
  |  | 3431|  9.72k|        f->bd_fn.read_pal_plane = dav1d_read_pal_plane_##bd##bpc; \
  |  | 3432|  9.72k|        f->bd_fn.read_pal_uv = dav1d_read_pal_uv_##bd##bpc
  ------------------
 3440|  9.72k|#endif
 3441|  9.72k|    }
 3442|  21.5k|#undef assign_bitdepth_case
 3443|       |
 3444|  21.5k|    int ref_coded_width[7];
 3445|  21.5k|    if (IS_INTER_OR_SWITCH(f->frame_hdr)) {
  ------------------
  |  |   36|  21.5k|    ((frame_header)->frame_type & 1)
  |  |  ------------------
  |  |  |  Branch (36:5): [True: 5.06k, False: 16.4k]
  |  |  ------------------
  ------------------
 3446|  5.06k|        if (f->frame_hdr->primary_ref_frame != DAV1D_PRIMARY_REF_NONE) {
  ------------------
  |  |   45|  5.06k|#define DAV1D_PRIMARY_REF_NONE 7
  ------------------
  |  Branch (3446:13): [True: 2.61k, False: 2.44k]
  ------------------
 3447|  2.61k|            const int pri_ref = f->frame_hdr->refidx[f->frame_hdr->primary_ref_frame];
 3448|  2.61k|            if (!c->refs[pri_ref].p.p.data[0]) {
  ------------------
  |  Branch (3448:17): [True: 3, False: 2.61k]
  ------------------
 3449|      3|                res = DAV1D_ERR(EINVAL);
  ------------------
  |  |   56|      3|#define DAV1D_ERR(e) (-(e)) ///< Negate POSIX error code.
  ------------------
 3450|      3|                goto error;
 3451|      3|            }
 3452|  2.61k|        }
 3453|  40.3k|        for (int i = 0; i < 7; i++) {
  ------------------
  |  Branch (3453:25): [True: 35.3k, False: 5.04k]
  ------------------
 3454|  35.3k|            const int refidx = f->frame_hdr->refidx[i];
 3455|  35.3k|            if (!c->refs[refidx].p.p.data[0] ||
  ------------------
  |  Branch (3455:17): [True: 5, False: 35.3k]
  ------------------
 3456|  35.3k|                f->frame_hdr->width[0] * 2 < c->refs[refidx].p.p.p.w ||
  ------------------
  |  Branch (3456:17): [True: 5, False: 35.3k]
  ------------------
 3457|  35.3k|                f->frame_hdr->height * 2 < c->refs[refidx].p.p.p.h ||
  ------------------
  |  Branch (3457:17): [True: 3, False: 35.3k]
  ------------------
 3458|  35.3k|                f->frame_hdr->width[0] > c->refs[refidx].p.p.p.w * 16 ||
  ------------------
  |  Branch (3458:17): [True: 4, False: 35.3k]
  ------------------
 3459|  35.3k|                f->frame_hdr->height > c->refs[refidx].p.p.p.h * 16 ||
  ------------------
  |  Branch (3459:17): [True: 2, False: 35.3k]
  ------------------
 3460|  35.3k|                f->seq_hdr->layout != c->refs[refidx].p.p.p.layout ||
  ------------------
  |  Branch (3460:17): [True: 0, False: 35.3k]
  ------------------
 3461|  35.3k|                bpc != c->refs[refidx].p.p.p.bpc)
  ------------------
  |  Branch (3461:17): [True: 0, False: 35.3k]
  ------------------
 3462|     19|            {
 3463|     25|                for (int j = 0; j < i; j++)
  ------------------
  |  Branch (3463:33): [True: 6, False: 19]
  ------------------
 3464|      6|                    dav1d_thread_picture_unref(&f->refp[j]);
 3465|     19|                res = DAV1D_ERR(EINVAL);
  ------------------
  |  |   56|     19|#define DAV1D_ERR(e) (-(e)) ///< Negate POSIX error code.
  ------------------
 3466|     19|                goto error;
 3467|     19|            }
 3468|  35.3k|            dav1d_thread_picture_ref(&f->refp[i], &c->refs[refidx].p);
 3469|  35.3k|            ref_coded_width[i] = c->refs[refidx].p.p.frame_hdr->width[0];
 3470|  35.3k|            if (f->frame_hdr->width[0] != c->refs[refidx].p.p.p.w ||
  ------------------
  |  Branch (3470:17): [True: 7.05k, False: 28.2k]
  ------------------
 3471|  28.2k|                f->frame_hdr->height != c->refs[refidx].p.p.p.h)
  ------------------
  |  Branch (3471:17): [True: 264, False: 27.9k]
  ------------------
 3472|  7.31k|            {
 3473|  7.31k|#define scale_fac(ref_sz, this_sz) \
 3474|  7.31k|    ((((ref_sz) << 14) + ((this_sz) >> 1)) / (this_sz))
 3475|  7.31k|                f->svc[i][0].scale = scale_fac(c->refs[refidx].p.p.p.w,
  ------------------
  |  | 3474|  7.31k|    ((((ref_sz) << 14) + ((this_sz) >> 1)) / (this_sz))
  ------------------
 3476|  7.31k|                                               f->frame_hdr->width[0]);
 3477|  7.31k|                f->svc[i][1].scale = scale_fac(c->refs[refidx].p.p.p.h,
  ------------------
  |  | 3474|  7.31k|    ((((ref_sz) << 14) + ((this_sz) >> 1)) / (this_sz))
  ------------------
 3478|  7.31k|                                               f->frame_hdr->height);
 3479|  7.31k|                f->svc[i][0].step = (f->svc[i][0].scale + 8) >> 4;
 3480|  7.31k|                f->svc[i][1].step = (f->svc[i][1].scale + 8) >> 4;
 3481|  27.9k|            } else {
 3482|  27.9k|                f->svc[i][0].scale = f->svc[i][1].scale = 0;
 3483|  27.9k|            }
 3484|  35.3k|            f->gmv_warp_allowed[i] = f->frame_hdr->gmv[i].type > DAV1D_WM_TYPE_TRANSLATION &&
  ------------------
  |  Branch (3484:38): [True: 3.74k, False: 31.5k]
  ------------------
 3485|  3.74k|                                     !f->frame_hdr->force_integer_mv &&
  ------------------
  |  Branch (3485:38): [True: 2.91k, False: 825]
  ------------------
 3486|  2.91k|                                     !dav1d_get_shear_params(&f->frame_hdr->gmv[i]) &&
  ------------------
  |  Branch (3486:38): [True: 2.87k, False: 43]
  ------------------
 3487|  2.87k|                                     !f->svc[i][0].scale;
  ------------------
  |  Branch (3487:38): [True: 1.91k, False: 957]
  ------------------
 3488|  35.3k|        }
 3489|  5.06k|    }
 3490|       |
 3491|       |    // setup entropy
 3492|  21.4k|    if (f->frame_hdr->primary_ref_frame == DAV1D_PRIMARY_REF_NONE) {
  ------------------
  |  |   45|  21.4k|#define DAV1D_PRIMARY_REF_NONE 7
  ------------------
  |  Branch (3492:9): [True: 18.8k, False: 2.60k]
  ------------------
 3493|  18.8k|        dav1d_cdf_thread_init_static(&f->in_cdf, f->frame_hdr->quant.yac);
 3494|  18.8k|    } else {
 3495|  2.60k|        const int pri_ref = f->frame_hdr->refidx[f->frame_hdr->primary_ref_frame];
 3496|  2.60k|        dav1d_cdf_thread_ref(&f->in_cdf, &c->cdf[pri_ref]);
 3497|  2.60k|    }
 3498|  21.4k|    if (f->frame_hdr->refresh_context) {
  ------------------
  |  Branch (3498:9): [True: 8.68k, False: 12.8k]
  ------------------
 3499|  8.68k|        res = dav1d_cdf_thread_alloc(c, &f->out_cdf, c->n_fc > 1);
 3500|  8.68k|        if (res < 0) goto error;
  ------------------
  |  Branch (3500:13): [True: 0, False: 8.68k]
  ------------------
 3501|  8.68k|    }
 3502|       |
 3503|       |    // FIXME qsort so tiles are in order (for frame threading)
 3504|  21.4k|    if (f->n_tile_data_alloc < c->n_tile_data) {
  ------------------
  |  Branch (3504:9): [True: 15.4k, False: 6.08k]
  ------------------
 3505|  15.4k|        dav1d_free(f->tile);
  ------------------
  |  |  135|  15.4k|#define dav1d_free(ptr) free(ptr)
  ------------------
 3506|  15.4k|        assert(c->n_tile_data < INT_MAX / (int)sizeof(*f->tile));
  ------------------
  |  |  140|  15.4k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 15.4k]
  |  |  |  Branch (140:68): [Folded, False: 15.4k]
  |  |  ------------------
  ------------------
 3507|  15.4k|        f->tile = dav1d_malloc(ALLOC_TILE, c->n_tile_data * sizeof(*f->tile));
  ------------------
  |  |  132|  15.4k|#define dav1d_malloc(type, sz) malloc(sz)
  ------------------
 3508|  15.4k|        if (!f->tile) {
  ------------------
  |  Branch (3508:13): [True: 0, False: 15.4k]
  ------------------
 3509|      0|            f->n_tile_data_alloc = f->n_tile_data = 0;
 3510|      0|            res = DAV1D_ERR(ENOMEM);
  ------------------
  |  |   56|      0|#define DAV1D_ERR(e) (-(e)) ///< Negate POSIX error code.
  ------------------
 3511|      0|            goto error;
 3512|      0|        }
 3513|  15.4k|        f->n_tile_data_alloc = c->n_tile_data;
 3514|  15.4k|    }
 3515|  21.4k|    memcpy(f->tile, c->tile, c->n_tile_data * sizeof(*f->tile));
 3516|  21.4k|    memset(c->tile, 0, c->n_tile_data * sizeof(*c->tile));
 3517|  21.4k|    f->n_tile_data = c->n_tile_data;
 3518|  21.4k|    c->n_tile_data = 0;
 3519|       |
 3520|       |    // allocate frame
 3521|  21.4k|    res = dav1d_thread_picture_alloc(c, f, bpc);
 3522|  21.4k|    if (res < 0) goto error;
  ------------------
  |  Branch (3522:9): [True: 0, False: 21.4k]
  ------------------
 3523|       |
 3524|  21.4k|    if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
  ------------------
  |  Branch (3524:9): [True: 1.19k, False: 20.2k]
  ------------------
 3525|  1.19k|        res = dav1d_picture_alloc_copy(c, &f->cur, f->frame_hdr->width[0], &f->sr_cur.p);
 3526|  1.19k|        if (res < 0) goto error;
  ------------------
  |  Branch (3526:13): [True: 0, False: 1.19k]
  ------------------
 3527|  20.2k|    } else {
 3528|  20.2k|        dav1d_picture_ref(&f->cur, &f->sr_cur.p);
 3529|  20.2k|    }
 3530|       |
 3531|  21.4k|    if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
  ------------------
  |  Branch (3531:9): [True: 1.19k, False: 20.2k]
  ------------------
 3532|  1.19k|        f->resize_step[0] = scale_fac(f->cur.p.w, f->sr_cur.p.p.w);
  ------------------
  |  | 3474|  1.19k|    ((((ref_sz) << 14) + ((this_sz) >> 1)) / (this_sz))
  ------------------
 3533|  1.19k|        const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
 3534|  1.19k|        const int in_cw = (f->cur.p.w + ss_hor) >> ss_hor;
 3535|  1.19k|        const int out_cw = (f->sr_cur.p.p.w + ss_hor) >> ss_hor;
 3536|  1.19k|        f->resize_step[1] = scale_fac(in_cw, out_cw);
  ------------------
  |  | 3474|  1.19k|    ((((ref_sz) << 14) + ((this_sz) >> 1)) / (this_sz))
  ------------------
 3537|  1.19k|#undef scale_fac
 3538|  1.19k|        f->resize_start[0] = get_upscale_x0(f->cur.p.w, f->sr_cur.p.p.w, f->resize_step[0]);
 3539|  1.19k|        f->resize_start[1] = get_upscale_x0(in_cw, out_cw, f->resize_step[1]);
 3540|  1.19k|    }
 3541|       |
 3542|       |    // move f->cur into output queue
 3543|  21.4k|    if (c->n_fc == 1) {
  ------------------
  |  Branch (3543:9): [True: 21.4k, False: 0]
  ------------------
 3544|  21.4k|        if (f->frame_hdr->show_frame || c->output_invisible_frames) {
  ------------------
  |  Branch (3544:13): [True: 18.1k, False: 3.38k]
  |  Branch (3544:41): [True: 0, False: 3.38k]
  ------------------
 3545|  18.1k|            dav1d_thread_picture_ref(&c->out, &f->sr_cur);
 3546|  18.1k|            c->event_flags |= dav1d_picture_get_event_flags(&f->sr_cur);
 3547|  18.1k|        }
 3548|  21.4k|    } else {
 3549|      0|        dav1d_thread_picture_ref(out_delayed, &f->sr_cur);
 3550|      0|    }
 3551|       |
 3552|  21.4k|    f->w4 = (f->frame_hdr->width[0] + 3) >> 2;
 3553|  21.4k|    f->h4 = (f->frame_hdr->height + 3) >> 2;
 3554|  21.4k|    f->bw = ((f->frame_hdr->width[0] + 7) >> 3) << 1;
 3555|  21.4k|    f->bh = ((f->frame_hdr->height + 7) >> 3) << 1;
 3556|  21.4k|    f->sb128w = (f->bw + 31) >> 5;
 3557|  21.4k|    f->sb128h = (f->bh + 31) >> 5;
 3558|  21.4k|    f->sb_shift = 4 + f->seq_hdr->sb128;
 3559|  21.4k|    f->sb_step = 16 << f->seq_hdr->sb128;
 3560|  21.4k|    f->sbh = (f->bh + f->sb_step - 1) >> f->sb_shift;
 3561|  21.4k|    f->b4_stride = (f->bw + 31) & ~31;
 3562|  21.4k|    f->bitdepth_max = (1 << f->cur.p.bpc) - 1;
 3563|  21.4k|    atomic_init(&f->task_thread.error, 0);
 3564|  21.4k|    const int uses_2pass = c->n_fc > 1;
 3565|  21.4k|    const int cols = f->frame_hdr->tiling.cols;
 3566|  21.4k|    const int rows = f->frame_hdr->tiling.rows;
 3567|  21.4k|    atomic_store(&f->task_thread.task_counter,
 3568|  21.4k|                 (cols * rows + f->sbh) << uses_2pass);
 3569|       |
 3570|       |    // ref_mvs
 3571|  21.4k|    if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc) {
  ------------------
  |  |   36|  42.9k|    ((frame_header)->frame_type & 1)
  |  |  ------------------
  |  |  |  Branch (36:5): [True: 5.04k, False: 16.4k]
  |  |  ------------------
  ------------------
  |  Branch (3571:45): [True: 3.33k, False: 13.1k]
  ------------------
 3572|  8.37k|        f->mvs_ref = dav1d_ref_create_using_pool(c->refmvs_pool,
 3573|  8.37k|            sizeof(*f->mvs) * f->sb128h * 16 * (f->b4_stride >> 1));
 3574|  8.37k|        if (!f->mvs_ref) {
  ------------------
  |  Branch (3574:13): [True: 0, False: 8.37k]
  ------------------
 3575|      0|            res = DAV1D_ERR(ENOMEM);
  ------------------
  |  |   56|      0|#define DAV1D_ERR(e) (-(e)) ///< Negate POSIX error code.
  ------------------
 3576|      0|            goto error;
 3577|      0|        }
 3578|  8.37k|        f->mvs = f->mvs_ref->data;
 3579|  8.37k|        if (!f->frame_hdr->allow_intrabc) {
  ------------------
  |  Branch (3579:13): [True: 5.04k, False: 3.33k]
  ------------------
 3580|  40.3k|            for (int i = 0; i < 7; i++)
  ------------------
  |  Branch (3580:29): [True: 35.2k, False: 5.04k]
  ------------------
 3581|  35.2k|                f->refpoc[i] = f->refp[i].p.frame_hdr->frame_offset;
 3582|  5.04k|        } else {
 3583|  3.33k|            memset(f->refpoc, 0, sizeof(f->refpoc));
 3584|  3.33k|        }
 3585|  8.37k|        if (f->frame_hdr->use_ref_frame_mvs) {
  ------------------
  |  Branch (3585:13): [True: 3.28k, False: 5.09k]
  ------------------
 3586|  26.2k|            for (int i = 0; i < 7; i++) {
  ------------------
  |  Branch (3586:29): [True: 22.9k, False: 3.28k]
  ------------------
 3587|  22.9k|                const int refidx = f->frame_hdr->refidx[i];
 3588|  22.9k|                const int ref_w = ((ref_coded_width[i] + 7) >> 3) << 1;
 3589|  22.9k|                const int ref_h = ((f->refp[i].p.p.h + 7) >> 3) << 1;
 3590|  22.9k|                if (c->refs[refidx].refmvs != NULL &&
  ------------------
  |  Branch (3590:21): [True: 2.32k, False: 20.6k]
  ------------------
 3591|  2.32k|                    ref_w == f->bw && ref_h == f->bh)
  ------------------
  |  Branch (3591:21): [True: 2.30k, False: 22]
  |  Branch (3591:39): [True: 2.30k, False: 0]
  ------------------
 3592|  2.30k|                {
 3593|  2.30k|                    f->ref_mvs_ref[i] = c->refs[refidx].refmvs;
 3594|  2.30k|                    dav1d_ref_inc(f->ref_mvs_ref[i]);
 3595|  2.30k|                    f->ref_mvs[i] = c->refs[refidx].refmvs->data;
 3596|  20.6k|                } else {
 3597|  20.6k|                    f->ref_mvs[i] = NULL;
 3598|  20.6k|                    f->ref_mvs_ref[i] = NULL;
 3599|  20.6k|                }
 3600|  22.9k|                memcpy(f->refrefpoc[i], c->refs[refidx].refpoc,
 3601|  22.9k|                       sizeof(*f->refrefpoc));
 3602|  22.9k|            }
 3603|  5.09k|        } else {
 3604|  5.09k|            memset(f->ref_mvs_ref, 0, sizeof(f->ref_mvs_ref));
 3605|  5.09k|        }
 3606|  13.1k|    } else {
 3607|  13.1k|        f->mvs_ref = NULL;
 3608|  13.1k|        memset(f->ref_mvs_ref, 0, sizeof(f->ref_mvs_ref));
 3609|  13.1k|    }
 3610|       |
 3611|       |    // segmap
 3612|  21.4k|    if (f->frame_hdr->segmentation.enabled) {
  ------------------
  |  Branch (3612:9): [True: 4.81k, False: 16.6k]
  ------------------
 3613|       |        // By default, the previous segmentation map is not initialised.
 3614|  4.81k|        f->prev_segmap_ref = NULL;
 3615|  4.81k|        f->prev_segmap = NULL;
 3616|       |
 3617|       |        // We might need a previous frame's segmentation map. This
 3618|       |        // happens if there is either no update or a temporal update.
 3619|  4.81k|        if (f->frame_hdr->segmentation.temporal || !f->frame_hdr->segmentation.update_map) {
  ------------------
  |  Branch (3619:13): [True: 174, False: 4.63k]
  |  Branch (3619:52): [True: 247, False: 4.39k]
  ------------------
 3620|    421|            const int pri_ref = f->frame_hdr->primary_ref_frame;
 3621|    421|            assert(pri_ref != DAV1D_PRIMARY_REF_NONE);
  ------------------
  |  |  140|    421|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 421]
  |  |  |  Branch (140:68): [Folded, False: 421]
  |  |  ------------------
  ------------------
 3622|    421|            const int ref_w = ((ref_coded_width[pri_ref] + 7) >> 3) << 1;
 3623|    421|            const int ref_h = ((f->refp[pri_ref].p.p.h + 7) >> 3) << 1;
 3624|    421|            if (ref_w == f->bw && ref_h == f->bh) {
  ------------------
  |  Branch (3624:17): [True: 414, False: 7]
  |  Branch (3624:35): [True: 408, False: 6]
  ------------------
 3625|    408|                f->prev_segmap_ref = c->refs[f->frame_hdr->refidx[pri_ref]].segmap;
 3626|    408|                if (f->prev_segmap_ref) {
  ------------------
  |  Branch (3626:21): [True: 53, False: 355]
  ------------------
 3627|     53|                    dav1d_ref_inc(f->prev_segmap_ref);
 3628|     53|                    f->prev_segmap = f->prev_segmap_ref->data;
 3629|     53|                }
 3630|    408|            }
 3631|    421|        }
 3632|       |
 3633|  4.81k|        if (f->frame_hdr->segmentation.update_map) {
  ------------------
  |  Branch (3633:13): [True: 4.56k, False: 247]
  ------------------
 3634|       |            // We're updating an existing map, but need somewhere to
 3635|       |            // put the new values. Allocate them here (the data
 3636|       |            // actually gets set elsewhere)
 3637|  4.56k|            f->cur_segmap_ref = dav1d_ref_create_using_pool(c->segmap_pool,
 3638|  4.56k|                sizeof(*f->cur_segmap) * f->b4_stride * 32 * f->sb128h);
 3639|  4.56k|            if (!f->cur_segmap_ref) {
  ------------------
  |  Branch (3639:17): [True: 0, False: 4.56k]
  ------------------
 3640|      0|                dav1d_ref_dec(&f->prev_segmap_ref);
 3641|      0|                res = DAV1D_ERR(ENOMEM);
  ------------------
  |  |   56|      0|#define DAV1D_ERR(e) (-(e)) ///< Negate POSIX error code.
  ------------------
 3642|      0|                goto error;
 3643|      0|            }
 3644|  4.56k|            f->cur_segmap = f->cur_segmap_ref->data;
 3645|  4.56k|        } else if (f->prev_segmap_ref) {
  ------------------
  |  Branch (3645:20): [True: 23, False: 224]
  ------------------
 3646|       |            // We're not updating an existing map, and we have a valid
 3647|       |            // reference. Use that.
 3648|     23|            f->cur_segmap_ref = f->prev_segmap_ref;
 3649|     23|            dav1d_ref_inc(f->cur_segmap_ref);
 3650|     23|            f->cur_segmap = f->prev_segmap_ref->data;
 3651|    224|        } else {
 3652|       |            // We need to make a new map. Allocate one here and zero it out.
 3653|    224|            const size_t segmap_size = sizeof(*f->cur_segmap) * f->b4_stride * 32 * f->sb128h;
 3654|    224|            f->cur_segmap_ref = dav1d_ref_create_using_pool(c->segmap_pool, segmap_size);
 3655|    224|            if (!f->cur_segmap_ref) {
  ------------------
  |  Branch (3655:17): [True: 0, False: 224]
  ------------------
 3656|      0|                res = DAV1D_ERR(ENOMEM);
  ------------------
  |  |   56|      0|#define DAV1D_ERR(e) (-(e)) ///< Negate POSIX error code.
  ------------------
 3657|      0|                goto error;
 3658|      0|            }
 3659|    224|            f->cur_segmap = f->cur_segmap_ref->data;
 3660|    224|            memset(f->cur_segmap, 0, segmap_size);
 3661|    224|        }
 3662|  16.6k|    } else {
 3663|  16.6k|        f->cur_segmap = NULL;
 3664|  16.6k|        f->cur_segmap_ref = NULL;
 3665|  16.6k|        f->prev_segmap_ref = NULL;
 3666|  16.6k|    }
 3667|       |
 3668|       |    // update references etc.
 3669|  21.4k|    const unsigned refresh_frame_flags = f->frame_hdr->refresh_frame_flags;
 3670|   193k|    for (int i = 0; i < 8; i++) {
  ------------------
  |  Branch (3670:21): [True: 171k, False: 21.4k]
  ------------------
 3671|   171k|        if (refresh_frame_flags & (1 << i)) {
  ------------------
  |  Branch (3671:13): [True: 143k, False: 28.8k]
  ------------------
 3672|   143k|            if (c->refs[i].p.p.frame_hdr)
  ------------------
  |  Branch (3672:17): [True: 18.6k, False: 124k]
  ------------------
 3673|  18.6k|                dav1d_thread_picture_unref(&c->refs[i].p);
 3674|   143k|            dav1d_thread_picture_ref(&c->refs[i].p, &f->sr_cur);
 3675|       |
 3676|   143k|            dav1d_cdf_thread_unref(&c->cdf[i]);
 3677|   143k|            if (f->frame_hdr->refresh_context) {
  ------------------
  |  Branch (3677:17): [True: 48.9k, False: 94.2k]
  ------------------
 3678|  48.9k|                dav1d_cdf_thread_ref(&c->cdf[i], &f->out_cdf);
 3679|  94.2k|            } else {
 3680|  94.2k|                dav1d_cdf_thread_ref(&c->cdf[i], &f->in_cdf);
 3681|  94.2k|            }
 3682|       |
 3683|   143k|            dav1d_ref_dec(&c->refs[i].segmap);
 3684|   143k|            c->refs[i].segmap = f->cur_segmap_ref;
 3685|   143k|            if (f->cur_segmap_ref)
  ------------------
  |  Branch (3685:17): [True: 35.9k, False: 107k]
  ------------------
 3686|  35.9k|                dav1d_ref_inc(f->cur_segmap_ref);
 3687|   143k|            dav1d_ref_dec(&c->refs[i].refmvs);
 3688|   143k|            if (!f->frame_hdr->allow_intrabc) {
  ------------------
  |  Branch (3688:17): [True: 116k, False: 26.2k]
  ------------------
 3689|   116k|                c->refs[i].refmvs = f->mvs_ref;
 3690|   116k|                if (f->mvs_ref)
  ------------------
  |  Branch (3690:21): [True: 13.2k, False: 103k]
  ------------------
 3691|  13.2k|                    dav1d_ref_inc(f->mvs_ref);
 3692|   116k|            }
 3693|   143k|            memcpy(c->refs[i].refpoc, f->refpoc, sizeof(f->refpoc));
 3694|   143k|        }
 3695|   171k|    }
 3696|       |
 3697|  21.4k|    if (c->n_fc == 1) {
  ------------------
  |  Branch (3697:9): [True: 21.4k, False: 0]
  ------------------
 3698|  21.4k|        if ((res = dav1d_decode_frame(f)) < 0) {
  ------------------
  |  Branch (3698:13): [True: 9.88k, False: 11.6k]
  ------------------
 3699|  9.88k|            dav1d_thread_picture_unref(&c->out);
 3700|  88.9k|            for (int i = 0; i < 8; i++) {
  ------------------
  |  Branch (3700:29): [True: 79.0k, False: 9.88k]
  ------------------
 3701|  79.0k|                if (refresh_frame_flags & (1 << i)) {
  ------------------
  |  Branch (3701:21): [True: 65.6k, False: 13.3k]
  ------------------
 3702|  65.6k|                    if (c->refs[i].p.p.frame_hdr)
  ------------------
  |  Branch (3702:25): [True: 65.6k, False: 0]
  ------------------
 3703|  65.6k|                        dav1d_thread_picture_unref(&c->refs[i].p);
 3704|  65.6k|                    dav1d_cdf_thread_unref(&c->cdf[i]);
 3705|  65.6k|                    dav1d_ref_dec(&c->refs[i].segmap);
 3706|  65.6k|                    dav1d_ref_dec(&c->refs[i].refmvs);
 3707|  65.6k|                }
 3708|  79.0k|            }
 3709|  9.88k|            goto error;
 3710|  9.88k|        }
 3711|  21.4k|    } else {
 3712|      0|        dav1d_task_frame_init(f);
 3713|      0|        pthread_mutex_unlock(&c->task_thread.lock);
 3714|      0|    }
 3715|       |
 3716|  11.6k|    return 0;
 3717|  9.90k|error:
 3718|  9.90k|    atomic_init(&f->task_thread.error, 1);
 3719|  9.90k|    dav1d_cdf_thread_unref(&f->in_cdf);
 3720|  9.90k|    if (f->frame_hdr->refresh_context)
  ------------------
  |  Branch (3720:9): [True: 1.49k, False: 8.41k]
  ------------------
 3721|  1.49k|        dav1d_cdf_thread_unref(&f->out_cdf);
 3722|  79.2k|    for (int i = 0; i < 7; i++) {
  ------------------
  |  Branch (3722:21): [True: 69.3k, False: 9.90k]
  ------------------
 3723|  69.3k|        if (f->refp[i].p.frame_hdr)
  ------------------
  |  Branch (3723:13): [True: 0, False: 69.3k]
  ------------------
 3724|      0|            dav1d_thread_picture_unref(&f->refp[i]);
 3725|  69.3k|        dav1d_ref_dec(&f->ref_mvs_ref[i]);
 3726|  69.3k|    }
 3727|  9.90k|    if (c->n_fc == 1)
  ------------------
  |  Branch (3727:9): [True: 9.90k, False: 0]
  ------------------
 3728|  9.90k|        dav1d_thread_picture_unref(&c->out);
 3729|      0|    else
 3730|      0|        dav1d_thread_picture_unref(out_delayed);
 3731|  9.90k|    dav1d_picture_unref_internal(&f->cur);
 3732|  9.90k|    dav1d_thread_picture_unref(&f->sr_cur);
 3733|  9.90k|    dav1d_ref_dec(&f->mvs_ref);
 3734|  9.90k|    dav1d_ref_dec(&f->seq_hdr_ref);
 3735|  9.90k|    dav1d_ref_dec(&f->frame_hdr_ref);
 3736|  9.90k|    dav1d_data_props_copy(&c->cached_error_props, &c->in.m);
 3737|       |
 3738|  9.90k|    for (int i = 0; i < f->n_tile_data; i++)
  ------------------
  |  Branch (3738:21): [True: 0, False: 9.90k]
  ------------------
 3739|      0|        dav1d_data_unref_internal(&f->tile[i].data);
 3740|  9.90k|    f->n_tile_data = 0;
 3741|       |
 3742|  9.90k|    if (c->n_fc > 1)
  ------------------
  |  Branch (3742:9): [True: 0, False: 9.90k]
  ------------------
 3743|      0|        pthread_mutex_unlock(&c->task_thread.lock);
 3744|       |
 3745|  9.90k|    return res;
 3746|  21.4k|}
decode.c:reset_context:
 2390|   296k|static void reset_context(BlockContext *const ctx, const int keyframe, const int pass) {
 2391|   296k|    memset(ctx->intra, keyframe, sizeof(ctx->intra));
 2392|   296k|    memset(ctx->uvmode, DC_PRED, sizeof(ctx->uvmode));
 2393|   296k|    if (keyframe)
  ------------------
  |  Branch (2393:9): [True: 278k, False: 17.5k]
  ------------------
 2394|   278k|        memset(ctx->mode, DC_PRED, sizeof(ctx->mode));
 2395|       |
 2396|   296k|    if (pass == 2) return;
  ------------------
  |  Branch (2396:9): [True: 0, False: 296k]
  ------------------
 2397|       |
 2398|   296k|    memset(ctx->partition, 0, sizeof(ctx->partition));
 2399|   296k|    memset(ctx->skip, 0, sizeof(ctx->skip));
 2400|   296k|    memset(ctx->skip_mode, 0, sizeof(ctx->skip_mode));
 2401|   296k|    memset(ctx->tx_lpf_y, 2, sizeof(ctx->tx_lpf_y));
 2402|   296k|    memset(ctx->tx_lpf_uv, 1, sizeof(ctx->tx_lpf_uv));
 2403|   296k|    memset(ctx->tx_intra, -1, sizeof(ctx->tx_intra));
 2404|   296k|    memset(ctx->tx, TX_64X64, sizeof(ctx->tx));
 2405|   296k|    if (!keyframe) {
  ------------------
  |  Branch (2405:9): [True: 17.5k, False: 278k]
  ------------------
 2406|  17.5k|        memset(ctx->ref, -1, sizeof(ctx->ref));
 2407|  17.5k|        memset(ctx->comp_type, 0, sizeof(ctx->comp_type));
 2408|  17.5k|        memset(ctx->mode, NEARESTMV, sizeof(ctx->mode));
 2409|  17.5k|    }
 2410|   296k|    memset(ctx->lcoef, 0x40, sizeof(ctx->lcoef));
 2411|   296k|    memset(ctx->ccoef, 0x40, sizeof(ctx->ccoef));
 2412|   296k|    memset(ctx->filter, DAV1D_N_SWITCHABLE_FILTERS, sizeof(ctx->filter));
 2413|   296k|    memset(ctx->seg_pred, 0, sizeof(ctx->seg_pred));
 2414|   296k|    memset(ctx->pal_sz, 0, sizeof(ctx->pal_sz));
 2415|   296k|}
decode.c:decode_sb:
 2119|  2.36M|{
 2120|  2.36M|    const Dav1dFrameContext *const f = t->f;
 2121|  2.36M|    Dav1dTileState *const ts = t->ts;
 2122|  2.36M|    const int hsz = 16 >> bl;
 2123|  2.36M|    const int have_h_split = f->bw > t->bx + hsz;
 2124|  2.36M|    const int have_v_split = f->bh > t->by + hsz;
 2125|       |
 2126|  2.36M|    if (!have_h_split && !have_v_split) {
  ------------------
  |  Branch (2126:9): [True: 515k, False: 1.84M]
  |  Branch (2126:26): [True: 22.8k, False: 492k]
  ------------------
 2127|  22.8k|        assert(bl < BL_8X8);
  ------------------
  |  |  140|  22.8k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 22.8k]
  |  |  |  Branch (140:68): [Folded, False: 22.8k]
  |  |  ------------------
  ------------------
 2128|  22.8k|        return decode_sb(t, bl + 1, INTRA_EDGE_SPLIT(node, 0));
  ------------------
  |  |   51|  22.8k|    ((const EdgeNode*)((uintptr_t)(n) + ((const EdgeBranch*)(n))->split_offset[i]))
  ------------------
 2129|  22.8k|    }
 2130|       |
 2131|  2.33M|    uint16_t *pc;
 2132|  2.33M|    enum BlockPartition bp;
 2133|  2.33M|    int ctx, bx8, by8;
 2134|  2.33M|    if (t->frame_thread.pass != 2) {
  ------------------
  |  Branch (2134:9): [True: 2.33M, False: 589]
  ------------------
 2135|  2.33M|        if (0 && bl == BL_64X64)
  ------------------
  |  Branch (2135:13): [Folded, False: 2.33M]
  |  Branch (2135:18): [True: 0, False: 0]
  ------------------
 2136|      0|            printf("poc=%d,y=%d,x=%d,bl=%d,r=%d\n",
 2137|      0|                   f->frame_hdr->frame_offset, t->by, t->bx, bl, ts->msac.rng);
 2138|  2.33M|        bx8 = (t->bx & 31) >> 1;
 2139|  2.33M|        by8 = (t->by & 31) >> 1;
 2140|  2.33M|        ctx = get_partition_ctx(t->a, &t->l, bl, by8, bx8);
 2141|  2.33M|        pc = ts->cdf.m.partition[bl][ctx];
 2142|  2.33M|    }
 2143|       |
 2144|  2.33M|    if (have_h_split && have_v_split) {
  ------------------
  |  Branch (2144:9): [True: 1.84M, False: 493k]
  |  Branch (2144:25): [True: 1.01M, False: 829k]
  ------------------
 2145|  1.01M|        if (t->frame_thread.pass == 2) {
  ------------------
  |  Branch (2145:13): [True: 0, False: 1.01M]
  ------------------
 2146|      0|            const Av1Block *const b = &f->frame_thread.b[t->by * f->b4_stride + t->bx];
 2147|      0|            bp = b->bl == bl ? b->bp : PARTITION_SPLIT;
  ------------------
  |  Branch (2147:18): [True: 0, False: 0]
  ------------------
 2148|  1.01M|        } else {
 2149|  1.01M|            bp = dav1d_msac_decode_symbol_adapt16(&ts->msac, pc,
  ------------------
  |  |   57|  1.01M|#define dav1d_msac_decode_symbol_adapt16(ctx, cdf, symb) ((ctx)->symbol_adapt16(ctx, cdf, symb))
  ------------------
 2150|  1.01M|                                                  dav1d_partition_type_count[bl]);
 2151|  1.01M|            if (f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I422 &&
  ------------------
  |  Branch (2151:17): [True: 2.12k, False: 1.01M]
  ------------------
 2152|  2.12k|                (bp == PARTITION_V || bp == PARTITION_V4 ||
  ------------------
  |  Branch (2152:18): [True: 86, False: 2.04k]
  |  Branch (2152:39): [True: 24, False: 2.01k]
  ------------------
 2153|  2.01k|                 bp == PARTITION_T_LEFT_SPLIT || bp == PARTITION_T_RIGHT_SPLIT))
  ------------------
  |  Branch (2153:18): [True: 11, False: 2.00k]
  |  Branch (2153:50): [True: 19, False: 1.98k]
  ------------------
 2154|    140|            {
 2155|    140|                return 1;
 2156|    140|            }
 2157|  1.01M|            if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|  1.01M|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 1.01M]
  |  |  ------------------
  |  |   35|  1.01M|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  1.01M|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 2158|      0|                printf("poc=%d,y=%d,x=%d,bl=%d,ctx=%d,bp=%d: r=%d\n",
 2159|      0|                       f->frame_hdr->frame_offset, t->by, t->bx, bl, ctx, bp,
 2160|      0|                       ts->msac.rng);
 2161|  1.01M|        }
 2162|  1.01M|        const uint8_t *const b = dav1d_block_sizes[bl][bp];
 2163|       |
 2164|  1.01M|        switch (bp) {
 2165|   293k|        case PARTITION_NONE:
  ------------------
  |  Branch (2165:9): [True: 293k, False: 723k]
  ------------------
 2166|   293k|            if (decode_b(t, bl, b[0], PARTITION_NONE, node->o))
  ------------------
  |  Branch (2166:17): [True: 37, False: 293k]
  ------------------
 2167|     37|                return -1;
 2168|   293k|            break;
 2169|   293k|        case PARTITION_H:
  ------------------
  |  Branch (2169:9): [True: 88.8k, False: 927k]
  ------------------
 2170|  88.8k|            if (decode_b(t, bl, b[0], PARTITION_H, node->h[0]))
  ------------------
  |  Branch (2170:17): [True: 21, False: 88.8k]
  ------------------
 2171|     21|                return -1;
 2172|  88.8k|            t->by += hsz;
 2173|  88.8k|            if (decode_b(t, bl, b[0], PARTITION_H, node->h[1]))
  ------------------
  |  Branch (2173:17): [True: 18, False: 88.8k]
  ------------------
 2174|     18|                return -1;
 2175|  88.8k|            t->by -= hsz;
 2176|  88.8k|            break;
 2177|  64.2k|        case PARTITION_V:
  ------------------
  |  Branch (2177:9): [True: 64.2k, False: 952k]
  ------------------
 2178|  64.2k|            if (decode_b(t, bl, b[0], PARTITION_V, node->v[0]))
  ------------------
  |  Branch (2178:17): [True: 16, False: 64.2k]
  ------------------
 2179|     16|                return -1;
 2180|  64.2k|            t->bx += hsz;
 2181|  64.2k|            if (decode_b(t, bl, b[0], PARTITION_V, node->v[1]))
  ------------------
  |  Branch (2181:17): [True: 21, False: 64.1k]
  ------------------
 2182|     21|                return -1;
 2183|  64.1k|            t->bx -= hsz;
 2184|  64.1k|            break;
 2185|   497k|        case PARTITION_SPLIT:
  ------------------
  |  Branch (2185:9): [True: 497k, False: 518k]
  ------------------
 2186|   497k|            if (bl == BL_8X8) {
  ------------------
  |  Branch (2186:17): [True: 422k, False: 75.5k]
  ------------------
 2187|   422k|                const EdgeTip *const tip = (const EdgeTip *) node;
 2188|   422k|                assert(hsz == 1);
  ------------------
  |  |  140|   422k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 422k]
  |  |  |  Branch (140:68): [Folded, False: 422k]
  |  |  ------------------
  ------------------
 2189|   422k|                if (decode_b(t, bl, BS_4x4, PARTITION_SPLIT, EDGE_ALL_TR_AND_BL))
  ------------------
  |  Branch (2189:21): [True: 13, False: 422k]
  ------------------
 2190|     13|                    return -1;
 2191|   422k|                const enum Filter2d tl_filter = t->tl_4x4_filter;
 2192|   422k|                t->bx++;
 2193|   422k|                if (decode_b(t, bl, BS_4x4, PARTITION_SPLIT, tip->split[0]))
  ------------------
  |  Branch (2193:21): [True: 11, False: 422k]
  ------------------
 2194|     11|                    return -1;
 2195|   422k|                t->bx--;
 2196|   422k|                t->by++;
 2197|   422k|                if (decode_b(t, bl, BS_4x4, PARTITION_SPLIT, tip->split[1]))
  ------------------
  |  Branch (2197:21): [True: 24, False: 422k]
  ------------------
 2198|     24|                    return -1;
 2199|   422k|                t->bx++;
 2200|   422k|                t->tl_4x4_filter = tl_filter;
 2201|   422k|                if (decode_b(t, bl, BS_4x4, PARTITION_SPLIT, tip->split[2]))
  ------------------
  |  Branch (2201:21): [True: 14, False: 422k]
  ------------------
 2202|     14|                    return -1;
 2203|   422k|                t->bx--;
 2204|   422k|                t->by--;
 2205|   422k|#if ARCH_X86_64
 2206|   422k|                if (t->frame_thread.pass) {
  ------------------
  |  Branch (2206:21): [True: 0, False: 422k]
  ------------------
 2207|       |                    /* In 8-bit mode with 2-pass decoding the coefficient buffer
 2208|       |                     * can end up misaligned due to skips here. Work around
 2209|       |                     * the issue by explicitly realigning the buffer. */
 2210|      0|                    const int p = t->frame_thread.pass & 1;
 2211|      0|                    ts->frame_thread[p].cf =
 2212|      0|                        (void*)(((uintptr_t)ts->frame_thread[p].cf + 63) & ~63);
 2213|      0|                }
 2214|   422k|#endif
 2215|   422k|            } else {
 2216|  75.5k|                if (decode_sb(t, bl + 1, INTRA_EDGE_SPLIT(node, 0)))
  ------------------
  |  |   51|  75.5k|    ((const EdgeNode*)((uintptr_t)(n) + ((const EdgeBranch*)(n))->split_offset[i]))
  ------------------
  |  Branch (2216:21): [True: 88, False: 75.4k]
  ------------------
 2217|     88|                    return 1;
 2218|  75.4k|                t->bx += hsz;
 2219|  75.4k|                if (decode_sb(t, bl + 1, INTRA_EDGE_SPLIT(node, 1)))
  ------------------
  |  |   51|  75.4k|    ((const EdgeNode*)((uintptr_t)(n) + ((const EdgeBranch*)(n))->split_offset[i]))
  ------------------
  |  Branch (2219:21): [True: 131, False: 75.3k]
  ------------------
 2220|    131|                    return 1;
 2221|  75.3k|                t->bx -= hsz;
 2222|  75.3k|                t->by += hsz;
 2223|  75.3k|                if (decode_sb(t, bl + 1, INTRA_EDGE_SPLIT(node, 2)))
  ------------------
  |  |   51|  75.3k|    ((const EdgeNode*)((uintptr_t)(n) + ((const EdgeBranch*)(n))->split_offset[i]))
  ------------------
  |  Branch (2223:21): [True: 117, False: 75.1k]
  ------------------
 2224|    117|                    return 1;
 2225|  75.1k|                t->bx += hsz;
 2226|  75.1k|                if (decode_sb(t, bl + 1, INTRA_EDGE_SPLIT(node, 3)))
  ------------------
  |  |   51|  75.1k|    ((const EdgeNode*)((uintptr_t)(n) + ((const EdgeBranch*)(n))->split_offset[i]))
  ------------------
  |  Branch (2226:21): [True: 79, False: 75.1k]
  ------------------
 2227|     79|                    return 1;
 2228|  75.1k|                t->bx -= hsz;
 2229|  75.1k|                t->by -= hsz;
 2230|  75.1k|            }
 2231|   497k|            break;
 2232|   497k|        case PARTITION_T_TOP_SPLIT: {
  ------------------
  |  Branch (2232:9): [True: 6.62k, False: 1.00M]
  ------------------
 2233|  6.62k|            if (decode_b(t, bl, b[0], PARTITION_T_TOP_SPLIT, EDGE_ALL_TR_AND_BL))
  ------------------
  |  Branch (2233:17): [True: 3, False: 6.62k]
  ------------------
 2234|      3|                return -1;
 2235|  6.62k|            t->bx += hsz;
 2236|  6.62k|            if (decode_b(t, bl, b[0], PARTITION_T_TOP_SPLIT, node->v[1]))
  ------------------
  |  Branch (2236:17): [True: 3, False: 6.62k]
  ------------------
 2237|      3|                return -1;
 2238|  6.62k|            t->bx -= hsz;
 2239|  6.62k|            t->by += hsz;
 2240|  6.62k|            if (decode_b(t, bl, b[1], PARTITION_T_TOP_SPLIT, node->h[1]))
  ------------------
  |  Branch (2240:17): [True: 3, False: 6.61k]
  ------------------
 2241|      3|                return -1;
 2242|  6.61k|            t->by -= hsz;
 2243|  6.61k|            break;
 2244|  6.62k|        }
 2245|  7.14k|        case PARTITION_T_BOTTOM_SPLIT: {
  ------------------
  |  Branch (2245:9): [True: 7.14k, False: 1.00M]
  ------------------
 2246|  7.14k|            if (decode_b(t, bl, b[0], PARTITION_T_BOTTOM_SPLIT, node->h[0]))
  ------------------
  |  Branch (2246:17): [True: 3, False: 7.14k]
  ------------------
 2247|      3|                return -1;
 2248|  7.14k|            t->by += hsz;
 2249|  7.14k|            if (decode_b(t, bl, b[1], PARTITION_T_BOTTOM_SPLIT, node->v[0]))
  ------------------
  |  Branch (2249:17): [True: 7, False: 7.13k]
  ------------------
 2250|      7|                return -1;
 2251|  7.13k|            t->bx += hsz;
 2252|  7.13k|            if (decode_b(t, bl, b[1], PARTITION_T_BOTTOM_SPLIT, 0))
  ------------------
  |  Branch (2252:17): [True: 10, False: 7.12k]
  ------------------
 2253|     10|                return -1;
 2254|  7.12k|            t->bx -= hsz;
 2255|  7.12k|            t->by -= hsz;
 2256|  7.12k|            break;
 2257|  7.13k|        }
 2258|  4.64k|        case PARTITION_T_LEFT_SPLIT: {
  ------------------
  |  Branch (2258:9): [True: 4.64k, False: 1.01M]
  ------------------
 2259|  4.64k|            if (decode_b(t, bl, b[0], PARTITION_T_LEFT_SPLIT, EDGE_ALL_TR_AND_BL))
  ------------------
  |  Branch (2259:17): [True: 3, False: 4.64k]
  ------------------
 2260|      3|                return -1;
 2261|  4.64k|            t->by += hsz;
 2262|  4.64k|            if (decode_b(t, bl, b[0], PARTITION_T_LEFT_SPLIT, node->h[1]))
  ------------------
  |  Branch (2262:17): [True: 6, False: 4.64k]
  ------------------
 2263|      6|                return -1;
 2264|  4.64k|            t->by -= hsz;
 2265|  4.64k|            t->bx += hsz;
 2266|  4.64k|            if (decode_b(t, bl, b[1], PARTITION_T_LEFT_SPLIT, node->v[1]))
  ------------------
  |  Branch (2266:17): [True: 7, False: 4.63k]
  ------------------
 2267|      7|                return -1;
 2268|  4.63k|            t->bx -= hsz;
 2269|  4.63k|            break;
 2270|  4.64k|        }
 2271|  5.57k|        case PARTITION_T_RIGHT_SPLIT: {
  ------------------
  |  Branch (2271:9): [True: 5.57k, False: 1.01M]
  ------------------
 2272|  5.57k|            if (decode_b(t, bl, b[0], PARTITION_T_RIGHT_SPLIT, node->v[0]))
  ------------------
  |  Branch (2272:17): [True: 3, False: 5.57k]
  ------------------
 2273|      3|                return -1;
 2274|  5.57k|            t->bx += hsz;
 2275|  5.57k|            if (decode_b(t, bl, b[1], PARTITION_T_RIGHT_SPLIT, node->h[0]))
  ------------------
  |  Branch (2275:17): [True: 7, False: 5.56k]
  ------------------
 2276|      7|                return -1;
 2277|  5.56k|            t->by += hsz;
 2278|  5.56k|            if (decode_b(t, bl, b[1], PARTITION_T_RIGHT_SPLIT, 0))
  ------------------
  |  Branch (2278:17): [True: 11, False: 5.55k]
  ------------------
 2279|     11|                return -1;
 2280|  5.55k|            t->by -= hsz;
 2281|  5.55k|            t->bx -= hsz;
 2282|  5.55k|            break;
 2283|  5.56k|        }
 2284|  16.1k|        case PARTITION_H4: {
  ------------------
  |  Branch (2284:9): [True: 16.1k, False: 1.00M]
  ------------------
 2285|  16.1k|            const EdgeBranch *const branch = (const EdgeBranch *) node;
 2286|  16.1k|            if (decode_b(t, bl, b[0], PARTITION_H4, node->h[0]))
  ------------------
  |  Branch (2286:17): [True: 7, False: 16.1k]
  ------------------
 2287|      7|                return -1;
 2288|  16.1k|            t->by += hsz >> 1;
 2289|  16.1k|            if (decode_b(t, bl, b[0], PARTITION_H4, branch->h4))
  ------------------
  |  Branch (2289:17): [True: 7, False: 16.1k]
  ------------------
 2290|      7|                return -1;
 2291|  16.1k|            t->by += hsz >> 1;
 2292|  16.1k|            if (decode_b(t, bl, b[0], PARTITION_H4, EDGE_ALL_LEFT_HAS_BOTTOM))
  ------------------
  |  Branch (2292:17): [True: 7, False: 16.1k]
  ------------------
 2293|      7|                return -1;
 2294|  16.1k|            t->by += hsz >> 1;
 2295|  16.1k|            if (t->by < f->bh)
  ------------------
  |  Branch (2295:17): [True: 13.9k, False: 2.16k]
  ------------------
 2296|  13.9k|                if (decode_b(t, bl, b[0], PARTITION_H4, node->h[1]))
  ------------------
  |  Branch (2296:21): [True: 7, False: 13.9k]
  ------------------
 2297|      7|                    return -1;
 2298|  16.0k|            t->by -= hsz * 3 >> 1;
 2299|  16.0k|            break;
 2300|  16.1k|        }
 2301|  32.9k|        case PARTITION_V4: {
  ------------------
  |  Branch (2301:9): [True: 32.9k, False: 983k]
  ------------------
 2302|  32.9k|            const EdgeBranch *const branch = (const EdgeBranch *) node;
 2303|  32.9k|            if (decode_b(t, bl, b[0], PARTITION_V4, node->v[0]))
  ------------------
  |  Branch (2303:17): [True: 15, False: 32.9k]
  ------------------
 2304|     15|                return -1;
 2305|  32.9k|            t->bx += hsz >> 1;
 2306|  32.9k|            if (decode_b(t, bl, b[0], PARTITION_V4, branch->v4))
  ------------------
  |  Branch (2306:17): [True: 7, False: 32.9k]
  ------------------
 2307|      7|                return -1;
 2308|  32.9k|            t->bx += hsz >> 1;
 2309|  32.9k|            if (decode_b(t, bl, b[0], PARTITION_V4, EDGE_ALL_TOP_HAS_RIGHT))
  ------------------
  |  Branch (2309:17): [True: 9, False: 32.9k]
  ------------------
 2310|      9|                return -1;
 2311|  32.9k|            t->bx += hsz >> 1;
 2312|  32.9k|            if (t->bx < f->bw)
  ------------------
  |  Branch (2312:17): [True: 31.3k, False: 1.57k]
  ------------------
 2313|  31.3k|                if (decode_b(t, bl, b[0], PARTITION_V4, node->v[1]))
  ------------------
  |  Branch (2313:21): [True: 9, False: 31.3k]
  ------------------
 2314|      9|                    return -1;
 2315|  32.9k|            t->bx -= hsz * 3 >> 1;
 2316|  32.9k|            break;
 2317|  32.9k|        }
 2318|      0|        default: assert(0);
  ------------------
  |  |  140|      0|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, Folded]
  |  |  |  Branch (140:68): [Folded, False: 0]
  |  |  ------------------
  ------------------
  |  Branch (2318:9): [True: 0, False: 1.01M]
  ------------------
 2319|  1.01M|        }
 2320|  1.32M|    } else if (have_h_split) {
  ------------------
  |  Branch (2320:16): [True: 833k, False: 489k]
  ------------------
 2321|   833k|        unsigned is_split;
 2322|   833k|        if (t->frame_thread.pass == 2) {
  ------------------
  |  Branch (2322:13): [True: 0, False: 833k]
  ------------------
 2323|      0|            const Av1Block *const b = &f->frame_thread.b[t->by * f->b4_stride + t->bx];
 2324|      0|            is_split = b->bl != bl;
 2325|   833k|        } else {
 2326|   833k|            is_split = dav1d_msac_decode_bool(&ts->msac,
  ------------------
  |  |   54|   833k|#define dav1d_msac_decode_bool           dav1d_msac_decode_bool_sse2
  ------------------
 2327|   833k|                           gather_top_partition_prob(pc, bl));
 2328|   833k|            if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|   833k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 833k]
  |  |  ------------------
  |  |   35|   833k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|   833k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 2329|      0|                printf("poc=%d,y=%d,x=%d,bl=%d,ctx=%d,bp=%d: r=%d\n",
 2330|      0|                       f->frame_hdr->frame_offset, t->by, t->bx, bl, ctx,
 2331|      0|                       is_split ? PARTITION_SPLIT : PARTITION_H, ts->msac.rng);
  ------------------
  |  Branch (2331:24): [True: 0, False: 0]
  ------------------
 2332|   833k|        }
 2333|       |
 2334|   833k|        assert(bl < BL_8X8);
  ------------------
  |  |  140|   833k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 833k]
  |  |  |  Branch (140:68): [Folded, False: 833k]
  |  |  ------------------
  ------------------
 2335|   833k|        if (is_split) {
  ------------------
  |  Branch (2335:13): [True: 609k, False: 223k]
  ------------------
 2336|   609k|            bp = PARTITION_SPLIT;
 2337|   609k|            if (decode_sb(t, bl + 1, INTRA_EDGE_SPLIT(node, 0))) return 1;
  ------------------
  |  |   51|   609k|    ((const EdgeNode*)((uintptr_t)(n) + ((const EdgeBranch*)(n))->split_offset[i]))
  ------------------
  |  Branch (2337:17): [True: 109, False: 609k]
  ------------------
 2338|   609k|            t->bx += hsz;
 2339|   609k|            if (decode_sb(t, bl + 1, INTRA_EDGE_SPLIT(node, 1))) return 1;
  ------------------
  |  |   51|   609k|    ((const EdgeNode*)((uintptr_t)(n) + ((const EdgeBranch*)(n))->split_offset[i]))
  ------------------
  |  Branch (2339:17): [True: 139, False: 609k]
  ------------------
 2340|   609k|            t->bx -= hsz;
 2341|   609k|        } else {
 2342|   223k|            bp = PARTITION_H;
 2343|   223k|            if (decode_b(t, bl, dav1d_block_sizes[bl][PARTITION_H][0],
  ------------------
  |  Branch (2343:17): [True: 36, False: 223k]
  ------------------
 2344|   223k|                         PARTITION_H, node->h[0]))
 2345|     36|                return -1;
 2346|   223k|        }
 2347|   833k|    } else {
 2348|   489k|        assert(have_v_split);
  ------------------
  |  |  140|   489k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 489k]
  |  |  |  Branch (140:68): [Folded, False: 489k]
  |  |  ------------------
  ------------------
 2349|   489k|        unsigned is_split;
 2350|   489k|        if (t->frame_thread.pass == 2) {
  ------------------
  |  Branch (2350:13): [True: 0, False: 489k]
  ------------------
 2351|      0|            const Av1Block *const b = &f->frame_thread.b[t->by * f->b4_stride + t->bx];
 2352|      0|            is_split = b->bl != bl;
 2353|   489k|        } else {
 2354|   489k|            is_split = dav1d_msac_decode_bool(&ts->msac,
  ------------------
  |  |   54|   489k|#define dav1d_msac_decode_bool           dav1d_msac_decode_bool_sse2
  ------------------
 2355|   489k|                           gather_left_partition_prob(pc, bl));
 2356|   489k|            if (f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I422 && !is_split)
  ------------------
  |  Branch (2356:17): [True: 65, False: 489k]
  |  Branch (2356:63): [True: 13, False: 52]
  ------------------
 2357|     13|                return 1;
 2358|   489k|            if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|   489k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 489k]
  |  |  ------------------
  |  |   35|   489k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|   489k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 2359|      0|                printf("poc=%d,y=%d,x=%d,bl=%d,ctx=%d,bp=%d: r=%d\n",
 2360|      0|                       f->frame_hdr->frame_offset, t->by, t->bx, bl, ctx,
 2361|      0|                       is_split ? PARTITION_SPLIT : PARTITION_V, ts->msac.rng);
  ------------------
  |  Branch (2361:24): [True: 0, False: 0]
  ------------------
 2362|   489k|        }
 2363|       |
 2364|   489k|        assert(bl < BL_8X8);
  ------------------
  |  |  140|   489k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 489k]
  |  |  |  Branch (140:68): [Folded, False: 489k]
  |  |  ------------------
  ------------------
 2365|   489k|        if (is_split) {
  ------------------
  |  Branch (2365:13): [True: 250k, False: 239k]
  ------------------
 2366|   250k|            bp = PARTITION_SPLIT;
 2367|   250k|            if (decode_sb(t, bl + 1, INTRA_EDGE_SPLIT(node, 0))) return 1;
  ------------------
  |  |   51|   250k|    ((const EdgeNode*)((uintptr_t)(n) + ((const EdgeBranch*)(n))->split_offset[i]))
  ------------------
  |  Branch (2367:17): [True: 157, False: 250k]
  ------------------
 2368|   250k|            t->by += hsz;
 2369|   250k|            if (decode_sb(t, bl + 1, INTRA_EDGE_SPLIT(node, 2))) return 1;
  ------------------
  |  |   51|   250k|    ((const EdgeNode*)((uintptr_t)(n) + ((const EdgeBranch*)(n))->split_offset[i]))
  ------------------
  |  Branch (2369:17): [True: 203, False: 250k]
  ------------------
 2370|   250k|            t->by -= hsz;
 2371|   250k|        } else {
 2372|   239k|            bp = PARTITION_V;
 2373|   239k|            if (decode_b(t, bl, dav1d_block_sizes[bl][PARTITION_V][0],
  ------------------
  |  Branch (2373:17): [True: 59, False: 239k]
  ------------------
 2374|   239k|                         PARTITION_V, node->v[0]))
 2375|     59|                return -1;
 2376|   239k|        }
 2377|   489k|    }
 2378|       |
 2379|  2.33M|    if (t->frame_thread.pass != 2 && (bp != PARTITION_SPLIT || bl == BL_8X8)) {
  ------------------
  |  Branch (2379:9): [True: 2.33M, False: 18.4E]
  |  Branch (2379:39): [True: 983k, False: 1.35M]
  |  Branch (2379:64): [True: 420k, False: 933k]
  ------------------
 2380|  1.40M|#define set_ctx(rep_macro) \
 2381|  1.40M|        rep_macro(t->a->partition, bx8, dav1d_al_part_ctx[0][bl][bp]); \
 2382|  1.40M|        rep_macro(t->l.partition, by8, dav1d_al_part_ctx[1][bl][bp])
 2383|  1.40M|        case_set_upto16(ulog2(hsz));
  ------------------
  |  |   80|  1.40M|    switch (var) { \
  |  |   81|   717k|    case 0: set_ctx(set_ctx1); break; \
  |  |  ------------------
  |  |  |  | 2381|   717k|        rep_macro(t->a->partition, bx8, dav1d_al_part_ctx[0][bl][bp]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   81|   717k|    case 0: set_ctx(set_ctx1); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   56|   717k|    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 2382|   717k|        rep_macro(t->l.partition, by8, dav1d_al_part_ctx[1][bl][bp])
  |  |  |  |  ------------------
  |  |  |  |  |  |   81|   717k|    case 0: set_ctx(set_ctx1); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   56|   717k|    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (81:5): [True: 717k, False: 688k]
  |  |  ------------------
  |  |   82|   300k|    case 1: set_ctx(set_ctx2); break; \
  |  |  ------------------
  |  |  |  | 2381|   300k|        rep_macro(t->a->partition, bx8, dav1d_al_part_ctx[0][bl][bp]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   82|   300k|    case 1: set_ctx(set_ctx2); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   58|   300k|    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 2382|   300k|        rep_macro(t->l.partition, by8, dav1d_al_part_ctx[1][bl][bp])
  |  |  |  |  ------------------
  |  |  |  |  |  |   82|   300k|    case 1: set_ctx(set_ctx2); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   58|   300k|    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (82:5): [True: 300k, False: 1.10M]
  |  |  ------------------
  |  |   83|   223k|    case 2: set_ctx(set_ctx4); break; \
  |  |  ------------------
  |  |  |  | 2381|   223k|        rep_macro(t->a->partition, bx8, dav1d_al_part_ctx[0][bl][bp]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   83|   223k|    case 2: set_ctx(set_ctx4); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   60|   223k|    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 2382|   223k|        rep_macro(t->l.partition, by8, dav1d_al_part_ctx[1][bl][bp])
  |  |  |  |  ------------------
  |  |  |  |  |  |   83|   223k|    case 2: set_ctx(set_ctx4); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   60|   223k|    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (83:5): [True: 223k, False: 1.18M]
  |  |  ------------------
  |  |   84|   132k|    case 3: set_ctx(set_ctx8); break; \
  |  |  ------------------
  |  |  |  | 2381|   132k|        rep_macro(t->a->partition, bx8, dav1d_al_part_ctx[0][bl][bp]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   84|   132k|    case 3: set_ctx(set_ctx8); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   62|   132k|    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 2382|   132k|        rep_macro(t->l.partition, by8, dav1d_al_part_ctx[1][bl][bp])
  |  |  |  |  ------------------
  |  |  |  |  |  |   84|   132k|    case 3: set_ctx(set_ctx8); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   62|   132k|    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (84:5): [True: 132k, False: 1.27M]
  |  |  ------------------
  |  |   85|  32.5k|    case 4: set_ctx(set_ctx16); break; \
  |  |  ------------------
  |  |  |  | 2381|  32.5k|        rep_macro(t->a->partition, bx8, dav1d_al_part_ctx[0][bl][bp]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   85|  32.5k|    case 4: set_ctx(set_ctx16); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   63|  32.5k|#define set_ctx16(var, off, val) do { \
  |  |  |  |  |  |  |  |   64|  32.5k|        memset(&(var)[off], val, 16); \
  |  |  |  |  |  |  |  |   65|  32.5k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (65:14): [Folded, False: 32.5k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 2382|  32.5k|        rep_macro(t->l.partition, by8, dav1d_al_part_ctx[1][bl][bp])
  |  |  |  |  ------------------
  |  |  |  |  |  |   85|  32.5k|    case 4: set_ctx(set_ctx16); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   63|  32.5k|#define set_ctx16(var, off, val) do { \
  |  |  |  |  |  |  |  |   64|  32.5k|        memset(&(var)[off], val, 16); \
  |  |  |  |  |  |  |  |   65|  32.5k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (65:14): [Folded, False: 32.5k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (85:5): [True: 32.5k, False: 1.37M]
  |  |  ------------------
  |  |   86|      0|    default: assert(0); \
  |  |  ------------------
  |  |  |  |  140|      0|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (140:28): [True: 0, Folded]
  |  |  |  |  |  Branch (140:68): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (86:5): [True: 0, False: 1.40M]
  |  |  ------------------
  |  |   87|  1.40M|    }
  ------------------
 2384|  1.40M|#undef set_ctx
 2385|  1.40M|    }
 2386|       |
 2387|  2.33M|    return 0;
 2388|  2.33M|}
decode.c:decode_b:
  687|  3.00M|                    const enum EdgeFlags intra_edge_flags) {
  688|  3.00M|    Dav1dTileState *const ts = t->ts;
  689|  3.00M|    const Dav1dFrameContext *const f = t->f;
  690|  3.00M|    Av1Block b_mem, *const b = t->frame_thread.pass ?
  ------------------
  |  Branch (690:32): [True: 0, False: 3.00M]
  ------------------
  691|  3.00M|        &f->frame_thread.b[t->by * f->b4_stride + t->bx] : &b_mem;
  692|  3.00M|    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
  693|  3.00M|    const int bx4 = t->bx & 31, by4 = t->by & 31;
  694|  3.00M|    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
  695|  3.00M|    const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
  696|  3.00M|    const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
  697|  3.00M|    const int bw4 = b_dim[0], bh4 = b_dim[1];
  698|  3.00M|    const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
  699|  3.00M|    const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver;
  700|  3.00M|    const int have_left = t->bx > ts->tiling.col_start;
  701|  3.00M|    const int have_top = t->by > ts->tiling.row_start;
  702|  3.00M|    const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&
  ------------------
  |  Branch (702:28): [True: 2.52M, False: 482k]
  ------------------
  703|  2.52M|                           (bw4 > ss_hor || t->bx & 1) &&
  ------------------
  |  Branch (703:29): [True: 2.39M, False: 127k]
  |  Branch (703:45): [True: 63.7k, False: 63.9k]
  ------------------
  704|  2.45M|                           (bh4 > ss_ver || t->by & 1);
  ------------------
  |  Branch (704:29): [True: 2.38M, False: 78.8k]
  |  Branch (704:45): [True: 39.4k, False: 39.4k]
  ------------------
  705|       |
  706|  3.00M|    if (t->frame_thread.pass == 2) {
  ------------------
  |  Branch (706:9): [True: 0, False: 3.00M]
  ------------------
  707|      0|        if (b->intra) {
  ------------------
  |  Branch (707:13): [True: 0, False: 0]
  ------------------
  708|      0|            f->bd_fn.recon_b_intra(t, bs, intra_edge_flags, b);
  709|       |
  710|      0|            const enum IntraPredMode y_mode_nofilt =
  711|      0|                b->y_mode == FILTER_PRED ? DC_PRED : b->y_mode;
  ------------------
  |  Branch (711:17): [True: 0, False: 0]
  ------------------
  712|      0|#define set_ctx(rep_macro) \
  713|      0|            rep_macro(edge->mode, off, y_mode_nofilt); \
  714|      0|            rep_macro(edge->intra, off, 1)
  715|      0|            BlockContext *edge = t->a;
  716|      0|            for (int i = 0, off = bx4; i < 2; i++, off = by4, edge = &t->l) {
  ------------------
  |  Branch (716:40): [True: 0, False: 0]
  ------------------
  717|      0|                case_set(b_dim[2 + i]);
  ------------------
  |  |   70|      0|    switch (var) { \
  |  |   71|      0|    case 0: set_ctx(set_ctx1); break; \
  |  |  ------------------
  |  |  |  |  713|      0|            rep_macro(edge->mode, off, y_mode_nofilt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   71|      0|    case 0: set_ctx(set_ctx1); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   56|      0|    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  714|      0|            rep_macro(edge->intra, off, 1)
  |  |  |  |  ------------------
  |  |  |  |  |  |   71|      0|    case 0: set_ctx(set_ctx1); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   56|      0|    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (71:5): [True: 0, False: 0]
  |  |  ------------------
  |  |   72|      0|    case 1: set_ctx(set_ctx2); break; \
  |  |  ------------------
  |  |  |  |  713|      0|            rep_macro(edge->mode, off, y_mode_nofilt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   72|      0|    case 1: set_ctx(set_ctx2); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   58|      0|    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  714|      0|            rep_macro(edge->intra, off, 1)
  |  |  |  |  ------------------
  |  |  |  |  |  |   72|      0|    case 1: set_ctx(set_ctx2); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   58|      0|    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (72:5): [True: 0, False: 0]
  |  |  ------------------
  |  |   73|      0|    case 2: set_ctx(set_ctx4); break; \
  |  |  ------------------
  |  |  |  |  713|      0|            rep_macro(edge->mode, off, y_mode_nofilt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   73|      0|    case 2: set_ctx(set_ctx4); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   60|      0|    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  714|      0|            rep_macro(edge->intra, off, 1)
  |  |  |  |  ------------------
  |  |  |  |  |  |   73|      0|    case 2: set_ctx(set_ctx4); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   60|      0|    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (73:5): [True: 0, False: 0]
  |  |  ------------------
  |  |   74|      0|    case 3: set_ctx(set_ctx8); break; \
  |  |  ------------------
  |  |  |  |  713|      0|            rep_macro(edge->mode, off, y_mode_nofilt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   74|      0|    case 3: set_ctx(set_ctx8); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   62|      0|    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  714|      0|            rep_macro(edge->intra, off, 1)
  |  |  |  |  ------------------
  |  |  |  |  |  |   74|      0|    case 3: set_ctx(set_ctx8); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   62|      0|    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (74:5): [True: 0, False: 0]
  |  |  ------------------
  |  |   75|      0|    case 4: set_ctx(set_ctx16); break; \
  |  |  ------------------
  |  |  |  |  713|      0|            rep_macro(edge->mode, off, y_mode_nofilt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   75|      0|    case 4: set_ctx(set_ctx16); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   63|      0|#define set_ctx16(var, off, val) do { \
  |  |  |  |  |  |  |  |   64|      0|        memset(&(var)[off], val, 16); \
  |  |  |  |  |  |  |  |   65|      0|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (65:14): [Folded, False: 0]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  714|      0|            rep_macro(edge->intra, off, 1)
  |  |  |  |  ------------------
  |  |  |  |  |  |   75|      0|    case 4: set_ctx(set_ctx16); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   63|      0|#define set_ctx16(var, off, val) do { \
  |  |  |  |  |  |  |  |   64|      0|        memset(&(var)[off], val, 16); \
  |  |  |  |  |  |  |  |   65|      0|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (65:14): [Folded, False: 0]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (75:5): [True: 0, False: 0]
  |  |  ------------------
  |  |   76|      0|    case 5: set_ctx(set_ctx32); break; \
  |  |  ------------------
  |  |  |  |  713|      0|            rep_macro(edge->mode, off, y_mode_nofilt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   76|      0|    case 5: set_ctx(set_ctx32); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   66|      0|#define set_ctx32(var, off, val) do { \
  |  |  |  |  |  |  |  |   67|      0|        memset(&(var)[off], val, 32); \
  |  |  |  |  |  |  |  |   68|      0|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (68:14): [Folded, False: 0]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  714|      0|            rep_macro(edge->intra, off, 1)
  |  |  |  |  ------------------
  |  |  |  |  |  |   76|      0|    case 5: set_ctx(set_ctx32); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   66|      0|#define set_ctx32(var, off, val) do { \
  |  |  |  |  |  |  |  |   67|      0|        memset(&(var)[off], val, 32); \
  |  |  |  |  |  |  |  |   68|      0|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (68:14): [Folded, False: 0]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (76:5): [True: 0, False: 0]
  |  |  ------------------
  |  |   77|      0|    default: assert(0); \
  |  |  ------------------
  |  |  |  |  140|      0|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (140:28): [True: 0, Folded]
  |  |  |  |  |  Branch (140:68): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (77:5): [True: 0, False: 0]
  |  |  ------------------
  |  |   78|      0|    }
  ------------------
  718|      0|            }
  719|      0|#undef set_ctx
  720|      0|            if (IS_INTER_OR_SWITCH(f->frame_hdr)) {
  ------------------
  |  |   36|      0|    ((frame_header)->frame_type & 1)
  |  |  ------------------
  |  |  |  Branch (36:5): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  721|      0|                refmvs_block *const r = &t->rt.r[(t->by & 31) + 5 + bh4 - 1][t->bx];
  722|      0|                for (int x = 0; x < bw4; x++) {
  ------------------
  |  Branch (722:33): [True: 0, False: 0]
  ------------------
  723|      0|                    r[x].ref.ref[0] = 0;
  724|      0|                    r[x].bs = bs;
  725|      0|                }
  726|      0|                refmvs_block *const *rr = &t->rt.r[(t->by & 31) + 5];
  727|      0|                for (int y = 0; y < bh4 - 1; y++) {
  ------------------
  |  Branch (727:33): [True: 0, False: 0]
  ------------------
  728|      0|                    rr[y][t->bx + bw4 - 1].ref.ref[0] = 0;
  729|      0|                    rr[y][t->bx + bw4 - 1].bs = bs;
  730|      0|                }
  731|      0|            }
  732|       |
  733|      0|            if (has_chroma) {
  ------------------
  |  Branch (733:17): [True: 0, False: 0]
  ------------------
  734|      0|                uint8_t uv_mode = b->uv_mode;
  735|      0|                dav1d_memset_pow2[ulog2(cbw4)](&t->a->uvmode[cbx4], uv_mode);
  736|      0|                dav1d_memset_pow2[ulog2(cbh4)](&t->l.uvmode[cby4], uv_mode);
  737|      0|            }
  738|      0|        } else {
  739|      0|            if (IS_INTER_OR_SWITCH(f->frame_hdr) /* not intrabc */ &&
  ------------------
  |  |   36|      0|    ((frame_header)->frame_type & 1)
  |  |  ------------------
  |  |  |  Branch (36:5): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  740|      0|                b->comp_type == COMP_INTER_NONE && b->motion_mode == MM_WARP)
  ------------------
  |  Branch (740:17): [True: 0, False: 0]
  |  Branch (740:52): [True: 0, False: 0]
  ------------------
  741|      0|            {
  742|      0|                if (b->matrix[0] == INT16_MIN) {
  ------------------
  |  Branch (742:21): [True: 0, False: 0]
  ------------------
  743|      0|                    t->warpmv.type = DAV1D_WM_TYPE_IDENTITY;
  744|      0|                } else {
  745|      0|                    t->warpmv.type = DAV1D_WM_TYPE_AFFINE;
  746|      0|                    t->warpmv.matrix[2] = b->matrix[0] + 0x10000;
  747|      0|                    t->warpmv.matrix[3] = b->matrix[1];
  748|      0|                    t->warpmv.matrix[4] = b->matrix[2];
  749|      0|                    t->warpmv.matrix[5] = b->matrix[3] + 0x10000;
  750|      0|                    dav1d_set_affine_mv2d(bw4, bh4, b->mv2d, &t->warpmv,
  751|      0|                                          t->bx, t->by);
  752|      0|                    dav1d_get_shear_params(&t->warpmv);
  753|      0|#define signabs(v) v < 0 ? '-' : ' ', abs(v)
  754|      0|                    if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|      0|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 0]
  |  |  ------------------
  |  |   35|      0|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|      0|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  755|      0|                        printf("[ %c%x %c%x %c%x\n  %c%x %c%x %c%x ]\n"
  756|      0|                               "alpha=%c%x, beta=%c%x, gamma=%c%x, delta=%c%x, mv=y:%d,x:%d\n",
  757|      0|                               signabs(t->warpmv.matrix[0]),
  ------------------
  |  |  753|      0|#define signabs(v) v < 0 ? '-' : ' ', abs(v)
  |  |  ------------------
  |  |  |  Branch (753:20): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  758|      0|                               signabs(t->warpmv.matrix[1]),
  ------------------
  |  |  753|      0|#define signabs(v) v < 0 ? '-' : ' ', abs(v)
  |  |  ------------------
  |  |  |  Branch (753:20): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  759|      0|                               signabs(t->warpmv.matrix[2]),
  ------------------
  |  |  753|      0|#define signabs(v) v < 0 ? '-' : ' ', abs(v)
  |  |  ------------------
  |  |  |  Branch (753:20): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  760|      0|                               signabs(t->warpmv.matrix[3]),
  ------------------
  |  |  753|      0|#define signabs(v) v < 0 ? '-' : ' ', abs(v)
  |  |  ------------------
  |  |  |  Branch (753:20): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  761|      0|                               signabs(t->warpmv.matrix[4]),
  ------------------
  |  |  753|      0|#define signabs(v) v < 0 ? '-' : ' ', abs(v)
  |  |  ------------------
  |  |  |  Branch (753:20): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  762|      0|                               signabs(t->warpmv.matrix[5]),
  ------------------
  |  |  753|      0|#define signabs(v) v < 0 ? '-' : ' ', abs(v)
  |  |  ------------------
  |  |  |  Branch (753:20): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  763|      0|                               signabs(t->warpmv.u.p.alpha),
  ------------------
  |  |  753|      0|#define signabs(v) v < 0 ? '-' : ' ', abs(v)
  |  |  ------------------
  |  |  |  Branch (753:20): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  764|      0|                               signabs(t->warpmv.u.p.beta),
  ------------------
  |  |  753|      0|#define signabs(v) v < 0 ? '-' : ' ', abs(v)
  |  |  ------------------
  |  |  |  Branch (753:20): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  765|      0|                               signabs(t->warpmv.u.p.gamma),
  ------------------
  |  |  753|      0|#define signabs(v) v < 0 ? '-' : ' ', abs(v)
  |  |  ------------------
  |  |  |  Branch (753:20): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  766|      0|                               signabs(t->warpmv.u.p.delta),
  ------------------
  |  |  753|      0|#define signabs(v) v < 0 ? '-' : ' ', abs(v)
  |  |  ------------------
  |  |  |  Branch (753:20): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  767|      0|                               b->mv2d.y, b->mv2d.x);
  768|      0|#undef signabs
  769|      0|                }
  770|      0|            }
  771|      0|            if (f->bd_fn.recon_b_inter(t, bs, b)) return -1;
  ------------------
  |  Branch (771:17): [True: 0, False: 0]
  ------------------
  772|       |
  773|      0|            const uint8_t *const filter = dav1d_filter_dir[b->filter2d];
  774|      0|            BlockContext *edge = t->a;
  775|      0|            for (int i = 0, off = bx4; i < 2; i++, off = by4, edge = &t->l) {
  ------------------
  |  Branch (775:40): [True: 0, False: 0]
  ------------------
  776|      0|#define set_ctx(rep_macro) \
  777|      0|                rep_macro(edge->filter[0], off, filter[0]); \
  778|      0|                rep_macro(edge->filter[1], off, filter[1]); \
  779|      0|                rep_macro(edge->intra, off, 0)
  780|      0|                case_set(b_dim[2 + i]);
  ------------------
  |  |   70|      0|    switch (var) { \
  |  |   71|      0|    case 0: set_ctx(set_ctx1); break; \
  |  |  ------------------
  |  |  |  |  777|      0|                rep_macro(edge->filter[0], off, filter[0]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   71|      0|    case 0: set_ctx(set_ctx1); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   56|      0|    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  778|      0|                rep_macro(edge->filter[1], off, filter[1]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   71|      0|    case 0: set_ctx(set_ctx1); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   56|      0|    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  779|      0|                rep_macro(edge->intra, off, 0)
  |  |  |  |  ------------------
  |  |  |  |  |  |   71|      0|    case 0: set_ctx(set_ctx1); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   56|      0|    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (71:5): [True: 0, False: 0]
  |  |  ------------------
  |  |   72|      0|    case 1: set_ctx(set_ctx2); break; \
  |  |  ------------------
  |  |  |  |  777|      0|                rep_macro(edge->filter[0], off, filter[0]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   72|      0|    case 1: set_ctx(set_ctx2); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   58|      0|    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  778|      0|                rep_macro(edge->filter[1], off, filter[1]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   72|      0|    case 1: set_ctx(set_ctx2); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   58|      0|    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  779|      0|                rep_macro(edge->intra, off, 0)
  |  |  |  |  ------------------
  |  |  |  |  |  |   72|      0|    case 1: set_ctx(set_ctx2); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   58|      0|    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (72:5): [True: 0, False: 0]
  |  |  ------------------
  |  |   73|      0|    case 2: set_ctx(set_ctx4); break; \
  |  |  ------------------
  |  |  |  |  777|      0|                rep_macro(edge->filter[0], off, filter[0]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   73|      0|    case 2: set_ctx(set_ctx4); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   60|      0|    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  778|      0|                rep_macro(edge->filter[1], off, filter[1]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   73|      0|    case 2: set_ctx(set_ctx4); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   60|      0|    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  779|      0|                rep_macro(edge->intra, off, 0)
  |  |  |  |  ------------------
  |  |  |  |  |  |   73|      0|    case 2: set_ctx(set_ctx4); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   60|      0|    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (73:5): [True: 0, False: 0]
  |  |  ------------------
  |  |   74|      0|    case 3: set_ctx(set_ctx8); break; \
  |  |  ------------------
  |  |  |  |  777|      0|                rep_macro(edge->filter[0], off, filter[0]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   74|      0|    case 3: set_ctx(set_ctx8); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   62|      0|    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  778|      0|                rep_macro(edge->filter[1], off, filter[1]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   74|      0|    case 3: set_ctx(set_ctx8); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   62|      0|    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  779|      0|                rep_macro(edge->intra, off, 0)
  |  |  |  |  ------------------
  |  |  |  |  |  |   74|      0|    case 3: set_ctx(set_ctx8); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   62|      0|    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (74:5): [True: 0, False: 0]
  |  |  ------------------
  |  |   75|      0|    case 4: set_ctx(set_ctx16); break; \
  |  |  ------------------
  |  |  |  |  777|      0|                rep_macro(edge->filter[0], off, filter[0]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   75|      0|    case 4: set_ctx(set_ctx16); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   63|      0|#define set_ctx16(var, off, val) do { \
  |  |  |  |  |  |  |  |   64|      0|        memset(&(var)[off], val, 16); \
  |  |  |  |  |  |  |  |   65|      0|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (65:14): [Folded, False: 0]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  778|      0|                rep_macro(edge->filter[1], off, filter[1]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   75|      0|    case 4: set_ctx(set_ctx16); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   63|      0|#define set_ctx16(var, off, val) do { \
  |  |  |  |  |  |  |  |   64|      0|        memset(&(var)[off], val, 16); \
  |  |  |  |  |  |  |  |   65|      0|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (65:14): [Folded, False: 0]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  779|      0|                rep_macro(edge->intra, off, 0)
  |  |  |  |  ------------------
  |  |  |  |  |  |   75|      0|    case 4: set_ctx(set_ctx16); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   63|      0|#define set_ctx16(var, off, val) do { \
  |  |  |  |  |  |  |  |   64|      0|        memset(&(var)[off], val, 16); \
  |  |  |  |  |  |  |  |   65|      0|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (65:14): [Folded, False: 0]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (75:5): [True: 0, False: 0]
  |  |  ------------------
  |  |   76|      0|    case 5: set_ctx(set_ctx32); break; \
  |  |  ------------------
  |  |  |  |  777|      0|                rep_macro(edge->filter[0], off, filter[0]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   76|      0|    case 5: set_ctx(set_ctx32); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   66|      0|#define set_ctx32(var, off, val) do { \
  |  |  |  |  |  |  |  |   67|      0|        memset(&(var)[off], val, 32); \
  |  |  |  |  |  |  |  |   68|      0|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (68:14): [Folded, False: 0]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  778|      0|                rep_macro(edge->filter[1], off, filter[1]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   76|      0|    case 5: set_ctx(set_ctx32); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   66|      0|#define set_ctx32(var, off, val) do { \
  |  |  |  |  |  |  |  |   67|      0|        memset(&(var)[off], val, 32); \
  |  |  |  |  |  |  |  |   68|      0|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (68:14): [Folded, False: 0]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  779|      0|                rep_macro(edge->intra, off, 0)
  |  |  |  |  ------------------
  |  |  |  |  |  |   76|      0|    case 5: set_ctx(set_ctx32); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   66|      0|#define set_ctx32(var, off, val) do { \
  |  |  |  |  |  |  |  |   67|      0|        memset(&(var)[off], val, 32); \
  |  |  |  |  |  |  |  |   68|      0|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (68:14): [Folded, False: 0]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (76:5): [True: 0, False: 0]
  |  |  ------------------
  |  |   77|      0|    default: assert(0); \
  |  |  ------------------
  |  |  |  |  140|      0|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (140:28): [True: 0, Folded]
  |  |  |  |  |  Branch (140:68): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (77:5): [True: 0, False: 0]
  |  |  ------------------
  |  |   78|      0|    }
  ------------------
  781|      0|#undef set_ctx
  782|      0|            }
  783|       |
  784|      0|            if (IS_INTER_OR_SWITCH(f->frame_hdr)) {
  ------------------
  |  |   36|      0|    ((frame_header)->frame_type & 1)
  |  |  ------------------
  |  |  |  Branch (36:5): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  785|      0|                refmvs_block *const r = &t->rt.r[(t->by & 31) + 5 + bh4 - 1][t->bx];
  786|      0|                for (int x = 0; x < bw4; x++) {
  ------------------
  |  Branch (786:33): [True: 0, False: 0]
  ------------------
  787|      0|                    r[x].ref.ref[0] = b->ref[0] + 1;
  788|      0|                    r[x].mv.mv[0] = b->mv[0];
  789|      0|                    r[x].bs = bs;
  790|      0|                }
  791|      0|                refmvs_block *const *rr = &t->rt.r[(t->by & 31) + 5];
  792|      0|                for (int y = 0; y < bh4 - 1; y++) {
  ------------------
  |  Branch (792:33): [True: 0, False: 0]
  ------------------
  793|      0|                    rr[y][t->bx + bw4 - 1].ref.ref[0] = b->ref[0] + 1;
  794|      0|                    rr[y][t->bx + bw4 - 1].mv.mv[0] = b->mv[0];
  795|      0|                    rr[y][t->bx + bw4 - 1].bs = bs;
  796|      0|                }
  797|      0|            }
  798|       |
  799|      0|            if (has_chroma) {
  ------------------
  |  Branch (799:17): [True: 0, False: 0]
  ------------------
  800|      0|                dav1d_memset_pow2[ulog2(cbw4)](&t->a->uvmode[cbx4], DC_PRED);
  801|      0|                dav1d_memset_pow2[ulog2(cbh4)](&t->l.uvmode[cby4], DC_PRED);
  802|      0|            }
  803|      0|        }
  804|      0|        return 0;
  805|      0|    }
  806|       |
  807|  3.00M|    const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
  808|       |
  809|  3.00M|    b->bl = bl;
  810|  3.00M|    b->bp = bp;
  811|  3.00M|    b->bs = bs;
  812|       |
  813|  3.00M|    const Dav1dSegmentationData *seg = NULL;
  814|       |
  815|       |    // segment_id (if seg_feature for skip/ref/gmv is enabled)
  816|  3.00M|    int seg_pred = 0;
  817|  3.00M|    if (f->frame_hdr->segmentation.enabled) {
  ------------------
  |  Branch (817:9): [True: 1.14M, False: 1.86M]
  ------------------
  818|  1.14M|        if (!f->frame_hdr->segmentation.update_map) {
  ------------------
  |  Branch (818:13): [True: 11.5k, False: 1.13M]
  ------------------
  819|  11.5k|            if (f->prev_segmap) {
  ------------------
  |  Branch (819:17): [True: 29, False: 11.5k]
  ------------------
  820|     29|                unsigned seg_id = get_prev_frame_segid(f, t->by, t->bx, w4, h4,
  821|     29|                                                       f->prev_segmap,
  822|     29|                                                       f->b4_stride);
  823|     29|                if (seg_id >= 8) return -1;
  ------------------
  |  Branch (823:21): [True: 0, False: 29]
  ------------------
  824|     29|                b->seg_id = seg_id;
  825|  11.5k|            } else {
  826|  11.5k|                b->seg_id = 0;
  827|  11.5k|            }
  828|  11.5k|            seg = &f->frame_hdr->segmentation.seg_data.d[b->seg_id];
  829|  1.13M|        } else if (f->frame_hdr->segmentation.seg_data.preskip) {
  ------------------
  |  Branch (829:20): [True: 1.07M, False: 57.7k]
  ------------------
  830|  1.07M|            if (f->frame_hdr->segmentation.temporal &&
  ------------------
  |  Branch (830:17): [True: 1.06k, False: 1.07M]
  ------------------
  831|  1.06k|                (seg_pred = dav1d_msac_decode_bool_adapt(&ts->msac,
  ------------------
  |  |   52|  1.06k|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
  |  Branch (831:17): [True: 581, False: 479]
  ------------------
  832|  1.06k|                                ts->cdf.m.seg_pred[t->a->seg_pred[bx4] +
  833|  1.06k|                                t->l.seg_pred[by4]])))
  834|    581|            {
  835|       |                // temporal predicted seg_id
  836|    581|                if (f->prev_segmap) {
  ------------------
  |  Branch (836:21): [True: 20, False: 561]
  ------------------
  837|     20|                    unsigned seg_id = get_prev_frame_segid(f, t->by, t->bx,
  838|     20|                                                           w4, h4,
  839|     20|                                                           f->prev_segmap,
  840|     20|                                                           f->b4_stride);
  841|     20|                    if (seg_id >= 8) return -1;
  ------------------
  |  Branch (841:25): [True: 0, False: 20]
  ------------------
  842|     20|                    b->seg_id = seg_id;
  843|    561|                } else {
  844|    561|                    b->seg_id = 0;
  845|    561|                }
  846|  1.07M|            } else {
  847|  1.07M|                int seg_ctx;
  848|  1.07M|                const unsigned pred_seg_id =
  849|  1.07M|                    get_cur_frame_segid(t->by, t->bx, have_top, have_left,
  850|  1.07M|                                        &seg_ctx, f->cur_segmap, f->b4_stride);
  851|  1.07M|                const unsigned diff = dav1d_msac_decode_symbol_adapt8(&ts->msac,
  ------------------
  |  |   48|  1.07M|#define dav1d_msac_decode_symbol_adapt8  dav1d_msac_decode_symbol_adapt8_sse2
  ------------------
  852|  1.07M|                                          ts->cdf.m.seg_id[seg_ctx],
  853|  1.07M|                                          DAV1D_MAX_SEGMENTS - 1);
  ------------------
  |  |   43|  1.07M|#define DAV1D_MAX_SEGMENTS 8
  ------------------
  854|  1.07M|                const unsigned last_active_seg_id =
  855|  1.07M|                    f->frame_hdr->segmentation.seg_data.last_active_segid;
  856|  1.07M|                b->seg_id = neg_deinterleave(diff, pred_seg_id,
  857|  1.07M|                                             last_active_seg_id + 1);
  858|  1.07M|                if (b->seg_id > last_active_seg_id) b->seg_id = 0; // error?
  ------------------
  |  Branch (858:21): [True: 254k, False: 819k]
  ------------------
  859|  1.07M|                if (b->seg_id >= DAV1D_MAX_SEGMENTS) b->seg_id = 0; // error?
  ------------------
  |  |   43|  1.07M|#define DAV1D_MAX_SEGMENTS 8
  ------------------
  |  Branch (859:21): [True: 0, False: 1.07M]
  ------------------
  860|  1.07M|            }
  861|       |
  862|  1.07M|            if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|  1.07M|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 1.07M]
  |  |  ------------------
  |  |   35|  1.07M|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  1.07M|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  863|      0|                printf("Post-segid[preskip;%d]: r=%d\n",
  864|      0|                       b->seg_id, ts->msac.rng);
  865|       |
  866|  1.07M|            seg = &f->frame_hdr->segmentation.seg_data.d[b->seg_id];
  867|  1.07M|        }
  868|  1.86M|    } else {
  869|  1.86M|        b->seg_id = 0;
  870|  1.86M|    }
  871|       |
  872|       |    // skip_mode
  873|  3.00M|    if ((!seg || (!seg->globalmv && seg->ref == -1 && !seg->skip)) &&
  ------------------
  |  Branch (873:10): [True: 1.91M, False: 1.09M]
  |  Branch (873:19): [True: 748k, False: 347k]
  |  Branch (873:37): [True: 470k, False: 277k]
  |  Branch (873:55): [True: 389k, False: 80.9k]
  ------------------
  874|  2.28M|        f->frame_hdr->skip_mode_enabled && imin(bw4, bh4) > 1)
  ------------------
  |  Branch (874:9): [True: 3.50k, False: 2.27M]
  |  Branch (874:44): [True: 2.42k, False: 1.07k]
  ------------------
  875|  2.42k|    {
  876|  2.42k|        const int smctx = t->a->skip_mode[bx4] + t->l.skip_mode[by4];
  877|  2.42k|        b->skip_mode = dav1d_msac_decode_bool_adapt(&ts->msac,
  ------------------
  |  |   52|  2.42k|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
  878|  2.42k|                           ts->cdf.m.skip_mode[smctx]);
  879|  2.42k|        if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|  2.42k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 2.42k]
  |  |  ------------------
  |  |   35|  2.42k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  2.42k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  880|      0|            printf("Post-skipmode[%d]: r=%d\n", b->skip_mode, ts->msac.rng);
  881|  3.00M|    } else {
  882|  3.00M|        b->skip_mode = 0;
  883|  3.00M|    }
  884|       |
  885|       |    // skip
  886|  3.00M|    if (b->skip_mode || (seg && seg->skip)) {
  ------------------
  |  Branch (886:9): [True: 25.2k, False: 2.98M]
  |  Branch (886:26): [True: 1.08M, False: 1.89M]
  |  Branch (886:33): [True: 374k, False: 711k]
  ------------------
  887|   375k|        b->skip = 1;
  888|  2.63M|    } else {
  889|  2.63M|        const int sctx = t->a->skip[bx4] + t->l.skip[by4];
  890|  2.63M|        b->skip = dav1d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.skip[sctx]);
  ------------------
  |  |   52|  2.63M|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
  891|  2.63M|        if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|  2.63M|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 2.63M]
  |  |  ------------------
  |  |   35|  2.63M|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  2.63M|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  892|      0|            printf("Post-skip[%d]: r=%d\n", b->skip, ts->msac.rng);
  893|  2.63M|    }
  894|       |
  895|       |    // segment_id
  896|  3.00M|    if (f->frame_hdr->segmentation.enabled &&
  ------------------
  |  Branch (896:9): [True: 1.14M, False: 1.85M]
  ------------------
  897|  1.14M|        f->frame_hdr->segmentation.update_map &&
  ------------------
  |  Branch (897:9): [True: 1.13M, False: 11.9k]
  ------------------
  898|  1.13M|        !f->frame_hdr->segmentation.seg_data.preskip)
  ------------------
  |  Branch (898:9): [True: 57.1k, False: 1.07M]
  ------------------
  899|  57.1k|    {
  900|  57.1k|        if (!b->skip && f->frame_hdr->segmentation.temporal &&
  ------------------
  |  Branch (900:13): [True: 30.6k, False: 26.5k]
  |  Branch (900:25): [True: 780, False: 29.8k]
  ------------------
  901|    780|            (seg_pred = dav1d_msac_decode_bool_adapt(&ts->msac,
  ------------------
  |  |   52|    780|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
  |  Branch (901:13): [True: 377, False: 403]
  ------------------
  902|    780|                            ts->cdf.m.seg_pred[t->a->seg_pred[bx4] +
  903|    780|                            t->l.seg_pred[by4]])))
  904|    377|        {
  905|       |            // temporal predicted seg_id
  906|    377|            if (f->prev_segmap) {
  ------------------
  |  Branch (906:17): [True: 13, False: 364]
  ------------------
  907|     13|                unsigned seg_id = get_prev_frame_segid(f, t->by, t->bx, w4, h4,
  908|     13|                                                       f->prev_segmap,
  909|     13|                                                       f->b4_stride);
  910|     13|                if (seg_id >= 8) return -1;
  ------------------
  |  Branch (910:21): [True: 0, False: 13]
  ------------------
  911|     13|                b->seg_id = seg_id;
  912|    364|            } else {
  913|    364|                b->seg_id = 0;
  914|    364|            }
  915|  56.8k|        } else {
  916|  56.8k|            int seg_ctx;
  917|  56.8k|            const unsigned pred_seg_id =
  918|  56.8k|                get_cur_frame_segid(t->by, t->bx, have_top, have_left,
  919|  56.8k|                                    &seg_ctx, f->cur_segmap, f->b4_stride);
  920|  56.8k|            if (b->skip) {
  ------------------
  |  Branch (920:17): [True: 26.7k, False: 30.1k]
  ------------------
  921|  26.7k|                b->seg_id = pred_seg_id;
  922|  30.1k|            } else {
  923|  30.1k|                const unsigned diff = dav1d_msac_decode_symbol_adapt8(&ts->msac,
  ------------------
  |  |   48|  30.1k|#define dav1d_msac_decode_symbol_adapt8  dav1d_msac_decode_symbol_adapt8_sse2
  ------------------
  924|  30.1k|                                          ts->cdf.m.seg_id[seg_ctx],
  925|  30.1k|                                          DAV1D_MAX_SEGMENTS - 1);
  ------------------
  |  |   43|  30.1k|#define DAV1D_MAX_SEGMENTS 8
  ------------------
  926|  30.1k|                const unsigned last_active_seg_id =
  927|  30.1k|                    f->frame_hdr->segmentation.seg_data.last_active_segid;
  928|  30.1k|                b->seg_id = neg_deinterleave(diff, pred_seg_id,
  929|  30.1k|                                             last_active_seg_id + 1);
  930|  30.1k|                if (b->seg_id > last_active_seg_id) b->seg_id = 0; // error?
  ------------------
  |  Branch (930:21): [True: 3.53k, False: 26.5k]
  ------------------
  931|  30.1k|            }
  932|  56.8k|            if (b->seg_id >= DAV1D_MAX_SEGMENTS) b->seg_id = 0; // error?
  ------------------
  |  |   43|  56.8k|#define DAV1D_MAX_SEGMENTS 8
  ------------------
  |  Branch (932:17): [True: 0, False: 56.8k]
  ------------------
  933|  56.8k|        }
  934|       |
  935|  57.1k|        seg = &f->frame_hdr->segmentation.seg_data.d[b->seg_id];
  936|       |
  937|  57.1k|        if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|  57.1k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 57.1k]
  |  |  ------------------
  |  |   35|  57.1k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  57.1k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  938|      0|            printf("Post-segid[postskip;%d]: r=%d\n",
  939|      0|                   b->seg_id, ts->msac.rng);
  940|  57.1k|    }
  941|       |
  942|       |    // cdef index
  943|  3.00M|    if (!b->skip) {
  ------------------
  |  Branch (943:9): [True: 851k, False: 2.15M]
  ------------------
  944|   851k|        const int idx = f->seq_hdr->sb128 ? ((t->bx & 16) >> 4) +
  ------------------
  |  Branch (944:25): [True: 337k, False: 513k]
  ------------------
  945|   513k|                                           ((t->by & 16) >> 3) : 0;
  946|   851k|        if (t->cur_sb_cdef_idx_ptr[idx] == -1) {
  ------------------
  |  Branch (946:13): [True: 249k, False: 602k]
  ------------------
  947|   249k|            const int v = dav1d_msac_decode_bools(&ts->msac,
  948|   249k|                              f->frame_hdr->cdef.n_bits);
  949|   249k|            t->cur_sb_cdef_idx_ptr[idx] = v;
  950|   249k|            if (bw4 > 16) t->cur_sb_cdef_idx_ptr[idx + 1] = v;
  ------------------
  |  Branch (950:17): [True: 14.5k, False: 234k]
  ------------------
  951|   249k|            if (bh4 > 16) t->cur_sb_cdef_idx_ptr[idx + 2] = v;
  ------------------
  |  Branch (951:17): [True: 11.2k, False: 237k]
  ------------------
  952|   249k|            if (bw4 == 32 && bh4 == 32) t->cur_sb_cdef_idx_ptr[idx + 3] = v;
  ------------------
  |  Branch (952:17): [True: 14.5k, False: 234k]
  |  Branch (952:30): [True: 2.02k, False: 12.4k]
  ------------------
  953|       |
  954|   249k|            if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|   249k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 249k]
  |  |  ------------------
  |  |   35|   249k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|   249k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  955|      0|                printf("Post-cdef_idx[%d]: r=%d\n",
  956|      0|                        *t->cur_sb_cdef_idx_ptr, ts->msac.rng);
  957|   249k|        }
  958|   851k|    }
  959|       |
  960|       |    // delta-q/lf
  961|  3.00M|    if (!((t->bx | t->by) & (31 >> !f->seq_hdr->sb128))) {
  ------------------
  |  Branch (961:9): [True: 325k, False: 2.68M]
  ------------------
  962|   325k|        const int prev_qidx = ts->last_qidx;
  963|   325k|        const int have_delta_q = f->frame_hdr->delta.q.present &&
  ------------------
  |  Branch (963:34): [True: 152k, False: 173k]
  ------------------
  964|   152k|            (bs != (f->seq_hdr->sb128 ? BS_128x128 : BS_64x64) || !b->skip);
  ------------------
  |  Branch (964:14): [True: 146k, False: 5.77k]
  |  Branch (964:21): [True: 44.8k, False: 107k]
  |  Branch (964:67): [True: 4.09k, False: 1.68k]
  ------------------
  965|       |
  966|   325k|        uint32_t prev_delta_lf = ts->last_delta_lf.u32;
  967|       |
  968|   325k|        if (have_delta_q) {
  ------------------
  |  Branch (968:13): [True: 150k, False: 174k]
  ------------------
  969|   150k|            int delta_q = dav1d_msac_decode_symbol_adapt4(&ts->msac,
  ------------------
  |  |   47|   150k|#define dav1d_msac_decode_symbol_adapt4  dav1d_msac_decode_symbol_adapt4_sse2
  ------------------
  970|   150k|                                                          ts->cdf.m.delta_q, 3);
  971|   150k|            if (delta_q == 3) {
  ------------------
  |  Branch (971:17): [True: 8.62k, False: 142k]
  ------------------
  972|  8.62k|                const int n_bits = 1 + dav1d_msac_decode_bools(&ts->msac, 3);
  973|  8.62k|                delta_q = dav1d_msac_decode_bools(&ts->msac, n_bits) +
  974|  8.62k|                          1 + (1 << n_bits);
  975|  8.62k|            }
  976|   150k|            if (delta_q) {
  ------------------
  |  Branch (976:17): [True: 29.5k, False: 121k]
  ------------------
  977|  29.5k|                if (dav1d_msac_decode_bool_equi(&ts->msac)) delta_q = -delta_q;
  ------------------
  |  |   53|  29.5k|#define dav1d_msac_decode_bool_equi      dav1d_msac_decode_bool_equi_sse2
  ------------------
  |  Branch (977:21): [True: 18.2k, False: 11.3k]
  ------------------
  978|  29.5k|                delta_q *= 1 << f->frame_hdr->delta.q.res_log2;
  979|  29.5k|            }
  980|   150k|            ts->last_qidx = iclip(ts->last_qidx + delta_q, 1, 255);
  981|   150k|            if (have_delta_q && DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|   150k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 150k]
  |  |  ------------------
  |  |   35|   150k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|   150k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  |  Branch (981:17): [True: 150k, False: 2]
  ------------------
  982|      0|                printf("Post-delta_q[%d->%d]: r=%d\n",
  983|      0|                       delta_q, ts->last_qidx, ts->msac.rng);
  984|       |
  985|   150k|            if (f->frame_hdr->delta.lf.present) {
  ------------------
  |  Branch (985:17): [True: 73.8k, False: 77.0k]
  ------------------
  986|  73.8k|                const int n_lfs = f->frame_hdr->delta.lf.multi ?
  ------------------
  |  Branch (986:35): [True: 58.9k, False: 14.8k]
  ------------------
  987|  58.9k|                    f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 ? 4 : 2 : 1;
  ------------------
  |  Branch (987:21): [True: 38.0k, False: 20.9k]
  ------------------
  988|       |
  989|   282k|                for (int i = 0; i < n_lfs; i++) {
  ------------------
  |  Branch (989:33): [True: 208k, False: 73.8k]
  ------------------
  990|   208k|                    int delta_lf = dav1d_msac_decode_symbol_adapt4(&ts->msac,
  ------------------
  |  |   47|   208k|#define dav1d_msac_decode_symbol_adapt4  dav1d_msac_decode_symbol_adapt4_sse2
  ------------------
  991|   208k|                        ts->cdf.m.delta_lf[i + f->frame_hdr->delta.lf.multi], 3);
  992|   208k|                    if (delta_lf == 3) {
  ------------------
  |  Branch (992:25): [True: 15.3k, False: 192k]
  ------------------
  993|  15.3k|                        const int n_bits = 1 + dav1d_msac_decode_bools(&ts->msac, 3);
  994|  15.3k|                        delta_lf = dav1d_msac_decode_bools(&ts->msac, n_bits) +
  995|  15.3k|                                   1 + (1 << n_bits);
  996|  15.3k|                    }
  997|   208k|                    if (delta_lf) {
  ------------------
  |  Branch (997:25): [True: 43.4k, False: 164k]
  ------------------
  998|  43.4k|                        if (dav1d_msac_decode_bool_equi(&ts->msac))
  ------------------
  |  |   53|  43.4k|#define dav1d_msac_decode_bool_equi      dav1d_msac_decode_bool_equi_sse2
  ------------------
  |  Branch (998:29): [True: 30.0k, False: 13.4k]
  ------------------
  999|  30.0k|                            delta_lf = -delta_lf;
 1000|  43.4k|                        delta_lf *= 1 << f->frame_hdr->delta.lf.res_log2;
 1001|  43.4k|                    }
 1002|   208k|                    ts->last_delta_lf.i8[i] =
 1003|   208k|                        iclip(ts->last_delta_lf.i8[i] + delta_lf, -63, 63);
 1004|   208k|                    if (have_delta_q && DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|   208k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 208k]
  |  |  ------------------
  |  |   35|   208k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|   208k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  |  Branch (1004:25): [True: 208k, False: 18.4E]
  ------------------
 1005|      0|                        printf("Post-delta_lf[%d:%d]: r=%d\n", i, delta_lf,
 1006|      0|                               ts->msac.rng);
 1007|   208k|                }
 1008|  73.8k|            }
 1009|   150k|        }
 1010|   325k|        if (ts->last_qidx == f->frame_hdr->quant.yac) {
  ------------------
  |  Branch (1010:13): [True: 217k, False: 108k]
  ------------------
 1011|       |            // assign frame-wide q values to this sb
 1012|   217k|            ts->dq = f->dq;
 1013|   217k|        } else if (ts->last_qidx != prev_qidx) {
  ------------------
  |  Branch (1013:20): [True: 19.4k, False: 88.5k]
  ------------------
 1014|       |            // find sb-specific quant parameters
 1015|  19.4k|            init_quant_tables(f->seq_hdr, f->frame_hdr, ts->last_qidx, ts->dqmem);
 1016|  19.4k|            ts->dq = ts->dqmem;
 1017|  19.4k|        }
 1018|   325k|        if (!ts->last_delta_lf.u32) {
  ------------------
  |  Branch (1018:13): [True: 260k, False: 65.0k]
  ------------------
 1019|       |            // assign frame-wide lf values to this sb
 1020|   260k|            ts->lflvl = f->lf.lvl;
 1021|   260k|        } else if (ts->last_delta_lf.u32 != prev_delta_lf) {
  ------------------
  |  Branch (1021:20): [True: 23.3k, False: 41.7k]
  ------------------
 1022|       |            // find sb-specific lf lvl parameters
 1023|  23.3k|            ts->lflvl = ts->lflvlmem;
 1024|  23.3k|            dav1d_calc_lf_values(ts->lflvlmem, f->frame_hdr, ts->last_delta_lf.i8);
 1025|  23.3k|        }
 1026|   325k|    }
 1027|       |
 1028|  3.00M|    if (b->skip_mode) {
  ------------------
  |  Branch (1028:9): [True: 356, False: 3.00M]
  ------------------
 1029|    356|        b->intra = 0;
 1030|  3.00M|    } else if (IS_INTER_OR_SWITCH(f->frame_hdr)) {
  ------------------
  |  |   36|  3.00M|    ((frame_header)->frame_type & 1)
  |  |  ------------------
  |  |  |  Branch (36:5): [True: 109k, False: 2.89M]
  |  |  ------------------
  ------------------
 1031|   109k|        if (seg && (seg->ref >= 0 || seg->globalmv)) {
  ------------------
  |  Branch (1031:13): [True: 15.6k, False: 93.9k]
  |  Branch (1031:21): [True: 1.07k, False: 14.5k]
  |  Branch (1031:38): [True: 9.57k, False: 4.99k]
  ------------------
 1032|  10.6k|            b->intra = !seg->ref;
 1033|  98.9k|        } else {
 1034|  98.9k|            const int ictx = get_intra_ctx(t->a, &t->l, by4, bx4,
 1035|  98.9k|                                           have_top, have_left);
 1036|  98.9k|            b->intra = !dav1d_msac_decode_bool_adapt(&ts->msac,
  ------------------
  |  |   52|  98.9k|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
 1037|  98.9k|                            ts->cdf.m.intra[ictx]);
 1038|  98.9k|            if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|  98.9k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 98.9k]
  |  |  ------------------
  |  |   35|  98.9k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  98.9k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1039|      0|                printf("Post-intra[%d]: r=%d\n", b->intra, ts->msac.rng);
 1040|  98.9k|        }
 1041|  2.89M|    } else if (f->frame_hdr->allow_intrabc) {
  ------------------
  |  Branch (1041:16): [True: 1.33M, False: 1.56M]
  ------------------
 1042|  1.33M|        b->intra = !dav1d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.intrabc);
  ------------------
  |  |   52|  1.33M|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
 1043|  1.33M|        if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|  1.33M|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 1.33M]
  |  |  ------------------
  |  |   35|  1.33M|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  1.33M|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1044|      0|            printf("Post-intrabcflag[%d]: r=%d\n", b->intra, ts->msac.rng);
 1045|  1.56M|    } else {
 1046|  1.56M|        b->intra = 1;
 1047|  1.56M|    }
 1048|       |
 1049|       |    // intra/inter-specific stuff
 1050|  3.00M|    if (b->intra) {
  ------------------
  |  Branch (1050:9): [True: 1.95M, False: 1.05M]
  ------------------
 1051|  1.95M|        uint16_t *const ymode_cdf = IS_INTER_OR_SWITCH(f->frame_hdr) ?
  ------------------
  |  |   36|  1.95M|    ((frame_header)->frame_type & 1)
  |  |  ------------------
  |  |  |  Branch (36:5): [True: 24.1k, False: 1.92M]
  |  |  ------------------
  ------------------
 1052|  24.1k|            ts->cdf.m.y_mode[dav1d_ymode_size_context[bs]] :
 1053|  1.95M|            ts->cdf.kfym[dav1d_intra_mode_context[t->a->mode[bx4]]]
 1054|  1.92M|                        [dav1d_intra_mode_context[t->l.mode[by4]]];
 1055|  1.95M|        b->y_mode = dav1d_msac_decode_symbol_adapt16(&ts->msac, ymode_cdf,
  ------------------
  |  |   57|  1.95M|#define dav1d_msac_decode_symbol_adapt16(ctx, cdf, symb) ((ctx)->symbol_adapt16(ctx, cdf, symb))
  ------------------
 1056|  1.95M|                                                     N_INTRA_PRED_MODES - 1);
 1057|  1.95M|        if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|  1.95M|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 1.95M]
  |  |  ------------------
  |  |   35|  1.95M|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  1.95M|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1058|      0|            printf("Post-ymode[%d]: r=%d\n", b->y_mode, ts->msac.rng);
 1059|       |
 1060|       |        // angle delta
 1061|  1.95M|        if (b_dim[2] + b_dim[3] >= 2 && b->y_mode >= VERT_PRED &&
  ------------------
  |  Branch (1061:13): [True: 966k, False: 986k]
  |  Branch (1061:41): [True: 544k, False: 422k]
  ------------------
 1062|   544k|            b->y_mode <= VERT_LEFT_PRED)
  ------------------
  |  Branch (1062:13): [True: 278k, False: 265k]
  ------------------
 1063|   278k|        {
 1064|   278k|            uint16_t *const acdf = ts->cdf.m.angle_delta[b->y_mode - VERT_PRED];
 1065|   278k|            const int angle = dav1d_msac_decode_symbol_adapt8(&ts->msac, acdf, 6);
  ------------------
  |  |   48|   278k|#define dav1d_msac_decode_symbol_adapt8  dav1d_msac_decode_symbol_adapt8_sse2
  ------------------
 1066|   278k|            b->y_angle = angle - 3;
 1067|  1.67M|        } else {
 1068|  1.67M|            b->y_angle = 0;
 1069|  1.67M|        }
 1070|       |
 1071|  1.95M|        if (has_chroma) {
  ------------------
  |  Branch (1071:13): [True: 1.49M, False: 456k]
  ------------------
 1072|  1.49M|            const int cfl_allowed = f->frame_hdr->segmentation.lossless[b->seg_id] ?
  ------------------
  |  Branch (1072:37): [True: 391k, False: 1.10M]
  ------------------
 1073|  1.10M|                cbw4 == 1 && cbh4 == 1 : !!(cfl_allowed_mask & (1 << bs));
  ------------------
  |  Branch (1073:17): [True: 339k, False: 51.9k]
  |  Branch (1073:30): [True: 332k, False: 6.64k]
  ------------------
 1074|  1.49M|            uint16_t *const uvmode_cdf = ts->cdf.m.uv_mode[cfl_allowed][b->y_mode];
 1075|  1.49M|            b->uv_mode = dav1d_msac_decode_symbol_adapt16(&ts->msac, uvmode_cdf,
  ------------------
  |  |   57|  1.49M|#define dav1d_msac_decode_symbol_adapt16(ctx, cdf, symb) ((ctx)->symbol_adapt16(ctx, cdf, symb))
  ------------------
 1076|  1.49M|                             N_UV_INTRA_PRED_MODES - 1 - !cfl_allowed);
 1077|  1.49M|            if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|  1.49M|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 1.49M]
  |  |  ------------------
  |  |   35|  1.49M|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  1.49M|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1078|      0|                printf("Post-uvmode[%d]: r=%d\n", b->uv_mode, ts->msac.rng);
 1079|       |
 1080|  1.49M|            b->uv_angle = 0;
 1081|  1.49M|            if (b->uv_mode == CFL_PRED) {
  ------------------
  |  Branch (1081:17): [True: 797k, False: 698k]
  ------------------
 1082|   797k|#define SIGN(a) (!!(a) + ((a) > 0))
 1083|   797k|                const int sign = dav1d_msac_decode_symbol_adapt8(&ts->msac,
  ------------------
  |  |   48|   797k|#define dav1d_msac_decode_symbol_adapt8  dav1d_msac_decode_symbol_adapt8_sse2
  ------------------
 1084|   797k|                                     ts->cdf.m.cfl_sign, 7) + 1;
 1085|   797k|                const int sign_u = sign * 0x56 >> 8, sign_v = sign - sign_u * 3;
 1086|   797k|                assert(sign_u == sign / 3);
  ------------------
  |  |  140|   797k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 797k]
  |  |  |  Branch (140:68): [Folded, False: 797k]
  |  |  ------------------
  ------------------
 1087|   797k|                if (sign_u) {
  ------------------
  |  Branch (1087:21): [True: 784k, False: 13.5k]
  ------------------
 1088|   784k|                    const int ctx = (sign_u == 2) * 3 + sign_v;
 1089|   784k|                    b->cfl_alpha[0] = dav1d_msac_decode_symbol_adapt16(&ts->msac,
  ------------------
  |  |   57|   784k|#define dav1d_msac_decode_symbol_adapt16(ctx, cdf, symb) ((ctx)->symbol_adapt16(ctx, cdf, symb))
  ------------------
 1090|   784k|                                          ts->cdf.m.cfl_alpha[ctx], 15) + 1;
 1091|   784k|                    if (sign_u == 1) b->cfl_alpha[0] = -b->cfl_alpha[0];
  ------------------
  |  Branch (1091:25): [True: 106k, False: 677k]
  ------------------
 1092|   784k|                } else {
 1093|  13.5k|                    b->cfl_alpha[0] = 0;
 1094|  13.5k|                }
 1095|   797k|                if (sign_v) {
  ------------------
  |  Branch (1095:21): [True: 745k, False: 52.8k]
  ------------------
 1096|   745k|                    const int ctx = (sign_v == 2) * 3 + sign_u;
 1097|   745k|                    b->cfl_alpha[1] = dav1d_msac_decode_symbol_adapt16(&ts->msac,
  ------------------
  |  |   57|   745k|#define dav1d_msac_decode_symbol_adapt16(ctx, cdf, symb) ((ctx)->symbol_adapt16(ctx, cdf, symb))
  ------------------
 1098|   745k|                                          ts->cdf.m.cfl_alpha[ctx], 15) + 1;
 1099|   745k|                    if (sign_v == 1) b->cfl_alpha[1] = -b->cfl_alpha[1];
  ------------------
  |  Branch (1099:25): [True: 44.3k, False: 700k]
  ------------------
 1100|   745k|                } else {
 1101|  52.8k|                    b->cfl_alpha[1] = 0;
 1102|  52.8k|                }
 1103|   797k|#undef SIGN
 1104|   797k|                if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|   797k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 797k]
  |  |  ------------------
  |  |   35|   797k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|   797k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1105|      0|                    printf("Post-uvalphas[%d/%d]: r=%d\n",
 1106|      0|                           b->cfl_alpha[0], b->cfl_alpha[1], ts->msac.rng);
 1107|   797k|            } else if (b_dim[2] + b_dim[3] >= 2 && b->uv_mode >= VERT_PRED &&
  ------------------
  |  Branch (1107:24): [True: 560k, False: 138k]
  |  Branch (1107:52): [True: 313k, False: 247k]
  ------------------
 1108|   313k|                       b->uv_mode <= VERT_LEFT_PRED)
  ------------------
  |  Branch (1108:24): [True: 152k, False: 160k]
  ------------------
 1109|   152k|            {
 1110|   152k|                uint16_t *const acdf = ts->cdf.m.angle_delta[b->uv_mode - VERT_PRED];
 1111|   152k|                const int angle = dav1d_msac_decode_symbol_adapt8(&ts->msac, acdf, 6);
  ------------------
  |  |   48|   152k|#define dav1d_msac_decode_symbol_adapt8  dav1d_msac_decode_symbol_adapt8_sse2
  ------------------
 1112|   152k|                b->uv_angle = angle - 3;
 1113|   152k|            }
 1114|  1.49M|        }
 1115|       |
 1116|  1.95M|        b->pal_sz[0] = b->pal_sz[1] = 0;
 1117|  1.95M|        if (f->frame_hdr->allow_screen_content_tools &&
  ------------------
  |  Branch (1117:13): [True: 581k, False: 1.37M]
  ------------------
 1118|   581k|            imax(bw4, bh4) <= 16 && bw4 + bh4 >= 4)
  ------------------
  |  Branch (1118:13): [True: 564k, False: 17.0k]
  |  Branch (1118:37): [True: 422k, False: 141k]
  ------------------
 1119|   422k|        {
 1120|   422k|            const int sz_ctx = b_dim[2] + b_dim[3] - 2;
 1121|   422k|            if (b->y_mode == DC_PRED) {
  ------------------
  |  Branch (1121:17): [True: 185k, False: 237k]
  ------------------
 1122|   185k|                const int pal_ctx = (t->a->pal_sz[bx4] > 0) + (t->l.pal_sz[by4] > 0);
 1123|   185k|                const int use_y_pal = dav1d_msac_decode_bool_adapt(&ts->msac,
  ------------------
  |  |   52|   185k|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
 1124|   185k|                                          ts->cdf.m.pal_y[sz_ctx][pal_ctx]);
 1125|   185k|                if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|   185k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 185k]
  |  |  ------------------
  |  |   35|   185k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|   185k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1126|      0|                    printf("Post-y_pal[%d]: r=%d\n", use_y_pal, ts->msac.rng);
 1127|   185k|                if (use_y_pal)
  ------------------
  |  Branch (1127:21): [True: 24.0k, False: 161k]
  ------------------
 1128|  24.0k|                    f->bd_fn.read_pal_plane(t, b, 0, sz_ctx, bx4, by4);
 1129|   185k|            }
 1130|       |
 1131|   422k|            if (has_chroma && b->uv_mode == DC_PRED) {
  ------------------
  |  Branch (1131:17): [True: 322k, False: 99.4k]
  |  Branch (1131:31): [True: 104k, False: 217k]
  ------------------
 1132|   104k|                const int pal_ctx = b->pal_sz[0] > 0;
 1133|   104k|                const int use_uv_pal = dav1d_msac_decode_bool_adapt(&ts->msac,
  ------------------
  |  |   52|   104k|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
 1134|   104k|                                           ts->cdf.m.pal_uv[pal_ctx]);
 1135|   104k|                if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|   104k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 104k]
  |  |  ------------------
  |  |   35|   104k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|   104k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1136|      0|                    printf("Post-uv_pal[%d]: r=%d\n", use_uv_pal, ts->msac.rng);
 1137|   104k|                if (use_uv_pal) // see aomedia bug 2183 for why we use luma coordinates
  ------------------
  |  Branch (1137:21): [True: 7.27k, False: 97.6k]
  ------------------
 1138|  7.27k|                    f->bd_fn.read_pal_uv(t, b, sz_ctx, bx4, by4);
 1139|   104k|            }
 1140|   422k|        }
 1141|       |
 1142|  1.95M|        if (b->y_mode == DC_PRED && !b->pal_sz[0] &&
  ------------------
  |  Branch (1142:13): [True: 547k, False: 1.40M]
  |  Branch (1142:37): [True: 523k, False: 24.0k]
  ------------------
 1143|   523k|            imax(b_dim[2], b_dim[3]) <= 3 && f->seq_hdr->filter_intra)
  ------------------
  |  Branch (1143:13): [True: 441k, False: 82.2k]
  |  Branch (1143:46): [True: 284k, False: 157k]
  ------------------
 1144|   284k|        {
 1145|   284k|            const int is_filter = dav1d_msac_decode_bool_adapt(&ts->msac,
  ------------------
  |  |   52|   284k|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
 1146|   284k|                                      ts->cdf.m.use_filter_intra[bs]);
 1147|   284k|            if (is_filter) {
  ------------------
  |  Branch (1147:17): [True: 168k, False: 115k]
  ------------------
 1148|   168k|                b->y_mode = FILTER_PRED;
 1149|   168k|                b->y_angle = dav1d_msac_decode_symbol_adapt8(&ts->msac,
  ------------------
  |  |   48|   168k|#define dav1d_msac_decode_symbol_adapt8  dav1d_msac_decode_symbol_adapt8_sse2
  ------------------
 1150|   168k|                                 ts->cdf.m.filter_intra, 4);
 1151|   168k|            }
 1152|   284k|            if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|   284k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 284k]
  |  |  ------------------
  |  |   35|   284k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|   284k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1153|      0|                printf("Post-filterintramode[%d/%d]: r=%d\n",
 1154|      0|                       b->y_mode, b->y_angle, ts->msac.rng);
 1155|   284k|        }
 1156|       |
 1157|  1.95M|        if (b->pal_sz[0]) {
  ------------------
  |  Branch (1157:13): [True: 24.0k, False: 1.92M]
  ------------------
 1158|  24.0k|            uint8_t *pal_idx;
 1159|  24.0k|            if (t->frame_thread.pass) {
  ------------------
  |  Branch (1159:17): [True: 0, False: 24.0k]
  ------------------
 1160|      0|                const int p = t->frame_thread.pass & 1;
 1161|      0|                assert(ts->frame_thread[p].pal_idx);
  ------------------
  |  |  140|      0|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 0]
  |  |  |  Branch (140:68): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1162|      0|                pal_idx = ts->frame_thread[p].pal_idx;
 1163|      0|                ts->frame_thread[p].pal_idx += bw4 * bh4 * 8;
 1164|      0|            } else
 1165|  24.0k|                pal_idx = t->scratch.pal_idx_y;
 1166|  24.0k|            read_pal_indices(t, pal_idx, b->pal_sz[0], 0, w4, h4, bw4, bh4);
 1167|  24.0k|            if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|  24.0k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 24.0k]
  |  |  ------------------
  |  |   35|  24.0k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  24.0k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1168|      0|                printf("Post-y-pal-indices: r=%d\n", ts->msac.rng);
 1169|  24.0k|        }
 1170|       |
 1171|  1.95M|        if (has_chroma && b->pal_sz[1]) {
  ------------------
  |  Branch (1171:13): [True: 1.50M, False: 451k]
  |  Branch (1171:27): [True: 7.27k, False: 1.49M]
  ------------------
 1172|  7.27k|            uint8_t *pal_idx;
 1173|  7.27k|            if (t->frame_thread.pass) {
  ------------------
  |  Branch (1173:17): [True: 0, False: 7.27k]
  ------------------
 1174|      0|                const int p = t->frame_thread.pass & 1;
 1175|      0|                assert(ts->frame_thread[p].pal_idx);
  ------------------
  |  |  140|      0|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 0]
  |  |  |  Branch (140:68): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1176|      0|                pal_idx = ts->frame_thread[p].pal_idx;
 1177|      0|                ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 8;
 1178|      0|            } else
 1179|  7.27k|                pal_idx = t->scratch.pal_idx_uv;
 1180|  7.27k|            read_pal_indices(t, pal_idx, b->pal_sz[1], 1, cw4, ch4, cbw4, cbh4);
 1181|  7.27k|            if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|  7.27k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 7.27k]
  |  |  ------------------
  |  |   35|  7.27k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  7.27k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1182|      0|                printf("Post-uv-pal-indices: r=%d\n", ts->msac.rng);
 1183|  7.27k|        }
 1184|       |
 1185|  1.95M|        const TxfmInfo *t_dim;
 1186|  1.95M|        if (f->frame_hdr->segmentation.lossless[b->seg_id]) {
  ------------------
  |  Branch (1186:13): [True: 407k, False: 1.54M]
  ------------------
 1187|   407k|            b->tx = b->uvtx = (int) TX_4X4;
 1188|   407k|            t_dim = &dav1d_txfm_dimensions[TX_4X4];
 1189|  1.54M|        } else {
 1190|  1.54M|            b->tx = dav1d_max_txfm_size_for_bs[bs][0];
 1191|  1.54M|            b->uvtx = dav1d_max_txfm_size_for_bs[bs][f->cur.p.layout];
 1192|  1.54M|            t_dim = &dav1d_txfm_dimensions[b->tx];
 1193|  1.54M|            if (f->frame_hdr->txfm_mode == DAV1D_TX_SWITCHABLE && t_dim->max > TX_4X4) {
  ------------------
  |  Branch (1193:17): [True: 568k, False: 976k]
  |  Branch (1193:67): [True: 467k, False: 101k]
  ------------------
 1194|   467k|                const int tctx = get_tx_ctx(t->a, &t->l, t_dim, by4, bx4);
 1195|   467k|                uint16_t *const tx_cdf = ts->cdf.m.txsz[t_dim->max - 1][tctx];
 1196|   467k|                int depth = dav1d_msac_decode_symbol_adapt4(&ts->msac, tx_cdf,
  ------------------
  |  |   47|   467k|#define dav1d_msac_decode_symbol_adapt4  dav1d_msac_decode_symbol_adapt4_sse2
  ------------------
 1197|   467k|                                imin(t_dim->max, 2));
 1198|       |
 1199|   808k|                while (depth--) {
  ------------------
  |  Branch (1199:24): [True: 341k, False: 467k]
  ------------------
 1200|   341k|                    b->tx = t_dim->sub;
 1201|   341k|                    t_dim = &dav1d_txfm_dimensions[b->tx];
 1202|   341k|                }
 1203|   467k|            }
 1204|  1.54M|            if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|  1.54M|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 1.54M]
  |  |  ------------------
  |  |   35|  1.54M|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  1.54M|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1205|      0|                printf("Post-tx[%d]: r=%d\n", b->tx, ts->msac.rng);
 1206|  1.54M|        }
 1207|       |
 1208|       |        // reconstruction
 1209|  1.95M|        if (t->frame_thread.pass == 1) {
  ------------------
  |  Branch (1209:13): [True: 0, False: 1.95M]
  ------------------
 1210|      0|            f->bd_fn.read_coef_blocks(t, bs, b);
 1211|  1.95M|        } else {
 1212|  1.95M|            f->bd_fn.recon_b_intra(t, bs, intra_edge_flags, b);
 1213|  1.95M|        }
 1214|       |
 1215|  1.95M|        if (f->frame_hdr->loopfilter.level_y[0] ||
  ------------------
  |  Branch (1215:13): [True: 946k, False: 1.00M]
  ------------------
 1216|  1.00M|            f->frame_hdr->loopfilter.level_y[1])
  ------------------
  |  Branch (1216:13): [True: 80.1k, False: 926k]
  ------------------
 1217|  1.03M|        {
 1218|  1.03M|            dav1d_create_lf_mask_intra(t->lf_mask, f->lf.level, f->b4_stride,
 1219|  1.03M|                                       (const uint8_t (*)[8][2])
 1220|  1.03M|                                       &ts->lflvl[b->seg_id][0][0][0],
 1221|  1.03M|                                       t->bx, t->by, f->w4, f->h4, bs,
 1222|  1.03M|                                       b->tx, b->uvtx, f->cur.p.layout,
 1223|  1.03M|                                       &t->a->tx_lpf_y[bx4], &t->l.tx_lpf_y[by4],
 1224|  1.03M|                                       has_chroma ? &t->a->tx_lpf_uv[cbx4] : NULL,
  ------------------
  |  Branch (1224:40): [True: 702k, False: 328k]
  ------------------
 1225|  1.03M|                                       has_chroma ? &t->l.tx_lpf_uv[cby4] : NULL);
  ------------------
  |  Branch (1225:40): [True: 702k, False: 328k]
  ------------------
 1226|  1.03M|        }
 1227|       |        // update contexts
 1228|  1.95M|        const enum IntraPredMode y_mode_nofilt =
 1229|  1.95M|            b->y_mode == FILTER_PRED ? DC_PRED : b->y_mode;
  ------------------
  |  Branch (1229:13): [True: 168k, False: 1.78M]
  ------------------
 1230|  1.95M|        BlockContext *edge = t->a;
 1231|  5.85M|        for (int i = 0, off = bx4; i < 2; i++, off = by4, edge = &t->l) {
  ------------------
  |  Branch (1231:36): [True: 3.90M, False: 1.94M]
  ------------------
 1232|  3.90M|            int t_lsz = ((uint8_t *) &t_dim->lw)[i]; // lw then lh
 1233|  3.90M|#define set_ctx(rep_macro) \
 1234|  3.90M|            rep_macro(edge->tx_intra, off, t_lsz); \
 1235|  3.90M|            rep_macro(edge->tx, off, t_lsz); \
 1236|  3.90M|            rep_macro(edge->mode, off, y_mode_nofilt); \
 1237|  3.90M|            rep_macro(edge->pal_sz, off, b->pal_sz[0]); \
 1238|  3.90M|            rep_macro(edge->seg_pred, off, seg_pred); \
 1239|  3.90M|            rep_macro(edge->skip_mode, off, 0); \
 1240|  3.90M|            rep_macro(edge->intra, off, 1); \
 1241|  3.90M|            rep_macro(edge->skip, off, b->skip); \
 1242|       |            /* see aomedia bug 2183 for why we use luma coordinates here */ \
 1243|  3.90M|            rep_macro(t->pal_sz_uv[i], off, (has_chroma ? b->pal_sz[1] : 0)); \
 1244|  3.90M|            if (IS_INTER_OR_SWITCH(f->frame_hdr)) { \
 1245|  3.90M|                rep_macro(edge->comp_type, off, COMP_INTER_NONE); \
 1246|  3.90M|                rep_macro(edge->ref[0], off, ((uint8_t) -1)); \
 1247|  3.90M|                rep_macro(edge->ref[1], off, ((uint8_t) -1)); \
 1248|  3.90M|                rep_macro(edge->filter[0], off, DAV1D_N_SWITCHABLE_FILTERS); \
 1249|  3.90M|                rep_macro(edge->filter[1], off, DAV1D_N_SWITCHABLE_FILTERS); \
 1250|  3.90M|            }
 1251|  3.90M|            case_set(b_dim[2 + i]);
  ------------------
  |  |   70|  3.90M|    switch (var) { \
  |  |   71|  1.83M|    case 0: set_ctx(set_ctx1); break; \
  |  |  ------------------
  |  |  |  | 1234|  1.83M|            rep_macro(edge->tx_intra, off, t_lsz); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   71|  1.83M|    case 0: set_ctx(set_ctx1); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   56|  1.83M|    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1235|  1.83M|            rep_macro(edge->tx, off, t_lsz); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   71|  1.83M|    case 0: set_ctx(set_ctx1); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   56|  1.83M|    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1236|  1.83M|            rep_macro(edge->mode, off, y_mode_nofilt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   71|  1.83M|    case 0: set_ctx(set_ctx1); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   56|  1.83M|    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1237|  1.83M|            rep_macro(edge->pal_sz, off, b->pal_sz[0]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   71|  1.83M|    case 0: set_ctx(set_ctx1); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   56|  1.83M|    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1238|  1.83M|            rep_macro(edge->seg_pred, off, seg_pred); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   71|  1.83M|    case 0: set_ctx(set_ctx1); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   56|  1.83M|    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1239|  1.83M|            rep_macro(edge->skip_mode, off, 0); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   71|  1.83M|    case 0: set_ctx(set_ctx1); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   56|  1.83M|    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1240|  1.83M|            rep_macro(edge->intra, off, 1); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   71|  1.83M|    case 0: set_ctx(set_ctx1); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   56|  1.83M|    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1241|  1.83M|            rep_macro(edge->skip, off, b->skip); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   71|  1.83M|    case 0: set_ctx(set_ctx1); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   56|  1.83M|    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1242|  1.83M|            /* see aomedia bug 2183 for why we use luma coordinates here */ \
  |  |  |  | 1243|  1.83M|            rep_macro(t->pal_sz_uv[i], off, (has_chroma ? b->pal_sz[1] : 0)); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   71|  1.83M|    case 0: set_ctx(set_ctx1); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   56|  3.66M|    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (56:43): [True: 1.52M, False: 305k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1244|  1.83M|            if (IS_INTER_OR_SWITCH(f->frame_hdr)) { \
  |  |  |  |  ------------------
  |  |  |  |  |  |   36|  1.83M|    ((frame_header)->frame_type & 1)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (36:5): [True: 3.11k, False: 1.82M]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1245|  3.11k|                rep_macro(edge->comp_type, off, COMP_INTER_NONE); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   71|  3.11k|    case 0: set_ctx(set_ctx1); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   56|  3.11k|    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1246|  3.11k|                rep_macro(edge->ref[0], off, ((uint8_t) -1)); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   71|  3.11k|    case 0: set_ctx(set_ctx1); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   56|  3.11k|    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1247|  3.11k|                rep_macro(edge->ref[1], off, ((uint8_t) -1)); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   71|  3.11k|    case 0: set_ctx(set_ctx1); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   56|  3.11k|    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1248|  3.11k|                rep_macro(edge->filter[0], off, DAV1D_N_SWITCHABLE_FILTERS); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   71|  3.11k|    case 0: set_ctx(set_ctx1); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   56|  3.11k|    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1249|  3.11k|                rep_macro(edge->filter[1], off, DAV1D_N_SWITCHABLE_FILTERS); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   71|  3.11k|    case 0: set_ctx(set_ctx1); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   56|  3.11k|    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1250|  3.11k|            }
  |  |  ------------------
  |  |  |  Branch (71:5): [True: 1.83M, False: 2.07M]
  |  |  ------------------
  |  |   72|   893k|    case 1: set_ctx(set_ctx2); break; \
  |  |  ------------------
  |  |  |  | 1234|   893k|            rep_macro(edge->tx_intra, off, t_lsz); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   72|   893k|    case 1: set_ctx(set_ctx2); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   58|   893k|    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1235|   893k|            rep_macro(edge->tx, off, t_lsz); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   72|   893k|    case 1: set_ctx(set_ctx2); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   58|   893k|    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1236|   893k|            rep_macro(edge->mode, off, y_mode_nofilt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   72|   893k|    case 1: set_ctx(set_ctx2); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   58|   893k|    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1237|   893k|            rep_macro(edge->pal_sz, off, b->pal_sz[0]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   72|   893k|    case 1: set_ctx(set_ctx2); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   58|   893k|    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1238|   893k|            rep_macro(edge->seg_pred, off, seg_pred); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   72|   893k|    case 1: set_ctx(set_ctx2); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   58|   893k|    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1239|   893k|            rep_macro(edge->skip_mode, off, 0); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   72|   893k|    case 1: set_ctx(set_ctx2); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   58|   893k|    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1240|   893k|            rep_macro(edge->intra, off, 1); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   72|   893k|    case 1: set_ctx(set_ctx2); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   58|   893k|    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1241|   893k|            rep_macro(edge->skip, off, b->skip); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   72|   893k|    case 1: set_ctx(set_ctx2); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   58|   893k|    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1242|   893k|            /* see aomedia bug 2183 for why we use luma coordinates here */ \
  |  |  |  | 1243|   893k|            rep_macro(t->pal_sz_uv[i], off, (has_chroma ? b->pal_sz[1] : 0)); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   72|   893k|    case 1: set_ctx(set_ctx2); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   58|  1.78M|    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (58:45): [True: 621k, False: 272k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1244|   893k|            if (IS_INTER_OR_SWITCH(f->frame_hdr)) { \
  |  |  |  |  ------------------
  |  |  |  |  |  |   36|   893k|    ((frame_header)->frame_type & 1)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (36:5): [True: 5.45k, False: 888k]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1245|  5.45k|                rep_macro(edge->comp_type, off, COMP_INTER_NONE); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   72|  5.45k|    case 1: set_ctx(set_ctx2); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   58|  5.45k|    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1246|  5.45k|                rep_macro(edge->ref[0], off, ((uint8_t) -1)); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   72|  5.45k|    case 1: set_ctx(set_ctx2); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   58|  5.45k|    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1247|  5.45k|                rep_macro(edge->ref[1], off, ((uint8_t) -1)); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   72|  5.45k|    case 1: set_ctx(set_ctx2); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   58|  5.45k|    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1248|  5.45k|                rep_macro(edge->filter[0], off, DAV1D_N_SWITCHABLE_FILTERS); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   72|  5.45k|    case 1: set_ctx(set_ctx2); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   58|  5.45k|    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1249|  5.45k|                rep_macro(edge->filter[1], off, DAV1D_N_SWITCHABLE_FILTERS); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   72|  5.45k|    case 1: set_ctx(set_ctx2); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   58|  5.45k|    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1250|  5.45k|            }
  |  |  ------------------
  |  |  |  Branch (72:5): [True: 893k, False: 3.00M]
  |  |  ------------------
  |  |   73|   570k|    case 2: set_ctx(set_ctx4); break; \
  |  |  ------------------
  |  |  |  | 1234|   570k|            rep_macro(edge->tx_intra, off, t_lsz); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   73|   570k|    case 2: set_ctx(set_ctx4); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   60|   570k|    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1235|   570k|            rep_macro(edge->tx, off, t_lsz); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   73|   570k|    case 2: set_ctx(set_ctx4); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   60|   570k|    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1236|   570k|            rep_macro(edge->mode, off, y_mode_nofilt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   73|   570k|    case 2: set_ctx(set_ctx4); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   60|   570k|    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1237|   570k|            rep_macro(edge->pal_sz, off, b->pal_sz[0]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   73|   570k|    case 2: set_ctx(set_ctx4); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   60|   570k|    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1238|   570k|            rep_macro(edge->seg_pred, off, seg_pred); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   73|   570k|    case 2: set_ctx(set_ctx4); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   60|   570k|    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1239|   570k|            rep_macro(edge->skip_mode, off, 0); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   73|   570k|    case 2: set_ctx(set_ctx4); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   60|   570k|    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1240|   570k|            rep_macro(edge->intra, off, 1); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   73|   570k|    case 2: set_ctx(set_ctx4); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   60|   570k|    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1241|   570k|            rep_macro(edge->skip, off, b->skip); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   73|   570k|    case 2: set_ctx(set_ctx4); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   60|   570k|    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1242|   570k|            /* see aomedia bug 2183 for why we use luma coordinates here */ \
  |  |  |  | 1243|   570k|            rep_macro(t->pal_sz_uv[i], off, (has_chroma ? b->pal_sz[1] : 0)); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   73|   570k|    case 2: set_ctx(set_ctx4); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   60|  1.14M|    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (60:45): [True: 405k, False: 165k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1244|   570k|            if (IS_INTER_OR_SWITCH(f->frame_hdr)) { \
  |  |  |  |  ------------------
  |  |  |  |  |  |   36|   570k|    ((frame_header)->frame_type & 1)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (36:5): [True: 15.1k, False: 555k]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1245|  15.1k|                rep_macro(edge->comp_type, off, COMP_INTER_NONE); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   73|  15.1k|    case 2: set_ctx(set_ctx4); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   60|  15.1k|    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1246|  15.1k|                rep_macro(edge->ref[0], off, ((uint8_t) -1)); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   73|  15.1k|    case 2: set_ctx(set_ctx4); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   60|  15.1k|    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1247|  15.1k|                rep_macro(edge->ref[1], off, ((uint8_t) -1)); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   73|  15.1k|    case 2: set_ctx(set_ctx4); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   60|  15.1k|    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1248|  15.1k|                rep_macro(edge->filter[0], off, DAV1D_N_SWITCHABLE_FILTERS); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   73|  15.1k|    case 2: set_ctx(set_ctx4); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   60|  15.1k|    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1249|  15.1k|                rep_macro(edge->filter[1], off, DAV1D_N_SWITCHABLE_FILTERS); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   73|  15.1k|    case 2: set_ctx(set_ctx4); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   60|  15.1k|    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1250|  15.1k|            }
  |  |  ------------------
  |  |  |  Branch (73:5): [True: 570k, False: 3.33M]
  |  |  ------------------
  |  |   74|   395k|    case 3: set_ctx(set_ctx8); break; \
  |  |  ------------------
  |  |  |  | 1234|   395k|            rep_macro(edge->tx_intra, off, t_lsz); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   74|   395k|    case 3: set_ctx(set_ctx8); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   62|   395k|    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1235|   395k|            rep_macro(edge->tx, off, t_lsz); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   74|   395k|    case 3: set_ctx(set_ctx8); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   62|   395k|    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1236|   395k|            rep_macro(edge->mode, off, y_mode_nofilt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   74|   395k|    case 3: set_ctx(set_ctx8); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   62|   395k|    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1237|   395k|            rep_macro(edge->pal_sz, off, b->pal_sz[0]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   74|   395k|    case 3: set_ctx(set_ctx8); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   62|   395k|    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1238|   395k|            rep_macro(edge->seg_pred, off, seg_pred); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   74|   395k|    case 3: set_ctx(set_ctx8); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   62|   395k|    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1239|   395k|            rep_macro(edge->skip_mode, off, 0); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   74|   395k|    case 3: set_ctx(set_ctx8); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   62|   395k|    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1240|   395k|            rep_macro(edge->intra, off, 1); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   74|   395k|    case 3: set_ctx(set_ctx8); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   62|   395k|    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1241|   395k|            rep_macro(edge->skip, off, b->skip); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   74|   395k|    case 3: set_ctx(set_ctx8); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   62|   395k|    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1242|   395k|            /* see aomedia bug 2183 for why we use luma coordinates here */ \
  |  |  |  | 1243|   395k|            rep_macro(t->pal_sz_uv[i], off, (has_chroma ? b->pal_sz[1] : 0)); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   74|   395k|    case 3: set_ctx(set_ctx8); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   62|   791k|    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (62:45): [True: 284k, False: 110k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1244|   395k|            if (IS_INTER_OR_SWITCH(f->frame_hdr)) { \
  |  |  |  |  ------------------
  |  |  |  |  |  |   36|   395k|    ((frame_header)->frame_type & 1)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (36:5): [True: 14.1k, False: 381k]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1245|  14.1k|                rep_macro(edge->comp_type, off, COMP_INTER_NONE); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   74|  14.1k|    case 3: set_ctx(set_ctx8); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   62|  14.1k|    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1246|  14.1k|                rep_macro(edge->ref[0], off, ((uint8_t) -1)); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   74|  14.1k|    case 3: set_ctx(set_ctx8); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   62|  14.1k|    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1247|  14.1k|                rep_macro(edge->ref[1], off, ((uint8_t) -1)); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   74|  14.1k|    case 3: set_ctx(set_ctx8); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   62|  14.1k|    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1248|  14.1k|                rep_macro(edge->filter[0], off, DAV1D_N_SWITCHABLE_FILTERS); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   74|  14.1k|    case 3: set_ctx(set_ctx8); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   62|  14.1k|    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1249|  14.1k|                rep_macro(edge->filter[1], off, DAV1D_N_SWITCHABLE_FILTERS); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   74|  14.1k|    case 3: set_ctx(set_ctx8); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   62|  14.1k|    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1250|  14.1k|            }
  |  |  ------------------
  |  |  |  Branch (74:5): [True: 395k, False: 3.50M]
  |  |  ------------------
  |  |   75|   184k|    case 4: set_ctx(set_ctx16); break; \
  |  |  ------------------
  |  |  |  | 1234|   184k|            rep_macro(edge->tx_intra, off, t_lsz); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   75|   184k|    case 4: set_ctx(set_ctx16); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   63|   184k|#define set_ctx16(var, off, val) do { \
  |  |  |  |  |  |  |  |   64|   184k|        memset(&(var)[off], val, 16); \
  |  |  |  |  |  |  |  |   65|   184k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (65:14): [Folded, False: 184k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1235|   184k|            rep_macro(edge->tx, off, t_lsz); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   75|   184k|    case 4: set_ctx(set_ctx16); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   63|   184k|#define set_ctx16(var, off, val) do { \
  |  |  |  |  |  |  |  |   64|   184k|        memset(&(var)[off], val, 16); \
  |  |  |  |  |  |  |  |   65|   184k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (65:14): [Folded, False: 184k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1236|   184k|            rep_macro(edge->mode, off, y_mode_nofilt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   75|   184k|    case 4: set_ctx(set_ctx16); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   63|   184k|#define set_ctx16(var, off, val) do { \
  |  |  |  |  |  |  |  |   64|   184k|        memset(&(var)[off], val, 16); \
  |  |  |  |  |  |  |  |   65|   184k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (65:14): [Folded, False: 184k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1237|   184k|            rep_macro(edge->pal_sz, off, b->pal_sz[0]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   75|   184k|    case 4: set_ctx(set_ctx16); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   63|   184k|#define set_ctx16(var, off, val) do { \
  |  |  |  |  |  |  |  |   64|   184k|        memset(&(var)[off], val, 16); \
  |  |  |  |  |  |  |  |   65|   184k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (65:14): [Folded, False: 184k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1238|   184k|            rep_macro(edge->seg_pred, off, seg_pred); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   75|   184k|    case 4: set_ctx(set_ctx16); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   63|   184k|#define set_ctx16(var, off, val) do { \
  |  |  |  |  |  |  |  |   64|   184k|        memset(&(var)[off], val, 16); \
  |  |  |  |  |  |  |  |   65|   184k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (65:14): [Folded, False: 184k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1239|   184k|            rep_macro(edge->skip_mode, off, 0); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   75|   184k|    case 4: set_ctx(set_ctx16); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   63|   184k|#define set_ctx16(var, off, val) do { \
  |  |  |  |  |  |  |  |   64|   184k|        memset(&(var)[off], val, 16); \
  |  |  |  |  |  |  |  |   65|   184k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (65:14): [Folded, False: 184k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1240|   184k|            rep_macro(edge->intra, off, 1); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   75|   184k|    case 4: set_ctx(set_ctx16); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   63|   184k|#define set_ctx16(var, off, val) do { \
  |  |  |  |  |  |  |  |   64|   184k|        memset(&(var)[off], val, 16); \
  |  |  |  |  |  |  |  |   65|   184k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (65:14): [Folded, False: 184k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1241|   184k|            rep_macro(edge->skip, off, b->skip); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   75|   184k|    case 4: set_ctx(set_ctx16); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   63|   184k|#define set_ctx16(var, off, val) do { \
  |  |  |  |  |  |  |  |   64|   184k|        memset(&(var)[off], val, 16); \
  |  |  |  |  |  |  |  |   65|   184k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (65:14): [Folded, False: 184k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1242|   184k|            /* see aomedia bug 2183 for why we use luma coordinates here */ \
  |  |  |  | 1243|   184k|            rep_macro(t->pal_sz_uv[i], off, (has_chroma ? b->pal_sz[1] : 0)); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   75|   184k|    case 4: set_ctx(set_ctx16); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   63|   184k|#define set_ctx16(var, off, val) do { \
  |  |  |  |  |  |  |  |   64|   368k|        memset(&(var)[off], val, 16); \
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (64:29): [True: 130k, False: 53.2k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   65|   184k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (65:14): [Folded, False: 184k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1244|   184k|            if (IS_INTER_OR_SWITCH(f->frame_hdr)) { \
  |  |  |  |  ------------------
  |  |  |  |  |  |   36|   184k|    ((frame_header)->frame_type & 1)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (36:5): [True: 10.2k, False: 173k]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1245|  10.2k|                rep_macro(edge->comp_type, off, COMP_INTER_NONE); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   75|  10.2k|    case 4: set_ctx(set_ctx16); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   63|  10.2k|#define set_ctx16(var, off, val) do { \
  |  |  |  |  |  |  |  |   64|  10.2k|        memset(&(var)[off], val, 16); \
  |  |  |  |  |  |  |  |   65|  10.2k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (65:14): [Folded, False: 10.2k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1246|  10.2k|                rep_macro(edge->ref[0], off, ((uint8_t) -1)); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   75|  10.2k|    case 4: set_ctx(set_ctx16); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   63|  10.2k|#define set_ctx16(var, off, val) do { \
  |  |  |  |  |  |  |  |   64|  10.2k|        memset(&(var)[off], val, 16); \
  |  |  |  |  |  |  |  |   65|  10.2k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (65:14): [Folded, False: 10.2k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1247|  10.2k|                rep_macro(edge->ref[1], off, ((uint8_t) -1)); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   75|  10.2k|    case 4: set_ctx(set_ctx16); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   63|  10.2k|#define set_ctx16(var, off, val) do { \
  |  |  |  |  |  |  |  |   64|  10.2k|        memset(&(var)[off], val, 16); \
  |  |  |  |  |  |  |  |   65|  10.2k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (65:14): [Folded, False: 10.2k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1248|  10.2k|                rep_macro(edge->filter[0], off, DAV1D_N_SWITCHABLE_FILTERS); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   75|  10.2k|    case 4: set_ctx(set_ctx16); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   63|  10.2k|#define set_ctx16(var, off, val) do { \
  |  |  |  |  |  |  |  |   64|  10.2k|        memset(&(var)[off], val, 16); \
  |  |  |  |  |  |  |  |   65|  10.2k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (65:14): [Folded, False: 10.2k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1249|  10.2k|                rep_macro(edge->filter[1], off, DAV1D_N_SWITCHABLE_FILTERS); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   75|  10.2k|    case 4: set_ctx(set_ctx16); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   63|  10.2k|#define set_ctx16(var, off, val) do { \
  |  |  |  |  |  |  |  |   64|  10.2k|        memset(&(var)[off], val, 16); \
  |  |  |  |  |  |  |  |   65|  10.2k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (65:14): [Folded, False: 10.2k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1250|  10.2k|            }
  |  |  ------------------
  |  |  |  Branch (75:5): [True: 184k, False: 3.71M]
  |  |  ------------------
  |  |   76|  33.2k|    case 5: set_ctx(set_ctx32); break; \
  |  |  ------------------
  |  |  |  | 1234|  33.2k|            rep_macro(edge->tx_intra, off, t_lsz); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   76|  33.2k|    case 5: set_ctx(set_ctx32); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   66|  33.2k|#define set_ctx32(var, off, val) do { \
  |  |  |  |  |  |  |  |   67|  33.2k|        memset(&(var)[off], val, 32); \
  |  |  |  |  |  |  |  |   68|  33.2k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (68:14): [Folded, False: 33.2k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1235|  33.2k|            rep_macro(edge->tx, off, t_lsz); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   76|  33.2k|    case 5: set_ctx(set_ctx32); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   66|  33.2k|#define set_ctx32(var, off, val) do { \
  |  |  |  |  |  |  |  |   67|  33.2k|        memset(&(var)[off], val, 32); \
  |  |  |  |  |  |  |  |   68|  33.2k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (68:14): [Folded, False: 33.2k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1236|  33.2k|            rep_macro(edge->mode, off, y_mode_nofilt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   76|  33.2k|    case 5: set_ctx(set_ctx32); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   66|  33.2k|#define set_ctx32(var, off, val) do { \
  |  |  |  |  |  |  |  |   67|  33.2k|        memset(&(var)[off], val, 32); \
  |  |  |  |  |  |  |  |   68|  33.2k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (68:14): [Folded, False: 33.2k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1237|  33.2k|            rep_macro(edge->pal_sz, off, b->pal_sz[0]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   76|  33.2k|    case 5: set_ctx(set_ctx32); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   66|  33.2k|#define set_ctx32(var, off, val) do { \
  |  |  |  |  |  |  |  |   67|  33.2k|        memset(&(var)[off], val, 32); \
  |  |  |  |  |  |  |  |   68|  33.2k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (68:14): [Folded, False: 33.2k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1238|  33.2k|            rep_macro(edge->seg_pred, off, seg_pred); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   76|  33.2k|    case 5: set_ctx(set_ctx32); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   66|  33.2k|#define set_ctx32(var, off, val) do { \
  |  |  |  |  |  |  |  |   67|  33.2k|        memset(&(var)[off], val, 32); \
  |  |  |  |  |  |  |  |   68|  33.2k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (68:14): [Folded, False: 33.2k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1239|  33.2k|            rep_macro(edge->skip_mode, off, 0); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   76|  33.2k|    case 5: set_ctx(set_ctx32); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   66|  33.2k|#define set_ctx32(var, off, val) do { \
  |  |  |  |  |  |  |  |   67|  33.2k|        memset(&(var)[off], val, 32); \
  |  |  |  |  |  |  |  |   68|  33.2k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (68:14): [Folded, False: 33.2k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1240|  33.2k|            rep_macro(edge->intra, off, 1); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   76|  33.2k|    case 5: set_ctx(set_ctx32); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   66|  33.2k|#define set_ctx32(var, off, val) do { \
  |  |  |  |  |  |  |  |   67|  33.2k|        memset(&(var)[off], val, 32); \
  |  |  |  |  |  |  |  |   68|  33.2k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (68:14): [Folded, False: 33.2k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1241|  33.2k|            rep_macro(edge->skip, off, b->skip); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   76|  33.2k|    case 5: set_ctx(set_ctx32); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   66|  33.2k|#define set_ctx32(var, off, val) do { \
  |  |  |  |  |  |  |  |   67|  33.2k|        memset(&(var)[off], val, 32); \
  |  |  |  |  |  |  |  |   68|  33.2k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (68:14): [Folded, False: 33.2k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1242|  33.2k|            /* see aomedia bug 2183 for why we use luma coordinates here */ \
  |  |  |  | 1243|  33.2k|            rep_macro(t->pal_sz_uv[i], off, (has_chroma ? b->pal_sz[1] : 0)); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   76|  33.2k|    case 5: set_ctx(set_ctx32); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   66|  33.2k|#define set_ctx32(var, off, val) do { \
  |  |  |  |  |  |  |  |   67|  66.4k|        memset(&(var)[off], val, 32); \
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (67:29): [True: 21.6k, False: 11.5k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   68|  33.2k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (68:14): [Folded, False: 33.2k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1244|  33.2k|            if (IS_INTER_OR_SWITCH(f->frame_hdr)) { \
  |  |  |  |  ------------------
  |  |  |  |  |  |   36|  33.2k|    ((frame_header)->frame_type & 1)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (36:5): [True: 184, False: 33.0k]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1245|    184|                rep_macro(edge->comp_type, off, COMP_INTER_NONE); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   76|    184|    case 5: set_ctx(set_ctx32); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   66|    184|#define set_ctx32(var, off, val) do { \
  |  |  |  |  |  |  |  |   67|    184|        memset(&(var)[off], val, 32); \
  |  |  |  |  |  |  |  |   68|    184|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (68:14): [Folded, False: 184]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1246|    184|                rep_macro(edge->ref[0], off, ((uint8_t) -1)); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   76|    184|    case 5: set_ctx(set_ctx32); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   66|    184|#define set_ctx32(var, off, val) do { \
  |  |  |  |  |  |  |  |   67|    184|        memset(&(var)[off], val, 32); \
  |  |  |  |  |  |  |  |   68|    184|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (68:14): [Folded, False: 184]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1247|    184|                rep_macro(edge->ref[1], off, ((uint8_t) -1)); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   76|    184|    case 5: set_ctx(set_ctx32); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   66|    184|#define set_ctx32(var, off, val) do { \
  |  |  |  |  |  |  |  |   67|    184|        memset(&(var)[off], val, 32); \
  |  |  |  |  |  |  |  |   68|    184|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (68:14): [Folded, False: 184]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1248|    184|                rep_macro(edge->filter[0], off, DAV1D_N_SWITCHABLE_FILTERS); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   76|    184|    case 5: set_ctx(set_ctx32); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   66|    184|#define set_ctx32(var, off, val) do { \
  |  |  |  |  |  |  |  |   67|    184|        memset(&(var)[off], val, 32); \
  |  |  |  |  |  |  |  |   68|    184|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (68:14): [Folded, False: 184]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1249|    184|                rep_macro(edge->filter[1], off, DAV1D_N_SWITCHABLE_FILTERS); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   76|    184|    case 5: set_ctx(set_ctx32); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   66|    184|#define set_ctx32(var, off, val) do { \
  |  |  |  |  |  |  |  |   67|    184|        memset(&(var)[off], val, 32); \
  |  |  |  |  |  |  |  |   68|    184|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (68:14): [Folded, False: 184]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1250|    184|            }
  |  |  ------------------
  |  |  |  Branch (76:5): [True: 33.2k, False: 3.86M]
  |  |  ------------------
  |  |   77|      0|    default: assert(0); \
  |  |  ------------------
  |  |  |  |  140|      0|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (140:28): [True: 0, Folded]
  |  |  |  |  |  Branch (140:68): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (77:5): [True: 0, False: 3.90M]
  |  |  ------------------
  |  |   78|  3.90M|    }
  ------------------
 1252|  3.89M|#undef set_ctx
 1253|  3.89M|        }
 1254|  1.94M|        if (b->pal_sz[0])
  ------------------
  |  Branch (1254:13): [True: 24.0k, False: 1.92M]
  ------------------
 1255|  24.0k|            f->bd_fn.copy_pal_block_y(t, bx4, by4, bw4, bh4);
 1256|  1.94M|        if (has_chroma) {
  ------------------
  |  Branch (1256:13): [True: 1.49M, False: 450k]
  ------------------
 1257|  1.49M|            uint8_t uv_mode = b->uv_mode;
 1258|  1.49M|            dav1d_memset_pow2[ulog2(cbw4)](&t->a->uvmode[cbx4], uv_mode);
 1259|  1.49M|            dav1d_memset_pow2[ulog2(cbh4)](&t->l.uvmode[cby4], uv_mode);
 1260|  1.49M|            if (b->pal_sz[1])
  ------------------
  |  Branch (1260:17): [True: 7.27k, False: 1.49M]
  ------------------
 1261|  7.27k|                f->bd_fn.copy_pal_block_uv(t, bx4, by4, bw4, bh4);
 1262|  1.49M|        }
 1263|  1.94M|        if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc)
  ------------------
  |  |   36|  3.89M|    ((frame_header)->frame_type & 1)
  |  |  ------------------
  |  |  |  Branch (36:5): [True: 16.9k, False: 1.93M]
  |  |  ------------------
  ------------------
  |  Branch (1263:49): [True: 375k, False: 1.55M]
  ------------------
 1264|   399k|            splat_intraref(f->c, t, bs, bw4, bh4);
 1265|  1.94M|    } else if (IS_KEY_OR_INTRA(f->frame_hdr)) {
  ------------------
  |  |   43|  1.05M|    (!IS_INTER_OR_SWITCH(frame_header))
  |  |  ------------------
  |  |  |  |   36|  1.05M|    ((frame_header)->frame_type & 1)
  |  |  ------------------
  |  |  |  Branch (43:5): [True: 961k, False: 91.6k]
  |  |  ------------------
  ------------------
 1266|       |        // intra block copy
 1267|   961k|        refmvs_candidate mvstack[8];
 1268|   961k|        int n_mvs, ctx;
 1269|   961k|        dav1d_refmvs_find(&t->rt, mvstack, &n_mvs, &ctx,
 1270|   961k|                          (union refmvs_refpair) { .ref = { 0, -1 }},
 1271|   961k|                          bs, intra_edge_flags, t->by, t->bx);
 1272|       |
 1273|   961k|        if (mvstack[0].mv.mv[0].n)
  ------------------
  |  Branch (1273:13): [True: 939k, False: 22.0k]
  ------------------
 1274|   939k|            b->mv[0] = mvstack[0].mv.mv[0];
 1275|  22.0k|        else if (mvstack[1].mv.mv[0].n)
  ------------------
  |  Branch (1275:18): [True: 0, False: 22.0k]
  ------------------
 1276|      0|            b->mv[0] = mvstack[1].mv.mv[0];
 1277|  22.0k|        else {
 1278|  22.0k|            if (t->by - (16 << f->seq_hdr->sb128) < ts->tiling.row_start) {
  ------------------
  |  Branch (1278:17): [True: 12.9k, False: 9.06k]
  ------------------
 1279|  12.9k|                b->mv[0].y = 0;
 1280|  12.9k|                b->mv[0].x = -(512 << f->seq_hdr->sb128) - 2048;
 1281|  12.9k|            } else {
 1282|  9.06k|                b->mv[0].y = -(512 << f->seq_hdr->sb128);
 1283|  9.06k|                b->mv[0].x = 0;
 1284|  9.06k|            }
 1285|  22.0k|        }
 1286|       |
 1287|   961k|        const union mv ref = b->mv[0];
 1288|   961k|        read_mv_residual(ts, &b->mv[0], -1);
 1289|       |
 1290|       |        // clip intrabc motion vector to decoded parts of current tile
 1291|   961k|        int border_left = ts->tiling.col_start * 4;
 1292|   961k|        int border_top  = ts->tiling.row_start * 4;
 1293|   961k|        if (has_chroma) {
  ------------------
  |  Branch (1293:13): [True: 895k, False: 66.3k]
  ------------------
 1294|   895k|            if (bw4 < 2 &&  ss_hor)
  ------------------
  |  Branch (1294:17): [True: 827k, False: 67.9k]
  |  Branch (1294:29): [True: 14.7k, False: 812k]
  ------------------
 1295|  14.7k|                border_left += 4;
 1296|   895k|            if (bh4 < 2 &&  ss_ver)
  ------------------
  |  Branch (1296:17): [True: 828k, False: 66.3k]
  |  Branch (1296:29): [True: 14.1k, False: 814k]
  ------------------
 1297|  14.1k|                border_top  += 4;
 1298|   895k|        }
 1299|   961k|        int src_left   = t->bx * 4 + (b->mv[0].x >> 3);
 1300|   961k|        int src_top    = t->by * 4 + (b->mv[0].y >> 3);
 1301|   961k|        int src_right  = src_left + bw4 * 4;
 1302|   961k|        int src_bottom = src_top  + bh4 * 4;
 1303|   961k|        const int border_right = ((ts->tiling.col_end + (bw4 - 1)) & ~(bw4 - 1)) * 4;
 1304|       |
 1305|       |        // check against left or right tile boundary and adjust if necessary
 1306|   961k|        if (src_left < border_left) {
  ------------------
  |  Branch (1306:13): [True: 545k, False: 415k]
  ------------------
 1307|   545k|            src_right += border_left - src_left;
 1308|   545k|            src_left  += border_left - src_left;
 1309|   545k|        } else if (src_right > border_right) {
  ------------------
  |  Branch (1309:20): [True: 222k, False: 193k]
  ------------------
 1310|   222k|            src_left  -= src_right - border_right;
 1311|   222k|            src_right -= src_right - border_right;
 1312|   222k|        }
 1313|       |        // check against top tile boundary and adjust if necessary
 1314|   961k|        if (src_top < border_top) {
  ------------------
  |  Branch (1314:13): [True: 909k, False: 51.9k]
  ------------------
 1315|   909k|            src_bottom += border_top - src_top;
 1316|   909k|            src_top    += border_top - src_top;
 1317|   909k|        }
 1318|       |
 1319|   961k|        const int sbx = (t->bx >> (4 + f->seq_hdr->sb128)) << (6 + f->seq_hdr->sb128);
 1320|   961k|        const int sby = (t->by >> (4 + f->seq_hdr->sb128)) << (6 + f->seq_hdr->sb128);
 1321|   961k|        const int sb_size = 1 << (6 + f->seq_hdr->sb128);
 1322|       |        // check for overlap with current superblock
 1323|   961k|        if (src_bottom > sby && src_right > sbx) {
  ------------------
  |  Branch (1323:13): [True: 933k, False: 28.0k]
  |  Branch (1323:33): [True: 220k, False: 713k]
  ------------------
 1324|   220k|            if (src_top - border_top >= src_bottom - sby) {
  ------------------
  |  Branch (1324:17): [True: 3.02k, False: 217k]
  ------------------
 1325|       |                // if possible move src up into the previous suberblock row
 1326|  3.02k|                src_top    -= src_bottom - sby;
 1327|  3.02k|                src_bottom -= src_bottom - sby;
 1328|   217k|            } else if (src_left - border_left >= src_right - sbx) {
  ------------------
  |  Branch (1328:24): [True: 215k, False: 1.51k]
  ------------------
 1329|       |                // if possible move src left into the previous suberblock
 1330|   215k|                src_left  -= src_right - sbx;
 1331|   215k|                src_right -= src_right - sbx;
 1332|   215k|            }
 1333|   220k|        }
 1334|       |        // move src up if it is below current superblock row
 1335|   961k|        if (src_bottom > sby + sb_size) {
  ------------------
  |  Branch (1335:13): [True: 610, False: 961k]
  ------------------
 1336|    610|            src_top    -= src_bottom - (sby + sb_size);
 1337|    610|            src_bottom -= src_bottom - (sby + sb_size);
 1338|    610|        }
 1339|       |        // error out if mv still overlaps with the current superblock
 1340|   961k|        if (src_bottom > sby && src_right > sbx)
  ------------------
  |  Branch (1340:13): [True: 927k, False: 34.0k]
  |  Branch (1340:33): [True: 404, False: 927k]
  ------------------
 1341|    404|            return -1;
 1342|       |
 1343|   961k|        b->mv[0].x = (src_left - t->bx * 4) * 8;
 1344|   961k|        b->mv[0].y = (src_top  - t->by * 4) * 8;
 1345|       |
 1346|   961k|        if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|   961k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 961k]
  |  |  ------------------
  |  |   35|   961k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|   961k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1347|      0|            printf("Post-dmv[%d/%d,ref=%d/%d|%d/%d]: r=%d\n",
 1348|      0|                   b->mv[0].y, b->mv[0].x, ref.y, ref.x,
 1349|      0|                   mvstack[0].mv.mv[0].y, mvstack[0].mv.mv[0].x, ts->msac.rng);
 1350|   961k|        read_vartx_tree(t, b, bs, bx4, by4);
 1351|       |
 1352|       |        // reconstruction
 1353|   961k|        if (t->frame_thread.pass == 1) {
  ------------------
  |  Branch (1353:13): [True: 0, False: 961k]
  ------------------
 1354|      0|            f->bd_fn.read_coef_blocks(t, bs, b);
 1355|      0|            b->filter2d = FILTER_2D_BILINEAR;
 1356|   961k|        } else {
 1357|   961k|            if (f->bd_fn.recon_b_inter(t, bs, b)) return -1;
  ------------------
  |  Branch (1357:17): [True: 0, False: 961k]
  ------------------
 1358|   961k|        }
 1359|       |
 1360|   961k|        splat_intrabc_mv(f->c, t, bs, b, bw4, bh4);
 1361|   961k|        BlockContext *edge = t->a;
 1362|  2.87M|        for (int i = 0, off = bx4; i < 2; i++, off = by4, edge = &t->l) {
  ------------------
  |  Branch (1362:36): [True: 1.92M, False: 951k]
  ------------------
 1363|  1.92M|#define set_ctx(rep_macro) \
 1364|  1.92M|            rep_macro(edge->tx_intra, off, b_dim[2 + i]); \
 1365|  1.92M|            rep_macro(edge->mode, off, DC_PRED); \
 1366|  1.92M|            rep_macro(edge->pal_sz, off, 0); \
 1367|       |            /* see aomedia bug 2183 for why this is outside if (has_chroma) */ \
 1368|  1.92M|            rep_macro(t->pal_sz_uv[i], off, 0); \
 1369|  1.92M|            rep_macro(edge->seg_pred, off, seg_pred); \
 1370|  1.92M|            rep_macro(edge->skip_mode, off, 0); \
 1371|  1.92M|            rep_macro(edge->intra, off, 0); \
 1372|  1.92M|            rep_macro(edge->skip, off, b->skip)
 1373|  1.92M|            case_set(b_dim[2 + i]);
  ------------------
  |  |   70|  1.92M|    switch (var) { \
  |  |   71|  1.76M|    case 0: set_ctx(set_ctx1); break; \
  |  |  ------------------
  |  |  |  | 1364|  1.76M|            rep_macro(edge->tx_intra, off, b_dim[2 + i]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   71|  1.76M|    case 0: set_ctx(set_ctx1); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   56|  1.76M|    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1365|  1.76M|            rep_macro(edge->mode, off, DC_PRED); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   71|  1.76M|    case 0: set_ctx(set_ctx1); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   56|  1.76M|    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1366|  1.76M|            rep_macro(edge->pal_sz, off, 0); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   71|  1.76M|    case 0: set_ctx(set_ctx1); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   56|  1.76M|    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1367|  1.76M|            /* see aomedia bug 2183 for why this is outside if (has_chroma) */ \
  |  |  |  | 1368|  1.76M|            rep_macro(t->pal_sz_uv[i], off, 0); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   71|  1.76M|    case 0: set_ctx(set_ctx1); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   56|  1.76M|    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1369|  1.76M|            rep_macro(edge->seg_pred, off, seg_pred); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   71|  1.76M|    case 0: set_ctx(set_ctx1); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   56|  1.76M|    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1370|  1.76M|            rep_macro(edge->skip_mode, off, 0); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   71|  1.76M|    case 0: set_ctx(set_ctx1); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   56|  1.76M|    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1371|  1.76M|            rep_macro(edge->intra, off, 0); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   71|  1.76M|    case 0: set_ctx(set_ctx1); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   56|  1.76M|    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1372|  1.76M|            rep_macro(edge->skip, off, b->skip)
  |  |  |  |  ------------------
  |  |  |  |  |  |   71|  1.76M|    case 0: set_ctx(set_ctx1); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   56|  1.76M|    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (71:5): [True: 1.76M, False: 153k]
  |  |  ------------------
  |  |   72|  69.7k|    case 1: set_ctx(set_ctx2); break; \
  |  |  ------------------
  |  |  |  | 1364|  69.7k|            rep_macro(edge->tx_intra, off, b_dim[2 + i]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   72|  69.7k|    case 1: set_ctx(set_ctx2); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   58|  69.7k|    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1365|  69.7k|            rep_macro(edge->mode, off, DC_PRED); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   72|  69.7k|    case 1: set_ctx(set_ctx2); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   58|  69.7k|    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1366|  69.7k|            rep_macro(edge->pal_sz, off, 0); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   72|  69.7k|    case 1: set_ctx(set_ctx2); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   58|  69.7k|    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1367|  69.7k|            /* see aomedia bug 2183 for why this is outside if (has_chroma) */ \
  |  |  |  | 1368|  69.7k|            rep_macro(t->pal_sz_uv[i], off, 0); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   72|  69.7k|    case 1: set_ctx(set_ctx2); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   58|  69.7k|    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1369|  69.7k|            rep_macro(edge->seg_pred, off, seg_pred); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   72|  69.7k|    case 1: set_ctx(set_ctx2); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   58|  69.7k|    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1370|  69.7k|            rep_macro(edge->skip_mode, off, 0); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   72|  69.7k|    case 1: set_ctx(set_ctx2); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   58|  69.7k|    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1371|  69.7k|            rep_macro(edge->intra, off, 0); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   72|  69.7k|    case 1: set_ctx(set_ctx2); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   58|  69.7k|    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1372|  69.7k|            rep_macro(edge->skip, off, b->skip)
  |  |  |  |  ------------------
  |  |  |  |  |  |   72|  69.7k|    case 1: set_ctx(set_ctx2); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   58|  69.7k|    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (72:5): [True: 69.7k, False: 1.85M]
  |  |  ------------------
  |  |   73|  34.6k|    case 2: set_ctx(set_ctx4); break; \
  |  |  ------------------
  |  |  |  | 1364|  34.6k|            rep_macro(edge->tx_intra, off, b_dim[2 + i]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   73|  34.6k|    case 2: set_ctx(set_ctx4); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   60|  34.6k|    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1365|  34.6k|            rep_macro(edge->mode, off, DC_PRED); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   73|  34.6k|    case 2: set_ctx(set_ctx4); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   60|  34.6k|    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1366|  34.6k|            rep_macro(edge->pal_sz, off, 0); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   73|  34.6k|    case 2: set_ctx(set_ctx4); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   60|  34.6k|    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1367|  34.6k|            /* see aomedia bug 2183 for why this is outside if (has_chroma) */ \
  |  |  |  | 1368|  34.6k|            rep_macro(t->pal_sz_uv[i], off, 0); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   73|  34.6k|    case 2: set_ctx(set_ctx4); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   60|  34.6k|    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1369|  34.6k|            rep_macro(edge->seg_pred, off, seg_pred); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   73|  34.6k|    case 2: set_ctx(set_ctx4); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   60|  34.6k|    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1370|  34.6k|            rep_macro(edge->skip_mode, off, 0); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   73|  34.6k|    case 2: set_ctx(set_ctx4); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   60|  34.6k|    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1371|  34.6k|            rep_macro(edge->intra, off, 0); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   73|  34.6k|    case 2: set_ctx(set_ctx4); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   60|  34.6k|    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1372|  34.6k|            rep_macro(edge->skip, off, b->skip)
  |  |  |  |  ------------------
  |  |  |  |  |  |   73|  34.6k|    case 2: set_ctx(set_ctx4); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   60|  34.6k|    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (73:5): [True: 34.6k, False: 1.88M]
  |  |  ------------------
  |  |   74|  42.7k|    case 3: set_ctx(set_ctx8); break; \
  |  |  ------------------
  |  |  |  | 1364|  42.7k|            rep_macro(edge->tx_intra, off, b_dim[2 + i]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   74|  42.7k|    case 3: set_ctx(set_ctx8); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   62|  42.7k|    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1365|  42.7k|            rep_macro(edge->mode, off, DC_PRED); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   74|  42.7k|    case 3: set_ctx(set_ctx8); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   62|  42.7k|    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1366|  42.7k|            rep_macro(edge->pal_sz, off, 0); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   74|  42.7k|    case 3: set_ctx(set_ctx8); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   62|  42.7k|    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1367|  42.7k|            /* see aomedia bug 2183 for why this is outside if (has_chroma) */ \
  |  |  |  | 1368|  42.7k|            rep_macro(t->pal_sz_uv[i], off, 0); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   74|  42.7k|    case 3: set_ctx(set_ctx8); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   62|  42.7k|    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1369|  42.7k|            rep_macro(edge->seg_pred, off, seg_pred); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   74|  42.7k|    case 3: set_ctx(set_ctx8); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   62|  42.7k|    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1370|  42.7k|            rep_macro(edge->skip_mode, off, 0); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   74|  42.7k|    case 3: set_ctx(set_ctx8); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   62|  42.7k|    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1371|  42.7k|            rep_macro(edge->intra, off, 0); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   74|  42.7k|    case 3: set_ctx(set_ctx8); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   62|  42.7k|    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1372|  42.7k|            rep_macro(edge->skip, off, b->skip)
  |  |  |  |  ------------------
  |  |  |  |  |  |   74|  42.7k|    case 3: set_ctx(set_ctx8); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   62|  42.7k|    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (74:5): [True: 42.7k, False: 1.87M]
  |  |  ------------------
  |  |   75|  9.25k|    case 4: set_ctx(set_ctx16); break; \
  |  |  ------------------
  |  |  |  | 1364|  9.25k|            rep_macro(edge->tx_intra, off, b_dim[2 + i]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   75|  9.25k|    case 4: set_ctx(set_ctx16); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   63|  9.25k|#define set_ctx16(var, off, val) do { \
  |  |  |  |  |  |  |  |   64|  9.25k|        memset(&(var)[off], val, 16); \
  |  |  |  |  |  |  |  |   65|  9.25k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (65:14): [Folded, False: 9.25k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1365|  9.25k|            rep_macro(edge->mode, off, DC_PRED); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   75|  9.25k|    case 4: set_ctx(set_ctx16); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   63|  9.25k|#define set_ctx16(var, off, val) do { \
  |  |  |  |  |  |  |  |   64|  9.25k|        memset(&(var)[off], val, 16); \
  |  |  |  |  |  |  |  |   65|  9.25k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (65:14): [Folded, False: 9.25k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1366|  9.25k|            rep_macro(edge->pal_sz, off, 0); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   75|  9.25k|    case 4: set_ctx(set_ctx16); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   63|  9.25k|#define set_ctx16(var, off, val) do { \
  |  |  |  |  |  |  |  |   64|  9.25k|        memset(&(var)[off], val, 16); \
  |  |  |  |  |  |  |  |   65|  9.25k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (65:14): [Folded, False: 9.25k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1367|  9.25k|            /* see aomedia bug 2183 for why this is outside if (has_chroma) */ \
  |  |  |  | 1368|  9.25k|            rep_macro(t->pal_sz_uv[i], off, 0); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   75|  9.25k|    case 4: set_ctx(set_ctx16); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   63|  9.25k|#define set_ctx16(var, off, val) do { \
  |  |  |  |  |  |  |  |   64|  9.25k|        memset(&(var)[off], val, 16); \
  |  |  |  |  |  |  |  |   65|  9.25k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (65:14): [Folded, False: 9.25k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1369|  9.25k|            rep_macro(edge->seg_pred, off, seg_pred); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   75|  9.25k|    case 4: set_ctx(set_ctx16); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   63|  9.25k|#define set_ctx16(var, off, val) do { \
  |  |  |  |  |  |  |  |   64|  9.25k|        memset(&(var)[off], val, 16); \
  |  |  |  |  |  |  |  |   65|  9.25k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (65:14): [Folded, False: 9.25k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1370|  9.25k|            rep_macro(edge->skip_mode, off, 0); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   75|  9.25k|    case 4: set_ctx(set_ctx16); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   63|  9.25k|#define set_ctx16(var, off, val) do { \
  |  |  |  |  |  |  |  |   64|  9.25k|        memset(&(var)[off], val, 16); \
  |  |  |  |  |  |  |  |   65|  9.25k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (65:14): [Folded, False: 9.25k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1371|  9.25k|            rep_macro(edge->intra, off, 0); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   75|  9.25k|    case 4: set_ctx(set_ctx16); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   63|  9.25k|#define set_ctx16(var, off, val) do { \
  |  |  |  |  |  |  |  |   64|  9.25k|        memset(&(var)[off], val, 16); \
  |  |  |  |  |  |  |  |   65|  9.25k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (65:14): [Folded, False: 9.25k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1372|  9.25k|            rep_macro(edge->skip, off, b->skip)
  |  |  |  |  ------------------
  |  |  |  |  |  |   75|  9.25k|    case 4: set_ctx(set_ctx16); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   63|  9.25k|#define set_ctx16(var, off, val) do { \
  |  |  |  |  |  |  |  |   64|  9.25k|        memset(&(var)[off], val, 16); \
  |  |  |  |  |  |  |  |   65|  9.25k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (65:14): [Folded, False: 9.25k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (75:5): [True: 9.25k, False: 1.91M]
  |  |  ------------------
  |  |   76|  1.36k|    case 5: set_ctx(set_ctx32); break; \
  |  |  ------------------
  |  |  |  | 1364|  1.36k|            rep_macro(edge->tx_intra, off, b_dim[2 + i]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   76|  1.36k|    case 5: set_ctx(set_ctx32); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   66|  1.36k|#define set_ctx32(var, off, val) do { \
  |  |  |  |  |  |  |  |   67|  1.36k|        memset(&(var)[off], val, 32); \
  |  |  |  |  |  |  |  |   68|  1.36k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (68:14): [Folded, False: 1.36k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1365|  1.36k|            rep_macro(edge->mode, off, DC_PRED); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   76|  1.36k|    case 5: set_ctx(set_ctx32); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   66|  1.36k|#define set_ctx32(var, off, val) do { \
  |  |  |  |  |  |  |  |   67|  1.36k|        memset(&(var)[off], val, 32); \
  |  |  |  |  |  |  |  |   68|  1.36k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (68:14): [Folded, False: 1.36k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1366|  1.36k|            rep_macro(edge->pal_sz, off, 0); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   76|  1.36k|    case 5: set_ctx(set_ctx32); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   66|  1.36k|#define set_ctx32(var, off, val) do { \
  |  |  |  |  |  |  |  |   67|  1.36k|        memset(&(var)[off], val, 32); \
  |  |  |  |  |  |  |  |   68|  1.36k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (68:14): [Folded, False: 1.36k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1367|  1.36k|            /* see aomedia bug 2183 for why this is outside if (has_chroma) */ \
  |  |  |  | 1368|  1.36k|            rep_macro(t->pal_sz_uv[i], off, 0); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   76|  1.36k|    case 5: set_ctx(set_ctx32); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   66|  1.36k|#define set_ctx32(var, off, val) do { \
  |  |  |  |  |  |  |  |   67|  1.36k|        memset(&(var)[off], val, 32); \
  |  |  |  |  |  |  |  |   68|  1.36k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (68:14): [Folded, False: 1.36k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1369|  1.36k|            rep_macro(edge->seg_pred, off, seg_pred); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   76|  1.36k|    case 5: set_ctx(set_ctx32); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   66|  1.36k|#define set_ctx32(var, off, val) do { \
  |  |  |  |  |  |  |  |   67|  1.36k|        memset(&(var)[off], val, 32); \
  |  |  |  |  |  |  |  |   68|  1.36k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (68:14): [Folded, False: 1.36k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1370|  1.36k|            rep_macro(edge->skip_mode, off, 0); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   76|  1.36k|    case 5: set_ctx(set_ctx32); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   66|  1.36k|#define set_ctx32(var, off, val) do { \
  |  |  |  |  |  |  |  |   67|  1.36k|        memset(&(var)[off], val, 32); \
  |  |  |  |  |  |  |  |   68|  1.36k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (68:14): [Folded, False: 1.36k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1371|  1.36k|            rep_macro(edge->intra, off, 0); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   76|  1.36k|    case 5: set_ctx(set_ctx32); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   66|  1.36k|#define set_ctx32(var, off, val) do { \
  |  |  |  |  |  |  |  |   67|  1.36k|        memset(&(var)[off], val, 32); \
  |  |  |  |  |  |  |  |   68|  1.36k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (68:14): [Folded, False: 1.36k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1372|  1.36k|            rep_macro(edge->skip, off, b->skip)
  |  |  |  |  ------------------
  |  |  |  |  |  |   76|  1.36k|    case 5: set_ctx(set_ctx32); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   66|  1.36k|#define set_ctx32(var, off, val) do { \
  |  |  |  |  |  |  |  |   67|  1.36k|        memset(&(var)[off], val, 32); \
  |  |  |  |  |  |  |  |   68|  1.36k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (68:14): [Folded, False: 1.36k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (76:5): [True: 1.36k, False: 1.91M]
  |  |  ------------------
  |  |   77|      0|    default: assert(0); \
  |  |  ------------------
  |  |  |  |  140|      0|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (140:28): [True: 0, Folded]
  |  |  |  |  |  Branch (140:68): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (77:5): [True: 0, False: 1.92M]
  |  |  ------------------
  |  |   78|  1.92M|    }
  ------------------
 1374|  1.91M|#undef set_ctx
 1375|  1.91M|        }
 1376|   951k|        if (has_chroma) {
  ------------------
  |  Branch (1376:13): [True: 892k, False: 59.4k]
  ------------------
 1377|   892k|            dav1d_memset_pow2[ulog2(cbw4)](&t->a->uvmode[cbx4], DC_PRED);
 1378|   892k|            dav1d_memset_pow2[ulog2(cbh4)](&t->l.uvmode[cby4], DC_PRED);
 1379|   892k|        }
 1380|   951k|    } else {
 1381|       |        // inter-specific mode/mv coding
 1382|  91.6k|        int is_comp, has_subpel_filter;
 1383|       |
 1384|  91.6k|        if (b->skip_mode) {
  ------------------
  |  Branch (1384:13): [True: 356, False: 91.3k]
  ------------------
 1385|    356|            is_comp = 1;
 1386|  91.3k|        } else if ((!seg || (seg->ref == -1 && !seg->globalmv && !seg->skip)) &&
  ------------------
  |  Branch (1386:21): [True: 76.4k, False: 14.8k]
  |  Branch (1386:30): [True: 13.8k, False: 970]
  |  Branch (1386:48): [True: 4.31k, False: 9.57k]
  |  Branch (1386:66): [True: 3.50k, False: 807]
  ------------------
 1387|  74.0k|                   f->frame_hdr->switchable_comp_refs && imin(bw4, bh4) > 1)
  ------------------
  |  Branch (1387:20): [True: 40.2k, False: 33.8k]
  |  Branch (1387:58): [True: 26.6k, False: 13.6k]
  ------------------
 1388|  26.6k|        {
 1389|  26.6k|            const int ctx = get_comp_ctx(t->a, &t->l, by4, bx4,
 1390|  26.6k|                                         have_top, have_left);
 1391|  26.6k|            is_comp = dav1d_msac_decode_bool_adapt(&ts->msac,
  ------------------
  |  |   52|  26.6k|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
 1392|  26.6k|                          ts->cdf.m.comp[ctx]);
 1393|  26.6k|            if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|  26.6k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 26.6k]
  |  |  ------------------
  |  |   35|  26.6k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  26.6k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1394|      0|                printf("Post-compflag[%d]: r=%d\n", is_comp, ts->msac.rng);
 1395|  64.6k|        } else {
 1396|  64.6k|            is_comp = 0;
 1397|  64.6k|        }
 1398|       |
 1399|  91.6k|        if (b->skip_mode) {
  ------------------
  |  Branch (1399:13): [True: 356, False: 91.3k]
  ------------------
 1400|    356|            b->ref[0] = f->frame_hdr->skip_mode_refs[0];
 1401|    356|            b->ref[1] = f->frame_hdr->skip_mode_refs[1];
 1402|    356|            b->comp_type = COMP_INTER_AVG;
 1403|    356|            b->inter_mode = NEARESTMV_NEARESTMV;
 1404|    356|            b->drl_idx = NEAREST_DRL;
 1405|    356|            has_subpel_filter = 0;
 1406|       |
 1407|    356|            refmvs_candidate mvstack[8];
 1408|    356|            int n_mvs, ctx;
 1409|    356|            dav1d_refmvs_find(&t->rt, mvstack, &n_mvs, &ctx,
 1410|    356|                              (union refmvs_refpair) { .ref = {
 1411|    356|                                    b->ref[0] + 1, b->ref[1] + 1 }},
 1412|    356|                              bs, intra_edge_flags, t->by, t->bx);
 1413|       |
 1414|    356|            b->mv[0] = mvstack[0].mv.mv[0];
 1415|    356|            b->mv[1] = mvstack[0].mv.mv[1];
 1416|    356|            fix_mv_precision(f->frame_hdr, &b->mv[0]);
 1417|    356|            fix_mv_precision(f->frame_hdr, &b->mv[1]);
 1418|    356|            if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|    356|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 356]
  |  |  ------------------
  |  |   35|    356|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|    356|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1419|      0|                printf("Post-skipmodeblock[mv=1:y=%d,x=%d,2:y=%d,x=%d,refs=%d+%d\n",
 1420|      0|                       b->mv[0].y, b->mv[0].x, b->mv[1].y, b->mv[1].x,
 1421|      0|                       b->ref[0], b->ref[1]);
 1422|  91.3k|        } else if (is_comp) {
  ------------------
  |  Branch (1422:20): [True: 14.7k, False: 76.5k]
  ------------------
 1423|  14.7k|            const int dir_ctx = get_comp_dir_ctx(t->a, &t->l, by4, bx4,
 1424|  14.7k|                                                 have_top, have_left);
 1425|  14.7k|            if (dav1d_msac_decode_bool_adapt(&ts->msac,
  ------------------
  |  |   52|  14.7k|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
  |  Branch (1425:17): [True: 12.0k, False: 2.75k]
  ------------------
 1426|  14.7k|                    ts->cdf.m.comp_dir[dir_ctx]))
 1427|  12.0k|            {
 1428|       |                // bidir - first reference (fw)
 1429|  12.0k|                const int ctx1 = av1_get_fwd_ref_ctx(t->a, &t->l, by4, bx4,
 1430|  12.0k|                                                     have_top, have_left);
 1431|  12.0k|                if (dav1d_msac_decode_bool_adapt(&ts->msac,
  ------------------
  |  |   52|  12.0k|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
  |  Branch (1431:21): [True: 4.96k, False: 7.05k]
  ------------------
 1432|  12.0k|                        ts->cdf.m.comp_fwd_ref[0][ctx1]))
 1433|  4.96k|                {
 1434|  4.96k|                    const int ctx2 = av1_get_fwd_ref_2_ctx(t->a, &t->l, by4, bx4,
 1435|  4.96k|                                                           have_top, have_left);
 1436|  4.96k|                    b->ref[0] = 2 + dav1d_msac_decode_bool_adapt(&ts->msac,
  ------------------
  |  |   52|  4.96k|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
 1437|  4.96k|                                        ts->cdf.m.comp_fwd_ref[2][ctx2]);
 1438|  7.05k|                } else {
 1439|  7.05k|                    const int ctx2 = av1_get_fwd_ref_1_ctx(t->a, &t->l, by4, bx4,
 1440|  7.05k|                                                           have_top, have_left);
 1441|  7.05k|                    b->ref[0] = dav1d_msac_decode_bool_adapt(&ts->msac,
  ------------------
  |  |   52|  7.05k|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
 1442|  7.05k|                                    ts->cdf.m.comp_fwd_ref[1][ctx2]);
 1443|  7.05k|                }
 1444|       |
 1445|       |                // second reference (bw)
 1446|  12.0k|                const int ctx3 = av1_get_bwd_ref_ctx(t->a, &t->l, by4, bx4,
 1447|  12.0k|                                                     have_top, have_left);
 1448|  12.0k|                if (dav1d_msac_decode_bool_adapt(&ts->msac,
  ------------------
  |  |   52|  12.0k|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
  |  Branch (1448:21): [True: 7.16k, False: 4.85k]
  ------------------
 1449|  12.0k|                        ts->cdf.m.comp_bwd_ref[0][ctx3]))
 1450|  7.16k|                {
 1451|  7.16k|                    b->ref[1] = 6;
 1452|  7.16k|                } else {
 1453|  4.85k|                    const int ctx4 = av1_get_bwd_ref_1_ctx(t->a, &t->l, by4, bx4,
 1454|  4.85k|                                                           have_top, have_left);
 1455|  4.85k|                    b->ref[1] = 4 + dav1d_msac_decode_bool_adapt(&ts->msac,
  ------------------
  |  |   52|  4.85k|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
 1456|  4.85k|                                        ts->cdf.m.comp_bwd_ref[1][ctx4]);
 1457|  4.85k|                }
 1458|  12.0k|            } else {
 1459|       |                // unidir
 1460|  2.75k|                const int uctx_p = av1_get_uni_p_ctx(t->a, &t->l, by4, bx4,
  ------------------
  |  |  280|  2.75k|#define av1_get_uni_p_ctx av1_get_ref_ctx
  ------------------
 1461|  2.75k|                                                     have_top, have_left);
 1462|  2.75k|                if (dav1d_msac_decode_bool_adapt(&ts->msac,
  ------------------
  |  |   52|  2.75k|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
  |  Branch (1462:21): [True: 750, False: 2.00k]
  ------------------
 1463|  2.75k|                        ts->cdf.m.comp_uni_ref[0][uctx_p]))
 1464|    750|                {
 1465|    750|                    b->ref[0] = 4;
 1466|    750|                    b->ref[1] = 6;
 1467|  2.00k|                } else {
 1468|  2.00k|                    const int uctx_p1 = av1_get_uni_p1_ctx(t->a, &t->l, by4, bx4,
 1469|  2.00k|                                                           have_top, have_left);
 1470|  2.00k|                    b->ref[0] = 0;
 1471|  2.00k|                    b->ref[1] = 1 + dav1d_msac_decode_bool_adapt(&ts->msac,
  ------------------
  |  |   52|  2.00k|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
 1472|  2.00k|                                        ts->cdf.m.comp_uni_ref[1][uctx_p1]);
 1473|  2.00k|                    if (b->ref[1] == 2) {
  ------------------
  |  Branch (1473:25): [True: 1.28k, False: 722]
  ------------------
 1474|  1.28k|                        const int uctx_p2 = av1_get_uni_p2_ctx(t->a, &t->l, by4, bx4,
  ------------------
  |  |  281|  1.28k|#define av1_get_uni_p2_ctx av1_get_fwd_ref_2_ctx
  ------------------
 1475|  1.28k|                                                               have_top, have_left);
 1476|  1.28k|                        b->ref[1] += dav1d_msac_decode_bool_adapt(&ts->msac,
  ------------------
  |  |   52|  1.28k|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
 1477|  1.28k|                                         ts->cdf.m.comp_uni_ref[2][uctx_p2]);
 1478|  1.28k|                    }
 1479|  2.00k|                }
 1480|  2.75k|            }
 1481|  14.7k|            if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|  14.7k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 14.7k]
  |  |  ------------------
  |  |   35|  14.7k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  14.7k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1482|      0|                printf("Post-refs[%d/%d]: r=%d\n",
 1483|      0|                       b->ref[0], b->ref[1], ts->msac.rng);
 1484|       |
 1485|  14.7k|            refmvs_candidate mvstack[8];
 1486|  14.7k|            int n_mvs, ctx;
 1487|  14.7k|            dav1d_refmvs_find(&t->rt, mvstack, &n_mvs, &ctx,
 1488|  14.7k|                              (union refmvs_refpair) { .ref = {
 1489|  14.7k|                                    b->ref[0] + 1, b->ref[1] + 1 }},
 1490|  14.7k|                              bs, intra_edge_flags, t->by, t->bx);
 1491|       |
 1492|  14.7k|            b->inter_mode = dav1d_msac_decode_symbol_adapt8(&ts->msac,
  ------------------
  |  |   48|  14.7k|#define dav1d_msac_decode_symbol_adapt8  dav1d_msac_decode_symbol_adapt8_sse2
  ------------------
 1493|  14.7k|                                ts->cdf.m.comp_inter_mode[ctx],
 1494|  14.7k|                                N_COMP_INTER_PRED_MODES - 1);
 1495|  14.7k|            if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|  14.7k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 14.7k]
  |  |  ------------------
  |  |   35|  14.7k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  14.7k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1496|      0|                printf("Post-compintermode[%d,ctx=%d,n_mvs=%d]: r=%d\n",
 1497|      0|                       b->inter_mode, ctx, n_mvs, ts->msac.rng);
 1498|       |
 1499|  14.7k|            const uint8_t *const im = dav1d_comp_inter_pred_modes[b->inter_mode];
 1500|  14.7k|            b->drl_idx = NEAREST_DRL;
 1501|  14.7k|            if (b->inter_mode == NEWMV_NEWMV) {
  ------------------
  |  Branch (1501:17): [True: 3.48k, False: 11.2k]
  ------------------
 1502|  3.48k|                if (n_mvs > 1) { // NEARER, NEAR or NEARISH
  ------------------
  |  Branch (1502:21): [True: 3.48k, False: 0]
  ------------------
 1503|  3.48k|                    const int drl_ctx_v1 = get_drl_context(mvstack, 0);
 1504|  3.48k|                    b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
  ------------------
  |  |   52|  3.48k|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
 1505|  3.48k|                                      ts->cdf.m.drl_bit[drl_ctx_v1]);
 1506|  3.48k|                    if (b->drl_idx == NEARER_DRL && n_mvs > 2) {
  ------------------
  |  Branch (1506:25): [True: 2.31k, False: 1.17k]
  |  Branch (1506:53): [True: 841, False: 1.47k]
  ------------------
 1507|    841|                        const int drl_ctx_v2 = get_drl_context(mvstack, 1);
 1508|    841|                        b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
  ------------------
  |  |   52|    841|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
 1509|    841|                                          ts->cdf.m.drl_bit[drl_ctx_v2]);
 1510|    841|                    }
 1511|  3.48k|                    if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|  3.48k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 3.48k]
  |  |  ------------------
  |  |   35|  3.48k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  3.48k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1512|      0|                        printf("Post-drlidx[%d,n_mvs=%d]: r=%d\n",
 1513|      0|                               b->drl_idx, n_mvs, ts->msac.rng);
 1514|  3.48k|                }
 1515|  11.2k|            } else if (im[0] == NEARMV || im[1] == NEARMV) {
  ------------------
  |  Branch (1515:24): [True: 2.93k, False: 8.35k]
  |  Branch (1515:43): [True: 378, False: 7.97k]
  ------------------
 1516|  3.31k|                b->drl_idx = NEARER_DRL;
 1517|  3.31k|                if (n_mvs > 2) { // NEAR or NEARISH
  ------------------
  |  Branch (1517:21): [True: 272, False: 3.04k]
  ------------------
 1518|    272|                    const int drl_ctx_v2 = get_drl_context(mvstack, 1);
 1519|    272|                    b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
  ------------------
  |  |   52|    272|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
 1520|    272|                                      ts->cdf.m.drl_bit[drl_ctx_v2]);
 1521|    272|                    if (b->drl_idx == NEAR_DRL && n_mvs > 3) {
  ------------------
  |  Branch (1521:25): [True: 94, False: 178]
  |  Branch (1521:51): [True: 22, False: 72]
  ------------------
 1522|     22|                        const int drl_ctx_v3 = get_drl_context(mvstack, 2);
 1523|     22|                        b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
  ------------------
  |  |   52|     22|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
 1524|     22|                                          ts->cdf.m.drl_bit[drl_ctx_v3]);
 1525|     22|                    }
 1526|    272|                    if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|    272|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 272]
  |  |  ------------------
  |  |   35|    272|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|    272|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1527|      0|                        printf("Post-drlidx[%d,n_mvs=%d]: r=%d\n",
 1528|      0|                               b->drl_idx, n_mvs, ts->msac.rng);
 1529|    272|                }
 1530|  3.31k|            }
 1531|  14.7k|            assert(b->drl_idx >= NEAREST_DRL && b->drl_idx <= NEARISH_DRL);
  ------------------
  |  |  140|  29.5k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:30): [True: 14.7k, False: 3]
  |  |  |  Branch (140:30): [True: 14.7k, False: 0]
  |  |  |  Branch (140:68): [Folded, False: 14.7k]
  |  |  ------------------
  ------------------
 1532|       |
 1533|  14.7k|#define assign_comp_mv(idx) \
 1534|  14.7k|            switch (im[idx]) { \
 1535|  14.7k|            case NEARMV: \
 1536|  14.7k|            case NEARESTMV: \
 1537|  14.7k|                b->mv[idx] = mvstack[b->drl_idx].mv.mv[idx]; \
 1538|  14.7k|                fix_mv_precision(f->frame_hdr, &b->mv[idx]); \
 1539|  14.7k|                break; \
 1540|  14.7k|            case GLOBALMV: \
 1541|  14.7k|                has_subpel_filter |= \
 1542|  14.7k|                    f->frame_hdr->gmv[b->ref[idx]].type == DAV1D_WM_TYPE_TRANSLATION; \
 1543|  14.7k|                b->mv[idx] = get_gmv_2d(&f->frame_hdr->gmv[b->ref[idx]], \
 1544|  14.7k|                                        t->bx, t->by, bw4, bh4, f->frame_hdr); \
 1545|  14.7k|                break; \
 1546|  14.7k|            case NEWMV: \
 1547|  14.7k|                b->mv[idx] = mvstack[b->drl_idx].mv.mv[idx]; \
 1548|  14.7k|                const int mv_prec = f->frame_hdr->hp - f->frame_hdr->force_integer_mv; \
 1549|  14.7k|                read_mv_residual(ts, &b->mv[idx], mv_prec); \
 1550|  14.7k|                break; \
 1551|  14.7k|            }
 1552|  14.7k|            has_subpel_filter = imin(bw4, bh4) == 1 ||
  ------------------
  |  Branch (1552:33): [True: 3, False: 14.7k]
  ------------------
 1553|  14.7k|                                b->inter_mode != GLOBALMV_GLOBALMV;
  ------------------
  |  Branch (1553:33): [True: 13.1k, False: 1.60k]
  ------------------
 1554|  14.7k|            assign_comp_mv(0);
  ------------------
  |  | 1534|  14.7k|            switch (im[idx]) { \
  |  |  ------------------
  |  |  |  Branch (1534:21): [True: 14.7k, False: 0]
  |  |  ------------------
  |  | 1535|  2.93k|            case NEARMV: \
  |  |  ------------------
  |  |  |  Branch (1535:13): [True: 2.93k, False: 11.8k]
  |  |  ------------------
  |  | 1536|  8.48k|            case NEARESTMV: \
  |  |  ------------------
  |  |  |  Branch (1536:13): [True: 5.54k, False: 9.23k]
  |  |  ------------------
  |  | 1537|  8.48k|                b->mv[idx] = mvstack[b->drl_idx].mv.mv[idx]; \
  |  | 1538|  8.48k|                fix_mv_precision(f->frame_hdr, &b->mv[idx]); \
  |  | 1539|  8.48k|                break; \
  |  | 1540|  2.93k|            case GLOBALMV: \
  |  |  ------------------
  |  |  |  Branch (1540:13): [True: 1.60k, False: 13.1k]
  |  |  ------------------
  |  | 1541|  1.60k|                has_subpel_filter |= \
  |  | 1542|  1.60k|                    f->frame_hdr->gmv[b->ref[idx]].type == DAV1D_WM_TYPE_TRANSLATION; \
  |  | 1543|  1.60k|                b->mv[idx] = get_gmv_2d(&f->frame_hdr->gmv[b->ref[idx]], \
  |  | 1544|  1.60k|                                        t->bx, t->by, bw4, bh4, f->frame_hdr); \
  |  | 1545|  1.60k|                break; \
  |  | 1546|  4.69k|            case NEWMV: \
  |  |  ------------------
  |  |  |  Branch (1546:13): [True: 4.69k, False: 10.0k]
  |  |  ------------------
  |  | 1547|  4.69k|                b->mv[idx] = mvstack[b->drl_idx].mv.mv[idx]; \
  |  | 1548|  4.69k|                const int mv_prec = f->frame_hdr->hp - f->frame_hdr->force_integer_mv; \
  |  | 1549|  4.69k|                read_mv_residual(ts, &b->mv[idx], mv_prec); \
  |  | 1550|  4.69k|                break; \
  |  | 1551|  14.7k|            }
  ------------------
 1555|  14.7k|            assign_comp_mv(1);
  ------------------
  |  | 1534|  14.7k|            switch (im[idx]) { \
  |  |  ------------------
  |  |  |  Branch (1534:21): [True: 14.7k, False: 18.4E]
  |  |  ------------------
  |  | 1535|  2.85k|            case NEARMV: \
  |  |  ------------------
  |  |  |  Branch (1535:13): [True: 2.85k, False: 11.9k]
  |  |  ------------------
  |  | 1536|  8.32k|            case NEARESTMV: \
  |  |  ------------------
  |  |  |  Branch (1536:13): [True: 5.47k, False: 9.30k]
  |  |  ------------------
  |  | 1537|  8.32k|                b->mv[idx] = mvstack[b->drl_idx].mv.mv[idx]; \
  |  | 1538|  8.32k|                fix_mv_precision(f->frame_hdr, &b->mv[idx]); \
  |  | 1539|  8.32k|                break; \
  |  | 1540|  2.85k|            case GLOBALMV: \
  |  |  ------------------
  |  |  |  Branch (1540:13): [True: 1.60k, False: 13.1k]
  |  |  ------------------
  |  | 1541|  1.60k|                has_subpel_filter |= \
  |  | 1542|  1.60k|                    f->frame_hdr->gmv[b->ref[idx]].type == DAV1D_WM_TYPE_TRANSLATION; \
  |  | 1543|  1.60k|                b->mv[idx] = get_gmv_2d(&f->frame_hdr->gmv[b->ref[idx]], \
  |  | 1544|  1.60k|                                        t->bx, t->by, bw4, bh4, f->frame_hdr); \
  |  | 1545|  1.60k|                break; \
  |  | 1546|  4.84k|            case NEWMV: \
  |  |  ------------------
  |  |  |  Branch (1546:13): [True: 4.84k, False: 9.93k]
  |  |  ------------------
  |  | 1547|  4.84k|                b->mv[idx] = mvstack[b->drl_idx].mv.mv[idx]; \
  |  | 1548|  4.84k|                const int mv_prec = f->frame_hdr->hp - f->frame_hdr->force_integer_mv; \
  |  | 1549|  4.84k|                read_mv_residual(ts, &b->mv[idx], mv_prec); \
  |  | 1550|  4.84k|                break; \
  |  | 1551|  14.7k|            }
  ------------------
 1556|  14.7k|#undef assign_comp_mv
 1557|  14.7k|            if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|  14.7k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 14.7k]
  |  |  ------------------
  |  |   35|  14.7k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  14.7k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1558|      0|                printf("Post-residual_mv[1:y=%d,x=%d,2:y=%d,x=%d]: r=%d\n",
 1559|      0|                       b->mv[0].y, b->mv[0].x, b->mv[1].y, b->mv[1].x,
 1560|      0|                       ts->msac.rng);
 1561|       |
 1562|       |            // jnt_comp vs. seg vs. wedge
 1563|  14.7k|            int is_segwedge = 0;
 1564|  14.7k|            if (f->seq_hdr->masked_compound) {
  ------------------
  |  Branch (1564:17): [True: 7.78k, False: 6.99k]
  ------------------
 1565|  7.78k|                const int mask_ctx = get_mask_comp_ctx(t->a, &t->l, by4, bx4);
 1566|       |
 1567|  7.78k|                is_segwedge = dav1d_msac_decode_bool_adapt(&ts->msac,
  ------------------
  |  |   52|  7.78k|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
 1568|  7.78k|                                  ts->cdf.m.mask_comp[mask_ctx]);
 1569|  7.78k|                if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|  7.78k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 7.78k]
  |  |  ------------------
  |  |   35|  7.78k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  7.78k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1570|      0|                    printf("Post-segwedge_vs_jntavg[%d,ctx=%d]: r=%d\n",
 1571|      0|                           is_segwedge, mask_ctx, ts->msac.rng);
 1572|  7.78k|            }
 1573|       |
 1574|  14.7k|            if (!is_segwedge) {
  ------------------
  |  Branch (1574:17): [True: 12.3k, False: 2.42k]
  ------------------
 1575|  12.3k|                if (f->seq_hdr->jnt_comp) {
  ------------------
  |  Branch (1575:21): [True: 7.65k, False: 4.70k]
  ------------------
 1576|  7.65k|                    const int jnt_ctx =
 1577|  7.65k|                        get_jnt_comp_ctx(f->seq_hdr->order_hint_n_bits,
 1578|  7.65k|                                         f->cur.frame_hdr->frame_offset,
 1579|  7.65k|                                         f->refp[b->ref[0]].p.frame_hdr->frame_offset,
 1580|  7.65k|                                         f->refp[b->ref[1]].p.frame_hdr->frame_offset,
 1581|  7.65k|                                         t->a, &t->l, by4, bx4);
 1582|  7.65k|                    b->comp_type = COMP_INTER_WEIGHTED_AVG +
 1583|  7.65k|                                   dav1d_msac_decode_bool_adapt(&ts->msac,
  ------------------
  |  |   52|  7.65k|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
 1584|  7.65k|                                       ts->cdf.m.jnt_comp[jnt_ctx]);
 1585|  7.65k|                    if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|  7.65k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 7.65k]
  |  |  ------------------
  |  |   35|  7.65k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  7.65k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1586|      0|                        printf("Post-jnt_comp[%d,ctx=%d[ac:%d,ar:%d,lc:%d,lr:%d]]: r=%d\n",
 1587|      0|                               b->comp_type == COMP_INTER_AVG,
 1588|      0|                               jnt_ctx, t->a->comp_type[bx4], t->a->ref[0][bx4],
 1589|      0|                               t->l.comp_type[by4], t->l.ref[0][by4],
 1590|      0|                               ts->msac.rng);
 1591|  7.65k|                } else {
 1592|  4.70k|                    b->comp_type = COMP_INTER_AVG;
 1593|  4.70k|                }
 1594|  12.3k|            } else {
 1595|  2.42k|                if (wedge_allowed_mask & (1 << bs)) {
  ------------------
  |  Branch (1595:21): [True: 2.23k, False: 190]
  ------------------
 1596|  2.23k|                    const int ctx = dav1d_wedge_ctx_lut[bs];
 1597|  2.23k|                    b->comp_type = COMP_INTER_WEDGE -
 1598|  2.23k|                                   dav1d_msac_decode_bool_adapt(&ts->msac,
  ------------------
  |  |   52|  2.23k|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
 1599|  2.23k|                                       ts->cdf.m.wedge_comp[ctx]);
 1600|  2.23k|                    if (b->comp_type == COMP_INTER_WEDGE)
  ------------------
  |  Branch (1600:25): [True: 631, False: 1.60k]
  ------------------
 1601|    631|                        b->wedge_idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
  ------------------
  |  |   57|    631|#define dav1d_msac_decode_symbol_adapt16(ctx, cdf, symb) ((ctx)->symbol_adapt16(ctx, cdf, symb))
  ------------------
 1602|  2.23k|                                           ts->cdf.m.wedge_idx[ctx], 15);
 1603|  2.23k|                } else {
 1604|    190|                    b->comp_type = COMP_INTER_SEG;
 1605|    190|                }
 1606|  2.42k|                b->mask_sign = dav1d_msac_decode_bool_equi(&ts->msac);
  ------------------
  |  |   53|  2.42k|#define dav1d_msac_decode_bool_equi      dav1d_msac_decode_bool_equi_sse2
  ------------------
 1607|  2.42k|                if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|  2.42k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 2.42k]
  |  |  ------------------
  |  |   35|  2.42k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  2.42k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1608|      0|                    printf("Post-seg/wedge[%d,wedge_idx=%d,sign=%d]: r=%d\n",
 1609|      0|                           b->comp_type == COMP_INTER_WEDGE,
 1610|      0|                           b->wedge_idx, b->mask_sign, ts->msac.rng);
 1611|  2.42k|            }
 1612|  76.5k|        } else {
 1613|  76.5k|            b->comp_type = COMP_INTER_NONE;
 1614|       |
 1615|       |            // ref
 1616|  76.5k|            if (seg && seg->ref > 0) {
  ------------------
  |  Branch (1616:17): [True: 14.3k, False: 62.1k]
  |  Branch (1616:24): [True: 970, False: 13.3k]
  ------------------
 1617|    970|                b->ref[0] = seg->ref - 1;
 1618|  75.5k|            } else if (seg && (seg->globalmv || seg->skip)) {
  ------------------
  |  Branch (1618:24): [True: 13.3k, False: 62.1k]
  |  Branch (1618:32): [True: 9.57k, False: 3.81k]
  |  Branch (1618:49): [True: 807, False: 3.01k]
  ------------------
 1619|  10.3k|                b->ref[0] = 0;
 1620|  65.1k|            } else {
 1621|  65.1k|                const int ctx1 = av1_get_ref_ctx(t->a, &t->l, by4, bx4,
 1622|  65.1k|                                                 have_top, have_left);
 1623|  65.1k|                if (dav1d_msac_decode_bool_adapt(&ts->msac,
  ------------------
  |  |   52|  65.1k|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
  |  Branch (1623:21): [True: 28.7k, False: 36.4k]
  ------------------
 1624|  65.1k|                                                 ts->cdf.m.ref[0][ctx1]))
 1625|  28.7k|                {
 1626|  28.7k|                    const int ctx2 = av1_get_ref_2_ctx(t->a, &t->l, by4, bx4,
  ------------------
  |  |  275|  28.7k|#define av1_get_ref_2_ctx av1_get_bwd_ref_ctx
  ------------------
 1627|  28.7k|                                                       have_top, have_left);
 1628|  28.7k|                    if (dav1d_msac_decode_bool_adapt(&ts->msac,
  ------------------
  |  |   52|  28.7k|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
  |  Branch (1628:25): [True: 19.0k, False: 9.71k]
  ------------------
 1629|  28.7k|                                                     ts->cdf.m.ref[1][ctx2]))
 1630|  19.0k|                    {
 1631|  19.0k|                        b->ref[0] = 6;
 1632|  19.0k|                    } else {
 1633|  9.71k|                        const int ctx3 = av1_get_ref_6_ctx(t->a, &t->l, by4, bx4,
  ------------------
  |  |  279|  9.71k|#define av1_get_ref_6_ctx av1_get_bwd_ref_1_ctx
  ------------------
 1634|  9.71k|                                                           have_top, have_left);
 1635|  9.71k|                        b->ref[0] = 4 + dav1d_msac_decode_bool_adapt(&ts->msac,
  ------------------
  |  |   52|  9.71k|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
 1636|  9.71k|                                            ts->cdf.m.ref[5][ctx3]);
 1637|  9.71k|                    }
 1638|  36.4k|                } else {
 1639|  36.4k|                    const int ctx2 = av1_get_ref_3_ctx(t->a, &t->l, by4, bx4,
  ------------------
  |  |  276|  36.4k|#define av1_get_ref_3_ctx av1_get_fwd_ref_ctx
  ------------------
 1640|  36.4k|                                                       have_top, have_left);
 1641|  36.4k|                    if (dav1d_msac_decode_bool_adapt(&ts->msac,
  ------------------
  |  |   52|  36.4k|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
  |  Branch (1641:25): [True: 8.68k, False: 27.7k]
  ------------------
 1642|  36.4k|                                                     ts->cdf.m.ref[2][ctx2]))
 1643|  8.68k|                    {
 1644|  8.68k|                        const int ctx3 = av1_get_ref_5_ctx(t->a, &t->l, by4, bx4,
  ------------------
  |  |  278|  8.68k|#define av1_get_ref_5_ctx av1_get_fwd_ref_2_ctx
  ------------------
 1645|  8.68k|                                                           have_top, have_left);
 1646|  8.68k|                        b->ref[0] = 2 + dav1d_msac_decode_bool_adapt(&ts->msac,
  ------------------
  |  |   52|  8.68k|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
 1647|  8.68k|                                            ts->cdf.m.ref[4][ctx3]);
 1648|  27.7k|                    } else {
 1649|  27.7k|                        const int ctx3 = av1_get_ref_4_ctx(t->a, &t->l, by4, bx4,
  ------------------
  |  |  277|  27.7k|#define av1_get_ref_4_ctx av1_get_fwd_ref_1_ctx
  ------------------
 1650|  27.7k|                                                           have_top, have_left);
 1651|  27.7k|                        b->ref[0] = dav1d_msac_decode_bool_adapt(&ts->msac,
  ------------------
  |  |   52|  27.7k|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
 1652|  27.7k|                                        ts->cdf.m.ref[3][ctx3]);
 1653|  27.7k|                    }
 1654|  36.4k|                }
 1655|  65.1k|                if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|  65.1k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 65.1k]
  |  |  ------------------
  |  |   35|  65.1k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  65.1k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1656|      0|                    printf("Post-ref[%d]: r=%d\n", b->ref[0], ts->msac.rng);
 1657|  65.1k|            }
 1658|  76.5k|            b->ref[1] = -1;
 1659|       |
 1660|  76.5k|            refmvs_candidate mvstack[8];
 1661|  76.5k|            int n_mvs, ctx;
 1662|  76.5k|            dav1d_refmvs_find(&t->rt, mvstack, &n_mvs, &ctx,
 1663|  76.5k|                              (union refmvs_refpair) { .ref = { b->ref[0] + 1, -1 }},
 1664|  76.5k|                              bs, intra_edge_flags, t->by, t->bx);
 1665|       |
 1666|       |            // mode parsing and mv derivation from ref_mvs
 1667|  76.5k|            if ((seg && (seg->skip || seg->globalmv)) ||
  ------------------
  |  Branch (1667:18): [True: 14.3k, False: 62.1k]
  |  Branch (1667:26): [True: 10.4k, False: 3.90k]
  |  Branch (1667:39): [True: 825, False: 3.08k]
  ------------------
 1668|  59.3k|                dav1d_msac_decode_bool_adapt(&ts->msac,
  ------------------
  |  |   52|  59.3k|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
  |  Branch (1668:17): [True: 38.2k, False: 21.1k]
  ------------------
 1669|  59.3k|                                             ts->cdf.m.newmv_mode[ctx & 7]))
 1670|  49.5k|            {
 1671|  49.5k|                if ((seg && (seg->skip || seg->globalmv)) ||
  ------------------
  |  Branch (1671:22): [True: 13.1k, False: 36.4k]
  |  Branch (1671:30): [True: 10.4k, False: 2.67k]
  |  Branch (1671:43): [True: 825, False: 1.84k]
  ------------------
 1672|  38.2k|                    !dav1d_msac_decode_bool_adapt(&ts->msac,
  ------------------
  |  |   52|  38.2k|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
  |  Branch (1672:21): [True: 1.30k, False: 36.9k]
  ------------------
 1673|  38.2k|                         ts->cdf.m.globalmv_mode[(ctx >> 3) & 1]))
 1674|  12.5k|                {
 1675|  12.5k|                    b->inter_mode = GLOBALMV;
 1676|  12.5k|                    b->mv[0] = get_gmv_2d(&f->frame_hdr->gmv[b->ref[0]],
 1677|  12.5k|                                          t->bx, t->by, bw4, bh4, f->frame_hdr);
 1678|  12.5k|                    has_subpel_filter = imin(bw4, bh4) == 1 ||
  ------------------
  |  Branch (1678:41): [True: 5.46k, False: 7.13k]
  ------------------
 1679|  7.13k|                        f->frame_hdr->gmv[b->ref[0]].type == DAV1D_WM_TYPE_TRANSLATION;
  ------------------
  |  Branch (1679:25): [True: 3.57k, False: 3.55k]
  ------------------
 1680|  36.9k|                } else {
 1681|  36.9k|                    has_subpel_filter = 1;
 1682|  36.9k|                    if (dav1d_msac_decode_bool_adapt(&ts->msac,
  ------------------
  |  |   52|  36.9k|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
  |  Branch (1682:25): [True: 14.2k, False: 22.7k]
  ------------------
 1683|  36.9k|                            ts->cdf.m.refmv_mode[(ctx >> 4) & 15]))
 1684|  14.2k|                    { // NEAREST, NEARER, NEAR or NEARISH
 1685|  14.2k|                        b->inter_mode = NEARMV;
 1686|  14.2k|                        b->drl_idx = NEARER_DRL;
 1687|  14.2k|                        if (n_mvs > 2) { // NEARER, NEAR or NEARISH
  ------------------
  |  Branch (1687:29): [True: 3.56k, False: 10.6k]
  ------------------
 1688|  3.56k|                            const int drl_ctx_v2 = get_drl_context(mvstack, 1);
 1689|  3.56k|                            b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
  ------------------
  |  |   52|  3.56k|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
 1690|  3.56k|                                              ts->cdf.m.drl_bit[drl_ctx_v2]);
 1691|  3.56k|                            if (b->drl_idx == NEAR_DRL && n_mvs > 3) { // NEAR or NEARISH
  ------------------
  |  Branch (1691:33): [True: 2.25k, False: 1.30k]
  |  Branch (1691:59): [True: 870, False: 1.38k]
  ------------------
 1692|    870|                                const int drl_ctx_v3 =
 1693|    870|                                    get_drl_context(mvstack, 2);
 1694|    870|                                b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
  ------------------
  |  |   52|    870|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
 1695|    870|                                                  ts->cdf.m.drl_bit[drl_ctx_v3]);
 1696|    870|                            }
 1697|  3.56k|                        }
 1698|  22.7k|                    } else {
 1699|  22.7k|                        b->inter_mode = NEARESTMV;
 1700|  22.7k|                        b->drl_idx = NEAREST_DRL;
 1701|  22.7k|                    }
 1702|  36.9k|                    assert(b->drl_idx >= NEAREST_DRL && b->drl_idx <= NEARISH_DRL);
  ------------------
  |  |  140|  73.9k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:30): [True: 36.9k, False: 18.4E]
  |  |  |  Branch (140:30): [True: 36.9k, False: 0]
  |  |  |  Branch (140:68): [Folded, False: 36.9k]
  |  |  ------------------
  ------------------
 1703|  36.9k|                    b->mv[0] = mvstack[b->drl_idx].mv.mv[0];
 1704|  36.9k|                    if (b->drl_idx < NEAR_DRL)
  ------------------
  |  Branch (1704:25): [True: 34.7k, False: 2.25k]
  ------------------
 1705|  34.7k|                        fix_mv_precision(f->frame_hdr, &b->mv[0]);
 1706|  36.9k|                }
 1707|       |
 1708|  49.5k|                if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|  49.5k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 49.5k]
  |  |  ------------------
  |  |   35|  49.5k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  49.5k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1709|      0|                    printf("Post-intermode[%d,drl=%d,mv=y:%d,x:%d,n_mvs=%d]: r=%d\n",
 1710|      0|                           b->inter_mode, b->drl_idx, b->mv[0].y, b->mv[0].x, n_mvs,
 1711|      0|                           ts->msac.rng);
 1712|  49.5k|            } else {
 1713|  26.9k|                has_subpel_filter = 1;
 1714|  26.9k|                b->inter_mode = NEWMV;
 1715|  26.9k|                b->drl_idx = NEAREST_DRL;
 1716|  26.9k|                if (n_mvs > 1) { // NEARER, NEAR or NEARISH
  ------------------
  |  Branch (1716:21): [True: 12.3k, False: 14.6k]
  ------------------
 1717|  12.3k|                    const int drl_ctx_v1 = get_drl_context(mvstack, 0);
 1718|  12.3k|                    b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
  ------------------
  |  |   52|  12.3k|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
 1719|  12.3k|                                      ts->cdf.m.drl_bit[drl_ctx_v1]);
 1720|  12.3k|                    if (b->drl_idx == NEARER_DRL && n_mvs > 2) { // NEAR or NEARISH
  ------------------
  |  Branch (1720:25): [True: 5.11k, False: 7.20k]
  |  Branch (1720:53): [True: 1.86k, False: 3.25k]
  ------------------
 1721|  1.86k|                        const int drl_ctx_v2 = get_drl_context(mvstack, 1);
 1722|  1.86k|                        b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
  ------------------
  |  |   52|  1.86k|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
 1723|  1.86k|                                          ts->cdf.m.drl_bit[drl_ctx_v2]);
 1724|  1.86k|                    }
 1725|  12.3k|                }
 1726|  26.9k|                assert(b->drl_idx >= NEAREST_DRL && b->drl_idx <= NEARISH_DRL);
  ------------------
  |  |  140|  48.0k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:30): [True: 21.1k, False: 5.87k]
  |  |  |  Branch (140:30): [True: 21.1k, False: 0]
  |  |  |  Branch (140:68): [Folded, False: 26.9k]
  |  |  ------------------
  ------------------
 1727|  26.9k|                if (n_mvs > 1) {
  ------------------
  |  Branch (1727:21): [True: 12.3k, False: 14.6k]
  ------------------
 1728|  12.3k|                    b->mv[0] = mvstack[b->drl_idx].mv.mv[0];
 1729|  14.6k|                } else {
 1730|  14.6k|                    assert(!b->drl_idx);
  ------------------
  |  |  140|  14.6k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 14.6k]
  |  |  |  Branch (140:68): [Folded, False: 14.6k]
  |  |  ------------------
  ------------------
 1731|  14.6k|                    b->mv[0] = mvstack[0].mv.mv[0];
 1732|  14.6k|                    fix_mv_precision(f->frame_hdr, &b->mv[0]);
 1733|  14.6k|                }
 1734|  26.9k|                if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|  26.9k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 26.9k]
  |  |  ------------------
  |  |   35|  26.9k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  26.9k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1735|      0|                    printf("Post-intermode[%d,drl=%d]: r=%d\n",
 1736|      0|                           b->inter_mode, b->drl_idx, ts->msac.rng);
 1737|  26.9k|                const int mv_prec = f->frame_hdr->hp - f->frame_hdr->force_integer_mv;
 1738|  26.9k|                read_mv_residual(ts, &b->mv[0], mv_prec);
 1739|  26.9k|                if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|  26.9k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 26.9k]
  |  |  ------------------
  |  |   35|  26.9k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  26.9k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1740|      0|                    printf("Post-residualmv[mv=y:%d,x:%d]: r=%d\n",
 1741|      0|                           b->mv[0].y, b->mv[0].x, ts->msac.rng);
 1742|  26.9k|            }
 1743|       |
 1744|       |            // interintra flags
 1745|  76.5k|            const int ii_sz_grp = dav1d_ymode_size_context[bs];
 1746|  76.5k|            if (f->seq_hdr->inter_intra &&
  ------------------
  |  Branch (1746:17): [True: 21.5k, False: 54.9k]
  ------------------
 1747|  21.5k|                interintra_allowed_mask & (1 << bs) &&
  ------------------
  |  Branch (1747:17): [True: 9.95k, False: 11.6k]
  ------------------
 1748|  9.95k|                dav1d_msac_decode_bool_adapt(&ts->msac,
  ------------------
  |  |   52|  9.95k|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
  |  Branch (1748:17): [True: 1.82k, False: 8.12k]
  ------------------
 1749|  9.95k|                                             ts->cdf.m.interintra[ii_sz_grp]))
 1750|  1.82k|            {
 1751|  1.82k|                b->interintra_mode = dav1d_msac_decode_symbol_adapt4(&ts->msac,
  ------------------
  |  |   47|  1.82k|#define dav1d_msac_decode_symbol_adapt4  dav1d_msac_decode_symbol_adapt4_sse2
  ------------------
 1752|  1.82k|                                         ts->cdf.m.interintra_mode[ii_sz_grp],
 1753|  1.82k|                                         N_INTER_INTRA_PRED_MODES - 1);
 1754|  1.82k|                const int wedge_ctx = dav1d_wedge_ctx_lut[bs];
 1755|  1.82k|                b->interintra_type = INTER_INTRA_BLEND +
 1756|  1.82k|                                     dav1d_msac_decode_bool_adapt(&ts->msac,
  ------------------
  |  |   52|  1.82k|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
 1757|  1.82k|                                         ts->cdf.m.interintra_wedge[wedge_ctx]);
 1758|  1.82k|                if (b->interintra_type == INTER_INTRA_WEDGE)
  ------------------
  |  Branch (1758:21): [True: 518, False: 1.31k]
  ------------------
 1759|    518|                    b->wedge_idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
  ------------------
  |  |   57|    518|#define dav1d_msac_decode_symbol_adapt16(ctx, cdf, symb) ((ctx)->symbol_adapt16(ctx, cdf, symb))
  ------------------
 1760|  1.82k|                                       ts->cdf.m.wedge_idx[wedge_ctx], 15);
 1761|  74.7k|            } else {
 1762|  74.7k|                b->interintra_type = INTER_INTRA_NONE;
 1763|  74.7k|            }
 1764|  76.5k|            if (DEBUG_BLOCK_INFO && f->seq_hdr->inter_intra &&
  ------------------
  |  |   34|  76.5k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 76.5k]
  |  |  ------------------
  |  |   35|  76.5k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  76.5k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  |  Branch (1764:37): [True: 0, False: 0]
  ------------------
 1765|      0|                interintra_allowed_mask & (1 << bs))
  ------------------
  |  Branch (1765:17): [True: 0, False: 0]
  ------------------
 1766|      0|            {
 1767|      0|                printf("Post-interintra[t=%d,m=%d,w=%d]: r=%d\n",
 1768|      0|                       b->interintra_type, b->interintra_mode,
 1769|      0|                       b->wedge_idx, ts->msac.rng);
 1770|      0|            }
 1771|       |
 1772|       |            // motion variation
 1773|  76.5k|            if (f->frame_hdr->switchable_motion_mode &&
  ------------------
  |  Branch (1773:17): [True: 61.6k, False: 14.8k]
  ------------------
 1774|  61.6k|                b->interintra_type == INTER_INTRA_NONE && imin(bw4, bh4) >= 2 &&
  ------------------
  |  Branch (1774:17): [True: 60.0k, False: 1.63k]
  |  Branch (1774:59): [True: 36.0k, False: 23.9k]
  ------------------
 1775|       |                // is not warped global motion
 1776|  36.0k|                !(!f->frame_hdr->force_integer_mv && b->inter_mode == GLOBALMV &&
  ------------------
  |  Branch (1776:19): [True: 33.8k, False: 2.18k]
  |  Branch (1776:54): [True: 5.52k, False: 28.3k]
  ------------------
 1777|  5.52k|                  f->frame_hdr->gmv[b->ref[0]].type > DAV1D_WM_TYPE_TRANSLATION) &&
  ------------------
  |  Branch (1777:19): [True: 1.00k, False: 4.52k]
  ------------------
 1778|       |                // has overlappable neighbours
 1779|  35.0k|                ((have_left && findoddzero(&t->l.intra[by4 + 1], h4 >> 1)) ||
  ------------------
  |  Branch (1779:19): [True: 29.7k, False: 5.33k]
  |  Branch (1779:32): [True: 28.7k, False: 1.02k]
  ------------------
 1780|  6.35k|                 (have_top && findoddzero(&t->a->intra[bx4 + 1], w4 >> 1))))
  ------------------
  |  Branch (1780:19): [True: 3.39k, False: 2.95k]
  |  Branch (1780:31): [True: 3.11k, False: 284]
  ------------------
 1781|  31.8k|            {
 1782|       |                // reaching here means the block allows obmc - check warp by
 1783|       |                // finding matching-ref blocks in top/left edges
 1784|  31.8k|                uint64_t mask[2] = { 0, 0 };
 1785|  31.8k|                find_matching_ref(t, intra_edge_flags, bw4, bh4, w4, h4,
 1786|  31.8k|                                  have_left, have_top, b->ref[0], mask);
 1787|  31.8k|                const int allow_warp = !f->svc[b->ref[0]][0].scale &&
  ------------------
  |  Branch (1787:40): [True: 31.3k, False: 518]
  ------------------
 1788|  31.3k|                    !f->frame_hdr->force_integer_mv &&
  ------------------
  |  Branch (1788:21): [True: 29.7k, False: 1.58k]
  ------------------
 1789|  29.7k|                    f->frame_hdr->warp_motion && (mask[0] | mask[1]);
  ------------------
  |  Branch (1789:21): [True: 15.7k, False: 14.0k]
  |  Branch (1789:50): [True: 13.9k, False: 1.74k]
  ------------------
 1790|       |
 1791|  31.8k|                b->motion_mode = allow_warp ?
  ------------------
  |  Branch (1791:34): [True: 13.9k, False: 17.8k]
  ------------------
 1792|  13.9k|                    dav1d_msac_decode_symbol_adapt4(&ts->msac,
  ------------------
  |  |   47|  13.9k|#define dav1d_msac_decode_symbol_adapt4  dav1d_msac_decode_symbol_adapt4_sse2
  ------------------
 1793|  13.9k|                        ts->cdf.m.motion_mode[bs], 2) :
 1794|  31.8k|                    dav1d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.obmc[bs]);
  ------------------
  |  |   52|  17.8k|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
 1795|  31.8k|                if (b->motion_mode == MM_WARP) {
  ------------------
  |  Branch (1795:21): [True: 5.48k, False: 26.3k]
  ------------------
 1796|  5.48k|                    has_subpel_filter = 0;
 1797|  5.48k|                    derive_warpmv(t, bw4, bh4, mask, b->mv[0], &t->warpmv);
 1798|  5.48k|#define signabs(v) v < 0 ? '-' : ' ', abs(v)
 1799|  5.48k|                    if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|  5.48k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 5.48k]
  |  |  ------------------
  |  |   35|  5.48k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  5.48k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1800|      0|                        printf("[ %c%x %c%x %c%x\n  %c%x %c%x %c%x ]\n"
 1801|      0|                               "alpha=%c%x, beta=%c%x, gamma=%c%x, delta=%c%x, "
 1802|      0|                               "mv=y:%d,x:%d\n",
 1803|      0|                               signabs(t->warpmv.matrix[0]),
  ------------------
  |  | 1798|      0|#define signabs(v) v < 0 ? '-' : ' ', abs(v)
  |  |  ------------------
  |  |  |  Branch (1798:20): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1804|      0|                               signabs(t->warpmv.matrix[1]),
  ------------------
  |  | 1798|      0|#define signabs(v) v < 0 ? '-' : ' ', abs(v)
  |  |  ------------------
  |  |  |  Branch (1798:20): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1805|      0|                               signabs(t->warpmv.matrix[2]),
  ------------------
  |  | 1798|      0|#define signabs(v) v < 0 ? '-' : ' ', abs(v)
  |  |  ------------------
  |  |  |  Branch (1798:20): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1806|      0|                               signabs(t->warpmv.matrix[3]),
  ------------------
  |  | 1798|      0|#define signabs(v) v < 0 ? '-' : ' ', abs(v)
  |  |  ------------------
  |  |  |  Branch (1798:20): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1807|      0|                               signabs(t->warpmv.matrix[4]),
  ------------------
  |  | 1798|      0|#define signabs(v) v < 0 ? '-' : ' ', abs(v)
  |  |  ------------------
  |  |  |  Branch (1798:20): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1808|      0|                               signabs(t->warpmv.matrix[5]),
  ------------------
  |  | 1798|      0|#define signabs(v) v < 0 ? '-' : ' ', abs(v)
  |  |  ------------------
  |  |  |  Branch (1798:20): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1809|      0|                               signabs(t->warpmv.u.p.alpha),
  ------------------
  |  | 1798|      0|#define signabs(v) v < 0 ? '-' : ' ', abs(v)
  |  |  ------------------
  |  |  |  Branch (1798:20): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1810|      0|                               signabs(t->warpmv.u.p.beta),
  ------------------
  |  | 1798|      0|#define signabs(v) v < 0 ? '-' : ' ', abs(v)
  |  |  ------------------
  |  |  |  Branch (1798:20): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1811|      0|                               signabs(t->warpmv.u.p.gamma),
  ------------------
  |  | 1798|      0|#define signabs(v) v < 0 ? '-' : ' ', abs(v)
  |  |  ------------------
  |  |  |  Branch (1798:20): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1812|      0|                               signabs(t->warpmv.u.p.delta),
  ------------------
  |  | 1798|      0|#define signabs(v) v < 0 ? '-' : ' ', abs(v)
  |  |  ------------------
  |  |  |  Branch (1798:20): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1813|      0|                               b->mv[0].y, b->mv[0].x);
 1814|  5.48k|#undef signabs
 1815|  5.48k|                    if (t->frame_thread.pass) {
  ------------------
  |  Branch (1815:25): [True: 0, False: 5.48k]
  ------------------
 1816|      0|                        if (t->warpmv.type == DAV1D_WM_TYPE_AFFINE) {
  ------------------
  |  Branch (1816:29): [True: 0, False: 0]
  ------------------
 1817|      0|                            b->matrix[0] = t->warpmv.matrix[2] - 0x10000;
 1818|      0|                            b->matrix[1] = t->warpmv.matrix[3];
 1819|      0|                            b->matrix[2] = t->warpmv.matrix[4];
 1820|      0|                            b->matrix[3] = t->warpmv.matrix[5] - 0x10000;
 1821|      0|                        } else {
 1822|      0|                            b->matrix[0] = INT16_MIN;
 1823|      0|                        }
 1824|      0|                    }
 1825|  5.48k|                }
 1826|       |
 1827|  31.8k|                if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|  31.8k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 31.8k]
  |  |  ------------------
  |  |   35|  31.8k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  31.8k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1828|      0|                    printf("Post-motionmode[%d]: r=%d [mask: 0x%" PRIx64 "/0x%"
 1829|      0|                           PRIx64 "]\n", b->motion_mode, ts->msac.rng, mask[0],
 1830|      0|                            mask[1]);
 1831|  44.7k|            } else {
 1832|  44.7k|                b->motion_mode = MM_TRANSLATION;
 1833|  44.7k|            }
 1834|  76.5k|        }
 1835|       |
 1836|       |        // subpel filter
 1837|  91.6k|        enum Dav1dFilterMode filter[2];
 1838|  91.6k|        if (f->frame_hdr->subpel_filter_mode == DAV1D_FILTER_SWITCHABLE) {
  ------------------
  |  Branch (1838:13): [True: 34.8k, False: 56.8k]
  ------------------
 1839|  34.8k|            if (has_subpel_filter) {
  ------------------
  |  Branch (1839:17): [True: 33.0k, False: 1.79k]
  ------------------
 1840|  33.0k|                const int comp = b->comp_type != COMP_INTER_NONE;
 1841|  33.0k|                const int ctx1 = get_filter_ctx(t->a, &t->l, comp, 0, b->ref[0],
 1842|  33.0k|                                                by4, bx4);
 1843|  33.0k|                filter[0] = dav1d_msac_decode_symbol_adapt4(&ts->msac,
  ------------------
  |  |   47|  33.0k|#define dav1d_msac_decode_symbol_adapt4  dav1d_msac_decode_symbol_adapt4_sse2
  ------------------
 1844|  33.0k|                               ts->cdf.m.filter[0][ctx1],
 1845|  33.0k|                               DAV1D_N_SWITCHABLE_FILTERS - 1);
 1846|  33.0k|                if (f->seq_hdr->dual_filter) {
  ------------------
  |  Branch (1846:21): [True: 23.4k, False: 9.60k]
  ------------------
 1847|  23.4k|                    const int ctx2 = get_filter_ctx(t->a, &t->l, comp, 1,
 1848|  23.4k|                                                    b->ref[0], by4, bx4);
 1849|  23.4k|                    if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|  23.4k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 23.4k]
  |  |  ------------------
  |  |   35|  23.4k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  23.4k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1850|      0|                        printf("Post-subpel_filter1[%d,ctx=%d]: r=%d\n",
 1851|      0|                               filter[0], ctx1, ts->msac.rng);
 1852|  23.4k|                    filter[1] = dav1d_msac_decode_symbol_adapt4(&ts->msac,
  ------------------
  |  |   47|  23.4k|#define dav1d_msac_decode_symbol_adapt4  dav1d_msac_decode_symbol_adapt4_sse2
  ------------------
 1853|  23.4k|                                    ts->cdf.m.filter[1][ctx2],
 1854|  23.4k|                                    DAV1D_N_SWITCHABLE_FILTERS - 1);
 1855|  23.4k|                    if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|  23.4k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 23.4k]
  |  |  ------------------
  |  |   35|  23.4k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  23.4k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1856|      0|                        printf("Post-subpel_filter2[%d,ctx=%d]: r=%d\n",
 1857|      0|                               filter[1], ctx2, ts->msac.rng);
 1858|  23.4k|                } else {
 1859|  9.60k|                    filter[1] = filter[0];
 1860|  9.60k|                    if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|  9.60k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 9.60k]
  |  |  ------------------
  |  |   35|  9.60k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  9.60k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1861|      0|                        printf("Post-subpel_filter[%d,ctx=%d]: r=%d\n",
 1862|      0|                               filter[0], ctx1, ts->msac.rng);
 1863|  9.60k|                }
 1864|  33.0k|            } else {
 1865|  1.79k|                filter[0] = filter[1] = DAV1D_FILTER_8TAP_REGULAR;
 1866|  1.79k|            }
 1867|  56.8k|        } else {
 1868|  56.8k|            filter[0] = filter[1] = f->frame_hdr->subpel_filter_mode;
 1869|  56.8k|        }
 1870|  91.6k|        b->filter2d = dav1d_filter_2d[filter[1]][filter[0]];
 1871|       |
 1872|  91.6k|        read_vartx_tree(t, b, bs, bx4, by4);
 1873|       |
 1874|       |        // reconstruction
 1875|  91.6k|        if (t->frame_thread.pass == 1) {
  ------------------
  |  Branch (1875:13): [True: 0, False: 91.6k]
  ------------------
 1876|      0|            f->bd_fn.read_coef_blocks(t, bs, b);
 1877|  91.6k|        } else {
 1878|  91.6k|            if (f->bd_fn.recon_b_inter(t, bs, b)) return -1;
  ------------------
  |  Branch (1878:17): [True: 0, False: 91.6k]
  ------------------
 1879|  91.6k|        }
 1880|       |
 1881|  91.6k|        if (f->frame_hdr->loopfilter.level_y[0] ||
  ------------------
  |  Branch (1881:13): [True: 24.9k, False: 66.6k]
  ------------------
 1882|  66.6k|            f->frame_hdr->loopfilter.level_y[1])
  ------------------
  |  Branch (1882:13): [True: 8.71k, False: 57.9k]
  ------------------
 1883|  27.8k|        {
 1884|  27.8k|            const int is_globalmv =
 1885|  27.8k|                b->inter_mode == (is_comp ? GLOBALMV_GLOBALMV : GLOBALMV);
  ------------------
  |  Branch (1885:35): [True: 3.96k, False: 23.8k]
  ------------------
 1886|  27.8k|            const uint8_t (*const lf_lvls)[8][2] = (const uint8_t (*)[8][2])
 1887|  27.8k|                &ts->lflvl[b->seg_id][0][b->ref[0] + 1][!is_globalmv];
 1888|  27.8k|            const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 };
 1889|  27.8k|            enum RectTxfmSize ytx = b->max_ytx, uvtx = b->uvtx;
 1890|  27.8k|            if (f->frame_hdr->segmentation.lossless[b->seg_id]) {
  ------------------
  |  Branch (1890:17): [True: 168, False: 27.6k]
  ------------------
 1891|    168|                ytx  = (enum RectTxfmSize) TX_4X4;
 1892|    168|                uvtx = (enum RectTxfmSize) TX_4X4;
 1893|    168|            }
 1894|  27.8k|            dav1d_create_lf_mask_inter(t->lf_mask, f->lf.level, f->b4_stride, lf_lvls,
 1895|  27.8k|                                       t->bx, t->by, f->w4, f->h4, b->skip, bs,
 1896|  27.8k|                                       ytx, tx_split, uvtx, f->cur.p.layout,
 1897|  27.8k|                                       &t->a->tx_lpf_y[bx4], &t->l.tx_lpf_y[by4],
 1898|  27.8k|                                       has_chroma ? &t->a->tx_lpf_uv[cbx4] : NULL,
  ------------------
  |  Branch (1898:40): [True: 22.9k, False: 4.87k]
  ------------------
 1899|  27.8k|                                       has_chroma ? &t->l.tx_lpf_uv[cby4] : NULL);
  ------------------
  |  Branch (1899:40): [True: 22.9k, False: 4.87k]
  ------------------
 1900|  27.8k|        }
 1901|       |
 1902|       |        // context updates
 1903|  91.6k|        if (is_comp)
  ------------------
  |  Branch (1903:13): [True: 15.1k, False: 76.5k]
  ------------------
 1904|  15.1k|            splat_tworef_mv(f->c, t, bs, b, bw4, bh4);
 1905|  76.5k|        else
 1906|  76.5k|            splat_oneref_mv(f->c, t, bs, b, bw4, bh4);
 1907|  91.6k|        BlockContext *edge = t->a;
 1908|   263k|        for (int i = 0, off = bx4; i < 2; i++, off = by4, edge = &t->l) {
  ------------------
  |  Branch (1908:36): [True: 171k, False: 91.7k]
  ------------------
 1909|   171k|#define set_ctx(rep_macro) \
 1910|   171k|            rep_macro(edge->seg_pred, off, seg_pred); \
 1911|   171k|            rep_macro(edge->skip_mode, off, b->skip_mode); \
 1912|   171k|            rep_macro(edge->intra, off, 0); \
 1913|   171k|            rep_macro(edge->skip, off, b->skip); \
 1914|   171k|            rep_macro(edge->pal_sz, off, 0); \
 1915|       |            /* see aomedia bug 2183 for why this is outside if (has_chroma) */ \
 1916|   171k|            rep_macro(t->pal_sz_uv[i], off, 0); \
 1917|   171k|            rep_macro(edge->tx_intra, off, b_dim[2 + i]); \
 1918|   171k|            rep_macro(edge->comp_type, off, b->comp_type); \
 1919|   171k|            rep_macro(edge->filter[0], off, filter[0]); \
 1920|   171k|            rep_macro(edge->filter[1], off, filter[1]); \
 1921|   171k|            rep_macro(edge->mode, off, b->inter_mode); \
 1922|   171k|            rep_macro(edge->ref[0], off, b->ref[0]); \
 1923|   171k|            rep_macro(edge->ref[1], off, ((uint8_t) b->ref[1]))
 1924|   171k|            case_set(b_dim[2 + i]);
  ------------------
  |  |   70|   171k|    switch (var) { \
  |  |   71|  37.6k|    case 0: set_ctx(set_ctx1); break; \
  |  |  ------------------
  |  |  |  | 1910|  37.6k|            rep_macro(edge->seg_pred, off, seg_pred); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   71|  37.6k|    case 0: set_ctx(set_ctx1); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   56|  37.6k|    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1911|  37.6k|            rep_macro(edge->skip_mode, off, b->skip_mode); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   71|  37.6k|    case 0: set_ctx(set_ctx1); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   56|  37.6k|    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1912|  37.6k|            rep_macro(edge->intra, off, 0); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   71|  37.6k|    case 0: set_ctx(set_ctx1); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   56|  37.6k|    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1913|  37.6k|            rep_macro(edge->skip, off, b->skip); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   71|  37.6k|    case 0: set_ctx(set_ctx1); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   56|  37.6k|    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1914|  37.6k|            rep_macro(edge->pal_sz, off, 0); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   71|  37.6k|    case 0: set_ctx(set_ctx1); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   56|  37.6k|    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1915|  37.6k|            /* see aomedia bug 2183 for why this is outside if (has_chroma) */ \
  |  |  |  | 1916|  37.6k|            rep_macro(t->pal_sz_uv[i], off, 0); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   71|  37.6k|    case 0: set_ctx(set_ctx1); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   56|  37.6k|    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1917|  37.6k|            rep_macro(edge->tx_intra, off, b_dim[2 + i]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   71|  37.6k|    case 0: set_ctx(set_ctx1); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   56|  37.6k|    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1918|  37.6k|            rep_macro(edge->comp_type, off, b->comp_type); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   71|  37.6k|    case 0: set_ctx(set_ctx1); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   56|  37.6k|    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1919|  37.6k|            rep_macro(edge->filter[0], off, filter[0]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   71|  37.6k|    case 0: set_ctx(set_ctx1); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   56|  37.6k|    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1920|  37.6k|            rep_macro(edge->filter[1], off, filter[1]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   71|  37.6k|    case 0: set_ctx(set_ctx1); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   56|  37.6k|    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1921|  37.6k|            rep_macro(edge->mode, off, b->inter_mode); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   71|  37.6k|    case 0: set_ctx(set_ctx1); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   56|  37.6k|    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1922|  37.6k|            rep_macro(edge->ref[0], off, b->ref[0]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   71|  37.6k|    case 0: set_ctx(set_ctx1); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   56|  37.6k|    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1923|  37.6k|            rep_macro(edge->ref[1], off, ((uint8_t) b->ref[1]))
  |  |  |  |  ------------------
  |  |  |  |  |  |   71|  37.6k|    case 0: set_ctx(set_ctx1); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   56|  37.6k|    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (71:5): [True: 37.6k, False: 133k]
  |  |  ------------------
  |  |   72|  64.7k|    case 1: set_ctx(set_ctx2); break; \
  |  |  ------------------
  |  |  |  | 1910|  64.7k|            rep_macro(edge->seg_pred, off, seg_pred); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   72|  64.7k|    case 1: set_ctx(set_ctx2); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   58|  64.7k|    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1911|  64.7k|            rep_macro(edge->skip_mode, off, b->skip_mode); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   72|  64.7k|    case 1: set_ctx(set_ctx2); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   58|  64.7k|    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1912|  64.7k|            rep_macro(edge->intra, off, 0); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   72|  64.7k|    case 1: set_ctx(set_ctx2); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   58|  64.7k|    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1913|  64.7k|            rep_macro(edge->skip, off, b->skip); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   72|  64.7k|    case 1: set_ctx(set_ctx2); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   58|  64.7k|    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1914|  64.7k|            rep_macro(edge->pal_sz, off, 0); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   72|  64.7k|    case 1: set_ctx(set_ctx2); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   58|  64.7k|    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1915|  64.7k|            /* see aomedia bug 2183 for why this is outside if (has_chroma) */ \
  |  |  |  | 1916|  64.7k|            rep_macro(t->pal_sz_uv[i], off, 0); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   72|  64.7k|    case 1: set_ctx(set_ctx2); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   58|  64.7k|    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1917|  64.7k|            rep_macro(edge->tx_intra, off, b_dim[2 + i]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   72|  64.7k|    case 1: set_ctx(set_ctx2); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   58|  64.7k|    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1918|  64.7k|            rep_macro(edge->comp_type, off, b->comp_type); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   72|  64.7k|    case 1: set_ctx(set_ctx2); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   58|  64.7k|    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1919|  64.7k|            rep_macro(edge->filter[0], off, filter[0]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   72|  64.7k|    case 1: set_ctx(set_ctx2); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   58|  64.7k|    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1920|  64.7k|            rep_macro(edge->filter[1], off, filter[1]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   72|  64.7k|    case 1: set_ctx(set_ctx2); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   58|  64.7k|    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1921|  64.7k|            rep_macro(edge->mode, off, b->inter_mode); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   72|  64.7k|    case 1: set_ctx(set_ctx2); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   58|  64.7k|    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1922|  64.7k|            rep_macro(edge->ref[0], off, b->ref[0]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   72|  64.7k|    case 1: set_ctx(set_ctx2); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   58|  64.7k|    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1923|  64.7k|            rep_macro(edge->ref[1], off, ((uint8_t) b->ref[1]))
  |  |  |  |  ------------------
  |  |  |  |  |  |   72|  64.7k|    case 1: set_ctx(set_ctx2); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   58|  64.7k|    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (72:5): [True: 64.7k, False: 106k]
  |  |  ------------------
  |  |   73|  41.9k|    case 2: set_ctx(set_ctx4); break; \
  |  |  ------------------
  |  |  |  | 1910|  41.9k|            rep_macro(edge->seg_pred, off, seg_pred); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   73|  41.9k|    case 2: set_ctx(set_ctx4); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   60|  41.9k|    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1911|  41.9k|            rep_macro(edge->skip_mode, off, b->skip_mode); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   73|  41.9k|    case 2: set_ctx(set_ctx4); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   60|  41.9k|    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1912|  41.9k|            rep_macro(edge->intra, off, 0); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   73|  41.9k|    case 2: set_ctx(set_ctx4); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   60|  41.9k|    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1913|  41.9k|            rep_macro(edge->skip, off, b->skip); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   73|  41.9k|    case 2: set_ctx(set_ctx4); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   60|  41.9k|    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1914|  41.9k|            rep_macro(edge->pal_sz, off, 0); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   73|  41.9k|    case 2: set_ctx(set_ctx4); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   60|  41.9k|    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1915|  41.9k|            /* see aomedia bug 2183 for why this is outside if (has_chroma) */ \
  |  |  |  | 1916|  41.9k|            rep_macro(t->pal_sz_uv[i], off, 0); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   73|  41.9k|    case 2: set_ctx(set_ctx4); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   60|  41.9k|    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1917|  41.9k|            rep_macro(edge->tx_intra, off, b_dim[2 + i]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   73|  41.9k|    case 2: set_ctx(set_ctx4); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   60|  41.9k|    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1918|  41.9k|            rep_macro(edge->comp_type, off, b->comp_type); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   73|  41.9k|    case 2: set_ctx(set_ctx4); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   60|  41.9k|    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1919|  41.9k|            rep_macro(edge->filter[0], off, filter[0]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   73|  41.9k|    case 2: set_ctx(set_ctx4); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   60|  41.9k|    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1920|  41.9k|            rep_macro(edge->filter[1], off, filter[1]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   73|  41.9k|    case 2: set_ctx(set_ctx4); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   60|  41.9k|    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1921|  41.9k|            rep_macro(edge->mode, off, b->inter_mode); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   73|  41.9k|    case 2: set_ctx(set_ctx4); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   60|  41.9k|    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1922|  41.9k|            rep_macro(edge->ref[0], off, b->ref[0]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   73|  41.9k|    case 2: set_ctx(set_ctx4); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   60|  41.9k|    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1923|  41.9k|            rep_macro(edge->ref[1], off, ((uint8_t) b->ref[1]))
  |  |  |  |  ------------------
  |  |  |  |  |  |   73|  41.9k|    case 2: set_ctx(set_ctx4); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   60|  41.9k|    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (73:5): [True: 41.9k, False: 129k]
  |  |  ------------------
  |  |   74|  21.1k|    case 3: set_ctx(set_ctx8); break; \
  |  |  ------------------
  |  |  |  | 1910|  21.1k|            rep_macro(edge->seg_pred, off, seg_pred); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   74|  21.1k|    case 3: set_ctx(set_ctx8); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   62|  21.1k|    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1911|  21.1k|            rep_macro(edge->skip_mode, off, b->skip_mode); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   74|  21.1k|    case 3: set_ctx(set_ctx8); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   62|  21.1k|    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1912|  21.1k|            rep_macro(edge->intra, off, 0); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   74|  21.1k|    case 3: set_ctx(set_ctx8); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   62|  21.1k|    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1913|  21.1k|            rep_macro(edge->skip, off, b->skip); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   74|  21.1k|    case 3: set_ctx(set_ctx8); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   62|  21.1k|    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1914|  21.1k|            rep_macro(edge->pal_sz, off, 0); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   74|  21.1k|    case 3: set_ctx(set_ctx8); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   62|  21.1k|    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1915|  21.1k|            /* see aomedia bug 2183 for why this is outside if (has_chroma) */ \
  |  |  |  | 1916|  21.1k|            rep_macro(t->pal_sz_uv[i], off, 0); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   74|  21.1k|    case 3: set_ctx(set_ctx8); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   62|  21.1k|    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1917|  21.1k|            rep_macro(edge->tx_intra, off, b_dim[2 + i]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   74|  21.1k|    case 3: set_ctx(set_ctx8); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   62|  21.1k|    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1918|  21.1k|            rep_macro(edge->comp_type, off, b->comp_type); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   74|  21.1k|    case 3: set_ctx(set_ctx8); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   62|  21.1k|    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1919|  21.1k|            rep_macro(edge->filter[0], off, filter[0]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   74|  21.1k|    case 3: set_ctx(set_ctx8); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   62|  21.1k|    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1920|  21.1k|            rep_macro(edge->filter[1], off, filter[1]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   74|  21.1k|    case 3: set_ctx(set_ctx8); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   62|  21.1k|    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1921|  21.1k|            rep_macro(edge->mode, off, b->inter_mode); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   74|  21.1k|    case 3: set_ctx(set_ctx8); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   62|  21.1k|    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1922|  21.1k|            rep_macro(edge->ref[0], off, b->ref[0]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   74|  21.1k|    case 3: set_ctx(set_ctx8); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   62|  21.1k|    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1923|  21.1k|            rep_macro(edge->ref[1], off, ((uint8_t) b->ref[1]))
  |  |  |  |  ------------------
  |  |  |  |  |  |   74|  21.1k|    case 3: set_ctx(set_ctx8); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   62|  21.1k|    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (74:5): [True: 21.1k, False: 150k]
  |  |  ------------------
  |  |   75|  5.65k|    case 4: set_ctx(set_ctx16); break; \
  |  |  ------------------
  |  |  |  | 1910|  5.65k|            rep_macro(edge->seg_pred, off, seg_pred); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   75|  5.65k|    case 4: set_ctx(set_ctx16); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   63|  5.65k|#define set_ctx16(var, off, val) do { \
  |  |  |  |  |  |  |  |   64|  5.65k|        memset(&(var)[off], val, 16); \
  |  |  |  |  |  |  |  |   65|  5.65k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (65:14): [Folded, False: 5.65k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1911|  5.65k|            rep_macro(edge->skip_mode, off, b->skip_mode); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   75|  5.65k|    case 4: set_ctx(set_ctx16); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   63|  5.65k|#define set_ctx16(var, off, val) do { \
  |  |  |  |  |  |  |  |   64|  5.65k|        memset(&(var)[off], val, 16); \
  |  |  |  |  |  |  |  |   65|  5.65k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (65:14): [Folded, False: 5.65k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1912|  5.65k|            rep_macro(edge->intra, off, 0); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   75|  5.65k|    case 4: set_ctx(set_ctx16); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   63|  5.65k|#define set_ctx16(var, off, val) do { \
  |  |  |  |  |  |  |  |   64|  5.65k|        memset(&(var)[off], val, 16); \
  |  |  |  |  |  |  |  |   65|  5.65k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (65:14): [Folded, False: 5.65k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1913|  5.65k|            rep_macro(edge->skip, off, b->skip); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   75|  5.65k|    case 4: set_ctx(set_ctx16); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   63|  5.65k|#define set_ctx16(var, off, val) do { \
  |  |  |  |  |  |  |  |   64|  5.65k|        memset(&(var)[off], val, 16); \
  |  |  |  |  |  |  |  |   65|  5.65k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (65:14): [Folded, False: 5.65k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1914|  5.65k|            rep_macro(edge->pal_sz, off, 0); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   75|  5.65k|    case 4: set_ctx(set_ctx16); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   63|  5.65k|#define set_ctx16(var, off, val) do { \
  |  |  |  |  |  |  |  |   64|  5.65k|        memset(&(var)[off], val, 16); \
  |  |  |  |  |  |  |  |   65|  5.65k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (65:14): [Folded, False: 5.65k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1915|  5.65k|            /* see aomedia bug 2183 for why this is outside if (has_chroma) */ \
  |  |  |  | 1916|  5.65k|            rep_macro(t->pal_sz_uv[i], off, 0); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   75|  5.65k|    case 4: set_ctx(set_ctx16); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   63|  5.65k|#define set_ctx16(var, off, val) do { \
  |  |  |  |  |  |  |  |   64|  5.65k|        memset(&(var)[off], val, 16); \
  |  |  |  |  |  |  |  |   65|  5.65k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (65:14): [Folded, False: 5.65k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1917|  5.65k|            rep_macro(edge->tx_intra, off, b_dim[2 + i]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   75|  5.65k|    case 4: set_ctx(set_ctx16); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   63|  5.65k|#define set_ctx16(var, off, val) do { \
  |  |  |  |  |  |  |  |   64|  5.65k|        memset(&(var)[off], val, 16); \
  |  |  |  |  |  |  |  |   65|  5.65k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (65:14): [Folded, False: 5.65k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1918|  5.65k|            rep_macro(edge->comp_type, off, b->comp_type); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   75|  5.65k|    case 4: set_ctx(set_ctx16); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   63|  5.65k|#define set_ctx16(var, off, val) do { \
  |  |  |  |  |  |  |  |   64|  5.65k|        memset(&(var)[off], val, 16); \
  |  |  |  |  |  |  |  |   65|  5.65k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (65:14): [Folded, False: 5.65k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1919|  5.65k|            rep_macro(edge->filter[0], off, filter[0]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   75|  5.65k|    case 4: set_ctx(set_ctx16); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   63|  5.65k|#define set_ctx16(var, off, val) do { \
  |  |  |  |  |  |  |  |   64|  5.65k|        memset(&(var)[off], val, 16); \
  |  |  |  |  |  |  |  |   65|  5.65k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (65:14): [Folded, False: 5.65k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1920|  5.65k|            rep_macro(edge->filter[1], off, filter[1]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   75|  5.65k|    case 4: set_ctx(set_ctx16); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   63|  5.65k|#define set_ctx16(var, off, val) do { \
  |  |  |  |  |  |  |  |   64|  5.65k|        memset(&(var)[off], val, 16); \
  |  |  |  |  |  |  |  |   65|  5.65k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (65:14): [Folded, False: 5.65k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1921|  5.65k|            rep_macro(edge->mode, off, b->inter_mode); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   75|  5.65k|    case 4: set_ctx(set_ctx16); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   63|  5.65k|#define set_ctx16(var, off, val) do { \
  |  |  |  |  |  |  |  |   64|  5.65k|        memset(&(var)[off], val, 16); \
  |  |  |  |  |  |  |  |   65|  5.65k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (65:14): [Folded, False: 5.65k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1922|  5.65k|            rep_macro(edge->ref[0], off, b->ref[0]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   75|  5.65k|    case 4: set_ctx(set_ctx16); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   63|  5.65k|#define set_ctx16(var, off, val) do { \
  |  |  |  |  |  |  |  |   64|  5.65k|        memset(&(var)[off], val, 16); \
  |  |  |  |  |  |  |  |   65|  5.65k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (65:14): [Folded, False: 5.65k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1923|  5.65k|            rep_macro(edge->ref[1], off, ((uint8_t) b->ref[1]))
  |  |  |  |  ------------------
  |  |  |  |  |  |   75|  5.65k|    case 4: set_ctx(set_ctx16); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   63|  5.65k|#define set_ctx16(var, off, val) do { \
  |  |  |  |  |  |  |  |   64|  5.65k|        memset(&(var)[off], val, 16); \
  |  |  |  |  |  |  |  |   65|  5.65k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (65:14): [Folded, False: 5.65k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (75:5): [True: 5.65k, False: 165k]
  |  |  ------------------
  |  |   76|    396|    case 5: set_ctx(set_ctx32); break; \
  |  |  ------------------
  |  |  |  | 1910|    396|            rep_macro(edge->seg_pred, off, seg_pred); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   76|    396|    case 5: set_ctx(set_ctx32); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   66|    396|#define set_ctx32(var, off, val) do { \
  |  |  |  |  |  |  |  |   67|    396|        memset(&(var)[off], val, 32); \
  |  |  |  |  |  |  |  |   68|    396|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (68:14): [Folded, False: 396]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1911|    396|            rep_macro(edge->skip_mode, off, b->skip_mode); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   76|    396|    case 5: set_ctx(set_ctx32); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   66|    396|#define set_ctx32(var, off, val) do { \
  |  |  |  |  |  |  |  |   67|    396|        memset(&(var)[off], val, 32); \
  |  |  |  |  |  |  |  |   68|    396|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (68:14): [Folded, False: 396]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1912|    396|            rep_macro(edge->intra, off, 0); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   76|    396|    case 5: set_ctx(set_ctx32); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   66|    396|#define set_ctx32(var, off, val) do { \
  |  |  |  |  |  |  |  |   67|    396|        memset(&(var)[off], val, 32); \
  |  |  |  |  |  |  |  |   68|    396|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (68:14): [Folded, False: 396]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1913|    396|            rep_macro(edge->skip, off, b->skip); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   76|    396|    case 5: set_ctx(set_ctx32); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   66|    396|#define set_ctx32(var, off, val) do { \
  |  |  |  |  |  |  |  |   67|    396|        memset(&(var)[off], val, 32); \
  |  |  |  |  |  |  |  |   68|    396|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (68:14): [Folded, False: 396]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1914|    396|            rep_macro(edge->pal_sz, off, 0); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   76|    396|    case 5: set_ctx(set_ctx32); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   66|    396|#define set_ctx32(var, off, val) do { \
  |  |  |  |  |  |  |  |   67|    396|        memset(&(var)[off], val, 32); \
  |  |  |  |  |  |  |  |   68|    396|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (68:14): [Folded, False: 396]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1915|    396|            /* see aomedia bug 2183 for why this is outside if (has_chroma) */ \
  |  |  |  | 1916|    396|            rep_macro(t->pal_sz_uv[i], off, 0); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   76|    396|    case 5: set_ctx(set_ctx32); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   66|    396|#define set_ctx32(var, off, val) do { \
  |  |  |  |  |  |  |  |   67|    396|        memset(&(var)[off], val, 32); \
  |  |  |  |  |  |  |  |   68|    396|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (68:14): [Folded, False: 396]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1917|    396|            rep_macro(edge->tx_intra, off, b_dim[2 + i]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   76|    396|    case 5: set_ctx(set_ctx32); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   66|    396|#define set_ctx32(var, off, val) do { \
  |  |  |  |  |  |  |  |   67|    396|        memset(&(var)[off], val, 32); \
  |  |  |  |  |  |  |  |   68|    396|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (68:14): [Folded, False: 396]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1918|    396|            rep_macro(edge->comp_type, off, b->comp_type); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   76|    396|    case 5: set_ctx(set_ctx32); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   66|    396|#define set_ctx32(var, off, val) do { \
  |  |  |  |  |  |  |  |   67|    396|        memset(&(var)[off], val, 32); \
  |  |  |  |  |  |  |  |   68|    396|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (68:14): [Folded, False: 396]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1919|    396|            rep_macro(edge->filter[0], off, filter[0]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   76|    396|    case 5: set_ctx(set_ctx32); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   66|    396|#define set_ctx32(var, off, val) do { \
  |  |  |  |  |  |  |  |   67|    396|        memset(&(var)[off], val, 32); \
  |  |  |  |  |  |  |  |   68|    396|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (68:14): [Folded, False: 396]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1920|    396|            rep_macro(edge->filter[1], off, filter[1]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   76|    396|    case 5: set_ctx(set_ctx32); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   66|    396|#define set_ctx32(var, off, val) do { \
  |  |  |  |  |  |  |  |   67|    396|        memset(&(var)[off], val, 32); \
  |  |  |  |  |  |  |  |   68|    396|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (68:14): [Folded, False: 396]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1921|    396|            rep_macro(edge->mode, off, b->inter_mode); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   76|    396|    case 5: set_ctx(set_ctx32); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   66|    396|#define set_ctx32(var, off, val) do { \
  |  |  |  |  |  |  |  |   67|    396|        memset(&(var)[off], val, 32); \
  |  |  |  |  |  |  |  |   68|    396|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (68:14): [Folded, False: 396]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1922|    396|            rep_macro(edge->ref[0], off, b->ref[0]); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   76|    396|    case 5: set_ctx(set_ctx32); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   66|    396|#define set_ctx32(var, off, val) do { \
  |  |  |  |  |  |  |  |   67|    396|        memset(&(var)[off], val, 32); \
  |  |  |  |  |  |  |  |   68|    396|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (68:14): [Folded, False: 396]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1923|    396|            rep_macro(edge->ref[1], off, ((uint8_t) b->ref[1]))
  |  |  |  |  ------------------
  |  |  |  |  |  |   76|    396|    case 5: set_ctx(set_ctx32); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   66|    396|#define set_ctx32(var, off, val) do { \
  |  |  |  |  |  |  |  |   67|    396|        memset(&(var)[off], val, 32); \
  |  |  |  |  |  |  |  |   68|    396|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (68:14): [Folded, False: 396]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (76:5): [True: 396, False: 170k]
  |  |  ------------------
  |  |   77|      0|    default: assert(0); \
  |  |  ------------------
  |  |  |  |  140|      0|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (140:28): [True: 0, Folded]
  |  |  |  |  |  Branch (140:68): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (77:5): [True: 0, False: 171k]
  |  |  ------------------
  |  |   78|   171k|    }
  ------------------
 1925|   171k|#undef set_ctx
 1926|   171k|        }
 1927|  91.7k|        if (has_chroma) {
  ------------------
  |  Branch (1927:13): [True: 69.8k, False: 21.8k]
  ------------------
 1928|  69.8k|            dav1d_memset_pow2[ulog2(cbw4)](&t->a->uvmode[cbx4], DC_PRED);
 1929|  69.8k|            dav1d_memset_pow2[ulog2(cbh4)](&t->l.uvmode[cby4], DC_PRED);
 1930|  69.8k|        }
 1931|  91.7k|    }
 1932|       |
 1933|       |    // update contexts
 1934|  2.99M|    if (f->frame_hdr->segmentation.enabled &&
  ------------------
  |  Branch (1934:9): [True: 1.15M, False: 1.83M]
  ------------------
 1935|  1.15M|        f->frame_hdr->segmentation.update_map)
  ------------------
  |  Branch (1935:9): [True: 1.14M, False: 12.9k]
  ------------------
 1936|  1.14M|    {
 1937|  1.14M|        uint8_t *seg_ptr = &f->cur_segmap[t->by * f->b4_stride + t->bx];
 1938|  1.14M|#define set_ctx(rep_macro) \
 1939|  1.14M|        for (int y = 0; y < bh4; y++) { \
 1940|  1.14M|            rep_macro(seg_ptr, 0, b->seg_id); \
 1941|  1.14M|            seg_ptr += f->b4_stride; \
 1942|  1.14M|        }
 1943|  1.14M|        case_set(b_dim[2]);
  ------------------
  |  |   70|  1.14M|    switch (var) { \
  |  |   71|   511k|    case 0: set_ctx(set_ctx1); break; \
  |  |  ------------------
  |  |  |  | 1939|  1.09M|        for (int y = 0; y < bh4; y++) { \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (1939:25): [True: 586k, False: 511k]
  |  |  |  |  ------------------
  |  |  |  | 1940|   586k|            rep_macro(seg_ptr, 0, b->seg_id); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   71|   586k|    case 0: set_ctx(set_ctx1); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   56|   586k|    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1941|   586k|            seg_ptr += f->b4_stride; \
  |  |  |  | 1942|   586k|        }
  |  |  ------------------
  |  |  |  Branch (71:5): [True: 511k, False: 632k]
  |  |  ------------------
  |  |   72|   302k|    case 1: set_ctx(set_ctx2); break; \
  |  |  ------------------
  |  |  |  | 1939|  1.25M|        for (int y = 0; y < bh4; y++) { \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (1939:25): [True: 947k, False: 302k]
  |  |  |  |  ------------------
  |  |  |  | 1940|   947k|            rep_macro(seg_ptr, 0, b->seg_id); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   72|   947k|    case 1: set_ctx(set_ctx2); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   58|   947k|    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1941|   947k|            seg_ptr += f->b4_stride; \
  |  |  |  | 1942|   947k|        }
  |  |  ------------------
  |  |  |  Branch (72:5): [True: 302k, False: 841k]
  |  |  ------------------
  |  |   73|   176k|    case 2: set_ctx(set_ctx4); break; \
  |  |  ------------------
  |  |  |  | 1939|  1.03M|        for (int y = 0; y < bh4; y++) { \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (1939:25): [True: 854k, False: 176k]
  |  |  |  |  ------------------
  |  |  |  | 1940|   854k|            rep_macro(seg_ptr, 0, b->seg_id); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   73|   854k|    case 2: set_ctx(set_ctx4); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   60|   854k|    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1941|   854k|            seg_ptr += f->b4_stride; \
  |  |  |  | 1942|   854k|        }
  |  |  ------------------
  |  |  |  Branch (73:5): [True: 176k, False: 966k]
  |  |  ------------------
  |  |   74|   107k|    case 3: set_ctx(set_ctx8); break; \
  |  |  ------------------
  |  |  |  | 1939|  1.10M|        for (int y = 0; y < bh4; y++) { \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (1939:25): [True: 999k, False: 107k]
  |  |  |  |  ------------------
  |  |  |  | 1940|   999k|            rep_macro(seg_ptr, 0, b->seg_id); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   74|   999k|    case 3: set_ctx(set_ctx8); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   62|   999k|    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1941|   999k|            seg_ptr += f->b4_stride; \
  |  |  |  | 1942|   999k|        }
  |  |  ------------------
  |  |  |  Branch (74:5): [True: 107k, False: 1.03M]
  |  |  ------------------
  |  |   75|  40.2k|    case 4: set_ctx(set_ctx16); break; \
  |  |  ------------------
  |  |  |  | 1939|   649k|        for (int y = 0; y < bh4; y++) { \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (1939:25): [True: 609k, False: 40.2k]
  |  |  |  |  ------------------
  |  |  |  | 1940|   609k|            rep_macro(seg_ptr, 0, b->seg_id); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   75|   609k|    case 4: set_ctx(set_ctx16); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   63|   609k|#define set_ctx16(var, off, val) do { \
  |  |  |  |  |  |  |  |   64|   609k|        memset(&(var)[off], val, 16); \
  |  |  |  |  |  |  |  |   65|   609k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (65:14): [Folded, False: 609k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1941|   609k|            seg_ptr += f->b4_stride; \
  |  |  |  | 1942|   609k|        }
  |  |  ------------------
  |  |  |  Branch (75:5): [True: 40.2k, False: 1.10M]
  |  |  ------------------
  |  |   76|  7.32k|    case 5: set_ctx(set_ctx32); break; \
  |  |  ------------------
  |  |  |  | 1939|   137k|        for (int y = 0; y < bh4; y++) { \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (1939:25): [True: 130k, False: 7.32k]
  |  |  |  |  ------------------
  |  |  |  | 1940|   130k|            rep_macro(seg_ptr, 0, b->seg_id); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   76|   130k|    case 5: set_ctx(set_ctx32); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   66|   130k|#define set_ctx32(var, off, val) do { \
  |  |  |  |  |  |  |  |   67|   130k|        memset(&(var)[off], val, 32); \
  |  |  |  |  |  |  |  |   68|   130k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (68:14): [Folded, False: 130k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  | 1941|   130k|            seg_ptr += f->b4_stride; \
  |  |  |  | 1942|   130k|        }
  |  |  ------------------
  |  |  |  Branch (76:5): [True: 7.32k, False: 1.13M]
  |  |  ------------------
  |  |   77|      0|    default: assert(0); \
  |  |  ------------------
  |  |  |  |  140|      0|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (140:28): [True: 0, Folded]
  |  |  |  |  |  Branch (140:68): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (77:5): [True: 0, False: 1.14M]
  |  |  ------------------
  |  |   78|  1.14M|    }
  ------------------
 1944|  1.14M|#undef set_ctx
 1945|  1.14M|    }
 1946|  2.99M|    if (!b->skip) {
  ------------------
  |  Branch (1946:9): [True: 851k, False: 2.13M]
  ------------------
 1947|   851k|        uint16_t (*noskip_mask)[2] = &t->lf_mask->noskip_mask[by4 >> 1];
 1948|   851k|        const unsigned mask = (~0U >> (32 - bw4)) << (bx4 & 15);
 1949|   851k|        const int bx_idx = (bx4 & 16) >> 4;
 1950|  2.78M|        for (int y = 0; y < bh4; y += 2, noskip_mask++) {
  ------------------
  |  Branch (1950:25): [True: 1.93M, False: 851k]
  ------------------
 1951|  1.93M|            (*noskip_mask)[bx_idx] |= mask;
 1952|  1.93M|            if (bw4 == 32) // this should be mask >> 16, but it's 0xffffffff anyway
  ------------------
  |  Branch (1952:17): [True: 132k, False: 1.79M]
  ------------------
 1953|   132k|                (*noskip_mask)[1] |= mask;
 1954|  1.93M|        }
 1955|   851k|    }
 1956|       |
 1957|  2.99M|    if (t->frame_thread.pass == 1 && !b->intra && IS_INTER_OR_SWITCH(f->frame_hdr)) {
  ------------------
  |  |   36|      0|    ((frame_header)->frame_type & 1)
  |  |  ------------------
  |  |  |  Branch (36:5): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  |  Branch (1957:9): [True: 0, False: 2.99M]
  |  Branch (1957:38): [True: 0, False: 0]
  ------------------
 1958|      0|        const int sby = (t->by - ts->tiling.row_start) >> f->sb_shift;
 1959|      0|        int (*const lowest_px)[2] = ts->lowest_pixel[sby];
 1960|       |
 1961|       |        // keep track of motion vectors for each reference
 1962|      0|        if (b->comp_type == COMP_INTER_NONE) {
  ------------------
  |  Branch (1962:13): [True: 0, False: 0]
  ------------------
 1963|       |            // y
 1964|      0|            if (imin(bw4, bh4) > 1 &&
  ------------------
  |  Branch (1964:17): [True: 0, False: 0]
  ------------------
 1965|      0|                ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) ||
  ------------------
  |  Branch (1965:19): [True: 0, False: 0]
  |  Branch (1965:48): [True: 0, False: 0]
  ------------------
 1966|      0|                 (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION)))
  ------------------
  |  Branch (1966:19): [True: 0, False: 0]
  |  Branch (1966:48): [True: 0, False: 0]
  ------------------
 1967|      0|            {
 1968|      0|                affine_lowest_px_luma(t, &lowest_px[b->ref[0]][0], b_dim,
 1969|      0|                                      b->motion_mode == MM_WARP ? &t->warpmv :
  ------------------
  |  Branch (1969:39): [True: 0, False: 0]
  ------------------
 1970|      0|                                      &f->frame_hdr->gmv[b->ref[0]]);
 1971|      0|            } else {
 1972|      0|                mc_lowest_px(&lowest_px[b->ref[0]][0], t->by, bh4, b->mv[0].y,
 1973|      0|                             0, &f->svc[b->ref[0]][1]);
 1974|      0|                if (b->motion_mode == MM_OBMC) {
  ------------------
  |  Branch (1974:21): [True: 0, False: 0]
  ------------------
 1975|      0|                    obmc_lowest_px(t, lowest_px, 0, b_dim, bx4, by4, w4, h4);
 1976|      0|                }
 1977|      0|            }
 1978|       |
 1979|       |            // uv
 1980|      0|            if (has_chroma) {
  ------------------
  |  Branch (1980:17): [True: 0, False: 0]
  ------------------
 1981|       |                // sub8x8 derivation
 1982|      0|                int is_sub8x8 = bw4 == ss_hor || bh4 == ss_ver;
  ------------------
  |  Branch (1982:33): [True: 0, False: 0]
  |  Branch (1982:50): [True: 0, False: 0]
  ------------------
 1983|      0|                refmvs_block *const *r;
 1984|      0|                if (is_sub8x8) {
  ------------------
  |  Branch (1984:21): [True: 0, False: 0]
  ------------------
 1985|      0|                    assert(ss_hor == 1);
  ------------------
  |  |  140|      0|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 0]
  |  |  |  Branch (140:68): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1986|      0|                    r = &t->rt.r[(t->by & 31) + 5];
 1987|      0|                    if (bw4 == 1) is_sub8x8 &= r[0][t->bx - 1].ref.ref[0] > 0;
  ------------------
  |  Branch (1987:25): [True: 0, False: 0]
  ------------------
 1988|      0|                    if (bh4 == ss_ver) is_sub8x8 &= r[-1][t->bx].ref.ref[0] > 0;
  ------------------
  |  Branch (1988:25): [True: 0, False: 0]
  ------------------
 1989|      0|                    if (bw4 == 1 && bh4 == ss_ver)
  ------------------
  |  Branch (1989:25): [True: 0, False: 0]
  |  Branch (1989:37): [True: 0, False: 0]
  ------------------
 1990|      0|                        is_sub8x8 &= r[-1][t->bx - 1].ref.ref[0] > 0;
 1991|      0|                }
 1992|       |
 1993|       |                // chroma prediction
 1994|      0|                if (is_sub8x8) {
  ------------------
  |  Branch (1994:21): [True: 0, False: 0]
  ------------------
 1995|      0|                    assert(ss_hor == 1);
  ------------------
  |  |  140|      0|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 0]
  |  |  |  Branch (140:68): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1996|      0|                    if (bw4 == 1 && bh4 == ss_ver) {
  ------------------
  |  Branch (1996:25): [True: 0, False: 0]
  |  Branch (1996:37): [True: 0, False: 0]
  ------------------
 1997|      0|                        const refmvs_block *const rr = &r[-1][t->bx - 1];
 1998|      0|                        mc_lowest_px(&lowest_px[rr->ref.ref[0] - 1][1],
 1999|      0|                                     t->by - 1, bh4, rr->mv.mv[0].y, ss_ver,
 2000|      0|                                     &f->svc[rr->ref.ref[0] - 1][1]);
 2001|      0|                    }
 2002|      0|                    if (bw4 == 1) {
  ------------------
  |  Branch (2002:25): [True: 0, False: 0]
  ------------------
 2003|      0|                        const refmvs_block *const rr = &r[0][t->bx - 1];
 2004|      0|                        mc_lowest_px(&lowest_px[rr->ref.ref[0] - 1][1],
 2005|      0|                                     t->by, bh4, rr->mv.mv[0].y, ss_ver,
 2006|      0|                                     &f->svc[rr->ref.ref[0] - 1][1]);
 2007|      0|                    }
 2008|      0|                    if (bh4 == ss_ver) {
  ------------------
  |  Branch (2008:25): [True: 0, False: 0]
  ------------------
 2009|      0|                        const refmvs_block *const rr = &r[-1][t->bx];
 2010|      0|                        mc_lowest_px(&lowest_px[rr->ref.ref[0] - 1][1],
 2011|      0|                                     t->by - 1, bh4, rr->mv.mv[0].y, ss_ver,
 2012|      0|                                     &f->svc[rr->ref.ref[0] - 1][1]);
 2013|      0|                    }
 2014|      0|                    mc_lowest_px(&lowest_px[b->ref[0]][1], t->by, bh4,
 2015|      0|                                 b->mv[0].y, ss_ver, &f->svc[b->ref[0]][1]);
 2016|      0|                } else {
 2017|      0|                    if (imin(cbw4, cbh4) > 1 &&
  ------------------
  |  Branch (2017:25): [True: 0, False: 0]
  ------------------
 2018|      0|                        ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) ||
  ------------------
  |  Branch (2018:27): [True: 0, False: 0]
  |  Branch (2018:56): [True: 0, False: 0]
  ------------------
 2019|      0|                         (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION)))
  ------------------
  |  Branch (2019:27): [True: 0, False: 0]
  |  Branch (2019:56): [True: 0, False: 0]
  ------------------
 2020|      0|                    {
 2021|      0|                        affine_lowest_px_chroma(t, &lowest_px[b->ref[0]][1], b_dim,
 2022|      0|                                                b->motion_mode == MM_WARP ? &t->warpmv :
  ------------------
  |  Branch (2022:49): [True: 0, False: 0]
  ------------------
 2023|      0|                                                &f->frame_hdr->gmv[b->ref[0]]);
 2024|      0|                    } else {
 2025|      0|                        mc_lowest_px(&lowest_px[b->ref[0]][1],
 2026|      0|                                     t->by & ~ss_ver, bh4 << (bh4 == ss_ver),
 2027|      0|                                     b->mv[0].y, ss_ver, &f->svc[b->ref[0]][1]);
 2028|      0|                        if (b->motion_mode == MM_OBMC) {
  ------------------
  |  Branch (2028:29): [True: 0, False: 0]
  ------------------
 2029|      0|                            obmc_lowest_px(t, lowest_px, 1, b_dim, bx4, by4, w4, h4);
 2030|      0|                        }
 2031|      0|                    }
 2032|      0|                }
 2033|      0|            }
 2034|      0|        } else {
 2035|       |            // y
 2036|      0|            for (int i = 0; i < 2; i++) {
  ------------------
  |  Branch (2036:29): [True: 0, False: 0]
  ------------------
 2037|      0|                if (b->inter_mode == GLOBALMV_GLOBALMV && f->gmv_warp_allowed[b->ref[i]]) {
  ------------------
  |  Branch (2037:21): [True: 0, False: 0]
  |  Branch (2037:59): [True: 0, False: 0]
  ------------------
 2038|      0|                    affine_lowest_px_luma(t, &lowest_px[b->ref[i]][0], b_dim,
 2039|      0|                                          &f->frame_hdr->gmv[b->ref[i]]);
 2040|      0|                } else {
 2041|      0|                    mc_lowest_px(&lowest_px[b->ref[i]][0], t->by, bh4,
 2042|      0|                                 b->mv[i].y, 0, &f->svc[b->ref[i]][1]);
 2043|      0|                }
 2044|      0|            }
 2045|       |
 2046|       |            // uv
 2047|      0|            if (has_chroma) for (int i = 0; i < 2; i++) {
  ------------------
  |  Branch (2047:17): [True: 0, False: 0]
  |  Branch (2047:45): [True: 0, False: 0]
  ------------------
 2048|      0|                if (b->inter_mode == GLOBALMV_GLOBALMV &&
  ------------------
  |  Branch (2048:21): [True: 0, False: 0]
  ------------------
 2049|      0|                    imin(cbw4, cbh4) > 1 && f->gmv_warp_allowed[b->ref[i]])
  ------------------
  |  Branch (2049:21): [True: 0, False: 0]
  |  Branch (2049:45): [True: 0, False: 0]
  ------------------
 2050|      0|                {
 2051|      0|                    affine_lowest_px_chroma(t, &lowest_px[b->ref[i]][1], b_dim,
 2052|      0|                                            &f->frame_hdr->gmv[b->ref[i]]);
 2053|      0|                } else {
 2054|      0|                    mc_lowest_px(&lowest_px[b->ref[i]][1], t->by, bh4,
 2055|      0|                                 b->mv[i].y, ss_ver, &f->svc[b->ref[i]][1]);
 2056|      0|                }
 2057|      0|            }
 2058|      0|        }
 2059|      0|    }
 2060|       |
 2061|  2.99M|    return 0;
 2062|  2.99M|}
decode.c:get_prev_frame_segid:
  499|     62|{
  500|     62|    assert(f->frame_hdr->primary_ref_frame != DAV1D_PRIMARY_REF_NONE);
  ------------------
  |  |  140|     62|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 62]
  |  |  |  Branch (140:68): [Folded, False: 62]
  |  |  ------------------
  ------------------
  501|       |
  502|     62|    unsigned seg_id = 8;
  503|     62|    ref_seg_map += by * stride + bx;
  504|     79|    do {
  505|    311|        for (int x = 0; x < w4; x++)
  ------------------
  |  Branch (505:25): [True: 232, False: 79]
  ------------------
  506|    232|            seg_id = imin(seg_id, ref_seg_map[x]);
  507|     79|        ref_seg_map += stride;
  508|     79|    } while (--h4 > 0 && seg_id);
  ------------------
  |  Branch (508:14): [True: 47, False: 32]
  |  Branch (508:26): [True: 17, False: 30]
  ------------------
  509|     62|    assert(seg_id < 8);
  ------------------
  |  |  140|     62|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 62]
  |  |  |  Branch (140:68): [Folded, False: 62]
  |  |  ------------------
  ------------------
  510|       |
  511|     62|    return seg_id;
  512|     62|}
decode.c:neg_deinterleave:
  169|  1.10M|static int neg_deinterleave(int diff, int ref, int max) {
  170|  1.10M|    if (!ref) return diff;
  ------------------
  |  Branch (170:9): [True: 444k, False: 659k]
  ------------------
  171|   659k|    if (ref >= (max - 1)) return max - diff - 1;
  ------------------
  |  Branch (171:9): [True: 97.2k, False: 562k]
  ------------------
  172|   562k|    if (2 * ref < max) {
  ------------------
  |  Branch (172:9): [True: 341k, False: 220k]
  ------------------
  173|   341k|        if (diff <= 2 * ref) {
  ------------------
  |  Branch (173:13): [True: 271k, False: 70.4k]
  ------------------
  174|   271k|            if (diff & 1)
  ------------------
  |  Branch (174:17): [True: 41.3k, False: 229k]
  ------------------
  175|  41.3k|                return ref + ((diff + 1) >> 1);
  176|   229k|            else
  177|   229k|                return ref - (diff >> 1);
  178|   271k|        }
  179|  70.4k|        return diff;
  180|   341k|    } else {
  181|   220k|        if (diff <= 2 * (max - ref - 1)) {
  ------------------
  |  Branch (181:13): [True: 192k, False: 27.8k]
  ------------------
  182|   192k|            if (diff & 1)
  ------------------
  |  Branch (182:17): [True: 28.8k, False: 163k]
  ------------------
  183|  28.8k|                return ref + ((diff + 1) >> 1);
  184|   163k|            else
  185|   163k|                return ref - (diff >> 1);
  186|   192k|        }
  187|  27.8k|        return max - (diff + 1);
  188|   220k|    }
  189|   562k|}
decode.c:read_pal_indices:
  419|  31.2k|{
  420|  31.2k|    Dav1dTileState *const ts = t->ts;
  421|  31.2k|    const ptrdiff_t stride = bw4 * 4;
  422|  31.2k|    assert(pal_idx);
  ------------------
  |  |  140|  31.2k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 31.2k]
  |  |  |  Branch (140:68): [Folded, False: 31.2k]
  |  |  ------------------
  ------------------
  423|  31.2k|    uint8_t *const pal_tmp = t->scratch.pal_idx_uv;
  424|  31.2k|    pal_tmp[0] = dav1d_msac_decode_uniform(&ts->msac, pal_sz);
  425|  31.2k|    uint16_t (*const color_map_cdf)[8] =
  426|  31.2k|        ts->cdf.m.color_map[pl][pal_sz - 2];
  427|  31.2k|    uint8_t (*const order)[8] = t->scratch.pal_order;
  428|  31.2k|    uint8_t *const ctx = t->scratch.pal_ctx;
  429|   818k|    for (int i = 1; i < 4 * (w4 + h4) - 1; i++) {
  ------------------
  |  Branch (429:21): [True: 786k, False: 31.2k]
  ------------------
  430|       |        // top/left-to-bottom/right diagonals ("wave-front")
  431|   786k|        const int first = imin(i, w4 * 4 - 1);
  432|   786k|        const int last = imax(0, i - h4 * 4 + 1);
  433|   786k|        order_palette(pal_tmp, stride, i, first, last, order, ctx);
  434|  6.54M|        for (int j = first, m = 0; j >= last; j--, m++) {
  ------------------
  |  Branch (434:36): [True: 5.75M, False: 786k]
  ------------------
  435|  5.75M|            const int color_idx = dav1d_msac_decode_symbol_adapt8(&ts->msac,
  ------------------
  |  |   48|  5.75M|#define dav1d_msac_decode_symbol_adapt8  dav1d_msac_decode_symbol_adapt8_sse2
  ------------------
  436|  5.75M|                                      color_map_cdf[ctx[m]], pal_sz - 1);
  437|  5.75M|            pal_tmp[(i - j) * stride + j] = order[m][color_idx];
  438|  5.75M|        }
  439|   786k|    }
  440|       |
  441|  31.2k|    t->c->pal_dsp.pal_idx_finish(pal_idx, pal_tmp, bw4 * 4, bh4 * 4,
  442|  31.2k|                                 w4 * 4, h4 * 4);
  443|  31.2k|}
decode.c:order_palette:
  356|   786k|{
  357|   786k|    int have_top = i > first;
  358|       |
  359|   786k|    assert(pal_idx);
  ------------------
  |  |  140|   786k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 786k]
  |  |  |  Branch (140:68): [Folded, False: 786k]
  |  |  ------------------
  ------------------
  360|   786k|    pal_idx += first + (i - first) * stride;
  361|  6.53M|    for (int j = first, n = 0; j >= last; have_top = 1, j--, n++, pal_idx += stride - 1) {
  ------------------
  |  Branch (361:32): [True: 5.74M, False: 786k]
  ------------------
  362|  5.74M|        const int have_left = j > 0;
  363|       |
  364|  5.74M|        assert(have_left || have_top);
  ------------------
  |  |  140|  6.08M|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:30): [True: 5.39M, False: 346k]
  |  |  |  Branch (140:30): [True: 346k, False: 0]
  |  |  |  Branch (140:68): [Folded, False: 5.74M]
  |  |  ------------------
  ------------------
  365|       |
  366|  5.74M|#define add(v_in) do { \
  367|  5.74M|        const int v = v_in; \
  368|  5.74M|        assert((unsigned)v < 8U); \
  369|  5.74M|        order[n][o_idx++] = v; \
  370|  5.74M|        mask |= 1 << v; \
  371|  5.74M|    } while (0)
  372|       |
  373|  5.74M|        unsigned mask = 0;
  374|  5.74M|        int o_idx = 0;
  375|  5.74M|        if (!have_left) {
  ------------------
  |  Branch (375:13): [True: 346k, False: 5.39M]
  ------------------
  376|   346k|            ctx[n] = 0;
  377|   346k|            add(pal_idx[-stride]);
  ------------------
  |  |  366|   346k|#define add(v_in) do { \
  |  |  367|   346k|        const int v = v_in; \
  |  |  368|   346k|        assert((unsigned)v < 8U); \
  |  |  ------------------
  |  |  |  |  140|   346k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (140:28): [True: 0, False: 346k]
  |  |  |  |  |  Branch (140:68): [Folded, False: 346k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  369|   346k|        order[n][o_idx++] = v; \
  |  |  370|   346k|        mask |= 1 << v; \
  |  |  371|   346k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (371:14): [Folded, False: 346k]
  |  |  ------------------
  ------------------
  378|  5.39M|        } else if (!have_top) {
  ------------------
  |  Branch (378:20): [True: 440k, False: 4.95M]
  ------------------
  379|   440k|            ctx[n] = 0;
  380|   440k|            add(pal_idx[-1]);
  ------------------
  |  |  366|   440k|#define add(v_in) do { \
  |  |  367|   440k|        const int v = v_in; \
  |  |  368|   440k|        assert((unsigned)v < 8U); \
  |  |  ------------------
  |  |  |  |  140|   440k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (140:28): [True: 0, False: 440k]
  |  |  |  |  |  Branch (140:68): [Folded, False: 440k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  369|   440k|        order[n][o_idx++] = v; \
  |  |  370|   440k|        mask |= 1 << v; \
  |  |  371|   440k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (371:14): [Folded, False: 440k]
  |  |  ------------------
  ------------------
  381|  4.95M|        } else {
  382|  4.95M|            const int l = pal_idx[-1], t = pal_idx[-stride], tl = pal_idx[-(stride + 1)];
  383|  4.95M|            const int same_t_l = t == l;
  384|  4.95M|            const int same_t_tl = t == tl;
  385|  4.95M|            const int same_l_tl = l == tl;
  386|  4.95M|            const int same_all = same_t_l & same_t_tl & same_l_tl;
  387|       |
  388|  4.95M|            if (same_all) {
  ------------------
  |  Branch (388:17): [True: 2.88M, False: 2.06M]
  ------------------
  389|  2.88M|                ctx[n] = 4;
  390|  2.88M|                add(t);
  ------------------
  |  |  366|  2.88M|#define add(v_in) do { \
  |  |  367|  2.88M|        const int v = v_in; \
  |  |  368|  2.88M|        assert((unsigned)v < 8U); \
  |  |  ------------------
  |  |  |  |  140|  2.88M|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (140:28): [True: 0, False: 2.88M]
  |  |  |  |  |  Branch (140:68): [Folded, False: 2.88M]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  369|  2.88M|        order[n][o_idx++] = v; \
  |  |  370|  2.88M|        mask |= 1 << v; \
  |  |  371|  2.88M|    } while (0)
  |  |  ------------------
  |  |  |  Branch (371:14): [Folded, False: 2.88M]
  |  |  ------------------
  ------------------
  391|  2.88M|            } else if (same_t_l) {
  ------------------
  |  Branch (391:24): [True: 146k, False: 1.92M]
  ------------------
  392|   146k|                ctx[n] = 3;
  393|   146k|                add(t);
  ------------------
  |  |  366|   146k|#define add(v_in) do { \
  |  |  367|   146k|        const int v = v_in; \
  |  |  368|   146k|        assert((unsigned)v < 8U); \
  |  |  ------------------
  |  |  |  |  140|   146k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (140:28): [True: 0, False: 146k]
  |  |  |  |  |  Branch (140:68): [Folded, False: 146k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  369|   146k|        order[n][o_idx++] = v; \
  |  |  370|   146k|        mask |= 1 << v; \
  |  |  371|   146k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (371:14): [Folded, False: 146k]
  |  |  ------------------
  ------------------
  394|   146k|                add(tl);
  ------------------
  |  |  366|   146k|#define add(v_in) do { \
  |  |  367|   146k|        const int v = v_in; \
  |  |  368|   146k|        assert((unsigned)v < 8U); \
  |  |  ------------------
  |  |  |  |  140|   146k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (140:28): [True: 0, False: 146k]
  |  |  |  |  |  Branch (140:68): [Folded, False: 146k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  369|   146k|        order[n][o_idx++] = v; \
  |  |  370|   146k|        mask |= 1 << v; \
  |  |  371|   146k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (371:14): [Folded, False: 146k]
  |  |  ------------------
  ------------------
  395|  1.92M|            } else if (same_t_tl | same_l_tl) {
  ------------------
  |  Branch (395:24): [True: 1.54M, False: 372k]
  ------------------
  396|  1.54M|                ctx[n] = 2;
  397|  1.54M|                add(tl);
  ------------------
  |  |  366|  1.54M|#define add(v_in) do { \
  |  |  367|  1.54M|        const int v = v_in; \
  |  |  368|  1.54M|        assert((unsigned)v < 8U); \
  |  |  ------------------
  |  |  |  |  140|  1.54M|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (140:28): [True: 0, False: 1.54M]
  |  |  |  |  |  Branch (140:68): [Folded, False: 1.54M]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  369|  1.54M|        order[n][o_idx++] = v; \
  |  |  370|  1.54M|        mask |= 1 << v; \
  |  |  371|  1.54M|    } while (0)
  |  |  ------------------
  |  |  |  Branch (371:14): [Folded, False: 1.54M]
  |  |  ------------------
  ------------------
  398|  1.54M|                add(same_t_tl ? l : t);
  ------------------
  |  |  366|  1.54M|#define add(v_in) do { \
  |  |  367|  3.09M|        const int v = v_in; \
  |  |  ------------------
  |  |  |  Branch (367:23): [True: 765k, False: 783k]
  |  |  ------------------
  |  |  368|  1.54M|        assert((unsigned)v < 8U); \
  |  |  ------------------
  |  |  |  |  140|  1.54M|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (140:28): [True: 0, False: 1.54M]
  |  |  |  |  |  Branch (140:68): [Folded, False: 1.54M]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  369|  1.54M|        order[n][o_idx++] = v; \
  |  |  370|  1.54M|        mask |= 1 << v; \
  |  |  371|  1.54M|    } while (0)
  |  |  ------------------
  |  |  |  Branch (371:14): [Folded, False: 1.54M]
  |  |  ------------------
  ------------------
  399|  1.54M|            } else {
  400|   372k|                ctx[n] = 1;
  401|   372k|                add(imin(t, l));
  ------------------
  |  |  366|   372k|#define add(v_in) do { \
  |  |  367|   372k|        const int v = v_in; \
  |  |  368|   372k|        assert((unsigned)v < 8U); \
  |  |  ------------------
  |  |  |  |  140|   372k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (140:28): [True: 0, False: 372k]
  |  |  |  |  |  Branch (140:68): [Folded, False: 372k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  369|   372k|        order[n][o_idx++] = v; \
  |  |  370|   372k|        mask |= 1 << v; \
  |  |  371|   372k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (371:14): [Folded, False: 372k]
  |  |  ------------------
  ------------------
  402|   372k|                add(imax(t, l));
  ------------------
  |  |  366|   372k|#define add(v_in) do { \
  |  |  367|   372k|        const int v = v_in; \
  |  |  368|   372k|        assert((unsigned)v < 8U); \
  |  |  ------------------
  |  |  |  |  140|   372k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (140:28): [True: 0, False: 372k]
  |  |  |  |  |  Branch (140:68): [Folded, False: 372k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  369|   372k|        order[n][o_idx++] = v; \
  |  |  370|   372k|        mask |= 1 << v; \
  |  |  371|   372k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (371:14): [Folded, False: 372k]
  |  |  ------------------
  ------------------
  403|   372k|                add(tl);
  ------------------
  |  |  366|   372k|#define add(v_in) do { \
  |  |  367|   372k|        const int v = v_in; \
  |  |  368|   372k|        assert((unsigned)v < 8U); \
  |  |  ------------------
  |  |  |  |  140|   372k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (140:28): [True: 0, False: 372k]
  |  |  |  |  |  Branch (140:68): [Folded, False: 372k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  369|   372k|        order[n][o_idx++] = v; \
  |  |  370|   372k|        mask |= 1 << v; \
  |  |  371|   372k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (371:14): [Folded, False: 372k]
  |  |  ------------------
  ------------------
  404|   372k|            }
  405|  4.95M|        }
  406|  51.6M|        for (unsigned m = 1, bit = 0; m < 0x100; m <<= 1, bit++)
  ------------------
  |  Branch (406:39): [True: 45.8M, False: 5.74M]
  ------------------
  407|  45.8M|            if (!(mask & m))
  ------------------
  |  Branch (407:17): [True: 37.6M, False: 8.18M]
  ------------------
  408|  37.6M|                order[n][o_idx++] = bit;
  409|  5.74M|        assert(o_idx == 8);
  ------------------
  |  |  140|  5.74M|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 5.74M]
  |  |  |  Branch (140:68): [Folded, False: 5.74M]
  |  |  ------------------
  ------------------
  410|  5.74M|#undef add
  411|  5.74M|    }
  412|   786k|}
decode.c:splat_intraref:
  566|   399k|{
  567|   399k|    const refmvs_block ALIGN(tmpl, 16) = (refmvs_block) {
  568|   399k|        .ref.ref = { 0, -1 },
  569|   399k|        .mv.mv[0].n = INVALID_MV,
  ------------------
  |  |   40|   399k|#define INVALID_MV 0x80008000
  ------------------
  570|   399k|        .bs = bs,
  571|   399k|        .mf = 0,
  572|   399k|    };
  573|   399k|    c->refmvs_dsp.splat_mv(&t->rt.r[(t->by & 31) + 5], &tmpl, t->bx, bw4, bh4);
  574|   399k|}
decode.c:read_mv_residual:
  109|   991k|{
  110|   991k|    MsacContext *const msac = &ts->msac;
  111|   991k|    const enum MVJoint mv_joint =
  112|   991k|        dav1d_msac_decode_symbol_adapt4(msac, ts->cdf.mv.joint, N_MV_JOINTS - 1);
  ------------------
  |  |   47|   991k|#define dav1d_msac_decode_symbol_adapt4  dav1d_msac_decode_symbol_adapt4_sse2
  ------------------
  113|   991k|    if (mv_joint & MV_JOINT_V)
  ------------------
  |  Branch (113:9): [True: 959k, False: 31.5k]
  ------------------
  114|   959k|        ref_mv->y += read_mv_component_diff(msac, &ts->cdf.mv.comp[0], mv_prec);
  115|   991k|    if (mv_joint & MV_JOINT_H)
  ------------------
  |  Branch (115:9): [True: 960k, False: 30.3k]
  ------------------
  116|   960k|        ref_mv->x += read_mv_component_diff(msac, &ts->cdf.mv.comp[1], mv_prec);
  117|   991k|}
decode.c:read_mv_component_diff:
   79|  1.89M|{
   80|  1.89M|    const int sign = dav1d_msac_decode_bool_adapt(msac, mv_comp->sign);
  ------------------
  |  |   52|  1.89M|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
   81|  1.89M|    const int cl = dav1d_msac_decode_symbol_adapt16(msac, mv_comp->classes, 10);
  ------------------
  |  |   57|  1.89M|#define dav1d_msac_decode_symbol_adapt16(ctx, cdf, symb) ((ctx)->symbol_adapt16(ctx, cdf, symb))
  ------------------
   82|  1.89M|    int up, fp = 3, hp = 1;
   83|       |
   84|  1.89M|    if (!cl) {
  ------------------
  |  Branch (84:9): [True: 93.9k, False: 1.80M]
  ------------------
   85|  93.9k|        up = dav1d_msac_decode_bool_adapt(msac, mv_comp->class0);
  ------------------
  |  |   52|  93.9k|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
   86|  93.9k|        if (mv_prec >= 0) {  // !force_integer_mv
  ------------------
  |  Branch (86:13): [True: 27.3k, False: 66.5k]
  ------------------
   87|  27.3k|            fp = dav1d_msac_decode_symbol_adapt4(msac, mv_comp->class0_fp[up], 3);
  ------------------
  |  |   47|  27.3k|#define dav1d_msac_decode_symbol_adapt4  dav1d_msac_decode_symbol_adapt4_sse2
  ------------------
   88|  27.3k|            if (mv_prec > 0) // allow_high_precision_mv
  ------------------
  |  Branch (88:17): [True: 13.7k, False: 13.6k]
  ------------------
   89|  13.7k|                hp = dav1d_msac_decode_bool_adapt(msac, mv_comp->class0_hp);
  ------------------
  |  |   52|  13.7k|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
   90|  27.3k|        }
   91|  1.80M|    } else {
   92|  1.80M|        up = 1 << cl;
   93|  18.5M|        for (int n = 0; n < cl; n++)
  ------------------
  |  Branch (93:25): [True: 16.7M, False: 1.80M]
  ------------------
   94|  16.7M|            up |= dav1d_msac_decode_bool_adapt(msac, mv_comp->classN[n]) << n;
  ------------------
  |  |   52|  16.7M|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
   95|  1.80M|        if (mv_prec >= 0) {  // !force_integer_mv
  ------------------
  |  Branch (95:13): [True: 10.5k, False: 1.79M]
  ------------------
   96|  10.5k|            fp = dav1d_msac_decode_symbol_adapt4(msac, mv_comp->classN_fp, 3);
  ------------------
  |  |   47|  10.5k|#define dav1d_msac_decode_symbol_adapt4  dav1d_msac_decode_symbol_adapt4_sse2
  ------------------
   97|  10.5k|            if (mv_prec > 0) // allow_high_precision_mv
  ------------------
  |  Branch (97:17): [True: 5.19k, False: 5.30k]
  ------------------
   98|  5.19k|                hp = dav1d_msac_decode_bool_adapt(msac, mv_comp->classN_hp);
  ------------------
  |  |   52|  5.19k|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
   99|  10.5k|        }
  100|  1.80M|    }
  101|       |
  102|  1.89M|    const int diff = ((up << 3) | (fp << 1) | hp) + 1;
  103|       |
  104|  1.89M|    return sign ? -diff : diff;
  ------------------
  |  Branch (104:12): [True: 1.83M, False: 63.0k]
  ------------------
  105|  1.89M|}
decode.c:read_vartx_tree:
  448|  1.04M|{
  449|  1.04M|    const Dav1dFrameContext *const f = t->f;
  450|  1.04M|    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
  451|  1.04M|    const int bw4 = b_dim[0], bh4 = b_dim[1];
  452|       |
  453|       |    // var-tx tree coding
  454|  1.04M|    uint16_t tx_split[2] = { 0 };
  455|  1.04M|    b->max_ytx = dav1d_max_txfm_size_for_bs[bs][0];
  456|  1.04M|    if (!b->skip && (f->frame_hdr->segmentation.lossless[b->seg_id] ||
  ------------------
  |  Branch (456:9): [True: 91.8k, False: 955k]
  |  Branch (456:22): [True: 12.3k, False: 79.5k]
  ------------------
  457|  79.5k|                     b->max_ytx == TX_4X4))
  ------------------
  |  Branch (457:22): [True: 6.08k, False: 73.4k]
  ------------------
  458|  18.4k|    {
  459|  18.4k|        b->max_ytx = b->uvtx = TX_4X4;
  460|  18.4k|        if (f->frame_hdr->txfm_mode == DAV1D_TX_SWITCHABLE) {
  ------------------
  |  Branch (460:13): [True: 2.73k, False: 15.6k]
  ------------------
  461|  2.73k|            dav1d_memset_pow2[b_dim[2]](&t->a->tx[bx4], TX_4X4);
  462|  2.73k|            dav1d_memset_pow2[b_dim[3]](&t->l.tx[by4], TX_4X4);
  463|  2.73k|        }
  464|  1.02M|    } else if (f->frame_hdr->txfm_mode != DAV1D_TX_SWITCHABLE || b->skip) {
  ------------------
  |  Branch (464:16): [True: 826k, False: 202k]
  |  Branch (464:66): [True: 169k, False: 32.8k]
  ------------------
  465|   997k|        if (f->frame_hdr->txfm_mode == DAV1D_TX_SWITCHABLE) {
  ------------------
  |  Branch (465:13): [True: 169k, False: 827k]
  ------------------
  466|   169k|            dav1d_memset_pow2[b_dim[2]](&t->a->tx[bx4], b_dim[2 + 0]);
  467|   169k|            dav1d_memset_pow2[b_dim[3]](&t->l.tx[by4], b_dim[2 + 1]);
  468|   169k|        }
  469|   997k|        b->uvtx = dav1d_max_txfm_size_for_bs[bs][f->cur.p.layout];
  470|   997k|    } else {
  471|  31.6k|        assert(bw4 <= 16 || bh4 <= 16 || b->max_ytx == TX_64X64);
  ------------------
  |  |  140|  63.5k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:30): [True: 31.4k, False: 204]
  |  |  |  Branch (140:30): [True: 178, False: 26]
  |  |  |  Branch (140:30): [True: 26, False: 0]
  |  |  |  Branch (140:68): [Folded, False: 31.6k]
  |  |  ------------------
  ------------------
  472|  31.6k|        int y, x, y_off, x_off;
  473|  31.6k|        const TxfmInfo *const ytx = &dav1d_txfm_dimensions[b->max_ytx];
  474|  64.6k|        for (y = 0, y_off = 0; y < bh4; y += ytx->h, y_off++) {
  ------------------
  |  Branch (474:32): [True: 33.0k, False: 31.6k]
  ------------------
  475|  66.3k|            for (x = 0, x_off = 0; x < bw4; x += ytx->w, x_off++) {
  ------------------
  |  Branch (475:36): [True: 33.2k, False: 33.0k]
  ------------------
  476|  33.2k|                read_tx_tree(t, b->max_ytx, 0, tx_split, x_off, y_off);
  477|       |                // contexts are updated inside read_tx_tree()
  478|  33.2k|                t->bx += ytx->w;
  479|  33.2k|            }
  480|  33.0k|            t->bx -= x;
  481|  33.0k|            t->by += ytx->h;
  482|  33.0k|        }
  483|  31.6k|        t->by -= y;
  484|  31.6k|        if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|  31.6k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 31.6k]
  |  |  ------------------
  |  |   35|  31.6k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  31.6k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  485|      0|            printf("Post-vartxtree[%x/%x]: r=%d\n",
  486|      0|                   tx_split[0], tx_split[1], t->ts->msac.rng);
  487|  31.6k|        b->uvtx = dav1d_max_txfm_size_for_bs[bs][f->cur.p.layout];
  488|  31.6k|    }
  489|  1.04M|    assert(!(tx_split[0] & ~0x33));
  ------------------
  |  |  140|  1.04M|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 1.04M]
  |  |  |  Branch (140:68): [Folded, False: 1.04M]
  |  |  ------------------
  ------------------
  490|  1.04M|    b->tx_split0 = (uint8_t)tx_split[0];
  491|  1.04M|    b->tx_split1 = tx_split[1];
  492|  1.04M|}
decode.c:read_tx_tree:
  123|  63.7k|{
  124|  63.7k|    const Dav1dFrameContext *const f = t->f;
  125|  63.7k|    const int bx4 = t->bx & 31, by4 = t->by & 31;
  126|  63.7k|    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[from];
  127|  63.7k|    const int txw = t_dim->lw, txh = t_dim->lh;
  128|  63.7k|    int is_split;
  129|       |
  130|  63.7k|    if (depth < 2 && from > (int) TX_4X4) {
  ------------------
  |  Branch (130:9): [True: 54.6k, False: 9.12k]
  |  Branch (130:22): [True: 54.6k, False: 0]
  ------------------
  131|  54.6k|        const int cat = 2 * (TX_64X64 - t_dim->max) - depth;
  132|  54.6k|        const int a = t->a->tx[bx4] < txw;
  133|  54.6k|        const int l = t->l.tx[by4] < txh;
  134|       |
  135|  54.6k|        is_split = dav1d_msac_decode_bool_adapt(&t->ts->msac,
  ------------------
  |  |   52|  54.6k|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
  136|  54.6k|                       t->ts->cdf.m.txpart[cat][a + l]);
  137|  54.6k|        if (is_split)
  ------------------
  |  Branch (137:13): [True: 16.1k, False: 38.5k]
  ------------------
  138|  16.1k|            masks[depth] |= 1 << (y_off * 4 + x_off);
  139|  54.6k|    } else {
  140|  9.12k|        is_split = 0;
  141|  9.12k|    }
  142|       |
  143|  63.7k|    if (is_split && t_dim->max > TX_8X8) {
  ------------------
  |  Branch (143:9): [True: 16.1k, False: 47.6k]
  |  Branch (143:21): [True: 11.6k, False: 4.42k]
  ------------------
  144|  11.6k|        const enum RectTxfmSize sub = t_dim->sub;
  145|  11.6k|        const TxfmInfo *const sub_t_dim = &dav1d_txfm_dimensions[sub];
  146|  11.6k|        const int txsw = sub_t_dim->w, txsh = sub_t_dim->h;
  147|       |
  148|  11.6k|        read_tx_tree(t, sub, depth + 1, masks, x_off * 2 + 0, y_off * 2 + 0);
  149|  11.6k|        t->bx += txsw;
  150|  11.6k|        if (txw >= txh && t->bx < f->bw)
  ------------------
  |  Branch (150:13): [True: 9.01k, False: 2.67k]
  |  Branch (150:27): [True: 8.36k, False: 648]
  ------------------
  151|  8.36k|            read_tx_tree(t, sub, depth + 1, masks, x_off * 2 + 1, y_off * 2 + 0);
  152|  11.6k|        t->bx -= txsw;
  153|  11.6k|        t->by += txsh;
  154|  11.6k|        if (txh >= txw && t->by < f->bh) {
  ------------------
  |  Branch (154:13): [True: 7.46k, False: 4.22k]
  |  Branch (154:27): [True: 6.83k, False: 627]
  ------------------
  155|  6.83k|            read_tx_tree(t, sub, depth + 1, masks, x_off * 2 + 0, y_off * 2 + 1);
  156|  6.83k|            t->bx += txsw;
  157|  6.83k|            if (txw >= txh && t->bx < f->bw)
  ------------------
  |  Branch (157:17): [True: 4.19k, False: 2.64k]
  |  Branch (157:31): [True: 3.58k, False: 604]
  ------------------
  158|  3.58k|                read_tx_tree(t, sub, depth + 1, masks,
  159|  3.58k|                             x_off * 2 + 1, y_off * 2 + 1);
  160|  6.83k|            t->bx -= txsw;
  161|  6.83k|        }
  162|  11.6k|        t->by -= txsh;
  163|  52.0k|    } else {
  164|  52.0k|        dav1d_memset_pow2[t_dim->lw](&t->a->tx[bx4], is_split ? TX_4X4 : txw);
  ------------------
  |  Branch (164:54): [True: 4.42k, False: 47.6k]
  ------------------
  165|  52.0k|        dav1d_memset_pow2[t_dim->lh](&t->l.tx[by4], is_split ? TX_4X4 : txh);
  ------------------
  |  Branch (165:53): [True: 4.42k, False: 47.6k]
  ------------------
  166|  52.0k|    }
  167|  63.7k|}
decode.c:splat_intrabc_mv:
  535|   962k|{
  536|   962k|    const refmvs_block ALIGN(tmpl, 16) = (refmvs_block) {
  537|   962k|        .ref.ref = { 0, -1 },
  538|   962k|        .mv.mv[0] = b->mv[0],
  539|   962k|        .bs = bs,
  540|   962k|        .mf = 0,
  541|   962k|    };
  542|   962k|    c->refmvs_dsp.splat_mv(&t->rt.r[(t->by & 31) + 5], &tmpl, t->bx, bw4, bh4);
  543|   962k|}
decode.c:findoddzero:
  339|  33.1k|static inline int findoddzero(const uint8_t *buf, int len) {
  340|  36.5k|    for (int n = 0; n < len; n++)
  ------------------
  |  Branch (340:21): [True: 35.2k, False: 1.30k]
  ------------------
  341|  35.2k|        if (!buf[n * 2]) return 1;
  ------------------
  |  Branch (341:13): [True: 31.8k, False: 3.42k]
  ------------------
  342|  1.30k|    return 0;
  343|  33.1k|}
decode.c:find_matching_ref:
  197|  31.8k|{
  198|  31.8k|    /*const*/ refmvs_block *const *r = &t->rt.r[(t->by & 31) + 5];
  199|  31.8k|    int count = 0;
  200|  31.8k|    int have_topleft = have_top && have_left;
  ------------------
  |  Branch (200:24): [True: 23.4k, False: 8.39k]
  |  Branch (200:36): [True: 20.8k, False: 2.59k]
  ------------------
  201|  31.8k|    int have_topright = imax(bw4, bh4) < 32 &&
  ------------------
  |  Branch (201:25): [True: 31.7k, False: 89]
  ------------------
  202|  31.7k|                        have_top && t->bx + bw4 < t->ts->tiling.col_end &&
  ------------------
  |  Branch (202:25): [True: 23.4k, False: 8.30k]
  |  Branch (202:37): [True: 20.7k, False: 2.69k]
  ------------------
  203|  20.7k|                        (intra_edge_flags & EDGE_I444_TOP_HAS_RIGHT);
  ------------------
  |  Branch (203:25): [True: 11.8k, False: 8.85k]
  ------------------
  204|       |
  205|  31.8k|#define bs(rp) dav1d_block_dimensions[(rp)->bs]
  206|  31.8k|#define matches(rp) ((rp)->ref.ref[0] == ref + 1 && (rp)->ref.ref[1] == -1)
  207|       |
  208|  31.8k|    if (have_top) {
  ------------------
  |  Branch (208:9): [True: 23.4k, False: 8.38k]
  ------------------
  209|  23.4k|        const refmvs_block *r2 = &r[-1][t->bx];
  210|  23.4k|        if (matches(r2)) {
  ------------------
  |  |  206|  23.4k|#define matches(rp) ((rp)->ref.ref[0] == ref + 1 && (rp)->ref.ref[1] == -1)
  |  |  ------------------
  |  |  |  Branch (206:22): [True: 17.1k, False: 6.25k]
  |  |  |  Branch (206:53): [True: 16.0k, False: 1.16k]
  |  |  ------------------
  ------------------
  211|  16.0k|            masks[0] |= 1;
  212|  16.0k|            count = 1;
  213|  16.0k|        }
  214|  23.4k|        int aw4 = bs(r2)[0];
  ------------------
  |  |  205|  23.4k|#define bs(rp) dav1d_block_dimensions[(rp)->bs]
  ------------------
  215|  23.4k|        if (aw4 >= bw4) {
  ------------------
  |  Branch (215:13): [True: 20.2k, False: 3.17k]
  ------------------
  216|  20.2k|            const int off = t->bx & (aw4 - 1);
  217|  20.2k|            if (off) have_topleft = 0;
  ------------------
  |  Branch (217:17): [True: 3.14k, False: 17.1k]
  ------------------
  218|  20.2k|            if (aw4 - off > bw4) have_topright = 0;
  ------------------
  |  Branch (218:17): [True: 3.35k, False: 16.9k]
  ------------------
  219|  20.2k|        } else {
  220|  3.17k|            unsigned mask = 1 << aw4;
  221|  7.16k|            for (int x = aw4; x < w4; x += aw4) {
  ------------------
  |  Branch (221:31): [True: 3.99k, False: 3.17k]
  ------------------
  222|  3.99k|                r2 += aw4;
  223|  3.99k|                if (matches(r2)) {
  ------------------
  |  |  206|  3.99k|#define matches(rp) ((rp)->ref.ref[0] == ref + 1 && (rp)->ref.ref[1] == -1)
  |  |  ------------------
  |  |  |  Branch (206:22): [True: 2.64k, False: 1.34k]
  |  |  |  Branch (206:53): [True: 2.46k, False: 185]
  |  |  ------------------
  ------------------
  224|  2.46k|                    masks[0] |= mask;
  225|  2.46k|                    if (++count >= 8) return;
  ------------------
  |  Branch (225:25): [True: 8, False: 2.45k]
  ------------------
  226|  2.46k|                }
  227|  3.98k|                aw4 = bs(r2)[0];
  ------------------
  |  |  205|  3.98k|#define bs(rp) dav1d_block_dimensions[(rp)->bs]
  ------------------
  228|  3.98k|                mask <<= aw4;
  229|  3.98k|            }
  230|  3.17k|        }
  231|  23.4k|    }
  232|  31.8k|    if (have_left) {
  ------------------
  |  Branch (232:9): [True: 29.2k, False: 2.58k]
  ------------------
  233|  29.2k|        /*const*/ refmvs_block *const *r2 = r;
  234|  29.2k|        if (matches(&r2[0][t->bx - 1])) {
  ------------------
  |  |  206|  29.2k|#define matches(rp) ((rp)->ref.ref[0] == ref + 1 && (rp)->ref.ref[1] == -1)
  |  |  ------------------
  |  |  |  Branch (206:22): [True: 22.4k, False: 6.81k]
  |  |  |  Branch (206:53): [True: 21.0k, False: 1.41k]
  |  |  ------------------
  ------------------
  235|  21.0k|            masks[1] |= 1;
  236|  21.0k|            if (++count >= 8) return;
  ------------------
  |  Branch (236:17): [True: 4, False: 21.0k]
  ------------------
  237|  21.0k|        }
  238|  29.2k|        int lh4 = bs(&r2[0][t->bx - 1])[1];
  ------------------
  |  |  205|  29.2k|#define bs(rp) dav1d_block_dimensions[(rp)->bs]
  ------------------
  239|  29.2k|        if (lh4 >= bh4) {
  ------------------
  |  Branch (239:13): [True: 24.9k, False: 4.29k]
  ------------------
  240|  24.9k|            if (t->by & (lh4 - 1)) have_topleft = 0;
  ------------------
  |  Branch (240:17): [True: 3.81k, False: 21.1k]
  ------------------
  241|  24.9k|        } else {
  242|  4.29k|            unsigned mask = 1 << lh4;
  243|  10.5k|            for (int y = lh4; y < h4; y += lh4) {
  ------------------
  |  Branch (243:31): [True: 6.24k, False: 4.28k]
  ------------------
  244|  6.24k|                r2 += lh4;
  245|  6.24k|                if (matches(&r2[0][t->bx - 1])) {
  ------------------
  |  |  206|  6.24k|#define matches(rp) ((rp)->ref.ref[0] == ref + 1 && (rp)->ref.ref[1] == -1)
  |  |  ------------------
  |  |  |  Branch (206:22): [True: 3.44k, False: 2.79k]
  |  |  |  Branch (206:53): [True: 3.22k, False: 220]
  |  |  ------------------
  ------------------
  246|  3.22k|                    masks[1] |= mask;
  247|  3.22k|                    if (++count >= 8) return;
  ------------------
  |  Branch (247:25): [True: 14, False: 3.21k]
  ------------------
  248|  3.22k|                }
  249|  6.22k|                lh4 = bs(&r2[0][t->bx - 1])[1];
  ------------------
  |  |  205|  6.22k|#define bs(rp) dav1d_block_dimensions[(rp)->bs]
  ------------------
  250|  6.22k|                mask <<= lh4;
  251|  6.22k|            }
  252|  4.29k|        }
  253|  29.2k|    }
  254|  31.8k|    if (have_topleft && matches(&r[-1][t->bx - 1])) {
  ------------------
  |  |  206|  13.8k|#define matches(rp) ((rp)->ref.ref[0] == ref + 1 && (rp)->ref.ref[1] == -1)
  |  |  ------------------
  |  |  |  Branch (206:22): [True: 9.29k, False: 4.58k]
  |  |  |  Branch (206:53): [True: 8.47k, False: 821]
  |  |  ------------------
  ------------------
  |  Branch (254:9): [True: 13.8k, False: 17.9k]
  ------------------
  255|  8.47k|        masks[1] |= 1ULL << 32;
  256|  8.47k|        if (++count >= 8) return;
  ------------------
  |  Branch (256:13): [True: 22, False: 8.44k]
  ------------------
  257|  8.47k|    }
  258|  31.7k|    if (have_topright && matches(&r[-1][t->bx + bw4])) {
  ------------------
  |  |  206|  8.70k|#define matches(rp) ((rp)->ref.ref[0] == ref + 1 && (rp)->ref.ref[1] == -1)
  |  |  ------------------
  |  |  |  Branch (206:22): [True: 5.72k, False: 2.97k]
  |  |  |  Branch (206:53): [True: 5.30k, False: 424]
  |  |  ------------------
  ------------------
  |  Branch (258:9): [True: 8.70k, False: 23.0k]
  ------------------
  259|  5.30k|        masks[0] |= 1ULL << 32;
  260|  5.30k|    }
  261|  31.7k|#undef matches
  262|  31.7k|}
decode.c:derive_warpmv:
  268|  5.48k|{
  269|  5.48k|    int pts[8][2 /* in, out */][2 /* x, y */], np = 0;
  270|  5.48k|    /*const*/ refmvs_block *const *r = &t->rt.r[(t->by & 31) + 5];
  271|       |
  272|  5.48k|#define add_sample(dx, dy, sx, sy, rp) do { \
  273|  5.48k|    pts[np][0][0] = 16 * (2 * dx + sx * bs(rp)[0]) - 8; \
  274|  5.48k|    pts[np][0][1] = 16 * (2 * dy + sy * bs(rp)[1]) - 8; \
  275|  5.48k|    pts[np][1][0] = pts[np][0][0] + (rp)->mv.mv[0].x; \
  276|  5.48k|    pts[np][1][1] = pts[np][0][1] + (rp)->mv.mv[0].y; \
  277|  5.48k|    np++; \
  278|  5.48k|} while (0)
  279|       |
  280|       |    // use masks[] to find the projectable motion vectors in the edges
  281|  5.48k|    if ((unsigned) masks[0] == 1 && !(masks[1] >> 32)) {
  ------------------
  |  Branch (281:9): [True: 2.60k, False: 2.88k]
  |  Branch (281:37): [True: 1.07k, False: 1.53k]
  ------------------
  282|  1.07k|        const int off = t->bx & (bs(&r[-1][t->bx])[0] - 1);
  ------------------
  |  |  205|  1.07k|#define bs(rp) dav1d_block_dimensions[(rp)->bs]
  ------------------
  283|  1.07k|        add_sample(-off, 0, 1, -1, &r[-1][t->bx]);
  ------------------
  |  |  272|  1.07k|#define add_sample(dx, dy, sx, sy, rp) do { \
  |  |  273|  1.07k|    pts[np][0][0] = 16 * (2 * dx + sx * bs(rp)[0]) - 8; \
  |  |  ------------------
  |  |  |  |  205|  1.07k|#define bs(rp) dav1d_block_dimensions[(rp)->bs]
  |  |  ------------------
  |  |  274|  1.07k|    pts[np][0][1] = 16 * (2 * dy + sy * bs(rp)[1]) - 8; \
  |  |  ------------------
  |  |  |  |  205|  1.07k|#define bs(rp) dav1d_block_dimensions[(rp)->bs]
  |  |  ------------------
  |  |  275|  1.07k|    pts[np][1][0] = pts[np][0][0] + (rp)->mv.mv[0].x; \
  |  |  276|  1.07k|    pts[np][1][1] = pts[np][0][1] + (rp)->mv.mv[0].y; \
  |  |  277|  1.07k|    np++; \
  |  |  278|  1.07k|} while (0)
  |  |  ------------------
  |  |  |  Branch (278:10): [Folded, False: 1.07k]
  |  |  ------------------
  ------------------
  284|  6.93k|    } else for (unsigned off = 0, xmask = (uint32_t) masks[0]; np < 8 && xmask;) { // top
  ------------------
  |  Branch (284:64): [True: 6.92k, False: 7]
  |  Branch (284:74): [True: 2.51k, False: 4.40k]
  ------------------
  285|  2.51k|        const int tz = ctz(xmask);
  286|  2.51k|        off += tz;
  287|  2.51k|        xmask >>= tz;
  288|  2.51k|        add_sample(off, 0, 1, -1, &r[-1][t->bx + off]);
  ------------------
  |  |  272|  2.51k|#define add_sample(dx, dy, sx, sy, rp) do { \
  |  |  273|  2.51k|    pts[np][0][0] = 16 * (2 * dx + sx * bs(rp)[0]) - 8; \
  |  |  ------------------
  |  |  |  |  205|  2.51k|#define bs(rp) dav1d_block_dimensions[(rp)->bs]
  |  |  ------------------
  |  |  274|  2.51k|    pts[np][0][1] = 16 * (2 * dy + sy * bs(rp)[1]) - 8; \
  |  |  ------------------
  |  |  |  |  205|  2.51k|#define bs(rp) dav1d_block_dimensions[(rp)->bs]
  |  |  ------------------
  |  |  275|  2.51k|    pts[np][1][0] = pts[np][0][0] + (rp)->mv.mv[0].x; \
  |  |  276|  2.51k|    pts[np][1][1] = pts[np][0][1] + (rp)->mv.mv[0].y; \
  |  |  277|  2.51k|    np++; \
  |  |  278|  2.51k|} while (0)
  |  |  ------------------
  |  |  |  Branch (278:10): [Folded, False: 2.51k]
  |  |  ------------------
  ------------------
  289|  2.51k|        xmask &= ~1;
  290|  2.51k|    }
  291|  5.48k|    if (np < 8 && masks[1] == 1) {
  ------------------
  |  Branch (291:9): [True: 5.47k, False: 8]
  |  Branch (291:19): [True: 2.82k, False: 2.65k]
  ------------------
  292|  2.82k|        const int off = t->by & (bs(&r[0][t->bx - 1])[1] - 1);
  ------------------
  |  |  205|  2.82k|#define bs(rp) dav1d_block_dimensions[(rp)->bs]
  ------------------
  293|  2.82k|        add_sample(0, -off, -1, 1, &r[-off][t->bx - 1]);
  ------------------
  |  |  272|  2.82k|#define add_sample(dx, dy, sx, sy, rp) do { \
  |  |  273|  2.82k|    pts[np][0][0] = 16 * (2 * dx + sx * bs(rp)[0]) - 8; \
  |  |  ------------------
  |  |  |  |  205|  2.82k|#define bs(rp) dav1d_block_dimensions[(rp)->bs]
  |  |  ------------------
  |  |  274|  2.82k|    pts[np][0][1] = 16 * (2 * dy + sy * bs(rp)[1]) - 8; \
  |  |  ------------------
  |  |  |  |  205|  2.82k|#define bs(rp) dav1d_block_dimensions[(rp)->bs]
  |  |  ------------------
  |  |  275|  2.82k|    pts[np][1][0] = pts[np][0][0] + (rp)->mv.mv[0].x; \
  |  |  276|  2.82k|    pts[np][1][1] = pts[np][0][1] + (rp)->mv.mv[0].y; \
  |  |  277|  2.82k|    np++; \
  |  |  278|  2.82k|} while (0)
  |  |  ------------------
  |  |  |  Branch (278:10): [Folded, False: 2.82k]
  |  |  ------------------
  ------------------
  294|  5.24k|    } else for (unsigned off = 0, ymask = (uint32_t) masks[1]; np < 8 && ymask;) { // left
  ------------------
  |  Branch (294:64): [True: 5.22k, False: 18]
  |  Branch (294:74): [True: 2.58k, False: 2.64k]
  ------------------
  295|  2.58k|        const int tz = ctz(ymask);
  296|  2.58k|        off += tz;
  297|  2.58k|        ymask >>= tz;
  298|  2.58k|        add_sample(0, off, -1, 1, &r[off][t->bx - 1]);
  ------------------
  |  |  272|  2.58k|#define add_sample(dx, dy, sx, sy, rp) do { \
  |  |  273|  2.58k|    pts[np][0][0] = 16 * (2 * dx + sx * bs(rp)[0]) - 8; \
  |  |  ------------------
  |  |  |  |  205|  2.58k|#define bs(rp) dav1d_block_dimensions[(rp)->bs]
  |  |  ------------------
  |  |  274|  2.58k|    pts[np][0][1] = 16 * (2 * dy + sy * bs(rp)[1]) - 8; \
  |  |  ------------------
  |  |  |  |  205|  2.58k|#define bs(rp) dav1d_block_dimensions[(rp)->bs]
  |  |  ------------------
  |  |  275|  2.58k|    pts[np][1][0] = pts[np][0][0] + (rp)->mv.mv[0].x; \
  |  |  276|  2.58k|    pts[np][1][1] = pts[np][0][1] + (rp)->mv.mv[0].y; \
  |  |  277|  2.58k|    np++; \
  |  |  278|  2.58k|} while (0)
  |  |  ------------------
  |  |  |  Branch (278:10): [Folded, False: 2.58k]
  |  |  ------------------
  ------------------
  299|  2.58k|        ymask &= ~1;
  300|  2.58k|    }
  301|  5.48k|    if (np < 8 && masks[1] >> 32) // top/left
  ------------------
  |  Branch (301:9): [True: 5.46k, False: 19]
  |  Branch (301:19): [True: 1.93k, False: 3.52k]
  ------------------
  302|  1.93k|        add_sample(0, 0, -1, -1, &r[-1][t->bx - 1]);
  ------------------
  |  |  272|  1.93k|#define add_sample(dx, dy, sx, sy, rp) do { \
  |  |  273|  1.93k|    pts[np][0][0] = 16 * (2 * dx + sx * bs(rp)[0]) - 8; \
  |  |  ------------------
  |  |  |  |  205|  1.93k|#define bs(rp) dav1d_block_dimensions[(rp)->bs]
  |  |  ------------------
  |  |  274|  1.93k|    pts[np][0][1] = 16 * (2 * dy + sy * bs(rp)[1]) - 8; \
  |  |  ------------------
  |  |  |  |  205|  1.93k|#define bs(rp) dav1d_block_dimensions[(rp)->bs]
  |  |  ------------------
  |  |  275|  1.93k|    pts[np][1][0] = pts[np][0][0] + (rp)->mv.mv[0].x; \
  |  |  276|  1.93k|    pts[np][1][1] = pts[np][0][1] + (rp)->mv.mv[0].y; \
  |  |  277|  1.93k|    np++; \
  |  |  278|  1.93k|} while (0)
  |  |  ------------------
  |  |  |  Branch (278:10): [Folded, False: 1.93k]
  |  |  ------------------
  ------------------
  303|  5.48k|    if (np < 8 && masks[0] >> 32) // top/right
  ------------------
  |  Branch (303:9): [True: 5.46k, False: 21]
  |  Branch (303:19): [True: 1.55k, False: 3.90k]
  ------------------
  304|  1.55k|        add_sample(bw4, 0, 1, -1, &r[-1][t->bx + bw4]);
  ------------------
  |  |  272|  1.55k|#define add_sample(dx, dy, sx, sy, rp) do { \
  |  |  273|  1.55k|    pts[np][0][0] = 16 * (2 * dx + sx * bs(rp)[0]) - 8; \
  |  |  ------------------
  |  |  |  |  205|  1.55k|#define bs(rp) dav1d_block_dimensions[(rp)->bs]
  |  |  ------------------
  |  |  274|  1.55k|    pts[np][0][1] = 16 * (2 * dy + sy * bs(rp)[1]) - 8; \
  |  |  ------------------
  |  |  |  |  205|  1.55k|#define bs(rp) dav1d_block_dimensions[(rp)->bs]
  |  |  ------------------
  |  |  275|  1.55k|    pts[np][1][0] = pts[np][0][0] + (rp)->mv.mv[0].x; \
  |  |  276|  1.55k|    pts[np][1][1] = pts[np][0][1] + (rp)->mv.mv[0].y; \
  |  |  277|  1.55k|    np++; \
  |  |  278|  1.55k|} while (0)
  |  |  ------------------
  |  |  |  Branch (278:10): [Folded, False: 1.55k]
  |  |  ------------------
  ------------------
  305|  5.48k|    assert(np > 0 && np <= 8);
  ------------------
  |  |  140|  10.9k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:30): [True: 5.48k, False: 2]
  |  |  |  Branch (140:30): [True: 5.48k, False: 0]
  |  |  |  Branch (140:68): [Folded, False: 5.48k]
  |  |  ------------------
  ------------------
  306|  5.48k|#undef bs
  307|       |
  308|       |    // select according to motion vector difference against a threshold
  309|  5.48k|    int mvd[8], ret = 0;
  310|  5.48k|    const int thresh = 4 * iclip(imax(bw4, bh4), 4, 28);
  311|  17.9k|    for (int i = 0; i < np; i++) {
  ------------------
  |  Branch (311:21): [True: 12.4k, False: 5.48k]
  ------------------
  312|  12.4k|        mvd[i] = abs(pts[i][1][0] - pts[i][0][0] - mv.x) +
  313|  12.4k|                 abs(pts[i][1][1] - pts[i][0][1] - mv.y);
  314|  12.4k|        if (mvd[i] > thresh)
  ------------------
  |  Branch (314:13): [True: 2.75k, False: 9.74k]
  ------------------
  315|  2.75k|            mvd[i] = -1;
  316|  9.74k|        else
  317|  9.74k|            ret++;
  318|  12.4k|    }
  319|  5.48k|    if (!ret) {
  ------------------
  |  Branch (319:9): [True: 813, False: 4.67k]
  ------------------
  320|    813|        ret = 1;
  321|  5.24k|    } else for (int i = 0, j = np - 1, k = 0; k < np - ret; k++, i++, j--) {
  ------------------
  |  Branch (321:47): [True: 1.09k, False: 4.14k]
  ------------------
  322|  1.76k|        while (mvd[i] != -1) i++;
  ------------------
  |  Branch (322:16): [True: 669, False: 1.09k]
  ------------------
  323|  2.00k|        while (mvd[j] == -1) j--;
  ------------------
  |  Branch (323:16): [True: 907, False: 1.09k]
  ------------------
  324|  1.09k|        assert(i != j);
  ------------------
  |  |  140|  1.09k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 1.09k]
  |  |  |  Branch (140:68): [Folded, False: 1.09k]
  |  |  ------------------
  ------------------
  325|  1.09k|        if (i > j) break;
  ------------------
  |  Branch (325:13): [True: 533, False: 566]
  ------------------
  326|       |        // replace the discarded samples;
  327|    566|        mvd[i] = mvd[j];
  328|    566|        memcpy(pts[i], pts[j], sizeof(*pts));
  329|    566|    }
  330|       |
  331|  5.48k|    if (!dav1d_find_affine_int(pts, ret, bw4, bh4, mv, wmp, t->bx, t->by) &&
  ------------------
  |  Branch (331:9): [True: 5.26k, False: 224]
  ------------------
  332|  5.26k|        !dav1d_get_shear_params(wmp))
  ------------------
  |  Branch (332:9): [True: 4.97k, False: 288]
  ------------------
  333|  4.97k|    {
  334|  4.97k|        wmp->type = DAV1D_WM_TYPE_AFFINE;
  335|  4.97k|    } else
  336|    512|        wmp->type = DAV1D_WM_TYPE_IDENTITY;
  337|  5.48k|}
decode.c:splat_tworef_mv:
  550|  15.1k|{
  551|  15.1k|    assert(bw4 >= 2 && bh4 >= 2);
  ------------------
  |  |  140|  30.2k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:30): [True: 15.1k, False: 0]
  |  |  |  Branch (140:30): [True: 15.1k, False: 0]
  |  |  |  Branch (140:68): [Folded, False: 15.1k]
  |  |  ------------------
  ------------------
  552|  15.1k|    const enum CompInterPredMode mode = b->inter_mode;
  553|  15.1k|    const refmvs_block ALIGN(tmpl, 16) = (refmvs_block) {
  554|  15.1k|        .ref.ref = { b->ref[0] + 1, b->ref[1] + 1 },
  555|  15.1k|        .mv.mv = { b->mv[0], b->mv[1] },
  556|  15.1k|        .bs = bs,
  557|  15.1k|        .mf = (mode == GLOBALMV_GLOBALMV) | !!((1 << mode) & (0xbc)) * 2,
  558|  15.1k|    };
  559|  15.1k|    c->refmvs_dsp.splat_mv(&t->rt.r[(t->by & 31) + 5], &tmpl, t->bx, bw4, bh4);
  560|  15.1k|}
decode.c:splat_oneref_mv:
  519|  70.6k|{
  520|  70.6k|    const enum InterPredMode mode = b->inter_mode;
  521|  70.6k|    const refmvs_block ALIGN(tmpl, 16) = (refmvs_block) {
  522|  70.6k|        .ref.ref = { b->ref[0] + 1, b->interintra_type ? 0 : -1 },
  ------------------
  |  Branch (522:37): [True: 1.82k, False: 68.8k]
  ------------------
  523|  70.6k|        .mv.mv[0] = b->mv[0],
  524|  70.6k|        .bs = bs,
  525|  70.6k|        .mf = (mode == GLOBALMV && imin(bw4, bh4) >= 2) | ((mode == NEWMV) * 2),
  ------------------
  |  Branch (525:16): [True: 12.5k, False: 58.0k]
  |  Branch (525:36): [True: 7.13k, False: 5.46k]
  ------------------
  526|  70.6k|    };
  527|  70.6k|    c->refmvs_dsp.splat_mv(&t->rt.r[(t->by & 31) + 5], &tmpl, t->bx, bw4, bh4);
  528|  70.6k|}
decode.c:read_restoration_info:
 2514|   109k|{
 2515|   109k|    const Dav1dFrameContext *const f = t->f;
 2516|   109k|    Dav1dTileState *const ts = t->ts;
 2517|       |
 2518|   109k|    if (frame_type == DAV1D_RESTORATION_SWITCHABLE) {
  ------------------
  |  Branch (2518:9): [True: 27.0k, False: 82.6k]
  ------------------
 2519|  27.0k|        const int filter = dav1d_msac_decode_symbol_adapt4(&ts->msac,
  ------------------
  |  |   47|  27.0k|#define dav1d_msac_decode_symbol_adapt4  dav1d_msac_decode_symbol_adapt4_sse2
  ------------------
 2520|  27.0k|                               ts->cdf.m.restore_switchable, 2);
 2521|  27.0k|        lr->type = filter + !!filter; /* NONE/WIENER/SGRPROJ */
 2522|  82.6k|    } else {
 2523|  82.6k|        const unsigned type =
 2524|  82.6k|            dav1d_msac_decode_bool_adapt(&ts->msac,
  ------------------
  |  |   52|  82.6k|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
 2525|  82.6k|                frame_type == DAV1D_RESTORATION_WIENER ?
  ------------------
  |  Branch (2525:17): [True: 25.7k, False: 56.9k]
  ------------------
 2526|  56.9k|                ts->cdf.m.restore_wiener : ts->cdf.m.restore_sgrproj);
 2527|  82.6k|        lr->type = type ? frame_type : DAV1D_RESTORATION_NONE;
  ------------------
  |  Branch (2527:20): [True: 48.0k, False: 34.6k]
  ------------------
 2528|  82.6k|    }
 2529|       |
 2530|   109k|    if (lr->type == DAV1D_RESTORATION_WIENER) {
  ------------------
  |  Branch (2530:9): [True: 26.8k, False: 82.9k]
  ------------------
 2531|  26.8k|        lr->filter_v[0] = p ? 0 :
  ------------------
  |  Branch (2531:27): [True: 12.0k, False: 14.7k]
  ------------------
 2532|  26.8k|            dav1d_msac_decode_subexp(&ts->msac,
 2533|  14.7k|                ts->lr_ref[p]->filter_v[0] + 5, 16, 1) - 5;
 2534|  26.8k|        lr->filter_v[1] =
 2535|  26.8k|            dav1d_msac_decode_subexp(&ts->msac,
 2536|  26.8k|                ts->lr_ref[p]->filter_v[1] + 23, 32, 2) - 23;
 2537|  26.8k|        lr->filter_v[2] =
 2538|  26.8k|            dav1d_msac_decode_subexp(&ts->msac,
 2539|  26.8k|                ts->lr_ref[p]->filter_v[2] + 17, 64, 3) - 17;
 2540|       |
 2541|  26.8k|        lr->filter_h[0] = p ? 0 :
  ------------------
  |  Branch (2541:27): [True: 12.1k, False: 14.7k]
  ------------------
 2542|  26.8k|            dav1d_msac_decode_subexp(&ts->msac,
 2543|  14.7k|                ts->lr_ref[p]->filter_h[0] + 5, 16, 1) - 5;
 2544|  26.8k|        lr->filter_h[1] =
 2545|  26.8k|            dav1d_msac_decode_subexp(&ts->msac,
 2546|  26.8k|                ts->lr_ref[p]->filter_h[1] + 23, 32, 2) - 23;
 2547|  26.8k|        lr->filter_h[2] =
 2548|  26.8k|            dav1d_msac_decode_subexp(&ts->msac,
 2549|  26.8k|                ts->lr_ref[p]->filter_h[2] + 17, 64, 3) - 17;
 2550|  26.8k|        memcpy(lr->sgr_weights, ts->lr_ref[p]->sgr_weights, sizeof(lr->sgr_weights));
 2551|  26.8k|        ts->lr_ref[p] = lr;
 2552|  26.8k|        if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|  26.8k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 26.8k]
  |  |  ------------------
  |  |   35|  26.8k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  26.8k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 2553|      0|            printf("Post-lr_wiener[pl=%d,v[%d,%d,%d],h[%d,%d,%d]]: r=%d\n",
 2554|      0|                   p, lr->filter_v[0], lr->filter_v[1],
 2555|      0|                   lr->filter_v[2], lr->filter_h[0],
 2556|      0|                   lr->filter_h[1], lr->filter_h[2], ts->msac.rng);
 2557|  82.9k|    } else if (lr->type == DAV1D_RESTORATION_SGRPROJ) {
  ------------------
  |  Branch (2557:16): [True: 40.8k, False: 42.0k]
  ------------------
 2558|  40.8k|        const unsigned idx = dav1d_msac_decode_bools(&ts->msac, 4);
 2559|  40.8k|        const uint16_t *const sgr_params = dav1d_sgr_params[idx];
 2560|  40.8k|        lr->type += idx;
 2561|  40.8k|        lr->sgr_weights[0] = sgr_params[0] ? dav1d_msac_decode_subexp(&ts->msac,
  ------------------
  |  Branch (2561:30): [True: 31.6k, False: 9.20k]
  ------------------
 2562|  31.6k|            ts->lr_ref[p]->sgr_weights[0] + 96, 128, 4) - 96 : 0;
 2563|  40.8k|        lr->sgr_weights[1] = sgr_params[1] ? dav1d_msac_decode_subexp(&ts->msac,
  ------------------
  |  Branch (2563:30): [True: 33.0k, False: 7.82k]
  ------------------
 2564|  33.0k|            ts->lr_ref[p]->sgr_weights[1] + 32, 128, 4) - 32 : 95;
 2565|  40.8k|        memcpy(lr->filter_v, ts->lr_ref[p]->filter_v, sizeof(lr->filter_v));
 2566|  40.8k|        memcpy(lr->filter_h, ts->lr_ref[p]->filter_h, sizeof(lr->filter_h));
 2567|  40.8k|        ts->lr_ref[p] = lr;
 2568|  40.8k|        if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|  40.8k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 40.8k]
  |  |  ------------------
  |  |   35|  40.8k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  40.8k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 2569|      0|            printf("Post-lr_sgrproj[pl=%d,idx=%d,w[%d,%d]]: r=%d\n",
 2570|      0|                   p, idx, lr->sgr_weights[0],
 2571|      0|                   lr->sgr_weights[1], ts->msac.rng);
 2572|  40.8k|    }
 2573|   109k|}
decode.c:init_quant_tables:
   57|  40.9k|{
   58|   219k|    for (int i = 0; i < (frame_hdr->segmentation.enabled ? 8 : 1); i++) {
  ------------------
  |  Branch (58:21): [True: 178k, False: 40.9k]
  |  Branch (58:26): [True: 176k, False: 42.5k]
  ------------------
   59|   178k|        const int yac = frame_hdr->segmentation.enabled ?
  ------------------
  |  Branch (59:25): [True: 157k, False: 21.2k]
  ------------------
   60|   157k|            iclip_u8(qidx + frame_hdr->segmentation.seg_data.d[i].delta_q) : qidx;
   61|   178k|        const int ydc = iclip_u8(yac + frame_hdr->quant.ydc_delta);
   62|   178k|        const int uac = iclip_u8(yac + frame_hdr->quant.uac_delta);
   63|   178k|        const int udc = iclip_u8(yac + frame_hdr->quant.udc_delta);
   64|   178k|        const int vac = iclip_u8(yac + frame_hdr->quant.vac_delta);
   65|   178k|        const int vdc = iclip_u8(yac + frame_hdr->quant.vdc_delta);
   66|       |
   67|   178k|        dq[i][0][0] = dav1d_dq_tbl[seq_hdr->hbd][ydc][0];
   68|   178k|        dq[i][0][1] = dav1d_dq_tbl[seq_hdr->hbd][yac][1];
   69|   178k|        dq[i][1][0] = dav1d_dq_tbl[seq_hdr->hbd][udc][0];
   70|   178k|        dq[i][1][1] = dav1d_dq_tbl[seq_hdr->hbd][uac][1];
   71|   178k|        dq[i][2][0] = dav1d_dq_tbl[seq_hdr->hbd][vdc][0];
   72|   178k|        dq[i][2][1] = dav1d_dq_tbl[seq_hdr->hbd][vac][1];
   73|   178k|    }
   74|  40.9k|}
decode.c:setup_tile:
 2430|  32.2k|{
 2431|  32.2k|    const int col_sb_start = f->frame_hdr->tiling.col_start_sb[tile_col];
 2432|  32.2k|    const int col_sb128_start = col_sb_start >> !f->seq_hdr->sb128;
 2433|  32.2k|    const int col_sb_end = f->frame_hdr->tiling.col_start_sb[tile_col + 1];
 2434|  32.2k|    const int row_sb_start = f->frame_hdr->tiling.row_start_sb[tile_row];
 2435|  32.2k|    const int row_sb_end = f->frame_hdr->tiling.row_start_sb[tile_row + 1];
 2436|  32.2k|    const int sb_shift = f->sb_shift;
 2437|       |
 2438|  32.2k|    const uint8_t *const size_mul = ss_size_mul[f->cur.p.layout];
 2439|  96.6k|    for (int p = 0; p < 2; p++) {
  ------------------
  |  Branch (2439:21): [True: 64.4k, False: 32.2k]
  ------------------
 2440|  64.4k|        ts->frame_thread[p].pal_idx = f->frame_thread.pal_idx ?
  ------------------
  |  Branch (2440:39): [True: 0, False: 64.4k]
  ------------------
 2441|      0|            &f->frame_thread.pal_idx[(size_t)tile_start_off * size_mul[1] / 8] :
 2442|  64.4k|            NULL;
 2443|  64.4k|        ts->frame_thread[p].cbi = f->frame_thread.cbi ?
  ------------------
  |  Branch (2443:35): [True: 0, False: 64.4k]
  ------------------
 2444|      0|            &f->frame_thread.cbi[(size_t)tile_start_off * size_mul[0] / 64] :
 2445|  64.4k|            NULL;
 2446|  64.4k|        ts->frame_thread[p].cf = f->frame_thread.cf ?
  ------------------
  |  Branch (2446:34): [True: 0, False: 64.4k]
  ------------------
 2447|      0|            (uint8_t*)f->frame_thread.cf +
 2448|      0|                (((size_t)tile_start_off * size_mul[0]) >> !f->seq_hdr->hbd) :
 2449|  64.4k|            NULL;
 2450|  64.4k|    }
 2451|       |
 2452|  32.2k|    dav1d_cdf_thread_copy(&ts->cdf, &f->in_cdf);
 2453|  32.2k|    ts->last_qidx = f->frame_hdr->quant.yac;
 2454|  32.2k|    ts->last_delta_lf.u32 = 0;
 2455|       |
 2456|  32.2k|    dav1d_msac_init(&ts->msac, data, sz, f->frame_hdr->disable_cdf_update);
 2457|       |
 2458|  32.2k|    ts->tiling.row = tile_row;
 2459|  32.2k|    ts->tiling.col = tile_col;
 2460|  32.2k|    ts->tiling.col_start = col_sb_start << sb_shift;
 2461|  32.2k|    ts->tiling.col_end = imin(col_sb_end << sb_shift, f->bw);
 2462|  32.2k|    ts->tiling.row_start = row_sb_start << sb_shift;
 2463|  32.2k|    ts->tiling.row_end = imin(row_sb_end << sb_shift, f->bh);
 2464|       |
 2465|       |    // Reference Restoration Unit (used for exp coding)
 2466|  32.2k|    int sb_idx, unit_idx;
 2467|  32.2k|    if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
  ------------------
  |  Branch (2467:9): [True: 2.35k, False: 29.8k]
  ------------------
 2468|       |        // vertical components only
 2469|  2.35k|        sb_idx = (ts->tiling.row_start >> 5) * f->sr_sb128w;
 2470|  2.35k|        unit_idx = (ts->tiling.row_start & 16) >> 3;
 2471|  29.8k|    } else {
 2472|  29.8k|        sb_idx = (ts->tiling.row_start >> 5) * f->sb128w + col_sb128_start;
 2473|  29.8k|        unit_idx = ((ts->tiling.row_start & 16) >> 3) +
 2474|  29.8k|                   ((ts->tiling.col_start & 16) >> 4);
 2475|  29.8k|    }
 2476|   128k|    for (int p = 0; p < 3; p++) {
  ------------------
  |  Branch (2476:21): [True: 96.6k, False: 32.2k]
  ------------------
 2477|  96.6k|        if (!((f->lf.restore_planes >> p) & 1U))
  ------------------
  |  Branch (2477:13): [True: 82.3k, False: 14.2k]
  ------------------
 2478|  82.3k|            continue;
 2479|       |
 2480|  14.2k|        if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
  ------------------
  |  Branch (2480:13): [True: 2.99k, False: 11.2k]
  ------------------
 2481|  2.99k|            const int ss_hor = p && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
  ------------------
  |  Branch (2481:32): [True: 1.83k, False: 1.16k]
  |  Branch (2481:37): [True: 321, False: 1.51k]
  ------------------
 2482|  2.99k|            const int d = f->frame_hdr->super_res.width_scale_denominator;
 2483|  2.99k|            const int unit_size_log2 = f->frame_hdr->restoration.unit_size[!!p];
 2484|  2.99k|            const int rnd = (8 << unit_size_log2) - 1, shift = unit_size_log2 + 3;
 2485|  2.99k|            const int x = ((4 * ts->tiling.col_start * d >> ss_hor) + rnd) >> shift;
 2486|  2.99k|            const int px_x = x << (unit_size_log2 + ss_hor);
 2487|  2.99k|            const int u_idx = unit_idx + ((px_x & 64) >> 6);
 2488|  2.99k|            const int sb128x = px_x >> 7;
 2489|  2.99k|            if (sb128x >= f->sr_sb128w) continue;
  ------------------
  |  Branch (2489:17): [True: 216, False: 2.77k]
  ------------------
 2490|  2.77k|            ts->lr_ref[p] = &f->lf.lr_mask[sb_idx + sb128x].lr[p][u_idx];
 2491|  11.2k|        } else {
 2492|  11.2k|            ts->lr_ref[p] = &f->lf.lr_mask[sb_idx].lr[p][unit_idx];
 2493|  11.2k|        }
 2494|       |
 2495|  14.0k|        ts->lr_ref[p]->filter_v[0] = 3;
 2496|  14.0k|        ts->lr_ref[p]->filter_v[1] = -7;
 2497|  14.0k|        ts->lr_ref[p]->filter_v[2] = 15;
 2498|  14.0k|        ts->lr_ref[p]->filter_h[0] = 3;
 2499|  14.0k|        ts->lr_ref[p]->filter_h[1] = -7;
 2500|  14.0k|        ts->lr_ref[p]->filter_h[2] = 15;
 2501|  14.0k|        ts->lr_ref[p]->sgr_weights[0] = -32;
 2502|  14.0k|        ts->lr_ref[p]->sgr_weights[1] = 31;
 2503|  14.0k|    }
 2504|       |
 2505|  32.2k|    if (f->c->n_tc > 1) {
  ------------------
  |  Branch (2505:9): [True: 30.5k, False: 1.69k]
  ------------------
 2506|  91.5k|        for (int p = 0; p < 2; p++)
  ------------------
  |  Branch (2506:25): [True: 61.0k, False: 30.5k]
  ------------------
 2507|  61.0k|            atomic_init(&ts->progress[p], row_sb_start);
 2508|  30.5k|    }
 2509|  32.2k|}
decode.c:get_upscale_x0:
 3321|  2.39k|static int get_upscale_x0(const int in_w, const int out_w, const int step) {
 3322|  2.39k|    const int err = out_w * step - (in_w << 14);
 3323|  2.39k|    const int x0 = (-((out_w - in_w) << 13) + (out_w >> 1)) / out_w + 128 - (err / 2);
 3324|  2.39k|    return x0 & 0x3fff;
 3325|  2.39k|}

obu.c:get_poc_diff:
  239|  35.8k|{
  240|  35.8k|    if (!order_hint_n_bits) return 0;
  ------------------
  |  Branch (240:9): [True: 0, False: 35.8k]
  ------------------
  241|  35.8k|    const int mask = 1 << (order_hint_n_bits - 1);
  242|  35.8k|    const int diff = poc0 - poc1;
  243|  35.8k|    return (diff & (mask - 1)) - (diff & mask);
  244|  35.8k|}
refmvs.c:get_gmv_2d:
  482|   100k|{
  483|   100k|    switch (gmv->type) {
  484|  9.54k|    case DAV1D_WM_TYPE_ROT_ZOOM:
  ------------------
  |  Branch (484:5): [True: 9.54k, False: 91.3k]
  ------------------
  485|  9.54k|        assert(gmv->matrix[5] ==  gmv->matrix[2]);
  ------------------
  |  |  140|  9.54k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 9.54k]
  |  |  |  Branch (140:68): [Folded, False: 9.54k]
  |  |  ------------------
  ------------------
  486|  9.54k|        assert(gmv->matrix[4] == -gmv->matrix[3]);
  ------------------
  |  |  140|  9.54k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 9.54k]
  |  |  |  Branch (140:68): [Folded, False: 9.54k]
  |  |  ------------------
  ------------------
  487|       |        // fall-through
  488|  9.54k|    default:
  ------------------
  |  Branch (488:5): [True: 0, False: 100k]
  ------------------
  489|  13.2k|    case DAV1D_WM_TYPE_AFFINE: {
  ------------------
  |  Branch (489:5): [True: 3.72k, False: 97.2k]
  ------------------
  490|  13.2k|        const int x = bx4 * 4 + bw4 * 2 - 1;
  491|  13.2k|        const int y = by4 * 4 + bh4 * 2 - 1;
  492|  13.2k|        const int xc = (gmv->matrix[2] - (1 << 16)) * x +
  493|  13.2k|                       gmv->matrix[3] * y + gmv->matrix[0];
  494|  13.2k|        const int yc = (gmv->matrix[5] - (1 << 16)) * y +
  495|  13.2k|                       gmv->matrix[4] * x + gmv->matrix[1];
  496|  13.2k|        const int shift = 16 - (3 - !hdr->hp);
  497|  13.2k|        const int round = (1 << shift) >> 1;
  498|  13.2k|        mv res = (mv) {
  499|  13.2k|            .y = apply_sign(((abs(yc) + round) >> shift) << !hdr->hp, yc),
  500|  13.2k|            .x = apply_sign(((abs(xc) + round) >> shift) << !hdr->hp, xc),
  501|  13.2k|        };
  502|  13.2k|        if (hdr->force_integer_mv)
  ------------------
  |  Branch (502:13): [True: 943, False: 12.3k]
  ------------------
  503|    943|            fix_int_mv_precision(&res);
  504|  13.2k|        return res;
  505|  9.54k|    }
  506|  8.22k|    case DAV1D_WM_TYPE_TRANSLATION: {
  ------------------
  |  Branch (506:5): [True: 8.22k, False: 92.7k]
  ------------------
  507|  8.22k|        mv res = (mv) {
  508|  8.22k|            .y = gmv->matrix[0] >> 13,
  509|  8.22k|            .x = gmv->matrix[1] >> 13,
  510|  8.22k|        };
  511|  8.22k|        if (hdr->force_integer_mv)
  ------------------
  |  Branch (511:13): [True: 216, False: 8.00k]
  ------------------
  512|    216|            fix_int_mv_precision(&res);
  513|  8.22k|        return res;
  514|  9.54k|    }
  515|  79.4k|    case DAV1D_WM_TYPE_IDENTITY:
  ------------------
  |  Branch (515:5): [True: 79.4k, False: 21.4k]
  ------------------
  516|  79.4k|        return (mv) { .x = 0, .y = 0 };
  517|   100k|    }
  518|   100k|}
refmvs.c:fix_int_mv_precision:
  462|  1.37k|static inline void fix_int_mv_precision(mv *const mv) {
  463|  1.37k|    mv->x = (mv->x - (mv->x >> 15) + 3) & ~7U;
  464|  1.37k|    mv->y = (mv->y - (mv->y >> 15) + 3) & ~7U;
  465|  1.37k|}
refmvs.c:fix_mv_precision:
  469|  12.0k|{
  470|  12.0k|    if (hdr->force_integer_mv) {
  ------------------
  |  Branch (470:9): [True: 215, False: 11.7k]
  ------------------
  471|    215|        fix_int_mv_precision(mv);
  472|  11.7k|    } else if (!hdr->hp) {
  ------------------
  |  Branch (472:16): [True: 581, False: 11.2k]
  ------------------
  473|    581|        mv->x = (mv->x - (mv->x >> 15)) & ~1U;
  474|    581|        mv->y = (mv->y - (mv->y >> 15)) & ~1U;
  475|    581|    }
  476|  12.0k|}
refmvs.c:get_poc_diff:
  239|   131k|{
  240|   131k|    if (!order_hint_n_bits) return 0;
  ------------------
  |  Branch (240:9): [True: 46.0k, False: 85.1k]
  ------------------
  241|  85.1k|    const int mask = 1 << (order_hint_n_bits - 1);
  242|  85.1k|    const int diff = poc0 - poc1;
  243|  85.1k|    return (diff & (mask - 1)) - (diff & mask);
  244|   131k|}
decode.c:get_partition_ctx:
   87|  2.33M|{
   88|  2.33M|    return ((a->partition[xb8] >> (4 - bl)) & 1) +
   89|  2.33M|          (((l->partition[yb8] >> (4 - bl)) & 1) << 1);
   90|  2.33M|}
decode.c:get_cur_frame_segid:
  445|  1.13M|{
  446|  1.13M|    cur_seg_map += bx + by * stride;
  447|  1.13M|    if (have_left && have_top) {
  ------------------
  |  Branch (447:9): [True: 804k, False: 326k]
  |  Branch (447:22): [True: 394k, False: 410k]
  ------------------
  448|   394k|        const int l = cur_seg_map[-1];
  449|   394k|        const int a = cur_seg_map[-stride];
  450|   394k|        const int al = cur_seg_map[-(stride + 1)];
  451|       |
  452|   394k|        if (l == a && al == l) *seg_ctx = 2;
  ------------------
  |  Branch (452:13): [True: 188k, False: 205k]
  |  Branch (452:23): [True: 171k, False: 16.8k]
  ------------------
  453|   222k|        else if (l == a || al == l || a == al) *seg_ctx = 1;
  ------------------
  |  Branch (453:18): [True: 16.5k, False: 206k]
  |  Branch (453:28): [True: 112k, False: 93.9k]
  |  Branch (453:39): [True: 53.3k, False: 40.5k]
  ------------------
  454|  40.3k|        else *seg_ctx = 0;
  455|   394k|        return a == al ? a : l;
  ------------------
  |  Branch (455:16): [True: 224k, False: 169k]
  ------------------
  456|   737k|    } else {
  457|   737k|        *seg_ctx = 0;
  458|  18.4E|        return have_left ? cur_seg_map[-1] : have_top ? cur_seg_map[-stride] : 0;
  ------------------
  |  Branch (458:16): [True: 421k, False: 315k]
  |  Branch (458:46): [True: 318k, False: 18.4E]
  ------------------
  459|   737k|    }
  460|  1.13M|}
decode.c:get_intra_ctx:
   63|  98.9k|{
   64|  98.9k|    if (have_left) {
  ------------------
  |  Branch (64:9): [True: 85.8k, False: 13.0k]
  ------------------
   65|  85.8k|        if (have_top) {
  ------------------
  |  Branch (65:13): [True: 66.4k, False: 19.4k]
  ------------------
   66|  66.4k|            const int ctx = l->intra[yb4] + a->intra[xb4];
   67|  66.4k|            return ctx + (ctx == 2);
   68|  66.4k|        } else
   69|  19.4k|            return l->intra[yb4] * 2;
   70|  85.8k|    } else {
   71|  13.0k|        return have_top ? a->intra[xb4] * 2 : 0;
  ------------------
  |  Branch (71:16): [True: 7.37k, False: 5.69k]
  ------------------
   72|  13.0k|    }
   73|  98.9k|}
decode.c:get_tx_ctx:
   79|   467k|{
   80|   467k|    return (l->tx_intra[yb4] >= max_tx->lh) + (a->tx_intra[xb4] >= max_tx->lw);
   81|   467k|}
decode.c:get_comp_ctx:
  160|  26.6k|{
  161|  26.6k|    if (have_top) {
  ------------------
  |  Branch (161:9): [True: 17.7k, False: 8.88k]
  ------------------
  162|  17.7k|        if (have_left) {
  ------------------
  |  Branch (162:13): [True: 15.9k, False: 1.87k]
  ------------------
  163|  15.9k|            if (a->comp_type[xb4]) {
  ------------------
  |  Branch (163:17): [True: 7.65k, False: 8.24k]
  ------------------
  164|  7.65k|                if (l->comp_type[yb4]) {
  ------------------
  |  Branch (164:21): [True: 5.11k, False: 2.54k]
  ------------------
  165|  5.11k|                    return 4;
  166|  5.11k|                } else {
  167|       |                    // 4U means intra (-1) or bwd (>= 4)
  168|  2.54k|                    return 2 + ((unsigned)l->ref[0][yb4] >= 4U);
  169|  2.54k|                }
  170|  8.24k|            } else if (l->comp_type[yb4]) {
  ------------------
  |  Branch (170:24): [True: 2.92k, False: 5.32k]
  ------------------
  171|       |                // 4U means intra (-1) or bwd (>= 4)
  172|  2.92k|                return 2 + ((unsigned)a->ref[0][xb4] >= 4U);
  173|  5.32k|            } else {
  174|  5.32k|                return (l->ref[0][yb4] >= 4) ^ (a->ref[0][xb4] >= 4);
  175|  5.32k|            }
  176|  15.9k|        } else {
  177|  1.87k|            return a->comp_type[xb4] ? 3 : a->ref[0][xb4] >= 4;
  ------------------
  |  Branch (177:20): [True: 733, False: 1.13k]
  ------------------
  178|  1.87k|        }
  179|  17.7k|    } else if (have_left) {
  ------------------
  |  Branch (179:16): [True: 7.07k, False: 1.80k]
  ------------------
  180|  7.07k|        return l->comp_type[yb4] ? 3 : l->ref[0][yb4] >= 4;
  ------------------
  |  Branch (180:16): [True: 3.11k, False: 3.95k]
  ------------------
  181|  7.07k|    } else {
  182|  1.80k|        return 1;
  183|  1.80k|    }
  184|  26.6k|}
decode.c:fix_mv_precision:
  469|  61.0k|{
  470|  61.0k|    if (hdr->force_integer_mv) {
  ------------------
  |  Branch (470:9): [True: 5.62k, False: 55.4k]
  ------------------
  471|  5.62k|        fix_int_mv_precision(mv);
  472|  55.4k|    } else if (!hdr->hp) {
  ------------------
  |  Branch (472:16): [True: 24.9k, False: 30.4k]
  ------------------
  473|  24.9k|        mv->x = (mv->x - (mv->x >> 15)) & ~1U;
  474|  24.9k|        mv->y = (mv->y - (mv->y >> 15)) & ~1U;
  475|  24.9k|    }
  476|  61.0k|}
decode.c:fix_int_mv_precision:
  462|  5.91k|static inline void fix_int_mv_precision(mv *const mv) {
  463|  5.91k|    mv->x = (mv->x - (mv->x >> 15) + 3) & ~7U;
  464|  5.91k|    mv->y = (mv->y - (mv->y >> 15) + 3) & ~7U;
  465|  5.91k|}
decode.c:get_comp_dir_ctx:
  190|  14.7k|{
  191|  14.7k|#define has_uni_comp(edge, off) \
  192|  14.7k|    ((edge->ref[0][off] < 4) == (edge->ref[1][off] < 4))
  193|       |
  194|  14.7k|    if (have_top && have_left) {
  ------------------
  |  Branch (194:9): [True: 10.5k, False: 4.20k]
  |  Branch (194:21): [True: 9.73k, False: 834]
  ------------------
  195|  9.73k|        const int a_intra = a->intra[xb4], l_intra = l->intra[yb4];
  196|       |
  197|  9.73k|        if (a_intra && l_intra) return 2;
  ------------------
  |  Branch (197:13): [True: 224, False: 9.51k]
  |  Branch (197:24): [True: 47, False: 177]
  ------------------
  198|  9.68k|        if (a_intra || l_intra) {
  ------------------
  |  Branch (198:13): [True: 177, False: 9.51k]
  |  Branch (198:24): [True: 175, False: 9.33k]
  ------------------
  199|    352|            const BlockContext *const edge = a_intra ? l : a;
  ------------------
  |  Branch (199:46): [True: 177, False: 175]
  ------------------
  200|    352|            const int off = a_intra ? yb4 : xb4;
  ------------------
  |  Branch (200:29): [True: 177, False: 175]
  ------------------
  201|       |
  202|    352|            if (edge->comp_type[off] == COMP_INTER_NONE) return 2;
  ------------------
  |  Branch (202:17): [True: 104, False: 248]
  ------------------
  203|    248|            return 1 + 2 * has_uni_comp(edge, off);
  ------------------
  |  |  192|    248|    ((edge->ref[0][off] < 4) == (edge->ref[1][off] < 4))
  ------------------
  204|    352|        }
  205|       |
  206|  9.33k|        const int a_comp = a->comp_type[xb4] != COMP_INTER_NONE;
  207|  9.33k|        const int l_comp = l->comp_type[yb4] != COMP_INTER_NONE;
  208|  9.33k|        const int a_ref0 = a->ref[0][xb4], l_ref0 = l->ref[0][yb4];
  209|       |
  210|  9.33k|        if (!a_comp && !l_comp) {
  ------------------
  |  Branch (210:13): [True: 2.95k, False: 6.38k]
  |  Branch (210:24): [True: 1.04k, False: 1.91k]
  ------------------
  211|  1.04k|            return 1 + 2 * ((a_ref0 >= 4) == (l_ref0 >= 4));
  212|  8.29k|        } else if (!a_comp || !l_comp) {
  ------------------
  |  Branch (212:20): [True: 1.91k, False: 6.38k]
  |  Branch (212:31): [True: 1.62k, False: 4.76k]
  ------------------
  213|  3.53k|            const BlockContext *const edge = a_comp ? a : l;
  ------------------
  |  Branch (213:46): [True: 1.62k, False: 1.91k]
  ------------------
  214|  3.53k|            const int off = a_comp ? xb4 : yb4;
  ------------------
  |  Branch (214:29): [True: 1.62k, False: 1.91k]
  ------------------
  215|       |
  216|  3.53k|            if (!has_uni_comp(edge, off)) return 1;
  ------------------
  |  |  192|  3.53k|    ((edge->ref[0][off] < 4) == (edge->ref[1][off] < 4))
  ------------------
  |  Branch (216:17): [True: 2.90k, False: 626]
  ------------------
  217|    626|            return 3 + ((a_ref0 >= 4) == (l_ref0 >= 4));
  218|  4.76k|        } else {
  219|  4.76k|            const int a_uni = has_uni_comp(a, xb4), l_uni = has_uni_comp(l, yb4);
  ------------------
  |  |  192|  4.76k|    ((edge->ref[0][off] < 4) == (edge->ref[1][off] < 4))
  ------------------
                          const int a_uni = has_uni_comp(a, xb4), l_uni = has_uni_comp(l, yb4);
  ------------------
  |  |  192|  4.76k|    ((edge->ref[0][off] < 4) == (edge->ref[1][off] < 4))
  ------------------
  220|       |
  221|  4.76k|            if (!a_uni && !l_uni) return 0;
  ------------------
  |  Branch (221:17): [True: 3.83k, False: 927]
  |  Branch (221:27): [True: 3.43k, False: 400]
  ------------------
  222|  1.32k|            if (!a_uni || !l_uni) return 2;
  ------------------
  |  Branch (222:17): [True: 400, False: 927]
  |  Branch (222:27): [True: 447, False: 480]
  ------------------
  223|    480|            return 3 + ((a_ref0 == 4) == (l_ref0 == 4));
  224|  1.32k|        }
  225|  9.33k|    } else if (have_top || have_left) {
  ------------------
  |  Branch (225:16): [True: 831, False: 4.21k]
  |  Branch (225:28): [True: 3.55k, False: 654]
  ------------------
  226|  4.39k|        const BlockContext *const edge = have_left ? l : a;
  ------------------
  |  Branch (226:42): [True: 3.55k, False: 834]
  ------------------
  227|  4.39k|        const int off = have_left ? yb4 : xb4;
  ------------------
  |  Branch (227:25): [True: 3.55k, False: 834]
  ------------------
  228|       |
  229|  4.39k|        if (edge->intra[off]) return 2;
  ------------------
  |  Branch (229:13): [True: 76, False: 4.31k]
  ------------------
  230|  4.31k|        if (edge->comp_type[off] == COMP_INTER_NONE) return 2;
  ------------------
  |  Branch (230:13): [True: 1.36k, False: 2.94k]
  ------------------
  231|  2.94k|        return 4 * has_uni_comp(edge, off);
  ------------------
  |  |  192|  2.94k|    ((edge->ref[0][off] < 4) == (edge->ref[1][off] < 4))
  ------------------
  232|  4.31k|    } else {
  233|    651|        return 2;
  234|    651|    }
  235|  14.7k|}
decode.c:av1_get_fwd_ref_ctx:
  307|  42.5k|{
  308|  42.5k|    int cnt[4] = { 0 };
  309|       |
  310|  42.5k|    if (have_top && !a->intra[xb4]) {
  ------------------
  |  Branch (310:9): [True: 31.2k, False: 11.3k]
  |  Branch (310:21): [True: 30.1k, False: 1.10k]
  ------------------
  311|  30.1k|        if (a->ref[0][xb4] < 4) cnt[a->ref[0][xb4]]++;
  ------------------
  |  Branch (311:13): [True: 25.9k, False: 4.19k]
  ------------------
  312|  30.1k|        if (a->comp_type[xb4] && a->ref[1][xb4] < 4) cnt[a->ref[1][xb4]]++;
  ------------------
  |  Branch (312:13): [True: 7.73k, False: 22.3k]
  |  Branch (312:34): [True: 864, False: 6.87k]
  ------------------
  313|  30.1k|    }
  314|       |
  315|  42.5k|    if (have_left && !l->intra[yb4]) {
  ------------------
  |  Branch (315:9): [True: 37.5k, False: 5.02k]
  |  Branch (315:22): [True: 36.3k, False: 1.18k]
  ------------------
  316|  36.3k|        if (l->ref[0][yb4] < 4) cnt[l->ref[0][yb4]]++;
  ------------------
  |  Branch (316:13): [True: 31.9k, False: 4.44k]
  ------------------
  317|  36.3k|        if (l->comp_type[yb4] && l->ref[1][yb4] < 4) cnt[l->ref[1][yb4]]++;
  ------------------
  |  Branch (317:13): [True: 10.3k, False: 25.9k]
  |  Branch (317:34): [True: 1.21k, False: 9.18k]
  ------------------
  318|  36.3k|    }
  319|       |
  320|  42.5k|    cnt[0] += cnt[1];
  321|  42.5k|    cnt[2] += cnt[3];
  322|       |
  323|  42.5k|    return cnt[0] == cnt[2] ? 1 : cnt[0] < cnt[2] ? 0 : 2;
  ------------------
  |  Branch (323:12): [True: 9.49k, False: 33.0k]
  |  Branch (323:35): [True: 9.87k, False: 23.1k]
  ------------------
  324|  42.5k|}
decode.c:av1_get_fwd_ref_2_ctx:
  350|  14.9k|{
  351|  14.9k|    int cnt[2] = { 0 };
  352|       |
  353|  14.9k|    if (have_top && !a->intra[xb4]) {
  ------------------
  |  Branch (353:9): [True: 10.2k, False: 4.74k]
  |  Branch (353:21): [True: 9.75k, False: 451]
  ------------------
  354|  9.75k|        if ((a->ref[0][xb4] ^ 2U) < 2) cnt[a->ref[0][xb4] - 2]++;
  ------------------
  |  Branch (354:13): [True: 5.91k, False: 3.83k]
  ------------------
  355|  9.75k|        if (a->comp_type[xb4] && (a->ref[1][xb4] ^ 2U) < 2) cnt[a->ref[1][xb4] - 2]++;
  ------------------
  |  Branch (355:13): [True: 3.15k, False: 6.59k]
  |  Branch (355:34): [True: 535, False: 2.61k]
  ------------------
  356|  9.75k|    }
  357|       |
  358|  14.9k|    if (have_left && !l->intra[yb4]) {
  ------------------
  |  Branch (358:9): [True: 12.9k, False: 1.98k]
  |  Branch (358:22): [True: 12.4k, False: 499]
  ------------------
  359|  12.4k|        if ((l->ref[0][yb4] ^ 2U) < 2) cnt[l->ref[0][yb4] - 2]++;
  ------------------
  |  Branch (359:13): [True: 8.32k, False: 4.13k]
  ------------------
  360|  12.4k|        if (l->comp_type[yb4] && (l->ref[1][yb4] ^ 2U) < 2) cnt[l->ref[1][yb4] - 2]++;
  ------------------
  |  Branch (360:13): [True: 4.75k, False: 7.70k]
  |  Branch (360:34): [True: 652, False: 4.10k]
  ------------------
  361|  12.4k|    }
  362|       |
  363|  14.9k|    return cnt[0] == cnt[1] ? 1 : cnt[0] < cnt[1] ? 0 : 2;
  ------------------
  |  Branch (363:12): [True: 4.38k, False: 10.5k]
  |  Branch (363:35): [True: 7.42k, False: 3.12k]
  ------------------
  364|  14.9k|}
decode.c:av1_get_fwd_ref_1_ctx:
  330|  28.9k|{
  331|  28.9k|    int cnt[2] = { 0 };
  332|       |
  333|  28.9k|    if (have_top && !a->intra[xb4]) {
  ------------------
  |  Branch (333:9): [True: 21.8k, False: 7.05k]
  |  Branch (333:21): [True: 21.2k, False: 665]
  ------------------
  334|  21.2k|        if (a->ref[0][xb4] < 2) cnt[a->ref[0][xb4]]++;
  ------------------
  |  Branch (334:13): [True: 16.7k, False: 4.41k]
  ------------------
  335|  21.2k|        if (a->comp_type[xb4] && a->ref[1][xb4] < 2) cnt[a->ref[1][xb4]]++;
  ------------------
  |  Branch (335:13): [True: 5.15k, False: 16.0k]
  |  Branch (335:34): [True: 277, False: 4.88k]
  ------------------
  336|  21.2k|    }
  337|       |
  338|  28.9k|    if (have_left && !l->intra[yb4]) {
  ------------------
  |  Branch (338:9): [True: 25.6k, False: 3.30k]
  |  Branch (338:22): [True: 24.9k, False: 711]
  ------------------
  339|  24.9k|        if (l->ref[0][yb4] < 2) cnt[l->ref[0][yb4]]++;
  ------------------
  |  Branch (339:13): [True: 20.2k, False: 4.65k]
  ------------------
  340|  24.9k|        if (l->comp_type[yb4] && l->ref[1][yb4] < 2) cnt[l->ref[1][yb4]]++;
  ------------------
  |  Branch (340:13): [True: 6.33k, False: 18.5k]
  |  Branch (340:34): [True: 414, False: 5.92k]
  ------------------
  341|  24.9k|    }
  342|       |
  343|  28.9k|    return cnt[0] == cnt[1] ? 1 : cnt[0] < cnt[1] ? 0 : 2;
  ------------------
  |  Branch (343:12): [True: 6.11k, False: 22.8k]
  |  Branch (343:35): [True: 2.46k, False: 20.3k]
  ------------------
  344|  28.9k|}
decode.c:av1_get_bwd_ref_ctx:
  370|  40.7k|{
  371|  40.7k|    int cnt[3] = { 0 };
  372|       |
  373|  40.7k|    if (have_top && !a->intra[xb4]) {
  ------------------
  |  Branch (373:9): [True: 29.7k, False: 11.0k]
  |  Branch (373:21): [True: 28.6k, False: 1.07k]
  ------------------
  374|  28.6k|        if (a->ref[0][xb4] >= 4) cnt[a->ref[0][xb4] - 4]++;
  ------------------
  |  Branch (374:13): [True: 16.9k, False: 11.7k]
  ------------------
  375|  28.6k|        if (a->comp_type[xb4] && a->ref[1][xb4] >= 4) cnt[a->ref[1][xb4] - 4]++;
  ------------------
  |  Branch (375:13): [True: 7.27k, False: 21.3k]
  |  Branch (375:34): [True: 6.78k, False: 484]
  ------------------
  376|  28.6k|    }
  377|       |
  378|  40.7k|    if (have_left && !l->intra[yb4]) {
  ------------------
  |  Branch (378:9): [True: 36.5k, False: 4.24k]
  |  Branch (378:22): [True: 35.2k, False: 1.32k]
  ------------------
  379|  35.2k|        if (l->ref[0][yb4] >= 4) cnt[l->ref[0][yb4] - 4]++;
  ------------------
  |  Branch (379:13): [True: 21.0k, False: 14.1k]
  ------------------
  380|  35.2k|        if (l->comp_type[yb4] && l->ref[1][yb4] >= 4) cnt[l->ref[1][yb4] - 4]++;
  ------------------
  |  Branch (380:13): [True: 9.97k, False: 25.2k]
  |  Branch (380:34): [True: 9.41k, False: 567]
  ------------------
  381|  35.2k|    }
  382|       |
  383|  40.7k|    cnt[1] += cnt[0];
  384|       |
  385|  40.7k|    return cnt[2] == cnt[1] ? 1 : cnt[1] < cnt[2] ? 0 : 2;
  ------------------
  |  Branch (385:12): [True: 8.52k, False: 32.2k]
  |  Branch (385:35): [True: 21.1k, False: 11.0k]
  ------------------
  386|  40.7k|}
decode.c:av1_get_bwd_ref_1_ctx:
  392|  14.5k|{
  393|  14.5k|    int cnt[3] = { 0 };
  394|       |
  395|  14.5k|    if (have_top && !a->intra[xb4]) {
  ------------------
  |  Branch (395:9): [True: 10.7k, False: 3.80k]
  |  Branch (395:21): [True: 10.3k, False: 408]
  ------------------
  396|  10.3k|        if (a->ref[0][xb4] >= 4) cnt[a->ref[0][xb4] - 4]++;
  ------------------
  |  Branch (396:13): [True: 5.68k, False: 4.66k]
  ------------------
  397|  10.3k|        if (a->comp_type[xb4] && a->ref[1][xb4] >= 4) cnt[a->ref[1][xb4] - 4]++;
  ------------------
  |  Branch (397:13): [True: 2.79k, False: 7.55k]
  |  Branch (397:34): [True: 2.58k, False: 210]
  ------------------
  398|  10.3k|    }
  399|       |
  400|  14.5k|    if (have_left && !l->intra[yb4]) {
  ------------------
  |  Branch (400:9): [True: 12.7k, False: 1.81k]
  |  Branch (400:22): [True: 12.2k, False: 512]
  ------------------
  401|  12.2k|        if (l->ref[0][yb4] >= 4) cnt[l->ref[0][yb4] - 4]++;
  ------------------
  |  Branch (401:13): [True: 6.52k, False: 5.71k]
  ------------------
  402|  12.2k|        if (l->comp_type[yb4] && l->ref[1][yb4] >= 4) cnt[l->ref[1][yb4] - 4]++;
  ------------------
  |  Branch (402:13): [True: 3.79k, False: 8.45k]
  |  Branch (402:34): [True: 3.50k, False: 290]
  ------------------
  403|  12.2k|    }
  404|       |
  405|  14.5k|    return cnt[0] == cnt[1] ? 1 : cnt[0] < cnt[1] ? 0 : 2;
  ------------------
  |  Branch (405:12): [True: 3.93k, False: 10.6k]
  |  Branch (405:35): [True: 6.21k, False: 4.42k]
  ------------------
  406|  14.5k|}
decode.c:av1_get_ref_ctx:
  287|  62.0k|{
  288|  62.0k|    int cnt[2] = { 0 };
  289|       |
  290|  62.0k|    if (have_top && !a->intra[xb4]) {
  ------------------
  |  Branch (290:9): [True: 45.3k, False: 16.7k]
  |  Branch (290:21): [True: 43.5k, False: 1.80k]
  ------------------
  291|  43.5k|        cnt[a->ref[0][xb4] >= 4]++;
  292|  43.5k|        if (a->comp_type[xb4]) cnt[a->ref[1][xb4] >= 4]++;
  ------------------
  |  Branch (292:13): [True: 4.59k, False: 38.9k]
  ------------------
  293|  43.5k|    }
  294|       |
  295|  62.0k|    if (have_left && !l->intra[yb4]) {
  ------------------
  |  Branch (295:9): [True: 54.4k, False: 7.63k]
  |  Branch (295:22): [True: 52.3k, False: 2.05k]
  ------------------
  296|  52.3k|        cnt[l->ref[0][yb4] >= 4]++;
  297|  52.3k|        if (l->comp_type[yb4]) cnt[l->ref[1][yb4] >= 4]++;
  ------------------
  |  Branch (297:13): [True: 6.46k, False: 45.9k]
  ------------------
  298|  52.3k|    }
  299|       |
  300|  62.0k|    return cnt[0] == cnt[1] ? 1 : cnt[0] < cnt[1] ? 0 : 2;
  ------------------
  |  Branch (300:12): [True: 12.7k, False: 49.3k]
  |  Branch (300:35): [True: 23.1k, False: 26.1k]
  ------------------
  301|  62.0k|}
decode.c:av1_get_uni_p1_ctx:
  412|  2.00k|{
  413|  2.00k|    int cnt[3] = { 0 };
  414|       |
  415|  2.00k|    if (have_top && !a->intra[xb4]) {
  ------------------
  |  Branch (415:9): [True: 1.35k, False: 648]
  |  Branch (415:21): [True: 1.33k, False: 23]
  ------------------
  416|  1.33k|        if (a->ref[0][xb4] - 1U < 3) cnt[a->ref[0][xb4] - 1]++;
  ------------------
  |  Branch (416:13): [True: 284, False: 1.05k]
  ------------------
  417|  1.33k|        if (a->comp_type[xb4] && a->ref[1][xb4] - 1U < 3) cnt[a->ref[1][xb4] - 1]++;
  ------------------
  |  Branch (417:13): [True: 931, False: 405]
  |  Branch (417:34): [True: 585, False: 346]
  ------------------
  418|  1.33k|    }
  419|       |
  420|  2.00k|    if (have_left && !l->intra[yb4]) {
  ------------------
  |  Branch (420:9): [True: 1.63k, False: 377]
  |  Branch (420:22): [True: 1.59k, False: 31]
  ------------------
  421|  1.59k|        if (l->ref[0][yb4] - 1U < 3) cnt[l->ref[0][yb4] - 1]++;
  ------------------
  |  Branch (421:13): [True: 374, False: 1.22k]
  ------------------
  422|  1.59k|        if (l->comp_type[yb4] && l->ref[1][yb4] - 1U < 3) cnt[l->ref[1][yb4] - 1]++;
  ------------------
  |  Branch (422:13): [True: 1.11k, False: 489]
  |  Branch (422:34): [True: 709, False: 401]
  ------------------
  423|  1.59k|    }
  424|       |
  425|  2.00k|    cnt[1] += cnt[2];
  426|       |
  427|  2.00k|    return cnt[0] == cnt[1] ? 1 : cnt[0] < cnt[1] ? 0 : 2;
  ------------------
  |  Branch (427:12): [True: 685, False: 1.32k]
  |  Branch (427:35): [True: 953, False: 369]
  ------------------
  428|  2.00k|}
decode.c:get_drl_context:
  432|  23.2k|{
  433|  23.2k|    if (ref_mv_stack[ref_idx].weight >= 640)
  ------------------
  |  Branch (433:9): [True: 18.0k, False: 5.20k]
  ------------------
  434|  18.0k|        return ref_mv_stack[ref_idx + 1].weight < 640;
  435|       |
  436|  18.4E|    return ref_mv_stack[ref_idx + 1].weight < 640 ? 2 : 0;
  ------------------
  |  Branch (436:12): [True: 5.21k, False: 18.4E]
  ------------------
  437|  23.2k|}
decode.c:get_gmv_2d:
  482|  15.7k|{
  483|  15.7k|    switch (gmv->type) {
  484|  2.26k|    case DAV1D_WM_TYPE_ROT_ZOOM:
  ------------------
  |  Branch (484:5): [True: 2.26k, False: 13.5k]
  ------------------
  485|  2.26k|        assert(gmv->matrix[5] ==  gmv->matrix[2]);
  ------------------
  |  |  140|  2.26k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 2.26k]
  |  |  |  Branch (140:68): [Folded, False: 2.26k]
  |  |  ------------------
  ------------------
  486|  2.26k|        assert(gmv->matrix[4] == -gmv->matrix[3]);
  ------------------
  |  |  140|  2.26k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 2.26k]
  |  |  |  Branch (140:68): [Folded, False: 2.26k]
  |  |  ------------------
  ------------------
  487|       |        // fall-through
  488|  2.26k|    default:
  ------------------
  |  Branch (488:5): [True: 0, False: 15.7k]
  ------------------
  489|  3.49k|    case DAV1D_WM_TYPE_AFFINE: {
  ------------------
  |  Branch (489:5): [True: 1.23k, False: 14.5k]
  ------------------
  490|  3.49k|        const int x = bx4 * 4 + bw4 * 2 - 1;
  491|  3.49k|        const int y = by4 * 4 + bh4 * 2 - 1;
  492|  3.49k|        const int xc = (gmv->matrix[2] - (1 << 16)) * x +
  493|  3.49k|                       gmv->matrix[3] * y + gmv->matrix[0];
  494|  3.49k|        const int yc = (gmv->matrix[5] - (1 << 16)) * y +
  495|  3.49k|                       gmv->matrix[4] * x + gmv->matrix[1];
  496|  3.49k|        const int shift = 16 - (3 - !hdr->hp);
  497|  3.49k|        const int round = (1 << shift) >> 1;
  498|  3.49k|        mv res = (mv) {
  499|  3.49k|            .y = apply_sign(((abs(yc) + round) >> shift) << !hdr->hp, yc),
  500|  3.49k|            .x = apply_sign(((abs(xc) + round) >> shift) << !hdr->hp, xc),
  501|  3.49k|        };
  502|  3.49k|        if (hdr->force_integer_mv)
  ------------------
  |  Branch (502:13): [True: 192, False: 3.30k]
  ------------------
  503|    192|            fix_int_mv_precision(&res);
  504|  3.49k|        return res;
  505|  2.26k|    }
  506|  6.23k|    case DAV1D_WM_TYPE_TRANSLATION: {
  ------------------
  |  Branch (506:5): [True: 6.23k, False: 9.56k]
  ------------------
  507|  6.23k|        mv res = (mv) {
  508|  6.23k|            .y = gmv->matrix[0] >> 13,
  509|  6.23k|            .x = gmv->matrix[1] >> 13,
  510|  6.23k|        };
  511|  6.23k|        if (hdr->force_integer_mv)
  ------------------
  |  Branch (511:13): [True: 94, False: 6.14k]
  ------------------
  512|     94|            fix_int_mv_precision(&res);
  513|  6.23k|        return res;
  514|  2.26k|    }
  515|  6.07k|    case DAV1D_WM_TYPE_IDENTITY:
  ------------------
  |  Branch (515:5): [True: 6.07k, False: 9.72k]
  ------------------
  516|  6.07k|        return (mv) { .x = 0, .y = 0 };
  517|  15.7k|    }
  518|  15.7k|}
decode.c:get_mask_comp_ctx:
  266|  7.78k|{
  267|  7.78k|    const int a_ctx = a->comp_type[xb4] >= COMP_INTER_SEG ? 1 :
  ------------------
  |  Branch (267:23): [True: 1.01k, False: 6.77k]
  ------------------
  268|  7.78k|                      a->ref[0][xb4] == 6 ? 3 : 0;
  ------------------
  |  Branch (268:23): [True: 400, False: 6.37k]
  ------------------
  269|  7.78k|    const int l_ctx = l->comp_type[yb4] >= COMP_INTER_SEG ? 1 :
  ------------------
  |  Branch (269:23): [True: 1.69k, False: 6.08k]
  ------------------
  270|  7.78k|                      l->ref[0][yb4] == 6 ? 3 : 0;
  ------------------
  |  Branch (270:23): [True: 413, False: 5.67k]
  ------------------
  271|       |
  272|  7.78k|    return imin(a_ctx + l_ctx, 5);
  273|  7.78k|}
decode.c:get_jnt_comp_ctx:
  251|  7.65k|{
  252|  7.65k|    const int d0 = abs(get_poc_diff(order_hint_n_bits, ref0poc, poc));
  253|  7.65k|    const int d1 = abs(get_poc_diff(order_hint_n_bits, poc, ref1poc));
  254|  7.65k|    const int offset = d0 == d1;
  255|  7.65k|    const int a_ctx = a->comp_type[xb4] >= COMP_INTER_AVG ||
  ------------------
  |  Branch (255:23): [True: 2.63k, False: 5.01k]
  ------------------
  256|  5.01k|                      a->ref[0][xb4] == 6;
  ------------------
  |  Branch (256:23): [True: 472, False: 4.54k]
  ------------------
  257|  7.65k|    const int l_ctx = l->comp_type[yb4] >= COMP_INTER_AVG ||
  ------------------
  |  Branch (257:23): [True: 3.33k, False: 4.31k]
  ------------------
  258|  4.31k|                      l->ref[0][yb4] == 6;
  ------------------
  |  Branch (258:23): [True: 693, False: 3.62k]
  ------------------
  259|       |
  260|  7.65k|    return 3 * offset + a_ctx + l_ctx;
  261|  7.65k|}
decode.c:get_filter_ctx:
  139|  56.4k|{
  140|  56.4k|    const int a_filter = (a->ref[0][xb4] == ref || a->ref[1][xb4] == ref) ?
  ------------------
  |  Branch (140:27): [True: 26.8k, False: 29.5k]
  |  Branch (140:52): [True: 1.30k, False: 28.2k]
  ------------------
  141|  28.2k|                         a->filter[dir][xb4] : DAV1D_N_SWITCHABLE_FILTERS;
  142|  56.4k|    const int l_filter = (l->ref[0][yb4] == ref || l->ref[1][yb4] == ref) ?
  ------------------
  |  Branch (142:27): [True: 33.0k, False: 23.3k]
  |  Branch (142:52): [True: 2.52k, False: 20.8k]
  ------------------
  143|  35.5k|                         l->filter[dir][yb4] : DAV1D_N_SWITCHABLE_FILTERS;
  144|       |
  145|  56.4k|    if (a_filter == l_filter) {
  ------------------
  |  Branch (145:9): [True: 28.5k, False: 27.9k]
  ------------------
  146|  28.5k|        return comp * 4 + a_filter;
  147|  28.5k|    } else if (a_filter == DAV1D_N_SWITCHABLE_FILTERS) {
  ------------------
  |  Branch (147:16): [True: 16.3k, False: 11.5k]
  ------------------
  148|  16.3k|        return comp * 4 + l_filter;
  149|  16.3k|    } else if (l_filter == DAV1D_N_SWITCHABLE_FILTERS) {
  ------------------
  |  Branch (149:16): [True: 8.98k, False: 2.54k]
  ------------------
  150|  8.98k|        return comp * 4 + a_filter;
  151|  8.98k|    } else {
  152|  2.54k|        return comp * 4 + DAV1D_N_SWITCHABLE_FILTERS;
  153|  2.54k|    }
  154|  56.4k|}
decode.c:gather_top_partition_prob:
  106|   833k|{
  107|       |    // Exploit the fact that cdfs for PARTITION_V, PARTITION_SPLIT and
  108|       |    // PARTITION_T_TOP_SPLIT are neighbors.
  109|   833k|    unsigned out = in[PARTITION_V - 1] - in[PARTITION_T_TOP_SPLIT];
  110|       |    // Exploit the facts that cdfs for PARTITION_T_LEFT_SPLIT and
  111|       |    // PARTITION_T_RIGHT_SPLIT are neighbors, the probability for
  112|       |    // PARTITION_V4 is always zero, and the probability for
  113|       |    // PARTITION_T_RIGHT_SPLIT is zero in 128x128 blocks.
  114|   833k|    out += in[PARTITION_T_LEFT_SPLIT - 1];
  115|   833k|    if (bl != BL_128X128)
  ------------------
  |  Branch (115:9): [True: 767k, False: 65.5k]
  ------------------
  116|   767k|        out += in[PARTITION_V4 - 1] - in[PARTITION_T_RIGHT_SPLIT];
  117|   833k|    return out;
  118|   833k|}
decode.c:gather_left_partition_prob:
   94|   492k|{
   95|   492k|    unsigned out = in[PARTITION_H - 1] - in[PARTITION_H];
   96|       |    // Exploit the fact that cdfs for PARTITION_SPLIT, PARTITION_T_TOP_SPLIT,
   97|       |    // PARTITION_T_BOTTOM_SPLIT and PARTITION_T_LEFT_SPLIT are neighbors.
   98|   492k|    out += in[PARTITION_SPLIT - 1] - in[PARTITION_T_LEFT_SPLIT];
   99|   492k|    if (bl != BL_128X128)
  ------------------
  |  Branch (99:9): [True: 445k, False: 47.5k]
  ------------------
  100|   445k|        out += in[PARTITION_H4 - 1] - in[PARTITION_H4];
  101|   492k|    return out;
  102|   492k|}
decode.c:get_poc_diff:
  239|  91.6k|{
  240|  91.6k|    if (!order_hint_n_bits) return 0;
  ------------------
  |  Branch (240:9): [True: 7.51k, False: 84.0k]
  ------------------
  241|  84.0k|    const int mask = 1 << (order_hint_n_bits - 1);
  242|  84.0k|    const int diff = poc0 - poc1;
  243|  84.0k|    return (diff & (mask - 1)) - (diff & mask);
  244|  91.6k|}
recon_tmpl.c:get_uv_inter_txtp:
  122|  30.1k|{
  123|  30.1k|    if (uvt_dim->max == TX_32X32)
  ------------------
  |  Branch (123:9): [True: 5.81k, False: 24.3k]
  ------------------
  124|  5.81k|        return ytxtp == IDTX ? IDTX : DCT_DCT;
  ------------------
  |  Branch (124:16): [True: 114, False: 5.70k]
  ------------------
  125|  24.3k|    if (uvt_dim->min == TX_16X16 &&
  ------------------
  |  Branch (125:9): [True: 1.69k, False: 22.6k]
  ------------------
  126|  1.69k|        ((1 << ytxtp) & ((1 << H_FLIPADST) | (1 << V_FLIPADST) |
  ------------------
  |  Branch (126:9): [True: 45, False: 1.64k]
  ------------------
  127|  1.69k|                         (1 << H_ADST) | (1 << V_ADST))))
  128|     45|    {
  129|     45|        return DCT_DCT;
  130|     45|    }
  131|       |
  132|  24.3k|    return ytxtp;
  133|  24.3k|}

dav1d_prep_grain_8bpc:
  105|    117|{
  106|    117|    const Dav1dFilmGrainData *const data = &out->frame_hdr->film_grain.data;
  107|       |#if BITDEPTH != 8
  108|       |    const int bitdepth_max = (1 << out->p.bpc) - 1;
  109|       |#endif
  110|       |
  111|       |    // Generate grain LUTs as needed
  112|    117|    dsp->generate_grain_y(grain_lut[0], data HIGHBD_TAIL_SUFFIX); // always needed
  113|    117|    if (data->num_uv_points[0] || data->chroma_scaling_from_luma)
  ------------------
  |  Branch (113:9): [True: 22, False: 95]
  |  Branch (113:35): [True: 58, False: 37]
  ------------------
  114|     80|        dsp->generate_grain_uv[in->p.layout - 1](grain_lut[1], grain_lut[0],
  115|     80|                                                 data, 0 HIGHBD_TAIL_SUFFIX);
  116|    117|    if (data->num_uv_points[1] || data->chroma_scaling_from_luma)
  ------------------
  |  Branch (116:9): [True: 12, False: 105]
  |  Branch (116:35): [True: 58, False: 47]
  ------------------
  117|     70|        dsp->generate_grain_uv[in->p.layout - 1](grain_lut[2], grain_lut[0],
  118|     70|                                                 data, 1 HIGHBD_TAIL_SUFFIX);
  119|       |
  120|       |    // Generate scaling LUTs as needed
  121|    117|    if (data->num_y_points || data->chroma_scaling_from_luma)
  ------------------
  |  Branch (121:9): [True: 87, False: 30]
  |  Branch (121:31): [True: 24, False: 6]
  ------------------
  122|    111|        generate_scaling(in->p.bpc, data->y_points, data->num_y_points, scaling[0]);
  123|    117|    if (data->num_uv_points[0])
  ------------------
  |  Branch (123:9): [True: 22, False: 95]
  ------------------
  124|     22|        generate_scaling(in->p.bpc, data->uv_points[0], data->num_uv_points[0], scaling[1]);
  125|    117|    if (data->num_uv_points[1])
  ------------------
  |  Branch (125:9): [True: 12, False: 105]
  ------------------
  126|     12|        generate_scaling(in->p.bpc, data->uv_points[1], data->num_uv_points[1], scaling[2]);
  127|       |
  128|       |    // Copy over the non-modified planes
  129|    117|    assert(out->stride[0] == in->stride[0]);
  ------------------
  |  |  140|    117|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 117]
  |  |  |  Branch (140:68): [Folded, False: 117]
  |  |  ------------------
  ------------------
  130|    117|    if (!data->num_y_points) {
  ------------------
  |  Branch (130:9): [True: 30, False: 87]
  ------------------
  131|     30|        const ptrdiff_t stride = out->stride[0];
  132|     30|        const ptrdiff_t sz = out->p.h * stride;
  133|     30|        if (sz < 0)
  ------------------
  |  Branch (133:13): [True: 0, False: 30]
  ------------------
  134|      0|            memcpy((uint8_t*) out->data[0] + sz - stride,
  135|      0|                   (uint8_t*) in->data[0] + sz - stride, -sz);
  136|     30|        else
  137|     30|            memcpy(out->data[0], in->data[0], sz);
  138|     30|    }
  139|       |
  140|    117|    if (in->p.layout != DAV1D_PIXEL_LAYOUT_I400 && !data->chroma_scaling_from_luma) {
  ------------------
  |  Branch (140:9): [True: 86, False: 31]
  |  Branch (140:52): [True: 28, False: 58]
  ------------------
  141|     28|        assert(out->stride[1] == in->stride[1]);
  ------------------
  |  |  140|     28|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 28]
  |  |  |  Branch (140:68): [Folded, False: 28]
  |  |  ------------------
  ------------------
  142|     28|        const int ss_ver = in->p.layout == DAV1D_PIXEL_LAYOUT_I420;
  143|     28|        const ptrdiff_t stride = out->stride[1];
  144|     28|        const ptrdiff_t sz = ((out->p.h + ss_ver) >> ss_ver) * stride;
  145|     28|        if (sz < 0) {
  ------------------
  |  Branch (145:13): [True: 0, False: 28]
  ------------------
  146|      0|            if (!data->num_uv_points[0])
  ------------------
  |  Branch (146:17): [True: 0, False: 0]
  ------------------
  147|      0|                memcpy((uint8_t*) out->data[1] + sz - stride,
  148|      0|                       (uint8_t*) in->data[1] + sz - stride, -sz);
  149|      0|            if (!data->num_uv_points[1])
  ------------------
  |  Branch (149:17): [True: 0, False: 0]
  ------------------
  150|      0|                memcpy((uint8_t*) out->data[2] + sz - stride,
  151|      0|                       (uint8_t*) in->data[2] + sz - stride, -sz);
  152|     28|        } else {
  153|     28|            if (!data->num_uv_points[0])
  ------------------
  |  Branch (153:17): [True: 6, False: 22]
  ------------------
  154|      6|                memcpy(out->data[1], in->data[1], sz);
  155|     28|            if (!data->num_uv_points[1])
  ------------------
  |  Branch (155:17): [True: 16, False: 12]
  ------------------
  156|     16|                memcpy(out->data[2], in->data[2], sz);
  157|     28|        }
  158|     28|    }
  159|    117|}
dav1d_apply_grain_row_8bpc:
  167|  6.68k|{
  168|       |    // Synthesize grain for the affected planes
  169|  6.68k|    const Dav1dFilmGrainData *const data = &out->frame_hdr->film_grain.data;
  170|  6.68k|    const int ss_y = in->p.layout == DAV1D_PIXEL_LAYOUT_I420;
  171|  6.68k|    const int ss_x = in->p.layout != DAV1D_PIXEL_LAYOUT_I444;
  172|  6.68k|    const int cpw = (out->p.w + ss_x) >> ss_x;
  173|  6.68k|    const int is_id = out->seq_hdr->mtrx == DAV1D_MC_IDENTITY;
  174|  6.68k|    pixel *const luma_src =
  175|  6.68k|        ((pixel *) in->data[0]) + row * FG_BLOCK_SIZE * PXSTRIDE(in->stride[0]);
  ------------------
  |  |   37|  6.68k|#define FG_BLOCK_SIZE 32
  ------------------
                      ((pixel *) in->data[0]) + row * FG_BLOCK_SIZE * PXSTRIDE(in->stride[0]);
  ------------------
  |  |   53|  6.68k|#define PXSTRIDE(x) (x)
  ------------------
  176|       |#if BITDEPTH != 8
  177|       |    const int bitdepth_max = (1 << out->p.bpc) - 1;
  178|       |#endif
  179|       |
  180|  6.68k|    if (data->num_y_points) {
  ------------------
  |  Branch (180:9): [True: 5.77k, False: 908]
  ------------------
  181|  5.77k|        const int bh = imin(out->p.h - row * FG_BLOCK_SIZE, FG_BLOCK_SIZE);
  ------------------
  |  |   37|  5.77k|#define FG_BLOCK_SIZE 32
  ------------------
                      const int bh = imin(out->p.h - row * FG_BLOCK_SIZE, FG_BLOCK_SIZE);
  ------------------
  |  |   37|  5.77k|#define FG_BLOCK_SIZE 32
  ------------------
  182|  5.77k|        dsp->fgy_32x32xn(((pixel *) out->data[0]) + row * FG_BLOCK_SIZE * PXSTRIDE(out->stride[0]),
  ------------------
  |  |   37|  5.77k|#define FG_BLOCK_SIZE 32
  ------------------
                      dsp->fgy_32x32xn(((pixel *) out->data[0]) + row * FG_BLOCK_SIZE * PXSTRIDE(out->stride[0]),
  ------------------
  |  |   53|  5.77k|#define PXSTRIDE(x) (x)
  ------------------
  183|  5.77k|                         luma_src, out->stride[0], data,
  184|  5.77k|                         out->p.w, scaling[0], grain_lut[0], bh, row HIGHBD_TAIL_SUFFIX);
  185|  5.77k|    }
  186|       |
  187|  6.68k|    if (!data->num_uv_points[0] && !data->num_uv_points[1] &&
  ------------------
  |  Branch (187:9): [True: 5.30k, False: 1.37k]
  |  Branch (187:36): [True: 5.26k, False: 42]
  ------------------
  188|  5.26k|        !data->chroma_scaling_from_luma)
  ------------------
  |  Branch (188:9): [True: 2.73k, False: 2.53k]
  ------------------
  189|  2.73k|    {
  190|  2.73k|        return;
  191|  2.73k|    }
  192|       |
  193|  3.95k|    const int bh = (imin(out->p.h - row * FG_BLOCK_SIZE, FG_BLOCK_SIZE) + ss_y) >> ss_y;
  ------------------
  |  |   37|  3.95k|#define FG_BLOCK_SIZE 32
  ------------------
                  const int bh = (imin(out->p.h - row * FG_BLOCK_SIZE, FG_BLOCK_SIZE) + ss_y) >> ss_y;
  ------------------
  |  |   37|  3.95k|#define FG_BLOCK_SIZE 32
  ------------------
  194|       |
  195|       |    // extend padding pixels
  196|  3.95k|    if (out->p.w & ss_x) {
  ------------------
  |  Branch (196:9): [True: 30, False: 3.92k]
  ------------------
  197|     30|        pixel *ptr = luma_src;
  198|    475|        for (int y = 0; y < bh; y++) {
  ------------------
  |  Branch (198:25): [True: 445, False: 30]
  ------------------
  199|    445|            ptr[out->p.w] = ptr[out->p.w - 1];
  200|    445|            ptr += PXSTRIDE(in->stride[0]) << ss_y;
  ------------------
  |  |   53|    445|#define PXSTRIDE(x) (x)
  ------------------
  201|    445|        }
  202|     30|    }
  203|       |
  204|  3.95k|    const ptrdiff_t uv_off = row * FG_BLOCK_SIZE * PXSTRIDE(out->stride[1]) >> ss_y;
  ------------------
  |  |   37|  3.95k|#define FG_BLOCK_SIZE 32
  ------------------
                  const ptrdiff_t uv_off = row * FG_BLOCK_SIZE * PXSTRIDE(out->stride[1]) >> ss_y;
  ------------------
  |  |   53|  3.95k|#define PXSTRIDE(x) (x)
  ------------------
  205|  3.95k|    if (data->chroma_scaling_from_luma) {
  ------------------
  |  Branch (205:9): [True: 2.55k, False: 1.40k]
  ------------------
  206|  7.58k|        for (int pl = 0; pl < 2; pl++)
  ------------------
  |  Branch (206:26): [True: 5.03k, False: 2.55k]
  ------------------
  207|  5.03k|            dsp->fguv_32x32xn[in->p.layout - 1](((pixel *) out->data[1 + pl]) + uv_off,
  208|  5.03k|                                                ((const pixel *) in->data[1 + pl]) + uv_off,
  209|  5.03k|                                                in->stride[1], data, cpw,
  210|  5.03k|                                                scaling[0], grain_lut[1 + pl],
  211|  5.03k|                                                bh, row, luma_src, in->stride[0],
  212|  5.03k|                                                pl, is_id HIGHBD_TAIL_SUFFIX);
  213|  2.55k|    } else {
  214|  4.01k|        for (int pl = 0; pl < 2; pl++)
  ------------------
  |  Branch (214:26): [True: 2.61k, False: 1.40k]
  ------------------
  215|  2.61k|            if (data->num_uv_points[pl])
  ------------------
  |  Branch (215:17): [True: 1.92k, False: 694]
  ------------------
  216|  1.92k|                dsp->fguv_32x32xn[in->p.layout - 1](((pixel *) out->data[1 + pl]) + uv_off,
  217|  1.92k|                                                    ((const pixel *) in->data[1 + pl]) + uv_off,
  218|  1.92k|                                                    in->stride[1], data, cpw,
  219|  1.92k|                                                    scaling[1 + pl], grain_lut[1 + pl],
  220|  1.92k|                                                    bh, row, luma_src, in->stride[0],
  221|  1.92k|                                                    pl, is_id HIGHBD_TAIL_SUFFIX);
  222|  1.40k|    }
  223|  3.95k|}
dav1d_apply_grain_8bpc:
  228|     16|{
  229|     16|    ALIGN_STK_16(entry, grain_lut, 3,[GRAIN_HEIGHT + 1][GRAIN_WIDTH]);
  ------------------
  |  |  100|     16|    ALIGN(type var[sz1d]sznd, ALIGN_16_VAL)
  |  |  ------------------
  |  |  |  |   86|     16|    line __attribute__((aligned(align)))
  |  |  ------------------
  ------------------
  230|     16|#if ARCH_X86_64 && BITDEPTH == 8
  231|     16|    ALIGN_STK_64(uint8_t, scaling, 3,[SCALING_SIZE]);
  ------------------
  |  |   96|     16|    ALIGN(type var[sz1d]sznd, ALIGN_64_VAL)
  |  |  ------------------
  |  |  |  |   86|     16|    line __attribute__((aligned(align)))
  |  |  ------------------
  ------------------
  232|       |#else
  233|       |    uint8_t scaling[3][SCALING_SIZE];
  234|       |#endif
  235|     16|    const int rows = (out->p.h + FG_BLOCK_SIZE - 1) / FG_BLOCK_SIZE;
  ------------------
  |  |   37|     16|#define FG_BLOCK_SIZE 32
  ------------------
                  const int rows = (out->p.h + FG_BLOCK_SIZE - 1) / FG_BLOCK_SIZE;
  ------------------
  |  |   37|     16|#define FG_BLOCK_SIZE 32
  ------------------
  236|       |
  237|     16|    bitfn(dav1d_prep_grain)(dsp, out, in, scaling, grain_lut);
  ------------------
  |  |   51|     16|#define bitfn(x) x##_8bpc
  ------------------
  238|    839|    for (int row = 0; row < rows; row++)
  ------------------
  |  Branch (238:23): [True: 823, False: 16]
  ------------------
  239|    823|        bitfn(dav1d_apply_grain_row)(dsp, out, in, scaling, grain_lut, row);
  ------------------
  |  |   51|    823|#define bitfn(x) x##_8bpc
  ------------------
  240|     16|}
fg_apply_tmpl.c:generate_scaling:
   44|    145|{
   45|    145|#if BITDEPTH == 8
   46|    145|    const int shift_x = 0;
   47|    145|    const int scaling_size = SCALING_SIZE;
  ------------------
  |  |   39|    145|#define SCALING_SIZE 256
  ------------------
   48|       |#else
   49|       |    assert(bitdepth > 8);
   50|       |    const int shift_x = bitdepth - 8;
   51|       |    const int scaling_size = 1 << bitdepth;
   52|       |#endif
   53|       |
   54|    145|    if (num == 0) {
  ------------------
  |  Branch (54:9): [True: 24, False: 121]
  ------------------
   55|     24|        memset(scaling, 0, scaling_size);
   56|     24|        return;
   57|     24|    }
   58|       |
   59|       |    // Fill up the preceding entries with the initial value
   60|    121|    memset(scaling, points[0][1], points[0][0] << shift_x);
   61|       |
   62|       |    // Linearly interpolate the values in the middle
   63|    211|    for (int i = 0; i < num - 1; i++) {
  ------------------
  |  Branch (63:21): [True: 90, False: 121]
  ------------------
   64|     90|        const int bx = points[i][0];
   65|     90|        const int by = points[i][1];
   66|     90|        const int ex = points[i+1][0];
   67|     90|        const int ey = points[i+1][1];
   68|     90|        const int dx = ex - bx;
   69|     90|        const int dy = ey - by;
   70|     90|        assert(dx > 0);
  ------------------
  |  |  140|     90|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 90]
  |  |  |  Branch (140:68): [Folded, False: 90]
  |  |  ------------------
  ------------------
   71|     90|        const int delta = dy * ((0x10000 + (dx >> 1)) / dx);
   72|  3.90k|        for (int x = 0, d = 0x8000; x < dx; x++) {
  ------------------
  |  Branch (72:37): [True: 3.81k, False: 90]
  ------------------
   73|  3.81k|            scaling[(bx + x) << shift_x] = by + (d >> 16);
   74|  3.81k|            d += delta;
   75|  3.81k|        }
   76|     90|    }
   77|       |
   78|       |    // Fill up the remaining entries with the final value
   79|    121|    const int n = points[num - 1][0] << shift_x;
   80|    121|    memset(&scaling[n], points[num - 1][1], scaling_size - n);
   81|       |
   82|       |#if BITDEPTH != 8
   83|       |    const int pad = 1 << shift_x, rnd = pad >> 1;
   84|       |    for (int i = 0; i < num - 1; i++) {
   85|       |        const int bx = points[i][0] << shift_x;
   86|       |        const int ex = points[i+1][0] << shift_x;
   87|       |        const int dx = ex - bx;
   88|       |        for (int x = 0; x < dx; x += pad) {
   89|       |            const int range = scaling[bx + x + pad] - scaling[bx + x];
   90|       |            for (int n = 1, r = rnd; n < pad; n++) {
   91|       |                r += range;
   92|       |                scaling[bx + x + n] = scaling[bx + x] + (r >> shift_x);
   93|       |            }
   94|       |        }
   95|       |    }
   96|       |#endif
   97|    121|}
dav1d_prep_grain_16bpc:
  105|    180|{
  106|    180|    const Dav1dFilmGrainData *const data = &out->frame_hdr->film_grain.data;
  107|    180|#if BITDEPTH != 8
  108|    180|    const int bitdepth_max = (1 << out->p.bpc) - 1;
  109|    180|#endif
  110|       |
  111|       |    // Generate grain LUTs as needed
  112|    180|    dsp->generate_grain_y(grain_lut[0], data HIGHBD_TAIL_SUFFIX); // always needed
  ------------------
  |  |   74|    180|#define HIGHBD_TAIL_SUFFIX , bitdepth_max
  ------------------
  113|    180|    if (data->num_uv_points[0] || data->chroma_scaling_from_luma)
  ------------------
  |  Branch (113:9): [True: 12, False: 168]
  |  Branch (113:35): [True: 139, False: 29]
  ------------------
  114|    151|        dsp->generate_grain_uv[in->p.layout - 1](grain_lut[1], grain_lut[0],
  115|    151|                                                 data, 0 HIGHBD_TAIL_SUFFIX);
  ------------------
  |  |   74|    151|#define HIGHBD_TAIL_SUFFIX , bitdepth_max
  ------------------
  116|    180|    if (data->num_uv_points[1] || data->chroma_scaling_from_luma)
  ------------------
  |  Branch (116:9): [True: 9, False: 171]
  |  Branch (116:35): [True: 139, False: 32]
  ------------------
  117|    148|        dsp->generate_grain_uv[in->p.layout - 1](grain_lut[2], grain_lut[0],
  118|    148|                                                 data, 1 HIGHBD_TAIL_SUFFIX);
  ------------------
  |  |   74|    148|#define HIGHBD_TAIL_SUFFIX , bitdepth_max
  ------------------
  119|       |
  120|       |    // Generate scaling LUTs as needed
  121|    180|    if (data->num_y_points || data->chroma_scaling_from_luma)
  ------------------
  |  Branch (121:9): [True: 132, False: 48]
  |  Branch (121:31): [True: 46, False: 2]
  ------------------
  122|    178|        generate_scaling(in->p.bpc, data->y_points, data->num_y_points, scaling[0]);
  123|    180|    if (data->num_uv_points[0])
  ------------------
  |  Branch (123:9): [True: 12, False: 168]
  ------------------
  124|     12|        generate_scaling(in->p.bpc, data->uv_points[0], data->num_uv_points[0], scaling[1]);
  125|    180|    if (data->num_uv_points[1])
  ------------------
  |  Branch (125:9): [True: 9, False: 171]
  ------------------
  126|      9|        generate_scaling(in->p.bpc, data->uv_points[1], data->num_uv_points[1], scaling[2]);
  127|       |
  128|       |    // Copy over the non-modified planes
  129|    180|    assert(out->stride[0] == in->stride[0]);
  ------------------
  |  |  140|    180|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 180]
  |  |  |  Branch (140:68): [Folded, False: 180]
  |  |  ------------------
  ------------------
  130|    180|    if (!data->num_y_points) {
  ------------------
  |  Branch (130:9): [True: 48, False: 132]
  ------------------
  131|     48|        const ptrdiff_t stride = out->stride[0];
  132|     48|        const ptrdiff_t sz = out->p.h * stride;
  133|     48|        if (sz < 0)
  ------------------
  |  Branch (133:13): [True: 0, False: 48]
  ------------------
  134|      0|            memcpy((uint8_t*) out->data[0] + sz - stride,
  135|      0|                   (uint8_t*) in->data[0] + sz - stride, -sz);
  136|     48|        else
  137|     48|            memcpy(out->data[0], in->data[0], sz);
  138|     48|    }
  139|       |
  140|    180|    if (in->p.layout != DAV1D_PIXEL_LAYOUT_I400 && !data->chroma_scaling_from_luma) {
  ------------------
  |  Branch (140:9): [True: 155, False: 25]
  |  Branch (140:52): [True: 16, False: 139]
  ------------------
  141|     16|        assert(out->stride[1] == in->stride[1]);
  ------------------
  |  |  140|     16|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 16]
  |  |  |  Branch (140:68): [Folded, False: 16]
  |  |  ------------------
  ------------------
  142|     16|        const int ss_ver = in->p.layout == DAV1D_PIXEL_LAYOUT_I420;
  143|     16|        const ptrdiff_t stride = out->stride[1];
  144|     16|        const ptrdiff_t sz = ((out->p.h + ss_ver) >> ss_ver) * stride;
  145|     16|        if (sz < 0) {
  ------------------
  |  Branch (145:13): [True: 0, False: 16]
  ------------------
  146|      0|            if (!data->num_uv_points[0])
  ------------------
  |  Branch (146:17): [True: 0, False: 0]
  ------------------
  147|      0|                memcpy((uint8_t*) out->data[1] + sz - stride,
  148|      0|                       (uint8_t*) in->data[1] + sz - stride, -sz);
  149|      0|            if (!data->num_uv_points[1])
  ------------------
  |  Branch (149:17): [True: 0, False: 0]
  ------------------
  150|      0|                memcpy((uint8_t*) out->data[2] + sz - stride,
  151|      0|                       (uint8_t*) in->data[2] + sz - stride, -sz);
  152|     16|        } else {
  153|     16|            if (!data->num_uv_points[0])
  ------------------
  |  Branch (153:17): [True: 4, False: 12]
  ------------------
  154|      4|                memcpy(out->data[1], in->data[1], sz);
  155|     16|            if (!data->num_uv_points[1])
  ------------------
  |  Branch (155:17): [True: 7, False: 9]
  ------------------
  156|      7|                memcpy(out->data[2], in->data[2], sz);
  157|     16|        }
  158|     16|    }
  159|    180|}
dav1d_apply_grain_row_16bpc:
  167|  18.0k|{
  168|       |    // Synthesize grain for the affected planes
  169|  18.0k|    const Dav1dFilmGrainData *const data = &out->frame_hdr->film_grain.data;
  170|  18.0k|    const int ss_y = in->p.layout == DAV1D_PIXEL_LAYOUT_I420;
  171|  18.0k|    const int ss_x = in->p.layout != DAV1D_PIXEL_LAYOUT_I444;
  172|  18.0k|    const int cpw = (out->p.w + ss_x) >> ss_x;
  173|  18.0k|    const int is_id = out->seq_hdr->mtrx == DAV1D_MC_IDENTITY;
  174|  18.0k|    pixel *const luma_src =
  175|  18.0k|        ((pixel *) in->data[0]) + row * FG_BLOCK_SIZE * PXSTRIDE(in->stride[0]);
  ------------------
  |  |   37|  18.0k|#define FG_BLOCK_SIZE 32
  ------------------
  176|  18.0k|#if BITDEPTH != 8
  177|  18.0k|    const int bitdepth_max = (1 << out->p.bpc) - 1;
  178|  18.0k|#endif
  179|       |
  180|  18.0k|    if (data->num_y_points) {
  ------------------
  |  Branch (180:9): [True: 6.36k, False: 11.7k]
  ------------------
  181|  6.36k|        const int bh = imin(out->p.h - row * FG_BLOCK_SIZE, FG_BLOCK_SIZE);
  ------------------
  |  |   37|  6.36k|#define FG_BLOCK_SIZE 32
  ------------------
                      const int bh = imin(out->p.h - row * FG_BLOCK_SIZE, FG_BLOCK_SIZE);
  ------------------
  |  |   37|  6.36k|#define FG_BLOCK_SIZE 32
  ------------------
  182|  6.36k|        dsp->fgy_32x32xn(((pixel *) out->data[0]) + row * FG_BLOCK_SIZE * PXSTRIDE(out->stride[0]),
  ------------------
  |  |   37|  6.36k|#define FG_BLOCK_SIZE 32
  ------------------
  183|  6.36k|                         luma_src, out->stride[0], data,
  184|  6.36k|                         out->p.w, scaling[0], grain_lut[0], bh, row HIGHBD_TAIL_SUFFIX);
  ------------------
  |  |   74|  6.36k|#define HIGHBD_TAIL_SUFFIX , bitdepth_max
  ------------------
  185|  6.36k|    }
  186|       |
  187|  18.0k|    if (!data->num_uv_points[0] && !data->num_uv_points[1] &&
  ------------------
  |  Branch (187:9): [True: 15.2k, False: 2.80k]
  |  Branch (187:36): [True: 15.4k, False: 18.4E]
  ------------------
  188|  15.4k|        !data->chroma_scaling_from_luma)
  ------------------
  |  Branch (188:9): [True: 4.14k, False: 11.2k]
  ------------------
  189|  4.13k|    {
  190|  4.13k|        return;
  191|  4.13k|    }
  192|       |
  193|  13.9k|    const int bh = (imin(out->p.h - row * FG_BLOCK_SIZE, FG_BLOCK_SIZE) + ss_y) >> ss_y;
  ------------------
  |  |   37|  13.9k|#define FG_BLOCK_SIZE 32
  ------------------
                  const int bh = (imin(out->p.h - row * FG_BLOCK_SIZE, FG_BLOCK_SIZE) + ss_y) >> ss_y;
  ------------------
  |  |   37|  13.9k|#define FG_BLOCK_SIZE 32
  ------------------
  194|       |
  195|       |    // extend padding pixels
  196|  13.9k|    if (out->p.w & ss_x) {
  ------------------
  |  Branch (196:9): [True: 8.19k, False: 5.74k]
  ------------------
  197|  8.19k|        pixel *ptr = luma_src;
  198|  47.0k|        for (int y = 0; y < bh; y++) {
  ------------------
  |  Branch (198:25): [True: 38.8k, False: 8.19k]
  ------------------
  199|  38.8k|            ptr[out->p.w] = ptr[out->p.w - 1];
  200|  38.8k|            ptr += PXSTRIDE(in->stride[0]) << ss_y;
  201|  38.8k|        }
  202|  8.19k|    }
  203|       |
  204|  13.9k|    const ptrdiff_t uv_off = row * FG_BLOCK_SIZE * PXSTRIDE(out->stride[1]) >> ss_y;
  ------------------
  |  |   37|  13.9k|#define FG_BLOCK_SIZE 32
  ------------------
  205|  13.9k|    if (data->chroma_scaling_from_luma) {
  ------------------
  |  Branch (205:9): [True: 10.1k, False: 3.77k]
  ------------------
  206|  29.4k|        for (int pl = 0; pl < 2; pl++)
  ------------------
  |  Branch (206:26): [True: 19.2k, False: 10.1k]
  ------------------
  207|  19.2k|            dsp->fguv_32x32xn[in->p.layout - 1](((pixel *) out->data[1 + pl]) + uv_off,
  208|  19.2k|                                                ((const pixel *) in->data[1 + pl]) + uv_off,
  209|  19.2k|                                                in->stride[1], data, cpw,
  210|  19.2k|                                                scaling[0], grain_lut[1 + pl],
  211|  19.2k|                                                bh, row, luma_src, in->stride[0],
  212|  19.2k|                                                pl, is_id HIGHBD_TAIL_SUFFIX);
  ------------------
  |  |   74|  19.2k|#define HIGHBD_TAIL_SUFFIX , bitdepth_max
  ------------------
  213|  10.1k|    } else {
  214|  5.46k|        for (int pl = 0; pl < 2; pl++)
  ------------------
  |  Branch (214:26): [True: 1.68k, False: 3.77k]
  ------------------
  215|  1.68k|            if (data->num_uv_points[pl])
  ------------------
  |  Branch (215:17): [True: 1.24k, False: 441]
  ------------------
  216|  1.24k|                dsp->fguv_32x32xn[in->p.layout - 1](((pixel *) out->data[1 + pl]) + uv_off,
  217|  1.24k|                                                    ((const pixel *) in->data[1 + pl]) + uv_off,
  218|  1.24k|                                                    in->stride[1], data, cpw,
  219|  1.24k|                                                    scaling[1 + pl], grain_lut[1 + pl],
  220|  1.24k|                                                    bh, row, luma_src, in->stride[0],
  221|  1.24k|                                                    pl, is_id HIGHBD_TAIL_SUFFIX);
  ------------------
  |  |   74|  1.24k|#define HIGHBD_TAIL_SUFFIX , bitdepth_max
  ------------------
  222|  3.77k|    }
  223|  13.9k|}
dav1d_apply_grain_16bpc:
  228|     39|{
  229|     39|    ALIGN_STK_16(entry, grain_lut, 3,[GRAIN_HEIGHT + 1][GRAIN_WIDTH]);
  ------------------
  |  |  100|     39|    ALIGN(type var[sz1d]sznd, ALIGN_16_VAL)
  |  |  ------------------
  |  |  |  |   86|     39|    line __attribute__((aligned(align)))
  |  |  ------------------
  ------------------
  230|       |#if ARCH_X86_64 && BITDEPTH == 8
  231|       |    ALIGN_STK_64(uint8_t, scaling, 3,[SCALING_SIZE]);
  232|       |#else
  233|     39|    uint8_t scaling[3][SCALING_SIZE];
  234|     39|#endif
  235|     39|    const int rows = (out->p.h + FG_BLOCK_SIZE - 1) / FG_BLOCK_SIZE;
  ------------------
  |  |   37|     39|#define FG_BLOCK_SIZE 32
  ------------------
                  const int rows = (out->p.h + FG_BLOCK_SIZE - 1) / FG_BLOCK_SIZE;
  ------------------
  |  |   37|     39|#define FG_BLOCK_SIZE 32
  ------------------
  236|       |
  237|     39|    bitfn(dav1d_prep_grain)(dsp, out, in, scaling, grain_lut);
  ------------------
  |  |   77|     39|#define bitfn(x) x##_16bpc
  ------------------
  238|  2.52k|    for (int row = 0; row < rows; row++)
  ------------------
  |  Branch (238:23): [True: 2.48k, False: 39]
  ------------------
  239|  2.48k|        bitfn(dav1d_apply_grain_row)(dsp, out, in, scaling, grain_lut, row);
  ------------------
  |  |   77|  2.48k|#define bitfn(x) x##_16bpc
  ------------------
  240|     39|}

dav1d_film_grain_dsp_init_8bpc:
  423|  7.82k|COLD void bitfn(dav1d_film_grain_dsp_init)(Dav1dFilmGrainDSPContext *const c) {
  424|  7.82k|    c->generate_grain_y = generate_grain_y_c;
  425|  7.82k|    c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = generate_grain_uv_420_c;
  426|  7.82k|    c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = generate_grain_uv_422_c;
  427|  7.82k|    c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = generate_grain_uv_444_c;
  428|       |
  429|  7.82k|    c->fgy_32x32xn = fgy_32x32xn_c;
  430|  7.82k|    c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = fguv_32x32xn_420_c;
  431|  7.82k|    c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = fguv_32x32xn_422_c;
  432|  7.82k|    c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = fguv_32x32xn_444_c;
  433|       |
  434|  7.82k|#if HAVE_ASM
  435|       |#if ARCH_AARCH64 || ARCH_ARM
  436|       |    film_grain_dsp_init_arm(c);
  437|       |#elif ARCH_X86
  438|       |    film_grain_dsp_init_x86(c);
  439|  7.82k|#endif
  440|  7.82k|#endif
  441|  7.82k|}
dav1d_film_grain_dsp_init_16bpc:
  423|  7.63k|COLD void bitfn(dav1d_film_grain_dsp_init)(Dav1dFilmGrainDSPContext *const c) {
  424|  7.63k|    c->generate_grain_y = generate_grain_y_c;
  425|  7.63k|    c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = generate_grain_uv_420_c;
  426|  7.63k|    c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = generate_grain_uv_422_c;
  427|  7.63k|    c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = generate_grain_uv_444_c;
  428|       |
  429|  7.63k|    c->fgy_32x32xn = fgy_32x32xn_c;
  430|  7.63k|    c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = fguv_32x32xn_420_c;
  431|  7.63k|    c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = fguv_32x32xn_422_c;
  432|  7.63k|    c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = fguv_32x32xn_444_c;
  433|       |
  434|  7.63k|#if HAVE_ASM
  435|       |#if ARCH_AARCH64 || ARCH_ARM
  436|       |    film_grain_dsp_init_arm(c);
  437|       |#elif ARCH_X86
  438|       |    film_grain_dsp_init_x86(c);
  439|  7.63k|#endif
  440|  7.63k|#endif
  441|  7.63k|}

dav1d_init_get_bits:
   38|  61.4k|{
   39|  61.4k|    assert(sz);
  ------------------
  |  |  140|  61.4k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 61.4k]
  |  |  |  Branch (140:68): [Folded, False: 61.4k]
  |  |  ------------------
  ------------------
   40|  61.4k|    c->ptr = c->ptr_start = data;
   41|  61.4k|    c->ptr_end = &c->ptr_start[sz];
   42|  61.4k|    c->state = 0;
   43|  61.4k|    c->bits_left = 0;
   44|  61.4k|    c->error = 0;
   45|  61.4k|}
dav1d_get_bit:
   47|  1.44M|unsigned dav1d_get_bit(GetBits *const c) {
   48|  1.44M|    if (!c->bits_left) {
  ------------------
  |  Branch (48:9): [True: 235k, False: 1.20M]
  ------------------
   49|   235k|        if (c->ptr >= c->ptr_end) {
  ------------------
  |  Branch (49:13): [True: 327, False: 234k]
  ------------------
   50|    327|            c->error = 1;
   51|   234k|        } else {
   52|   234k|            const unsigned state = *c->ptr++;
   53|   234k|            c->bits_left = 7;
   54|   234k|            c->state = (uint64_t) state << 57;
   55|   234k|            return state >> 7;
   56|   234k|        }
   57|   235k|    }
   58|       |
   59|  1.20M|    const uint64_t state = c->state;
   60|  1.20M|    c->bits_left--;
   61|  1.20M|    c->state = state << 1;
   62|  1.20M|    return (unsigned) (state >> 63);
   63|  1.44M|}
dav1d_get_bits:
   81|   698k|type name(GetBits *const c, const int n) {      \
   82|   698k|    assert(n > 0 && n <= 32);                   \
  ------------------
  |  |  140|  1.39M|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:30): [True: 698k, False: 0]
  |  |  |  Branch (140:30): [True: 698k, False: 0]
  |  |  |  Branch (140:68): [Folded, False: 698k]
  |  |  ------------------
  ------------------
   83|   698k|    /* Unsigned cast avoids refill after eob */ \
   84|   698k|    if ((unsigned) n > (unsigned) c->bits_left) \
  ------------------
  |  Branch (84:9): [True: 382k, False: 316k]
  ------------------
   85|   698k|        refill(c, n);                           \
   86|   698k|    const uint64_t state = c->state;            \
   87|   698k|    c->bits_left -= n;                          \
   88|   698k|    c->state = state << n;                      \
   89|   698k|    return (type) ((type64) state >> (64 - n)); \
   90|   698k|}
dav1d_get_sbits:
   81|   110k|type name(GetBits *const c, const int n) {      \
   82|   110k|    assert(n > 0 && n <= 32);                   \
  ------------------
  |  |  140|   221k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:30): [True: 110k, False: 0]
  |  |  |  Branch (140:30): [True: 110k, False: 0]
  |  |  |  Branch (140:68): [Folded, False: 110k]
  |  |  ------------------
  ------------------
   83|   110k|    /* Unsigned cast avoids refill after eob */ \
   84|   110k|    if ((unsigned) n > (unsigned) c->bits_left) \
  ------------------
  |  Branch (84:9): [True: 88.4k, False: 22.1k]
  ------------------
   85|   110k|        refill(c, n);                           \
   86|   110k|    const uint64_t state = c->state;            \
   87|   110k|    c->bits_left -= n;                          \
   88|   110k|    c->state = state << n;                      \
   89|   110k|    return (type) ((type64) state >> (64 - n)); \
   90|   110k|}
dav1d_get_uleb128:
   95|  55.4k|unsigned dav1d_get_uleb128(GetBits *const c) {
   96|  55.4k|    uint64_t val = 0;
   97|  55.4k|    unsigned i = 0, more;
   98|       |
   99|  61.4k|    do {
  100|  61.4k|        const int v = dav1d_get_bits(c, 8);
  101|  61.4k|        more = v & 0x80;
  102|  61.4k|        val |= ((uint64_t) (v & 0x7F)) << i;
  103|  61.4k|        i += 7;
  104|  61.4k|    } while (more && i < 56);
  ------------------
  |  Branch (104:14): [True: 6.03k, False: 55.4k]
  |  Branch (104:22): [True: 6.01k, False: 23]
  ------------------
  105|       |
  106|  55.4k|    if (val > UINT32_MAX || more) {
  ------------------
  |  Branch (106:9): [True: 98, False: 55.3k]
  |  Branch (106:29): [True: 3, False: 55.3k]
  ------------------
  107|    101|        c->error = 1;
  108|    101|        return 0;
  109|    101|    }
  110|       |
  111|  55.3k|    return (unsigned) val;
  112|  55.4k|}
dav1d_get_uniform:
  114|  11.6k|unsigned dav1d_get_uniform(GetBits *const c, const unsigned max) {
  115|       |    // Output in range [0..max-1]
  116|       |    // max must be > 1, or else nothing is read from the bitstream
  117|  11.6k|    assert(max > 1);
  ------------------
  |  |  140|  11.6k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 11.6k]
  |  |  |  Branch (140:68): [Folded, False: 11.6k]
  |  |  ------------------
  ------------------
  118|  11.6k|    const int l = ulog2(max) + 1;
  119|  11.6k|    assert(l > 1);
  ------------------
  |  |  140|  11.6k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 11.6k]
  |  |  |  Branch (140:68): [Folded, False: 11.6k]
  |  |  ------------------
  ------------------
  120|  11.6k|    const unsigned m = (1U << l) - max;
  121|  11.6k|    const unsigned v = dav1d_get_bits(c, l - 1);
  122|  11.6k|    return v < m ? v : (v << 1) - m + dav1d_get_bit(c);
  ------------------
  |  Branch (122:12): [True: 8.38k, False: 3.23k]
  ------------------
  123|  11.6k|}
dav1d_get_vlc:
  125|    310|unsigned dav1d_get_vlc(GetBits *const c) {
  126|    310|    if (dav1d_get_bit(c))
  ------------------
  |  Branch (126:9): [True: 135, False: 175]
  ------------------
  127|    135|        return 0;
  128|       |
  129|    175|    int n_bits = 0;
  130|  1.12k|    do {
  131|  1.12k|        if (++n_bits == 32)
  ------------------
  |  Branch (131:13): [True: 9, False: 1.11k]
  ------------------
  132|      9|            return UINT32_MAX;
  133|  1.12k|    } while (!dav1d_get_bit(c));
  ------------------
  |  Branch (133:14): [True: 946, False: 166]
  ------------------
  134|       |
  135|    166|    return ((1U << n_bits) - 1) + dav1d_get_bits(c, n_bits);
  136|    175|}
dav1d_get_bits_subexp:
  162|  20.0k|int dav1d_get_bits_subexp(GetBits *const c, const int ref, const unsigned n) {
  163|  20.0k|    return (int) get_bits_subexp_u(c, ref + (1 << n), 2 << n) - (1 << n);
  164|  20.0k|}
getbits.c:refill:
   65|   470k|static inline void refill(GetBits *const c, const int n) {
   66|   470k|    assert(c->bits_left >= 0 && c->bits_left < 32);
  ------------------
  |  |  140|   941k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:30): [True: 470k, False: 0]
  |  |  |  Branch (140:30): [True: 470k, False: 0]
  |  |  |  Branch (140:68): [Folded, False: 470k]
  |  |  ------------------
  ------------------
   67|   470k|    unsigned state = 0;
   68|   498k|    do {
   69|   498k|        if (c->ptr >= c->ptr_end) {
  ------------------
  |  Branch (69:13): [True: 984, False: 497k]
  ------------------
   70|    984|            c->error = 1;
   71|    984|            if (state) break;
  ------------------
  |  Branch (71:17): [True: 102, False: 882]
  ------------------
   72|    882|            return;
   73|    984|        }
   74|   497k|        state = (state << 8) | *c->ptr++;
   75|   497k|        c->bits_left += 8;
   76|   497k|    } while (n > c->bits_left);
  ------------------
  |  Branch (76:14): [True: 28.1k, False: 469k]
  ------------------
   77|   469k|    c->state |= (uint64_t) state << (64 - c->bits_left);
   78|   469k|}
getbits.c:get_bits_subexp_u:
  140|  20.0k|{
  141|  20.0k|    unsigned v = 0;
  142|       |
  143|  36.2k|    for (int i = 0;; i++) {
  144|  36.2k|        const int b = i ? 3 + i - 1 : 3;
  ------------------
  |  Branch (144:23): [True: 16.2k, False: 20.0k]
  ------------------
  145|       |
  146|  36.2k|        if (n < v + 3 * (1 << b)) {
  ------------------
  |  Branch (146:13): [True: 338, False: 35.9k]
  ------------------
  147|    338|            v += dav1d_get_uniform(c, n - v + 1);
  148|    338|            break;
  149|    338|        }
  150|       |
  151|  35.9k|        if (!dav1d_get_bit(c)) {
  ------------------
  |  Branch (151:13): [True: 19.6k, False: 16.2k]
  ------------------
  152|  19.6k|            v += dav1d_get_bits(c, b);
  153|  19.6k|            break;
  154|  19.6k|        }
  155|       |
  156|  16.2k|        v += 1 << b;
  157|  16.2k|    }
  158|       |
  159|  20.0k|    return ref * 2 <= n ? inv_recenter(ref, v) : n - inv_recenter(n - ref, v);
  ------------------
  |  Branch (159:12): [True: 19.5k, False: 480]
  ------------------
  160|  20.0k|}

obu.c:dav1d_bytealign_get_bits:
   52|  43.6k|static inline void dav1d_bytealign_get_bits(GetBits *c) {
   53|       |    // bits_left is never more than 7, because it is only incremented
   54|       |    // by refill(), called by dav1d_get_bits and that never reads more
   55|       |    // than 7 bits more than it needs.
   56|       |    //
   57|       |    // If this wasn't true, we would need to work out how many bits to
   58|       |    // discard (bits_left % 8), subtract that from bits_left and then
   59|       |    // shift state right by that amount.
   60|  43.6k|    assert(c->bits_left <= 7);
  ------------------
  |  |  140|  43.6k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 43.6k]
  |  |  |  Branch (140:68): [Folded, False: 43.6k]
  |  |  ------------------
  ------------------
   61|       |
   62|  43.6k|    c->bits_left = 0;
   63|  43.6k|    c->state = 0;
   64|  43.6k|}

dav1d_init_intra_edge_tree:
  126|      1|COLD void dav1d_init_intra_edge_tree(void) {
  127|       |    // This function is guaranteed to be called only once
  128|      1|    struct ModeSelMem mem;
  129|       |
  130|      1|    mem.nwc[BL_128X128] = &nodes.branch_sb128[1];
  131|      1|    mem.nwc[BL_64X64] = &nodes.branch_sb128[1 + 4];
  132|      1|    mem.nwc[BL_32X32] = &nodes.branch_sb128[1 + 4 + 16];
  133|      1|    mem.nt = nodes.tip_sb128;
  134|      1|    init_mode_node(nodes.branch_sb128, BL_128X128, &mem, 1, 0);
  135|      1|    assert(mem.nwc[BL_128X128] == &nodes.branch_sb128[1 + 4]);
  ------------------
  |  |  140|      1|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 1]
  |  |  |  Branch (140:68): [Folded, False: 1]
  |  |  ------------------
  ------------------
  136|      1|    assert(mem.nwc[BL_64X64] == &nodes.branch_sb128[1 + 4 + 16]);
  ------------------
  |  |  140|      1|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 1]
  |  |  |  Branch (140:68): [Folded, False: 1]
  |  |  ------------------
  ------------------
  137|      1|    assert(mem.nwc[BL_32X32] == &nodes.branch_sb128[1 + 4 + 16 + 64]);
  ------------------
  |  |  140|      1|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 1]
  |  |  |  Branch (140:68): [Folded, False: 1]
  |  |  ------------------
  ------------------
  138|      1|    assert(mem.nt == &nodes.tip_sb128[256]);
  ------------------
  |  |  140|      1|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 1]
  |  |  |  Branch (140:68): [Folded, False: 1]
  |  |  ------------------
  ------------------
  139|       |
  140|      1|    mem.nwc[BL_128X128] = NULL;
  141|      1|    mem.nwc[BL_64X64] = &nodes.branch_sb64[1];
  142|      1|    mem.nwc[BL_32X32] = &nodes.branch_sb64[1 + 4];
  143|      1|    mem.nt = nodes.tip_sb64;
  144|      1|    init_mode_node(nodes.branch_sb64, BL_64X64, &mem, 1, 0);
  145|      1|    assert(mem.nwc[BL_64X64] == &nodes.branch_sb64[1 + 4]);
  ------------------
  |  |  140|      1|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 1]
  |  |  |  Branch (140:68): [Folded, False: 1]
  |  |  ------------------
  ------------------
  146|      1|    assert(mem.nwc[BL_32X32] == &nodes.branch_sb64[1 + 4 + 16]);
  ------------------
  |  |  140|      1|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 1]
  |  |  |  Branch (140:68): [Folded, False: 1]
  |  |  ------------------
  ------------------
  147|      1|    assert(mem.nt == &nodes.tip_sb64[64]);
  ------------------
  |  |  140|      1|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 1]
  |  |  |  Branch (140:68): [Folded, False: 1]
  |  |  ------------------
  ------------------
  148|      1|}
intra_edge.c:init_mode_node:
  101|    106|{
  102|    106|    init_edges(&nwc->node, bl,
  103|    106|               (top_has_right ? EDGE_ALL_TOP_HAS_RIGHT : 0) |
  ------------------
  |  Branch (103:17): [True: 73, False: 33]
  ------------------
  104|    106|               (left_has_bottom ? EDGE_ALL_LEFT_HAS_BOTTOM : 0));
  ------------------
  |  Branch (104:17): [True: 33, False: 73]
  ------------------
  105|    106|    if (bl == BL_16X16) {
  ------------------
  |  Branch (105:9): [True: 80, False: 26]
  ------------------
  106|    400|        for (int n = 0; n < 4; n++) {
  ------------------
  |  Branch (106:25): [True: 320, False: 80]
  ------------------
  107|    320|            EdgeTip *const nt = mem->nt++;
  108|    320|            nwc->split_offset[n] = PTR_OFFSET(nwc, nt);
  ------------------
  |  |   94|    320|#define PTR_OFFSET(a, b) ((uint16_t)((uintptr_t)(b) - (uintptr_t)(a)))
  ------------------
  109|    320|            init_edges(&nt->node, bl + 1,
  110|    320|                       ((n == 3 || (n == 1 && !top_has_right)) ? 0 :
  ------------------
  |  Branch (110:26): [True: 80, False: 240]
  |  Branch (110:37): [True: 80, False: 160]
  |  Branch (110:47): [True: 26, False: 54]
  ------------------
  111|    320|                        EDGE_ALL_TOP_HAS_RIGHT) |
  112|    320|                       (!(n == 0 || (n == 2 && left_has_bottom)) ? 0 :
  ------------------
  |  Branch (112:27): [True: 80, False: 240]
  |  Branch (112:38): [True: 80, False: 160]
  |  Branch (112:48): [True: 26, False: 54]
  ------------------
  113|    320|                        EDGE_ALL_LEFT_HAS_BOTTOM));
  114|    320|        }
  115|     80|    } else {
  116|    130|        for (int n = 0; n < 4; n++) {
  ------------------
  |  Branch (116:25): [True: 104, False: 26]
  ------------------
  117|    104|            EdgeBranch *const nwc_child = mem->nwc[bl]++;
  118|    104|            nwc->split_offset[n] = PTR_OFFSET(nwc, nwc_child);
  ------------------
  |  |   94|    104|#define PTR_OFFSET(a, b) ((uint16_t)((uintptr_t)(b) - (uintptr_t)(a)))
  ------------------
  119|    104|            init_mode_node(nwc_child, bl + 1, mem,
  120|    104|                           !(n == 3 || (n == 1 && !top_has_right)),
  ------------------
  |  Branch (120:30): [True: 26, False: 78]
  |  Branch (120:41): [True: 26, False: 52]
  |  Branch (120:51): [True: 7, False: 19]
  ------------------
  121|    104|                           n == 0 || (n == 2 && left_has_bottom));
  ------------------
  |  Branch (121:28): [True: 26, False: 78]
  |  Branch (121:39): [True: 26, False: 52]
  |  Branch (121:49): [True: 7, False: 19]
  ------------------
  122|    104|        }
  123|     26|    }
  124|    106|}
intra_edge.c:init_edges:
   58|    426|{
   59|    426|    node->o = edge_flags;
   60|    426|    node->h[0] = edge_flags | EDGE_ALL_LEFT_HAS_BOTTOM;
   61|    426|    node->v[0] = edge_flags | EDGE_ALL_TOP_HAS_RIGHT;
   62|       |
   63|    426|    if (bl == BL_8X8) {
  ------------------
  |  Branch (63:9): [True: 320, False: 106]
  ------------------
   64|    320|        EdgeTip *const nt = (EdgeTip *) node;
   65|       |
   66|    320|        node->h[1] = edge_flags & (EDGE_ALL_LEFT_HAS_BOTTOM |
   67|    320|                                   EDGE_I420_TOP_HAS_RIGHT);
   68|    320|        node->v[1] = edge_flags & (EDGE_ALL_TOP_HAS_RIGHT |
   69|    320|                                   EDGE_I420_LEFT_HAS_BOTTOM |
   70|    320|                                   EDGE_I422_LEFT_HAS_BOTTOM);
   71|       |
   72|    320|        nt->split[0] = (edge_flags & EDGE_ALL_TOP_HAS_RIGHT) |
   73|    320|                       EDGE_I422_LEFT_HAS_BOTTOM;
   74|    320|        nt->split[1] = edge_flags | EDGE_I444_TOP_HAS_RIGHT;
   75|    320|        nt->split[2] = edge_flags & (EDGE_I420_TOP_HAS_RIGHT |
   76|    320|                                     EDGE_I420_LEFT_HAS_BOTTOM |
   77|    320|                                     EDGE_I422_LEFT_HAS_BOTTOM);
   78|    320|    } else {
   79|    106|        EdgeBranch *const nwc = (EdgeBranch *) node;
   80|       |
   81|    106|        node->h[1] = edge_flags & EDGE_ALL_LEFT_HAS_BOTTOM;
   82|    106|        node->v[1] = edge_flags & EDGE_ALL_TOP_HAS_RIGHT;
   83|       |
   84|    106|        nwc->h4 = EDGE_ALL_LEFT_HAS_BOTTOM;
   85|    106|        nwc->v4 = EDGE_ALL_TOP_HAS_RIGHT;
   86|    106|        if (bl == BL_16X16) {
  ------------------
  |  Branch (86:13): [True: 80, False: 26]
  ------------------
   87|     80|            nwc->h4 |= edge_flags & EDGE_I420_TOP_HAS_RIGHT;
   88|     80|            nwc->v4 |= edge_flags & (EDGE_I420_LEFT_HAS_BOTTOM |
   89|     80|                                     EDGE_I422_LEFT_HAS_BOTTOM);
   90|     80|        }
   91|    106|    }
   92|    426|}

recon_tmpl.c:sm_flag:
   95|  3.92M|static inline int sm_flag(const BlockContext *const b, const int idx) {
   96|  3.92M|    if (!b->intra[idx]) return 0;
  ------------------
  |  Branch (96:9): [True: 59.3k, False: 3.86M]
  ------------------
   97|  3.86M|    const enum IntraPredMode m = b->mode[idx];
   98|  3.86M|    return (m == SMOOTH_PRED || m == SMOOTH_H_PRED ||
  ------------------
  |  Branch (98:13): [True: 157k, False: 3.70M]
  |  Branch (98:33): [True: 56.6k, False: 3.65M]
  ------------------
   99|  3.65M|            m == SMOOTH_V_PRED) ? ANGLE_SMOOTH_EDGE_FLAG : 0;
  ------------------
  |  |   93|   277k|#define ANGLE_SMOOTH_EDGE_FLAG      512
  ------------------
  |  Branch (99:13): [True: 37.0k, False: 3.61M]
  ------------------
  100|  3.92M|}
recon_tmpl.c:sm_uv_flag:
  102|  3.03M|static inline int sm_uv_flag(const BlockContext *const b, const int idx) {
  103|  3.03M|    const enum IntraPredMode m = b->uvmode[idx];
  104|  3.03M|    return (m == SMOOTH_PRED || m == SMOOTH_H_PRED ||
  ------------------
  |  Branch (104:13): [True: 94.0k, False: 2.93M]
  |  Branch (104:33): [True: 40.0k, False: 2.89M]
  ------------------
  105|  2.89M|            m == SMOOTH_V_PRED) ? ANGLE_SMOOTH_EDGE_FLAG : 0;
  ------------------
  |  |   93|   162k|#define ANGLE_SMOOTH_EDGE_FLAG      512
  ------------------
  |  Branch (105:13): [True: 25.9k, False: 2.87M]
  ------------------
  106|  3.03M|}

dav1d_prepare_intra_edges_8bpc:
   86|  4.76M|{
   87|  4.76M|    const int bitdepth = bitdepth_from_max(bitdepth_max);
  ------------------
  |  |   58|  4.76M|#define bitdepth_from_max(x) 8
  ------------------
   88|  4.76M|    assert(y < h && x < w);
  ------------------
  |  |  140|  9.53M|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:30): [True: 4.76M, False: 531]
  |  |  |  Branch (140:30): [True: 4.76M, False: 18.4E]
  |  |  |  Branch (140:68): [Folded, False: 4.76M]
  |  |  ------------------
  ------------------
   89|       |
   90|  4.76M|    switch (mode) {
   91|   151k|    case VERT_PRED:
  ------------------
  |  Branch (91:5): [True: 151k, False: 4.61M]
  ------------------
   92|   366k|    case HOR_PRED:
  ------------------
  |  Branch (92:5): [True: 214k, False: 4.55M]
  ------------------
   93|   422k|    case DIAG_DOWN_LEFT_PRED:
  ------------------
  |  Branch (93:5): [True: 56.1k, False: 4.71M]
  ------------------
   94|   476k|    case DIAG_DOWN_RIGHT_PRED:
  ------------------
  |  Branch (94:5): [True: 54.1k, False: 4.71M]
  ------------------
   95|   517k|    case VERT_RIGHT_PRED:
  ------------------
  |  Branch (95:5): [True: 41.3k, False: 4.72M]
  ------------------
   96|   571k|    case HOR_DOWN_PRED:
  ------------------
  |  Branch (96:5): [True: 53.3k, False: 4.71M]
  ------------------
   97|   666k|    case HOR_UP_PRED:
  ------------------
  |  Branch (97:5): [True: 94.8k, False: 4.67M]
  ------------------
   98|   741k|    case VERT_LEFT_PRED: {
  ------------------
  |  Branch (98:5): [True: 75.2k, False: 4.69M]
  ------------------
   99|   741k|        *angle = av1_mode_to_angle_map[mode - VERT_PRED] + 3 * *angle;
  100|       |
  101|   741k|        if (*angle <= 90)
  ------------------
  |  Branch (101:13): [True: 243k, False: 497k]
  ------------------
  102|   243k|            mode = *angle < 90 && have_top ? Z1_PRED : VERT_PRED;
  ------------------
  |  Branch (102:20): [True: 157k, False: 86.4k]
  |  Branch (102:35): [True: 102k, False: 55.5k]
  ------------------
  103|   497k|        else if (*angle < 180)
  ------------------
  |  Branch (103:18): [True: 235k, False: 261k]
  ------------------
  104|   235k|            mode = Z2_PRED;
  105|   261k|        else
  106|   261k|            mode = *angle > 180 && have_left ? Z3_PRED : HOR_PRED;
  ------------------
  |  Branch (106:20): [True: 149k, False: 112k]
  |  Branch (106:36): [True: 131k, False: 18.1k]
  ------------------
  107|   741k|        break;
  108|   666k|    }
  109|  2.06M|    case DC_PRED:
  ------------------
  |  Branch (109:5): [True: 2.06M, False: 2.70M]
  ------------------
  110|  3.53M|    case PAETH_PRED:
  ------------------
  |  Branch (110:5): [True: 1.46M, False: 3.30M]
  ------------------
  111|  3.53M|        mode = av1_mode_conv[mode][have_left][have_top];
  112|  3.53M|        break;
  113|   520k|    default:
  ------------------
  |  Branch (113:5): [True: 520k, False: 4.24M]
  ------------------
  114|   520k|        break;
  115|  4.76M|    }
  116|       |
  117|  4.76M|    const pixel *dst_top;
  118|  4.76M|    if (have_top &&
  ------------------
  |  Branch (118:9): [True: 3.13M, False: 1.63M]
  ------------------
  119|  3.13M|        (av1_intra_prediction_edges[mode].needs_top ||
  ------------------
  |  Branch (119:10): [True: 2.96M, False: 167k]
  ------------------
  120|   167k|         av1_intra_prediction_edges[mode].needs_topleft ||
  ------------------
  |  Branch (120:10): [True: 75.7k, False: 91.6k]
  ------------------
  121|  91.6k|         (av1_intra_prediction_edges[mode].needs_left && !have_left)))
  ------------------
  |  Branch (121:11): [True: 91.6k, False: 0]
  |  Branch (121:58): [True: 29.3k, False: 62.3k]
  ------------------
  122|  3.07M|    {
  123|  3.07M|        if (prefilter_toplevel_sb_edge) {
  ------------------
  |  Branch (123:13): [True: 151k, False: 2.92M]
  ------------------
  124|   151k|            dst_top = &prefilter_toplevel_sb_edge[x * 4];
  125|  2.92M|        } else {
  126|  2.92M|            dst_top = &dst[-PXSTRIDE(stride)];
  ------------------
  |  |   53|  2.92M|#define PXSTRIDE(x) (x)
  ------------------
  127|  2.92M|        }
  128|  3.07M|    }
  129|       |
  130|  4.76M|    if (av1_intra_prediction_edges[mode].needs_left) {
  ------------------
  |  Branch (130:9): [True: 4.34M, False: 424k]
  ------------------
  131|  4.34M|        const int sz = th << 2;
  132|  4.34M|        pixel *const left = &topleft_out[-sz];
  133|       |
  134|  4.34M|        if (have_left) {
  ------------------
  |  Branch (134:13): [True: 4.20M, False: 134k]
  ------------------
  135|  4.20M|            const int px_have = imin(sz, (h - y) << 2);
  136|       |
  137|  28.8M|            for (int i = 0; i < px_have; i++)
  ------------------
  |  Branch (137:29): [True: 24.6M, False: 4.20M]
  ------------------
  138|  24.6M|                left[sz - 1 - i] = dst[PXSTRIDE(stride) * i - 1];
  ------------------
  |  |   53|  24.6M|#define PXSTRIDE(x) (x)
  ------------------
  139|  4.20M|            if (px_have < sz)
  ------------------
  |  Branch (139:17): [True: 224k, False: 3.98M]
  ------------------
  140|   224k|                pixel_set(left, left[sz - px_have], sz - px_have);
  ------------------
  |  |   48|   224k|#define pixel_set memset
  ------------------
  141|  4.20M|        } else {
  142|   134k|            pixel_set(left, have_top ? *dst_top : ((1 << bitdepth) >> 1) + 1, sz);
  ------------------
  |  |   48|   134k|#define pixel_set memset
  ------------------
  |  Branch (142:29): [True: 127k, False: 7.45k]
  ------------------
  143|   134k|        }
  144|       |
  145|  4.34M|        if (av1_intra_prediction_edges[mode].needs_bottomleft) {
  ------------------
  |  Branch (145:13): [True: 131k, False: 4.21M]
  ------------------
  146|   131k|            const int have_bottomleft = (!have_left || y + th >= h) ? 0 :
  ------------------
  |  Branch (146:42): [True: 18.4E, False: 131k]
  |  Branch (146:56): [True: 55.0k, False: 76.1k]
  ------------------
  147|   131k|                                        (edge_flags & EDGE_I444_LEFT_HAS_BOTTOM);
  148|       |
  149|   131k|            if (have_bottomleft) {
  ------------------
  |  Branch (149:17): [True: 19.6k, False: 111k]
  ------------------
  150|  19.6k|                const int px_have = imin(sz, (h - y - th) << 2);
  151|       |
  152|   161k|                for (int i = 0; i < px_have; i++)
  ------------------
  |  Branch (152:33): [True: 142k, False: 19.6k]
  ------------------
  153|   142k|                    left[-(i + 1)] = dst[(sz + i) * PXSTRIDE(stride) - 1];
  ------------------
  |  |   53|   142k|#define PXSTRIDE(x) (x)
  ------------------
  154|  19.6k|                if (px_have < sz)
  ------------------
  |  Branch (154:21): [True: 625, False: 19.0k]
  ------------------
  155|    625|                    pixel_set(left - sz, left[-px_have], sz - px_have);
  ------------------
  |  |   48|    625|#define pixel_set memset
  ------------------
  156|   111k|            } else {
  157|   111k|                pixel_set(left - sz, left[0], sz);
  ------------------
  |  |   48|   111k|#define pixel_set memset
  ------------------
  158|   111k|            }
  159|   131k|        }
  160|  4.34M|    }
  161|       |
  162|  4.76M|    if (av1_intra_prediction_edges[mode].needs_top) {
  ------------------
  |  Branch (162:9): [True: 3.32M, False: 1.44M]
  ------------------
  163|  3.32M|        const int sz = tw << 2;
  164|  3.32M|        pixel *const top = &topleft_out[1];
  165|       |
  166|  3.32M|        if (have_top) {
  ------------------
  |  Branch (166:13): [True: 2.97M, False: 353k]
  ------------------
  167|  2.97M|            const int px_have = imin(sz, (w - x) << 2);
  168|  2.97M|            pixel_copy(top, dst_top, px_have);
  ------------------
  |  |   47|  2.97M|#define pixel_copy memcpy
  ------------------
  169|  2.97M|            if (px_have < sz)
  ------------------
  |  Branch (169:17): [True: 121k, False: 2.84M]
  ------------------
  170|   121k|                pixel_set(top + px_have, top[px_have - 1], sz - px_have);
  ------------------
  |  |   48|   121k|#define pixel_set memset
  ------------------
  171|  2.97M|        } else {
  172|   353k|            pixel_set(top, have_left ? dst[-1] : ((1 << bitdepth) >> 1) - 1, sz);
  ------------------
  |  |   48|   353k|#define pixel_set memset
  ------------------
  |  Branch (172:28): [True: 347k, False: 5.44k]
  ------------------
  173|   353k|        }
  174|       |
  175|  3.32M|        if (av1_intra_prediction_edges[mode].needs_topright) {
  ------------------
  |  Branch (175:13): [True: 102k, False: 3.22M]
  ------------------
  176|   102k|            const int have_topright = (!have_top || x + tw >= w) ? 0 :
  ------------------
  |  Branch (176:40): [True: 18.4E, False: 102k]
  |  Branch (176:53): [True: 18.2k, False: 83.8k]
  ------------------
  177|   102k|                                      (edge_flags & EDGE_I444_TOP_HAS_RIGHT);
  178|       |
  179|   102k|            if (have_topright) {
  ------------------
  |  Branch (179:17): [True: 65.8k, False: 36.1k]
  ------------------
  180|  65.8k|                const int px_have = imin(sz, (w - x - tw) << 2);
  181|       |
  182|  65.8k|                pixel_copy(top + sz, &dst_top[sz], px_have);
  ------------------
  |  |   47|  65.8k|#define pixel_copy memcpy
  ------------------
  183|  65.8k|                if (px_have < sz)
  ------------------
  |  Branch (183:21): [True: 931, False: 64.9k]
  ------------------
  184|    931|                    pixel_set(top + sz + px_have, top[sz + px_have - 1],
  ------------------
  |  |   48|    931|#define pixel_set memset
  ------------------
  185|    931|                              sz - px_have);
  186|  65.8k|            } else {
  187|  36.1k|                pixel_set(top + sz, top[sz - 1], sz);
  ------------------
  |  |   48|  36.1k|#define pixel_set memset
  ------------------
  188|  36.1k|            }
  189|   102k|        }
  190|  3.32M|    }
  191|       |
  192|  4.76M|    if (av1_intra_prediction_edges[mode].needs_topleft) {
  ------------------
  |  Branch (192:9): [True: 1.67M, False: 3.08M]
  ------------------
  193|  1.67M|        if (have_left)
  ------------------
  |  Branch (193:13): [True: 1.61M, False: 66.9k]
  ------------------
  194|  1.61M|            *topleft_out = have_top ? dst_top[-1] : dst[-1];
  ------------------
  |  Branch (194:28): [True: 1.43M, False: 180k]
  ------------------
  195|  66.9k|        else
  196|  66.9k|            *topleft_out = have_top ? *dst_top : (1 << bitdepth) >> 1;
  ------------------
  |  Branch (196:28): [True: 63.3k, False: 3.57k]
  ------------------
  197|       |
  198|  1.67M|        if (mode == Z2_PRED && tw + th >= 6 && filter_edge)
  ------------------
  |  Branch (198:13): [True: 235k, False: 1.44M]
  |  Branch (198:32): [True: 71.3k, False: 164k]
  |  Branch (198:48): [True: 32.7k, False: 38.5k]
  ------------------
  199|  32.7k|            *topleft_out = ((topleft_out[-1] + topleft_out[1]) * 5 +
  200|  32.7k|                            topleft_out[0] * 6 + 8) >> 4;
  201|  1.67M|    }
  202|       |
  203|  4.76M|    return mode;
  204|  4.76M|}
dav1d_prepare_intra_edges_16bpc:
   86|  3.94M|{
   87|  3.94M|    const int bitdepth = bitdepth_from_max(bitdepth_max);
  ------------------
  |  |   75|  3.94M|#define bitdepth_from_max(bitdepth_max) (32 - clz(bitdepth_max))
  ------------------
   88|  3.94M|    assert(y < h && x < w);
  ------------------
  |  |  140|  7.89M|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:30): [True: 3.94M, False: 2.43k]
  |  |  |  Branch (140:30): [True: 3.94M, False: 195]
  |  |  |  Branch (140:68): [Folded, False: 3.94M]
  |  |  ------------------
  ------------------
   89|       |
   90|  3.94M|    switch (mode) {
   91|   165k|    case VERT_PRED:
  ------------------
  |  Branch (91:5): [True: 165k, False: 3.78M]
  ------------------
   92|   422k|    case HOR_PRED:
  ------------------
  |  Branch (92:5): [True: 257k, False: 3.69M]
  ------------------
   93|   485k|    case DIAG_DOWN_LEFT_PRED:
  ------------------
  |  Branch (93:5): [True: 63.0k, False: 3.88M]
  ------------------
   94|   540k|    case DIAG_DOWN_RIGHT_PRED:
  ------------------
  |  Branch (94:5): [True: 54.9k, False: 3.89M]
  ------------------
   95|   584k|    case VERT_RIGHT_PRED:
  ------------------
  |  Branch (95:5): [True: 44.0k, False: 3.90M]
  ------------------
   96|   644k|    case HOR_DOWN_PRED:
  ------------------
  |  Branch (96:5): [True: 60.0k, False: 3.88M]
  ------------------
   97|   778k|    case HOR_UP_PRED:
  ------------------
  |  Branch (97:5): [True: 133k, False: 3.81M]
  ------------------
   98|   852k|    case VERT_LEFT_PRED: {
  ------------------
  |  Branch (98:5): [True: 73.7k, False: 3.87M]
  ------------------
   99|   852k|        *angle = av1_mode_to_angle_map[mode - VERT_PRED] + 3 * *angle;
  100|       |
  101|   852k|        if (*angle <= 90)
  ------------------
  |  Branch (101:13): [True: 263k, False: 588k]
  ------------------
  102|   263k|            mode = *angle < 90 && have_top ? Z1_PRED : VERT_PRED;
  ------------------
  |  Branch (102:20): [True: 176k, False: 87.0k]
  |  Branch (102:35): [True: 132k, False: 43.9k]
  ------------------
  103|   588k|        else if (*angle < 180)
  ------------------
  |  Branch (103:18): [True: 257k, False: 330k]
  ------------------
  104|   257k|            mode = Z2_PRED;
  105|   330k|        else
  106|   330k|            mode = *angle > 180 && have_left ? Z3_PRED : HOR_PRED;
  ------------------
  |  Branch (106:20): [True: 195k, False: 135k]
  |  Branch (106:36): [True: 146k, False: 48.6k]
  ------------------
  107|   852k|        break;
  108|   778k|    }
  109|  1.86M|    case DC_PRED:
  ------------------
  |  Branch (109:5): [True: 1.86M, False: 2.08M]
  ------------------
  110|  2.50M|    case PAETH_PRED:
  ------------------
  |  Branch (110:5): [True: 639k, False: 3.30M]
  ------------------
  111|  2.50M|        mode = av1_mode_conv[mode][have_left][have_top];
  112|  2.50M|        break;
  113|   609k|    default:
  ------------------
  |  Branch (113:5): [True: 609k, False: 3.33M]
  ------------------
  114|   609k|        break;
  115|  3.94M|    }
  116|       |
  117|  3.95M|    const pixel *dst_top;
  118|  3.95M|    if (have_top &&
  ------------------
  |  Branch (118:9): [True: 2.61M, False: 1.33M]
  ------------------
  119|  2.61M|        (av1_intra_prediction_edges[mode].needs_top ||
  ------------------
  |  Branch (119:10): [True: 2.37M, False: 240k]
  ------------------
  120|   240k|         av1_intra_prediction_edges[mode].needs_topleft ||
  ------------------
  |  Branch (120:10): [True: 96.2k, False: 144k]
  ------------------
  121|   144k|         (av1_intra_prediction_edges[mode].needs_left && !have_left)))
  ------------------
  |  Branch (121:11): [True: 144k, False: 18.4E]
  |  Branch (121:58): [True: 77.1k, False: 67.2k]
  ------------------
  122|  2.55M|    {
  123|  2.55M|        if (prefilter_toplevel_sb_edge) {
  ------------------
  |  Branch (123:13): [True: 218k, False: 2.33M]
  ------------------
  124|   218k|            dst_top = &prefilter_toplevel_sb_edge[x * 4];
  125|  2.33M|        } else {
  126|  2.33M|            dst_top = &dst[-PXSTRIDE(stride)];
  127|  2.33M|        }
  128|  2.55M|    }
  129|       |
  130|  3.95M|    if (av1_intra_prediction_edges[mode].needs_left) {
  ------------------
  |  Branch (130:9): [True: 3.32M, False: 625k]
  ------------------
  131|  3.32M|        const int sz = th << 2;
  132|  3.32M|        pixel *const left = &topleft_out[-sz];
  133|       |
  134|  3.32M|        if (have_left) {
  ------------------
  |  Branch (134:13): [True: 2.99M, False: 330k]
  ------------------
  135|  2.99M|            const int px_have = imin(sz, (h - y) << 2);
  136|       |
  137|  23.4M|            for (int i = 0; i < px_have; i++)
  ------------------
  |  Branch (137:29): [True: 20.4M, False: 2.99M]
  ------------------
  138|  20.4M|                left[sz - 1 - i] = dst[PXSTRIDE(stride) * i - 1];
  139|  2.99M|            if (px_have < sz)
  ------------------
  |  Branch (139:17): [True: 266k, False: 2.72M]
  ------------------
  140|   266k|                pixel_set(left, left[sz - px_have], sz - px_have);
  141|  2.99M|        } else {
  142|   330k|            pixel_set(left, have_top ? *dst_top : ((1 << bitdepth) >> 1) + 1, sz);
  ------------------
  |  Branch (142:29): [True: 320k, False: 10.1k]
  ------------------
  143|   330k|        }
  144|       |
  145|  3.32M|        if (av1_intra_prediction_edges[mode].needs_bottomleft) {
  ------------------
  |  Branch (145:13): [True: 146k, False: 3.17M]
  ------------------
  146|   146k|            const int have_bottomleft = (!have_left || y + th >= h) ? 0 :
  ------------------
  |  Branch (146:42): [True: 18.4E, False: 146k]
  |  Branch (146:56): [True: 51.7k, False: 95.1k]
  ------------------
  147|   146k|                                        (edge_flags & EDGE_I444_LEFT_HAS_BOTTOM);
  148|       |
  149|   146k|            if (have_bottomleft) {
  ------------------
  |  Branch (149:17): [True: 22.9k, False: 123k]
  ------------------
  150|  22.9k|                const int px_have = imin(sz, (h - y - th) << 2);
  151|       |
  152|   203k|                for (int i = 0; i < px_have; i++)
  ------------------
  |  Branch (152:33): [True: 180k, False: 22.9k]
  ------------------
  153|   180k|                    left[-(i + 1)] = dst[(sz + i) * PXSTRIDE(stride) - 1];
  154|  22.9k|                if (px_have < sz)
  ------------------
  |  Branch (154:21): [True: 1.31k, False: 21.6k]
  ------------------
  155|  1.31k|                    pixel_set(left - sz, left[-px_have], sz - px_have);
  156|   123k|            } else {
  157|   123k|                pixel_set(left - sz, left[0], sz);
  158|   123k|            }
  159|   146k|        }
  160|  3.32M|    }
  161|       |
  162|  3.95M|    if (av1_intra_prediction_edges[mode].needs_top) {
  ------------------
  |  Branch (162:9): [True: 2.68M, False: 1.26M]
  ------------------
  163|  2.68M|        const int sz = tw << 2;
  164|  2.68M|        pixel *const top = &topleft_out[1];
  165|       |
  166|  2.68M|        if (have_top) {
  ------------------
  |  Branch (166:13): [True: 2.37M, False: 305k]
  ------------------
  167|  2.37M|            const int px_have = imin(sz, (w - x) << 2);
  168|  2.37M|            pixel_copy(top, dst_top, px_have);
  ------------------
  |  |   65|  2.37M|#define pixel_copy(a, b, c) memcpy(a, b, (c) << 1)
  ------------------
  169|  2.37M|            if (px_have < sz)
  ------------------
  |  Branch (169:17): [True: 245k, False: 2.13M]
  ------------------
  170|   245k|                pixel_set(top + px_have, top[px_have - 1], sz - px_have);
  171|  2.37M|        } else {
  172|   305k|            pixel_set(top, have_left ? dst[-1] : ((1 << bitdepth) >> 1) - 1, sz);
  ------------------
  |  Branch (172:28): [True: 297k, False: 7.91k]
  ------------------
  173|   305k|        }
  174|       |
  175|  2.68M|        if (av1_intra_prediction_edges[mode].needs_topright) {
  ------------------
  |  Branch (175:13): [True: 132k, False: 2.55M]
  ------------------
  176|   132k|            const int have_topright = (!have_top || x + tw >= w) ? 0 :
  ------------------
  |  Branch (176:40): [True: 18.4E, False: 132k]
  |  Branch (176:53): [True: 54.8k, False: 77.8k]
  ------------------
  177|   132k|                                      (edge_flags & EDGE_I444_TOP_HAS_RIGHT);
  178|       |
  179|   132k|            if (have_topright) {
  ------------------
  |  Branch (179:17): [True: 57.9k, False: 74.7k]
  ------------------
  180|  57.9k|                const int px_have = imin(sz, (w - x - tw) << 2);
  181|       |
  182|  57.9k|                pixel_copy(top + sz, &dst_top[sz], px_have);
  ------------------
  |  |   65|  57.9k|#define pixel_copy(a, b, c) memcpy(a, b, (c) << 1)
  ------------------
  183|  57.9k|                if (px_have < sz)
  ------------------
  |  Branch (183:21): [True: 1.77k, False: 56.1k]
  ------------------
  184|  1.77k|                    pixel_set(top + sz + px_have, top[sz + px_have - 1],
  185|  1.77k|                              sz - px_have);
  186|  74.7k|            } else {
  187|  74.7k|                pixel_set(top + sz, top[sz - 1], sz);
  188|  74.7k|            }
  189|   132k|        }
  190|  2.68M|    }
  191|       |
  192|  3.95M|    if (av1_intra_prediction_edges[mode].needs_topleft) {
  ------------------
  |  Branch (192:9): [True: 1.03M, False: 2.91M]
  ------------------
  193|  1.03M|        if (have_left)
  ------------------
  |  Branch (193:13): [True: 868k, False: 169k]
  ------------------
  194|   868k|            *topleft_out = have_top ? dst_top[-1] : dst[-1];
  ------------------
  |  Branch (194:28): [True: 713k, False: 154k]
  ------------------
  195|   169k|        else
  196|   169k|            *topleft_out = have_top ? *dst_top : (1 << bitdepth) >> 1;
  ------------------
  |  Branch (196:28): [True: 163k, False: 5.85k]
  ------------------
  197|       |
  198|  1.03M|        if (mode == Z2_PRED && tw + th >= 6 && filter_edge)
  ------------------
  |  Branch (198:13): [True: 257k, False: 780k]
  |  Branch (198:32): [True: 96.6k, False: 161k]
  |  Branch (198:48): [True: 38.0k, False: 58.6k]
  ------------------
  199|  38.0k|            *topleft_out = ((topleft_out[-1] + topleft_out[1]) * 5 +
  200|  38.0k|                            topleft_out[0] * 6 + 8) >> 4;
  201|  1.03M|    }
  202|       |
  203|  3.95M|    return mode;
  204|  3.94M|}

dav1d_intra_pred_dsp_init_8bpc:
  744|  7.82k|COLD void bitfn(dav1d_intra_pred_dsp_init)(Dav1dIntraPredDSPContext *const c) {
  745|  7.82k|    c->intra_pred[DC_PRED      ] = ipred_dc_c;
  746|  7.82k|    c->intra_pred[DC_128_PRED  ] = ipred_dc_128_c;
  747|  7.82k|    c->intra_pred[TOP_DC_PRED  ] = ipred_dc_top_c;
  748|  7.82k|    c->intra_pred[LEFT_DC_PRED ] = ipred_dc_left_c;
  749|  7.82k|    c->intra_pred[HOR_PRED     ] = ipred_h_c;
  750|  7.82k|    c->intra_pred[VERT_PRED    ] = ipred_v_c;
  751|  7.82k|    c->intra_pred[PAETH_PRED   ] = ipred_paeth_c;
  752|  7.82k|    c->intra_pred[SMOOTH_PRED  ] = ipred_smooth_c;
  753|  7.82k|    c->intra_pred[SMOOTH_V_PRED] = ipred_smooth_v_c;
  754|  7.82k|    c->intra_pred[SMOOTH_H_PRED] = ipred_smooth_h_c;
  755|  7.82k|    c->intra_pred[Z1_PRED      ] = ipred_z1_c;
  756|  7.82k|    c->intra_pred[Z2_PRED      ] = ipred_z2_c;
  757|  7.82k|    c->intra_pred[Z3_PRED      ] = ipred_z3_c;
  758|  7.82k|    c->intra_pred[FILTER_PRED  ] = ipred_filter_c;
  759|       |
  760|  7.82k|    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = cfl_ac_420_c;
  761|  7.82k|    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = cfl_ac_422_c;
  762|  7.82k|    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = cfl_ac_444_c;
  763|       |
  764|  7.82k|    c->cfl_pred[DC_PRED     ] = ipred_cfl_c;
  765|  7.82k|    c->cfl_pred[DC_128_PRED ] = ipred_cfl_128_c;
  766|  7.82k|    c->cfl_pred[TOP_DC_PRED ] = ipred_cfl_top_c;
  767|  7.82k|    c->cfl_pred[LEFT_DC_PRED] = ipred_cfl_left_c;
  768|       |
  769|  7.82k|    c->pal_pred = pal_pred_c;
  770|       |
  771|  7.82k|#if HAVE_ASM
  772|       |#if ARCH_AARCH64 || ARCH_ARM
  773|       |    intra_pred_dsp_init_arm(c);
  774|       |#elif ARCH_RISCV
  775|       |    intra_pred_dsp_init_riscv(c);
  776|       |#elif ARCH_X86
  777|       |    intra_pred_dsp_init_x86(c);
  778|       |#elif ARCH_LOONGARCH64
  779|       |    intra_pred_dsp_init_loongarch(c);
  780|       |#endif
  781|  7.82k|#endif
  782|  7.82k|}
dav1d_intra_pred_dsp_init_16bpc:
  744|  7.63k|COLD void bitfn(dav1d_intra_pred_dsp_init)(Dav1dIntraPredDSPContext *const c) {
  745|  7.63k|    c->intra_pred[DC_PRED      ] = ipred_dc_c;
  746|  7.63k|    c->intra_pred[DC_128_PRED  ] = ipred_dc_128_c;
  747|  7.63k|    c->intra_pred[TOP_DC_PRED  ] = ipred_dc_top_c;
  748|  7.63k|    c->intra_pred[LEFT_DC_PRED ] = ipred_dc_left_c;
  749|  7.63k|    c->intra_pred[HOR_PRED     ] = ipred_h_c;
  750|  7.63k|    c->intra_pred[VERT_PRED    ] = ipred_v_c;
  751|  7.63k|    c->intra_pred[PAETH_PRED   ] = ipred_paeth_c;
  752|  7.63k|    c->intra_pred[SMOOTH_PRED  ] = ipred_smooth_c;
  753|  7.63k|    c->intra_pred[SMOOTH_V_PRED] = ipred_smooth_v_c;
  754|  7.63k|    c->intra_pred[SMOOTH_H_PRED] = ipred_smooth_h_c;
  755|  7.63k|    c->intra_pred[Z1_PRED      ] = ipred_z1_c;
  756|  7.63k|    c->intra_pred[Z2_PRED      ] = ipred_z2_c;
  757|  7.63k|    c->intra_pred[Z3_PRED      ] = ipred_z3_c;
  758|  7.63k|    c->intra_pred[FILTER_PRED  ] = ipred_filter_c;
  759|       |
  760|  7.63k|    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = cfl_ac_420_c;
  761|  7.63k|    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = cfl_ac_422_c;
  762|  7.63k|    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = cfl_ac_444_c;
  763|       |
  764|  7.63k|    c->cfl_pred[DC_PRED     ] = ipred_cfl_c;
  765|  7.63k|    c->cfl_pred[DC_128_PRED ] = ipred_cfl_128_c;
  766|  7.63k|    c->cfl_pred[TOP_DC_PRED ] = ipred_cfl_top_c;
  767|  7.63k|    c->cfl_pred[LEFT_DC_PRED] = ipred_cfl_left_c;
  768|       |
  769|  7.63k|    c->pal_pred = pal_pred_c;
  770|       |
  771|  7.63k|#if HAVE_ASM
  772|       |#if ARCH_AARCH64 || ARCH_ARM
  773|       |    intra_pred_dsp_init_arm(c);
  774|       |#elif ARCH_RISCV
  775|       |    intra_pred_dsp_init_riscv(c);
  776|       |#elif ARCH_X86
  777|       |    intra_pred_dsp_init_x86(c);
  778|       |#elif ARCH_LOONGARCH64
  779|       |    intra_pred_dsp_init_loongarch(c);
  780|       |#endif
  781|  7.63k|#endif
  782|  7.63k|}

itx_1d.c:inv_dct4_1d_internal_c:
   68|  1.81M|{
   69|  1.81M|    assert(stride > 0);
  ------------------
  |  |  140|  1.81M|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 1.81M]
  |  |  |  Branch (140:68): [Folded, False: 1.81M]
  |  |  ------------------
  ------------------
   70|  1.81M|    const int in0 = c[0 * stride], in1 = c[1 * stride];
   71|       |
   72|  1.81M|    int t0, t1, t2, t3;
   73|  1.81M|    if (tx64) {
  ------------------
  |  Branch (73:9): [True: 901k, False: 916k]
  ------------------
   74|   901k|        t0 = t1 = (in0 * 181 + 128) >> 8;
   75|   901k|        t2 = (in1 * 1567 + 2048) >> 12;
   76|   901k|        t3 = (in1 * 3784 + 2048) >> 12;
   77|   916k|    } else {
   78|   916k|        const int in2 = c[2 * stride], in3 = c[3 * stride];
   79|       |
   80|   916k|        t0 = ((in0 + in2) * 181 + 128) >> 8;
   81|   916k|        t1 = ((in0 - in2) * 181 + 128) >> 8;
   82|   916k|        t2 = ((in1 *  1567         - in3 * (3784 - 4096) + 2048) >> 12) - in3;
   83|   916k|        t3 = ((in1 * (3784 - 4096) + in3 *  1567         + 2048) >> 12) + in1;
   84|   916k|    }
   85|       |
   86|  1.81M|    c[0 * stride] = CLIP(t0 + t3);
  ------------------
  |  |   37|  1.81M|#define CLIP(a) iclip(a, min, max)
  ------------------
   87|  1.81M|    c[1 * stride] = CLIP(t1 + t2);
  ------------------
  |  |   37|  1.81M|#define CLIP(a) iclip(a, min, max)
  ------------------
   88|  1.81M|    c[2 * stride] = CLIP(t1 - t2);
  ------------------
  |  |   37|  1.81M|#define CLIP(a) iclip(a, min, max)
  ------------------
   89|  1.81M|    c[3 * stride] = CLIP(t0 - t3);
  ------------------
  |  |   37|  1.81M|#define CLIP(a) iclip(a, min, max)
  ------------------
   90|  1.81M|}
itx_1d.c:inv_dct8_1d_internal_c:
  101|  1.81M|{
  102|  1.81M|    assert(stride > 0);
  ------------------
  |  |  140|  1.81M|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 1.81M]
  |  |  |  Branch (140:68): [Folded, False: 1.81M]
  |  |  ------------------
  ------------------
  103|  1.81M|    inv_dct4_1d_internal_c(c, stride << 1, min, max, tx64);
  104|       |
  105|  1.81M|    const int in1 = c[1 * stride], in3 = c[3 * stride];
  106|       |
  107|  1.81M|    int t4a, t5a, t6a, t7a;
  108|  1.81M|    if (tx64) {
  ------------------
  |  Branch (108:9): [True: 902k, False: 916k]
  ------------------
  109|   902k|        t4a = (in1 *   799 + 2048) >> 12;
  110|   902k|        t5a = (in3 * -2276 + 2048) >> 12;
  111|   902k|        t6a = (in3 *  3406 + 2048) >> 12;
  112|   902k|        t7a = (in1 *  4017 + 2048) >> 12;
  113|   916k|    } else {
  114|   916k|        const int in5 = c[5 * stride], in7 = c[7 * stride];
  115|       |
  116|   916k|        t4a = ((in1 *   799         - in7 * (4017 - 4096) + 2048) >> 12) - in7;
  117|   916k|        t5a =  (in5 *  1703         - in3 *  1138         + 1024) >> 11;
  118|   916k|        t6a =  (in5 *  1138         + in3 *  1703         + 1024) >> 11;
  119|   916k|        t7a = ((in1 * (4017 - 4096) + in7 *  799          + 2048) >> 12) + in1;
  120|   916k|    }
  121|       |
  122|  1.81M|    const int t4  = CLIP(t4a + t5a);
  ------------------
  |  |   37|  1.81M|#define CLIP(a) iclip(a, min, max)
  ------------------
  123|  1.81M|              t5a = CLIP(t4a - t5a);
  ------------------
  |  |   37|  1.81M|#define CLIP(a) iclip(a, min, max)
  ------------------
  124|  1.81M|    const int t7  = CLIP(t7a + t6a);
  ------------------
  |  |   37|  1.81M|#define CLIP(a) iclip(a, min, max)
  ------------------
  125|  1.81M|              t6a = CLIP(t7a - t6a);
  ------------------
  |  |   37|  1.81M|#define CLIP(a) iclip(a, min, max)
  ------------------
  126|       |
  127|  1.81M|    const int t5  = ((t6a - t5a) * 181 + 128) >> 8;
  128|  1.81M|    const int t6  = ((t6a + t5a) * 181 + 128) >> 8;
  129|       |
  130|  1.81M|    const int t0 = c[0 * stride];
  131|  1.81M|    const int t1 = c[2 * stride];
  132|  1.81M|    const int t2 = c[4 * stride];
  133|  1.81M|    const int t3 = c[6 * stride];
  134|       |
  135|  1.81M|    c[0 * stride] = CLIP(t0 + t7);
  ------------------
  |  |   37|  1.81M|#define CLIP(a) iclip(a, min, max)
  ------------------
  136|  1.81M|    c[1 * stride] = CLIP(t1 + t6);
  ------------------
  |  |   37|  1.81M|#define CLIP(a) iclip(a, min, max)
  ------------------
  137|  1.81M|    c[2 * stride] = CLIP(t2 + t5);
  ------------------
  |  |   37|  1.81M|#define CLIP(a) iclip(a, min, max)
  ------------------
  138|  1.81M|    c[3 * stride] = CLIP(t3 + t4);
  ------------------
  |  |   37|  1.81M|#define CLIP(a) iclip(a, min, max)
  ------------------
  139|  1.81M|    c[4 * stride] = CLIP(t3 - t4);
  ------------------
  |  |   37|  1.81M|#define CLIP(a) iclip(a, min, max)
  ------------------
  140|  1.81M|    c[5 * stride] = CLIP(t2 - t5);
  ------------------
  |  |   37|  1.81M|#define CLIP(a) iclip(a, min, max)
  ------------------
  141|  1.81M|    c[6 * stride] = CLIP(t1 - t6);
  ------------------
  |  |   37|  1.81M|#define CLIP(a) iclip(a, min, max)
  ------------------
  142|  1.81M|    c[7 * stride] = CLIP(t0 - t7);
  ------------------
  |  |   37|  1.81M|#define CLIP(a) iclip(a, min, max)
  ------------------
  143|  1.81M|}
itx_1d.c:inv_dct16_1d_c:
  242|   305k|{
  243|   305k|    inv_dct16_1d_internal_c(c, stride, min, max, 0);
  244|   305k|}
itx_1d.c:inv_dct16_1d_internal_c:
  154|  1.81M|{
  155|  1.81M|    assert(stride > 0);
  ------------------
  |  |  140|  1.81M|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 1.81M]
  |  |  |  Branch (140:68): [Folded, False: 1.81M]
  |  |  ------------------
  ------------------
  156|  1.81M|    inv_dct8_1d_internal_c(c, stride << 1, min, max, tx64);
  157|       |
  158|  1.81M|    const int in1 = c[1 * stride], in3 = c[3 * stride];
  159|  1.81M|    const int in5 = c[5 * stride], in7 = c[7 * stride];
  160|       |
  161|  1.81M|    int t8a, t9a, t10a, t11a, t12a, t13a, t14a, t15a;
  162|  1.81M|    if (tx64) {
  ------------------
  |  Branch (162:9): [True: 903k, False: 914k]
  ------------------
  163|   903k|        t8a  = (in1 *   401 + 2048) >> 12;
  164|   903k|        t9a  = (in7 * -2598 + 2048) >> 12;
  165|   903k|        t10a = (in5 *  1931 + 2048) >> 12;
  166|   903k|        t11a = (in3 * -1189 + 2048) >> 12;
  167|   903k|        t12a = (in3 *  3920 + 2048) >> 12;
  168|   903k|        t13a = (in5 *  3612 + 2048) >> 12;
  169|   903k|        t14a = (in7 *  3166 + 2048) >> 12;
  170|   903k|        t15a = (in1 *  4076 + 2048) >> 12;
  171|   914k|    } else {
  172|   914k|        const int in9  = c[ 9 * stride], in11 = c[11 * stride];
  173|   914k|        const int in13 = c[13 * stride], in15 = c[15 * stride];
  174|       |
  175|   914k|        t8a  = ((in1  *   401         - in15 * (4076 - 4096) + 2048) >> 12) - in15;
  176|   914k|        t9a  =  (in9  *  1583         - in7  *  1299         + 1024) >> 11;
  177|   914k|        t10a = ((in5  *  1931         - in11 * (3612 - 4096) + 2048) >> 12) - in11;
  178|   914k|        t11a = ((in13 * (3920 - 4096) - in3  *  1189         + 2048) >> 12) + in13;
  179|   914k|        t12a = ((in13 *  1189         + in3  * (3920 - 4096) + 2048) >> 12) + in3;
  180|   914k|        t13a = ((in5  * (3612 - 4096) + in11 *  1931         + 2048) >> 12) + in5;
  181|   914k|        t14a =  (in9  *  1299         + in7  *  1583         + 1024) >> 11;
  182|   914k|        t15a = ((in1  * (4076 - 4096) + in15 *   401         + 2048) >> 12) + in1;
  183|   914k|    }
  184|       |
  185|  1.81M|    int t8  = CLIP(t8a  + t9a);
  ------------------
  |  |   37|  1.81M|#define CLIP(a) iclip(a, min, max)
  ------------------
  186|  1.81M|    int t9  = CLIP(t8a  - t9a);
  ------------------
  |  |   37|  1.81M|#define CLIP(a) iclip(a, min, max)
  ------------------
  187|  1.81M|    int t10 = CLIP(t11a - t10a);
  ------------------
  |  |   37|  1.81M|#define CLIP(a) iclip(a, min, max)
  ------------------
  188|  1.81M|    int t11 = CLIP(t11a + t10a);
  ------------------
  |  |   37|  1.81M|#define CLIP(a) iclip(a, min, max)
  ------------------
  189|  1.81M|    int t12 = CLIP(t12a + t13a);
  ------------------
  |  |   37|  1.81M|#define CLIP(a) iclip(a, min, max)
  ------------------
  190|  1.81M|    int t13 = CLIP(t12a - t13a);
  ------------------
  |  |   37|  1.81M|#define CLIP(a) iclip(a, min, max)
  ------------------
  191|  1.81M|    int t14 = CLIP(t15a - t14a);
  ------------------
  |  |   37|  1.81M|#define CLIP(a) iclip(a, min, max)
  ------------------
  192|  1.81M|    int t15 = CLIP(t15a + t14a);
  ------------------
  |  |   37|  1.81M|#define CLIP(a) iclip(a, min, max)
  ------------------
  193|       |
  194|  1.81M|    t9a  = ((  t14 *  1567         - t9  * (3784 - 4096)  + 2048) >> 12) - t9;
  195|  1.81M|    t14a = ((  t14 * (3784 - 4096) + t9  *  1567          + 2048) >> 12) + t14;
  196|  1.81M|    t10a = ((-(t13 * (3784 - 4096) + t10 *  1567)         + 2048) >> 12) - t13;
  197|  1.81M|    t13a = ((  t13 *  1567         - t10 * (3784 - 4096)  + 2048) >> 12) - t10;
  198|       |
  199|  1.81M|    t8a  = CLIP(t8   + t11);
  ------------------
  |  |   37|  1.81M|#define CLIP(a) iclip(a, min, max)
  ------------------
  200|  1.81M|    t9   = CLIP(t9a  + t10a);
  ------------------
  |  |   37|  1.81M|#define CLIP(a) iclip(a, min, max)
  ------------------
  201|  1.81M|    t10  = CLIP(t9a  - t10a);
  ------------------
  |  |   37|  1.81M|#define CLIP(a) iclip(a, min, max)
  ------------------
  202|  1.81M|    t11a = CLIP(t8   - t11);
  ------------------
  |  |   37|  1.81M|#define CLIP(a) iclip(a, min, max)
  ------------------
  203|  1.81M|    t12a = CLIP(t15  - t12);
  ------------------
  |  |   37|  1.81M|#define CLIP(a) iclip(a, min, max)
  ------------------
  204|  1.81M|    t13  = CLIP(t14a - t13a);
  ------------------
  |  |   37|  1.81M|#define CLIP(a) iclip(a, min, max)
  ------------------
  205|  1.81M|    t14  = CLIP(t14a + t13a);
  ------------------
  |  |   37|  1.81M|#define CLIP(a) iclip(a, min, max)
  ------------------
  206|  1.81M|    t15a = CLIP(t15  + t12);
  ------------------
  |  |   37|  1.81M|#define CLIP(a) iclip(a, min, max)
  ------------------
  207|       |
  208|  1.81M|    t10a = ((t13  - t10)  * 181 + 128) >> 8;
  209|  1.81M|    t13a = ((t13  + t10)  * 181 + 128) >> 8;
  210|  1.81M|    t11  = ((t12a - t11a) * 181 + 128) >> 8;
  211|  1.81M|    t12  = ((t12a + t11a) * 181 + 128) >> 8;
  212|       |
  213|  1.81M|    const int t0 = c[ 0 * stride];
  214|  1.81M|    const int t1 = c[ 2 * stride];
  215|  1.81M|    const int t2 = c[ 4 * stride];
  216|  1.81M|    const int t3 = c[ 6 * stride];
  217|  1.81M|    const int t4 = c[ 8 * stride];
  218|  1.81M|    const int t5 = c[10 * stride];
  219|  1.81M|    const int t6 = c[12 * stride];
  220|  1.81M|    const int t7 = c[14 * stride];
  221|       |
  222|  1.81M|    c[ 0 * stride] = CLIP(t0 + t15a);
  ------------------
  |  |   37|  1.81M|#define CLIP(a) iclip(a, min, max)
  ------------------
  223|  1.81M|    c[ 1 * stride] = CLIP(t1 + t14);
  ------------------
  |  |   37|  1.81M|#define CLIP(a) iclip(a, min, max)
  ------------------
  224|  1.81M|    c[ 2 * stride] = CLIP(t2 + t13a);
  ------------------
  |  |   37|  1.81M|#define CLIP(a) iclip(a, min, max)
  ------------------
  225|  1.81M|    c[ 3 * stride] = CLIP(t3 + t12);
  ------------------
  |  |   37|  1.81M|#define CLIP(a) iclip(a, min, max)
  ------------------
  226|  1.81M|    c[ 4 * stride] = CLIP(t4 + t11);
  ------------------
  |  |   37|  1.81M|#define CLIP(a) iclip(a, min, max)
  ------------------
  227|  1.81M|    c[ 5 * stride] = CLIP(t5 + t10a);
  ------------------
  |  |   37|  1.81M|#define CLIP(a) iclip(a, min, max)
  ------------------
  228|  1.81M|    c[ 6 * stride] = CLIP(t6 + t9);
  ------------------
  |  |   37|  1.81M|#define CLIP(a) iclip(a, min, max)
  ------------------
  229|  1.81M|    c[ 7 * stride] = CLIP(t7 + t8a);
  ------------------
  |  |   37|  1.81M|#define CLIP(a) iclip(a, min, max)
  ------------------
  230|  1.81M|    c[ 8 * stride] = CLIP(t7 - t8a);
  ------------------
  |  |   37|  1.81M|#define CLIP(a) iclip(a, min, max)
  ------------------
  231|  1.81M|    c[ 9 * stride] = CLIP(t6 - t9);
  ------------------
  |  |   37|  1.81M|#define CLIP(a) iclip(a, min, max)
  ------------------
  232|  1.81M|    c[10 * stride] = CLIP(t5 - t10a);
  ------------------
  |  |   37|  1.81M|#define CLIP(a) iclip(a, min, max)
  ------------------
  233|  1.81M|    c[11 * stride] = CLIP(t4 - t11);
  ------------------
  |  |   37|  1.81M|#define CLIP(a) iclip(a, min, max)
  ------------------
  234|  1.81M|    c[12 * stride] = CLIP(t3 - t12);
  ------------------
  |  |   37|  1.81M|#define CLIP(a) iclip(a, min, max)
  ------------------
  235|  1.81M|    c[13 * stride] = CLIP(t2 - t13a);
  ------------------
  |  |   37|  1.81M|#define CLIP(a) iclip(a, min, max)
  ------------------
  236|  1.81M|    c[14 * stride] = CLIP(t1 - t14);
  ------------------
  |  |   37|  1.81M|#define CLIP(a) iclip(a, min, max)
  ------------------
  237|  1.81M|    c[15 * stride] = CLIP(t0 - t15a);
  ------------------
  |  |   37|  1.81M|#define CLIP(a) iclip(a, min, max)
  ------------------
  238|  1.81M|}
itx_1d.c:inv_dct32_1d_c:
  432|   621k|{
  433|   621k|    inv_dct32_1d_internal_c(c, stride, min, max, 0);
  434|   621k|}
itx_1d.c:inv_dct32_1d_internal_c:
  249|  1.51M|{
  250|  1.51M|    assert(stride > 0);
  ------------------
  |  |  140|  1.51M|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 1.51M]
  |  |  |  Branch (140:68): [Folded, False: 1.51M]
  |  |  ------------------
  ------------------
  251|  1.51M|    inv_dct16_1d_internal_c(c, stride << 1, min, max, tx64);
  252|       |
  253|  1.51M|    const int in1  = c[ 1 * stride], in3  = c[ 3 * stride];
  254|  1.51M|    const int in5  = c[ 5 * stride], in7  = c[ 7 * stride];
  255|  1.51M|    const int in9  = c[ 9 * stride], in11 = c[11 * stride];
  256|  1.51M|    const int in13 = c[13 * stride], in15 = c[15 * stride];
  257|       |
  258|  1.51M|    int t16a, t17a, t18a, t19a, t20a, t21a, t22a, t23a;
  259|  1.51M|    int t24a, t25a, t26a, t27a, t28a, t29a, t30a, t31a;
  260|  1.51M|    if (tx64) {
  ------------------
  |  Branch (260:9): [True: 902k, False: 616k]
  ------------------
  261|   902k|        t16a = (in1  *   201 + 2048) >> 12;
  262|   902k|        t17a = (in15 * -2751 + 2048) >> 12;
  263|   902k|        t18a = (in9  *  1751 + 2048) >> 12;
  264|   902k|        t19a = (in7  * -1380 + 2048) >> 12;
  265|   902k|        t20a = (in5  *   995 + 2048) >> 12;
  266|   902k|        t21a = (in11 * -2106 + 2048) >> 12;
  267|   902k|        t22a = (in13 *  2440 + 2048) >> 12;
  268|   902k|        t23a = (in3  *  -601 + 2048) >> 12;
  269|   902k|        t24a = (in3  *  4052 + 2048) >> 12;
  270|   902k|        t25a = (in13 *  3290 + 2048) >> 12;
  271|   902k|        t26a = (in11 *  3513 + 2048) >> 12;
  272|   902k|        t27a = (in5  *  3973 + 2048) >> 12;
  273|   902k|        t28a = (in7  *  3857 + 2048) >> 12;
  274|   902k|        t29a = (in9  *  3703 + 2048) >> 12;
  275|   902k|        t30a = (in15 *  3035 + 2048) >> 12;
  276|   902k|        t31a = (in1  *  4091 + 2048) >> 12;
  277|   902k|    } else {
  278|   616k|        const int in17 = c[17 * stride], in19 = c[19 * stride];
  279|   616k|        const int in21 = c[21 * stride], in23 = c[23 * stride];
  280|   616k|        const int in25 = c[25 * stride], in27 = c[27 * stride];
  281|   616k|        const int in29 = c[29 * stride], in31 = c[31 * stride];
  282|       |
  283|   616k|        t16a = ((in1  *   201         - in31 * (4091 - 4096) + 2048) >> 12) - in31;
  284|   616k|        t17a = ((in17 * (3035 - 4096) - in15 *  2751         + 2048) >> 12) + in17;
  285|   616k|        t18a = ((in9  *  1751         - in23 * (3703 - 4096) + 2048) >> 12) - in23;
  286|   616k|        t19a = ((in25 * (3857 - 4096) - in7  *  1380         + 2048) >> 12) + in25;
  287|   616k|        t20a = ((in5  *   995         - in27 * (3973 - 4096) + 2048) >> 12) - in27;
  288|   616k|        t21a = ((in21 * (3513 - 4096) - in11 *  2106         + 2048) >> 12) + in21;
  289|   616k|        t22a =  (in13 *  1220         - in19 *  1645         + 1024) >> 11;
  290|   616k|        t23a = ((in29 * (4052 - 4096) - in3  *   601         + 2048) >> 12) + in29;
  291|   616k|        t24a = ((in29 *   601         + in3  * (4052 - 4096) + 2048) >> 12) + in3;
  292|   616k|        t25a =  (in13 *  1645         + in19 *  1220         + 1024) >> 11;
  293|   616k|        t26a = ((in21 *  2106         + in11 * (3513 - 4096) + 2048) >> 12) + in11;
  294|   616k|        t27a = ((in5  * (3973 - 4096) + in27 *   995         + 2048) >> 12) + in5;
  295|   616k|        t28a = ((in25 *  1380         + in7  * (3857 - 4096) + 2048) >> 12) + in7;
  296|   616k|        t29a = ((in9  * (3703 - 4096) + in23 *  1751         + 2048) >> 12) + in9;
  297|   616k|        t30a = ((in17 *  2751         + in15 * (3035 - 4096) + 2048) >> 12) + in15;
  298|   616k|        t31a = ((in1  * (4091 - 4096) + in31 *   201         + 2048) >> 12) + in1;
  299|   616k|    }
  300|       |
  301|  1.51M|    int t16 = CLIP(t16a + t17a);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  302|  1.51M|    int t17 = CLIP(t16a - t17a);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  303|  1.51M|    int t18 = CLIP(t19a - t18a);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  304|  1.51M|    int t19 = CLIP(t19a + t18a);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  305|  1.51M|    int t20 = CLIP(t20a + t21a);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  306|  1.51M|    int t21 = CLIP(t20a - t21a);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  307|  1.51M|    int t22 = CLIP(t23a - t22a);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  308|  1.51M|    int t23 = CLIP(t23a + t22a);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  309|  1.51M|    int t24 = CLIP(t24a + t25a);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  310|  1.51M|    int t25 = CLIP(t24a - t25a);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  311|  1.51M|    int t26 = CLIP(t27a - t26a);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  312|  1.51M|    int t27 = CLIP(t27a + t26a);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  313|  1.51M|    int t28 = CLIP(t28a + t29a);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  314|  1.51M|    int t29 = CLIP(t28a - t29a);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  315|  1.51M|    int t30 = CLIP(t31a - t30a);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  316|  1.51M|    int t31 = CLIP(t31a + t30a);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  317|       |
  318|  1.51M|    t17a = ((  t30 *   799         - t17 * (4017 - 4096)  + 2048) >> 12) - t17;
  319|  1.51M|    t30a = ((  t30 * (4017 - 4096) + t17 *   799          + 2048) >> 12) + t30;
  320|  1.51M|    t18a = ((-(t29 * (4017 - 4096) + t18 *   799)         + 2048) >> 12) - t29;
  321|  1.51M|    t29a = ((  t29 *   799         - t18 * (4017 - 4096)  + 2048) >> 12) - t18;
  322|  1.51M|    t21a =  (  t26 *  1703         - t21 *  1138          + 1024) >> 11;
  323|  1.51M|    t26a =  (  t26 *  1138         + t21 *  1703          + 1024) >> 11;
  324|  1.51M|    t22a =  (-(t25 *  1138         + t22 *  1703        ) + 1024) >> 11;
  325|  1.51M|    t25a =  (  t25 *  1703         - t22 *  1138          + 1024) >> 11;
  326|       |
  327|  1.51M|    t16a = CLIP(t16  + t19);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  328|  1.51M|    t17  = CLIP(t17a + t18a);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  329|  1.51M|    t18  = CLIP(t17a - t18a);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  330|  1.51M|    t19a = CLIP(t16  - t19);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  331|  1.51M|    t20a = CLIP(t23  - t20);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  332|  1.51M|    t21  = CLIP(t22a - t21a);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  333|  1.51M|    t22  = CLIP(t22a + t21a);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  334|  1.51M|    t23a = CLIP(t23  + t20);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  335|  1.51M|    t24a = CLIP(t24  + t27);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  336|  1.51M|    t25  = CLIP(t25a + t26a);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  337|  1.51M|    t26  = CLIP(t25a - t26a);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  338|  1.51M|    t27a = CLIP(t24  - t27);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  339|  1.51M|    t28a = CLIP(t31  - t28);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  340|  1.51M|    t29  = CLIP(t30a - t29a);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  341|  1.51M|    t30  = CLIP(t30a + t29a);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  342|  1.51M|    t31a = CLIP(t31  + t28);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  343|       |
  344|  1.51M|    t18a = ((  t29  *  1567         - t18  * (3784 - 4096)  + 2048) >> 12) - t18;
  345|  1.51M|    t29a = ((  t29  * (3784 - 4096) + t18  *  1567          + 2048) >> 12) + t29;
  346|  1.51M|    t19  = ((  t28a *  1567         - t19a * (3784 - 4096)  + 2048) >> 12) - t19a;
  347|  1.51M|    t28  = ((  t28a * (3784 - 4096) + t19a *  1567          + 2048) >> 12) + t28a;
  348|  1.51M|    t20  = ((-(t27a * (3784 - 4096) + t20a *  1567)         + 2048) >> 12) - t27a;
  349|  1.51M|    t27  = ((  t27a *  1567         - t20a * (3784 - 4096)  + 2048) >> 12) - t20a;
  350|  1.51M|    t21a = ((-(t26  * (3784 - 4096) + t21  *  1567)         + 2048) >> 12) - t26;
  351|  1.51M|    t26a = ((  t26  *  1567         - t21  * (3784 - 4096)  + 2048) >> 12) - t21;
  352|       |
  353|  1.51M|    t16  = CLIP(t16a + t23a);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  354|  1.51M|    t17a = CLIP(t17  + t22);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  355|  1.51M|    t18  = CLIP(t18a + t21a);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  356|  1.51M|    t19a = CLIP(t19  + t20);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  357|  1.51M|    t20a = CLIP(t19  - t20);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  358|  1.51M|    t21  = CLIP(t18a - t21a);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  359|  1.51M|    t22a = CLIP(t17  - t22);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  360|  1.51M|    t23  = CLIP(t16a - t23a);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  361|  1.51M|    t24  = CLIP(t31a - t24a);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  362|  1.51M|    t25a = CLIP(t30  - t25);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  363|  1.51M|    t26  = CLIP(t29a - t26a);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  364|  1.51M|    t27a = CLIP(t28  - t27);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  365|  1.51M|    t28a = CLIP(t28  + t27);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  366|  1.51M|    t29  = CLIP(t29a + t26a);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  367|  1.51M|    t30a = CLIP(t30  + t25);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  368|  1.51M|    t31  = CLIP(t31a + t24a);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  369|       |
  370|  1.51M|    t20  = ((t27a - t20a) * 181 + 128) >> 8;
  371|  1.51M|    t27  = ((t27a + t20a) * 181 + 128) >> 8;
  372|  1.51M|    t21a = ((t26  - t21 ) * 181 + 128) >> 8;
  373|  1.51M|    t26a = ((t26  + t21 ) * 181 + 128) >> 8;
  374|  1.51M|    t22  = ((t25a - t22a) * 181 + 128) >> 8;
  375|  1.51M|    t25  = ((t25a + t22a) * 181 + 128) >> 8;
  376|  1.51M|    t23a = ((t24  - t23 ) * 181 + 128) >> 8;
  377|  1.51M|    t24a = ((t24  + t23 ) * 181 + 128) >> 8;
  378|       |
  379|  1.51M|    const int t0  = c[ 0 * stride];
  380|  1.51M|    const int t1  = c[ 2 * stride];
  381|  1.51M|    const int t2  = c[ 4 * stride];
  382|  1.51M|    const int t3  = c[ 6 * stride];
  383|  1.51M|    const int t4  = c[ 8 * stride];
  384|  1.51M|    const int t5  = c[10 * stride];
  385|  1.51M|    const int t6  = c[12 * stride];
  386|  1.51M|    const int t7  = c[14 * stride];
  387|  1.51M|    const int t8  = c[16 * stride];
  388|  1.51M|    const int t9  = c[18 * stride];
  389|  1.51M|    const int t10 = c[20 * stride];
  390|  1.51M|    const int t11 = c[22 * stride];
  391|  1.51M|    const int t12 = c[24 * stride];
  392|  1.51M|    const int t13 = c[26 * stride];
  393|  1.51M|    const int t14 = c[28 * stride];
  394|  1.51M|    const int t15 = c[30 * stride];
  395|       |
  396|  1.51M|    c[ 0 * stride] = CLIP(t0  + t31);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  397|  1.51M|    c[ 1 * stride] = CLIP(t1  + t30a);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  398|  1.51M|    c[ 2 * stride] = CLIP(t2  + t29);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  399|  1.51M|    c[ 3 * stride] = CLIP(t3  + t28a);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  400|  1.51M|    c[ 4 * stride] = CLIP(t4  + t27);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  401|  1.51M|    c[ 5 * stride] = CLIP(t5  + t26a);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  402|  1.51M|    c[ 6 * stride] = CLIP(t6  + t25);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  403|  1.51M|    c[ 7 * stride] = CLIP(t7  + t24a);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  404|  1.51M|    c[ 8 * stride] = CLIP(t8  + t23a);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  405|  1.51M|    c[ 9 * stride] = CLIP(t9  + t22);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  406|  1.51M|    c[10 * stride] = CLIP(t10 + t21a);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  407|  1.51M|    c[11 * stride] = CLIP(t11 + t20);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  408|  1.51M|    c[12 * stride] = CLIP(t12 + t19a);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  409|  1.51M|    c[13 * stride] = CLIP(t13 + t18);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  410|  1.51M|    c[14 * stride] = CLIP(t14 + t17a);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  411|  1.51M|    c[15 * stride] = CLIP(t15 + t16);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  412|  1.51M|    c[16 * stride] = CLIP(t15 - t16);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  413|  1.51M|    c[17 * stride] = CLIP(t14 - t17a);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  414|  1.51M|    c[18 * stride] = CLIP(t13 - t18);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  415|  1.51M|    c[19 * stride] = CLIP(t12 - t19a);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  416|  1.51M|    c[20 * stride] = CLIP(t11 - t20);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  417|  1.51M|    c[21 * stride] = CLIP(t10 - t21a);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  418|  1.51M|    c[22 * stride] = CLIP(t9  - t22);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  419|  1.51M|    c[23 * stride] = CLIP(t8  - t23a);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  420|  1.51M|    c[24 * stride] = CLIP(t7  - t24a);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  421|  1.51M|    c[25 * stride] = CLIP(t6  - t25);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  422|  1.51M|    c[26 * stride] = CLIP(t5  - t26a);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  423|  1.51M|    c[27 * stride] = CLIP(t4  - t27);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  424|  1.51M|    c[28 * stride] = CLIP(t3  - t28a);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  425|  1.51M|    c[29 * stride] = CLIP(t2  - t29);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  426|  1.51M|    c[30 * stride] = CLIP(t1  - t30a);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  427|  1.51M|    c[31 * stride] = CLIP(t0  - t31);
  ------------------
  |  |   37|  1.51M|#define CLIP(a) iclip(a, min, max)
  ------------------
  428|  1.51M|}
itx_1d.c:inv_dct64_1d_c:
  438|   902k|{
  439|   902k|    assert(stride > 0);
  ------------------
  |  |  140|   902k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 902k]
  |  |  |  Branch (140:68): [Folded, False: 902k]
  |  |  ------------------
  ------------------
  440|   902k|    inv_dct32_1d_internal_c(c, stride << 1, min, max, 1);
  441|       |
  442|   902k|    const int in1  = c[ 1 * stride], in3  = c[ 3 * stride];
  443|   902k|    const int in5  = c[ 5 * stride], in7  = c[ 7 * stride];
  444|   902k|    const int in9  = c[ 9 * stride], in11 = c[11 * stride];
  445|   902k|    const int in13 = c[13 * stride], in15 = c[15 * stride];
  446|   902k|    const int in17 = c[17 * stride], in19 = c[19 * stride];
  447|   902k|    const int in21 = c[21 * stride], in23 = c[23 * stride];
  448|   902k|    const int in25 = c[25 * stride], in27 = c[27 * stride];
  449|   902k|    const int in29 = c[29 * stride], in31 = c[31 * stride];
  450|       |
  451|   902k|    int t32a = (in1  *   101 + 2048) >> 12;
  452|   902k|    int t33a = (in31 * -2824 + 2048) >> 12;
  453|   902k|    int t34a = (in17 *  1660 + 2048) >> 12;
  454|   902k|    int t35a = (in15 * -1474 + 2048) >> 12;
  455|   902k|    int t36a = (in9  *   897 + 2048) >> 12;
  456|   902k|    int t37a = (in23 * -2191 + 2048) >> 12;
  457|   902k|    int t38a = (in25 *  2359 + 2048) >> 12;
  458|   902k|    int t39a = (in7  *  -700 + 2048) >> 12;
  459|   902k|    int t40a = (in5  *   501 + 2048) >> 12;
  460|   902k|    int t41a = (in27 * -2520 + 2048) >> 12;
  461|   902k|    int t42a = (in21 *  2019 + 2048) >> 12;
  462|   902k|    int t43a = (in11 * -1092 + 2048) >> 12;
  463|   902k|    int t44a = (in13 *  1285 + 2048) >> 12;
  464|   902k|    int t45a = (in19 * -1842 + 2048) >> 12;
  465|   902k|    int t46a = (in29 *  2675 + 2048) >> 12;
  466|   902k|    int t47a = (in3  *  -301 + 2048) >> 12;
  467|   902k|    int t48a = (in3  *  4085 + 2048) >> 12;
  468|   902k|    int t49a = (in29 *  3102 + 2048) >> 12;
  469|   902k|    int t50a = (in19 *  3659 + 2048) >> 12;
  470|   902k|    int t51a = (in13 *  3889 + 2048) >> 12;
  471|   902k|    int t52a = (in11 *  3948 + 2048) >> 12;
  472|   902k|    int t53a = (in21 *  3564 + 2048) >> 12;
  473|   902k|    int t54a = (in27 *  3229 + 2048) >> 12;
  474|   902k|    int t55a = (in5  *  4065 + 2048) >> 12;
  475|   902k|    int t56a = (in7  *  4036 + 2048) >> 12;
  476|   902k|    int t57a = (in25 *  3349 + 2048) >> 12;
  477|   902k|    int t58a = (in23 *  3461 + 2048) >> 12;
  478|   902k|    int t59a = (in9  *  3996 + 2048) >> 12;
  479|   902k|    int t60a = (in15 *  3822 + 2048) >> 12;
  480|   902k|    int t61a = (in17 *  3745 + 2048) >> 12;
  481|   902k|    int t62a = (in31 *  2967 + 2048) >> 12;
  482|   902k|    int t63a = (in1  *  4095 + 2048) >> 12;
  483|       |
  484|   902k|    int t32 = CLIP(t32a + t33a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  485|   902k|    int t33 = CLIP(t32a - t33a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  486|   902k|    int t34 = CLIP(t35a - t34a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  487|   902k|    int t35 = CLIP(t35a + t34a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  488|   902k|    int t36 = CLIP(t36a + t37a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  489|   902k|    int t37 = CLIP(t36a - t37a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  490|   902k|    int t38 = CLIP(t39a - t38a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  491|   902k|    int t39 = CLIP(t39a + t38a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  492|   902k|    int t40 = CLIP(t40a + t41a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  493|   902k|    int t41 = CLIP(t40a - t41a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  494|   902k|    int t42 = CLIP(t43a - t42a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  495|   902k|    int t43 = CLIP(t43a + t42a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  496|   902k|    int t44 = CLIP(t44a + t45a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  497|   902k|    int t45 = CLIP(t44a - t45a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  498|   902k|    int t46 = CLIP(t47a - t46a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  499|   902k|    int t47 = CLIP(t47a + t46a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  500|   902k|    int t48 = CLIP(t48a + t49a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  501|   902k|    int t49 = CLIP(t48a - t49a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  502|   902k|    int t50 = CLIP(t51a - t50a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  503|   902k|    int t51 = CLIP(t51a + t50a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  504|   902k|    int t52 = CLIP(t52a + t53a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  505|   902k|    int t53 = CLIP(t52a - t53a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  506|   902k|    int t54 = CLIP(t55a - t54a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  507|   902k|    int t55 = CLIP(t55a + t54a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  508|   902k|    int t56 = CLIP(t56a + t57a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  509|   902k|    int t57 = CLIP(t56a - t57a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  510|   902k|    int t58 = CLIP(t59a - t58a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  511|   902k|    int t59 = CLIP(t59a + t58a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  512|   902k|    int t60 = CLIP(t60a + t61a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  513|   902k|    int t61 = CLIP(t60a - t61a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  514|   902k|    int t62 = CLIP(t63a - t62a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  515|   902k|    int t63 = CLIP(t63a + t62a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  516|       |
  517|   902k|    t33a = ((t33 * (4096 - 4076) + t62 *   401         + 2048) >> 12) - t33;
  518|   902k|    t34a = ((t34 *  -401         + t61 * (4096 - 4076) + 2048) >> 12) - t61;
  519|   902k|    t37a =  (t37 * -1299         + t58 *  1583         + 1024) >> 11;
  520|   902k|    t38a =  (t38 * -1583         + t57 * -1299         + 1024) >> 11;
  521|   902k|    t41a = ((t41 * (4096 - 3612) + t54 *  1931         + 2048) >> 12) - t41;
  522|   902k|    t42a = ((t42 * -1931         + t53 * (4096 - 3612) + 2048) >> 12) - t53;
  523|   902k|    t45a = ((t45 * -1189         + t50 * (3920 - 4096) + 2048) >> 12) + t50;
  524|   902k|    t46a = ((t46 * (4096 - 3920) + t49 * -1189         + 2048) >> 12) - t46;
  525|   902k|    t49a = ((t46 * -1189         + t49 * (3920 - 4096) + 2048) >> 12) + t49;
  526|   902k|    t50a = ((t45 * (3920 - 4096) + t50 *  1189         + 2048) >> 12) + t45;
  527|   902k|    t53a = ((t42 * (4096 - 3612) + t53 *  1931         + 2048) >> 12) - t42;
  528|   902k|    t54a = ((t41 *  1931         + t54 * (3612 - 4096) + 2048) >> 12) + t54;
  529|   902k|    t57a =  (t38 * -1299         + t57 *  1583         + 1024) >> 11;
  530|   902k|    t58a =  (t37 *  1583         + t58 *  1299         + 1024) >> 11;
  531|   902k|    t61a = ((t34 * (4096 - 4076) + t61 *   401         + 2048) >> 12) - t34;
  532|   902k|    t62a = ((t33 *   401         + t62 * (4076 - 4096) + 2048) >> 12) + t62;
  533|       |
  534|   902k|    t32a = CLIP(t32  + t35);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  535|   902k|    t33  = CLIP(t33a + t34a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  536|   902k|    t34  = CLIP(t33a - t34a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  537|   902k|    t35a = CLIP(t32  - t35);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  538|   902k|    t36a = CLIP(t39  - t36);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  539|   902k|    t37  = CLIP(t38a - t37a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  540|   902k|    t38  = CLIP(t38a + t37a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  541|   902k|    t39a = CLIP(t39  + t36);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  542|   902k|    t40a = CLIP(t40  + t43);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  543|   902k|    t41  = CLIP(t41a + t42a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  544|   902k|    t42  = CLIP(t41a - t42a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  545|   902k|    t43a = CLIP(t40  - t43);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  546|   902k|    t44a = CLIP(t47  - t44);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  547|   902k|    t45  = CLIP(t46a - t45a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  548|   902k|    t46  = CLIP(t46a + t45a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  549|   902k|    t47a = CLIP(t47  + t44);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  550|   902k|    t48a = CLIP(t48  + t51);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  551|   902k|    t49  = CLIP(t49a + t50a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  552|   902k|    t50  = CLIP(t49a - t50a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  553|   902k|    t51a = CLIP(t48  - t51);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  554|   902k|    t52a = CLIP(t55  - t52);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  555|   902k|    t53  = CLIP(t54a - t53a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  556|   902k|    t54  = CLIP(t54a + t53a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  557|   902k|    t55a = CLIP(t55  + t52);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  558|   902k|    t56a = CLIP(t56  + t59);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  559|   902k|    t57  = CLIP(t57a + t58a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  560|   902k|    t58  = CLIP(t57a - t58a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  561|   902k|    t59a = CLIP(t56  - t59);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  562|   902k|    t60a = CLIP(t63  - t60);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  563|   902k|    t61  = CLIP(t62a - t61a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  564|   902k|    t62  = CLIP(t62a + t61a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  565|   902k|    t63a = CLIP(t63  + t60);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  566|       |
  567|   902k|    t34a = ((t34  * (4096 - 4017) + t61  *   799         + 2048) >> 12) - t34;
  568|   902k|    t35  = ((t35a * (4096 - 4017) + t60a *   799         + 2048) >> 12) - t35a;
  569|   902k|    t36  = ((t36a *  -799         + t59a * (4096 - 4017) + 2048) >> 12) - t59a;
  570|   902k|    t37a = ((t37  *  -799         + t58  * (4096 - 4017) + 2048) >> 12) - t58;
  571|   902k|    t42a =  (t42  * -1138         + t53  *  1703         + 1024) >> 11;
  572|   902k|    t43  =  (t43a * -1138         + t52a *  1703         + 1024) >> 11;
  573|   902k|    t44  =  (t44a * -1703         + t51a * -1138         + 1024) >> 11;
  574|   902k|    t45a =  (t45  * -1703         + t50  * -1138         + 1024) >> 11;
  575|   902k|    t50a =  (t45  * -1138         + t50  *  1703         + 1024) >> 11;
  576|   902k|    t51  =  (t44a * -1138         + t51a *  1703         + 1024) >> 11;
  577|   902k|    t52  =  (t43a *  1703         + t52a *  1138         + 1024) >> 11;
  578|   902k|    t53a =  (t42  *  1703         + t53  *  1138         + 1024) >> 11;
  579|   902k|    t58a = ((t37  * (4096 - 4017) + t58  *   799         + 2048) >> 12) - t37;
  580|   902k|    t59  = ((t36a * (4096 - 4017) + t59a *   799         + 2048) >> 12) - t36a;
  581|   902k|    t60  = ((t35a *   799         + t60a * (4017 - 4096) + 2048) >> 12) + t60a;
  582|   902k|    t61a = ((t34  *   799         + t61  * (4017 - 4096) + 2048) >> 12) + t61;
  583|       |
  584|   902k|    t32  = CLIP(t32a + t39a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  585|   902k|    t33a = CLIP(t33  + t38);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  586|   902k|    t34  = CLIP(t34a + t37a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  587|   902k|    t35a = CLIP(t35  + t36);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  588|   902k|    t36a = CLIP(t35  - t36);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  589|   902k|    t37  = CLIP(t34a - t37a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  590|   902k|    t38a = CLIP(t33  - t38);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  591|   902k|    t39  = CLIP(t32a - t39a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  592|   902k|    t40  = CLIP(t47a - t40a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  593|   902k|    t41a = CLIP(t46  - t41);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  594|   902k|    t42  = CLIP(t45a - t42a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  595|   902k|    t43a = CLIP(t44  - t43);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  596|   902k|    t44a = CLIP(t44  + t43);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  597|   902k|    t45  = CLIP(t45a + t42a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  598|   902k|    t46a = CLIP(t46  + t41);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  599|   902k|    t47  = CLIP(t47a + t40a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  600|   902k|    t48  = CLIP(t48a + t55a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  601|   902k|    t49a = CLIP(t49  + t54);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  602|   902k|    t50  = CLIP(t50a + t53a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  603|   902k|    t51a = CLIP(t51  + t52);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  604|   902k|    t52a = CLIP(t51  - t52);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  605|   902k|    t53  = CLIP(t50a - t53a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  606|   902k|    t54a = CLIP(t49  - t54);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  607|   902k|    t55  = CLIP(t48a - t55a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  608|   902k|    t56  = CLIP(t63a - t56a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  609|   902k|    t57a = CLIP(t62  - t57);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  610|   902k|    t58  = CLIP(t61a - t58a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  611|   902k|    t59a = CLIP(t60  - t59);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  612|   902k|    t60a = CLIP(t60  + t59);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  613|   902k|    t61  = CLIP(t61a + t58a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  614|   902k|    t62a = CLIP(t62  + t57);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  615|   902k|    t63  = CLIP(t63a + t56a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  616|       |
  617|   902k|    t36  = ((t36a * (4096 - 3784) + t59a *  1567         + 2048) >> 12) - t36a;
  618|   902k|    t37a = ((t37  * (4096 - 3784) + t58  *  1567         + 2048) >> 12) - t37;
  619|   902k|    t38  = ((t38a * (4096 - 3784) + t57a *  1567         + 2048) >> 12) - t38a;
  620|   902k|    t39a = ((t39  * (4096 - 3784) + t56  *  1567         + 2048) >> 12) - t39;
  621|   902k|    t40a = ((t40  * -1567         + t55  * (4096 - 3784) + 2048) >> 12) - t55;
  622|   902k|    t41  = ((t41a * -1567         + t54a * (4096 - 3784) + 2048) >> 12) - t54a;
  623|   902k|    t42a = ((t42  * -1567         + t53  * (4096 - 3784) + 2048) >> 12) - t53;
  624|   902k|    t43  = ((t43a * -1567         + t52a * (4096 - 3784) + 2048) >> 12) - t52a;
  625|   902k|    t52  = ((t43a * (4096 - 3784) + t52a *  1567         + 2048) >> 12) - t43a;
  626|   902k|    t53a = ((t42  * (4096 - 3784) + t53  *  1567         + 2048) >> 12) - t42;
  627|   902k|    t54  = ((t41a * (4096 - 3784) + t54a *  1567         + 2048) >> 12) - t41a;
  628|   902k|    t55a = ((t40  * (4096 - 3784) + t55  *  1567         + 2048) >> 12) - t40;
  629|   902k|    t56a = ((t39  *  1567         + t56  * (3784 - 4096) + 2048) >> 12) + t56;
  630|   902k|    t57  = ((t38a *  1567         + t57a * (3784 - 4096) + 2048) >> 12) + t57a;
  631|   902k|    t58a = ((t37  *  1567         + t58  * (3784 - 4096) + 2048) >> 12) + t58;
  632|   902k|    t59  = ((t36a *  1567         + t59a * (3784 - 4096) + 2048) >> 12) + t59a;
  633|       |
  634|   902k|    t32a = CLIP(t32  + t47);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  635|   902k|    t33  = CLIP(t33a + t46a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  636|   902k|    t34a = CLIP(t34  + t45);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  637|   902k|    t35  = CLIP(t35a + t44a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  638|   902k|    t36a = CLIP(t36  + t43);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  639|   902k|    t37  = CLIP(t37a + t42a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  640|   902k|    t38a = CLIP(t38  + t41);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  641|   902k|    t39  = CLIP(t39a + t40a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  642|   902k|    t40  = CLIP(t39a - t40a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  643|   902k|    t41a = CLIP(t38  - t41);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  644|   902k|    t42  = CLIP(t37a - t42a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  645|   902k|    t43a = CLIP(t36  - t43);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  646|   902k|    t44  = CLIP(t35a - t44a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  647|   902k|    t45a = CLIP(t34  - t45);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  648|   902k|    t46  = CLIP(t33a - t46a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  649|   902k|    t47a = CLIP(t32  - t47);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  650|   902k|    t48a = CLIP(t63  - t48);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  651|   902k|    t49  = CLIP(t62a - t49a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  652|   902k|    t50a = CLIP(t61  - t50);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  653|   902k|    t51  = CLIP(t60a - t51a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  654|   902k|    t52a = CLIP(t59  - t52);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  655|   902k|    t53  = CLIP(t58a - t53a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  656|   902k|    t54a = CLIP(t57  - t54);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  657|   902k|    t55  = CLIP(t56a - t55a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  658|   902k|    t56  = CLIP(t56a + t55a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  659|   902k|    t57a = CLIP(t57  + t54);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  660|   902k|    t58  = CLIP(t58a + t53a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  661|   902k|    t59a = CLIP(t59  + t52);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  662|   902k|    t60  = CLIP(t60a + t51a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  663|   902k|    t61a = CLIP(t61  + t50);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  664|   902k|    t62  = CLIP(t62a + t49a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  665|   902k|    t63a = CLIP(t63  + t48);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  666|       |
  667|   902k|    t40a = ((t55  - t40 ) * 181 + 128) >> 8;
  668|   902k|    t41  = ((t54a - t41a) * 181 + 128) >> 8;
  669|   902k|    t42a = ((t53  - t42 ) * 181 + 128) >> 8;
  670|   902k|    t43  = ((t52a - t43a) * 181 + 128) >> 8;
  671|   902k|    t44a = ((t51  - t44 ) * 181 + 128) >> 8;
  672|   902k|    t45  = ((t50a - t45a) * 181 + 128) >> 8;
  673|   902k|    t46a = ((t49  - t46 ) * 181 + 128) >> 8;
  674|   902k|    t47  = ((t48a - t47a) * 181 + 128) >> 8;
  675|   902k|    t48  = ((t47a + t48a) * 181 + 128) >> 8;
  676|   902k|    t49a = ((t46  + t49 ) * 181 + 128) >> 8;
  677|   902k|    t50  = ((t45a + t50a) * 181 + 128) >> 8;
  678|   902k|    t51a = ((t44  + t51 ) * 181 + 128) >> 8;
  679|   902k|    t52  = ((t43a + t52a) * 181 + 128) >> 8;
  680|   902k|    t53a = ((t42  + t53 ) * 181 + 128) >> 8;
  681|   902k|    t54  = ((t41a + t54a) * 181 + 128) >> 8;
  682|   902k|    t55a = ((t40  + t55 ) * 181 + 128) >> 8;
  683|       |
  684|   902k|    const int t0  = c[ 0 * stride];
  685|   902k|    const int t1  = c[ 2 * stride];
  686|   902k|    const int t2  = c[ 4 * stride];
  687|   902k|    const int t3  = c[ 6 * stride];
  688|   902k|    const int t4  = c[ 8 * stride];
  689|   902k|    const int t5  = c[10 * stride];
  690|   902k|    const int t6  = c[12 * stride];
  691|   902k|    const int t7  = c[14 * stride];
  692|   902k|    const int t8  = c[16 * stride];
  693|   902k|    const int t9  = c[18 * stride];
  694|   902k|    const int t10 = c[20 * stride];
  695|   902k|    const int t11 = c[22 * stride];
  696|   902k|    const int t12 = c[24 * stride];
  697|   902k|    const int t13 = c[26 * stride];
  698|   902k|    const int t14 = c[28 * stride];
  699|   902k|    const int t15 = c[30 * stride];
  700|   902k|    const int t16 = c[32 * stride];
  701|   902k|    const int t17 = c[34 * stride];
  702|   902k|    const int t18 = c[36 * stride];
  703|   902k|    const int t19 = c[38 * stride];
  704|   902k|    const int t20 = c[40 * stride];
  705|   902k|    const int t21 = c[42 * stride];
  706|   902k|    const int t22 = c[44 * stride];
  707|   902k|    const int t23 = c[46 * stride];
  708|   902k|    const int t24 = c[48 * stride];
  709|   902k|    const int t25 = c[50 * stride];
  710|   902k|    const int t26 = c[52 * stride];
  711|   902k|    const int t27 = c[54 * stride];
  712|   902k|    const int t28 = c[56 * stride];
  713|   902k|    const int t29 = c[58 * stride];
  714|   902k|    const int t30 = c[60 * stride];
  715|   902k|    const int t31 = c[62 * stride];
  716|       |
  717|   902k|    c[ 0 * stride] = CLIP(t0  + t63a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  718|   902k|    c[ 1 * stride] = CLIP(t1  + t62);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  719|   902k|    c[ 2 * stride] = CLIP(t2  + t61a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  720|   902k|    c[ 3 * stride] = CLIP(t3  + t60);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  721|   902k|    c[ 4 * stride] = CLIP(t4  + t59a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  722|   902k|    c[ 5 * stride] = CLIP(t5  + t58);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  723|   902k|    c[ 6 * stride] = CLIP(t6  + t57a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  724|   902k|    c[ 7 * stride] = CLIP(t7  + t56);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  725|   902k|    c[ 8 * stride] = CLIP(t8  + t55a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  726|   902k|    c[ 9 * stride] = CLIP(t9  + t54);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  727|   902k|    c[10 * stride] = CLIP(t10 + t53a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  728|   902k|    c[11 * stride] = CLIP(t11 + t52);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  729|   902k|    c[12 * stride] = CLIP(t12 + t51a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  730|   902k|    c[13 * stride] = CLIP(t13 + t50);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  731|   902k|    c[14 * stride] = CLIP(t14 + t49a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  732|   902k|    c[15 * stride] = CLIP(t15 + t48);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  733|   902k|    c[16 * stride] = CLIP(t16 + t47);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  734|   902k|    c[17 * stride] = CLIP(t17 + t46a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  735|   902k|    c[18 * stride] = CLIP(t18 + t45);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  736|   902k|    c[19 * stride] = CLIP(t19 + t44a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  737|   902k|    c[20 * stride] = CLIP(t20 + t43);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  738|   902k|    c[21 * stride] = CLIP(t21 + t42a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  739|   902k|    c[22 * stride] = CLIP(t22 + t41);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  740|   902k|    c[23 * stride] = CLIP(t23 + t40a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  741|   902k|    c[24 * stride] = CLIP(t24 + t39);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  742|   902k|    c[25 * stride] = CLIP(t25 + t38a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  743|   902k|    c[26 * stride] = CLIP(t26 + t37);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  744|   902k|    c[27 * stride] = CLIP(t27 + t36a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  745|   902k|    c[28 * stride] = CLIP(t28 + t35);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  746|   902k|    c[29 * stride] = CLIP(t29 + t34a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  747|   902k|    c[30 * stride] = CLIP(t30 + t33);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  748|   902k|    c[31 * stride] = CLIP(t31 + t32a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  749|   902k|    c[32 * stride] = CLIP(t31 - t32a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  750|   902k|    c[33 * stride] = CLIP(t30 - t33);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  751|   902k|    c[34 * stride] = CLIP(t29 - t34a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  752|   902k|    c[35 * stride] = CLIP(t28 - t35);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  753|   902k|    c[36 * stride] = CLIP(t27 - t36a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  754|   902k|    c[37 * stride] = CLIP(t26 - t37);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  755|   902k|    c[38 * stride] = CLIP(t25 - t38a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  756|   902k|    c[39 * stride] = CLIP(t24 - t39);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  757|   902k|    c[40 * stride] = CLIP(t23 - t40a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  758|   902k|    c[41 * stride] = CLIP(t22 - t41);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  759|   902k|    c[42 * stride] = CLIP(t21 - t42a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  760|   902k|    c[43 * stride] = CLIP(t20 - t43);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  761|   902k|    c[44 * stride] = CLIP(t19 - t44a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  762|   902k|    c[45 * stride] = CLIP(t18 - t45);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  763|   902k|    c[46 * stride] = CLIP(t17 - t46a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  764|   902k|    c[47 * stride] = CLIP(t16 - t47);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  765|   902k|    c[48 * stride] = CLIP(t15 - t48);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  766|   902k|    c[49 * stride] = CLIP(t14 - t49a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  767|   902k|    c[50 * stride] = CLIP(t13 - t50);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  768|   902k|    c[51 * stride] = CLIP(t12 - t51a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  769|   902k|    c[52 * stride] = CLIP(t11 - t52);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  770|   902k|    c[53 * stride] = CLIP(t10 - t53a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  771|   902k|    c[54 * stride] = CLIP(t9  - t54);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  772|   902k|    c[55 * stride] = CLIP(t8  - t55a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  773|   902k|    c[56 * stride] = CLIP(t7  - t56);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  774|   902k|    c[57 * stride] = CLIP(t6  - t57a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  775|   902k|    c[58 * stride] = CLIP(t5  - t58);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  776|   902k|    c[59 * stride] = CLIP(t4  - t59a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  777|   902k|    c[60 * stride] = CLIP(t3  - t60);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  778|   902k|    c[61 * stride] = CLIP(t2  - t61a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  779|   902k|    c[62 * stride] = CLIP(t1  - t62);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  780|   902k|    c[63 * stride] = CLIP(t0  - t63a);
  ------------------
  |  |   37|   902k|#define CLIP(a) iclip(a, min, max)
  ------------------
  781|   902k|}

dav1d_itx_dsp_init_8bpc:
  220|  7.82k|COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c, int bpc) {
  221|  7.82k|#define assign_itx_all_fn64(w, h, pfx) \
  222|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT  ] = \
  223|  7.82k|        inv_txfm_add_dct_dct_##w##x##h##_c
  224|       |
  225|  7.82k|#define assign_itx_all_fn32(w, h, pfx) \
  226|  7.82k|    assign_itx_all_fn64(w, h, pfx); \
  227|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][IDTX] = \
  228|  7.82k|        inv_txfm_add_identity_identity_##w##x##h##_c
  229|       |
  230|  7.82k|#define assign_itx_all_fn16(w, h, pfx) \
  231|  7.82k|    assign_itx_all_fn32(w, h, pfx); \
  232|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_ADST ] = \
  233|  7.82k|        inv_txfm_add_adst_dct_##w##x##h##_c; \
  234|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_DCT ] = \
  235|  7.82k|        inv_txfm_add_dct_adst_##w##x##h##_c; \
  236|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_ADST] = \
  237|  7.82k|        inv_txfm_add_adst_adst_##w##x##h##_c; \
  238|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_FLIPADST] = \
  239|  7.82k|        inv_txfm_add_flipadst_adst_##w##x##h##_c; \
  240|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_ADST] = \
  241|  7.82k|        inv_txfm_add_adst_flipadst_##w##x##h##_c; \
  242|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_FLIPADST] = \
  243|  7.82k|        inv_txfm_add_flipadst_dct_##w##x##h##_c; \
  244|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_DCT] = \
  245|  7.82k|        inv_txfm_add_dct_flipadst_##w##x##h##_c; \
  246|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_FLIPADST] = \
  247|  7.82k|        inv_txfm_add_flipadst_flipadst_##w##x##h##_c; \
  248|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][H_DCT] = \
  249|  7.82k|        inv_txfm_add_dct_identity_##w##x##h##_c; \
  250|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][V_DCT] = \
  251|  7.82k|        inv_txfm_add_identity_dct_##w##x##h##_c
  252|       |
  253|  7.82k|#define assign_itx_all_fn84(w, h, pfx) \
  254|  7.82k|    assign_itx_all_fn16(w, h, pfx); \
  255|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][H_FLIPADST] = \
  256|  7.82k|        inv_txfm_add_flipadst_identity_##w##x##h##_c; \
  257|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][V_FLIPADST] = \
  258|  7.82k|        inv_txfm_add_identity_flipadst_##w##x##h##_c; \
  259|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][H_ADST] = \
  260|  7.82k|        inv_txfm_add_adst_identity_##w##x##h##_c; \
  261|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][V_ADST] = \
  262|  7.82k|        inv_txfm_add_identity_adst_##w##x##h##_c; \
  263|  7.82k|
  264|  7.82k|#if !(HAVE_ASM && TRIM_DSP_FUNCTIONS && ( \
  265|  7.82k|  ARCH_AARCH64 || \
  266|  7.82k|  (ARCH_ARM && (defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32))) \
  267|  7.82k|))
  268|  7.82k|    c->itxfm_add[TX_4X4][WHT_WHT] = inv_txfm_add_wht_wht_4x4_c;
  269|  7.82k|#endif
  270|  7.82k|    assign_itx_all_fn84( 4,  4, );
  ------------------
  |  |  254|  7.82k|    assign_itx_all_fn16(w, h, pfx); \
  |  |  ------------------
  |  |  |  |  231|  7.82k|    assign_itx_all_fn32(w, h, pfx); \
  |  |  |  |  ------------------
  |  |  |  |  |  |  226|  7.82k|    assign_itx_all_fn64(w, h, pfx); \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  222|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT  ] = \
  |  |  |  |  |  |  |  |  223|  7.82k|        inv_txfm_add_dct_dct_##w##x##h##_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  227|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][IDTX] = \
  |  |  |  |  |  |  228|  7.82k|        inv_txfm_add_identity_identity_##w##x##h##_c
  |  |  |  |  ------------------
  |  |  |  |  232|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_ADST ] = \
  |  |  |  |  233|  7.82k|        inv_txfm_add_adst_dct_##w##x##h##_c; \
  |  |  |  |  234|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_DCT ] = \
  |  |  |  |  235|  7.82k|        inv_txfm_add_dct_adst_##w##x##h##_c; \
  |  |  |  |  236|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_ADST] = \
  |  |  |  |  237|  7.82k|        inv_txfm_add_adst_adst_##w##x##h##_c; \
  |  |  |  |  238|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_FLIPADST] = \
  |  |  |  |  239|  7.82k|        inv_txfm_add_flipadst_adst_##w##x##h##_c; \
  |  |  |  |  240|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_ADST] = \
  |  |  |  |  241|  7.82k|        inv_txfm_add_adst_flipadst_##w##x##h##_c; \
  |  |  |  |  242|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_FLIPADST] = \
  |  |  |  |  243|  7.82k|        inv_txfm_add_flipadst_dct_##w##x##h##_c; \
  |  |  |  |  244|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_DCT] = \
  |  |  |  |  245|  7.82k|        inv_txfm_add_dct_flipadst_##w##x##h##_c; \
  |  |  |  |  246|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_FLIPADST] = \
  |  |  |  |  247|  7.82k|        inv_txfm_add_flipadst_flipadst_##w##x##h##_c; \
  |  |  |  |  248|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][H_DCT] = \
  |  |  |  |  249|  7.82k|        inv_txfm_add_dct_identity_##w##x##h##_c; \
  |  |  |  |  250|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][V_DCT] = \
  |  |  |  |  251|  7.82k|        inv_txfm_add_identity_dct_##w##x##h##_c
  |  |  ------------------
  |  |  255|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][H_FLIPADST] = \
  |  |  256|  7.82k|        inv_txfm_add_flipadst_identity_##w##x##h##_c; \
  |  |  257|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][V_FLIPADST] = \
  |  |  258|  7.82k|        inv_txfm_add_identity_flipadst_##w##x##h##_c; \
  |  |  259|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][H_ADST] = \
  |  |  260|  7.82k|        inv_txfm_add_adst_identity_##w##x##h##_c; \
  |  |  261|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][V_ADST] = \
  |  |  262|  7.82k|        inv_txfm_add_identity_adst_##w##x##h##_c; \
  ------------------
  271|  7.82k|    assign_itx_all_fn84( 4,  8, R);
  ------------------
  |  |  254|  7.82k|    assign_itx_all_fn16(w, h, pfx); \
  |  |  ------------------
  |  |  |  |  231|  7.82k|    assign_itx_all_fn32(w, h, pfx); \
  |  |  |  |  ------------------
  |  |  |  |  |  |  226|  7.82k|    assign_itx_all_fn64(w, h, pfx); \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  222|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT  ] = \
  |  |  |  |  |  |  |  |  223|  7.82k|        inv_txfm_add_dct_dct_##w##x##h##_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  227|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][IDTX] = \
  |  |  |  |  |  |  228|  7.82k|        inv_txfm_add_identity_identity_##w##x##h##_c
  |  |  |  |  ------------------
  |  |  |  |  232|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_ADST ] = \
  |  |  |  |  233|  7.82k|        inv_txfm_add_adst_dct_##w##x##h##_c; \
  |  |  |  |  234|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_DCT ] = \
  |  |  |  |  235|  7.82k|        inv_txfm_add_dct_adst_##w##x##h##_c; \
  |  |  |  |  236|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_ADST] = \
  |  |  |  |  237|  7.82k|        inv_txfm_add_adst_adst_##w##x##h##_c; \
  |  |  |  |  238|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_FLIPADST] = \
  |  |  |  |  239|  7.82k|        inv_txfm_add_flipadst_adst_##w##x##h##_c; \
  |  |  |  |  240|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_ADST] = \
  |  |  |  |  241|  7.82k|        inv_txfm_add_adst_flipadst_##w##x##h##_c; \
  |  |  |  |  242|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_FLIPADST] = \
  |  |  |  |  243|  7.82k|        inv_txfm_add_flipadst_dct_##w##x##h##_c; \
  |  |  |  |  244|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_DCT] = \
  |  |  |  |  245|  7.82k|        inv_txfm_add_dct_flipadst_##w##x##h##_c; \
  |  |  |  |  246|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_FLIPADST] = \
  |  |  |  |  247|  7.82k|        inv_txfm_add_flipadst_flipadst_##w##x##h##_c; \
  |  |  |  |  248|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][H_DCT] = \
  |  |  |  |  249|  7.82k|        inv_txfm_add_dct_identity_##w##x##h##_c; \
  |  |  |  |  250|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][V_DCT] = \
  |  |  |  |  251|  7.82k|        inv_txfm_add_identity_dct_##w##x##h##_c
  |  |  ------------------
  |  |  255|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][H_FLIPADST] = \
  |  |  256|  7.82k|        inv_txfm_add_flipadst_identity_##w##x##h##_c; \
  |  |  257|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][V_FLIPADST] = \
  |  |  258|  7.82k|        inv_txfm_add_identity_flipadst_##w##x##h##_c; \
  |  |  259|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][H_ADST] = \
  |  |  260|  7.82k|        inv_txfm_add_adst_identity_##w##x##h##_c; \
  |  |  261|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][V_ADST] = \
  |  |  262|  7.82k|        inv_txfm_add_identity_adst_##w##x##h##_c; \
  ------------------
  272|  7.82k|    assign_itx_all_fn84( 4, 16, R);
  ------------------
  |  |  254|  7.82k|    assign_itx_all_fn16(w, h, pfx); \
  |  |  ------------------
  |  |  |  |  231|  7.82k|    assign_itx_all_fn32(w, h, pfx); \
  |  |  |  |  ------------------
  |  |  |  |  |  |  226|  7.82k|    assign_itx_all_fn64(w, h, pfx); \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  222|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT  ] = \
  |  |  |  |  |  |  |  |  223|  7.82k|        inv_txfm_add_dct_dct_##w##x##h##_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  227|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][IDTX] = \
  |  |  |  |  |  |  228|  7.82k|        inv_txfm_add_identity_identity_##w##x##h##_c
  |  |  |  |  ------------------
  |  |  |  |  232|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_ADST ] = \
  |  |  |  |  233|  7.82k|        inv_txfm_add_adst_dct_##w##x##h##_c; \
  |  |  |  |  234|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_DCT ] = \
  |  |  |  |  235|  7.82k|        inv_txfm_add_dct_adst_##w##x##h##_c; \
  |  |  |  |  236|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_ADST] = \
  |  |  |  |  237|  7.82k|        inv_txfm_add_adst_adst_##w##x##h##_c; \
  |  |  |  |  238|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_FLIPADST] = \
  |  |  |  |  239|  7.82k|        inv_txfm_add_flipadst_adst_##w##x##h##_c; \
  |  |  |  |  240|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_ADST] = \
  |  |  |  |  241|  7.82k|        inv_txfm_add_adst_flipadst_##w##x##h##_c; \
  |  |  |  |  242|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_FLIPADST] = \
  |  |  |  |  243|  7.82k|        inv_txfm_add_flipadst_dct_##w##x##h##_c; \
  |  |  |  |  244|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_DCT] = \
  |  |  |  |  245|  7.82k|        inv_txfm_add_dct_flipadst_##w##x##h##_c; \
  |  |  |  |  246|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_FLIPADST] = \
  |  |  |  |  247|  7.82k|        inv_txfm_add_flipadst_flipadst_##w##x##h##_c; \
  |  |  |  |  248|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][H_DCT] = \
  |  |  |  |  249|  7.82k|        inv_txfm_add_dct_identity_##w##x##h##_c; \
  |  |  |  |  250|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][V_DCT] = \
  |  |  |  |  251|  7.82k|        inv_txfm_add_identity_dct_##w##x##h##_c
  |  |  ------------------
  |  |  255|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][H_FLIPADST] = \
  |  |  256|  7.82k|        inv_txfm_add_flipadst_identity_##w##x##h##_c; \
  |  |  257|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][V_FLIPADST] = \
  |  |  258|  7.82k|        inv_txfm_add_identity_flipadst_##w##x##h##_c; \
  |  |  259|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][H_ADST] = \
  |  |  260|  7.82k|        inv_txfm_add_adst_identity_##w##x##h##_c; \
  |  |  261|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][V_ADST] = \
  |  |  262|  7.82k|        inv_txfm_add_identity_adst_##w##x##h##_c; \
  ------------------
  273|  7.82k|    assign_itx_all_fn84( 8,  4, R);
  ------------------
  |  |  254|  7.82k|    assign_itx_all_fn16(w, h, pfx); \
  |  |  ------------------
  |  |  |  |  231|  7.82k|    assign_itx_all_fn32(w, h, pfx); \
  |  |  |  |  ------------------
  |  |  |  |  |  |  226|  7.82k|    assign_itx_all_fn64(w, h, pfx); \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  222|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT  ] = \
  |  |  |  |  |  |  |  |  223|  7.82k|        inv_txfm_add_dct_dct_##w##x##h##_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  227|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][IDTX] = \
  |  |  |  |  |  |  228|  7.82k|        inv_txfm_add_identity_identity_##w##x##h##_c
  |  |  |  |  ------------------
  |  |  |  |  232|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_ADST ] = \
  |  |  |  |  233|  7.82k|        inv_txfm_add_adst_dct_##w##x##h##_c; \
  |  |  |  |  234|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_DCT ] = \
  |  |  |  |  235|  7.82k|        inv_txfm_add_dct_adst_##w##x##h##_c; \
  |  |  |  |  236|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_ADST] = \
  |  |  |  |  237|  7.82k|        inv_txfm_add_adst_adst_##w##x##h##_c; \
  |  |  |  |  238|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_FLIPADST] = \
  |  |  |  |  239|  7.82k|        inv_txfm_add_flipadst_adst_##w##x##h##_c; \
  |  |  |  |  240|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_ADST] = \
  |  |  |  |  241|  7.82k|        inv_txfm_add_adst_flipadst_##w##x##h##_c; \
  |  |  |  |  242|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_FLIPADST] = \
  |  |  |  |  243|  7.82k|        inv_txfm_add_flipadst_dct_##w##x##h##_c; \
  |  |  |  |  244|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_DCT] = \
  |  |  |  |  245|  7.82k|        inv_txfm_add_dct_flipadst_##w##x##h##_c; \
  |  |  |  |  246|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_FLIPADST] = \
  |  |  |  |  247|  7.82k|        inv_txfm_add_flipadst_flipadst_##w##x##h##_c; \
  |  |  |  |  248|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][H_DCT] = \
  |  |  |  |  249|  7.82k|        inv_txfm_add_dct_identity_##w##x##h##_c; \
  |  |  |  |  250|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][V_DCT] = \
  |  |  |  |  251|  7.82k|        inv_txfm_add_identity_dct_##w##x##h##_c
  |  |  ------------------
  |  |  255|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][H_FLIPADST] = \
  |  |  256|  7.82k|        inv_txfm_add_flipadst_identity_##w##x##h##_c; \
  |  |  257|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][V_FLIPADST] = \
  |  |  258|  7.82k|        inv_txfm_add_identity_flipadst_##w##x##h##_c; \
  |  |  259|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][H_ADST] = \
  |  |  260|  7.82k|        inv_txfm_add_adst_identity_##w##x##h##_c; \
  |  |  261|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][V_ADST] = \
  |  |  262|  7.82k|        inv_txfm_add_identity_adst_##w##x##h##_c; \
  ------------------
  274|  7.82k|    assign_itx_all_fn84( 8,  8, );
  ------------------
  |  |  254|  7.82k|    assign_itx_all_fn16(w, h, pfx); \
  |  |  ------------------
  |  |  |  |  231|  7.82k|    assign_itx_all_fn32(w, h, pfx); \
  |  |  |  |  ------------------
  |  |  |  |  |  |  226|  7.82k|    assign_itx_all_fn64(w, h, pfx); \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  222|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT  ] = \
  |  |  |  |  |  |  |  |  223|  7.82k|        inv_txfm_add_dct_dct_##w##x##h##_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  227|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][IDTX] = \
  |  |  |  |  |  |  228|  7.82k|        inv_txfm_add_identity_identity_##w##x##h##_c
  |  |  |  |  ------------------
  |  |  |  |  232|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_ADST ] = \
  |  |  |  |  233|  7.82k|        inv_txfm_add_adst_dct_##w##x##h##_c; \
  |  |  |  |  234|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_DCT ] = \
  |  |  |  |  235|  7.82k|        inv_txfm_add_dct_adst_##w##x##h##_c; \
  |  |  |  |  236|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_ADST] = \
  |  |  |  |  237|  7.82k|        inv_txfm_add_adst_adst_##w##x##h##_c; \
  |  |  |  |  238|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_FLIPADST] = \
  |  |  |  |  239|  7.82k|        inv_txfm_add_flipadst_adst_##w##x##h##_c; \
  |  |  |  |  240|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_ADST] = \
  |  |  |  |  241|  7.82k|        inv_txfm_add_adst_flipadst_##w##x##h##_c; \
  |  |  |  |  242|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_FLIPADST] = \
  |  |  |  |  243|  7.82k|        inv_txfm_add_flipadst_dct_##w##x##h##_c; \
  |  |  |  |  244|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_DCT] = \
  |  |  |  |  245|  7.82k|        inv_txfm_add_dct_flipadst_##w##x##h##_c; \
  |  |  |  |  246|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_FLIPADST] = \
  |  |  |  |  247|  7.82k|        inv_txfm_add_flipadst_flipadst_##w##x##h##_c; \
  |  |  |  |  248|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][H_DCT] = \
  |  |  |  |  249|  7.82k|        inv_txfm_add_dct_identity_##w##x##h##_c; \
  |  |  |  |  250|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][V_DCT] = \
  |  |  |  |  251|  7.82k|        inv_txfm_add_identity_dct_##w##x##h##_c
  |  |  ------------------
  |  |  255|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][H_FLIPADST] = \
  |  |  256|  7.82k|        inv_txfm_add_flipadst_identity_##w##x##h##_c; \
  |  |  257|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][V_FLIPADST] = \
  |  |  258|  7.82k|        inv_txfm_add_identity_flipadst_##w##x##h##_c; \
  |  |  259|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][H_ADST] = \
  |  |  260|  7.82k|        inv_txfm_add_adst_identity_##w##x##h##_c; \
  |  |  261|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][V_ADST] = \
  |  |  262|  7.82k|        inv_txfm_add_identity_adst_##w##x##h##_c; \
  ------------------
  275|  7.82k|    assign_itx_all_fn84( 8, 16, R);
  ------------------
  |  |  254|  7.82k|    assign_itx_all_fn16(w, h, pfx); \
  |  |  ------------------
  |  |  |  |  231|  7.82k|    assign_itx_all_fn32(w, h, pfx); \
  |  |  |  |  ------------------
  |  |  |  |  |  |  226|  7.82k|    assign_itx_all_fn64(w, h, pfx); \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  222|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT  ] = \
  |  |  |  |  |  |  |  |  223|  7.82k|        inv_txfm_add_dct_dct_##w##x##h##_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  227|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][IDTX] = \
  |  |  |  |  |  |  228|  7.82k|        inv_txfm_add_identity_identity_##w##x##h##_c
  |  |  |  |  ------------------
  |  |  |  |  232|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_ADST ] = \
  |  |  |  |  233|  7.82k|        inv_txfm_add_adst_dct_##w##x##h##_c; \
  |  |  |  |  234|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_DCT ] = \
  |  |  |  |  235|  7.82k|        inv_txfm_add_dct_adst_##w##x##h##_c; \
  |  |  |  |  236|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_ADST] = \
  |  |  |  |  237|  7.82k|        inv_txfm_add_adst_adst_##w##x##h##_c; \
  |  |  |  |  238|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_FLIPADST] = \
  |  |  |  |  239|  7.82k|        inv_txfm_add_flipadst_adst_##w##x##h##_c; \
  |  |  |  |  240|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_ADST] = \
  |  |  |  |  241|  7.82k|        inv_txfm_add_adst_flipadst_##w##x##h##_c; \
  |  |  |  |  242|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_FLIPADST] = \
  |  |  |  |  243|  7.82k|        inv_txfm_add_flipadst_dct_##w##x##h##_c; \
  |  |  |  |  244|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_DCT] = \
  |  |  |  |  245|  7.82k|        inv_txfm_add_dct_flipadst_##w##x##h##_c; \
  |  |  |  |  246|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_FLIPADST] = \
  |  |  |  |  247|  7.82k|        inv_txfm_add_flipadst_flipadst_##w##x##h##_c; \
  |  |  |  |  248|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][H_DCT] = \
  |  |  |  |  249|  7.82k|        inv_txfm_add_dct_identity_##w##x##h##_c; \
  |  |  |  |  250|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][V_DCT] = \
  |  |  |  |  251|  7.82k|        inv_txfm_add_identity_dct_##w##x##h##_c
  |  |  ------------------
  |  |  255|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][H_FLIPADST] = \
  |  |  256|  7.82k|        inv_txfm_add_flipadst_identity_##w##x##h##_c; \
  |  |  257|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][V_FLIPADST] = \
  |  |  258|  7.82k|        inv_txfm_add_identity_flipadst_##w##x##h##_c; \
  |  |  259|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][H_ADST] = \
  |  |  260|  7.82k|        inv_txfm_add_adst_identity_##w##x##h##_c; \
  |  |  261|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][V_ADST] = \
  |  |  262|  7.82k|        inv_txfm_add_identity_adst_##w##x##h##_c; \
  ------------------
  276|  7.82k|    assign_itx_all_fn32( 8, 32, R);
  ------------------
  |  |  226|  7.82k|    assign_itx_all_fn64(w, h, pfx); \
  |  |  ------------------
  |  |  |  |  222|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT  ] = \
  |  |  |  |  223|  7.82k|        inv_txfm_add_dct_dct_##w##x##h##_c
  |  |  ------------------
  |  |  227|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][IDTX] = \
  |  |  228|  7.82k|        inv_txfm_add_identity_identity_##w##x##h##_c
  ------------------
  277|  7.82k|    assign_itx_all_fn84(16,  4, R);
  ------------------
  |  |  254|  7.82k|    assign_itx_all_fn16(w, h, pfx); \
  |  |  ------------------
  |  |  |  |  231|  7.82k|    assign_itx_all_fn32(w, h, pfx); \
  |  |  |  |  ------------------
  |  |  |  |  |  |  226|  7.82k|    assign_itx_all_fn64(w, h, pfx); \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  222|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT  ] = \
  |  |  |  |  |  |  |  |  223|  7.82k|        inv_txfm_add_dct_dct_##w##x##h##_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  227|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][IDTX] = \
  |  |  |  |  |  |  228|  7.82k|        inv_txfm_add_identity_identity_##w##x##h##_c
  |  |  |  |  ------------------
  |  |  |  |  232|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_ADST ] = \
  |  |  |  |  233|  7.82k|        inv_txfm_add_adst_dct_##w##x##h##_c; \
  |  |  |  |  234|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_DCT ] = \
  |  |  |  |  235|  7.82k|        inv_txfm_add_dct_adst_##w##x##h##_c; \
  |  |  |  |  236|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_ADST] = \
  |  |  |  |  237|  7.82k|        inv_txfm_add_adst_adst_##w##x##h##_c; \
  |  |  |  |  238|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_FLIPADST] = \
  |  |  |  |  239|  7.82k|        inv_txfm_add_flipadst_adst_##w##x##h##_c; \
  |  |  |  |  240|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_ADST] = \
  |  |  |  |  241|  7.82k|        inv_txfm_add_adst_flipadst_##w##x##h##_c; \
  |  |  |  |  242|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_FLIPADST] = \
  |  |  |  |  243|  7.82k|        inv_txfm_add_flipadst_dct_##w##x##h##_c; \
  |  |  |  |  244|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_DCT] = \
  |  |  |  |  245|  7.82k|        inv_txfm_add_dct_flipadst_##w##x##h##_c; \
  |  |  |  |  246|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_FLIPADST] = \
  |  |  |  |  247|  7.82k|        inv_txfm_add_flipadst_flipadst_##w##x##h##_c; \
  |  |  |  |  248|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][H_DCT] = \
  |  |  |  |  249|  7.82k|        inv_txfm_add_dct_identity_##w##x##h##_c; \
  |  |  |  |  250|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][V_DCT] = \
  |  |  |  |  251|  7.82k|        inv_txfm_add_identity_dct_##w##x##h##_c
  |  |  ------------------
  |  |  255|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][H_FLIPADST] = \
  |  |  256|  7.82k|        inv_txfm_add_flipadst_identity_##w##x##h##_c; \
  |  |  257|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][V_FLIPADST] = \
  |  |  258|  7.82k|        inv_txfm_add_identity_flipadst_##w##x##h##_c; \
  |  |  259|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][H_ADST] = \
  |  |  260|  7.82k|        inv_txfm_add_adst_identity_##w##x##h##_c; \
  |  |  261|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][V_ADST] = \
  |  |  262|  7.82k|        inv_txfm_add_identity_adst_##w##x##h##_c; \
  ------------------
  278|  7.82k|    assign_itx_all_fn84(16,  8, R);
  ------------------
  |  |  254|  7.82k|    assign_itx_all_fn16(w, h, pfx); \
  |  |  ------------------
  |  |  |  |  231|  7.82k|    assign_itx_all_fn32(w, h, pfx); \
  |  |  |  |  ------------------
  |  |  |  |  |  |  226|  7.82k|    assign_itx_all_fn64(w, h, pfx); \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  222|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT  ] = \
  |  |  |  |  |  |  |  |  223|  7.82k|        inv_txfm_add_dct_dct_##w##x##h##_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  227|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][IDTX] = \
  |  |  |  |  |  |  228|  7.82k|        inv_txfm_add_identity_identity_##w##x##h##_c
  |  |  |  |  ------------------
  |  |  |  |  232|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_ADST ] = \
  |  |  |  |  233|  7.82k|        inv_txfm_add_adst_dct_##w##x##h##_c; \
  |  |  |  |  234|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_DCT ] = \
  |  |  |  |  235|  7.82k|        inv_txfm_add_dct_adst_##w##x##h##_c; \
  |  |  |  |  236|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_ADST] = \
  |  |  |  |  237|  7.82k|        inv_txfm_add_adst_adst_##w##x##h##_c; \
  |  |  |  |  238|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_FLIPADST] = \
  |  |  |  |  239|  7.82k|        inv_txfm_add_flipadst_adst_##w##x##h##_c; \
  |  |  |  |  240|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_ADST] = \
  |  |  |  |  241|  7.82k|        inv_txfm_add_adst_flipadst_##w##x##h##_c; \
  |  |  |  |  242|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_FLIPADST] = \
  |  |  |  |  243|  7.82k|        inv_txfm_add_flipadst_dct_##w##x##h##_c; \
  |  |  |  |  244|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_DCT] = \
  |  |  |  |  245|  7.82k|        inv_txfm_add_dct_flipadst_##w##x##h##_c; \
  |  |  |  |  246|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_FLIPADST] = \
  |  |  |  |  247|  7.82k|        inv_txfm_add_flipadst_flipadst_##w##x##h##_c; \
  |  |  |  |  248|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][H_DCT] = \
  |  |  |  |  249|  7.82k|        inv_txfm_add_dct_identity_##w##x##h##_c; \
  |  |  |  |  250|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][V_DCT] = \
  |  |  |  |  251|  7.82k|        inv_txfm_add_identity_dct_##w##x##h##_c
  |  |  ------------------
  |  |  255|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][H_FLIPADST] = \
  |  |  256|  7.82k|        inv_txfm_add_flipadst_identity_##w##x##h##_c; \
  |  |  257|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][V_FLIPADST] = \
  |  |  258|  7.82k|        inv_txfm_add_identity_flipadst_##w##x##h##_c; \
  |  |  259|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][H_ADST] = \
  |  |  260|  7.82k|        inv_txfm_add_adst_identity_##w##x##h##_c; \
  |  |  261|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][V_ADST] = \
  |  |  262|  7.82k|        inv_txfm_add_identity_adst_##w##x##h##_c; \
  ------------------
  279|  7.82k|    assign_itx_all_fn16(16, 16, );
  ------------------
  |  |  231|  7.82k|    assign_itx_all_fn32(w, h, pfx); \
  |  |  ------------------
  |  |  |  |  226|  7.82k|    assign_itx_all_fn64(w, h, pfx); \
  |  |  |  |  ------------------
  |  |  |  |  |  |  222|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT  ] = \
  |  |  |  |  |  |  223|  7.82k|        inv_txfm_add_dct_dct_##w##x##h##_c
  |  |  |  |  ------------------
  |  |  |  |  227|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][IDTX] = \
  |  |  |  |  228|  7.82k|        inv_txfm_add_identity_identity_##w##x##h##_c
  |  |  ------------------
  |  |  232|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_ADST ] = \
  |  |  233|  7.82k|        inv_txfm_add_adst_dct_##w##x##h##_c; \
  |  |  234|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_DCT ] = \
  |  |  235|  7.82k|        inv_txfm_add_dct_adst_##w##x##h##_c; \
  |  |  236|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_ADST] = \
  |  |  237|  7.82k|        inv_txfm_add_adst_adst_##w##x##h##_c; \
  |  |  238|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_FLIPADST] = \
  |  |  239|  7.82k|        inv_txfm_add_flipadst_adst_##w##x##h##_c; \
  |  |  240|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_ADST] = \
  |  |  241|  7.82k|        inv_txfm_add_adst_flipadst_##w##x##h##_c; \
  |  |  242|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_FLIPADST] = \
  |  |  243|  7.82k|        inv_txfm_add_flipadst_dct_##w##x##h##_c; \
  |  |  244|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_DCT] = \
  |  |  245|  7.82k|        inv_txfm_add_dct_flipadst_##w##x##h##_c; \
  |  |  246|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_FLIPADST] = \
  |  |  247|  7.82k|        inv_txfm_add_flipadst_flipadst_##w##x##h##_c; \
  |  |  248|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][H_DCT] = \
  |  |  249|  7.82k|        inv_txfm_add_dct_identity_##w##x##h##_c; \
  |  |  250|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][V_DCT] = \
  |  |  251|  7.82k|        inv_txfm_add_identity_dct_##w##x##h##_c
  ------------------
  280|  7.82k|    assign_itx_all_fn32(16, 32, R);
  ------------------
  |  |  226|  7.82k|    assign_itx_all_fn64(w, h, pfx); \
  |  |  ------------------
  |  |  |  |  222|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT  ] = \
  |  |  |  |  223|  7.82k|        inv_txfm_add_dct_dct_##w##x##h##_c
  |  |  ------------------
  |  |  227|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][IDTX] = \
  |  |  228|  7.82k|        inv_txfm_add_identity_identity_##w##x##h##_c
  ------------------
  281|  7.82k|    assign_itx_all_fn64(16, 64, R);
  ------------------
  |  |  222|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT  ] = \
  |  |  223|  7.82k|        inv_txfm_add_dct_dct_##w##x##h##_c
  ------------------
  282|  7.82k|    assign_itx_all_fn32(32,  8, R);
  ------------------
  |  |  226|  7.82k|    assign_itx_all_fn64(w, h, pfx); \
  |  |  ------------------
  |  |  |  |  222|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT  ] = \
  |  |  |  |  223|  7.82k|        inv_txfm_add_dct_dct_##w##x##h##_c
  |  |  ------------------
  |  |  227|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][IDTX] = \
  |  |  228|  7.82k|        inv_txfm_add_identity_identity_##w##x##h##_c
  ------------------
  283|  7.82k|    assign_itx_all_fn32(32, 16, R);
  ------------------
  |  |  226|  7.82k|    assign_itx_all_fn64(w, h, pfx); \
  |  |  ------------------
  |  |  |  |  222|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT  ] = \
  |  |  |  |  223|  7.82k|        inv_txfm_add_dct_dct_##w##x##h##_c
  |  |  ------------------
  |  |  227|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][IDTX] = \
  |  |  228|  7.82k|        inv_txfm_add_identity_identity_##w##x##h##_c
  ------------------
  284|  7.82k|    assign_itx_all_fn32(32, 32, );
  ------------------
  |  |  226|  7.82k|    assign_itx_all_fn64(w, h, pfx); \
  |  |  ------------------
  |  |  |  |  222|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT  ] = \
  |  |  |  |  223|  7.82k|        inv_txfm_add_dct_dct_##w##x##h##_c
  |  |  ------------------
  |  |  227|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][IDTX] = \
  |  |  228|  7.82k|        inv_txfm_add_identity_identity_##w##x##h##_c
  ------------------
  285|  7.82k|    assign_itx_all_fn64(32, 64, R);
  ------------------
  |  |  222|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT  ] = \
  |  |  223|  7.82k|        inv_txfm_add_dct_dct_##w##x##h##_c
  ------------------
  286|  7.82k|    assign_itx_all_fn64(64, 16, R);
  ------------------
  |  |  222|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT  ] = \
  |  |  223|  7.82k|        inv_txfm_add_dct_dct_##w##x##h##_c
  ------------------
  287|  7.82k|    assign_itx_all_fn64(64, 32, R);
  ------------------
  |  |  222|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT  ] = \
  |  |  223|  7.82k|        inv_txfm_add_dct_dct_##w##x##h##_c
  ------------------
  288|  7.82k|    assign_itx_all_fn64(64, 64, );
  ------------------
  |  |  222|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT  ] = \
  |  |  223|  7.82k|        inv_txfm_add_dct_dct_##w##x##h##_c
  ------------------
  289|       |
  290|  7.82k|    int all_simd = 0;
  291|  7.82k|#if HAVE_ASM
  292|       |#if ARCH_AARCH64 || ARCH_ARM
  293|       |    itx_dsp_init_arm(c, bpc, &all_simd);
  294|       |#endif
  295|       |#if ARCH_LOONGARCH64
  296|       |    itx_dsp_init_loongarch(c, bpc);
  297|       |#endif
  298|       |#if ARCH_PPC64LE
  299|       |    itx_dsp_init_ppc(c, bpc);
  300|       |#endif
  301|       |#if ARCH_RISCV
  302|       |    itx_dsp_init_riscv(c, bpc);
  303|       |#endif
  304|  7.82k|#if ARCH_X86
  305|  7.82k|    itx_dsp_init_x86(c, bpc, &all_simd);
  306|  7.82k|#endif
  307|  7.82k|#endif
  308|       |
  309|  7.82k|    if (!all_simd)
  ------------------
  |  Branch (309:9): [True: 0, False: 7.82k]
  ------------------
  310|      0|        dav1d_init_last_nonzero_col_from_eob_tables();
  311|  7.82k|}
itx_tmpl.c:inv_txfm_add_c:
   47|  54.1k|{
   48|  54.1k|    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx];
   49|  54.1k|    const int w = 4 * t_dim->w, h = 4 * t_dim->h;
   50|  54.1k|    const int has_dconly = txtp == DCT_DCT;
   51|  54.1k|    assert(w >= 4 && w <= 64);
  ------------------
  |  |  140|   108k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:30): [True: 54.1k, False: 0]
  |  |  |  Branch (140:30): [True: 54.1k, False: 0]
  |  |  |  Branch (140:68): [Folded, False: 54.1k]
  |  |  ------------------
  ------------------
   52|  54.1k|    assert(h >= 4 && h <= 64);
  ------------------
  |  |  140|   108k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:30): [True: 54.1k, False: 1]
  |  |  |  Branch (140:30): [True: 54.1k, False: 0]
  |  |  |  Branch (140:68): [Folded, False: 54.1k]
  |  |  ------------------
  ------------------
   53|  54.1k|    assert(eob >= 0);
  ------------------
  |  |  140|  54.1k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 54.1k]
  |  |  |  Branch (140:68): [Folded, False: 54.1k]
  |  |  ------------------
  ------------------
   54|       |
   55|  54.1k|    const int is_rect2 = w * 2 == h || h * 2 == w;
  ------------------
  |  Branch (55:26): [True: 23.9k, False: 30.1k]
  |  Branch (55:40): [True: 12.4k, False: 17.7k]
  ------------------
   56|  54.1k|    const int rnd = (1 << shift) >> 1;
   57|       |
   58|  54.1k|    if (eob < has_dconly) {
  ------------------
  |  Branch (58:9): [True: 13.6k, False: 40.5k]
  ------------------
   59|  13.6k|        int dc = coeff[0];
   60|  13.6k|        coeff[0] = 0;
   61|  13.6k|        if (is_rect2)
  ------------------
  |  Branch (61:13): [True: 7.46k, False: 6.19k]
  ------------------
   62|  7.46k|            dc = (dc * 181 + 128) >> 8;
   63|  13.6k|        dc = (dc * 181 + 128) >> 8;
   64|  13.6k|        dc = (dc + rnd) >> shift;
   65|  13.6k|        dc = (dc * 181 + 128 + 2048) >> 12;
   66|   563k|        for (int y = 0; y < h; y++, dst += PXSTRIDE(stride))
  ------------------
  |  |   53|   550k|#define PXSTRIDE(x) (x)
  ------------------
  |  Branch (66:25): [True: 550k, False: 13.6k]
  ------------------
   67|  21.3M|            for (int x = 0; x < w; x++)
  ------------------
  |  Branch (67:29): [True: 20.7M, False: 550k]
  ------------------
   68|  20.7M|                dst[x] = iclip_pixel(dst[x] + dc);
  ------------------
  |  |   49|  20.7M|#define iclip_pixel iclip_u8
  ------------------
   69|  13.6k|        return;
   70|  13.6k|    }
   71|       |
   72|  40.5k|    const uint8_t *const txtps = dav1d_tx1d_types[txtp];
   73|  40.5k|    const itx_1d_fn first_1d_fn = dav1d_tx1d_fns[t_dim->lw][txtps[0]];
   74|  40.5k|    const itx_1d_fn second_1d_fn = dav1d_tx1d_fns[t_dim->lh][txtps[1]];
   75|  40.5k|    const int sh = imin(h, 32), sw = imin(w, 32);
   76|  40.5k|#if BITDEPTH == 8
   77|  40.5k|    const int row_clip_min = INT16_MIN;
   78|  40.5k|    const int col_clip_min = INT16_MIN;
   79|       |#else
   80|       |    const int row_clip_min = (int) ((unsigned) ~bitdepth_max << 7);
   81|       |    const int col_clip_min = (int) ((unsigned) ~bitdepth_max << 5);
   82|       |#endif
   83|  40.5k|    const int row_clip_max = ~row_clip_min;
   84|  40.5k|    const int col_clip_max = ~col_clip_min;
   85|       |
   86|  40.5k|    int32_t tmp[64 * 64], *c = tmp;
   87|  40.5k|    int last_nonzero_col; // in first 1d itx
   88|  40.5k|    if (txtps[1] == IDENTITY && txtps[0] != IDENTITY) {
  ------------------
  |  Branch (88:9): [True: 0, False: 40.5k]
  |  Branch (88:33): [True: 0, False: 0]
  ------------------
   89|      0|        last_nonzero_col = imin(sh - 1, eob);
   90|  40.5k|    } else if (txtps[0] == IDENTITY && txtps[1] != IDENTITY) {
  ------------------
  |  Branch (90:16): [True: 0, False: 40.5k]
  |  Branch (90:40): [True: 0, False: 0]
  ------------------
   91|      0|        last_nonzero_col = eob >> (t_dim->lw + 2);
   92|  40.5k|    } else {
   93|  40.5k|        last_nonzero_col = dav1d_last_nonzero_col_from_eob[tx][eob];
   94|  40.5k|    }
   95|  40.5k|    assert(last_nonzero_col < sh);
  ------------------
  |  |  140|  40.5k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 40.5k]
  |  |  |  Branch (140:68): [Folded, False: 40.5k]
  |  |  ------------------
  ------------------
   96|   406k|    for (int y = 0; y <= last_nonzero_col; y++, c += w) {
  ------------------
  |  Branch (96:21): [True: 365k, False: 40.5k]
  ------------------
   97|   365k|        if (is_rect2)
  ------------------
  |  Branch (97:13): [True: 240k, False: 125k]
  ------------------
   98|  6.45M|            for (int x = 0; x < sw; x++)
  ------------------
  |  Branch (98:29): [True: 6.21M, False: 240k]
  ------------------
   99|  6.21M|                c[x] = (coeff[y + x * sh] * 181 + 128) >> 8;
  100|   125k|        else
  101|  4.10M|            for (int x = 0; x < sw; x++)
  ------------------
  |  Branch (101:29): [True: 3.98M, False: 125k]
  ------------------
  102|  3.98M|                c[x] = coeff[y + x * sh];
  103|   365k|        first_1d_fn(c, 1, row_clip_min, row_clip_max);
  104|   365k|    }
  105|  40.5k|    if (last_nonzero_col + 1 < sh)
  ------------------
  |  Branch (105:9): [True: 36.8k, False: 3.62k]
  ------------------
  106|  36.8k|        memset(c, 0, sizeof(*c) * (sh - last_nonzero_col - 1) * w);
  107|       |
  108|  40.5k|    memset(coeff, 0, sizeof(*coeff) * sw * sh);
  109|  43.2M|    for (int i = 0; i < w * sh; i++)
  ------------------
  |  Branch (109:21): [True: 43.2M, False: 40.5k]
  ------------------
  110|  43.2M|        tmp[i] = iclip((tmp[i] + rnd) >> shift, col_clip_min, col_clip_max);
  111|       |
  112|  1.49M|    for (int x = 0; x < w; x++)
  ------------------
  |  Branch (112:21): [True: 1.45M, False: 40.5k]
  ------------------
  113|  1.45M|        second_1d_fn(&tmp[x], w, col_clip_min, col_clip_max);
  114|       |
  115|  40.5k|    c = tmp;
  116|  1.73M|    for (int y = 0; y < h; y++, dst += PXSTRIDE(stride))
  ------------------
  |  |   53|  1.69M|#define PXSTRIDE(x) (x)
  ------------------
  |  Branch (116:21): [True: 1.69M, False: 40.5k]
  ------------------
  117|  62.7M|        for (int x = 0; x < w; x++)
  ------------------
  |  Branch (117:25): [True: 61.0M, False: 1.69M]
  ------------------
  118|  61.0M|            dst[x] = iclip_pixel(dst[x] + ((*c++ + 8) >> 4));
  ------------------
  |  |   49|  61.0M|#define iclip_pixel iclip_u8
  ------------------
  119|  40.5k|}
itx_tmpl.c:inv_txfm_add_dct_dct_16x32_c:
  127|  14.6k|                                               HIGHBD_DECL_SUFFIX) \
  128|  14.6k|{ \
  129|  14.6k|    inv_txfm_add_c(dst, stride, coeff, eob, pfx##TX_##w##X##h, shift, type \
  130|  14.6k|                   HIGHBD_TAIL_SUFFIX); \
  131|  14.6k|}
itx_tmpl.c:inv_txfm_add_dct_dct_16x64_c:
  127|    260|                                               HIGHBD_DECL_SUFFIX) \
  128|    260|{ \
  129|    260|    inv_txfm_add_c(dst, stride, coeff, eob, pfx##TX_##w##X##h, shift, type \
  130|    260|                   HIGHBD_TAIL_SUFFIX); \
  131|    260|}
itx_tmpl.c:inv_txfm_add_dct_dct_32x16_c:
  127|  8.30k|                                               HIGHBD_DECL_SUFFIX) \
  128|  8.30k|{ \
  129|  8.30k|    inv_txfm_add_c(dst, stride, coeff, eob, pfx##TX_##w##X##h, shift, type \
  130|  8.30k|                   HIGHBD_TAIL_SUFFIX); \
  131|  8.30k|}
itx_tmpl.c:inv_txfm_add_dct_dct_32x32_c:
  127|  5.93k|                                               HIGHBD_DECL_SUFFIX) \
  128|  5.93k|{ \
  129|  5.93k|    inv_txfm_add_c(dst, stride, coeff, eob, pfx##TX_##w##X##h, shift, type \
  130|  5.93k|                   HIGHBD_TAIL_SUFFIX); \
  131|  5.93k|}
itx_tmpl.c:inv_txfm_add_dct_dct_32x64_c:
  127|  9.33k|                                               HIGHBD_DECL_SUFFIX) \
  128|  9.33k|{ \
  129|  9.33k|    inv_txfm_add_c(dst, stride, coeff, eob, pfx##TX_##w##X##h, shift, type \
  130|  9.33k|                   HIGHBD_TAIL_SUFFIX); \
  131|  9.33k|}
itx_tmpl.c:inv_txfm_add_dct_dct_64x16_c:
  127|    228|                                               HIGHBD_DECL_SUFFIX) \
  128|    228|{ \
  129|    228|    inv_txfm_add_c(dst, stride, coeff, eob, pfx##TX_##w##X##h, shift, type \
  130|    228|                   HIGHBD_TAIL_SUFFIX); \
  131|    228|}
itx_tmpl.c:inv_txfm_add_dct_dct_64x32_c:
  127|  4.14k|                                               HIGHBD_DECL_SUFFIX) \
  128|  4.14k|{ \
  129|  4.14k|    inv_txfm_add_c(dst, stride, coeff, eob, pfx##TX_##w##X##h, shift, type \
  130|  4.14k|                   HIGHBD_TAIL_SUFFIX); \
  131|  4.14k|}
itx_tmpl.c:inv_txfm_add_dct_dct_64x64_c:
  127|  11.2k|                                               HIGHBD_DECL_SUFFIX) \
  128|  11.2k|{ \
  129|  11.2k|    inv_txfm_add_c(dst, stride, coeff, eob, pfx##TX_##w##X##h, shift, type \
  130|  11.2k|                   HIGHBD_TAIL_SUFFIX); \
  131|  11.2k|}
dav1d_itx_dsp_init_16bpc:
  220|  7.63k|COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c, int bpc) {
  221|  7.63k|#define assign_itx_all_fn64(w, h, pfx) \
  222|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT  ] = \
  223|  7.63k|        inv_txfm_add_dct_dct_##w##x##h##_c
  224|       |
  225|  7.63k|#define assign_itx_all_fn32(w, h, pfx) \
  226|  7.63k|    assign_itx_all_fn64(w, h, pfx); \
  227|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][IDTX] = \
  228|  7.63k|        inv_txfm_add_identity_identity_##w##x##h##_c
  229|       |
  230|  7.63k|#define assign_itx_all_fn16(w, h, pfx) \
  231|  7.63k|    assign_itx_all_fn32(w, h, pfx); \
  232|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_ADST ] = \
  233|  7.63k|        inv_txfm_add_adst_dct_##w##x##h##_c; \
  234|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_DCT ] = \
  235|  7.63k|        inv_txfm_add_dct_adst_##w##x##h##_c; \
  236|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_ADST] = \
  237|  7.63k|        inv_txfm_add_adst_adst_##w##x##h##_c; \
  238|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_FLIPADST] = \
  239|  7.63k|        inv_txfm_add_flipadst_adst_##w##x##h##_c; \
  240|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_ADST] = \
  241|  7.63k|        inv_txfm_add_adst_flipadst_##w##x##h##_c; \
  242|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_FLIPADST] = \
  243|  7.63k|        inv_txfm_add_flipadst_dct_##w##x##h##_c; \
  244|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_DCT] = \
  245|  7.63k|        inv_txfm_add_dct_flipadst_##w##x##h##_c; \
  246|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_FLIPADST] = \
  247|  7.63k|        inv_txfm_add_flipadst_flipadst_##w##x##h##_c; \
  248|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][H_DCT] = \
  249|  7.63k|        inv_txfm_add_dct_identity_##w##x##h##_c; \
  250|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][V_DCT] = \
  251|  7.63k|        inv_txfm_add_identity_dct_##w##x##h##_c
  252|       |
  253|  7.63k|#define assign_itx_all_fn84(w, h, pfx) \
  254|  7.63k|    assign_itx_all_fn16(w, h, pfx); \
  255|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][H_FLIPADST] = \
  256|  7.63k|        inv_txfm_add_flipadst_identity_##w##x##h##_c; \
  257|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][V_FLIPADST] = \
  258|  7.63k|        inv_txfm_add_identity_flipadst_##w##x##h##_c; \
  259|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][H_ADST] = \
  260|  7.63k|        inv_txfm_add_adst_identity_##w##x##h##_c; \
  261|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][V_ADST] = \
  262|  7.63k|        inv_txfm_add_identity_adst_##w##x##h##_c; \
  263|  7.63k|
  264|  7.63k|#if !(HAVE_ASM && TRIM_DSP_FUNCTIONS && ( \
  265|  7.63k|  ARCH_AARCH64 || \
  266|  7.63k|  (ARCH_ARM && (defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32))) \
  267|  7.63k|))
  268|  7.63k|    c->itxfm_add[TX_4X4][WHT_WHT] = inv_txfm_add_wht_wht_4x4_c;
  269|  7.63k|#endif
  270|  7.63k|    assign_itx_all_fn84( 4,  4, );
  ------------------
  |  |  254|  7.63k|    assign_itx_all_fn16(w, h, pfx); \
  |  |  ------------------
  |  |  |  |  231|  7.63k|    assign_itx_all_fn32(w, h, pfx); \
  |  |  |  |  ------------------
  |  |  |  |  |  |  226|  7.63k|    assign_itx_all_fn64(w, h, pfx); \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  222|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT  ] = \
  |  |  |  |  |  |  |  |  223|  7.63k|        inv_txfm_add_dct_dct_##w##x##h##_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  227|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][IDTX] = \
  |  |  |  |  |  |  228|  7.63k|        inv_txfm_add_identity_identity_##w##x##h##_c
  |  |  |  |  ------------------
  |  |  |  |  232|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_ADST ] = \
  |  |  |  |  233|  7.63k|        inv_txfm_add_adst_dct_##w##x##h##_c; \
  |  |  |  |  234|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_DCT ] = \
  |  |  |  |  235|  7.63k|        inv_txfm_add_dct_adst_##w##x##h##_c; \
  |  |  |  |  236|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_ADST] = \
  |  |  |  |  237|  7.63k|        inv_txfm_add_adst_adst_##w##x##h##_c; \
  |  |  |  |  238|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_FLIPADST] = \
  |  |  |  |  239|  7.63k|        inv_txfm_add_flipadst_adst_##w##x##h##_c; \
  |  |  |  |  240|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_ADST] = \
  |  |  |  |  241|  7.63k|        inv_txfm_add_adst_flipadst_##w##x##h##_c; \
  |  |  |  |  242|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_FLIPADST] = \
  |  |  |  |  243|  7.63k|        inv_txfm_add_flipadst_dct_##w##x##h##_c; \
  |  |  |  |  244|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_DCT] = \
  |  |  |  |  245|  7.63k|        inv_txfm_add_dct_flipadst_##w##x##h##_c; \
  |  |  |  |  246|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_FLIPADST] = \
  |  |  |  |  247|  7.63k|        inv_txfm_add_flipadst_flipadst_##w##x##h##_c; \
  |  |  |  |  248|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][H_DCT] = \
  |  |  |  |  249|  7.63k|        inv_txfm_add_dct_identity_##w##x##h##_c; \
  |  |  |  |  250|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][V_DCT] = \
  |  |  |  |  251|  7.63k|        inv_txfm_add_identity_dct_##w##x##h##_c
  |  |  ------------------
  |  |  255|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][H_FLIPADST] = \
  |  |  256|  7.63k|        inv_txfm_add_flipadst_identity_##w##x##h##_c; \
  |  |  257|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][V_FLIPADST] = \
  |  |  258|  7.63k|        inv_txfm_add_identity_flipadst_##w##x##h##_c; \
  |  |  259|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][H_ADST] = \
  |  |  260|  7.63k|        inv_txfm_add_adst_identity_##w##x##h##_c; \
  |  |  261|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][V_ADST] = \
  |  |  262|  7.63k|        inv_txfm_add_identity_adst_##w##x##h##_c; \
  ------------------
  271|  7.63k|    assign_itx_all_fn84( 4,  8, R);
  ------------------
  |  |  254|  7.63k|    assign_itx_all_fn16(w, h, pfx); \
  |  |  ------------------
  |  |  |  |  231|  7.63k|    assign_itx_all_fn32(w, h, pfx); \
  |  |  |  |  ------------------
  |  |  |  |  |  |  226|  7.63k|    assign_itx_all_fn64(w, h, pfx); \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  222|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT  ] = \
  |  |  |  |  |  |  |  |  223|  7.63k|        inv_txfm_add_dct_dct_##w##x##h##_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  227|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][IDTX] = \
  |  |  |  |  |  |  228|  7.63k|        inv_txfm_add_identity_identity_##w##x##h##_c
  |  |  |  |  ------------------
  |  |  |  |  232|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_ADST ] = \
  |  |  |  |  233|  7.63k|        inv_txfm_add_adst_dct_##w##x##h##_c; \
  |  |  |  |  234|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_DCT ] = \
  |  |  |  |  235|  7.63k|        inv_txfm_add_dct_adst_##w##x##h##_c; \
  |  |  |  |  236|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_ADST] = \
  |  |  |  |  237|  7.63k|        inv_txfm_add_adst_adst_##w##x##h##_c; \
  |  |  |  |  238|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_FLIPADST] = \
  |  |  |  |  239|  7.63k|        inv_txfm_add_flipadst_adst_##w##x##h##_c; \
  |  |  |  |  240|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_ADST] = \
  |  |  |  |  241|  7.63k|        inv_txfm_add_adst_flipadst_##w##x##h##_c; \
  |  |  |  |  242|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_FLIPADST] = \
  |  |  |  |  243|  7.63k|        inv_txfm_add_flipadst_dct_##w##x##h##_c; \
  |  |  |  |  244|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_DCT] = \
  |  |  |  |  245|  7.63k|        inv_txfm_add_dct_flipadst_##w##x##h##_c; \
  |  |  |  |  246|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_FLIPADST] = \
  |  |  |  |  247|  7.63k|        inv_txfm_add_flipadst_flipadst_##w##x##h##_c; \
  |  |  |  |  248|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][H_DCT] = \
  |  |  |  |  249|  7.63k|        inv_txfm_add_dct_identity_##w##x##h##_c; \
  |  |  |  |  250|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][V_DCT] = \
  |  |  |  |  251|  7.63k|        inv_txfm_add_identity_dct_##w##x##h##_c
  |  |  ------------------
  |  |  255|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][H_FLIPADST] = \
  |  |  256|  7.63k|        inv_txfm_add_flipadst_identity_##w##x##h##_c; \
  |  |  257|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][V_FLIPADST] = \
  |  |  258|  7.63k|        inv_txfm_add_identity_flipadst_##w##x##h##_c; \
  |  |  259|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][H_ADST] = \
  |  |  260|  7.63k|        inv_txfm_add_adst_identity_##w##x##h##_c; \
  |  |  261|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][V_ADST] = \
  |  |  262|  7.63k|        inv_txfm_add_identity_adst_##w##x##h##_c; \
  ------------------
  272|  7.63k|    assign_itx_all_fn84( 4, 16, R);
  ------------------
  |  |  254|  7.63k|    assign_itx_all_fn16(w, h, pfx); \
  |  |  ------------------
  |  |  |  |  231|  7.63k|    assign_itx_all_fn32(w, h, pfx); \
  |  |  |  |  ------------------
  |  |  |  |  |  |  226|  7.63k|    assign_itx_all_fn64(w, h, pfx); \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  222|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT  ] = \
  |  |  |  |  |  |  |  |  223|  7.63k|        inv_txfm_add_dct_dct_##w##x##h##_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  227|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][IDTX] = \
  |  |  |  |  |  |  228|  7.63k|        inv_txfm_add_identity_identity_##w##x##h##_c
  |  |  |  |  ------------------
  |  |  |  |  232|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_ADST ] = \
  |  |  |  |  233|  7.63k|        inv_txfm_add_adst_dct_##w##x##h##_c; \
  |  |  |  |  234|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_DCT ] = \
  |  |  |  |  235|  7.63k|        inv_txfm_add_dct_adst_##w##x##h##_c; \
  |  |  |  |  236|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_ADST] = \
  |  |  |  |  237|  7.63k|        inv_txfm_add_adst_adst_##w##x##h##_c; \
  |  |  |  |  238|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_FLIPADST] = \
  |  |  |  |  239|  7.63k|        inv_txfm_add_flipadst_adst_##w##x##h##_c; \
  |  |  |  |  240|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_ADST] = \
  |  |  |  |  241|  7.63k|        inv_txfm_add_adst_flipadst_##w##x##h##_c; \
  |  |  |  |  242|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_FLIPADST] = \
  |  |  |  |  243|  7.63k|        inv_txfm_add_flipadst_dct_##w##x##h##_c; \
  |  |  |  |  244|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_DCT] = \
  |  |  |  |  245|  7.63k|        inv_txfm_add_dct_flipadst_##w##x##h##_c; \
  |  |  |  |  246|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_FLIPADST] = \
  |  |  |  |  247|  7.63k|        inv_txfm_add_flipadst_flipadst_##w##x##h##_c; \
  |  |  |  |  248|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][H_DCT] = \
  |  |  |  |  249|  7.63k|        inv_txfm_add_dct_identity_##w##x##h##_c; \
  |  |  |  |  250|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][V_DCT] = \
  |  |  |  |  251|  7.63k|        inv_txfm_add_identity_dct_##w##x##h##_c
  |  |  ------------------
  |  |  255|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][H_FLIPADST] = \
  |  |  256|  7.63k|        inv_txfm_add_flipadst_identity_##w##x##h##_c; \
  |  |  257|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][V_FLIPADST] = \
  |  |  258|  7.63k|        inv_txfm_add_identity_flipadst_##w##x##h##_c; \
  |  |  259|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][H_ADST] = \
  |  |  260|  7.63k|        inv_txfm_add_adst_identity_##w##x##h##_c; \
  |  |  261|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][V_ADST] = \
  |  |  262|  7.63k|        inv_txfm_add_identity_adst_##w##x##h##_c; \
  ------------------
  273|  7.63k|    assign_itx_all_fn84( 8,  4, R);
  ------------------
  |  |  254|  7.63k|    assign_itx_all_fn16(w, h, pfx); \
  |  |  ------------------
  |  |  |  |  231|  7.63k|    assign_itx_all_fn32(w, h, pfx); \
  |  |  |  |  ------------------
  |  |  |  |  |  |  226|  7.63k|    assign_itx_all_fn64(w, h, pfx); \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  222|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT  ] = \
  |  |  |  |  |  |  |  |  223|  7.63k|        inv_txfm_add_dct_dct_##w##x##h##_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  227|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][IDTX] = \
  |  |  |  |  |  |  228|  7.63k|        inv_txfm_add_identity_identity_##w##x##h##_c
  |  |  |  |  ------------------
  |  |  |  |  232|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_ADST ] = \
  |  |  |  |  233|  7.63k|        inv_txfm_add_adst_dct_##w##x##h##_c; \
  |  |  |  |  234|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_DCT ] = \
  |  |  |  |  235|  7.63k|        inv_txfm_add_dct_adst_##w##x##h##_c; \
  |  |  |  |  236|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_ADST] = \
  |  |  |  |  237|  7.63k|        inv_txfm_add_adst_adst_##w##x##h##_c; \
  |  |  |  |  238|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_FLIPADST] = \
  |  |  |  |  239|  7.63k|        inv_txfm_add_flipadst_adst_##w##x##h##_c; \
  |  |  |  |  240|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_ADST] = \
  |  |  |  |  241|  7.63k|        inv_txfm_add_adst_flipadst_##w##x##h##_c; \
  |  |  |  |  242|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_FLIPADST] = \
  |  |  |  |  243|  7.63k|        inv_txfm_add_flipadst_dct_##w##x##h##_c; \
  |  |  |  |  244|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_DCT] = \
  |  |  |  |  245|  7.63k|        inv_txfm_add_dct_flipadst_##w##x##h##_c; \
  |  |  |  |  246|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_FLIPADST] = \
  |  |  |  |  247|  7.63k|        inv_txfm_add_flipadst_flipadst_##w##x##h##_c; \
  |  |  |  |  248|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][H_DCT] = \
  |  |  |  |  249|  7.63k|        inv_txfm_add_dct_identity_##w##x##h##_c; \
  |  |  |  |  250|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][V_DCT] = \
  |  |  |  |  251|  7.63k|        inv_txfm_add_identity_dct_##w##x##h##_c
  |  |  ------------------
  |  |  255|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][H_FLIPADST] = \
  |  |  256|  7.63k|        inv_txfm_add_flipadst_identity_##w##x##h##_c; \
  |  |  257|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][V_FLIPADST] = \
  |  |  258|  7.63k|        inv_txfm_add_identity_flipadst_##w##x##h##_c; \
  |  |  259|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][H_ADST] = \
  |  |  260|  7.63k|        inv_txfm_add_adst_identity_##w##x##h##_c; \
  |  |  261|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][V_ADST] = \
  |  |  262|  7.63k|        inv_txfm_add_identity_adst_##w##x##h##_c; \
  ------------------
  274|  7.63k|    assign_itx_all_fn84( 8,  8, );
  ------------------
  |  |  254|  7.63k|    assign_itx_all_fn16(w, h, pfx); \
  |  |  ------------------
  |  |  |  |  231|  7.63k|    assign_itx_all_fn32(w, h, pfx); \
  |  |  |  |  ------------------
  |  |  |  |  |  |  226|  7.63k|    assign_itx_all_fn64(w, h, pfx); \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  222|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT  ] = \
  |  |  |  |  |  |  |  |  223|  7.63k|        inv_txfm_add_dct_dct_##w##x##h##_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  227|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][IDTX] = \
  |  |  |  |  |  |  228|  7.63k|        inv_txfm_add_identity_identity_##w##x##h##_c
  |  |  |  |  ------------------
  |  |  |  |  232|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_ADST ] = \
  |  |  |  |  233|  7.63k|        inv_txfm_add_adst_dct_##w##x##h##_c; \
  |  |  |  |  234|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_DCT ] = \
  |  |  |  |  235|  7.63k|        inv_txfm_add_dct_adst_##w##x##h##_c; \
  |  |  |  |  236|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_ADST] = \
  |  |  |  |  237|  7.63k|        inv_txfm_add_adst_adst_##w##x##h##_c; \
  |  |  |  |  238|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_FLIPADST] = \
  |  |  |  |  239|  7.63k|        inv_txfm_add_flipadst_adst_##w##x##h##_c; \
  |  |  |  |  240|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_ADST] = \
  |  |  |  |  241|  7.63k|        inv_txfm_add_adst_flipadst_##w##x##h##_c; \
  |  |  |  |  242|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_FLIPADST] = \
  |  |  |  |  243|  7.63k|        inv_txfm_add_flipadst_dct_##w##x##h##_c; \
  |  |  |  |  244|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_DCT] = \
  |  |  |  |  245|  7.63k|        inv_txfm_add_dct_flipadst_##w##x##h##_c; \
  |  |  |  |  246|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_FLIPADST] = \
  |  |  |  |  247|  7.63k|        inv_txfm_add_flipadst_flipadst_##w##x##h##_c; \
  |  |  |  |  248|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][H_DCT] = \
  |  |  |  |  249|  7.63k|        inv_txfm_add_dct_identity_##w##x##h##_c; \
  |  |  |  |  250|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][V_DCT] = \
  |  |  |  |  251|  7.63k|        inv_txfm_add_identity_dct_##w##x##h##_c
  |  |  ------------------
  |  |  255|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][H_FLIPADST] = \
  |  |  256|  7.63k|        inv_txfm_add_flipadst_identity_##w##x##h##_c; \
  |  |  257|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][V_FLIPADST] = \
  |  |  258|  7.63k|        inv_txfm_add_identity_flipadst_##w##x##h##_c; \
  |  |  259|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][H_ADST] = \
  |  |  260|  7.63k|        inv_txfm_add_adst_identity_##w##x##h##_c; \
  |  |  261|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][V_ADST] = \
  |  |  262|  7.63k|        inv_txfm_add_identity_adst_##w##x##h##_c; \
  ------------------
  275|  7.63k|    assign_itx_all_fn84( 8, 16, R);
  ------------------
  |  |  254|  7.63k|    assign_itx_all_fn16(w, h, pfx); \
  |  |  ------------------
  |  |  |  |  231|  7.63k|    assign_itx_all_fn32(w, h, pfx); \
  |  |  |  |  ------------------
  |  |  |  |  |  |  226|  7.63k|    assign_itx_all_fn64(w, h, pfx); \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  222|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT  ] = \
  |  |  |  |  |  |  |  |  223|  7.63k|        inv_txfm_add_dct_dct_##w##x##h##_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  227|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][IDTX] = \
  |  |  |  |  |  |  228|  7.63k|        inv_txfm_add_identity_identity_##w##x##h##_c
  |  |  |  |  ------------------
  |  |  |  |  232|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_ADST ] = \
  |  |  |  |  233|  7.63k|        inv_txfm_add_adst_dct_##w##x##h##_c; \
  |  |  |  |  234|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_DCT ] = \
  |  |  |  |  235|  7.63k|        inv_txfm_add_dct_adst_##w##x##h##_c; \
  |  |  |  |  236|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_ADST] = \
  |  |  |  |  237|  7.63k|        inv_txfm_add_adst_adst_##w##x##h##_c; \
  |  |  |  |  238|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_FLIPADST] = \
  |  |  |  |  239|  7.63k|        inv_txfm_add_flipadst_adst_##w##x##h##_c; \
  |  |  |  |  240|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_ADST] = \
  |  |  |  |  241|  7.63k|        inv_txfm_add_adst_flipadst_##w##x##h##_c; \
  |  |  |  |  242|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_FLIPADST] = \
  |  |  |  |  243|  7.63k|        inv_txfm_add_flipadst_dct_##w##x##h##_c; \
  |  |  |  |  244|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_DCT] = \
  |  |  |  |  245|  7.63k|        inv_txfm_add_dct_flipadst_##w##x##h##_c; \
  |  |  |  |  246|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_FLIPADST] = \
  |  |  |  |  247|  7.63k|        inv_txfm_add_flipadst_flipadst_##w##x##h##_c; \
  |  |  |  |  248|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][H_DCT] = \
  |  |  |  |  249|  7.63k|        inv_txfm_add_dct_identity_##w##x##h##_c; \
  |  |  |  |  250|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][V_DCT] = \
  |  |  |  |  251|  7.63k|        inv_txfm_add_identity_dct_##w##x##h##_c
  |  |  ------------------
  |  |  255|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][H_FLIPADST] = \
  |  |  256|  7.63k|        inv_txfm_add_flipadst_identity_##w##x##h##_c; \
  |  |  257|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][V_FLIPADST] = \
  |  |  258|  7.63k|        inv_txfm_add_identity_flipadst_##w##x##h##_c; \
  |  |  259|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][H_ADST] = \
  |  |  260|  7.63k|        inv_txfm_add_adst_identity_##w##x##h##_c; \
  |  |  261|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][V_ADST] = \
  |  |  262|  7.63k|        inv_txfm_add_identity_adst_##w##x##h##_c; \
  ------------------
  276|  7.63k|    assign_itx_all_fn32( 8, 32, R);
  ------------------
  |  |  226|  7.63k|    assign_itx_all_fn64(w, h, pfx); \
  |  |  ------------------
  |  |  |  |  222|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT  ] = \
  |  |  |  |  223|  7.63k|        inv_txfm_add_dct_dct_##w##x##h##_c
  |  |  ------------------
  |  |  227|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][IDTX] = \
  |  |  228|  7.63k|        inv_txfm_add_identity_identity_##w##x##h##_c
  ------------------
  277|  7.63k|    assign_itx_all_fn84(16,  4, R);
  ------------------
  |  |  254|  7.63k|    assign_itx_all_fn16(w, h, pfx); \
  |  |  ------------------
  |  |  |  |  231|  7.63k|    assign_itx_all_fn32(w, h, pfx); \
  |  |  |  |  ------------------
  |  |  |  |  |  |  226|  7.63k|    assign_itx_all_fn64(w, h, pfx); \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  222|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT  ] = \
  |  |  |  |  |  |  |  |  223|  7.63k|        inv_txfm_add_dct_dct_##w##x##h##_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  227|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][IDTX] = \
  |  |  |  |  |  |  228|  7.63k|        inv_txfm_add_identity_identity_##w##x##h##_c
  |  |  |  |  ------------------
  |  |  |  |  232|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_ADST ] = \
  |  |  |  |  233|  7.63k|        inv_txfm_add_adst_dct_##w##x##h##_c; \
  |  |  |  |  234|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_DCT ] = \
  |  |  |  |  235|  7.63k|        inv_txfm_add_dct_adst_##w##x##h##_c; \
  |  |  |  |  236|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_ADST] = \
  |  |  |  |  237|  7.63k|        inv_txfm_add_adst_adst_##w##x##h##_c; \
  |  |  |  |  238|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_FLIPADST] = \
  |  |  |  |  239|  7.63k|        inv_txfm_add_flipadst_adst_##w##x##h##_c; \
  |  |  |  |  240|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_ADST] = \
  |  |  |  |  241|  7.63k|        inv_txfm_add_adst_flipadst_##w##x##h##_c; \
  |  |  |  |  242|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_FLIPADST] = \
  |  |  |  |  243|  7.63k|        inv_txfm_add_flipadst_dct_##w##x##h##_c; \
  |  |  |  |  244|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_DCT] = \
  |  |  |  |  245|  7.63k|        inv_txfm_add_dct_flipadst_##w##x##h##_c; \
  |  |  |  |  246|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_FLIPADST] = \
  |  |  |  |  247|  7.63k|        inv_txfm_add_flipadst_flipadst_##w##x##h##_c; \
  |  |  |  |  248|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][H_DCT] = \
  |  |  |  |  249|  7.63k|        inv_txfm_add_dct_identity_##w##x##h##_c; \
  |  |  |  |  250|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][V_DCT] = \
  |  |  |  |  251|  7.63k|        inv_txfm_add_identity_dct_##w##x##h##_c
  |  |  ------------------
  |  |  255|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][H_FLIPADST] = \
  |  |  256|  7.63k|        inv_txfm_add_flipadst_identity_##w##x##h##_c; \
  |  |  257|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][V_FLIPADST] = \
  |  |  258|  7.63k|        inv_txfm_add_identity_flipadst_##w##x##h##_c; \
  |  |  259|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][H_ADST] = \
  |  |  260|  7.63k|        inv_txfm_add_adst_identity_##w##x##h##_c; \
  |  |  261|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][V_ADST] = \
  |  |  262|  7.63k|        inv_txfm_add_identity_adst_##w##x##h##_c; \
  ------------------
  278|  7.63k|    assign_itx_all_fn84(16,  8, R);
  ------------------
  |  |  254|  7.63k|    assign_itx_all_fn16(w, h, pfx); \
  |  |  ------------------
  |  |  |  |  231|  7.63k|    assign_itx_all_fn32(w, h, pfx); \
  |  |  |  |  ------------------
  |  |  |  |  |  |  226|  7.63k|    assign_itx_all_fn64(w, h, pfx); \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  222|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT  ] = \
  |  |  |  |  |  |  |  |  223|  7.63k|        inv_txfm_add_dct_dct_##w##x##h##_c
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  227|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][IDTX] = \
  |  |  |  |  |  |  228|  7.63k|        inv_txfm_add_identity_identity_##w##x##h##_c
  |  |  |  |  ------------------
  |  |  |  |  232|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_ADST ] = \
  |  |  |  |  233|  7.63k|        inv_txfm_add_adst_dct_##w##x##h##_c; \
  |  |  |  |  234|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_DCT ] = \
  |  |  |  |  235|  7.63k|        inv_txfm_add_dct_adst_##w##x##h##_c; \
  |  |  |  |  236|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_ADST] = \
  |  |  |  |  237|  7.63k|        inv_txfm_add_adst_adst_##w##x##h##_c; \
  |  |  |  |  238|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_FLIPADST] = \
  |  |  |  |  239|  7.63k|        inv_txfm_add_flipadst_adst_##w##x##h##_c; \
  |  |  |  |  240|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_ADST] = \
  |  |  |  |  241|  7.63k|        inv_txfm_add_adst_flipadst_##w##x##h##_c; \
  |  |  |  |  242|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_FLIPADST] = \
  |  |  |  |  243|  7.63k|        inv_txfm_add_flipadst_dct_##w##x##h##_c; \
  |  |  |  |  244|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_DCT] = \
  |  |  |  |  245|  7.63k|        inv_txfm_add_dct_flipadst_##w##x##h##_c; \
  |  |  |  |  246|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_FLIPADST] = \
  |  |  |  |  247|  7.63k|        inv_txfm_add_flipadst_flipadst_##w##x##h##_c; \
  |  |  |  |  248|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][H_DCT] = \
  |  |  |  |  249|  7.63k|        inv_txfm_add_dct_identity_##w##x##h##_c; \
  |  |  |  |  250|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][V_DCT] = \
  |  |  |  |  251|  7.63k|        inv_txfm_add_identity_dct_##w##x##h##_c
  |  |  ------------------
  |  |  255|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][H_FLIPADST] = \
  |  |  256|  7.63k|        inv_txfm_add_flipadst_identity_##w##x##h##_c; \
  |  |  257|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][V_FLIPADST] = \
  |  |  258|  7.63k|        inv_txfm_add_identity_flipadst_##w##x##h##_c; \
  |  |  259|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][H_ADST] = \
  |  |  260|  7.63k|        inv_txfm_add_adst_identity_##w##x##h##_c; \
  |  |  261|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][V_ADST] = \
  |  |  262|  7.63k|        inv_txfm_add_identity_adst_##w##x##h##_c; \
  ------------------
  279|  7.63k|    assign_itx_all_fn16(16, 16, );
  ------------------
  |  |  231|  7.63k|    assign_itx_all_fn32(w, h, pfx); \
  |  |  ------------------
  |  |  |  |  226|  7.63k|    assign_itx_all_fn64(w, h, pfx); \
  |  |  |  |  ------------------
  |  |  |  |  |  |  222|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT  ] = \
  |  |  |  |  |  |  223|  7.63k|        inv_txfm_add_dct_dct_##w##x##h##_c
  |  |  |  |  ------------------
  |  |  |  |  227|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][IDTX] = \
  |  |  |  |  228|  7.63k|        inv_txfm_add_identity_identity_##w##x##h##_c
  |  |  ------------------
  |  |  232|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_ADST ] = \
  |  |  233|  7.63k|        inv_txfm_add_adst_dct_##w##x##h##_c; \
  |  |  234|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_DCT ] = \
  |  |  235|  7.63k|        inv_txfm_add_dct_adst_##w##x##h##_c; \
  |  |  236|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_ADST] = \
  |  |  237|  7.63k|        inv_txfm_add_adst_adst_##w##x##h##_c; \
  |  |  238|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][ADST_FLIPADST] = \
  |  |  239|  7.63k|        inv_txfm_add_flipadst_adst_##w##x##h##_c; \
  |  |  240|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_ADST] = \
  |  |  241|  7.63k|        inv_txfm_add_adst_flipadst_##w##x##h##_c; \
  |  |  242|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_FLIPADST] = \
  |  |  243|  7.63k|        inv_txfm_add_flipadst_dct_##w##x##h##_c; \
  |  |  244|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_DCT] = \
  |  |  245|  7.63k|        inv_txfm_add_dct_flipadst_##w##x##h##_c; \
  |  |  246|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_FLIPADST] = \
  |  |  247|  7.63k|        inv_txfm_add_flipadst_flipadst_##w##x##h##_c; \
  |  |  248|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][H_DCT] = \
  |  |  249|  7.63k|        inv_txfm_add_dct_identity_##w##x##h##_c; \
  |  |  250|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][V_DCT] = \
  |  |  251|  7.63k|        inv_txfm_add_identity_dct_##w##x##h##_c
  ------------------
  280|  7.63k|    assign_itx_all_fn32(16, 32, R);
  ------------------
  |  |  226|  7.63k|    assign_itx_all_fn64(w, h, pfx); \
  |  |  ------------------
  |  |  |  |  222|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT  ] = \
  |  |  |  |  223|  7.63k|        inv_txfm_add_dct_dct_##w##x##h##_c
  |  |  ------------------
  |  |  227|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][IDTX] = \
  |  |  228|  7.63k|        inv_txfm_add_identity_identity_##w##x##h##_c
  ------------------
  281|  7.63k|    assign_itx_all_fn64(16, 64, R);
  ------------------
  |  |  222|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT  ] = \
  |  |  223|  7.63k|        inv_txfm_add_dct_dct_##w##x##h##_c
  ------------------
  282|  7.63k|    assign_itx_all_fn32(32,  8, R);
  ------------------
  |  |  226|  7.63k|    assign_itx_all_fn64(w, h, pfx); \
  |  |  ------------------
  |  |  |  |  222|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT  ] = \
  |  |  |  |  223|  7.63k|        inv_txfm_add_dct_dct_##w##x##h##_c
  |  |  ------------------
  |  |  227|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][IDTX] = \
  |  |  228|  7.63k|        inv_txfm_add_identity_identity_##w##x##h##_c
  ------------------
  283|  7.63k|    assign_itx_all_fn32(32, 16, R);
  ------------------
  |  |  226|  7.63k|    assign_itx_all_fn64(w, h, pfx); \
  |  |  ------------------
  |  |  |  |  222|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT  ] = \
  |  |  |  |  223|  7.63k|        inv_txfm_add_dct_dct_##w##x##h##_c
  |  |  ------------------
  |  |  227|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][IDTX] = \
  |  |  228|  7.63k|        inv_txfm_add_identity_identity_##w##x##h##_c
  ------------------
  284|  7.63k|    assign_itx_all_fn32(32, 32, );
  ------------------
  |  |  226|  7.63k|    assign_itx_all_fn64(w, h, pfx); \
  |  |  ------------------
  |  |  |  |  222|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT  ] = \
  |  |  |  |  223|  7.63k|        inv_txfm_add_dct_dct_##w##x##h##_c
  |  |  ------------------
  |  |  227|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][IDTX] = \
  |  |  228|  7.63k|        inv_txfm_add_identity_identity_##w##x##h##_c
  ------------------
  285|  7.63k|    assign_itx_all_fn64(32, 64, R);
  ------------------
  |  |  222|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT  ] = \
  |  |  223|  7.63k|        inv_txfm_add_dct_dct_##w##x##h##_c
  ------------------
  286|  7.63k|    assign_itx_all_fn64(64, 16, R);
  ------------------
  |  |  222|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT  ] = \
  |  |  223|  7.63k|        inv_txfm_add_dct_dct_##w##x##h##_c
  ------------------
  287|  7.63k|    assign_itx_all_fn64(64, 32, R);
  ------------------
  |  |  222|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT  ] = \
  |  |  223|  7.63k|        inv_txfm_add_dct_dct_##w##x##h##_c
  ------------------
  288|  7.63k|    assign_itx_all_fn64(64, 64, );
  ------------------
  |  |  222|  7.63k|    c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT  ] = \
  |  |  223|  7.63k|        inv_txfm_add_dct_dct_##w##x##h##_c
  ------------------
  289|       |
  290|  7.63k|    int all_simd = 0;
  291|  7.63k|#if HAVE_ASM
  292|       |#if ARCH_AARCH64 || ARCH_ARM
  293|       |    itx_dsp_init_arm(c, bpc, &all_simd);
  294|       |#endif
  295|       |#if ARCH_LOONGARCH64
  296|       |    itx_dsp_init_loongarch(c, bpc);
  297|       |#endif
  298|       |#if ARCH_PPC64LE
  299|       |    itx_dsp_init_ppc(c, bpc);
  300|       |#endif
  301|       |#if ARCH_RISCV
  302|       |    itx_dsp_init_riscv(c, bpc);
  303|       |#endif
  304|  7.63k|#if ARCH_X86
  305|  7.63k|    itx_dsp_init_x86(c, bpc, &all_simd);
  306|  7.63k|#endif
  307|  7.63k|#endif
  308|       |
  309|  7.63k|    if (!all_simd)
  ------------------
  |  Branch (309:9): [True: 2.94k, False: 4.68k]
  ------------------
  310|  2.94k|        dav1d_init_last_nonzero_col_from_eob_tables();
  311|  7.63k|}

dav1d_copy_lpf_8bpc:
  106|  31.4k|{
  107|  31.4k|    const int have_tt = f->c->n_tc > 1;
  108|  31.4k|    const int resize = f->frame_hdr->width[0] != f->frame_hdr->width[1];
  109|  31.4k|    const int offset = 8 * !!sby;
  110|  31.4k|    const ptrdiff_t *const src_stride = f->cur.stride;
  111|  31.4k|    const ptrdiff_t *const lr_stride = f->sr_cur.p.stride;
  112|  31.4k|    const int tt_off = have_tt * sby * (4 << f->seq_hdr->sb128);
  113|  31.4k|    pixel *const dst[3] = {
  114|  31.4k|        f->lf.lr_lpf_line[0] + tt_off * PXSTRIDE(lr_stride[0]),
  ------------------
  |  |   53|  31.4k|#define PXSTRIDE(x) (x)
  ------------------
  115|  31.4k|        f->lf.lr_lpf_line[1] + tt_off * PXSTRIDE(lr_stride[1]),
  ------------------
  |  |   53|  31.4k|#define PXSTRIDE(x) (x)
  ------------------
  116|  31.4k|        f->lf.lr_lpf_line[2] + tt_off * PXSTRIDE(lr_stride[1])
  ------------------
  |  |   53|  31.4k|#define PXSTRIDE(x) (x)
  ------------------
  117|  31.4k|    };
  118|       |
  119|       |    // TODO Also check block level restore type to reduce copying.
  120|  31.4k|    const int restore_planes = f->lf.restore_planes;
  121|       |
  122|  31.4k|    if (f->seq_hdr->cdef || restore_planes & LR_RESTORE_Y) {
  ------------------
  |  Branch (122:9): [True: 28.0k, False: 3.48k]
  |  Branch (122:29): [True: 2.80k, False: 677]
  ------------------
  123|  30.8k|        const int h = f->cur.p.h;
  124|  30.8k|        const int w = f->bw << 2;
  125|  30.8k|        const int row_h = imin((sby + 1) << (6 + f->seq_hdr->sb128), h - 1);
  126|  30.8k|        const int y_stripe = (sby << (6 + f->seq_hdr->sb128)) - offset;
  127|  30.8k|        if (restore_planes & LR_RESTORE_Y || !resize)
  ------------------
  |  Branch (127:13): [True: 6.57k, False: 24.2k]
  |  Branch (127:46): [True: 23.5k, False: 728]
  ------------------
  128|  30.0k|            backup_lpf(f, dst[0], lr_stride[0],
  129|  30.0k|                       src[0] - offset * PXSTRIDE(src_stride[0]), src_stride[0],
  ------------------
  |  |   53|  30.0k|#define PXSTRIDE(x) (x)
  ------------------
  130|  30.0k|                       0, f->seq_hdr->sb128, y_stripe, row_h, w, h, 0, 1);
  131|  30.8k|        if (have_tt && resize) {
  ------------------
  |  Branch (131:13): [True: 29.0k, False: 1.79k]
  |  Branch (131:24): [True: 801, False: 28.2k]
  ------------------
  132|    801|            const ptrdiff_t cdef_off_y = sby * 4 * PXSTRIDE(src_stride[0]);
  ------------------
  |  |   53|    801|#define PXSTRIDE(x) (x)
  ------------------
  133|    801|            backup_lpf(f, f->lf.cdef_lpf_line[0] + cdef_off_y, src_stride[0],
  134|    801|                       src[0] - offset * PXSTRIDE(src_stride[0]), src_stride[0],
  ------------------
  |  |   53|    801|#define PXSTRIDE(x) (x)
  ------------------
  135|    801|                       0, f->seq_hdr->sb128, y_stripe, row_h, w, h, 0, 0);
  136|    801|        }
  137|  30.8k|    }
  138|  31.4k|    if ((f->seq_hdr->cdef || restore_planes & (LR_RESTORE_U | LR_RESTORE_V)) &&
  ------------------
  |  Branch (138:10): [True: 28.0k, False: 3.48k]
  |  Branch (138:30): [True: 2.56k, False: 922]
  ------------------
  139|  30.5k|        f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400)
  ------------------
  |  Branch (139:9): [True: 20.3k, False: 10.2k]
  ------------------
  140|  20.3k|    {
  141|  20.3k|        const int ss_ver = f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
  142|  20.3k|        const int ss_hor = f->sr_cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
  143|  20.3k|        const int h = (f->cur.p.h + ss_ver) >> ss_ver;
  144|  20.3k|        const int w = f->bw << (2 - ss_hor);
  145|  20.3k|        const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr->sb128), h - 1);
  146|  20.3k|        const int offset_uv = offset >> ss_ver;
  147|  20.3k|        const int y_stripe = (sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv;
  148|  20.3k|        const ptrdiff_t cdef_off_uv = sby * 4 * PXSTRIDE(src_stride[1]);
  ------------------
  |  |   53|  20.3k|#define PXSTRIDE(x) (x)
  ------------------
  149|  20.3k|        if (f->seq_hdr->cdef || restore_planes & LR_RESTORE_U) {
  ------------------
  |  Branch (149:13): [True: 17.7k, False: 2.56k]
  |  Branch (149:33): [True: 1.74k, False: 813]
  ------------------
  150|  19.5k|            if (restore_planes & LR_RESTORE_U || !resize)
  ------------------
  |  Branch (150:17): [True: 4.17k, False: 15.3k]
  |  Branch (150:50): [True: 14.6k, False: 661]
  ------------------
  151|  18.8k|                backup_lpf(f, dst[1], lr_stride[1],
  152|  18.8k|                           src[1] - offset_uv * PXSTRIDE(src_stride[1]),
  ------------------
  |  |   53|  18.8k|#define PXSTRIDE(x) (x)
  ------------------
  153|  18.8k|                           src_stride[1], ss_ver, f->seq_hdr->sb128, y_stripe,
  154|  18.8k|                           row_h, w, h, ss_hor, 1);
  155|  19.5k|            if (have_tt && resize)
  ------------------
  |  Branch (155:17): [True: 18.2k, False: 1.22k]
  |  Branch (155:28): [True: 725, False: 17.5k]
  ------------------
  156|    725|                backup_lpf(f, f->lf.cdef_lpf_line[1] + cdef_off_uv, src_stride[1],
  157|    725|                           src[1] - offset_uv * PXSTRIDE(src_stride[1]),
  ------------------
  |  |   53|    725|#define PXSTRIDE(x) (x)
  ------------------
  158|    725|                           src_stride[1], ss_ver, f->seq_hdr->sb128, y_stripe,
  159|    725|                           row_h, w, h, ss_hor, 0);
  160|  19.5k|        }
  161|  20.3k|        if (f->seq_hdr->cdef || restore_planes & LR_RESTORE_V) {
  ------------------
  |  Branch (161:13): [True: 17.7k, False: 2.56k]
  |  Branch (161:33): [True: 1.70k, False: 859]
  ------------------
  162|  19.4k|            if (restore_planes & LR_RESTORE_V || !resize)
  ------------------
  |  Branch (162:17): [True: 4.05k, False: 15.4k]
  |  Branch (162:50): [True: 14.7k, False: 635]
  ------------------
  163|  18.8k|                backup_lpf(f, dst[2], lr_stride[1],
  164|  18.8k|                           src[2] - offset_uv * PXSTRIDE(src_stride[1]),
  ------------------
  |  |   53|  18.8k|#define PXSTRIDE(x) (x)
  ------------------
  165|  18.8k|                           src_stride[1], ss_ver, f->seq_hdr->sb128, y_stripe,
  166|  18.8k|                           row_h, w, h, ss_hor, 1);
  167|  19.4k|            if (have_tt && resize)
  ------------------
  |  Branch (167:17): [True: 18.2k, False: 1.21k]
  |  Branch (167:28): [True: 724, False: 17.5k]
  ------------------
  168|    724|                backup_lpf(f, f->lf.cdef_lpf_line[2] + cdef_off_uv, src_stride[1],
  169|    724|                           src[2] - offset_uv * PXSTRIDE(src_stride[1]),
  ------------------
  |  |   53|    724|#define PXSTRIDE(x) (x)
  ------------------
  170|    724|                           src_stride[1], ss_ver, f->seq_hdr->sb128, y_stripe,
  171|    724|                           row_h, w, h, ss_hor, 0);
  172|  19.4k|        }
  173|  20.3k|    }
  174|  31.4k|}
dav1d_loopfilter_sbrow_cols_8bpc:
  316|  16.4k|{
  317|  16.4k|    int x, have_left;
  318|       |    // Don't filter outside the frame
  319|  16.4k|    const int is_sb64 = !f->seq_hdr->sb128;
  320|  16.4k|    const int starty4 = (sby & is_sb64) << 4;
  321|  16.4k|    const int sbsz = 32 >> is_sb64;
  322|  16.4k|    const int sbl2 = 5 - is_sb64;
  323|  16.4k|    const int halign = (f->bh + 31) & ~31;
  324|  16.4k|    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
  325|  16.4k|    const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
  326|  16.4k|    const int vmask = 16 >> ss_ver, hmask = 16 >> ss_hor;
  327|  16.4k|    const unsigned vmax = 1U << vmask, hmax = 1U << hmask;
  328|  16.4k|    const unsigned endy4 = starty4 + imin(f->h4 - sby * sbsz, sbsz);
  329|  16.4k|    const unsigned uv_endy4 = (endy4 + ss_ver) >> ss_ver;
  330|       |
  331|       |    // fix lpf strength at tile col boundaries
  332|  16.4k|    const uint8_t *lpf_y = &f->lf.tx_lpf_right_edge[0][sby << sbl2];
  333|  16.4k|    const uint8_t *lpf_uv = &f->lf.tx_lpf_right_edge[1][sby << (sbl2 - ss_ver)];
  334|  17.0k|    for (int tile_col = 1;; tile_col++) {
  335|  17.0k|        x = f->frame_hdr->tiling.col_start_sb[tile_col];
  336|  17.0k|        if ((x << sbl2) >= f->bw) break;
  ------------------
  |  Branch (336:13): [True: 16.4k, False: 623]
  ------------------
  337|    623|        const int bx4 = x & is_sb64 ? 16 : 0, cbx4 = bx4 >> ss_hor;
  ------------------
  |  Branch (337:25): [True: 191, False: 432]
  ------------------
  338|    623|        x >>= is_sb64;
  339|       |
  340|    623|        uint16_t (*const y_hmask)[2] = lflvl[x].filter_y[0][bx4];
  341|  15.3k|        for (unsigned y = starty4, mask = 1 << y; y < endy4; y++, mask <<= 1) {
  ------------------
  |  Branch (341:51): [True: 14.7k, False: 623]
  ------------------
  342|  14.7k|            const int sidx = mask >= 0x10000U;
  343|  14.7k|            const unsigned smask = mask >> (sidx << 4);
  344|  14.7k|            const int idx = 2 * !!(y_hmask[2][sidx] & smask) +
  345|  14.7k|                                !!(y_hmask[1][sidx] & smask);
  346|  14.7k|            y_hmask[2][sidx] &= ~smask;
  347|  14.7k|            y_hmask[1][sidx] &= ~smask;
  348|  14.7k|            y_hmask[0][sidx] &= ~smask;
  349|  14.7k|            y_hmask[imin(idx, lpf_y[y - starty4])][sidx] |= smask;
  350|  14.7k|        }
  351|       |
  352|    623|        if (f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400) {
  ------------------
  |  Branch (352:13): [True: 455, False: 168]
  ------------------
  353|    455|            uint16_t (*const uv_hmask)[2] = lflvl[x].filter_uv[0][cbx4];
  354|  12.4k|            for (unsigned y = starty4 >> ss_ver, uv_mask = 1 << y; y < uv_endy4;
  ------------------
  |  Branch (354:68): [True: 11.9k, False: 455]
  ------------------
  355|  11.9k|                 y++, uv_mask <<= 1)
  356|  11.9k|            {
  357|  11.9k|                const int sidx = uv_mask >= vmax;
  358|  11.9k|                const unsigned smask = uv_mask >> (sidx << (4 - ss_ver));
  359|  11.9k|                const int idx = !!(uv_hmask[1][sidx] & smask);
  360|  11.9k|                uv_hmask[1][sidx] &= ~smask;
  361|  11.9k|                uv_hmask[0][sidx] &= ~smask;
  362|  11.9k|                uv_hmask[imin(idx, lpf_uv[y - (starty4 >> ss_ver)])][sidx] |= smask;
  363|  11.9k|            }
  364|    455|        }
  365|    623|        lpf_y  += halign;
  366|    623|        lpf_uv += halign >> ss_ver;
  367|    623|    }
  368|       |
  369|       |    // fix lpf strength at tile row boundaries
  370|  16.4k|    if (start_of_tile_row) {
  ------------------
  |  Branch (370:9): [True: 284, False: 16.1k]
  ------------------
  371|    284|        const BlockContext *a;
  372|    284|        for (x = 0, a = &f->a[f->sb128w * (start_of_tile_row - 1)];
  373|    631|             x < f->sb128w; x++, a++)
  ------------------
  |  Branch (373:14): [True: 347, False: 284]
  ------------------
  374|    347|        {
  375|    347|            uint16_t (*const y_vmask)[2] = lflvl[x].filter_y[1][starty4];
  376|    347|            const unsigned w = imin(32, f->w4 - (x << 5));
  377|  6.25k|            for (unsigned mask = 1, i = 0; i < w; mask <<= 1, i++) {
  ------------------
  |  Branch (377:44): [True: 5.90k, False: 347]
  ------------------
  378|  5.90k|                const int sidx = mask >= 0x10000U;
  379|  5.90k|                const unsigned smask = mask >> (sidx << 4);
  380|  5.90k|                const int idx = 2 * !!(y_vmask[2][sidx] & smask) +
  381|  5.90k|                                    !!(y_vmask[1][sidx] & smask);
  382|  5.90k|                y_vmask[2][sidx] &= ~smask;
  383|  5.90k|                y_vmask[1][sidx] &= ~smask;
  384|  5.90k|                y_vmask[0][sidx] &= ~smask;
  385|  5.90k|                y_vmask[imin(idx, a->tx_lpf_y[i])][sidx] |= smask;
  386|  5.90k|            }
  387|       |
  388|    347|            if (f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400) {
  ------------------
  |  Branch (388:17): [True: 214, False: 133]
  ------------------
  389|    214|                const unsigned cw = (w + ss_hor) >> ss_hor;
  390|    214|                uint16_t (*const uv_vmask)[2] = lflvl[x].filter_uv[1][starty4 >> ss_ver];
  391|  2.91k|                for (unsigned uv_mask = 1, i = 0; i < cw; uv_mask <<= 1, i++) {
  ------------------
  |  Branch (391:51): [True: 2.70k, False: 214]
  ------------------
  392|  2.70k|                    const int sidx = uv_mask >= hmax;
  393|  2.70k|                    const unsigned smask = uv_mask >> (sidx << (4 - ss_hor));
  394|  2.70k|                    const int idx = !!(uv_vmask[1][sidx] & smask);
  395|  2.70k|                    uv_vmask[1][sidx] &= ~smask;
  396|  2.70k|                    uv_vmask[0][sidx] &= ~smask;
  397|  2.70k|                    uv_vmask[imin(idx, a->tx_lpf_uv[i])][sidx] |= smask;
  398|  2.70k|                }
  399|    214|            }
  400|    347|        }
  401|    284|    }
  402|       |
  403|  16.4k|    pixel *ptr;
  404|  16.4k|    uint8_t (*level_ptr)[4] = f->lf.level + f->b4_stride * sby * sbsz;
  405|  34.9k|    for (ptr = p[0], have_left = 0, x = 0; x < f->sb128w;
  ------------------
  |  Branch (405:44): [True: 18.5k, False: 16.4k]
  ------------------
  406|  18.5k|         x++, have_left = 1, ptr += 128, level_ptr += 32)
  407|  18.5k|    {
  408|  18.5k|        filter_plane_cols_y(f, have_left, level_ptr, f->b4_stride,
  409|  18.5k|                            lflvl[x].filter_y[0], ptr, f->cur.stride[0],
  410|  18.5k|                            imin(32, f->w4 - x * 32), starty4, endy4);
  411|  18.5k|    }
  412|       |
  413|  16.4k|    if (!f->frame_hdr->loopfilter.level_u && !f->frame_hdr->loopfilter.level_v)
  ------------------
  |  Branch (413:9): [True: 5.64k, False: 10.8k]
  |  Branch (413:46): [True: 4.92k, False: 719]
  ------------------
  414|  4.92k|        return;
  415|       |
  416|  11.5k|    ptrdiff_t uv_off;
  417|  11.5k|    level_ptr = f->lf.level + f->b4_stride * (sby * sbsz >> ss_ver);
  418|  24.6k|    for (uv_off = 0, have_left = 0, x = 0; x < f->sb128w;
  ------------------
  |  Branch (418:44): [True: 13.1k, False: 11.5k]
  ------------------
  419|  13.1k|         x++, have_left = 1, uv_off += 128 >> ss_hor, level_ptr += 32 >> ss_hor)
  420|  13.1k|    {
  421|  13.1k|        filter_plane_cols_uv(f, have_left, level_ptr, f->b4_stride,
  422|  13.1k|                             lflvl[x].filter_uv[0],
  423|  13.1k|                             &p[1][uv_off], &p[2][uv_off], f->cur.stride[1],
  424|  13.1k|                             (imin(32, f->w4 - x * 32) + ss_hor) >> ss_hor,
  425|  13.1k|                             starty4 >> ss_ver, uv_endy4, ss_ver);
  426|  13.1k|    }
  427|  11.5k|}
dav1d_loopfilter_sbrow_rows_8bpc:
  432|  16.4k|{
  433|  16.4k|    int x;
  434|       |    // Don't filter outside the frame
  435|  16.4k|    const int have_top = sby > 0;
  436|  16.4k|    const int is_sb64 = !f->seq_hdr->sb128;
  437|  16.4k|    const int starty4 = (sby & is_sb64) << 4;
  438|  16.4k|    const int sbsz = 32 >> is_sb64;
  439|  16.4k|    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
  440|  16.4k|    const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
  441|  16.4k|    const unsigned endy4 = starty4 + imin(f->h4 - sby * sbsz, sbsz);
  442|  16.4k|    const unsigned uv_endy4 = (endy4 + ss_ver) >> ss_ver;
  443|       |
  444|  16.4k|    pixel *ptr;
  445|  16.4k|    uint8_t (*level_ptr)[4] = f->lf.level + f->b4_stride * sby * sbsz;
  446|  34.8k|    for (ptr = p[0], x = 0; x < f->sb128w; x++, ptr += 128, level_ptr += 32) {
  ------------------
  |  Branch (446:29): [True: 18.4k, False: 16.4k]
  ------------------
  447|  18.4k|        filter_plane_rows_y(f, have_top, level_ptr, f->b4_stride,
  448|  18.4k|                            lflvl[x].filter_y[1], ptr, f->cur.stride[0],
  449|  18.4k|                            imin(32, f->w4 - x * 32), starty4, endy4);
  450|  18.4k|    }
  451|       |
  452|  16.4k|    if (!f->frame_hdr->loopfilter.level_u && !f->frame_hdr->loopfilter.level_v)
  ------------------
  |  Branch (452:9): [True: 5.61k, False: 10.8k]
  |  Branch (452:46): [True: 4.89k, False: 717]
  ------------------
  453|  4.89k|        return;
  454|       |
  455|  11.5k|    ptrdiff_t uv_off;
  456|  11.5k|    level_ptr = f->lf.level + f->b4_stride * (sby * sbsz >> ss_ver);
  457|  24.6k|    for (uv_off = 0, x = 0; x < f->sb128w;
  ------------------
  |  Branch (457:29): [True: 13.1k, False: 11.5k]
  ------------------
  458|  13.1k|         x++, uv_off += 128 >> ss_hor, level_ptr += 32 >> ss_hor)
  459|  13.1k|    {
  460|  13.1k|        filter_plane_rows_uv(f, have_top, level_ptr, f->b4_stride,
  461|  13.1k|                             lflvl[x].filter_uv[1],
  462|  13.1k|                             &p[1][uv_off], &p[2][uv_off], f->cur.stride[1],
  463|  13.1k|                             (imin(32, f->w4 - x * 32) + ss_hor) >> ss_hor,
  464|  13.1k|                             starty4 >> ss_ver, uv_endy4, ss_hor);
  465|  13.1k|    }
  466|  11.5k|}
lf_apply_tmpl.c:backup_lpf:
   47|   195k|{
   48|   195k|    const int cdef_backup = !lr_backup;
   49|   195k|    const int dst_w = f->frame_hdr->super_res.enabled ?
  ------------------
  |  Branch (49:23): [True: 53.6k, False: 141k]
  ------------------
   50|   141k|                      (f->frame_hdr->width[1] + ss_hor) >> ss_hor : src_w;
   51|       |
   52|       |    // The first stripe of the frame is shorter by 8 luma pixel rows.
   53|   195k|    int stripe_h = ((64 << (cdef_backup & sb128)) - 8 * !row) >> ss_ver;
   54|   195k|    src += (stripe_h - 2) * PXSTRIDE(src_stride);
  ------------------
  |  |   53|   195k|#define PXSTRIDE(x) (x)
  ------------------
   55|       |
   56|   195k|    if (f->c->n_tc == 1) {
  ------------------
  |  Branch (56:9): [True: 12.6k, False: 182k]
  ------------------
   57|  12.6k|        if (row) {
  ------------------
  |  Branch (57:13): [True: 10.5k, False: 2.07k]
  ------------------
   58|  10.5k|            const int top = 4 << sb128;
   59|       |            // Copy the top part of the stored loop filtered pixels from the
   60|       |            // previous sb row needed above the first stripe of this sb row.
   61|  10.5k|            pixel_copy(&dst[PXSTRIDE(dst_stride) *  0],
  ------------------
  |  |   47|  10.5k|#define pixel_copy memcpy
  ------------------
                          pixel_copy(&dst[PXSTRIDE(dst_stride) *  0],
  ------------------
  |  |   53|  10.5k|#define PXSTRIDE(x) (x)
  ------------------
   62|  10.5k|                       &dst[PXSTRIDE(dst_stride) *  top],      dst_w);
  ------------------
  |  |   53|  10.5k|#define PXSTRIDE(x) (x)
  ------------------
   63|  10.5k|            pixel_copy(&dst[PXSTRIDE(dst_stride) *  1],
  ------------------
  |  |   47|  10.5k|#define pixel_copy memcpy
  ------------------
                          pixel_copy(&dst[PXSTRIDE(dst_stride) *  1],
  ------------------
  |  |   53|  10.5k|#define PXSTRIDE(x) (x)
  ------------------
   64|  10.5k|                       &dst[PXSTRIDE(dst_stride) * (top + 1)], dst_w);
  ------------------
  |  |   53|  10.5k|#define PXSTRIDE(x) (x)
  ------------------
   65|  10.5k|            pixel_copy(&dst[PXSTRIDE(dst_stride) *  2],
  ------------------
  |  |   47|  10.5k|#define pixel_copy memcpy
  ------------------
                          pixel_copy(&dst[PXSTRIDE(dst_stride) *  2],
  ------------------
  |  |   53|  10.5k|#define PXSTRIDE(x) (x)
  ------------------
   66|  10.5k|                       &dst[PXSTRIDE(dst_stride) * (top + 2)], dst_w);
  ------------------
  |  |   53|  10.5k|#define PXSTRIDE(x) (x)
  ------------------
   67|  10.5k|            pixel_copy(&dst[PXSTRIDE(dst_stride) *  3],
  ------------------
  |  |   47|  10.5k|#define pixel_copy memcpy
  ------------------
                          pixel_copy(&dst[PXSTRIDE(dst_stride) *  3],
  ------------------
  |  |   53|  10.5k|#define PXSTRIDE(x) (x)
  ------------------
   68|  10.5k|                       &dst[PXSTRIDE(dst_stride) * (top + 3)], dst_w);
  ------------------
  |  |   53|  10.5k|#define PXSTRIDE(x) (x)
  ------------------
   69|  10.5k|        }
   70|  12.6k|        dst += 4 * PXSTRIDE(dst_stride);
  ------------------
  |  |   53|  12.6k|#define PXSTRIDE(x) (x)
  ------------------
   71|  12.6k|    }
   72|       |
   73|   195k|    if (lr_backup && (f->frame_hdr->width[0] != f->frame_hdr->width[1])) {
  ------------------
  |  Branch (73:9): [True: 187k, False: 7.61k]
  |  Branch (73:22): [True: 4.83k, False: 183k]
  ------------------
   74|  9.95k|        while (row + stripe_h <= row_h) {
  ------------------
  |  Branch (74:16): [True: 5.12k, False: 4.83k]
  ------------------
   75|  5.12k|            const int n_lines = 4 - (row + stripe_h + 1 == h);
   76|  5.12k|            f->dsp->mc.resize(dst, dst_stride, src, src_stride,
   77|  5.12k|                              dst_w, n_lines, src_w, f->resize_step[ss_hor],
   78|  5.12k|                              f->resize_start[ss_hor] HIGHBD_CALL_SUFFIX);
   79|  5.12k|            row += stripe_h; // unmodified stripe_h for the 1st stripe
   80|  5.12k|            stripe_h = 64 >> ss_ver;
   81|  5.12k|            src += stripe_h * PXSTRIDE(src_stride);
  ------------------
  |  |   53|  5.12k|#define PXSTRIDE(x) (x)
  ------------------
   82|  5.12k|            dst += n_lines * PXSTRIDE(dst_stride);
  ------------------
  |  |   53|  5.12k|#define PXSTRIDE(x) (x)
  ------------------
   83|  5.12k|            if (n_lines == 3) {
  ------------------
  |  Branch (83:17): [True: 34, False: 5.08k]
  ------------------
   84|     34|                pixel_copy(dst, &dst[-PXSTRIDE(dst_stride)], dst_w);
  ------------------
  |  |   47|     34|#define pixel_copy memcpy
  ------------------
                              pixel_copy(dst, &dst[-PXSTRIDE(dst_stride)], dst_w);
  ------------------
  |  |   53|     34|#define PXSTRIDE(x) (x)
  ------------------
   85|     34|                dst += PXSTRIDE(dst_stride);
  ------------------
  |  |   53|     34|#define PXSTRIDE(x) (x)
  ------------------
   86|     34|            }
   87|  5.12k|        }
   88|   190k|    } else {
   89|   431k|        while (row + stripe_h <= row_h) {
  ------------------
  |  Branch (89:16): [True: 240k, False: 190k]
  ------------------
   90|   240k|            const int n_lines = 4 - (row + stripe_h + 1 == h);
   91|  1.20M|            for (int i = 0; i < 4; i++) {
  ------------------
  |  Branch (91:29): [True: 961k, False: 240k]
  ------------------
   92|   961k|                pixel_copy(dst, i == n_lines ? &dst[-PXSTRIDE(dst_stride)] :
  ------------------
  |  |   47|   961k|#define pixel_copy memcpy
  ------------------
                              pixel_copy(dst, i == n_lines ? &dst[-PXSTRIDE(dst_stride)] :
  ------------------
  |  |   53|    174|#define PXSTRIDE(x) (x)
  ------------------
  |  Branch (92:33): [True: 174, False: 961k]
  ------------------
   93|   961k|                                               src, src_w);
   94|   961k|                dst += PXSTRIDE(dst_stride);
  ------------------
  |  |   53|   961k|#define PXSTRIDE(x) (x)
  ------------------
   95|   961k|                src += PXSTRIDE(src_stride);
  ------------------
  |  |   53|   961k|#define PXSTRIDE(x) (x)
  ------------------
   96|   961k|            }
   97|   240k|            row += stripe_h; // unmodified stripe_h for the 1st stripe
   98|   240k|            stripe_h = 64 >> ss_ver;
   99|   240k|            src += (stripe_h - 4) * PXSTRIDE(src_stride);
  ------------------
  |  |   53|   240k|#define PXSTRIDE(x) (x)
  ------------------
  100|   240k|        }
  101|   190k|    }
  102|   195k|}
lf_apply_tmpl.c:filter_plane_cols_y:
  184|  86.8k|{
  185|  86.8k|    const Dav1dDSPContext *const dsp = f->dsp;
  186|       |
  187|       |    // filter edges between columns (e.g. block1 | block2)
  188|   527k|    for (int x = 0; x < w; x++) {
  ------------------
  |  Branch (188:21): [True: 440k, False: 86.8k]
  ------------------
  189|   440k|        if (!have_left && !x) continue;
  ------------------
  |  Branch (189:13): [True: 350k, False: 90.7k]
  |  Branch (189:27): [True: 82.4k, False: 267k]
  ------------------
  190|   358k|        uint32_t hmask[4];
  191|   358k|        if (!starty4) {
  ------------------
  |  Branch (191:13): [True: 251k, False: 107k]
  ------------------
  192|   251k|            hmask[0] = mask[x][0][0];
  193|   251k|            hmask[1] = mask[x][1][0];
  194|   251k|            hmask[2] = mask[x][2][0];
  195|   251k|            if (endy4 > 16) {
  ------------------
  |  Branch (195:17): [True: 67.6k, False: 183k]
  ------------------
  196|  67.6k|                hmask[0] |= (unsigned) mask[x][0][1] << 16;
  197|  67.6k|                hmask[1] |= (unsigned) mask[x][1][1] << 16;
  198|  67.6k|                hmask[2] |= (unsigned) mask[x][2][1] << 16;
  199|  67.6k|            }
  200|   251k|        } else {
  201|   107k|            hmask[0] = mask[x][0][1];
  202|   107k|            hmask[1] = mask[x][1][1];
  203|   107k|            hmask[2] = mask[x][2][1];
  204|   107k|        }
  205|   358k|        hmask[3] = 0;
  206|   358k|        dsp->lf.loop_filter_sb[0][0](&dst[x * 4], ls, hmask,
  207|   358k|                                     (const uint8_t(*)[4]) &lvl[x][0], b4_stride,
  208|   358k|                                     &f->lf.lim_lut, endy4 - starty4 HIGHBD_CALL_SUFFIX);
  209|   358k|    }
  210|  86.8k|}
lf_apply_tmpl.c:filter_plane_cols_uv:
  251|  35.7k|{
  252|  35.7k|    const Dav1dDSPContext *const dsp = f->dsp;
  253|       |
  254|       |    // filter edges between columns (e.g. block1 | block2)
  255|   288k|    for (int x = 0; x < w; x++) {
  ------------------
  |  Branch (255:21): [True: 253k, False: 35.7k]
  ------------------
  256|   253k|        if (!have_left && !x) continue;
  ------------------
  |  Branch (256:13): [True: 193k, False: 59.5k]
  |  Branch (256:27): [True: 32.4k, False: 161k]
  ------------------
  257|   220k|        uint32_t hmask[3];
  258|   220k|        if (!starty4) {
  ------------------
  |  Branch (258:13): [True: 159k, False: 60.8k]
  ------------------
  259|   159k|            hmask[0] = mask[x][0][0];
  260|   159k|            hmask[1] = mask[x][1][0];
  261|   159k|            if (endy4 > (16 >> ss_ver)) {
  ------------------
  |  Branch (261:17): [True: 42.1k, False: 117k]
  ------------------
  262|  42.1k|                hmask[0] |= (unsigned) mask[x][0][1] << (16 >> ss_ver);
  263|  42.1k|                hmask[1] |= (unsigned) mask[x][1][1] << (16 >> ss_ver);
  264|  42.1k|            }
  265|   159k|        } else {
  266|  60.8k|            hmask[0] = mask[x][0][1];
  267|  60.8k|            hmask[1] = mask[x][1][1];
  268|  60.8k|        }
  269|   220k|        hmask[2] = 0;
  270|   220k|        dsp->lf.loop_filter_sb[1][0](&u[x * 4], ls, hmask,
  271|   220k|                                     (const uint8_t(*)[4]) &lvl[x][2], b4_stride,
  272|   220k|                                     &f->lf.lim_lut, endy4 - starty4 HIGHBD_CALL_SUFFIX);
  273|   220k|        dsp->lf.loop_filter_sb[1][0](&v[x * 4], ls, hmask,
  274|   220k|                                     (const uint8_t(*)[4]) &lvl[x][3], b4_stride,
  275|   220k|                                     &f->lf.lim_lut, endy4 - starty4 HIGHBD_CALL_SUFFIX);
  276|   220k|    }
  277|  35.7k|}
lf_apply_tmpl.c:filter_plane_rows_y:
  220|  86.7k|{
  221|  86.7k|    const Dav1dDSPContext *const dsp = f->dsp;
  222|       |
  223|       |    //                                 block1
  224|       |    // filter edges between rows (e.g. ------)
  225|       |    //                                 block2
  226|  1.78M|    for (int y = starty4; y < endy4;
  ------------------
  |  Branch (226:27): [True: 1.70M, False: 86.7k]
  ------------------
  227|  1.70M|         y++, dst += 4 * PXSTRIDE(ls), lvl += b4_stride)
  ------------------
  |  |   53|  1.70M|#define PXSTRIDE(x) (x)
  ------------------
  228|  1.70M|    {
  229|  1.70M|        if (!have_top && !y) continue;
  ------------------
  |  Branch (229:13): [True: 118k, False: 1.58M]
  |  Branch (229:26): [True: 9.33k, False: 108k]
  ------------------
  230|  1.69M|        const uint32_t vmask[4] = {
  231|  1.69M|            mask[y][0][0] | ((unsigned) mask[y][0][1] << 16),
  232|  1.69M|            mask[y][1][0] | ((unsigned) mask[y][1][1] << 16),
  233|  1.69M|            mask[y][2][0] | ((unsigned) mask[y][2][1] << 16),
  234|  1.69M|            0,
  235|  1.69M|        };
  236|  1.69M|        dsp->lf.loop_filter_sb[0][1](dst, ls, vmask,
  237|  1.69M|                                     (const uint8_t(*)[4]) &lvl[0][1], b4_stride,
  238|  1.69M|                                     &f->lf.lim_lut, w HIGHBD_CALL_SUFFIX);
  239|  1.69M|    }
  240|  86.7k|}
lf_apply_tmpl.c:filter_plane_rows_uv:
  288|  35.7k|{
  289|  35.7k|    const Dav1dDSPContext *const dsp = f->dsp;
  290|  35.7k|    ptrdiff_t off_l = 0;
  291|       |
  292|       |    //                                 block1
  293|       |    // filter edges between rows (e.g. ------)
  294|       |    //                                 block2
  295|   697k|    for (int y = starty4; y < endy4;
  ------------------
  |  Branch (295:27): [True: 661k, False: 35.7k]
  ------------------
  296|   661k|         y++, off_l += 4 * PXSTRIDE(ls), lvl += b4_stride)
  ------------------
  |  |   53|   661k|#define PXSTRIDE(x) (x)
  ------------------
  297|   661k|    {
  298|   661k|        if (!have_top && !y) continue;
  ------------------
  |  Branch (298:13): [True: 72.5k, False: 588k]
  |  Branch (298:26): [True: 7.10k, False: 65.4k]
  ------------------
  299|   654k|        const uint32_t vmask[3] = {
  300|   654k|            mask[y][0][0] | ((unsigned) mask[y][0][1] << (16 >> ss_hor)),
  301|   654k|            mask[y][1][0] | ((unsigned) mask[y][1][1] << (16 >> ss_hor)),
  302|   654k|            0,
  303|   654k|        };
  304|   654k|        dsp->lf.loop_filter_sb[1][1](&u[off_l], ls, vmask,
  305|   654k|                                     (const uint8_t(*)[4]) &lvl[0][2], b4_stride,
  306|   654k|                                     &f->lf.lim_lut, w HIGHBD_CALL_SUFFIX);
  307|   654k|        dsp->lf.loop_filter_sb[1][1](&v[off_l], ls, vmask,
  308|   654k|                                     (const uint8_t(*)[4]) &lvl[0][3], b4_stride,
  309|   654k|                                     &f->lf.lim_lut, w HIGHBD_CALL_SUFFIX);
  310|   654k|    }
  311|  35.7k|}
dav1d_copy_lpf_16bpc:
  106|  74.0k|{
  107|  74.0k|    const int have_tt = f->c->n_tc > 1;
  108|  74.0k|    const int resize = f->frame_hdr->width[0] != f->frame_hdr->width[1];
  109|  74.0k|    const int offset = 8 * !!sby;
  110|  74.0k|    const ptrdiff_t *const src_stride = f->cur.stride;
  111|  74.0k|    const ptrdiff_t *const lr_stride = f->sr_cur.p.stride;
  112|  74.0k|    const int tt_off = have_tt * sby * (4 << f->seq_hdr->sb128);
  113|  74.0k|    pixel *const dst[3] = {
  114|  74.0k|        f->lf.lr_lpf_line[0] + tt_off * PXSTRIDE(lr_stride[0]),
  115|  74.0k|        f->lf.lr_lpf_line[1] + tt_off * PXSTRIDE(lr_stride[1]),
  116|  74.0k|        f->lf.lr_lpf_line[2] + tt_off * PXSTRIDE(lr_stride[1])
  117|  74.0k|    };
  118|       |
  119|       |    // TODO Also check block level restore type to reduce copying.
  120|  74.0k|    const int restore_planes = f->lf.restore_planes;
  121|       |
  122|  74.0k|    if (f->seq_hdr->cdef || restore_planes & LR_RESTORE_Y) {
  ------------------
  |  Branch (122:9): [True: 66.9k, False: 7.14k]
  |  Branch (122:29): [True: 6.43k, False: 705]
  ------------------
  123|  73.3k|        const int h = f->cur.p.h;
  124|  73.3k|        const int w = f->bw << 2;
  125|  73.3k|        const int row_h = imin((sby + 1) << (6 + f->seq_hdr->sb128), h - 1);
  126|  73.3k|        const int y_stripe = (sby << (6 + f->seq_hdr->sb128)) - offset;
  127|  73.3k|        if (restore_planes & LR_RESTORE_Y || !resize)
  ------------------
  |  Branch (127:13): [True: 53.8k, False: 19.5k]
  |  Branch (127:46): [True: 18.9k, False: 576]
  ------------------
  128|  72.7k|            backup_lpf(f, dst[0], lr_stride[0],
  129|  72.7k|                       src[0] - offset * PXSTRIDE(src_stride[0]), src_stride[0],
  130|  72.7k|                       0, f->seq_hdr->sb128, y_stripe, row_h, w, h, 0, 1);
  131|  73.3k|        if (have_tt && resize) {
  ------------------
  |  Branch (131:13): [True: 69.4k, False: 3.85k]
  |  Branch (131:24): [True: 1.88k, False: 67.6k]
  ------------------
  132|  1.88k|            const ptrdiff_t cdef_off_y = sby * 4 * PXSTRIDE(src_stride[0]);
  133|  1.88k|            backup_lpf(f, f->lf.cdef_lpf_line[0] + cdef_off_y, src_stride[0],
  134|  1.88k|                       src[0] - offset * PXSTRIDE(src_stride[0]), src_stride[0],
  135|  1.88k|                       0, f->seq_hdr->sb128, y_stripe, row_h, w, h, 0, 0);
  136|  1.88k|        }
  137|  73.3k|    }
  138|  74.0k|    if ((f->seq_hdr->cdef || restore_planes & (LR_RESTORE_U | LR_RESTORE_V)) &&
  ------------------
  |  Branch (138:10): [True: 66.9k, False: 7.14k]
  |  Branch (138:30): [True: 4.80k, False: 2.33k]
  ------------------
  139|  71.7k|        f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400)
  ------------------
  |  Branch (139:9): [True: 24.6k, False: 47.1k]
  ------------------
  140|  24.6k|    {
  141|  24.6k|        const int ss_ver = f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
  142|  24.6k|        const int ss_hor = f->sr_cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
  143|  24.6k|        const int h = (f->cur.p.h + ss_ver) >> ss_ver;
  144|  24.6k|        const int w = f->bw << (2 - ss_hor);
  145|  24.6k|        const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr->sb128), h - 1);
  146|  24.6k|        const int offset_uv = offset >> ss_ver;
  147|  24.6k|        const int y_stripe = (sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv;
  148|  24.6k|        const ptrdiff_t cdef_off_uv = sby * 4 * PXSTRIDE(src_stride[1]);
  149|  24.6k|        if (f->seq_hdr->cdef || restore_planes & LR_RESTORE_U) {
  ------------------
  |  Branch (149:13): [True: 19.8k, False: 4.80k]
  |  Branch (149:33): [True: 4.39k, False: 413]
  ------------------
  150|  24.1k|            if (restore_planes & LR_RESTORE_U || !resize)
  ------------------
  |  Branch (150:17): [True: 13.9k, False: 10.2k]
  |  Branch (150:50): [True: 9.78k, False: 423]
  ------------------
  151|  23.7k|                backup_lpf(f, dst[1], lr_stride[1],
  152|  23.7k|                           src[1] - offset_uv * PXSTRIDE(src_stride[1]),
  153|  23.7k|                           src_stride[1], ss_ver, f->seq_hdr->sb128, y_stripe,
  154|  23.7k|                           row_h, w, h, ss_hor, 1);
  155|  24.1k|            if (have_tt && resize)
  ------------------
  |  Branch (155:17): [True: 21.8k, False: 2.37k]
  |  Branch (155:28): [True: 1.74k, False: 20.0k]
  ------------------
  156|  1.74k|                backup_lpf(f, f->lf.cdef_lpf_line[1] + cdef_off_uv, src_stride[1],
  157|  1.74k|                           src[1] - offset_uv * PXSTRIDE(src_stride[1]),
  158|  1.74k|                           src_stride[1], ss_ver, f->seq_hdr->sb128, y_stripe,
  159|  1.74k|                           row_h, w, h, ss_hor, 0);
  160|  24.1k|        }
  161|  24.6k|        if (f->seq_hdr->cdef || restore_planes & LR_RESTORE_V) {
  ------------------
  |  Branch (161:13): [True: 19.8k, False: 4.80k]
  |  Branch (161:33): [True: 4.30k, False: 504]
  ------------------
  162|  24.1k|            if (restore_planes & LR_RESTORE_V || !resize)
  ------------------
  |  Branch (162:17): [True: 12.6k, False: 11.4k]
  |  Branch (162:50): [True: 10.9k, False: 498]
  ------------------
  163|  23.6k|                backup_lpf(f, dst[2], lr_stride[1],
  164|  23.6k|                           src[2] - offset_uv * PXSTRIDE(src_stride[1]),
  165|  23.6k|                           src_stride[1], ss_ver, f->seq_hdr->sb128, y_stripe,
  166|  23.6k|                           row_h, w, h, ss_hor, 1);
  167|  24.1k|            if (have_tt && resize)
  ------------------
  |  Branch (167:17): [True: 21.7k, False: 2.38k]
  |  Branch (167:28): [True: 1.73k, False: 19.9k]
  ------------------
  168|  1.73k|                backup_lpf(f, f->lf.cdef_lpf_line[2] + cdef_off_uv, src_stride[1],
  169|  1.73k|                           src[2] - offset_uv * PXSTRIDE(src_stride[1]),
  170|  1.73k|                           src_stride[1], ss_ver, f->seq_hdr->sb128, y_stripe,
  171|  1.73k|                           row_h, w, h, ss_hor, 0);
  172|  24.1k|        }
  173|  24.6k|    }
  174|  74.0k|}
dav1d_loopfilter_sbrow_cols_16bpc:
  316|  65.9k|{
  317|  65.9k|    int x, have_left;
  318|       |    // Don't filter outside the frame
  319|  65.9k|    const int is_sb64 = !f->seq_hdr->sb128;
  320|  65.9k|    const int starty4 = (sby & is_sb64) << 4;
  321|  65.9k|    const int sbsz = 32 >> is_sb64;
  322|  65.9k|    const int sbl2 = 5 - is_sb64;
  323|  65.9k|    const int halign = (f->bh + 31) & ~31;
  324|  65.9k|    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
  325|  65.9k|    const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
  326|  65.9k|    const int vmask = 16 >> ss_ver, hmask = 16 >> ss_hor;
  327|  65.9k|    const unsigned vmax = 1U << vmask, hmax = 1U << hmask;
  328|  65.9k|    const unsigned endy4 = starty4 + imin(f->h4 - sby * sbsz, sbsz);
  329|  65.9k|    const unsigned uv_endy4 = (endy4 + ss_ver) >> ss_ver;
  330|       |
  331|       |    // fix lpf strength at tile col boundaries
  332|  65.9k|    const uint8_t *lpf_y = &f->lf.tx_lpf_right_edge[0][sby << sbl2];
  333|  65.9k|    const uint8_t *lpf_uv = &f->lf.tx_lpf_right_edge[1][sby << (sbl2 - ss_ver)];
  334|  67.1k|    for (int tile_col = 1;; tile_col++) {
  335|  67.1k|        x = f->frame_hdr->tiling.col_start_sb[tile_col];
  336|  67.1k|        if ((x << sbl2) >= f->bw) break;
  ------------------
  |  Branch (336:13): [True: 65.9k, False: 1.19k]
  ------------------
  337|  1.19k|        const int bx4 = x & is_sb64 ? 16 : 0, cbx4 = bx4 >> ss_hor;
  ------------------
  |  Branch (337:25): [True: 854, False: 342]
  ------------------
  338|  1.19k|        x >>= is_sb64;
  339|       |
  340|  1.19k|        uint16_t (*const y_hmask)[2] = lflvl[x].filter_y[0][bx4];
  341|  19.8k|        for (unsigned y = starty4, mask = 1 << y; y < endy4; y++, mask <<= 1) {
  ------------------
  |  Branch (341:51): [True: 18.6k, False: 1.19k]
  ------------------
  342|  18.6k|            const int sidx = mask >= 0x10000U;
  343|  18.6k|            const unsigned smask = mask >> (sidx << 4);
  344|  18.6k|            const int idx = 2 * !!(y_hmask[2][sidx] & smask) +
  345|  18.6k|                                !!(y_hmask[1][sidx] & smask);
  346|  18.6k|            y_hmask[2][sidx] &= ~smask;
  347|  18.6k|            y_hmask[1][sidx] &= ~smask;
  348|  18.6k|            y_hmask[0][sidx] &= ~smask;
  349|  18.6k|            y_hmask[imin(idx, lpf_y[y - starty4])][sidx] |= smask;
  350|  18.6k|        }
  351|       |
  352|  1.19k|        if (f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400) {
  ------------------
  |  Branch (352:13): [True: 1.05k, False: 145]
  ------------------
  353|  1.05k|            uint16_t (*const uv_hmask)[2] = lflvl[x].filter_uv[0][cbx4];
  354|  17.4k|            for (unsigned y = starty4 >> ss_ver, uv_mask = 1 << y; y < uv_endy4;
  ------------------
  |  Branch (354:68): [True: 16.3k, False: 1.05k]
  ------------------
  355|  16.3k|                 y++, uv_mask <<= 1)
  356|  16.3k|            {
  357|  16.3k|                const int sidx = uv_mask >= vmax;
  358|  16.3k|                const unsigned smask = uv_mask >> (sidx << (4 - ss_ver));
  359|  16.3k|                const int idx = !!(uv_hmask[1][sidx] & smask);
  360|  16.3k|                uv_hmask[1][sidx] &= ~smask;
  361|  16.3k|                uv_hmask[0][sidx] &= ~smask;
  362|  16.3k|                uv_hmask[imin(idx, lpf_uv[y - (starty4 >> ss_ver)])][sidx] |= smask;
  363|  16.3k|            }
  364|  1.05k|        }
  365|  1.19k|        lpf_y  += halign;
  366|  1.19k|        lpf_uv += halign >> ss_ver;
  367|  1.19k|    }
  368|       |
  369|       |    // fix lpf strength at tile row boundaries
  370|  65.9k|    if (start_of_tile_row) {
  ------------------
  |  Branch (370:9): [True: 306, False: 65.6k]
  ------------------
  371|    306|        const BlockContext *a;
  372|    306|        for (x = 0, a = &f->a[f->sb128w * (start_of_tile_row - 1)];
  373|    747|             x < f->sb128w; x++, a++)
  ------------------
  |  Branch (373:14): [True: 441, False: 306]
  ------------------
  374|    441|        {
  375|    441|            uint16_t (*const y_vmask)[2] = lflvl[x].filter_y[1][starty4];
  376|    441|            const unsigned w = imin(32, f->w4 - (x << 5));
  377|  8.28k|            for (unsigned mask = 1, i = 0; i < w; mask <<= 1, i++) {
  ------------------
  |  Branch (377:44): [True: 7.84k, False: 441]
  ------------------
  378|  7.84k|                const int sidx = mask >= 0x10000U;
  379|  7.84k|                const unsigned smask = mask >> (sidx << 4);
  380|  7.84k|                const int idx = 2 * !!(y_vmask[2][sidx] & smask) +
  381|  7.84k|                                    !!(y_vmask[1][sidx] & smask);
  382|  7.84k|                y_vmask[2][sidx] &= ~smask;
  383|  7.84k|                y_vmask[1][sidx] &= ~smask;
  384|  7.84k|                y_vmask[0][sidx] &= ~smask;
  385|  7.84k|                y_vmask[imin(idx, a->tx_lpf_y[i])][sidx] |= smask;
  386|  7.84k|            }
  387|       |
  388|    441|            if (f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400) {
  ------------------
  |  Branch (388:17): [True: 329, False: 112]
  ------------------
  389|    329|                const unsigned cw = (w + ss_hor) >> ss_hor;
  390|    329|                uint16_t (*const uv_vmask)[2] = lflvl[x].filter_uv[1][starty4 >> ss_ver];
  391|  6.41k|                for (unsigned uv_mask = 1, i = 0; i < cw; uv_mask <<= 1, i++) {
  ------------------
  |  Branch (391:51): [True: 6.08k, False: 329]
  ------------------
  392|  6.08k|                    const int sidx = uv_mask >= hmax;
  393|  6.08k|                    const unsigned smask = uv_mask >> (sidx << (4 - ss_hor));
  394|  6.08k|                    const int idx = !!(uv_vmask[1][sidx] & smask);
  395|  6.08k|                    uv_vmask[1][sidx] &= ~smask;
  396|  6.08k|                    uv_vmask[0][sidx] &= ~smask;
  397|  6.08k|                    uv_vmask[imin(idx, a->tx_lpf_uv[i])][sidx] |= smask;
  398|  6.08k|                }
  399|    329|            }
  400|    441|        }
  401|    306|    }
  402|       |
  403|  65.9k|    pixel *ptr;
  404|  65.9k|    uint8_t (*level_ptr)[4] = f->lf.level + f->b4_stride * sby * sbsz;
  405|   134k|    for (ptr = p[0], have_left = 0, x = 0; x < f->sb128w;
  ------------------
  |  Branch (405:44): [True: 68.3k, False: 65.9k]
  ------------------
  406|  68.3k|         x++, have_left = 1, ptr += 128, level_ptr += 32)
  407|  68.3k|    {
  408|  68.3k|        filter_plane_cols_y(f, have_left, level_ptr, f->b4_stride,
  409|  68.3k|                            lflvl[x].filter_y[0], ptr, f->cur.stride[0],
  410|  68.3k|                            imin(32, f->w4 - x * 32), starty4, endy4);
  411|  68.3k|    }
  412|       |
  413|  65.9k|    if (!f->frame_hdr->loopfilter.level_u && !f->frame_hdr->loopfilter.level_v)
  ------------------
  |  Branch (413:9): [True: 46.0k, False: 19.8k]
  |  Branch (413:46): [True: 45.0k, False: 1.01k]
  ------------------
  414|  45.0k|        return;
  415|       |
  416|  20.9k|    ptrdiff_t uv_off;
  417|  20.9k|    level_ptr = f->lf.level + f->b4_stride * (sby * sbsz >> ss_ver);
  418|  43.5k|    for (uv_off = 0, have_left = 0, x = 0; x < f->sb128w;
  ------------------
  |  Branch (418:44): [True: 22.6k, False: 20.9k]
  ------------------
  419|  22.6k|         x++, have_left = 1, uv_off += 128 >> ss_hor, level_ptr += 32 >> ss_hor)
  420|  22.6k|    {
  421|  22.6k|        filter_plane_cols_uv(f, have_left, level_ptr, f->b4_stride,
  422|  22.6k|                             lflvl[x].filter_uv[0],
  423|  22.6k|                             &p[1][uv_off], &p[2][uv_off], f->cur.stride[1],
  424|  22.6k|                             (imin(32, f->w4 - x * 32) + ss_hor) >> ss_hor,
  425|  22.6k|                             starty4 >> ss_ver, uv_endy4, ss_ver);
  426|  22.6k|    }
  427|  20.9k|}
dav1d_loopfilter_sbrow_rows_16bpc:
  432|  65.9k|{
  433|  65.9k|    int x;
  434|       |    // Don't filter outside the frame
  435|  65.9k|    const int have_top = sby > 0;
  436|  65.9k|    const int is_sb64 = !f->seq_hdr->sb128;
  437|  65.9k|    const int starty4 = (sby & is_sb64) << 4;
  438|  65.9k|    const int sbsz = 32 >> is_sb64;
  439|  65.9k|    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
  440|  65.9k|    const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
  441|  65.9k|    const unsigned endy4 = starty4 + imin(f->h4 - sby * sbsz, sbsz);
  442|  65.9k|    const unsigned uv_endy4 = (endy4 + ss_ver) >> ss_ver;
  443|       |
  444|  65.9k|    pixel *ptr;
  445|  65.9k|    uint8_t (*level_ptr)[4] = f->lf.level + f->b4_stride * sby * sbsz;
  446|   134k|    for (ptr = p[0], x = 0; x < f->sb128w; x++, ptr += 128, level_ptr += 32) {
  ------------------
  |  Branch (446:29): [True: 68.2k, False: 65.9k]
  ------------------
  447|  68.2k|        filter_plane_rows_y(f, have_top, level_ptr, f->b4_stride,
  448|  68.2k|                            lflvl[x].filter_y[1], ptr, f->cur.stride[0],
  449|  68.2k|                            imin(32, f->w4 - x * 32), starty4, endy4);
  450|  68.2k|    }
  451|       |
  452|  65.9k|    if (!f->frame_hdr->loopfilter.level_u && !f->frame_hdr->loopfilter.level_v)
  ------------------
  |  Branch (452:9): [True: 46.0k, False: 19.8k]
  |  Branch (452:46): [True: 45.0k, False: 1.01k]
  ------------------
  453|  45.0k|        return;
  454|       |
  455|  20.8k|    ptrdiff_t uv_off;
  456|  20.8k|    level_ptr = f->lf.level + f->b4_stride * (sby * sbsz >> ss_ver);
  457|  43.4k|    for (uv_off = 0, x = 0; x < f->sb128w;
  ------------------
  |  Branch (457:29): [True: 22.5k, False: 20.8k]
  ------------------
  458|  22.5k|         x++, uv_off += 128 >> ss_hor, level_ptr += 32 >> ss_hor)
  459|  22.5k|    {
  460|  22.5k|        filter_plane_rows_uv(f, have_top, level_ptr, f->b4_stride,
  461|  22.5k|                             lflvl[x].filter_uv[1],
  462|  22.5k|                             &p[1][uv_off], &p[2][uv_off], f->cur.stride[1],
  463|  22.5k|                             (imin(32, f->w4 - x * 32) + ss_hor) >> ss_hor,
  464|  22.5k|                             starty4 >> ss_ver, uv_endy4, ss_hor);
  465|  22.5k|    }
  466|  20.8k|}

dav1d_create_lf_mask_intra:
  271|  1.03M|{
  272|  1.03M|    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
  273|  1.03M|    const int bw4 = imin(iw - bx, b_dim[0]);
  274|  1.03M|    const int bh4 = imin(ih - by, b_dim[1]);
  275|  1.03M|    const int bx4 = bx & 31;
  276|  1.03M|    const int by4 = by & 31;
  277|  1.03M|    assert(bw4 >= 0 && bh4 >= 0);
  ------------------
  |  |  140|  2.06M|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:30): [True: 1.02M, False: 3.66k]
  |  |  |  Branch (140:30): [True: 1.02M, False: 721]
  |  |  |  Branch (140:68): [Folded, False: 1.03M]
  |  |  ------------------
  ------------------
  278|       |
  279|  1.03M|    if (bw4 && bh4) {
  ------------------
  |  Branch (279:9): [True: 1.02M, False: 11.8k]
  |  Branch (279:16): [True: 857k, False: 162k]
  ------------------
  280|   857k|        uint8_t (*level_cache_ptr)[4] = level_cache + by * b4_stride + bx;
  281|  3.92M|        for (int y = 0; y < bh4; y++) {
  ------------------
  |  Branch (281:25): [True: 3.06M, False: 857k]
  ------------------
  282|  14.3M|            for (int x = 0; x < bw4; x++) {
  ------------------
  |  Branch (282:29): [True: 11.3M, False: 3.06M]
  ------------------
  283|  11.3M|                level_cache_ptr[x][0] = filter_level[0][0][0];
  284|  11.3M|                level_cache_ptr[x][1] = filter_level[1][0][0];
  285|  11.3M|            }
  286|  3.06M|            level_cache_ptr += b4_stride;
  287|  3.06M|        }
  288|       |
  289|   857k|        mask_edges_intra(lflvl->filter_y, by4, bx4, bw4, bh4, ytx, ay, ly);
  290|   857k|    }
  291|       |
  292|  1.03M|    if (!auv) return;
  ------------------
  |  Branch (292:9): [True: 328k, False: 703k]
  ------------------
  293|       |
  294|   703k|    const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
  295|   703k|    const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
  296|   703k|    const int cbw4 = imin(((iw + ss_hor) >> ss_hor) - (bx >> ss_hor),
  297|   703k|                          (b_dim[0] + ss_hor) >> ss_hor);
  298|   703k|    const int cbh4 = imin(((ih + ss_ver) >> ss_ver) - (by >> ss_ver),
  299|   703k|                          (b_dim[1] + ss_ver) >> ss_ver);
  300|   703k|    assert(cbw4 >= 0 && cbh4 >= 0);
  ------------------
  |  |  140|  1.40M|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:30): [True: 698k, False: 4.66k]
  |  |  |  Branch (140:30): [True: 698k, False: 381]
  |  |  |  Branch (140:68): [Folded, False: 703k]
  |  |  ------------------
  ------------------
  301|       |
  302|   703k|    if (!cbw4 || !cbh4) return;
  ------------------
  |  Branch (302:9): [True: 10.5k, False: 692k]
  |  Branch (302:18): [True: 126k, False: 566k]
  ------------------
  303|       |
  304|   569k|    const int cbx4 = bx4 >> ss_hor;
  305|   569k|    const int cby4 = by4 >> ss_ver;
  306|       |
  307|   569k|    uint8_t (*level_cache_ptr)[4] =
  308|   569k|        level_cache + (by >> ss_ver) * b4_stride + (bx >> ss_hor);
  309|  2.27M|    for (int y = 0; y < cbh4; y++) {
  ------------------
  |  Branch (309:21): [True: 1.70M, False: 569k]
  ------------------
  310|  8.74M|        for (int x = 0; x < cbw4; x++) {
  ------------------
  |  Branch (310:25): [True: 7.04M, False: 1.70M]
  ------------------
  311|  7.04M|            level_cache_ptr[x][2] = filter_level[2][0][0];
  312|  7.04M|            level_cache_ptr[x][3] = filter_level[3][0][0];
  313|  7.04M|        }
  314|  1.70M|        level_cache_ptr += b4_stride;
  315|  1.70M|    }
  316|       |
  317|   569k|    mask_edges_chroma(lflvl->filter_uv, cby4, cbx4, cbw4, cbh4, 0, uvtx,
  318|   569k|                      auv, luv, ss_hor, ss_ver);
  319|   569k|}
dav1d_create_lf_mask_inter:
  334|  27.8k|{
  335|  27.8k|    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
  336|  27.8k|    const int bw4 = imin(iw - bx, b_dim[0]);
  337|  27.8k|    const int bh4 = imin(ih - by, b_dim[1]);
  338|  27.8k|    const int bx4 = bx & 31;
  339|  27.8k|    const int by4 = by & 31;
  340|  27.8k|    assert(bw4 >= 0 && bh4 >= 0);
  ------------------
  |  |  140|  55.6k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:30): [True: 27.8k, False: 13]
  |  |  |  Branch (140:30): [True: 27.8k, False: 0]
  |  |  |  Branch (140:68): [Folded, False: 27.8k]
  |  |  ------------------
  ------------------
  341|       |
  342|  27.8k|    if (bw4 && bh4) {
  ------------------
  |  Branch (342:9): [True: 27.4k, False: 337]
  |  Branch (342:16): [True: 27.1k, False: 347]
  ------------------
  343|  27.1k|        uint8_t (*level_cache_ptr)[4] = level_cache + by * b4_stride + bx;
  344|   110k|        for (int y = 0; y < bh4; y++) {
  ------------------
  |  Branch (344:25): [True: 83.2k, False: 27.1k]
  ------------------
  345|   488k|            for (int x = 0; x < bw4; x++) {
  ------------------
  |  Branch (345:29): [True: 405k, False: 83.2k]
  ------------------
  346|   405k|                level_cache_ptr[x][0] = filter_level[0][0][0];
  347|   405k|                level_cache_ptr[x][1] = filter_level[1][0][0];
  348|   405k|            }
  349|  83.2k|            level_cache_ptr += b4_stride;
  350|  83.2k|        }
  351|       |
  352|  27.1k|        mask_edges_inter(lflvl->filter_y, by4, bx4, bw4, bh4, skip,
  353|  27.1k|                         max_ytx, tx_masks, ay, ly);
  354|  27.1k|    }
  355|       |
  356|  27.8k|    if (!auv) return;
  ------------------
  |  Branch (356:9): [True: 4.87k, False: 22.9k]
  ------------------
  357|       |
  358|  22.9k|    const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
  359|  22.9k|    const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
  360|  22.9k|    const int cbw4 = imin(((iw + ss_hor) >> ss_hor) - (bx >> ss_hor),
  361|  22.9k|                          (b_dim[0] + ss_hor) >> ss_hor);
  362|  22.9k|    const int cbh4 = imin(((ih + ss_ver) >> ss_ver) - (by >> ss_ver),
  363|  22.9k|                          (b_dim[1] + ss_ver) >> ss_ver);
  364|  22.9k|    assert(cbw4 >= 0 && cbh4 >= 0);
  ------------------
  |  |  140|  45.8k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:30): [True: 22.9k, False: 1]
  |  |  |  Branch (140:30): [True: 22.9k, False: 0]
  |  |  |  Branch (140:68): [Folded, False: 22.9k]
  |  |  ------------------
  ------------------
  365|       |
  366|  22.9k|    if (!cbw4 || !cbh4) return;
  ------------------
  |  Branch (366:9): [True: 267, False: 22.6k]
  |  Branch (366:18): [True: 347, False: 22.3k]
  ------------------
  367|       |
  368|  22.3k|    const int cbx4 = bx4 >> ss_hor;
  369|  22.3k|    const int cby4 = by4 >> ss_ver;
  370|       |
  371|  22.3k|    uint8_t (*level_cache_ptr)[4] =
  372|  22.3k|        level_cache + (by >> ss_ver) * b4_stride + (bx >> ss_hor);
  373|  65.5k|    for (int y = 0; y < cbh4; y++) {
  ------------------
  |  Branch (373:21): [True: 43.2k, False: 22.3k]
  ------------------
  374|   163k|        for (int x = 0; x < cbw4; x++) {
  ------------------
  |  Branch (374:25): [True: 120k, False: 43.2k]
  ------------------
  375|   120k|            level_cache_ptr[x][2] = filter_level[2][0][0];
  376|   120k|            level_cache_ptr[x][3] = filter_level[3][0][0];
  377|   120k|        }
  378|  43.2k|        level_cache_ptr += b4_stride;
  379|  43.2k|    }
  380|       |
  381|  22.3k|    mask_edges_chroma(lflvl->filter_uv, cby4, cbx4, cbw4, cbh4, skip, uvtx,
  382|  22.3k|                      auv, luv, ss_hor, ss_ver);
  383|  22.3k|}
dav1d_calc_eih:
  385|  17.4k|void dav1d_calc_eih(Av1FilterLUT *const lim_lut, const int filter_sharpness) {
  386|       |    // set E/I/H values from loopfilter level
  387|  17.4k|    const int sharp = filter_sharpness;
  388|  1.13M|    for (int level = 0; level < 64; level++) {
  ------------------
  |  Branch (388:25): [True: 1.11M, False: 17.4k]
  ------------------
  389|  1.11M|        int limit = level;
  390|       |
  391|  1.11M|        if (sharp > 0) {
  ------------------
  |  Branch (391:13): [True: 487k, False: 628k]
  ------------------
  392|   487k|            limit >>= (sharp + 3) >> 2;
  393|   487k|            limit = imin(limit, 9 - sharp);
  394|   487k|        }
  395|  1.11M|        limit = imax(limit, 1);
  396|       |
  397|  1.11M|        lim_lut->i[level] = limit;
  398|  1.11M|        lim_lut->e[level] = 2 * (level + 2) + limit;
  399|  1.11M|    }
  400|  17.4k|    lim_lut->sharp[0] = (sharp + 3) >> 2;
  401|  17.4k|    lim_lut->sharp[1] = sharp ? 9 - sharp : 0xff;
  ------------------
  |  Branch (401:25): [True: 7.61k, False: 9.82k]
  ------------------
  402|  17.4k|}
dav1d_calc_lf_values:
  441|  44.8k|{
  442|  44.8k|    const int n_seg = hdr->segmentation.enabled ? 8 : 1;
  ------------------
  |  Branch (442:23): [True: 24.1k, False: 20.7k]
  ------------------
  443|       |
  444|  44.8k|    if (!hdr->loopfilter.level_y[0] && !hdr->loopfilter.level_y[1]) {
  ------------------
  |  Branch (444:9): [True: 14.0k, False: 30.8k]
  |  Branch (444:40): [True: 12.6k, False: 1.44k]
  ------------------
  445|  12.6k|        memset(lflvl_values, 0, sizeof(*lflvl_values) * n_seg);
  446|  12.6k|        return;
  447|  12.6k|    }
  448|       |
  449|  32.2k|    const Dav1dLoopfilterModeRefDeltas *const mr_deltas =
  450|  32.2k|        hdr->loopfilter.mode_ref_delta_enabled ?
  ------------------
  |  Branch (450:9): [True: 21.3k, False: 10.8k]
  ------------------
  451|  32.2k|        &hdr->loopfilter.mode_ref_deltas : NULL;
  452|   213k|    for (int s = 0; s < n_seg; s++) {
  ------------------
  |  Branch (452:21): [True: 181k, False: 32.2k]
  ------------------
  453|   181k|        const Dav1dSegmentationData *const segd =
  454|   181k|            hdr->segmentation.enabled ? &hdr->segmentation.seg_data.d[s] : NULL;
  ------------------
  |  Branch (454:13): [True: 170k, False: 10.8k]
  ------------------
  455|       |
  456|   181k|        calc_lf_value(lflvl_values[s][0], hdr->loopfilter.level_y[0],
  457|   181k|                      lf_delta[0], segd ? segd->delta_lf_y_v : 0, mr_deltas);
  ------------------
  |  Branch (457:36): [True: 170k, False: 10.8k]
  ------------------
  458|   181k|        calc_lf_value(lflvl_values[s][1], hdr->loopfilter.level_y[1],
  459|   181k|                      lf_delta[hdr->delta.lf.multi ? 1 : 0],
  ------------------
  |  Branch (459:32): [True: 144k, False: 36.7k]
  ------------------
  460|   181k|                      segd ? segd->delta_lf_y_h : 0, mr_deltas);
  ------------------
  |  Branch (460:23): [True: 170k, False: 10.8k]
  ------------------
  461|   181k|        calc_lf_value_chroma(lflvl_values[s][2], hdr->loopfilter.level_u,
  462|   181k|                             lf_delta[hdr->delta.lf.multi ? 2 : 0],
  ------------------
  |  Branch (462:39): [True: 144k, False: 36.7k]
  ------------------
  463|   181k|                             segd ? segd->delta_lf_u : 0, mr_deltas);
  ------------------
  |  Branch (463:30): [True: 170k, False: 10.8k]
  ------------------
  464|   181k|        calc_lf_value_chroma(lflvl_values[s][3], hdr->loopfilter.level_v,
  465|   181k|                             lf_delta[hdr->delta.lf.multi ? 3 : 0],
  ------------------
  |  Branch (465:39): [True: 144k, False: 36.7k]
  ------------------
  466|   181k|                             segd ? segd->delta_lf_v : 0, mr_deltas);
  ------------------
  |  Branch (466:30): [True: 170k, False: 10.8k]
  ------------------
  467|   181k|    }
  468|  32.2k|}
lf_mask.c:mask_edges_intra:
  152|   857k|{
  153|   857k|    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx];
  154|   857k|    const int twl4 = t_dim->lw, thl4 = t_dim->lh;
  155|   857k|    const int twl4c = imin(2, twl4), thl4c = imin(2, thl4);
  156|   857k|    int y, x;
  157|       |
  158|       |    // left block edge
  159|   857k|    unsigned mask = 1U << by4;
  160|  3.92M|    for (y = 0; y < h4; y++, mask <<= 1) {
  ------------------
  |  Branch (160:17): [True: 3.06M, False: 857k]
  ------------------
  161|  3.06M|        const int sidx = mask >= 0x10000;
  162|  3.06M|        const unsigned smask = mask >> (sidx << 4);
  163|  3.06M|        masks[0][bx4][imin(twl4c, l[y])][sidx] |= smask;
  164|  3.06M|    }
  165|       |
  166|       |    // top block edge
  167|  3.44M|    for (x = 0, mask = 1U << bx4; x < w4; x++, mask <<= 1) {
  ------------------
  |  Branch (167:35): [True: 2.59M, False: 857k]
  ------------------
  168|  2.59M|        const int sidx = mask >= 0x10000;
  169|  2.59M|        const unsigned smask = mask >> (sidx << 4);
  170|  2.59M|        masks[1][by4][imin(thl4c, a[x])][sidx] |= smask;
  171|  2.59M|    }
  172|       |
  173|       |    // inner (tx) left|right edges
  174|   857k|    const int hstep = t_dim->w;
  175|   857k|    unsigned t = 1U << by4;
  176|   857k|    unsigned inner = (unsigned) ((((uint64_t) t) << h4) - t);
  177|   857k|    unsigned inner1 = inner & 0xffff, inner2 = inner >> 16;
  178|   987k|    for (x = hstep; x < w4; x += hstep) {
  ------------------
  |  Branch (178:21): [True: 129k, False: 857k]
  ------------------
  179|   129k|        if (inner1) masks[0][bx4 + x][twl4c][0] |= inner1;
  ------------------
  |  Branch (179:13): [True: 98.9k, False: 30.8k]
  ------------------
  180|   129k|        if (inner2) masks[0][bx4 + x][twl4c][1] |= inner2;
  ------------------
  |  Branch (180:13): [True: 34.9k, False: 94.8k]
  ------------------
  181|   129k|    }
  182|       |
  183|       |    //            top
  184|       |    // inner (tx) --- edges
  185|       |    //           bottom
  186|   857k|    const int vstep = t_dim->h;
  187|   857k|    t = 1U << bx4;
  188|   857k|    inner = (unsigned) ((((uint64_t) t) << w4) - t);
  189|   857k|    inner1 = inner & 0xffff;
  190|   857k|    inner2 = inner >> 16;
  191|  1.15M|    for (y = vstep; y < h4; y += vstep) {
  ------------------
  |  Branch (191:21): [True: 299k, False: 857k]
  ------------------
  192|   299k|        if (inner1) masks[1][by4 + y][thl4c][0] |= inner1;
  ------------------
  |  Branch (192:13): [True: 251k, False: 47.5k]
  ------------------
  193|   299k|        if (inner2) masks[1][by4 + y][thl4c][1] |= inner2;
  ------------------
  |  Branch (193:13): [True: 51.2k, False: 248k]
  ------------------
  194|   299k|    }
  195|       |
  196|   857k|    dav1d_memset_likely_pow2(a, thl4c, w4);
  197|   857k|    dav1d_memset_likely_pow2(l, twl4c, h4);
  198|   857k|}
lf_mask.c:mask_edges_chroma:
  207|   589k|{
  208|   589k|    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx];
  209|   589k|    const int twl4 = t_dim->lw, thl4 = t_dim->lh;
  210|   589k|    const int twl4c = !!twl4, thl4c = !!thl4;
  211|   589k|    int y, x;
  212|   589k|    const int vbits = 4 - ss_ver, hbits = 4 - ss_hor;
  213|   589k|    const int vmask = 16 >> ss_ver, hmask = 16 >> ss_hor;
  214|   589k|    const unsigned vmax = 1 << vmask, hmax = 1 << hmask;
  215|       |
  216|       |    // left block edge
  217|   589k|    unsigned mask = 1U << cby4;
  218|  2.33M|    for (y = 0; y < ch4; y++, mask <<= 1) {
  ------------------
  |  Branch (218:17): [True: 1.74M, False: 589k]
  ------------------
  219|  1.74M|        const int sidx = mask >= vmax;
  220|  1.74M|        const unsigned smask = mask >> (sidx << vbits);
  221|  1.74M|        masks[0][cbx4][imin(twl4c, l[y])][sidx] |= smask;
  222|  1.74M|    }
  223|       |
  224|       |    // top block edge
  225|  2.34M|    for (x = 0, mask = 1U << cbx4; x < cw4; x++, mask <<= 1) {
  ------------------
  |  Branch (225:36): [True: 1.75M, False: 589k]
  ------------------
  226|  1.75M|        const int sidx = mask >= hmax;
  227|  1.75M|        const unsigned smask = mask >> (sidx << hbits);
  228|  1.75M|        masks[1][cby4][imin(thl4c, a[x])][sidx] |= smask;
  229|  1.75M|    }
  230|       |
  231|   589k|    if (!skip_inter) {
  ------------------
  |  Branch (231:9): [True: 580k, False: 9.54k]
  ------------------
  232|       |        // inner (tx) left|right edges
  233|   580k|        const int hstep = t_dim->w;
  234|   580k|        unsigned t = 1U << cby4;
  235|   580k|        unsigned inner = (unsigned) ((((uint64_t) t) << ch4) - t);
  236|   580k|        unsigned inner1 = inner & ((1 << vmask) - 1), inner2 = inner >> vmask;
  237|   619k|        for (x = hstep; x < cw4; x += hstep) {
  ------------------
  |  Branch (237:25): [True: 39.7k, False: 580k]
  ------------------
  238|  39.7k|            if (inner1) masks[0][cbx4 + x][twl4c][0] |= inner1;
  ------------------
  |  Branch (238:17): [True: 33.8k, False: 5.90k]
  ------------------
  239|  39.7k|            if (inner2) masks[0][cbx4 + x][twl4c][1] |= inner2;
  ------------------
  |  Branch (239:17): [True: 9.70k, False: 30.0k]
  ------------------
  240|  39.7k|        }
  241|       |
  242|       |        //            top
  243|       |        // inner (tx) --- edges
  244|       |        //           bottom
  245|   580k|        const int vstep = t_dim->h;
  246|   580k|        t = 1U << cbx4;
  247|   580k|        inner = (unsigned) ((((uint64_t) t) << cw4) - t);
  248|   580k|        inner1 = inner & ((1 << hmask) - 1), inner2 = inner >> hmask;
  249|   629k|        for (y = vstep; y < ch4; y += vstep) {
  ------------------
  |  Branch (249:25): [True: 49.1k, False: 580k]
  ------------------
  250|  49.1k|            if (inner1) masks[1][cby4 + y][thl4c][0] |= inner1;
  ------------------
  |  Branch (250:17): [True: 41.8k, False: 7.22k]
  ------------------
  251|  49.1k|            if (inner2) masks[1][cby4 + y][thl4c][1] |= inner2;
  ------------------
  |  Branch (251:17): [True: 10.8k, False: 38.2k]
  ------------------
  252|  49.1k|        }
  253|   580k|    }
  254|       |
  255|   589k|    dav1d_memset_likely_pow2(a, thl4c, cw4);
  256|   589k|    dav1d_memset_likely_pow2(l, twl4c, ch4);
  257|   589k|}
lf_mask.c:mask_edges_inter:
   85|  27.1k|{
   86|  27.1k|    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[max_tx];
   87|  27.1k|    int y, x;
   88|       |
   89|  27.1k|    ALIGN_STK_16(uint8_t, txa, 2 /* edge */, [2 /* txsz, step */][32 /* y */][32 /* x */]);
  ------------------
  |  |  100|  27.1k|    ALIGN(type var[sz1d]sznd, ALIGN_16_VAL)
  |  |  ------------------
  |  |  |  |   86|  27.1k|    line __attribute__((aligned(align)))
  |  |  ------------------
  ------------------
   90|  54.6k|    for (int y_off = 0, y = 0; y < h4; y += t_dim->h, y_off++)
  ------------------
  |  Branch (90:32): [True: 27.5k, False: 27.1k]
  ------------------
   91|  57.2k|        for (int x_off = 0, x = 0; x < w4; x += t_dim->w, x_off++)
  ------------------
  |  Branch (91:36): [True: 29.6k, False: 27.5k]
  ------------------
   92|  29.6k|            decomp_tx((uint8_t(*)[2][32][32]) &txa[0][0][y][x],
   93|  29.6k|                      max_tx, 0, y_off, x_off, tx_masks);
   94|       |
   95|       |    // left block edge
   96|  27.1k|    unsigned mask = 1U << by4;
   97|   110k|    for (y = 0; y < h4; y++, mask <<= 1) {
  ------------------
  |  Branch (97:17): [True: 83.3k, False: 27.1k]
  ------------------
   98|  83.3k|        const int sidx = mask >= 0x10000;
   99|  83.3k|        const unsigned smask = mask >> (sidx << 4);
  100|  83.3k|        masks[0][bx4][imin(txa[0][0][y][0], l[y])][sidx] |= smask;
  101|  83.3k|    }
  102|       |
  103|       |    // top block edge
  104|   117k|    for (x = 0, mask = 1U << bx4; x < w4; x++, mask <<= 1) {
  ------------------
  |  Branch (104:35): [True: 90.2k, False: 27.1k]
  ------------------
  105|  90.2k|        const int sidx = mask >= 0x10000;
  106|  90.2k|        const unsigned smask = mask >> (sidx << 4);
  107|  90.2k|        masks[1][by4][imin(txa[1][0][0][x], a[x])][sidx] |= smask;
  108|  90.2k|    }
  109|       |
  110|  27.1k|    if (!skip) {
  ------------------
  |  Branch (110:9): [True: 13.4k, False: 13.6k]
  ------------------
  111|       |        // inner (tx) left|right edges
  112|  58.2k|        for (y = 0, mask = 1U << by4; y < h4; y++, mask <<= 1) {
  ------------------
  |  Branch (112:39): [True: 44.7k, False: 13.4k]
  ------------------
  113|  44.7k|            const int sidx = mask >= 0x10000U;
  114|  44.7k|            const unsigned smask = mask >> (sidx << 4);
  115|  44.7k|            int ltx = txa[0][0][y][0];
  116|  44.7k|            int step = txa[0][1][y][0];
  117|  55.8k|            for (x = step; x < w4; x += step) {
  ------------------
  |  Branch (117:28): [True: 11.0k, False: 44.7k]
  ------------------
  118|  11.0k|                const int rtx = txa[0][0][y][x];
  119|  11.0k|                masks[0][bx4 + x][imin(rtx, ltx)][sidx] |= smask;
  120|  11.0k|                ltx = rtx;
  121|  11.0k|                step = txa[0][1][y][x];
  122|  11.0k|            }
  123|  44.7k|        }
  124|       |
  125|       |        //            top
  126|       |        // inner (tx) --- edges
  127|       |        //           bottom
  128|  65.9k|        for (x = 0, mask = 1U << bx4; x < w4; x++, mask <<= 1) {
  ------------------
  |  Branch (128:39): [True: 52.4k, False: 13.4k]
  ------------------
  129|  52.4k|            const int sidx = mask >= 0x10000U;
  130|  52.4k|            const unsigned smask = mask >> (sidx << 4);
  131|  52.4k|            int ttx = txa[1][0][0][x];
  132|  52.4k|            int step = txa[1][1][0][x];
  133|  62.9k|            for (y = step; y < h4; y += step) {
  ------------------
  |  Branch (133:28): [True: 10.4k, False: 52.4k]
  ------------------
  134|  10.4k|                const int btx = txa[1][0][y][x];
  135|  10.4k|                masks[1][by4 + y][imin(ttx, btx)][sidx] |= smask;
  136|  10.4k|                ttx = btx;
  137|  10.4k|                step = txa[1][1][y][x];
  138|  10.4k|            }
  139|  52.4k|        }
  140|  13.4k|    }
  141|       |
  142|   110k|    for (y = 0; y < h4; y++)
  ------------------
  |  Branch (142:17): [True: 83.3k, False: 27.1k]
  ------------------
  143|  83.3k|        l[y] = txa[0][0][y][w4 - 1];
  144|  27.1k|    memcpy(a, txa[1][0][h4 - 1], w4);
  145|  27.1k|}
lf_mask.c:decomp_tx:
   44|  40.8k|{
   45|  40.8k|    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[from];
   46|  40.8k|    const int is_split = (from == (int) TX_4X4 || depth > 1) ? 0 :
  ------------------
  |  Branch (46:27): [True: 8.35k, False: 32.5k]
  |  Branch (46:51): [True: 2.39k, False: 30.1k]
  ------------------
   47|  40.8k|        (tx_masks[depth] >> (y_off * 4 + x_off)) & 1;
   48|       |
   49|  40.8k|    if (is_split) {
  ------------------
  |  Branch (49:9): [True: 3.70k, False: 37.1k]
  ------------------
   50|  3.70k|        const enum RectTxfmSize sub = t_dim->sub;
   51|  3.70k|        const int htw4 = t_dim->w >> 1, hth4 = t_dim->h >> 1;
   52|       |
   53|  3.70k|        decomp_tx(txa, sub, depth + 1, y_off * 2 + 0, x_off * 2 + 0, tx_masks);
   54|  3.70k|        if (t_dim->w >= t_dim->h)
  ------------------
  |  Branch (54:13): [True: 2.97k, False: 736]
  ------------------
   55|  2.97k|            decomp_tx((uint8_t(*)[2][32][32]) &txa[0][0][0][htw4],
   56|  2.97k|                      sub, depth + 1, y_off * 2 + 0, x_off * 2 + 1, tx_masks);
   57|  3.70k|        if (t_dim->h >= t_dim->w) {
  ------------------
  |  Branch (57:13): [True: 2.64k, False: 1.06k]
  ------------------
   58|  2.64k|            decomp_tx((uint8_t(*)[2][32][32]) &txa[0][0][hth4][0],
   59|  2.64k|                      sub, depth + 1, y_off * 2 + 1, x_off * 2 + 0, tx_masks);
   60|  2.64k|            if (t_dim->w >= t_dim->h)
  ------------------
  |  Branch (60:17): [True: 1.90k, False: 734]
  ------------------
   61|  1.90k|                decomp_tx((uint8_t(*)[2][32][32]) &txa[0][0][hth4][htw4],
   62|  1.90k|                          sub, depth + 1, y_off * 2 + 1, x_off * 2 + 1, tx_masks);
   63|  2.64k|        }
   64|  37.1k|    } else {
   65|  37.1k|        const int lw = imin(2, t_dim->lw), lh = imin(2, t_dim->lh);
   66|       |
   67|  37.1k|#define set_ctx(rep_macro) \
   68|  37.1k|        for (int y = 0; y < t_dim->h; y++) { \
   69|  37.1k|            rep_macro(txa[0][0][y], 0, lw); \
   70|  37.1k|            rep_macro(txa[1][0][y], 0, lh); \
   71|  37.1k|            txa[0][1][y][0] = t_dim->w; \
   72|  37.1k|        }
   73|  37.1k|        case_set_upto16(t_dim->lw);
  ------------------
  |  |   80|  37.1k|    switch (var) { \
  |  |   81|  11.1k|    case 0: set_ctx(set_ctx1); break; \
  |  |  ------------------
  |  |  |  |   68|  26.8k|        for (int y = 0; y < t_dim->h; y++) { \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (68:25): [True: 15.6k, False: 11.1k]
  |  |  |  |  ------------------
  |  |  |  |   69|  15.6k|            rep_macro(txa[0][0][y], 0, lw); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   81|  15.6k|    case 0: set_ctx(set_ctx1); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   56|  15.6k|    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   70|  15.6k|            rep_macro(txa[1][0][y], 0, lh); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   81|  15.6k|    case 0: set_ctx(set_ctx1); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   56|  15.6k|    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   71|  15.6k|            txa[0][1][y][0] = t_dim->w; \
  |  |  |  |   72|  15.6k|        }
  |  |  ------------------
  |  |  |  Branch (81:5): [True: 11.1k, False: 26.0k]
  |  |  ------------------
  |  |   82|  13.7k|    case 1: set_ctx(set_ctx2); break; \
  |  |  ------------------
  |  |  |  |   68|  48.2k|        for (int y = 0; y < t_dim->h; y++) { \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (68:25): [True: 34.4k, False: 13.7k]
  |  |  |  |  ------------------
  |  |  |  |   69|  34.4k|            rep_macro(txa[0][0][y], 0, lw); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   82|  34.4k|    case 1: set_ctx(set_ctx2); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   58|  34.4k|    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   70|  34.4k|            rep_macro(txa[1][0][y], 0, lh); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   82|  34.4k|    case 1: set_ctx(set_ctx2); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   58|  34.4k|    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   71|  34.4k|            txa[0][1][y][0] = t_dim->w; \
  |  |  |  |   72|  34.4k|        }
  |  |  ------------------
  |  |  |  Branch (82:5): [True: 13.7k, False: 23.4k]
  |  |  ------------------
  |  |   83|  8.94k|    case 2: set_ctx(set_ctx4); break; \
  |  |  ------------------
  |  |  |  |   68|  36.1k|        for (int y = 0; y < t_dim->h; y++) { \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (68:25): [True: 27.1k, False: 8.94k]
  |  |  |  |  ------------------
  |  |  |  |   69|  27.1k|            rep_macro(txa[0][0][y], 0, lw); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   83|  27.1k|    case 2: set_ctx(set_ctx4); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   60|  27.1k|    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   70|  27.1k|            rep_macro(txa[1][0][y], 0, lh); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   83|  27.1k|    case 2: set_ctx(set_ctx4); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   60|  27.1k|    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   71|  27.1k|            txa[0][1][y][0] = t_dim->w; \
  |  |  |  |   72|  27.1k|        }
  |  |  ------------------
  |  |  |  Branch (83:5): [True: 8.94k, False: 28.2k]
  |  |  ------------------
  |  |   84|  2.36k|    case 3: set_ctx(set_ctx8); break; \
  |  |  ------------------
  |  |  |  |   68|  15.7k|        for (int y = 0; y < t_dim->h; y++) { \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (68:25): [True: 13.4k, False: 2.36k]
  |  |  |  |  ------------------
  |  |  |  |   69|  13.4k|            rep_macro(txa[0][0][y], 0, lw); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   84|  13.4k|    case 3: set_ctx(set_ctx8); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   62|  13.4k|    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   70|  13.4k|            rep_macro(txa[1][0][y], 0, lh); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   84|  13.4k|    case 3: set_ctx(set_ctx8); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   62|  13.4k|    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   71|  13.4k|            txa[0][1][y][0] = t_dim->w; \
  |  |  |  |   72|  13.4k|        }
  |  |  ------------------
  |  |  |  Branch (84:5): [True: 2.36k, False: 34.8k]
  |  |  ------------------
  |  |   85|    966|    case 4: set_ctx(set_ctx16); break; \
  |  |  ------------------
  |  |  |  |   68|  13.3k|        for (int y = 0; y < t_dim->h; y++) { \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (68:25): [True: 12.4k, False: 966]
  |  |  |  |  ------------------
  |  |  |  |   69|  12.4k|            rep_macro(txa[0][0][y], 0, lw); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   85|  12.4k|    case 4: set_ctx(set_ctx16); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   63|  12.4k|#define set_ctx16(var, off, val) do { \
  |  |  |  |  |  |  |  |   64|  12.4k|        memset(&(var)[off], val, 16); \
  |  |  |  |  |  |  |  |   65|  12.4k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (65:14): [Folded, False: 12.4k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   70|  12.4k|            rep_macro(txa[1][0][y], 0, lh); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   85|  12.4k|    case 4: set_ctx(set_ctx16); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   63|  12.4k|#define set_ctx16(var, off, val) do { \
  |  |  |  |  |  |  |  |   64|  12.4k|        memset(&(var)[off], val, 16); \
  |  |  |  |  |  |  |  |   65|  12.4k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (65:14): [Folded, False: 12.4k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   71|  12.4k|            txa[0][1][y][0] = t_dim->w; \
  |  |  |  |   72|  12.4k|        }
  |  |  ------------------
  |  |  |  Branch (85:5): [True: 966, False: 36.2k]
  |  |  ------------------
  |  |   86|      0|    default: assert(0); \
  |  |  ------------------
  |  |  |  |  140|      0|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (140:28): [True: 0, Folded]
  |  |  |  |  |  Branch (140:68): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (86:5): [True: 0, False: 37.1k]
  |  |  ------------------
  |  |   87|  37.1k|    }
  ------------------
   74|  37.1k|#undef set_ctx
   75|  37.1k|        dav1d_memset_pow2[t_dim->lw](txa[1][1][0], t_dim->h);
   76|  37.1k|    }
   77|  40.8k|}
lf_mask.c:calc_lf_value:
  408|   604k|{
  409|   604k|    const int base = iclip(iclip(base_lvl + lf_delta, 0, 63) + seg_delta, 0, 63);
  410|       |
  411|   604k|    if (!mr_delta) {
  ------------------
  |  Branch (411:9): [True: 124k, False: 480k]
  ------------------
  412|   124k|        memset(lflvl_values, base, sizeof(*lflvl_values) * 8);
  413|   480k|    } else {
  414|   480k|        const int sh = base >= 32;
  415|   480k|        lflvl_values[0][0] = lflvl_values[0][1] =
  416|   480k|            iclip(base + (mr_delta->ref_delta[0] * (1 << sh)), 0, 63);
  417|  3.72M|        for (int r = 1; r < 8; r++) {
  ------------------
  |  Branch (417:25): [True: 3.24M, False: 480k]
  ------------------
  418|  9.69M|            for (int m = 0; m < 2; m++) {
  ------------------
  |  Branch (418:29): [True: 6.44M, False: 3.24M]
  ------------------
  419|  6.44M|                const int delta =
  420|  6.44M|                    mr_delta->mode_delta[m] + mr_delta->ref_delta[r];
  421|  6.44M|                lflvl_values[r][m] = iclip(base + (delta * (1 << sh)), 0, 63);
  422|  6.44M|            }
  423|  3.24M|        }
  424|   480k|    }
  425|   604k|}
lf_mask.c:calc_lf_value_chroma:
  431|   361k|{
  432|   361k|    if (!base_lvl)
  ------------------
  |  Branch (432:9): [True: 114k, False: 247k]
  ------------------
  433|   114k|        memset(lflvl_values, 0, sizeof(*lflvl_values) * 8);
  434|   247k|    else
  435|   247k|        calc_lf_value(lflvl_values, base_lvl, lf_delta, seg_delta, mr_delta);
  436|   361k|}

dav1d_default_settings:
   71|  17.2k|COLD void dav1d_default_settings(Dav1dSettings *const s) {
   72|  17.2k|    s->n_threads = 0;
   73|  17.2k|    s->max_frame_delay = 0;
   74|  17.2k|    s->apply_grain = 1;
   75|  17.2k|    s->allocator.cookie = NULL;
   76|  17.2k|    s->allocator.alloc_picture_callback = dav1d_default_picture_alloc;
   77|  17.2k|    s->allocator.release_picture_callback = dav1d_default_picture_release;
   78|       |    s->logger.cookie = NULL;
   79|  17.2k|    s->logger.callback = dav1d_log_default_callback;
   80|  17.2k|    s->operating_point = 0;
   81|  17.2k|    s->all_layers = 1; // just until the tests are adjusted
   82|  17.2k|    s->frame_size_limit = 0;
   83|  17.2k|    s->strict_std_compliance = 0;
   84|  17.2k|    s->output_invisible_frames = 0;
   85|  17.2k|    s->inloop_filters = DAV1D_INLOOPFILTER_ALL;
   86|  17.2k|    s->decode_frame_type = DAV1D_DECODEFRAMETYPE_ALL;
   87|  17.2k|}
dav1d_open:
  138|  17.2k|COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
  139|  17.2k|    static pthread_once_t initted = PTHREAD_ONCE_INIT;
  140|  17.2k|    pthread_once(&initted, init_internal);
  141|       |
  142|  17.2k|    validate_input_or_ret(c_out != NULL, DAV1D_ERR(EINVAL));
  ------------------
  |  |   52|  17.2k|    if (!(x)) { \
  |  |  ------------------
  |  |  |  Branch (52:9): [True: 0, False: 17.2k]
  |  |  ------------------
  |  |   53|      0|        debug_print("Input validation check \'%s\' failed in %s!\n", \
  |  |  ------------------
  |  |  |  |   35|      0|#define debug_print(...) do {} while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (35:39): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   54|      0|                    #x, __func__); \
  |  |   55|      0|        debug_abort(); \
  |  |  ------------------
  |  |  |  |   36|      0|#define debug_abort() do {} while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (36:36): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   56|      0|        return r; \
  |  |   57|      0|    }
  ------------------
  143|  17.2k|    validate_input_or_ret(s != NULL, DAV1D_ERR(EINVAL));
  ------------------
  |  |   52|  17.2k|    if (!(x)) { \
  |  |  ------------------
  |  |  |  Branch (52:9): [True: 0, False: 17.2k]
  |  |  ------------------
  |  |   53|      0|        debug_print("Input validation check \'%s\' failed in %s!\n", \
  |  |  ------------------
  |  |  |  |   35|      0|#define debug_print(...) do {} while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (35:39): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   54|      0|                    #x, __func__); \
  |  |   55|      0|        debug_abort(); \
  |  |  ------------------
  |  |  |  |   36|      0|#define debug_abort() do {} while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (36:36): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   56|      0|        return r; \
  |  |   57|      0|    }
  ------------------
  144|  17.2k|    validate_input_or_ret(s->n_threads >= 0 &&
  ------------------
  |  |   52|  34.4k|    if (!(x)) { \
  |  |  ------------------
  |  |  |  Branch (52:11): [True: 17.2k, False: 0]
  |  |  |  Branch (52:11): [True: 17.2k, False: 0]
  |  |  ------------------
  |  |   53|      0|        debug_print("Input validation check \'%s\' failed in %s!\n", \
  |  |  ------------------
  |  |  |  |   35|      0|#define debug_print(...) do {} while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (35:39): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   54|      0|                    #x, __func__); \
  |  |   55|      0|        debug_abort(); \
  |  |  ------------------
  |  |  |  |   36|      0|#define debug_abort() do {} while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (36:36): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   56|      0|        return r; \
  |  |   57|      0|    }
  ------------------
  145|  17.2k|                          s->n_threads <= DAV1D_MAX_THREADS, DAV1D_ERR(EINVAL));
  146|  17.2k|    validate_input_or_ret(s->max_frame_delay >= 0 &&
  ------------------
  |  |   52|  34.4k|    if (!(x)) { \
  |  |  ------------------
  |  |  |  Branch (52:11): [True: 17.2k, False: 0]
  |  |  |  Branch (52:11): [True: 17.2k, False: 0]
  |  |  ------------------
  |  |   53|      0|        debug_print("Input validation check \'%s\' failed in %s!\n", \
  |  |  ------------------
  |  |  |  |   35|      0|#define debug_print(...) do {} while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (35:39): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   54|      0|                    #x, __func__); \
  |  |   55|      0|        debug_abort(); \
  |  |  ------------------
  |  |  |  |   36|      0|#define debug_abort() do {} while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (36:36): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   56|      0|        return r; \
  |  |   57|      0|    }
  ------------------
  147|  17.2k|                          s->max_frame_delay <= DAV1D_MAX_FRAME_DELAY, DAV1D_ERR(EINVAL));
  148|  17.2k|    validate_input_or_ret(s->allocator.alloc_picture_callback != NULL,
  ------------------
  |  |   52|  17.2k|    if (!(x)) { \
  |  |  ------------------
  |  |  |  Branch (52:9): [True: 0, False: 17.2k]
  |  |  ------------------
  |  |   53|      0|        debug_print("Input validation check \'%s\' failed in %s!\n", \
  |  |  ------------------
  |  |  |  |   35|      0|#define debug_print(...) do {} while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (35:39): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   54|      0|                    #x, __func__); \
  |  |   55|      0|        debug_abort(); \
  |  |  ------------------
  |  |  |  |   36|      0|#define debug_abort() do {} while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (36:36): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   56|      0|        return r; \
  |  |   57|      0|    }
  ------------------
  149|  17.2k|                          DAV1D_ERR(EINVAL));
  150|  17.2k|    validate_input_or_ret(s->allocator.release_picture_callback != NULL,
  ------------------
  |  |   52|  17.2k|    if (!(x)) { \
  |  |  ------------------
  |  |  |  Branch (52:9): [True: 0, False: 17.2k]
  |  |  ------------------
  |  |   53|      0|        debug_print("Input validation check \'%s\' failed in %s!\n", \
  |  |  ------------------
  |  |  |  |   35|      0|#define debug_print(...) do {} while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (35:39): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   54|      0|                    #x, __func__); \
  |  |   55|      0|        debug_abort(); \
  |  |  ------------------
  |  |  |  |   36|      0|#define debug_abort() do {} while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (36:36): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   56|      0|        return r; \
  |  |   57|      0|    }
  ------------------
  151|  17.2k|                          DAV1D_ERR(EINVAL));
  152|  17.2k|    validate_input_or_ret(s->operating_point >= 0 &&
  ------------------
  |  |   52|  34.4k|    if (!(x)) { \
  |  |  ------------------
  |  |  |  Branch (52:11): [True: 17.2k, False: 0]
  |  |  |  Branch (52:11): [True: 17.2k, False: 0]
  |  |  ------------------
  |  |   53|      0|        debug_print("Input validation check \'%s\' failed in %s!\n", \
  |  |  ------------------
  |  |  |  |   35|      0|#define debug_print(...) do {} while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (35:39): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   54|      0|                    #x, __func__); \
  |  |   55|      0|        debug_abort(); \
  |  |  ------------------
  |  |  |  |   36|      0|#define debug_abort() do {} while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (36:36): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   56|      0|        return r; \
  |  |   57|      0|    }
  ------------------
  153|  17.2k|                          s->operating_point <= 31, DAV1D_ERR(EINVAL));
  154|  17.2k|    validate_input_or_ret(s->decode_frame_type >= DAV1D_DECODEFRAMETYPE_ALL &&
  ------------------
  |  |   52|  34.4k|    if (!(x)) { \
  |  |  ------------------
  |  |  |  Branch (52:11): [True: 17.2k, False: 0]
  |  |  |  Branch (52:11): [True: 17.2k, False: 0]
  |  |  ------------------
  |  |   53|      0|        debug_print("Input validation check \'%s\' failed in %s!\n", \
  |  |  ------------------
  |  |  |  |   35|      0|#define debug_print(...) do {} while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (35:39): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   54|      0|                    #x, __func__); \
  |  |   55|      0|        debug_abort(); \
  |  |  ------------------
  |  |  |  |   36|      0|#define debug_abort() do {} while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (36:36): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   56|      0|        return r; \
  |  |   57|      0|    }
  ------------------
  155|  17.2k|                          s->decode_frame_type <= DAV1D_DECODEFRAMETYPE_KEY, DAV1D_ERR(EINVAL));
  156|       |
  157|  17.2k|    pthread_attr_t thread_attr;
  158|  17.2k|    if (pthread_attr_init(&thread_attr)) return DAV1D_ERR(ENOMEM);
  ------------------
  |  |   56|      0|#define DAV1D_ERR(e) (-(e)) ///< Negate POSIX error code.
  ------------------
  |  Branch (158:9): [True: 0, False: 17.2k]
  ------------------
  159|  17.2k|    size_t stack_size = 1024 * 1024 + get_stack_size_internal(&thread_attr);
  160|       |
  161|  17.2k|    pthread_attr_setstacksize(&thread_attr, stack_size);
  162|       |
  163|  17.2k|    Dav1dContext *const c = *c_out = dav1d_alloc_aligned(ALLOC_COMMON_CTX, sizeof(*c), 64);
  ------------------
  |  |  134|  17.2k|#define dav1d_alloc_aligned(type, sz, align) dav1d_alloc_aligned_internal(sz, align)
  ------------------
  164|  17.2k|    if (!c) goto error;
  ------------------
  |  Branch (164:9): [True: 0, False: 17.2k]
  ------------------
  165|  17.2k|    memset(c, 0, sizeof(*c));
  166|       |
  167|  17.2k|    c->allocator = s->allocator;
  168|  17.2k|    c->logger = s->logger;
  169|  17.2k|    c->apply_grain = s->apply_grain;
  170|  17.2k|    c->operating_point = s->operating_point;
  171|  17.2k|    c->all_layers = s->all_layers;
  172|  17.2k|    c->frame_size_limit = s->frame_size_limit;
  173|  17.2k|    c->strict_std_compliance = s->strict_std_compliance;
  174|  17.2k|    c->output_invisible_frames = s->output_invisible_frames;
  175|  17.2k|    c->inloop_filters = s->inloop_filters;
  176|  17.2k|    c->decode_frame_type = s->decode_frame_type;
  177|       |
  178|  17.2k|    dav1d_data_props_set_defaults(&c->cached_error_props);
  179|       |
  180|  17.2k|    if (dav1d_mem_pool_init(ALLOC_OBU_HDR, &c->seq_hdr_pool) ||
  ------------------
  |  |  131|  34.4k|#define dav1d_mem_pool_init(type, pool) dav1d_mem_pool_init(pool)
  |  |  ------------------
  |  |  |  Branch (131:41): [True: 0, False: 17.2k]
  |  |  ------------------
  ------------------
  181|  17.2k|        dav1d_mem_pool_init(ALLOC_OBU_HDR, &c->frame_hdr_pool) ||
  ------------------
  |  |  131|  34.4k|#define dav1d_mem_pool_init(type, pool) dav1d_mem_pool_init(pool)
  |  |  ------------------
  |  |  |  Branch (131:41): [True: 0, False: 17.2k]
  |  |  ------------------
  ------------------
  182|  17.2k|        dav1d_mem_pool_init(ALLOC_SEGMAP, &c->segmap_pool) ||
  ------------------
  |  |  131|  34.4k|#define dav1d_mem_pool_init(type, pool) dav1d_mem_pool_init(pool)
  |  |  ------------------
  |  |  |  Branch (131:41): [True: 0, False: 17.2k]
  |  |  ------------------
  ------------------
  183|  17.2k|        dav1d_mem_pool_init(ALLOC_REFMVS, &c->refmvs_pool) ||
  ------------------
  |  |  131|  34.4k|#define dav1d_mem_pool_init(type, pool) dav1d_mem_pool_init(pool)
  |  |  ------------------
  |  |  |  Branch (131:41): [True: 0, False: 17.2k]
  |  |  ------------------
  ------------------
  184|  17.2k|        dav1d_mem_pool_init(ALLOC_PIC_CTX, &c->pic_ctx_pool) ||
  ------------------
  |  |  131|  34.4k|#define dav1d_mem_pool_init(type, pool) dav1d_mem_pool_init(pool)
  |  |  ------------------
  |  |  |  Branch (131:41): [True: 0, False: 17.2k]
  |  |  ------------------
  ------------------
  185|  17.2k|        dav1d_mem_pool_init(ALLOC_CDF, &c->cdf_pool))
  ------------------
  |  |  131|  17.2k|#define dav1d_mem_pool_init(type, pool) dav1d_mem_pool_init(pool)
  |  |  ------------------
  |  |  |  Branch (131:41): [True: 0, False: 17.2k]
  |  |  ------------------
  ------------------
  186|      0|    {
  187|      0|        goto error;
  188|      0|    }
  189|       |
  190|  17.2k|    if (c->allocator.alloc_picture_callback   == dav1d_default_picture_alloc &&
  ------------------
  |  Branch (190:9): [True: 17.2k, False: 0]
  ------------------
  191|  17.2k|        c->allocator.release_picture_callback == dav1d_default_picture_release)
  ------------------
  |  Branch (191:9): [True: 17.2k, False: 0]
  ------------------
  192|  17.2k|    {
  193|  17.2k|        if (c->allocator.cookie) goto error;
  ------------------
  |  Branch (193:13): [True: 0, False: 17.2k]
  ------------------
  194|  17.2k|        if (dav1d_mem_pool_init(ALLOC_PIC, &c->picture_pool)) goto error;
  ------------------
  |  |  131|  17.2k|#define dav1d_mem_pool_init(type, pool) dav1d_mem_pool_init(pool)
  |  |  ------------------
  |  |  |  Branch (131:41): [True: 0, False: 17.2k]
  |  |  ------------------
  ------------------
  195|  17.2k|        c->allocator.cookie = c->picture_pool;
  196|  17.2k|    } else if (c->allocator.alloc_picture_callback   == dav1d_default_picture_alloc ||
  ------------------
  |  Branch (196:16): [True: 0, False: 0]
  ------------------
  197|      0|               c->allocator.release_picture_callback == dav1d_default_picture_release)
  ------------------
  |  Branch (197:16): [True: 0, False: 0]
  ------------------
  198|      0|    {
  199|      0|        goto error;
  200|      0|    }
  201|       |
  202|       |    /* On 32-bit systems extremely large frame sizes can cause overflows in
  203|       |     * dav1d_decode_frame() malloc size calculations. Prevent that from occuring
  204|       |     * by enforcing a maximum frame size limit, chosen to roughly correspond to
  205|       |     * the largest size possible to decode without exhausting virtual memory. */
  206|  17.2k|    if (sizeof(size_t) < 8 && s->frame_size_limit - 1 >= 8192 * 8192) {
  ------------------
  |  Branch (206:9): [Folded, False: 17.2k]
  |  Branch (206:31): [True: 0, False: 0]
  ------------------
  207|      0|        c->frame_size_limit = 8192 * 8192;
  208|      0|        if (s->frame_size_limit)
  ------------------
  |  Branch (208:13): [True: 0, False: 0]
  ------------------
  209|      0|            dav1d_log(c, "Frame size limit reduced from %u to %u.\n",
  ------------------
  |  |   39|      0|#define dav1d_log dav1d_log
  ------------------
  210|      0|                      s->frame_size_limit, c->frame_size_limit);
  211|      0|    }
  212|       |
  213|  17.2k|    c->flush = &c->flush_mem;
  214|  17.2k|    atomic_init(c->flush, 0);
  215|       |
  216|  17.2k|    get_num_threads(c, s, &c->n_tc, &c->n_fc);
  217|       |
  218|  17.2k|    c->fc = dav1d_alloc_aligned(ALLOC_THREAD_CTX, sizeof(*c->fc) * c->n_fc, 32);
  ------------------
  |  |  134|  17.2k|#define dav1d_alloc_aligned(type, sz, align) dav1d_alloc_aligned_internal(sz, align)
  ------------------
  219|  17.2k|    if (!c->fc) goto error;
  ------------------
  |  Branch (219:9): [True: 0, False: 17.2k]
  ------------------
  220|  17.2k|    memset(c->fc, 0, sizeof(*c->fc) * c->n_fc);
  221|       |
  222|  17.2k|    c->tc = dav1d_alloc_aligned(ALLOC_THREAD_CTX, sizeof(*c->tc) * c->n_tc, 64);
  ------------------
  |  |  134|  17.2k|#define dav1d_alloc_aligned(type, sz, align) dav1d_alloc_aligned_internal(sz, align)
  ------------------
  223|  17.2k|    if (!c->tc) goto error;
  ------------------
  |  Branch (223:9): [True: 0, False: 17.2k]
  ------------------
  224|  17.2k|    memset(c->tc, 0, sizeof(*c->tc) * c->n_tc);
  225|  17.2k|    if (c->n_tc > 1) {
  ------------------
  |  Branch (225:9): [True: 16.3k, False: 912]
  ------------------
  226|  16.3k|        if (pthread_mutex_init(&c->task_thread.lock, NULL)) goto error;
  ------------------
  |  Branch (226:13): [True: 0, False: 16.3k]
  ------------------
  227|  16.3k|        if (pthread_cond_init(&c->task_thread.cond, NULL)) {
  ------------------
  |  Branch (227:13): [True: 0, False: 16.3k]
  ------------------
  228|      0|            pthread_mutex_destroy(&c->task_thread.lock);
  229|      0|            goto error;
  230|      0|        }
  231|  16.3k|        if (pthread_cond_init(&c->task_thread.delayed_fg.cond, NULL)) {
  ------------------
  |  Branch (231:13): [True: 0, False: 16.3k]
  ------------------
  232|      0|            pthread_cond_destroy(&c->task_thread.cond);
  233|      0|            pthread_mutex_destroy(&c->task_thread.lock);
  234|      0|            goto error;
  235|      0|        }
  236|  16.3k|        c->task_thread.cur = c->n_fc;
  237|  16.3k|        atomic_init(&c->task_thread.reset_task_cur, UINT_MAX);
  238|  16.3k|        atomic_init(&c->task_thread.cond_signaled, 0);
  239|  16.3k|        c->task_thread.inited = 1;
  240|  16.3k|    }
  241|       |
  242|  17.2k|    if (c->n_fc > 1) {
  ------------------
  |  Branch (242:9): [True: 0, False: 17.2k]
  ------------------
  243|      0|        const size_t out_delayed_sz = sizeof(*c->frame_thread.out_delayed) * c->n_fc;
  244|      0|        c->frame_thread.out_delayed =
  245|      0|            dav1d_malloc(ALLOC_THREAD_CTX, out_delayed_sz);
  ------------------
  |  |  132|      0|#define dav1d_malloc(type, sz) malloc(sz)
  ------------------
  246|      0|        if (!c->frame_thread.out_delayed) goto error;
  ------------------
  |  Branch (246:13): [True: 0, False: 0]
  ------------------
  247|      0|        memset(c->frame_thread.out_delayed, 0, out_delayed_sz);
  248|      0|    }
  249|  34.4k|    for (unsigned n = 0; n < c->n_fc; n++) {
  ------------------
  |  Branch (249:26): [True: 17.2k, False: 17.2k]
  ------------------
  250|  17.2k|        Dav1dFrameContext *const f = &c->fc[n];
  251|  17.2k|        if (c->n_tc > 1) {
  ------------------
  |  Branch (251:13): [True: 16.3k, False: 912]
  ------------------
  252|  16.3k|            if (pthread_mutex_init(&f->task_thread.lock, NULL)) goto error;
  ------------------
  |  Branch (252:17): [True: 0, False: 16.3k]
  ------------------
  253|  16.3k|            if (pthread_cond_init(&f->task_thread.cond, NULL)) {
  ------------------
  |  Branch (253:17): [True: 0, False: 16.3k]
  ------------------
  254|      0|                pthread_mutex_destroy(&f->task_thread.lock);
  255|      0|                goto error;
  256|      0|            }
  257|  16.3k|            if (pthread_mutex_init(&f->task_thread.pending_tasks.lock, NULL)) {
  ------------------
  |  Branch (257:17): [True: 0, False: 16.3k]
  ------------------
  258|      0|                pthread_cond_destroy(&f->task_thread.cond);
  259|      0|                pthread_mutex_destroy(&f->task_thread.lock);
  260|      0|                goto error;
  261|      0|            }
  262|  16.3k|        }
  263|  17.2k|        f->c = c;
  264|  17.2k|        f->task_thread.ttd = &c->task_thread;
  265|  17.2k|        f->lf.last_sharpness = -1;
  266|  17.2k|    }
  267|       |
  268|   553k|    for (unsigned m = 0; m < c->n_tc; m++) {
  ------------------
  |  Branch (268:26): [True: 535k, False: 17.2k]
  ------------------
  269|   535k|        Dav1dTaskContext *const t = &c->tc[m];
  270|   535k|        t->f = &c->fc[0];
  271|   535k|        t->task_thread.ttd = &c->task_thread;
  272|   535k|        t->c = c;
  273|   535k|        memset(t->cf_16bpc, 0, sizeof(t->cf_16bpc));
  274|   535k|        if (c->n_tc > 1) {
  ------------------
  |  Branch (274:13): [True: 535k, False: 912]
  ------------------
  275|   535k|            if (pthread_mutex_init(&t->task_thread.td.lock, NULL)) goto error;
  ------------------
  |  Branch (275:17): [True: 0, False: 535k]
  ------------------
  276|   535k|            if (pthread_cond_init(&t->task_thread.td.cond, NULL)) {
  ------------------
  |  Branch (276:17): [True: 0, False: 535k]
  ------------------
  277|      0|                pthread_mutex_destroy(&t->task_thread.td.lock);
  278|      0|                goto error;
  279|      0|            }
  280|   535k|            if (pthread_create(&t->task_thread.td.thread, &thread_attr, dav1d_worker_task, t)) {
  ------------------
  |  Branch (280:17): [True: 0, False: 535k]
  ------------------
  281|      0|                pthread_cond_destroy(&t->task_thread.td.cond);
  282|      0|                pthread_mutex_destroy(&t->task_thread.td.lock);
  283|      0|                goto error;
  284|      0|            }
  285|   535k|            t->task_thread.td.inited = 1;
  286|   535k|        }
  287|   535k|    }
  288|  17.2k|    dav1d_pal_dsp_init(&c->pal_dsp);
  289|  17.2k|    dav1d_refmvs_dsp_init(&c->refmvs_dsp);
  290|       |
  291|  17.2k|    pthread_attr_destroy(&thread_attr);
  292|       |
  293|  17.2k|    return 0;
  294|       |
  295|      0|error:
  296|      0|    if (c) close_internal(c_out, 0);
  ------------------
  |  Branch (296:9): [True: 0, False: 0]
  ------------------
  297|      0|    pthread_attr_destroy(&thread_attr);
  298|      0|    return DAV1D_ERR(ENOMEM);
  ------------------
  |  |   56|      0|#define DAV1D_ERR(e) (-(e)) ///< Negate POSIX error code.
  ------------------
  299|  17.2k|}
dav1d_send_data:
  437|  20.8k|{
  438|  20.8k|    validate_input_or_ret(c != NULL, DAV1D_ERR(EINVAL));
  ------------------
  |  |   52|  20.8k|    if (!(x)) { \
  |  |  ------------------
  |  |  |  Branch (52:9): [True: 0, False: 20.8k]
  |  |  ------------------
  |  |   53|      0|        debug_print("Input validation check \'%s\' failed in %s!\n", \
  |  |  ------------------
  |  |  |  |   35|      0|#define debug_print(...) do {} while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (35:39): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   54|      0|                    #x, __func__); \
  |  |   55|      0|        debug_abort(); \
  |  |  ------------------
  |  |  |  |   36|      0|#define debug_abort() do {} while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (36:36): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   56|      0|        return r; \
  |  |   57|      0|    }
  ------------------
  439|  20.8k|    validate_input_or_ret(in != NULL, DAV1D_ERR(EINVAL));
  ------------------
  |  |   52|  20.8k|    if (!(x)) { \
  |  |  ------------------
  |  |  |  Branch (52:9): [True: 0, False: 20.8k]
  |  |  ------------------
  |  |   53|      0|        debug_print("Input validation check \'%s\' failed in %s!\n", \
  |  |  ------------------
  |  |  |  |   35|      0|#define debug_print(...) do {} while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (35:39): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   54|      0|                    #x, __func__); \
  |  |   55|      0|        debug_abort(); \
  |  |  ------------------
  |  |  |  |   36|      0|#define debug_abort() do {} while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (36:36): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   56|      0|        return r; \
  |  |   57|      0|    }
  ------------------
  440|       |
  441|  20.8k|    if (in->data) {
  ------------------
  |  Branch (441:9): [True: 20.8k, False: 0]
  ------------------
  442|  20.8k|        validate_input_or_ret(in->sz > 0 && in->sz <= SIZE_MAX / 2, DAV1D_ERR(EINVAL));
  ------------------
  |  |   52|  41.7k|    if (!(x)) { \
  |  |  ------------------
  |  |  |  Branch (52:11): [True: 20.8k, False: 0]
  |  |  |  Branch (52:11): [True: 20.8k, False: 0]
  |  |  ------------------
  |  |   53|      0|        debug_print("Input validation check \'%s\' failed in %s!\n", \
  |  |  ------------------
  |  |  |  |   35|      0|#define debug_print(...) do {} while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (35:39): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   54|      0|                    #x, __func__); \
  |  |   55|      0|        debug_abort(); \
  |  |  ------------------
  |  |  |  |   36|      0|#define debug_abort() do {} while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (36:36): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   56|      0|        return r; \
  |  |   57|      0|    }
  ------------------
  443|  20.8k|        c->drain = 0;
  444|  20.8k|    }
  445|  20.8k|    if (c->in.data)
  ------------------
  |  Branch (445:9): [True: 0, False: 20.8k]
  ------------------
  446|      0|        return DAV1D_ERR(EAGAIN);
  ------------------
  |  |   56|      0|#define DAV1D_ERR(e) (-(e)) ///< Negate POSIX error code.
  ------------------
  447|  20.8k|    dav1d_data_ref(&c->in, in);
  448|       |
  449|  20.8k|    int res = gen_picture(c);
  450|  20.8k|    if (!res)
  ------------------
  |  Branch (450:9): [True: 10.0k, False: 10.8k]
  ------------------
  451|  10.0k|        dav1d_data_unref_internal(in);
  452|       |
  453|  20.8k|    return res;
  454|  20.8k|}
dav1d_get_picture:
  457|  19.2k|{
  458|  19.2k|    validate_input_or_ret(c != NULL, DAV1D_ERR(EINVAL));
  ------------------
  |  |   52|  19.2k|    if (!(x)) { \
  |  |  ------------------
  |  |  |  Branch (52:9): [True: 0, False: 19.2k]
  |  |  ------------------
  |  |   53|      0|        debug_print("Input validation check \'%s\' failed in %s!\n", \
  |  |  ------------------
  |  |  |  |   35|      0|#define debug_print(...) do {} while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (35:39): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   54|      0|                    #x, __func__); \
  |  |   55|      0|        debug_abort(); \
  |  |  ------------------
  |  |  |  |   36|      0|#define debug_abort() do {} while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (36:36): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   56|      0|        return r; \
  |  |   57|      0|    }
  ------------------
  459|  19.2k|    validate_input_or_ret(out != NULL, DAV1D_ERR(EINVAL));
  ------------------
  |  |   52|  19.2k|    if (!(x)) { \
  |  |  ------------------
  |  |  |  Branch (52:9): [True: 0, False: 19.2k]
  |  |  ------------------
  |  |   53|      0|        debug_print("Input validation check \'%s\' failed in %s!\n", \
  |  |  ------------------
  |  |  |  |   35|      0|#define debug_print(...) do {} while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (35:39): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   54|      0|                    #x, __func__); \
  |  |   55|      0|        debug_abort(); \
  |  |  ------------------
  |  |  |  |   36|      0|#define debug_abort() do {} while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (36:36): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   56|      0|        return r; \
  |  |   57|      0|    }
  ------------------
  460|       |
  461|  19.2k|    const int drain = c->drain;
  462|  19.2k|    c->drain = 1;
  463|       |
  464|  19.2k|    int res = gen_picture(c);
  465|  19.2k|    if (res < 0)
  ------------------
  |  Branch (465:9): [True: 1.44k, False: 17.7k]
  ------------------
  466|  1.44k|        return res;
  467|       |
  468|  17.7k|    if (c->cached_error) {
  ------------------
  |  Branch (468:9): [True: 0, False: 17.7k]
  ------------------
  469|      0|        const int res = c->cached_error;
  470|      0|        c->cached_error = 0;
  471|      0|        return res;
  472|      0|    }
  473|       |
  474|  17.7k|    if (output_picture_ready(c, c->n_fc == 1))
  ------------------
  |  Branch (474:9): [True: 9.21k, False: 8.56k]
  ------------------
  475|  9.21k|        return output_image(c, out);
  476|       |
  477|  8.56k|    if (c->n_fc > 1 && drain)
  ------------------
  |  Branch (477:9): [True: 0, False: 8.56k]
  |  Branch (477:24): [True: 0, False: 0]
  ------------------
  478|      0|        return drain_picture(c, out);
  479|       |
  480|  8.56k|    return DAV1D_ERR(EAGAIN);
  ------------------
  |  |   56|  8.56k|#define DAV1D_ERR(e) (-(e)) ///< Negate POSIX error code.
  ------------------
  481|  8.56k|}
dav1d_apply_grain:
  485|    297|{
  486|    297|    validate_input_or_ret(c != NULL, DAV1D_ERR(EINVAL));
  ------------------
  |  |   52|    297|    if (!(x)) { \
  |  |  ------------------
  |  |  |  Branch (52:9): [True: 0, False: 297]
  |  |  ------------------
  |  |   53|      0|        debug_print("Input validation check \'%s\' failed in %s!\n", \
  |  |  ------------------
  |  |  |  |   35|      0|#define debug_print(...) do {} while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (35:39): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   54|      0|                    #x, __func__); \
  |  |   55|      0|        debug_abort(); \
  |  |  ------------------
  |  |  |  |   36|      0|#define debug_abort() do {} while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (36:36): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   56|      0|        return r; \
  |  |   57|      0|    }
  ------------------
  487|    297|    validate_input_or_ret(out != NULL, DAV1D_ERR(EINVAL));
  ------------------
  |  |   52|    297|    if (!(x)) { \
  |  |  ------------------
  |  |  |  Branch (52:9): [True: 0, False: 297]
  |  |  ------------------
  |  |   53|      0|        debug_print("Input validation check \'%s\' failed in %s!\n", \
  |  |  ------------------
  |  |  |  |   35|      0|#define debug_print(...) do {} while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (35:39): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   54|      0|                    #x, __func__); \
  |  |   55|      0|        debug_abort(); \
  |  |  ------------------
  |  |  |  |   36|      0|#define debug_abort() do {} while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (36:36): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   56|      0|        return r; \
  |  |   57|      0|    }
  ------------------
  488|    297|    validate_input_or_ret(in != NULL, DAV1D_ERR(EINVAL));
  ------------------
  |  |   52|    297|    if (!(x)) { \
  |  |  ------------------
  |  |  |  Branch (52:9): [True: 0, False: 297]
  |  |  ------------------
  |  |   53|      0|        debug_print("Input validation check \'%s\' failed in %s!\n", \
  |  |  ------------------
  |  |  |  |   35|      0|#define debug_print(...) do {} while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (35:39): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   54|      0|                    #x, __func__); \
  |  |   55|      0|        debug_abort(); \
  |  |  ------------------
  |  |  |  |   36|      0|#define debug_abort() do {} while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (36:36): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   56|      0|        return r; \
  |  |   57|      0|    }
  ------------------
  489|       |
  490|    297|    if (!has_grain(in)) {
  ------------------
  |  Branch (490:9): [True: 0, False: 297]
  ------------------
  491|      0|        dav1d_picture_ref(out, in);
  492|      0|        return 0;
  493|      0|    }
  494|       |
  495|    297|    int res = dav1d_picture_alloc_copy(c, out, in->p.w, in);
  496|    297|    if (res < 0) goto error;
  ------------------
  |  Branch (496:9): [True: 0, False: 297]
  ------------------
  497|       |
  498|    297|    if (c->n_tc > 1) {
  ------------------
  |  Branch (498:9): [True: 242, False: 55]
  ------------------
  499|    242|        dav1d_task_delayed_fg(c, out, in);
  500|    242|    } else {
  501|     55|        switch (out->p.bpc) {
  502|      0|#if CONFIG_8BPC
  503|     16|        case 8:
  ------------------
  |  Branch (503:9): [True: 16, False: 39]
  ------------------
  504|     16|            dav1d_apply_grain_8bpc(&c->dsp[0].fg, out, in);
  505|     16|            break;
  506|      0|#endif
  507|      0|#if CONFIG_16BPC
  508|     29|        case 10:
  ------------------
  |  Branch (508:9): [True: 29, False: 26]
  ------------------
  509|     39|        case 12:
  ------------------
  |  Branch (509:9): [True: 10, False: 45]
  ------------------
  510|     39|            dav1d_apply_grain_16bpc(&c->dsp[(out->p.bpc >> 1) - 4].fg, out, in);
  511|     39|            break;
  512|      0|#endif
  513|      0|        default: abort();
  ------------------
  |  Branch (513:9): [True: 0, False: 55]
  ------------------
  514|     55|        }
  515|     55|    }
  516|       |
  517|    297|    return 0;
  518|       |
  519|      0|error:
  520|      0|    dav1d_picture_unref_internal(out);
  521|      0|    return res;
  522|    297|}
dav1d_flush:
  524|  17.2k|void dav1d_flush(Dav1dContext *const c) {
  525|  17.2k|    dav1d_data_unref_internal(&c->in);
  526|  17.2k|    if (c->out.p.frame_hdr)
  ------------------
  |  Branch (526:9): [True: 0, False: 17.2k]
  ------------------
  527|      0|        dav1d_thread_picture_unref(&c->out);
  528|  17.2k|    if (c->cache.p.frame_hdr)
  ------------------
  |  Branch (528:9): [True: 498, False: 16.7k]
  ------------------
  529|    498|        dav1d_thread_picture_unref(&c->cache);
  530|       |
  531|  17.2k|    c->drain = 0;
  532|  17.2k|    c->cached_error = 0;
  533|       |
  534|   155k|    for (int i = 0; i < 8; i++) {
  ------------------
  |  Branch (534:21): [True: 137k, False: 17.2k]
  ------------------
  535|   137k|        if (c->refs[i].p.p.frame_hdr)
  ------------------
  |  Branch (535:13): [True: 55.7k, False: 82.0k]
  ------------------
  536|  55.7k|            dav1d_thread_picture_unref(&c->refs[i].p);
  537|   137k|        dav1d_ref_dec(&c->refs[i].segmap);
  538|   137k|        dav1d_ref_dec(&c->refs[i].refmvs);
  539|   137k|        dav1d_cdf_thread_unref(&c->cdf[i]);
  540|   137k|    }
  541|  17.2k|    c->frame_hdr = NULL;
  542|  17.2k|    c->seq_hdr = NULL;
  543|  17.2k|    dav1d_ref_dec(&c->seq_hdr_ref);
  544|       |
  545|  17.2k|    c->mastering_display = NULL;
  546|  17.2k|    c->content_light = NULL;
  547|  17.2k|    c->itut_t35 = NULL;
  548|  17.2k|    c->n_itut_t35 = 0;
  549|  17.2k|    dav1d_ref_dec(&c->mastering_display_ref);
  550|  17.2k|    dav1d_ref_dec(&c->content_light_ref);
  551|  17.2k|    dav1d_ref_dec(&c->itut_t35_ref);
  552|       |
  553|  17.2k|    dav1d_data_props_unref_internal(&c->cached_error_props);
  554|       |
  555|  17.2k|    if (c->n_fc == 1 && c->n_tc == 1) return;
  ------------------
  |  Branch (555:9): [True: 17.2k, False: 0]
  |  Branch (555:25): [True: 912, False: 16.3k]
  ------------------
  556|  17.2k|    atomic_store(c->flush, 1);
  557|       |
  558|  16.3k|    if (c->n_tc > 1) {
  ------------------
  |  Branch (558:9): [True: 16.3k, False: 0]
  ------------------
  559|  16.3k|        pthread_mutex_lock(&c->task_thread.lock);
  560|       |        // stop running tasks in worker threads
  561|   551k|        for (unsigned i = 0; i < c->n_tc; i++) {
  ------------------
  |  Branch (561:30): [True: 535k, False: 16.3k]
  ------------------
  562|   535k|            Dav1dTaskContext *const tc = &c->tc[i];
  563|   535k|            while (!tc->task_thread.flushed) {
  ------------------
  |  Branch (563:20): [True: 988, False: 535k]
  ------------------
  564|    988|                pthread_cond_wait(&tc->task_thread.td.cond, &c->task_thread.lock);
  565|    988|            }
  566|   535k|        }
  567|  32.6k|        for (unsigned i = 0; i < c->n_fc; i++) {
  ------------------
  |  Branch (567:30): [True: 16.3k, False: 16.3k]
  ------------------
  568|  16.3k|            c->fc[i].task_thread.task_head = NULL;
  569|  16.3k|            c->fc[i].task_thread.task_tail = NULL;
  570|  16.3k|            c->fc[i].task_thread.task_cur_prev = NULL;
  571|  16.3k|            c->fc[i].task_thread.pending_tasks.head = NULL;
  572|  16.3k|            c->fc[i].task_thread.pending_tasks.tail = NULL;
  573|  16.3k|            atomic_init(&c->fc[i].task_thread.pending_tasks.merge, 0);
  574|  16.3k|        }
  575|  16.3k|        atomic_init(&c->task_thread.first, 0);
  576|  16.3k|        c->task_thread.cur = c->n_fc;
  577|  16.3k|        atomic_store(&c->task_thread.reset_task_cur, UINT_MAX);
  578|  16.3k|        atomic_store(&c->task_thread.cond_signaled, 0);
  579|  16.3k|        pthread_mutex_unlock(&c->task_thread.lock);
  580|  16.3k|    }
  581|       |
  582|  16.3k|    if (c->n_fc > 1) {
  ------------------
  |  Branch (582:9): [True: 0, False: 16.3k]
  ------------------
  583|      0|        for (unsigned n = 0, next = c->frame_thread.next; n < c->n_fc; n++, next++) {
  ------------------
  |  Branch (583:59): [True: 0, False: 0]
  ------------------
  584|      0|            if (next == c->n_fc) next = 0;
  ------------------
  |  Branch (584:17): [True: 0, False: 0]
  ------------------
  585|      0|            Dav1dFrameContext *const f = &c->fc[next];
  586|      0|            dav1d_decode_frame_exit(f, -1);
  587|      0|            f->n_tile_data = 0;
  588|      0|            f->task_thread.retval = 0;
  589|      0|            f->task_thread.error = 0;
  590|      0|            Dav1dThreadPicture *out_delayed = &c->frame_thread.out_delayed[next];
  591|      0|            if (out_delayed->p.frame_hdr) {
  ------------------
  |  Branch (591:17): [True: 0, False: 0]
  ------------------
  592|      0|                dav1d_thread_picture_unref(out_delayed);
  593|      0|            }
  594|      0|        }
  595|      0|        c->frame_thread.next = 0;
  596|      0|    }
  597|       |    atomic_store(c->flush, 0);
  598|  16.3k|}
dav1d_close:
  600|  17.2k|COLD void dav1d_close(Dav1dContext **const c_out) {
  601|  17.2k|    validate_input(c_out != NULL);
  ------------------
  |  |   59|  17.2k|#define validate_input(x) validate_input_or_ret(x, )
  |  |  ------------------
  |  |  |  |   52|  17.2k|    if (!(x)) { \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (52:9): [True: 0, False: 17.2k]
  |  |  |  |  ------------------
  |  |  |  |   53|      0|        debug_print("Input validation check \'%s\' failed in %s!\n", \
  |  |  |  |  ------------------
  |  |  |  |  |  |   35|      0|#define debug_print(...) do {} while (0)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (35:39): [Folded, False: 0]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   54|      0|                    #x, __func__); \
  |  |  |  |   55|      0|        debug_abort(); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   36|      0|#define debug_abort() do {} while (0)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (36:36): [Folded, False: 0]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   56|      0|        return r; \
  |  |  |  |   57|      0|    }
  |  |  ------------------
  ------------------
  602|       |#if TRACK_HEAP_ALLOCATIONS
  603|       |    dav1d_log_alloc_stats(*c_out);
  604|       |#endif
  605|  17.2k|    close_internal(c_out, 1);
  606|  17.2k|}
dav1d_picture_unref:
  725|  15.0k|void dav1d_picture_unref(Dav1dPicture *const p) {
  726|  15.0k|    dav1d_picture_unref_internal(p);
  727|  15.0k|}
dav1d_data_wrap:
  738|  20.8k|{
  739|  20.8k|    return dav1d_data_wrap_internal(buf, ptr, sz, free_callback, user_data);
  740|  20.8k|}
dav1d_data_unref:
  754|  10.8k|void dav1d_data_unref(Dav1dData *const buf) {
  755|  10.8k|    dav1d_data_unref_internal(buf);
  756|  10.8k|}
lib.c:get_num_threads:
  109|  17.2k|{
  110|       |    /* ceil(sqrt(n)) */
  111|  17.2k|    static const uint8_t fc_lut[49] = {
  112|  17.2k|        1,                                     /*     1 */
  113|  17.2k|        2, 2, 2,                               /*  2- 4 */
  114|  17.2k|        3, 3, 3, 3, 3,                         /*  5- 9 */
  115|  17.2k|        4, 4, 4, 4, 4, 4, 4,                   /* 10-16 */
  116|  17.2k|        5, 5, 5, 5, 5, 5, 5, 5, 5,             /* 17-25 */
  117|  17.2k|        6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,       /* 26-36 */
  118|  17.2k|        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, /* 37-49 */
  119|  17.2k|    };
  120|  17.2k|    *n_tc = s->n_threads ? s->n_threads :
  ------------------
  |  Branch (120:13): [True: 17.2k, False: 0]
  ------------------
  121|  17.2k|        iclip(dav1d_num_logical_processors(c), 1, DAV1D_MAX_THREADS);
  ------------------
  |  |   46|      0|#define DAV1D_MAX_THREADS 256
  ------------------
  122|  17.2k|    *n_fc = s->max_frame_delay ? umin(s->max_frame_delay, *n_tc) :
  ------------------
  |  Branch (122:13): [True: 17.2k, False: 0]
  ------------------
  123|  17.2k|            *n_tc < 50 ? fc_lut[*n_tc - 1] : 8; // min(8, ceil(sqrt(n)))
  ------------------
  |  Branch (123:13): [True: 0, False: 0]
  ------------------
  124|  17.2k|}
lib.c:init_internal:
   53|      1|static COLD void init_internal(void) {
   54|      1|    dav1d_init_cpu();
   55|      1|    dav1d_init_ii_wedge_masks();
   56|      1|    dav1d_init_intra_edge_tree();
   57|      1|    dav1d_init_qm_tables();
   58|      1|    dav1d_init_thread();
  ------------------
  |  |  144|      1|#define dav1d_init_thread() do {} while (0)
  |  |  ------------------
  |  |  |  Branch (144:42): [Folded, False: 1]
  |  |  ------------------
  ------------------
   59|      1|}
lib.c:get_stack_size_internal:
   92|  17.2k|static COLD size_t get_stack_size_internal(const pthread_attr_t *const thread_attr) {
   93|  17.2k|#if defined(__linux__) && HAVE_DLSYM && defined(__GLIBC__)
   94|       |    /* glibc has an issue where the size of the TLS is subtracted from the stack
   95|       |     * size instead of allocated separately. As a result the specified stack
   96|       |     * size may be insufficient when used in an application with large amounts
   97|       |     * of TLS data. The following is a workaround to compensate for that.
   98|       |     * See https://sourceware.org/bugzilla/show_bug.cgi?id=11787 */
   99|  17.2k|    size_t (*const get_minstack)(const pthread_attr_t*) =
  100|  17.2k|        dlsym(RTLD_DEFAULT, "__pthread_get_minstack");
  101|  17.2k|    if (get_minstack)
  ------------------
  |  Branch (101:9): [True: 17.2k, False: 0]
  ------------------
  102|  17.2k|        return get_minstack(thread_attr) - PTHREAD_STACK_MIN;
  103|      0|#endif
  104|      0|    return 0;
  105|  17.2k|}
lib.c:gen_picture:
  411|  40.1k|{
  412|  40.1k|    Dav1dData *const in = &c->in;
  413|       |
  414|  40.1k|    if (output_picture_ready(c, 0))
  ------------------
  |  Branch (414:9): [True: 8.33k, False: 31.7k]
  ------------------
  415|  8.33k|        return 0;
  416|       |
  417|  72.1k|    while (in->sz > 0) {
  ------------------
  |  Branch (417:12): [True: 61.4k, False: 10.6k]
  ------------------
  418|  61.4k|        const ptrdiff_t res = dav1d_parse_obus(c, in);
  419|  61.4k|        if (res < 0) {
  ------------------
  |  Branch (419:13): [True: 12.3k, False: 49.1k]
  ------------------
  420|  12.3k|            dav1d_data_unref_internal(in);
  421|  49.1k|        } else {
  422|  49.1k|            assert((size_t)res <= in->sz);
  ------------------
  |  |  140|  49.1k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 49.1k]
  |  |  |  Branch (140:68): [Folded, False: 49.1k]
  |  |  ------------------
  ------------------
  423|  49.1k|            in->sz -= res;
  424|  49.1k|            in->data += res;
  425|  49.1k|            if (!in->sz) dav1d_data_unref_internal(in);
  ------------------
  |  Branch (425:17): [True: 8.56k, False: 40.5k]
  ------------------
  426|  49.1k|        }
  427|  61.4k|        if (output_picture_ready(c, 0))
  ------------------
  |  Branch (427:13): [True: 8.78k, False: 52.6k]
  ------------------
  428|  8.78k|            break;
  429|  52.6k|        if (res < 0)
  ------------------
  |  Branch (429:13): [True: 12.3k, False: 40.3k]
  ------------------
  430|  12.3k|            return (int)res;
  431|  52.6k|    }
  432|       |
  433|  19.4k|    return 0;
  434|  31.7k|}
lib.c:output_picture_ready:
  330|   119k|static int output_picture_ready(Dav1dContext *const c, const int drain) {
  331|   119k|    if (c->cached_error) return 1;
  ------------------
  |  Branch (331:9): [True: 0, False: 119k]
  ------------------
  332|   119k|    if (!c->all_layers && c->max_spatial_id) {
  ------------------
  |  Branch (332:9): [True: 96.1k, False: 23.2k]
  |  Branch (332:27): [True: 6.40k, False: 89.7k]
  ------------------
  333|  6.40k|        if (c->out.p.data[0] && c->cache.p.data[0]) {
  ------------------
  |  Branch (333:13): [True: 1.28k, False: 5.12k]
  |  Branch (333:33): [True: 358, False: 927]
  ------------------
  334|    358|            if (c->max_spatial_id == c->cache.p.frame_hdr->spatial_id ||
  ------------------
  |  Branch (334:17): [True: 0, False: 358]
  ------------------
  335|    358|                c->out.flags & PICTURE_FLAG_NEW_TEMPORAL_UNIT)
  ------------------
  |  Branch (335:17): [True: 199, False: 159]
  ------------------
  336|    199|                return 1;
  337|    159|            dav1d_thread_picture_unref(&c->cache);
  338|    159|            dav1d_thread_picture_move_ref(&c->cache, &c->out);
  339|    159|            return 0;
  340|  6.04k|        } else if (c->cache.p.data[0] && drain) {
  ------------------
  |  Branch (340:20): [True: 1.72k, False: 4.32k]
  |  Branch (340:42): [True: 429, False: 1.29k]
  ------------------
  341|    429|            return 1;
  342|  5.61k|        } else if (c->out.p.data[0]) {
  ------------------
  |  Branch (342:20): [True: 927, False: 4.69k]
  ------------------
  343|    927|            dav1d_thread_picture_move_ref(&c->cache, &c->out);
  344|    927|            return 0;
  345|    927|        }
  346|  6.40k|    }
  347|       |
  348|   117k|    return !!c->out.p.data[0];
  349|   119k|}
lib.c:output_image:
  310|  9.21k|{
  311|  9.21k|    int res = 0;
  312|       |
  313|  9.21k|    Dav1dThreadPicture *const in = (c->all_layers || !c->max_spatial_id)
  ------------------
  |  Branch (313:37): [True: 2.40k, False: 6.81k]
  |  Branch (313:54): [True: 6.31k, False: 497]
  ------------------
  314|  9.21k|                                   ? &c->out : &c->cache;
  315|  9.21k|    if (!c->apply_grain || !has_grain(&in->p)) {
  ------------------
  |  Branch (315:9): [True: 0, False: 9.21k]
  |  Branch (315:28): [True: 8.91k, False: 297]
  ------------------
  316|  8.91k|        dav1d_picture_move_ref(out, &in->p);
  317|  8.91k|        dav1d_thread_picture_unref(in);
  318|  8.91k|        goto end;
  319|  8.91k|    }
  320|       |
  321|    297|    res = dav1d_apply_grain(c, out, &in->p);
  322|    297|    dav1d_thread_picture_unref(in);
  323|  9.21k|end:
  324|  9.21k|    if (!c->all_layers && c->max_spatial_id && c->out.p.data[0]) {
  ------------------
  |  Branch (324:9): [True: 6.81k, False: 2.40k]
  |  Branch (324:27): [True: 497, False: 6.31k]
  |  Branch (324:48): [True: 68, False: 429]
  ------------------
  325|     68|        dav1d_thread_picture_move_ref(in, &c->out);
  326|     68|    }
  327|  9.21k|    return res;
  328|    297|}
lib.c:has_grain:
  302|  9.51k|{
  303|  9.51k|    const Dav1dFilmGrainData *fgdata = &pic->frame_hdr->film_grain.data;
  304|  9.51k|    return fgdata->num_y_points || fgdata->num_uv_points[0] ||
  ------------------
  |  Branch (304:12): [True: 438, False: 9.07k]
  |  Branch (304:36): [True: 8, False: 9.06k]
  ------------------
  305|  9.06k|           fgdata->num_uv_points[1] || (fgdata->clip_to_restricted_range &&
  ------------------
  |  Branch (305:12): [True: 8, False: 9.05k]
  |  Branch (305:41): [True: 148, False: 8.91k]
  ------------------
  306|    148|                                        fgdata->chroma_scaling_from_luma);
  ------------------
  |  Branch (306:41): [True: 140, False: 8]
  ------------------
  307|  9.51k|}
lib.c:close_internal:
  608|  17.2k|static COLD void close_internal(Dav1dContext **const c_out, int flush) {
  609|  17.2k|    Dav1dContext *const c = *c_out;
  610|  17.2k|    if (!c) return;
  ------------------
  |  Branch (610:9): [True: 0, False: 17.2k]
  ------------------
  611|       |
  612|  17.2k|    if (flush) dav1d_flush(c);
  ------------------
  |  Branch (612:9): [True: 17.2k, False: 0]
  ------------------
  613|       |
  614|  17.2k|    if (c->tc) {
  ------------------
  |  Branch (614:9): [True: 17.2k, False: 0]
  ------------------
  615|  17.2k|        struct TaskThreadData *ttd = &c->task_thread;
  616|  17.2k|        if (ttd->inited) {
  ------------------
  |  Branch (616:13): [True: 16.3k, False: 912]
  ------------------
  617|  16.3k|            pthread_mutex_lock(&ttd->lock);
  618|   551k|            for (unsigned n = 0; n < c->n_tc && c->tc[n].task_thread.td.inited; n++)
  ------------------
  |  Branch (618:34): [True: 535k, False: 16.3k]
  |  Branch (618:49): [True: 535k, False: 0]
  ------------------
  619|   535k|                c->tc[n].task_thread.die = 1;
  620|  16.3k|            pthread_cond_broadcast(&ttd->cond);
  621|  16.3k|            pthread_mutex_unlock(&ttd->lock);
  622|   551k|            for (unsigned n = 0; n < c->n_tc; n++) {
  ------------------
  |  Branch (622:34): [True: 535k, False: 16.3k]
  ------------------
  623|   535k|                Dav1dTaskContext *const pf = &c->tc[n];
  624|   535k|                if (!pf->task_thread.td.inited) break;
  ------------------
  |  Branch (624:21): [True: 0, False: 535k]
  ------------------
  625|   535k|                pthread_join(pf->task_thread.td.thread, NULL);
  626|   535k|                pthread_cond_destroy(&pf->task_thread.td.cond);
  627|   535k|                pthread_mutex_destroy(&pf->task_thread.td.lock);
  628|   535k|            }
  629|  16.3k|            pthread_cond_destroy(&ttd->delayed_fg.cond);
  630|  16.3k|            pthread_cond_destroy(&ttd->cond);
  631|  16.3k|            pthread_mutex_destroy(&ttd->lock);
  632|  16.3k|        }
  633|  17.2k|        dav1d_free_aligned(c->tc);
  ------------------
  |  |  136|  17.2k|#define dav1d_free_aligned(ptr) dav1d_free_aligned_internal(ptr)
  ------------------
  634|  17.2k|    }
  635|       |
  636|  34.4k|    for (unsigned n = 0; c->fc && n < c->n_fc; n++) {
  ------------------
  |  Branch (636:26): [True: 34.4k, False: 0]
  |  Branch (636:35): [True: 17.2k, False: 17.2k]
  ------------------
  637|  17.2k|        Dav1dFrameContext *const f = &c->fc[n];
  638|       |
  639|       |        // clean-up threading stuff
  640|  17.2k|        if (c->n_fc > 1) {
  ------------------
  |  Branch (640:13): [True: 0, False: 17.2k]
  ------------------
  641|      0|            dav1d_free(f->tile_thread.lowest_pixel_mem);
  ------------------
  |  |  135|      0|#define dav1d_free(ptr) free(ptr)
  ------------------
  642|      0|            dav1d_free(f->frame_thread.b);
  ------------------
  |  |  135|      0|#define dav1d_free(ptr) free(ptr)
  ------------------
  643|      0|            dav1d_free_aligned(f->frame_thread.cbi);
  ------------------
  |  |  136|      0|#define dav1d_free_aligned(ptr) dav1d_free_aligned_internal(ptr)
  ------------------
  644|      0|            dav1d_free_aligned(f->frame_thread.pal_idx);
  ------------------
  |  |  136|      0|#define dav1d_free_aligned(ptr) dav1d_free_aligned_internal(ptr)
  ------------------
  645|      0|            dav1d_free_aligned(f->frame_thread.cf);
  ------------------
  |  |  136|      0|#define dav1d_free_aligned(ptr) dav1d_free_aligned_internal(ptr)
  ------------------
  646|      0|            dav1d_free(f->frame_thread.tile_start_off);
  ------------------
  |  |  135|      0|#define dav1d_free(ptr) free(ptr)
  ------------------
  647|      0|            dav1d_free_aligned(f->frame_thread.pal);
  ------------------
  |  |  136|      0|#define dav1d_free_aligned(ptr) dav1d_free_aligned_internal(ptr)
  ------------------
  648|      0|        }
  649|  17.2k|        if (c->n_tc > 1) {
  ------------------
  |  Branch (649:13): [True: 16.3k, False: 912]
  ------------------
  650|  16.3k|            pthread_mutex_destroy(&f->task_thread.pending_tasks.lock);
  651|  16.3k|            pthread_cond_destroy(&f->task_thread.cond);
  652|  16.3k|            pthread_mutex_destroy(&f->task_thread.lock);
  653|  16.3k|        }
  654|  17.2k|        dav1d_free(f->frame_thread.frame_progress);
  ------------------
  |  |  135|  17.2k|#define dav1d_free(ptr) free(ptr)
  ------------------
  655|  17.2k|        dav1d_free(f->task_thread.tasks);
  ------------------
  |  |  135|  17.2k|#define dav1d_free(ptr) free(ptr)
  ------------------
  656|  17.2k|        dav1d_free(f->task_thread.tile_tasks[0]);
  ------------------
  |  |  135|  17.2k|#define dav1d_free(ptr) free(ptr)
  ------------------
  657|  17.2k|        dav1d_free_aligned(f->ts);
  ------------------
  |  |  136|  17.2k|#define dav1d_free_aligned(ptr) dav1d_free_aligned_internal(ptr)
  ------------------
  658|  17.2k|        dav1d_free_aligned(f->ipred_edge[0]);
  ------------------
  |  |  136|  17.2k|#define dav1d_free_aligned(ptr) dav1d_free_aligned_internal(ptr)
  ------------------
  659|  17.2k|        dav1d_free(f->a);
  ------------------
  |  |  135|  17.2k|#define dav1d_free(ptr) free(ptr)
  ------------------
  660|  17.2k|        dav1d_free(f->tile);
  ------------------
  |  |  135|  17.2k|#define dav1d_free(ptr) free(ptr)
  ------------------
  661|  17.2k|        dav1d_free(f->lf.mask);
  ------------------
  |  |  135|  17.2k|#define dav1d_free(ptr) free(ptr)
  ------------------
  662|  17.2k|        dav1d_free(f->lf.level);
  ------------------
  |  |  135|  17.2k|#define dav1d_free(ptr) free(ptr)
  ------------------
  663|  17.2k|        dav1d_free(f->lf.lr_mask);
  ------------------
  |  |  135|  17.2k|#define dav1d_free(ptr) free(ptr)
  ------------------
  664|  17.2k|        dav1d_free(f->lf.tx_lpf_right_edge[0]);
  ------------------
  |  |  135|  17.2k|#define dav1d_free(ptr) free(ptr)
  ------------------
  665|  17.2k|        dav1d_free(f->lf.start_of_tile_row);
  ------------------
  |  |  135|  17.2k|#define dav1d_free(ptr) free(ptr)
  ------------------
  666|  17.2k|        dav1d_free_aligned(f->rf.r);
  ------------------
  |  |  136|  17.2k|#define dav1d_free_aligned(ptr) dav1d_free_aligned_internal(ptr)
  ------------------
  667|  17.2k|        dav1d_free_aligned(f->lf.cdef_line_buf);
  ------------------
  |  |  136|  17.2k|#define dav1d_free_aligned(ptr) dav1d_free_aligned_internal(ptr)
  ------------------
  668|  17.2k|        dav1d_free_aligned(f->lf.lr_line_buf);
  ------------------
  |  |  136|  17.2k|#define dav1d_free_aligned(ptr) dav1d_free_aligned_internal(ptr)
  ------------------
  669|  17.2k|    }
  670|  17.2k|    dav1d_free_aligned(c->fc);
  ------------------
  |  |  136|  17.2k|#define dav1d_free_aligned(ptr) dav1d_free_aligned_internal(ptr)
  ------------------
  671|  17.2k|    if (c->n_fc > 1 && c->frame_thread.out_delayed) {
  ------------------
  |  Branch (671:9): [True: 0, False: 17.2k]
  |  Branch (671:24): [True: 0, False: 0]
  ------------------
  672|      0|        for (unsigned n = 0; n < c->n_fc; n++)
  ------------------
  |  Branch (672:30): [True: 0, False: 0]
  ------------------
  673|      0|            if (c->frame_thread.out_delayed[n].p.frame_hdr)
  ------------------
  |  Branch (673:17): [True: 0, False: 0]
  ------------------
  674|      0|                dav1d_thread_picture_unref(&c->frame_thread.out_delayed[n]);
  675|      0|        dav1d_free(c->frame_thread.out_delayed);
  ------------------
  |  |  135|      0|#define dav1d_free(ptr) free(ptr)
  ------------------
  676|      0|    }
  677|  17.2k|    for (int n = 0; n < c->n_tile_data; n++)
  ------------------
  |  Branch (677:21): [True: 37, False: 17.2k]
  ------------------
  678|     37|        dav1d_data_unref_internal(&c->tile[n].data);
  679|  17.2k|    dav1d_free(c->tile);
  ------------------
  |  |  135|  17.2k|#define dav1d_free(ptr) free(ptr)
  ------------------
  680|   155k|    for (int n = 0; n < 8; n++) {
  ------------------
  |  Branch (680:21): [True: 137k, False: 17.2k]
  ------------------
  681|   137k|        dav1d_cdf_thread_unref(&c->cdf[n]);
  682|   137k|        if (c->refs[n].p.p.frame_hdr)
  ------------------
  |  Branch (682:13): [True: 0, False: 137k]
  ------------------
  683|      0|            dav1d_thread_picture_unref(&c->refs[n].p);
  684|   137k|        dav1d_ref_dec(&c->refs[n].refmvs);
  685|   137k|        dav1d_ref_dec(&c->refs[n].segmap);
  686|   137k|    }
  687|  17.2k|    dav1d_ref_dec(&c->seq_hdr_ref);
  688|  17.2k|    dav1d_ref_dec(&c->frame_hdr_ref);
  689|       |
  690|  17.2k|    dav1d_ref_dec(&c->mastering_display_ref);
  691|  17.2k|    dav1d_ref_dec(&c->content_light_ref);
  692|  17.2k|    dav1d_ref_dec(&c->itut_t35_ref);
  693|       |
  694|  17.2k|    dav1d_mem_pool_end(c->seq_hdr_pool);
  695|  17.2k|    dav1d_mem_pool_end(c->frame_hdr_pool);
  696|  17.2k|    dav1d_mem_pool_end(c->segmap_pool);
  697|  17.2k|    dav1d_mem_pool_end(c->refmvs_pool);
  698|  17.2k|    dav1d_mem_pool_end(c->cdf_pool);
  699|  17.2k|    dav1d_mem_pool_end(c->picture_pool);
  700|  17.2k|    dav1d_mem_pool_end(c->pic_ctx_pool);
  701|       |
  702|  17.2k|    dav1d_freep_aligned(c_out);
  703|  17.2k|}

dav1d_log:
   46|  8.13k|COLD void dav1d_log(Dav1dContext *const c, const char *const format, ...) {
   47|  8.13k|    assert(c != NULL);
  ------------------
  |  |  140|  8.13k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 8.13k]
  |  |  |  Branch (140:68): [Folded, False: 8.13k]
  |  |  ------------------
  ------------------
   48|       |
   49|  8.13k|    if (!c->logger.callback)
  ------------------
  |  Branch (49:9): [True: 0, False: 8.13k]
  ------------------
   50|      0|        return;
   51|       |
   52|  8.13k|    va_list ap;
   53|  8.13k|    va_start(ap, format);
   54|  8.13k|    c->logger.callback(c->logger.cookie, format, ap);
   55|       |    va_end(ap);
   56|  8.13k|}

dav1d_loop_filter_dsp_init_8bpc:
  259|  7.82k|COLD void bitfn(dav1d_loop_filter_dsp_init)(Dav1dLoopFilterDSPContext *const c) {
  260|  7.82k|    c->loop_filter_sb[0][0] = loop_filter_h_sb128y_c;
  261|  7.82k|    c->loop_filter_sb[0][1] = loop_filter_v_sb128y_c;
  262|  7.82k|    c->loop_filter_sb[1][0] = loop_filter_h_sb128uv_c;
  263|  7.82k|    c->loop_filter_sb[1][1] = loop_filter_v_sb128uv_c;
  264|       |
  265|  7.82k|#if HAVE_ASM
  266|       |#if ARCH_AARCH64 || ARCH_ARM
  267|       |    loop_filter_dsp_init_arm(c);
  268|       |#elif ARCH_LOONGARCH64
  269|       |    loop_filter_dsp_init_loongarch(c);
  270|       |#elif ARCH_PPC64LE
  271|       |    loop_filter_dsp_init_ppc(c);
  272|       |#elif ARCH_X86
  273|       |    loop_filter_dsp_init_x86(c);
  274|  7.82k|#endif
  275|  7.82k|#endif
  276|  7.82k|}
dav1d_loop_filter_dsp_init_16bpc:
  259|  7.63k|COLD void bitfn(dav1d_loop_filter_dsp_init)(Dav1dLoopFilterDSPContext *const c) {
  260|  7.63k|    c->loop_filter_sb[0][0] = loop_filter_h_sb128y_c;
  261|  7.63k|    c->loop_filter_sb[0][1] = loop_filter_v_sb128y_c;
  262|  7.63k|    c->loop_filter_sb[1][0] = loop_filter_h_sb128uv_c;
  263|  7.63k|    c->loop_filter_sb[1][1] = loop_filter_v_sb128uv_c;
  264|       |
  265|  7.63k|#if HAVE_ASM
  266|       |#if ARCH_AARCH64 || ARCH_ARM
  267|       |    loop_filter_dsp_init_arm(c);
  268|       |#elif ARCH_LOONGARCH64
  269|       |    loop_filter_dsp_init_loongarch(c);
  270|       |#elif ARCH_PPC64LE
  271|       |    loop_filter_dsp_init_ppc(c);
  272|       |#elif ARCH_X86
  273|       |    loop_filter_dsp_init_x86(c);
  274|  7.63k|#endif
  275|  7.63k|#endif
  276|  7.63k|}

dav1d_loop_restoration_dsp_init_8bpc:
 1367|  7.82k|{
 1368|  7.82k|    c->wiener[0] = c->wiener[1] = wiener_c;
 1369|  7.82k|    c->sgr[0] = sgr_5x5_c;
 1370|  7.82k|    c->sgr[1] = sgr_3x3_c;
 1371|  7.82k|    c->sgr[2] = sgr_mix_c;
 1372|       |
 1373|  7.82k|#if HAVE_ASM
 1374|       |#if ARCH_AARCH64 || ARCH_ARM
 1375|       |    loop_restoration_dsp_init_arm(c, bpc);
 1376|       |#elif ARCH_LOONGARCH64
 1377|       |    loop_restoration_dsp_init_loongarch(c, bpc);
 1378|       |#elif ARCH_PPC64LE
 1379|       |    loop_restoration_dsp_init_ppc(c, bpc);
 1380|       |#elif ARCH_X86
 1381|       |    loop_restoration_dsp_init_x86(c, bpc);
 1382|  7.82k|#endif
 1383|  7.82k|#endif
 1384|  7.82k|}
looprestoration_tmpl.c:sgr_5x5_c:
  830|  6.38k|{
  831|  6.38k|    ALIGN_STK_16(int32_t, sumsq_buf, BUF_STRIDE * 5 + 16,);
  ------------------
  |  |  100|  6.38k|    ALIGN(type var[sz1d]sznd, ALIGN_16_VAL)
  |  |  ------------------
  |  |  |  |   86|  6.38k|    line __attribute__((aligned(align)))
  |  |  ------------------
  ------------------
  832|  6.38k|    ALIGN_STK_16(coef, sum_buf, BUF_STRIDE * 5 + 16,);
  ------------------
  |  |  100|  6.38k|    ALIGN(type var[sz1d]sznd, ALIGN_16_VAL)
  |  |  ------------------
  |  |  |  |   86|  6.38k|    line __attribute__((aligned(align)))
  |  |  ------------------
  ------------------
  833|  6.38k|    int32_t *sumsq_ptrs[5], *sumsq_rows[5];
  834|  6.38k|    coef *sum_ptrs[5], *sum_rows[5];
  835|  38.2k|    for (int i = 0; i < 5; i++) {
  ------------------
  |  Branch (835:21): [True: 31.9k, False: 6.38k]
  ------------------
  836|  31.9k|        sumsq_rows[i] = &sumsq_buf[i * BUF_STRIDE];
  ------------------
  |  |  685|  31.9k|#define BUF_STRIDE (384 + 16)
  ------------------
  837|  31.9k|        sum_rows[i] = &sum_buf[i * BUF_STRIDE];
  ------------------
  |  |  685|  31.9k|#define BUF_STRIDE (384 + 16)
  ------------------
  838|  31.9k|    }
  839|       |
  840|  6.38k|    ALIGN_STK_16(int32_t, A_buf, BUF_STRIDE * 2 + 16,);
  ------------------
  |  |  100|  6.38k|    ALIGN(type var[sz1d]sznd, ALIGN_16_VAL)
  |  |  ------------------
  |  |  |  |   86|  6.38k|    line __attribute__((aligned(align)))
  |  |  ------------------
  ------------------
  841|  6.38k|    ALIGN_STK_16(coef, B_buf, BUF_STRIDE * 2 + 16,);
  ------------------
  |  |  100|  6.38k|    ALIGN(type var[sz1d]sznd, ALIGN_16_VAL)
  |  |  ------------------
  |  |  |  |   86|  6.38k|    line __attribute__((aligned(align)))
  |  |  ------------------
  ------------------
  842|  6.38k|    int32_t *A_ptrs[2];
  843|  6.38k|    coef *B_ptrs[2];
  844|  19.1k|    for (int i = 0; i < 2; i++) {
  ------------------
  |  Branch (844:21): [True: 12.7k, False: 6.38k]
  ------------------
  845|  12.7k|        A_ptrs[i] = &A_buf[i * BUF_STRIDE];
  ------------------
  |  |  685|  12.7k|#define BUF_STRIDE (384 + 16)
  ------------------
  846|  12.7k|        B_ptrs[i] = &B_buf[i * BUF_STRIDE];
  ------------------
  |  |  685|  12.7k|#define BUF_STRIDE (384 + 16)
  ------------------
  847|  12.7k|    }
  848|  6.38k|    const pixel *src = dst;
  849|  6.38k|    const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
  ------------------
  |  |   53|  6.38k|#define PXSTRIDE(x) (x)
  ------------------
  850|       |
  851|  6.38k|    if (edges & LR_HAVE_TOP) {
  ------------------
  |  Branch (851:9): [True: 6.04k, False: 341]
  ------------------
  852|  6.04k|        sumsq_ptrs[0] = sumsq_rows[0];
  853|  6.04k|        sumsq_ptrs[1] = sumsq_rows[0];
  854|  6.04k|        sumsq_ptrs[2] = sumsq_rows[1];
  855|  6.04k|        sumsq_ptrs[3] = sumsq_rows[2];
  856|  6.04k|        sumsq_ptrs[4] = sumsq_rows[3];
  857|  6.04k|        sum_ptrs[0] = sum_rows[0];
  858|  6.04k|        sum_ptrs[1] = sum_rows[0];
  859|  6.04k|        sum_ptrs[2] = sum_rows[1];
  860|  6.04k|        sum_ptrs[3] = sum_rows[2];
  861|  6.04k|        sum_ptrs[4] = sum_rows[3];
  862|       |
  863|  6.04k|        sgr_box5_row_h(sumsq_rows[0], sum_rows[0], NULL, lpf, w, edges);
  864|  6.04k|        lpf += PXSTRIDE(stride);
  ------------------
  |  |   53|  6.04k|#define PXSTRIDE(x) (x)
  ------------------
  865|  6.04k|        sgr_box5_row_h(sumsq_rows[1], sum_rows[1], NULL, lpf, w, edges);
  866|       |
  867|  6.04k|        sgr_box5_row_h(sumsq_rows[2], sum_rows[2], left, src, w, edges);
  868|  6.04k|        left++;
  869|  6.04k|        src += PXSTRIDE(stride);
  ------------------
  |  |   53|  6.04k|#define PXSTRIDE(x) (x)
  ------------------
  870|       |
  871|  6.04k|        if (--h <= 0)
  ------------------
  |  Branch (871:13): [True: 12, False: 6.02k]
  ------------------
  872|     12|            goto vert_1;
  873|       |
  874|  6.02k|        sgr_box5_row_h(sumsq_rows[3], sum_rows[3], left, src, w, edges);
  875|  6.02k|        left++;
  876|  6.02k|        src += PXSTRIDE(stride);
  ------------------
  |  |   53|  6.02k|#define PXSTRIDE(x) (x)
  ------------------
  877|  6.02k|        sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
  878|  6.02k|                      w, params->sgr.s0, BITDEPTH_MAX);
  ------------------
  |  |   59|  6.02k|#define BITDEPTH_MAX 0xff
  ------------------
  879|  6.02k|        rotate(A_ptrs, B_ptrs, 2);
  880|       |
  881|  6.02k|        if (--h <= 0)
  ------------------
  |  Branch (881:13): [True: 8, False: 6.02k]
  ------------------
  882|      8|            goto vert_2;
  883|       |
  884|       |        // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
  885|       |        // one of them to point at the previously unused rows[4].
  886|  6.02k|        sumsq_ptrs[3] = sumsq_rows[4];
  887|  6.02k|        sum_ptrs[3] = sum_rows[4];
  888|  6.02k|    } else {
  889|    341|        sumsq_ptrs[0] = sumsq_rows[0];
  890|    341|        sumsq_ptrs[1] = sumsq_rows[0];
  891|    341|        sumsq_ptrs[2] = sumsq_rows[0];
  892|    341|        sumsq_ptrs[3] = sumsq_rows[0];
  893|    341|        sumsq_ptrs[4] = sumsq_rows[0];
  894|    341|        sum_ptrs[0] = sum_rows[0];
  895|    341|        sum_ptrs[1] = sum_rows[0];
  896|    341|        sum_ptrs[2] = sum_rows[0];
  897|    341|        sum_ptrs[3] = sum_rows[0];
  898|    341|        sum_ptrs[4] = sum_rows[0];
  899|       |
  900|    341|        sgr_box5_row_h(sumsq_rows[0], sum_rows[0], left, src, w, edges);
  901|    341|        left++;
  902|    341|        src += PXSTRIDE(stride);
  ------------------
  |  |   53|    341|#define PXSTRIDE(x) (x)
  ------------------
  903|       |
  904|    341|        if (--h <= 0)
  ------------------
  |  Branch (904:13): [True: 18, False: 323]
  ------------------
  905|     18|            goto vert_1;
  906|       |
  907|    323|        sumsq_ptrs[4] = sumsq_rows[1];
  908|    323|        sum_ptrs[4] = sum_rows[1];
  909|       |
  910|    323|        sgr_box5_row_h(sumsq_rows[1], sum_rows[1], left, src, w, edges);
  911|    323|        left++;
  912|    323|        src += PXSTRIDE(stride);
  ------------------
  |  |   53|    323|#define PXSTRIDE(x) (x)
  ------------------
  913|       |
  914|    323|        sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
  915|    323|                      w, params->sgr.s0, BITDEPTH_MAX);
  ------------------
  |  |   59|    323|#define BITDEPTH_MAX 0xff
  ------------------
  916|    323|        rotate(A_ptrs, B_ptrs, 2);
  917|       |
  918|    323|        if (--h <= 0)
  ------------------
  |  Branch (918:13): [True: 18, False: 305]
  ------------------
  919|     18|            goto vert_2;
  920|       |
  921|    305|        sumsq_ptrs[3] = sumsq_rows[2];
  922|    305|        sumsq_ptrs[4] = sumsq_rows[3];
  923|    305|        sum_ptrs[3] = sum_rows[2];
  924|    305|        sum_ptrs[4] = sum_rows[3];
  925|       |
  926|    305|        sgr_box5_row_h(sumsq_rows[2], sum_rows[2], left, src, w, edges);
  927|    305|        left++;
  928|    305|        src += PXSTRIDE(stride);
  ------------------
  |  |   53|    305|#define PXSTRIDE(x) (x)
  ------------------
  929|       |
  930|    305|        if (--h <= 0)
  ------------------
  |  Branch (930:13): [True: 2, False: 303]
  ------------------
  931|      2|            goto odd;
  932|       |
  933|    303|        sgr_box5_row_h(sumsq_rows[3], sum_rows[3], left, src, w, edges);
  934|    303|        left++;
  935|    303|        src += PXSTRIDE(stride);
  ------------------
  |  |   53|    303|#define PXSTRIDE(x) (x)
  ------------------
  936|       |
  937|    303|        sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
  938|    303|                      w, params->sgr.s0, BITDEPTH_MAX);
  ------------------
  |  |   59|    303|#define BITDEPTH_MAX 0xff
  ------------------
  939|    303|        sgr_finish2(&dst, stride, A_ptrs, B_ptrs,
  940|    303|                    w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
  941|       |
  942|    303|        if (--h <= 0)
  ------------------
  |  Branch (942:13): [True: 0, False: 303]
  ------------------
  943|      0|            goto vert_2;
  944|       |
  945|       |        // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
  946|       |        // one of them to point at the previously unused rows[4].
  947|    303|        sumsq_ptrs[3] = sumsq_rows[4];
  948|    303|        sum_ptrs[3] = sum_rows[4];
  949|    303|    }
  950|       |
  951|   168k|    do {
  952|   168k|        sgr_box5_row_h(sumsq_ptrs[3], sum_ptrs[3], left, src, w, edges);
  953|   168k|        left++;
  954|   168k|        src += PXSTRIDE(stride);
  ------------------
  |  |   53|   168k|#define PXSTRIDE(x) (x)
  ------------------
  955|       |
  956|   168k|        if (--h <= 0)
  ------------------
  |  Branch (956:13): [True: 88, False: 168k]
  ------------------
  957|     88|            goto odd;
  958|       |
  959|   168k|        sgr_box5_row_h(sumsq_ptrs[4], sum_ptrs[4], left, src, w, edges);
  960|   168k|        left++;
  961|   168k|        src += PXSTRIDE(stride);
  ------------------
  |  |   53|   168k|#define PXSTRIDE(x) (x)
  ------------------
  962|       |
  963|   168k|        sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
  964|   168k|                      w, params->sgr.s0, BITDEPTH_MAX);
  ------------------
  |  |   59|   168k|#define BITDEPTH_MAX 0xff
  ------------------
  965|   168k|        sgr_finish2(&dst, stride, A_ptrs, B_ptrs,
  966|   168k|                    w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
  967|   168k|    } while (--h > 0);
  ------------------
  |  Branch (967:14): [True: 161k, False: 6.23k]
  ------------------
  968|       |
  969|  6.23k|    if (!(edges & LR_HAVE_BOTTOM))
  ------------------
  |  Branch (969:9): [True: 202, False: 6.03k]
  ------------------
  970|    202|        goto vert_2;
  971|       |
  972|  6.03k|    sgr_box5_row_h(sumsq_ptrs[3], sum_ptrs[3], NULL, lpf_bottom, w, edges);
  973|  6.03k|    lpf_bottom += PXSTRIDE(stride);
  ------------------
  |  |   53|  6.03k|#define PXSTRIDE(x) (x)
  ------------------
  974|  6.03k|    sgr_box5_row_h(sumsq_ptrs[4], sum_ptrs[4], NULL, lpf_bottom, w, edges);
  975|       |
  976|  6.26k|output_2:
  977|  6.26k|    sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
  978|  6.26k|                  w, params->sgr.s0, BITDEPTH_MAX);
  ------------------
  |  |   59|  6.26k|#define BITDEPTH_MAX 0xff
  ------------------
  979|  6.26k|    sgr_finish2(&dst, stride, A_ptrs, B_ptrs,
  980|  6.26k|                w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
  981|  6.26k|    return;
  982|       |
  983|    228|vert_2:
  984|       |    // Duplicate the last row twice more
  985|    228|    sumsq_ptrs[3] = sumsq_ptrs[2];
  986|    228|    sumsq_ptrs[4] = sumsq_ptrs[2];
  987|    228|    sum_ptrs[3] = sum_ptrs[2];
  988|    228|    sum_ptrs[4] = sum_ptrs[2];
  989|    228|    goto output_2;
  990|       |
  991|     90|odd:
  992|       |    // Copy the last row as padding once
  993|     90|    sumsq_ptrs[4] = sumsq_ptrs[3];
  994|     90|    sum_ptrs[4] = sum_ptrs[3];
  995|       |
  996|     90|    sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
  997|     90|                  w, params->sgr.s0, BITDEPTH_MAX);
  ------------------
  |  |   59|     90|#define BITDEPTH_MAX 0xff
  ------------------
  998|     90|    sgr_finish2(&dst, stride, A_ptrs, B_ptrs,
  999|     90|                w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
 1000|       |
 1001|    120|output_1:
 1002|       |    // Duplicate the last row twice more
 1003|    120|    sumsq_ptrs[3] = sumsq_ptrs[2];
 1004|    120|    sumsq_ptrs[4] = sumsq_ptrs[2];
 1005|    120|    sum_ptrs[3] = sum_ptrs[2];
 1006|    120|    sum_ptrs[4] = sum_ptrs[2];
 1007|       |
 1008|    120|    sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
 1009|    120|                  w, params->sgr.s0, BITDEPTH_MAX);
  ------------------
  |  |   59|    120|#define BITDEPTH_MAX 0xff
  ------------------
 1010|       |    // Output only one row
 1011|    120|    sgr_finish2(&dst, stride, A_ptrs, B_ptrs,
 1012|    120|                w, 1, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
 1013|    120|    return;
 1014|       |
 1015|     30|vert_1:
 1016|       |    // Copy the last row as padding once
 1017|     30|    sumsq_ptrs[4] = sumsq_ptrs[3];
 1018|     30|    sum_ptrs[4] = sum_ptrs[3];
 1019|       |
 1020|     30|    sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
 1021|     30|                  w, params->sgr.s0, BITDEPTH_MAX);
  ------------------
  |  |   59|     30|#define BITDEPTH_MAX 0xff
  ------------------
 1022|     30|    rotate(A_ptrs, B_ptrs, 2);
 1023|       |
 1024|     30|    goto output_1;
 1025|     90|}
looprestoration_tmpl.c:sgr_box5_row_h:
  441|  2.05M|{
  442|  2.05M|    sumsq++;
  443|  2.05M|    sum++;
  444|  2.05M|    int a = edges & LR_HAVE_LEFT ? (left ? left[0][1] : src[-3]) : src[0];
  ------------------
  |  Branch (444:13): [True: 24.8k, False: 2.03M]
  |  Branch (444:37): [True: 23.7k, False: 1.08k]
  ------------------
  445|  2.05M|    int b = edges & LR_HAVE_LEFT ? (left ? left[0][2] : src[-2]) : src[0];
  ------------------
  |  Branch (445:13): [True: 24.8k, False: 2.03M]
  |  Branch (445:37): [True: 23.7k, False: 1.08k]
  ------------------
  446|  2.05M|    int c = edges & LR_HAVE_LEFT ? (left ? left[0][3] : src[-1]) : src[0];
  ------------------
  |  Branch (446:13): [True: 24.8k, False: 2.03M]
  |  Branch (446:37): [True: 23.7k, False: 1.08k]
  ------------------
  447|  2.05M|    int d = src[0];
  448|  23.9M|    for (int x = -1; x < w + 1; x++) {
  ------------------
  |  Branch (448:22): [True: 21.9M, False: 2.05M]
  ------------------
  449|  21.9M|        int e = (x + 2 < w || (edges & LR_HAVE_RIGHT)) ? src[x + 2] : src[w - 1];
  ------------------
  |  Branch (449:18): [True: 15.7M, False: 6.18M]
  |  Branch (449:31): [True: 85.6k, False: 6.09M]
  ------------------
  450|  21.9M|        sum[x] = a + b + c + d + e;
  451|  21.9M|        sumsq[x] = a * a + b * b + c * c + d * d + e * e;
  452|  21.9M|        a = b;
  453|  21.9M|        b = c;
  454|  21.9M|        c = d;
  455|  21.9M|        d = e;
  456|  21.9M|    }
  457|  2.05M|}
looprestoration_tmpl.c:sgr_box5_vert:
  537|  1.00M|{
  538|  1.00M|    sgr_box5_row_v(sumsq, sum, sumsq_out, sum_out, w);
  539|  1.00M|    sgr_calc_row_ab(sumsq_out, sum_out, w, s, bitdepth_max, 25, 164);
  540|  1.00M|    rotate5_x2(sumsq, sum);
  541|  1.00M|}
looprestoration_tmpl.c:sgr_box5_row_v:
  488|  1.00M|{
  489|  11.7M|    for (int x = 0; x < w + 2; x++) {
  ------------------
  |  Branch (489:21): [True: 10.7M, False: 1.00M]
  ------------------
  490|  10.7M|        int sq_a = sumsq[0][x];
  491|  10.7M|        int sq_b = sumsq[1][x];
  492|  10.7M|        int sq_c = sumsq[2][x];
  493|  10.7M|        int sq_d = sumsq[3][x];
  494|  10.7M|        int sq_e = sumsq[4][x];
  495|  10.7M|        int s_a = sum[0][x];
  496|  10.7M|        int s_b = sum[1][x];
  497|  10.7M|        int s_c = sum[2][x];
  498|  10.7M|        int s_d = sum[3][x];
  499|  10.7M|        int s_e = sum[4][x];
  500|  10.7M|        sumsq_out[x] = sq_a + sq_b + sq_c + sq_d + sq_e;
  501|  10.7M|        sum_out[x] = s_a + s_b + s_c + s_d + s_e;
  502|  10.7M|    }
  503|  1.00M|}
looprestoration_tmpl.c:sgr_calc_row_ab:
  507|  3.29M|{
  508|  3.29M|    const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
  ------------------
  |  |   58|  3.29M|#define bitdepth_from_max(x) 8
  ------------------
  509|  34.6M|    for (int i = 0; i < w + 2; i++) {
  ------------------
  |  Branch (509:21): [True: 31.3M, False: 3.29M]
  ------------------
  510|  31.3M|        const int a =
  511|  31.3M|            (AA[i] + ((1 << (2 * bitdepth_min_8)) >> 1)) >> (2 * bitdepth_min_8);
  512|  31.3M|        const int b =
  513|  31.3M|            (BB[i] + ((1 << bitdepth_min_8) >> 1)) >> bitdepth_min_8;
  514|       |
  515|  31.3M|        const unsigned p = imax(a * n - b * b, 0);
  516|  31.3M|        const unsigned z = (p * s + (1 << 19)) >> 20;
  517|  31.3M|        const unsigned x = dav1d_sgr_x_by_x[umin(z, 255)];
  518|       |
  519|       |        // This is where we invert A and B, so that B is of size coef.
  520|  31.3M|        AA[i] = (x * BB[i] * sgr_one_by_x + (1 << 11)) >> 12;
  521|  31.3M|        BB[i] = x;
  522|  31.3M|    }
  523|  3.29M|}
looprestoration_tmpl.c:rotate5_x2:
  402|   986k|{
  403|   986k|    int32_t *tmp32[2];
  404|   986k|    coef *tmpc[2];
  405|  2.95M|    for (int i = 0; i < 2; i++) {
  ------------------
  |  Branch (405:21): [True: 1.97M, False: 986k]
  ------------------
  406|  1.97M|        tmp32[i] = sumsq_ptrs[i];
  407|  1.97M|        tmpc[i] = sum_ptrs[i];
  408|  1.97M|    }
  409|  3.93M|    for (int i = 0; i < 3; i++) {
  ------------------
  |  Branch (409:21): [True: 2.94M, False: 986k]
  ------------------
  410|  2.94M|        sumsq_ptrs[i] = sumsq_ptrs[i + 2];
  411|  2.94M|        sum_ptrs[i] = sum_ptrs[i + 2];
  412|  2.94M|    }
  413|  2.95M|    for (int i = 0; i < 2; i++) {
  ------------------
  |  Branch (413:21): [True: 1.97M, False: 986k]
  ------------------
  414|  1.97M|        sumsq_ptrs[3 + i] = tmp32[i];
  415|  1.97M|        sum_ptrs[3 + i] = tmpc[i];
  416|  1.97M|    }
  417|   986k|}
looprestoration_tmpl.c:rotate:
  390|  5.37M|{
  391|  5.37M|    int32_t *tmp32 = sumsq_ptrs[0];
  392|  5.37M|    coef *tmpc = sum_ptrs[0];
  393|  16.7M|    for (int i = 0; i < n - 1; i++) {
  ------------------
  |  Branch (393:21): [True: 11.3M, False: 5.37M]
  ------------------
  394|  11.3M|        sumsq_ptrs[i] = sumsq_ptrs[i + 1];
  395|  11.3M|        sum_ptrs[i] = sum_ptrs[i + 1];
  396|  11.3M|    }
  397|  5.37M|    sumsq_ptrs[n - 1] = tmp32;
  398|  5.37M|    sum_ptrs[n - 1] = tmpc;
  399|  5.37M|}
looprestoration_tmpl.c:sgr_finish2:
  645|   175k|{
  646|   175k|    ALIGN_STK_16(coef, tmp, 2*FILTER_OUT_STRIDE,);
  ------------------
  |  |  100|   175k|    ALIGN(type var[sz1d]sznd, ALIGN_16_VAL)
  |  |  ------------------
  |  |  |  |   86|   175k|    line __attribute__((aligned(align)))
  |  |  ------------------
  ------------------
  647|       |
  648|   175k|    sgr_finish_filter2(tmp, *dst, stride, A_ptrs, B_ptrs, w, h);
  649|   175k|    sgr_weighted_row1(*dst, tmp, w, w1 HIGHBD_TAIL_SUFFIX);
  650|   175k|    *dst += PXSTRIDE(stride);
  ------------------
  |  |   53|   175k|#define PXSTRIDE(x) (x)
  ------------------
  651|   175k|    if (h > 1) {
  ------------------
  |  Branch (651:9): [True: 174k, False: 390]
  ------------------
  652|   174k|        sgr_weighted_row1(*dst, tmp + FILTER_OUT_STRIDE, w, w1 HIGHBD_TAIL_SUFFIX);
  ------------------
  |  |  572|   174k|#define FILTER_OUT_STRIDE (384)
  ------------------
  653|   174k|        *dst += PXSTRIDE(stride);
  ------------------
  |  |   53|   174k|#define PXSTRIDE(x) (x)
  ------------------
  654|   174k|    }
  655|   175k|    rotate(A_ptrs, B_ptrs, 2);
  656|   175k|}
looprestoration_tmpl.c:sgr_finish_filter2:
  579|   959k|{
  580|   959k|#define SIX_NEIGHBORS(P, i)\
  581|   959k|    ((P[0][i]     + P[1][i]) * 6 +   \
  582|   959k|     (P[0][i - 1] + P[1][i - 1] +    \
  583|   959k|      P[0][i + 1] + P[1][i + 1]) * 5)
  584|  9.26M|    for (int i = 0; i < w; i++) {
  ------------------
  |  Branch (584:21): [True: 8.30M, False: 959k]
  ------------------
  585|  8.30M|        const int a = SIX_NEIGHBORS(B_ptrs, i + 1);
  ------------------
  |  |  581|  8.30M|    ((P[0][i]     + P[1][i]) * 6 +   \
  |  |  582|  8.30M|     (P[0][i - 1] + P[1][i - 1] +    \
  |  |  583|  8.30M|      P[0][i + 1] + P[1][i + 1]) * 5)
  ------------------
  586|  8.30M|        const int b = SIX_NEIGHBORS(A_ptrs, i + 1);
  ------------------
  |  |  581|  8.30M|    ((P[0][i]     + P[1][i]) * 6 +   \
  |  |  582|  8.30M|     (P[0][i - 1] + P[1][i - 1] +    \
  |  |  583|  8.30M|      P[0][i + 1] + P[1][i + 1]) * 5)
  ------------------
  587|  8.30M|        tmp[i] = (b - a * src[i] + (1 << 8)) >> 9;
  588|  8.30M|    }
  589|   959k|    if (h <= 1)
  ------------------
  |  Branch (589:9): [True: 494, False: 958k]
  ------------------
  590|    494|        return;
  591|   958k|    tmp += FILTER_OUT_STRIDE;
  ------------------
  |  |  572|   958k|#define FILTER_OUT_STRIDE (384)
  ------------------
  592|   958k|    src += PXSTRIDE(src_stride);
  ------------------
  |  |   53|   958k|#define PXSTRIDE(x) (x)
  ------------------
  593|   958k|    const int32_t *A = &A_ptrs[1][1];
  594|   958k|    const coef *B = &B_ptrs[1][1];
  595|  9.30M|    for (int i = 0; i < w; i++) {
  ------------------
  |  Branch (595:21): [True: 8.34M, False: 958k]
  ------------------
  596|  8.34M|        const int a = B[i] * 6 + (B[i - 1] + B[i + 1]) * 5;
  597|  8.34M|        const int b = A[i] * 6 + (A[i - 1] + A[i + 1]) * 5;
  598|  8.34M|        tmp[i] = (b - a * src[i] + (1 << 7)) >> 8;
  599|  8.34M|    }
  600|   958k|#undef SIX_NEIGHBORS
  601|   958k|}
looprestoration_tmpl.c:sgr_weighted_row1:
  605|   993k|{
  606|  9.76M|    for (int i = 0; i < w; i++) {
  ------------------
  |  Branch (606:21): [True: 8.76M, False: 993k]
  ------------------
  607|  8.76M|        const int v = w1 * t1[i];
  608|  8.76M|        dst[i] = iclip_pixel(dst[i] + ((v + (1 << 10)) >> 11));
  ------------------
  |  |   49|  8.76M|#define iclip_pixel iclip_u8
  ------------------
  609|  8.76M|    }
  610|   993k|}
looprestoration_tmpl.c:sgr_3x3_c:
  684|  11.7k|{
  685|  11.7k|#define BUF_STRIDE (384 + 16)
  686|  11.7k|    ALIGN_STK_16(int32_t, sumsq_buf, BUF_STRIDE * 3 + 16,);
  ------------------
  |  |  100|  11.7k|    ALIGN(type var[sz1d]sznd, ALIGN_16_VAL)
  |  |  ------------------
  |  |  |  |   86|  11.7k|    line __attribute__((aligned(align)))
  |  |  ------------------
  ------------------
  687|  11.7k|    ALIGN_STK_16(coef, sum_buf, BUF_STRIDE * 3 + 16,);
  ------------------
  |  |  100|  11.7k|    ALIGN(type var[sz1d]sznd, ALIGN_16_VAL)
  |  |  ------------------
  |  |  |  |   86|  11.7k|    line __attribute__((aligned(align)))
  |  |  ------------------
  ------------------
  688|  11.7k|    int32_t *sumsq_ptrs[3], *sumsq_rows[3];
  689|  11.7k|    coef *sum_ptrs[3], *sum_rows[3];
  690|  46.8k|    for (int i = 0; i < 3; i++) {
  ------------------
  |  Branch (690:21): [True: 35.1k, False: 11.7k]
  ------------------
  691|  35.1k|        sumsq_rows[i] = &sumsq_buf[i * BUF_STRIDE];
  ------------------
  |  |  685|  35.1k|#define BUF_STRIDE (384 + 16)
  ------------------
  692|  35.1k|        sum_rows[i] = &sum_buf[i * BUF_STRIDE];
  ------------------
  |  |  685|  35.1k|#define BUF_STRIDE (384 + 16)
  ------------------
  693|  35.1k|    }
  694|       |
  695|  11.7k|    ALIGN_STK_16(int32_t, A_buf, BUF_STRIDE * 3 + 16,);
  ------------------
  |  |  100|  11.7k|    ALIGN(type var[sz1d]sznd, ALIGN_16_VAL)
  |  |  ------------------
  |  |  |  |   86|  11.7k|    line __attribute__((aligned(align)))
  |  |  ------------------
  ------------------
  696|  11.7k|    ALIGN_STK_16(coef, B_buf, BUF_STRIDE * 3 + 16,);
  ------------------
  |  |  100|  11.7k|    ALIGN(type var[sz1d]sznd, ALIGN_16_VAL)
  |  |  ------------------
  |  |  |  |   86|  11.7k|    line __attribute__((aligned(align)))
  |  |  ------------------
  ------------------
  697|  11.7k|    int32_t *A_ptrs[3];
  698|  11.7k|    coef *B_ptrs[3];
  699|  46.8k|    for (int i = 0; i < 3; i++) {
  ------------------
  |  Branch (699:21): [True: 35.1k, False: 11.7k]
  ------------------
  700|  35.1k|        A_ptrs[i] = &A_buf[i * BUF_STRIDE];
  ------------------
  |  |  685|  35.1k|#define BUF_STRIDE (384 + 16)
  ------------------
  701|  35.1k|        B_ptrs[i] = &B_buf[i * BUF_STRIDE];
  ------------------
  |  |  685|  35.1k|#define BUF_STRIDE (384 + 16)
  ------------------
  702|  35.1k|    }
  703|  11.7k|    const pixel *src = dst;
  704|  11.7k|    const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
  ------------------
  |  |   53|  11.7k|#define PXSTRIDE(x) (x)
  ------------------
  705|       |
  706|  11.7k|    if (edges & LR_HAVE_TOP) {
  ------------------
  |  Branch (706:9): [True: 11.3k, False: 389]
  ------------------
  707|  11.3k|        sumsq_ptrs[0] = sumsq_rows[0];
  708|  11.3k|        sumsq_ptrs[1] = sumsq_rows[1];
  709|  11.3k|        sumsq_ptrs[2] = sumsq_rows[2];
  710|  11.3k|        sum_ptrs[0] = sum_rows[0];
  711|  11.3k|        sum_ptrs[1] = sum_rows[1];
  712|  11.3k|        sum_ptrs[2] = sum_rows[2];
  713|       |
  714|  11.3k|        sgr_box3_row_h(sumsq_rows[0], sum_rows[0], NULL, lpf, w, edges);
  715|  11.3k|        lpf += PXSTRIDE(stride);
  ------------------
  |  |   53|  11.3k|#define PXSTRIDE(x) (x)
  ------------------
  716|  11.3k|        sgr_box3_row_h(sumsq_rows[1], sum_rows[1], NULL, lpf, w, edges);
  717|       |
  718|  11.3k|        sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
  719|  11.3k|                    left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
  ------------------
  |  |   59|  11.3k|#define BITDEPTH_MAX 0xff
  ------------------
  720|  11.3k|        left++;
  721|  11.3k|        src += PXSTRIDE(stride);
  ------------------
  |  |   53|  11.3k|#define PXSTRIDE(x) (x)
  ------------------
  722|  11.3k|        rotate(A_ptrs, B_ptrs, 3);
  723|       |
  724|  11.3k|        if (--h <= 0)
  ------------------
  |  Branch (724:13): [True: 20, False: 11.2k]
  ------------------
  725|     20|            goto vert_1;
  726|       |
  727|  11.2k|        sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
  728|  11.2k|                    left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
  ------------------
  |  |   59|  11.2k|#define BITDEPTH_MAX 0xff
  ------------------
  729|  11.2k|        left++;
  730|  11.2k|        src += PXSTRIDE(stride);
  ------------------
  |  |   53|  11.2k|#define PXSTRIDE(x) (x)
  ------------------
  731|  11.2k|        rotate(A_ptrs, B_ptrs, 3);
  732|       |
  733|  11.2k|        if (--h <= 0)
  ------------------
  |  Branch (733:13): [True: 6, False: 11.2k]
  ------------------
  734|      6|            goto vert_2;
  735|  11.2k|    } else {
  736|    389|        sumsq_ptrs[0] = sumsq_rows[0];
  737|    389|        sumsq_ptrs[1] = sumsq_rows[0];
  738|    389|        sumsq_ptrs[2] = sumsq_rows[0];
  739|    389|        sum_ptrs[0] = sum_rows[0];
  740|    389|        sum_ptrs[1] = sum_rows[0];
  741|    389|        sum_ptrs[2] = sum_rows[0];
  742|       |
  743|    389|        sgr_box3_row_h(sumsq_rows[0], sum_rows[0], left, src, w, edges);
  744|    389|        left++;
  745|    389|        src += PXSTRIDE(stride);
  ------------------
  |  |   53|    389|#define PXSTRIDE(x) (x)
  ------------------
  746|       |
  747|    389|        sgr_box3_vert(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
  748|    389|                      w, params->sgr.s1, BITDEPTH_MAX);
  ------------------
  |  |   59|    389|#define BITDEPTH_MAX 0xff
  ------------------
  749|    389|        rotate(A_ptrs, B_ptrs, 3);
  750|       |
  751|    389|        if (--h <= 0)
  ------------------
  |  Branch (751:13): [True: 24, False: 365]
  ------------------
  752|     24|            goto vert_1;
  753|       |
  754|    365|        sumsq_ptrs[2] = sumsq_rows[1];
  755|    365|        sum_ptrs[2] = sum_rows[1];
  756|       |
  757|    365|        sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
  758|    365|                    left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
  ------------------
  |  |   59|    365|#define BITDEPTH_MAX 0xff
  ------------------
  759|    365|        left++;
  760|    365|        src += PXSTRIDE(stride);
  ------------------
  |  |   53|    365|#define PXSTRIDE(x) (x)
  ------------------
  761|    365|        rotate(A_ptrs, B_ptrs, 3);
  762|       |
  763|    365|        if (--h <= 0)
  ------------------
  |  Branch (763:13): [True: 34, False: 331]
  ------------------
  764|     34|            goto vert_2;
  765|       |
  766|    331|        sumsq_ptrs[2] = sumsq_rows[2];
  767|    331|        sum_ptrs[2] = sum_rows[2];
  768|    331|    }
  769|       |
  770|   624k|    do {
  771|   624k|        sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
  772|   624k|                    left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
  ------------------
  |  |   59|   624k|#define BITDEPTH_MAX 0xff
  ------------------
  773|   624k|        left++;
  774|   624k|        src += PXSTRIDE(stride);
  ------------------
  |  |   53|   624k|#define PXSTRIDE(x) (x)
  ------------------
  775|       |
  776|   624k|        sgr_finish1(&dst, stride, A_ptrs, B_ptrs,
  777|   624k|                    w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
  778|   624k|    } while (--h > 0);
  ------------------
  |  Branch (778:14): [True: 612k, False: 11.6k]
  ------------------
  779|       |
  780|  11.6k|    if (!(edges & LR_HAVE_BOTTOM))
  ------------------
  |  Branch (780:9): [True: 200, False: 11.4k]
  ------------------
  781|    200|        goto vert_2;
  782|       |
  783|  11.4k|    sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
  784|  11.4k|                NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX);
  ------------------
  |  |   59|  11.4k|#define BITDEPTH_MAX 0xff
  ------------------
  785|  11.4k|    lpf_bottom += PXSTRIDE(stride);
  ------------------
  |  |   53|  11.4k|#define PXSTRIDE(x) (x)
  ------------------
  786|       |
  787|  11.4k|    sgr_finish1(&dst, stride, A_ptrs, B_ptrs,
  788|  11.4k|                w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
  789|       |
  790|  11.4k|    sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
  791|  11.4k|                NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX);
  ------------------
  |  |   59|  11.4k|#define BITDEPTH_MAX 0xff
  ------------------
  792|       |
  793|  11.4k|    sgr_finish1(&dst, stride, A_ptrs, B_ptrs,
  794|  11.4k|                w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
  795|  11.4k|    return;
  796|       |
  797|    240|vert_2:
  798|    240|    sumsq_ptrs[2] = sumsq_ptrs[1];
  799|    240|    sum_ptrs[2] = sum_ptrs[1];
  800|    240|    sgr_box3_vert(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
  801|    240|                  w, params->sgr.s1, BITDEPTH_MAX);
  ------------------
  |  |   59|    240|#define BITDEPTH_MAX 0xff
  ------------------
  802|       |
  803|    240|    sgr_finish1(&dst, stride, A_ptrs, B_ptrs,
  804|    240|                w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
  805|       |
  806|    284|output_1:
  807|    284|    sumsq_ptrs[2] = sumsq_ptrs[1];
  808|    284|    sum_ptrs[2] = sum_ptrs[1];
  809|    284|    sgr_box3_vert(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
  810|    284|                  w, params->sgr.s1, BITDEPTH_MAX);
  ------------------
  |  |   59|    284|#define BITDEPTH_MAX 0xff
  ------------------
  811|       |
  812|    284|    sgr_finish1(&dst, stride, A_ptrs, B_ptrs,
  813|    284|                w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
  814|    284|    return;
  815|       |
  816|     44|vert_1:
  817|     44|    sumsq_ptrs[2] = sumsq_ptrs[1];
  818|     44|    sum_ptrs[2] = sum_ptrs[1];
  819|     44|    sgr_box3_vert(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
  820|     44|                  w, params->sgr.s1, BITDEPTH_MAX);
  ------------------
  |  |   59|     44|#define BITDEPTH_MAX 0xff
  ------------------
  821|     44|    rotate(A_ptrs, B_ptrs, 3);
  822|     44|    goto output_1;
  823|    240|}
looprestoration_tmpl.c:sgr_box3_row_h:
  423|  2.33M|{
  424|  2.33M|    sumsq++;
  425|  2.33M|    sum++;
  426|  2.33M|    int a = edges & LR_HAVE_LEFT ? (left ? left[0][2] : src[-2]) : src[0];
  ------------------
  |  Branch (426:13): [True: 26.9k, False: 2.31M]
  |  Branch (426:37): [True: 25.6k, False: 1.24k]
  ------------------
  427|  2.33M|    int b = edges & LR_HAVE_LEFT ? (left ? left[0][3] : src[-1]) : src[0];
  ------------------
  |  Branch (427:13): [True: 26.9k, False: 2.31M]
  |  Branch (427:37): [True: 25.6k, False: 1.24k]
  ------------------
  428|  26.7M|    for (int x = -1; x < w + 1; x++) {
  ------------------
  |  Branch (428:22): [True: 24.4M, False: 2.33M]
  ------------------
  429|  24.4M|        int c = (x + 1 < w || (edges & LR_HAVE_RIGHT)) ? src[x + 1] : src[w - 1];
  ------------------
  |  Branch (429:18): [True: 19.7M, False: 4.73M]
  |  Branch (429:31): [True: 66.5k, False: 4.67M]
  ------------------
  430|  24.4M|        sum[x] = a + b + c;
  431|  24.4M|        sumsq[x] = a * a + b * b + c * c;
  432|  24.4M|        a = b;
  433|  24.4M|        b = c;
  434|  24.4M|    }
  435|  2.33M|}
looprestoration_tmpl.c:sgr_box3_hv:
  550|   668k|{
  551|   668k|    sgr_box3_row_h(sumsq[2], sum[2], left, src, w, edges);
  552|   668k|    sgr_box3_vert(sumsq, sum, AA, BB, w, s, bitdepth_max);
  553|   668k|}
looprestoration_tmpl.c:sgr_box3_vert:
  528|  2.29M|{
  529|  2.29M|    sgr_box3_row_v(sumsq, sum, sumsq_out, sum_out, w);
  530|  2.29M|    sgr_calc_row_ab(sumsq_out, sum_out, w, s, bitdepth_max, 9, 455);
  531|  2.29M|    rotate(sumsq, sum, 3);
  532|  2.29M|}
looprestoration_tmpl.c:sgr_box3_row_v:
  472|  2.28M|{
  473|  26.1M|    for (int x = 0; x < w + 2; x++) {
  ------------------
  |  Branch (473:21): [True: 23.8M, False: 2.28M]
  ------------------
  474|  23.8M|        int sq_a = sumsq[0][x];
  475|  23.8M|        int sq_b = sumsq[1][x];
  476|  23.8M|        int sq_c = sumsq[2][x];
  477|  23.8M|        int s_a = sum[0][x];
  478|  23.8M|        int s_b = sum[1][x];
  479|  23.8M|        int s_c = sum[2][x];
  480|  23.8M|        sumsq_out[x] = sq_a + sq_b + sq_c;
  481|  23.8M|        sum_out[x] = s_a + s_b + s_c;
  482|  23.8M|    }
  483|  2.28M|}
looprestoration_tmpl.c:sgr_finish1:
  631|   644k|{
  632|       |    // Only one single row, no stride needed
  633|   644k|    ALIGN_STK_16(coef, tmp, 384,);
  ------------------
  |  |  100|   644k|    ALIGN(type var[sz1d]sznd, ALIGN_16_VAL)
  |  |  ------------------
  |  |  |  |   86|   644k|    line __attribute__((aligned(align)))
  |  |  ------------------
  ------------------
  634|       |
  635|   644k|    sgr_finish_filter_row1(tmp, *dst, A_ptrs, B_ptrs, w);
  636|   644k|    sgr_weighted_row1(*dst, tmp, w, w1 HIGHBD_TAIL_SUFFIX);
  637|   644k|    *dst += PXSTRIDE(stride);
  ------------------
  |  |   53|   644k|#define PXSTRIDE(x) (x)
  ------------------
  638|   644k|    rotate(A_ptrs, B_ptrs, 3);
  639|   644k|}
looprestoration_tmpl.c:sgr_finish_filter_row1:
  559|  2.19M|{
  560|  2.19M|#define EIGHT_NEIGHBORS(P, i)\
  561|  2.19M|    ((P[1][i] + P[1][i - 1] + P[1][i + 1] + P[0][i] + P[2][i]) * 4 + \
  562|  2.19M|     (P[0][i - 1] + P[2][i - 1] +                           \
  563|  2.19M|      P[0][i + 1] + P[2][i + 1]) * 3)
  564|  20.7M|    for (int i = 0; i < w; i++) {
  ------------------
  |  Branch (564:21): [True: 18.6M, False: 2.19M]
  ------------------
  565|  18.6M|        const int a = EIGHT_NEIGHBORS(B_ptrs, i + 1);
  ------------------
  |  |  561|  18.6M|    ((P[1][i] + P[1][i - 1] + P[1][i + 1] + P[0][i] + P[2][i]) * 4 + \
  |  |  562|  18.6M|     (P[0][i - 1] + P[2][i - 1] +                           \
  |  |  563|  18.6M|      P[0][i + 1] + P[2][i + 1]) * 3)
  ------------------
  566|  18.6M|        const int b = EIGHT_NEIGHBORS(A_ptrs, i + 1);
  ------------------
  |  |  561|  18.6M|    ((P[1][i] + P[1][i - 1] + P[1][i + 1] + P[0][i] + P[2][i]) * 4 + \
  |  |  562|  18.6M|     (P[0][i - 1] + P[2][i - 1] +                           \
  |  |  563|  18.6M|      P[0][i + 1] + P[2][i + 1]) * 3)
  ------------------
  567|  18.6M|        tmp[i] = (b - a * src[i] + (1 << 8)) >> 9;
  568|  18.6M|    }
  569|  2.19M|#undef EIGHT_NEIGHBORS
  570|  2.19M|}
looprestoration_tmpl.c:sgr_mix_c:
 1032|  28.3k|{
 1033|  28.3k|    ALIGN_STK_16(int32_t, sumsq5_buf, BUF_STRIDE * 5 + 16,);
  ------------------
  |  |  100|  28.3k|    ALIGN(type var[sz1d]sznd, ALIGN_16_VAL)
  |  |  ------------------
  |  |  |  |   86|  28.3k|    line __attribute__((aligned(align)))
  |  |  ------------------
  ------------------
 1034|  28.3k|    ALIGN_STK_16(coef, sum5_buf, BUF_STRIDE * 5 + 16,);
  ------------------
  |  |  100|  28.3k|    ALIGN(type var[sz1d]sznd, ALIGN_16_VAL)
  |  |  ------------------
  |  |  |  |   86|  28.3k|    line __attribute__((aligned(align)))
  |  |  ------------------
  ------------------
 1035|  28.3k|    int32_t *sumsq5_ptrs[5], *sumsq5_rows[5];
 1036|  28.3k|    coef *sum5_ptrs[5], *sum5_rows[5];
 1037|   170k|    for (int i = 0; i < 5; i++) {
  ------------------
  |  Branch (1037:21): [True: 141k, False: 28.3k]
  ------------------
 1038|   141k|        sumsq5_rows[i] = &sumsq5_buf[i * BUF_STRIDE];
  ------------------
  |  |  685|   141k|#define BUF_STRIDE (384 + 16)
  ------------------
 1039|   141k|        sum5_rows[i] = &sum5_buf[i * BUF_STRIDE];
  ------------------
  |  |  685|   141k|#define BUF_STRIDE (384 + 16)
  ------------------
 1040|   141k|    }
 1041|  28.3k|    ALIGN_STK_16(int32_t, sumsq3_buf, BUF_STRIDE * 3 + 16,);
  ------------------
  |  |  100|  28.3k|    ALIGN(type var[sz1d]sznd, ALIGN_16_VAL)
  |  |  ------------------
  |  |  |  |   86|  28.3k|    line __attribute__((aligned(align)))
  |  |  ------------------
  ------------------
 1042|  28.3k|    ALIGN_STK_16(coef, sum3_buf, BUF_STRIDE * 3 + 16,);
  ------------------
  |  |  100|  28.3k|    ALIGN(type var[sz1d]sznd, ALIGN_16_VAL)
  |  |  ------------------
  |  |  |  |   86|  28.3k|    line __attribute__((aligned(align)))
  |  |  ------------------
  ------------------
 1043|  28.3k|    int32_t *sumsq3_ptrs[3], *sumsq3_rows[3];
 1044|  28.3k|    coef *sum3_ptrs[3], *sum3_rows[3];
 1045|   113k|    for (int i = 0; i < 3; i++) {
  ------------------
  |  Branch (1045:21): [True: 85.1k, False: 28.3k]
  ------------------
 1046|  85.1k|        sumsq3_rows[i] = &sumsq3_buf[i * BUF_STRIDE];
  ------------------
  |  |  685|  85.1k|#define BUF_STRIDE (384 + 16)
  ------------------
 1047|  85.1k|        sum3_rows[i] = &sum3_buf[i * BUF_STRIDE];
  ------------------
  |  |  685|  85.1k|#define BUF_STRIDE (384 + 16)
  ------------------
 1048|  85.1k|    }
 1049|       |
 1050|  28.3k|    ALIGN_STK_16(int32_t, A5_buf, BUF_STRIDE * 2 + 16,);
  ------------------
  |  |  100|  28.3k|    ALIGN(type var[sz1d]sznd, ALIGN_16_VAL)
  |  |  ------------------
  |  |  |  |   86|  28.3k|    line __attribute__((aligned(align)))
  |  |  ------------------
  ------------------
 1051|  28.3k|    ALIGN_STK_16(coef, B5_buf, BUF_STRIDE * 2 + 16,);
  ------------------
  |  |  100|  28.3k|    ALIGN(type var[sz1d]sznd, ALIGN_16_VAL)
  |  |  ------------------
  |  |  |  |   86|  28.3k|    line __attribute__((aligned(align)))
  |  |  ------------------
  ------------------
 1052|  28.3k|    int32_t *A5_ptrs[2];
 1053|  28.3k|    coef *B5_ptrs[2];
 1054|  85.1k|    for (int i = 0; i < 2; i++) {
  ------------------
  |  Branch (1054:21): [True: 56.7k, False: 28.3k]
  ------------------
 1055|  56.7k|        A5_ptrs[i] = &A5_buf[i * BUF_STRIDE];
  ------------------
  |  |  685|  56.7k|#define BUF_STRIDE (384 + 16)
  ------------------
 1056|  56.7k|        B5_ptrs[i] = &B5_buf[i * BUF_STRIDE];
  ------------------
  |  |  685|  56.7k|#define BUF_STRIDE (384 + 16)
  ------------------
 1057|  56.7k|    }
 1058|  28.3k|    ALIGN_STK_16(int32_t, A3_buf, BUF_STRIDE * 4 + 16,);
  ------------------
  |  |  100|  28.3k|    ALIGN(type var[sz1d]sznd, ALIGN_16_VAL)
  |  |  ------------------
  |  |  |  |   86|  28.3k|    line __attribute__((aligned(align)))
  |  |  ------------------
  ------------------
 1059|  28.3k|    ALIGN_STK_16(coef, B3_buf, BUF_STRIDE * 4 + 16,);
  ------------------
  |  |  100|  28.3k|    ALIGN(type var[sz1d]sznd, ALIGN_16_VAL)
  |  |  ------------------
  |  |  |  |   86|  28.3k|    line __attribute__((aligned(align)))
  |  |  ------------------
  ------------------
 1060|  28.3k|    int32_t *A3_ptrs[4];
 1061|  28.3k|    coef *B3_ptrs[4];
 1062|   141k|    for (int i = 0; i < 4; i++) {
  ------------------
  |  Branch (1062:21): [True: 113k, False: 28.3k]
  ------------------
 1063|   113k|        A3_ptrs[i] = &A3_buf[i * BUF_STRIDE];
  ------------------
  |  |  685|   113k|#define BUF_STRIDE (384 + 16)
  ------------------
 1064|   113k|        B3_ptrs[i] = &B3_buf[i * BUF_STRIDE];
  ------------------
  |  |  685|   113k|#define BUF_STRIDE (384 + 16)
  ------------------
 1065|   113k|    }
 1066|  28.3k|    const pixel *src = dst;
 1067|  28.3k|    const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
  ------------------
  |  |   53|  28.3k|#define PXSTRIDE(x) (x)
  ------------------
 1068|       |
 1069|  28.3k|    if (edges & LR_HAVE_TOP) {
  ------------------
  |  Branch (1069:9): [True: 27.5k, False: 825]
  ------------------
 1070|  27.5k|        sumsq5_ptrs[0] = sumsq5_rows[0];
 1071|  27.5k|        sumsq5_ptrs[1] = sumsq5_rows[0];
 1072|  27.5k|        sumsq5_ptrs[2] = sumsq5_rows[1];
 1073|  27.5k|        sumsq5_ptrs[3] = sumsq5_rows[2];
 1074|  27.5k|        sumsq5_ptrs[4] = sumsq5_rows[3];
 1075|  27.5k|        sum5_ptrs[0] = sum5_rows[0];
 1076|  27.5k|        sum5_ptrs[1] = sum5_rows[0];
 1077|  27.5k|        sum5_ptrs[2] = sum5_rows[1];
 1078|  27.5k|        sum5_ptrs[3] = sum5_rows[2];
 1079|  27.5k|        sum5_ptrs[4] = sum5_rows[3];
 1080|       |
 1081|  27.5k|        sumsq3_ptrs[0] = sumsq3_rows[0];
 1082|  27.5k|        sumsq3_ptrs[1] = sumsq3_rows[1];
 1083|  27.5k|        sumsq3_ptrs[2] = sumsq3_rows[2];
 1084|  27.5k|        sum3_ptrs[0] = sum3_rows[0];
 1085|  27.5k|        sum3_ptrs[1] = sum3_rows[1];
 1086|  27.5k|        sum3_ptrs[2] = sum3_rows[2];
 1087|       |
 1088|  27.5k|        sgr_box35_row_h(sumsq3_rows[0], sum3_rows[0],
 1089|  27.5k|                        sumsq5_rows[0], sum5_rows[0],
 1090|  27.5k|                        NULL, lpf, w, edges);
 1091|  27.5k|        lpf += PXSTRIDE(stride);
  ------------------
  |  |   53|  27.5k|#define PXSTRIDE(x) (x)
  ------------------
 1092|  27.5k|        sgr_box35_row_h(sumsq3_rows[1], sum3_rows[1],
 1093|  27.5k|                        sumsq5_rows[1], sum5_rows[1],
 1094|  27.5k|                        NULL, lpf, w, edges);
 1095|       |
 1096|  27.5k|        sgr_box35_row_h(sumsq3_rows[2], sum3_rows[2],
 1097|  27.5k|                        sumsq5_rows[2], sum5_rows[2],
 1098|  27.5k|                        left, src, w, edges);
 1099|  27.5k|        left++;
 1100|  27.5k|        src += PXSTRIDE(stride);
  ------------------
  |  |   53|  27.5k|#define PXSTRIDE(x) (x)
  ------------------
 1101|       |
 1102|  27.5k|        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
 1103|  27.5k|                      w, params->sgr.s1, BITDEPTH_MAX);
  ------------------
  |  |   59|  27.5k|#define BITDEPTH_MAX 0xff
  ------------------
 1104|  27.5k|        rotate(A3_ptrs, B3_ptrs, 4);
 1105|       |
 1106|  27.5k|        if (--h <= 0)
  ------------------
  |  Branch (1106:13): [True: 38, False: 27.5k]
  ------------------
 1107|     38|            goto vert_1;
 1108|       |
 1109|  27.5k|        sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2],
 1110|  27.5k|                        sumsq5_rows[3], sum5_rows[3],
 1111|  27.5k|                        left, src, w, edges);
 1112|  27.5k|        left++;
 1113|  27.5k|        src += PXSTRIDE(stride);
  ------------------
  |  |   53|  27.5k|#define PXSTRIDE(x) (x)
  ------------------
 1114|  27.5k|        sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
 1115|  27.5k|                      w, params->sgr.s0, BITDEPTH_MAX);
  ------------------
  |  |   59|  27.5k|#define BITDEPTH_MAX 0xff
  ------------------
 1116|  27.5k|        rotate(A5_ptrs, B5_ptrs, 2);
 1117|  27.5k|        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
 1118|  27.5k|                      w, params->sgr.s1, BITDEPTH_MAX);
  ------------------
  |  |   59|  27.5k|#define BITDEPTH_MAX 0xff
  ------------------
 1119|  27.5k|        rotate(A3_ptrs, B3_ptrs, 4);
 1120|       |
 1121|  27.5k|        if (--h <= 0)
  ------------------
  |  Branch (1121:13): [True: 6, False: 27.5k]
  ------------------
 1122|      6|            goto vert_2;
 1123|       |
 1124|       |        // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
 1125|       |        // one of them to point at the previously unused rows[4].
 1126|  27.5k|        sumsq5_ptrs[3] = sumsq5_rows[4];
 1127|  27.5k|        sum5_ptrs[3] = sum5_rows[4];
 1128|  27.5k|    } else {
 1129|    825|        sumsq5_ptrs[0] = sumsq5_rows[0];
 1130|    825|        sumsq5_ptrs[1] = sumsq5_rows[0];
 1131|    825|        sumsq5_ptrs[2] = sumsq5_rows[0];
 1132|    825|        sumsq5_ptrs[3] = sumsq5_rows[0];
 1133|    825|        sumsq5_ptrs[4] = sumsq5_rows[0];
 1134|    825|        sum5_ptrs[0] = sum5_rows[0];
 1135|    825|        sum5_ptrs[1] = sum5_rows[0];
 1136|    825|        sum5_ptrs[2] = sum5_rows[0];
 1137|    825|        sum5_ptrs[3] = sum5_rows[0];
 1138|    825|        sum5_ptrs[4] = sum5_rows[0];
 1139|       |
 1140|    825|        sumsq3_ptrs[0] = sumsq3_rows[0];
 1141|    825|        sumsq3_ptrs[1] = sumsq3_rows[0];
 1142|    825|        sumsq3_ptrs[2] = sumsq3_rows[0];
 1143|    825|        sum3_ptrs[0] = sum3_rows[0];
 1144|    825|        sum3_ptrs[1] = sum3_rows[0];
 1145|    825|        sum3_ptrs[2] = sum3_rows[0];
 1146|       |
 1147|    825|        sgr_box35_row_h(sumsq3_rows[0], sum3_rows[0],
 1148|    825|                        sumsq5_rows[0], sum5_rows[0],
 1149|    825|                        left, src, w, edges);
 1150|    825|        left++;
 1151|    825|        src += PXSTRIDE(stride);
  ------------------
  |  |   53|    825|#define PXSTRIDE(x) (x)
  ------------------
 1152|       |
 1153|    825|        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
 1154|    825|                      w, params->sgr.s1, BITDEPTH_MAX);
  ------------------
  |  |   59|    825|#define BITDEPTH_MAX 0xff
  ------------------
 1155|    825|        rotate(A3_ptrs, B3_ptrs, 4);
 1156|       |
 1157|    825|        if (--h <= 0)
  ------------------
  |  Branch (1157:13): [True: 58, False: 767]
  ------------------
 1158|     58|            goto vert_1;
 1159|       |
 1160|    767|        sumsq5_ptrs[4] = sumsq5_rows[1];
 1161|    767|        sum5_ptrs[4] = sum5_rows[1];
 1162|       |
 1163|    767|        sumsq3_ptrs[2] = sumsq3_rows[1];
 1164|    767|        sum3_ptrs[2] = sum3_rows[1];
 1165|       |
 1166|    767|        sgr_box35_row_h(sumsq3_rows[1], sum3_rows[1],
 1167|    767|                        sumsq5_rows[1], sum5_rows[1],
 1168|    767|                        left, src, w, edges);
 1169|    767|        left++;
 1170|    767|        src += PXSTRIDE(stride);
  ------------------
  |  |   53|    767|#define PXSTRIDE(x) (x)
  ------------------
 1171|       |
 1172|    767|        sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
 1173|    767|                      w, params->sgr.s0, BITDEPTH_MAX);
  ------------------
  |  |   59|    767|#define BITDEPTH_MAX 0xff
  ------------------
 1174|    767|        rotate(A5_ptrs, B5_ptrs, 2);
 1175|    767|        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
 1176|    767|                      w, params->sgr.s1, BITDEPTH_MAX);
  ------------------
  |  |   59|    767|#define BITDEPTH_MAX 0xff
  ------------------
 1177|    767|        rotate(A3_ptrs, B3_ptrs, 4);
 1178|       |
 1179|    767|        if (--h <= 0)
  ------------------
  |  Branch (1179:13): [True: 34, False: 733]
  ------------------
 1180|     34|            goto vert_2;
 1181|       |
 1182|    733|        sumsq5_ptrs[3] = sumsq5_rows[2];
 1183|    733|        sumsq5_ptrs[4] = sumsq5_rows[3];
 1184|    733|        sum5_ptrs[3] = sum5_rows[2];
 1185|    733|        sum5_ptrs[4] = sum5_rows[3];
 1186|       |
 1187|    733|        sumsq3_ptrs[2] = sumsq3_rows[2];
 1188|    733|        sum3_ptrs[2] = sum3_rows[2];
 1189|       |
 1190|    733|        sgr_box35_row_h(sumsq3_rows[2], sum3_rows[2],
 1191|    733|                        sumsq5_rows[2], sum5_rows[2],
 1192|    733|                        left, src, w, edges);
 1193|    733|        left++;
 1194|    733|        src += PXSTRIDE(stride);
  ------------------
  |  |   53|    733|#define PXSTRIDE(x) (x)
  ------------------
 1195|       |
 1196|    733|        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
 1197|    733|                      w, params->sgr.s1, BITDEPTH_MAX);
  ------------------
  |  |   59|    733|#define BITDEPTH_MAX 0xff
  ------------------
 1198|    733|        rotate(A3_ptrs, B3_ptrs, 4);
 1199|       |
 1200|    733|        if (--h <= 0)
  ------------------
  |  Branch (1200:13): [True: 2, False: 731]
  ------------------
 1201|      2|            goto odd;
 1202|       |
 1203|    731|        sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2],
 1204|    731|                        sumsq5_rows[3], sum5_rows[3],
 1205|    731|                        left, src, w, edges);
 1206|    731|        left++;
 1207|    731|        src += PXSTRIDE(stride);
  ------------------
  |  |   53|    731|#define PXSTRIDE(x) (x)
  ------------------
 1208|       |
 1209|    731|        sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
 1210|    731|                      w, params->sgr.s0, BITDEPTH_MAX);
  ------------------
  |  |   59|    731|#define BITDEPTH_MAX 0xff
  ------------------
 1211|    731|        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
 1212|    731|                      w, params->sgr.s1, BITDEPTH_MAX);
  ------------------
  |  |   59|    731|#define BITDEPTH_MAX 0xff
  ------------------
 1213|    731|        sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
 1214|    731|                       w, 2, params->sgr.w0, params->sgr.w1
 1215|    731|                       HIGHBD_TAIL_SUFFIX);
 1216|       |
 1217|    731|        if (--h <= 0)
  ------------------
  |  Branch (1217:13): [True: 2, False: 729]
  ------------------
 1218|      2|            goto vert_2;
 1219|       |
 1220|       |        // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
 1221|       |        // one of them to point at the previously unused rows[4].
 1222|    729|        sumsq5_ptrs[3] = sumsq5_rows[4];
 1223|    729|        sum5_ptrs[3] = sum5_rows[4];
 1224|    729|    }
 1225|       |
 1226|   763k|    do {
 1227|   763k|        sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2],
 1228|   763k|                        sumsq5_ptrs[3], sum5_ptrs[3],
 1229|   763k|                        left, src, w, edges);
 1230|   763k|        left++;
 1231|   763k|        src += PXSTRIDE(stride);
  ------------------
  |  |   53|   763k|#define PXSTRIDE(x) (x)
  ------------------
 1232|       |
 1233|   763k|        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
 1234|   763k|                      w, params->sgr.s1, BITDEPTH_MAX);
  ------------------
  |  |   59|   763k|#define BITDEPTH_MAX 0xff
  ------------------
 1235|   763k|        rotate(A3_ptrs, B3_ptrs, 4);
 1236|       |
 1237|   763k|        if (--h <= 0)
  ------------------
  |  Branch (1237:13): [True: 276, False: 763k]
  ------------------
 1238|    276|            goto odd;
 1239|       |
 1240|   763k|        sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2],
 1241|   763k|                        sumsq5_ptrs[4], sum5_ptrs[4],
 1242|   763k|                        left, src, w, edges);
 1243|   763k|        left++;
 1244|   763k|        src += PXSTRIDE(stride);
  ------------------
  |  |   53|   763k|#define PXSTRIDE(x) (x)
  ------------------
 1245|       |
 1246|   763k|        sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
 1247|   763k|                      w, params->sgr.s0, BITDEPTH_MAX);
  ------------------
  |  |   59|   763k|#define BITDEPTH_MAX 0xff
  ------------------
 1248|   763k|        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
 1249|   763k|                      w, params->sgr.s1, BITDEPTH_MAX);
  ------------------
  |  |   59|   763k|#define BITDEPTH_MAX 0xff
  ------------------
 1250|   763k|        sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
 1251|   763k|                       w, 2, params->sgr.w0, params->sgr.w1
 1252|   763k|                       HIGHBD_TAIL_SUFFIX);
 1253|   763k|    } while (--h > 0);
  ------------------
  |  Branch (1253:14): [True: 735k, False: 27.9k]
  ------------------
 1254|       |
 1255|  27.9k|    if (!(edges & LR_HAVE_BOTTOM))
  ------------------
  |  Branch (1255:9): [True: 132, False: 27.8k]
  ------------------
 1256|    132|        goto vert_2;
 1257|       |
 1258|  27.8k|    sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2],
 1259|  27.8k|                    sumsq5_ptrs[3], sum5_ptrs[3],
 1260|  27.8k|                    NULL, lpf_bottom, w, edges);
 1261|  27.8k|    lpf_bottom += PXSTRIDE(stride);
  ------------------
  |  |   53|  27.8k|#define PXSTRIDE(x) (x)
  ------------------
 1262|  27.8k|    sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
 1263|  27.8k|                  w, params->sgr.s1, BITDEPTH_MAX);
  ------------------
  |  |   59|  27.8k|#define BITDEPTH_MAX 0xff
  ------------------
 1264|  27.8k|    rotate(A3_ptrs, B3_ptrs, 4);
 1265|       |
 1266|  27.8k|    sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2],
 1267|  27.8k|                    sumsq5_ptrs[4], sum5_ptrs[4],
 1268|  27.8k|                    NULL, lpf_bottom, w, edges);
 1269|       |
 1270|  28.0k|output_2:
 1271|  28.0k|    sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
 1272|  28.0k|                  w, params->sgr.s0, BITDEPTH_MAX);
  ------------------
  |  |   59|  28.0k|#define BITDEPTH_MAX 0xff
  ------------------
 1273|  28.0k|    sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
 1274|  28.0k|                  w, params->sgr.s1, BITDEPTH_MAX);
  ------------------
  |  |   59|  28.0k|#define BITDEPTH_MAX 0xff
  ------------------
 1275|  28.0k|    sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
 1276|  28.0k|                   w, 2, params->sgr.w0, params->sgr.w1
 1277|  28.0k|                   HIGHBD_TAIL_SUFFIX);
 1278|  28.0k|    return;
 1279|       |
 1280|    174|vert_2:
 1281|       |    // Duplicate the last row twice more
 1282|    174|    sumsq5_ptrs[3] = sumsq5_ptrs[2];
 1283|    174|    sumsq5_ptrs[4] = sumsq5_ptrs[2];
 1284|    174|    sum5_ptrs[3] = sum5_ptrs[2];
 1285|    174|    sum5_ptrs[4] = sum5_ptrs[2];
 1286|       |
 1287|    174|    sumsq3_ptrs[2] = sumsq3_ptrs[1];
 1288|    174|    sum3_ptrs[2] = sum3_ptrs[1];
 1289|    174|    sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
 1290|    174|                  w, params->sgr.s1, BITDEPTH_MAX);
  ------------------
  |  |   59|    174|#define BITDEPTH_MAX 0xff
  ------------------
 1291|    174|    rotate(A3_ptrs, B3_ptrs, 4);
 1292|       |
 1293|    174|    sumsq3_ptrs[2] = sumsq3_ptrs[1];
 1294|    174|    sum3_ptrs[2] = sum3_ptrs[1];
 1295|       |
 1296|    174|    goto output_2;
 1297|       |
 1298|    278|odd:
 1299|       |    // Copy the last row as padding once
 1300|    278|    sumsq5_ptrs[4] = sumsq5_ptrs[3];
 1301|    278|    sum5_ptrs[4] = sum5_ptrs[3];
 1302|       |
 1303|    278|    sumsq3_ptrs[2] = sumsq3_ptrs[1];
 1304|    278|    sum3_ptrs[2] = sum3_ptrs[1];
 1305|       |
 1306|    278|    sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
 1307|    278|                  w, params->sgr.s0, BITDEPTH_MAX);
  ------------------
  |  |   59|    278|#define BITDEPTH_MAX 0xff
  ------------------
 1308|    278|    sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
 1309|    278|                  w, params->sgr.s1, BITDEPTH_MAX);
  ------------------
  |  |   59|    278|#define BITDEPTH_MAX 0xff
  ------------------
 1310|    278|    sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
 1311|    278|                   w, 2, params->sgr.w0, params->sgr.w1
 1312|    278|                   HIGHBD_TAIL_SUFFIX);
 1313|       |
 1314|    374|output_1:
 1315|       |    // Duplicate the last row twice more
 1316|    374|    sumsq5_ptrs[3] = sumsq5_ptrs[2];
 1317|    374|    sumsq5_ptrs[4] = sumsq5_ptrs[2];
 1318|    374|    sum5_ptrs[3] = sum5_ptrs[2];
 1319|    374|    sum5_ptrs[4] = sum5_ptrs[2];
 1320|       |
 1321|    374|    sumsq3_ptrs[2] = sumsq3_ptrs[1];
 1322|    374|    sum3_ptrs[2] = sum3_ptrs[1];
 1323|       |
 1324|    374|    sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
 1325|    374|                  w, params->sgr.s0, BITDEPTH_MAX);
  ------------------
  |  |   59|    374|#define BITDEPTH_MAX 0xff
  ------------------
 1326|    374|    sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
 1327|    374|                  w, params->sgr.s1, BITDEPTH_MAX);
  ------------------
  |  |   59|    374|#define BITDEPTH_MAX 0xff
  ------------------
 1328|    374|    rotate(A3_ptrs, B3_ptrs, 4);
 1329|       |    // Output only one row
 1330|    374|    sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
 1331|    374|                   w, 1, params->sgr.w0, params->sgr.w1
 1332|    374|                   HIGHBD_TAIL_SUFFIX);
 1333|    374|    return;
 1334|       |
 1335|     96|vert_1:
 1336|       |    // Copy the last row as padding once
 1337|     96|    sumsq5_ptrs[4] = sumsq5_ptrs[3];
 1338|     96|    sum5_ptrs[4] = sum5_ptrs[3];
 1339|       |
 1340|     96|    sumsq3_ptrs[2] = sumsq3_ptrs[1];
 1341|     96|    sum3_ptrs[2] = sum3_ptrs[1];
 1342|       |
 1343|     96|    sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
 1344|     96|                  w, params->sgr.s0, BITDEPTH_MAX);
  ------------------
  |  |   59|     96|#define BITDEPTH_MAX 0xff
  ------------------
 1345|     96|    rotate(A5_ptrs, B5_ptrs, 2);
 1346|     96|    sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
 1347|     96|                  w, params->sgr.s1, BITDEPTH_MAX);
  ------------------
  |  |   59|     96|#define BITDEPTH_MAX 0xff
  ------------------
 1348|     96|    rotate(A3_ptrs, B3_ptrs, 4);
 1349|       |
 1350|     96|    goto output_1;
 1351|    278|}
looprestoration_tmpl.c:sgr_box35_row_h:
  464|  1.67M|{
  465|  1.67M|    sgr_box3_row_h(sumsq3, sum3, left, src, w, edges);
  466|  1.67M|    sgr_box5_row_h(sumsq5, sum5, left, src, w, edges);
  467|  1.67M|}
looprestoration_tmpl.c:sgr_finish_mix:
  663|   785k|{
  664|   785k|    ALIGN_STK_16(coef, tmp5, 2*FILTER_OUT_STRIDE,);
  ------------------
  |  |  100|   785k|    ALIGN(type var[sz1d]sznd, ALIGN_16_VAL)
  |  |  ------------------
  |  |  |  |   86|   785k|    line __attribute__((aligned(align)))
  |  |  ------------------
  ------------------
  665|   785k|    ALIGN_STK_16(coef, tmp3, 2*FILTER_OUT_STRIDE,);
  ------------------
  |  |  100|   785k|    ALIGN(type var[sz1d]sznd, ALIGN_16_VAL)
  |  |  ------------------
  |  |  |  |   86|   785k|    line __attribute__((aligned(align)))
  |  |  ------------------
  ------------------
  666|       |
  667|   785k|    sgr_finish_filter2(tmp5, *dst, stride, A5_ptrs, B5_ptrs, w, h);
  668|   785k|    sgr_finish_filter_row1(tmp3, *dst, A3_ptrs, B3_ptrs, w);
  669|   785k|    if (h > 1)
  ------------------
  |  Branch (669:9): [True: 791k, False: 18.4E]
  ------------------
  670|   791k|        sgr_finish_filter_row1(tmp3 + FILTER_OUT_STRIDE, *dst + PXSTRIDE(stride),
  ------------------
  |  |  572|   791k|#define FILTER_OUT_STRIDE (384)
  ------------------
                      sgr_finish_filter_row1(tmp3 + FILTER_OUT_STRIDE, *dst + PXSTRIDE(stride),
  ------------------
  |  |   53|   791k|#define PXSTRIDE(x) (x)
  ------------------
  671|   791k|                               &A3_ptrs[1], &B3_ptrs[1], w);
  672|   785k|    sgr_weighted2(*dst, stride, tmp5, tmp3, w, h, w0, w1 HIGHBD_TAIL_SUFFIX);
  673|   785k|    *dst += h*PXSTRIDE(stride);
  ------------------
  |  |   53|   785k|#define PXSTRIDE(x) (x)
  ------------------
  674|   785k|    rotate(A5_ptrs, B5_ptrs, 2);
  675|   785k|    rotate(A3_ptrs, B3_ptrs, 4);
  676|   785k|}
looprestoration_tmpl.c:sgr_weighted2:
  616|   793k|{
  617|  2.37M|    for (int j = 0; j < h; j++) {
  ------------------
  |  Branch (617:21): [True: 1.58M, False: 793k]
  ------------------
  618|  14.9M|        for (int i = 0; i < w; i++) {
  ------------------
  |  Branch (618:25): [True: 13.3M, False: 1.58M]
  ------------------
  619|  13.3M|            const int v = w0 * t1[i] + w1 * t2[i];
  620|  13.3M|            dst[i] = iclip_pixel(dst[i] + ((v + (1 << 10)) >> 11));
  ------------------
  |  |   49|  13.3M|#define iclip_pixel iclip_u8
  ------------------
  621|  13.3M|        }
  622|  1.58M|        dst += PXSTRIDE(dst_stride);
  ------------------
  |  |   53|  1.58M|#define PXSTRIDE(x) (x)
  ------------------
  623|  1.58M|        t1 += FILTER_OUT_STRIDE;
  ------------------
  |  |  572|  1.58M|#define FILTER_OUT_STRIDE (384)
  ------------------
  624|  1.58M|        t2 += FILTER_OUT_STRIDE;
  ------------------
  |  |  572|  1.58M|#define FILTER_OUT_STRIDE (384)
  ------------------
  625|  1.58M|    }
  626|   793k|}
dav1d_loop_restoration_dsp_init_16bpc:
 1367|  7.63k|{
 1368|  7.63k|    c->wiener[0] = c->wiener[1] = wiener_c;
 1369|  7.63k|    c->sgr[0] = sgr_5x5_c;
 1370|  7.63k|    c->sgr[1] = sgr_3x3_c;
 1371|  7.63k|    c->sgr[2] = sgr_mix_c;
 1372|       |
 1373|  7.63k|#if HAVE_ASM
 1374|       |#if ARCH_AARCH64 || ARCH_ARM
 1375|       |    loop_restoration_dsp_init_arm(c, bpc);
 1376|       |#elif ARCH_LOONGARCH64
 1377|       |    loop_restoration_dsp_init_loongarch(c, bpc);
 1378|       |#elif ARCH_PPC64LE
 1379|       |    loop_restoration_dsp_init_ppc(c, bpc);
 1380|       |#elif ARCH_X86
 1381|       |    loop_restoration_dsp_init_x86(c, bpc);
 1382|  7.63k|#endif
 1383|  7.63k|#endif
 1384|  7.63k|}

dav1d_lr_sbrow_8bpc:
  170|  7.80k|{
  171|  7.80k|    const int offset_y = 8 * !!sby;
  172|  7.80k|    const ptrdiff_t *const dst_stride = f->sr_cur.p.stride;
  173|  7.80k|    const int restore_planes = f->lf.restore_planes;
  174|  7.80k|    const int not_last = sby + 1 < f->sbh;
  175|       |
  176|  7.80k|    if (restore_planes & LR_RESTORE_Y) {
  ------------------
  |  Branch (176:9): [True: 6.46k, False: 1.34k]
  ------------------
  177|  6.46k|        const int h = f->sr_cur.p.p.h;
  178|  6.46k|        const int w = f->sr_cur.p.p.w;
  179|  6.46k|        const int next_row_y = (sby + 1) << (6 + f->seq_hdr->sb128);
  180|  6.46k|        const int row_h = imin(next_row_y - 8 * not_last, h);
  181|  6.46k|        const int y_stripe = (sby << (6 + f->seq_hdr->sb128)) - offset_y;
  182|  6.46k|        lr_sbrow(f, dst[0] - offset_y * PXSTRIDE(dst_stride[0]), y_stripe, w,
  ------------------
  |  |   53|  6.46k|#define PXSTRIDE(x) (x)
  ------------------
  183|  6.46k|                 h, row_h, 0);
  184|  6.46k|    }
  185|  7.80k|    if (restore_planes & (LR_RESTORE_U | LR_RESTORE_V)) {
  ------------------
  |  Branch (185:9): [True: 5.31k, False: 2.48k]
  ------------------
  186|  5.31k|        const int ss_ver = f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
  187|  5.31k|        const int ss_hor = f->sr_cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
  188|  5.31k|        const int h = (f->sr_cur.p.p.h + ss_ver) >> ss_ver;
  189|  5.31k|        const int w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor;
  190|  5.31k|        const int next_row_y = (sby + 1) << ((6 - ss_ver) + f->seq_hdr->sb128);
  191|  5.31k|        const int row_h = imin(next_row_y - (8 >> ss_ver) * not_last, h);
  192|  5.31k|        const int offset_uv = offset_y >> ss_ver;
  193|  5.31k|        const int y_stripe = (sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv;
  194|  5.31k|        if (restore_planes & LR_RESTORE_U)
  ------------------
  |  Branch (194:13): [True: 4.07k, False: 1.24k]
  ------------------
  195|  4.07k|            lr_sbrow(f, dst[1] - offset_uv * PXSTRIDE(dst_stride[1]), y_stripe,
  ------------------
  |  |   53|  4.07k|#define PXSTRIDE(x) (x)
  ------------------
  196|  4.07k|                     w, h, row_h, 1);
  197|       |
  198|  5.31k|        if (restore_planes & LR_RESTORE_V)
  ------------------
  |  Branch (198:13): [True: 3.95k, False: 1.36k]
  ------------------
  199|  3.95k|            lr_sbrow(f, dst[2] - offset_uv * PXSTRIDE(dst_stride[1]), y_stripe,
  ------------------
  |  |   53|  3.95k|#define PXSTRIDE(x) (x)
  ------------------
  200|  3.95k|                     w, h, row_h, 2);
  201|  5.31k|    }
  202|  7.80k|}
lr_apply_tmpl.c:lr_sbrow:
  109|  94.6k|{
  110|  94.6k|    const int chroma = !!plane;
  111|  94.6k|    const int ss_ver = chroma & (f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420);
  112|  94.6k|    const int ss_hor = chroma & (f->sr_cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444);
  113|  94.6k|    const ptrdiff_t p_stride = f->sr_cur.p.stride[chroma];
  114|       |
  115|  94.6k|    const int unit_size_log2 = f->frame_hdr->restoration.unit_size[!!plane];
  116|  94.6k|    const int unit_size = 1 << unit_size_log2;
  117|  94.6k|    const int half_unit_size = unit_size >> 1;
  118|  94.6k|    const int max_unit_size = unit_size + half_unit_size;
  119|       |
  120|       |    // Y coordinate of the sbrow (y is 8 luma pixel rows above row_y)
  121|  94.6k|    const int row_y = y + ((8 >> ss_ver) * !!y);
  122|       |
  123|       |    // FIXME This is an ugly hack to lookup the proper AV1Filter unit for
  124|       |    // chroma planes. Question: For Multithreaded decoding, is it better
  125|       |    // to store the chroma LR information with collocated Luma information?
  126|       |    // In other words. For a chroma restoration unit locate at 128,128 and
  127|       |    // with a 4:2:0 chroma subsampling, do we store the filter information at
  128|       |    // the AV1Filter unit located at (128,128) or (256,256)
  129|       |    // TODO Support chroma subsampling.
  130|  94.6k|    const int shift_hor = 7 - ss_hor;
  131|       |
  132|       |    /* maximum sbrow height is 128 + 8 rows offset */
  133|  94.6k|    ALIGN_STK_16(pixel, pre_lr_border, 2, [128 + 8][4]);
  ------------------
  |  |  100|  94.6k|    ALIGN(type var[sz1d]sznd, ALIGN_16_VAL)
  |  |  ------------------
  |  |  |  |   86|  94.6k|    line __attribute__((aligned(align)))
  |  |  ------------------
  ------------------
  134|  94.6k|    const Av1RestorationUnit *lr[2];
  135|       |
  136|  94.6k|    enum LrEdgeFlags edges = (y > 0 ? LR_HAVE_TOP : 0) | LR_HAVE_RIGHT;
  ------------------
  |  Branch (136:31): [True: 87.5k, False: 7.04k]
  ------------------
  137|       |
  138|  94.6k|    int aligned_unit_pos = row_y & ~(unit_size - 1);
  139|  94.6k|    if (aligned_unit_pos && aligned_unit_pos + half_unit_size > h)
  ------------------
  |  Branch (139:9): [True: 85.3k, False: 9.22k]
  |  Branch (139:29): [True: 563, False: 84.8k]
  ------------------
  140|    563|        aligned_unit_pos -= unit_size;
  141|  94.6k|    aligned_unit_pos <<= ss_ver;
  142|  94.6k|    const int sb_idx = (aligned_unit_pos >> 7) * f->sr_sb128w;
  143|  94.6k|    const int unit_idx = ((aligned_unit_pos >> 6) & 1) << 1;
  144|  94.6k|    lr[0] = &f->lf.lr_mask[sb_idx].lr[plane][unit_idx];
  145|  94.6k|    int restore = lr[0]->type != DAV1D_RESTORATION_NONE;
  146|  94.6k|    int x = 0, bit = 0;
  147|   101k|    for (; x + max_unit_size <= w; p += unit_size, edges |= LR_HAVE_LEFT, bit ^= 1) {
  ------------------
  |  Branch (147:12): [True: 6.47k, False: 94.6k]
  ------------------
  148|  6.47k|        const int next_x = x + unit_size;
  149|  6.47k|        const int next_u_idx = unit_idx + ((next_x >> (shift_hor - 1)) & 1);
  150|  6.47k|        lr[!bit] =
  151|  6.47k|            &f->lf.lr_mask[sb_idx + (next_x >> shift_hor)].lr[plane][next_u_idx];
  152|  6.47k|        const int restore_next = lr[!bit]->type != DAV1D_RESTORATION_NONE;
  153|  6.47k|        if (restore_next)
  ------------------
  |  Branch (153:13): [True: 3.98k, False: 2.49k]
  ------------------
  154|  3.98k|            backup4xU(pre_lr_border[bit], p + unit_size - 4, p_stride, row_h - y);
  155|  6.47k|        if (restore)
  ------------------
  |  Branch (155:13): [True: 4.15k, False: 2.32k]
  ------------------
  156|  4.15k|            lr_stripe(f, p, pre_lr_border[!bit], x, y, plane, unit_size, row_h,
  157|  4.15k|                      lr[bit], edges);
  158|  6.47k|        x = next_x;
  159|  6.47k|        restore = restore_next;
  160|  6.47k|    }
  161|  94.6k|    if (restore) {
  ------------------
  |  Branch (161:9): [True: 55.7k, False: 38.8k]
  ------------------
  162|  55.7k|        edges &= ~LR_HAVE_RIGHT;
  163|  55.7k|        const int unit_w = w - x;
  164|  55.7k|        lr_stripe(f, p, pre_lr_border[!bit], x, y, plane, unit_w, row_h, lr[bit], edges);
  165|  55.7k|    }
  166|  94.6k|}
lr_apply_tmpl.c:backup4xU:
  102|  3.98k|{
  103|   171k|    for (; u > 0; u--, dst++, src += PXSTRIDE(src_stride))
  ------------------
  |  |   53|   167k|#define PXSTRIDE(x) (x)
  ------------------
  |  Branch (103:12): [True: 167k, False: 3.98k]
  ------------------
  104|   167k|        pixel_copy(dst, src, 4);
  ------------------
  |  |   47|   167k|#define pixel_copy memcpy
  ------------------
  105|  3.98k|}
lr_apply_tmpl.c:lr_stripe:
   40|  59.9k|{
   41|  59.9k|    const Dav1dDSPContext *const dsp = f->dsp;
   42|  59.9k|    const int chroma = !!plane;
   43|  59.9k|    const int ss_ver = chroma & (f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420);
   44|  59.9k|    const ptrdiff_t stride = f->sr_cur.p.stride[chroma];
   45|  59.9k|    const int sby = (y + (y ? 8 << ss_ver : 0)) >> (6 - ss_ver + f->seq_hdr->sb128);
  ------------------
  |  Branch (45:27): [True: 53.1k, False: 6.75k]
  ------------------
   46|  59.9k|    const int have_tt = f->c->n_tc > 1;
   47|  59.9k|    const pixel *lpf = f->lf.lr_lpf_line[plane] +
   48|  59.9k|        have_tt * (sby * (4 << f->seq_hdr->sb128) - 4) * PXSTRIDE(stride) + x;
  ------------------
  |  |   53|  59.9k|#define PXSTRIDE(x) (x)
  ------------------
   49|       |
   50|       |    // The first stripe of the frame is shorter by 8 luma pixel rows.
   51|  59.9k|    int stripe_h = imin((64 - 8 * !y) >> ss_ver, row_h - y);
   52|       |
   53|  59.9k|    looprestorationfilter_fn lr_fn;
   54|  59.9k|    LooprestorationParams params;
   55|  59.9k|    if (lr->type == DAV1D_RESTORATION_WIENER) {
  ------------------
  |  Branch (55:9): [True: 16.8k, False: 43.0k]
  ------------------
   56|  16.8k|        int16_t (*const filter)[8] = params.filter;
   57|  16.8k|        filter[0][0] = filter[0][6] = lr->filter_h[0];
   58|  16.8k|        filter[0][1] = filter[0][5] = lr->filter_h[1];
   59|  16.8k|        filter[0][2] = filter[0][4] = lr->filter_h[2];
   60|  16.8k|        filter[0][3] = -(filter[0][0] + filter[0][1] + filter[0][2]) * 2;
   61|       |#if BITDEPTH != 8
   62|       |        /* For 8-bit SIMD it's beneficial to handle the +128 separately
   63|       |         * in order to avoid overflows. */
   64|       |        filter[0][3] += 128;
   65|       |#endif
   66|       |
   67|  16.8k|        filter[1][0] = filter[1][6] = lr->filter_v[0];
   68|  16.8k|        filter[1][1] = filter[1][5] = lr->filter_v[1];
   69|  16.8k|        filter[1][2] = filter[1][4] = lr->filter_v[2];
   70|  16.8k|        filter[1][3] = 128 - (filter[1][0] + filter[1][1] + filter[1][2]) * 2;
   71|       |
   72|  16.8k|        lr_fn = dsp->lr.wiener[!(filter[0][0] | filter[1][0])];
   73|  43.0k|    } else {
   74|  43.0k|        assert(lr->type >= DAV1D_RESTORATION_SGRPROJ);
  ------------------
  |  |  140|  43.0k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 43.0k]
  |  |  |  Branch (140:68): [Folded, False: 43.0k]
  |  |  ------------------
  ------------------
   75|  43.0k|        const int sgr_idx = lr->type - DAV1D_RESTORATION_SGRPROJ;
   76|  43.0k|        const uint16_t *const sgr_params = dav1d_sgr_params[sgr_idx];
   77|  43.0k|        params.sgr.s0 = sgr_params[0];
   78|  43.0k|        params.sgr.s1 = sgr_params[1];
   79|  43.0k|        params.sgr.w0 = lr->sgr_weights[0];
   80|  43.0k|        params.sgr.w1 = 128 - (lr->sgr_weights[0] + lr->sgr_weights[1]);
   81|       |
   82|  43.0k|        lr_fn = dsp->lr.sgr[!!sgr_params[0] + !!sgr_params[1] * 2 - 1];
   83|  43.0k|    }
   84|       |
   85|  84.3k|    while (y + stripe_h <= row_h) {
  ------------------
  |  Branch (85:12): [True: 84.3k, False: 13]
  ------------------
   86|       |        // Change the HAVE_BOTTOM bit in edges to (sby + 1 != f->sbh || y + stripe_h != row_h)
   87|  84.3k|        edges ^= (-(sby + 1 != f->sbh || y + stripe_h != row_h) ^ edges) & LR_HAVE_BOTTOM;
  ------------------
  |  Branch (87:21): [True: 78.6k, False: 5.72k]
  |  Branch (87:42): [True: 895, False: 4.82k]
  ------------------
   88|  84.3k|        lr_fn(p, stride, left, lpf, unit_w, stripe_h, &params, edges HIGHBD_CALL_SUFFIX);
   89|       |
   90|  84.3k|        left += stripe_h;
   91|  84.3k|        y += stripe_h;
   92|  84.3k|        p += stripe_h * PXSTRIDE(stride);
  ------------------
  |  |   53|  84.3k|#define PXSTRIDE(x) (x)
  ------------------
   93|  84.3k|        edges |= LR_HAVE_TOP;
   94|  84.3k|        stripe_h = imin(64 >> ss_ver, row_h - y);
   95|  84.3k|        if (stripe_h == 0) break;
  ------------------
  |  Branch (95:13): [True: 59.8k, False: 24.4k]
  ------------------
   96|  24.4k|        lpf += 4 * PXSTRIDE(stride);
  ------------------
  |  |   53|  24.4k|#define PXSTRIDE(x) (x)
  ------------------
   97|  24.4k|    }
   98|  59.9k|}
dav1d_lr_sbrow_16bpc:
  170|  55.5k|{
  171|  55.5k|    const int offset_y = 8 * !!sby;
  172|  55.5k|    const ptrdiff_t *const dst_stride = f->sr_cur.p.stride;
  173|  55.5k|    const int restore_planes = f->lf.restore_planes;
  174|  55.5k|    const int not_last = sby + 1 < f->sbh;
  175|       |
  176|  55.5k|    if (restore_planes & LR_RESTORE_Y) {
  ------------------
  |  Branch (176:9): [True: 53.6k, False: 1.90k]
  ------------------
  177|  53.6k|        const int h = f->sr_cur.p.p.h;
  178|  53.6k|        const int w = f->sr_cur.p.p.w;
  179|  53.6k|        const int next_row_y = (sby + 1) << (6 + f->seq_hdr->sb128);
  180|  53.6k|        const int row_h = imin(next_row_y - 8 * not_last, h);
  181|  53.6k|        const int y_stripe = (sby << (6 + f->seq_hdr->sb128)) - offset_y;
  182|  53.6k|        lr_sbrow(f, dst[0] - offset_y * PXSTRIDE(dst_stride[0]), y_stripe, w,
  183|  53.6k|                 h, row_h, 0);
  184|  53.6k|    }
  185|  55.5k|    if (restore_planes & (LR_RESTORE_U | LR_RESTORE_V)) {
  ------------------
  |  Branch (185:9): [True: 15.0k, False: 40.5k]
  ------------------
  186|  15.0k|        const int ss_ver = f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
  187|  15.0k|        const int ss_hor = f->sr_cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
  188|  15.0k|        const int h = (f->sr_cur.p.p.h + ss_ver) >> ss_ver;
  189|  15.0k|        const int w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor;
  190|  15.0k|        const int next_row_y = (sby + 1) << ((6 - ss_ver) + f->seq_hdr->sb128);
  191|  15.0k|        const int row_h = imin(next_row_y - (8 >> ss_ver) * not_last, h);
  192|  15.0k|        const int offset_uv = offset_y >> ss_ver;
  193|  15.0k|        const int y_stripe = (sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv;
  194|  15.0k|        if (restore_planes & LR_RESTORE_U)
  ------------------
  |  Branch (194:13): [True: 13.8k, False: 1.18k]
  ------------------
  195|  13.8k|            lr_sbrow(f, dst[1] - offset_uv * PXSTRIDE(dst_stride[1]), y_stripe,
  196|  13.8k|                     w, h, row_h, 1);
  197|       |
  198|  15.0k|        if (restore_planes & LR_RESTORE_V)
  ------------------
  |  Branch (198:13): [True: 12.5k, False: 2.49k]
  ------------------
  199|  12.5k|            lr_sbrow(f, dst[2] - offset_uv * PXSTRIDE(dst_stride[1]), y_stripe,
  200|  12.5k|                     w, h, row_h, 2);
  201|  15.0k|    }
  202|  55.5k|}

dav1d_mc_dsp_init_8bpc:
  960|  7.82k|COLD void bitfn(dav1d_mc_dsp_init)(Dav1dMCDSPContext *const c) {
  961|  7.82k|#define init_mc_fns(type, name) do { \
  962|  7.82k|    c->mc        [type] = put_##name##_c; \
  963|  7.82k|    c->mc_scaled [type] = put_##name##_scaled_c; \
  964|  7.82k|    c->mct       [type] = prep_##name##_c; \
  965|  7.82k|    c->mct_scaled[type] = prep_##name##_scaled_c; \
  966|  7.82k|} while (0)
  967|       |
  968|  7.82k|    init_mc_fns(FILTER_2D_8TAP_REGULAR,        8tap_regular);
  ------------------
  |  |  961|  7.82k|#define init_mc_fns(type, name) do { \
  |  |  962|  7.82k|    c->mc        [type] = put_##name##_c; \
  |  |  963|  7.82k|    c->mc_scaled [type] = put_##name##_scaled_c; \
  |  |  964|  7.82k|    c->mct       [type] = prep_##name##_c; \
  |  |  965|  7.82k|    c->mct_scaled[type] = prep_##name##_scaled_c; \
  |  |  966|  7.82k|} while (0)
  |  |  ------------------
  |  |  |  Branch (966:10): [Folded, False: 7.82k]
  |  |  ------------------
  ------------------
  969|  7.82k|    init_mc_fns(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth);
  ------------------
  |  |  961|  7.82k|#define init_mc_fns(type, name) do { \
  |  |  962|  7.82k|    c->mc        [type] = put_##name##_c; \
  |  |  963|  7.82k|    c->mc_scaled [type] = put_##name##_scaled_c; \
  |  |  964|  7.82k|    c->mct       [type] = prep_##name##_c; \
  |  |  965|  7.82k|    c->mct_scaled[type] = prep_##name##_scaled_c; \
  |  |  966|  7.82k|} while (0)
  |  |  ------------------
  |  |  |  Branch (966:10): [Folded, False: 7.82k]
  |  |  ------------------
  ------------------
  970|  7.82k|    init_mc_fns(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp);
  ------------------
  |  |  961|  7.82k|#define init_mc_fns(type, name) do { \
  |  |  962|  7.82k|    c->mc        [type] = put_##name##_c; \
  |  |  963|  7.82k|    c->mc_scaled [type] = put_##name##_scaled_c; \
  |  |  964|  7.82k|    c->mct       [type] = prep_##name##_c; \
  |  |  965|  7.82k|    c->mct_scaled[type] = prep_##name##_scaled_c; \
  |  |  966|  7.82k|} while (0)
  |  |  ------------------
  |  |  |  Branch (966:10): [Folded, False: 7.82k]
  |  |  ------------------
  ------------------
  971|  7.82k|    init_mc_fns(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular);
  ------------------
  |  |  961|  7.82k|#define init_mc_fns(type, name) do { \
  |  |  962|  7.82k|    c->mc        [type] = put_##name##_c; \
  |  |  963|  7.82k|    c->mc_scaled [type] = put_##name##_scaled_c; \
  |  |  964|  7.82k|    c->mct       [type] = prep_##name##_c; \
  |  |  965|  7.82k|    c->mct_scaled[type] = prep_##name##_scaled_c; \
  |  |  966|  7.82k|} while (0)
  |  |  ------------------
  |  |  |  Branch (966:10): [Folded, False: 7.82k]
  |  |  ------------------
  ------------------
  972|  7.82k|    init_mc_fns(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth);
  ------------------
  |  |  961|  7.82k|#define init_mc_fns(type, name) do { \
  |  |  962|  7.82k|    c->mc        [type] = put_##name##_c; \
  |  |  963|  7.82k|    c->mc_scaled [type] = put_##name##_scaled_c; \
  |  |  964|  7.82k|    c->mct       [type] = prep_##name##_c; \
  |  |  965|  7.82k|    c->mct_scaled[type] = prep_##name##_scaled_c; \
  |  |  966|  7.82k|} while (0)
  |  |  ------------------
  |  |  |  Branch (966:10): [Folded, False: 7.82k]
  |  |  ------------------
  ------------------
  973|  7.82k|    init_mc_fns(FILTER_2D_8TAP_SHARP,          8tap_sharp);
  ------------------
  |  |  961|  7.82k|#define init_mc_fns(type, name) do { \
  |  |  962|  7.82k|    c->mc        [type] = put_##name##_c; \
  |  |  963|  7.82k|    c->mc_scaled [type] = put_##name##_scaled_c; \
  |  |  964|  7.82k|    c->mct       [type] = prep_##name##_c; \
  |  |  965|  7.82k|    c->mct_scaled[type] = prep_##name##_scaled_c; \
  |  |  966|  7.82k|} while (0)
  |  |  ------------------
  |  |  |  Branch (966:10): [Folded, False: 7.82k]
  |  |  ------------------
  ------------------
  974|  7.82k|    init_mc_fns(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular);
  ------------------
  |  |  961|  7.82k|#define init_mc_fns(type, name) do { \
  |  |  962|  7.82k|    c->mc        [type] = put_##name##_c; \
  |  |  963|  7.82k|    c->mc_scaled [type] = put_##name##_scaled_c; \
  |  |  964|  7.82k|    c->mct       [type] = prep_##name##_c; \
  |  |  965|  7.82k|    c->mct_scaled[type] = prep_##name##_scaled_c; \
  |  |  966|  7.82k|} while (0)
  |  |  ------------------
  |  |  |  Branch (966:10): [Folded, False: 7.82k]
  |  |  ------------------
  ------------------
  975|  7.82k|    init_mc_fns(FILTER_2D_8TAP_SMOOTH,         8tap_smooth);
  ------------------
  |  |  961|  7.82k|#define init_mc_fns(type, name) do { \
  |  |  962|  7.82k|    c->mc        [type] = put_##name##_c; \
  |  |  963|  7.82k|    c->mc_scaled [type] = put_##name##_scaled_c; \
  |  |  964|  7.82k|    c->mct       [type] = prep_##name##_c; \
  |  |  965|  7.82k|    c->mct_scaled[type] = prep_##name##_scaled_c; \
  |  |  966|  7.82k|} while (0)
  |  |  ------------------
  |  |  |  Branch (966:10): [Folded, False: 7.82k]
  |  |  ------------------
  ------------------
  976|  7.82k|    init_mc_fns(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp);
  ------------------
  |  |  961|  7.82k|#define init_mc_fns(type, name) do { \
  |  |  962|  7.82k|    c->mc        [type] = put_##name##_c; \
  |  |  963|  7.82k|    c->mc_scaled [type] = put_##name##_scaled_c; \
  |  |  964|  7.82k|    c->mct       [type] = prep_##name##_c; \
  |  |  965|  7.82k|    c->mct_scaled[type] = prep_##name##_scaled_c; \
  |  |  966|  7.82k|} while (0)
  |  |  ------------------
  |  |  |  Branch (966:10): [Folded, False: 7.82k]
  |  |  ------------------
  ------------------
  977|  7.82k|    init_mc_fns(FILTER_2D_BILINEAR,            bilin);
  ------------------
  |  |  961|  7.82k|#define init_mc_fns(type, name) do { \
  |  |  962|  7.82k|    c->mc        [type] = put_##name##_c; \
  |  |  963|  7.82k|    c->mc_scaled [type] = put_##name##_scaled_c; \
  |  |  964|  7.82k|    c->mct       [type] = prep_##name##_c; \
  |  |  965|  7.82k|    c->mct_scaled[type] = prep_##name##_scaled_c; \
  |  |  966|  7.82k|} while (0)
  |  |  ------------------
  |  |  |  Branch (966:10): [Folded, False: 7.82k]
  |  |  ------------------
  ------------------
  978|       |
  979|  7.82k|    c->avg      = avg_c;
  980|  7.82k|    c->w_avg    = w_avg_c;
  981|  7.82k|    c->mask     = mask_c;
  982|  7.82k|    c->blend    = blend_c;
  983|  7.82k|    c->blend_v  = blend_v_c;
  984|  7.82k|    c->blend_h  = blend_h_c;
  985|  7.82k|    c->w_mask[0] = w_mask_444_c;
  986|  7.82k|    c->w_mask[1] = w_mask_422_c;
  987|  7.82k|    c->w_mask[2] = w_mask_420_c;
  988|  7.82k|    c->warp8x8  = warp_affine_8x8_c;
  989|  7.82k|    c->warp8x8t = warp_affine_8x8t_c;
  990|  7.82k|    c->emu_edge = emu_edge_c;
  991|  7.82k|    c->resize   = resize_c;
  992|       |
  993|  7.82k|#if HAVE_ASM
  994|       |#if ARCH_AARCH64 || ARCH_ARM
  995|       |    mc_dsp_init_arm(c);
  996|       |#elif ARCH_LOONGARCH64
  997|       |    mc_dsp_init_loongarch(c);
  998|       |#elif ARCH_PPC64LE
  999|       |    mc_dsp_init_ppc(c);
 1000|       |#elif ARCH_RISCV
 1001|       |    mc_dsp_init_riscv(c);
 1002|       |#elif ARCH_X86
 1003|       |    mc_dsp_init_x86(c);
 1004|  7.82k|#endif
 1005|  7.82k|#endif
 1006|  7.82k|}
dav1d_mc_dsp_init_16bpc:
  960|  7.63k|COLD void bitfn(dav1d_mc_dsp_init)(Dav1dMCDSPContext *const c) {
  961|  7.63k|#define init_mc_fns(type, name) do { \
  962|  7.63k|    c->mc        [type] = put_##name##_c; \
  963|  7.63k|    c->mc_scaled [type] = put_##name##_scaled_c; \
  964|  7.63k|    c->mct       [type] = prep_##name##_c; \
  965|  7.63k|    c->mct_scaled[type] = prep_##name##_scaled_c; \
  966|  7.63k|} while (0)
  967|       |
  968|  7.63k|    init_mc_fns(FILTER_2D_8TAP_REGULAR,        8tap_regular);
  ------------------
  |  |  961|  7.63k|#define init_mc_fns(type, name) do { \
  |  |  962|  7.63k|    c->mc        [type] = put_##name##_c; \
  |  |  963|  7.63k|    c->mc_scaled [type] = put_##name##_scaled_c; \
  |  |  964|  7.63k|    c->mct       [type] = prep_##name##_c; \
  |  |  965|  7.63k|    c->mct_scaled[type] = prep_##name##_scaled_c; \
  |  |  966|  7.63k|} while (0)
  |  |  ------------------
  |  |  |  Branch (966:10): [Folded, False: 7.63k]
  |  |  ------------------
  ------------------
  969|  7.63k|    init_mc_fns(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth);
  ------------------
  |  |  961|  7.63k|#define init_mc_fns(type, name) do { \
  |  |  962|  7.63k|    c->mc        [type] = put_##name##_c; \
  |  |  963|  7.63k|    c->mc_scaled [type] = put_##name##_scaled_c; \
  |  |  964|  7.63k|    c->mct       [type] = prep_##name##_c; \
  |  |  965|  7.63k|    c->mct_scaled[type] = prep_##name##_scaled_c; \
  |  |  966|  7.63k|} while (0)
  |  |  ------------------
  |  |  |  Branch (966:10): [Folded, False: 7.63k]
  |  |  ------------------
  ------------------
  970|  7.63k|    init_mc_fns(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp);
  ------------------
  |  |  961|  7.63k|#define init_mc_fns(type, name) do { \
  |  |  962|  7.63k|    c->mc        [type] = put_##name##_c; \
  |  |  963|  7.63k|    c->mc_scaled [type] = put_##name##_scaled_c; \
  |  |  964|  7.63k|    c->mct       [type] = prep_##name##_c; \
  |  |  965|  7.63k|    c->mct_scaled[type] = prep_##name##_scaled_c; \
  |  |  966|  7.63k|} while (0)
  |  |  ------------------
  |  |  |  Branch (966:10): [Folded, False: 7.63k]
  |  |  ------------------
  ------------------
  971|  7.63k|    init_mc_fns(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular);
  ------------------
  |  |  961|  7.63k|#define init_mc_fns(type, name) do { \
  |  |  962|  7.63k|    c->mc        [type] = put_##name##_c; \
  |  |  963|  7.63k|    c->mc_scaled [type] = put_##name##_scaled_c; \
  |  |  964|  7.63k|    c->mct       [type] = prep_##name##_c; \
  |  |  965|  7.63k|    c->mct_scaled[type] = prep_##name##_scaled_c; \
  |  |  966|  7.63k|} while (0)
  |  |  ------------------
  |  |  |  Branch (966:10): [Folded, False: 7.63k]
  |  |  ------------------
  ------------------
  972|  7.63k|    init_mc_fns(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth);
  ------------------
  |  |  961|  7.63k|#define init_mc_fns(type, name) do { \
  |  |  962|  7.63k|    c->mc        [type] = put_##name##_c; \
  |  |  963|  7.63k|    c->mc_scaled [type] = put_##name##_scaled_c; \
  |  |  964|  7.63k|    c->mct       [type] = prep_##name##_c; \
  |  |  965|  7.63k|    c->mct_scaled[type] = prep_##name##_scaled_c; \
  |  |  966|  7.63k|} while (0)
  |  |  ------------------
  |  |  |  Branch (966:10): [Folded, False: 7.63k]
  |  |  ------------------
  ------------------
  973|  7.63k|    init_mc_fns(FILTER_2D_8TAP_SHARP,          8tap_sharp);
  ------------------
  |  |  961|  7.63k|#define init_mc_fns(type, name) do { \
  |  |  962|  7.63k|    c->mc        [type] = put_##name##_c; \
  |  |  963|  7.63k|    c->mc_scaled [type] = put_##name##_scaled_c; \
  |  |  964|  7.63k|    c->mct       [type] = prep_##name##_c; \
  |  |  965|  7.63k|    c->mct_scaled[type] = prep_##name##_scaled_c; \
  |  |  966|  7.63k|} while (0)
  |  |  ------------------
  |  |  |  Branch (966:10): [Folded, False: 7.63k]
  |  |  ------------------
  ------------------
  974|  7.63k|    init_mc_fns(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular);
  ------------------
  |  |  961|  7.63k|#define init_mc_fns(type, name) do { \
  |  |  962|  7.63k|    c->mc        [type] = put_##name##_c; \
  |  |  963|  7.63k|    c->mc_scaled [type] = put_##name##_scaled_c; \
  |  |  964|  7.63k|    c->mct       [type] = prep_##name##_c; \
  |  |  965|  7.63k|    c->mct_scaled[type] = prep_##name##_scaled_c; \
  |  |  966|  7.63k|} while (0)
  |  |  ------------------
  |  |  |  Branch (966:10): [Folded, False: 7.63k]
  |  |  ------------------
  ------------------
  975|  7.63k|    init_mc_fns(FILTER_2D_8TAP_SMOOTH,         8tap_smooth);
  ------------------
  |  |  961|  7.63k|#define init_mc_fns(type, name) do { \
  |  |  962|  7.63k|    c->mc        [type] = put_##name##_c; \
  |  |  963|  7.63k|    c->mc_scaled [type] = put_##name##_scaled_c; \
  |  |  964|  7.63k|    c->mct       [type] = prep_##name##_c; \
  |  |  965|  7.63k|    c->mct_scaled[type] = prep_##name##_scaled_c; \
  |  |  966|  7.63k|} while (0)
  |  |  ------------------
  |  |  |  Branch (966:10): [Folded, False: 7.63k]
  |  |  ------------------
  ------------------
  976|  7.63k|    init_mc_fns(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp);
  ------------------
  |  |  961|  7.63k|#define init_mc_fns(type, name) do { \
  |  |  962|  7.63k|    c->mc        [type] = put_##name##_c; \
  |  |  963|  7.63k|    c->mc_scaled [type] = put_##name##_scaled_c; \
  |  |  964|  7.63k|    c->mct       [type] = prep_##name##_c; \
  |  |  965|  7.63k|    c->mct_scaled[type] = prep_##name##_scaled_c; \
  |  |  966|  7.63k|} while (0)
  |  |  ------------------
  |  |  |  Branch (966:10): [Folded, False: 7.63k]
  |  |  ------------------
  ------------------
  977|  7.63k|    init_mc_fns(FILTER_2D_BILINEAR,            bilin);
  ------------------
  |  |  961|  7.63k|#define init_mc_fns(type, name) do { \
  |  |  962|  7.63k|    c->mc        [type] = put_##name##_c; \
  |  |  963|  7.63k|    c->mc_scaled [type] = put_##name##_scaled_c; \
  |  |  964|  7.63k|    c->mct       [type] = prep_##name##_c; \
  |  |  965|  7.63k|    c->mct_scaled[type] = prep_##name##_scaled_c; \
  |  |  966|  7.63k|} while (0)
  |  |  ------------------
  |  |  |  Branch (966:10): [Folded, False: 7.63k]
  |  |  ------------------
  ------------------
  978|       |
  979|  7.63k|    c->avg      = avg_c;
  980|  7.63k|    c->w_avg    = w_avg_c;
  981|  7.63k|    c->mask     = mask_c;
  982|  7.63k|    c->blend    = blend_c;
  983|  7.63k|    c->blend_v  = blend_v_c;
  984|  7.63k|    c->blend_h  = blend_h_c;
  985|  7.63k|    c->w_mask[0] = w_mask_444_c;
  986|  7.63k|    c->w_mask[1] = w_mask_422_c;
  987|  7.63k|    c->w_mask[2] = w_mask_420_c;
  988|  7.63k|    c->warp8x8  = warp_affine_8x8_c;
  989|  7.63k|    c->warp8x8t = warp_affine_8x8t_c;
  990|  7.63k|    c->emu_edge = emu_edge_c;
  991|  7.63k|    c->resize   = resize_c;
  992|       |
  993|  7.63k|#if HAVE_ASM
  994|       |#if ARCH_AARCH64 || ARCH_ARM
  995|       |    mc_dsp_init_arm(c);
  996|       |#elif ARCH_LOONGARCH64
  997|       |    mc_dsp_init_loongarch(c);
  998|       |#elif ARCH_PPC64LE
  999|       |    mc_dsp_init_ppc(c);
 1000|       |#elif ARCH_RISCV
 1001|       |    mc_dsp_init_riscv(c);
 1002|       |#elif ARCH_X86
 1003|       |    mc_dsp_init_x86(c);
 1004|  7.63k|#endif
 1005|  7.63k|#endif
 1006|  7.63k|}

dav1d_mem_pool_push:
  224|   107k|void dav1d_mem_pool_push(Dav1dMemPool *const pool, Dav1dMemPoolBuffer *const buf) {
  225|   107k|    pthread_mutex_lock(&pool->lock);
  226|   107k|    const int ref_cnt = --pool->ref_cnt;
  227|   107k|    if (!pool->end) {
  ------------------
  |  Branch (227:9): [True: 107k, False: 0]
  ------------------
  228|   107k|        buf->next = pool->buf;
  229|   107k|        pool->buf = buf;
  230|   107k|        pthread_mutex_unlock(&pool->lock);
  231|   107k|        assert(ref_cnt > 0);
  ------------------
  |  |  140|   107k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 107k]
  |  |  |  Branch (140:68): [Folded, False: 107k]
  |  |  ------------------
  ------------------
  232|   107k|    } else {
  233|      0|        pthread_mutex_unlock(&pool->lock);
  234|      0|        dav1d_free_aligned(buf->data);
  ------------------
  |  |  136|      0|#define dav1d_free_aligned(ptr) dav1d_free_aligned_internal(ptr)
  ------------------
  235|      0|        if (!ref_cnt) mem_pool_destroy(pool);
  ------------------
  |  Branch (235:13): [True: 0, False: 0]
  ------------------
  236|      0|    }
  237|   107k|}
dav1d_mem_pool_pop:
  239|   107k|Dav1dMemPoolBuffer *dav1d_mem_pool_pop(Dav1dMemPool *const pool, const size_t size) {
  240|   107k|    assert(!(size & (sizeof(void*) - 1)));
  ------------------
  |  |  140|   107k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 107k]
  |  |  |  Branch (140:68): [Folded, False: 107k]
  |  |  ------------------
  ------------------
  241|   107k|    pthread_mutex_lock(&pool->lock);
  242|   107k|    Dav1dMemPoolBuffer *buf = pool->buf;
  243|   107k|    pool->ref_cnt++;
  244|   107k|    uint8_t *data;
  245|   107k|    if (buf) {
  ------------------
  |  Branch (245:9): [True: 3.78k, False: 104k]
  ------------------
  246|  3.78k|        pool->buf = buf->next;
  247|  3.78k|        pthread_mutex_unlock(&pool->lock);
  248|  3.78k|        data = buf->data;
  249|  3.78k|        if ((uintptr_t)buf - (uintptr_t)data != size) {
  ------------------
  |  Branch (249:13): [True: 158, False: 3.62k]
  ------------------
  250|       |            /* Reallocate if the size has changed */
  251|    158|            dav1d_free_aligned(data);
  ------------------
  |  |  136|    158|#define dav1d_free_aligned(ptr) dav1d_free_aligned_internal(ptr)
  ------------------
  252|    158|            goto alloc;
  253|    158|        }
  254|       |#if TRACK_HEAP_ALLOCATIONS
  255|       |        dav1d_track_reuse(pool->type);
  256|       |#endif
  257|   104k|    } else {
  258|   104k|        pthread_mutex_unlock(&pool->lock);
  259|   104k|alloc:
  260|   104k|        data = dav1d_alloc_aligned(pool->type,
  ------------------
  |  |  134|   104k|#define dav1d_alloc_aligned(type, sz, align) dav1d_alloc_aligned_internal(sz, align)
  ------------------
  261|   104k|                                   size + sizeof(Dav1dMemPoolBuffer), 64);
  262|   104k|        if (!data) {
  ------------------
  |  Branch (262:13): [True: 0, False: 104k]
  ------------------
  263|      0|            pthread_mutex_lock(&pool->lock);
  264|      0|            const int ref_cnt = --pool->ref_cnt;
  265|      0|            pthread_mutex_unlock(&pool->lock);
  266|      0|            if (!ref_cnt) mem_pool_destroy(pool);
  ------------------
  |  Branch (266:17): [True: 0, False: 0]
  ------------------
  267|      0|            return NULL;
  268|      0|        }
  269|   104k|        buf = (Dav1dMemPoolBuffer*)(data + size);
  270|   104k|        buf->data = data;
  271|   104k|    }
  272|       |
  273|   107k|    return buf;
  274|   107k|}
dav1d_mem_pool_init:
  278|   120k|{
  279|   120k|    Dav1dMemPool *const pool = dav1d_malloc(ALLOC_COMMON_CTX,
  ------------------
  |  |  132|   120k|#define dav1d_malloc(type, sz) malloc(sz)
  ------------------
  280|   120k|                                            sizeof(Dav1dMemPool));
  281|   120k|    if (pool) {
  ------------------
  |  Branch (281:9): [True: 120k, False: 0]
  ------------------
  282|   120k|        if (!pthread_mutex_init(&pool->lock, NULL)) {
  ------------------
  |  Branch (282:13): [True: 120k, False: 0]
  ------------------
  283|   120k|            pool->buf = NULL;
  284|   120k|            pool->ref_cnt = 1;
  285|   120k|            pool->end = 0;
  286|       |#if TRACK_HEAP_ALLOCATIONS
  287|       |            pool->type = type;
  288|       |#endif
  289|   120k|            *ppool = pool;
  290|   120k|            return 0;
  291|   120k|        }
  292|      0|        dav1d_free(pool);
  ------------------
  |  |  135|      0|#define dav1d_free(ptr) free(ptr)
  ------------------
  293|      0|    }
  294|      0|    *ppool = NULL;
  295|      0|    return DAV1D_ERR(ENOMEM);
  ------------------
  |  |   56|      0|#define DAV1D_ERR(e) (-(e)) ///< Negate POSIX error code.
  ------------------
  296|   120k|}
dav1d_mem_pool_end:
  298|   120k|COLD void dav1d_mem_pool_end(Dav1dMemPool *const pool) {
  299|   120k|    if (pool) {
  ------------------
  |  Branch (299:9): [True: 120k, False: 0]
  ------------------
  300|   120k|        pthread_mutex_lock(&pool->lock);
  301|   120k|        Dav1dMemPoolBuffer *buf = pool->buf;
  302|   120k|        const int ref_cnt = --pool->ref_cnt;
  303|   120k|        pool->buf = NULL;
  304|   120k|        pool->end = 1;
  305|   120k|        pthread_mutex_unlock(&pool->lock);
  306|       |
  307|   224k|        while (buf) {
  ------------------
  |  Branch (307:16): [True: 104k, False: 120k]
  ------------------
  308|   104k|            void *const data = buf->data;
  309|   104k|            buf = buf->next;
  310|   104k|            dav1d_free_aligned(data);
  ------------------
  |  |  136|   104k|#define dav1d_free_aligned(ptr) dav1d_free_aligned_internal(ptr)
  ------------------
  311|   104k|        }
  312|   120k|        if (!ref_cnt) mem_pool_destroy(pool);
  ------------------
  |  Branch (312:13): [True: 120k, False: 0]
  ------------------
  313|   120k|    }
  314|   120k|}
mem.c:mem_pool_destroy:
  219|   120k|static COLD void mem_pool_destroy(Dav1dMemPool *const pool) {
  220|   120k|    pthread_mutex_destroy(&pool->lock);
  221|   120k|    dav1d_free(pool);
  ------------------
  |  |  135|   120k|#define dav1d_free(ptr) free(ptr)
  ------------------
  222|   120k|}

lib.c:dav1d_alloc_aligned_internal:
   89|  51.6k|static inline void *dav1d_alloc_aligned_internal(const size_t sz, const size_t align) {
   90|  51.6k|    assert(!(align & (align - 1)));
  ------------------
  |  |  140|  51.6k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 51.6k]
  |  |  |  Branch (140:68): [Folded, False: 51.6k]
  |  |  ------------------
  ------------------
   91|       |#ifdef _WIN32
   92|       |    return _aligned_malloc(sz, align);
   93|       |#elif HAVE_POSIX_MEMALIGN
   94|  51.6k|    void *ptr;
   95|  51.6k|    if (posix_memalign(&ptr, align, sz)) return NULL;
  ------------------
  |  Branch (95:9): [True: 0, False: 51.6k]
  ------------------
   96|  51.6k|    return ptr;
   97|       |#elif HAVE_MEMALIGN
   98|       |    return memalign(align, sz);
   99|       |#elif HAVE_ALIGNED_ALLOC
  100|       |    // The C11 standard specifies that the size parameter
  101|       |    // must be an integral multiple of alignment.
  102|       |    return aligned_alloc(align, ROUND_UP(sz, align));
  103|       |#else
  104|       |    void *const buf = malloc(sz + align + sizeof(void *));
  105|       |    if (!buf) return NULL;
  106|       |
  107|       |    void *const ptr = (void *)(((uintptr_t)buf + sizeof(void *) + align - 1) & ~(align - 1));
  108|       |    ((void **)ptr)[-1] = buf;
  109|       |    return ptr;
  110|       |#endif
  111|  51.6k|}
lib.c:dav1d_free_aligned_internal:
  113|   137k|static inline void dav1d_free_aligned_internal(void *ptr) {
  114|       |#ifdef _WIN32
  115|       |    _aligned_free(ptr);
  116|       |#elif HAVE_POSIX_MEMALIGN || HAVE_MEMALIGN || HAVE_ALIGNED_ALLOC
  117|       |    free(ptr);
  118|       |#else
  119|       |    if (ptr) free(((void **)ptr)[-1]);
  120|       |#endif
  121|   137k|}
lib.c:dav1d_freep_aligned:
  144|  17.2k|static inline void dav1d_freep_aligned(void *ptr) {
  145|  17.2k|    void **mem = (void **) ptr;
  146|  17.2k|    if (*mem) {
  ------------------
  |  Branch (146:9): [True: 17.2k, False: 0]
  ------------------
  147|  17.2k|        dav1d_free_aligned(*mem);
  ------------------
  |  |  136|  17.2k|#define dav1d_free_aligned(ptr) dav1d_free_aligned_internal(ptr)
  ------------------
  148|       |        *mem = NULL;
  149|  17.2k|    }
  150|  17.2k|}
mem.c:dav1d_free_aligned_internal:
  113|   104k|static inline void dav1d_free_aligned_internal(void *ptr) {
  114|       |#ifdef _WIN32
  115|       |    _aligned_free(ptr);
  116|       |#elif HAVE_POSIX_MEMALIGN || HAVE_MEMALIGN || HAVE_ALIGNED_ALLOC
  117|       |    free(ptr);
  118|       |#else
  119|       |    if (ptr) free(((void **)ptr)[-1]);
  120|       |#endif
  121|   104k|}
mem.c:dav1d_alloc_aligned_internal:
   89|   104k|static inline void *dav1d_alloc_aligned_internal(const size_t sz, const size_t align) {
   90|   104k|    assert(!(align & (align - 1)));
  ------------------
  |  |  140|   104k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 104k]
  |  |  |  Branch (140:68): [Folded, False: 104k]
  |  |  ------------------
  ------------------
   91|       |#ifdef _WIN32
   92|       |    return _aligned_malloc(sz, align);
   93|       |#elif HAVE_POSIX_MEMALIGN
   94|   104k|    void *ptr;
   95|   104k|    if (posix_memalign(&ptr, align, sz)) return NULL;
  ------------------
  |  Branch (95:9): [True: 0, False: 104k]
  ------------------
   96|   104k|    return ptr;
   97|       |#elif HAVE_MEMALIGN
   98|       |    return memalign(align, sz);
   99|       |#elif HAVE_ALIGNED_ALLOC
  100|       |    // The C11 standard specifies that the size parameter
  101|       |    // must be an integral multiple of alignment.
  102|       |    return aligned_alloc(align, ROUND_UP(sz, align));
  103|       |#else
  104|       |    void *const buf = malloc(sz + align + sizeof(void *));
  105|       |    if (!buf) return NULL;
  106|       |
  107|       |    void *const ptr = (void *)(((uintptr_t)buf + sizeof(void *) + align - 1) & ~(align - 1));
  108|       |    ((void **)ptr)[-1] = buf;
  109|       |    return ptr;
  110|       |#endif
  111|   104k|}
ref.c:dav1d_alloc_aligned_internal:
   89|     27|static inline void *dav1d_alloc_aligned_internal(const size_t sz, const size_t align) {
   90|     27|    assert(!(align & (align - 1)));
  ------------------
  |  |  140|     27|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 27]
  |  |  |  Branch (140:68): [Folded, False: 27]
  |  |  ------------------
  ------------------
   91|       |#ifdef _WIN32
   92|       |    return _aligned_malloc(sz, align);
   93|       |#elif HAVE_POSIX_MEMALIGN
   94|     27|    void *ptr;
   95|     27|    if (posix_memalign(&ptr, align, sz)) return NULL;
  ------------------
  |  Branch (95:9): [True: 0, False: 27]
  ------------------
   96|     27|    return ptr;
   97|       |#elif HAVE_MEMALIGN
   98|       |    return memalign(align, sz);
   99|       |#elif HAVE_ALIGNED_ALLOC
  100|       |    // The C11 standard specifies that the size parameter
  101|       |    // must be an integral multiple of alignment.
  102|       |    return aligned_alloc(align, ROUND_UP(sz, align));
  103|       |#else
  104|       |    void *const buf = malloc(sz + align + sizeof(void *));
  105|       |    if (!buf) return NULL;
  106|       |
  107|       |    void *const ptr = (void *)(((uintptr_t)buf + sizeof(void *) + align - 1) & ~(align - 1));
  108|       |    ((void **)ptr)[-1] = buf;
  109|       |    return ptr;
  110|       |#endif
  111|     27|}
ref.c:dav1d_free_aligned_internal:
  113|     27|static inline void dav1d_free_aligned_internal(void *ptr) {
  114|       |#ifdef _WIN32
  115|       |    _aligned_free(ptr);
  116|       |#elif HAVE_POSIX_MEMALIGN || HAVE_MEMALIGN || HAVE_ALIGNED_ALLOC
  117|       |    free(ptr);
  118|       |#else
  119|       |    if (ptr) free(((void **)ptr)[-1]);
  120|       |#endif
  121|     27|}
refmvs.c:dav1d_free_aligned_internal:
  113|  6.45k|static inline void dav1d_free_aligned_internal(void *ptr) {
  114|       |#ifdef _WIN32
  115|       |    _aligned_free(ptr);
  116|       |#elif HAVE_POSIX_MEMALIGN || HAVE_MEMALIGN || HAVE_ALIGNED_ALLOC
  117|       |    free(ptr);
  118|       |#else
  119|       |    if (ptr) free(((void **)ptr)[-1]);
  120|       |#endif
  121|  6.45k|}
refmvs.c:dav1d_alloc_aligned_internal:
   89|  6.45k|static inline void *dav1d_alloc_aligned_internal(const size_t sz, const size_t align) {
   90|  6.45k|    assert(!(align & (align - 1)));
  ------------------
  |  |  140|  6.45k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 6.45k]
  |  |  |  Branch (140:68): [Folded, False: 6.45k]
  |  |  ------------------
  ------------------
   91|       |#ifdef _WIN32
   92|       |    return _aligned_malloc(sz, align);
   93|       |#elif HAVE_POSIX_MEMALIGN
   94|  6.45k|    void *ptr;
   95|  6.45k|    if (posix_memalign(&ptr, align, sz)) return NULL;
  ------------------
  |  Branch (95:9): [True: 0, False: 6.45k]
  ------------------
   96|  6.45k|    return ptr;
   97|       |#elif HAVE_MEMALIGN
   98|       |    return memalign(align, sz);
   99|       |#elif HAVE_ALIGNED_ALLOC
  100|       |    // The C11 standard specifies that the size parameter
  101|       |    // must be an integral multiple of alignment.
  102|       |    return aligned_alloc(align, ROUND_UP(sz, align));
  103|       |#else
  104|       |    void *const buf = malloc(sz + align + sizeof(void *));
  105|       |    if (!buf) return NULL;
  106|       |
  107|       |    void *const ptr = (void *)(((uintptr_t)buf + sizeof(void *) + align - 1) & ~(align - 1));
  108|       |    ((void **)ptr)[-1] = buf;
  109|       |    return ptr;
  110|       |#endif
  111|  6.45k|}
decode.c:dav1d_free_aligned_internal:
  113|  63.1k|static inline void dav1d_free_aligned_internal(void *ptr) {
  114|       |#ifdef _WIN32
  115|       |    _aligned_free(ptr);
  116|       |#elif HAVE_POSIX_MEMALIGN || HAVE_MEMALIGN || HAVE_ALIGNED_ALLOC
  117|       |    free(ptr);
  118|       |#else
  119|       |    if (ptr) free(((void **)ptr)[-1]);
  120|       |#endif
  121|  63.1k|}
decode.c:dav1d_alloc_aligned_internal:
   89|  63.1k|static inline void *dav1d_alloc_aligned_internal(const size_t sz, const size_t align) {
   90|  63.1k|    assert(!(align & (align - 1)));
  ------------------
  |  |  140|  63.1k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 63.1k]
  |  |  |  Branch (140:68): [Folded, False: 63.1k]
  |  |  ------------------
  ------------------
   91|       |#ifdef _WIN32
   92|       |    return _aligned_malloc(sz, align);
   93|       |#elif HAVE_POSIX_MEMALIGN
   94|  63.1k|    void *ptr;
   95|  63.1k|    if (posix_memalign(&ptr, align, sz)) return NULL;
  ------------------
  |  Branch (95:9): [True: 0, False: 63.1k]
  ------------------
   96|  63.1k|    return ptr;
   97|       |#elif HAVE_MEMALIGN
   98|       |    return memalign(align, sz);
   99|       |#elif HAVE_ALIGNED_ALLOC
  100|       |    // The C11 standard specifies that the size parameter
  101|       |    // must be an integral multiple of alignment.
  102|       |    return aligned_alloc(align, ROUND_UP(sz, align));
  103|       |#else
  104|       |    void *const buf = malloc(sz + align + sizeof(void *));
  105|       |    if (!buf) return NULL;
  106|       |
  107|       |    void *const ptr = (void *)(((uintptr_t)buf + sizeof(void *) + align - 1) & ~(align - 1));
  108|       |    ((void **)ptr)[-1] = buf;
  109|       |    return ptr;
  110|       |#endif
  111|  63.1k|}

dav1d_msac_decode_subexp:
   62|   201k|{
   63|   201k|    assert(n >> k == 8);
  ------------------
  |  |  140|   201k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 201k]
  |  |  |  Branch (140:68): [Folded, False: 201k]
  |  |  ------------------
  ------------------
   64|       |
   65|   201k|    unsigned a = 0;
   66|   201k|    if (dav1d_msac_decode_bool_equi(s)) {
  ------------------
  |  |   53|   201k|#define dav1d_msac_decode_bool_equi      dav1d_msac_decode_bool_equi_sse2
  ------------------
  |  Branch (66:9): [True: 106k, False: 95.1k]
  ------------------
   67|   106k|        if (dav1d_msac_decode_bool_equi(s))
  ------------------
  |  |   53|   106k|#define dav1d_msac_decode_bool_equi      dav1d_msac_decode_bool_equi_sse2
  ------------------
  |  Branch (67:13): [True: 59.4k, False: 46.6k]
  ------------------
   68|  59.4k|            k += dav1d_msac_decode_bool_equi(s) + 1;
  ------------------
  |  |   53|  59.4k|#define dav1d_msac_decode_bool_equi      dav1d_msac_decode_bool_equi_sse2
  ------------------
   69|   106k|        a = 1 << k;
   70|   106k|    }
   71|   201k|    const unsigned v = dav1d_msac_decode_bools(s, k) + a;
   72|   201k|    return ref * 2 <= n ? inv_recenter(ref, v) :
  ------------------
  |  Branch (72:12): [True: 112k, False: 88.3k]
  ------------------
   73|   201k|                          n - 1 - inv_recenter(n - 1 - ref, v);
   74|   201k|}
dav1d_msac_init:
  206|  32.2k|{
  207|  32.2k|    s->buf_pos = data;
  208|  32.2k|    s->buf_end = data + sz;
  209|  32.2k|    s->dif = 0;
  210|  32.2k|    s->rng = 0x8000;
  211|  32.2k|    s->cnt = -15;
  212|  32.2k|    s->allow_update_cdf = !disable_cdf_update_flag;
  213|  32.2k|    ctx_refill(s);
  214|       |
  215|  32.2k|#if ARCH_X86_64 && HAVE_ASM
  216|  32.2k|    s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt_c;
  217|       |
  218|  32.2k|    msac_init_x86(s);
  219|  32.2k|#endif
  220|  32.2k|}
msac.c:ctx_refill:
   41|  32.2k|static inline void ctx_refill(MsacContext *const s) {
   42|  32.2k|    const uint8_t *buf_pos = s->buf_pos;
   43|  32.2k|    const uint8_t *buf_end = s->buf_end;
   44|  32.2k|    int c = EC_WIN_SIZE - s->cnt - 24;
  ------------------
  |  |   39|  32.2k|#define EC_WIN_SIZE (sizeof(ec_win) << 3)
  ------------------
   45|  32.2k|    ec_win dif = s->dif;
   46|   206k|    do {
   47|   206k|        if (buf_pos >= buf_end) {
  ------------------
  |  Branch (47:13): [True: 5.98k, False: 200k]
  ------------------
   48|       |            // set remaining bits to 1;
   49|  5.98k|            dif |= ~(~(ec_win)0xff << c);
   50|  5.98k|            break;
   51|  5.98k|        }
   52|   200k|        dif |= (ec_win)(*buf_pos++ ^ 0xff) << c;
   53|   200k|        c -= 8;
   54|   200k|    } while (c >= 0);
  ------------------
  |  Branch (54:14): [True: 173k, False: 26.2k]
  ------------------
   55|  32.2k|    s->dif = dif;
   56|  32.2k|    s->cnt = EC_WIN_SIZE - c - 24;
  ------------------
  |  |   39|  32.2k|#define EC_WIN_SIZE (sizeof(ec_win) << 3)
  ------------------
   57|  32.2k|    s->buf_pos = buf_pos;
   58|  32.2k|}

decode.c:dav1d_msac_decode_bools:
   94|   369k|static inline unsigned dav1d_msac_decode_bools(MsacContext *const s, unsigned n) {
   95|   369k|    unsigned v = 0;
   96|   928k|    while (n--)
  ------------------
  |  Branch (96:12): [True: 559k, False: 369k]
  ------------------
   97|   559k|        v = (v << 1) | dav1d_msac_decode_bool_equi(s);
  ------------------
  |  |   53|   559k|#define dav1d_msac_decode_bool_equi      dav1d_msac_decode_bool_equi_sse2
  ------------------
   98|   369k|    return v;
   99|   369k|}
decode.c:dav1d_msac_decode_uniform:
  101|  31.2k|static inline int dav1d_msac_decode_uniform(MsacContext *const s, const unsigned n) {
  102|  31.2k|    assert(n > 0);
  ------------------
  |  |  140|  31.2k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 31.2k]
  |  |  |  Branch (140:68): [Folded, False: 31.2k]
  |  |  ------------------
  ------------------
  103|  31.2k|    const int l = ulog2(n) + 1;
  104|  31.2k|    assert(l > 1);
  ------------------
  |  |  140|  31.2k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 31.2k]
  |  |  |  Branch (140:68): [Folded, False: 31.2k]
  |  |  ------------------
  ------------------
  105|  31.2k|    const unsigned m = (1 << l) - n;
  106|  31.2k|    const unsigned v = dav1d_msac_decode_bools(s, l - 1);
  107|  31.2k|    return v < m ? v : (v << 1) - m + dav1d_msac_decode_bool_equi(s);
  ------------------
  |  |   53|  6.99k|#define dav1d_msac_decode_bool_equi      dav1d_msac_decode_bool_equi_sse2
  ------------------
  |  Branch (107:12): [True: 24.2k, False: 6.99k]
  ------------------
  108|  31.2k|}
msac.c:dav1d_msac_decode_bools:
   94|   201k|static inline unsigned dav1d_msac_decode_bools(MsacContext *const s, unsigned n) {
   95|   201k|    unsigned v = 0;
   96|   851k|    while (n--)
  ------------------
  |  Branch (96:12): [True: 650k, False: 201k]
  ------------------
   97|   650k|        v = (v << 1) | dav1d_msac_decode_bool_equi(s);
  ------------------
  |  |   53|   650k|#define dav1d_msac_decode_bool_equi      dav1d_msac_decode_bool_equi_sse2
  ------------------
   98|   201k|    return v;
   99|   201k|}
recon_tmpl.c:dav1d_msac_decode_bools:
   94|  1.92M|static inline unsigned dav1d_msac_decode_bools(MsacContext *const s, unsigned n) {
   95|  1.92M|    unsigned v = 0;
   96|  6.46M|    while (n--)
  ------------------
  |  Branch (96:12): [True: 4.54M, False: 1.92M]
  ------------------
   97|  4.54M|        v = (v << 1) | dav1d_msac_decode_bool_equi(s);
  ------------------
  |  |   53|  4.54M|#define dav1d_msac_decode_bool_equi      dav1d_msac_decode_bool_equi_sse2
  ------------------
   98|  1.92M|    return v;
   99|  1.92M|}

dav1d_parse_obus:
 1169|  61.4k|ptrdiff_t dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in) {
 1170|  61.4k|    GetBits gb;
 1171|  61.4k|    int res;
 1172|       |
 1173|  61.4k|    dav1d_init_get_bits(&gb, in->data, in->sz);
 1174|       |
 1175|       |    // obu header
 1176|  61.4k|    const int obu_forbidden_bit = dav1d_get_bit(&gb);
 1177|  61.4k|    if (c->strict_std_compliance && obu_forbidden_bit) goto error;
  ------------------
  |  Branch (1177:9): [True: 0, False: 61.4k]
  |  Branch (1177:37): [True: 0, False: 0]
  ------------------
 1178|  61.4k|    const enum Dav1dObuType type = dav1d_get_bits(&gb, 4);
 1179|  61.4k|    const int has_extension = dav1d_get_bit(&gb);
 1180|  61.4k|    const int has_length_field = dav1d_get_bit(&gb);
 1181|  61.4k|    dav1d_get_bit(&gb); // reserved
 1182|       |
 1183|  61.4k|    int temporal_id = 0, spatial_id = 0;
 1184|  61.4k|    if (has_extension) {
  ------------------
  |  Branch (1184:9): [True: 6.79k, False: 54.6k]
  ------------------
 1185|  6.79k|        temporal_id = dav1d_get_bits(&gb, 3);
 1186|  6.79k|        spatial_id = dav1d_get_bits(&gb, 2);
 1187|  6.79k|        dav1d_get_bits(&gb, 3); // reserved
 1188|  6.79k|    }
 1189|       |
 1190|  61.4k|    if (has_length_field) {
  ------------------
  |  Branch (1190:9): [True: 54.7k, False: 6.70k]
  ------------------
 1191|  54.7k|        const size_t len = dav1d_get_uleb128(&gb);
 1192|  54.7k|        if (len > (size_t)(gb.ptr_end - gb.ptr)) goto error;
  ------------------
  |  Branch (1192:13): [True: 580, False: 54.1k]
  ------------------
 1193|  54.1k|        gb.ptr_end = gb.ptr + len;
 1194|  54.1k|    }
 1195|  60.8k|    if (gb.error) goto error;
  ------------------
  |  Branch (1195:9): [True: 179, False: 60.6k]
  ------------------
 1196|       |
 1197|       |    // We must have read a whole number of bytes at this point (1 byte
 1198|       |    // for the header and whole bytes at a time when reading the
 1199|       |    // leb128 length field).
 1200|  60.6k|    assert(gb.bits_left == 0);
  ------------------
  |  |  140|  60.6k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 60.6k]
  |  |  |  Branch (140:68): [Folded, False: 60.6k]
  |  |  ------------------
  ------------------
 1201|       |
 1202|       |    // skip obu not belonging to the selected temporal/spatial layer
 1203|  60.6k|    if (type != DAV1D_OBU_SEQ_HDR && type != DAV1D_OBU_TD &&
  ------------------
  |  Branch (1203:9): [True: 42.9k, False: 17.7k]
  |  Branch (1203:38): [True: 29.0k, False: 13.9k]
  ------------------
 1204|  29.0k|        has_extension && c->operating_point_idc != 0)
  ------------------
  |  Branch (1204:9): [True: 5.94k, False: 23.0k]
  |  Branch (1204:26): [True: 1.37k, False: 4.57k]
  ------------------
 1205|  1.37k|    {
 1206|  1.37k|        const int in_temporal_layer = (c->operating_point_idc >> temporal_id) & 1;
 1207|  1.37k|        const int in_spatial_layer = (c->operating_point_idc >> (spatial_id + 8)) & 1;
 1208|  1.37k|        if (!in_temporal_layer || !in_spatial_layer)
  ------------------
  |  Branch (1208:13): [True: 275, False: 1.09k]
  |  Branch (1208:35): [True: 90, False: 1.00k]
  ------------------
 1209|    365|            return gb.ptr_end - gb.ptr_start;
 1210|  1.37k|    }
 1211|       |
 1212|  60.3k|    switch (type) {
 1213|  17.7k|    case DAV1D_OBU_SEQ_HDR: {
  ------------------
  |  Branch (1213:5): [True: 17.7k, False: 42.5k]
  ------------------
 1214|  17.7k|        Dav1dRef *ref = dav1d_ref_create_using_pool(c->seq_hdr_pool,
 1215|  17.7k|                                                    sizeof(Dav1dSequenceHeader));
 1216|  17.7k|        if (!ref) return DAV1D_ERR(ENOMEM);
  ------------------
  |  |   56|      0|#define DAV1D_ERR(e) (-(e)) ///< Negate POSIX error code.
  ------------------
  |  Branch (1216:13): [True: 0, False: 17.7k]
  ------------------
 1217|  17.7k|        Dav1dSequenceHeader *seq_hdr = ref->data;
 1218|  17.7k|        if ((res = parse_seq_hdr(seq_hdr, &gb, c->strict_std_compliance)) < 0) {
  ------------------
  |  Branch (1218:13): [True: 900, False: 16.8k]
  ------------------
 1219|    900|            dav1d_log(c, "Error parsing sequence header\n");
  ------------------
  |  |   39|    900|#define dav1d_log dav1d_log
  ------------------
 1220|    900|            dav1d_ref_dec(&ref);
 1221|    900|            goto error;
 1222|    900|        }
 1223|       |
 1224|  16.8k|        const int op_idx =
 1225|  16.8k|            c->operating_point < seq_hdr->num_operating_points ? c->operating_point : 0;
  ------------------
  |  Branch (1225:13): [True: 16.8k, False: 0]
  ------------------
 1226|  16.8k|        c->operating_point_idc = seq_hdr->operating_points[op_idx].idc;
 1227|  16.8k|        const unsigned spatial_mask = c->operating_point_idc >> 8;
 1228|  16.8k|        c->max_spatial_id = spatial_mask ? ulog2(spatial_mask) : 0;
  ------------------
  |  Branch (1228:29): [True: 3.36k, False: 13.4k]
  ------------------
 1229|       |
 1230|       |        // If we have read a sequence header which is different from
 1231|       |        // the old one, this is a new video sequence and can't use any
 1232|       |        // previous state. Free that state.
 1233|       |
 1234|  16.8k|        if (!c->seq_hdr) {
  ------------------
  |  Branch (1234:13): [True: 15.9k, False: 915]
  ------------------
 1235|  15.9k|            c->frame_hdr = NULL;
 1236|  15.9k|            c->frame_flags |= PICTURE_FLAG_NEW_SEQUENCE;
 1237|       |        // see 7.5, operating_parameter_info is allowed to change in
 1238|       |        // sequence headers of a single sequence
 1239|  15.9k|        } else if (memcmp(seq_hdr, c->seq_hdr, offsetof(Dav1dSequenceHeader, operating_parameter_info))) {
  ------------------
  |  Branch (1239:20): [True: 471, False: 444]
  ------------------
 1240|    471|            c->frame_hdr = NULL;
 1241|    471|            c->mastering_display = NULL;
 1242|    471|            c->content_light = NULL;
 1243|    471|            dav1d_ref_dec(&c->mastering_display_ref);
 1244|    471|            dav1d_ref_dec(&c->content_light_ref);
 1245|  4.23k|            for (int i = 0; i < 8; i++) {
  ------------------
  |  Branch (1245:29): [True: 3.76k, False: 471]
  ------------------
 1246|  3.76k|                if (c->refs[i].p.p.frame_hdr)
  ------------------
  |  Branch (1246:21): [True: 3.07k, False: 694]
  ------------------
 1247|  3.07k|                    dav1d_thread_picture_unref(&c->refs[i].p);
 1248|  3.76k|                dav1d_ref_dec(&c->refs[i].segmap);
 1249|  3.76k|                dav1d_ref_dec(&c->refs[i].refmvs);
 1250|  3.76k|                dav1d_cdf_thread_unref(&c->cdf[i]);
 1251|  3.76k|            }
 1252|    471|            c->frame_flags |= PICTURE_FLAG_NEW_SEQUENCE;
 1253|       |        // If operating_parameter_info changed, signal it
 1254|    471|        } else if (memcmp(seq_hdr->operating_parameter_info, c->seq_hdr->operating_parameter_info,
  ------------------
  |  Branch (1254:20): [True: 0, False: 444]
  ------------------
 1255|    444|                          sizeof(seq_hdr->operating_parameter_info)))
 1256|      0|        {
 1257|      0|            c->frame_flags |= PICTURE_FLAG_NEW_OP_PARAMS_INFO;
 1258|      0|        }
 1259|  16.8k|        dav1d_ref_dec(&c->seq_hdr_ref);
 1260|  16.8k|        c->seq_hdr_ref = ref;
 1261|  16.8k|        c->seq_hdr = seq_hdr;
 1262|  16.8k|        break;
 1263|  17.7k|    }
 1264|    214|    case DAV1D_OBU_REDUNDANT_FRAME_HDR:
  ------------------
  |  Branch (1264:5): [True: 214, False: 60.0k]
  ------------------
 1265|    214|        if (c->frame_hdr) break;
  ------------------
  |  Branch (1265:13): [True: 35, False: 179]
  ------------------
 1266|       |        // fall-through
 1267|  22.2k|    case DAV1D_OBU_FRAME:
  ------------------
  |  Branch (1267:5): [True: 22.0k, False: 38.2k]
  ------------------
 1268|  23.1k|    case DAV1D_OBU_FRAME_HDR:
  ------------------
  |  Branch (1268:5): [True: 931, False: 59.3k]
  ------------------
 1269|  23.1k|        if (!c->seq_hdr) goto error;
  ------------------
  |  Branch (1269:13): [True: 85, False: 23.0k]
  ------------------
 1270|  23.0k|        if (!c->frame_hdr_ref) {
  ------------------
  |  Branch (1270:13): [True: 22.3k, False: 665]
  ------------------
 1271|  22.3k|            c->frame_hdr_ref = dav1d_ref_create_using_pool(c->frame_hdr_pool,
 1272|  22.3k|                                                           sizeof(Dav1dFrameHeader));
 1273|  22.3k|            if (!c->frame_hdr_ref) return DAV1D_ERR(ENOMEM);
  ------------------
  |  |   56|      0|#define DAV1D_ERR(e) (-(e)) ///< Negate POSIX error code.
  ------------------
  |  Branch (1273:17): [True: 0, False: 22.3k]
  ------------------
 1274|  22.3k|        }
 1275|       |#ifndef NDEBUG
 1276|       |        // ensure that the reference is writable
 1277|       |        assert(dav1d_ref_is_writable(c->frame_hdr_ref));
 1278|       |#endif
 1279|  23.0k|        c->frame_hdr = c->frame_hdr_ref->data;
 1280|  23.0k|        memset(c->frame_hdr, 0, sizeof(*c->frame_hdr));
 1281|  23.0k|        c->frame_hdr->temporal_id = temporal_id;
 1282|  23.0k|        c->frame_hdr->spatial_id = spatial_id;
 1283|  23.0k|        if ((res = parse_frame_hdr(c, &gb)) < 0) {
  ------------------
  |  Branch (1283:13): [True: 146, False: 22.9k]
  ------------------
 1284|    146|            c->frame_hdr = NULL;
 1285|    146|            goto error;
 1286|    146|        }
 1287|  22.9k|        for (int n = 0; n < c->n_tile_data; n++)
  ------------------
  |  Branch (1287:25): [True: 18, False: 22.9k]
  ------------------
 1288|     18|            dav1d_data_unref_internal(&c->tile[n].data);
 1289|  22.9k|        c->n_tile_data = 0;
 1290|  22.9k|        c->n_tiles = 0;
 1291|  22.9k|        if (type != DAV1D_OBU_FRAME) {
  ------------------
  |  Branch (1291:13): [True: 1.03k, False: 21.8k]
  ------------------
 1292|       |            // This is actually a frame header OBU so read the
 1293|       |            // trailing bit and check for overrun.
 1294|  1.03k|            if (check_trailing_bits(&gb, c->strict_std_compliance) < 0) {
  ------------------
  |  Branch (1294:17): [True: 95, False: 944]
  ------------------
 1295|     95|                c->frame_hdr = NULL;
 1296|     95|                goto error;
 1297|     95|            }
 1298|  1.03k|        }
 1299|       |
 1300|  22.8k|        if (c->frame_size_limit && (int64_t)c->frame_hdr->width[1] *
  ------------------
  |  Branch (1300:13): [True: 22.8k, False: 0]
  |  Branch (1300:36): [True: 37, False: 22.7k]
  ------------------
 1301|  22.8k|            c->frame_hdr->height > c->frame_size_limit)
 1302|     37|        {
 1303|     37|            dav1d_log(c, "Frame size %dx%d exceeds limit %u\n", c->frame_hdr->width[1],
  ------------------
  |  |   39|     37|#define dav1d_log dav1d_log
  ------------------
 1304|     37|                      c->frame_hdr->height, c->frame_size_limit);
 1305|     37|            c->frame_hdr = NULL;
 1306|     37|            return DAV1D_ERR(ERANGE);
  ------------------
  |  |   56|     37|#define DAV1D_ERR(e) (-(e)) ///< Negate POSIX error code.
  ------------------
 1307|     37|        }
 1308|       |
 1309|  22.7k|        if (type != DAV1D_OBU_FRAME)
  ------------------
  |  Branch (1309:13): [True: 937, False: 21.8k]
  ------------------
 1310|    937|            break;
 1311|       |        // OBU_FRAMEs shouldn't be signaled with show_existing_frame
 1312|  21.8k|        if (c->frame_hdr->show_existing_frame) {
  ------------------
  |  Branch (1312:13): [True: 34, False: 21.8k]
  ------------------
 1313|     34|            c->frame_hdr = NULL;
 1314|     34|            goto error;
 1315|     34|        }
 1316|       |
 1317|       |        // This is the frame header at the start of a frame OBU.
 1318|       |        // There's no trailing bit at the end to skip, but we do need
 1319|       |        // to align to the next byte.
 1320|  21.8k|        dav1d_bytealign_get_bits(&gb);
 1321|       |        // fall-through
 1322|  21.8k|    case DAV1D_OBU_TILE_GRP: {
  ------------------
  |  Branch (1322:5): [True: 67, False: 60.2k]
  ------------------
 1323|  21.8k|        if (!c->frame_hdr) goto error;
  ------------------
  |  Branch (1323:13): [True: 35, False: 21.8k]
  ------------------
 1324|  21.8k|        if (c->n_tile_data_alloc < c->n_tile_data + 1) {
  ------------------
  |  Branch (1324:13): [True: 15.5k, False: 6.25k]
  ------------------
 1325|  15.5k|            if ((c->n_tile_data + 1) > INT_MAX / (int)sizeof(*c->tile)) goto error;
  ------------------
  |  Branch (1325:17): [True: 0, False: 15.5k]
  ------------------
 1326|  15.5k|            struct Dav1dTileGroup *tile = dav1d_realloc(ALLOC_TILE, c->tile,
  ------------------
  |  |  133|  15.5k|#define dav1d_realloc(type, ptr, sz) realloc(ptr, sz)
  ------------------
 1327|  15.5k|                                                        (c->n_tile_data + 1) * sizeof(*c->tile));
 1328|  15.5k|            if (!tile) goto error;
  ------------------
  |  Branch (1328:17): [True: 0, False: 15.5k]
  ------------------
 1329|  15.5k|            c->tile = tile;
 1330|  15.5k|            memset(c->tile + c->n_tile_data, 0, sizeof(*c->tile));
 1331|  15.5k|            c->n_tile_data_alloc = c->n_tile_data + 1;
 1332|  15.5k|        }
 1333|  21.8k|        parse_tile_hdr(c, &gb);
 1334|       |        // Align to the next byte boundary and check for overrun.
 1335|  21.8k|        dav1d_bytealign_get_bits(&gb);
 1336|  21.8k|        if (gb.error) goto error;
  ------------------
  |  Branch (1336:13): [True: 257, False: 21.5k]
  ------------------
 1337|       |
 1338|  21.5k|        dav1d_data_ref(&c->tile[c->n_tile_data].data, in);
 1339|  21.5k|        c->tile[c->n_tile_data].data.data = gb.ptr;
 1340|  21.5k|        c->tile[c->n_tile_data].data.sz = (size_t)(gb.ptr_end - gb.ptr);
 1341|       |        // ensure tile groups are in order and sane, see 6.10.1
 1342|  21.5k|        if (c->tile[c->n_tile_data].start > c->tile[c->n_tile_data].end ||
  ------------------
  |  Branch (1342:13): [True: 13, False: 21.5k]
  ------------------
 1343|  21.5k|            c->tile[c->n_tile_data].start != c->n_tiles)
  ------------------
  |  Branch (1343:13): [True: 10, False: 21.5k]
  ------------------
 1344|     23|        {
 1345|     49|            for (int i = 0; i <= c->n_tile_data; i++)
  ------------------
  |  Branch (1345:29): [True: 26, False: 23]
  ------------------
 1346|     26|                dav1d_data_unref_internal(&c->tile[i].data);
 1347|     23|            c->n_tile_data = 0;
 1348|     23|            c->n_tiles = 0;
 1349|     23|            goto error;
 1350|     23|        }
 1351|  21.5k|        c->n_tiles += 1 + c->tile[c->n_tile_data].end -
 1352|  21.5k|                          c->tile[c->n_tile_data].start;
 1353|  21.5k|        c->n_tile_data++;
 1354|  21.5k|        break;
 1355|  21.5k|    }
 1356|    704|    case DAV1D_OBU_METADATA: {
  ------------------
  |  Branch (1356:5): [True: 704, False: 59.6k]
  ------------------
 1357|    704|#define DEBUG_OBU_METADATA 0
 1358|       |#if DEBUG_OBU_METADATA
 1359|       |        const uint8_t *const init_ptr = gb.ptr;
 1360|       |#endif
 1361|       |        // obu metadta type field
 1362|    704|        const enum ObuMetaType meta_type = dav1d_get_uleb128(&gb);
 1363|    704|        if (gb.error) goto error;
  ------------------
  |  Branch (1363:13): [True: 18, False: 686]
  ------------------
 1364|       |
 1365|    686|        switch (meta_type) {
 1366|     13|        case OBU_META_HDR_CLL: {
  ------------------
  |  Branch (1366:9): [True: 13, False: 673]
  ------------------
 1367|     13|            Dav1dRef *ref = dav1d_ref_create(ALLOC_OBU_META,
  ------------------
  |  |   49|     13|#define dav1d_ref_create(type, size) dav1d_ref_create(size)
  ------------------
 1368|     13|                                             sizeof(Dav1dContentLightLevel));
 1369|     13|            if (!ref) return DAV1D_ERR(ENOMEM);
  ------------------
  |  |   56|      0|#define DAV1D_ERR(e) (-(e)) ///< Negate POSIX error code.
  ------------------
  |  Branch (1369:17): [True: 0, False: 13]
  ------------------
 1370|     13|            Dav1dContentLightLevel *const content_light = ref->data;
 1371|       |
 1372|     13|            content_light->max_content_light_level = dav1d_get_bits(&gb, 16);
 1373|       |#if DEBUG_OBU_METADATA
 1374|       |            printf("CLLOBU: max-content-light-level: %d [off=%td]\n",
 1375|       |                   content_light->max_content_light_level,
 1376|       |                   (gb.ptr - init_ptr) * 8 - gb.bits_left);
 1377|       |#endif
 1378|     13|            content_light->max_frame_average_light_level = dav1d_get_bits(&gb, 16);
 1379|       |#if DEBUG_OBU_METADATA
 1380|       |            printf("CLLOBU: max-frame-average-light-level: %d [off=%td]\n",
 1381|       |                   content_light->max_frame_average_light_level,
 1382|       |                   (gb.ptr - init_ptr) * 8 - gb.bits_left);
 1383|       |#endif
 1384|       |
 1385|     13|            if (check_trailing_bits(&gb, c->strict_std_compliance) < 0) {
  ------------------
  |  Branch (1385:17): [True: 3, False: 10]
  ------------------
 1386|      3|                dav1d_ref_dec(&ref);
 1387|      3|                goto error;
 1388|      3|            }
 1389|       |
 1390|     10|            dav1d_ref_dec(&c->content_light_ref);
 1391|     10|            c->content_light = content_light;
 1392|     10|            c->content_light_ref = ref;
 1393|     10|            break;
 1394|     13|        }
 1395|     14|        case OBU_META_HDR_MDCV: {
  ------------------
  |  Branch (1395:9): [True: 14, False: 672]
  ------------------
 1396|     14|            Dav1dRef *ref = dav1d_ref_create(ALLOC_OBU_META,
  ------------------
  |  |   49|     14|#define dav1d_ref_create(type, size) dav1d_ref_create(size)
  ------------------
 1397|     14|                                             sizeof(Dav1dMasteringDisplay));
 1398|     14|            if (!ref) return DAV1D_ERR(ENOMEM);
  ------------------
  |  |   56|      0|#define DAV1D_ERR(e) (-(e)) ///< Negate POSIX error code.
  ------------------
  |  Branch (1398:17): [True: 0, False: 14]
  ------------------
 1399|     14|            Dav1dMasteringDisplay *const mastering_display = ref->data;
 1400|       |
 1401|     56|            for (int i = 0; i < 3; i++) {
  ------------------
  |  Branch (1401:29): [True: 42, False: 14]
  ------------------
 1402|     42|                mastering_display->primaries[i][0] = dav1d_get_bits(&gb, 16);
 1403|     42|                mastering_display->primaries[i][1] = dav1d_get_bits(&gb, 16);
 1404|       |#if DEBUG_OBU_METADATA
 1405|       |                printf("MDCVOBU: primaries[%d]: (%d, %d) [off=%td]\n", i,
 1406|       |                       mastering_display->primaries[i][0],
 1407|       |                       mastering_display->primaries[i][1],
 1408|       |                       (gb.ptr - init_ptr) * 8 - gb.bits_left);
 1409|       |#endif
 1410|     42|            }
 1411|     14|            mastering_display->white_point[0] = dav1d_get_bits(&gb, 16);
 1412|       |#if DEBUG_OBU_METADATA
 1413|       |            printf("MDCVOBU: white-point-x: %d [off=%td]\n",
 1414|       |                   mastering_display->white_point[0],
 1415|       |                   (gb.ptr - init_ptr) * 8 - gb.bits_left);
 1416|       |#endif
 1417|     14|            mastering_display->white_point[1] = dav1d_get_bits(&gb, 16);
 1418|       |#if DEBUG_OBU_METADATA
 1419|       |            printf("MDCVOBU: white-point-y: %d [off=%td]\n",
 1420|       |                   mastering_display->white_point[1],
 1421|       |                   (gb.ptr - init_ptr) * 8 - gb.bits_left);
 1422|       |#endif
 1423|     14|            mastering_display->max_luminance = dav1d_get_bits(&gb, 32);
 1424|       |#if DEBUG_OBU_METADATA
 1425|       |            printf("MDCVOBU: max-luminance: %d [off=%td]\n",
 1426|       |                   mastering_display->max_luminance,
 1427|       |                   (gb.ptr - init_ptr) * 8 - gb.bits_left);
 1428|       |#endif
 1429|     14|            mastering_display->min_luminance = dav1d_get_bits(&gb, 32);
 1430|       |#if DEBUG_OBU_METADATA
 1431|       |            printf("MDCVOBU: min-luminance: %d [off=%td]\n",
 1432|       |                   mastering_display->min_luminance,
 1433|       |                   (gb.ptr - init_ptr) * 8 - gb.bits_left);
 1434|       |#endif
 1435|     14|            if (check_trailing_bits(&gb, c->strict_std_compliance) < 0) {
  ------------------
  |  Branch (1435:17): [True: 3, False: 11]
  ------------------
 1436|      3|                dav1d_ref_dec(&ref);
 1437|      3|                goto error;
 1438|      3|            }
 1439|       |
 1440|     11|            dav1d_ref_dec(&c->mastering_display_ref);
 1441|     11|            c->mastering_display = mastering_display;
 1442|     11|            c->mastering_display_ref = ref;
 1443|     11|            break;
 1444|     14|        }
 1445|     23|        case OBU_META_ITUT_T35: {
  ------------------
  |  Branch (1445:9): [True: 23, False: 663]
  ------------------
 1446|     23|            ptrdiff_t payload_size = gb.ptr_end - gb.ptr;
 1447|       |            // Don't take into account all the trailing bits for payload_size
 1448|     39|            while (payload_size > 0 && !gb.ptr[payload_size - 1])
  ------------------
  |  Branch (1448:20): [True: 36, False: 3]
  |  Branch (1448:40): [True: 16, False: 20]
  ------------------
 1449|     16|                payload_size--; // trailing_zero_bit x 8
 1450|     23|            payload_size--; // trailing_one_bit + trailing_zero_bit x 7
 1451|       |
 1452|     23|            int country_code_extension_byte = 0;
 1453|     23|            const int country_code = dav1d_get_bits(&gb, 8);
 1454|     23|            payload_size--;
 1455|     23|            if (country_code == 0xFF) {
  ------------------
  |  Branch (1455:17): [True: 2, False: 21]
  ------------------
 1456|      2|                country_code_extension_byte = dav1d_get_bits(&gb, 8);
 1457|      2|                payload_size--;
 1458|      2|            }
 1459|       |
 1460|     23|            if (payload_size <= 0 || gb.ptr[payload_size] != 0x80) {
  ------------------
  |  Branch (1460:17): [True: 9, False: 14]
  |  Branch (1460:38): [True: 9, False: 5]
  ------------------
 1461|     18|                dav1d_log(c, "Malformed ITU-T T.35 metadata message format\n");
  ------------------
  |  |   39|     18|#define dav1d_log dav1d_log
  ------------------
 1462|     18|                break;
 1463|     18|            }
 1464|       |
 1465|      5|            if ((c->n_itut_t35 + 1) > INT_MAX / (int)sizeof(*c->itut_t35)) goto error;
  ------------------
  |  Branch (1465:17): [True: 0, False: 5]
  ------------------
 1466|      5|            struct Dav1dITUTT35 *itut_t35 = dav1d_realloc(ALLOC_OBU_META, c->itut_t35,
  ------------------
  |  |  133|      5|#define dav1d_realloc(type, ptr, sz) realloc(ptr, sz)
  ------------------
 1467|      5|                                                          (c->n_itut_t35 + 1) * sizeof(*c->itut_t35));
 1468|      5|            if (!itut_t35) goto error;
  ------------------
  |  Branch (1468:17): [True: 0, False: 5]
  ------------------
 1469|      5|            c->itut_t35 = itut_t35;
 1470|      5|            memset(c->itut_t35 + c->n_itut_t35, 0, sizeof(*c->itut_t35));
 1471|       |
 1472|      5|            struct itut_t35_ctx_context *itut_t35_ctx;
 1473|      5|            if (!c->n_itut_t35) {
  ------------------
  |  Branch (1473:17): [True: 5, False: 0]
  ------------------
 1474|      5|                assert(!c->itut_t35_ref);
  ------------------
  |  |  140|      5|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 5]
  |  |  |  Branch (140:68): [Folded, False: 5]
  |  |  ------------------
  ------------------
 1475|      5|                itut_t35_ctx = dav1d_malloc(ALLOC_OBU_META, sizeof(struct itut_t35_ctx_context));
  ------------------
  |  |  132|      5|#define dav1d_malloc(type, sz) malloc(sz)
  ------------------
 1476|      5|                if (!itut_t35_ctx) goto error;
  ------------------
  |  Branch (1476:21): [True: 0, False: 5]
  ------------------
 1477|      5|                c->itut_t35_ref = dav1d_ref_init(&itut_t35_ctx->ref, c->itut_t35,
 1478|      5|                                                 dav1d_picture_free_itut_t35, itut_t35_ctx, 0);
 1479|      5|            } else {
 1480|      0|                assert(c->itut_t35_ref && atomic_load(&c->itut_t35_ref->ref_cnt) == 1);
  ------------------
  |  |  140|      0|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:30): [True: 0, False: 0]
  |  |  |  Branch (140:30): [True: 0, False: 0]
  |  |  |  Branch (140:68): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1481|      0|                itut_t35_ctx = c->itut_t35_ref->user_data;
 1482|      0|                c->itut_t35_ref->const_data = (uint8_t *)c->itut_t35;
 1483|      0|            }
 1484|      5|            itut_t35_ctx->itut_t35 = c->itut_t35;
 1485|      5|            itut_t35_ctx->n_itut_t35 = c->n_itut_t35 + 1;
 1486|       |
 1487|      5|            Dav1dITUTT35 *const itut_t35_metadata = &c->itut_t35[c->n_itut_t35];
 1488|      5|            itut_t35_metadata->payload = dav1d_malloc(ALLOC_OBU_META, payload_size);
  ------------------
  |  |  132|      5|#define dav1d_malloc(type, sz) malloc(sz)
  ------------------
 1489|      5|            if (!itut_t35_metadata->payload) goto error;
  ------------------
  |  Branch (1489:17): [True: 0, False: 5]
  ------------------
 1490|       |
 1491|      5|            itut_t35_metadata->country_code = country_code;
 1492|      5|            itut_t35_metadata->country_code_extension_byte = country_code_extension_byte;
 1493|      5|            itut_t35_metadata->payload_size = payload_size;
 1494|       |
 1495|       |            // We know that we've read a whole number of bytes and that the
 1496|       |            // payload is within the OBU boundaries, so just use memcpy()
 1497|      5|            assert(gb.bits_left == 0);
  ------------------
  |  |  140|      5|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 5]
  |  |  |  Branch (140:68): [Folded, False: 5]
  |  |  ------------------
  ------------------
 1498|      5|            memcpy(itut_t35_metadata->payload, gb.ptr, payload_size);
 1499|       |
 1500|      5|            c->n_itut_t35++;
 1501|      5|            break;
 1502|      5|        }
 1503|     10|        case OBU_META_SCALABILITY:
  ------------------
  |  Branch (1503:9): [True: 10, False: 676]
  ------------------
 1504|     16|        case OBU_META_TIMECODE:
  ------------------
  |  Branch (1504:9): [True: 6, False: 680]
  ------------------
 1505|       |            // ignore metadata OBUs we don't care about
 1506|     16|            break;
 1507|    620|        default:
  ------------------
  |  Branch (1507:9): [True: 620, False: 66]
  ------------------
 1508|       |            // print a warning but don't fail for unknown types
 1509|    620|            if (meta_type > 31) // Types 6 to 31 are "Unregistered user private", so ignore them.
  ------------------
  |  Branch (1509:17): [True: 127, False: 493]
  ------------------
 1510|    127|                dav1d_log(c, "Unknown Metadata OBU type %d\n", meta_type);
  ------------------
  |  |   39|    127|#define dav1d_log dav1d_log
  ------------------
 1511|    620|            break;
 1512|    686|        }
 1513|       |
 1514|    680|        break;
 1515|    686|    }
 1516|  13.9k|    case DAV1D_OBU_TD:
  ------------------
  |  Branch (1516:5): [True: 13.9k, False: 46.4k]
  ------------------
 1517|  13.9k|        c->frame_flags |= PICTURE_FLAG_NEW_TEMPORAL_UNIT;
 1518|  13.9k|        break;
 1519|    178|    case DAV1D_OBU_PADDING:
  ------------------
  |  Branch (1519:5): [True: 178, False: 60.1k]
  ------------------
 1520|       |        // ignore OBUs we don't care about
 1521|    178|        break;
 1522|  4.53k|    default:
  ------------------
  |  Branch (1522:5): [True: 4.53k, False: 55.7k]
  ------------------
 1523|       |        // print a warning but don't fail for unknown types
 1524|  4.53k|        dav1d_log(c, "Unknown OBU type %d of size %td\n", type, gb.ptr_end - gb.ptr);
  ------------------
  |  |   39|  4.53k|#define dav1d_log dav1d_log
  ------------------
 1525|  4.53k|        break;
 1526|  60.3k|    }
 1527|       |
 1528|  58.6k|    if (c->seq_hdr && c->frame_hdr) {
  ------------------
  |  Branch (1528:9): [True: 46.0k, False: 12.5k]
  |  Branch (1528:23): [True: 23.0k, False: 23.0k]
  ------------------
 1529|  23.0k|        if (c->frame_hdr->show_existing_frame) {
  ------------------
  |  Branch (1529:13): [True: 278, False: 22.7k]
  ------------------
 1530|    278|            if (!c->refs[c->frame_hdr->existing_frame_idx].p.p.frame_hdr) goto error;
  ------------------
  |  Branch (1530:17): [True: 7, False: 271]
  ------------------
 1531|    271|            switch (c->refs[c->frame_hdr->existing_frame_idx].p.p.frame_hdr->frame_type) {
 1532|    176|            case DAV1D_FRAME_TYPE_INTER:
  ------------------
  |  Branch (1532:13): [True: 176, False: 95]
  ------------------
 1533|    176|            case DAV1D_FRAME_TYPE_SWITCH:
  ------------------
  |  Branch (1533:13): [True: 0, False: 271]
  ------------------
 1534|    176|                if (c->decode_frame_type > DAV1D_DECODEFRAMETYPE_REFERENCE)
  ------------------
  |  Branch (1534:21): [True: 0, False: 176]
  ------------------
 1535|      0|                    goto skip;
 1536|    176|                break;
 1537|    176|            case DAV1D_FRAME_TYPE_INTRA:
  ------------------
  |  Branch (1537:13): [True: 2, False: 269]
  ------------------
 1538|      2|                if (c->decode_frame_type > DAV1D_DECODEFRAMETYPE_INTRA)
  ------------------
  |  Branch (1538:21): [True: 0, False: 2]
  ------------------
 1539|      0|                    goto skip;
 1540|       |                // fall-through
 1541|     95|            default:
  ------------------
  |  Branch (1541:13): [True: 93, False: 178]
  ------------------
 1542|     95|                break;
 1543|    271|            }
 1544|    271|            if (!c->refs[c->frame_hdr->existing_frame_idx].p.p.data[0]) goto error;
  ------------------
  |  Branch (1544:17): [True: 0, False: 271]
  ------------------
 1545|    271|            if (c->strict_std_compliance &&
  ------------------
  |  Branch (1545:17): [True: 0, False: 271]
  ------------------
 1546|      0|                !c->refs[c->frame_hdr->existing_frame_idx].p.showable)
  ------------------
  |  Branch (1546:17): [True: 0, False: 0]
  ------------------
 1547|      0|            {
 1548|      0|                goto error;
 1549|      0|            }
 1550|    271|            if (c->n_fc == 1) {
  ------------------
  |  Branch (1550:17): [True: 271, False: 0]
  ------------------
 1551|    271|                dav1d_thread_picture_ref(&c->out,
 1552|    271|                                         &c->refs[c->frame_hdr->existing_frame_idx].p);
 1553|    271|                dav1d_picture_copy_props(&c->out.p,
 1554|    271|                                         c->content_light, c->content_light_ref,
 1555|    271|                                         c->mastering_display, c->mastering_display_ref,
 1556|    271|                                         c->itut_t35, c->itut_t35_ref, c->n_itut_t35,
 1557|    271|                                         &in->m);
 1558|       |                // Must be removed from the context after being attached to the frame
 1559|    271|                dav1d_ref_dec(&c->itut_t35_ref);
 1560|    271|                c->itut_t35 = NULL;
 1561|    271|                c->n_itut_t35 = 0;
 1562|    271|                c->event_flags |= dav1d_picture_get_event_flags(&c->refs[c->frame_hdr->existing_frame_idx].p);
 1563|    271|            } else {
 1564|      0|                pthread_mutex_lock(&c->task_thread.lock);
 1565|       |                // need to append this to the frame output queue
 1566|      0|                const unsigned next = c->frame_thread.next++;
 1567|      0|                if (c->frame_thread.next == c->n_fc)
  ------------------
  |  Branch (1567:21): [True: 0, False: 0]
  ------------------
 1568|      0|                    c->frame_thread.next = 0;
 1569|       |
 1570|      0|                Dav1dFrameContext *const f = &c->fc[next];
 1571|      0|                while (f->n_tile_data > 0)
  ------------------
  |  Branch (1571:24): [True: 0, False: 0]
  ------------------
 1572|      0|                    pthread_cond_wait(&f->task_thread.cond,
 1573|      0|                                      &f->task_thread.ttd->lock);
 1574|      0|                Dav1dThreadPicture *const out_delayed =
 1575|      0|                    &c->frame_thread.out_delayed[next];
 1576|      0|                if (out_delayed->p.data[0] || atomic_load(&f->task_thread.error)) {
  ------------------
  |  Branch (1576:21): [True: 0, False: 0]
  |  Branch (1576:47): [True: 0, False: 0]
  ------------------
 1577|      0|                    unsigned first = atomic_load(&c->task_thread.first);
 1578|      0|                    if (first + 1U < c->n_fc)
  ------------------
  |  Branch (1578:25): [True: 0, False: 0]
  ------------------
 1579|      0|                        atomic_fetch_add(&c->task_thread.first, 1U);
 1580|      0|                    else
 1581|      0|                        atomic_store(&c->task_thread.first, 0);
 1582|      0|                    atomic_compare_exchange_strong(&c->task_thread.reset_task_cur,
 1583|      0|                                                   &first, UINT_MAX);
 1584|      0|                    if (c->task_thread.cur && c->task_thread.cur < c->n_fc)
  ------------------
  |  Branch (1584:25): [True: 0, False: 0]
  |  Branch (1584:47): [True: 0, False: 0]
  ------------------
 1585|      0|                        c->task_thread.cur--;
 1586|      0|                }
 1587|      0|                const int error = f->task_thread.retval;
 1588|      0|                if (error) {
  ------------------
  |  Branch (1588:21): [True: 0, False: 0]
  ------------------
 1589|      0|                    c->cached_error = error;
 1590|      0|                    f->task_thread.retval = 0;
 1591|      0|                    dav1d_data_props_copy(&c->cached_error_props, &out_delayed->p.m);
 1592|      0|                    dav1d_thread_picture_unref(out_delayed);
 1593|      0|                } else if (out_delayed->p.data[0]) {
  ------------------
  |  Branch (1593:28): [True: 0, False: 0]
  ------------------
 1594|      0|                    const unsigned progress = atomic_load_explicit(&out_delayed->progress[1],
 1595|      0|                                                                   memory_order_relaxed);
 1596|      0|                    if ((out_delayed->visible || c->output_invisible_frames) &&
  ------------------
  |  Branch (1596:26): [True: 0, False: 0]
  |  Branch (1596:50): [True: 0, False: 0]
  ------------------
 1597|      0|                        progress != FRAME_ERROR)
  ------------------
  |  |   35|      0|#define FRAME_ERROR (UINT_MAX - 1)
  ------------------
  |  Branch (1597:25): [True: 0, False: 0]
  ------------------
 1598|      0|                    {
 1599|      0|                        dav1d_thread_picture_ref(&c->out, out_delayed);
 1600|      0|                        c->event_flags |= dav1d_picture_get_event_flags(out_delayed);
 1601|      0|                    }
 1602|      0|                    dav1d_thread_picture_unref(out_delayed);
 1603|      0|                }
 1604|      0|                dav1d_thread_picture_ref(out_delayed,
 1605|      0|                                         &c->refs[c->frame_hdr->existing_frame_idx].p);
 1606|      0|                out_delayed->visible = 1;
 1607|      0|                dav1d_picture_copy_props(&out_delayed->p,
 1608|      0|                                         c->content_light, c->content_light_ref,
 1609|      0|                                         c->mastering_display, c->mastering_display_ref,
 1610|      0|                                         c->itut_t35, c->itut_t35_ref, c->n_itut_t35,
 1611|      0|                                         &in->m);
 1612|       |                // Must be removed from the context after being attached to the frame
 1613|      0|                dav1d_ref_dec(&c->itut_t35_ref);
 1614|      0|                c->itut_t35 = NULL;
 1615|      0|                c->n_itut_t35 = 0;
 1616|       |
 1617|      0|                pthread_mutex_unlock(&c->task_thread.lock);
 1618|      0|            }
 1619|    271|            if (c->refs[c->frame_hdr->existing_frame_idx].p.p.frame_hdr->frame_type == DAV1D_FRAME_TYPE_KEY) {
  ------------------
  |  Branch (1619:17): [True: 93, False: 178]
  ------------------
 1620|     93|                const int r = c->frame_hdr->existing_frame_idx;
 1621|     93|                c->refs[r].p.showable = 0;
 1622|    837|                for (int i = 0; i < 8; i++) {
  ------------------
  |  Branch (1622:33): [True: 744, False: 93]
  ------------------
 1623|    744|                    if (i == r) continue;
  ------------------
  |  Branch (1623:25): [True: 93, False: 651]
  ------------------
 1624|       |
 1625|    651|                    if (c->refs[i].p.p.frame_hdr)
  ------------------
  |  Branch (1625:25): [True: 639, False: 12]
  ------------------
 1626|    639|                        dav1d_thread_picture_unref(&c->refs[i].p);
 1627|    651|                    dav1d_thread_picture_ref(&c->refs[i].p, &c->refs[r].p);
 1628|       |
 1629|    651|                    dav1d_cdf_thread_unref(&c->cdf[i]);
 1630|    651|                    dav1d_cdf_thread_ref(&c->cdf[i], &c->cdf[r]);
 1631|       |
 1632|    651|                    dav1d_ref_dec(&c->refs[i].segmap);
 1633|    651|                    c->refs[i].segmap = c->refs[r].segmap;
 1634|    651|                    if (c->refs[r].segmap)
  ------------------
  |  Branch (1634:25): [True: 112, False: 539]
  ------------------
 1635|    112|                        dav1d_ref_inc(c->refs[r].segmap);
 1636|    651|                    dav1d_ref_dec(&c->refs[i].refmvs);
 1637|    651|                }
 1638|     93|            }
 1639|    271|            c->frame_hdr = NULL;
 1640|  22.7k|        } else if (c->n_tiles == c->frame_hdr->tiling.cols * c->frame_hdr->tiling.rows) {
  ------------------
  |  Branch (1640:20): [True: 21.5k, False: 1.22k]
  ------------------
 1641|  21.5k|            switch (c->frame_hdr->frame_type) {
 1642|  4.44k|            case DAV1D_FRAME_TYPE_INTER:
  ------------------
  |  Branch (1642:13): [True: 4.44k, False: 17.0k]
  ------------------
 1643|  5.06k|            case DAV1D_FRAME_TYPE_SWITCH:
  ------------------
  |  Branch (1643:13): [True: 624, False: 20.8k]
  ------------------
 1644|  5.06k|                if (c->decode_frame_type > DAV1D_DECODEFRAMETYPE_REFERENCE ||
  ------------------
  |  Branch (1644:21): [True: 0, False: 5.06k]
  ------------------
 1645|  5.06k|                    (c->decode_frame_type == DAV1D_DECODEFRAMETYPE_REFERENCE &&
  ------------------
  |  Branch (1645:22): [True: 0, False: 5.06k]
  ------------------
 1646|      0|                     !c->frame_hdr->refresh_frame_flags))
  ------------------
  |  Branch (1646:22): [True: 0, False: 0]
  ------------------
 1647|      0|                    goto skip;
 1648|  5.06k|                break;
 1649|  5.06k|            case DAV1D_FRAME_TYPE_INTRA:
  ------------------
  |  Branch (1649:13): [True: 164, False: 21.3k]
  ------------------
 1650|    164|                if (c->decode_frame_type > DAV1D_DECODEFRAMETYPE_INTRA ||
  ------------------
  |  Branch (1650:21): [True: 0, False: 164]
  ------------------
 1651|    164|                    (c->decode_frame_type == DAV1D_DECODEFRAMETYPE_REFERENCE &&
  ------------------
  |  Branch (1651:22): [True: 0, False: 164]
  ------------------
 1652|      0|                     !c->frame_hdr->refresh_frame_flags))
  ------------------
  |  Branch (1652:22): [True: 0, False: 0]
  ------------------
 1653|      0|                    goto skip;
 1654|       |                // fall-through
 1655|  16.4k|            default:
  ------------------
  |  Branch (1655:13): [True: 16.2k, False: 5.22k]
  ------------------
 1656|  16.4k|                break;
 1657|  21.5k|            }
 1658|  21.5k|            if (!c->n_tile_data)
  ------------------
  |  Branch (1658:17): [True: 0, False: 21.5k]
  ------------------
 1659|      0|                goto error;
 1660|  21.5k|            if ((res = dav1d_submit_frame(c)) < 0)
  ------------------
  |  Branch (1660:17): [True: 9.90k, False: 11.6k]
  ------------------
 1661|  9.90k|                return res;
 1662|  11.6k|            assert(!c->n_tile_data);
  ------------------
  |  |  140|  11.6k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 11.6k]
  |  |  |  Branch (140:68): [Folded, False: 11.6k]
  |  |  ------------------
  ------------------
 1663|  11.6k|            c->frame_hdr = NULL;
 1664|  11.6k|            c->n_tiles = 0;
 1665|  11.6k|        }
 1666|  23.0k|    }
 1667|       |
 1668|  48.7k|    return gb.ptr_end - gb.ptr_start;
 1669|       |
 1670|      0|skip:
 1671|       |    // update refs with only the headers in case we skip the frame
 1672|      0|    for (int i = 0; i < 8; i++) {
  ------------------
  |  Branch (1672:21): [True: 0, False: 0]
  ------------------
 1673|      0|        if (c->frame_hdr->refresh_frame_flags & (1 << i)) {
  ------------------
  |  Branch (1673:13): [True: 0, False: 0]
  ------------------
 1674|      0|            dav1d_thread_picture_unref(&c->refs[i].p);
 1675|      0|            c->refs[i].p.p.frame_hdr = c->frame_hdr;
 1676|      0|            c->refs[i].p.p.seq_hdr = c->seq_hdr;
 1677|      0|            c->refs[i].p.p.frame_hdr_ref = c->frame_hdr_ref;
 1678|      0|            c->refs[i].p.p.seq_hdr_ref = c->seq_hdr_ref;
 1679|      0|            dav1d_ref_inc(c->frame_hdr_ref);
 1680|      0|            dav1d_ref_inc(c->seq_hdr_ref);
 1681|      0|        }
 1682|      0|    }
 1683|       |
 1684|      0|    dav1d_ref_dec(&c->frame_hdr_ref);
 1685|      0|    c->frame_hdr = NULL;
 1686|      0|    c->n_tiles = 0;
 1687|       |
 1688|      0|    return gb.ptr_end - gb.ptr_start;
 1689|       |
 1690|  2.36k|error:
 1691|  2.36k|    dav1d_data_props_copy(&c->cached_error_props, &in->m);
 1692|  2.36k|    dav1d_log(c, gb.error ? "Overrun in OBU bit buffer\n" :
  ------------------
  |  |   39|  2.36k|#define dav1d_log dav1d_log
  ------------------
  |  Branch (1692:18): [True: 1.32k, False: 1.04k]
  ------------------
 1693|  2.36k|                            "Error parsing OBU data\n");
 1694|  2.36k|    return DAV1D_ERR(EINVAL);
  ------------------
  |  |   56|  2.36k|#define DAV1D_ERR(e) (-(e)) ///< Negate POSIX error code.
  ------------------
 1695|  58.6k|}
obu.c:parse_seq_hdr:
   75|  17.7k|{
   76|  17.7k|#define DEBUG_SEQ_HDR 0
   77|       |
   78|       |#if DEBUG_SEQ_HDR
   79|       |    const unsigned init_bit_pos = dav1d_get_bits_pos(gb);
   80|       |#endif
   81|       |
   82|  17.7k|    memset(hdr, 0, sizeof(*hdr));
   83|  17.7k|    hdr->profile = dav1d_get_bits(gb, 3);
   84|  17.7k|    if (hdr->profile > 2) goto error;
  ------------------
  |  Branch (84:9): [True: 33, False: 17.7k]
  ------------------
   85|       |#if DEBUG_SEQ_HDR
   86|       |    printf("SEQHDR: post-profile: off=%u\n",
   87|       |           dav1d_get_bits_pos(gb) - init_bit_pos);
   88|       |#endif
   89|       |
   90|  17.7k|    hdr->still_picture = dav1d_get_bit(gb);
   91|  17.7k|    hdr->reduced_still_picture_header = dav1d_get_bit(gb);
   92|  17.7k|    if (hdr->reduced_still_picture_header && !hdr->still_picture) goto error;
  ------------------
  |  Branch (92:9): [True: 10.2k, False: 7.44k]
  |  Branch (92:46): [True: 28, False: 10.2k]
  ------------------
   93|       |#if DEBUG_SEQ_HDR
   94|       |    printf("SEQHDR: post-stillpicture_flags: off=%u\n",
   95|       |           dav1d_get_bits_pos(gb) - init_bit_pos);
   96|       |#endif
   97|       |
   98|  17.6k|    if (hdr->reduced_still_picture_header) {
  ------------------
  |  Branch (98:9): [True: 10.2k, False: 7.44k]
  ------------------
   99|  10.2k|        hdr->num_operating_points = 1;
  100|  10.2k|        hdr->operating_points[0].major_level = dav1d_get_bits(gb, 3);
  101|  10.2k|        hdr->operating_points[0].minor_level = dav1d_get_bits(gb, 2);
  102|  10.2k|        hdr->operating_points[0].initial_display_delay = 10;
  103|  10.2k|    } else {
  104|  7.44k|        hdr->timing_info_present = dav1d_get_bit(gb);
  105|  7.44k|        if (hdr->timing_info_present) {
  ------------------
  |  Branch (105:13): [True: 543, False: 6.90k]
  ------------------
  106|    543|            hdr->num_units_in_tick = dav1d_get_bits(gb, 32);
  107|    543|            hdr->time_scale = dav1d_get_bits(gb, 32);
  108|    543|            if (strict_std_compliance && (!hdr->num_units_in_tick || !hdr->time_scale))
  ------------------
  |  Branch (108:17): [True: 0, False: 543]
  |  Branch (108:43): [True: 0, False: 0]
  |  Branch (108:70): [True: 0, False: 0]
  ------------------
  109|      0|                goto error;
  110|    543|            hdr->equal_picture_interval = dav1d_get_bit(gb);
  111|    543|            if (hdr->equal_picture_interval) {
  ------------------
  |  Branch (111:17): [True: 310, False: 233]
  ------------------
  112|    310|                const unsigned num_ticks_per_picture = dav1d_get_vlc(gb);
  113|    310|                if (num_ticks_per_picture == UINT32_MAX)
  ------------------
  |  Branch (113:21): [True: 9, False: 301]
  ------------------
  114|      9|                    goto error;
  115|    301|                hdr->num_ticks_per_picture = num_ticks_per_picture + 1;
  116|    301|            }
  117|       |
  118|    534|            hdr->decoder_model_info_present = dav1d_get_bit(gb);
  119|    534|            if (hdr->decoder_model_info_present) {
  ------------------
  |  Branch (119:17): [True: 310, False: 224]
  ------------------
  120|    310|                hdr->encoder_decoder_buffer_delay_length = dav1d_get_bits(gb, 5) + 1;
  121|    310|                hdr->num_units_in_decoding_tick = dav1d_get_bits(gb, 32);
  122|    310|                if (strict_std_compliance && !hdr->num_units_in_decoding_tick)
  ------------------
  |  Branch (122:21): [True: 0, False: 310]
  |  Branch (122:46): [True: 0, False: 0]
  ------------------
  123|      0|                    goto error;
  124|    310|                hdr->buffer_removal_delay_length = dav1d_get_bits(gb, 5) + 1;
  125|    310|                hdr->frame_presentation_delay_length = dav1d_get_bits(gb, 5) + 1;
  126|    310|            }
  127|    534|        }
  128|       |#if DEBUG_SEQ_HDR
  129|       |        printf("SEQHDR: post-timinginfo: off=%u\n",
  130|       |               dav1d_get_bits_pos(gb) - init_bit_pos);
  131|       |#endif
  132|       |
  133|  7.43k|        hdr->display_model_info_present = dav1d_get_bit(gb);
  134|  7.43k|        hdr->num_operating_points = dav1d_get_bits(gb, 5) + 1;
  135|  22.4k|        for (int i = 0; i < hdr->num_operating_points; i++) {
  ------------------
  |  Branch (135:25): [True: 15.3k, False: 7.13k]
  ------------------
  136|  15.3k|            struct Dav1dSequenceHeaderOperatingPoint *const op =
  137|  15.3k|                &hdr->operating_points[i];
  138|  15.3k|            op->idc = dav1d_get_bits(gb, 12);
  139|  15.3k|            if (op->idc && (!(op->idc & 0xff) || !(op->idc & 0xf00)))
  ------------------
  |  Branch (139:17): [True: 10.5k, False: 4.78k]
  |  Branch (139:29): [True: 58, False: 10.5k]
  |  Branch (139:50): [True: 246, False: 10.2k]
  ------------------
  140|    304|                goto error;
  141|  15.0k|            op->major_level = 2 + dav1d_get_bits(gb, 3);
  142|  15.0k|            op->minor_level = dav1d_get_bits(gb, 2);
  143|  15.0k|            if (op->major_level > 3)
  ------------------
  |  Branch (143:17): [True: 2.97k, False: 12.0k]
  ------------------
  144|  2.97k|                op->tier = dav1d_get_bit(gb);
  145|  15.0k|            if (hdr->decoder_model_info_present) {
  ------------------
  |  Branch (145:17): [True: 3.04k, False: 12.0k]
  ------------------
  146|  3.04k|                op->decoder_model_param_present = dav1d_get_bit(gb);
  147|  3.04k|                if (op->decoder_model_param_present) {
  ------------------
  |  Branch (147:21): [True: 977, False: 2.06k]
  ------------------
  148|    977|                    struct Dav1dSequenceHeaderOperatingParameterInfo *const opi =
  149|    977|                        &hdr->operating_parameter_info[i];
  150|    977|                    opi->decoder_buffer_delay =
  151|    977|                        dav1d_get_bits(gb, hdr->encoder_decoder_buffer_delay_length);
  152|    977|                    opi->encoder_buffer_delay =
  153|    977|                        dav1d_get_bits(gb, hdr->encoder_decoder_buffer_delay_length);
  154|    977|                    opi->low_delay_mode = dav1d_get_bit(gb);
  155|    977|                }
  156|  3.04k|            }
  157|  15.0k|            if (hdr->display_model_info_present)
  ------------------
  |  Branch (157:17): [True: 3.05k, False: 11.9k]
  ------------------
  158|  3.05k|                op->display_model_param_present = dav1d_get_bit(gb);
  159|  15.0k|            op->initial_display_delay =
  160|  15.0k|                op->display_model_param_present ? dav1d_get_bits(gb, 4) + 1 : 10;
  ------------------
  |  Branch (160:17): [True: 1.16k, False: 13.8k]
  ------------------
  161|  15.0k|        }
  162|       |#if DEBUG_SEQ_HDR
  163|       |        printf("SEQHDR: post-operating-points: off=%u\n",
  164|       |               dav1d_get_bits_pos(gb) - init_bit_pos);
  165|       |#endif
  166|  7.43k|    }
  167|       |
  168|  17.3k|    hdr->width_n_bits = dav1d_get_bits(gb, 4) + 1;
  169|  17.3k|    hdr->height_n_bits = dav1d_get_bits(gb, 4) + 1;
  170|  17.3k|    hdr->max_width = dav1d_get_bits(gb, hdr->width_n_bits) + 1;
  171|  17.3k|    hdr->max_height = dav1d_get_bits(gb, hdr->height_n_bits) + 1;
  172|       |#if DEBUG_SEQ_HDR
  173|       |    printf("SEQHDR: post-size: off=%u\n",
  174|       |           dav1d_get_bits_pos(gb) - init_bit_pos);
  175|       |#endif
  176|  17.3k|    if (!hdr->reduced_still_picture_header) {
  ------------------
  |  Branch (176:9): [True: 7.13k, False: 10.2k]
  ------------------
  177|  7.13k|        hdr->frame_id_numbers_present = dav1d_get_bit(gb);
  178|  7.13k|        if (hdr->frame_id_numbers_present) {
  ------------------
  |  Branch (178:13): [True: 480, False: 6.65k]
  ------------------
  179|    480|            hdr->delta_frame_id_n_bits = dav1d_get_bits(gb, 4) + 2;
  180|    480|            hdr->frame_id_n_bits = dav1d_get_bits(gb, 3) + hdr->delta_frame_id_n_bits + 1;
  181|    480|        }
  182|  7.13k|    }
  183|       |#if DEBUG_SEQ_HDR
  184|       |    printf("SEQHDR: post-frame-id-numbers-present: off=%u\n",
  185|       |           dav1d_get_bits_pos(gb) - init_bit_pos);
  186|       |#endif
  187|       |
  188|  17.3k|    hdr->sb128 = dav1d_get_bit(gb);
  189|  17.3k|    hdr->filter_intra = dav1d_get_bit(gb);
  190|  17.3k|    hdr->intra_edge_filter = dav1d_get_bit(gb);
  191|  17.3k|    if (hdr->reduced_still_picture_header) {
  ------------------
  |  Branch (191:9): [True: 10.2k, False: 7.13k]
  ------------------
  192|  10.2k|        hdr->screen_content_tools = DAV1D_ADAPTIVE;
  193|  10.2k|        hdr->force_integer_mv = DAV1D_ADAPTIVE;
  194|  10.2k|    } else {
  195|  7.13k|        hdr->inter_intra = dav1d_get_bit(gb);
  196|  7.13k|        hdr->masked_compound = dav1d_get_bit(gb);
  197|  7.13k|        hdr->warped_motion = dav1d_get_bit(gb);
  198|  7.13k|        hdr->dual_filter = dav1d_get_bit(gb);
  199|  7.13k|        hdr->order_hint = dav1d_get_bit(gb);
  200|  7.13k|        if (hdr->order_hint) {
  ------------------
  |  Branch (200:13): [True: 5.61k, False: 1.52k]
  ------------------
  201|  5.61k|            hdr->jnt_comp = dav1d_get_bit(gb);
  202|  5.61k|            hdr->ref_frame_mvs = dav1d_get_bit(gb);
  203|  5.61k|        }
  204|  7.13k|        hdr->screen_content_tools = dav1d_get_bit(gb) ? DAV1D_ADAPTIVE : dav1d_get_bit(gb);
  ------------------
  |  Branch (204:37): [True: 5.49k, False: 1.63k]
  ------------------
  205|       |    #if DEBUG_SEQ_HDR
  206|       |        printf("SEQHDR: post-screentools: off=%u\n",
  207|       |               dav1d_get_bits_pos(gb) - init_bit_pos);
  208|       |    #endif
  209|  7.13k|        hdr->force_integer_mv = hdr->screen_content_tools ?
  ------------------
  |  Branch (209:33): [True: 6.30k, False: 826]
  ------------------
  210|  6.30k|                                dav1d_get_bit(gb) ? DAV1D_ADAPTIVE : dav1d_get_bit(gb) : 2;
  ------------------
  |  Branch (210:33): [True: 5.33k, False: 977]
  ------------------
  211|  7.13k|        if (hdr->order_hint)
  ------------------
  |  Branch (211:13): [True: 5.61k, False: 1.52k]
  ------------------
  212|  5.61k|            hdr->order_hint_n_bits = dav1d_get_bits(gb, 3) + 1;
  213|  7.13k|    }
  214|  17.3k|    hdr->super_res = dav1d_get_bit(gb);
  215|  17.3k|    hdr->cdef = dav1d_get_bit(gb);
  216|  17.3k|    hdr->restoration = dav1d_get_bit(gb);
  217|       |#if DEBUG_SEQ_HDR
  218|       |    printf("SEQHDR: post-featurebits: off=%u\n",
  219|       |           dav1d_get_bits_pos(gb) - init_bit_pos);
  220|       |#endif
  221|       |
  222|  17.3k|    hdr->hbd = dav1d_get_bit(gb);
  223|  17.3k|    if (hdr->profile == 2 && hdr->hbd)
  ------------------
  |  Branch (223:9): [True: 3.88k, False: 13.4k]
  |  Branch (223:30): [True: 3.40k, False: 478]
  ------------------
  224|  3.40k|        hdr->hbd += dav1d_get_bit(gb);
  225|  17.3k|    if (hdr->profile != 1)
  ------------------
  |  Branch (225:9): [True: 9.40k, False: 7.96k]
  ------------------
  226|  9.40k|        hdr->monochrome = dav1d_get_bit(gb);
  227|  17.3k|    hdr->color_description_present = dav1d_get_bit(gb);
  228|  17.3k|    if (hdr->color_description_present) {
  ------------------
  |  Branch (228:9): [True: 7.47k, False: 9.89k]
  ------------------
  229|  7.47k|        hdr->pri = dav1d_get_bits(gb, 8);
  230|  7.47k|        hdr->trc = dav1d_get_bits(gb, 8);
  231|  7.47k|        hdr->mtrx = dav1d_get_bits(gb, 8);
  232|  9.89k|    } else {
  233|  9.89k|        hdr->pri = DAV1D_COLOR_PRI_UNKNOWN;
  234|  9.89k|        hdr->trc = DAV1D_TRC_UNKNOWN;
  235|  9.89k|        hdr->mtrx = DAV1D_MC_UNKNOWN;
  236|  9.89k|    }
  237|  17.3k|    if (hdr->monochrome) {
  ------------------
  |  Branch (237:9): [True: 3.87k, False: 13.4k]
  ------------------
  238|  3.87k|        hdr->color_range = dav1d_get_bit(gb);
  239|  3.87k|        hdr->layout = DAV1D_PIXEL_LAYOUT_I400;
  240|  3.87k|        hdr->ss_hor = hdr->ss_ver = 1;
  241|  3.87k|        hdr->chr = DAV1D_CHR_UNKNOWN;
  242|  13.4k|    } else if (hdr->pri == DAV1D_COLOR_PRI_BT709 &&
  ------------------
  |  Branch (242:16): [True: 3.20k, False: 10.2k]
  ------------------
  243|  3.20k|               hdr->trc == DAV1D_TRC_SRGB &&
  ------------------
  |  Branch (243:16): [True: 1.56k, False: 1.64k]
  ------------------
  244|  1.56k|               hdr->mtrx == DAV1D_MC_IDENTITY)
  ------------------
  |  Branch (244:16): [True: 7, False: 1.55k]
  ------------------
  245|      7|    {
  246|      7|        hdr->layout = DAV1D_PIXEL_LAYOUT_I444;
  247|      7|        hdr->color_range = 1;
  248|      7|        if (hdr->profile != 1 && !(hdr->profile == 2 && hdr->hbd == 2))
  ------------------
  |  Branch (248:13): [True: 2, False: 5]
  |  Branch (248:36): [True: 0, False: 2]
  |  Branch (248:57): [True: 0, False: 0]
  ------------------
  249|      2|            goto error;
  250|  13.4k|    } else {
  251|  13.4k|        hdr->color_range = dav1d_get_bit(gb);
  252|  13.4k|        switch (hdr->profile) {
  ------------------
  |  Branch (252:17): [True: 13.4k, False: 0]
  ------------------
  253|  3.26k|        case 0: hdr->layout = DAV1D_PIXEL_LAYOUT_I420;
  ------------------
  |  Branch (253:9): [True: 3.26k, False: 10.2k]
  ------------------
  254|  3.26k|                hdr->ss_hor = hdr->ss_ver = 1;
  255|  3.26k|                break;
  256|  7.95k|        case 1: hdr->layout = DAV1D_PIXEL_LAYOUT_I444;
  ------------------
  |  Branch (256:9): [True: 7.95k, False: 5.52k]
  ------------------
  257|  7.95k|                break;
  258|  2.26k|        case 2:
  ------------------
  |  Branch (258:9): [True: 2.26k, False: 11.2k]
  ------------------
  259|  2.26k|            if (hdr->hbd == 2) {
  ------------------
  |  Branch (259:17): [True: 1.81k, False: 450]
  ------------------
  260|  1.81k|                hdr->ss_hor = dav1d_get_bit(gb);
  261|  1.81k|                if (hdr->ss_hor)
  ------------------
  |  Branch (261:21): [True: 424, False: 1.38k]
  ------------------
  262|    424|                    hdr->ss_ver = dav1d_get_bit(gb);
  263|  1.81k|            } else
  264|    450|                hdr->ss_hor = 1;
  265|  2.26k|            hdr->layout = hdr->ss_hor ?
  ------------------
  |  Branch (265:27): [True: 874, False: 1.38k]
  ------------------
  266|    874|                          hdr->ss_ver ? DAV1D_PIXEL_LAYOUT_I420 :
  ------------------
  |  Branch (266:27): [True: 156, False: 718]
  ------------------
  267|    874|                                        DAV1D_PIXEL_LAYOUT_I422 :
  268|  2.26k|                                        DAV1D_PIXEL_LAYOUT_I444;
  269|  2.26k|            break;
  270|  13.4k|        }
  271|  13.4k|        hdr->chr = (hdr->ss_hor & hdr->ss_ver) ?
  ------------------
  |  Branch (271:20): [True: 3.42k, False: 10.0k]
  ------------------
  272|  10.0k|                   dav1d_get_bits(gb, 2) : DAV1D_CHR_UNKNOWN;
  273|  13.4k|    }
  274|  17.3k|    if (strict_std_compliance &&
  ------------------
  |  Branch (274:9): [True: 0, False: 17.3k]
  ------------------
  275|      0|        hdr->mtrx == DAV1D_MC_IDENTITY && hdr->layout != DAV1D_PIXEL_LAYOUT_I444)
  ------------------
  |  Branch (275:9): [True: 0, False: 0]
  |  Branch (275:43): [True: 0, False: 0]
  ------------------
  276|      0|    {
  277|      0|        goto error;
  278|      0|    }
  279|  17.3k|    if (!hdr->monochrome)
  ------------------
  |  Branch (279:9): [True: 13.4k, False: 3.87k]
  ------------------
  280|  13.4k|        hdr->separate_uv_delta_q = dav1d_get_bit(gb);
  281|       |#if DEBUG_SEQ_HDR
  282|       |    printf("SEQHDR: post-colorinfo: off=%u\n",
  283|       |           dav1d_get_bits_pos(gb) - init_bit_pos);
  284|       |#endif
  285|       |
  286|  17.3k|    hdr->film_grain_present = dav1d_get_bit(gb);
  287|       |#if DEBUG_SEQ_HDR
  288|       |    printf("SEQHDR: post-filmgrain: off=%u\n",
  289|       |           dav1d_get_bits_pos(gb) - init_bit_pos);
  290|       |#endif
  291|       |
  292|       |    // We needn't bother flushing the OBU here: we'll check we didn't
  293|       |    // overrun in the caller and will then discard gb, so there's no
  294|       |    // point in setting its position properly.
  295|       |
  296|  17.3k|    return check_trailing_bits(gb, strict_std_compliance);
  297|       |
  298|    376|error:
  299|    376|    return DAV1D_ERR(EINVAL);
  ------------------
  |  |   56|    376|#define DAV1D_ERR(e) (-(e)) ///< Negate POSIX error code.
  ------------------
  300|  17.3k|}
obu.c:parse_frame_hdr:
  409|  23.0k|static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
  410|  23.0k|#define DEBUG_FRAME_HDR 0
  411|       |
  412|       |#if DEBUG_FRAME_HDR
  413|       |    const uint8_t *const init_ptr = gb->ptr;
  414|       |#endif
  415|  23.0k|    const Dav1dSequenceHeader *const seqhdr = c->seq_hdr;
  416|  23.0k|    Dav1dFrameHeader *const hdr = c->frame_hdr;
  417|       |
  418|  23.0k|    if (!seqhdr->reduced_still_picture_header)
  ------------------
  |  Branch (418:9): [True: 12.3k, False: 10.7k]
  ------------------
  419|  12.3k|        hdr->show_existing_frame = dav1d_get_bit(gb);
  420|       |#if DEBUG_FRAME_HDR
  421|       |    printf("HDR: post-show_existing_frame: off=%td\n",
  422|       |           (gb->ptr - init_ptr) * 8 - gb->bits_left);
  423|       |#endif
  424|  23.0k|    if (hdr->show_existing_frame) {
  ------------------
  |  Branch (424:9): [True: 330, False: 22.7k]
  ------------------
  425|    330|        hdr->existing_frame_idx = dav1d_get_bits(gb, 3);
  426|    330|        if (seqhdr->decoder_model_info_present && !seqhdr->equal_picture_interval)
  ------------------
  |  Branch (426:13): [True: 6, False: 324]
  |  Branch (426:51): [True: 3, False: 3]
  ------------------
  427|      3|            hdr->frame_presentation_delay = dav1d_get_bits(gb, seqhdr->frame_presentation_delay_length);
  428|    330|        if (seqhdr->frame_id_numbers_present) {
  ------------------
  |  Branch (428:13): [True: 20, False: 310]
  ------------------
  429|     20|            hdr->frame_id = dav1d_get_bits(gb, seqhdr->frame_id_n_bits);
  430|     20|            Dav1dFrameHeader *const ref_frame_hdr = c->refs[hdr->existing_frame_idx].p.p.frame_hdr;
  431|     20|            if (!ref_frame_hdr || ref_frame_hdr->frame_id != hdr->frame_id) goto error;
  ------------------
  |  Branch (431:17): [True: 7, False: 13]
  |  Branch (431:35): [True: 11, False: 2]
  ------------------
  432|     20|        }
  433|    312|        return 0;
  434|    330|    }
  435|       |
  436|  22.7k|    if (seqhdr->reduced_still_picture_header) {
  ------------------
  |  Branch (436:9): [True: 10.7k, False: 11.9k]
  ------------------
  437|  10.7k|        hdr->frame_type = DAV1D_FRAME_TYPE_KEY;
  438|  10.7k|        hdr->show_frame = 1;
  439|  11.9k|    } else {
  440|  11.9k|        hdr->frame_type = dav1d_get_bits(gb, 2);
  441|  11.9k|        hdr->show_frame = dav1d_get_bit(gb);
  442|  11.9k|    }
  443|  22.7k|    if (hdr->show_frame) {
  ------------------
  |  Branch (443:9): [True: 19.1k, False: 3.59k]
  ------------------
  444|  19.1k|        if (seqhdr->decoder_model_info_present && !seqhdr->equal_picture_interval)
  ------------------
  |  Branch (444:13): [True: 18, False: 19.1k]
  |  Branch (444:51): [True: 13, False: 5]
  ------------------
  445|     13|            hdr->frame_presentation_delay = dav1d_get_bits(gb, seqhdr->frame_presentation_delay_length);
  446|  19.1k|        hdr->showable_frame = hdr->frame_type != DAV1D_FRAME_TYPE_KEY;
  447|  19.1k|    } else
  448|  3.59k|        hdr->showable_frame = dav1d_get_bit(gb);
  449|  22.7k|    hdr->error_resilient_mode =
  450|  22.7k|        (hdr->frame_type == DAV1D_FRAME_TYPE_KEY && hdr->show_frame) ||
  ------------------
  |  Branch (450:10): [True: 17.2k, False: 5.48k]
  |  Branch (450:53): [True: 16.9k, False: 309]
  ------------------
  451|  5.78k|        hdr->frame_type == DAV1D_FRAME_TYPE_SWITCH ||
  ------------------
  |  Branch (451:9): [True: 678, False: 5.11k]
  ------------------
  452|  5.11k|        seqhdr->reduced_still_picture_header || dav1d_get_bit(gb);
  ------------------
  |  Branch (452:9): [True: 0, False: 5.11k]
  |  Branch (452:49): [True: 137, False: 4.97k]
  ------------------
  453|       |#if DEBUG_FRAME_HDR
  454|       |    printf("HDR: post-frametype_bits: off=%td\n",
  455|       |           (gb->ptr - init_ptr) * 8 - gb->bits_left);
  456|       |#endif
  457|  22.7k|    hdr->disable_cdf_update = dav1d_get_bit(gb);
  458|  22.7k|    hdr->allow_screen_content_tools = seqhdr->screen_content_tools == DAV1D_ADAPTIVE ?
  ------------------
  |  Branch (458:39): [True: 20.7k, False: 1.98k]
  ------------------
  459|  20.7k|                                      dav1d_get_bit(gb) : seqhdr->screen_content_tools;
  460|  22.7k|    if (hdr->allow_screen_content_tools)
  ------------------
  |  Branch (460:9): [True: 9.89k, False: 12.8k]
  ------------------
  461|  9.89k|        hdr->force_integer_mv = seqhdr->force_integer_mv == DAV1D_ADAPTIVE ?
  ------------------
  |  Branch (461:33): [True: 8.56k, False: 1.32k]
  ------------------
  462|  8.56k|                                dav1d_get_bit(gb) : seqhdr->force_integer_mv;
  463|       |
  464|  22.7k|    if (IS_KEY_OR_INTRA(hdr))
  ------------------
  |  |   43|  22.7k|    (!IS_INTER_OR_SWITCH(frame_header))
  |  |  ------------------
  |  |  |  |   36|  22.7k|    ((frame_header)->frame_type & 1)
  |  |  ------------------
  |  |  |  Branch (43:5): [True: 17.4k, False: 5.29k]
  |  |  ------------------
  ------------------
  465|  17.4k|        hdr->force_integer_mv = 1;
  466|       |
  467|  22.7k|    if (seqhdr->frame_id_numbers_present)
  ------------------
  |  Branch (467:9): [True: 420, False: 22.3k]
  ------------------
  468|    420|        hdr->frame_id = dav1d_get_bits(gb, seqhdr->frame_id_n_bits);
  469|       |
  470|  22.7k|    if (!seqhdr->reduced_still_picture_header)
  ------------------
  |  Branch (470:9): [True: 11.9k, False: 10.7k]
  ------------------
  471|  11.9k|        hdr->frame_size_override = hdr->frame_type == DAV1D_FRAME_TYPE_SWITCH ? 1 : dav1d_get_bit(gb);
  ------------------
  |  Branch (471:36): [True: 678, False: 11.3k]
  ------------------
  472|       |#if DEBUG_FRAME_HDR
  473|       |    printf("HDR: post-frame_size_override_flag: off=%td\n",
  474|       |           (gb->ptr - init_ptr) * 8 - gb->bits_left);
  475|       |#endif
  476|  22.7k|    if (seqhdr->order_hint)
  ------------------
  |  Branch (476:9): [True: 10.4k, False: 12.2k]
  ------------------
  477|  10.4k|        hdr->frame_offset = dav1d_get_bits(gb, seqhdr->order_hint_n_bits);
  478|  22.7k|    hdr->primary_ref_frame = !hdr->error_resilient_mode && IS_INTER_OR_SWITCH(hdr) ?
  ------------------
  |  |   36|  4.97k|    ((frame_header)->frame_type & 1)
  |  |  ------------------
  |  |  |  Branch (36:5): [True: 4.58k, False: 390]
  |  |  ------------------
  ------------------
  |  Branch (478:30): [True: 4.97k, False: 17.7k]
  ------------------
  479|  18.1k|                             dav1d_get_bits(gb, 3) : DAV1D_PRIMARY_REF_NONE;
  ------------------
  |  |   45|  40.8k|#define DAV1D_PRIMARY_REF_NONE 7
  ------------------
  480|       |
  481|  22.7k|    if (seqhdr->decoder_model_info_present) {
  ------------------
  |  Branch (481:9): [True: 32, False: 22.6k]
  ------------------
  482|     32|        hdr->buffer_removal_time_present = dav1d_get_bit(gb);
  483|     32|        if (hdr->buffer_removal_time_present) {
  ------------------
  |  Branch (483:13): [True: 24, False: 8]
  ------------------
  484|    242|            for (int i = 0; i < c->seq_hdr->num_operating_points; i++) {
  ------------------
  |  Branch (484:29): [True: 218, False: 24]
  ------------------
  485|    218|                const struct Dav1dSequenceHeaderOperatingPoint *const seqop = &seqhdr->operating_points[i];
  486|    218|                struct Dav1dFrameHeaderOperatingPoint *const op = &hdr->operating_points[i];
  487|    218|                if (seqop->decoder_model_param_present) {
  ------------------
  |  Branch (487:21): [True: 112, False: 106]
  ------------------
  488|    112|                    int in_temporal_layer = (seqop->idc >> hdr->temporal_id) & 1;
  489|    112|                    int in_spatial_layer  = (seqop->idc >> (hdr->spatial_id + 8)) & 1;
  490|    112|                    if (!seqop->idc || (in_temporal_layer && in_spatial_layer))
  ------------------
  |  Branch (490:25): [True: 0, False: 112]
  |  Branch (490:41): [True: 62, False: 50]
  |  Branch (490:62): [True: 31, False: 31]
  ------------------
  491|     31|                        op->buffer_removal_time = dav1d_get_bits(gb, seqhdr->buffer_removal_delay_length);
  492|    112|                }
  493|    218|            }
  494|     24|        }
  495|     32|    }
  496|       |
  497|  22.7k|    if (IS_KEY_OR_INTRA(hdr)) {
  ------------------
  |  |   43|  22.7k|    (!IS_INTER_OR_SWITCH(frame_header))
  |  |  ------------------
  |  |  |  |   36|  22.7k|    ((frame_header)->frame_type & 1)
  |  |  ------------------
  |  |  |  Branch (43:5): [True: 17.4k, False: 5.29k]
  |  |  ------------------
  ------------------
  498|  17.4k|        hdr->refresh_frame_flags = (hdr->frame_type == DAV1D_FRAME_TYPE_KEY &&
  ------------------
  |  Branch (498:37): [True: 17.2k, False: 182]
  ------------------
  499|  17.2k|                                    hdr->show_frame) ? 0xff : dav1d_get_bits(gb, 8);
  ------------------
  |  Branch (499:37): [True: 16.9k, False: 309]
  ------------------
  500|  17.4k|        if (hdr->refresh_frame_flags != 0xff && hdr->error_resilient_mode && seqhdr->order_hint)
  ------------------
  |  Branch (500:13): [True: 489, False: 16.9k]
  |  Branch (500:49): [True: 101, False: 388]
  |  Branch (500:78): [True: 63, False: 38]
  ------------------
  501|    567|            for (int i = 0; i < 8; i++)
  ------------------
  |  Branch (501:29): [True: 504, False: 63]
  ------------------
  502|    504|                dav1d_get_bits(gb, seqhdr->order_hint_n_bits);
  503|  17.4k|        if (c->strict_std_compliance &&
  ------------------
  |  Branch (503:13): [True: 0, False: 17.4k]
  ------------------
  504|      0|            hdr->frame_type == DAV1D_FRAME_TYPE_INTRA && hdr->refresh_frame_flags == 0xff)
  ------------------
  |  Branch (504:13): [True: 0, False: 0]
  |  Branch (504:58): [True: 0, False: 0]
  ------------------
  505|      0|        {
  506|      0|            goto error;
  507|      0|        }
  508|  17.4k|        if (read_frame_size(c, gb, 0) < 0) goto error;
  ------------------
  |  Branch (508:13): [True: 0, False: 17.4k]
  ------------------
  509|  17.4k|        if (hdr->allow_screen_content_tools && !hdr->super_res.enabled)
  ------------------
  |  Branch (509:13): [True: 8.04k, False: 9.38k]
  |  Branch (509:48): [True: 7.64k, False: 397]
  ------------------
  510|  7.64k|            hdr->allow_intrabc = dav1d_get_bit(gb);
  511|  17.4k|    } else {
  512|  5.29k|        hdr->refresh_frame_flags = hdr->frame_type == DAV1D_FRAME_TYPE_SWITCH ? 0xff :
  ------------------
  |  Branch (512:36): [True: 678, False: 4.62k]
  ------------------
  513|  5.29k|                                   dav1d_get_bits(gb, 8);
  514|  5.29k|        if (hdr->error_resilient_mode && seqhdr->order_hint)
  ------------------
  |  Branch (514:13): [True: 714, False: 4.58k]
  |  Branch (514:42): [True: 553, False: 161]
  ------------------
  515|  4.97k|            for (int i = 0; i < 8; i++)
  ------------------
  |  Branch (515:29): [True: 4.42k, False: 553]
  ------------------
  516|  4.42k|                dav1d_get_bits(gb, seqhdr->order_hint_n_bits);
  517|  5.29k|        if (seqhdr->order_hint) {
  ------------------
  |  Branch (517:13): [True: 4.89k, False: 405]
  ------------------
  518|  4.89k|            hdr->frame_ref_short_signaling = dav1d_get_bit(gb);
  519|  4.89k|            if (hdr->frame_ref_short_signaling) {
  ------------------
  |  Branch (519:17): [True: 669, False: 4.22k]
  ------------------
  520|    669|                hdr->refidx[0] = dav1d_get_bits(gb, 3);
  521|    669|                hdr->refidx[1] = hdr->refidx[2] = -1;
  522|    669|                hdr->refidx[3] = dav1d_get_bits(gb, 3);
  523|       |
  524|       |                /* +1 allows for unconditional stores, as unused
  525|       |                 * values can be dumped into frame_offset[-1]. */
  526|    669|                int frame_offset_mem[8+1];
  527|    669|                int *const frame_offset = &frame_offset_mem[1];
  528|    669|                int earliest_ref = -1;
  529|  5.93k|                for (int i = 0, earliest_offset = INT_MAX; i < 8; i++) {
  ------------------
  |  Branch (529:60): [True: 5.27k, False: 657]
  ------------------
  530|  5.27k|                    const Dav1dFrameHeader *const refhdr = c->refs[i].p.p.frame_hdr;
  531|  5.27k|                    if (!refhdr) goto error;
  ------------------
  |  Branch (531:25): [True: 12, False: 5.26k]
  ------------------
  532|  5.26k|                    const int diff = get_poc_diff(seqhdr->order_hint_n_bits,
  533|  5.26k|                                                  refhdr->frame_offset,
  534|  5.26k|                                                  hdr->frame_offset);
  535|  5.26k|                    frame_offset[i] = diff;
  536|  5.26k|                    if (diff < earliest_offset) {
  ------------------
  |  Branch (536:25): [True: 730, False: 4.53k]
  ------------------
  537|    730|                        earliest_offset = diff;
  538|    730|                        earliest_ref = i;
  539|    730|                    }
  540|  5.26k|                }
  541|    657|                frame_offset[hdr->refidx[0]] = INT_MIN; // = reference frame is used
  542|    657|                frame_offset[hdr->refidx[3]] = INT_MIN;
  543|    657|                assert(earliest_ref >= 0);
  ------------------
  |  |  140|    657|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 657]
  |  |  |  Branch (140:68): [Folded, False: 657]
  |  |  ------------------
  ------------------
  544|       |
  545|    657|                int refidx = -1;
  546|  5.91k|                for (int i = 0, latest_offset = 0; i < 8; i++) {
  ------------------
  |  Branch (546:52): [True: 5.25k, False: 657]
  ------------------
  547|  5.25k|                    const int hint = frame_offset[i];
  548|  5.25k|                    if (hint >= latest_offset) {
  ------------------
  |  Branch (548:25): [True: 1.27k, False: 3.98k]
  ------------------
  549|  1.27k|                        latest_offset = hint;
  550|  1.27k|                        refidx = i;
  551|  1.27k|                    }
  552|  5.25k|                }
  553|    657|                frame_offset[refidx] = INT_MIN;
  554|    657|                hdr->refidx[6] = refidx;
  555|       |
  556|  1.97k|                for (int i = 4; i < 6; i++) {
  ------------------
  |  Branch (556:33): [True: 1.31k, False: 657]
  ------------------
  557|       |                    /* Unsigned compares to handle negative values. */
  558|  1.31k|                    unsigned earliest_offset = UINT8_MAX;
  559|  1.31k|                    refidx = -1;
  560|  11.8k|                    for (int j = 0; j < 8; j++) {
  ------------------
  |  Branch (560:37): [True: 10.5k, False: 1.31k]
  ------------------
  561|  10.5k|                        const unsigned hint = frame_offset[j];
  562|  10.5k|                        if (hint < earliest_offset) {
  ------------------
  |  Branch (562:29): [True: 482, False: 10.0k]
  ------------------
  563|    482|                            earliest_offset = hint;
  564|    482|                            refidx = j;
  565|    482|                        }
  566|  10.5k|                    }
  567|  1.31k|                    frame_offset[refidx] = INT_MIN;
  568|  1.31k|                    hdr->refidx[i] = refidx;
  569|  1.31k|                }
  570|       |
  571|  4.59k|                for (int i = 1; i < 7; i++) {
  ------------------
  |  Branch (571:33): [True: 3.94k, False: 657]
  ------------------
  572|  3.94k|                    refidx = hdr->refidx[i];
  573|  3.94k|                    if (refidx < 0) {
  ------------------
  |  Branch (573:25): [True: 2.55k, False: 1.39k]
  ------------------
  574|  2.55k|                        unsigned latest_offset = ~UINT8_MAX;
  575|  22.9k|                        for (int j = 0; j < 8; j++) {
  ------------------
  |  Branch (575:41): [True: 20.4k, False: 2.55k]
  ------------------
  576|  20.4k|                            const unsigned hint = frame_offset[j];
  577|  20.4k|                            if (hint >= latest_offset) {
  ------------------
  |  Branch (577:33): [True: 8.57k, False: 11.8k]
  ------------------
  578|  8.57k|                                latest_offset = hint;
  579|  8.57k|                                refidx = j;
  580|  8.57k|                            }
  581|  20.4k|                        }
  582|  2.55k|                        frame_offset[refidx] = INT_MIN;
  583|  2.55k|                        hdr->refidx[i] = refidx >= 0 ? refidx : earliest_ref;
  ------------------
  |  Branch (583:42): [True: 2.19k, False: 355]
  ------------------
  584|  2.55k|                    }
  585|  3.94k|                }
  586|    657|            }
  587|  4.89k|        }
  588|  42.0k|        for (int i = 0; i < 7; i++) {
  ------------------
  |  Branch (588:25): [True: 36.8k, False: 5.25k]
  ------------------
  589|  36.8k|            if (!hdr->frame_ref_short_signaling)
  ------------------
  |  Branch (589:17): [True: 32.2k, False: 4.58k]
  ------------------
  590|  32.2k|                hdr->refidx[i] = dav1d_get_bits(gb, 3);
  591|  36.8k|            if (seqhdr->frame_id_numbers_present) {
  ------------------
  |  Branch (591:17): [True: 49, False: 36.7k]
  ------------------
  592|     49|                const unsigned delta_ref_frame_id = dav1d_get_bits(gb, seqhdr->delta_frame_id_n_bits) + 1;
  593|     49|                const unsigned ref_frame_id = (hdr->frame_id + (1 << seqhdr->frame_id_n_bits) - delta_ref_frame_id) & ((1 << seqhdr->frame_id_n_bits) - 1);
  594|     49|                Dav1dFrameHeader *const ref_frame_hdr = c->refs[hdr->refidx[i]].p.p.frame_hdr;
  595|     49|                if (!ref_frame_hdr || ref_frame_hdr->frame_id != ref_frame_id) goto error;
  ------------------
  |  Branch (595:21): [True: 12, False: 37]
  |  Branch (595:39): [True: 21, False: 16]
  ------------------
  596|     49|            }
  597|  36.8k|        }
  598|  5.25k|        const int use_ref = !hdr->error_resilient_mode &&
  ------------------
  |  Branch (598:29): [True: 4.55k, False: 696]
  ------------------
  599|  4.55k|                            hdr->frame_size_override;
  ------------------
  |  Branch (599:29): [True: 206, False: 4.35k]
  ------------------
  600|  5.25k|        if (read_frame_size(c, gb, use_ref) < 0) goto error;
  ------------------
  |  Branch (600:13): [True: 5, False: 5.24k]
  ------------------
  601|  5.24k|        if (!hdr->force_integer_mv)
  ------------------
  |  Branch (601:13): [True: 4.08k, False: 1.16k]
  ------------------
  602|  4.08k|            hdr->hp = dav1d_get_bit(gb);
  603|  5.24k|        hdr->subpel_filter_mode = dav1d_get_bit(gb) ? DAV1D_FILTER_SWITCHABLE :
  ------------------
  |  Branch (603:35): [True: 1.57k, False: 3.67k]
  ------------------
  604|  5.24k|                                                      dav1d_get_bits(gb, 2);
  605|  5.24k|        hdr->switchable_motion_mode = dav1d_get_bit(gb);
  606|  5.24k|        if (!hdr->error_resilient_mode && seqhdr->ref_frame_mvs &&
  ------------------
  |  Branch (606:13): [True: 4.55k, False: 696]
  |  Branch (606:43): [True: 4.15k, False: 402]
  ------------------
  607|  4.15k|            seqhdr->order_hint && IS_INTER_OR_SWITCH(hdr))
  ------------------
  |  |   36|  4.15k|    ((frame_header)->frame_type & 1)
  |  |  ------------------
  |  |  |  Branch (36:5): [True: 4.15k, False: 0]
  |  |  ------------------
  ------------------
  |  Branch (607:13): [True: 4.15k, False: 0]
  ------------------
  608|  4.15k|        {
  609|  4.15k|            hdr->use_ref_frame_mvs = dav1d_get_bit(gb);
  610|  4.15k|        }
  611|  5.24k|    }
  612|       |#if DEBUG_FRAME_HDR
  613|       |    printf("HDR: post-frametype-specific-bits: off=%td\n",
  614|       |           (gb->ptr - init_ptr) * 8 - gb->bits_left);
  615|       |#endif
  616|       |
  617|  22.6k|    if (!seqhdr->reduced_still_picture_header && !hdr->disable_cdf_update)
  ------------------
  |  Branch (617:9): [True: 11.9k, False: 10.7k]
  |  Branch (617:50): [True: 10.7k, False: 1.18k]
  ------------------
  618|  10.7k|        hdr->refresh_context = !dav1d_get_bit(gb);
  619|       |#if DEBUG_FRAME_HDR
  620|       |    printf("HDR: post-refresh_context: off=%td\n",
  621|       |           (gb->ptr - init_ptr) * 8 - gb->bits_left);
  622|       |#endif
  623|       |
  624|       |    // tile data
  625|  22.6k|    hdr->tiling.uniform = dav1d_get_bit(gb);
  626|  22.6k|    const int sbsz_min1 = (64 << seqhdr->sb128) - 1;
  627|  22.6k|    const int sbsz_log2 = 6 + seqhdr->sb128;
  628|  22.6k|    const int sbw = (hdr->width[0] + sbsz_min1) >> sbsz_log2;
  629|  22.6k|    const int sbh = (hdr->height + sbsz_min1) >> sbsz_log2;
  630|  22.6k|    const int max_tile_width_sb = 4096 >> sbsz_log2;
  631|  22.6k|    const int max_tile_area_sb = 4096 * 2304 >> (2 * sbsz_log2);
  632|  22.6k|    hdr->tiling.min_log2_cols = tile_log2(max_tile_width_sb, sbw);
  633|  22.6k|    hdr->tiling.max_log2_cols = tile_log2(1, imin(sbw, DAV1D_MAX_TILE_COLS));
  ------------------
  |  |   41|  22.6k|#define DAV1D_MAX_TILE_COLS 64
  ------------------
  634|  22.6k|    hdr->tiling.max_log2_rows = tile_log2(1, imin(sbh, DAV1D_MAX_TILE_ROWS));
  ------------------
  |  |   42|  22.6k|#define DAV1D_MAX_TILE_ROWS 64
  ------------------
  635|  22.6k|    const int min_log2_tiles = imax(tile_log2(max_tile_area_sb, sbw * sbh),
  636|  22.6k|                              hdr->tiling.min_log2_cols);
  637|  22.6k|    if (hdr->tiling.uniform) {
  ------------------
  |  Branch (637:9): [True: 16.5k, False: 6.17k]
  ------------------
  638|  16.5k|        for (hdr->tiling.log2_cols = hdr->tiling.min_log2_cols;
  639|  17.3k|             hdr->tiling.log2_cols < hdr->tiling.max_log2_cols && dav1d_get_bit(gb);
  ------------------
  |  Branch (639:14): [True: 9.47k, False: 7.86k]
  |  Branch (639:67): [True: 827, False: 8.64k]
  ------------------
  640|  16.5k|             hdr->tiling.log2_cols++) ;
  641|  16.5k|        const int tile_w = 1 + ((sbw - 1) >> hdr->tiling.log2_cols);
  642|  16.5k|        hdr->tiling.cols = 0;
  643|  38.0k|        for (int sbx = 0; sbx < sbw; sbx += tile_w, hdr->tiling.cols++)
  ------------------
  |  Branch (643:27): [True: 21.5k, False: 16.5k]
  ------------------
  644|  21.5k|            hdr->tiling.col_start_sb[hdr->tiling.cols] = sbx;
  645|  16.5k|        hdr->tiling.min_log2_rows =
  646|  16.5k|            imax(min_log2_tiles - hdr->tiling.log2_cols, 0);
  647|       |
  648|  16.5k|        for (hdr->tiling.log2_rows = hdr->tiling.min_log2_rows;
  649|  17.9k|             hdr->tiling.log2_rows < hdr->tiling.max_log2_rows && dav1d_get_bit(gb);
  ------------------
  |  Branch (649:14): [True: 9.83k, False: 8.08k]
  |  Branch (649:67): [True: 1.40k, False: 8.42k]
  ------------------
  650|  16.5k|             hdr->tiling.log2_rows++) ;
  651|  16.5k|        const int tile_h = 1 + ((sbh - 1) >> hdr->tiling.log2_rows);
  652|  16.5k|        hdr->tiling.rows = 0;
  653|  35.4k|        for (int sby = 0; sby < sbh; sby += tile_h, hdr->tiling.rows++)
  ------------------
  |  Branch (653:27): [True: 18.9k, False: 16.5k]
  ------------------
  654|  18.9k|            hdr->tiling.row_start_sb[hdr->tiling.rows] = sby;
  655|  16.5k|    } else {
  656|  6.17k|        hdr->tiling.cols = 0;
  657|  6.17k|        int widest_tile = 0, max_tile_area_sb = sbw * sbh;
  658|  14.6k|        for (int sbx = 0; sbx < sbw && hdr->tiling.cols < DAV1D_MAX_TILE_COLS; hdr->tiling.cols++) {
  ------------------
  |  |   41|  8.47k|#define DAV1D_MAX_TILE_COLS 64
  ------------------
  |  Branch (658:27): [True: 8.47k, False: 6.15k]
  |  Branch (658:40): [True: 8.46k, False: 11]
  ------------------
  659|  8.46k|            const int tile_width_sb = imin(sbw - sbx, max_tile_width_sb);
  660|  8.46k|            const int tile_w = (tile_width_sb > 1) ? 1 + dav1d_get_uniform(gb, tile_width_sb) : 1;
  ------------------
  |  Branch (660:32): [True: 3.18k, False: 5.27k]
  ------------------
  661|  8.46k|            hdr->tiling.col_start_sb[hdr->tiling.cols] = sbx;
  662|  8.46k|            sbx += tile_w;
  663|  8.46k|            widest_tile = imax(widest_tile, tile_w);
  664|  8.46k|        }
  665|  6.17k|        hdr->tiling.log2_cols = tile_log2(1, hdr->tiling.cols);
  666|  6.17k|        if (min_log2_tiles) max_tile_area_sb >>= min_log2_tiles + 1;
  ------------------
  |  Branch (666:13): [True: 91, False: 6.07k]
  ------------------
  667|  6.17k|        const int max_tile_height_sb = imax(max_tile_area_sb / widest_tile, 1);
  668|       |
  669|  6.17k|        hdr->tiling.rows = 0;
  670|  18.9k|        for (int sby = 0; sby < sbh && hdr->tiling.rows < DAV1D_MAX_TILE_ROWS; hdr->tiling.rows++) {
  ------------------
  |  |   42|  12.7k|#define DAV1D_MAX_TILE_ROWS 64
  ------------------
  |  Branch (670:27): [True: 12.7k, False: 6.15k]
  |  Branch (670:40): [True: 12.7k, False: 14]
  ------------------
  671|  12.7k|            const int tile_height_sb = imin(sbh - sby, max_tile_height_sb);
  672|  12.7k|            const int tile_h = (tile_height_sb > 1) ? 1 + dav1d_get_uniform(gb, tile_height_sb) : 1;
  ------------------
  |  Branch (672:32): [True: 8.09k, False: 4.69k]
  ------------------
  673|  12.7k|            hdr->tiling.row_start_sb[hdr->tiling.rows] = sby;
  674|  12.7k|            sby += tile_h;
  675|  12.7k|        }
  676|  6.17k|        hdr->tiling.log2_rows = tile_log2(1, hdr->tiling.rows);
  677|  6.17k|    }
  678|  22.6k|    hdr->tiling.col_start_sb[hdr->tiling.cols] = sbw;
  679|  22.6k|    hdr->tiling.row_start_sb[hdr->tiling.rows] = sbh;
  680|  22.6k|    if (hdr->tiling.log2_cols || hdr->tiling.log2_rows) {
  ------------------
  |  Branch (680:9): [True: 2.62k, False: 20.0k]
  |  Branch (680:34): [True: 2.54k, False: 17.5k]
  ------------------
  681|  5.16k|        hdr->tiling.update = dav1d_get_bits(gb, hdr->tiling.log2_cols + hdr->tiling.log2_rows);
  682|  5.16k|        if (hdr->tiling.update >= hdr->tiling.cols * hdr->tiling.rows)
  ------------------
  |  Branch (682:13): [True: 10, False: 5.15k]
  ------------------
  683|     10|            goto error;
  684|  5.15k|        hdr->tiling.n_bytes = dav1d_get_bits(gb, 2) + 1;
  685|  5.15k|    }
  686|       |#if DEBUG_FRAME_HDR
  687|       |    printf("HDR: post-tiling: off=%td\n",
  688|       |           (gb->ptr - init_ptr) * 8 - gb->bits_left);
  689|       |#endif
  690|       |
  691|       |    // quant data
  692|  22.6k|    hdr->quant.yac = dav1d_get_bits(gb, 8);
  693|  22.6k|    if (dav1d_get_bit(gb))
  ------------------
  |  Branch (693:9): [True: 5.25k, False: 17.4k]
  ------------------
  694|  5.25k|        hdr->quant.ydc_delta = dav1d_get_sbits(gb, 7);
  695|  22.6k|    if (!seqhdr->monochrome) {
  ------------------
  |  Branch (695:9): [True: 18.0k, False: 4.61k]
  ------------------
  696|       |        // If the sequence header says that delta_q might be different
  697|       |        // for U, V, we must check whether it actually is for this
  698|       |        // frame.
  699|  18.0k|        const int diff_uv_delta = seqhdr->separate_uv_delta_q ? dav1d_get_bit(gb) : 0;
  ------------------
  |  Branch (699:35): [True: 2.98k, False: 15.0k]
  ------------------
  700|  18.0k|        if (dav1d_get_bit(gb))
  ------------------
  |  Branch (700:13): [True: 3.41k, False: 14.6k]
  ------------------
  701|  3.41k|            hdr->quant.udc_delta = dav1d_get_sbits(gb, 7);
  702|  18.0k|        if (dav1d_get_bit(gb))
  ------------------
  |  Branch (702:13): [True: 4.02k, False: 14.0k]
  ------------------
  703|  4.02k|            hdr->quant.uac_delta = dav1d_get_sbits(gb, 7);
  704|  18.0k|        if (diff_uv_delta) {
  ------------------
  |  Branch (704:13): [True: 801, False: 17.2k]
  ------------------
  705|    801|            if (dav1d_get_bit(gb))
  ------------------
  |  Branch (705:17): [True: 396, False: 405]
  ------------------
  706|    396|                hdr->quant.vdc_delta = dav1d_get_sbits(gb, 7);
  707|    801|            if (dav1d_get_bit(gb))
  ------------------
  |  Branch (707:17): [True: 362, False: 439]
  ------------------
  708|    362|                hdr->quant.vac_delta = dav1d_get_sbits(gb, 7);
  709|  17.2k|        } else {
  710|  17.2k|            hdr->quant.vdc_delta = hdr->quant.udc_delta;
  711|  17.2k|            hdr->quant.vac_delta = hdr->quant.uac_delta;
  712|  17.2k|        }
  713|  18.0k|    }
  714|       |#if DEBUG_FRAME_HDR
  715|       |    printf("HDR: post-quant: off=%td\n",
  716|       |           (gb->ptr - init_ptr) * 8 - gb->bits_left);
  717|       |#endif
  718|  22.6k|    hdr->quant.qm = dav1d_get_bit(gb);
  719|  22.6k|    if (hdr->quant.qm) {
  ------------------
  |  Branch (719:9): [True: 6.36k, False: 16.3k]
  ------------------
  720|  6.36k|        hdr->quant.qm_y = dav1d_get_bits(gb, 4);
  721|  6.36k|        hdr->quant.qm_u = dav1d_get_bits(gb, 4);
  722|  6.36k|        hdr->quant.qm_v = seqhdr->separate_uv_delta_q ? dav1d_get_bits(gb, 4) :
  ------------------
  |  Branch (722:27): [True: 1.22k, False: 5.14k]
  ------------------
  723|  6.36k|                                                        hdr->quant.qm_u;
  724|  6.36k|    }
  725|       |#if DEBUG_FRAME_HDR
  726|       |    printf("HDR: post-qm: off=%td\n",
  727|       |           (gb->ptr - init_ptr) * 8 - gb->bits_left);
  728|       |#endif
  729|       |
  730|       |    // segmentation data
  731|  22.6k|    hdr->segmentation.enabled = dav1d_get_bit(gb);
  732|  22.6k|    if (hdr->segmentation.enabled) {
  ------------------
  |  Branch (732:9): [True: 5.08k, False: 17.5k]
  ------------------
  733|  5.08k|        if (hdr->primary_ref_frame == DAV1D_PRIMARY_REF_NONE) {
  ------------------
  |  |   45|  5.08k|#define DAV1D_PRIMARY_REF_NONE 7
  ------------------
  |  Branch (733:13): [True: 4.47k, False: 609]
  ------------------
  734|  4.47k|            hdr->segmentation.update_map = 1;
  735|  4.47k|            hdr->segmentation.update_data = 1;
  736|  4.47k|        } else {
  737|    609|            hdr->segmentation.update_map = dav1d_get_bit(gb);
  738|    609|            if (hdr->segmentation.update_map)
  ------------------
  |  Branch (738:17): [True: 341, False: 268]
  ------------------
  739|    341|                hdr->segmentation.temporal = dav1d_get_bit(gb);
  740|    609|            hdr->segmentation.update_data = dav1d_get_bit(gb);
  741|    609|        }
  742|       |
  743|  5.08k|        if (hdr->segmentation.update_data) {
  ------------------
  |  Branch (743:13): [True: 4.65k, False: 429]
  ------------------
  744|  4.65k|            hdr->segmentation.seg_data.last_active_segid = -1;
  745|  41.8k|            for (int i = 0; i < DAV1D_MAX_SEGMENTS; i++) {
  ------------------
  |  |   43|  41.8k|#define DAV1D_MAX_SEGMENTS 8
  ------------------
  |  Branch (745:29): [True: 37.2k, False: 4.65k]
  ------------------
  746|  37.2k|                Dav1dSegmentationData *const seg =
  747|  37.2k|                    &hdr->segmentation.seg_data.d[i];
  748|  37.2k|                if (dav1d_get_bit(gb)) {
  ------------------
  |  Branch (748:21): [True: 17.0k, False: 20.1k]
  ------------------
  749|  17.0k|                    seg->delta_q = dav1d_get_sbits(gb, 9);
  750|  17.0k|                    hdr->segmentation.seg_data.last_active_segid = i;
  751|  17.0k|                }
  752|  37.2k|                if (dav1d_get_bit(gb)) {
  ------------------
  |  Branch (752:21): [True: 15.5k, False: 21.6k]
  ------------------
  753|  15.5k|                    seg->delta_lf_y_v = dav1d_get_sbits(gb, 7);
  754|  15.5k|                    hdr->segmentation.seg_data.last_active_segid = i;
  755|  15.5k|                }
  756|  37.2k|                if (dav1d_get_bit(gb)) {
  ------------------
  |  Branch (756:21): [True: 17.9k, False: 19.2k]
  ------------------
  757|  17.9k|                    seg->delta_lf_y_h = dav1d_get_sbits(gb, 7);
  758|  17.9k|                    hdr->segmentation.seg_data.last_active_segid = i;
  759|  17.9k|                }
  760|  37.2k|                if (dav1d_get_bit(gb)) {
  ------------------
  |  Branch (760:21): [True: 18.0k, False: 19.1k]
  ------------------
  761|  18.0k|                    seg->delta_lf_u = dav1d_get_sbits(gb, 7);
  762|  18.0k|                    hdr->segmentation.seg_data.last_active_segid = i;
  763|  18.0k|                }
  764|  37.2k|                if (dav1d_get_bit(gb)) {
  ------------------
  |  Branch (764:21): [True: 17.8k, False: 19.3k]
  ------------------
  765|  17.8k|                    seg->delta_lf_v = dav1d_get_sbits(gb, 7);
  766|  17.8k|                    hdr->segmentation.seg_data.last_active_segid = i;
  767|  17.8k|                }
  768|  37.2k|                if (dav1d_get_bit(gb)) {
  ------------------
  |  Branch (768:21): [True: 17.6k, False: 19.5k]
  ------------------
  769|  17.6k|                    seg->ref = dav1d_get_bits(gb, 3);
  770|  17.6k|                    hdr->segmentation.seg_data.last_active_segid = i;
  771|  17.6k|                    hdr->segmentation.seg_data.preskip = 1;
  772|  19.5k|                } else {
  773|  19.5k|                    seg->ref = -1;
  774|  19.5k|                }
  775|  37.2k|                if ((seg->skip = dav1d_get_bit(gb))) {
  ------------------
  |  Branch (775:21): [True: 14.8k, False: 22.4k]
  ------------------
  776|  14.8k|                    hdr->segmentation.seg_data.last_active_segid = i;
  777|  14.8k|                    hdr->segmentation.seg_data.preskip = 1;
  778|  14.8k|                }
  779|  37.2k|                if ((seg->globalmv = dav1d_get_bit(gb))) {
  ------------------
  |  Branch (779:21): [True: 14.6k, False: 22.6k]
  ------------------
  780|  14.6k|                    hdr->segmentation.seg_data.last_active_segid = i;
  781|  14.6k|                    hdr->segmentation.seg_data.preskip = 1;
  782|  14.6k|                }
  783|  37.2k|            }
  784|  4.65k|        } else {
  785|       |            // segmentation.update_data was false so we should copy
  786|       |            // segmentation data from the reference frame.
  787|    429|            assert(hdr->primary_ref_frame != DAV1D_PRIMARY_REF_NONE);
  ------------------
  |  |  140|    429|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 429]
  |  |  |  Branch (140:68): [Folded, False: 429]
  |  |  ------------------
  ------------------
  788|    429|            const int pri_ref = hdr->refidx[hdr->primary_ref_frame];
  789|    429|            if (!c->refs[pri_ref].p.p.frame_hdr) goto error;
  ------------------
  |  Branch (789:17): [True: 4, False: 425]
  ------------------
  790|    425|            hdr->segmentation.seg_data =
  791|    425|                c->refs[pri_ref].p.p.frame_hdr->segmentation.seg_data;
  792|    425|        }
  793|  17.5k|    } else {
  794|   158k|        for (int i = 0; i < DAV1D_MAX_SEGMENTS; i++)
  ------------------
  |  |   43|   158k|#define DAV1D_MAX_SEGMENTS 8
  ------------------
  |  Branch (794:25): [True: 140k, False: 17.5k]
  ------------------
  795|   140k|            hdr->segmentation.seg_data.d[i].ref = -1;
  796|  17.5k|    }
  797|       |#if DEBUG_FRAME_HDR
  798|       |    printf("HDR: post-segmentation: off=%td\n",
  799|       |           (gb->ptr - init_ptr) * 8 - gb->bits_left);
  800|       |#endif
  801|       |
  802|       |    // delta q
  803|  22.6k|    if (hdr->quant.yac) {
  ------------------
  |  Branch (803:9): [True: 19.7k, False: 2.86k]
  ------------------
  804|  19.7k|        hdr->delta.q.present = dav1d_get_bit(gb);
  805|  19.7k|        if (hdr->delta.q.present) {
  ------------------
  |  Branch (805:13): [True: 5.74k, False: 14.0k]
  ------------------
  806|  5.74k|            hdr->delta.q.res_log2 = dav1d_get_bits(gb, 2);
  807|  5.74k|            if (!hdr->allow_intrabc) {
  ------------------
  |  Branch (807:17): [True: 4.51k, False: 1.23k]
  ------------------
  808|  4.51k|                hdr->delta.lf.present = dav1d_get_bit(gb);
  809|  4.51k|                if (hdr->delta.lf.present) {
  ------------------
  |  Branch (809:21): [True: 2.01k, False: 2.49k]
  ------------------
  810|  2.01k|                    hdr->delta.lf.res_log2 = dav1d_get_bits(gb, 2);
  811|  2.01k|                    hdr->delta.lf.multi = dav1d_get_bit(gb);
  812|  2.01k|                }
  813|  4.51k|            }
  814|  5.74k|        }
  815|  19.7k|    }
  816|       |#if DEBUG_FRAME_HDR
  817|       |    printf("HDR: post-delta_q_lf_flags: off=%td\n",
  818|       |           (gb->ptr - init_ptr) * 8 - gb->bits_left);
  819|       |#endif
  820|       |
  821|       |    // derive lossless flags
  822|  22.6k|    const int delta_lossless = !hdr->quant.ydc_delta && !hdr->quant.udc_delta &&
  ------------------
  |  Branch (822:32): [True: 17.7k, False: 4.87k]
  |  Branch (822:57): [True: 16.0k, False: 1.70k]
  ------------------
  823|  16.0k|        !hdr->quant.uac_delta && !hdr->quant.vdc_delta && !hdr->quant.vac_delta;
  ------------------
  |  Branch (823:9): [True: 14.4k, False: 1.63k]
  |  Branch (823:34): [True: 14.3k, False: 50]
  |  Branch (823:59): [True: 14.3k, False: 20]
  ------------------
  824|  22.6k|    hdr->all_lossless = 1;
  825|   203k|    for (int i = 0; i < DAV1D_MAX_SEGMENTS; i++) {
  ------------------
  |  |   43|   203k|#define DAV1D_MAX_SEGMENTS 8
  ------------------
  |  Branch (825:21): [True: 181k, False: 22.6k]
  ------------------
  826|   181k|        hdr->segmentation.qidx[i] = hdr->segmentation.enabled ?
  ------------------
  |  Branch (826:37): [True: 40.6k, False: 140k]
  ------------------
  827|  40.6k|            iclip_u8(hdr->quant.yac + hdr->segmentation.seg_data.d[i].delta_q) :
  828|   181k|            hdr->quant.yac;
  829|   181k|        hdr->segmentation.lossless[i] =
  830|   181k|            !hdr->segmentation.qidx[i] && delta_lossless;
  ------------------
  |  Branch (830:13): [True: 26.0k, False: 155k]
  |  Branch (830:43): [True: 21.7k, False: 4.33k]
  ------------------
  831|   181k|        hdr->all_lossless &= hdr->segmentation.lossless[i];
  832|   181k|    }
  833|       |
  834|       |    // loopfilter
  835|  22.6k|    if (hdr->all_lossless || hdr->allow_intrabc) {
  ------------------
  |  Branch (835:9): [True: 2.53k, False: 20.1k]
  |  Branch (835:30): [True: 3.17k, False: 16.9k]
  ------------------
  836|  5.71k|        hdr->loopfilter.mode_ref_delta_enabled = 1;
  837|  5.71k|        hdr->loopfilter.mode_ref_delta_update = 1;
  838|  5.71k|        hdr->loopfilter.mode_ref_deltas = default_mode_ref_deltas;
  839|  16.9k|    } else {
  840|  16.9k|        hdr->loopfilter.level_y[0] = dav1d_get_bits(gb, 6);
  841|  16.9k|        hdr->loopfilter.level_y[1] = dav1d_get_bits(gb, 6);
  842|  16.9k|        if (!seqhdr->monochrome &&
  ------------------
  |  Branch (842:13): [True: 14.4k, False: 2.50k]
  ------------------
  843|  14.4k|            (hdr->loopfilter.level_y[0] || hdr->loopfilter.level_y[1]))
  ------------------
  |  Branch (843:14): [True: 8.38k, False: 6.05k]
  |  Branch (843:44): [True: 944, False: 5.11k]
  ------------------
  844|  9.33k|        {
  845|  9.33k|            hdr->loopfilter.level_u = dav1d_get_bits(gb, 6);
  846|  9.33k|            hdr->loopfilter.level_v = dav1d_get_bits(gb, 6);
  847|  9.33k|        }
  848|  16.9k|        hdr->loopfilter.sharpness = dav1d_get_bits(gb, 3);
  849|       |
  850|  16.9k|        if (hdr->primary_ref_frame == DAV1D_PRIMARY_REF_NONE) {
  ------------------
  |  |   45|  16.9k|#define DAV1D_PRIMARY_REF_NONE 7
  ------------------
  |  Branch (850:13): [True: 15.0k, False: 1.89k]
  ------------------
  851|  15.0k|            hdr->loopfilter.mode_ref_deltas = default_mode_ref_deltas;
  852|  15.0k|        } else {
  853|  1.89k|            const int ref = hdr->refidx[hdr->primary_ref_frame];
  854|  1.89k|            if (!c->refs[ref].p.p.frame_hdr) goto error;
  ------------------
  |  Branch (854:17): [True: 4, False: 1.89k]
  ------------------
  855|  1.89k|            hdr->loopfilter.mode_ref_deltas =
  856|  1.89k|                c->refs[ref].p.p.frame_hdr->loopfilter.mode_ref_deltas;
  857|  1.89k|        }
  858|  16.9k|        hdr->loopfilter.mode_ref_delta_enabled = dav1d_get_bit(gb);
  859|  16.9k|        if (hdr->loopfilter.mode_ref_delta_enabled) {
  ------------------
  |  Branch (859:13): [True: 9.27k, False: 7.67k]
  ------------------
  860|  9.27k|            hdr->loopfilter.mode_ref_delta_update = dav1d_get_bit(gb);
  861|  9.27k|            if (hdr->loopfilter.mode_ref_delta_update) {
  ------------------
  |  Branch (861:17): [True: 2.18k, False: 7.08k]
  ------------------
  862|  19.7k|                for (int i = 0; i < 8; i++)
  ------------------
  |  Branch (862:33): [True: 17.5k, False: 2.18k]
  ------------------
  863|  17.5k|                    if (dav1d_get_bit(gb))
  ------------------
  |  Branch (863:25): [True: 8.64k, False: 8.86k]
  ------------------
  864|  8.64k|                        hdr->loopfilter.mode_ref_deltas.ref_delta[i] =
  865|  8.64k|                            dav1d_get_sbits(gb, 7);
  866|  6.56k|                for (int i = 0; i < 2; i++)
  ------------------
  |  Branch (866:33): [True: 4.37k, False: 2.18k]
  ------------------
  867|  4.37k|                    if (dav1d_get_bit(gb))
  ------------------
  |  Branch (867:25): [True: 2.06k, False: 2.31k]
  ------------------
  868|  2.06k|                        hdr->loopfilter.mode_ref_deltas.mode_delta[i] =
  869|  2.06k|                            dav1d_get_sbits(gb, 7);
  870|  2.18k|            }
  871|  9.27k|        }
  872|  16.9k|    }
  873|       |#if DEBUG_FRAME_HDR
  874|       |    printf("HDR: post-lpf: off=%td\n",
  875|       |           (gb->ptr - init_ptr) * 8 - gb->bits_left);
  876|       |#endif
  877|       |
  878|       |    // cdef
  879|  22.6k|    if (!hdr->all_lossless && seqhdr->cdef && !hdr->allow_intrabc) {
  ------------------
  |  Branch (879:9): [True: 20.1k, False: 2.53k]
  |  Branch (879:31): [True: 13.1k, False: 6.96k]
  |  Branch (879:47): [True: 11.6k, False: 1.47k]
  ------------------
  880|  11.6k|        hdr->cdef.damping = dav1d_get_bits(gb, 2) + 3;
  881|  11.6k|        hdr->cdef.n_bits = dav1d_get_bits(gb, 2);
  882|  34.2k|        for (int i = 0; i < (1 << hdr->cdef.n_bits); i++) {
  ------------------
  |  Branch (882:25): [True: 22.5k, False: 11.6k]
  ------------------
  883|  22.5k|            hdr->cdef.y_strength[i] = dav1d_get_bits(gb, 6);
  884|  22.5k|            if (!seqhdr->monochrome)
  ------------------
  |  Branch (884:17): [True: 17.5k, False: 5.02k]
  ------------------
  885|  17.5k|                hdr->cdef.uv_strength[i] = dav1d_get_bits(gb, 6);
  886|  22.5k|        }
  887|  11.6k|    }
  888|       |#if DEBUG_FRAME_HDR
  889|       |    printf("HDR: post-cdef: off=%td\n",
  890|       |           (gb->ptr - init_ptr) * 8 - gb->bits_left);
  891|       |#endif
  892|       |
  893|       |    // restoration
  894|  22.6k|    if ((!hdr->all_lossless || hdr->super_res.enabled) &&
  ------------------
  |  Branch (894:10): [True: 20.1k, False: 2.53k]
  |  Branch (894:32): [True: 71, False: 2.46k]
  ------------------
  895|  20.1k|        seqhdr->restoration && !hdr->allow_intrabc)
  ------------------
  |  Branch (895:9): [True: 8.94k, False: 11.2k]
  |  Branch (895:32): [True: 7.12k, False: 1.81k]
  ------------------
  896|  7.12k|    {
  897|  7.12k|        hdr->restoration.type[0] = dav1d_get_bits(gb, 2);
  898|  7.12k|        if (!seqhdr->monochrome) {
  ------------------
  |  Branch (898:13): [True: 5.36k, False: 1.75k]
  ------------------
  899|  5.36k|            hdr->restoration.type[1] = dav1d_get_bits(gb, 2);
  900|  5.36k|            hdr->restoration.type[2] = dav1d_get_bits(gb, 2);
  901|  5.36k|        }
  902|       |
  903|  7.12k|        if (hdr->restoration.type[0] || hdr->restoration.type[1] ||
  ------------------
  |  Branch (903:13): [True: 4.52k, False: 2.60k]
  |  Branch (903:41): [True: 504, False: 2.09k]
  ------------------
  904|  2.09k|            hdr->restoration.type[2])
  ------------------
  |  Branch (904:13): [True: 244, False: 1.85k]
  ------------------
  905|  5.26k|        {
  906|       |            // Log2 of the restoration unit size.
  907|  5.26k|            hdr->restoration.unit_size[0] = 6 + seqhdr->sb128;
  908|  5.26k|            if (dav1d_get_bit(gb)) {
  ------------------
  |  Branch (908:17): [True: 2.49k, False: 2.77k]
  ------------------
  909|  2.49k|                hdr->restoration.unit_size[0]++;
  910|  2.49k|                if (!seqhdr->sb128)
  ------------------
  |  Branch (910:21): [True: 1.54k, False: 947]
  ------------------
  911|  1.54k|                    hdr->restoration.unit_size[0] += dav1d_get_bit(gb);
  912|  2.49k|            }
  913|  5.26k|            hdr->restoration.unit_size[1] = hdr->restoration.unit_size[0];
  914|  5.26k|            if ((hdr->restoration.type[1] || hdr->restoration.type[2]) &&
  ------------------
  |  Branch (914:18): [True: 3.06k, False: 2.20k]
  |  Branch (914:46): [True: 833, False: 1.36k]
  ------------------
  915|  3.90k|                seqhdr->ss_hor == 1 && seqhdr->ss_ver == 1)
  ------------------
  |  Branch (915:17): [True: 721, False: 3.18k]
  |  Branch (915:40): [True: 509, False: 212]
  ------------------
  916|    509|            {
  917|    509|                hdr->restoration.unit_size[1] -= dav1d_get_bit(gb);
  918|    509|            }
  919|  5.26k|        } else {
  920|  1.85k|            hdr->restoration.unit_size[0] = 8;
  921|  1.85k|        }
  922|  7.12k|    }
  923|       |#if DEBUG_FRAME_HDR
  924|       |    printf("HDR: post-restoration: off=%td\n",
  925|       |           (gb->ptr - init_ptr) * 8 - gb->bits_left);
  926|       |#endif
  927|       |
  928|  22.6k|    if (!hdr->all_lossless)
  ------------------
  |  Branch (928:9): [True: 20.1k, False: 2.53k]
  ------------------
  929|  20.1k|        hdr->txfm_mode = dav1d_get_bit(gb) ? DAV1D_TX_SWITCHABLE : DAV1D_TX_LARGEST;
  ------------------
  |  Branch (929:26): [True: 7.21k, False: 12.9k]
  ------------------
  930|       |#if DEBUG_FRAME_HDR
  931|       |    printf("HDR: post-txfmmode: off=%td\n",
  932|       |           (gb->ptr - init_ptr) * 8 - gb->bits_left);
  933|       |#endif
  934|  22.6k|    if (IS_INTER_OR_SWITCH(hdr))
  ------------------
  |  |   36|  22.6k|    ((frame_header)->frame_type & 1)
  |  |  ------------------
  |  |  |  Branch (36:5): [True: 5.23k, False: 17.4k]
  |  |  ------------------
  ------------------
  935|  5.23k|        hdr->switchable_comp_refs = dav1d_get_bit(gb);
  936|       |#if DEBUG_FRAME_HDR
  937|       |    printf("HDR: post-refmode: off=%td\n",
  938|       |           (gb->ptr - init_ptr) * 8 - gb->bits_left);
  939|       |#endif
  940|  22.6k|    if (hdr->switchable_comp_refs && IS_INTER_OR_SWITCH(hdr) && seqhdr->order_hint) {
  ------------------
  |  |   36|  24.5k|    ((frame_header)->frame_type & 1)
  |  |  ------------------
  |  |  |  Branch (36:5): [True: 1.86k, False: 0]
  |  |  ------------------
  ------------------
  |  Branch (940:9): [True: 1.86k, False: 20.7k]
  |  Branch (940:65): [True: 1.67k, False: 189]
  ------------------
  941|  1.67k|        const int poc = hdr->frame_offset;
  942|  1.67k|        int off_before = -1, off_after = -1;
  943|  1.67k|        int off_before_idx, off_after_idx;
  944|  13.3k|        for (int i = 0; i < 7; i++) {
  ------------------
  |  Branch (944:25): [True: 11.6k, False: 1.67k]
  ------------------
  945|  11.6k|            if (!c->refs[hdr->refidx[i]].p.p.frame_hdr) goto error;
  ------------------
  |  Branch (945:17): [True: 5, False: 11.6k]
  ------------------
  946|  11.6k|            const int refpoc = c->refs[hdr->refidx[i]].p.p.frame_hdr->frame_offset;
  947|       |
  948|  11.6k|            const int diff = get_poc_diff(seqhdr->order_hint_n_bits, refpoc, poc);
  949|  11.6k|            if (diff > 0) {
  ------------------
  |  Branch (949:17): [True: 1.05k, False: 10.6k]
  ------------------
  950|  1.05k|                if (off_after < 0 || get_poc_diff(seqhdr->order_hint_n_bits,
  ------------------
  |  Branch (950:21): [True: 270, False: 782]
  |  Branch (950:38): [True: 16, False: 766]
  ------------------
  951|    782|                                                  off_after, refpoc) > 0)
  952|    286|                {
  953|    286|                    off_after = refpoc;
  954|    286|                    off_after_idx = i;
  955|    286|                }
  956|  10.6k|            } else if (diff < 0 && (off_before < 0 ||
  ------------------
  |  Branch (956:24): [True: 9.87k, False: 762]
  |  Branch (956:37): [True: 1.48k, False: 8.38k]
  ------------------
  957|  8.38k|                                    get_poc_diff(seqhdr->order_hint_n_bits,
  ------------------
  |  Branch (957:37): [True: 34, False: 8.35k]
  ------------------
  958|  8.38k|                                                 refpoc, off_before) > 0))
  959|  1.52k|            {
  960|  1.52k|                off_before = refpoc;
  961|  1.52k|                off_before_idx = i;
  962|  1.52k|            }
  963|  11.6k|        }
  964|       |
  965|  1.67k|        if ((off_before | off_after) >= 0) {
  ------------------
  |  Branch (965:13): [True: 146, False: 1.52k]
  ------------------
  966|    146|            hdr->skip_mode_refs[0] = imin(off_before_idx, off_after_idx);
  967|    146|            hdr->skip_mode_refs[1] = imax(off_before_idx, off_after_idx);
  968|    146|            hdr->skip_mode_allowed = 1;
  969|  1.52k|        } else if (off_before >= 0) {
  ------------------
  |  Branch (969:20): [True: 1.34k, False: 183]
  ------------------
  970|  1.34k|            int off_before2 = -1;
  971|  1.34k|            int off_before2_idx;
  972|  10.7k|            for (int i = 0; i < 7; i++) {
  ------------------
  |  Branch (972:29): [True: 9.38k, False: 1.34k]
  ------------------
  973|  9.38k|                if (!c->refs[hdr->refidx[i]].p.p.frame_hdr) goto error;
  ------------------
  |  Branch (973:21): [True: 0, False: 9.38k]
  ------------------
  974|  9.38k|                const int refpoc = c->refs[hdr->refidx[i]].p.p.frame_hdr->frame_offset;
  975|  9.38k|                if (get_poc_diff(seqhdr->order_hint_n_bits,
  ------------------
  |  Branch (975:21): [True: 384, False: 9.00k]
  ------------------
  976|  9.38k|                                 refpoc, off_before) < 0) {
  977|    384|                    if (off_before2 < 0 || get_poc_diff(seqhdr->order_hint_n_bits,
  ------------------
  |  Branch (977:25): [True: 76, False: 308]
  |  Branch (977:44): [True: 24, False: 284]
  ------------------
  978|    308|                                                        refpoc, off_before2) > 0)
  979|    100|                    {
  980|    100|                        off_before2 = refpoc;
  981|    100|                        off_before2_idx = i;
  982|    100|                    }
  983|    384|                }
  984|  9.38k|            }
  985|       |
  986|  1.34k|            if (off_before2 >= 0) {
  ------------------
  |  Branch (986:17): [True: 76, False: 1.26k]
  ------------------
  987|     76|                hdr->skip_mode_refs[0] = imin(off_before_idx, off_before2_idx);
  988|     76|                hdr->skip_mode_refs[1] = imax(off_before_idx, off_before2_idx);
  989|     76|                hdr->skip_mode_allowed = 1;
  990|     76|            }
  991|  1.34k|        }
  992|  1.67k|    }
  993|  22.6k|    if (hdr->skip_mode_allowed)
  ------------------
  |  Branch (993:9): [True: 222, False: 22.4k]
  ------------------
  994|    222|        hdr->skip_mode_enabled = dav1d_get_bit(gb);
  995|       |#if DEBUG_FRAME_HDR
  996|       |    printf("HDR: post-extskip: off=%td\n",
  997|       |           (gb->ptr - init_ptr) * 8 - gb->bits_left);
  998|       |#endif
  999|  22.6k|    if (!hdr->error_resilient_mode && IS_INTER_OR_SWITCH(hdr) && seqhdr->warped_motion)
  ------------------
  |  |   36|  27.5k|    ((frame_header)->frame_type & 1)
  |  |  ------------------
  |  |  |  Branch (36:5): [True: 4.54k, False: 390]
  |  |  ------------------
  ------------------
  |  Branch (999:9): [True: 4.93k, False: 17.7k]
  |  Branch (999:66): [True: 3.51k, False: 1.02k]
  ------------------
 1000|  3.51k|        hdr->warp_motion = dav1d_get_bit(gb);
 1001|       |#if DEBUG_FRAME_HDR
 1002|       |    printf("HDR: post-warpmotionbit: off=%td\n",
 1003|       |           (gb->ptr - init_ptr) * 8 - gb->bits_left);
 1004|       |#endif
 1005|  22.6k|    hdr->reduced_txtp_set = dav1d_get_bit(gb);
 1006|       |#if DEBUG_FRAME_HDR
 1007|       |    printf("HDR: post-reducedtxtpset: off=%td\n",
 1008|       |           (gb->ptr - init_ptr) * 8 - gb->bits_left);
 1009|       |#endif
 1010|       |
 1011|   181k|    for (int i = 0; i < 7; i++)
  ------------------
  |  Branch (1011:21): [True: 158k, False: 22.6k]
  ------------------
 1012|   158k|        hdr->gmv[i] = dav1d_default_wm_params;
 1013|       |
 1014|  22.6k|    if (IS_INTER_OR_SWITCH(hdr)) {
  ------------------
  |  |   36|  22.6k|    ((frame_header)->frame_type & 1)
  |  |  ------------------
  |  |  |  Branch (36:5): [True: 5.23k, False: 17.4k]
  |  |  ------------------
  ------------------
 1015|  41.8k|        for (int i = 0; i < 7; i++) {
  ------------------
  |  Branch (1015:25): [True: 36.6k, False: 5.23k]
  ------------------
 1016|  36.6k|            hdr->gmv[i].type = !dav1d_get_bit(gb) ? DAV1D_WM_TYPE_IDENTITY :
  ------------------
  |  Branch (1016:32): [True: 31.7k, False: 4.84k]
  ------------------
 1017|  36.6k|                                dav1d_get_bit(gb) ? DAV1D_WM_TYPE_ROT_ZOOM :
  ------------------
  |  Branch (1017:33): [True: 2.68k, False: 2.16k]
  ------------------
 1018|  4.84k|                                dav1d_get_bit(gb) ? DAV1D_WM_TYPE_TRANSLATION :
  ------------------
  |  Branch (1018:33): [True: 915, False: 1.24k]
  ------------------
 1019|  2.16k|                                                    DAV1D_WM_TYPE_AFFINE;
 1020|       |
 1021|  36.6k|            if (hdr->gmv[i].type == DAV1D_WM_TYPE_IDENTITY) continue;
  ------------------
  |  Branch (1021:17): [True: 31.7k, False: 4.84k]
  ------------------
 1022|       |
 1023|  4.84k|            const Dav1dWarpedMotionParams *ref_gmv;
 1024|  4.84k|            if (hdr->primary_ref_frame == DAV1D_PRIMARY_REF_NONE) {
  ------------------
  |  |   45|  4.84k|#define DAV1D_PRIMARY_REF_NONE 7
  ------------------
  |  Branch (1024:17): [True: 2.40k, False: 2.44k]
  ------------------
 1025|  2.40k|                ref_gmv = &dav1d_default_wm_params;
 1026|  2.44k|            } else {
 1027|  2.44k|                const int pri_ref = hdr->refidx[hdr->primary_ref_frame];
 1028|  2.44k|                if (!c->refs[pri_ref].p.p.frame_hdr) goto error;
  ------------------
  |  Branch (1028:21): [True: 3, False: 2.43k]
  ------------------
 1029|  2.43k|                ref_gmv = &c->refs[pri_ref].p.p.frame_hdr->gmv[i];
 1030|  2.43k|            }
 1031|  4.83k|            int32_t *const mat = hdr->gmv[i].matrix;
 1032|  4.83k|            const int32_t *const ref_mat = ref_gmv->matrix;
 1033|  4.83k|            int bits, shift;
 1034|       |
 1035|  4.83k|            if (hdr->gmv[i].type >= DAV1D_WM_TYPE_ROT_ZOOM) {
  ------------------
  |  Branch (1035:17): [True: 3.92k, False: 915]
  ------------------
 1036|  3.92k|                mat[2] = (1 << 16) + 2 *
 1037|  3.92k|                    dav1d_get_bits_subexp(gb, (ref_mat[2] - (1 << 16)) >> 1, 12);
 1038|  3.92k|                mat[3] = 2 * dav1d_get_bits_subexp(gb, ref_mat[3] >> 1, 12);
 1039|       |
 1040|  3.92k|                bits = 12;
 1041|  3.92k|                shift = 10;
 1042|  3.92k|            } else {
 1043|    915|                bits = 9 - !hdr->hp;
 1044|    915|                shift = 13 + !hdr->hp;
 1045|    915|            }
 1046|       |
 1047|  4.83k|            if (hdr->gmv[i].type == DAV1D_WM_TYPE_AFFINE) {
  ------------------
  |  Branch (1047:17): [True: 1.24k, False: 3.59k]
  ------------------
 1048|  1.24k|                mat[4] = 2 * dav1d_get_bits_subexp(gb, ref_mat[4] >> 1, 12);
 1049|  1.24k|                mat[5] = (1 << 16) + 2 *
 1050|  1.24k|                    dav1d_get_bits_subexp(gb, (ref_mat[5] - (1 << 16)) >> 1, 12);
 1051|  3.59k|            } else {
 1052|  3.59k|                mat[4] = -mat[3];
 1053|  3.59k|                mat[5] = mat[2];
 1054|  3.59k|            }
 1055|       |
 1056|  4.83k|            mat[0] = dav1d_get_bits_subexp(gb, ref_mat[0] >> shift, bits) * (1 << shift);
 1057|  4.83k|            mat[1] = dav1d_get_bits_subexp(gb, ref_mat[1] >> shift, bits) * (1 << shift);
 1058|  4.83k|        }
 1059|  5.23k|    }
 1060|       |#if DEBUG_FRAME_HDR
 1061|       |    printf("HDR: post-gmv: off=%td\n",
 1062|       |           (gb->ptr - init_ptr) * 8 - gb->bits_left);
 1063|       |#endif
 1064|       |
 1065|  22.6k|    if (seqhdr->film_grain_present && (hdr->show_frame || hdr->showable_frame)) {
  ------------------
  |  Branch (1065:9): [True: 2.62k, False: 20.0k]
  |  Branch (1065:40): [True: 2.34k, False: 282]
  |  Branch (1065:59): [True: 190, False: 92]
  ------------------
 1066|  2.53k|        hdr->film_grain.present = dav1d_get_bit(gb);
 1067|  2.53k|        if (hdr->film_grain.present) {
  ------------------
  |  Branch (1067:13): [True: 486, False: 2.04k]
  ------------------
 1068|    486|            const unsigned seed = dav1d_get_bits(gb, 16);
 1069|    486|            hdr->film_grain.update = hdr->frame_type != DAV1D_FRAME_TYPE_INTER || dav1d_get_bit(gb);
  ------------------
  |  Branch (1069:38): [True: 452, False: 34]
  |  Branch (1069:83): [True: 4, False: 30]
  ------------------
 1070|    486|            if (!hdr->film_grain.update) {
  ------------------
  |  Branch (1070:17): [True: 30, False: 456]
  ------------------
 1071|     30|                const int refidx = dav1d_get_bits(gb, 3);
 1072|     30|                int i;
 1073|    131|                for (i = 0; i < 7; i++)
  ------------------
  |  Branch (1073:29): [True: 128, False: 3]
  ------------------
 1074|    128|                    if (hdr->refidx[i] == refidx)
  ------------------
  |  Branch (1074:25): [True: 27, False: 101]
  ------------------
 1075|     27|                        break;
 1076|     30|                if (i == 7 || !c->refs[refidx].p.p.frame_hdr) goto error;
  ------------------
  |  Branch (1076:21): [True: 3, False: 27]
  |  Branch (1076:31): [True: 2, False: 25]
  ------------------
 1077|     25|                hdr->film_grain.data = c->refs[refidx].p.p.frame_hdr->film_grain.data;
 1078|     25|                hdr->film_grain.data.seed = seed;
 1079|    456|            } else {
 1080|    456|                Dav1dFilmGrainData *const fgd = &hdr->film_grain.data;
 1081|    456|                fgd->seed = seed;
 1082|       |
 1083|    456|                fgd->num_y_points = dav1d_get_bits(gb, 4);
 1084|    456|                if (fgd->num_y_points > 14) goto error;
  ------------------
  |  Branch (1084:21): [True: 3, False: 453]
  ------------------
 1085|    992|                for (int i = 0; i < fgd->num_y_points; i++) {
  ------------------
  |  Branch (1085:33): [True: 557, False: 435]
  ------------------
 1086|    557|                    fgd->y_points[i][0] = dav1d_get_bits(gb, 8);
 1087|    557|                    if (i && fgd->y_points[i - 1][0] >= fgd->y_points[i][0])
  ------------------
  |  Branch (1087:25): [True: 249, False: 308]
  |  Branch (1087:30): [True: 18, False: 231]
  ------------------
 1088|     18|                        goto error;
 1089|    539|                    fgd->y_points[i][1] = dav1d_get_bits(gb, 8);
 1090|    539|                }
 1091|       |
 1092|    435|                if (!seqhdr->monochrome)
  ------------------
  |  Branch (1092:21): [True: 333, False: 102]
  ------------------
 1093|    333|                    fgd->chroma_scaling_from_luma = dav1d_get_bit(gb);
 1094|    435|                if (seqhdr->monochrome || fgd->chroma_scaling_from_luma ||
  ------------------
  |  Branch (1094:21): [True: 102, False: 333]
  |  Branch (1094:43): [True: 234, False: 99]
  ------------------
 1095|     99|                    (seqhdr->ss_ver == 1 && seqhdr->ss_hor == 1 && !fgd->num_y_points))
  ------------------
  |  Branch (1095:22): [True: 20, False: 79]
  |  Branch (1095:45): [True: 20, False: 0]
  |  Branch (1095:68): [True: 13, False: 7]
  ------------------
 1096|    349|                {
 1097|    349|                    fgd->num_uv_points[0] = fgd->num_uv_points[1] = 0;
 1098|    349|                } else for (int pl = 0; pl < 2; pl++) {
  ------------------
  |  Branch (1098:41): [True: 155, False: 62]
  ------------------
 1099|    155|                    fgd->num_uv_points[pl] = dav1d_get_bits(gb, 4);
 1100|    155|                    if (fgd->num_uv_points[pl] > 10) goto error;
  ------------------
  |  Branch (1100:25): [True: 10, False: 145]
  ------------------
 1101|    276|                    for (int i = 0; i < fgd->num_uv_points[pl]; i++) {
  ------------------
  |  Branch (1101:37): [True: 145, False: 131]
  ------------------
 1102|    145|                        fgd->uv_points[pl][i][0] = dav1d_get_bits(gb, 8);
 1103|    145|                        if (i && fgd->uv_points[pl][i - 1][0] >= fgd->uv_points[pl][i][0])
  ------------------
  |  Branch (1103:29): [True: 59, False: 86]
  |  Branch (1103:34): [True: 14, False: 45]
  ------------------
 1104|     14|                            goto error;
 1105|    131|                        fgd->uv_points[pl][i][1] = dav1d_get_bits(gb, 8);
 1106|    131|                    }
 1107|    145|                }
 1108|       |
 1109|    411|                if (seqhdr->ss_hor == 1 && seqhdr->ss_ver == 1 &&
  ------------------
  |  Branch (1109:21): [True: 204, False: 207]
  |  Branch (1109:44): [True: 195, False: 9]
  ------------------
 1110|    195|                    !!fgd->num_uv_points[0] != !!fgd->num_uv_points[1])
  ------------------
  |  Branch (1110:21): [True: 2, False: 193]
  ------------------
 1111|      2|                {
 1112|      2|                    goto error;
 1113|      2|                }
 1114|       |
 1115|    409|                fgd->scaling_shift = dav1d_get_bits(gb, 2) + 8;
 1116|    409|                fgd->ar_coeff_lag = dav1d_get_bits(gb, 2);
 1117|    409|                const int num_y_pos = 2 * fgd->ar_coeff_lag * (fgd->ar_coeff_lag + 1);
 1118|    409|                if (fgd->num_y_points)
  ------------------
  |  Branch (1118:21): [True: 278, False: 131]
  ------------------
 1119|  2.45k|                    for (int i = 0; i < num_y_pos; i++)
  ------------------
  |  Branch (1119:37): [True: 2.17k, False: 278]
  ------------------
 1120|  2.17k|                        fgd->ar_coeffs_y[i] = dav1d_get_bits(gb, 8) - 128;
 1121|  1.22k|                for (int pl = 0; pl < 2; pl++)
  ------------------
  |  Branch (1121:34): [True: 818, False: 409]
  ------------------
 1122|    818|                    if (fgd->num_uv_points[pl] || fgd->chroma_scaling_from_luma) {
  ------------------
  |  Branch (1122:25): [True: 65, False: 753]
  |  Branch (1122:51): [True: 468, False: 285]
  ------------------
 1123|    533|                        const int num_uv_pos = num_y_pos + !!fgd->num_y_points;
 1124|  5.18k|                        for (int i = 0; i < num_uv_pos; i++)
  ------------------
  |  Branch (1124:41): [True: 4.65k, False: 533]
  ------------------
 1125|  4.65k|                            fgd->ar_coeffs_uv[pl][i] = dav1d_get_bits(gb, 8) - 128;
 1126|    533|                        if (!fgd->num_y_points)
  ------------------
  |  Branch (1126:29): [True: 182, False: 351]
  ------------------
 1127|    182|                            fgd->ar_coeffs_uv[pl][num_uv_pos] = 0;
 1128|    533|                    }
 1129|    409|                fgd->ar_coeff_shift = dav1d_get_bits(gb, 2) + 6;
 1130|    409|                fgd->grain_scale_shift = dav1d_get_bits(gb, 2);
 1131|  1.22k|                for (int pl = 0; pl < 2; pl++)
  ------------------
  |  Branch (1131:34): [True: 818, False: 409]
  ------------------
 1132|    818|                    if (fgd->num_uv_points[pl]) {
  ------------------
  |  Branch (1132:25): [True: 65, False: 753]
  ------------------
 1133|     65|                        fgd->uv_mult[pl] = dav1d_get_bits(gb, 8) - 128;
 1134|     65|                        fgd->uv_luma_mult[pl] = dav1d_get_bits(gb, 8) - 128;
 1135|     65|                        fgd->uv_offset[pl] = dav1d_get_bits(gb, 9) - 256;
 1136|     65|                    }
 1137|    409|                fgd->overlap_flag = dav1d_get_bit(gb);
 1138|    409|                fgd->clip_to_restricted_range = dav1d_get_bit(gb);
 1139|    409|            }
 1140|    486|        }
 1141|  2.53k|    }
 1142|       |#if DEBUG_FRAME_HDR
 1143|       |    printf("HDR: post-filmgrain: off=%td\n",
 1144|       |           (gb->ptr - init_ptr) * 8 - gb->bits_left);
 1145|       |#endif
 1146|       |
 1147|  22.6k|    return 0;
 1148|       |
 1149|    146|error:
 1150|    146|    dav1d_log(c, "Error parsing frame header\n");
  ------------------
  |  |   39|    146|#define dav1d_log dav1d_log
  ------------------
 1151|    146|    return DAV1D_ERR(EINVAL);
  ------------------
  |  |   56|    146|#define DAV1D_ERR(e) (-(e)) ///< Negate POSIX error code.
  ------------------
 1152|  22.6k|}
obu.c:read_frame_size:
  343|  22.6k|{
  344|  22.6k|    const Dav1dSequenceHeader *const seqhdr = c->seq_hdr;
  345|  22.6k|    Dav1dFrameHeader *const hdr = c->frame_hdr;
  346|       |
  347|  22.6k|    if (use_ref) {
  ------------------
  |  Branch (347:9): [True: 206, False: 22.4k]
  ------------------
  348|    630|        for (int i = 0; i < 7; i++) {
  ------------------
  |  Branch (348:25): [True: 606, False: 24]
  ------------------
  349|    606|            if (dav1d_get_bit(gb)) {
  ------------------
  |  Branch (349:17): [True: 182, False: 424]
  ------------------
  350|    182|                const Dav1dThreadPicture *const ref =
  351|    182|                    &c->refs[c->frame_hdr->refidx[i]].p;
  352|    182|                if (!ref->p.frame_hdr) return -1;
  ------------------
  |  Branch (352:21): [True: 5, False: 177]
  ------------------
  353|    177|                hdr->width[1] = ref->p.frame_hdr->width[1];
  354|    177|                hdr->height = ref->p.frame_hdr->height;
  355|    177|                hdr->render_width = ref->p.frame_hdr->render_width;
  356|    177|                hdr->render_height = ref->p.frame_hdr->render_height;
  357|    177|                hdr->super_res.enabled = seqhdr->super_res && dav1d_get_bit(gb);
  ------------------
  |  Branch (357:42): [True: 22, False: 155]
  |  Branch (357:63): [True: 14, False: 8]
  ------------------
  358|    177|                if (hdr->super_res.enabled) {
  ------------------
  |  Branch (358:21): [True: 14, False: 163]
  ------------------
  359|     14|                    const int d = hdr->super_res.width_scale_denominator =
  360|     14|                        9 + dav1d_get_bits(gb, 3);
  361|     14|                    hdr->width[0] = imax((hdr->width[1] * 8 + (d >> 1)) / d,
  362|     14|                                         imin(16, hdr->width[1]));
  363|    163|                } else {
  364|    163|                    hdr->super_res.width_scale_denominator = 8;
  365|    163|                    hdr->width[0] = hdr->width[1];
  366|    163|                }
  367|    177|                return 0;
  368|    182|            }
  369|    606|        }
  370|    206|    }
  371|       |
  372|  22.5k|    if (hdr->frame_size_override) {
  ------------------
  |  Branch (372:9): [True: 1.75k, False: 20.7k]
  ------------------
  373|  1.75k|        hdr->width[1] = dav1d_get_bits(gb, seqhdr->width_n_bits) + 1;
  374|  1.75k|        hdr->height = dav1d_get_bits(gb, seqhdr->height_n_bits) + 1;
  375|  20.7k|    } else {
  376|  20.7k|        hdr->width[1] = seqhdr->max_width;
  377|  20.7k|        hdr->height = seqhdr->max_height;
  378|  20.7k|    }
  379|  22.5k|    hdr->super_res.enabled = seqhdr->super_res && dav1d_get_bit(gb);
  ------------------
  |  Branch (379:30): [True: 5.33k, False: 17.1k]
  |  Branch (379:51): [True: 1.91k, False: 3.41k]
  ------------------
  380|  22.5k|    if (hdr->super_res.enabled) {
  ------------------
  |  Branch (380:9): [True: 1.91k, False: 20.5k]
  ------------------
  381|  1.91k|        const int d = hdr->super_res.width_scale_denominator = 9 + dav1d_get_bits(gb, 3);
  382|  1.91k|        hdr->width[0] = imax((hdr->width[1] * 8 + (d >> 1)) / d, imin(16, hdr->width[1]));
  383|  20.5k|    } else {
  384|  20.5k|        hdr->super_res.width_scale_denominator = 8;
  385|  20.5k|        hdr->width[0] = hdr->width[1];
  386|  20.5k|    }
  387|  22.5k|    hdr->have_render_size = dav1d_get_bit(gb);
  388|  22.5k|    if (hdr->have_render_size) {
  ------------------
  |  Branch (388:9): [True: 4.26k, False: 18.2k]
  ------------------
  389|  4.26k|        hdr->render_width = dav1d_get_bits(gb, 16) + 1;
  390|  4.26k|        hdr->render_height = dav1d_get_bits(gb, 16) + 1;
  391|  18.2k|    } else {
  392|  18.2k|        hdr->render_width = hdr->width[1];
  393|  18.2k|        hdr->render_height = hdr->height;
  394|  18.2k|    }
  395|  22.5k|    return 0;
  396|  22.6k|}
obu.c:tile_log2:
  398|   103k|static inline int tile_log2(const int sz, const int tgt) {
  399|   103k|    int k;
  400|   167k|    for (k = 0; (sz << k) < tgt; k++) ;
  ------------------
  |  Branch (400:17): [True: 64.0k, False: 103k]
  ------------------
  401|   103k|    return k;
  402|   103k|}
obu.c:check_trailing_bits:
   50|  18.4k|{
   51|  18.4k|    const int trailing_one_bit = dav1d_get_bit(gb);
   52|       |
   53|  18.4k|    if (gb->error)
  ------------------
  |  Branch (53:9): [True: 625, False: 17.8k]
  ------------------
   54|    625|        return DAV1D_ERR(EINVAL);
  ------------------
  |  |   56|    625|#define DAV1D_ERR(e) (-(e)) ///< Negate POSIX error code.
  ------------------
   55|       |
   56|  17.8k|    if (!strict_std_compliance)
  ------------------
  |  Branch (56:9): [True: 17.8k, False: 0]
  ------------------
   57|  17.8k|        return 0;
   58|       |
   59|      0|    if (!trailing_one_bit || gb->state)
  ------------------
  |  Branch (59:9): [True: 0, False: 0]
  |  Branch (59:30): [True: 0, False: 0]
  ------------------
   60|      0|        return DAV1D_ERR(EINVAL);
  ------------------
  |  |   56|      0|#define DAV1D_ERR(e) (-(e)) ///< Negate POSIX error code.
  ------------------
   61|       |
   62|      0|    ptrdiff_t size = gb->ptr_end - gb->ptr;
   63|      0|    while (size > 0 && gb->ptr[size - 1] == 0)
  ------------------
  |  Branch (63:12): [True: 0, False: 0]
  |  Branch (63:24): [True: 0, False: 0]
  ------------------
   64|      0|        size--;
   65|       |
   66|      0|    if (size)
  ------------------
  |  Branch (66:9): [True: 0, False: 0]
  ------------------
   67|      0|        return DAV1D_ERR(EINVAL);
  ------------------
  |  |   56|      0|#define DAV1D_ERR(e) (-(e)) ///< Negate POSIX error code.
  ------------------
   68|       |
   69|      0|    return 0;
   70|      0|}
obu.c:parse_tile_hdr:
 1154|  21.8k|static void parse_tile_hdr(Dav1dContext *const c, GetBits *const gb) {
 1155|  21.8k|    const int n_tiles = c->frame_hdr->tiling.cols * c->frame_hdr->tiling.rows;
 1156|  21.8k|    const int have_tile_pos = n_tiles > 1 ? dav1d_get_bit(gb) : 0;
  ------------------
  |  Branch (1156:31): [True: 4.78k, False: 17.0k]
  ------------------
 1157|       |
 1158|  21.8k|    if (have_tile_pos) {
  ------------------
  |  Branch (1158:9): [True: 772, False: 21.0k]
  ------------------
 1159|    772|        const int n_bits = c->frame_hdr->tiling.log2_cols +
 1160|    772|                           c->frame_hdr->tiling.log2_rows;
 1161|    772|        c->tile[c->n_tile_data].start = dav1d_get_bits(gb, n_bits);
 1162|    772|        c->tile[c->n_tile_data].end = dav1d_get_bits(gb, n_bits);
 1163|  21.0k|    } else {
 1164|  21.0k|        c->tile[c->n_tile_data].start = 0;
 1165|  21.0k|        c->tile[c->n_tile_data].end = n_tiles - 1;
 1166|  21.0k|    }
 1167|  21.8k|}

dav1d_pal_dsp_init:
   71|  17.2k|COLD void dav1d_pal_dsp_init(Dav1dPalDSPContext *const c) {
   72|  17.2k|    c->pal_idx_finish = pal_idx_finish_c;
   73|       |
   74|  17.2k|#if HAVE_ASM
   75|       |#if ARCH_RISCV
   76|       |    pal_dsp_init_riscv(c);
   77|       |#elif ARCH_X86
   78|       |    pal_dsp_init_x86(c);
   79|  17.2k|#endif
   80|  17.2k|#endif
   81|  17.2k|}

dav1d_default_picture_alloc:
   46|  22.9k|int dav1d_default_picture_alloc(Dav1dPicture *const p, void *const cookie) {
   47|  22.9k|    assert(sizeof(Dav1dMemPoolBuffer) <= DAV1D_PICTURE_ALIGNMENT);
  ------------------
  |  |  140|  22.9k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [Folded, False: 22.9k]
  |  |  |  Branch (140:68): [Folded, False: 22.9k]
  |  |  ------------------
  ------------------
   48|  22.9k|    const int hbd = p->p.bpc > 8;
   49|  22.9k|    const int aligned_w = (p->p.w + 127) & ~127;
   50|  22.9k|    const int aligned_h = (p->p.h + 127) & ~127;
   51|  22.9k|    const int has_chroma = p->p.layout != DAV1D_PIXEL_LAYOUT_I400;
   52|  22.9k|    const int ss_ver = p->p.layout == DAV1D_PIXEL_LAYOUT_I420;
   53|  22.9k|    const int ss_hor = p->p.layout != DAV1D_PIXEL_LAYOUT_I444;
   54|  22.9k|    ptrdiff_t y_stride = aligned_w << hbd;
   55|  22.9k|    ptrdiff_t uv_stride = has_chroma ? y_stride >> ss_hor : 0;
  ------------------
  |  Branch (55:27): [True: 18.2k, False: 4.78k]
  ------------------
   56|       |    /* Due to how mapping of addresses to sets works in most L1 and L2 cache
   57|       |     * implementations, strides of multiples of certain power-of-two numbers
   58|       |     * may cause multiple rows of the same superblock to map to the same set,
   59|       |     * causing evictions of previous rows resulting in a reduction in cache
   60|       |     * hit rate. Avoid that by slightly padding the stride when necessary. */
   61|  22.9k|    if (!(y_stride & 1023))
  ------------------
  |  Branch (61:9): [True: 592, False: 22.3k]
  ------------------
   62|    592|        y_stride += DAV1D_PICTURE_ALIGNMENT;
  ------------------
  |  |   44|    592|#define DAV1D_PICTURE_ALIGNMENT 64
  ------------------
   63|  22.9k|    if (!(uv_stride & 1023) && has_chroma)
  ------------------
  |  Branch (63:9): [True: 4.98k, False: 18.0k]
  |  Branch (63:32): [True: 198, False: 4.78k]
  ------------------
   64|    198|        uv_stride += DAV1D_PICTURE_ALIGNMENT;
  ------------------
  |  |   44|    198|#define DAV1D_PICTURE_ALIGNMENT 64
  ------------------
   65|  22.9k|    p->stride[0] = y_stride;
   66|  22.9k|    p->stride[1] = uv_stride;
   67|  22.9k|    const size_t y_sz = y_stride * aligned_h;
   68|  22.9k|    const size_t uv_sz = uv_stride * (aligned_h >> ss_ver);
   69|  22.9k|    const size_t pic_size = y_sz + 2 * uv_sz;
   70|       |
   71|  22.9k|    Dav1dMemPoolBuffer *const buf = dav1d_mem_pool_pop(cookie, pic_size +
   72|  22.9k|                                                       DAV1D_PICTURE_ALIGNMENT -
  ------------------
  |  |   44|  22.9k|#define DAV1D_PICTURE_ALIGNMENT 64
  ------------------
   73|  22.9k|                                                       sizeof(Dav1dMemPoolBuffer));
   74|  22.9k|    if (!buf) return DAV1D_ERR(ENOMEM);
  ------------------
  |  |   56|      0|#define DAV1D_ERR(e) (-(e)) ///< Negate POSIX error code.
  ------------------
  |  Branch (74:9): [True: 0, False: 22.9k]
  ------------------
   75|  22.9k|    p->allocator_data = buf;
   76|       |
   77|  22.9k|    uint8_t *const data = buf->data;
   78|  22.9k|    p->data[0] = data;
   79|  22.9k|    p->data[1] = has_chroma ? data + y_sz : NULL;
  ------------------
  |  Branch (79:18): [True: 18.2k, False: 4.78k]
  ------------------
   80|  22.9k|    p->data[2] = has_chroma ? data + y_sz + uv_sz : NULL;
  ------------------
  |  Branch (80:18): [True: 18.2k, False: 4.78k]
  ------------------
   81|       |
   82|  22.9k|    return 0;
   83|  22.9k|}
dav1d_default_picture_release:
   85|  22.9k|void dav1d_default_picture_release(Dav1dPicture *const p, void *const cookie) {
   86|  22.9k|    dav1d_mem_pool_push(cookie, p->allocator_data);
   87|  22.9k|}
dav1d_picture_free_itut_t35:
  105|      5|void dav1d_picture_free_itut_t35(const uint8_t *const data, void *const user_data) {
  106|      5|    struct itut_t35_ctx_context *itut_t35_ctx = user_data;
  107|       |
  108|     10|    for (size_t i = 0; i < itut_t35_ctx->n_itut_t35; i++)
  ------------------
  |  Branch (108:24): [True: 5, False: 5]
  ------------------
  109|      5|        dav1d_free(itut_t35_ctx->itut_t35[i].payload);
  ------------------
  |  |  135|      5|#define dav1d_free(ptr) free(ptr)
  ------------------
  110|      5|    dav1d_free(itut_t35_ctx->itut_t35);
  ------------------
  |  |  135|      5|#define dav1d_free(ptr) free(ptr)
  ------------------
  111|      5|    dav1d_free(itut_t35_ctx);
  ------------------
  |  |  135|      5|#define dav1d_free(ptr) free(ptr)
  ------------------
  112|      5|}
dav1d_picture_copy_props:
  172|  19.8k|{
  173|  19.8k|    dav1d_data_props_copy(&p->m, props);
  174|       |
  175|  19.8k|    dav1d_ref_dec(&p->content_light_ref);
  176|  19.8k|    p->content_light_ref = content_light_ref;
  177|  19.8k|    p->content_light = content_light;
  178|  19.8k|    if (content_light_ref) dav1d_ref_inc(content_light_ref);
  ------------------
  |  Branch (178:9): [True: 4, False: 19.8k]
  ------------------
  179|       |
  180|  19.8k|    dav1d_ref_dec(&p->mastering_display_ref);
  181|  19.8k|    p->mastering_display_ref = mastering_display_ref;
  182|  19.8k|    p->mastering_display = mastering_display;
  183|  19.8k|    if (mastering_display_ref) dav1d_ref_inc(mastering_display_ref);
  ------------------
  |  Branch (183:9): [True: 8, False: 19.8k]
  ------------------
  184|       |
  185|  19.8k|    dav1d_ref_dec(&p->itut_t35_ref);
  186|  19.8k|    p->itut_t35_ref = itut_t35_ref;
  187|  19.8k|    p->itut_t35 = itut_t35;
  188|  19.8k|    p->n_itut_t35 = n_itut_t35;
  189|  19.8k|    if (itut_t35_ref) dav1d_ref_inc(itut_t35_ref);
  ------------------
  |  Branch (189:9): [True: 0, False: 19.8k]
  ------------------
  190|  19.8k|}
dav1d_thread_picture_alloc:
  194|  21.4k|{
  195|  21.4k|    Dav1dThreadPicture *const p = &f->sr_cur;
  196|       |
  197|  21.4k|    const int res = picture_alloc(c, &p->p, f->frame_hdr->width[1], f->frame_hdr->height,
  198|  21.4k|                                  f->seq_hdr, f->seq_hdr_ref,
  199|  21.4k|                                  f->frame_hdr, f->frame_hdr_ref,
  200|  21.4k|                                  bpc, &f->tile[0].data.m, &c->allocator,
  201|  21.4k|                                  (void **) &p->progress);
  202|  21.4k|    if (res) return res;
  ------------------
  |  Branch (202:9): [True: 0, False: 21.4k]
  ------------------
  203|       |
  204|       |    // Don't clear these flags from c->frame_flags if the frame is not going to be output.
  205|       |    // This way they will be added to the next visible frame too.
  206|  21.4k|    const int flags_mask = ((f->frame_hdr->show_frame || c->output_invisible_frames) &&
  ------------------
  |  Branch (206:30): [True: 18.1k, False: 3.38k]
  |  Branch (206:58): [True: 0, False: 3.38k]
  ------------------
  207|  18.1k|                            c->max_spatial_id == f->frame_hdr->spatial_id)
  ------------------
  |  Branch (207:29): [True: 13.0k, False: 5.04k]
  ------------------
  208|  21.4k|                           ? 0 : (PICTURE_FLAG_NEW_SEQUENCE | PICTURE_FLAG_NEW_OP_PARAMS_INFO);
  209|  21.4k|    p->flags = c->frame_flags;
  210|  21.4k|    c->frame_flags &= flags_mask;
  211|       |
  212|  21.4k|    p->visible = f->frame_hdr->show_frame;
  213|  21.4k|    p->showable = f->frame_hdr->showable_frame;
  214|       |
  215|  21.4k|    if (p->visible) {
  ------------------
  |  Branch (215:9): [True: 18.1k, False: 3.38k]
  ------------------
  216|       |        // Only add HDR10+ and T35 metadata when show frame flag is enabled
  217|  18.1k|        dav1d_picture_copy_props(&p->p, c->content_light, c->content_light_ref,
  218|  18.1k|                                 c->mastering_display, c->mastering_display_ref,
  219|  18.1k|                                 c->itut_t35, c->itut_t35_ref, c->n_itut_t35,
  220|  18.1k|                                 &f->tile[0].data.m);
  221|       |
  222|       |        // Must be removed from the context after being attached to the frame
  223|  18.1k|        dav1d_ref_dec(&c->itut_t35_ref);
  224|  18.1k|        c->itut_t35 = NULL;
  225|  18.1k|        c->n_itut_t35 = 0;
  226|  18.1k|    } else {
  227|  3.38k|        dav1d_data_props_copy(&p->p.m, &f->tile[0].data.m);
  228|  3.38k|    }
  229|       |
  230|  21.4k|    if (c->n_fc > 1) {
  ------------------
  |  Branch (230:9): [True: 0, False: 21.4k]
  ------------------
  231|      0|        atomic_init(&p->progress[0], 0);
  232|       |        atomic_init(&p->progress[1], 0);
  233|      0|    }
  234|  21.4k|    return res;
  235|  21.4k|}
dav1d_picture_alloc_copy:
  239|  1.49k|{
  240|  1.49k|    Dav1dMemPoolBuffer *const buf = (Dav1dMemPoolBuffer *)src->ref->const_data;
  241|  1.49k|    struct pic_ctx_context *const pic_ctx = buf->data;
  242|  1.49k|    const int res = picture_alloc(c, dst, w, src->p.h,
  243|  1.49k|                                  src->seq_hdr, src->seq_hdr_ref,
  244|  1.49k|                                  src->frame_hdr, src->frame_hdr_ref,
  245|  1.49k|                                  src->p.bpc, &src->m, &pic_ctx->allocator,
  246|  1.49k|                                  NULL);
  247|  1.49k|    if (res) return res;
  ------------------
  |  Branch (247:9): [True: 0, False: 1.49k]
  ------------------
  248|       |
  249|  1.49k|    dav1d_picture_copy_props(dst, src->content_light, src->content_light_ref,
  250|  1.49k|                             src->mastering_display, src->mastering_display_ref,
  251|  1.49k|                             src->itut_t35, src->itut_t35_ref, src->n_itut_t35,
  252|  1.49k|                             &src->m);
  253|       |
  254|  1.49k|    return 0;
  255|  1.49k|}
dav1d_picture_ref:
  257|   217k|void dav1d_picture_ref(Dav1dPicture *const dst, const Dav1dPicture *const src) {
  258|   217k|    assert(dst != NULL);
  ------------------
  |  |  140|   217k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 217k]
  |  |  |  Branch (140:68): [Folded, False: 217k]
  |  |  ------------------
  ------------------
  259|   217k|    assert(dst->data[0] == NULL);
  ------------------
  |  |  140|   217k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 217k]
  |  |  |  Branch (140:68): [Folded, False: 217k]
  |  |  ------------------
  ------------------
  260|   217k|    assert(src != NULL);
  ------------------
  |  |  140|   217k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 217k]
  |  |  |  Branch (140:68): [Folded, False: 217k]
  |  |  ------------------
  ------------------
  261|       |
  262|   217k|    if (src->ref) {
  ------------------
  |  Branch (262:9): [True: 217k, False: 0]
  ------------------
  263|   217k|        assert(src->data[0] != NULL);
  ------------------
  |  |  140|   217k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 217k]
  |  |  |  Branch (140:68): [Folded, False: 217k]
  |  |  ------------------
  ------------------
  264|   217k|        dav1d_ref_inc(src->ref);
  265|   217k|    }
  266|   217k|    if (src->frame_hdr_ref) dav1d_ref_inc(src->frame_hdr_ref);
  ------------------
  |  Branch (266:9): [True: 217k, False: 0]
  ------------------
  267|   217k|    if (src->seq_hdr_ref) dav1d_ref_inc(src->seq_hdr_ref);
  ------------------
  |  Branch (267:9): [True: 217k, False: 0]
  ------------------
  268|   217k|    if (src->m.user_data.ref) dav1d_ref_inc(src->m.user_data.ref);
  ------------------
  |  Branch (268:9): [True: 0, False: 217k]
  ------------------
  269|   217k|    if (src->content_light_ref) dav1d_ref_inc(src->content_light_ref);
  ------------------
  |  Branch (269:9): [True: 40, False: 217k]
  ------------------
  270|   217k|    if (src->mastering_display_ref) dav1d_ref_inc(src->mastering_display_ref);
  ------------------
  |  Branch (270:9): [True: 38, False: 217k]
  ------------------
  271|   217k|    if (src->itut_t35_ref) dav1d_ref_inc(src->itut_t35_ref);
  ------------------
  |  Branch (271:9): [True: 0, False: 217k]
  ------------------
  272|   217k|    *dst = *src;
  273|   217k|}
dav1d_picture_move_ref:
  275|  10.0k|void dav1d_picture_move_ref(Dav1dPicture *const dst, Dav1dPicture *const src) {
  276|  10.0k|    assert(dst != NULL);
  ------------------
  |  |  140|  10.0k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 10.0k]
  |  |  |  Branch (140:68): [Folded, False: 10.0k]
  |  |  ------------------
  ------------------
  277|  10.0k|    assert(dst->data[0] == NULL);
  ------------------
  |  |  140|  10.0k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 10.0k]
  |  |  |  Branch (140:68): [Folded, False: 10.0k]
  |  |  ------------------
  ------------------
  278|  10.0k|    assert(src != NULL);
  ------------------
  |  |  140|  10.0k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 10.0k]
  |  |  |  Branch (140:68): [Folded, False: 10.0k]
  |  |  ------------------
  ------------------
  279|       |
  280|  10.0k|    if (src->ref)
  ------------------
  |  Branch (280:9): [True: 10.0k, False: 0]
  ------------------
  281|  10.0k|        assert(src->data[0] != NULL);
  ------------------
  |  |  140|  10.0k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 10.0k]
  |  |  |  Branch (140:68): [Folded, False: 10.0k]
  |  |  ------------------
  ------------------
  282|       |
  283|  10.0k|    *dst = *src;
  284|  10.0k|    memset(src, 0, sizeof(*src));
  285|  10.0k|}
dav1d_thread_picture_ref:
  289|   197k|{
  290|   197k|    dav1d_picture_ref(&dst->p, &src->p);
  291|   197k|    dst->visible = src->visible;
  292|   197k|    dst->showable = src->showable;
  293|   197k|    dst->progress = src->progress;
  294|   197k|    dst->flags = src->flags;
  295|   197k|}
dav1d_thread_picture_move_ref:
  299|  1.15k|{
  300|  1.15k|    dav1d_picture_move_ref(&dst->p, &src->p);
  301|  1.15k|    dst->visible = src->visible;
  302|  1.15k|    dst->showable = src->showable;
  303|  1.15k|    dst->progress = src->progress;
  304|  1.15k|    dst->flags = src->flags;
  305|  1.15k|    memset(src, 0, sizeof(*src));
  306|  1.15k|}
dav1d_picture_unref_internal:
  308|   326k|void dav1d_picture_unref_internal(Dav1dPicture *const p) {
  309|   326k|    validate_input(p != NULL);
  ------------------
  |  |   59|   326k|#define validate_input(x) validate_input_or_ret(x, )
  |  |  ------------------
  |  |  |  |   52|   326k|    if (!(x)) { \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (52:9): [True: 0, False: 326k]
  |  |  |  |  ------------------
  |  |  |  |   53|      0|        debug_print("Input validation check \'%s\' failed in %s!\n", \
  |  |  |  |  ------------------
  |  |  |  |  |  |   35|      0|#define debug_print(...) do {} while (0)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (35:39): [Folded, False: 0]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   54|      0|                    #x, __func__); \
  |  |  |  |   55|      0|        debug_abort(); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   36|      0|#define debug_abort() do {} while (0)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (36:36): [Folded, False: 0]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   56|      0|        return r; \
  |  |  |  |   57|      0|    }
  |  |  ------------------
  ------------------
  310|       |
  311|   326k|    if (p->ref) {
  ------------------
  |  Branch (311:9): [True: 240k, False: 85.9k]
  ------------------
  312|   240k|        validate_input(p->data[0] != NULL);
  ------------------
  |  |   59|   240k|#define validate_input(x) validate_input_or_ret(x, )
  |  |  ------------------
  |  |  |  |   52|   240k|    if (!(x)) { \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (52:9): [True: 0, False: 240k]
  |  |  |  |  ------------------
  |  |  |  |   53|      0|        debug_print("Input validation check \'%s\' failed in %s!\n", \
  |  |  |  |  ------------------
  |  |  |  |  |  |   35|      0|#define debug_print(...) do {} while (0)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (35:39): [Folded, False: 0]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   54|      0|                    #x, __func__); \
  |  |  |  |   55|      0|        debug_abort(); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   36|      0|#define debug_abort() do {} while (0)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  Branch (36:36): [Folded, False: 0]
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   56|      0|        return r; \
  |  |  |  |   57|      0|    }
  |  |  ------------------
  ------------------
  313|   240k|        dav1d_ref_dec(&p->ref);
  314|   240k|    }
  315|   326k|    dav1d_ref_dec(&p->seq_hdr_ref);
  316|   326k|    dav1d_ref_dec(&p->frame_hdr_ref);
  317|   326k|    dav1d_ref_dec(&p->m.user_data.ref);
  318|   326k|    dav1d_ref_dec(&p->content_light_ref);
  319|   326k|    dav1d_ref_dec(&p->mastering_display_ref);
  320|   326k|    dav1d_ref_dec(&p->itut_t35_ref);
  321|   326k|    memset(p, 0, sizeof(*p));
  322|   326k|    dav1d_data_props_set_defaults(&p->m);
  323|   326k|}
dav1d_thread_picture_unref:
  325|   260k|void dav1d_thread_picture_unref(Dav1dThreadPicture *const p) {
  326|   260k|    dav1d_picture_unref_internal(&p->p);
  327|       |
  328|       |    p->progress = NULL;
  329|   260k|}
dav1d_picture_get_event_flags:
  331|  18.3k|enum Dav1dEventFlags dav1d_picture_get_event_flags(const Dav1dThreadPicture *const p) {
  332|  18.3k|    if (!p->flags)
  ------------------
  |  Branch (332:9): [True: 744, False: 17.6k]
  ------------------
  333|    744|        return 0;
  334|       |
  335|  17.6k|    enum Dav1dEventFlags flags = 0;
  336|  17.6k|    if (p->flags & PICTURE_FLAG_NEW_SEQUENCE)
  ------------------
  |  Branch (336:9): [True: 16.4k, False: 1.15k]
  ------------------
  337|  16.4k|       flags |= DAV1D_EVENT_FLAG_NEW_SEQUENCE;
  338|  17.6k|    if (p->flags & PICTURE_FLAG_NEW_OP_PARAMS_INFO)
  ------------------
  |  Branch (338:9): [True: 0, False: 17.6k]
  ------------------
  339|      0|       flags |= DAV1D_EVENT_FLAG_NEW_OP_PARAMS_INFO;
  340|       |
  341|  17.6k|    return flags;
  342|  18.3k|}
picture.c:picture_alloc:
  123|  22.9k|{
  124|  22.9k|    if (p->data[0]) {
  ------------------
  |  Branch (124:9): [True: 0, False: 22.9k]
  ------------------
  125|      0|        dav1d_log(c, "Picture already allocated!\n");
  ------------------
  |  |   39|      0|#define dav1d_log dav1d_log
  ------------------
  126|      0|        return -1;
  127|      0|    }
  128|  22.9k|    assert(bpc > 0 && bpc <= 16);
  ------------------
  |  |  140|  45.9k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:30): [True: 22.9k, False: 0]
  |  |  |  Branch (140:30): [True: 22.9k, False: 0]
  |  |  |  Branch (140:68): [Folded, False: 22.9k]
  |  |  ------------------
  ------------------
  129|       |
  130|  22.9k|    size_t extra = c->n_fc > 1 ? sizeof(atomic_int) * 2 : 0;
  ------------------
  |  Branch (130:20): [True: 0, False: 22.9k]
  ------------------
  131|  22.9k|    Dav1dMemPoolBuffer *buf = dav1d_mem_pool_pop(c->pic_ctx_pool,
  132|  22.9k|                                                 extra + sizeof(struct pic_ctx_context));
  133|  22.9k|    if (buf == NULL)
  ------------------
  |  Branch (133:9): [True: 0, False: 22.9k]
  ------------------
  134|      0|        return DAV1D_ERR(ENOMEM);
  ------------------
  |  |   56|      0|#define DAV1D_ERR(e) (-(e)) ///< Negate POSIX error code.
  ------------------
  135|       |
  136|  22.9k|    struct pic_ctx_context *pic_ctx = buf->data;
  137|       |
  138|  22.9k|    p->p.w = w;
  139|  22.9k|    p->p.h = h;
  140|  22.9k|    p->seq_hdr = seq_hdr;
  141|  22.9k|    p->frame_hdr = frame_hdr;
  142|  22.9k|    p->p.layout = seq_hdr->layout;
  143|  22.9k|    p->p.bpc = bpc;
  144|  22.9k|    dav1d_data_props_set_defaults(&p->m);
  145|  22.9k|    const int res = p_allocator->alloc_picture_callback(p, p_allocator->cookie);
  146|  22.9k|    if (res < 0) {
  ------------------
  |  Branch (146:9): [True: 0, False: 22.9k]
  ------------------
  147|      0|        dav1d_mem_pool_push(c->pic_ctx_pool, buf);
  148|      0|        return res;
  149|      0|    }
  150|       |
  151|  22.9k|    pic_ctx->allocator = *p_allocator;
  152|  22.9k|    pic_ctx->pic = *p;
  153|  22.9k|    p->ref = dav1d_ref_init(&pic_ctx->ref, buf, free_buffer, c->pic_ctx_pool, 0);
  154|       |
  155|  22.9k|    p->seq_hdr_ref = seq_hdr_ref;
  156|  22.9k|    if (seq_hdr_ref) dav1d_ref_inc(seq_hdr_ref);
  ------------------
  |  Branch (156:9): [True: 22.9k, False: 0]
  ------------------
  157|       |
  158|  22.9k|    p->frame_hdr_ref = frame_hdr_ref;
  159|  22.9k|    if (frame_hdr_ref) dav1d_ref_inc(frame_hdr_ref);
  ------------------
  |  Branch (159:9): [True: 22.9k, False: 0]
  ------------------
  160|       |
  161|  22.9k|    if (extra && extra_ptr)
  ------------------
  |  Branch (161:9): [True: 0, False: 22.9k]
  |  Branch (161:18): [True: 0, False: 0]
  ------------------
  162|      0|        *extra_ptr = &pic_ctx->extra_data;
  163|       |
  164|  22.9k|    return 0;
  165|  22.9k|}
picture.c:free_buffer:
   96|  22.9k|static void free_buffer(const uint8_t *const data, void *const user_data) {
   97|  22.9k|    Dav1dMemPoolBuffer *buf = (Dav1dMemPoolBuffer *)data;
   98|  22.9k|    struct pic_ctx_context *pic_ctx = buf->data;
   99|       |
  100|  22.9k|    pic_ctx->allocator.release_picture_callback(&pic_ctx->pic,
  101|  22.9k|                                                pic_ctx->allocator.cookie);
  102|  22.9k|    dav1d_mem_pool_push(user_data, buf);
  103|  22.9k|}

dav1d_init_qm_tables:
 1648|      1|COLD void dav1d_init_qm_tables(void) {
 1649|       |    // This function is guaranteed to be called only once
 1650|       |
 1651|     16|    for (int i = 0; i < 15; i++)
  ------------------
  |  Branch (1651:21): [True: 15, False: 1]
  ------------------
 1652|     45|        for (int j = 0; j < 2; j++) {
  ------------------
  |  Branch (1652:25): [True: 30, False: 15]
  ------------------
 1653|       |            // note that the w/h in the assignment is inverted, this is on purpose
 1654|       |            // because we store coefficients transposed
 1655|     30|            dav1d_qm_tbl[i][j][RTX_4X8  ] = qm_tbl_8x4[i][j];
 1656|     30|            dav1d_qm_tbl[i][j][RTX_8X4  ] = qm_tbl_4x8[i][j];
 1657|     30|            dav1d_qm_tbl[i][j][RTX_4X16 ] = qm_tbl_16x4[i][j];
 1658|     30|            dav1d_qm_tbl[i][j][RTX_16X4 ] = qm_tbl_4x16[i][j];
 1659|     30|            dav1d_qm_tbl[i][j][RTX_8X16 ] = qm_tbl_16x8[i][j];
 1660|     30|            dav1d_qm_tbl[i][j][RTX_16X8 ] = qm_tbl_8x16[i][j];
 1661|     30|            dav1d_qm_tbl[i][j][RTX_8X32 ] = qm_tbl_32x8[i][j];
 1662|     30|            dav1d_qm_tbl[i][j][RTX_32X8 ] = qm_tbl_8x32[i][j];
 1663|     30|            dav1d_qm_tbl[i][j][RTX_16X32] = qm_tbl_32x16[i][j];
 1664|     30|            dav1d_qm_tbl[i][j][RTX_32X16] = qm_tbl_16x32[i][j];
 1665|       |
 1666|     30|            dav1d_qm_tbl[i][j][ TX_4X4  ] = qm_tbl_4x4[i][j];
 1667|     30|            dav1d_qm_tbl[i][j][ TX_8X8  ] = qm_tbl_8x8[i][j];
 1668|     30|            dav1d_qm_tbl[i][j][ TX_16X16] = qm_tbl_16x16[i][j];
 1669|     30|            dav1d_qm_tbl[i][j][ TX_32X32] = qm_tbl_32x32[i][j];
 1670|       |
 1671|     30|            untriangle(qm_tbl_32x32[i][j], qm_tbl_32x32_t[i][j], 32);
 1672|     30|            subsample(qm_tbl_4x4[i][j],   &qm_tbl_32x32[i][j][32*3+3], 32, 8, 8);
 1673|     30|            subsample(qm_tbl_8x4[i][j],   &qm_tbl_32x16[i][j][32*1+1], 16, 4, 4);
 1674|     30|            subsample(qm_tbl_8x8[i][j],   &qm_tbl_32x32[i][j][32*1+1], 32, 4, 4);
 1675|     30|            subsample(qm_tbl_16x4[i][j],  &qm_tbl_32x16[i][j][32*1+0], 16, 2, 4);
 1676|     30|            subsample(qm_tbl_16x8[i][j],  &qm_tbl_32x16[i][j][32*0+0], 16, 2, 2);
 1677|     30|            subsample(qm_tbl_16x16[i][j], &qm_tbl_32x32[i][j][32*0+0], 32, 2, 2);
 1678|     30|            subsample(qm_tbl_32x8[i][j],  &qm_tbl_32x16[i][j][32*0+0], 16, 1, 2);
 1679|     30|            transpose(qm_tbl_4x8[i][j], qm_tbl_8x4[i][j], 8, 4);
 1680|     30|            transpose(qm_tbl_4x16[i][j], qm_tbl_16x4[i][j], 16, 4);
 1681|     30|            transpose(qm_tbl_8x16[i][j], qm_tbl_16x8[i][j], 16, 8);
 1682|     30|            transpose(qm_tbl_8x32[i][j], qm_tbl_32x8[i][j], 32, 8);
 1683|     30|            transpose(qm_tbl_16x32[i][j], qm_tbl_32x16[i][j], 32, 16);
 1684|       |
 1685|     30|            dav1d_qm_tbl[i][j][ TX_64X64] = dav1d_qm_tbl[i][j][ TX_32X32];
 1686|     30|            dav1d_qm_tbl[i][j][RTX_64X32] = dav1d_qm_tbl[i][j][ TX_32X32];
 1687|     30|            dav1d_qm_tbl[i][j][RTX_64X16] = dav1d_qm_tbl[i][j][RTX_32X16];
 1688|     30|            dav1d_qm_tbl[i][j][RTX_32X64] = dav1d_qm_tbl[i][j][ TX_32X32];
 1689|     30|            dav1d_qm_tbl[i][j][RTX_16X64] = dav1d_qm_tbl[i][j][RTX_16X32];
 1690|     30|        }
 1691|       |
 1692|       |    // dav1d_qm_tbl[15][*][*] == NULL
 1693|      1|}
qm.c:untriangle:
 1635|     30|static void untriangle(uint8_t *dst, const uint8_t *src, const int sz) {
 1636|    990|    for (int y = 0; y < sz; y++) {
  ------------------
  |  Branch (1636:21): [True: 960, False: 30]
  ------------------
 1637|    960|        memcpy(dst, src, y + 1);
 1638|    960|        const uint8_t *src_ptr = &src[y];
 1639|  15.8k|        for (int x = y + 1; x < sz; x++) {
  ------------------
  |  Branch (1639:29): [True: 14.8k, False: 960]
  ------------------
 1640|  14.8k|            src_ptr += x;
 1641|  14.8k|            dst[x] = *src_ptr;
 1642|  14.8k|        }
 1643|    960|        dst += sz;
 1644|    960|        src += y + 1;
 1645|    960|    }
 1646|     30|}
qm.c:subsample:
 1621|    210|{
 1622|  1.77k|    for (int y = 0; y < h; y += vstep)
  ------------------
  |  Branch (1622:21): [True: 1.56k, False: 210]
  ------------------
 1623|  26.0k|        for (int x = 0; x < 32; x += hstep)
  ------------------
  |  Branch (1623:25): [True: 24.4k, False: 1.56k]
  ------------------
 1624|  24.4k|            *dst++ = src[y * 32 + x];
 1625|    210|}
qm.c:transpose:
 1629|    150|{
 1630|  1.35k|    for (int y = 0, y_off = 0; y < h; y++, y_off += w)
  ------------------
  |  Branch (1630:32): [True: 1.20k, False: 150]
  ------------------
 1631|  30.9k|        for (int x = 0, x_off = 0; x < w; x++, x_off += h)
  ------------------
  |  Branch (1631:36): [True: 29.7k, False: 1.20k]
  ------------------
 1632|  29.7k|            dst[x_off + y] = src[y_off + x];
 1633|    150|}

dav1d_recon_b_intra_8bpc:
 1179|   888k|{
 1180|   888k|    Dav1dTileState *const ts = t->ts;
 1181|   888k|    const Dav1dFrameContext *const f = t->f;
 1182|   888k|    const Dav1dDSPContext *const dsp = f->dsp;
 1183|   888k|    const int bx4 = t->bx & 31, by4 = t->by & 31;
 1184|   888k|    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
 1185|   888k|    const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
 1186|   888k|    const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
 1187|   888k|    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
 1188|   888k|    const int bw4 = b_dim[0], bh4 = b_dim[1];
 1189|   888k|    const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
 1190|   888k|    const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
 1191|   888k|    const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&
  ------------------
  |  Branch (1191:28): [True: 791k, False: 97.0k]
  ------------------
 1192|   791k|                           (bw4 > ss_hor || t->bx & 1) &&
  ------------------
  |  Branch (1192:29): [True: 763k, False: 27.5k]
  |  Branch (1192:45): [True: 13.7k, False: 13.8k]
  ------------------
 1193|   777k|                           (bh4 > ss_ver || t->by & 1);
  ------------------
  |  Branch (1193:29): [True: 759k, False: 17.9k]
  |  Branch (1193:45): [True: 8.96k, False: 8.95k]
  ------------------
 1194|   888k|    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->tx];
 1195|   888k|    const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx];
 1196|       |
 1197|       |    // coefficient coding
 1198|   888k|    pixel *const edge = bitfn(t->scratch.edge) + 128;
  ------------------
  |  |   51|   888k|#define bitfn(x) x##_8bpc
  ------------------
 1199|   888k|    const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver;
 1200|       |
 1201|   888k|    const int intra_edge_filter_flag = f->seq_hdr->intra_edge_filter << 10;
 1202|       |
 1203|  1.78M|    for (int init_y = 0; init_y < h4; init_y += 16) {
  ------------------
  |  Branch (1203:26): [True: 891k, False: 896k]
  ------------------
 1204|   891k|        const int sub_h4 = imin(h4, 16 + init_y);
 1205|   891k|        const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver);
 1206|  1.79M|        for (int init_x = 0; init_x < w4; init_x += 16) {
  ------------------
  |  Branch (1206:30): [True: 897k, False: 900k]
  ------------------
 1207|   897k|            if (b->pal_sz[0]) {
  ------------------
  |  Branch (1207:17): [True: 10.9k, False: 886k]
  ------------------
 1208|  10.9k|                pixel *dst = ((pixel *) f->cur.data[0]) +
 1209|  10.9k|                             4 * (t->by * PXSTRIDE(f->cur.stride[0]) + t->bx);
  ------------------
  |  |   53|  10.9k|#define PXSTRIDE(x) (x)
  ------------------
 1210|  10.9k|                const uint8_t *pal_idx;
 1211|  10.9k|                if (t->frame_thread.pass) {
  ------------------
  |  Branch (1211:21): [True: 0, False: 10.9k]
  ------------------
 1212|      0|                    const int p = t->frame_thread.pass & 1;
 1213|      0|                    assert(ts->frame_thread[p].pal_idx);
  ------------------
  |  |  140|      0|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 0]
  |  |  |  Branch (140:68): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1214|      0|                    pal_idx = ts->frame_thread[p].pal_idx;
 1215|      0|                    ts->frame_thread[p].pal_idx += bw4 * bh4 * 8;
 1216|  10.9k|                } else {
 1217|  10.9k|                    pal_idx = t->scratch.pal_idx_y;
 1218|  10.9k|                }
 1219|  10.9k|                const pixel *const pal = t->frame_thread.pass ?
  ------------------
  |  Branch (1219:42): [True: 0, False: 10.9k]
  ------------------
 1220|      0|                    f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
 1221|      0|                                        ((t->bx >> 1) + (t->by & 1))][0] :
 1222|  10.9k|                    bytefn(t->scratch.pal)[0];
  ------------------
  |  |   87|  10.9k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   51|  10.9k|#define bitfn(x) x##_8bpc
  |  |  ------------------
  ------------------
 1223|  10.9k|                f->dsp->ipred.pal_pred(dst, f->cur.stride[0], pal,
 1224|  10.9k|                                       pal_idx, bw4 * 4, bh4 * 4);
 1225|  10.9k|                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
  ------------------
  |  |   34|  10.9k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 10.9k]
  |  |  ------------------
  |  |   35|  10.9k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  10.9k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
                              if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
  ------------------
  |  |   37|      0|#define DEBUG_B_PIXELS 0
  |  |  ------------------
  |  |  |  Branch (37:24): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1226|      0|                    hex_dump(dst, PXSTRIDE(f->cur.stride[0]),
  ------------------
  |  |   53|      0|#define PXSTRIDE(x) (x)
  ------------------
 1227|      0|                             bw4 * 4, bh4 * 4, "y-pal-pred");
 1228|  10.9k|            }
 1229|       |
 1230|   897k|            const int intra_flags = (sm_flag(t->a, bx4) |
 1231|   897k|                                     sm_flag(&t->l, by4) |
 1232|   897k|                                     intra_edge_filter_flag);
 1233|   897k|            const int sb_has_tr = init_x + 16 < w4 ? 1 : init_y ? 0 :
  ------------------
  |  Branch (1233:35): [True: 8.61k, False: 888k]
  |  Branch (1233:58): [True: 4.40k, False: 884k]
  ------------------
 1234|   888k|                              intra_edge_flags & EDGE_I444_TOP_HAS_RIGHT;
 1235|   897k|            const int sb_has_bl = init_x ? 0 : init_y + 16 < h4 ? 1 :
  ------------------
  |  Branch (1235:35): [True: 8.62k, False: 888k]
  |  Branch (1235:48): [True: 4.40k, False: 883k]
  ------------------
 1236|   888k|                              intra_edge_flags & EDGE_I444_LEFT_HAS_BOTTOM;
 1237|   897k|            int y, x;
 1238|   897k|            const int sub_w4 = imin(w4, init_x + 16);
 1239|  2.09M|            for (y = init_y, t->by += init_y; y < sub_h4;
  ------------------
  |  Branch (1239:47): [True: 1.18M, False: 905k]
  ------------------
 1240|  1.19M|                 y += t_dim->h, t->by += t_dim->h)
 1241|  1.18M|            {
 1242|  1.18M|                pixel *dst = ((pixel *) f->cur.data[0]) +
 1243|  1.18M|                               4 * (t->by * PXSTRIDE(f->cur.stride[0]) +
  ------------------
  |  |   53|  1.18M|#define PXSTRIDE(x) (x)
  ------------------
 1244|  1.18M|                                    t->bx + init_x);
 1245|  3.10M|                for (x = init_x, t->bx += init_x; x < sub_w4;
  ------------------
  |  Branch (1245:51): [True: 1.90M, False: 1.19M]
  ------------------
 1246|  1.91M|                     x += t_dim->w, t->bx += t_dim->w)
 1247|  1.90M|                {
 1248|  1.90M|                    if (b->pal_sz[0]) goto skip_y_pred;
  ------------------
  |  Branch (1248:25): [True: 41.3k, False: 1.86M]
  ------------------
 1249|       |
 1250|  1.86M|                    int angle = b->y_angle;
 1251|  1.86M|                    const enum EdgeFlags edge_flags =
 1252|  1.86M|                        (((y > init_y || !sb_has_tr) && (x + t_dim->w >= sub_w4)) ?
  ------------------
  |  Branch (1252:28): [True: 745k, False: 1.11M]
  |  Branch (1252:42): [True: 221k, False: 896k]
  |  Branch (1252:57): [True: 477k, False: 495k]
  ------------------
 1253|  1.38M|                             0 : EDGE_I444_TOP_HAS_RIGHT) |
 1254|  1.86M|                        ((x > init_x || (!sb_has_bl && y + t_dim->h >= sub_h4)) ?
  ------------------
  |  Branch (1254:27): [True: 691k, False: 1.17M]
  |  Branch (1254:42): [True: 668k, False: 503k]
  |  Branch (1254:56): [True: 458k, False: 210k]
  ------------------
 1255|  1.15M|                             0 : EDGE_I444_LEFT_HAS_BOTTOM);
 1256|  1.86M|                    const pixel *top_sb_edge = NULL;
 1257|  1.86M|                    if (!(t->by & (f->sb_step - 1))) {
  ------------------
  |  Branch (1257:25): [True: 645k, False: 1.21M]
  ------------------
 1258|   645k|                        top_sb_edge = f->ipred_edge[0];
 1259|   645k|                        const int sby = t->by >> f->sb_shift;
 1260|   645k|                        top_sb_edge += f->sb128w * 128 * (sby - 1);
 1261|   645k|                    }
 1262|  1.86M|                    const enum IntraPredMode m =
 1263|  1.86M|                        bytefn(dav1d_prepare_intra_edges)(t->bx,
  ------------------
  |  |   87|  1.86M|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   51|  1.86M|#define bitfn(x) x##_8bpc
  |  |  ------------------
  ------------------
 1264|  1.86M|                                                          t->bx > ts->tiling.col_start,
 1265|  1.86M|                                                          t->by,
 1266|  1.86M|                                                          t->by > ts->tiling.row_start,
 1267|  1.86M|                                                          ts->tiling.col_end,
 1268|  1.86M|                                                          ts->tiling.row_end,
 1269|  1.86M|                                                          edge_flags, dst,
 1270|  1.86M|                                                          f->cur.stride[0], top_sb_edge,
 1271|  1.86M|                                                          b->y_mode, &angle,
 1272|  1.86M|                                                          t_dim->w, t_dim->h,
 1273|  1.86M|                                                          f->seq_hdr->intra_edge_filter,
 1274|  1.86M|                                                          edge HIGHBD_CALL_SUFFIX);
 1275|  1.86M|                    dsp->ipred.intra_pred[m](dst, f->cur.stride[0], edge,
 1276|  1.86M|                                             t_dim->w * 4, t_dim->h * 4,
 1277|  1.86M|                                             angle | intra_flags,
 1278|  1.86M|                                             4 * f->bw - 4 * t->bx,
 1279|  1.86M|                                             4 * f->bh - 4 * t->by
 1280|  1.86M|                                             HIGHBD_CALL_SUFFIX);
 1281|       |
 1282|  1.86M|                    if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
  ------------------
  |  |   34|  1.86M|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 1.86M]
  |  |  ------------------
  |  |   35|  1.86M|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  1.86M|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
                                  if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
  ------------------
  |  |   37|      0|#define DEBUG_B_PIXELS 0
  |  |  ------------------
  |  |  |  Branch (37:24): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1283|      0|                        hex_dump(edge - t_dim->h * 4, t_dim->h * 4,
 1284|      0|                                 t_dim->h * 4, 2, "l");
 1285|      0|                        hex_dump(edge, 0, 1, 1, "tl");
 1286|      0|                        hex_dump(edge + 1, t_dim->w * 4,
 1287|      0|                                 t_dim->w * 4, 2, "t");
 1288|      0|                        hex_dump(dst, f->cur.stride[0],
 1289|      0|                                 t_dim->w * 4, t_dim->h * 4, "y-intra-pred");
 1290|      0|                    }
 1291|       |
 1292|  1.91M|                skip_y_pred: {}
 1293|  1.91M|                    if (!b->skip) {
  ------------------
  |  Branch (1293:25): [True: 896k, False: 1.01M]
  ------------------
 1294|   896k|                        coef *cf;
 1295|   896k|                        int eob;
 1296|   896k|                        enum TxfmType txtp;
 1297|   896k|                        if (t->frame_thread.pass) {
  ------------------
  |  Branch (1297:29): [True: 0, False: 896k]
  ------------------
 1298|      0|                            const int p = t->frame_thread.pass & 1;
 1299|      0|                            const int cbi = *ts->frame_thread[p].cbi++;
 1300|      0|                            cf = ts->frame_thread[p].cf;
 1301|      0|                            ts->frame_thread[p].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
 1302|      0|                            eob  = cbi >> 5;
 1303|      0|                            txtp = cbi & 0x1f;
 1304|   896k|                        } else {
 1305|   896k|                            uint8_t cf_ctx;
 1306|   896k|                            cf = bitfn(t->cf);
  ------------------
  |  |   51|   896k|#define bitfn(x) x##_8bpc
  ------------------
 1307|   896k|                            eob = decode_coefs(t, &t->a->lcoef[bx4 + x],
 1308|   896k|                                               &t->l.lcoef[by4 + y], b->tx, bs,
 1309|   896k|                                               b, 1, 0, cf, &txtp, &cf_ctx);
 1310|   896k|                            if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|   896k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 896k]
  |  |  ------------------
  |  |   35|   896k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|   896k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1311|      0|                                printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
 1312|      0|                                       b->tx, txtp, eob, ts->msac.rng);
 1313|   896k|                            dav1d_memset_likely_pow2(&t->a->lcoef[bx4 + x], cf_ctx, imin(t_dim->w, f->bw - t->bx));
 1314|   896k|                            dav1d_memset_likely_pow2(&t->l.lcoef[by4 + y], cf_ctx, imin(t_dim->h, f->bh - t->by));
 1315|   896k|                        }
 1316|   896k|                        if (eob >= 0) {
  ------------------
  |  Branch (1316:29): [True: 470k, False: 426k]
  ------------------
 1317|   470k|                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
  ------------------
  |  |   34|   470k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 470k]
  |  |  ------------------
  |  |   35|   470k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|   470k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
                                          if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
  ------------------
  |  |   37|      0|#define DEBUG_B_PIXELS 0
  |  |  ------------------
  |  |  |  Branch (37:24): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1318|      0|                                coef_dump(cf, imin(t_dim->h, 8) * 4,
 1319|      0|                                          imin(t_dim->w, 8) * 4, 3, "dq");
 1320|   470k|                            dsp->itx.itxfm_add[b->tx]
 1321|   470k|                                              [txtp](dst,
 1322|   470k|                                                     f->cur.stride[0],
 1323|   470k|                                                     cf, eob HIGHBD_CALL_SUFFIX);
 1324|   470k|                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
  ------------------
  |  |   34|   470k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 470k]
  |  |  ------------------
  |  |   35|   470k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|   470k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
                                          if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
  ------------------
  |  |   37|      0|#define DEBUG_B_PIXELS 0
  |  |  ------------------
  |  |  |  Branch (37:24): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1325|      0|                                hex_dump(dst, f->cur.stride[0],
 1326|      0|                                         t_dim->w * 4, t_dim->h * 4, "recon");
 1327|   470k|                        }
 1328|  1.01M|                    } else if (!t->frame_thread.pass) {
  ------------------
  |  Branch (1328:32): [True: 1.01M, False: 18.4E]
  ------------------
 1329|  1.01M|                        dav1d_memset_pow2[t_dim->lw](&t->a->lcoef[bx4 + x], 0x40);
 1330|  1.01M|                        dav1d_memset_pow2[t_dim->lh](&t->l.lcoef[by4 + y], 0x40);
 1331|  1.01M|                    }
 1332|  1.91M|                    dst += 4 * t_dim->w;
 1333|  1.91M|                }
 1334|  1.19M|                t->bx -= x;
 1335|  1.19M|            }
 1336|   905k|            t->by -= y;
 1337|       |
 1338|   905k|            if (!has_chroma) continue;
  ------------------
  |  Branch (1338:17): [True: 113k, False: 791k]
  ------------------
 1339|       |
 1340|   791k|            const ptrdiff_t stride = f->cur.stride[1];
 1341|       |
 1342|   791k|            if (b->uv_mode == CFL_PRED) {
  ------------------
  |  Branch (1342:17): [True: 419k, False: 371k]
  ------------------
 1343|   419k|                assert(!init_x && !init_y);
  ------------------
  |  |  140|   839k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:30): [True: 419k, False: 314]
  |  |  |  Branch (140:30): [True: 419k, False: 24]
  |  |  |  Branch (140:68): [Folded, False: 419k]
  |  |  ------------------
  ------------------
 1344|       |
 1345|   419k|                int16_t *const ac = t->scratch.ac;
 1346|   419k|                pixel *y_src = ((pixel *) f->cur.data[0]) + 4 * (t->bx & ~ss_hor) +
 1347|   419k|                                 4 * (t->by & ~ss_ver) * PXSTRIDE(f->cur.stride[0]);
  ------------------
  |  |   53|   419k|#define PXSTRIDE(x) (x)
  ------------------
 1348|   419k|                const ptrdiff_t uv_off = 4 * ((t->bx >> ss_hor) +
 1349|   419k|                                              (t->by >> ss_ver) * PXSTRIDE(stride));
  ------------------
  |  |   53|   419k|#define PXSTRIDE(x) (x)
  ------------------
 1350|   419k|                pixel *const uv_dst[2] = { ((pixel *) f->cur.data[1]) + uv_off,
 1351|   419k|                                           ((pixel *) f->cur.data[2]) + uv_off };
 1352|       |
 1353|   419k|                const int furthest_r =
 1354|   419k|                    ((cw4 << ss_hor) + t_dim->w - 1) & ~(t_dim->w - 1);
 1355|   419k|                const int furthest_b =
 1356|   419k|                    ((ch4 << ss_ver) + t_dim->h - 1) & ~(t_dim->h - 1);
 1357|   419k|                dsp->ipred.cfl_ac[f->cur.p.layout - 1](ac, y_src, f->cur.stride[0],
 1358|   419k|                                                         cbw4 - (furthest_r >> ss_hor),
 1359|   419k|                                                         cbh4 - (furthest_b >> ss_ver),
 1360|   419k|                                                         cbw4 * 4, cbh4 * 4);
 1361|  1.26M|                for (int pl = 0; pl < 2; pl++) {
  ------------------
  |  Branch (1361:34): [True: 841k, False: 419k]
  ------------------
 1362|   841k|                    if (!b->cfl_alpha[pl]) continue;
  ------------------
  |  Branch (1362:25): [True: 29.2k, False: 811k]
  ------------------
 1363|   811k|                    int angle = 0;
 1364|   811k|                    const pixel *top_sb_edge = NULL;
 1365|   811k|                    if (!((t->by & ~ss_ver) & (f->sb_step - 1))) {
  ------------------
  |  Branch (1365:25): [True: 422k, False: 389k]
  ------------------
 1366|   422k|                        top_sb_edge = f->ipred_edge[pl + 1];
 1367|   422k|                        const int sby = t->by >> f->sb_shift;
 1368|   422k|                        top_sb_edge += f->sb128w * 128 * (sby - 1);
 1369|   422k|                    }
 1370|   811k|                    const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver;
 1371|   811k|                    const int xstart = ts->tiling.col_start >> ss_hor;
 1372|   811k|                    const int ystart = ts->tiling.row_start >> ss_ver;
 1373|   811k|                    const enum IntraPredMode m =
 1374|   811k|                        bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart,
  ------------------
  |  |   87|   811k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   51|   811k|#define bitfn(x) x##_8bpc
  |  |  ------------------
  ------------------
 1375|   811k|                                                          ypos, ypos > ystart,
 1376|   811k|                                                          ts->tiling.col_end >> ss_hor,
 1377|   811k|                                                          ts->tiling.row_end >> ss_ver,
 1378|   811k|                                                          0, uv_dst[pl], stride,
 1379|   811k|                                                          top_sb_edge, DC_PRED, &angle,
 1380|   811k|                                                          uv_t_dim->w, uv_t_dim->h, 0,
 1381|   811k|                                                          edge HIGHBD_CALL_SUFFIX);
 1382|   811k|                    dsp->ipred.cfl_pred[m](uv_dst[pl], stride, edge,
 1383|   811k|                                           uv_t_dim->w * 4,
 1384|   811k|                                           uv_t_dim->h * 4,
 1385|   811k|                                           ac, b->cfl_alpha[pl]
 1386|   811k|                                           HIGHBD_CALL_SUFFIX);
 1387|   811k|                }
 1388|   419k|                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
  ------------------
  |  |   34|   419k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 419k]
  |  |  ------------------
  |  |   35|   419k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|   419k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
                              if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
  ------------------
  |  |   37|      0|#define DEBUG_B_PIXELS 0
  |  |  ------------------
  |  |  |  Branch (37:24): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1389|      0|                    ac_dump(ac, 4*cbw4, 4*cbh4, "ac");
 1390|      0|                    hex_dump(uv_dst[0], stride, cbw4 * 4, cbh4 * 4, "u-cfl-pred");
 1391|      0|                    hex_dump(uv_dst[1], stride, cbw4 * 4, cbh4 * 4, "v-cfl-pred");
 1392|      0|                }
 1393|   419k|            } else if (b->pal_sz[1]) {
  ------------------
  |  Branch (1393:24): [True: 3.84k, False: 367k]
  ------------------
 1394|  3.84k|                const ptrdiff_t uv_dstoff = 4 * ((t->bx >> ss_hor) +
 1395|  3.84k|                                              (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1]));
  ------------------
  |  |   53|  3.84k|#define PXSTRIDE(x) (x)
  ------------------
 1396|  3.84k|                const pixel (*pal)[8];
 1397|  3.84k|                const uint8_t *pal_idx;
 1398|  3.84k|                if (t->frame_thread.pass) {
  ------------------
  |  Branch (1398:21): [True: 0, False: 3.84k]
  ------------------
 1399|      0|                    const int p = t->frame_thread.pass & 1;
 1400|      0|                    assert(ts->frame_thread[p].pal_idx);
  ------------------
  |  |  140|      0|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 0]
  |  |  |  Branch (140:68): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1401|      0|                    pal = f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
 1402|      0|                                              ((t->bx >> 1) + (t->by & 1))];
 1403|      0|                    pal_idx = ts->frame_thread[p].pal_idx;
 1404|      0|                    ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 8;
 1405|  3.84k|                } else {
 1406|  3.84k|                    pal = bytefn(t->scratch.pal);
  ------------------
  |  |   87|  3.84k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   51|  3.84k|#define bitfn(x) x##_8bpc
  |  |  ------------------
  ------------------
 1407|  3.84k|                    pal_idx = t->scratch.pal_idx_uv;
 1408|  3.84k|                }
 1409|       |
 1410|  3.84k|                f->dsp->ipred.pal_pred(((pixel *) f->cur.data[1]) + uv_dstoff,
 1411|  3.84k|                                       f->cur.stride[1], pal[1],
 1412|  3.84k|                                       pal_idx, cbw4 * 4, cbh4 * 4);
 1413|  3.84k|                f->dsp->ipred.pal_pred(((pixel *) f->cur.data[2]) + uv_dstoff,
 1414|  3.84k|                                       f->cur.stride[1], pal[2],
 1415|  3.84k|                                       pal_idx, cbw4 * 4, cbh4 * 4);
 1416|  3.84k|                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
  ------------------
  |  |   34|  3.84k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 3.84k]
  |  |  ------------------
  |  |   35|  3.84k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  3.84k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
                              if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
  ------------------
  |  |   37|      0|#define DEBUG_B_PIXELS 0
  |  |  ------------------
  |  |  |  Branch (37:24): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1417|      0|                    hex_dump(((pixel *) f->cur.data[1]) + uv_dstoff,
 1418|      0|                             PXSTRIDE(f->cur.stride[1]),
  ------------------
  |  |   53|      0|#define PXSTRIDE(x) (x)
  ------------------
 1419|      0|                             cbw4 * 4, cbh4 * 4, "u-pal-pred");
 1420|      0|                    hex_dump(((pixel *) f->cur.data[2]) + uv_dstoff,
 1421|      0|                             PXSTRIDE(f->cur.stride[1]),
  ------------------
  |  |   53|      0|#define PXSTRIDE(x) (x)
  ------------------
 1422|      0|                             cbw4 * 4, cbh4 * 4, "v-pal-pred");
 1423|      0|                }
 1424|  3.84k|            }
 1425|       |
 1426|   791k|            const int sm_uv_fl = sm_uv_flag(t->a, cbx4) |
 1427|   791k|                                 sm_uv_flag(&t->l, cby4);
 1428|   791k|            const int uv_sb_has_tr =
 1429|   791k|                ((init_x + 16) >> ss_hor) < cw4 ? 1 : init_y ? 0 :
  ------------------
  |  Branch (1429:17): [True: 7.77k, False: 783k]
  |  Branch (1429:55): [True: 3.49k, False: 779k]
  ------------------
 1430|   783k|                intra_edge_flags & (EDGE_I420_TOP_HAS_RIGHT >> (f->cur.p.layout - 1));
 1431|   791k|            const int uv_sb_has_bl =
 1432|   791k|                init_x ? 0 : ((init_y + 16) >> ss_ver) < ch4 ? 1 :
  ------------------
  |  Branch (1432:17): [True: 7.77k, False: 783k]
  |  Branch (1432:30): [True: 3.49k, False: 779k]
  ------------------
 1433|   783k|                intra_edge_flags & (EDGE_I420_LEFT_HAS_BOTTOM >> (f->cur.p.layout - 1));
 1434|   791k|            const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor);
 1435|  2.35M|            for (int pl = 0; pl < 2; pl++) {
  ------------------
  |  Branch (1435:30): [True: 1.56M, False: 791k]
  ------------------
 1436|  3.51M|                for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4;
  ------------------
  |  Branch (1436:61): [True: 1.94M, False: 1.56M]
  ------------------
 1437|  1.94M|                     y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver)
 1438|  1.94M|                {
 1439|  1.94M|                    pixel *dst = ((pixel *) f->cur.data[1 + pl]) +
 1440|  1.94M|                                   4 * ((t->by >> ss_ver) * PXSTRIDE(stride) +
  ------------------
  |  |   53|  1.94M|#define PXSTRIDE(x) (x)
  ------------------
 1441|  1.94M|                                        ((t->bx + init_x) >> ss_hor));
 1442|  4.88M|                    for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4;
  ------------------
  |  Branch (1442:65): [True: 2.93M, False: 1.94M]
  ------------------
 1443|  2.93M|                         x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor)
 1444|  2.93M|                    {
 1445|  2.93M|                        if ((b->uv_mode == CFL_PRED && b->cfl_alpha[pl]) ||
  ------------------
  |  Branch (1445:30): [True: 835k, False: 2.09M]
  |  Branch (1445:56): [True: 806k, False: 28.7k]
  ------------------
 1446|  2.13M|                            b->pal_sz[1])
  ------------------
  |  Branch (1446:29): [True: 18.4k, False: 2.11M]
  ------------------
 1447|   825k|                        {
 1448|   825k|                            goto skip_uv_pred;
 1449|   825k|                        }
 1450|       |
 1451|  2.10M|                        int angle = b->uv_angle;
 1452|       |                        // this probably looks weird because we're using
 1453|       |                        // luma flags in a chroma loop, but that's because
 1454|       |                        // prepare_intra_edges() expects luma flags as input
 1455|  2.10M|                        const enum EdgeFlags edge_flags =
 1456|  2.10M|                            (((y > (init_y >> ss_ver) || !uv_sb_has_tr) &&
  ------------------
  |  Branch (1456:32): [True: 1.02M, False: 1.08M]
  |  Branch (1456:58): [True: 148k, False: 933k]
  ------------------
 1457|  1.18M|                              (x + uv_t_dim->w >= sub_cw4)) ?
  ------------------
  |  Branch (1457:31): [True: 513k, False: 671k]
  ------------------
 1458|  1.59M|                                 0 : EDGE_I444_TOP_HAS_RIGHT) |
 1459|  2.10M|                            ((x > (init_x >> ss_hor) ||
  ------------------
  |  Branch (1459:31): [True: 970k, False: 1.13M]
  ------------------
 1460|  1.13M|                              (!uv_sb_has_bl && y + uv_t_dim->h >= sub_ch4)) ?
  ------------------
  |  Branch (1460:32): [True: 620k, False: 518k]
  |  Branch (1460:49): [True: 323k, False: 296k]
  ------------------
 1461|  1.30M|                                 0 : EDGE_I444_LEFT_HAS_BOTTOM);
 1462|  2.10M|                        const pixel *top_sb_edge = NULL;
 1463|  2.10M|                        if (!((t->by & ~ss_ver) & (f->sb_step - 1))) {
  ------------------
  |  Branch (1463:29): [True: 750k, False: 1.35M]
  ------------------
 1464|   750k|                            top_sb_edge = f->ipred_edge[1 + pl];
 1465|   750k|                            const int sby = t->by >> f->sb_shift;
 1466|   750k|                            top_sb_edge += f->sb128w * 128 * (sby - 1);
 1467|   750k|                        }
 1468|  2.10M|                        const enum IntraPredMode uv_mode =
 1469|  2.10M|                             b->uv_mode == CFL_PRED ? DC_PRED : b->uv_mode;
  ------------------
  |  Branch (1469:30): [True: 29.2k, False: 2.07M]
  ------------------
 1470|  2.10M|                        const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver;
 1471|  2.10M|                        const int xstart = ts->tiling.col_start >> ss_hor;
 1472|  2.10M|                        const int ystart = ts->tiling.row_start >> ss_ver;
 1473|  2.10M|                        const enum IntraPredMode m =
 1474|  2.10M|                            bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart,
  ------------------
  |  |   87|  2.10M|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   51|  2.10M|#define bitfn(x) x##_8bpc
  |  |  ------------------
  ------------------
 1475|  2.10M|                                                              ypos, ypos > ystart,
 1476|  2.10M|                                                              ts->tiling.col_end >> ss_hor,
 1477|  2.10M|                                                              ts->tiling.row_end >> ss_ver,
 1478|  2.10M|                                                              edge_flags, dst, stride,
 1479|  2.10M|                                                              top_sb_edge, uv_mode,
 1480|  2.10M|                                                              &angle, uv_t_dim->w,
 1481|  2.10M|                                                              uv_t_dim->h,
 1482|  2.10M|                                                              f->seq_hdr->intra_edge_filter,
 1483|  2.10M|                                                              edge HIGHBD_CALL_SUFFIX);
 1484|  2.10M|                        angle |= intra_edge_filter_flag;
 1485|  2.10M|                        dsp->ipred.intra_pred[m](dst, stride, edge,
 1486|  2.10M|                                                 uv_t_dim->w * 4,
 1487|  2.10M|                                                 uv_t_dim->h * 4,
 1488|  2.10M|                                                 angle | sm_uv_fl,
 1489|  2.10M|                                                 (4 * f->bw + ss_hor -
 1490|  2.10M|                                                  4 * (t->bx & ~ss_hor)) >> ss_hor,
 1491|  2.10M|                                                 (4 * f->bh + ss_ver -
 1492|  2.10M|                                                  4 * (t->by & ~ss_ver)) >> ss_ver
 1493|  2.10M|                                                 HIGHBD_CALL_SUFFIX);
 1494|  2.10M|                        if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
  ------------------
  |  |   34|  2.10M|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 2.10M]
  |  |  ------------------
  |  |   35|  2.10M|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  2.10M|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
                                      if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
  ------------------
  |  |   37|      0|#define DEBUG_B_PIXELS 0
  |  |  ------------------
  |  |  |  Branch (37:24): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1495|      0|                            hex_dump(edge - uv_t_dim->h * 4, uv_t_dim->h * 4,
 1496|      0|                                     uv_t_dim->h * 4, 2, "l");
 1497|      0|                            hex_dump(edge, 0, 1, 1, "tl");
 1498|      0|                            hex_dump(edge + 1, uv_t_dim->w * 4,
 1499|      0|                                     uv_t_dim->w * 4, 2, "t");
 1500|      0|                            hex_dump(dst, stride, uv_t_dim->w * 4,
 1501|      0|                                     uv_t_dim->h * 4, pl ? "v-intra-pred" : "u-intra-pred");
  ------------------
  |  Branch (1501:55): [True: 0, False: 0]
  ------------------
 1502|      0|                        }
 1503|       |
 1504|  2.93M|                    skip_uv_pred: {}
 1505|  2.93M|                        if (!b->skip) {
  ------------------
  |  Branch (1505:29): [True: 1.23M, False: 1.70M]
  ------------------
 1506|  1.23M|                            enum TxfmType txtp;
 1507|  1.23M|                            int eob;
 1508|  1.23M|                            coef *cf;
 1509|  1.23M|                            if (t->frame_thread.pass) {
  ------------------
  |  Branch (1509:33): [True: 0, False: 1.23M]
  ------------------
 1510|      0|                                const int p = t->frame_thread.pass & 1;
 1511|      0|                                const int cbi = *ts->frame_thread[p].cbi++;
 1512|      0|                                cf = ts->frame_thread[p].cf;
 1513|      0|                                ts->frame_thread[p].cf += uv_t_dim->w * uv_t_dim->h * 16;
 1514|      0|                                eob  = cbi >> 5;
 1515|      0|                                txtp = cbi & 0x1f;
 1516|  1.23M|                            } else {
 1517|  1.23M|                                uint8_t cf_ctx;
 1518|  1.23M|                                cf = bitfn(t->cf);
  ------------------
  |  |   51|  1.23M|#define bitfn(x) x##_8bpc
  ------------------
 1519|  1.23M|                                eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
 1520|  1.23M|                                                   &t->l.ccoef[pl][cby4 + y],
 1521|  1.23M|                                                   b->uvtx, bs, b, 1, 1 + pl, cf,
 1522|  1.23M|                                                   &txtp, &cf_ctx);
 1523|  1.23M|                                if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|  1.23M|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 1.23M]
  |  |  ------------------
  |  |   35|  1.23M|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  1.23M|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1524|      0|                                    printf("Post-uv-cf-blk[pl=%d,tx=%d,"
 1525|      0|                                           "txtp=%d,eob=%d]: r=%d [x=%d,cbx4=%d]\n",
 1526|      0|                                           pl, b->uvtx, txtp, eob, ts->msac.rng, x, cbx4);
 1527|  1.23M|                                int ctw = imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor);
 1528|  1.23M|                                int cth = imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver);
 1529|  1.23M|                                dav1d_memset_likely_pow2(&t->a->ccoef[pl][cbx4 + x], cf_ctx, ctw);
 1530|  1.23M|                                dav1d_memset_likely_pow2(&t->l.ccoef[pl][cby4 + y], cf_ctx, cth);
 1531|  1.23M|                            }
 1532|  1.23M|                            if (eob >= 0) {
  ------------------
  |  Branch (1532:33): [True: 590k, False: 643k]
  ------------------
 1533|   590k|                                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
  ------------------
  |  |   34|   590k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 590k]
  |  |  ------------------
  |  |   35|   590k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|   590k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
                                              if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
  ------------------
  |  |   37|      0|#define DEBUG_B_PIXELS 0
  |  |  ------------------
  |  |  |  Branch (37:24): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1534|      0|                                    coef_dump(cf, uv_t_dim->h * 4,
 1535|      0|                                              uv_t_dim->w * 4, 3, "dq");
 1536|   590k|                                dsp->itx.itxfm_add[b->uvtx]
 1537|   590k|                                                  [txtp](dst, stride,
 1538|   590k|                                                         cf, eob HIGHBD_CALL_SUFFIX);
 1539|   590k|                                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
  ------------------
  |  |   34|   590k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 590k]
  |  |  ------------------
  |  |   35|   590k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|   590k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
                                              if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
  ------------------
  |  |   37|      0|#define DEBUG_B_PIXELS 0
  |  |  ------------------
  |  |  |  Branch (37:24): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1540|      0|                                    hex_dump(dst, stride, uv_t_dim->w * 4,
 1541|      0|                                             uv_t_dim->h * 4, "recon");
 1542|   590k|                            }
 1543|  1.70M|                        } else if (!t->frame_thread.pass) {
  ------------------
  |  Branch (1543:36): [True: 1.70M, False: 18.4E]
  ------------------
 1544|  1.70M|                            dav1d_memset_pow2[uv_t_dim->lw](&t->a->ccoef[pl][cbx4 + x], 0x40);
 1545|  1.70M|                            dav1d_memset_pow2[uv_t_dim->lh](&t->l.ccoef[pl][cby4 + y], 0x40);
 1546|  1.70M|                        }
 1547|  2.93M|                        dst += uv_t_dim->w * 4;
 1548|  2.93M|                    }
 1549|  1.94M|                    t->bx -= x << ss_hor;
 1550|  1.94M|                }
 1551|  1.56M|                t->by -= y << ss_ver;
 1552|  1.56M|            }
 1553|   791k|        }
 1554|   891k|    }
 1555|   888k|}
dav1d_recon_b_inter_8bpc:
 1559|   609k|{
 1560|   609k|    Dav1dTileState *const ts = t->ts;
 1561|   609k|    const Dav1dFrameContext *const f = t->f;
 1562|   609k|    const Dav1dDSPContext *const dsp = f->dsp;
 1563|   609k|    const int bx4 = t->bx & 31, by4 = t->by & 31;
 1564|   609k|    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
 1565|   609k|    const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
 1566|   609k|    const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
 1567|   609k|    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
 1568|   609k|    const int bw4 = b_dim[0], bh4 = b_dim[1];
 1569|   609k|    const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
 1570|   609k|    const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&
  ------------------
  |  Branch (1570:28): [True: 585k, False: 23.4k]
  ------------------
 1571|   585k|                           (bw4 > ss_hor || t->bx & 1) &&
  ------------------
  |  Branch (1571:29): [True: 566k, False: 19.2k]
  |  Branch (1571:45): [True: 9.66k, False: 9.62k]
  ------------------
 1572|   575k|                           (bh4 > ss_ver || t->by & 1);
  ------------------
  |  Branch (1572:29): [True: 561k, False: 14.1k]
  |  Branch (1572:45): [True: 7.08k, False: 7.09k]
  ------------------
 1573|   609k|    const int chr_layout_idx = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I400 ? 0 :
  ------------------
  |  Branch (1573:32): [True: 11.1k, False: 597k]
  ------------------
 1574|   609k|                               DAV1D_PIXEL_LAYOUT_I444 - f->cur.p.layout;
 1575|   609k|    int res;
 1576|       |
 1577|       |    // prediction
 1578|   609k|    const int cbh4 = (bh4 + ss_ver) >> ss_ver, cbw4 = (bw4 + ss_hor) >> ss_hor;
 1579|   609k|    pixel *dst = ((pixel *) f->cur.data[0]) +
 1580|   609k|        4 * (t->by * PXSTRIDE(f->cur.stride[0]) + t->bx);
  ------------------
  |  |   53|   609k|#define PXSTRIDE(x) (x)
  ------------------
 1581|   609k|    const ptrdiff_t uvdstoff =
 1582|   609k|        4 * ((t->bx >> ss_hor) + (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1]));
  ------------------
  |  |   53|   609k|#define PXSTRIDE(x) (x)
  ------------------
 1583|   609k|    if (IS_KEY_OR_INTRA(f->frame_hdr)) {
  ------------------
  |  |   43|   609k|    (!IS_INTER_OR_SWITCH(frame_header))
  |  |  ------------------
  |  |  |  |   36|   609k|    ((frame_header)->frame_type & 1)
  |  |  ------------------
  |  |  |  Branch (43:5): [True: 536k, False: 72.5k]
  |  |  ------------------
  ------------------
 1584|       |        // intrabc
 1585|   536k|        assert(!f->frame_hdr->super_res.enabled);
  ------------------
  |  |  140|   536k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 536k]
  |  |  |  Branch (140:68): [Folded, False: 536k]
  |  |  ------------------
  ------------------
 1586|   536k|        res = mc(t, dst, NULL, f->cur.stride[0], bw4, bh4, t->bx, t->by, 0,
 1587|   536k|                 b->mv[0], &f->sr_cur, 0 /* unused */, FILTER_2D_BILINEAR);
 1588|   536k|        if (res) return res;
  ------------------
  |  Branch (1588:13): [True: 0, False: 536k]
  ------------------
 1589|  1.55M|        if (has_chroma) for (int pl = 1; pl < 3; pl++) {
  ------------------
  |  Branch (1589:13): [True: 521k, False: 15.1k]
  |  Branch (1589:42): [True: 1.03M, False: 521k]
  ------------------
 1590|  1.03M|            res = mc(t, ((pixel *)f->cur.data[pl]) + uvdstoff, NULL, f->cur.stride[1],
 1591|  1.03M|                     bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),
 1592|  1.03M|                     t->bx & ~ss_hor, t->by & ~ss_ver, pl, b->mv[0],
 1593|  1.03M|                     &f->sr_cur, 0 /* unused */, FILTER_2D_BILINEAR);
 1594|  1.03M|            if (res) return res;
  ------------------
  |  Branch (1594:17): [True: 0, False: 1.03M]
  ------------------
 1595|  1.03M|        }
 1596|   536k|    } else if (b->comp_type == COMP_INTER_NONE) {
  ------------------
  |  Branch (1596:16): [True: 53.1k, False: 19.3k]
  ------------------
 1597|  53.1k|        const Dav1dThreadPicture *const refp = &f->refp[b->ref[0]];
 1598|  53.1k|        const enum Filter2d filter_2d = b->filter2d;
 1599|       |
 1600|  53.1k|        if (imin(bw4, bh4) > 1 &&
  ------------------
  |  Branch (1600:13): [True: 30.4k, False: 22.7k]
  ------------------
 1601|  30.4k|            ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) ||
  ------------------
  |  Branch (1601:15): [True: 6.75k, False: 23.6k]
  |  Branch (1601:44): [True: 1.06k, False: 5.69k]
  ------------------
 1602|  29.3k|             (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION)))
  ------------------
  |  Branch (1602:15): [True: 3.06k, False: 26.3k]
  |  Branch (1602:44): [True: 2.75k, False: 309]
  ------------------
 1603|  3.81k|        {
 1604|  3.81k|            res = warp_affine(t, dst, NULL, f->cur.stride[0], b_dim, 0, refp,
 1605|  3.81k|                              b->motion_mode == MM_WARP ? &t->warpmv :
  ------------------
  |  Branch (1605:31): [True: 2.75k, False: 1.06k]
  ------------------
 1606|  3.81k|                                  &f->frame_hdr->gmv[b->ref[0]]);
 1607|  3.81k|            if (res) return res;
  ------------------
  |  Branch (1607:17): [True: 0, False: 3.81k]
  ------------------
 1608|  49.3k|        } else {
 1609|  49.3k|            res = mc(t, dst, NULL, f->cur.stride[0],
 1610|  49.3k|                     bw4, bh4, t->bx, t->by, 0, b->mv[0], refp, b->ref[0], filter_2d);
 1611|  49.3k|            if (res) return res;
  ------------------
  |  Branch (1611:17): [True: 0, False: 49.3k]
  ------------------
 1612|  49.3k|            if (b->motion_mode == MM_OBMC) {
  ------------------
  |  Branch (1612:17): [True: 11.0k, False: 38.2k]
  ------------------
 1613|  11.0k|                res = obmc(t, dst, f->cur.stride[0], b_dim, 0, bx4, by4, w4, h4);
 1614|  11.0k|                if (res) return res;
  ------------------
  |  Branch (1614:21): [True: 0, False: 11.0k]
  ------------------
 1615|  11.0k|            }
 1616|  49.3k|        }
 1617|  53.1k|        if (b->interintra_type) {
  ------------------
  |  Branch (1617:13): [True: 1.55k, False: 51.6k]
  ------------------
 1618|  1.55k|            pixel *const tl_edge = bitfn(t->scratch.edge) + 32;
  ------------------
  |  |   51|  1.55k|#define bitfn(x) x##_8bpc
  ------------------
 1619|  1.55k|            enum IntraPredMode m = b->interintra_mode == II_SMOOTH_PRED ?
  ------------------
  |  Branch (1619:36): [True: 304, False: 1.25k]
  ------------------
 1620|  1.25k|                                   SMOOTH_PRED : b->interintra_mode;
 1621|  1.55k|            pixel *const tmp = bitfn(t->scratch.interintra);
  ------------------
  |  |   51|  1.55k|#define bitfn(x) x##_8bpc
  ------------------
 1622|  1.55k|            int angle = 0;
 1623|  1.55k|            const pixel *top_sb_edge = NULL;
 1624|  1.55k|            if (!(t->by & (f->sb_step - 1))) {
  ------------------
  |  Branch (1624:17): [True: 416, False: 1.14k]
  ------------------
 1625|    416|                top_sb_edge = f->ipred_edge[0];
 1626|    416|                const int sby = t->by >> f->sb_shift;
 1627|    416|                top_sb_edge += f->sb128w * 128 * (sby - 1);
 1628|    416|            }
 1629|  1.55k|            m = bytefn(dav1d_prepare_intra_edges)(t->bx, t->bx > ts->tiling.col_start,
  ------------------
  |  |   87|  1.55k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   51|  1.55k|#define bitfn(x) x##_8bpc
  |  |  ------------------
  ------------------
 1630|  1.55k|                                                  t->by, t->by > ts->tiling.row_start,
 1631|  1.55k|                                                  ts->tiling.col_end, ts->tiling.row_end,
 1632|  1.55k|                                                  0, dst, f->cur.stride[0], top_sb_edge,
 1633|  1.55k|                                                  m, &angle, bw4, bh4, 0, tl_edge
 1634|  1.55k|                                                  HIGHBD_CALL_SUFFIX);
 1635|  1.55k|            dsp->ipred.intra_pred[m](tmp, 4 * bw4 * sizeof(pixel),
 1636|  1.55k|                                     tl_edge, bw4 * 4, bh4 * 4, 0, 0, 0
 1637|  1.55k|                                     HIGHBD_CALL_SUFFIX);
 1638|  1.55k|            dsp->mc.blend(dst, f->cur.stride[0], tmp,
 1639|  1.55k|                          bw4 * 4, bh4 * 4, II_MASK(0, bs, b));
  ------------------
  |  |   83|  1.55k|    ((const uint8_t*)((uintptr_t)&dav1d_masks + \
  |  |   84|  1.55k|    (size_t)((b)->interintra_type == INTER_INTRA_BLEND ? \
  |  |  ------------------
  |  |  |  Branch (84:14): [True: 1.12k, False: 431]
  |  |  ------------------
  |  |   85|  1.55k|    dav1d_masks.offsets[c][(bs)-BS_32x32].ii[(b)->interintra_mode] : \
  |  |   86|  1.55k|    dav1d_masks.offsets[c][(bs)-BS_32x32].wedge[0][(b)->wedge_idx]) * 8))
  ------------------
 1640|  1.55k|        }
 1641|       |
 1642|  53.1k|        if (!has_chroma) goto skip_inter_chroma_pred;
  ------------------
  |  Branch (1642:13): [True: 12.2k, False: 40.8k]
  ------------------
 1643|       |
 1644|       |        // sub8x8 derivation
 1645|  40.8k|        int is_sub8x8 = bw4 == ss_hor || bh4 == ss_ver;
  ------------------
  |  Branch (1645:25): [True: 4.32k, False: 36.5k]
  |  Branch (1645:42): [True: 3.94k, False: 32.6k]
  ------------------
 1646|  40.8k|        refmvs_block *const *r;
 1647|  40.8k|        if (is_sub8x8) {
  ------------------
  |  Branch (1647:13): [True: 8.27k, False: 32.6k]
  ------------------
 1648|  8.27k|            assert(ss_hor == 1);
  ------------------
  |  |  140|  8.27k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 8.27k]
  |  |  |  Branch (140:68): [Folded, False: 8.27k]
  |  |  ------------------
  ------------------
 1649|  8.27k|            r = &t->rt.r[(t->by & 31) + 5];
 1650|  8.27k|            if (bw4 == 1) is_sub8x8 &= r[0][t->bx - 1].ref.ref[0] > 0;
  ------------------
  |  Branch (1650:17): [True: 4.32k, False: 3.94k]
  ------------------
 1651|  8.27k|            if (bh4 == ss_ver) is_sub8x8 &= r[-1][t->bx].ref.ref[0] > 0;
  ------------------
  |  Branch (1651:17): [True: 5.35k, False: 2.91k]
  ------------------
 1652|  8.27k|            if (bw4 == 1 && bh4 == ss_ver)
  ------------------
  |  Branch (1652:17): [True: 4.32k, False: 3.94k]
  |  Branch (1652:29): [True: 1.40k, False: 2.91k]
  ------------------
 1653|  1.40k|                is_sub8x8 &= r[-1][t->bx - 1].ref.ref[0] > 0;
 1654|  8.27k|        }
 1655|       |
 1656|       |        // chroma prediction
 1657|  40.8k|        if (is_sub8x8) {
  ------------------
  |  Branch (1657:13): [True: 7.98k, False: 32.8k]
  ------------------
 1658|  7.98k|            assert(ss_hor == 1);
  ------------------
  |  |  140|  7.98k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 7.98k]
  |  |  |  Branch (140:68): [Folded, False: 7.98k]
  |  |  ------------------
  ------------------
 1659|  7.98k|            ptrdiff_t h_off = 0, v_off = 0;
 1660|  7.98k|            if (bw4 == 1 && bh4 == ss_ver) {
  ------------------
  |  Branch (1660:17): [True: 4.16k, False: 3.81k]
  |  Branch (1660:29): [True: 1.32k, False: 2.84k]
  ------------------
 1661|  3.96k|                for (int pl = 0; pl < 2; pl++) {
  ------------------
  |  Branch (1661:34): [True: 2.64k, False: 1.32k]
  ------------------
 1662|  2.64k|                    res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff,
 1663|  2.64k|                             NULL, f->cur.stride[1],
 1664|  2.64k|                             bw4, bh4, t->bx - 1, t->by - 1, 1 + pl,
 1665|  2.64k|                             r[-1][t->bx - 1].mv.mv[0],
 1666|  2.64k|                             &f->refp[r[-1][t->bx - 1].ref.ref[0] - 1],
 1667|  2.64k|                             r[-1][t->bx - 1].ref.ref[0] - 1,
 1668|  2.64k|                             t->frame_thread.pass != 2 ? t->tl_4x4_filter :
  ------------------
  |  Branch (1668:30): [True: 2.64k, False: 0]
  ------------------
 1669|  2.64k|                                 f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx - 1].filter2d);
 1670|  2.64k|                    if (res) return res;
  ------------------
  |  Branch (1670:25): [True: 0, False: 2.64k]
  ------------------
 1671|  2.64k|                }
 1672|  1.32k|                v_off = 2 * PXSTRIDE(f->cur.stride[1]);
  ------------------
  |  |   53|  1.32k|#define PXSTRIDE(x) (x)
  ------------------
 1673|  1.32k|                h_off = 2;
 1674|  1.32k|            }
 1675|  7.98k|            if (bw4 == 1) {
  ------------------
  |  Branch (1675:17): [True: 4.16k, False: 3.81k]
  ------------------
 1676|  4.16k|                const enum Filter2d left_filter_2d =
 1677|  4.16k|                    dav1d_filter_2d[t->l.filter[1][by4]][t->l.filter[0][by4]];
 1678|  12.5k|                for (int pl = 0; pl < 2; pl++) {
  ------------------
  |  Branch (1678:34): [True: 8.33k, False: 4.16k]
  ------------------
 1679|  8.33k|                    res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + v_off, NULL,
 1680|  8.33k|                             f->cur.stride[1], bw4, bh4, t->bx - 1,
 1681|  8.33k|                             t->by, 1 + pl, r[0][t->bx - 1].mv.mv[0],
 1682|  8.33k|                             &f->refp[r[0][t->bx - 1].ref.ref[0] - 1],
 1683|  8.33k|                             r[0][t->bx - 1].ref.ref[0] - 1,
 1684|  8.33k|                             t->frame_thread.pass != 2 ? left_filter_2d :
  ------------------
  |  Branch (1684:30): [True: 8.33k, False: 0]
  ------------------
 1685|  8.33k|                                 f->frame_thread.b[(t->by * f->b4_stride) + t->bx - 1].filter2d);
 1686|  8.33k|                    if (res) return res;
  ------------------
  |  Branch (1686:25): [True: 0, False: 8.33k]
  ------------------
 1687|  8.33k|                }
 1688|  4.16k|                h_off = 2;
 1689|  4.16k|            }
 1690|  7.98k|            if (bh4 == ss_ver) {
  ------------------
  |  Branch (1690:17): [True: 5.13k, False: 2.84k]
  ------------------
 1691|  5.13k|                const enum Filter2d top_filter_2d =
 1692|  5.13k|                    dav1d_filter_2d[t->a->filter[1][bx4]][t->a->filter[0][bx4]];
 1693|  15.4k|                for (int pl = 0; pl < 2; pl++) {
  ------------------
  |  Branch (1693:34): [True: 10.2k, False: 5.13k]
  ------------------
 1694|  10.2k|                    res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + h_off, NULL,
 1695|  10.2k|                             f->cur.stride[1], bw4, bh4, t->bx, t->by - 1,
 1696|  10.2k|                             1 + pl, r[-1][t->bx].mv.mv[0],
 1697|  10.2k|                             &f->refp[r[-1][t->bx].ref.ref[0] - 1],
 1698|  10.2k|                             r[-1][t->bx].ref.ref[0] - 1,
 1699|  10.2k|                             t->frame_thread.pass != 2 ? top_filter_2d :
  ------------------
  |  Branch (1699:30): [True: 10.2k, False: 0]
  ------------------
 1700|  10.2k|                                 f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx].filter2d);
 1701|  10.2k|                    if (res) return res;
  ------------------
  |  Branch (1701:25): [True: 0, False: 10.2k]
  ------------------
 1702|  10.2k|                }
 1703|  5.13k|                v_off = 2 * PXSTRIDE(f->cur.stride[1]);
  ------------------
  |  |   53|  5.13k|#define PXSTRIDE(x) (x)
  ------------------
 1704|  5.13k|            }
 1705|  23.9k|            for (int pl = 0; pl < 2; pl++) {
  ------------------
  |  Branch (1705:30): [True: 15.9k, False: 7.98k]
  ------------------
 1706|  15.9k|                res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + h_off + v_off, NULL, f->cur.stride[1],
 1707|  15.9k|                         bw4, bh4, t->bx, t->by, 1 + pl, b->mv[0],
 1708|  15.9k|                         refp, b->ref[0], filter_2d);
 1709|  15.9k|                if (res) return res;
  ------------------
  |  Branch (1709:21): [True: 0, False: 15.9k]
  ------------------
 1710|  15.9k|            }
 1711|  32.8k|        } else {
 1712|  32.8k|            if (imin(cbw4, cbh4) > 1 &&
  ------------------
  |  Branch (1712:17): [True: 11.9k, False: 20.9k]
  ------------------
 1713|  11.9k|                ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) ||
  ------------------
  |  Branch (1713:19): [True: 2.05k, False: 9.89k]
  |  Branch (1713:48): [True: 364, False: 1.69k]
  ------------------
 1714|  11.5k|                 (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION)))
  ------------------
  |  Branch (1714:19): [True: 882, False: 10.7k]
  |  Branch (1714:48): [True: 813, False: 69]
  ------------------
 1715|  1.17k|            {
 1716|  3.53k|                for (int pl = 0; pl < 2; pl++) {
  ------------------
  |  Branch (1716:34): [True: 2.35k, False: 1.17k]
  ------------------
 1717|  2.35k|                    res = warp_affine(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff, NULL,
 1718|  2.35k|                                      f->cur.stride[1], b_dim, 1 + pl, refp,
 1719|  2.35k|                                      b->motion_mode == MM_WARP ? &t->warpmv :
  ------------------
  |  Branch (1719:39): [True: 1.62k, False: 728]
  ------------------
 1720|  2.35k|                                          &f->frame_hdr->gmv[b->ref[0]]);
 1721|  2.35k|                    if (res) return res;
  ------------------
  |  Branch (1721:25): [True: 0, False: 2.35k]
  ------------------
 1722|  2.35k|                }
 1723|  31.7k|            } else {
 1724|  95.1k|                for (int pl = 0; pl < 2; pl++) {
  ------------------
  |  Branch (1724:34): [True: 63.4k, False: 31.7k]
  ------------------
 1725|  63.4k|                    res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff,
 1726|  63.4k|                             NULL, f->cur.stride[1],
 1727|  63.4k|                             bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),
 1728|  63.4k|                             t->bx & ~ss_hor, t->by & ~ss_ver,
 1729|  63.4k|                             1 + pl, b->mv[0], refp, b->ref[0], filter_2d);
 1730|  63.4k|                    if (res) return res;
  ------------------
  |  Branch (1730:25): [True: 0, False: 63.4k]
  ------------------
 1731|  63.4k|                    if (b->motion_mode == MM_OBMC) {
  ------------------
  |  Branch (1731:25): [True: 21.8k, False: 41.5k]
  ------------------
 1732|  21.8k|                        res = obmc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff,
 1733|  21.8k|                                   f->cur.stride[1], b_dim, 1 + pl, bx4, by4, w4, h4);
 1734|  21.8k|                        if (res) return res;
  ------------------
  |  Branch (1734:29): [True: 0, False: 21.8k]
  ------------------
 1735|  21.8k|                    }
 1736|  63.4k|                }
 1737|  31.7k|            }
 1738|  32.8k|            if (b->interintra_type) {
  ------------------
  |  Branch (1738:17): [True: 1.54k, False: 31.3k]
  ------------------
 1739|       |                // FIXME for 8x32 with 4:2:2 subsampling, this probably does
 1740|       |                // the wrong thing since it will select 4x16, not 4x32, as a
 1741|       |                // transform size...
 1742|  1.54k|                const uint8_t *const ii_mask = II_MASK(chr_layout_idx, bs, b);
  ------------------
  |  |   83|  1.54k|    ((const uint8_t*)((uintptr_t)&dav1d_masks + \
  |  |   84|  1.54k|    (size_t)((b)->interintra_type == INTER_INTRA_BLEND ? \
  |  |  ------------------
  |  |  |  Branch (84:14): [True: 1.12k, False: 424]
  |  |  ------------------
  |  |   85|  1.54k|    dav1d_masks.offsets[c][(bs)-BS_32x32].ii[(b)->interintra_mode] : \
  |  |   86|  1.54k|    dav1d_masks.offsets[c][(bs)-BS_32x32].wedge[0][(b)->wedge_idx]) * 8))
  ------------------
 1743|       |
 1744|  4.63k|                for (int pl = 0; pl < 2; pl++) {
  ------------------
  |  Branch (1744:34): [True: 3.09k, False: 1.54k]
  ------------------
 1745|  3.09k|                    pixel *const tmp = bitfn(t->scratch.interintra);
  ------------------
  |  |   51|  3.09k|#define bitfn(x) x##_8bpc
  ------------------
 1746|  3.09k|                    pixel *const tl_edge = bitfn(t->scratch.edge) + 32;
  ------------------
  |  |   51|  3.09k|#define bitfn(x) x##_8bpc
  ------------------
 1747|  3.09k|                    enum IntraPredMode m =
 1748|  3.09k|                        b->interintra_mode == II_SMOOTH_PRED ?
  ------------------
  |  Branch (1748:25): [True: 602, False: 2.48k]
  ------------------
 1749|  2.48k|                        SMOOTH_PRED : b->interintra_mode;
 1750|  3.09k|                    int angle = 0;
 1751|  3.09k|                    pixel *const uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff;
 1752|  3.09k|                    const pixel *top_sb_edge = NULL;
 1753|  3.09k|                    if (!(t->by & (f->sb_step - 1))) {
  ------------------
  |  Branch (1753:25): [True: 820, False: 2.27k]
  ------------------
 1754|    820|                        top_sb_edge = f->ipred_edge[pl + 1];
 1755|    820|                        const int sby = t->by >> f->sb_shift;
 1756|    820|                        top_sb_edge += f->sb128w * 128 * (sby - 1);
 1757|    820|                    }
 1758|  3.09k|                    m = bytefn(dav1d_prepare_intra_edges)(t->bx >> ss_hor,
  ------------------
  |  |   87|  3.09k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   51|  3.09k|#define bitfn(x) x##_8bpc
  |  |  ------------------
  ------------------
 1759|  3.09k|                                                          (t->bx >> ss_hor) >
 1760|  3.09k|                                                              (ts->tiling.col_start >> ss_hor),
 1761|  3.09k|                                                          t->by >> ss_ver,
 1762|  3.09k|                                                          (t->by >> ss_ver) >
 1763|  3.09k|                                                              (ts->tiling.row_start >> ss_ver),
 1764|  3.09k|                                                          ts->tiling.col_end >> ss_hor,
 1765|  3.09k|                                                          ts->tiling.row_end >> ss_ver,
 1766|  3.09k|                                                          0, uvdst, f->cur.stride[1],
 1767|  3.09k|                                                          top_sb_edge, m,
 1768|  3.09k|                                                          &angle, cbw4, cbh4, 0, tl_edge
 1769|  3.09k|                                                          HIGHBD_CALL_SUFFIX);
 1770|  3.09k|                    dsp->ipred.intra_pred[m](tmp, cbw4 * 4 * sizeof(pixel),
 1771|  3.09k|                                             tl_edge, cbw4 * 4, cbh4 * 4, 0, 0, 0
 1772|  3.09k|                                             HIGHBD_CALL_SUFFIX);
 1773|  3.09k|                    dsp->mc.blend(uvdst, f->cur.stride[1], tmp,
 1774|  3.09k|                                  cbw4 * 4, cbh4 * 4, ii_mask);
 1775|  3.09k|                }
 1776|  1.54k|            }
 1777|  32.8k|        }
 1778|       |
 1779|  53.1k|    skip_inter_chroma_pred: {}
 1780|  53.1k|        t->tl_4x4_filter = filter_2d;
 1781|  53.1k|    } else {
 1782|  19.3k|        const enum Filter2d filter_2d = b->filter2d;
 1783|       |        // Maximum super block size is 128x128
 1784|  19.3k|        int16_t (*tmp)[128 * 128] = t->scratch.compinter;
 1785|  19.3k|        int jnt_weight;
 1786|  19.3k|        uint8_t *const seg_mask = t->scratch.seg_mask;
 1787|  19.3k|        const uint8_t *mask;
 1788|       |
 1789|  42.7k|        for (int i = 0; i < 2; i++) {
  ------------------
  |  Branch (1789:25): [True: 23.4k, False: 19.3k]
  ------------------
 1790|  23.4k|            const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]];
 1791|       |
 1792|  23.4k|            if (b->inter_mode == GLOBALMV_GLOBALMV && f->gmv_warp_allowed[b->ref[i]]) {
  ------------------
  |  Branch (1792:17): [True: 2.45k, False: 20.9k]
  |  Branch (1792:55): [True: 452, False: 2.00k]
  ------------------
 1793|    452|                res = warp_affine(t, NULL, tmp[i], bw4 * 4, b_dim, 0, refp,
 1794|    452|                                  &f->frame_hdr->gmv[b->ref[i]]);
 1795|    452|                if (res) return res;
  ------------------
  |  Branch (1795:21): [True: 0, False: 452]
  ------------------
 1796|  22.9k|            } else {
 1797|  22.9k|                res = mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by, 0,
 1798|  22.9k|                         b->mv[i], refp, b->ref[i], filter_2d);
 1799|  22.9k|                if (res) return res;
  ------------------
  |  Branch (1799:21): [True: 0, False: 22.9k]
  ------------------
 1800|  22.9k|            }
 1801|  23.4k|        }
 1802|  19.3k|        switch (b->comp_type) {
  ------------------
  |  Branch (1802:17): [True: 11.7k, False: 7.66k]
  ------------------
 1803|  7.96k|        case COMP_INTER_AVG:
  ------------------
  |  Branch (1803:9): [True: 7.96k, False: 11.4k]
  ------------------
 1804|  7.96k|            dsp->mc.avg(dst, f->cur.stride[0], tmp[0], tmp[1],
 1805|  7.96k|                        bw4 * 4, bh4 * 4 HIGHBD_CALL_SUFFIX);
 1806|  7.96k|            break;
 1807|  2.05k|        case COMP_INTER_WEIGHTED_AVG:
  ------------------
  |  Branch (1807:9): [True: 2.05k, False: 17.3k]
  ------------------
 1808|  2.05k|            jnt_weight = f->jnt_weights[b->ref[0]][b->ref[1]];
 1809|  2.05k|            dsp->mc.w_avg(dst, f->cur.stride[0], tmp[0], tmp[1],
 1810|  2.05k|                          bw4 * 4, bh4 * 4, jnt_weight HIGHBD_CALL_SUFFIX);
 1811|  2.05k|            break;
 1812|  1.28k|        case COMP_INTER_SEG:
  ------------------
  |  Branch (1812:9): [True: 1.28k, False: 18.0k]
  ------------------
 1813|  1.28k|            dsp->mc.w_mask[chr_layout_idx](dst, f->cur.stride[0],
 1814|  1.28k|                                           tmp[b->mask_sign], tmp[!b->mask_sign],
 1815|  1.28k|                                           bw4 * 4, bh4 * 4, seg_mask,
 1816|  1.28k|                                           b->mask_sign HIGHBD_CALL_SUFFIX);
 1817|  1.28k|            mask = seg_mask;
 1818|  1.28k|            break;
 1819|    403|        case COMP_INTER_WEDGE:
  ------------------
  |  Branch (1819:9): [True: 403, False: 18.9k]
  ------------------
 1820|    403|            mask = WEDGE_MASK(0, bs, 0, b->wedge_idx);
  ------------------
  |  |   89|    403|    ((const uint8_t*)((uintptr_t)&dav1d_masks + \
  |  |   90|    403|    (size_t)dav1d_masks.offsets[c][(bs)-BS_32x32].wedge[sign][idx] * 8))
  ------------------
 1821|    403|            dsp->mc.mask(dst, f->cur.stride[0],
 1822|    403|                         tmp[b->mask_sign], tmp[!b->mask_sign],
 1823|    403|                         bw4 * 4, bh4 * 4, mask HIGHBD_CALL_SUFFIX);
 1824|    403|            if (has_chroma)
  ------------------
  |  Branch (1824:17): [True: 391, False: 12]
  ------------------
 1825|    391|                mask = WEDGE_MASK(chr_layout_idx, bs, b->mask_sign, b->wedge_idx);
  ------------------
  |  |   89|    391|    ((const uint8_t*)((uintptr_t)&dav1d_masks + \
  |  |   90|    391|    (size_t)dav1d_masks.offsets[c][(bs)-BS_32x32].wedge[sign][idx] * 8))
  ------------------
 1826|    403|            break;
 1827|  19.3k|        }
 1828|       |
 1829|       |        // chroma
 1830|  34.7k|        if (has_chroma) for (int pl = 0; pl < 2; pl++) {
  ------------------
  |  Branch (1830:13): [True: 11.5k, False: 130]
  |  Branch (1830:42): [True: 23.1k, False: 11.5k]
  ------------------
 1831|  69.3k|            for (int i = 0; i < 2; i++) {
  ------------------
  |  Branch (1831:29): [True: 46.2k, False: 23.1k]
  ------------------
 1832|  46.2k|                const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]];
 1833|  46.2k|                if (b->inter_mode == GLOBALMV_GLOBALMV &&
  ------------------
  |  Branch (1833:21): [True: 4.82k, False: 41.4k]
  ------------------
 1834|  4.82k|                    imin(cbw4, cbh4) > 1 && f->gmv_warp_allowed[b->ref[i]])
  ------------------
  |  Branch (1834:21): [True: 2.02k, False: 2.80k]
  |  Branch (1834:45): [True: 354, False: 1.66k]
  ------------------
 1835|    354|                {
 1836|    354|                    res = warp_affine(t, NULL, tmp[i], bw4 * 4 >> ss_hor,
 1837|    354|                                      b_dim, 1 + pl,
 1838|    354|                                      refp, &f->frame_hdr->gmv[b->ref[i]]);
 1839|    354|                    if (res) return res;
  ------------------
  |  Branch (1839:25): [True: 0, False: 354]
  ------------------
 1840|  45.8k|                } else {
 1841|  45.8k|                    res = mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by,
 1842|  45.8k|                             1 + pl, b->mv[i], refp, b->ref[i], filter_2d);
 1843|  45.8k|                    if (res) return res;
  ------------------
  |  Branch (1843:25): [True: 0, False: 45.8k]
  ------------------
 1844|  45.8k|                }
 1845|  46.2k|            }
 1846|  23.1k|            pixel *const uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff;
 1847|  23.1k|            switch (b->comp_type) {
  ------------------
  |  Branch (1847:21): [True: 23.1k, False: 18.4E]
  ------------------
 1848|  15.7k|            case COMP_INTER_AVG:
  ------------------
  |  Branch (1848:13): [True: 15.7k, False: 7.32k]
  ------------------
 1849|  15.7k|                dsp->mc.avg(uvdst, f->cur.stride[1], tmp[0], tmp[1],
 1850|  15.7k|                            bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver
 1851|  15.7k|                            HIGHBD_CALL_SUFFIX);
 1852|  15.7k|                break;
 1853|  4.09k|            case COMP_INTER_WEIGHTED_AVG:
  ------------------
  |  Branch (1853:13): [True: 4.09k, False: 19.0k]
  ------------------
 1854|  4.09k|                dsp->mc.w_avg(uvdst, f->cur.stride[1], tmp[0], tmp[1],
 1855|  4.09k|                              bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, jnt_weight
 1856|  4.09k|                              HIGHBD_CALL_SUFFIX);
 1857|  4.09k|                break;
 1858|    782|            case COMP_INTER_WEDGE:
  ------------------
  |  Branch (1858:13): [True: 782, False: 22.3k]
  ------------------
 1859|  3.26k|            case COMP_INTER_SEG:
  ------------------
  |  Branch (1859:13): [True: 2.48k, False: 20.6k]
  ------------------
 1860|  3.26k|                dsp->mc.mask(uvdst, f->cur.stride[1],
 1861|  3.26k|                             tmp[b->mask_sign], tmp[!b->mask_sign],
 1862|  3.26k|                             bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, mask
 1863|  3.26k|                             HIGHBD_CALL_SUFFIX);
 1864|  3.26k|                break;
 1865|  23.1k|            }
 1866|  23.1k|        }
 1867|  11.6k|    }
 1868|       |
 1869|   601k|    if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
  ------------------
  |  |   34|   601k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 601k]
  |  |  ------------------
  |  |   35|   601k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|   601k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
                  if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
  ------------------
  |  |   37|      0|#define DEBUG_B_PIXELS 0
  |  |  ------------------
  |  |  |  Branch (37:24): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1870|      0|        hex_dump(dst, f->cur.stride[0], b_dim[0] * 4, b_dim[1] * 4, "y-pred");
 1871|      0|        if (has_chroma) {
  ------------------
  |  Branch (1871:13): [True: 0, False: 0]
  ------------------
 1872|      0|            hex_dump(&((pixel *) f->cur.data[1])[uvdstoff], f->cur.stride[1],
 1873|      0|                     cbw4 * 4, cbh4 * 4, "u-pred");
 1874|      0|            hex_dump(&((pixel *) f->cur.data[2])[uvdstoff], f->cur.stride[1],
 1875|      0|                     cbw4 * 4, cbh4 * 4, "v-pred");
 1876|      0|        }
 1877|      0|    }
 1878|       |
 1879|   601k|    const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
 1880|       |
 1881|   601k|    if (b->skip) {
  ------------------
  |  Branch (1881:9): [True: 549k, False: 52.2k]
  ------------------
 1882|       |        // reset coef contexts
 1883|   549k|        BlockContext *const a = t->a;
 1884|   549k|        dav1d_memset_pow2[b_dim[2]](&a->lcoef[bx4], 0x40);
 1885|   549k|        dav1d_memset_pow2[b_dim[3]](&t->l.lcoef[by4], 0x40);
 1886|   549k|        if (has_chroma) {
  ------------------
  |  Branch (1886:13): [True: 528k, False: 20.8k]
  ------------------
 1887|   528k|            dav1d_memset_pow2_fn memset_cw = dav1d_memset_pow2[ulog2(cbw4)];
 1888|   528k|            dav1d_memset_pow2_fn memset_ch = dav1d_memset_pow2[ulog2(cbh4)];
 1889|   528k|            memset_cw(&a->ccoef[0][cbx4], 0x40);
 1890|   528k|            memset_cw(&a->ccoef[1][cbx4], 0x40);
 1891|   528k|            memset_ch(&t->l.ccoef[0][cby4], 0x40);
 1892|   528k|            memset_ch(&t->l.ccoef[1][cby4], 0x40);
 1893|   528k|        }
 1894|   549k|        return 0;
 1895|   549k|    }
 1896|       |
 1897|  52.2k|    const TxfmInfo *const uvtx = &dav1d_txfm_dimensions[b->uvtx];
 1898|  52.2k|    const TxfmInfo *const ytx = &dav1d_txfm_dimensions[b->max_ytx];
 1899|  52.2k|    const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 };
 1900|       |
 1901|   112k|    for (int init_y = 0; init_y < bh4; init_y += 16) {
  ------------------
  |  Branch (1901:26): [True: 59.8k, False: 52.2k]
  ------------------
 1902|   120k|        for (int init_x = 0; init_x < bw4; init_x += 16) {
  ------------------
  |  Branch (1902:30): [True: 60.3k, False: 59.8k]
  ------------------
 1903|       |            // coefficient coding & inverse transforms
 1904|  60.3k|            int y_off = !!init_y, y;
 1905|  60.3k|            dst += PXSTRIDE(f->cur.stride[0]) * 4 * init_y;
  ------------------
  |  |   53|  60.3k|#define PXSTRIDE(x) (x)
  ------------------
 1906|   131k|            for (y = init_y, t->by += init_y; y < imin(h4, init_y + 16);
  ------------------
  |  Branch (1906:47): [True: 71.1k, False: 60.3k]
  ------------------
 1907|  71.1k|                 y += ytx->h, y_off++)
 1908|  71.1k|            {
 1909|  71.1k|                int x, x_off = !!init_x;
 1910|   208k|                for (x = init_x, t->bx += init_x; x < imin(w4, init_x + 16);
  ------------------
  |  Branch (1910:51): [True: 137k, False: 71.1k]
  ------------------
 1911|   137k|                     x += ytx->w, x_off++)
 1912|   137k|                {
 1913|   137k|                    read_coef_tree(t, bs, b, b->max_ytx, 0, tx_split,
 1914|   137k|                                   x_off, y_off, &dst[x * 4]);
 1915|   137k|                    t->bx += ytx->w;
 1916|   137k|                }
 1917|  71.1k|                dst += PXSTRIDE(f->cur.stride[0]) * 4 * ytx->h;
  ------------------
  |  |   53|  71.1k|#define PXSTRIDE(x) (x)
  ------------------
 1918|  71.1k|                t->bx -= x;
 1919|  71.1k|                t->by += ytx->h;
 1920|  71.1k|            }
 1921|  60.3k|            dst -= PXSTRIDE(f->cur.stride[0]) * 4 * y;
  ------------------
  |  |   53|  60.3k|#define PXSTRIDE(x) (x)
  ------------------
 1922|  60.3k|            t->by -= y;
 1923|       |
 1924|       |            // chroma coefs and inverse transform
 1925|   145k|            if (has_chroma) for (int pl = 0; pl < 2; pl++) {
  ------------------
  |  Branch (1925:17): [True: 48.3k, False: 11.9k]
  |  Branch (1925:46): [True: 96.7k, False: 48.3k]
  ------------------
 1926|  96.7k|                pixel *uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff +
 1927|  96.7k|                    (PXSTRIDE(f->cur.stride[1]) * init_y * 4 >> ss_ver);
  ------------------
  |  |   53|  96.7k|#define PXSTRIDE(x) (x)
  ------------------
 1928|  96.7k|                for (y = init_y >> ss_ver, t->by += init_y;
 1929|   206k|                     y < imin(ch4, (init_y + 16) >> ss_ver); y += uvtx->h)
  ------------------
  |  Branch (1929:22): [True: 109k, False: 96.7k]
  ------------------
 1930|   109k|                {
 1931|   109k|                    int x;
 1932|   109k|                    for (x = init_x >> ss_hor, t->bx += init_x;
 1933|   281k|                         x < imin(cw4, (init_x + 16) >> ss_hor); x += uvtx->w)
  ------------------
  |  Branch (1933:26): [True: 172k, False: 109k]
  ------------------
 1934|   172k|                    {
 1935|   172k|                        coef *cf;
 1936|   172k|                        int eob;
 1937|   172k|                        enum TxfmType txtp;
 1938|   172k|                        if (t->frame_thread.pass) {
  ------------------
  |  Branch (1938:29): [True: 0, False: 172k]
  ------------------
 1939|      0|                            const int p = t->frame_thread.pass & 1;
 1940|      0|                            const int cbi = *ts->frame_thread[p].cbi++;
 1941|      0|                            cf = ts->frame_thread[p].cf;
 1942|      0|                            ts->frame_thread[p].cf += uvtx->w * uvtx->h * 16;
 1943|      0|                            eob  = cbi >> 5;
 1944|      0|                            txtp = cbi & 0x1f;
 1945|   172k|                        } else {
 1946|   172k|                            uint8_t cf_ctx;
 1947|   172k|                            cf = bitfn(t->cf);
  ------------------
  |  |   51|   172k|#define bitfn(x) x##_8bpc
  ------------------
 1948|   172k|                            txtp = t->scratch.txtp_map[(by4 + (y << ss_ver)) * 32 +
 1949|   172k|                                                        bx4 + (x << ss_hor)];
 1950|   172k|                            eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
 1951|   172k|                                               &t->l.ccoef[pl][cby4 + y],
 1952|   172k|                                               b->uvtx, bs, b, 0, 1 + pl,
 1953|   172k|                                               cf, &txtp, &cf_ctx);
 1954|   172k|                            if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|   172k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 172k]
  |  |  ------------------
  |  |   35|   172k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|   172k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1955|      0|                                printf("Post-uv-cf-blk[pl=%d,tx=%d,"
 1956|      0|                                       "txtp=%d,eob=%d]: r=%d\n",
 1957|      0|                                       pl, b->uvtx, txtp, eob, ts->msac.rng);
 1958|   172k|                            int ctw = imin(uvtx->w, (f->bw - t->bx + ss_hor) >> ss_hor);
 1959|   172k|                            int cth = imin(uvtx->h, (f->bh - t->by + ss_ver) >> ss_ver);
 1960|   172k|                            dav1d_memset_likely_pow2(&t->a->ccoef[pl][cbx4 + x], cf_ctx, ctw);
 1961|   172k|                            dav1d_memset_likely_pow2(&t->l.ccoef[pl][cby4 + y], cf_ctx, cth);
 1962|   172k|                        }
 1963|   172k|                        if (eob >= 0) {
  ------------------
  |  Branch (1963:29): [True: 68.3k, False: 103k]
  ------------------
 1964|  68.3k|                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
  ------------------
  |  |   34|  68.3k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 68.3k]
  |  |  ------------------
  |  |   35|  68.3k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  68.3k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
                                          if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
  ------------------
  |  |   37|      0|#define DEBUG_B_PIXELS 0
  |  |  ------------------
  |  |  |  Branch (37:24): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1965|      0|                                coef_dump(cf, uvtx->h * 4, uvtx->w * 4, 3, "dq");
 1966|  68.3k|                            dsp->itx.itxfm_add[b->uvtx]
 1967|  68.3k|                                              [txtp](&uvdst[4 * x],
 1968|  68.3k|                                                     f->cur.stride[1],
 1969|  68.3k|                                                     cf, eob HIGHBD_CALL_SUFFIX);
 1970|  68.3k|                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
  ------------------
  |  |   34|  68.3k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 68.3k]
  |  |  ------------------
  |  |   35|  68.3k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  68.3k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
                                          if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
  ------------------
  |  |   37|      0|#define DEBUG_B_PIXELS 0
  |  |  ------------------
  |  |  |  Branch (37:24): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1971|      0|                                hex_dump(&uvdst[4 * x], f->cur.stride[1],
 1972|      0|                                         uvtx->w * 4, uvtx->h * 4, "recon");
 1973|  68.3k|                        }
 1974|   172k|                        t->bx += uvtx->w << ss_hor;
 1975|   172k|                    }
 1976|   109k|                    uvdst += PXSTRIDE(f->cur.stride[1]) * 4 * uvtx->h;
  ------------------
  |  |   53|   109k|#define PXSTRIDE(x) (x)
  ------------------
 1977|   109k|                    t->bx -= x << ss_hor;
 1978|   109k|                    t->by += uvtx->h << ss_ver;
 1979|   109k|                }
 1980|  96.7k|                t->by -= y << ss_ver;
 1981|  96.7k|            }
 1982|  60.3k|        }
 1983|  59.8k|    }
 1984|  52.2k|    return 0;
 1985|   601k|}
dav1d_filter_sbrow_deblock_cols_8bpc:
 1987|  17.4k|void bytefn(dav1d_filter_sbrow_deblock_cols)(Dav1dFrameContext *const f, const int sby) {
 1988|  17.4k|    if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_DEBLOCK) ||
  ------------------
  |  Branch (1988:9): [True: 0, False: 17.4k]
  ------------------
 1989|  17.4k|        (!f->frame_hdr->loopfilter.level_y[0] && !f->frame_hdr->loopfilter.level_y[1]))
  ------------------
  |  Branch (1989:10): [True: 1.75k, False: 15.7k]
  |  Branch (1989:50): [True: 1.01k, False: 739]
  ------------------
 1990|  1.01k|    {
 1991|  1.01k|        return;
 1992|  1.01k|    }
 1993|  16.4k|    const int y = sby * f->sb_step * 4;
 1994|  16.4k|    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
 1995|  16.4k|    pixel *const p[3] = {
 1996|  16.4k|        f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
  ------------------
  |  |   53|  16.4k|#define PXSTRIDE(x) (x)
  ------------------
 1997|  16.4k|        f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
  ------------------
  |  |   53|  16.4k|#define PXSTRIDE(x) (x)
  ------------------
 1998|  16.4k|        f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
  ------------------
  |  |   53|  16.4k|#define PXSTRIDE(x) (x)
  ------------------
 1999|  16.4k|    };
 2000|  16.4k|    Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
 2001|  16.4k|    bytefn(dav1d_loopfilter_sbrow_cols)(f, p, mask, sby,
  ------------------
  |  |   87|  16.4k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   51|  16.4k|#define bitfn(x) x##_8bpc
  |  |  ------------------
  ------------------
 2002|  16.4k|                                        f->lf.start_of_tile_row[sby]);
 2003|  16.4k|}
dav1d_filter_sbrow_deblock_rows_8bpc:
 2005|  35.0k|void bytefn(dav1d_filter_sbrow_deblock_rows)(Dav1dFrameContext *const f, const int sby) {
 2006|  35.0k|    const int y = sby * f->sb_step * 4;
 2007|  35.0k|    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
 2008|  35.0k|    pixel *const p[3] = {
 2009|  35.0k|        f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
  ------------------
  |  |   53|  35.0k|#define PXSTRIDE(x) (x)
  ------------------
 2010|  35.0k|        f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
  ------------------
  |  |   53|  35.0k|#define PXSTRIDE(x) (x)
  ------------------
 2011|  35.0k|        f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
  ------------------
  |  |   53|  35.0k|#define PXSTRIDE(x) (x)
  ------------------
 2012|  35.0k|    };
 2013|  35.0k|    Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
 2014|  35.0k|    if (f->c->inloop_filters & DAV1D_INLOOPFILTER_DEBLOCK &&
  ------------------
  |  Branch (2014:9): [True: 35.0k, False: 0]
  ------------------
 2015|  35.0k|        (f->frame_hdr->loopfilter.level_y[0] || f->frame_hdr->loopfilter.level_y[1]))
  ------------------
  |  Branch (2015:10): [True: 15.6k, False: 19.4k]
  |  Branch (2015:49): [True: 739, False: 18.6k]
  ------------------
 2016|  16.4k|    {
 2017|  16.4k|        bytefn(dav1d_loopfilter_sbrow_rows)(f, p, mask, sby);
  ------------------
  |  |   87|  16.4k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   51|  16.4k|#define bitfn(x) x##_8bpc
  |  |  ------------------
  ------------------
 2018|  16.4k|    }
 2019|  35.0k|    if (f->seq_hdr->cdef || f->lf.restore_planes) {
  ------------------
  |  Branch (2019:9): [True: 28.0k, False: 7.09k]
  |  Branch (2019:29): [True: 3.48k, False: 3.61k]
  ------------------
 2020|       |        // Store loop filtered pixels required by CDEF / LR
 2021|  31.4k|        bytefn(dav1d_copy_lpf)(f, p, sby);
  ------------------
  |  |   87|  31.4k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   51|  31.4k|#define bitfn(x) x##_8bpc
  |  |  ------------------
  ------------------
 2022|  31.4k|    }
 2023|  35.0k|}
dav1d_filter_sbrow_cdef_8bpc:
 2025|  27.9k|void bytefn(dav1d_filter_sbrow_cdef)(Dav1dTaskContext *const tc, const int sby) {
 2026|  27.9k|    const Dav1dFrameContext *const f = tc->f;
 2027|  27.9k|    if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_CDEF)) return;
  ------------------
  |  Branch (2027:9): [True: 0, False: 27.9k]
  ------------------
 2028|  27.9k|    const int sbsz = f->sb_step;
 2029|  27.9k|    const int y = sby * sbsz * 4;
 2030|  27.9k|    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
 2031|  27.9k|    pixel *const p[3] = {
 2032|  27.9k|        f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
  ------------------
  |  |   53|  27.9k|#define PXSTRIDE(x) (x)
  ------------------
 2033|  27.9k|        f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
  ------------------
  |  |   53|  27.9k|#define PXSTRIDE(x) (x)
  ------------------
 2034|  27.9k|        f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
  ------------------
  |  |   53|  27.9k|#define PXSTRIDE(x) (x)
  ------------------
 2035|  27.9k|    };
 2036|  27.9k|    Av1Filter *prev_mask = f->lf.mask + ((sby - 1) >> !f->seq_hdr->sb128) * f->sb128w;
 2037|  27.9k|    Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
 2038|  27.9k|    const int start = sby * sbsz;
 2039|  27.9k|    if (sby) {
  ------------------
  |  Branch (2039:9): [True: 21.6k, False: 6.28k]
  ------------------
 2040|  21.6k|        const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
 2041|  21.6k|        pixel *p_up[3] = {
 2042|  21.6k|            p[0] - 8 * PXSTRIDE(f->cur.stride[0]),
  ------------------
  |  |   53|  21.6k|#define PXSTRIDE(x) (x)
  ------------------
 2043|  21.6k|            p[1] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
  ------------------
  |  |   53|  21.6k|#define PXSTRIDE(x) (x)
  ------------------
 2044|  21.6k|            p[2] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
  ------------------
  |  |   53|  21.6k|#define PXSTRIDE(x) (x)
  ------------------
 2045|  21.6k|        };
 2046|  21.6k|        bytefn(dav1d_cdef_brow)(tc, p_up, prev_mask, start - 2, start, 1, sby);
  ------------------
  |  |   87|  21.6k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   51|  21.6k|#define bitfn(x) x##_8bpc
  |  |  ------------------
  ------------------
 2047|  21.6k|    }
 2048|  27.9k|    const int n_blks = sbsz - 2 * (sby + 1 < f->sbh);
 2049|  27.9k|    const int end = imin(start + n_blks, f->bh);
 2050|  27.9k|    bytefn(dav1d_cdef_brow)(tc, p, mask, start, end, 0, sby);
  ------------------
  |  |   87|  27.9k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   51|  27.9k|#define bitfn(x) x##_8bpc
  |  |  ------------------
  ------------------
 2051|  27.9k|}
dav1d_filter_sbrow_resize_8bpc:
 2053|    964|void bytefn(dav1d_filter_sbrow_resize)(Dav1dFrameContext *const f, const int sby) {
 2054|    964|    const int sbsz = f->sb_step;
 2055|    964|    const int y = sby * sbsz * 4;
 2056|    964|    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
 2057|    964|    const pixel *const p[3] = {
 2058|    964|        f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
  ------------------
  |  |   53|    964|#define PXSTRIDE(x) (x)
  ------------------
 2059|    964|        f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
  ------------------
  |  |   53|    964|#define PXSTRIDE(x) (x)
  ------------------
 2060|    964|        f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
  ------------------
  |  |   53|    964|#define PXSTRIDE(x) (x)
  ------------------
 2061|    964|    };
 2062|    964|    pixel *const sr_p[3] = {
 2063|    964|        f->lf.sr_p[0] + y * PXSTRIDE(f->sr_cur.p.stride[0]),
  ------------------
  |  |   53|    964|#define PXSTRIDE(x) (x)
  ------------------
 2064|    964|        f->lf.sr_p[1] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver),
  ------------------
  |  |   53|    964|#define PXSTRIDE(x) (x)
  ------------------
 2065|    964|        f->lf.sr_p[2] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver)
  ------------------
  |  |   53|    964|#define PXSTRIDE(x) (x)
  ------------------
 2066|    964|    };
 2067|    964|    const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400;
 2068|  3.66k|    for (int pl = 0; pl < 1 + 2 * has_chroma; pl++) {
  ------------------
  |  Branch (2068:22): [True: 2.69k, False: 964]
  ------------------
 2069|  2.69k|        const int ss_ver = pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
  ------------------
  |  Branch (2069:28): [True: 1.73k, False: 964]
  |  Branch (2069:34): [True: 176, False: 1.55k]
  ------------------
 2070|  2.69k|        const int h_start = 8 * !!sby >> ss_ver;
 2071|  2.69k|        const ptrdiff_t dst_stride = f->sr_cur.p.stride[!!pl];
 2072|  2.69k|        pixel *dst = sr_p[pl] - h_start * PXSTRIDE(dst_stride);
  ------------------
  |  |   53|  2.69k|#define PXSTRIDE(x) (x)
  ------------------
 2073|  2.69k|        const ptrdiff_t src_stride = f->cur.stride[!!pl];
 2074|  2.69k|        const pixel *src = p[pl] - h_start * PXSTRIDE(src_stride);
  ------------------
  |  |   53|  2.69k|#define PXSTRIDE(x) (x)
  ------------------
 2075|  2.69k|        const int h_end = 4 * (sbsz - 2 * (sby + 1 < f->sbh)) >> ss_ver;
 2076|  2.69k|        const int ss_hor = pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
  ------------------
  |  Branch (2076:28): [True: 1.73k, False: 964]
  |  Branch (2076:34): [True: 180, False: 1.55k]
  ------------------
 2077|  2.69k|        const int dst_w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor;
 2078|  2.69k|        const int src_w = (4 * f->bw + ss_hor) >> ss_hor;
 2079|  2.69k|        const int img_h = (f->cur.p.h - sbsz * 4 * sby + ss_ver) >> ss_ver;
 2080|       |
 2081|  2.69k|        f->dsp->mc.resize(dst, dst_stride, src, src_stride, dst_w,
 2082|  2.69k|                          imin(img_h, h_end) + h_start, src_w,
 2083|  2.69k|                          f->resize_step[!!pl], f->resize_start[!!pl]
 2084|  2.69k|                          HIGHBD_CALL_SUFFIX);
 2085|  2.69k|    }
 2086|    964|}
dav1d_filter_sbrow_lr_8bpc:
 2088|  7.80k|void bytefn(dav1d_filter_sbrow_lr)(Dav1dFrameContext *const f, const int sby) {
 2089|  7.80k|    if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_RESTORATION)) return;
  ------------------
  |  Branch (2089:9): [True: 0, False: 7.80k]
  ------------------
 2090|  7.80k|    const int y = sby * f->sb_step * 4;
 2091|  7.80k|    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
 2092|  7.80k|    pixel *const sr_p[3] = {
 2093|  7.80k|        f->lf.sr_p[0] + y * PXSTRIDE(f->sr_cur.p.stride[0]),
  ------------------
  |  |   53|  7.80k|#define PXSTRIDE(x) (x)
  ------------------
 2094|  7.80k|        f->lf.sr_p[1] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver),
  ------------------
  |  |   53|  7.80k|#define PXSTRIDE(x) (x)
  ------------------
 2095|  7.80k|        f->lf.sr_p[2] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver)
  ------------------
  |  |   53|  7.80k|#define PXSTRIDE(x) (x)
  ------------------
 2096|  7.80k|    };
 2097|  7.80k|    bytefn(dav1d_lr_sbrow)(f, sr_p, sby);
  ------------------
  |  |   87|  7.80k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   51|  7.80k|#define bitfn(x) x##_8bpc
  |  |  ------------------
  ------------------
 2098|  7.80k|}
dav1d_filter_sbrow_8bpc:
 2100|  2.28k|void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) {
 2101|  2.28k|    bytefn(dav1d_filter_sbrow_deblock_cols)(f, sby);
  ------------------
  |  |   87|  2.28k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   51|  2.28k|#define bitfn(x) x##_8bpc
  |  |  ------------------
  ------------------
 2102|  2.28k|    bytefn(dav1d_filter_sbrow_deblock_rows)(f, sby);
  ------------------
  |  |   87|  2.28k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   51|  2.28k|#define bitfn(x) x##_8bpc
  |  |  ------------------
  ------------------
 2103|  2.28k|    if (f->seq_hdr->cdef)
  ------------------
  |  Branch (2103:9): [True: 1.72k, False: 557]
  ------------------
 2104|  1.72k|        bytefn(dav1d_filter_sbrow_cdef)(f->c->tc, sby);
  ------------------
  |  |   87|  1.72k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   51|  1.72k|#define bitfn(x) x##_8bpc
  |  |  ------------------
  ------------------
 2105|  2.28k|    if (f->frame_hdr->width[0] != f->frame_hdr->width[1])
  ------------------
  |  Branch (2105:9): [True: 101, False: 2.18k]
  ------------------
 2106|    101|        bytefn(dav1d_filter_sbrow_resize)(f, sby);
  ------------------
  |  |   87|    101|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   51|    101|#define bitfn(x) x##_8bpc
  |  |  ------------------
  ------------------
 2107|  2.28k|    if (f->lf.restore_planes)
  ------------------
  |  Branch (2107:9): [True: 410, False: 1.87k]
  ------------------
 2108|    410|        bytefn(dav1d_filter_sbrow_lr)(f, sby);
  ------------------
  |  |   87|    410|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   51|    410|#define bitfn(x) x##_8bpc
  |  |  ------------------
  ------------------
 2109|  2.28k|}
dav1d_backup_ipred_edge_8bpc:
 2111|  47.5k|void bytefn(dav1d_backup_ipred_edge)(Dav1dTaskContext *const t) {
 2112|  47.5k|    const Dav1dFrameContext *const f = t->f;
 2113|  47.5k|    Dav1dTileState *const ts = t->ts;
 2114|  47.5k|    const int sby = t->by >> f->sb_shift;
 2115|  47.5k|    const int sby_off = f->sb128w * 128 * sby;
 2116|  47.5k|    const int x_off = ts->tiling.col_start;
 2117|       |
 2118|  47.5k|    const pixel *const y =
 2119|  47.5k|        ((const pixel *) f->cur.data[0]) + x_off * 4 +
 2120|  47.5k|                    ((t->by + f->sb_step) * 4 - 1) * PXSTRIDE(f->cur.stride[0]);
  ------------------
  |  |   53|  47.5k|#define PXSTRIDE(x) (x)
  ------------------
 2121|  47.5k|    pixel_copy(&f->ipred_edge[0][sby_off + x_off * 4], y,
  ------------------
  |  |   47|  47.5k|#define pixel_copy memcpy
  ------------------
 2122|  47.5k|               4 * (ts->tiling.col_end - x_off));
 2123|       |
 2124|  47.5k|    if (f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400) {
  ------------------
  |  Branch (2124:9): [True: 35.3k, False: 12.2k]
  ------------------
 2125|  35.3k|        const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
 2126|  35.3k|        const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
 2127|       |
 2128|  35.3k|        const ptrdiff_t uv_off = (x_off * 4 >> ss_hor) +
 2129|  35.3k|            (((t->by + f->sb_step) * 4 >> ss_ver) - 1) * PXSTRIDE(f->cur.stride[1]);
  ------------------
  |  |   53|  35.3k|#define PXSTRIDE(x) (x)
  ------------------
 2130|   105k|        for (int pl = 1; pl <= 2; pl++)
  ------------------
  |  Branch (2130:26): [True: 70.6k, False: 35.3k]
  ------------------
 2131|  70.6k|            pixel_copy(&f->ipred_edge[pl][sby_off + (x_off * 4 >> ss_hor)],
  ------------------
  |  |   47|  70.6k|#define pixel_copy memcpy
  ------------------
 2132|  70.6k|                       &((const pixel *) f->cur.data[pl])[uv_off],
 2133|  70.6k|                       4 * (ts->tiling.col_end - x_off) >> ss_hor);
 2134|  35.3k|    }
 2135|  47.5k|}
dav1d_copy_pal_block_y_8bpc:
 2141|  10.9k|{
 2142|  10.9k|    const Dav1dFrameContext *const f = t->f;
 2143|  10.9k|    pixel *const pal = t->frame_thread.pass ?
  ------------------
  |  Branch (2143:24): [True: 0, False: 10.9k]
  ------------------
 2144|      0|        f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
 2145|      0|                            ((t->bx >> 1) + (t->by & 1))][0] :
 2146|  10.9k|        bytefn(t->scratch.pal)[0];
  ------------------
  |  |   87|  10.9k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   51|  10.9k|#define bitfn(x) x##_8bpc
  |  |  ------------------
  ------------------
 2147|  54.3k|    for (int x = 0; x < bw4; x++)
  ------------------
  |  Branch (2147:21): [True: 43.3k, False: 10.9k]
  ------------------
 2148|  43.3k|        memcpy(bytefn(t->al_pal)[0][bx4 + x][0], pal, 8 * sizeof(pixel));
  ------------------
  |  |   87|  43.3k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   51|  43.3k|#define bitfn(x) x##_8bpc
  |  |  ------------------
  ------------------
 2149|  44.4k|    for (int y = 0; y < bh4; y++)
  ------------------
  |  Branch (2149:21): [True: 33.4k, False: 10.9k]
  ------------------
 2150|  33.4k|        memcpy(bytefn(t->al_pal)[1][by4 + y][0], pal, 8 * sizeof(pixel));
  ------------------
  |  |   87|  33.4k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   51|  33.4k|#define bitfn(x) x##_8bpc
  |  |  ------------------
  ------------------
 2151|  10.9k|}
dav1d_copy_pal_block_uv_8bpc:
 2157|  3.84k|{
 2158|  3.84k|    const Dav1dFrameContext *const f = t->f;
 2159|  3.84k|    const pixel (*const pal)[8] = t->frame_thread.pass ?
  ------------------
  |  Branch (2159:35): [True: 0, False: 3.84k]
  ------------------
 2160|      0|        f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
 2161|      0|                            ((t->bx >> 1) + (t->by & 1))] :
 2162|  3.84k|        bytefn(t->scratch.pal);
  ------------------
  |  |   87|  3.84k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   51|  3.84k|#define bitfn(x) x##_8bpc
  |  |  ------------------
  ------------------
 2163|       |    // see aomedia bug 2183 for why we use luma coordinates here
 2164|  11.5k|    for (int pl = 1; pl <= 2; pl++) {
  ------------------
  |  Branch (2164:22): [True: 7.68k, False: 3.84k]
  ------------------
 2165|  40.9k|        for (int x = 0; x < bw4; x++)
  ------------------
  |  Branch (2165:25): [True: 33.2k, False: 7.68k]
  ------------------
 2166|  33.2k|            memcpy(bytefn(t->al_pal)[0][bx4 + x][pl], pal[pl], 8 * sizeof(pixel));
  ------------------
  |  |   87|  33.2k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   51|  33.2k|#define bitfn(x) x##_8bpc
  |  |  ------------------
  ------------------
 2167|  33.0k|        for (int y = 0; y < bh4; y++)
  ------------------
  |  Branch (2167:25): [True: 25.3k, False: 7.68k]
  ------------------
 2168|  25.3k|            memcpy(bytefn(t->al_pal)[1][by4 + y][pl], pal[pl], 8 * sizeof(pixel));
  ------------------
  |  |   87|  25.3k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   51|  25.3k|#define bitfn(x) x##_8bpc
  |  |  ------------------
  ------------------
 2169|  7.68k|    }
 2170|  3.84k|}
dav1d_read_pal_plane_8bpc:
 2175|  14.8k|{
 2176|  14.8k|    Dav1dTileState *const ts = t->ts;
 2177|  14.8k|    const Dav1dFrameContext *const f = t->f;
 2178|  14.8k|    const int pal_sz = b->pal_sz[pl] = dav1d_msac_decode_symbol_adapt8(&ts->msac,
  ------------------
  |  |   48|  14.8k|#define dav1d_msac_decode_symbol_adapt8  dav1d_msac_decode_symbol_adapt8_sse2
  ------------------
 2179|  14.8k|                                           ts->cdf.m.pal_sz[pl][sz_ctx], 6) + 2;
 2180|  14.8k|    pixel cache[16], used_cache[8];
 2181|  14.8k|    int l_cache = pl ? t->pal_sz_uv[1][by4] : t->l.pal_sz[by4];
  ------------------
  |  Branch (2181:19): [True: 3.84k, False: 10.9k]
  ------------------
 2182|  14.8k|    int n_cache = 0;
 2183|       |    // don't reuse above palette outside SB64 boundaries
 2184|  14.8k|    int a_cache = by4 & 15 ? pl ? t->pal_sz_uv[0][bx4] : t->a->pal_sz[bx4] : 0;
  ------------------
  |  Branch (2184:19): [True: 7.73k, False: 7.08k]
  |  Branch (2184:30): [True: 1.46k, False: 6.26k]
  ------------------
 2185|  14.8k|    const pixel *l = bytefn(t->al_pal)[1][by4][pl];
  ------------------
  |  |   87|  14.8k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   51|  14.8k|#define bitfn(x) x##_8bpc
  |  |  ------------------
  ------------------
 2186|  14.8k|    const pixel *a = bytefn(t->al_pal)[0][bx4][pl];
  ------------------
  |  |   87|  14.8k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   51|  14.8k|#define bitfn(x) x##_8bpc
  |  |  ------------------
  ------------------
 2187|       |
 2188|       |    // fill/sort cache
 2189|  24.5k|    while (l_cache && a_cache) {
  ------------------
  |  Branch (2189:12): [True: 14.4k, False: 10.0k]
  |  Branch (2189:23): [True: 9.70k, False: 4.73k]
  ------------------
 2190|  9.70k|        if (*l < *a) {
  ------------------
  |  Branch (2190:13): [True: 3.79k, False: 5.91k]
  ------------------
 2191|  3.79k|            if (!n_cache || cache[n_cache - 1] != *l)
  ------------------
  |  Branch (2191:17): [True: 726, False: 3.06k]
  |  Branch (2191:29): [True: 3.01k, False: 46]
  ------------------
 2192|  3.74k|                cache[n_cache++] = *l;
 2193|  3.79k|            l++;
 2194|  3.79k|            l_cache--;
 2195|  5.91k|        } else {
 2196|  5.91k|            if (*a == *l) {
  ------------------
  |  Branch (2196:17): [True: 2.41k, False: 3.50k]
  ------------------
 2197|  2.41k|                l++;
 2198|  2.41k|                l_cache--;
 2199|  2.41k|            }
 2200|  5.91k|            if (!n_cache || cache[n_cache - 1] != *a)
  ------------------
  |  Branch (2200:17): [True: 1.08k, False: 4.83k]
  |  Branch (2200:29): [True: 4.62k, False: 203]
  ------------------
 2201|  5.71k|                cache[n_cache++] = *a;
 2202|  5.91k|            a++;
 2203|  5.91k|            a_cache--;
 2204|  5.91k|        }
 2205|  9.70k|    }
 2206|  14.8k|    if (l_cache) {
  ------------------
  |  Branch (2206:9): [True: 4.73k, False: 10.0k]
  ------------------
 2207|  16.1k|        do {
 2208|  16.1k|            if (!n_cache || cache[n_cache - 1] != *l)
  ------------------
  |  Branch (2208:17): [True: 3.95k, False: 12.2k]
  |  Branch (2208:29): [True: 9.64k, False: 2.59k]
  ------------------
 2209|  13.6k|                cache[n_cache++] = *l;
 2210|  16.1k|            l++;
 2211|  16.1k|        } while (--l_cache > 0);
  ------------------
  |  Branch (2211:18): [True: 11.4k, False: 4.73k]
  ------------------
 2212|  10.0k|    } else if (a_cache) {
  ------------------
  |  Branch (2212:16): [True: 2.66k, False: 7.41k]
  ------------------
 2213|  10.4k|        do {
 2214|  10.4k|            if (!n_cache || cache[n_cache - 1] != *a)
  ------------------
  |  Branch (2214:17): [True: 1.89k, False: 8.58k]
  |  Branch (2214:29): [True: 6.46k, False: 2.12k]
  ------------------
 2215|  8.35k|                cache[n_cache++] = *a;
 2216|  10.4k|            a++;
 2217|  10.4k|        } while (--a_cache > 0);
  ------------------
  |  Branch (2217:18): [True: 7.81k, False: 2.66k]
  ------------------
 2218|  2.66k|    }
 2219|       |
 2220|       |    // find reused cache entries
 2221|  14.8k|    int i = 0;
 2222|  42.9k|    for (int n = 0; n < n_cache && i < pal_sz; n++)
  ------------------
  |  Branch (2222:21): [True: 29.3k, False: 13.6k]
  |  Branch (2222:36): [True: 28.0k, False: 1.21k]
  ------------------
 2223|  28.0k|        if (dav1d_msac_decode_bool_equi(&ts->msac))
  ------------------
  |  |   53|  28.0k|#define dav1d_msac_decode_bool_equi      dav1d_msac_decode_bool_equi_sse2
  ------------------
  |  Branch (2223:13): [True: 14.3k, False: 13.7k]
  ------------------
 2224|  14.3k|            used_cache[i++] = cache[n];
 2225|  14.8k|    const int n_used_cache = i;
 2226|       |
 2227|       |    // parse new entries
 2228|  14.8k|    pixel *const pal = t->frame_thread.pass ?
  ------------------
  |  Branch (2228:24): [True: 0, False: 14.8k]
  ------------------
 2229|      0|        f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
 2230|      0|                            ((t->bx >> 1) + (t->by & 1))][pl] :
 2231|  14.8k|        bytefn(t->scratch.pal)[pl];
  ------------------
  |  |   87|  14.8k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   51|  14.8k|#define bitfn(x) x##_8bpc
  |  |  ------------------
  ------------------
 2232|  14.8k|    if (i < pal_sz) {
  ------------------
  |  Branch (2232:9): [True: 12.8k, False: 1.94k]
  ------------------
 2233|  18.4E|        const int bpc = BITDEPTH == 8 ? 8 : f->cur.p.bpc;
  ------------------
  |  Branch (2233:25): [True: 12.8k, Folded]
  ------------------
 2234|  12.8k|        int prev = pal[i++] = dav1d_msac_decode_bools(&ts->msac, bpc);
 2235|       |
 2236|  12.8k|        if (i < pal_sz) {
  ------------------
  |  Branch (2236:13): [True: 10.5k, False: 2.29k]
  ------------------
 2237|  10.5k|            int bits = bpc - 3 + dav1d_msac_decode_bools(&ts->msac, 2);
 2238|  10.5k|            const int max = (1 << bpc) - 1;
 2239|       |
 2240|  21.7k|            do {
 2241|  21.7k|                const int delta = dav1d_msac_decode_bools(&ts->msac, bits);
 2242|  21.7k|                prev = pal[i++] = imin(prev + delta + !pl, max);
 2243|  21.7k|                if (prev + !pl >= max) {
  ------------------
  |  Branch (2243:21): [True: 4.70k, False: 17.0k]
  ------------------
 2244|  12.7k|                    for (; i < pal_sz; i++)
  ------------------
  |  Branch (2244:28): [True: 8.00k, False: 4.70k]
  ------------------
 2245|  8.00k|                        pal[i] = max;
 2246|  4.70k|                    break;
 2247|  4.70k|                }
 2248|  17.0k|                bits = imin(bits, 1 + ulog2(max - prev - !pl));
 2249|  17.0k|            } while (i < pal_sz);
  ------------------
  |  Branch (2249:22): [True: 11.1k, False: 5.87k]
  ------------------
 2250|  10.5k|        }
 2251|       |
 2252|       |        // merge cache+new entries
 2253|  12.8k|        int n = 0, m = n_used_cache;
 2254|  64.7k|        for (i = 0; i < pal_sz; i++) {
  ------------------
  |  Branch (2254:21): [True: 51.8k, False: 12.8k]
  ------------------
 2255|  51.8k|            if (n < n_used_cache && (m >= pal_sz || used_cache[n] <= pal[m])) {
  ------------------
  |  Branch (2255:17): [True: 15.5k, False: 36.2k]
  |  Branch (2255:38): [True: 3.58k, False: 12.0k]
  |  Branch (2255:53): [True: 5.67k, False: 6.34k]
  ------------------
 2256|  9.25k|                pal[i] = used_cache[n++];
 2257|  42.6k|            } else {
 2258|  42.6k|                assert(m < pal_sz);
  ------------------
  |  |  140|  42.6k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 42.6k]
  |  |  |  Branch (140:68): [Folded, False: 42.6k]
  |  |  ------------------
  ------------------
 2259|  42.6k|                pal[i] = pal[m++];
 2260|  42.6k|            }
 2261|  51.8k|        }
 2262|  12.8k|    } else {
 2263|  1.94k|        memcpy(pal, used_cache, n_used_cache * sizeof(*used_cache));
 2264|  1.94k|    }
 2265|       |
 2266|  14.8k|    if (DEBUG_BLOCK_INFO) {
  ------------------
  |  |   34|  14.8k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 14.8k]
  |  |  ------------------
  |  |   35|  14.8k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  14.8k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 2267|      0|        printf("Post-pal[pl=%d,sz=%d,cache_size=%d,used_cache=%d]: r=%d, cache=",
 2268|      0|               pl, pal_sz, n_cache, n_used_cache, ts->msac.rng);
 2269|      0|        for (int n = 0; n < n_cache; n++)
  ------------------
  |  Branch (2269:25): [True: 0, False: 0]
  ------------------
 2270|      0|            printf("%c%02x", n ? ' ' : '[', cache[n]);
  ------------------
  |  Branch (2270:30): [True: 0, False: 0]
  ------------------
 2271|      0|        printf("%s, pal=", n_cache ? "]" : "[]");
  ------------------
  |  Branch (2271:28): [True: 0, False: 0]
  ------------------
 2272|      0|        for (int n = 0; n < pal_sz; n++)
  ------------------
  |  Branch (2272:25): [True: 0, False: 0]
  ------------------
 2273|      0|            printf("%c%02x", n ? ' ' : '[', pal[n]);
  ------------------
  |  Branch (2273:30): [True: 0, False: 0]
  ------------------
 2274|      0|        printf("]\n");
 2275|      0|    }
 2276|  14.8k|}
dav1d_read_pal_uv_8bpc:
 2280|  3.84k|{
 2281|  3.84k|    bytefn(dav1d_read_pal_plane)(t, b, 1, sz_ctx, bx4, by4);
  ------------------
  |  |   87|  3.84k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   51|  3.84k|#define bitfn(x) x##_8bpc
  |  |  ------------------
  ------------------
 2282|       |
 2283|       |    // V pal coding
 2284|  3.84k|    Dav1dTileState *const ts = t->ts;
 2285|  3.84k|    const Dav1dFrameContext *const f = t->f;
 2286|  3.84k|    pixel *const pal = t->frame_thread.pass ?
  ------------------
  |  Branch (2286:24): [True: 0, False: 3.84k]
  ------------------
 2287|      0|        f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
 2288|      0|                            ((t->bx >> 1) + (t->by & 1))][2] :
 2289|  3.84k|        bytefn(t->scratch.pal)[2];
  ------------------
  |  |   87|  3.84k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   51|  3.84k|#define bitfn(x) x##_8bpc
  |  |  ------------------
  ------------------
 2290|  3.84k|    const int bpc = BITDEPTH == 8 ? 8 : f->cur.p.bpc;
  ------------------
  |  Branch (2290:21): [True: 3.84k, Folded]
  ------------------
 2291|  3.84k|    if (dav1d_msac_decode_bool_equi(&ts->msac)) {
  ------------------
  |  |   53|  3.84k|#define dav1d_msac_decode_bool_equi      dav1d_msac_decode_bool_equi_sse2
  ------------------
  |  Branch (2291:9): [True: 1.38k, False: 2.46k]
  ------------------
 2292|  1.38k|        const int bits = bpc - 4 + dav1d_msac_decode_bools(&ts->msac, 2);
 2293|  1.38k|        int prev = pal[0] = dav1d_msac_decode_bools(&ts->msac, bpc);
 2294|  1.38k|        const int max = (1 << bpc) - 1;
 2295|  5.95k|        for (int i = 1; i < b->pal_sz[1]; i++) {
  ------------------
  |  Branch (2295:25): [True: 4.57k, False: 1.38k]
  ------------------
 2296|  4.57k|            int delta = dav1d_msac_decode_bools(&ts->msac, bits);
 2297|  4.57k|            if (delta && dav1d_msac_decode_bool_equi(&ts->msac)) delta = -delta;
  ------------------
  |  |   53|  4.38k|#define dav1d_msac_decode_bool_equi      dav1d_msac_decode_bool_equi_sse2
  ------------------
  |  Branch (2297:17): [True: 4.38k, False: 191]
  |  Branch (2297:26): [True: 2.37k, False: 2.00k]
  ------------------
 2298|  4.57k|            prev = pal[i] = (prev + delta) & max;
 2299|  4.57k|        }
 2300|  2.46k|    } else {
 2301|  9.61k|        for (int i = 0; i < b->pal_sz[1]; i++)
  ------------------
  |  Branch (2301:25): [True: 7.15k, False: 2.46k]
  ------------------
 2302|  7.15k|            pal[i] = dav1d_msac_decode_bools(&ts->msac, bpc);
 2303|  2.46k|    }
 2304|  3.84k|    if (DEBUG_BLOCK_INFO) {
  ------------------
  |  |   34|  3.84k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 3.84k]
  |  |  ------------------
  |  |   35|  3.84k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  3.84k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 2305|      0|        printf("Post-pal[pl=2]: r=%d ", ts->msac.rng);
 2306|      0|        for (int n = 0; n < b->pal_sz[1]; n++)
  ------------------
  |  Branch (2306:25): [True: 0, False: 0]
  ------------------
 2307|      0|            printf("%c%02x", n ? ' ' : '[', pal[n]);
  ------------------
  |  Branch (2307:30): [True: 0, False: 0]
  ------------------
 2308|      0|        printf("]\n");
 2309|      0|    }
 2310|  3.84k|}
recon_tmpl.c:read_coef_tree:
  736|   288k|{
  737|   288k|    const Dav1dFrameContext *const f = t->f;
  738|   288k|    Dav1dTileState *const ts = t->ts;
  739|   288k|    const Dav1dDSPContext *const dsp = f->dsp;
  740|   288k|    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[ytx];
  741|   288k|    const int txw = t_dim->w, txh = t_dim->h;
  742|       |
  743|       |    /* y_off can be larger than 3 since lossless blocks use TX_4X4 but can't
  744|       |     * be splitted. Aviods an undefined left shift. */
  745|   288k|    if (depth < 2 && tx_split[depth] &&
  ------------------
  |  Branch (745:9): [True: 271k, False: 16.8k]
  |  Branch (745:22): [True: 20.4k, False: 250k]
  ------------------
  746|  20.4k|        tx_split[depth] & (1 << (y_off * 4 + x_off)))
  ------------------
  |  Branch (746:9): [True: 16.1k, False: 4.31k]
  ------------------
  747|  16.1k|    {
  748|  16.1k|        const enum RectTxfmSize sub = t_dim->sub;
  749|  16.1k|        const TxfmInfo *const sub_t_dim = &dav1d_txfm_dimensions[sub];
  750|  16.1k|        const int txsw = sub_t_dim->w, txsh = sub_t_dim->h;
  751|       |
  752|  16.1k|        read_coef_tree(t, bs, b, sub, depth + 1, tx_split,
  753|  16.1k|                       x_off * 2 + 0, y_off * 2 + 0, dst);
  754|  16.1k|        t->bx += txsw;
  755|  16.1k|        if (txw >= txh && t->bx < f->bw)
  ------------------
  |  Branch (755:13): [True: 12.8k, False: 3.22k]
  |  Branch (755:27): [True: 12.2k, False: 648]
  ------------------
  756|  12.2k|            read_coef_tree(t, bs, b, sub, depth + 1, tx_split, x_off * 2 + 1,
  757|  12.2k|                           y_off * 2 + 0, dst ? &dst[4 * txsw] : NULL);
  ------------------
  |  Branch (757:43): [True: 12.2k, False: 0]
  ------------------
  758|  16.1k|        t->bx -= txsw;
  759|  16.1k|        t->by += txsh;
  760|  16.1k|        if (txh >= txw && t->by < f->bh) {
  ------------------
  |  Branch (760:13): [True: 11.1k, False: 4.99k]
  |  Branch (760:27): [True: 10.4k, False: 628]
  ------------------
  761|  10.4k|            if (dst)
  ------------------
  |  Branch (761:17): [True: 10.4k, False: 0]
  ------------------
  762|  10.4k|                dst += 4 * txsh * PXSTRIDE(f->cur.stride[0]);
  ------------------
  |  |   53|  10.4k|#define PXSTRIDE(x) (x)
  ------------------
  763|  10.4k|            read_coef_tree(t, bs, b, sub, depth + 1, tx_split,
  764|  10.4k|                           x_off * 2 + 0, y_off * 2 + 1, dst);
  765|  10.4k|            t->bx += txsw;
  766|  10.4k|            if (txw >= txh && t->bx < f->bw)
  ------------------
  |  Branch (766:17): [True: 7.28k, False: 3.20k]
  |  Branch (766:31): [True: 6.68k, False: 604]
  ------------------
  767|  6.68k|                read_coef_tree(t, bs, b, sub, depth + 1, tx_split, x_off * 2 + 1,
  768|  6.68k|                               y_off * 2 + 1, dst ? &dst[4 * txsw] : NULL);
  ------------------
  |  Branch (768:47): [True: 6.68k, False: 0]
  ------------------
  769|  10.4k|            t->bx -= txsw;
  770|  10.4k|        }
  771|  16.1k|        t->by -= txsh;
  772|   271k|    } else {
  773|   271k|        const int bx4 = t->bx & 31, by4 = t->by & 31;
  774|   271k|        enum TxfmType txtp;
  775|   271k|        uint8_t cf_ctx;
  776|   271k|        int eob;
  777|   271k|        coef *cf;
  778|       |
  779|   271k|        if (t->frame_thread.pass) {
  ------------------
  |  Branch (779:13): [True: 0, False: 271k]
  ------------------
  780|      0|            const int p = t->frame_thread.pass & 1;
  781|      0|            assert(ts->frame_thread[p].cf);
  ------------------
  |  |  140|      0|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 0]
  |  |  |  Branch (140:68): [Folded, False: 0]
  |  |  ------------------
  ------------------
  782|      0|            cf = ts->frame_thread[p].cf;
  783|      0|            ts->frame_thread[p].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
  784|   271k|        } else {
  785|   271k|            cf = bitfn(t->cf);
  ------------------
  |  |   51|   271k|#define bitfn(x) x##_8bpc
  ------------------
  786|   271k|        }
  787|   271k|        if (t->frame_thread.pass != 2) {
  ------------------
  |  Branch (787:13): [True: 271k, False: 18.4E]
  ------------------
  788|   271k|            eob = decode_coefs(t, &t->a->lcoef[bx4], &t->l.lcoef[by4],
  789|   271k|                               ytx, bs, b, 0, 0, cf, &txtp, &cf_ctx);
  790|   271k|            if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|   271k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 271k]
  |  |  ------------------
  |  |   35|   271k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|   271k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  791|      0|                printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
  792|      0|                       ytx, txtp, eob, ts->msac.rng);
  793|   271k|            dav1d_memset_likely_pow2(&t->a->lcoef[bx4], cf_ctx, imin(txw, f->bw - t->bx));
  794|   271k|            dav1d_memset_likely_pow2(&t->l.lcoef[by4], cf_ctx, imin(txh, f->bh - t->by));
  795|   271k|#define set_ctx(rep_macro) \
  796|   271k|            for (int y = 0; y < txh; y++) { \
  797|   271k|                rep_macro(txtp_map, 0, txtp); \
  798|   271k|                txtp_map += 32; \
  799|   271k|            }
  800|   271k|            uint8_t *txtp_map = &t->scratch.txtp_map[by4 * 32 + bx4];
  801|   272k|            case_set_upto16(t_dim->lw);
  ------------------
  |  |   80|   271k|    switch (var) { \
  |  |   81|   189k|    case 0: set_ctx(set_ctx1); break; \
  |  |  ------------------
  |  |  |  |  796|   389k|            for (int y = 0; y < txh; y++) { \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (796:29): [True: 200k, False: 189k]
  |  |  |  |  ------------------
  |  |  |  |  797|   200k|                rep_macro(txtp_map, 0, txtp); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   81|   200k|    case 0: set_ctx(set_ctx1); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   56|   200k|    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  798|   200k|                txtp_map += 32; \
  |  |  |  |  799|   200k|            }
  |  |  ------------------
  |  |  |  Branch (81:5): [True: 189k, False: 82.1k]
  |  |  ------------------
  |  |   82|  40.5k|    case 1: set_ctx(set_ctx2); break; \
  |  |  ------------------
  |  |  |  |  796|   131k|            for (int y = 0; y < txh; y++) { \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (796:29): [True: 90.5k, False: 40.5k]
  |  |  |  |  ------------------
  |  |  |  |  797|  90.5k|                rep_macro(txtp_map, 0, txtp); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   82|  90.5k|    case 1: set_ctx(set_ctx2); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   58|  90.5k|    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  798|  90.5k|                txtp_map += 32; \
  |  |  |  |  799|  90.5k|            }
  |  |  ------------------
  |  |  |  Branch (82:5): [True: 40.5k, False: 231k]
  |  |  ------------------
  |  |   83|  27.3k|    case 2: set_ctx(set_ctx4); break; \
  |  |  ------------------
  |  |  |  |  796|   116k|            for (int y = 0; y < txh; y++) { \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (796:29): [True: 89.1k, False: 27.3k]
  |  |  |  |  ------------------
  |  |  |  |  797|  89.1k|                rep_macro(txtp_map, 0, txtp); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   83|  89.1k|    case 2: set_ctx(set_ctx4); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   60|  89.1k|    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  798|  89.1k|                txtp_map += 32; \
  |  |  |  |  799|  89.1k|            }
  |  |  ------------------
  |  |  |  Branch (83:5): [True: 27.3k, False: 244k]
  |  |  ------------------
  |  |   84|  9.43k|    case 3: set_ctx(set_ctx8); break; \
  |  |  ------------------
  |  |  |  |  796|  68.3k|            for (int y = 0; y < txh; y++) { \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (796:29): [True: 58.9k, False: 9.43k]
  |  |  |  |  ------------------
  |  |  |  |  797|  58.9k|                rep_macro(txtp_map, 0, txtp); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   84|  58.9k|    case 3: set_ctx(set_ctx8); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   62|  58.9k|    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  798|  58.9k|                txtp_map += 32; \
  |  |  |  |  799|  58.9k|            }
  |  |  ------------------
  |  |  |  Branch (84:5): [True: 9.43k, False: 262k]
  |  |  ------------------
  |  |   85|  4.96k|    case 4: set_ctx(set_ctx16); break; \
  |  |  ------------------
  |  |  |  |  796|  67.6k|            for (int y = 0; y < txh; y++) { \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (796:29): [True: 62.6k, False: 4.96k]
  |  |  |  |  ------------------
  |  |  |  |  797|  62.6k|                rep_macro(txtp_map, 0, txtp); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   85|  62.6k|    case 4: set_ctx(set_ctx16); break; \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   63|  62.6k|#define set_ctx16(var, off, val) do { \
  |  |  |  |  |  |  |  |   64|  62.6k|        memset(&(var)[off], val, 16); \
  |  |  |  |  |  |  |  |   65|  62.6k|    } while (0)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  Branch (65:14): [Folded, False: 62.6k]
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  798|  62.6k|                txtp_map += 32; \
  |  |  |  |  799|  62.6k|            }
  |  |  ------------------
  |  |  |  Branch (85:5): [True: 4.96k, False: 266k]
  |  |  ------------------
  |  |   86|      0|    default: assert(0); \
  |  |  ------------------
  |  |  |  |  140|      0|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (140:28): [True: 0, Folded]
  |  |  |  |  |  Branch (140:68): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (86:5): [True: 0, False: 271k]
  |  |  ------------------
  |  |   87|   271k|    }
  ------------------
  802|   272k|#undef set_ctx
  803|   272k|            if (t->frame_thread.pass == 1)
  ------------------
  |  Branch (803:17): [True: 0, False: 272k]
  ------------------
  804|      0|                *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp;
  805|  18.4E|        } else {
  806|  18.4E|            const int cbi = *ts->frame_thread[0].cbi++;
  807|  18.4E|            eob  = cbi >> 5;
  808|  18.4E|            txtp = cbi & 0x1f;
  809|  18.4E|        }
  810|   272k|        if (!(t->frame_thread.pass & 1)) {
  ------------------
  |  Branch (810:13): [True: 272k, False: 18.4E]
  ------------------
  811|   272k|            assert(dst);
  ------------------
  |  |  140|   272k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 272k]
  |  |  |  Branch (140:68): [Folded, False: 272k]
  |  |  ------------------
  ------------------
  812|   272k|            if (eob >= 0) {
  ------------------
  |  Branch (812:17): [True: 179k, False: 92.2k]
  ------------------
  813|   179k|                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
  ------------------
  |  |   34|   179k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 179k]
  |  |  ------------------
  |  |   35|   179k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|   179k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
                              if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
  ------------------
  |  |   37|      0|#define DEBUG_B_PIXELS 0
  |  |  ------------------
  |  |  |  Branch (37:24): [Folded, False: 0]
  |  |  ------------------
  ------------------
  814|      0|                    coef_dump(cf, imin(t_dim->h, 8) * 4, imin(t_dim->w, 8) * 4, 3, "dq");
  815|   179k|                dsp->itx.itxfm_add[ytx][txtp](dst, f->cur.stride[0], cf, eob
  816|   179k|                                              HIGHBD_CALL_SUFFIX);
  817|   179k|                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
  ------------------
  |  |   34|   179k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 179k]
  |  |  ------------------
  |  |   35|   179k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|   179k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
                              if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
  ------------------
  |  |   37|      0|#define DEBUG_B_PIXELS 0
  |  |  ------------------
  |  |  |  Branch (37:24): [Folded, False: 0]
  |  |  ------------------
  ------------------
  818|      0|                    hex_dump(dst, f->cur.stride[0], t_dim->w * 4, t_dim->h * 4, "recon");
  819|   179k|            }
  820|   272k|        }
  821|   272k|    }
  822|   288k|}
recon_tmpl.c:decode_coefs:
  327|  4.49M|{
  328|  4.49M|    Dav1dTileState *const ts = t->ts;
  329|  4.49M|    const int chroma = !!plane;
  330|  4.49M|    const Dav1dFrameContext *const f = t->f;
  331|  4.49M|    const int lossless = f->frame_hdr->segmentation.lossless[b->seg_id];
  332|  4.49M|    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx];
  333|  4.49M|    const int dbg = DEBUG_BLOCK_INFO && plane && 0;
  ------------------
  |  |   34|  4.49M|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 4.49M]
  |  |  ------------------
  |  |   35|  4.49M|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  4.49M|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  |  Branch (333:41): [True: 0, False: 0]
  |  Branch (333:50): [Folded, False: 0]
  ------------------
  334|       |
  335|  4.49M|    if (dbg)
  ------------------
  |  Branch (335:9): [Folded, False: 4.49M]
  ------------------
  336|      0|        printf("Start: r=%d\n", ts->msac.rng);
  337|       |
  338|       |    // does this block have any non-zero coefficients
  339|  4.49M|    const int sctx = get_skip_ctx(t_dim, bs, a, l, chroma, f->cur.p.layout);
  340|  4.49M|    const int all_skip = dav1d_msac_decode_bool_adapt(&ts->msac,
  ------------------
  |  |   52|  4.49M|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
  341|  4.49M|                             ts->cdf.coef.skip[t_dim->ctx][sctx]);
  342|  4.49M|    if (dbg)
  ------------------
  |  Branch (342:9): [Folded, False: 4.49M]
  ------------------
  343|      0|        printf("Post-non-zero[%d][%d][%d]: r=%d\n",
  344|      0|               t_dim->ctx, sctx, all_skip, ts->msac.rng);
  345|  4.49M|    if (all_skip) {
  ------------------
  |  Branch (345:9): [True: 2.20M, False: 2.29M]
  ------------------
  346|  2.20M|        *res_ctx = 0x40;
  347|  2.20M|        *txtp = lossless * WHT_WHT; /* lossless ? WHT_WHT : DCT_DCT */
  348|  2.20M|        return -1;
  349|  2.20M|    }
  350|       |
  351|       |    // transform type (chroma: derived, luma: explicitly coded)
  352|  2.29M|    if (lossless) {
  ------------------
  |  Branch (352:9): [True: 1.11M, False: 1.17M]
  ------------------
  353|  1.11M|        assert(t_dim->max == TX_4X4);
  ------------------
  |  |  140|  1.11M|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 1.11M]
  |  |  |  Branch (140:68): [Folded, False: 1.11M]
  |  |  ------------------
  ------------------
  354|  1.11M|        *txtp = WHT_WHT;
  355|  1.17M|    } else if (t_dim->max + intra >= TX_64X64) {
  ------------------
  |  Branch (355:16): [True: 253k, False: 921k]
  ------------------
  356|   253k|        *txtp = DCT_DCT;
  357|   921k|    } else if (chroma) {
  ------------------
  |  Branch (357:16): [True: 234k, False: 687k]
  ------------------
  358|       |        // inferred from either the luma txtp (inter) or a LUT (intra)
  359|   234k|        *txtp = intra ? dav1d_txtp_from_uvmode[b->uv_mode] :
  ------------------
  |  Branch (359:17): [True: 203k, False: 30.2k]
  ------------------
  360|   234k|                        get_uv_inter_txtp(t_dim, *txtp);
  361|   687k|    } else if (!f->frame_hdr->segmentation.qidx[b->seg_id]) {
  ------------------
  |  Branch (361:16): [True: 21.5k, False: 665k]
  ------------------
  362|       |        // In libaom, lossless is checked by a literal qidx == 0, but not all
  363|       |        // such blocks are actually lossless. The remainder gets an implicit
  364|       |        // transform type (for luma)
  365|  21.5k|        *txtp = DCT_DCT;
  366|   665k|    } else {
  367|   665k|        unsigned idx;
  368|   665k|        if (intra) {
  ------------------
  |  Branch (368:13): [True: 598k, False: 66.9k]
  ------------------
  369|   598k|            const enum IntraPredMode y_mode_nofilt = b->y_mode == FILTER_PRED ?
  ------------------
  |  Branch (369:54): [True: 99.9k, False: 498k]
  ------------------
  370|   498k|                dav1d_filter_mode_to_y_mode[b->y_angle] : b->y_mode;
  371|   598k|            if (f->frame_hdr->reduced_txtp_set || t_dim->min == TX_16X16) {
  ------------------
  |  Branch (371:17): [True: 151k, False: 447k]
  |  Branch (371:51): [True: 60.2k, False: 387k]
  ------------------
  372|   211k|                idx = dav1d_msac_decode_symbol_adapt8(&ts->msac,
  ------------------
  |  |   48|   211k|#define dav1d_msac_decode_symbol_adapt8  dav1d_msac_decode_symbol_adapt8_sse2
  ------------------
  373|   211k|                          ts->cdf.m.txtp_intra2[t_dim->min][y_mode_nofilt], 4);
  374|   211k|                *txtp = dav1d_tx_types_per_set[idx + 0];
  375|   387k|            } else {
  376|   387k|                idx = dav1d_msac_decode_symbol_adapt8(&ts->msac,
  ------------------
  |  |   48|   387k|#define dav1d_msac_decode_symbol_adapt8  dav1d_msac_decode_symbol_adapt8_sse2
  ------------------
  377|   387k|                          ts->cdf.m.txtp_intra1[t_dim->min][y_mode_nofilt], 6);
  378|   387k|                *txtp = dav1d_tx_types_per_set[idx + 5];
  379|   387k|            }
  380|   598k|            if (dbg)
  ------------------
  |  Branch (380:17): [Folded, False: 598k]
  ------------------
  381|      0|                printf("Post-txtp-intra[%d->%d][%d][%d->%d]: r=%d\n",
  382|      0|                       tx, t_dim->min, y_mode_nofilt, idx, *txtp, ts->msac.rng);
  383|   598k|        } else {
  384|  66.9k|            if (f->frame_hdr->reduced_txtp_set || t_dim->max == TX_32X32) {
  ------------------
  |  Branch (384:17): [True: 2.49k, False: 64.4k]
  |  Branch (384:51): [True: 8.03k, False: 56.3k]
  ------------------
  385|  26.7k|                idx = dav1d_msac_decode_bool_adapt(&ts->msac,
  ------------------
  |  |   52|  26.7k|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
  386|  26.7k|                          ts->cdf.m.txtp_inter3[t_dim->min]);
  387|  26.7k|                *txtp = (idx - 1) & IDTX; /* idx ? DCT_DCT : IDTX */
  388|  40.1k|            } else if (t_dim->min == TX_16X16) {
  ------------------
  |  Branch (388:24): [True: 7.10k, False: 33.0k]
  ------------------
  389|  7.10k|                idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
  ------------------
  |  |   57|  7.10k|#define dav1d_msac_decode_symbol_adapt16(ctx, cdf, symb) ((ctx)->symbol_adapt16(ctx, cdf, symb))
  ------------------
  390|  7.10k|                          ts->cdf.m.txtp_inter2, 11);
  391|  7.10k|                *txtp = dav1d_tx_types_per_set[idx + 12];
  392|  33.0k|            } else {
  393|  33.0k|                idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
  ------------------
  |  |   57|  33.0k|#define dav1d_msac_decode_symbol_adapt16(ctx, cdf, symb) ((ctx)->symbol_adapt16(ctx, cdf, symb))
  ------------------
  394|  33.0k|                          ts->cdf.m.txtp_inter1[t_dim->min], 15);
  395|  33.0k|                *txtp = dav1d_tx_types_per_set[idx + 24];
  396|  33.0k|            }
  397|  66.9k|            if (dbg)
  ------------------
  |  Branch (397:17): [Folded, False: 66.9k]
  ------------------
  398|      0|                printf("Post-txtp-inter[%d->%d][%d->%d]: r=%d\n",
  399|      0|                       tx, t_dim->min, idx, *txtp, ts->msac.rng);
  400|  66.9k|        }
  401|   665k|    }
  402|       |
  403|       |    // find end-of-block (eob)
  404|  2.29M|    int eob;
  405|  2.29M|    const int slw = imin(t_dim->lw, TX_32X32), slh = imin(t_dim->lh, TX_32X32);
  406|  2.29M|    const int tx2dszctx = slw + slh;
  407|  2.29M|    const enum TxClass tx_class = dav1d_tx_type_class[*txtp];
  408|  2.29M|    const int is_1d = tx_class != TX_CLASS_2D;
  409|  2.29M|    switch (tx2dszctx) {
  ------------------
  |  Branch (409:13): [True: 2.31M, False: 18.4E]
  ------------------
  410|      0|#define case_sz(sz, bin, ns, is_1d) \
  411|      0|    case sz: { \
  412|      0|        uint16_t *const eob_bin_cdf = ts->cdf.coef.eob_bin_##bin[chroma]is_1d; \
  413|      0|        eob = dav1d_msac_decode_symbol_adapt##ns(&ts->msac, eob_bin_cdf, 4 + sz); \
  414|      0|        break; \
  415|      0|    }
  416|  1.30M|    case_sz(0,   16,  8, [is_1d]);
  ------------------
  |  |  411|  1.30M|    case sz: { \
  |  |  ------------------
  |  |  |  Branch (411:5): [True: 1.30M, False: 991k]
  |  |  ------------------
  |  |  412|  1.30M|        uint16_t *const eob_bin_cdf = ts->cdf.coef.eob_bin_##bin[chroma]is_1d; \
  |  |  413|  1.30M|        eob = dav1d_msac_decode_symbol_adapt##ns(&ts->msac, eob_bin_cdf, 4 + sz); \
  |  |  ------------------
  |  |  |  |   48|  1.30M|#define dav1d_msac_decode_symbol_adapt8  dav1d_msac_decode_symbol_adapt8_sse2
  |  |  ------------------
  |  |  414|  1.30M|        break; \
  |  |  415|  1.30M|    }
  ------------------
  417|   163k|    case_sz(1,   32,  8, [is_1d]);
  ------------------
  |  |  411|   163k|    case sz: { \
  |  |  ------------------
  |  |  |  Branch (411:5): [True: 163k, False: 2.13M]
  |  |  ------------------
  |  |  412|   163k|        uint16_t *const eob_bin_cdf = ts->cdf.coef.eob_bin_##bin[chroma]is_1d; \
  |  |  413|   163k|        eob = dav1d_msac_decode_symbol_adapt##ns(&ts->msac, eob_bin_cdf, 4 + sz); \
  |  |  ------------------
  |  |  |  |   48|   163k|#define dav1d_msac_decode_symbol_adapt8  dav1d_msac_decode_symbol_adapt8_sse2
  |  |  ------------------
  |  |  414|   163k|        break; \
  |  |  415|   163k|    }
  ------------------
  418|   295k|    case_sz(2,   64,  8, [is_1d]);
  ------------------
  |  |  411|   295k|    case sz: { \
  |  |  ------------------
  |  |  |  Branch (411:5): [True: 295k, False: 1.99M]
  |  |  ------------------
  |  |  412|   295k|        uint16_t *const eob_bin_cdf = ts->cdf.coef.eob_bin_##bin[chroma]is_1d; \
  |  |  413|   295k|        eob = dav1d_msac_decode_symbol_adapt##ns(&ts->msac, eob_bin_cdf, 4 + sz); \
  |  |  ------------------
  |  |  |  |   48|   295k|#define dav1d_msac_decode_symbol_adapt8  dav1d_msac_decode_symbol_adapt8_sse2
  |  |  ------------------
  |  |  414|   295k|        break; \
  |  |  415|   295k|    }
  ------------------
  419|   164k|    case_sz(3,  128,  8, [is_1d]);
  ------------------
  |  |  411|   164k|    case sz: { \
  |  |  ------------------
  |  |  |  Branch (411:5): [True: 164k, False: 2.13M]
  |  |  ------------------
  |  |  412|   164k|        uint16_t *const eob_bin_cdf = ts->cdf.coef.eob_bin_##bin[chroma]is_1d; \
  |  |  413|   164k|        eob = dav1d_msac_decode_symbol_adapt##ns(&ts->msac, eob_bin_cdf, 4 + sz); \
  |  |  ------------------
  |  |  |  |   48|   164k|#define dav1d_msac_decode_symbol_adapt8  dav1d_msac_decode_symbol_adapt8_sse2
  |  |  ------------------
  |  |  414|   164k|        break; \
  |  |  415|   164k|    }
  ------------------
  420|   138k|    case_sz(4,  256, 16, [is_1d]);
  ------------------
  |  |  411|   138k|    case sz: { \
  |  |  ------------------
  |  |  |  Branch (411:5): [True: 138k, False: 2.15M]
  |  |  ------------------
  |  |  412|   138k|        uint16_t *const eob_bin_cdf = ts->cdf.coef.eob_bin_##bin[chroma]is_1d; \
  |  |  413|   138k|        eob = dav1d_msac_decode_symbol_adapt##ns(&ts->msac, eob_bin_cdf, 4 + sz); \
  |  |  ------------------
  |  |  |  |   57|   138k|#define dav1d_msac_decode_symbol_adapt16(ctx, cdf, symb) ((ctx)->symbol_adapt16(ctx, cdf, symb))
  |  |  ------------------
  |  |  414|   138k|        break; \
  |  |  415|   138k|    }
  ------------------
  421|   105k|    case_sz(5,  512, 16,        );
  ------------------
  |  |  411|   105k|    case sz: { \
  |  |  ------------------
  |  |  |  Branch (411:5): [True: 105k, False: 2.18M]
  |  |  ------------------
  |  |  412|   105k|        uint16_t *const eob_bin_cdf = ts->cdf.coef.eob_bin_##bin[chroma]is_1d; \
  |  |  413|   105k|        eob = dav1d_msac_decode_symbol_adapt##ns(&ts->msac, eob_bin_cdf, 4 + sz); \
  |  |  ------------------
  |  |  |  |   57|   105k|#define dav1d_msac_decode_symbol_adapt16(ctx, cdf, symb) ((ctx)->symbol_adapt16(ctx, cdf, symb))
  |  |  ------------------
  |  |  414|   105k|        break; \
  |  |  415|   105k|    }
  ------------------
  422|   140k|    case_sz(6, 1024, 16,        );
  ------------------
  |  |  411|   140k|    case sz: { \
  |  |  ------------------
  |  |  |  Branch (411:5): [True: 140k, False: 2.15M]
  |  |  ------------------
  |  |  412|   140k|        uint16_t *const eob_bin_cdf = ts->cdf.coef.eob_bin_##bin[chroma]is_1d; \
  |  |  413|   140k|        eob = dav1d_msac_decode_symbol_adapt##ns(&ts->msac, eob_bin_cdf, 4 + sz); \
  |  |  ------------------
  |  |  |  |   57|   140k|#define dav1d_msac_decode_symbol_adapt16(ctx, cdf, symb) ((ctx)->symbol_adapt16(ctx, cdf, symb))
  |  |  ------------------
  |  |  414|   140k|        break; \
  |  |  415|   140k|    }
  ------------------
  423|  2.29M|#undef case_sz
  424|  2.29M|    }
  425|  2.30M|    if (dbg)
  ------------------
  |  Branch (425:9): [Folded, False: 2.30M]
  ------------------
  426|      0|        printf("Post-eob_bin_%d[%d][%d][%d]: r=%d\n",
  427|      0|               16 << tx2dszctx, chroma, is_1d, eob, ts->msac.rng);
  428|  2.30M|    if (eob > 1) {
  ------------------
  |  Branch (428:9): [True: 1.79M, False: 514k]
  ------------------
  429|  1.79M|        const int eob_bin = eob - 2;
  430|  1.79M|        uint16_t *const eob_hi_bit_cdf =
  431|  1.79M|            ts->cdf.coef.eob_hi_bit[t_dim->ctx][chroma][eob_bin];
  432|  1.79M|        const int eob_hi_bit = dav1d_msac_decode_bool_adapt(&ts->msac, eob_hi_bit_cdf);
  ------------------
  |  |   52|  1.79M|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
  433|  1.79M|        if (dbg)
  ------------------
  |  Branch (433:13): [Folded, False: 1.79M]
  ------------------
  434|      0|            printf("Post-eob_hi_bit[%d][%d][%d][%d]: r=%d\n",
  435|      0|                   t_dim->ctx, chroma, eob_bin, eob_hi_bit, ts->msac.rng);
  436|  1.79M|        eob = ((eob_hi_bit | 2) << eob_bin) | dav1d_msac_decode_bools(&ts->msac, eob_bin);
  437|  1.79M|        if (dbg)
  ------------------
  |  Branch (437:13): [Folded, False: 1.79M]
  ------------------
  438|      0|            printf("Post-eob[%d]: r=%d\n", eob, ts->msac.rng);
  439|  1.79M|    }
  440|  2.30M|    assert(eob >= 0);
  ------------------
  |  |  140|  2.30M|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 2.30M]
  |  |  |  Branch (140:68): [Folded, False: 2.30M]
  |  |  ------------------
  ------------------
  441|       |
  442|       |    // base tokens
  443|  2.30M|    uint16_t (*const eob_cdf)[4] = ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma];
  444|  2.30M|    uint16_t (*const hi_cdf)[4] = ts->cdf.coef.br_tok[imin(t_dim->ctx, 3)][chroma];
  445|  2.30M|    unsigned rc, dc_tok;
  446|       |
  447|  2.30M|    if (eob) {
  ------------------
  |  Branch (447:9): [True: 1.88M, False: 421k]
  ------------------
  448|  1.88M|        uint16_t (*const lo_cdf)[4] = ts->cdf.coef.base_tok[t_dim->ctx][chroma];
  449|  1.88M|        uint8_t *const levels = t->scratch.levels; // bits 0-5: tok, 6-7: lo_tok
  450|       |
  451|       |        /* eob */
  452|  1.88M|        unsigned ctx = 1 + (eob > 2 << tx2dszctx) + (eob > 4 << tx2dszctx);
  453|  1.88M|        int eob_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, eob_cdf[ctx], 2);
  ------------------
  |  |   47|  1.88M|#define dav1d_msac_decode_symbol_adapt4  dav1d_msac_decode_symbol_adapt4_sse2
  ------------------
  454|  1.88M|        int tok = eob_tok + 1;
  455|  1.88M|        int level_tok = tok * 0x41;
  456|  1.88M|        unsigned mag;
  457|       |
  458|  1.88M|#define DECODE_COEFS_CLASS(tx_class) \
  459|  1.88M|        unsigned x, y; \
  460|  1.88M|        uint8_t *level; \
  461|  1.88M|        if (tx_class == TX_CLASS_2D) \
  462|  1.88M|            rc = scan[eob], x = rc >> shift, y = rc & mask; \
  463|  1.88M|        else if (tx_class == TX_CLASS_H) \
  464|       |            /* Transposing reduces the stride and padding requirements */ \
  465|  1.88M|            x = eob & mask, y = eob >> shift, rc = eob; \
  466|  1.88M|        else /* tx_class == TX_CLASS_V */ \
  467|  1.88M|            x = eob & mask, y = eob >> shift, rc = (x << shift2) | y; \
  468|  1.88M|        if (dbg) \
  469|  1.88M|            printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
  470|  1.88M|                   t_dim->ctx, chroma, ctx, eob, rc, tok, ts->msac.rng); \
  471|  1.88M|        if (eob_tok == 2) { \
  472|  1.88M|            ctx = (tx_class == TX_CLASS_2D ? (x | y) > 1 : y != 0) ? 14 : 7; \
  473|  1.88M|            tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
  474|  1.88M|            level_tok = tok + (3 << 6); \
  475|  1.88M|            if (dbg) \
  476|  1.88M|                printf("Post-hi_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
  477|  1.88M|                       imin(t_dim->ctx, 3), chroma, ctx, eob, rc, tok, \
  478|  1.88M|                       ts->msac.rng); \
  479|  1.88M|        } \
  480|  1.88M|        cf[rc] = tok << 11; \
  481|  1.88M|        if (tx_class == TX_CLASS_2D) \
  482|  1.88M|            level = levels + rc; \
  483|  1.88M|        else \
  484|  1.88M|            level = levels + x * stride + y; \
  485|  1.88M|        *level = (uint8_t) level_tok; \
  486|  1.88M|        for (int i = eob - 1; i > 0; i--) { /* ac */ \
  487|  1.88M|            unsigned rc_i; \
  488|  1.88M|            if (tx_class == TX_CLASS_2D) \
  489|  1.88M|                rc_i = scan[i], x = rc_i >> shift, y = rc_i & mask; \
  490|  1.88M|            else if (tx_class == TX_CLASS_H) \
  491|  1.88M|                x = i & mask, y = i >> shift, rc_i = i; \
  492|  1.88M|            else /* tx_class == TX_CLASS_V */ \
  493|  1.88M|                x = i & mask, y = i >> shift, rc_i = (x << shift2) | y; \
  494|  1.88M|            assert(x < 32 && y < 32); \
  495|  1.88M|            if (tx_class == TX_CLASS_2D) \
  496|  1.88M|                level = levels + rc_i; \
  497|  1.88M|            else \
  498|  1.88M|                level = levels + x * stride + y; \
  499|  1.88M|            ctx = get_lo_ctx(level, tx_class, &mag, lo_ctx_offsets, x, y, stride); \
  500|  1.88M|            if (tx_class == TX_CLASS_2D) \
  501|  1.88M|                y |= x; \
  502|  1.88M|            tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf[ctx], 3); \
  503|  1.88M|            if (dbg) \
  504|  1.88M|                printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
  505|  1.88M|                       t_dim->ctx, chroma, ctx, i, rc_i, tok, ts->msac.rng); \
  506|  1.88M|            if (tok == 3) { \
  507|  1.88M|                mag &= 63; \
  508|  1.88M|                ctx = (y > (tx_class == TX_CLASS_2D) ? 14 : 7) + \
  509|  1.88M|                      (mag > 12 ? 6 : (mag + 1) >> 1); \
  510|  1.88M|                tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
  511|  1.88M|                if (dbg) \
  512|  1.88M|                    printf("Post-hi_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
  513|  1.88M|                           imin(t_dim->ctx, 3), chroma, ctx, i, rc_i, tok, \
  514|  1.88M|                           ts->msac.rng); \
  515|  1.88M|                *level = (uint8_t) (tok + (3 << 6)); \
  516|  1.88M|                cf[rc_i] = (tok << 11) | rc; \
  517|  1.88M|                rc = rc_i; \
  518|  1.88M|            } else { \
  519|       |                /* 0x1 for tok, 0x7ff as bitmask for rc, 0x41 for level_tok */ \
  520|  1.88M|                tok *= 0x17ff41; \
  521|  1.88M|                *level = (uint8_t) tok; \
  522|       |                /* tok ? (tok << 11) | rc : 0 */ \
  523|  1.88M|                tok = (tok >> 9) & (rc + ~0x7ffu); \
  524|  1.88M|                if (tok) rc = rc_i; \
  525|  1.88M|                cf[rc_i] = tok; \
  526|  1.88M|            } \
  527|  1.88M|        } \
  528|       |        /* dc */ \
  529|  1.88M|        ctx = (tx_class == TX_CLASS_2D) ? 0 : \
  530|  1.88M|            get_lo_ctx(levels, tx_class, &mag, lo_ctx_offsets, 0, 0, stride); \
  531|  1.88M|        dc_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf[ctx], 3); \
  532|  1.88M|        if (dbg) \
  533|  1.88M|            printf("Post-dc_lo_tok[%d][%d][%d][%d]: r=%d\n", \
  534|  1.88M|                   t_dim->ctx, chroma, ctx, dc_tok, ts->msac.rng); \
  535|  1.88M|        if (dc_tok == 3) { \
  536|  1.88M|            if (tx_class == TX_CLASS_2D) \
  537|  1.88M|                mag = levels[0 * stride + 1] + levels[1 * stride + 0] + \
  538|  1.88M|                      levels[1 * stride + 1]; \
  539|  1.88M|            mag &= 63; \
  540|  1.88M|            ctx = mag > 12 ? 6 : (mag + 1) >> 1; \
  541|  1.88M|            dc_tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
  542|  1.88M|            if (dbg) \
  543|  1.88M|                printf("Post-dc_hi_tok[%d][%d][0][%d]: r=%d\n", \
  544|  1.88M|                       imin(t_dim->ctx, 3), chroma, dc_tok, ts->msac.rng); \
  545|  1.88M|        } \
  546|  1.88M|        break
  547|       |
  548|  1.88M|        const uint16_t *scan;
  549|  1.88M|        switch (tx_class) {
  550|  1.81M|        case TX_CLASS_2D: {
  ------------------
  |  Branch (550:9): [True: 1.81M, False: 73.2k]
  ------------------
  551|  1.81M|            const unsigned nonsquare_tx = tx >= RTX_4X8;
  552|  1.81M|            const uint8_t (*const lo_ctx_offsets)[5] =
  553|  1.81M|                dav1d_lo_ctx_offsets[nonsquare_tx + (tx & nonsquare_tx)];
  554|  1.81M|            scan = dav1d_scans[tx];
  555|  1.81M|            const ptrdiff_t stride = 4 << slh;
  556|  1.81M|            const unsigned shift = slh + 2, shift2 = 0;
  557|  1.81M|            const unsigned mask = (4 << slh) - 1;
  558|  1.81M|            memset(levels, 0, stride * ((4 << slw) + 2));
  559|  1.81M|            DECODE_COEFS_CLASS(TX_CLASS_2D);
  ------------------
  |  |  459|  1.81M|        unsigned x, y; \
  |  |  460|  1.81M|        uint8_t *level; \
  |  |  461|  1.81M|        if (tx_class == TX_CLASS_2D) \
  |  |  ------------------
  |  |  |  Branch (461:13): [True: 1.81M, Folded]
  |  |  ------------------
  |  |  462|  1.81M|            rc = scan[eob], x = rc >> shift, y = rc & mask; \
  |  |  463|  18.4E|        else if (tx_class == TX_CLASS_H) \
  |  |  ------------------
  |  |  |  Branch (463:18): [Folded, False: 18.4E]
  |  |  ------------------
  |  |  464|  18.4E|            /* Transposing reduces the stride and padding requirements */ \
  |  |  465|  18.4E|            x = eob & mask, y = eob >> shift, rc = eob; \
  |  |  466|  18.4E|        else /* tx_class == TX_CLASS_V */ \
  |  |  467|  18.4E|            x = eob & mask, y = eob >> shift, rc = (x << shift2) | y; \
  |  |  468|  1.81M|        if (dbg) \
  |  |  ------------------
  |  |  |  Branch (468:13): [Folded, False: 1.81M]
  |  |  ------------------
  |  |  469|  1.81M|            printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
  |  |  470|      0|                   t_dim->ctx, chroma, ctx, eob, rc, tok, ts->msac.rng); \
  |  |  471|  1.81M|        if (eob_tok == 2) { \
  |  |  ------------------
  |  |  |  Branch (471:13): [True: 75.0k, False: 1.73M]
  |  |  ------------------
  |  |  472|  75.0k|            ctx = (tx_class == TX_CLASS_2D ? (x | y) > 1 : y != 0) ? 14 : 7; \
  |  |  ------------------
  |  |  |  Branch (472:19): [True: 59.7k, False: 15.3k]
  |  |  |  Branch (472:20): [True: 75.0k, Folded]
  |  |  ------------------
  |  |  473|  75.0k|            tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
  |  |  ------------------
  |  |  |  |   49|  75.0k|#define dav1d_msac_decode_hi_tok         dav1d_msac_decode_hi_tok_sse2
  |  |  ------------------
  |  |  474|  75.0k|            level_tok = tok + (3 << 6); \
  |  |  475|  75.0k|            if (dbg) \
  |  |  ------------------
  |  |  |  Branch (475:17): [Folded, False: 75.0k]
  |  |  ------------------
  |  |  476|  75.0k|                printf("Post-hi_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
  |  |  477|      0|                       imin(t_dim->ctx, 3), chroma, ctx, eob, rc, tok, \
  |  |  478|      0|                       ts->msac.rng); \
  |  |  479|  75.0k|        } \
  |  |  480|  1.81M|        cf[rc] = tok << 11; \
  |  |  481|  1.81M|        if (tx_class == TX_CLASS_2D) \
  |  |  ------------------
  |  |  |  Branch (481:13): [True: 1.81M, Folded]
  |  |  ------------------
  |  |  482|  1.81M|            level = levels + rc; \
  |  |  483|  1.81M|        else \
  |  |  484|  18.4E|            level = levels + x * stride + y; \
  |  |  485|  1.81M|        *level = (uint8_t) level_tok; \
  |  |  486|  37.8M|        for (int i = eob - 1; i > 0; i--) { /* ac */ \
  |  |  ------------------
  |  |  |  Branch (486:31): [True: 36.0M, False: 1.81M]
  |  |  ------------------
  |  |  487|  36.0M|            unsigned rc_i; \
  |  |  488|  36.0M|            if (tx_class == TX_CLASS_2D) \
  |  |  ------------------
  |  |  |  Branch (488:17): [True: 36.0M, Folded]
  |  |  ------------------
  |  |  489|  36.0M|                rc_i = scan[i], x = rc_i >> shift, y = rc_i & mask; \
  |  |  490|  18.4E|            else if (tx_class == TX_CLASS_H) \
  |  |  ------------------
  |  |  |  Branch (490:22): [Folded, False: 18.4E]
  |  |  ------------------
  |  |  491|  18.4E|                x = i & mask, y = i >> shift, rc_i = i; \
  |  |  492|  18.4E|            else /* tx_class == TX_CLASS_V */ \
  |  |  493|  18.4E|                x = i & mask, y = i >> shift, rc_i = (x << shift2) | y; \
  |  |  494|  36.0M|            assert(x < 32 && y < 32); \
  |  |  ------------------
  |  |  |  |  140|  72.1M|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (140:30): [True: 36.0M, False: 18.4E]
  |  |  |  |  |  Branch (140:30): [True: 36.0M, False: 18.4E]
  |  |  |  |  |  Branch (140:68): [Folded, False: 36.0M]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  495|  36.0M|            if (tx_class == TX_CLASS_2D) \
  |  |  ------------------
  |  |  |  Branch (495:17): [True: 36.1M, Folded]
  |  |  ------------------
  |  |  496|  36.1M|                level = levels + rc_i; \
  |  |  497|  36.0M|            else \
  |  |  498|  18.4E|                level = levels + x * stride + y; \
  |  |  499|  36.0M|            ctx = get_lo_ctx(level, tx_class, &mag, lo_ctx_offsets, x, y, stride); \
  |  |  500|  36.0M|            if (tx_class == TX_CLASS_2D) \
  |  |  ------------------
  |  |  |  Branch (500:17): [True: 35.9M, Folded]
  |  |  ------------------
  |  |  501|  36.0M|                y |= x; \
  |  |  502|  36.0M|            tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf[ctx], 3); \
  |  |  ------------------
  |  |  |  |   47|  36.0M|#define dav1d_msac_decode_symbol_adapt4  dav1d_msac_decode_symbol_adapt4_sse2
  |  |  ------------------
  |  |  503|  36.0M|            if (dbg) \
  |  |  ------------------
  |  |  |  Branch (503:17): [Folded, False: 36.0M]
  |  |  ------------------
  |  |  504|  36.0M|                printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
  |  |  505|      0|                       t_dim->ctx, chroma, ctx, i, rc_i, tok, ts->msac.rng); \
  |  |  506|  36.0M|            if (tok == 3) { \
  |  |  ------------------
  |  |  |  Branch (506:17): [True: 3.98M, False: 32.0M]
  |  |  ------------------
  |  |  507|  3.98M|                mag &= 63; \
  |  |  508|  3.98M|                ctx = (y > (tx_class == TX_CLASS_2D) ? 14 : 7) + \
  |  |  ------------------
  |  |  |  Branch (508:24): [True: 3.07M, False: 911k]
  |  |  ------------------
  |  |  509|  3.98M|                      (mag > 12 ? 6 : (mag + 1) >> 1); \
  |  |  ------------------
  |  |  |  Branch (509:24): [True: 915k, False: 3.07M]
  |  |  ------------------
  |  |  510|  3.98M|                tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
  |  |  ------------------
  |  |  |  |   49|  3.98M|#define dav1d_msac_decode_hi_tok         dav1d_msac_decode_hi_tok_sse2
  |  |  ------------------
  |  |  511|  3.98M|                if (dbg) \
  |  |  ------------------
  |  |  |  Branch (511:21): [Folded, False: 3.98M]
  |  |  ------------------
  |  |  512|  3.98M|                    printf("Post-hi_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
  |  |  513|      0|                           imin(t_dim->ctx, 3), chroma, ctx, i, rc_i, tok, \
  |  |  514|      0|                           ts->msac.rng); \
  |  |  515|  3.98M|                *level = (uint8_t) (tok + (3 << 6)); \
  |  |  516|  3.98M|                cf[rc_i] = (tok << 11) | rc; \
  |  |  517|  3.98M|                rc = rc_i; \
  |  |  518|  32.0M|            } else { \
  |  |  519|  32.0M|                /* 0x1 for tok, 0x7ff as bitmask for rc, 0x41 for level_tok */ \
  |  |  520|  32.0M|                tok *= 0x17ff41; \
  |  |  521|  32.0M|                *level = (uint8_t) tok; \
  |  |  522|  32.0M|                /* tok ? (tok << 11) | rc : 0 */ \
  |  |  523|  32.0M|                tok = (tok >> 9) & (rc + ~0x7ffu); \
  |  |  524|  32.0M|                if (tok) rc = rc_i; \
  |  |  ------------------
  |  |  |  Branch (524:21): [True: 11.4M, False: 20.6M]
  |  |  ------------------
  |  |  525|  32.0M|                cf[rc_i] = tok; \
  |  |  526|  32.0M|            } \
  |  |  527|  36.0M|        } \
  |  |  528|  1.81M|        /* dc */ \
  |  |  529|  1.81M|        ctx = (tx_class == TX_CLASS_2D) ? 0 : \
  |  |  ------------------
  |  |  |  Branch (529:15): [True: 1.81M, Folded]
  |  |  ------------------
  |  |  530|  1.81M|            get_lo_ctx(levels, tx_class, &mag, lo_ctx_offsets, 0, 0, stride); \
  |  |  531|  1.81M|        dc_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf[ctx], 3); \
  |  |  ------------------
  |  |  |  |   47|  1.81M|#define dav1d_msac_decode_symbol_adapt4  dav1d_msac_decode_symbol_adapt4_sse2
  |  |  ------------------
  |  |  532|  1.81M|        if (dbg) \
  |  |  ------------------
  |  |  |  Branch (532:13): [Folded, False: 1.81M]
  |  |  ------------------
  |  |  533|  1.81M|            printf("Post-dc_lo_tok[%d][%d][%d][%d]: r=%d\n", \
  |  |  534|      0|                   t_dim->ctx, chroma, ctx, dc_tok, ts->msac.rng); \
  |  |  535|  1.81M|        if (dc_tok == 3) { \
  |  |  ------------------
  |  |  |  Branch (535:13): [True: 752k, False: 1.06M]
  |  |  ------------------
  |  |  536|   752k|            if (tx_class == TX_CLASS_2D) \
  |  |  ------------------
  |  |  |  Branch (536:17): [True: 752k, Folded]
  |  |  ------------------
  |  |  537|   752k|                mag = levels[0 * stride + 1] + levels[1 * stride + 0] + \
  |  |  538|   752k|                      levels[1 * stride + 1]; \
  |  |  539|   752k|            mag &= 63; \
  |  |  540|   752k|            ctx = mag > 12 ? 6 : (mag + 1) >> 1; \
  |  |  ------------------
  |  |  |  Branch (540:19): [True: 100k, False: 651k]
  |  |  ------------------
  |  |  541|   752k|            dc_tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
  |  |  ------------------
  |  |  |  |   49|   752k|#define dav1d_msac_decode_hi_tok         dav1d_msac_decode_hi_tok_sse2
  |  |  ------------------
  |  |  542|   752k|            if (dbg) \
  |  |  ------------------
  |  |  |  Branch (542:17): [Folded, False: 752k]
  |  |  ------------------
  |  |  543|   752k|                printf("Post-dc_hi_tok[%d][%d][0][%d]: r=%d\n", \
  |  |  544|      0|                       imin(t_dim->ctx, 3), chroma, dc_tok, ts->msac.rng); \
  |  |  545|   752k|        } \
  |  |  546|  1.81M|        break
  ------------------
  560|  1.81M|        }
  561|  48.6k|        case TX_CLASS_H: {
  ------------------
  |  Branch (561:9): [True: 48.6k, False: 1.83M]
  ------------------
  562|  48.6k|            const uint8_t (*const lo_ctx_offsets)[5] = NULL;
  563|  48.6k|            const ptrdiff_t stride = 16;
  564|  48.6k|            const unsigned shift = slh + 2, shift2 = 0;
  565|  48.6k|            const unsigned mask = (4 << slh) - 1;
  566|  48.6k|            memset(levels, 0, stride * ((4 << slh) + 2));
  567|  48.6k|            DECODE_COEFS_CLASS(TX_CLASS_H);
  ------------------
  |  |  459|  48.6k|        unsigned x, y; \
  |  |  460|  48.6k|        uint8_t *level; \
  |  |  461|  48.6k|        if (tx_class == TX_CLASS_2D) \
  |  |  ------------------
  |  |  |  Branch (461:13): [Folded, False: 48.6k]
  |  |  ------------------
  |  |  462|  48.6k|            rc = scan[eob], x = rc >> shift, y = rc & mask; \
  |  |  463|  48.6k|        else if (tx_class == TX_CLASS_H) \
  |  |  ------------------
  |  |  |  Branch (463:18): [True: 48.6k, Folded]
  |  |  ------------------
  |  |  464|  48.6k|            /* Transposing reduces the stride and padding requirements */ \
  |  |  465|  48.6k|            x = eob & mask, y = eob >> shift, rc = eob; \
  |  |  466|  48.6k|        else /* tx_class == TX_CLASS_V */ \
  |  |  467|  48.6k|            x = eob & mask, y = eob >> shift, rc = (x << shift2) | y; \
  |  |  468|  48.6k|        if (dbg) \
  |  |  ------------------
  |  |  |  Branch (468:13): [Folded, False: 48.6k]
  |  |  ------------------
  |  |  469|  48.6k|            printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
  |  |  470|      0|                   t_dim->ctx, chroma, ctx, eob, rc, tok, ts->msac.rng); \
  |  |  471|  48.6k|        if (eob_tok == 2) { \
  |  |  ------------------
  |  |  |  Branch (471:13): [True: 2.66k, False: 45.9k]
  |  |  ------------------
  |  |  472|  2.66k|            ctx = (tx_class == TX_CLASS_2D ? (x | y) > 1 : y != 0) ? 14 : 7; \
  |  |  ------------------
  |  |  |  Branch (472:19): [True: 2.60k, False: 52]
  |  |  |  Branch (472:20): [Folded, False: 2.66k]
  |  |  ------------------
  |  |  473|  2.66k|            tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
  |  |  ------------------
  |  |  |  |   49|  2.66k|#define dav1d_msac_decode_hi_tok         dav1d_msac_decode_hi_tok_sse2
  |  |  ------------------
  |  |  474|  2.66k|            level_tok = tok + (3 << 6); \
  |  |  475|  2.66k|            if (dbg) \
  |  |  ------------------
  |  |  |  Branch (475:17): [Folded, False: 2.66k]
  |  |  ------------------
  |  |  476|  2.66k|                printf("Post-hi_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
  |  |  477|      0|                       imin(t_dim->ctx, 3), chroma, ctx, eob, rc, tok, \
  |  |  478|      0|                       ts->msac.rng); \
  |  |  479|  2.66k|        } \
  |  |  480|  48.6k|        cf[rc] = tok << 11; \
  |  |  481|  48.6k|        if (tx_class == TX_CLASS_2D) \
  |  |  ------------------
  |  |  |  Branch (481:13): [Folded, False: 48.6k]
  |  |  ------------------
  |  |  482|  48.6k|            level = levels + rc; \
  |  |  483|  48.6k|        else \
  |  |  484|  48.6k|            level = levels + x * stride + y; \
  |  |  485|  48.6k|        *level = (uint8_t) level_tok; \
  |  |  486|  1.03M|        for (int i = eob - 1; i > 0; i--) { /* ac */ \
  |  |  ------------------
  |  |  |  Branch (486:31): [True: 982k, False: 48.6k]
  |  |  ------------------
  |  |  487|   982k|            unsigned rc_i; \
  |  |  488|   982k|            if (tx_class == TX_CLASS_2D) \
  |  |  ------------------
  |  |  |  Branch (488:17): [Folded, False: 982k]
  |  |  ------------------
  |  |  489|   982k|                rc_i = scan[i], x = rc_i >> shift, y = rc_i & mask; \
  |  |  490|   982k|            else if (tx_class == TX_CLASS_H) \
  |  |  ------------------
  |  |  |  Branch (490:22): [True: 982k, Folded]
  |  |  ------------------
  |  |  491|   982k|                x = i & mask, y = i >> shift, rc_i = i; \
  |  |  492|   982k|            else /* tx_class == TX_CLASS_V */ \
  |  |  493|   982k|                x = i & mask, y = i >> shift, rc_i = (x << shift2) | y; \
  |  |  494|   982k|            assert(x < 32 && y < 32); \
  |  |  ------------------
  |  |  |  |  140|  1.96M|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (140:30): [True: 982k, False: 18.4E]
  |  |  |  |  |  Branch (140:30): [True: 982k, False: 18.4E]
  |  |  |  |  |  Branch (140:68): [Folded, False: 982k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  495|   982k|            if (tx_class == TX_CLASS_2D) \
  |  |  ------------------
  |  |  |  Branch (495:17): [Folded, False: 982k]
  |  |  ------------------
  |  |  496|   982k|                level = levels + rc_i; \
  |  |  497|   982k|            else \
  |  |  498|   982k|                level = levels + x * stride + y; \
  |  |  499|   982k|            ctx = get_lo_ctx(level, tx_class, &mag, lo_ctx_offsets, x, y, stride); \
  |  |  500|   982k|            if (tx_class == TX_CLASS_2D) \
  |  |  ------------------
  |  |  |  Branch (500:17): [Folded, False: 982k]
  |  |  ------------------
  |  |  501|   982k|                y |= x; \
  |  |  502|   982k|            tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf[ctx], 3); \
  |  |  ------------------
  |  |  |  |   47|   982k|#define dav1d_msac_decode_symbol_adapt4  dav1d_msac_decode_symbol_adapt4_sse2
  |  |  ------------------
  |  |  503|   982k|            if (dbg) \
  |  |  ------------------
  |  |  |  Branch (503:17): [Folded, False: 982k]
  |  |  ------------------
  |  |  504|   982k|                printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
  |  |  505|      0|                       t_dim->ctx, chroma, ctx, i, rc_i, tok, ts->msac.rng); \
  |  |  506|   982k|            if (tok == 3) { \
  |  |  ------------------
  |  |  |  Branch (506:17): [True: 61.3k, False: 921k]
  |  |  ------------------
  |  |  507|  61.3k|                mag &= 63; \
  |  |  508|  61.3k|                ctx = (y > (tx_class == TX_CLASS_2D) ? 14 : 7) + \
  |  |  ------------------
  |  |  |  Branch (508:24): [True: 40.4k, False: 20.9k]
  |  |  ------------------
  |  |  509|  61.3k|                      (mag > 12 ? 6 : (mag + 1) >> 1); \
  |  |  ------------------
  |  |  |  Branch (509:24): [True: 9.76k, False: 51.6k]
  |  |  ------------------
  |  |  510|  61.3k|                tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
  |  |  ------------------
  |  |  |  |   49|  61.3k|#define dav1d_msac_decode_hi_tok         dav1d_msac_decode_hi_tok_sse2
  |  |  ------------------
  |  |  511|  61.3k|                if (dbg) \
  |  |  ------------------
  |  |  |  Branch (511:21): [Folded, False: 61.3k]
  |  |  ------------------
  |  |  512|  61.3k|                    printf("Post-hi_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
  |  |  513|      0|                           imin(t_dim->ctx, 3), chroma, ctx, i, rc_i, tok, \
  |  |  514|      0|                           ts->msac.rng); \
  |  |  515|  61.3k|                *level = (uint8_t) (tok + (3 << 6)); \
  |  |  516|  61.3k|                cf[rc_i] = (tok << 11) | rc; \
  |  |  517|  61.3k|                rc = rc_i; \
  |  |  518|   921k|            } else { \
  |  |  519|   921k|                /* 0x1 for tok, 0x7ff as bitmask for rc, 0x41 for level_tok */ \
  |  |  520|   921k|                tok *= 0x17ff41; \
  |  |  521|   921k|                *level = (uint8_t) tok; \
  |  |  522|   921k|                /* tok ? (tok << 11) | rc : 0 */ \
  |  |  523|   921k|                tok = (tok >> 9) & (rc + ~0x7ffu); \
  |  |  524|   921k|                if (tok) rc = rc_i; \
  |  |  ------------------
  |  |  |  Branch (524:21): [True: 260k, False: 661k]
  |  |  ------------------
  |  |  525|   921k|                cf[rc_i] = tok; \
  |  |  526|   921k|            } \
  |  |  527|   982k|        } \
  |  |  528|  48.6k|        /* dc */ \
  |  |  529|  48.6k|        ctx = (tx_class == TX_CLASS_2D) ? 0 : \
  |  |  ------------------
  |  |  |  Branch (529:15): [Folded, False: 48.6k]
  |  |  ------------------
  |  |  530|  48.6k|            get_lo_ctx(levels, tx_class, &mag, lo_ctx_offsets, 0, 0, stride); \
  |  |  531|  48.6k|        dc_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf[ctx], 3); \
  |  |  ------------------
  |  |  |  |   47|  48.6k|#define dav1d_msac_decode_symbol_adapt4  dav1d_msac_decode_symbol_adapt4_sse2
  |  |  ------------------
  |  |  532|  48.6k|        if (dbg) \
  |  |  ------------------
  |  |  |  Branch (532:13): [Folded, False: 48.6k]
  |  |  ------------------
  |  |  533|  48.6k|            printf("Post-dc_lo_tok[%d][%d][%d][%d]: r=%d\n", \
  |  |  534|      0|                   t_dim->ctx, chroma, ctx, dc_tok, ts->msac.rng); \
  |  |  535|  48.6k|        if (dc_tok == 3) { \
  |  |  ------------------
  |  |  |  Branch (535:13): [True: 7.66k, False: 40.9k]
  |  |  ------------------
  |  |  536|  7.66k|            if (tx_class == TX_CLASS_2D) \
  |  |  ------------------
  |  |  |  Branch (536:17): [Folded, False: 7.66k]
  |  |  ------------------
  |  |  537|  7.66k|                mag = levels[0 * stride + 1] + levels[1 * stride + 0] + \
  |  |  538|      0|                      levels[1 * stride + 1]; \
  |  |  539|  7.66k|            mag &= 63; \
  |  |  540|  7.66k|            ctx = mag > 12 ? 6 : (mag + 1) >> 1; \
  |  |  ------------------
  |  |  |  Branch (540:19): [True: 1.95k, False: 5.70k]
  |  |  ------------------
  |  |  541|  7.66k|            dc_tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
  |  |  ------------------
  |  |  |  |   49|  7.66k|#define dav1d_msac_decode_hi_tok         dav1d_msac_decode_hi_tok_sse2
  |  |  ------------------
  |  |  542|  7.66k|            if (dbg) \
  |  |  ------------------
  |  |  |  Branch (542:17): [Folded, False: 7.66k]
  |  |  ------------------
  |  |  543|  7.66k|                printf("Post-dc_hi_tok[%d][%d][0][%d]: r=%d\n", \
  |  |  544|      0|                       imin(t_dim->ctx, 3), chroma, dc_tok, ts->msac.rng); \
  |  |  545|  7.66k|        } \
  |  |  546|  48.6k|        break
  ------------------
  568|  48.6k|        }
  569|  24.5k|        case TX_CLASS_V: {
  ------------------
  |  Branch (569:9): [True: 24.5k, False: 1.86M]
  ------------------
  570|  24.5k|            const uint8_t (*const lo_ctx_offsets)[5] = NULL;
  571|  24.5k|            const ptrdiff_t stride = 16;
  572|  24.5k|            const unsigned shift = slw + 2, shift2 = slh + 2;
  573|  24.5k|            const unsigned mask = (4 << slw) - 1;
  574|  24.5k|            memset(levels, 0, stride * ((4 << slw) + 2));
  575|  24.5k|            DECODE_COEFS_CLASS(TX_CLASS_V);
  ------------------
  |  |  459|  24.5k|        unsigned x, y; \
  |  |  460|  24.5k|        uint8_t *level; \
  |  |  461|  24.5k|        if (tx_class == TX_CLASS_2D) \
  |  |  ------------------
  |  |  |  Branch (461:13): [Folded, False: 24.5k]
  |  |  ------------------
  |  |  462|  24.5k|            rc = scan[eob], x = rc >> shift, y = rc & mask; \
  |  |  463|  24.5k|        else if (tx_class == TX_CLASS_H) \
  |  |  ------------------
  |  |  |  Branch (463:18): [Folded, False: 24.5k]
  |  |  ------------------
  |  |  464|  24.5k|            /* Transposing reduces the stride and padding requirements */ \
  |  |  465|  24.5k|            x = eob & mask, y = eob >> shift, rc = eob; \
  |  |  466|  24.5k|        else /* tx_class == TX_CLASS_V */ \
  |  |  467|  24.5k|            x = eob & mask, y = eob >> shift, rc = (x << shift2) | y; \
  |  |  468|  24.5k|        if (dbg) \
  |  |  ------------------
  |  |  |  Branch (468:13): [Folded, False: 24.5k]
  |  |  ------------------
  |  |  469|  24.5k|            printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
  |  |  470|      0|                   t_dim->ctx, chroma, ctx, eob, rc, tok, ts->msac.rng); \
  |  |  471|  24.5k|        if (eob_tok == 2) { \
  |  |  ------------------
  |  |  |  Branch (471:13): [True: 1.25k, False: 23.3k]
  |  |  ------------------
  |  |  472|  1.25k|            ctx = (tx_class == TX_CLASS_2D ? (x | y) > 1 : y != 0) ? 14 : 7; \
  |  |  ------------------
  |  |  |  Branch (472:19): [True: 1.22k, False: 32]
  |  |  |  Branch (472:20): [Folded, False: 1.25k]
  |  |  ------------------
  |  |  473|  1.25k|            tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
  |  |  ------------------
  |  |  |  |   49|  1.25k|#define dav1d_msac_decode_hi_tok         dav1d_msac_decode_hi_tok_sse2
  |  |  ------------------
  |  |  474|  1.25k|            level_tok = tok + (3 << 6); \
  |  |  475|  1.25k|            if (dbg) \
  |  |  ------------------
  |  |  |  Branch (475:17): [Folded, False: 1.25k]
  |  |  ------------------
  |  |  476|  1.25k|                printf("Post-hi_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
  |  |  477|      0|                       imin(t_dim->ctx, 3), chroma, ctx, eob, rc, tok, \
  |  |  478|      0|                       ts->msac.rng); \
  |  |  479|  1.25k|        } \
  |  |  480|  24.5k|        cf[rc] = tok << 11; \
  |  |  481|  24.5k|        if (tx_class == TX_CLASS_2D) \
  |  |  ------------------
  |  |  |  Branch (481:13): [Folded, False: 24.5k]
  |  |  ------------------
  |  |  482|  24.5k|            level = levels + rc; \
  |  |  483|  24.5k|        else \
  |  |  484|  24.5k|            level = levels + x * stride + y; \
  |  |  485|  24.5k|        *level = (uint8_t) level_tok; \
  |  |  486|   526k|        for (int i = eob - 1; i > 0; i--) { /* ac */ \
  |  |  ------------------
  |  |  |  Branch (486:31): [True: 501k, False: 24.5k]
  |  |  ------------------
  |  |  487|   501k|            unsigned rc_i; \
  |  |  488|   501k|            if (tx_class == TX_CLASS_2D) \
  |  |  ------------------
  |  |  |  Branch (488:17): [Folded, False: 501k]
  |  |  ------------------
  |  |  489|   501k|                rc_i = scan[i], x = rc_i >> shift, y = rc_i & mask; \
  |  |  490|   501k|            else if (tx_class == TX_CLASS_H) \
  |  |  ------------------
  |  |  |  Branch (490:22): [Folded, False: 501k]
  |  |  ------------------
  |  |  491|   501k|                x = i & mask, y = i >> shift, rc_i = i; \
  |  |  492|   501k|            else /* tx_class == TX_CLASS_V */ \
  |  |  493|   501k|                x = i & mask, y = i >> shift, rc_i = (x << shift2) | y; \
  |  |  494|   501k|            assert(x < 32 && y < 32); \
  |  |  ------------------
  |  |  |  |  140|  1.00M|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (140:30): [True: 501k, False: 28]
  |  |  |  |  |  Branch (140:30): [True: 501k, False: 18.4E]
  |  |  |  |  |  Branch (140:68): [Folded, False: 501k]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  495|   501k|            if (tx_class == TX_CLASS_2D) \
  |  |  ------------------
  |  |  |  Branch (495:17): [Folded, False: 501k]
  |  |  ------------------
  |  |  496|   501k|                level = levels + rc_i; \
  |  |  497|   501k|            else \
  |  |  498|   501k|                level = levels + x * stride + y; \
  |  |  499|   501k|            ctx = get_lo_ctx(level, tx_class, &mag, lo_ctx_offsets, x, y, stride); \
  |  |  500|   501k|            if (tx_class == TX_CLASS_2D) \
  |  |  ------------------
  |  |  |  Branch (500:17): [Folded, False: 501k]
  |  |  ------------------
  |  |  501|   501k|                y |= x; \
  |  |  502|   501k|            tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf[ctx], 3); \
  |  |  ------------------
  |  |  |  |   47|   501k|#define dav1d_msac_decode_symbol_adapt4  dav1d_msac_decode_symbol_adapt4_sse2
  |  |  ------------------
  |  |  503|   501k|            if (dbg) \
  |  |  ------------------
  |  |  |  Branch (503:17): [Folded, False: 501k]
  |  |  ------------------
  |  |  504|   501k|                printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
  |  |  505|      0|                       t_dim->ctx, chroma, ctx, i, rc_i, tok, ts->msac.rng); \
  |  |  506|   501k|            if (tok == 3) { \
  |  |  ------------------
  |  |  |  Branch (506:17): [True: 25.0k, False: 476k]
  |  |  ------------------
  |  |  507|  25.0k|                mag &= 63; \
  |  |  508|  25.0k|                ctx = (y > (tx_class == TX_CLASS_2D) ? 14 : 7) + \
  |  |  ------------------
  |  |  |  Branch (508:24): [True: 15.2k, False: 9.79k]
  |  |  ------------------
  |  |  509|  25.0k|                      (mag > 12 ? 6 : (mag + 1) >> 1); \
  |  |  ------------------
  |  |  |  Branch (509:24): [True: 3.79k, False: 21.2k]
  |  |  ------------------
  |  |  510|  25.0k|                tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
  |  |  ------------------
  |  |  |  |   49|  25.0k|#define dav1d_msac_decode_hi_tok         dav1d_msac_decode_hi_tok_sse2
  |  |  ------------------
  |  |  511|  25.0k|                if (dbg) \
  |  |  ------------------
  |  |  |  Branch (511:21): [Folded, False: 25.0k]
  |  |  ------------------
  |  |  512|  25.0k|                    printf("Post-hi_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
  |  |  513|      0|                           imin(t_dim->ctx, 3), chroma, ctx, i, rc_i, tok, \
  |  |  514|      0|                           ts->msac.rng); \
  |  |  515|  25.0k|                *level = (uint8_t) (tok + (3 << 6)); \
  |  |  516|  25.0k|                cf[rc_i] = (tok << 11) | rc; \
  |  |  517|  25.0k|                rc = rc_i; \
  |  |  518|   476k|            } else { \
  |  |  519|   476k|                /* 0x1 for tok, 0x7ff as bitmask for rc, 0x41 for level_tok */ \
  |  |  520|   476k|                tok *= 0x17ff41; \
  |  |  521|   476k|                *level = (uint8_t) tok; \
  |  |  522|   476k|                /* tok ? (tok << 11) | rc : 0 */ \
  |  |  523|   476k|                tok = (tok >> 9) & (rc + ~0x7ffu); \
  |  |  524|   476k|                if (tok) rc = rc_i; \
  |  |  ------------------
  |  |  |  Branch (524:21): [True: 125k, False: 350k]
  |  |  ------------------
  |  |  525|   476k|                cf[rc_i] = tok; \
  |  |  526|   476k|            } \
  |  |  527|   501k|        } \
  |  |  528|  24.5k|        /* dc */ \
  |  |  529|  24.5k|        ctx = (tx_class == TX_CLASS_2D) ? 0 : \
  |  |  ------------------
  |  |  |  Branch (529:15): [Folded, False: 24.5k]
  |  |  ------------------
  |  |  530|  24.5k|            get_lo_ctx(levels, tx_class, &mag, lo_ctx_offsets, 0, 0, stride); \
  |  |  531|  24.5k|        dc_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf[ctx], 3); \
  |  |  ------------------
  |  |  |  |   47|  24.5k|#define dav1d_msac_decode_symbol_adapt4  dav1d_msac_decode_symbol_adapt4_sse2
  |  |  ------------------
  |  |  532|  24.5k|        if (dbg) \
  |  |  ------------------
  |  |  |  Branch (532:13): [Folded, False: 24.5k]
  |  |  ------------------
  |  |  533|  24.5k|            printf("Post-dc_lo_tok[%d][%d][%d][%d]: r=%d\n", \
  |  |  534|      0|                   t_dim->ctx, chroma, ctx, dc_tok, ts->msac.rng); \
  |  |  535|  24.5k|        if (dc_tok == 3) { \
  |  |  ------------------
  |  |  |  Branch (535:13): [True: 3.08k, False: 21.4k]
  |  |  ------------------
  |  |  536|  3.08k|            if (tx_class == TX_CLASS_2D) \
  |  |  ------------------
  |  |  |  Branch (536:17): [Folded, False: 3.08k]
  |  |  ------------------
  |  |  537|  3.08k|                mag = levels[0 * stride + 1] + levels[1 * stride + 0] + \
  |  |  538|      0|                      levels[1 * stride + 1]; \
  |  |  539|  3.08k|            mag &= 63; \
  |  |  540|  3.08k|            ctx = mag > 12 ? 6 : (mag + 1) >> 1; \
  |  |  ------------------
  |  |  |  Branch (540:19): [True: 625, False: 2.45k]
  |  |  ------------------
  |  |  541|  3.08k|            dc_tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
  |  |  ------------------
  |  |  |  |   49|  3.08k|#define dav1d_msac_decode_hi_tok         dav1d_msac_decode_hi_tok_sse2
  |  |  ------------------
  |  |  542|  3.08k|            if (dbg) \
  |  |  ------------------
  |  |  |  Branch (542:17): [Folded, False: 3.08k]
  |  |  ------------------
  |  |  543|  3.08k|                printf("Post-dc_hi_tok[%d][%d][0][%d]: r=%d\n", \
  |  |  544|      0|                       imin(t_dim->ctx, 3), chroma, dc_tok, ts->msac.rng); \
  |  |  545|  3.08k|        } \
  |  |  546|  24.5k|        break
  ------------------
  576|  24.5k|        }
  577|      0|#undef DECODE_COEFS_CLASS
  578|      0|        default: assert(0);
  ------------------
  |  |  140|      0|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, Folded]
  |  |  |  Branch (140:68): [Folded, False: 0]
  |  |  ------------------
  ------------------
  |  Branch (578:9): [True: 0, False: 1.88M]
  ------------------
  579|  1.88M|        }
  580|  1.88M|    } else { // dc-only
  581|   421k|        int tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac, eob_cdf[0], 2);
  ------------------
  |  |   47|   421k|#define dav1d_msac_decode_symbol_adapt4  dav1d_msac_decode_symbol_adapt4_sse2
  ------------------
  582|   421k|        dc_tok = 1 + tok_br;
  583|   421k|        if (dbg)
  ------------------
  |  Branch (583:13): [Folded, False: 421k]
  ------------------
  584|      0|            printf("Post-dc_lo_tok[%d][%d][%d][%d]: r=%d\n",
  585|      0|                   t_dim->ctx, chroma, 0, dc_tok, ts->msac.rng);
  586|   421k|        if (tok_br == 2) {
  ------------------
  |  Branch (586:13): [True: 42.6k, False: 379k]
  ------------------
  587|  42.6k|            dc_tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[0]);
  ------------------
  |  |   49|  42.6k|#define dav1d_msac_decode_hi_tok         dav1d_msac_decode_hi_tok_sse2
  ------------------
  588|  42.6k|            if (dbg)
  ------------------
  |  Branch (588:17): [Folded, False: 42.6k]
  ------------------
  589|      0|                printf("Post-dc_hi_tok[%d][%d][0][%d]: r=%d\n",
  590|      0|                       imin(t_dim->ctx, 3), chroma, dc_tok, ts->msac.rng);
  591|  42.6k|        }
  592|   421k|        rc = 0;
  593|   421k|    }
  594|       |
  595|       |    // residual and sign
  596|  2.30M|    const uint16_t *const dq_tbl = ts->dq[b->seg_id][plane];
  597|  2.30M|    const uint8_t *const qm_tbl = *txtp < IDTX ? f->qm[tx][plane] : NULL;
  ------------------
  |  Branch (597:35): [True: 1.06M, False: 1.24M]
  ------------------
  598|  2.30M|    const int dq_shift = imax(0, t_dim->ctx - 2);
  599|  2.30M|    const int cf_max = ~(~127U << (BITDEPTH == 8 ? 8 : f->cur.p.bpc));
  ------------------
  |  Branch (599:36): [True: 1.23M, Folded]
  ------------------
  600|  2.30M|    unsigned cul_level, dc_sign_level;
  601|       |
  602|  2.30M|    if (!dc_tok) {
  ------------------
  |  Branch (602:9): [True: 404k, False: 1.90M]
  ------------------
  603|   404k|        cul_level = 0;
  604|   404k|        dc_sign_level = 1 << 6;
  605|   404k|        if (qm_tbl) goto ac_qm;
  ------------------
  |  Branch (605:13): [True: 69.5k, False: 335k]
  ------------------
  606|   335k|        goto ac_noqm;
  607|   404k|    }
  608|       |
  609|  1.90M|    const int dc_sign_ctx = get_dc_sign_ctx(tx, a, l);
  610|  1.90M|    uint16_t *const dc_sign_cdf = ts->cdf.coef.dc_sign[chroma][dc_sign_ctx];
  611|  1.90M|    const int dc_sign = dav1d_msac_decode_bool_adapt(&ts->msac, dc_sign_cdf);
  ------------------
  |  |   52|  1.90M|#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
  ------------------
  612|  1.90M|    if (dbg)
  ------------------
  |  Branch (612:9): [Folded, False: 1.90M]
  ------------------
  613|      0|        printf("Post-dc_sign[%d][%d][%d]: r=%d\n",
  614|      0|               chroma, dc_sign_ctx, dc_sign, ts->msac.rng);
  615|       |
  616|  1.90M|    int dc_dq = dq_tbl[0];
  617|  1.90M|    dc_sign_level = (dc_sign - 1) & (2 << 6);
  618|       |
  619|  1.90M|    if (qm_tbl) {
  ------------------
  |  Branch (619:9): [True: 364k, False: 1.53M]
  ------------------
  620|   364k|        dc_dq = (dc_dq * qm_tbl[0] + 16) >> 5;
  621|       |
  622|   364k|        if (dc_tok == 15) {
  ------------------
  |  Branch (622:13): [True: 16.3k, False: 348k]
  ------------------
  623|  16.3k|            dc_tok = read_golomb(&ts->msac) + 15;
  624|  16.3k|            if (dbg)
  ------------------
  |  Branch (624:17): [Folded, False: 16.3k]
  ------------------
  625|      0|                printf("Post-dc_residual[%d->%d]: r=%d\n",
  626|      0|                       dc_tok - 15, dc_tok, ts->msac.rng);
  627|       |
  628|  16.3k|            dc_tok &= 0xfffff;
  629|  16.3k|            dc_dq = (dc_dq * dc_tok) & 0xffffff;
  630|   348k|        } else {
  631|   348k|            dc_dq *= dc_tok;
  632|   348k|            assert(dc_dq <= 0xffffff);
  ------------------
  |  |  140|   348k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 348k]
  |  |  |  Branch (140:68): [Folded, False: 348k]
  |  |  ------------------
  ------------------
  633|   348k|        }
  634|   364k|        cul_level = dc_tok;
  635|   364k|        dc_dq >>= dq_shift;
  636|   364k|        dc_dq = umin(dc_dq, cf_max + dc_sign);
  637|   364k|        cf[0] = (coef) (dc_sign ? -dc_dq : dc_dq);
  ------------------
  |  Branch (637:25): [True: 187k, False: 177k]
  ------------------
  638|       |
  639|   584k|        if (rc) ac_qm: {
  ------------------
  |  Branch (639:13): [True: 257k, False: 106k]
  ------------------
  640|   584k|            const unsigned ac_dq = dq_tbl[1];
  641|  4.74M|            do {
  642|  4.74M|                const int sign = dav1d_msac_decode_bool_equi(&ts->msac);
  ------------------
  |  |   53|  4.74M|#define dav1d_msac_decode_bool_equi      dav1d_msac_decode_bool_equi_sse2
  ------------------
  643|  4.74M|                if (dbg)
  ------------------
  |  Branch (643:21): [Folded, False: 4.74M]
  ------------------
  644|      0|                    printf("Post-sign[%d=%d]: r=%d\n", rc, sign, ts->msac.rng);
  645|  4.74M|                const unsigned rc_tok = cf[rc];
  646|  4.74M|                unsigned tok, dq = (ac_dq * qm_tbl[rc] + 16) >> 5;
  647|  4.74M|                int dq_sat;
  648|       |
  649|  4.74M|                if (rc_tok >= (15 << 11)) {
  ------------------
  |  Branch (649:21): [True: 151k, False: 4.59M]
  ------------------
  650|   151k|                    tok = read_golomb(&ts->msac) + 15;
  651|   151k|                    if (dbg)
  ------------------
  |  Branch (651:25): [Folded, False: 151k]
  ------------------
  652|      0|                        printf("Post-residual[%d=%d->%d]: r=%d\n",
  653|      0|                               rc, tok - 15, tok, ts->msac.rng);
  654|       |
  655|   151k|                    tok &= 0xfffff;
  656|   151k|                    dq = (dq * tok) & 0xffffff;
  657|  4.59M|                } else {
  658|  4.59M|                    tok = rc_tok >> 11;
  659|  4.59M|                    dq *= tok;
  660|  4.59M|                    assert(dq <= 0xffffff);
  ------------------
  |  |  140|  4.59M|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 4.59M]
  |  |  |  Branch (140:68): [Folded, False: 4.59M]
  |  |  ------------------
  ------------------
  661|  4.59M|                }
  662|  4.74M|                cul_level += tok;
  663|  4.74M|                dq >>= dq_shift;
  664|  4.74M|                dq_sat = umin(dq, cf_max + sign);
  665|  4.74M|                cf[rc] = (coef) (sign ? -dq_sat : dq_sat);
  ------------------
  |  Branch (665:34): [True: 2.42M, False: 2.31M]
  ------------------
  666|       |
  667|  4.74M|                rc = rc_tok & 0x3ff;
  668|  4.74M|            } while (rc);
  ------------------
  |  Branch (668:22): [True: 4.42M, False: 326k]
  ------------------
  669|   584k|        }
  670|  1.53M|    } else {
  671|       |        // non-qmatrix is the common case and allows for additional optimizations
  672|  1.53M|        if (dc_tok == 15) {
  ------------------
  |  Branch (672:13): [True: 87.8k, False: 1.44M]
  ------------------
  673|  87.8k|            dc_tok = read_golomb(&ts->msac) + 15;
  674|  87.8k|            if (dbg)
  ------------------
  |  Branch (674:17): [Folded, False: 87.8k]
  ------------------
  675|      0|                printf("Post-dc_residual[%d->%d]: r=%d\n",
  676|      0|                       dc_tok - 15, dc_tok, ts->msac.rng);
  677|       |
  678|  87.8k|            dc_tok &= 0xfffff;
  679|  87.8k|            dc_dq = ((dc_dq * dc_tok) & 0xffffff) >> dq_shift;
  680|  87.8k|            dc_dq = umin(dc_dq, cf_max + dc_sign);
  681|  1.44M|        } else {
  682|  1.44M|            dc_dq = ((dc_dq * dc_tok) >> dq_shift);
  683|  1.44M|            assert(dc_dq <= cf_max);
  ------------------
  |  |  140|  1.44M|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 1.44M]
  |  |  |  Branch (140:68): [Folded, False: 1.44M]
  |  |  ------------------
  ------------------
  684|  1.44M|        }
  685|  1.53M|        cul_level = dc_tok;
  686|  1.53M|        cf[0] = (coef) (dc_sign ? -dc_dq : dc_dq);
  ------------------
  |  Branch (686:25): [True: 814k, False: 723k]
  ------------------
  687|       |
  688|  2.78M|        if (rc) ac_noqm: {
  ------------------
  |  Branch (688:13): [True: 1.22M, False: 312k]
  ------------------
  689|  2.78M|            const unsigned ac_dq = dq_tbl[1];
  690|  13.4M|            do {
  691|  13.4M|                const int sign = dav1d_msac_decode_bool_equi(&ts->msac);
  ------------------
  |  |   53|  13.4M|#define dav1d_msac_decode_bool_equi      dav1d_msac_decode_bool_equi_sse2
  ------------------
  692|  13.4M|                if (dbg)
  ------------------
  |  Branch (692:21): [Folded, False: 13.4M]
  ------------------
  693|      0|                    printf("Post-sign[%d=%d]: r=%d\n", rc, sign, ts->msac.rng);
  694|  13.4M|                const unsigned rc_tok = cf[rc];
  695|  13.4M|                unsigned tok;
  696|  13.4M|                int dq;
  697|       |
  698|       |                // residual
  699|  13.4M|                if (rc_tok >= (15 << 11)) {
  ------------------
  |  Branch (699:21): [True: 348k, False: 13.0M]
  ------------------
  700|   348k|                    tok = read_golomb(&ts->msac) + 15;
  701|   348k|                    if (dbg)
  ------------------
  |  Branch (701:25): [Folded, False: 348k]
  ------------------
  702|      0|                        printf("Post-residual[%d=%d->%d]: r=%d\n",
  703|      0|                               rc, tok - 15, tok, ts->msac.rng);
  704|       |
  705|       |                    // coefficient parsing, see 5.11.39
  706|   348k|                    tok &= 0xfffff;
  707|       |
  708|       |                    // dequant, see 7.12.3
  709|   348k|                    dq = ((ac_dq * tok) & 0xffffff) >> dq_shift;
  710|   348k|                    dq = umin(dq, cf_max + sign);
  711|  13.0M|                } else {
  712|       |                    // cannot exceed cf_max, so we can avoid the clipping
  713|  13.0M|                    tok = rc_tok >> 11;
  714|  13.0M|                    dq = ((ac_dq * tok) >> dq_shift);
  715|  13.0M|                    assert(dq <= cf_max);
  ------------------
  |  |  140|  13.0M|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 13.0M]
  |  |  |  Branch (140:68): [Folded, False: 13.0M]
  |  |  ------------------
  ------------------
  716|  13.0M|                }
  717|  13.4M|                cul_level += tok;
  718|  13.4M|                cf[rc] = (coef) (sign ? -dq : dq);
  ------------------
  |  Branch (718:34): [True: 6.78M, False: 6.64M]
  ------------------
  719|       |
  720|  13.4M|                rc = rc_tok & 0x3ff; // next non-zero rc, zero if eob
  721|  13.4M|            } while (rc);
  ------------------
  |  Branch (721:22): [True: 11.8M, False: 1.55M]
  ------------------
  722|  2.78M|        }
  723|  1.53M|    }
  724|       |
  725|       |    // context
  726|  2.30M|    *res_ctx = umin(cul_level, 63) | dc_sign_level;
  727|       |
  728|  2.30M|    return eob;
  729|  1.90M|}
recon_tmpl.c:get_skip_ctx:
   65|  4.49M|{
   66|  4.49M|    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
   67|       |
   68|  4.49M|    if (chroma) {
  ------------------
  |  Branch (68:9): [True: 2.49M, False: 2.00M]
  ------------------
   69|  2.49M|        const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
   70|  2.49M|        const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
   71|  2.49M|        const int not_one_blk = b_dim[2] - (!!b_dim[2] && ss_hor) > t_dim->lw ||
  ------------------
  |  Branch (71:33): [True: 1.32M, False: 1.16M]
  |  Branch (71:45): [True: 2.31M, False: 177k]
  |  Branch (71:59): [True: 357k, False: 1.95M]
  ------------------
   72|  1.16M|                                b_dim[3] - (!!b_dim[3] && ss_ver) > t_dim->lh;
  ------------------
  |  Branch (72:33): [True: 54.4k, False: 1.11M]
  |  Branch (72:45): [True: 949k, False: 216k]
  |  Branch (72:59): [True: 265k, False: 683k]
  ------------------
   73|  2.49M|        unsigned ca, cl;
   74|       |
   75|  2.49M|#define MERGE_CTX(dir, type, no_val) \
   76|  2.49M|        c##dir = *(const type *) dir != no_val; \
   77|  2.49M|        break
   78|       |
   79|  2.49M|        switch (t_dim->lw) {
   80|       |        /* For some reason the MSVC CRT _wassert() function is not flagged as
   81|       |         * __declspec(noreturn), so when using those headers the compiler will
   82|       |         * expect execution to continue after an assertion has been triggered
   83|       |         * and will therefore complain about the use of uninitialized variables
   84|       |         * when compiled in debug mode if we put the default case at the end. */
   85|      0|        default: assert(0); /* fall-through */
  ------------------
  |  |  140|      0|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, Folded]
  |  |  |  Branch (140:68): [Folded, False: 0]
  |  |  ------------------
  ------------------
  |  Branch (85:9): [True: 0, False: 2.49M]
  ------------------
   86|  1.32M|        case TX_4X4:   MERGE_CTX(a, uint8_t,  0x40);
  ------------------
  |  |   76|  1.32M|        c##dir = *(const type *) dir != no_val; \
  |  |   77|  1.32M|        break
  ------------------
  |  Branch (86:9): [True: 1.32M, False: 1.17M]
  ------------------
   87|   395k|        case TX_8X8:   MERGE_CTX(a, uint16_t, 0x4040);
  ------------------
  |  |   76|   395k|        c##dir = *(const type *) dir != no_val; \
  |  |   77|   395k|        break
  ------------------
  |  Branch (87:9): [True: 395k, False: 2.09M]
  ------------------
   88|   313k|        case TX_16X16: MERGE_CTX(a, uint32_t, 0x40404040U);
  ------------------
  |  |   76|   313k|        c##dir = *(const type *) dir != no_val; \
  |  |   77|   313k|        break
  ------------------
  |  Branch (88:9): [True: 313k, False: 2.17M]
  ------------------
   89|   465k|        case TX_32X32: MERGE_CTX(a, uint64_t, 0x4040404040404040ULL);
  ------------------
  |  |   76|   465k|        c##dir = *(const type *) dir != no_val; \
  |  |   77|   465k|        break
  ------------------
  |  Branch (89:9): [True: 465k, False: 2.02M]
  ------------------
   90|  2.49M|        }
   91|  2.49M|        switch (t_dim->lh) {
   92|      0|        default: assert(0); /* fall-through */
  ------------------
  |  |  140|      0|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, Folded]
  |  |  |  Branch (140:68): [Folded, False: 0]
  |  |  ------------------
  ------------------
  |  Branch (92:9): [True: 0, False: 2.49M]
  ------------------
   93|  1.39M|        case TX_4X4:   MERGE_CTX(l, uint8_t,  0x40);
  ------------------
  |  |   76|  1.39M|        c##dir = *(const type *) dir != no_val; \
  |  |   77|  1.39M|        break
  ------------------
  |  Branch (93:9): [True: 1.39M, False: 1.09M]
  ------------------
   94|   462k|        case TX_8X8:   MERGE_CTX(l, uint16_t, 0x4040);
  ------------------
  |  |   76|   462k|        c##dir = *(const type *) dir != no_val; \
  |  |   77|   462k|        break
  ------------------
  |  Branch (94:9): [True: 462k, False: 2.03M]
  ------------------
   95|   246k|        case TX_16X16: MERGE_CTX(l, uint32_t, 0x40404040U);
  ------------------
  |  |   76|   246k|        c##dir = *(const type *) dir != no_val; \
  |  |   77|   246k|        break
  ------------------
  |  Branch (95:9): [True: 246k, False: 2.24M]
  ------------------
   96|   390k|        case TX_32X32: MERGE_CTX(l, uint64_t, 0x4040404040404040ULL);
  ------------------
  |  |   76|   390k|        c##dir = *(const type *) dir != no_val; \
  |  |   77|   390k|        break
  ------------------
  |  Branch (96:9): [True: 390k, False: 2.10M]
  ------------------
   97|  2.49M|        }
   98|  2.49M|#undef MERGE_CTX
   99|       |
  100|  2.49M|        return 7 + not_one_blk * 3 + ca + cl;
  101|  2.49M|    } else if (b_dim[2] == t_dim->lw && b_dim[3] == t_dim->lh) {
  ------------------
  |  Branch (101:16): [True: 730k, False: 1.27M]
  |  Branch (101:41): [True: 654k, False: 76.5k]
  ------------------
  102|   654k|        return 0;
  103|  1.34M|    } else {
  104|  1.34M|        unsigned la, ll;
  105|       |
  106|  1.34M|#define MERGE_CTX(dir, type, tx) \
  107|  1.34M|        if (tx == TX_64X64) { \
  108|  1.34M|            uint64_t tmp = *(const uint64_t *) dir; \
  109|  1.34M|            tmp |= *(const uint64_t *) &dir[8]; \
  110|  1.34M|            l##dir = (unsigned) (tmp >> 32) | (unsigned) tmp; \
  111|  1.34M|        } else \
  112|  1.34M|            l##dir = *(const type *) dir; \
  113|  1.34M|        if (tx == TX_32X32) l##dir |= *(const type *) &dir[sizeof(type)]; \
  114|  1.34M|        if (tx >= TX_16X16) l##dir |= l##dir >> 16; \
  115|  1.34M|        if (tx >= TX_8X8)   l##dir |= l##dir >> 8; \
  116|  1.34M|        break
  117|       |
  118|  1.34M|        switch (t_dim->lw) {
  119|      0|        default: assert(0); /* fall-through */
  ------------------
  |  |  140|      0|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, Folded]
  |  |  |  Branch (140:68): [Folded, False: 0]
  |  |  ------------------
  ------------------
  |  Branch (119:9): [True: 0, False: 1.34M]
  ------------------
  120|   924k|        case TX_4X4:   MERGE_CTX(a, uint8_t,  TX_4X4);
  ------------------
  |  |  107|   924k|        if (tx == TX_64X64) { \
  |  |  ------------------
  |  |  |  Branch (107:13): [Folded, False: 924k]
  |  |  ------------------
  |  |  108|      0|            uint64_t tmp = *(const uint64_t *) dir; \
  |  |  109|      0|            tmp |= *(const uint64_t *) &dir[8]; \
  |  |  110|      0|            l##dir = (unsigned) (tmp >> 32) | (unsigned) tmp; \
  |  |  111|      0|        } else \
  |  |  112|   924k|            l##dir = *(const type *) dir; \
  |  |  113|   924k|        if (tx == TX_32X32) l##dir |= *(const type *) &dir[sizeof(type)]; \
  |  |  ------------------
  |  |  |  Branch (113:13): [Folded, False: 924k]
  |  |  ------------------
  |  |  114|   924k|        if (tx >= TX_16X16) l##dir |= l##dir >> 16; \
  |  |  ------------------
  |  |  |  Branch (114:13): [Folded, False: 924k]
  |  |  ------------------
  |  |  115|   924k|        if (tx >= TX_8X8)   l##dir |= l##dir >> 8; \
  |  |  ------------------
  |  |  |  Branch (115:13): [Folded, False: 924k]
  |  |  ------------------
  |  |  116|   924k|        break
  ------------------
  |  Branch (120:9): [True: 924k, False: 425k]
  ------------------
  121|   262k|        case TX_8X8:   MERGE_CTX(a, uint16_t, TX_8X8);
  ------------------
  |  |  107|   262k|        if (tx == TX_64X64) { \
  |  |  ------------------
  |  |  |  Branch (107:13): [Folded, False: 262k]
  |  |  ------------------
  |  |  108|      0|            uint64_t tmp = *(const uint64_t *) dir; \
  |  |  109|      0|            tmp |= *(const uint64_t *) &dir[8]; \
  |  |  110|      0|            l##dir = (unsigned) (tmp >> 32) | (unsigned) tmp; \
  |  |  111|      0|        } else \
  |  |  112|   262k|            l##dir = *(const type *) dir; \
  |  |  113|   262k|        if (tx == TX_32X32) l##dir |= *(const type *) &dir[sizeof(type)]; \
  |  |  ------------------
  |  |  |  Branch (113:13): [Folded, False: 262k]
  |  |  ------------------
  |  |  114|   262k|        if (tx >= TX_16X16) l##dir |= l##dir >> 16; \
  |  |  ------------------
  |  |  |  Branch (114:13): [Folded, False: 262k]
  |  |  ------------------
  |  |  115|   262k|        if (tx >= TX_8X8)   l##dir |= l##dir >> 8; \
  |  |  ------------------
  |  |  |  Branch (115:13): [True: 262k, Folded]
  |  |  ------------------
  |  |  116|   262k|        break
  ------------------
  |  Branch (121:9): [True: 262k, False: 1.08M]
  ------------------
  122|   120k|        case TX_16X16: MERGE_CTX(a, uint32_t, TX_16X16);
  ------------------
  |  |  107|   120k|        if (tx == TX_64X64) { \
  |  |  ------------------
  |  |  |  Branch (107:13): [Folded, False: 120k]
  |  |  ------------------
  |  |  108|      0|            uint64_t tmp = *(const uint64_t *) dir; \
  |  |  109|      0|            tmp |= *(const uint64_t *) &dir[8]; \
  |  |  110|      0|            l##dir = (unsigned) (tmp >> 32) | (unsigned) tmp; \
  |  |  111|      0|        } else \
  |  |  112|   120k|            l##dir = *(const type *) dir; \
  |  |  113|   120k|        if (tx == TX_32X32) l##dir |= *(const type *) &dir[sizeof(type)]; \
  |  |  ------------------
  |  |  |  Branch (113:13): [Folded, False: 120k]
  |  |  ------------------
  |  |  114|   120k|        if (tx >= TX_16X16) l##dir |= l##dir >> 16; \
  |  |  ------------------
  |  |  |  Branch (114:13): [True: 120k, Folded]
  |  |  ------------------
  |  |  115|   120k|        if (tx >= TX_8X8)   l##dir |= l##dir >> 8; \
  |  |  ------------------
  |  |  |  Branch (115:13): [True: 120k, Folded]
  |  |  ------------------
  |  |  116|   120k|        break
  ------------------
  |  Branch (122:9): [True: 120k, False: 1.22M]
  ------------------
  123|  16.5k|        case TX_32X32: MERGE_CTX(a, uint32_t, TX_32X32);
  ------------------
  |  |  107|  16.5k|        if (tx == TX_64X64) { \
  |  |  ------------------
  |  |  |  Branch (107:13): [Folded, False: 16.5k]
  |  |  ------------------
  |  |  108|      0|            uint64_t tmp = *(const uint64_t *) dir; \
  |  |  109|      0|            tmp |= *(const uint64_t *) &dir[8]; \
  |  |  110|      0|            l##dir = (unsigned) (tmp >> 32) | (unsigned) tmp; \
  |  |  111|      0|        } else \
  |  |  112|  16.5k|            l##dir = *(const type *) dir; \
  |  |  113|  16.5k|        if (tx == TX_32X32) l##dir |= *(const type *) &dir[sizeof(type)]; \
  |  |  ------------------
  |  |  |  Branch (113:13): [True: 16.5k, Folded]
  |  |  ------------------
  |  |  114|  16.5k|        if (tx >= TX_16X16) l##dir |= l##dir >> 16; \
  |  |  ------------------
  |  |  |  Branch (114:13): [True: 16.5k, Folded]
  |  |  ------------------
  |  |  115|  16.5k|        if (tx >= TX_8X8)   l##dir |= l##dir >> 8; \
  |  |  ------------------
  |  |  |  Branch (115:13): [True: 16.5k, Folded]
  |  |  ------------------
  |  |  116|  16.5k|        break
  ------------------
  |  Branch (123:9): [True: 16.5k, False: 1.33M]
  ------------------
  124|  38.6k|        case TX_64X64: MERGE_CTX(a, uint32_t, TX_64X64);
  ------------------
  |  |  107|  38.6k|        if (tx == TX_64X64) { \
  |  |  ------------------
  |  |  |  Branch (107:13): [True: 38.6k, Folded]
  |  |  ------------------
  |  |  108|  38.6k|            uint64_t tmp = *(const uint64_t *) dir; \
  |  |  109|  38.6k|            tmp |= *(const uint64_t *) &dir[8]; \
  |  |  110|  38.6k|            l##dir = (unsigned) (tmp >> 32) | (unsigned) tmp; \
  |  |  111|  38.6k|        } else \
  |  |  112|  18.4E|            l##dir = *(const type *) dir; \
  |  |  113|  38.6k|        if (tx == TX_32X32) l##dir |= *(const type *) &dir[sizeof(type)]; \
  |  |  ------------------
  |  |  |  Branch (113:13): [Folded, False: 38.6k]
  |  |  ------------------
  |  |  114|  38.6k|        if (tx >= TX_16X16) l##dir |= l##dir >> 16; \
  |  |  ------------------
  |  |  |  Branch (114:13): [True: 38.6k, Folded]
  |  |  ------------------
  |  |  115|  38.6k|        if (tx >= TX_8X8)   l##dir |= l##dir >> 8; \
  |  |  ------------------
  |  |  |  Branch (115:13): [True: 38.6k, Folded]
  |  |  ------------------
  |  |  116|  38.6k|        break
  ------------------
  |  Branch (124:9): [True: 38.6k, False: 1.31M]
  ------------------
  125|  1.34M|        }
  126|  1.36M|        switch (t_dim->lh) {
  127|      0|        default: assert(0); /* fall-through */
  ------------------
  |  |  140|      0|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, Folded]
  |  |  |  Branch (140:68): [Folded, False: 0]
  |  |  ------------------
  ------------------
  |  Branch (127:9): [True: 0, False: 1.36M]
  ------------------
  128|   929k|        case TX_4X4:   MERGE_CTX(l, uint8_t,  TX_4X4);
  ------------------
  |  |  107|   929k|        if (tx == TX_64X64) { \
  |  |  ------------------
  |  |  |  Branch (107:13): [Folded, False: 929k]
  |  |  ------------------
  |  |  108|      0|            uint64_t tmp = *(const uint64_t *) dir; \
  |  |  109|      0|            tmp |= *(const uint64_t *) &dir[8]; \
  |  |  110|      0|            l##dir = (unsigned) (tmp >> 32) | (unsigned) tmp; \
  |  |  111|      0|        } else \
  |  |  112|   929k|            l##dir = *(const type *) dir; \
  |  |  113|   929k|        if (tx == TX_32X32) l##dir |= *(const type *) &dir[sizeof(type)]; \
  |  |  ------------------
  |  |  |  Branch (113:13): [Folded, False: 929k]
  |  |  ------------------
  |  |  114|   929k|        if (tx >= TX_16X16) l##dir |= l##dir >> 16; \
  |  |  ------------------
  |  |  |  Branch (114:13): [Folded, False: 929k]
  |  |  ------------------
  |  |  115|   929k|        if (tx >= TX_8X8)   l##dir |= l##dir >> 8; \
  |  |  ------------------
  |  |  |  Branch (115:13): [Folded, False: 929k]
  |  |  ------------------
  |  |  116|   929k|        break
  ------------------
  |  Branch (128:9): [True: 929k, False: 431k]
  ------------------
  129|   257k|        case TX_8X8:   MERGE_CTX(l, uint16_t, TX_8X8);
  ------------------
  |  |  107|   257k|        if (tx == TX_64X64) { \
  |  |  ------------------
  |  |  |  Branch (107:13): [Folded, False: 257k]
  |  |  ------------------
  |  |  108|      0|            uint64_t tmp = *(const uint64_t *) dir; \
  |  |  109|      0|            tmp |= *(const uint64_t *) &dir[8]; \
  |  |  110|      0|            l##dir = (unsigned) (tmp >> 32) | (unsigned) tmp; \
  |  |  111|      0|        } else \
  |  |  112|   257k|            l##dir = *(const type *) dir; \
  |  |  113|   257k|        if (tx == TX_32X32) l##dir |= *(const type *) &dir[sizeof(type)]; \
  |  |  ------------------
  |  |  |  Branch (113:13): [Folded, False: 257k]
  |  |  ------------------
  |  |  114|   257k|        if (tx >= TX_16X16) l##dir |= l##dir >> 16; \
  |  |  ------------------
  |  |  |  Branch (114:13): [Folded, False: 257k]
  |  |  ------------------
  |  |  115|   257k|        if (tx >= TX_8X8)   l##dir |= l##dir >> 8; \
  |  |  ------------------
  |  |  |  Branch (115:13): [True: 257k, Folded]
  |  |  ------------------
  |  |  116|   257k|        break
  ------------------
  |  Branch (129:9): [True: 257k, False: 1.10M]
  ------------------
  130|   119k|        case TX_16X16: MERGE_CTX(l, uint32_t, TX_16X16);
  ------------------
  |  |  107|   119k|        if (tx == TX_64X64) { \
  |  |  ------------------
  |  |  |  Branch (107:13): [Folded, False: 119k]
  |  |  ------------------
  |  |  108|      0|            uint64_t tmp = *(const uint64_t *) dir; \
  |  |  109|      0|            tmp |= *(const uint64_t *) &dir[8]; \
  |  |  110|      0|            l##dir = (unsigned) (tmp >> 32) | (unsigned) tmp; \
  |  |  111|      0|        } else \
  |  |  112|   119k|            l##dir = *(const type *) dir; \
  |  |  113|   119k|        if (tx == TX_32X32) l##dir |= *(const type *) &dir[sizeof(type)]; \
  |  |  ------------------
  |  |  |  Branch (113:13): [Folded, False: 119k]
  |  |  ------------------
  |  |  114|   119k|        if (tx >= TX_16X16) l##dir |= l##dir >> 16; \
  |  |  ------------------
  |  |  |  Branch (114:13): [True: 119k, Folded]
  |  |  ------------------
  |  |  115|   119k|        if (tx >= TX_8X8)   l##dir |= l##dir >> 8; \
  |  |  ------------------
  |  |  |  Branch (115:13): [True: 119k, Folded]
  |  |  ------------------
  |  |  116|   119k|        break
  ------------------
  |  Branch (130:9): [True: 119k, False: 1.24M]
  ------------------
  131|  16.5k|        case TX_32X32: MERGE_CTX(l, uint32_t, TX_32X32);
  ------------------
  |  |  107|  16.5k|        if (tx == TX_64X64) { \
  |  |  ------------------
  |  |  |  Branch (107:13): [Folded, False: 16.5k]
  |  |  ------------------
  |  |  108|      0|            uint64_t tmp = *(const uint64_t *) dir; \
  |  |  109|      0|            tmp |= *(const uint64_t *) &dir[8]; \
  |  |  110|      0|            l##dir = (unsigned) (tmp >> 32) | (unsigned) tmp; \
  |  |  111|      0|        } else \
  |  |  112|  16.5k|            l##dir = *(const type *) dir; \
  |  |  113|  16.5k|        if (tx == TX_32X32) l##dir |= *(const type *) &dir[sizeof(type)]; \
  |  |  ------------------
  |  |  |  Branch (113:13): [True: 16.5k, Folded]
  |  |  ------------------
  |  |  114|  16.5k|        if (tx >= TX_16X16) l##dir |= l##dir >> 16; \
  |  |  ------------------
  |  |  |  Branch (114:13): [True: 16.5k, Folded]
  |  |  ------------------
  |  |  115|  16.5k|        if (tx >= TX_8X8)   l##dir |= l##dir >> 8; \
  |  |  ------------------
  |  |  |  Branch (115:13): [True: 16.5k, Folded]
  |  |  ------------------
  |  |  116|  16.5k|        break
  ------------------
  |  Branch (131:9): [True: 16.5k, False: 1.34M]
  ------------------
  132|  38.6k|        case TX_64X64: MERGE_CTX(l, uint32_t, TX_64X64);
  ------------------
  |  |  107|  38.6k|        if (tx == TX_64X64) { \
  |  |  ------------------
  |  |  |  Branch (107:13): [True: 38.6k, Folded]
  |  |  ------------------
  |  |  108|  38.6k|            uint64_t tmp = *(const uint64_t *) dir; \
  |  |  109|  38.6k|            tmp |= *(const uint64_t *) &dir[8]; \
  |  |  110|  38.6k|            l##dir = (unsigned) (tmp >> 32) | (unsigned) tmp; \
  |  |  111|  38.6k|        } else \
  |  |  112|  38.6k|            l##dir = *(const type *) dir; \
  |  |  113|  38.6k|        if (tx == TX_32X32) l##dir |= *(const type *) &dir[sizeof(type)]; \
  |  |  ------------------
  |  |  |  Branch (113:13): [Folded, False: 38.6k]
  |  |  ------------------
  |  |  114|  38.6k|        if (tx >= TX_16X16) l##dir |= l##dir >> 16; \
  |  |  ------------------
  |  |  |  Branch (114:13): [True: 38.6k, Folded]
  |  |  ------------------
  |  |  115|  38.6k|        if (tx >= TX_8X8)   l##dir |= l##dir >> 8; \
  |  |  ------------------
  |  |  |  Branch (115:13): [True: 38.6k, Folded]
  |  |  ------------------
  |  |  116|  38.6k|        break
  ------------------
  |  Branch (132:9): [True: 38.6k, False: 1.32M]
  ------------------
  133|  1.36M|        }
  134|  1.36M|#undef MERGE_CTX
  135|       |
  136|  1.36M|        return dav1d_skip_ctx[umin(la & 0x3F, 4)][umin(ll & 0x3F, 4)];
  137|  1.36M|    }
  138|  4.49M|}
recon_tmpl.c:get_lo_ctx:
  304|  37.4M|{
  305|  37.4M|    unsigned mag = levels[0 * stride + 1] + levels[1 * stride + 0];
  306|  37.4M|    unsigned offset;
  307|  37.4M|    if (tx_class == TX_CLASS_2D) {
  ------------------
  |  Branch (307:9): [True: 36.1M, False: 1.31M]
  ------------------
  308|  36.1M|        mag += levels[1 * stride + 1];
  309|  36.1M|        *hi_mag = mag;
  310|  36.1M|        mag += levels[0 * stride + 2] + levels[2 * stride + 0];
  311|  36.1M|        offset = ctx_offsets[umin(y, 4)][umin(x, 4)];
  312|  36.1M|    } else {
  313|  1.31M|        mag += levels[0 * stride + 2];
  314|  1.31M|        *hi_mag = mag;
  315|  1.31M|        mag += levels[0 * stride + 3] + levels[0 * stride + 4];
  316|  1.31M|        offset = 26 + (y > 1 ? 10 : y * 5);
  ------------------
  |  Branch (316:24): [True: 767k, False: 548k]
  ------------------
  317|  1.31M|    }
  318|  37.4M|    return offset + (mag > 512 ? 4 : (mag + 64) >> 7);
  ------------------
  |  Branch (318:22): [True: 3.30M, False: 34.1M]
  ------------------
  319|  37.4M|}
recon_tmpl.c:get_dc_sign_ctx:
  143|  1.90M|{
  144|  1.90M|    uint64_t mask = 0xC0C0C0C0C0C0C0C0ULL, mul = 0x0101010101010101ULL;
  145|  1.90M|    int s;
  146|       |
  147|  1.90M|#if ARCH_X86_64 && defined(__GNUC__)
  148|       |    /* Coerce compilers into producing better code. For some reason
  149|       |     * every x86-64 compiler is awful at handling 64-bit constants. */
  150|  1.90M|    __asm__("" : "+r"(mask), "+r"(mul));
  151|  1.90M|#endif
  152|       |
  153|  1.90M|    switch(tx) {
  154|      0|    default: assert(0); /* fall-through */
  ------------------
  |  |  140|      0|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, Folded]
  |  |  |  Branch (140:68): [Folded, False: 0]
  |  |  ------------------
  ------------------
  |  Branch (154:5): [True: 0, False: 1.90M]
  ------------------
  155|  1.08M|    case TX_4X4: {
  ------------------
  |  Branch (155:5): [True: 1.08M, False: 823k]
  ------------------
  156|  1.08M|        int t = *(const uint8_t *) a >> 6;
  157|  1.08M|        t    += *(const uint8_t *) l >> 6;
  158|  1.08M|        s = t - 1 - 1;
  159|  1.08M|        break;
  160|      0|    }
  161|   208k|    case TX_8X8: {
  ------------------
  |  Branch (161:5): [True: 208k, False: 1.69M]
  ------------------
  162|   208k|        uint32_t t = *(const uint16_t *) a & (uint32_t) mask;
  163|   208k|        t         += *(const uint16_t *) l & (uint32_t) mask;
  164|   208k|        t *= 0x04040404U;
  165|   208k|        s = (int) (t >> 24) - 2 - 2;
  166|   208k|        break;
  167|      0|    }
  168|  93.8k|    case TX_16X16: {
  ------------------
  |  Branch (168:5): [True: 93.8k, False: 1.81M]
  ------------------
  169|  93.8k|        uint32_t t = (*(const uint32_t *) a & (uint32_t) mask) >> 6;
  170|  93.8k|        t         += (*(const uint32_t *) l & (uint32_t) mask) >> 6;
  171|  93.8k|        t *= (uint32_t) mul;
  172|  93.8k|        s = (int) (t >> 24) - 4 - 4;
  173|  93.8k|        break;
  174|      0|    }
  175|  66.2k|    case TX_32X32: {
  ------------------
  |  Branch (175:5): [True: 66.2k, False: 1.83M]
  ------------------
  176|  66.2k|        uint64_t t = (*(const uint64_t *) a & mask) >> 6;
  177|  66.2k|        t         += (*(const uint64_t *) l & mask) >> 6;
  178|  66.2k|        t *= mul;
  179|  66.2k|        s = (int) (t >> 56) - 8 - 8;
  180|  66.2k|        break;
  181|      0|    }
  182|  24.4k|    case TX_64X64: {
  ------------------
  |  Branch (182:5): [True: 24.4k, False: 1.88M]
  ------------------
  183|  24.4k|        uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6;
  184|  24.4k|        t         += (*(const uint64_t *) &a[8] & mask) >> 6;
  185|  24.4k|        t         += (*(const uint64_t *) &l[0] & mask) >> 6;
  186|  24.4k|        t         += (*(const uint64_t *) &l[8] & mask) >> 6;
  187|  24.4k|        t *= mul;
  188|  24.4k|        s = (int) (t >> 56) - 16 - 16;
  189|  24.4k|        break;
  190|      0|    }
  191|  49.5k|    case RTX_4X8: {
  ------------------
  |  Branch (191:5): [True: 49.5k, False: 1.85M]
  ------------------
  192|  49.5k|        uint32_t t = *(const uint8_t  *) a & (uint32_t) mask;
  193|  49.5k|        t         += *(const uint16_t *) l & (uint32_t) mask;
  194|  49.5k|        t *= 0x04040404U;
  195|  49.5k|        s = (int) (t >> 24) - 1 - 2;
  196|  49.5k|        break;
  197|      0|    }
  198|  76.3k|    case RTX_8X4: {
  ------------------
  |  Branch (198:5): [True: 76.3k, False: 1.82M]
  ------------------
  199|  76.3k|        uint32_t t = *(const uint16_t *) a & (uint32_t) mask;
  200|  76.3k|        t         += *(const uint8_t  *) l & (uint32_t) mask;
  201|  76.3k|        t *= 0x04040404U;
  202|  76.3k|        s = (int) (t >> 24) - 2 - 1;
  203|  76.3k|        break;
  204|      0|    }
  205|  35.3k|    case RTX_8X16: {
  ------------------
  |  Branch (205:5): [True: 35.3k, False: 1.87M]
  ------------------
  206|  35.3k|        uint32_t t = *(const uint16_t *) a & (uint32_t) mask;
  207|  35.3k|        t         += *(const uint32_t *) l & (uint32_t) mask;
  208|  35.3k|        t = (t >> 6) * (uint32_t) mul;
  209|  35.3k|        s = (int) (t >> 24) - 2 - 4;
  210|  35.3k|        break;
  211|      0|    }
  212|  92.2k|    case RTX_16X8: {
  ------------------
  |  Branch (212:5): [True: 92.2k, False: 1.81M]
  ------------------
  213|  92.2k|        uint32_t t = *(const uint32_t *) a & (uint32_t) mask;
  214|  92.2k|        t         += *(const uint16_t *) l & (uint32_t) mask;
  215|  92.2k|        t = (t >> 6) * (uint32_t) mul;
  216|  92.2k|        s = (int) (t >> 24) - 4 - 2;
  217|  92.2k|        break;
  218|      0|    }
  219|  26.1k|    case RTX_16X32: {
  ------------------
  |  Branch (219:5): [True: 26.1k, False: 1.87M]
  ------------------
  220|  26.1k|        uint64_t t = *(const uint32_t *) a & (uint32_t) mask;
  221|  26.1k|        t         += *(const uint64_t *) l & mask;
  222|  26.1k|        t = (t >> 6) * mul;
  223|  26.1k|        s = (int) (t >> 56) - 4 - 8;
  224|  26.1k|        break;
  225|      0|    }
  226|  61.9k|    case RTX_32X16: {
  ------------------
  |  Branch (226:5): [True: 61.9k, False: 1.84M]
  ------------------
  227|  61.9k|        uint64_t t = *(const uint64_t *) a & mask;
  228|  61.9k|        t         += *(const uint32_t *) l & (uint32_t) mask;
  229|  61.9k|        t = (t >> 6) * mul;
  230|  61.9k|        s = (int) (t >> 56) - 8 - 4;
  231|  61.9k|        break;
  232|      0|    }
  233|  12.3k|    case RTX_32X64: {
  ------------------
  |  Branch (233:5): [True: 12.3k, False: 1.89M]
  ------------------
  234|  12.3k|        uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6;
  235|  12.3k|        t         += (*(const uint64_t *) &l[0] & mask) >> 6;
  236|  12.3k|        t         += (*(const uint64_t *) &l[8] & mask) >> 6;
  237|  12.3k|        t *= mul;
  238|  12.3k|        s = (int) (t >> 56) - 8 - 16;
  239|  12.3k|        break;
  240|      0|    }
  241|  24.2k|    case RTX_64X32: {
  ------------------
  |  Branch (241:5): [True: 24.2k, False: 1.88M]
  ------------------
  242|  24.2k|        uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6;
  243|  24.2k|        t         += (*(const uint64_t *) &a[8] & mask) >> 6;
  244|  24.2k|        t         += (*(const uint64_t *) &l[0] & mask) >> 6;
  245|  24.2k|        t *= mul;
  246|  24.2k|        s = (int) (t >> 56) - 16 - 8;
  247|  24.2k|        break;
  248|      0|    }
  249|  9.41k|    case RTX_4X16: {
  ------------------
  |  Branch (249:5): [True: 9.41k, False: 1.89M]
  ------------------
  250|  9.41k|        uint32_t t = *(const uint8_t  *) a & (uint32_t) mask;
  251|  9.41k|        t         += *(const uint32_t *) l & (uint32_t) mask;
  252|  9.41k|        t = (t >> 6) * (uint32_t) mul;
  253|  9.41k|        s = (int) (t >> 24) - 1 - 4;
  254|  9.41k|        break;
  255|      0|    }
  256|  22.5k|    case RTX_16X4: {
  ------------------
  |  Branch (256:5): [True: 22.5k, False: 1.88M]
  ------------------
  257|  22.5k|        uint32_t t = *(const uint32_t *) a & (uint32_t) mask;
  258|  22.5k|        t         += *(const uint8_t  *) l & (uint32_t) mask;
  259|  22.5k|        t = (t >> 6) * (uint32_t) mul;
  260|  22.5k|        s = (int) (t >> 24) - 4 - 1;
  261|  22.5k|        break;
  262|      0|    }
  263|  7.82k|    case RTX_8X32: {
  ------------------
  |  Branch (263:5): [True: 7.82k, False: 1.89M]
  ------------------
  264|  7.82k|        uint64_t t = *(const uint16_t *) a & (uint32_t) mask;
  265|  7.82k|        t         += *(const uint64_t *) l & mask;
  266|  7.82k|        t = (t >> 6) * mul;
  267|  7.82k|        s = (int) (t >> 56) - 2 - 8;
  268|  7.82k|        break;
  269|      0|    }
  270|  12.2k|    case RTX_32X8: {
  ------------------
  |  Branch (270:5): [True: 12.2k, False: 1.89M]
  ------------------
  271|  12.2k|        uint64_t t = *(const uint64_t *) a & mask;
  272|  12.2k|        t         += *(const uint16_t *) l & (uint32_t) mask;
  273|  12.2k|        t = (t >> 6) * mul;
  274|  12.2k|        s = (int) (t >> 56) - 8 - 2;
  275|  12.2k|        break;
  276|      0|    }
  277|  1.40k|    case RTX_16X64: {
  ------------------
  |  Branch (277:5): [True: 1.40k, False: 1.90M]
  ------------------
  278|  1.40k|        uint64_t t = *(const uint32_t *) a & (uint32_t) mask;
  279|  1.40k|        t         += *(const uint64_t *) &l[0] & mask;
  280|  1.40k|        t = (t >> 6) + ((*(const uint64_t *) &l[8] & mask) >> 6);
  281|  1.40k|        t *= mul;
  282|  1.40k|        s = (int) (t >> 56) - 4 - 16;
  283|  1.40k|        break;
  284|      0|    }
  285|  1.40k|    case RTX_64X16: {
  ------------------
  |  Branch (285:5): [True: 1.40k, False: 1.90M]
  ------------------
  286|  1.40k|        uint64_t t = *(const uint64_t *) &a[0] & mask;
  287|  1.40k|        t         += *(const uint32_t *) l & (uint32_t) mask;
  288|  1.40k|        t = (t >> 6) + ((*(const uint64_t *) &a[8] & mask) >> 6);
  289|  1.40k|        t *= mul;
  290|  1.40k|        s = (int) (t >> 56) - 16 - 4;
  291|  1.40k|        break;
  292|      0|    }
  293|  1.90M|    }
  294|       |
  295|  1.90M|    return (s != 0) + (s > 0);
  296|  1.90M|}
recon_tmpl.c:read_golomb:
   49|   603k|static inline unsigned read_golomb(MsacContext *const msac) {
   50|   603k|    int len = 0;
   51|   603k|    unsigned val = 1;
   52|       |
   53|  1.14M|    while (!dav1d_msac_decode_bool_equi(msac) && len < 32) len++;
  ------------------
  |  |   53|  1.14M|#define dav1d_msac_decode_bool_equi      dav1d_msac_decode_bool_equi_sse2
  ------------------
  |  Branch (53:12): [True: 538k, False: 603k]
  |  Branch (53:50): [True: 538k, False: 18.4E]
  ------------------
   54|  1.14M|    while (len--) val = (val << 1) + dav1d_msac_decode_bool_equi(msac);
  ------------------
  |  |   53|   538k|#define dav1d_msac_decode_bool_equi      dav1d_msac_decode_bool_equi_sse2
  ------------------
  |  Branch (54:12): [True: 538k, False: 603k]
  ------------------
   55|       |
   56|   603k|    return val - 1;
   57|   603k|}
recon_tmpl.c:mc:
  944|  3.02M|{
  945|  3.02M|    assert((dst8 != NULL) ^ (dst16 != NULL));
  ------------------
  |  |  140|  3.02M|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 3.02M]
  |  |  |  Branch (140:68): [Folded, False: 3.02M]
  |  |  ------------------
  ------------------
  946|  3.02M|    const Dav1dFrameContext *const f = t->f;
  947|  3.02M|    const int ss_ver = !!pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
  ------------------
  |  Branch (947:24): [True: 1.97M, False: 1.04M]
  |  Branch (947:32): [True: 246k, False: 1.73M]
  ------------------
  948|  3.02M|    const int ss_hor = !!pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
  ------------------
  |  Branch (948:24): [True: 1.97M, False: 1.04M]
  |  Branch (948:32): [True: 247k, False: 1.73M]
  ------------------
  949|  3.02M|    const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
  950|  3.02M|    const int mvx = mv.x, mvy = mv.y;
  951|  3.02M|    const int mx = mvx & (15 >> !ss_hor), my = mvy & (15 >> !ss_ver);
  952|  3.02M|    ptrdiff_t ref_stride = refp->p.stride[!!pl];
  953|  3.02M|    const pixel *ref;
  954|       |
  955|  3.02M|    if (refp->p.p.w == f->cur.p.w && refp->p.p.h == f->cur.p.h) {
  ------------------
  |  Branch (955:9): [True: 3.00M, False: 11.7k]
  |  Branch (955:38): [True: 3.01M, False: 18.4E]
  ------------------
  956|  3.01M|        const int dx = bx * h_mul + (mvx >> (3 + ss_hor));
  957|  3.01M|        const int dy = by * v_mul + (mvy >> (3 + ss_ver));
  958|  3.01M|        int w, h;
  959|       |
  960|  3.01M|        if (refp->p.data[0] != f->cur.data[0]) { // i.e. not for intrabc
  ------------------
  |  Branch (960:13): [True: 332k, False: 2.68M]
  ------------------
  961|   332k|            w = (f->cur.p.w + ss_hor) >> ss_hor;
  962|   332k|            h = (f->cur.p.h + ss_ver) >> ss_ver;
  963|  2.68M|        } else {
  964|  2.68M|            w = f->bw * 4 >> ss_hor;
  965|  2.68M|            h = f->bh * 4 >> ss_ver;
  966|  2.68M|        }
  967|  3.01M|        if (dx < !!mx * 3 || dy < !!my * 3 ||
  ------------------
  |  Branch (967:13): [True: 15.8k, False: 2.99M]
  |  Branch (967:30): [True: 39.5k, False: 2.95M]
  ------------------
  968|  2.95M|            dx + bw4 * h_mul + !!mx * 4 > w ||
  ------------------
  |  Branch (968:13): [True: 47.4k, False: 2.91M]
  ------------------
  969|  2.91M|            dy + bh4 * v_mul + !!my * 4 > h)
  ------------------
  |  Branch (969:13): [True: 124k, False: 2.78M]
  ------------------
  970|   222k|        {
  971|   222k|            pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge);
  ------------------
  |  |   51|   222k|#define bitfn(x) x##_8bpc
  ------------------
  972|   222k|            f->dsp->mc.emu_edge(bw4 * h_mul + !!mx * 7, bh4 * v_mul + !!my * 7,
  973|   222k|                                w, h, dx - !!mx * 3, dy - !!my * 3,
  974|   222k|                                emu_edge_buf, 192 * sizeof(pixel),
  975|   222k|                                refp->p.data[pl], ref_stride);
  976|   222k|            ref = &emu_edge_buf[192 * !!my * 3 + !!mx * 3];
  977|   222k|            ref_stride = 192 * sizeof(pixel);
  978|  2.79M|        } else {
  979|  2.79M|            ref = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx;
  ------------------
  |  |   53|  2.79M|#define PXSTRIDE(x) (x)
  ------------------
  980|  2.79M|        }
  981|       |
  982|  3.01M|        if (dst8 != NULL) {
  ------------------
  |  Branch (982:13): [True: 2.92M, False: 88.3k]
  ------------------
  983|  2.92M|            f->dsp->mc.mc[filter_2d](dst8, dst_stride, ref, ref_stride, bw4 * h_mul,
  984|  2.92M|                                     bh4 * v_mul, mx << !ss_hor, my << !ss_ver
  985|  2.92M|                                     HIGHBD_CALL_SUFFIX);
  986|  2.92M|        } else {
  987|  88.3k|            f->dsp->mc.mct[filter_2d](dst16, ref, ref_stride, bw4 * h_mul,
  988|  88.3k|                                      bh4 * v_mul, mx << !ss_hor, my << !ss_ver
  989|  88.3k|                                      HIGHBD_CALL_SUFFIX);
  990|  88.3k|        }
  991|  3.01M|    } else {
  992|  6.61k|        assert(refp != &f->sr_cur);
  ------------------
  |  |  140|  6.61k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 6.61k]
  |  |  |  Branch (140:68): [Folded, False: 6.61k]
  |  |  ------------------
  ------------------
  993|       |
  994|  6.61k|        const int orig_pos_y = (by * v_mul << 4) + mvy * (1 << !ss_ver);
  995|  6.61k|        const int orig_pos_x = (bx * h_mul << 4) + mvx * (1 << !ss_hor);
  996|  6.61k|#define scale_mv(res, val, scale) do { \
  997|  6.61k|            const int64_t tmp = (int64_t)(val) * scale + (scale - 0x4000) * 8; \
  998|  6.61k|            res = apply_sign64((int) ((llabs(tmp) + 128) >> 8), tmp) + 32;     \
  999|  6.61k|        } while (0)
 1000|  6.61k|        int pos_y, pos_x;
 1001|  6.61k|        scale_mv(pos_x, orig_pos_x, f->svc[refidx][0].scale);
  ------------------
  |  |  996|  6.61k|#define scale_mv(res, val, scale) do { \
  |  |  997|  6.61k|            const int64_t tmp = (int64_t)(val) * scale + (scale - 0x4000) * 8; \
  |  |  998|  6.61k|            res = apply_sign64((int) ((llabs(tmp) + 128) >> 8), tmp) + 32;     \
  |  |  999|  6.61k|        } while (0)
  |  |  ------------------
  |  |  |  Branch (999:18): [Folded, False: 6.61k]
  |  |  ------------------
  ------------------
 1002|  6.61k|        scale_mv(pos_y, orig_pos_y, f->svc[refidx][1].scale);
  ------------------
  |  |  996|  6.61k|#define scale_mv(res, val, scale) do { \
  |  |  997|  6.61k|            const int64_t tmp = (int64_t)(val) * scale + (scale - 0x4000) * 8; \
  |  |  998|  6.61k|            res = apply_sign64((int) ((llabs(tmp) + 128) >> 8), tmp) + 32;     \
  |  |  999|  6.61k|        } while (0)
  |  |  ------------------
  |  |  |  Branch (999:18): [Folded, False: 6.61k]
  |  |  ------------------
  ------------------
 1003|  6.61k|#undef scale_mv
 1004|  6.61k|        const int left = pos_x >> 10;
 1005|  6.61k|        const int top = pos_y >> 10;
 1006|  6.61k|        const int right =
 1007|  6.61k|            ((pos_x + (bw4 * h_mul - 1) * f->svc[refidx][0].step) >> 10) + 1;
 1008|  6.61k|        const int bottom =
 1009|  6.61k|            ((pos_y + (bh4 * v_mul - 1) * f->svc[refidx][1].step) >> 10) + 1;
 1010|       |
 1011|  6.61k|        if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|  6.61k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 6.61k]
  |  |  ------------------
  |  |   35|  6.61k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  6.61k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1012|      0|            printf("Off %dx%d [%d,%d,%d], size %dx%d [%d,%d]\n",
 1013|      0|                   left, top, orig_pos_x, f->svc[refidx][0].scale, refidx,
 1014|      0|                   right-left, bottom-top,
 1015|      0|                   f->svc[refidx][0].step, f->svc[refidx][1].step);
 1016|       |
 1017|  6.61k|        const int w = (refp->p.p.w + ss_hor) >> ss_hor;
 1018|  6.61k|        const int h = (refp->p.p.h + ss_ver) >> ss_ver;
 1019|  13.7k|        if (left < 3 || top < 3 || right + 4 > w || bottom + 4 > h) {
  ------------------
  |  Branch (1019:13): [True: 18.4E, False: 10.0k]
  |  Branch (1019:25): [True: 4.84k, False: 5.21k]
  |  Branch (1019:36): [True: 1.98k, False: 3.23k]
  |  Branch (1019:53): [True: 1.56k, False: 1.66k]
  ------------------
 1020|  13.7k|            pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge);
  ------------------
  |  |   51|  13.7k|#define bitfn(x) x##_8bpc
  ------------------
 1021|  13.7k|            f->dsp->mc.emu_edge(right - left + 7, bottom - top + 7,
 1022|  13.7k|                                w, h, left - 3, top - 3,
 1023|  13.7k|                                emu_edge_buf, 320 * sizeof(pixel),
 1024|  13.7k|                                refp->p.data[pl], ref_stride);
 1025|  13.7k|            ref = &emu_edge_buf[320 * 3 + 3];
 1026|  13.7k|            ref_stride = 320 * sizeof(pixel);
 1027|  13.7k|            if (DEBUG_BLOCK_INFO) printf("Emu\n");
  ------------------
  |  |   34|  13.7k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 13.7k]
  |  |  ------------------
  |  |   35|  13.7k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  13.7k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1028|  18.4E|        } else {
 1029|  18.4E|            ref = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * top + left;
  ------------------
  |  |   53|  18.4E|#define PXSTRIDE(x) (x)
  ------------------
 1030|  18.4E|        }
 1031|       |
 1032|  11.8k|        if (dst8 != NULL) {
  ------------------
  |  Branch (1032:13): [True: 11.8k, False: 18.4E]
  ------------------
 1033|  11.8k|            f->dsp->mc.mc_scaled[filter_2d](dst8, dst_stride, ref, ref_stride,
 1034|  11.8k|                                            bw4 * h_mul, bh4 * v_mul,
 1035|  11.8k|                                            pos_x & 0x3ff, pos_y & 0x3ff,
 1036|  11.8k|                                            f->svc[refidx][0].step,
 1037|  11.8k|                                            f->svc[refidx][1].step
 1038|  11.8k|                                            HIGHBD_CALL_SUFFIX);
 1039|  18.4E|        } else {
 1040|  18.4E|            f->dsp->mc.mct_scaled[filter_2d](dst16, ref, ref_stride,
 1041|  18.4E|                                             bw4 * h_mul, bh4 * v_mul,
 1042|  18.4E|                                             pos_x & 0x3ff, pos_y & 0x3ff,
 1043|  18.4E|                                             f->svc[refidx][0].step,
 1044|  18.4E|                                             f->svc[refidx][1].step
 1045|  18.4E|                                             HIGHBD_CALL_SUFFIX);
 1046|  18.4E|        }
 1047|  6.61k|    }
 1048|       |
 1049|  3.02M|    return 0;
 1050|  3.02M|}
recon_tmpl.c:warp_affine:
 1120|  10.8k|{
 1121|  10.8k|    assert((dst8 != NULL) ^ (dst16 != NULL));
  ------------------
  |  |  140|  10.8k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 10.8k]
  |  |  |  Branch (140:68): [Folded, False: 10.8k]
  |  |  ------------------
  ------------------
 1122|  10.8k|    const Dav1dFrameContext *const f = t->f;
 1123|  10.8k|    const Dav1dDSPContext *const dsp = f->dsp;
 1124|  10.8k|    const int ss_ver = !!pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
  ------------------
  |  Branch (1124:24): [True: 4.12k, False: 6.68k]
  |  Branch (1124:32): [True: 3.70k, False: 422]
  ------------------
 1125|  10.8k|    const int ss_hor = !!pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
  ------------------
  |  Branch (1125:24): [True: 4.12k, False: 6.68k]
  |  Branch (1125:32): [True: 3.74k, False: 378]
  ------------------
 1126|  10.8k|    const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
 1127|  10.8k|    assert(!((b_dim[0] * h_mul) & 7) && !((b_dim[1] * v_mul) & 7));
  ------------------
  |  |  140|  21.6k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:30): [True: 10.8k, False: 0]
  |  |  |  Branch (140:30): [True: 10.8k, False: 0]
  |  |  |  Branch (140:68): [Folded, False: 10.8k]
  |  |  ------------------
  ------------------
 1128|  10.8k|    const int32_t *const mat = wmp->matrix;
 1129|  10.8k|    const int width = (refp->p.p.w + ss_hor) >> ss_hor;
 1130|  10.8k|    const int height = (refp->p.p.h + ss_ver) >> ss_ver;
 1131|       |
 1132|  42.1k|    for (int y = 0; y < b_dim[1] * v_mul; y += 8) {
  ------------------
  |  Branch (1132:21): [True: 31.3k, False: 10.8k]
  ------------------
 1133|  31.3k|        const int src_y = t->by * 4 + ((y + 4) << ss_ver);
 1134|  31.3k|        const int64_t mat3_y = (int64_t) mat[3] * src_y + mat[0];
 1135|  31.3k|        const int64_t mat5_y = (int64_t) mat[5] * src_y + mat[1];
 1136|  86.0k|        for (int x = 0; x < b_dim[0] * h_mul; x += 8) {
  ------------------
  |  Branch (1136:25): [True: 54.7k, False: 31.3k]
  ------------------
 1137|       |            // calculate transformation relative to center of 8x8 block in
 1138|       |            // luma pixel units
 1139|  54.7k|            const int src_x = t->bx * 4 + ((x + 4) << ss_hor);
 1140|  54.7k|            const int64_t mvx = ((int64_t) mat[2] * src_x + mat3_y) >> ss_hor;
 1141|  54.7k|            const int64_t mvy = ((int64_t) mat[4] * src_x + mat5_y) >> ss_ver;
 1142|       |
 1143|  54.7k|            const int dx = (int) (mvx >> 16) - 4;
 1144|  54.7k|            const int mx = (((int) mvx & 0xffff) - wmp->u.p.alpha * 4 -
 1145|  54.7k|                                                   wmp->u.p.beta  * 7) & ~0x3f;
 1146|  54.7k|            const int dy = (int) (mvy >> 16) - 4;
 1147|  54.7k|            const int my = (((int) mvy & 0xffff) - wmp->u.p.gamma * 4 -
 1148|  54.7k|                                                   wmp->u.p.delta * 4) & ~0x3f;
 1149|       |
 1150|  54.7k|            const pixel *ref_ptr;
 1151|  54.7k|            ptrdiff_t ref_stride = refp->p.stride[!!pl];
 1152|       |
 1153|  54.7k|            if (dx < 3 || dx + 8 + 4 > width || dy < 3 || dy + 8 + 4 > height) {
  ------------------
  |  Branch (1153:17): [True: 2.58k, False: 52.1k]
  |  Branch (1153:27): [True: 6.93k, False: 45.2k]
  |  Branch (1153:49): [True: 4.13k, False: 41.0k]
  |  Branch (1153:59): [True: 2.43k, False: 38.6k]
  ------------------
 1154|  16.1k|                pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge);
  ------------------
  |  |   51|  16.1k|#define bitfn(x) x##_8bpc
  ------------------
 1155|  16.1k|                f->dsp->mc.emu_edge(15, 15, width, height, dx - 3, dy - 3,
 1156|  16.1k|                                    emu_edge_buf, 32 * sizeof(pixel),
 1157|  16.1k|                                    refp->p.data[pl], ref_stride);
 1158|  16.1k|                ref_ptr = &emu_edge_buf[32 * 3 + 3];
 1159|  16.1k|                ref_stride = 32 * sizeof(pixel);
 1160|  38.6k|            } else {
 1161|  38.6k|                ref_ptr = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx;
  ------------------
  |  |   53|  38.6k|#define PXSTRIDE(x) (x)
  ------------------
 1162|  38.6k|            }
 1163|  54.7k|            if (dst16 != NULL)
  ------------------
  |  Branch (1163:17): [True: 7.79k, False: 46.9k]
  ------------------
 1164|  7.79k|                dsp->mc.warp8x8t(&dst16[x], dstride, ref_ptr, ref_stride,
 1165|  7.79k|                                 wmp->u.abcd, mx, my HIGHBD_CALL_SUFFIX);
 1166|  46.9k|            else
 1167|  46.9k|                dsp->mc.warp8x8(&dst8[x], dstride, ref_ptr, ref_stride,
 1168|  46.9k|                                wmp->u.abcd, mx, my HIGHBD_CALL_SUFFIX);
 1169|  54.7k|        }
 1170|  31.3k|        if (dst8) dst8  += 8 * PXSTRIDE(dstride);
  ------------------
  |  |   53|  29.1k|#define PXSTRIDE(x) (x)
  ------------------
  |  Branch (1170:13): [True: 29.1k, False: 2.16k]
  ------------------
 1171|  2.16k|        else      dst16 += 8 * dstride;
 1172|  31.3k|    }
 1173|  10.8k|    return 0;
 1174|  10.8k|}
recon_tmpl.c:obmc:
 1056|  42.3k|{
 1057|  42.3k|    assert(!(t->bx & 1) && !(t->by & 1));
  ------------------
  |  |  140|  84.6k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:30): [True: 42.3k, False: 0]
  |  |  |  Branch (140:30): [True: 42.3k, False: 0]
  |  |  |  Branch (140:68): [Folded, False: 42.3k]
  |  |  ------------------
  ------------------
 1058|  42.3k|    const Dav1dFrameContext *const f = t->f;
 1059|  42.3k|    /*const*/ refmvs_block **r = &t->rt.r[(t->by & 31) + 5];
 1060|  42.3k|    pixel *const lap = bitfn(t->scratch.lap);
  ------------------
  |  |   51|  42.3k|#define bitfn(x) x##_8bpc
  ------------------
 1061|  42.3k|    const int ss_ver = !!pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
  ------------------
  |  Branch (1061:24): [True: 28.1k, False: 14.2k]
  |  Branch (1061:32): [True: 27.1k, False: 1.00k]
  ------------------
 1062|  42.3k|    const int ss_hor = !!pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
  ------------------
  |  Branch (1062:24): [True: 28.1k, False: 14.2k]
  |  Branch (1062:32): [True: 27.1k, False: 970]
  ------------------
 1063|  42.3k|    const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
 1064|  42.3k|    int res;
 1065|       |
 1066|  42.3k|    if (t->by > t->ts->tiling.row_start &&
  ------------------
  |  Branch (1066:9): [True: 33.2k, False: 9.13k]
  ------------------
 1067|  33.2k|        (!pl || b_dim[0] * h_mul + b_dim[1] * v_mul >= 16))
  ------------------
  |  Branch (1067:10): [True: 11.1k, False: 22.0k]
  |  Branch (1067:17): [True: 6.32k, False: 15.7k]
  ------------------
 1068|  17.4k|    {
 1069|  37.0k|        for (int i = 0, x = 0; x < w4 && i < imin(b_dim[2], 4); ) {
  ------------------
  |  Branch (1069:32): [True: 19.5k, False: 17.4k]
  |  Branch (1069:42): [True: 19.5k, False: 37]
  ------------------
 1070|       |            // only odd blocks are considered for overlap handling, hence +1
 1071|  19.5k|            const refmvs_block *const a_r = &r[-1][t->bx + x + 1];
 1072|  19.5k|            const uint8_t *const a_b_dim = dav1d_block_dimensions[a_r->bs];
 1073|  19.5k|            const int step4 = iclip(a_b_dim[0], 2, 16);
 1074|       |
 1075|  19.5k|            if (a_r->ref.ref[0] > 0) {
  ------------------
  |  Branch (1075:17): [True: 18.8k, False: 673]
  ------------------
 1076|  18.8k|                const int ow4 = imin(step4, b_dim[0]);
 1077|  18.8k|                const int oh4 = imin(b_dim[1], 16) >> 1;
 1078|  18.8k|                res = mc(t, lap, NULL, ow4 * h_mul * sizeof(pixel), ow4, (oh4 * 3 + 3) >> 2,
 1079|  18.8k|                         t->bx + x, t->by, pl, a_r->mv.mv[0],
 1080|  18.8k|                         &f->refp[a_r->ref.ref[0] - 1], a_r->ref.ref[0] - 1,
 1081|  18.8k|                         dav1d_filter_2d[t->a->filter[1][bx4 + x + 1]][t->a->filter[0][bx4 + x + 1]]);
 1082|  18.8k|                if (res) return res;
  ------------------
  |  Branch (1082:21): [True: 0, False: 18.8k]
  ------------------
 1083|  18.8k|                f->dsp->mc.blend_h(&dst[x * h_mul], dst_stride, lap,
 1084|  18.8k|                                   h_mul * ow4, v_mul * oh4);
 1085|  18.8k|                i++;
 1086|  18.8k|            }
 1087|  19.5k|            x += step4;
 1088|  19.5k|        }
 1089|  17.4k|    }
 1090|       |
 1091|  42.3k|    if (t->bx > t->ts->tiling.col_start)
  ------------------
  |  Branch (1091:9): [True: 39.2k, False: 3.10k]
  ------------------
 1092|  82.4k|        for (int i = 0, y = 0; y < h4 && i < imin(b_dim[3], 4); ) {
  ------------------
  |  Branch (1092:32): [True: 43.4k, False: 39.0k]
  |  Branch (1092:42): [True: 43.2k, False: 197]
  ------------------
 1093|       |            // only odd blocks are considered for overlap handling, hence +1
 1094|  43.2k|            const refmvs_block *const l_r = &r[y + 1][t->bx - 1];
 1095|  43.2k|            const uint8_t *const l_b_dim = dav1d_block_dimensions[l_r->bs];
 1096|  43.2k|            const int step4 = iclip(l_b_dim[1], 2, 16);
 1097|       |
 1098|  43.2k|            if (l_r->ref.ref[0] > 0) {
  ------------------
  |  Branch (1098:17): [True: 42.2k, False: 998]
  ------------------
 1099|  42.2k|                const int ow4 = imin(b_dim[0], 16) >> 1;
 1100|  42.2k|                const int oh4 = imin(step4, b_dim[1]);
 1101|  42.2k|                res = mc(t, lap, NULL, h_mul * ow4 * sizeof(pixel), ow4, oh4,
 1102|  42.2k|                         t->bx, t->by + y, pl, l_r->mv.mv[0],
 1103|  42.2k|                         &f->refp[l_r->ref.ref[0] - 1], l_r->ref.ref[0] - 1,
 1104|  42.2k|                         dav1d_filter_2d[t->l.filter[1][by4 + y + 1]][t->l.filter[0][by4 + y + 1]]);
 1105|  42.2k|                if (res) return res;
  ------------------
  |  Branch (1105:21): [True: 0, False: 42.2k]
  ------------------
 1106|  42.2k|                f->dsp->mc.blend_v(&dst[y * v_mul * PXSTRIDE(dst_stride)],
  ------------------
  |  |   53|  42.2k|#define PXSTRIDE(x) (x)
  ------------------
 1107|  42.2k|                                   dst_stride, lap, h_mul * ow4, v_mul * oh4);
 1108|  42.2k|                i++;
 1109|  42.2k|            }
 1110|  43.2k|            y += step4;
 1111|  43.2k|        }
 1112|  42.3k|    return 0;
 1113|  42.3k|}
dav1d_recon_b_intra_16bpc:
 1179|  1.07M|{
 1180|  1.07M|    Dav1dTileState *const ts = t->ts;
 1181|  1.07M|    const Dav1dFrameContext *const f = t->f;
 1182|  1.07M|    const Dav1dDSPContext *const dsp = f->dsp;
 1183|  1.07M|    const int bx4 = t->bx & 31, by4 = t->by & 31;
 1184|  1.07M|    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
 1185|  1.07M|    const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
 1186|  1.07M|    const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
 1187|  1.07M|    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
 1188|  1.07M|    const int bw4 = b_dim[0], bh4 = b_dim[1];
 1189|  1.07M|    const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
 1190|  1.07M|    const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
 1191|  1.07M|    const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&
  ------------------
  |  Branch (1191:28): [True: 745k, False: 325k]
  ------------------
 1192|   745k|                           (bw4 > ss_hor || t->bx & 1) &&
  ------------------
  |  Branch (1192:29): [True: 717k, False: 28.3k]
  |  Branch (1192:45): [True: 14.2k, False: 14.1k]
  ------------------
 1193|   731k|                           (bh4 > ss_ver || t->by & 1);
  ------------------
  |  Branch (1193:29): [True: 712k, False: 18.8k]
  |  Branch (1193:45): [True: 9.44k, False: 9.43k]
  ------------------
 1194|  1.07M|    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->tx];
 1195|  1.07M|    const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx];
 1196|       |
 1197|       |    // coefficient coding
 1198|  1.07M|    pixel *const edge = bitfn(t->scratch.edge) + 128;
  ------------------
  |  |   77|  1.07M|#define bitfn(x) x##_16bpc
  ------------------
 1199|  1.07M|    const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver;
 1200|       |
 1201|  1.07M|    const int intra_edge_filter_flag = f->seq_hdr->intra_edge_filter << 10;
 1202|       |
 1203|  2.15M|    for (int init_y = 0; init_y < h4; init_y += 16) {
  ------------------
  |  Branch (1203:26): [True: 1.08M, False: 1.06M]
  ------------------
 1204|  1.08M|        const int sub_h4 = imin(h4, 16 + init_y);
 1205|  1.08M|        const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver);
 1206|  2.17M|        for (int init_x = 0; init_x < w4; init_x += 16) {
  ------------------
  |  Branch (1206:30): [True: 1.09M, False: 1.08M]
  ------------------
 1207|  1.09M|            if (b->pal_sz[0]) {
  ------------------
  |  Branch (1207:17): [True: 13.0k, False: 1.07M]
  ------------------
 1208|  13.0k|                pixel *dst = ((pixel *) f->cur.data[0]) +
 1209|  13.0k|                             4 * (t->by * PXSTRIDE(f->cur.stride[0]) + t->bx);
 1210|  13.0k|                const uint8_t *pal_idx;
 1211|  13.0k|                if (t->frame_thread.pass) {
  ------------------
  |  Branch (1211:21): [True: 0, False: 13.0k]
  ------------------
 1212|      0|                    const int p = t->frame_thread.pass & 1;
 1213|      0|                    assert(ts->frame_thread[p].pal_idx);
  ------------------
  |  |  140|      0|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 0]
  |  |  |  Branch (140:68): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1214|      0|                    pal_idx = ts->frame_thread[p].pal_idx;
 1215|      0|                    ts->frame_thread[p].pal_idx += bw4 * bh4 * 8;
 1216|  13.0k|                } else {
 1217|  13.0k|                    pal_idx = t->scratch.pal_idx_y;
 1218|  13.0k|                }
 1219|  13.0k|                const pixel *const pal = t->frame_thread.pass ?
  ------------------
  |  Branch (1219:42): [True: 0, False: 13.0k]
  ------------------
 1220|      0|                    f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
 1221|      0|                                        ((t->bx >> 1) + (t->by & 1))][0] :
 1222|  13.0k|                    bytefn(t->scratch.pal)[0];
  ------------------
  |  |   87|  13.0k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   77|  13.0k|#define bitfn(x) x##_16bpc
  |  |  ------------------
  ------------------
 1223|  13.0k|                f->dsp->ipred.pal_pred(dst, f->cur.stride[0], pal,
 1224|  13.0k|                                       pal_idx, bw4 * 4, bh4 * 4);
 1225|  13.0k|                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
  ------------------
  |  |   34|  13.0k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 13.0k]
  |  |  ------------------
  |  |   35|  13.0k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  13.0k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
                              if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
  ------------------
  |  |   37|      0|#define DEBUG_B_PIXELS 0
  |  |  ------------------
  |  |  |  Branch (37:24): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1226|      0|                    hex_dump(dst, PXSTRIDE(f->cur.stride[0]),
 1227|      0|                             bw4 * 4, bh4 * 4, "y-pal-pred");
 1228|  13.0k|            }
 1229|       |
 1230|  1.09M|            const int intra_flags = (sm_flag(t->a, bx4) |
 1231|  1.09M|                                     sm_flag(&t->l, by4) |
 1232|  1.09M|                                     intra_edge_filter_flag);
 1233|  1.09M|            const int sb_has_tr = init_x + 16 < w4 ? 1 : init_y ? 0 :
  ------------------
  |  Branch (1233:35): [True: 9.15k, False: 1.08M]
  |  Branch (1233:58): [True: 13.2k, False: 1.06M]
  ------------------
 1234|  1.08M|                              intra_edge_flags & EDGE_I444_TOP_HAS_RIGHT;
 1235|  1.09M|            const int sb_has_bl = init_x ? 0 : init_y + 16 < h4 ? 1 :
  ------------------
  |  Branch (1235:35): [True: 9.16k, False: 1.08M]
  |  Branch (1235:48): [True: 13.2k, False: 1.06M]
  ------------------
 1236|  1.08M|                              intra_edge_flags & EDGE_I444_LEFT_HAS_BOTTOM;
 1237|  1.09M|            int y, x;
 1238|  1.09M|            const int sub_w4 = imin(w4, init_x + 16);
 1239|  2.49M|            for (y = init_y, t->by += init_y; y < sub_h4;
  ------------------
  |  Branch (1239:47): [True: 1.40M, False: 1.09M]
  ------------------
 1240|  1.40M|                 y += t_dim->h, t->by += t_dim->h)
 1241|  1.40M|            {
 1242|  1.40M|                pixel *dst = ((pixel *) f->cur.data[0]) +
 1243|  1.40M|                               4 * (t->by * PXSTRIDE(f->cur.stride[0]) +
 1244|  1.40M|                                    t->bx + init_x);
 1245|  3.29M|                for (x = init_x, t->bx += init_x; x < sub_w4;
  ------------------
  |  Branch (1245:51): [True: 1.89M, False: 1.40M]
  ------------------
 1246|  1.89M|                     x += t_dim->w, t->bx += t_dim->w)
 1247|  1.89M|                {
 1248|  1.89M|                    if (b->pal_sz[0]) goto skip_y_pred;
  ------------------
  |  Branch (1248:25): [True: 21.5k, False: 1.87M]
  ------------------
 1249|       |
 1250|  1.87M|                    int angle = b->y_angle;
 1251|  1.87M|                    const enum EdgeFlags edge_flags =
 1252|  1.87M|                        (((y > init_y || !sb_has_tr) && (x + t_dim->w >= sub_w4)) ?
  ------------------
  |  Branch (1252:28): [True: 605k, False: 1.26M]
  |  Branch (1252:42): [True: 237k, False: 1.03M]
  |  Branch (1252:57): [True: 516k, False: 329k]
  ------------------
 1253|  1.36M|                             0 : EDGE_I444_TOP_HAS_RIGHT) |
 1254|  1.87M|                        ((x > init_x || (!sb_has_bl && y + t_dim->h >= sub_h4)) ?
  ------------------
  |  Branch (1254:27): [True: 490k, False: 1.38M]
  |  Branch (1254:42): [True: 674k, False: 710k]
  |  Branch (1254:56): [True: 500k, False: 173k]
  ------------------
 1255|   991k|                             0 : EDGE_I444_LEFT_HAS_BOTTOM);
 1256|  1.87M|                    const pixel *top_sb_edge = NULL;
 1257|  1.87M|                    if (!(t->by & (f->sb_step - 1))) {
  ------------------
  |  Branch (1257:25): [True: 635k, False: 1.24M]
  ------------------
 1258|   635k|                        top_sb_edge = f->ipred_edge[0];
 1259|   635k|                        const int sby = t->by >> f->sb_shift;
 1260|   635k|                        top_sb_edge += f->sb128w * 128 * (sby - 1);
 1261|   635k|                    }
 1262|  1.87M|                    const enum IntraPredMode m =
 1263|  1.87M|                        bytefn(dav1d_prepare_intra_edges)(t->bx,
  ------------------
  |  |   87|  1.87M|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   77|  1.87M|#define bitfn(x) x##_16bpc
  |  |  ------------------
  ------------------
 1264|  1.87M|                                                          t->bx > ts->tiling.col_start,
 1265|  1.87M|                                                          t->by,
 1266|  1.87M|                                                          t->by > ts->tiling.row_start,
 1267|  1.87M|                                                          ts->tiling.col_end,
 1268|  1.87M|                                                          ts->tiling.row_end,
 1269|  1.87M|                                                          edge_flags, dst,
 1270|  1.87M|                                                          f->cur.stride[0], top_sb_edge,
 1271|  1.87M|                                                          b->y_mode, &angle,
 1272|  1.87M|                                                          t_dim->w, t_dim->h,
 1273|  1.87M|                                                          f->seq_hdr->intra_edge_filter,
 1274|  1.87M|                                                          edge HIGHBD_CALL_SUFFIX);
  ------------------
  |  |   73|  1.87M|#define HIGHBD_CALL_SUFFIX , f->bitdepth_max
  ------------------
 1275|  1.87M|                    dsp->ipred.intra_pred[m](dst, f->cur.stride[0], edge,
 1276|  1.87M|                                             t_dim->w * 4, t_dim->h * 4,
 1277|  1.87M|                                             angle | intra_flags,
 1278|  1.87M|                                             4 * f->bw - 4 * t->bx,
 1279|  1.87M|                                             4 * f->bh - 4 * t->by
 1280|  1.87M|                                             HIGHBD_CALL_SUFFIX);
  ------------------
  |  |   73|  1.87M|#define HIGHBD_CALL_SUFFIX , f->bitdepth_max
  ------------------
 1281|       |
 1282|  1.87M|                    if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
  ------------------
  |  |   34|  1.87M|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 1.87M]
  |  |  ------------------
  |  |   35|  1.87M|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  1.87M|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
                                  if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
  ------------------
  |  |   37|      0|#define DEBUG_B_PIXELS 0
  |  |  ------------------
  |  |  |  Branch (37:24): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1283|      0|                        hex_dump(edge - t_dim->h * 4, t_dim->h * 4,
 1284|      0|                                 t_dim->h * 4, 2, "l");
 1285|      0|                        hex_dump(edge, 0, 1, 1, "tl");
 1286|      0|                        hex_dump(edge + 1, t_dim->w * 4,
 1287|      0|                                 t_dim->w * 4, 2, "t");
 1288|      0|                        hex_dump(dst, f->cur.stride[0],
 1289|      0|                                 t_dim->w * 4, t_dim->h * 4, "y-intra-pred");
 1290|      0|                    }
 1291|       |
 1292|  1.89M|                skip_y_pred: {}
 1293|  1.89M|                    if (!b->skip) {
  ------------------
  |  Branch (1293:25): [True: 846k, False: 1.05M]
  ------------------
 1294|   846k|                        coef *cf;
 1295|   846k|                        int eob;
 1296|   846k|                        enum TxfmType txtp;
 1297|   846k|                        if (t->frame_thread.pass) {
  ------------------
  |  Branch (1297:29): [True: 0, False: 846k]
  ------------------
 1298|      0|                            const int p = t->frame_thread.pass & 1;
 1299|      0|                            const int cbi = *ts->frame_thread[p].cbi++;
 1300|      0|                            cf = ts->frame_thread[p].cf;
 1301|      0|                            ts->frame_thread[p].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
 1302|      0|                            eob  = cbi >> 5;
 1303|      0|                            txtp = cbi & 0x1f;
 1304|   846k|                        } else {
 1305|   846k|                            uint8_t cf_ctx;
 1306|   846k|                            cf = bitfn(t->cf);
  ------------------
  |  |   77|   846k|#define bitfn(x) x##_16bpc
  ------------------
 1307|   846k|                            eob = decode_coefs(t, &t->a->lcoef[bx4 + x],
 1308|   846k|                                               &t->l.lcoef[by4 + y], b->tx, bs,
 1309|   846k|                                               b, 1, 0, cf, &txtp, &cf_ctx);
 1310|   846k|                            if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|   846k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 846k]
  |  |  ------------------
  |  |   35|   846k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|   846k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1311|      0|                                printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
 1312|      0|                                       b->tx, txtp, eob, ts->msac.rng);
 1313|   846k|                            dav1d_memset_likely_pow2(&t->a->lcoef[bx4 + x], cf_ctx, imin(t_dim->w, f->bw - t->bx));
 1314|   846k|                            dav1d_memset_likely_pow2(&t->l.lcoef[by4 + y], cf_ctx, imin(t_dim->h, f->bh - t->by));
 1315|   846k|                        }
 1316|   846k|                        if (eob >= 0) {
  ------------------
  |  Branch (1316:29): [True: 549k, False: 296k]
  ------------------
 1317|   549k|                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
  ------------------
  |  |   34|   549k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 549k]
  |  |  ------------------
  |  |   35|   549k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|   549k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
                                          if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
  ------------------
  |  |   37|      0|#define DEBUG_B_PIXELS 0
  |  |  ------------------
  |  |  |  Branch (37:24): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1318|      0|                                coef_dump(cf, imin(t_dim->h, 8) * 4,
 1319|      0|                                          imin(t_dim->w, 8) * 4, 3, "dq");
 1320|   549k|                            dsp->itx.itxfm_add[b->tx]
 1321|   549k|                                              [txtp](dst,
 1322|   549k|                                                     f->cur.stride[0],
 1323|   549k|                                                     cf, eob HIGHBD_CALL_SUFFIX);
  ------------------
  |  |   73|   549k|#define HIGHBD_CALL_SUFFIX , f->bitdepth_max
  ------------------
 1324|   549k|                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
  ------------------
  |  |   34|   549k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 549k]
  |  |  ------------------
  |  |   35|   549k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|   549k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
                                          if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
  ------------------
  |  |   37|      0|#define DEBUG_B_PIXELS 0
  |  |  ------------------
  |  |  |  Branch (37:24): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1325|      0|                                hex_dump(dst, f->cur.stride[0],
 1326|      0|                                         t_dim->w * 4, t_dim->h * 4, "recon");
 1327|   549k|                        }
 1328|  1.05M|                    } else if (!t->frame_thread.pass) {
  ------------------
  |  Branch (1328:32): [True: 1.05M, False: 18.4E]
  ------------------
 1329|  1.05M|                        dav1d_memset_pow2[t_dim->lw](&t->a->lcoef[bx4 + x], 0x40);
 1330|  1.05M|                        dav1d_memset_pow2[t_dim->lh](&t->l.lcoef[by4 + y], 0x40);
 1331|  1.05M|                    }
 1332|  1.89M|                    dst += 4 * t_dim->w;
 1333|  1.89M|                }
 1334|  1.40M|                t->bx -= x;
 1335|  1.40M|            }
 1336|  1.09M|            t->by -= y;
 1337|       |
 1338|  1.09M|            if (!has_chroma) continue;
  ------------------
  |  Branch (1338:17): [True: 356k, False: 736k]
  ------------------
 1339|       |
 1340|   736k|            const ptrdiff_t stride = f->cur.stride[1];
 1341|       |
 1342|   736k|            if (b->uv_mode == CFL_PRED) {
  ------------------
  |  Branch (1342:17): [True: 378k, False: 357k]
  ------------------
 1343|   378k|                assert(!init_x && !init_y);
  ------------------
  |  |  140|   757k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:30): [True: 378k, False: 26]
  |  |  |  Branch (140:30): [True: 378k, False: 18.4E]
  |  |  |  Branch (140:68): [Folded, False: 378k]
  |  |  ------------------
  ------------------
 1344|       |
 1345|   378k|                int16_t *const ac = t->scratch.ac;
 1346|   378k|                pixel *y_src = ((pixel *) f->cur.data[0]) + 4 * (t->bx & ~ss_hor) +
 1347|   378k|                                 4 * (t->by & ~ss_ver) * PXSTRIDE(f->cur.stride[0]);
 1348|   378k|                const ptrdiff_t uv_off = 4 * ((t->bx >> ss_hor) +
 1349|   378k|                                              (t->by >> ss_ver) * PXSTRIDE(stride));
 1350|   378k|                pixel *const uv_dst[2] = { ((pixel *) f->cur.data[1]) + uv_off,
 1351|   378k|                                           ((pixel *) f->cur.data[2]) + uv_off };
 1352|       |
 1353|   378k|                const int furthest_r =
 1354|   378k|                    ((cw4 << ss_hor) + t_dim->w - 1) & ~(t_dim->w - 1);
 1355|   378k|                const int furthest_b =
 1356|   378k|                    ((ch4 << ss_ver) + t_dim->h - 1) & ~(t_dim->h - 1);
 1357|   378k|                dsp->ipred.cfl_ac[f->cur.p.layout - 1](ac, y_src, f->cur.stride[0],
 1358|   378k|                                                         cbw4 - (furthest_r >> ss_hor),
 1359|   378k|                                                         cbh4 - (furthest_b >> ss_ver),
 1360|   378k|                                                         cbw4 * 4, cbh4 * 4);
 1361|  1.13M|                for (int pl = 0; pl < 2; pl++) {
  ------------------
  |  Branch (1361:34): [True: 757k, False: 378k]
  ------------------
 1362|   757k|                    if (!b->cfl_alpha[pl]) continue;
  ------------------
  |  Branch (1362:25): [True: 34.5k, False: 723k]
  ------------------
 1363|   723k|                    int angle = 0;
 1364|   723k|                    const pixel *top_sb_edge = NULL;
 1365|   723k|                    if (!((t->by & ~ss_ver) & (f->sb_step - 1))) {
  ------------------
  |  Branch (1365:25): [True: 373k, False: 349k]
  ------------------
 1366|   373k|                        top_sb_edge = f->ipred_edge[pl + 1];
 1367|   373k|                        const int sby = t->by >> f->sb_shift;
 1368|   373k|                        top_sb_edge += f->sb128w * 128 * (sby - 1);
 1369|   373k|                    }
 1370|   723k|                    const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver;
 1371|   723k|                    const int xstart = ts->tiling.col_start >> ss_hor;
 1372|   723k|                    const int ystart = ts->tiling.row_start >> ss_ver;
 1373|   723k|                    const enum IntraPredMode m =
 1374|   723k|                        bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart,
  ------------------
  |  |   87|   723k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   77|   723k|#define bitfn(x) x##_16bpc
  |  |  ------------------
  ------------------
 1375|   723k|                                                          ypos, ypos > ystart,
 1376|   723k|                                                          ts->tiling.col_end >> ss_hor,
 1377|   723k|                                                          ts->tiling.row_end >> ss_ver,
 1378|   723k|                                                          0, uv_dst[pl], stride,
 1379|   723k|                                                          top_sb_edge, DC_PRED, &angle,
 1380|   723k|                                                          uv_t_dim->w, uv_t_dim->h, 0,
 1381|   723k|                                                          edge HIGHBD_CALL_SUFFIX);
  ------------------
  |  |   73|   723k|#define HIGHBD_CALL_SUFFIX , f->bitdepth_max
  ------------------
 1382|   723k|                    dsp->ipred.cfl_pred[m](uv_dst[pl], stride, edge,
 1383|   723k|                                           uv_t_dim->w * 4,
 1384|   723k|                                           uv_t_dim->h * 4,
 1385|   723k|                                           ac, b->cfl_alpha[pl]
 1386|   723k|                                           HIGHBD_CALL_SUFFIX);
  ------------------
  |  |   73|   723k|#define HIGHBD_CALL_SUFFIX , f->bitdepth_max
  ------------------
 1387|   723k|                }
 1388|   378k|                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
  ------------------
  |  |   34|   378k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 378k]
  |  |  ------------------
  |  |   35|   378k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|   378k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
                              if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
  ------------------
  |  |   37|      0|#define DEBUG_B_PIXELS 0
  |  |  ------------------
  |  |  |  Branch (37:24): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1389|      0|                    ac_dump(ac, 4*cbw4, 4*cbh4, "ac");
 1390|      0|                    hex_dump(uv_dst[0], stride, cbw4 * 4, cbh4 * 4, "u-cfl-pred");
 1391|      0|                    hex_dump(uv_dst[1], stride, cbw4 * 4, cbh4 * 4, "v-cfl-pred");
 1392|      0|                }
 1393|   378k|            } else if (b->pal_sz[1]) {
  ------------------
  |  Branch (1393:24): [True: 3.42k, False: 353k]
  ------------------
 1394|  3.42k|                const ptrdiff_t uv_dstoff = 4 * ((t->bx >> ss_hor) +
 1395|  3.42k|                                              (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1]));
 1396|  3.42k|                const pixel (*pal)[8];
 1397|  3.42k|                const uint8_t *pal_idx;
 1398|  3.42k|                if (t->frame_thread.pass) {
  ------------------
  |  Branch (1398:21): [True: 0, False: 3.42k]
  ------------------
 1399|      0|                    const int p = t->frame_thread.pass & 1;
 1400|      0|                    assert(ts->frame_thread[p].pal_idx);
  ------------------
  |  |  140|      0|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 0]
  |  |  |  Branch (140:68): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1401|      0|                    pal = f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
 1402|      0|                                              ((t->bx >> 1) + (t->by & 1))];
 1403|      0|                    pal_idx = ts->frame_thread[p].pal_idx;
 1404|      0|                    ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 8;
 1405|  3.42k|                } else {
 1406|  3.42k|                    pal = bytefn(t->scratch.pal);
  ------------------
  |  |   87|  3.42k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   77|  3.42k|#define bitfn(x) x##_16bpc
  |  |  ------------------
  ------------------
 1407|  3.42k|                    pal_idx = t->scratch.pal_idx_uv;
 1408|  3.42k|                }
 1409|       |
 1410|  3.42k|                f->dsp->ipred.pal_pred(((pixel *) f->cur.data[1]) + uv_dstoff,
 1411|  3.42k|                                       f->cur.stride[1], pal[1],
 1412|  3.42k|                                       pal_idx, cbw4 * 4, cbh4 * 4);
 1413|  3.42k|                f->dsp->ipred.pal_pred(((pixel *) f->cur.data[2]) + uv_dstoff,
 1414|  3.42k|                                       f->cur.stride[1], pal[2],
 1415|  3.42k|                                       pal_idx, cbw4 * 4, cbh4 * 4);
 1416|  3.42k|                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
  ------------------
  |  |   34|  3.42k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 3.42k]
  |  |  ------------------
  |  |   35|  3.42k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  3.42k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
                              if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
  ------------------
  |  |   37|      0|#define DEBUG_B_PIXELS 0
  |  |  ------------------
  |  |  |  Branch (37:24): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1417|      0|                    hex_dump(((pixel *) f->cur.data[1]) + uv_dstoff,
 1418|      0|                             PXSTRIDE(f->cur.stride[1]),
 1419|      0|                             cbw4 * 4, cbh4 * 4, "u-pal-pred");
 1420|      0|                    hex_dump(((pixel *) f->cur.data[2]) + uv_dstoff,
 1421|      0|                             PXSTRIDE(f->cur.stride[1]),
 1422|      0|                             cbw4 * 4, cbh4 * 4, "v-pal-pred");
 1423|      0|                }
 1424|  3.42k|            }
 1425|       |
 1426|   736k|            const int sm_uv_fl = sm_uv_flag(t->a, cbx4) |
 1427|   736k|                                 sm_uv_flag(&t->l, cby4);
 1428|   736k|            const int uv_sb_has_tr =
 1429|   736k|                ((init_x + 16) >> ss_hor) < cw4 ? 1 : init_y ? 0 :
  ------------------
  |  Branch (1429:17): [True: 7.06k, False: 728k]
  |  Branch (1429:55): [True: 4.91k, False: 724k]
  ------------------
 1430|   728k|                intra_edge_flags & (EDGE_I420_TOP_HAS_RIGHT >> (f->cur.p.layout - 1));
 1431|   736k|            const int uv_sb_has_bl =
 1432|   736k|                init_x ? 0 : ((init_y + 16) >> ss_ver) < ch4 ? 1 :
  ------------------
  |  Branch (1432:17): [True: 7.06k, False: 728k]
  |  Branch (1432:30): [True: 4.91k, False: 724k]
  ------------------
 1433|   728k|                intra_edge_flags & (EDGE_I420_LEFT_HAS_BOTTOM >> (f->cur.p.layout - 1));
 1434|   736k|            const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor);
 1435|  2.20M|            for (int pl = 0; pl < 2; pl++) {
  ------------------
  |  Branch (1435:30): [True: 1.46M, False: 733k]
  ------------------
 1436|  3.06M|                for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4;
  ------------------
  |  Branch (1436:61): [True: 1.59M, False: 1.46M]
  ------------------
 1437|  1.59M|                     y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver)
 1438|  1.59M|                {
 1439|  1.59M|                    pixel *dst = ((pixel *) f->cur.data[1 + pl]) +
 1440|  1.59M|                                   4 * ((t->by >> ss_ver) * PXSTRIDE(stride) +
 1441|  1.59M|                                        ((t->bx + init_x) >> ss_hor));
 1442|  3.68M|                    for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4;
  ------------------
  |  Branch (1442:65): [True: 2.08M, False: 1.59M]
  ------------------
 1443|  2.08M|                         x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor)
 1444|  2.08M|                    {
 1445|  2.08M|                        if ((b->uv_mode == CFL_PRED && b->cfl_alpha[pl]) ||
  ------------------
  |  Branch (1445:30): [True: 756k, False: 1.33M]
  |  Branch (1445:56): [True: 721k, False: 34.7k]
  ------------------
 1446|  1.37M|                            b->pal_sz[1])
  ------------------
  |  Branch (1446:29): [True: 10.9k, False: 1.36M]
  ------------------
 1447|   732k|                        {
 1448|   732k|                            goto skip_uv_pred;
 1449|   732k|                        }
 1450|       |
 1451|  1.35M|                        int angle = b->uv_angle;
 1452|       |                        // this probably looks weird because we're using
 1453|       |                        // luma flags in a chroma loop, but that's because
 1454|       |                        // prepare_intra_edges() expects luma flags as input
 1455|  1.35M|                        const enum EdgeFlags edge_flags =
 1456|  1.35M|                            (((y > (init_y >> ss_ver) || !uv_sb_has_tr) &&
  ------------------
  |  Branch (1456:32): [True: 411k, False: 946k]
  |  Branch (1456:58): [True: 161k, False: 784k]
  ------------------
 1457|   579k|                              (x + uv_t_dim->w >= sub_cw4)) ?
  ------------------
  |  Branch (1457:31): [True: 276k, False: 303k]
  ------------------
 1458|  1.08M|                                 0 : EDGE_I444_TOP_HAS_RIGHT) |
 1459|  1.35M|                            ((x > (init_x >> ss_hor) ||
  ------------------
  |  Branch (1459:31): [True: 488k, False: 868k]
  ------------------
 1460|   868k|                              (!uv_sb_has_bl && y + uv_t_dim->h >= sub_ch4)) ?
  ------------------
  |  Branch (1460:32): [True: 382k, False: 486k]
  |  Branch (1460:49): [True: 296k, False: 86.1k]
  ------------------
 1461|   790k|                                 0 : EDGE_I444_LEFT_HAS_BOTTOM);
 1462|  1.35M|                        const pixel *top_sb_edge = NULL;
 1463|  1.35M|                        if (!((t->by & ~ss_ver) & (f->sb_step - 1))) {
  ------------------
  |  Branch (1463:29): [True: 559k, False: 798k]
  ------------------
 1464|   559k|                            top_sb_edge = f->ipred_edge[1 + pl];
 1465|   559k|                            const int sby = t->by >> f->sb_shift;
 1466|   559k|                            top_sb_edge += f->sb128w * 128 * (sby - 1);
 1467|   559k|                        }
 1468|  1.35M|                        const enum IntraPredMode uv_mode =
 1469|  1.35M|                             b->uv_mode == CFL_PRED ? DC_PRED : b->uv_mode;
  ------------------
  |  Branch (1469:30): [True: 34.5k, False: 1.32M]
  ------------------
 1470|  1.35M|                        const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver;
 1471|  1.35M|                        const int xstart = ts->tiling.col_start >> ss_hor;
 1472|  1.35M|                        const int ystart = ts->tiling.row_start >> ss_ver;
 1473|  1.35M|                        const enum IntraPredMode m =
 1474|  1.35M|                            bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart,
  ------------------
  |  |   87|  1.35M|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   77|  1.35M|#define bitfn(x) x##_16bpc
  |  |  ------------------
  ------------------
 1475|  1.35M|                                                              ypos, ypos > ystart,
 1476|  1.35M|                                                              ts->tiling.col_end >> ss_hor,
 1477|  1.35M|                                                              ts->tiling.row_end >> ss_ver,
 1478|  1.35M|                                                              edge_flags, dst, stride,
 1479|  1.35M|                                                              top_sb_edge, uv_mode,
 1480|  1.35M|                                                              &angle, uv_t_dim->w,
 1481|  1.35M|                                                              uv_t_dim->h,
 1482|  1.35M|                                                              f->seq_hdr->intra_edge_filter,
 1483|  1.35M|                                                              edge HIGHBD_CALL_SUFFIX);
  ------------------
  |  |   73|  1.35M|#define HIGHBD_CALL_SUFFIX , f->bitdepth_max
  ------------------
 1484|  1.35M|                        angle |= intra_edge_filter_flag;
 1485|  1.35M|                        dsp->ipred.intra_pred[m](dst, stride, edge,
 1486|  1.35M|                                                 uv_t_dim->w * 4,
 1487|  1.35M|                                                 uv_t_dim->h * 4,
 1488|  1.35M|                                                 angle | sm_uv_fl,
 1489|  1.35M|                                                 (4 * f->bw + ss_hor -
 1490|  1.35M|                                                  4 * (t->bx & ~ss_hor)) >> ss_hor,
 1491|  1.35M|                                                 (4 * f->bh + ss_ver -
 1492|  1.35M|                                                  4 * (t->by & ~ss_ver)) >> ss_ver
 1493|  1.35M|                                                 HIGHBD_CALL_SUFFIX);
  ------------------
  |  |   73|  1.35M|#define HIGHBD_CALL_SUFFIX , f->bitdepth_max
  ------------------
 1494|  1.35M|                        if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
  ------------------
  |  |   34|  1.35M|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 1.35M]
  |  |  ------------------
  |  |   35|  1.35M|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  1.35M|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
                                      if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
  ------------------
  |  |   37|      0|#define DEBUG_B_PIXELS 0
  |  |  ------------------
  |  |  |  Branch (37:24): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1495|      0|                            hex_dump(edge - uv_t_dim->h * 4, uv_t_dim->h * 4,
 1496|      0|                                     uv_t_dim->h * 4, 2, "l");
 1497|      0|                            hex_dump(edge, 0, 1, 1, "tl");
 1498|      0|                            hex_dump(edge + 1, uv_t_dim->w * 4,
 1499|      0|                                     uv_t_dim->w * 4, 2, "t");
 1500|      0|                            hex_dump(dst, stride, uv_t_dim->w * 4,
 1501|      0|                                     uv_t_dim->h * 4, pl ? "v-intra-pred" : "u-intra-pred");
  ------------------
  |  Branch (1501:55): [True: 0, False: 0]
  ------------------
 1502|      0|                        }
 1503|       |
 1504|  2.08M|                    skip_uv_pred: {}
 1505|  2.08M|                        if (!b->skip) {
  ------------------
  |  Branch (1505:29): [True: 989k, False: 1.09M]
  ------------------
 1506|   989k|                            enum TxfmType txtp;
 1507|   989k|                            int eob;
 1508|   989k|                            coef *cf;
 1509|   989k|                            if (t->frame_thread.pass) {
  ------------------
  |  Branch (1509:33): [True: 0, False: 989k]
  ------------------
 1510|      0|                                const int p = t->frame_thread.pass & 1;
 1511|      0|                                const int cbi = *ts->frame_thread[p].cbi++;
 1512|      0|                                cf = ts->frame_thread[p].cf;
 1513|      0|                                ts->frame_thread[p].cf += uv_t_dim->w * uv_t_dim->h * 16;
 1514|      0|                                eob  = cbi >> 5;
 1515|      0|                                txtp = cbi & 0x1f;
 1516|   989k|                            } else {
 1517|   989k|                                uint8_t cf_ctx;
 1518|   989k|                                cf = bitfn(t->cf);
  ------------------
  |  |   77|   989k|#define bitfn(x) x##_16bpc
  ------------------
 1519|   989k|                                eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
 1520|   989k|                                                   &t->l.ccoef[pl][cby4 + y],
 1521|   989k|                                                   b->uvtx, bs, b, 1, 1 + pl, cf,
 1522|   989k|                                                   &txtp, &cf_ctx);
 1523|   989k|                                if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|   989k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 989k]
  |  |  ------------------
  |  |   35|   989k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|   989k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1524|      0|                                    printf("Post-uv-cf-blk[pl=%d,tx=%d,"
 1525|      0|                                           "txtp=%d,eob=%d]: r=%d [x=%d,cbx4=%d]\n",
 1526|      0|                                           pl, b->uvtx, txtp, eob, ts->msac.rng, x, cbx4);
 1527|   989k|                                int ctw = imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor);
 1528|   989k|                                int cth = imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver);
 1529|   989k|                                dav1d_memset_likely_pow2(&t->a->ccoef[pl][cbx4 + x], cf_ctx, ctw);
 1530|   989k|                                dav1d_memset_likely_pow2(&t->l.ccoef[pl][cby4 + y], cf_ctx, cth);
 1531|   989k|                            }
 1532|   989k|                            if (eob >= 0) {
  ------------------
  |  Branch (1532:33): [True: 402k, False: 586k]
  ------------------
 1533|   402k|                                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
  ------------------
  |  |   34|   402k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 402k]
  |  |  ------------------
  |  |   35|   402k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|   402k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
                                              if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
  ------------------
  |  |   37|      0|#define DEBUG_B_PIXELS 0
  |  |  ------------------
  |  |  |  Branch (37:24): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1534|      0|                                    coef_dump(cf, uv_t_dim->h * 4,
 1535|      0|                                              uv_t_dim->w * 4, 3, "dq");
 1536|   402k|                                dsp->itx.itxfm_add[b->uvtx]
 1537|   402k|                                                  [txtp](dst, stride,
 1538|   402k|                                                         cf, eob HIGHBD_CALL_SUFFIX);
  ------------------
  |  |   73|   402k|#define HIGHBD_CALL_SUFFIX , f->bitdepth_max
  ------------------
 1539|   402k|                                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
  ------------------
  |  |   34|   402k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 402k]
  |  |  ------------------
  |  |   35|   402k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|   402k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
                                              if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
  ------------------
  |  |   37|      0|#define DEBUG_B_PIXELS 0
  |  |  ------------------
  |  |  |  Branch (37:24): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1540|      0|                                    hex_dump(dst, stride, uv_t_dim->w * 4,
 1541|      0|                                             uv_t_dim->h * 4, "recon");
 1542|   402k|                            }
 1543|  1.10M|                        } else if (!t->frame_thread.pass) {
  ------------------
  |  Branch (1543:36): [True: 1.10M, False: 18.4E]
  ------------------
 1544|  1.10M|                            dav1d_memset_pow2[uv_t_dim->lw](&t->a->ccoef[pl][cbx4 + x], 0x40);
 1545|  1.10M|                            dav1d_memset_pow2[uv_t_dim->lh](&t->l.ccoef[pl][cby4 + y], 0x40);
 1546|  1.10M|                        }
 1547|  2.08M|                        dst += uv_t_dim->w * 4;
 1548|  2.08M|                    }
 1549|  1.59M|                    t->bx -= x << ss_hor;
 1550|  1.59M|                }
 1551|  1.46M|                t->by -= y << ss_ver;
 1552|  1.46M|            }
 1553|   736k|        }
 1554|  1.08M|    }
 1555|  1.07M|}
dav1d_recon_b_inter_16bpc:
 1559|   439k|{
 1560|   439k|    Dav1dTileState *const ts = t->ts;
 1561|   439k|    const Dav1dFrameContext *const f = t->f;
 1562|   439k|    const Dav1dDSPContext *const dsp = f->dsp;
 1563|   439k|    const int bx4 = t->bx & 31, by4 = t->by & 31;
 1564|   439k|    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
 1565|   439k|    const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
 1566|   439k|    const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
 1567|   439k|    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
 1568|   439k|    const int bw4 = b_dim[0], bh4 = b_dim[1];
 1569|   439k|    const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
 1570|   439k|    const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&
  ------------------
  |  Branch (1570:28): [True: 418k, False: 21.1k]
  ------------------
 1571|   418k|                           (bw4 > ss_hor || t->bx & 1) &&
  ------------------
  |  Branch (1571:29): [True: 365k, False: 52.6k]
  |  Branch (1571:45): [True: 26.2k, False: 26.3k]
  ------------------
 1572|   392k|                           (bh4 > ss_ver || t->by & 1);
  ------------------
  |  Branch (1572:29): [True: 364k, False: 27.9k]
  |  Branch (1572:45): [True: 13.9k, False: 14.0k]
  ------------------
 1573|   439k|    const int chr_layout_idx = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I400 ? 0 :
  ------------------
  |  Branch (1573:32): [True: 19.0k, False: 420k]
  ------------------
 1574|   439k|                               DAV1D_PIXEL_LAYOUT_I444 - f->cur.p.layout;
 1575|   439k|    int res;
 1576|       |
 1577|       |    // prediction
 1578|   439k|    const int cbh4 = (bh4 + ss_ver) >> ss_ver, cbw4 = (bw4 + ss_hor) >> ss_hor;
 1579|   439k|    pixel *dst = ((pixel *) f->cur.data[0]) +
 1580|   439k|        4 * (t->by * PXSTRIDE(f->cur.stride[0]) + t->bx);
 1581|   439k|    const ptrdiff_t uvdstoff =
 1582|   439k|        4 * ((t->bx >> ss_hor) + (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1]));
 1583|   439k|    if (IS_KEY_OR_INTRA(f->frame_hdr)) {
  ------------------
  |  |   43|   439k|    (!IS_INTER_OR_SWITCH(frame_header))
  |  |  ------------------
  |  |  |  |   36|   439k|    ((frame_header)->frame_type & 1)
  |  |  ------------------
  |  |  |  Branch (43:5): [True: 415k, False: 24.4k]
  |  |  ------------------
  ------------------
 1584|       |        // intrabc
 1585|   415k|        assert(!f->frame_hdr->super_res.enabled);
  ------------------
  |  |  140|   415k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 415k]
  |  |  |  Branch (140:68): [Folded, False: 415k]
  |  |  ------------------
  ------------------
 1586|   415k|        res = mc(t, dst, NULL, f->cur.stride[0], bw4, bh4, t->bx, t->by, 0,
 1587|   415k|                 b->mv[0], &f->sr_cur, 0 /* unused */, FILTER_2D_BILINEAR);
 1588|   415k|        if (res) return res;
  ------------------
  |  Branch (1588:13): [True: 0, False: 415k]
  ------------------
 1589|  1.07M|        if (has_chroma) for (int pl = 1; pl < 3; pl++) {
  ------------------
  |  Branch (1589:13): [True: 360k, False: 54.1k]
  |  Branch (1589:42): [True: 718k, False: 360k]
  ------------------
 1590|   718k|            res = mc(t, ((pixel *)f->cur.data[pl]) + uvdstoff, NULL, f->cur.stride[1],
 1591|   718k|                     bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),
 1592|   718k|                     t->bx & ~ss_hor, t->by & ~ss_ver, pl, b->mv[0],
 1593|   718k|                     &f->sr_cur, 0 /* unused */, FILTER_2D_BILINEAR);
 1594|   718k|            if (res) return res;
  ------------------
  |  Branch (1594:17): [True: 0, False: 718k]
  ------------------
 1595|   718k|        }
 1596|   415k|    } else if (b->comp_type == COMP_INTER_NONE) {
  ------------------
  |  Branch (1596:16): [True: 17.4k, False: 7.00k]
  ------------------
 1597|  17.4k|        const Dav1dThreadPicture *const refp = &f->refp[b->ref[0]];
 1598|  17.4k|        const enum Filter2d filter_2d = b->filter2d;
 1599|       |
 1600|  17.4k|        if (imin(bw4, bh4) > 1 &&
  ------------------
  |  Branch (1600:13): [True: 11.1k, False: 6.33k]
  ------------------
 1601|  11.1k|            ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) ||
  ------------------
  |  Branch (1601:15): [True: 371, False: 10.7k]
  |  Branch (1601:44): [True: 96, False: 275]
  ------------------
 1602|  11.0k|             (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION)))
  ------------------
  |  Branch (1602:15): [True: 2.42k, False: 8.63k]
  |  Branch (1602:44): [True: 2.22k, False: 202]
  ------------------
 1603|  2.31k|        {
 1604|  2.31k|            res = warp_affine(t, dst, NULL, f->cur.stride[0], b_dim, 0, refp,
 1605|  2.31k|                              b->motion_mode == MM_WARP ? &t->warpmv :
  ------------------
  |  Branch (1605:31): [True: 2.22k, False: 96]
  ------------------
 1606|  2.31k|                                  &f->frame_hdr->gmv[b->ref[0]]);
 1607|  2.31k|            if (res) return res;
  ------------------
  |  Branch (1607:17): [True: 0, False: 2.31k]
  ------------------
 1608|  15.1k|        } else {
 1609|  15.1k|            res = mc(t, dst, NULL, f->cur.stride[0],
 1610|  15.1k|                     bw4, bh4, t->bx, t->by, 0, b->mv[0], refp, b->ref[0], filter_2d);
 1611|  15.1k|            if (res) return res;
  ------------------
  |  Branch (1611:17): [True: 0, False: 15.1k]
  ------------------
 1612|  15.1k|            if (b->motion_mode == MM_OBMC) {
  ------------------
  |  Branch (1612:17): [True: 3.16k, False: 12.0k]
  ------------------
 1613|  3.16k|                res = obmc(t, dst, f->cur.stride[0], b_dim, 0, bx4, by4, w4, h4);
 1614|  3.16k|                if (res) return res;
  ------------------
  |  Branch (1614:21): [True: 0, False: 3.16k]
  ------------------
 1615|  3.16k|            }
 1616|  15.1k|        }
 1617|  17.4k|        if (b->interintra_type) {
  ------------------
  |  Branch (1617:13): [True: 270, False: 17.2k]
  ------------------
 1618|    270|            pixel *const tl_edge = bitfn(t->scratch.edge) + 32;
  ------------------
  |  |   77|    270|#define bitfn(x) x##_16bpc
  ------------------
 1619|    270|            enum IntraPredMode m = b->interintra_mode == II_SMOOTH_PRED ?
  ------------------
  |  Branch (1619:36): [True: 44, False: 226]
  ------------------
 1620|    226|                                   SMOOTH_PRED : b->interintra_mode;
 1621|    270|            pixel *const tmp = bitfn(t->scratch.interintra);
  ------------------
  |  |   77|    270|#define bitfn(x) x##_16bpc
  ------------------
 1622|    270|            int angle = 0;
 1623|    270|            const pixel *top_sb_edge = NULL;
 1624|    270|            if (!(t->by & (f->sb_step - 1))) {
  ------------------
  |  Branch (1624:17): [True: 68, False: 202]
  ------------------
 1625|     68|                top_sb_edge = f->ipred_edge[0];
 1626|     68|                const int sby = t->by >> f->sb_shift;
 1627|     68|                top_sb_edge += f->sb128w * 128 * (sby - 1);
 1628|     68|            }
 1629|    270|            m = bytefn(dav1d_prepare_intra_edges)(t->bx, t->bx > ts->tiling.col_start,
  ------------------
  |  |   87|    270|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   77|    270|#define bitfn(x) x##_16bpc
  |  |  ------------------
  ------------------
 1630|    270|                                                  t->by, t->by > ts->tiling.row_start,
 1631|    270|                                                  ts->tiling.col_end, ts->tiling.row_end,
 1632|    270|                                                  0, dst, f->cur.stride[0], top_sb_edge,
 1633|    270|                                                  m, &angle, bw4, bh4, 0, tl_edge
 1634|    270|                                                  HIGHBD_CALL_SUFFIX);
  ------------------
  |  |   73|    270|#define HIGHBD_CALL_SUFFIX , f->bitdepth_max
  ------------------
 1635|    270|            dsp->ipred.intra_pred[m](tmp, 4 * bw4 * sizeof(pixel),
 1636|    270|                                     tl_edge, bw4 * 4, bh4 * 4, 0, 0, 0
 1637|    270|                                     HIGHBD_CALL_SUFFIX);
  ------------------
  |  |   73|    270|#define HIGHBD_CALL_SUFFIX , f->bitdepth_max
  ------------------
 1638|    270|            dsp->mc.blend(dst, f->cur.stride[0], tmp,
 1639|    270|                          bw4 * 4, bh4 * 4, II_MASK(0, bs, b));
  ------------------
  |  |   83|    270|    ((const uint8_t*)((uintptr_t)&dav1d_masks + \
  |  |   84|    270|    (size_t)((b)->interintra_type == INTER_INTRA_BLEND ? \
  |  |  ------------------
  |  |  |  Branch (84:14): [True: 183, False: 87]
  |  |  ------------------
  |  |   85|    270|    dav1d_masks.offsets[c][(bs)-BS_32x32].ii[(b)->interintra_mode] : \
  |  |   86|    270|    dav1d_masks.offsets[c][(bs)-BS_32x32].wedge[0][(b)->wedge_idx]) * 8))
  ------------------
 1640|    270|        }
 1641|       |
 1642|  17.4k|        if (!has_chroma) goto skip_inter_chroma_pred;
  ------------------
  |  Branch (1642:13): [True: 3.43k, False: 14.0k]
  ------------------
 1643|       |
 1644|       |        // sub8x8 derivation
 1645|  14.0k|        int is_sub8x8 = bw4 == ss_hor || bh4 == ss_ver;
  ------------------
  |  Branch (1645:25): [True: 1.18k, False: 12.8k]
  |  Branch (1645:42): [True: 1.19k, False: 11.6k]
  ------------------
 1646|  14.0k|        refmvs_block *const *r;
 1647|  14.0k|        if (is_sub8x8) {
  ------------------
  |  Branch (1647:13): [True: 2.40k, False: 11.6k]
  ------------------
 1648|  2.40k|            assert(ss_hor == 1);
  ------------------
  |  |  140|  2.40k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 2.40k]
  |  |  |  Branch (140:68): [Folded, False: 2.40k]
  |  |  ------------------
  ------------------
 1649|  2.40k|            r = &t->rt.r[(t->by & 31) + 5];
 1650|  2.40k|            if (bw4 == 1) is_sub8x8 &= r[0][t->bx - 1].ref.ref[0] > 0;
  ------------------
  |  Branch (1650:17): [True: 1.19k, False: 1.20k]
  ------------------
 1651|  2.40k|            if (bh4 == ss_ver) is_sub8x8 &= r[-1][t->bx].ref.ref[0] > 0;
  ------------------
  |  Branch (1651:17): [True: 1.53k, False: 865]
  ------------------
 1652|  2.40k|            if (bw4 == 1 && bh4 == ss_ver)
  ------------------
  |  Branch (1652:17): [True: 1.19k, False: 1.20k]
  |  Branch (1652:29): [True: 334, False: 865]
  ------------------
 1653|    334|                is_sub8x8 &= r[-1][t->bx - 1].ref.ref[0] > 0;
 1654|  2.40k|        }
 1655|       |
 1656|       |        // chroma prediction
 1657|  14.0k|        if (is_sub8x8) {
  ------------------
  |  Branch (1657:13): [True: 2.31k, False: 11.7k]
  ------------------
 1658|  2.31k|            assert(ss_hor == 1);
  ------------------
  |  |  140|  2.31k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 2.31k]
  |  |  |  Branch (140:68): [Folded, False: 2.31k]
  |  |  ------------------
  ------------------
 1659|  2.31k|            ptrdiff_t h_off = 0, v_off = 0;
 1660|  2.31k|            if (bw4 == 1 && bh4 == ss_ver) {
  ------------------
  |  Branch (1660:17): [True: 1.15k, False: 1.16k]
  |  Branch (1660:29): [True: 304, False: 849]
  ------------------
 1661|    912|                for (int pl = 0; pl < 2; pl++) {
  ------------------
  |  Branch (1661:34): [True: 608, False: 304]
  ------------------
 1662|    608|                    res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff,
 1663|    608|                             NULL, f->cur.stride[1],
 1664|    608|                             bw4, bh4, t->bx - 1, t->by - 1, 1 + pl,
 1665|    608|                             r[-1][t->bx - 1].mv.mv[0],
 1666|    608|                             &f->refp[r[-1][t->bx - 1].ref.ref[0] - 1],
 1667|    608|                             r[-1][t->bx - 1].ref.ref[0] - 1,
 1668|    608|                             t->frame_thread.pass != 2 ? t->tl_4x4_filter :
  ------------------
  |  Branch (1668:30): [True: 608, False: 0]
  ------------------
 1669|    608|                                 f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx - 1].filter2d);
 1670|    608|                    if (res) return res;
  ------------------
  |  Branch (1670:25): [True: 0, False: 608]
  ------------------
 1671|    608|                }
 1672|    304|                v_off = 2 * PXSTRIDE(f->cur.stride[1]);
 1673|    304|                h_off = 2;
 1674|    304|            }
 1675|  2.31k|            if (bw4 == 1) {
  ------------------
  |  Branch (1675:17): [True: 1.15k, False: 1.16k]
  ------------------
 1676|  1.15k|                const enum Filter2d left_filter_2d =
 1677|  1.15k|                    dav1d_filter_2d[t->l.filter[1][by4]][t->l.filter[0][by4]];
 1678|  3.45k|                for (int pl = 0; pl < 2; pl++) {
  ------------------
  |  Branch (1678:34): [True: 2.30k, False: 1.15k]
  ------------------
 1679|  2.30k|                    res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + v_off, NULL,
 1680|  2.30k|                             f->cur.stride[1], bw4, bh4, t->bx - 1,
 1681|  2.30k|                             t->by, 1 + pl, r[0][t->bx - 1].mv.mv[0],
 1682|  2.30k|                             &f->refp[r[0][t->bx - 1].ref.ref[0] - 1],
 1683|  2.30k|                             r[0][t->bx - 1].ref.ref[0] - 1,
 1684|  2.30k|                             t->frame_thread.pass != 2 ? left_filter_2d :
  ------------------
  |  Branch (1684:30): [True: 2.30k, False: 0]
  ------------------
 1685|  2.30k|                                 f->frame_thread.b[(t->by * f->b4_stride) + t->bx - 1].filter2d);
 1686|  2.30k|                    if (res) return res;
  ------------------
  |  Branch (1686:25): [True: 0, False: 2.30k]
  ------------------
 1687|  2.30k|                }
 1688|  1.15k|                h_off = 2;
 1689|  1.15k|            }
 1690|  2.31k|            if (bh4 == ss_ver) {
  ------------------
  |  Branch (1690:17): [True: 1.46k, False: 849]
  ------------------
 1691|  1.46k|                const enum Filter2d top_filter_2d =
 1692|  1.46k|                    dav1d_filter_2d[t->a->filter[1][bx4]][t->a->filter[0][bx4]];
 1693|  4.40k|                for (int pl = 0; pl < 2; pl++) {
  ------------------
  |  Branch (1693:34): [True: 2.93k, False: 1.46k]
  ------------------
 1694|  2.93k|                    res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + h_off, NULL,
 1695|  2.93k|                             f->cur.stride[1], bw4, bh4, t->bx, t->by - 1,
 1696|  2.93k|                             1 + pl, r[-1][t->bx].mv.mv[0],
 1697|  2.93k|                             &f->refp[r[-1][t->bx].ref.ref[0] - 1],
 1698|  2.93k|                             r[-1][t->bx].ref.ref[0] - 1,
 1699|  2.93k|                             t->frame_thread.pass != 2 ? top_filter_2d :
  ------------------
  |  Branch (1699:30): [True: 2.93k, False: 0]
  ------------------
 1700|  2.93k|                                 f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx].filter2d);
 1701|  2.93k|                    if (res) return res;
  ------------------
  |  Branch (1701:25): [True: 0, False: 2.93k]
  ------------------
 1702|  2.93k|                }
 1703|  1.46k|                v_off = 2 * PXSTRIDE(f->cur.stride[1]);
 1704|  1.46k|            }
 1705|  6.95k|            for (int pl = 0; pl < 2; pl++) {
  ------------------
  |  Branch (1705:30): [True: 4.63k, False: 2.31k]
  ------------------
 1706|  4.63k|                res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + h_off + v_off, NULL, f->cur.stride[1],
 1707|  4.63k|                         bw4, bh4, t->bx, t->by, 1 + pl, b->mv[0],
 1708|  4.63k|                         refp, b->ref[0], filter_2d);
 1709|  4.63k|                if (res) return res;
  ------------------
  |  Branch (1709:21): [True: 0, False: 4.63k]
  ------------------
 1710|  4.63k|            }
 1711|  11.7k|        } else {
 1712|  11.7k|            if (imin(cbw4, cbh4) > 1 &&
  ------------------
  |  Branch (1712:17): [True: 4.53k, False: 7.21k]
  ------------------
 1713|  4.53k|                ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) ||
  ------------------
  |  Branch (1713:19): [True: 222, False: 4.30k]
  |  Branch (1713:48): [True: 62, False: 160]
  ------------------
 1714|  4.46k|                 (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION)))
  ------------------
  |  Branch (1714:19): [True: 643, False: 3.82k]
  |  Branch (1714:48): [True: 598, False: 45]
  ------------------
 1715|    660|            {
 1716|  1.98k|                for (int pl = 0; pl < 2; pl++) {
  ------------------
  |  Branch (1716:34): [True: 1.32k, False: 660]
  ------------------
 1717|  1.32k|                    res = warp_affine(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff, NULL,
 1718|  1.32k|                                      f->cur.stride[1], b_dim, 1 + pl, refp,
 1719|  1.32k|                                      b->motion_mode == MM_WARP ? &t->warpmv :
  ------------------
  |  Branch (1719:39): [True: 1.19k, False: 124]
  ------------------
 1720|  1.32k|                                          &f->frame_hdr->gmv[b->ref[0]]);
 1721|  1.32k|                    if (res) return res;
  ------------------
  |  Branch (1721:25): [True: 0, False: 1.32k]
  ------------------
 1722|  1.32k|                }
 1723|  11.0k|            } else {
 1724|  33.2k|                for (int pl = 0; pl < 2; pl++) {
  ------------------
  |  Branch (1724:34): [True: 22.1k, False: 11.0k]
  ------------------
 1725|  22.1k|                    res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff,
 1726|  22.1k|                             NULL, f->cur.stride[1],
 1727|  22.1k|                             bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),
 1728|  22.1k|                             t->bx & ~ss_hor, t->by & ~ss_ver,
 1729|  22.1k|                             1 + pl, b->mv[0], refp, b->ref[0], filter_2d);
 1730|  22.1k|                    if (res) return res;
  ------------------
  |  Branch (1730:25): [True: 0, False: 22.1k]
  ------------------
 1731|  22.1k|                    if (b->motion_mode == MM_OBMC) {
  ------------------
  |  Branch (1731:25): [True: 6.27k, False: 15.8k]
  ------------------
 1732|  6.27k|                        res = obmc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff,
 1733|  6.27k|                                   f->cur.stride[1], b_dim, 1 + pl, bx4, by4, w4, h4);
 1734|  6.27k|                        if (res) return res;
  ------------------
  |  Branch (1734:29): [True: 0, False: 6.27k]
  ------------------
 1735|  6.27k|                    }
 1736|  22.1k|                }
 1737|  11.0k|            }
 1738|  11.7k|            if (b->interintra_type) {
  ------------------
  |  Branch (1738:17): [True: 270, False: 11.4k]
  ------------------
 1739|       |                // FIXME for 8x32 with 4:2:2 subsampling, this probably does
 1740|       |                // the wrong thing since it will select 4x16, not 4x32, as a
 1741|       |                // transform size...
 1742|    270|                const uint8_t *const ii_mask = II_MASK(chr_layout_idx, bs, b);
  ------------------
  |  |   83|    270|    ((const uint8_t*)((uintptr_t)&dav1d_masks + \
  |  |   84|    270|    (size_t)((b)->interintra_type == INTER_INTRA_BLEND ? \
  |  |  ------------------
  |  |  |  Branch (84:14): [True: 183, False: 87]
  |  |  ------------------
  |  |   85|    270|    dav1d_masks.offsets[c][(bs)-BS_32x32].ii[(b)->interintra_mode] : \
  |  |   86|    270|    dav1d_masks.offsets[c][(bs)-BS_32x32].wedge[0][(b)->wedge_idx]) * 8))
  ------------------
 1743|       |
 1744|    810|                for (int pl = 0; pl < 2; pl++) {
  ------------------
  |  Branch (1744:34): [True: 540, False: 270]
  ------------------
 1745|    540|                    pixel *const tmp = bitfn(t->scratch.interintra);
  ------------------
  |  |   77|    540|#define bitfn(x) x##_16bpc
  ------------------
 1746|    540|                    pixel *const tl_edge = bitfn(t->scratch.edge) + 32;
  ------------------
  |  |   77|    540|#define bitfn(x) x##_16bpc
  ------------------
 1747|    540|                    enum IntraPredMode m =
 1748|    540|                        b->interintra_mode == II_SMOOTH_PRED ?
  ------------------
  |  Branch (1748:25): [True: 88, False: 452]
  ------------------
 1749|    452|                        SMOOTH_PRED : b->interintra_mode;
 1750|    540|                    int angle = 0;
 1751|    540|                    pixel *const uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff;
 1752|    540|                    const pixel *top_sb_edge = NULL;
 1753|    540|                    if (!(t->by & (f->sb_step - 1))) {
  ------------------
  |  Branch (1753:25): [True: 136, False: 404]
  ------------------
 1754|    136|                        top_sb_edge = f->ipred_edge[pl + 1];
 1755|    136|                        const int sby = t->by >> f->sb_shift;
 1756|    136|                        top_sb_edge += f->sb128w * 128 * (sby - 1);
 1757|    136|                    }
 1758|    540|                    m = bytefn(dav1d_prepare_intra_edges)(t->bx >> ss_hor,
  ------------------
  |  |   87|    540|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   77|    540|#define bitfn(x) x##_16bpc
  |  |  ------------------
  ------------------
 1759|    540|                                                          (t->bx >> ss_hor) >
 1760|    540|                                                              (ts->tiling.col_start >> ss_hor),
 1761|    540|                                                          t->by >> ss_ver,
 1762|    540|                                                          (t->by >> ss_ver) >
 1763|    540|                                                              (ts->tiling.row_start >> ss_ver),
 1764|    540|                                                          ts->tiling.col_end >> ss_hor,
 1765|    540|                                                          ts->tiling.row_end >> ss_ver,
 1766|    540|                                                          0, uvdst, f->cur.stride[1],
 1767|    540|                                                          top_sb_edge, m,
 1768|    540|                                                          &angle, cbw4, cbh4, 0, tl_edge
 1769|    540|                                                          HIGHBD_CALL_SUFFIX);
  ------------------
  |  |   73|    540|#define HIGHBD_CALL_SUFFIX , f->bitdepth_max
  ------------------
 1770|    540|                    dsp->ipred.intra_pred[m](tmp, cbw4 * 4 * sizeof(pixel),
 1771|    540|                                             tl_edge, cbw4 * 4, cbh4 * 4, 0, 0, 0
 1772|    540|                                             HIGHBD_CALL_SUFFIX);
  ------------------
  |  |   73|    540|#define HIGHBD_CALL_SUFFIX , f->bitdepth_max
  ------------------
 1773|    540|                    dsp->mc.blend(uvdst, f->cur.stride[1], tmp,
 1774|    540|                                  cbw4 * 4, cbh4 * 4, ii_mask);
 1775|    540|                }
 1776|    270|            }
 1777|  11.7k|        }
 1778|       |
 1779|  17.4k|    skip_inter_chroma_pred: {}
 1780|  17.4k|        t->tl_4x4_filter = filter_2d;
 1781|  17.4k|    } else {
 1782|  7.00k|        const enum Filter2d filter_2d = b->filter2d;
 1783|       |        // Maximum super block size is 128x128
 1784|  7.00k|        int16_t (*tmp)[128 * 128] = t->scratch.compinter;
 1785|  7.00k|        int jnt_weight;
 1786|  7.00k|        uint8_t *const seg_mask = t->scratch.seg_mask;
 1787|  7.00k|        const uint8_t *mask;
 1788|       |
 1789|  13.8k|        for (int i = 0; i < 2; i++) {
  ------------------
  |  Branch (1789:25): [True: 6.85k, False: 7.00k]
  ------------------
 1790|  6.85k|            const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]];
 1791|       |
 1792|  6.85k|            if (b->inter_mode == GLOBALMV_GLOBALMV && f->gmv_warp_allowed[b->ref[i]]) {
  ------------------
  |  Branch (1792:17): [True: 750, False: 6.10k]
  |  Branch (1792:55): [True: 99, False: 651]
  ------------------
 1793|     99|                res = warp_affine(t, NULL, tmp[i], bw4 * 4, b_dim, 0, refp,
 1794|     99|                                  &f->frame_hdr->gmv[b->ref[i]]);
 1795|     99|                if (res) return res;
  ------------------
  |  Branch (1795:21): [True: 0, False: 99]
  ------------------
 1796|  6.75k|            } else {
 1797|  6.75k|                res = mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by, 0,
 1798|  6.75k|                         b->mv[i], refp, b->ref[i], filter_2d);
 1799|  6.75k|                if (res) return res;
  ------------------
  |  Branch (1799:21): [True: 0, False: 6.75k]
  ------------------
 1800|  6.75k|            }
 1801|  6.85k|        }
 1802|  7.00k|        switch (b->comp_type) {
  ------------------
  |  Branch (1802:17): [True: 3.42k, False: 3.57k]
  ------------------
 1803|  2.44k|        case COMP_INTER_AVG:
  ------------------
  |  Branch (1803:9): [True: 2.44k, False: 4.55k]
  ------------------
 1804|  2.44k|            dsp->mc.avg(dst, f->cur.stride[0], tmp[0], tmp[1],
 1805|  2.44k|                        bw4 * 4, bh4 * 4 HIGHBD_CALL_SUFFIX);
  ------------------
  |  |   73|  2.44k|#define HIGHBD_CALL_SUFFIX , f->bitdepth_max
  ------------------
 1806|  2.44k|            break;
 1807|    245|        case COMP_INTER_WEIGHTED_AVG:
  ------------------
  |  Branch (1807:9): [True: 245, False: 6.75k]
  ------------------
 1808|    245|            jnt_weight = f->jnt_weights[b->ref[0]][b->ref[1]];
 1809|    245|            dsp->mc.w_avg(dst, f->cur.stride[0], tmp[0], tmp[1],
 1810|    245|                          bw4 * 4, bh4 * 4, jnt_weight HIGHBD_CALL_SUFFIX);
  ------------------
  |  |   73|    245|#define HIGHBD_CALL_SUFFIX , f->bitdepth_max
  ------------------
 1811|    245|            break;
 1812|    508|        case COMP_INTER_SEG:
  ------------------
  |  Branch (1812:9): [True: 508, False: 6.49k]
  ------------------
 1813|    508|            dsp->mc.w_mask[chr_layout_idx](dst, f->cur.stride[0],
 1814|    508|                                           tmp[b->mask_sign], tmp[!b->mask_sign],
 1815|    508|                                           bw4 * 4, bh4 * 4, seg_mask,
 1816|    508|                                           b->mask_sign HIGHBD_CALL_SUFFIX);
  ------------------
  |  |   73|    508|#define HIGHBD_CALL_SUFFIX , f->bitdepth_max
  ------------------
 1817|    508|            mask = seg_mask;
 1818|    508|            break;
 1819|    228|        case COMP_INTER_WEDGE:
  ------------------
  |  Branch (1819:9): [True: 228, False: 6.77k]
  ------------------
 1820|    228|            mask = WEDGE_MASK(0, bs, 0, b->wedge_idx);
  ------------------
  |  |   89|    228|    ((const uint8_t*)((uintptr_t)&dav1d_masks + \
  |  |   90|    228|    (size_t)dav1d_masks.offsets[c][(bs)-BS_32x32].wedge[sign][idx] * 8))
  ------------------
 1821|    228|            dsp->mc.mask(dst, f->cur.stride[0],
 1822|    228|                         tmp[b->mask_sign], tmp[!b->mask_sign],
 1823|    228|                         bw4 * 4, bh4 * 4, mask HIGHBD_CALL_SUFFIX);
  ------------------
  |  |   73|    228|#define HIGHBD_CALL_SUFFIX , f->bitdepth_max
  ------------------
 1824|    228|            if (has_chroma)
  ------------------
  |  Branch (1824:17): [True: 226, False: 2]
  ------------------
 1825|    226|                mask = WEDGE_MASK(chr_layout_idx, bs, b->mask_sign, b->wedge_idx);
  ------------------
  |  |   89|    226|    ((const uint8_t*)((uintptr_t)&dav1d_masks + \
  |  |   90|    226|    (size_t)dav1d_masks.offsets[c][(bs)-BS_32x32].wedge[sign][idx] * 8))
  ------------------
 1826|    228|            break;
 1827|  7.00k|        }
 1828|       |
 1829|       |        // chroma
 1830|  10.1k|        if (has_chroma) for (int pl = 0; pl < 2; pl++) {
  ------------------
  |  Branch (1830:13): [True: 3.37k, False: 44]
  |  Branch (1830:42): [True: 6.75k, False: 3.37k]
  ------------------
 1831|  20.2k|            for (int i = 0; i < 2; i++) {
  ------------------
  |  Branch (1831:29): [True: 13.5k, False: 6.75k]
  ------------------
 1832|  13.5k|                const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]];
 1833|  13.5k|                if (b->inter_mode == GLOBALMV_GLOBALMV &&
  ------------------
  |  Branch (1833:21): [True: 1.42k, False: 12.0k]
  ------------------
 1834|  1.42k|                    imin(cbw4, cbh4) > 1 && f->gmv_warp_allowed[b->ref[i]])
  ------------------
  |  Branch (1834:21): [True: 548, False: 880]
  |  Branch (1834:45): [True: 94, False: 454]
  ------------------
 1835|     94|                {
 1836|     94|                    res = warp_affine(t, NULL, tmp[i], bw4 * 4 >> ss_hor,
 1837|     94|                                      b_dim, 1 + pl,
 1838|     94|                                      refp, &f->frame_hdr->gmv[b->ref[i]]);
 1839|     94|                    if (res) return res;
  ------------------
  |  Branch (1839:25): [True: 0, False: 94]
  ------------------
 1840|  13.4k|                } else {
 1841|  13.4k|                    res = mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by,
 1842|  13.4k|                             1 + pl, b->mv[i], refp, b->ref[i], filter_2d);
 1843|  13.4k|                    if (res) return res;
  ------------------
  |  Branch (1843:25): [True: 0, False: 13.4k]
  ------------------
 1844|  13.4k|                }
 1845|  13.5k|            }
 1846|  6.75k|            pixel *const uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff;
 1847|  6.75k|            switch (b->comp_type) {
  ------------------
  |  Branch (1847:21): [True: 6.76k, False: 18.4E]
  ------------------
 1848|  4.82k|            case COMP_INTER_AVG:
  ------------------
  |  Branch (1848:13): [True: 4.82k, False: 1.93k]
  ------------------
 1849|  4.82k|                dsp->mc.avg(uvdst, f->cur.stride[1], tmp[0], tmp[1],
 1850|  4.82k|                            bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver
 1851|  4.82k|                            HIGHBD_CALL_SUFFIX);
  ------------------
  |  |   73|  4.82k|#define HIGHBD_CALL_SUFFIX , f->bitdepth_max
  ------------------
 1852|  4.82k|                break;
 1853|    490|            case COMP_INTER_WEIGHTED_AVG:
  ------------------
  |  Branch (1853:13): [True: 490, False: 6.26k]
  ------------------
 1854|    490|                dsp->mc.w_avg(uvdst, f->cur.stride[1], tmp[0], tmp[1],
 1855|    490|                              bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, jnt_weight
 1856|    490|                              HIGHBD_CALL_SUFFIX);
  ------------------
  |  |   73|    490|#define HIGHBD_CALL_SUFFIX , f->bitdepth_max
  ------------------
 1857|    490|                break;
 1858|    452|            case COMP_INTER_WEDGE:
  ------------------
  |  Branch (1858:13): [True: 452, False: 6.30k]
  ------------------
 1859|  1.45k|            case COMP_INTER_SEG:
  ------------------
  |  Branch (1859:13): [True: 1.00k, False: 5.75k]
  ------------------
 1860|  1.45k|                dsp->mc.mask(uvdst, f->cur.stride[1],
 1861|  1.45k|                             tmp[b->mask_sign], tmp[!b->mask_sign],
 1862|  1.45k|                             bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, mask
 1863|  1.45k|                             HIGHBD_CALL_SUFFIX);
  ------------------
  |  |   73|  1.45k|#define HIGHBD_CALL_SUFFIX , f->bitdepth_max
  ------------------
 1864|  1.45k|                break;
 1865|  6.75k|            }
 1866|  6.75k|        }
 1867|  3.42k|    }
 1868|       |
 1869|   435k|    if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
  ------------------
  |  |   34|   435k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 435k]
  |  |  ------------------
  |  |   35|   435k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|   435k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
                  if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
  ------------------
  |  |   37|      0|#define DEBUG_B_PIXELS 0
  |  |  ------------------
  |  |  |  Branch (37:24): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1870|      0|        hex_dump(dst, f->cur.stride[0], b_dim[0] * 4, b_dim[1] * 4, "y-pred");
 1871|      0|        if (has_chroma) {
  ------------------
  |  Branch (1871:13): [True: 0, False: 0]
  ------------------
 1872|      0|            hex_dump(&((pixel *) f->cur.data[1])[uvdstoff], f->cur.stride[1],
 1873|      0|                     cbw4 * 4, cbh4 * 4, "u-pred");
 1874|      0|            hex_dump(&((pixel *) f->cur.data[2])[uvdstoff], f->cur.stride[1],
 1875|      0|                     cbw4 * 4, cbh4 * 4, "v-pred");
 1876|      0|        }
 1877|      0|    }
 1878|       |
 1879|   435k|    const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
 1880|       |
 1881|   435k|    if (b->skip) {
  ------------------
  |  Branch (1881:9): [True: 407k, False: 28.7k]
  ------------------
 1882|       |        // reset coef contexts
 1883|   407k|        BlockContext *const a = t->a;
 1884|   407k|        dav1d_memset_pow2[b_dim[2]](&a->lcoef[bx4], 0x40);
 1885|   407k|        dav1d_memset_pow2[b_dim[3]](&t->l.lcoef[by4], 0x40);
 1886|   407k|        if (has_chroma) {
  ------------------
  |  Branch (1886:13): [True: 354k, False: 53.0k]
  ------------------
 1887|   354k|            dav1d_memset_pow2_fn memset_cw = dav1d_memset_pow2[ulog2(cbw4)];
 1888|   354k|            dav1d_memset_pow2_fn memset_ch = dav1d_memset_pow2[ulog2(cbh4)];
 1889|   354k|            memset_cw(&a->ccoef[0][cbx4], 0x40);
 1890|   354k|            memset_cw(&a->ccoef[1][cbx4], 0x40);
 1891|   354k|            memset_ch(&t->l.ccoef[0][cby4], 0x40);
 1892|   354k|            memset_ch(&t->l.ccoef[1][cby4], 0x40);
 1893|   354k|        }
 1894|   407k|        return 0;
 1895|   407k|    }
 1896|       |
 1897|  28.7k|    const TxfmInfo *const uvtx = &dav1d_txfm_dimensions[b->uvtx];
 1898|  28.7k|    const TxfmInfo *const ytx = &dav1d_txfm_dimensions[b->max_ytx];
 1899|  28.7k|    const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 };
 1900|       |
 1901|  61.3k|    for (int init_y = 0; init_y < bh4; init_y += 16) {
  ------------------
  |  Branch (1901:26): [True: 32.6k, False: 28.7k]
  ------------------
 1902|  65.6k|        for (int init_x = 0; init_x < bw4; init_x += 16) {
  ------------------
  |  Branch (1902:30): [True: 32.9k, False: 32.6k]
  ------------------
 1903|       |            // coefficient coding & inverse transforms
 1904|  32.9k|            int y_off = !!init_y, y;
 1905|  32.9k|            dst += PXSTRIDE(f->cur.stride[0]) * 4 * init_y;
 1906|  76.3k|            for (y = init_y, t->by += init_y; y < imin(h4, init_y + 16);
  ------------------
  |  Branch (1906:47): [True: 43.4k, False: 32.9k]
  ------------------
 1907|  43.4k|                 y += ytx->h, y_off++)
 1908|  43.4k|            {
 1909|  43.4k|                int x, x_off = !!init_x;
 1910|   148k|                for (x = init_x, t->bx += init_x; x < imin(w4, init_x + 16);
  ------------------
  |  Branch (1910:51): [True: 105k, False: 43.4k]
  ------------------
 1911|   105k|                     x += ytx->w, x_off++)
 1912|   105k|                {
 1913|   105k|                    read_coef_tree(t, bs, b, b->max_ytx, 0, tx_split,
 1914|   105k|                                   x_off, y_off, &dst[x * 4]);
 1915|   105k|                    t->bx += ytx->w;
 1916|   105k|                }
 1917|  43.4k|                dst += PXSTRIDE(f->cur.stride[0]) * 4 * ytx->h;
 1918|  43.4k|                t->bx -= x;
 1919|  43.4k|                t->by += ytx->h;
 1920|  43.4k|            }
 1921|  32.9k|            dst -= PXSTRIDE(f->cur.stride[0]) * 4 * y;
 1922|  32.9k|            t->by -= y;
 1923|       |
 1924|       |            // chroma coefs and inverse transform
 1925|  75.6k|            if (has_chroma) for (int pl = 0; pl < 2; pl++) {
  ------------------
  |  Branch (1925:17): [True: 25.2k, False: 7.74k]
  |  Branch (1925:46): [True: 50.4k, False: 25.2k]
  ------------------
 1926|  50.4k|                pixel *uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff +
 1927|  50.4k|                    (PXSTRIDE(f->cur.stride[1]) * init_y * 4 >> ss_ver);
 1928|  50.4k|                for (y = init_y >> ss_ver, t->by += init_y;
 1929|   109k|                     y < imin(ch4, (init_y + 16) >> ss_ver); y += uvtx->h)
  ------------------
  |  Branch (1929:22): [True: 59.2k, False: 50.4k]
  ------------------
 1930|  59.2k|                {
 1931|  59.2k|                    int x;
 1932|  59.2k|                    for (x = init_x >> ss_hor, t->bx += init_x;
 1933|   155k|                         x < imin(cw4, (init_x + 16) >> ss_hor); x += uvtx->w)
  ------------------
  |  Branch (1933:26): [True: 96.5k, False: 59.2k]
  ------------------
 1934|  96.5k|                    {
 1935|  96.5k|                        coef *cf;
 1936|  96.5k|                        int eob;
 1937|  96.5k|                        enum TxfmType txtp;
 1938|  96.5k|                        if (t->frame_thread.pass) {
  ------------------
  |  Branch (1938:29): [True: 0, False: 96.5k]
  ------------------
 1939|      0|                            const int p = t->frame_thread.pass & 1;
 1940|      0|                            const int cbi = *ts->frame_thread[p].cbi++;
 1941|      0|                            cf = ts->frame_thread[p].cf;
 1942|      0|                            ts->frame_thread[p].cf += uvtx->w * uvtx->h * 16;
 1943|      0|                            eob  = cbi >> 5;
 1944|      0|                            txtp = cbi & 0x1f;
 1945|  96.5k|                        } else {
 1946|  96.5k|                            uint8_t cf_ctx;
 1947|  96.5k|                            cf = bitfn(t->cf);
  ------------------
  |  |   77|  96.5k|#define bitfn(x) x##_16bpc
  ------------------
 1948|  96.5k|                            txtp = t->scratch.txtp_map[(by4 + (y << ss_ver)) * 32 +
 1949|  96.5k|                                                        bx4 + (x << ss_hor)];
 1950|  96.5k|                            eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
 1951|  96.5k|                                               &t->l.ccoef[pl][cby4 + y],
 1952|  96.5k|                                               b->uvtx, bs, b, 0, 1 + pl,
 1953|  96.5k|                                               cf, &txtp, &cf_ctx);
 1954|  96.5k|                            if (DEBUG_BLOCK_INFO)
  ------------------
  |  |   34|  96.5k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 96.5k]
  |  |  ------------------
  |  |   35|  96.5k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  96.5k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1955|      0|                                printf("Post-uv-cf-blk[pl=%d,tx=%d,"
 1956|      0|                                       "txtp=%d,eob=%d]: r=%d\n",
 1957|      0|                                       pl, b->uvtx, txtp, eob, ts->msac.rng);
 1958|  96.5k|                            int ctw = imin(uvtx->w, (f->bw - t->bx + ss_hor) >> ss_hor);
 1959|  96.5k|                            int cth = imin(uvtx->h, (f->bh - t->by + ss_ver) >> ss_ver);
 1960|  96.5k|                            dav1d_memset_likely_pow2(&t->a->ccoef[pl][cbx4 + x], cf_ctx, ctw);
 1961|  96.5k|                            dav1d_memset_likely_pow2(&t->l.ccoef[pl][cby4 + y], cf_ctx, cth);
 1962|  96.5k|                        }
 1963|  96.5k|                        if (eob >= 0) {
  ------------------
  |  Branch (1963:29): [True: 50.7k, False: 45.8k]
  ------------------
 1964|  50.7k|                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
  ------------------
  |  |   34|  50.7k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 50.7k]
  |  |  ------------------
  |  |   35|  50.7k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  50.7k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
                                          if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
  ------------------
  |  |   37|      0|#define DEBUG_B_PIXELS 0
  |  |  ------------------
  |  |  |  Branch (37:24): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1965|      0|                                coef_dump(cf, uvtx->h * 4, uvtx->w * 4, 3, "dq");
 1966|  50.7k|                            dsp->itx.itxfm_add[b->uvtx]
 1967|  50.7k|                                              [txtp](&uvdst[4 * x],
 1968|  50.7k|                                                     f->cur.stride[1],
 1969|  50.7k|                                                     cf, eob HIGHBD_CALL_SUFFIX);
  ------------------
  |  |   73|  50.7k|#define HIGHBD_CALL_SUFFIX , f->bitdepth_max
  ------------------
 1970|  50.7k|                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
  ------------------
  |  |   34|  50.7k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 50.7k]
  |  |  ------------------
  |  |   35|  50.7k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  50.7k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
                                          if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
  ------------------
  |  |   37|      0|#define DEBUG_B_PIXELS 0
  |  |  ------------------
  |  |  |  Branch (37:24): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1971|      0|                                hex_dump(&uvdst[4 * x], f->cur.stride[1],
 1972|      0|                                         uvtx->w * 4, uvtx->h * 4, "recon");
 1973|  50.7k|                        }
 1974|  96.5k|                        t->bx += uvtx->w << ss_hor;
 1975|  96.5k|                    }
 1976|  59.2k|                    uvdst += PXSTRIDE(f->cur.stride[1]) * 4 * uvtx->h;
 1977|  59.2k|                    t->bx -= x << ss_hor;
 1978|  59.2k|                    t->by += uvtx->h << ss_ver;
 1979|  59.2k|                }
 1980|  50.4k|                t->by -= y << ss_ver;
 1981|  50.4k|            }
 1982|  32.9k|        }
 1983|  32.6k|    }
 1984|  28.7k|    return 0;
 1985|   435k|}
dav1d_filter_sbrow_deblock_cols_16bpc:
 1987|  67.7k|void bytefn(dav1d_filter_sbrow_deblock_cols)(Dav1dFrameContext *const f, const int sby) {
 1988|  67.7k|    if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_DEBLOCK) ||
  ------------------
  |  Branch (1988:9): [True: 0, False: 67.7k]
  ------------------
 1989|  67.7k|        (!f->frame_hdr->loopfilter.level_y[0] && !f->frame_hdr->loopfilter.level_y[1]))
  ------------------
  |  Branch (1989:10): [True: 2.55k, False: 65.2k]
  |  Branch (1989:50): [True: 1.83k, False: 721]
  ------------------
 1990|  1.83k|    {
 1991|  1.83k|        return;
 1992|  1.83k|    }
 1993|  65.9k|    const int y = sby * f->sb_step * 4;
 1994|  65.9k|    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
 1995|  65.9k|    pixel *const p[3] = {
 1996|  65.9k|        f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
 1997|  65.9k|        f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
 1998|  65.9k|        f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
 1999|  65.9k|    };
 2000|  65.9k|    Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
 2001|  65.9k|    bytefn(dav1d_loopfilter_sbrow_cols)(f, p, mask, sby,
  ------------------
  |  |   87|  65.9k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   77|  65.9k|#define bitfn(x) x##_16bpc
  |  |  ------------------
  ------------------
 2002|  65.9k|                                        f->lf.start_of_tile_row[sby]);
 2003|  65.9k|}
dav1d_filter_sbrow_deblock_rows_16bpc:
 2005|  76.7k|void bytefn(dav1d_filter_sbrow_deblock_rows)(Dav1dFrameContext *const f, const int sby) {
 2006|  76.7k|    const int y = sby * f->sb_step * 4;
 2007|  76.7k|    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
 2008|  76.7k|    pixel *const p[3] = {
 2009|  76.7k|        f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
 2010|  76.7k|        f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
 2011|  76.7k|        f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
 2012|  76.7k|    };
 2013|  76.7k|    Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
 2014|  76.7k|    if (f->c->inloop_filters & DAV1D_INLOOPFILTER_DEBLOCK &&
  ------------------
  |  Branch (2014:9): [True: 76.7k, False: 0]
  ------------------
 2015|  76.7k|        (f->frame_hdr->loopfilter.level_y[0] || f->frame_hdr->loopfilter.level_y[1]))
  ------------------
  |  Branch (2015:10): [True: 65.2k, False: 11.5k]
  |  Branch (2015:49): [True: 721, False: 10.8k]
  ------------------
 2016|  65.9k|    {
 2017|  65.9k|        bytefn(dav1d_loopfilter_sbrow_rows)(f, p, mask, sby);
  ------------------
  |  |   87|  65.9k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   77|  65.9k|#define bitfn(x) x##_16bpc
  |  |  ------------------
  ------------------
 2018|  65.9k|    }
 2019|  76.7k|    if (f->seq_hdr->cdef || f->lf.restore_planes) {
  ------------------
  |  Branch (2019:9): [True: 66.9k, False: 9.88k]
  |  Branch (2019:29): [True: 7.14k, False: 2.73k]
  ------------------
 2020|       |        // Store loop filtered pixels required by CDEF / LR
 2021|  74.0k|        bytefn(dav1d_copy_lpf)(f, p, sby);
  ------------------
  |  |   87|  74.0k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   77|  74.0k|#define bitfn(x) x##_16bpc
  |  |  ------------------
  ------------------
 2022|  74.0k|    }
 2023|  76.7k|}
dav1d_filter_sbrow_cdef_16bpc:
 2025|  66.8k|void bytefn(dav1d_filter_sbrow_cdef)(Dav1dTaskContext *const tc, const int sby) {
 2026|  66.8k|    const Dav1dFrameContext *const f = tc->f;
 2027|  66.8k|    if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_CDEF)) return;
  ------------------
  |  Branch (2027:9): [True: 0, False: 66.8k]
  ------------------
 2028|  66.8k|    const int sbsz = f->sb_step;
 2029|  66.8k|    const int y = sby * sbsz * 4;
 2030|  66.8k|    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
 2031|  66.8k|    pixel *const p[3] = {
 2032|  66.8k|        f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
 2033|  66.8k|        f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
 2034|  66.8k|        f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
 2035|  66.8k|    };
 2036|  66.8k|    Av1Filter *prev_mask = f->lf.mask + ((sby - 1) >> !f->seq_hdr->sb128) * f->sb128w;
 2037|  66.8k|    Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
 2038|  66.8k|    const int start = sby * sbsz;
 2039|  66.8k|    if (sby) {
  ------------------
  |  Branch (2039:9): [True: 62.5k, False: 4.29k]
  ------------------
 2040|  62.5k|        const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
 2041|  62.5k|        pixel *p_up[3] = {
 2042|  62.5k|            p[0] - 8 * PXSTRIDE(f->cur.stride[0]),
 2043|  62.5k|            p[1] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
 2044|  62.5k|            p[2] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
 2045|  62.5k|        };
 2046|  62.5k|        bytefn(dav1d_cdef_brow)(tc, p_up, prev_mask, start - 2, start, 1, sby);
  ------------------
  |  |   87|  62.5k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   77|  62.5k|#define bitfn(x) x##_16bpc
  |  |  ------------------
  ------------------
 2047|  62.5k|    }
 2048|  66.8k|    const int n_blks = sbsz - 2 * (sby + 1 < f->sbh);
 2049|  66.8k|    const int end = imin(start + n_blks, f->bh);
 2050|  66.8k|    bytefn(dav1d_cdef_brow)(tc, p, mask, start, end, 0, sby);
  ------------------
  |  |   87|  66.8k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   77|  66.8k|#define bitfn(x) x##_16bpc
  |  |  ------------------
  ------------------
 2051|  66.8k|}
dav1d_filter_sbrow_resize_16bpc:
 2053|  2.11k|void bytefn(dav1d_filter_sbrow_resize)(Dav1dFrameContext *const f, const int sby) {
 2054|  2.11k|    const int sbsz = f->sb_step;
 2055|  2.11k|    const int y = sby * sbsz * 4;
 2056|  2.11k|    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
 2057|  2.11k|    const pixel *const p[3] = {
 2058|  2.11k|        f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
 2059|  2.11k|        f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
 2060|  2.11k|        f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
 2061|  2.11k|    };
 2062|  2.11k|    pixel *const sr_p[3] = {
 2063|  2.11k|        f->lf.sr_p[0] + y * PXSTRIDE(f->sr_cur.p.stride[0]),
 2064|  2.11k|        f->lf.sr_p[1] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver),
 2065|  2.11k|        f->lf.sr_p[2] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver)
 2066|  2.11k|    };
 2067|  2.11k|    const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400;
 2068|  8.07k|    for (int pl = 0; pl < 1 + 2 * has_chroma; pl++) {
  ------------------
  |  Branch (2068:22): [True: 5.95k, False: 2.11k]
  ------------------
 2069|  5.95k|        const int ss_ver = pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
  ------------------
  |  Branch (2069:28): [True: 3.84k, False: 2.11k]
  |  Branch (2069:34): [True: 100, False: 3.74k]
  ------------------
 2070|  5.95k|        const int h_start = 8 * !!sby >> ss_ver;
 2071|  5.95k|        const ptrdiff_t dst_stride = f->sr_cur.p.stride[!!pl];
 2072|  5.95k|        pixel *dst = sr_p[pl] - h_start * PXSTRIDE(dst_stride);
 2073|  5.95k|        const ptrdiff_t src_stride = f->cur.stride[!!pl];
 2074|  5.95k|        const pixel *src = p[pl] - h_start * PXSTRIDE(src_stride);
 2075|  5.95k|        const int h_end = 4 * (sbsz - 2 * (sby + 1 < f->sbh)) >> ss_ver;
 2076|  5.95k|        const int ss_hor = pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
  ------------------
  |  Branch (2076:28): [True: 3.84k, False: 2.11k]
  |  Branch (2076:34): [True: 100, False: 3.74k]
  ------------------
 2077|  5.95k|        const int dst_w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor;
 2078|  5.95k|        const int src_w = (4 * f->bw + ss_hor) >> ss_hor;
 2079|  5.95k|        const int img_h = (f->cur.p.h - sbsz * 4 * sby + ss_ver) >> ss_ver;
 2080|       |
 2081|  5.95k|        f->dsp->mc.resize(dst, dst_stride, src, src_stride, dst_w,
 2082|  5.95k|                          imin(img_h, h_end) + h_start, src_w,
 2083|  5.95k|                          f->resize_step[!!pl], f->resize_start[!!pl]
 2084|  5.95k|                          HIGHBD_CALL_SUFFIX);
  ------------------
  |  |   73|  5.95k|#define HIGHBD_CALL_SUFFIX , f->bitdepth_max
  ------------------
 2085|  5.95k|    }
 2086|  2.11k|}
dav1d_filter_sbrow_lr_16bpc:
 2088|  55.5k|void bytefn(dav1d_filter_sbrow_lr)(Dav1dFrameContext *const f, const int sby) {
 2089|  55.5k|    if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_RESTORATION)) return;
  ------------------
  |  Branch (2089:9): [True: 0, False: 55.5k]
  ------------------
 2090|  55.5k|    const int y = sby * f->sb_step * 4;
 2091|  55.5k|    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
 2092|  55.5k|    pixel *const sr_p[3] = {
 2093|  55.5k|        f->lf.sr_p[0] + y * PXSTRIDE(f->sr_cur.p.stride[0]),
 2094|  55.5k|        f->lf.sr_p[1] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver),
 2095|  55.5k|        f->lf.sr_p[2] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver)
 2096|  55.5k|    };
 2097|  55.5k|    bytefn(dav1d_lr_sbrow)(f, sr_p, sby);
  ------------------
  |  |   87|  55.5k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   77|  55.5k|#define bitfn(x) x##_16bpc
  |  |  ------------------
  ------------------
 2098|  55.5k|}
dav1d_filter_sbrow_16bpc:
 2100|  4.28k|void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) {
 2101|  4.28k|    bytefn(dav1d_filter_sbrow_deblock_cols)(f, sby);
  ------------------
  |  |   87|  4.28k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   77|  4.28k|#define bitfn(x) x##_16bpc
  |  |  ------------------
  ------------------
 2102|  4.28k|    bytefn(dav1d_filter_sbrow_deblock_rows)(f, sby);
  ------------------
  |  |   87|  4.28k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   77|  4.28k|#define bitfn(x) x##_16bpc
  |  |  ------------------
  ------------------
 2103|  4.28k|    if (f->seq_hdr->cdef)
  ------------------
  |  Branch (2103:9): [True: 3.40k, False: 879]
  ------------------
 2104|  3.40k|        bytefn(dav1d_filter_sbrow_cdef)(f->c->tc, sby);
  ------------------
  |  |   87|  3.40k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   77|  3.40k|#define bitfn(x) x##_16bpc
  |  |  ------------------
  ------------------
 2105|  4.28k|    if (f->frame_hdr->width[0] != f->frame_hdr->width[1])
  ------------------
  |  Branch (2105:9): [True: 197, False: 4.08k]
  ------------------
 2106|    197|        bytefn(dav1d_filter_sbrow_resize)(f, sby);
  ------------------
  |  |   87|    197|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   77|    197|#define bitfn(x) x##_16bpc
  |  |  ------------------
  ------------------
 2107|  4.28k|    if (f->lf.restore_planes)
  ------------------
  |  Branch (2107:9): [True: 1.90k, False: 2.37k]
  ------------------
 2108|  1.90k|        bytefn(dav1d_filter_sbrow_lr)(f, sby);
  ------------------
  |  |   87|  1.90k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   77|  1.90k|#define bitfn(x) x##_16bpc
  |  |  ------------------
  ------------------
 2109|  4.28k|}
dav1d_backup_ipred_edge_16bpc:
 2111|   102k|void bytefn(dav1d_backup_ipred_edge)(Dav1dTaskContext *const t) {
 2112|   102k|    const Dav1dFrameContext *const f = t->f;
 2113|   102k|    Dav1dTileState *const ts = t->ts;
 2114|   102k|    const int sby = t->by >> f->sb_shift;
 2115|   102k|    const int sby_off = f->sb128w * 128 * sby;
 2116|   102k|    const int x_off = ts->tiling.col_start;
 2117|       |
 2118|   102k|    const pixel *const y =
 2119|   102k|        ((const pixel *) f->cur.data[0]) + x_off * 4 +
 2120|   102k|                    ((t->by + f->sb_step) * 4 - 1) * PXSTRIDE(f->cur.stride[0]);
 2121|   102k|    pixel_copy(&f->ipred_edge[0][sby_off + x_off * 4], y,
  ------------------
  |  |   65|   102k|#define pixel_copy(a, b, c) memcpy(a, b, (c) << 1)
  ------------------
 2122|   102k|               4 * (ts->tiling.col_end - x_off));
 2123|       |
 2124|   102k|    if (f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400) {
  ------------------
  |  Branch (2124:9): [True: 37.0k, False: 65.7k]
  ------------------
 2125|  37.0k|        const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
 2126|  37.0k|        const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
 2127|       |
 2128|  37.0k|        const ptrdiff_t uv_off = (x_off * 4 >> ss_hor) +
 2129|  37.0k|            (((t->by + f->sb_step) * 4 >> ss_ver) - 1) * PXSTRIDE(f->cur.stride[1]);
 2130|   111k|        for (int pl = 1; pl <= 2; pl++)
  ------------------
  |  Branch (2130:26): [True: 74.1k, False: 37.0k]
  ------------------
 2131|  74.1k|            pixel_copy(&f->ipred_edge[pl][sby_off + (x_off * 4 >> ss_hor)],
  ------------------
  |  |   65|  74.1k|#define pixel_copy(a, b, c) memcpy(a, b, (c) << 1)
  ------------------
 2132|  37.0k|                       &((const pixel *) f->cur.data[pl])[uv_off],
 2133|  37.0k|                       4 * (ts->tiling.col_end - x_off) >> ss_hor);
 2134|  37.0k|    }
 2135|   102k|}
dav1d_copy_pal_block_y_16bpc:
 2141|  13.0k|{
 2142|  13.0k|    const Dav1dFrameContext *const f = t->f;
 2143|  13.0k|    pixel *const pal = t->frame_thread.pass ?
  ------------------
  |  Branch (2143:24): [True: 0, False: 13.0k]
  ------------------
 2144|      0|        f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
 2145|      0|                            ((t->bx >> 1) + (t->by & 1))][0] :
 2146|  13.0k|        bytefn(t->scratch.pal)[0];
  ------------------
  |  |   87|  13.0k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   77|  13.0k|#define bitfn(x) x##_16bpc
  |  |  ------------------
  ------------------
 2147|  61.2k|    for (int x = 0; x < bw4; x++)
  ------------------
  |  Branch (2147:21): [True: 48.2k, False: 13.0k]
  ------------------
 2148|  48.2k|        memcpy(bytefn(t->al_pal)[0][bx4 + x][0], pal, 8 * sizeof(pixel));
  ------------------
  |  |   87|  48.2k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   77|  48.2k|#define bitfn(x) x##_16bpc
  |  |  ------------------
  ------------------
 2149|  54.3k|    for (int y = 0; y < bh4; y++)
  ------------------
  |  Branch (2149:21): [True: 41.3k, False: 13.0k]
  ------------------
 2150|  41.3k|        memcpy(bytefn(t->al_pal)[1][by4 + y][0], pal, 8 * sizeof(pixel));
  ------------------
  |  |   87|  41.3k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   77|  41.3k|#define bitfn(x) x##_16bpc
  |  |  ------------------
  ------------------
 2151|  13.0k|}
dav1d_copy_pal_block_uv_16bpc:
 2157|  3.42k|{
 2158|  3.42k|    const Dav1dFrameContext *const f = t->f;
 2159|  3.42k|    const pixel (*const pal)[8] = t->frame_thread.pass ?
  ------------------
  |  Branch (2159:35): [True: 0, False: 3.42k]
  ------------------
 2160|      0|        f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
 2161|      0|                            ((t->bx >> 1) + (t->by & 1))] :
 2162|  3.42k|        bytefn(t->scratch.pal);
  ------------------
  |  |   87|  3.42k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   77|  3.42k|#define bitfn(x) x##_16bpc
  |  |  ------------------
  ------------------
 2163|       |    // see aomedia bug 2183 for why we use luma coordinates here
 2164|  10.2k|    for (int pl = 1; pl <= 2; pl++) {
  ------------------
  |  Branch (2164:22): [True: 6.85k, False: 3.42k]
  ------------------
 2165|  35.3k|        for (int x = 0; x < bw4; x++)
  ------------------
  |  Branch (2165:25): [True: 28.4k, False: 6.85k]
  ------------------
 2166|  28.4k|            memcpy(bytefn(t->al_pal)[0][bx4 + x][pl], pal[pl], 8 * sizeof(pixel));
  ------------------
  |  |   87|  28.4k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   77|  28.4k|#define bitfn(x) x##_16bpc
  |  |  ------------------
  ------------------
 2167|  31.6k|        for (int y = 0; y < bh4; y++)
  ------------------
  |  Branch (2167:25): [True: 24.8k, False: 6.85k]
  ------------------
 2168|  24.8k|            memcpy(bytefn(t->al_pal)[1][by4 + y][pl], pal[pl], 8 * sizeof(pixel));
  ------------------
  |  |   87|  24.8k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   77|  24.8k|#define bitfn(x) x##_16bpc
  |  |  ------------------
  ------------------
 2169|  6.85k|    }
 2170|  3.42k|}
dav1d_read_pal_plane_16bpc:
 2175|  16.4k|{
 2176|  16.4k|    Dav1dTileState *const ts = t->ts;
 2177|  16.4k|    const Dav1dFrameContext *const f = t->f;
 2178|  16.4k|    const int pal_sz = b->pal_sz[pl] = dav1d_msac_decode_symbol_adapt8(&ts->msac,
  ------------------
  |  |   48|  16.4k|#define dav1d_msac_decode_symbol_adapt8  dav1d_msac_decode_symbol_adapt8_sse2
  ------------------
 2179|  16.4k|                                           ts->cdf.m.pal_sz[pl][sz_ctx], 6) + 2;
 2180|  16.4k|    pixel cache[16], used_cache[8];
 2181|  16.4k|    int l_cache = pl ? t->pal_sz_uv[1][by4] : t->l.pal_sz[by4];
  ------------------
  |  Branch (2181:19): [True: 3.42k, False: 13.0k]
  ------------------
 2182|  16.4k|    int n_cache = 0;
 2183|       |    // don't reuse above palette outside SB64 boundaries
 2184|  16.4k|    int a_cache = by4 & 15 ? pl ? t->pal_sz_uv[0][bx4] : t->a->pal_sz[bx4] : 0;
  ------------------
  |  Branch (2184:19): [True: 11.9k, False: 4.48k]
  |  Branch (2184:30): [True: 2.35k, False: 9.61k]
  ------------------
 2185|  16.4k|    const pixel *l = bytefn(t->al_pal)[1][by4][pl];
  ------------------
  |  |   87|  16.4k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   77|  16.4k|#define bitfn(x) x##_16bpc
  |  |  ------------------
  ------------------
 2186|  16.4k|    const pixel *a = bytefn(t->al_pal)[0][bx4][pl];
  ------------------
  |  |   87|  16.4k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   77|  16.4k|#define bitfn(x) x##_16bpc
  |  |  ------------------
  ------------------
 2187|       |
 2188|       |    // fill/sort cache
 2189|  30.7k|    while (l_cache && a_cache) {
  ------------------
  |  Branch (2189:12): [True: 19.2k, False: 11.5k]
  |  Branch (2189:23): [True: 14.3k, False: 4.93k]
  ------------------
 2190|  14.3k|        if (*l < *a) {
  ------------------
  |  Branch (2190:13): [True: 5.36k, False: 8.97k]
  ------------------
 2191|  5.36k|            if (!n_cache || cache[n_cache - 1] != *l)
  ------------------
  |  Branch (2191:17): [True: 1.08k, False: 4.27k]
  |  Branch (2191:29): [True: 4.25k, False: 21]
  ------------------
 2192|  5.33k|                cache[n_cache++] = *l;
 2193|  5.36k|            l++;
 2194|  5.36k|            l_cache--;
 2195|  8.97k|        } else {
 2196|  8.97k|            if (*a == *l) {
  ------------------
  |  Branch (2196:17): [True: 3.58k, False: 5.39k]
  ------------------
 2197|  3.58k|                l++;
 2198|  3.58k|                l_cache--;
 2199|  3.58k|            }
 2200|  8.97k|            if (!n_cache || cache[n_cache - 1] != *a)
  ------------------
  |  Branch (2200:17): [True: 1.59k, False: 7.38k]
  |  Branch (2200:29): [True: 7.11k, False: 269]
  ------------------
 2201|  8.71k|                cache[n_cache++] = *a;
 2202|  8.97k|            a++;
 2203|  8.97k|            a_cache--;
 2204|  8.97k|        }
 2205|  14.3k|    }
 2206|  16.4k|    if (l_cache) {
  ------------------
  |  Branch (2206:9): [True: 4.93k, False: 11.5k]
  ------------------
 2207|  20.6k|        do {
 2208|  20.6k|            if (!n_cache || cache[n_cache - 1] != *l)
  ------------------
  |  Branch (2208:17): [True: 3.79k, False: 16.8k]
  |  Branch (2208:29): [True: 13.3k, False: 3.52k]
  ------------------
 2209|  17.1k|                cache[n_cache++] = *l;
 2210|  20.6k|            l++;
 2211|  20.6k|        } while (--l_cache > 0);
  ------------------
  |  Branch (2211:18): [True: 15.7k, False: 4.93k]
  ------------------
 2212|  11.5k|    } else if (a_cache) {
  ------------------
  |  Branch (2212:16): [True: 4.10k, False: 7.41k]
  ------------------
 2213|  16.8k|        do {
 2214|  16.8k|            if (!n_cache || cache[n_cache - 1] != *a)
  ------------------
  |  Branch (2214:17): [True: 2.96k, False: 13.8k]
  |  Branch (2214:29): [True: 10.6k, False: 3.18k]
  ------------------
 2215|  13.6k|                cache[n_cache++] = *a;
 2216|  16.8k|            a++;
 2217|  16.8k|        } while (--a_cache > 0);
  ------------------
  |  Branch (2217:18): [True: 12.7k, False: 4.10k]
  ------------------
 2218|  4.10k|    }
 2219|       |
 2220|       |    // find reused cache entries
 2221|  16.4k|    int i = 0;
 2222|  55.7k|    for (int n = 0; n < n_cache && i < pal_sz; n++)
  ------------------
  |  Branch (2222:21): [True: 41.1k, False: 14.5k]
  |  Branch (2222:36): [True: 39.3k, False: 1.85k]
  ------------------
 2223|  39.3k|        if (dav1d_msac_decode_bool_equi(&ts->msac))
  ------------------
  |  |   53|  39.3k|#define dav1d_msac_decode_bool_equi      dav1d_msac_decode_bool_equi_sse2
  ------------------
  |  Branch (2223:13): [True: 19.8k, False: 19.4k]
  ------------------
 2224|  19.8k|            used_cache[i++] = cache[n];
 2225|  16.4k|    const int n_used_cache = i;
 2226|       |
 2227|       |    // parse new entries
 2228|  16.4k|    pixel *const pal = t->frame_thread.pass ?
  ------------------
  |  Branch (2228:24): [True: 0, False: 16.4k]
  ------------------
 2229|      0|        f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
 2230|      0|                            ((t->bx >> 1) + (t->by & 1))][pl] :
 2231|  16.4k|        bytefn(t->scratch.pal)[pl];
  ------------------
  |  |   87|  16.4k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   77|  16.4k|#define bitfn(x) x##_16bpc
  |  |  ------------------
  ------------------
 2232|  16.4k|    if (i < pal_sz) {
  ------------------
  |  Branch (2232:9): [True: 13.8k, False: 2.57k]
  ------------------
 2233|  13.8k|        const int bpc = BITDEPTH == 8 ? 8 : f->cur.p.bpc;
  ------------------
  |  Branch (2233:25): [Folded, False: 13.8k]
  ------------------
 2234|  13.8k|        int prev = pal[i++] = dav1d_msac_decode_bools(&ts->msac, bpc);
 2235|       |
 2236|  13.8k|        if (i < pal_sz) {
  ------------------
  |  Branch (2236:13): [True: 12.4k, False: 1.46k]
  ------------------
 2237|  12.4k|            int bits = bpc - 3 + dav1d_msac_decode_bools(&ts->msac, 2);
 2238|  12.4k|            const int max = (1 << bpc) - 1;
 2239|       |
 2240|  28.7k|            do {
 2241|  28.7k|                const int delta = dav1d_msac_decode_bools(&ts->msac, bits);
 2242|  28.7k|                prev = pal[i++] = imin(prev + delta + !pl, max);
 2243|  28.7k|                if (prev + !pl >= max) {
  ------------------
  |  Branch (2243:21): [True: 5.47k, False: 23.2k]
  ------------------
 2244|  15.2k|                    for (; i < pal_sz; i++)
  ------------------
  |  Branch (2244:28): [True: 9.81k, False: 5.47k]
  ------------------
 2245|  9.81k|                        pal[i] = max;
 2246|  5.47k|                    break;
 2247|  5.47k|                }
 2248|  23.2k|                bits = imin(bits, 1 + ulog2(max - prev - !pl));
 2249|  23.2k|            } while (i < pal_sz);
  ------------------
  |  Branch (2249:22): [True: 16.3k, False: 6.95k]
  ------------------
 2250|  12.4k|        }
 2251|       |
 2252|       |        // merge cache+new entries
 2253|  13.8k|        int n = 0, m = n_used_cache;
 2254|  79.0k|        for (i = 0; i < pal_sz; i++) {
  ------------------
  |  Branch (2254:21): [True: 65.1k, False: 13.8k]
  ------------------
 2255|  65.1k|            if (n < n_used_cache && (m >= pal_sz || used_cache[n] <= pal[m])) {
  ------------------
  |  Branch (2255:17): [True: 21.8k, False: 43.3k]
  |  Branch (2255:38): [True: 4.79k, False: 17.0k]
  |  Branch (2255:53): [True: 7.96k, False: 9.06k]
  ------------------
 2256|  12.7k|                pal[i] = used_cache[n++];
 2257|  52.4k|            } else {
 2258|  52.4k|                assert(m < pal_sz);
  ------------------
  |  |  140|  52.4k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 52.4k]
  |  |  |  Branch (140:68): [Folded, False: 52.4k]
  |  |  ------------------
  ------------------
 2259|  52.4k|                pal[i] = pal[m++];
 2260|  52.4k|            }
 2261|  65.1k|        }
 2262|  13.8k|    } else {
 2263|  2.57k|        memcpy(pal, used_cache, n_used_cache * sizeof(*used_cache));
 2264|  2.57k|    }
 2265|       |
 2266|  16.4k|    if (DEBUG_BLOCK_INFO) {
  ------------------
  |  |   34|  16.4k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 16.4k]
  |  |  ------------------
  |  |   35|  16.4k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  16.4k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 2267|      0|        printf("Post-pal[pl=%d,sz=%d,cache_size=%d,used_cache=%d]: r=%d, cache=",
 2268|      0|               pl, pal_sz, n_cache, n_used_cache, ts->msac.rng);
 2269|      0|        for (int n = 0; n < n_cache; n++)
  ------------------
  |  Branch (2269:25): [True: 0, False: 0]
  ------------------
 2270|      0|            printf("%c%02x", n ? ' ' : '[', cache[n]);
  ------------------
  |  Branch (2270:30): [True: 0, False: 0]
  ------------------
 2271|      0|        printf("%s, pal=", n_cache ? "]" : "[]");
  ------------------
  |  Branch (2271:28): [True: 0, False: 0]
  ------------------
 2272|      0|        for (int n = 0; n < pal_sz; n++)
  ------------------
  |  Branch (2272:25): [True: 0, False: 0]
  ------------------
 2273|      0|            printf("%c%02x", n ? ' ' : '[', pal[n]);
  ------------------
  |  Branch (2273:30): [True: 0, False: 0]
  ------------------
 2274|      0|        printf("]\n");
 2275|      0|    }
 2276|  16.4k|}
dav1d_read_pal_uv_16bpc:
 2280|  3.42k|{
 2281|  3.42k|    bytefn(dav1d_read_pal_plane)(t, b, 1, sz_ctx, bx4, by4);
  ------------------
  |  |   87|  3.42k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   77|  3.42k|#define bitfn(x) x##_16bpc
  |  |  ------------------
  ------------------
 2282|       |
 2283|       |    // V pal coding
 2284|  3.42k|    Dav1dTileState *const ts = t->ts;
 2285|  3.42k|    const Dav1dFrameContext *const f = t->f;
 2286|  3.42k|    pixel *const pal = t->frame_thread.pass ?
  ------------------
  |  Branch (2286:24): [True: 0, False: 3.42k]
  ------------------
 2287|      0|        f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
 2288|      0|                            ((t->bx >> 1) + (t->by & 1))][2] :
 2289|  3.42k|        bytefn(t->scratch.pal)[2];
  ------------------
  |  |   87|  3.42k|#define bytefn(x) bitfn(x)
  |  |  ------------------
  |  |  |  |   77|  3.42k|#define bitfn(x) x##_16bpc
  |  |  ------------------
  ------------------
 2290|  3.42k|    const int bpc = BITDEPTH == 8 ? 8 : f->cur.p.bpc;
  ------------------
  |  Branch (2290:21): [Folded, False: 3.42k]
  ------------------
 2291|  3.42k|    if (dav1d_msac_decode_bool_equi(&ts->msac)) {
  ------------------
  |  |   53|  3.42k|#define dav1d_msac_decode_bool_equi      dav1d_msac_decode_bool_equi_sse2
  ------------------
  |  Branch (2291:9): [True: 1.88k, False: 1.54k]
  ------------------
 2292|  1.88k|        const int bits = bpc - 4 + dav1d_msac_decode_bools(&ts->msac, 2);
 2293|  1.88k|        int prev = pal[0] = dav1d_msac_decode_bools(&ts->msac, bpc);
 2294|  1.88k|        const int max = (1 << bpc) - 1;
 2295|  7.39k|        for (int i = 1; i < b->pal_sz[1]; i++) {
  ------------------
  |  Branch (2295:25): [True: 5.50k, False: 1.88k]
  ------------------
 2296|  5.50k|            int delta = dav1d_msac_decode_bools(&ts->msac, bits);
 2297|  5.50k|            if (delta && dav1d_msac_decode_bool_equi(&ts->msac)) delta = -delta;
  ------------------
  |  |   53|  5.43k|#define dav1d_msac_decode_bool_equi      dav1d_msac_decode_bool_equi_sse2
  ------------------
  |  Branch (2297:17): [True: 5.43k, False: 73]
  |  Branch (2297:26): [True: 2.64k, False: 2.79k]
  ------------------
 2298|  5.50k|            prev = pal[i] = (prev + delta) & max;
 2299|  5.50k|        }
 2300|  1.88k|    } else {
 2301|  7.84k|        for (int i = 0; i < b->pal_sz[1]; i++)
  ------------------
  |  Branch (2301:25): [True: 6.29k, False: 1.54k]
  ------------------
 2302|  6.29k|            pal[i] = dav1d_msac_decode_bools(&ts->msac, bpc);
 2303|  1.54k|    }
 2304|  3.42k|    if (DEBUG_BLOCK_INFO) {
  ------------------
  |  |   34|  3.42k|#define DEBUG_BLOCK_INFO 0 && \
  |  |  ------------------
  |  |  |  Branch (34:26): [Folded, False: 3.42k]
  |  |  ------------------
  |  |   35|  3.42k|        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
  |  |  ------------------
  |  |  |  Branch (35:9): [True: 0, False: 0]
  |  |  |  Branch (35:44): [True: 0, False: 0]
  |  |  |  Branch (35:58): [True: 0, False: 0]
  |  |  ------------------
  |  |   36|  3.42k|        t->bx >= 8 && t->bx < 12
  |  |  ------------------
  |  |  |  Branch (36:9): [True: 0, False: 0]
  |  |  |  Branch (36:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 2305|      0|        printf("Post-pal[pl=2]: r=%d ", ts->msac.rng);
 2306|      0|        for (int n = 0; n < b->pal_sz[1]; n++)
  ------------------
  |  Branch (2306:25): [True: 0, False: 0]
  ------------------
 2307|      0|            printf("%c%02x", n ? ' ' : '[', pal[n]);
  ------------------
  |  Branch (2307:30): [True: 0, False: 0]
  ------------------
 2308|      0|        printf("]\n");
 2309|      0|    }
 2310|  3.42k|}

dav1d_ref_create:
   37|     27|Dav1dRef *dav1d_ref_create(const enum AllocationType type, size_t size) {
   38|     27|    size = (size + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
   39|       |
   40|     27|    uint8_t *const data = dav1d_alloc_aligned(type, size + sizeof(Dav1dRef), 64);
  ------------------
  |  |  134|     27|#define dav1d_alloc_aligned(type, sz, align) dav1d_alloc_aligned_internal(sz, align)
  ------------------
   41|     27|    if (!data) return NULL;
  ------------------
  |  Branch (41:9): [True: 0, False: 27]
  ------------------
   42|       |
   43|     27|    Dav1dRef *const res = (Dav1dRef*)(data + size);
   44|     27|    res->const_data = res->user_data = res->data = data;
   45|     27|    atomic_init(&res->ref_cnt, 1);
   46|     27|    res->free_ref = 0;
   47|     27|    res->free_callback = default_free_callback;
   48|       |
   49|     27|    return res;
   50|     27|}
dav1d_ref_create_using_pool:
   56|  61.9k|Dav1dRef *dav1d_ref_create_using_pool(Dav1dMemPool *const pool, size_t size) {
   57|  61.9k|    size = (size + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
   58|       |
   59|  61.9k|    Dav1dMemPoolBuffer *const buf =
   60|  61.9k|        dav1d_mem_pool_pop(pool, size + sizeof(Dav1dRef));
   61|  61.9k|    if (!buf) return NULL;
  ------------------
  |  Branch (61:9): [True: 0, False: 61.9k]
  ------------------
   62|       |
   63|  61.9k|    Dav1dRef *const res = &((Dav1dRef*)buf)[-1];
   64|  61.9k|    res->data = buf->data;
   65|  61.9k|    res->const_data = pool;
   66|  61.9k|    atomic_init(&res->ref_cnt, 1);
   67|  61.9k|    res->free_ref = 0;
   68|  61.9k|    res->free_callback = pool_free_callback;
   69|  61.9k|    res->user_data = buf;
   70|       |
   71|  61.9k|    return res;
   72|  61.9k|}
dav1d_ref_dec:
   74|  4.78M|void dav1d_ref_dec(Dav1dRef **const pref) {
   75|  4.78M|    assert(pref != NULL);
  ------------------
  |  |  140|  4.78M|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 4.78M]
  |  |  |  Branch (140:68): [Folded, False: 4.78M]
  |  |  ------------------
  ------------------
   76|       |
   77|  4.78M|    Dav1dRef *const ref = *pref;
   78|  4.78M|    if (!ref) return;
  ------------------
  |  Branch (78:9): [True: 3.80M, False: 974k]
  ------------------
   79|       |
   80|   974k|    *pref = NULL;
   81|   974k|    if (atomic_fetch_sub(&ref->ref_cnt, 1) == 1) {
  ------------------
  |  Branch (81:9): [True: 105k, False: 868k]
  ------------------
   82|   105k|        const int free_ref = ref->free_ref;
   83|   105k|        ref->free_callback(ref->const_data, ref->user_data);
   84|   105k|        if (free_ref) dav1d_free(ref);
  ------------------
  |  |  135|  20.8k|#define dav1d_free(ptr) free(ptr)
  ------------------
  |  Branch (84:13): [True: 20.8k, False: 85.0k]
  ------------------
   85|   105k|    }
   86|   974k|}
ref.c:default_free_callback:
   32|     27|static void default_free_callback(const uint8_t *const data, void *const user_data) {
   33|     27|    assert(data == user_data);
  ------------------
  |  |  140|     27|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 27]
  |  |  |  Branch (140:68): [Folded, False: 27]
  |  |  ------------------
  ------------------
   34|     27|    dav1d_free_aligned(user_data);
  ------------------
  |  |  136|     27|#define dav1d_free_aligned(ptr) dav1d_free_aligned_internal(ptr)
  ------------------
   35|     27|}
ref.c:pool_free_callback:
   52|  61.9k|static void pool_free_callback(const uint8_t *const data, void *const user_data) {
   53|  61.9k|    dav1d_mem_pool_push((Dav1dMemPool*)data, user_data);
   54|  61.9k|}

obu.c:dav1d_ref_init:
   59|      5|{
   60|      5|    ref->data = NULL;
   61|      5|    ref->const_data = ptr;
   62|       |    atomic_init(&ref->ref_cnt, 1);
   63|      5|    ref->free_ref = free_ref;
   64|      5|    ref->free_callback = free_callback;
   65|      5|    ref->user_data = user_data;
   66|      5|    return ref;
   67|      5|}
obu.c:dav1d_ref_inc:
   69|    112|static inline void dav1d_ref_inc(Dav1dRef *const ref) {
   70|       |    atomic_fetch_add_explicit(&ref->ref_cnt, 1, memory_order_relaxed);
   71|    112|}
picture.c:dav1d_ref_inc:
   69|   699k|static inline void dav1d_ref_inc(Dav1dRef *const ref) {
   70|       |    atomic_fetch_add_explicit(&ref->ref_cnt, 1, memory_order_relaxed);
   71|   699k|}
picture.c:dav1d_ref_init:
   59|  22.9k|{
   60|  22.9k|    ref->data = NULL;
   61|  22.9k|    ref->const_data = ptr;
   62|       |    atomic_init(&ref->ref_cnt, 1);
   63|  22.9k|    ref->free_ref = free_ref;
   64|  22.9k|    ref->free_callback = free_callback;
   65|  22.9k|    ref->user_data = user_data;
   66|  22.9k|    return ref;
   67|  22.9k|}
cdf.c:dav1d_ref_inc:
   69|  53.9k|static inline void dav1d_ref_inc(Dav1dRef *const ref) {
   70|       |    atomic_fetch_add_explicit(&ref->ref_cnt, 1, memory_order_relaxed);
   71|  53.9k|}
data.c:dav1d_ref_init:
   59|  20.8k|{
   60|  20.8k|    ref->data = NULL;
   61|  20.8k|    ref->const_data = ptr;
   62|       |    atomic_init(&ref->ref_cnt, 1);
   63|  20.8k|    ref->free_ref = free_ref;
   64|  20.8k|    ref->free_callback = free_callback;
   65|  20.8k|    ref->user_data = user_data;
   66|  20.8k|    return ref;
   67|  20.8k|}
data.c:dav1d_ref_inc:
   69|  42.4k|static inline void dav1d_ref_inc(Dav1dRef *const ref) {
   70|       |    atomic_fetch_add_explicit(&ref->ref_cnt, 1, memory_order_relaxed);
   71|  42.4k|}
decode.c:dav1d_ref_inc:
   69|  73.1k|static inline void dav1d_ref_inc(Dav1dRef *const ref) {
   70|       |    atomic_fetch_add_explicit(&ref->ref_cnt, 1, memory_order_relaxed);
   71|  73.1k|}

dav1d_refmvs_find:
  354|  1.04M|{
  355|  1.04M|    const refmvs_frame *const rf = rt->rf;
  356|  1.04M|    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
  357|  1.04M|    const int bw4 = b_dim[0], w4 = imin(imin(bw4, 16), rt->tile_col.end - bx4);
  358|  1.04M|    const int bh4 = b_dim[1], h4 = imin(imin(bh4, 16), rt->tile_row.end - by4);
  359|  1.04M|    mv gmv[2], tgmv[2];
  360|       |
  361|  1.04M|    *cnt = 0;
  362|  1.04M|    assert(ref.ref[0] >=  0 && ref.ref[0] <= 8 &&
  ------------------
  |  |  140|  6.26M|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:30): [True: 1.04M, False: 2.75k]
  |  |  |  Branch (140:30): [True: 1.04M, False: 2.30k]
  |  |  |  Branch (140:30): [True: 1.04M, False: 18.4E]
  |  |  |  Branch (140:30): [True: 1.04M, False: 18.4E]
  |  |  |  Branch (140:68): [Folded, False: 1.04M]
  |  |  ------------------
  ------------------
  363|  1.04M|           ref.ref[1] >= -1 && ref.ref[1] <= 8);
  364|  1.04M|    if (ref.ref[0] > 0) {
  ------------------
  |  Branch (364:9): [True: 85.8k, False: 960k]
  ------------------
  365|  85.8k|        tgmv[0] = get_gmv_2d(&rf->frm_hdr->gmv[ref.ref[0] - 1],
  366|  85.8k|                             bx4, by4, bw4, bh4, rf->frm_hdr);
  367|  85.8k|        gmv[0] = rf->frm_hdr->gmv[ref.ref[0] - 1].type > DAV1D_WM_TYPE_TRANSLATION ?
  ------------------
  |  Branch (367:18): [True: 11.0k, False: 74.7k]
  ------------------
  368|  74.7k|                 tgmv[0] : (mv) { .n = INVALID_MV };
  ------------------
  |  |   40|  74.7k|#define INVALID_MV 0x80008000
  ------------------
  369|   960k|    } else {
  370|   960k|        tgmv[0] = (mv) { .n = 0 };
  371|   960k|        gmv[0] = (mv) { .n = INVALID_MV };
  ------------------
  |  |   40|   960k|#define INVALID_MV 0x80008000
  ------------------
  372|   960k|    }
  373|  1.04M|    if (ref.ref[1] > 0) {
  ------------------
  |  Branch (373:9): [True: 15.1k, False: 1.03M]
  ------------------
  374|  15.1k|        tgmv[1] = get_gmv_2d(&rf->frm_hdr->gmv[ref.ref[1] - 1],
  375|  15.1k|                             bx4, by4, bw4, bh4, rf->frm_hdr);
  376|  15.1k|        gmv[1] = rf->frm_hdr->gmv[ref.ref[1] - 1].type > DAV1D_WM_TYPE_TRANSLATION ?
  ------------------
  |  Branch (376:18): [True: 2.26k, False: 12.8k]
  ------------------
  377|  12.8k|                 tgmv[1] : (mv) { .n = INVALID_MV };
  ------------------
  |  |   40|  12.8k|#define INVALID_MV 0x80008000
  ------------------
  378|  15.1k|    }
  379|       |
  380|       |    // top
  381|  1.04M|    int have_newmv = 0, have_col_mvs = 0, have_row_mvs = 0;
  382|  1.04M|    unsigned max_rows = 0, n_rows = ~0;
  383|  1.04M|    const refmvs_block *b_top;
  384|  1.04M|    if (by4 > rt->tile_row.start) {
  ------------------
  |  Branch (384:9): [True: 550k, False: 496k]
  ------------------
  385|   550k|        max_rows = imin((by4 - rt->tile_row.start + 1) >> 1, 2 + (bh4 > 1));
  386|   550k|        b_top = &rt->r[(by4 & 31) - 1 + 5][bx4];
  387|   550k|        n_rows = scan_row(mvstack, cnt, ref, gmv, b_top,
  388|   550k|                          bw4, w4, max_rows, bw4 >= 16 ? 4 : 1,
  ------------------
  |  Branch (388:46): [True: 2.01k, False: 548k]
  ------------------
  389|   550k|                          &have_newmv, &have_row_mvs);
  390|   550k|    }
  391|       |
  392|       |    // left
  393|  1.04M|    unsigned max_cols = 0, n_cols = ~0U;
  394|  1.04M|    refmvs_block *const *b_left;
  395|  1.04M|    if (bx4 > rt->tile_col.start) {
  ------------------
  |  Branch (395:9): [True: 1.01M, False: 34.3k]
  ------------------
  396|  1.01M|        max_cols = imin((bx4 - rt->tile_col.start + 1) >> 1, 2 + (bw4 > 1));
  397|  1.01M|        b_left = &rt->r[(by4 & 31) + 5];
  398|  1.01M|        n_cols = scan_col(mvstack, cnt, ref, gmv, b_left,
  399|  1.01M|                          bh4, h4, bx4 - 1, max_cols, bh4 >= 16 ? 4 : 1,
  ------------------
  |  Branch (399:55): [True: 6.67k, False: 1.00M]
  ------------------
  400|  1.01M|                          &have_newmv, &have_col_mvs);
  401|  1.01M|    }
  402|       |
  403|       |    // top/right
  404|  1.04M|    if (n_rows != ~0U && edge_flags & EDGE_I444_TOP_HAS_RIGHT &&
  ------------------
  |  Branch (404:9): [True: 539k, False: 506k]
  |  Branch (404:26): [True: 286k, False: 253k]
  ------------------
  405|   286k|        imax(bw4, bh4) <= 16 && bw4 + bx4 < rt->tile_col.end)
  ------------------
  |  Branch (405:9): [True: 286k, False: 436]
  |  Branch (405:33): [True: 267k, False: 19.0k]
  ------------------
  406|   267k|    {
  407|   267k|        add_spatial_candidate(mvstack, cnt, 4, &b_top[bw4], ref, gmv,
  408|   267k|                              &have_newmv, &have_row_mvs);
  409|   267k|    }
  410|       |
  411|  1.04M|    const int nearest_match = have_col_mvs + have_row_mvs;
  412|  1.04M|    const int nearest_cnt = *cnt;
  413|  2.69M|    for (int n = 0; n < nearest_cnt; n++)
  ------------------
  |  Branch (413:21): [True: 1.65M, False: 1.04M]
  ------------------
  414|  1.65M|        mvstack[n].weight += 640;
  415|       |
  416|       |    // temporal
  417|  1.04M|    int globalmv_ctx = rf->frm_hdr->use_ref_frame_mvs;
  418|  1.04M|    if (rf->use_ref_frame_mvs) {
  ------------------
  |  Branch (418:9): [True: 10.8k, False: 1.03M]
  ------------------
  419|  10.8k|        const ptrdiff_t stride = rf->rp_stride;
  420|  10.8k|        const int by8 = by4 >> 1, bx8 = bx4 >> 1;
  421|  10.8k|        const refmvs_temporal_block *const rbi = &rt->rp_proj[(by8 & 15) * stride + bx8];
  422|  10.8k|        const refmvs_temporal_block *rb = rbi;
  423|  10.8k|        const int step_h = bw4 >= 16 ? 2 : 1, step_v = bh4 >= 16 ? 2 : 1;
  ------------------
  |  Branch (423:28): [True: 952, False: 9.92k]
  |  Branch (423:56): [True: 1.09k, False: 9.78k]
  ------------------
  424|  10.8k|        const int w8 = imin((w4 + 1) >> 1, 8), h8 = imin((h4 + 1) >> 1, 8);
  425|  33.0k|        for (int y = 0; y < h8; y += step_v) {
  ------------------
  |  Branch (425:25): [True: 22.1k, False: 10.8k]
  ------------------
  426|  68.1k|            for (int x = 0; x < w8; x+= step_h) {
  ------------------
  |  Branch (426:29): [True: 46.0k, False: 22.1k]
  ------------------
  427|  46.0k|                add_temporal_candidate(rf, mvstack, cnt, &rb[x], ref,
  428|  46.0k|                                       !(x | y) ? &globalmv_ctx : NULL, tgmv);
  ------------------
  |  Branch (428:40): [True: 10.8k, False: 35.1k]
  ------------------
  429|  46.0k|            }
  430|  22.1k|            rb += stride * step_v;
  431|  22.1k|        }
  432|  10.8k|        if (imin(bw4, bh4) >= 2 && imax(bw4, bh4) < 16) {
  ------------------
  |  Branch (432:13): [True: 7.79k, False: 3.07k]
  |  Branch (432:36): [True: 6.47k, False: 1.32k]
  ------------------
  433|  6.47k|            const int bh8 = bh4 >> 1, bw8 = bw4 >> 1;
  434|  6.47k|            rb = &rbi[bh8 * stride];
  435|  6.47k|            const int has_bottom = by8 + bh8 < imin(rt->tile_row.end >> 1,
  436|  6.47k|                                                    (by8 & ~7) + 8);
  437|  6.47k|            if (has_bottom && bx8 - 1 >= imax(rt->tile_col.start >> 1, bx8 & ~7)) {
  ------------------
  |  Branch (437:17): [True: 4.56k, False: 1.90k]
  |  Branch (437:31): [True: 3.19k, False: 1.36k]
  ------------------
  438|  3.19k|                add_temporal_candidate(rf, mvstack, cnt, &rb[-1], ref,
  439|  3.19k|                                       NULL, NULL);
  440|  3.19k|            }
  441|  6.47k|            if (bx8 + bw8 < imin(rt->tile_col.end >> 1, (bx8 & ~7) + 8)) {
  ------------------
  |  Branch (441:17): [True: 4.44k, False: 2.02k]
  ------------------
  442|  4.44k|                if (has_bottom) {
  ------------------
  |  Branch (442:21): [True: 3.18k, False: 1.26k]
  ------------------
  443|  3.18k|                    add_temporal_candidate(rf, mvstack, cnt, &rb[bw8], ref,
  444|  3.18k|                                           NULL, NULL);
  445|  3.18k|                }
  446|  4.44k|                if (by8 + bh8 - 1 < imin(rt->tile_row.end >> 1, (by8 & ~7) + 8)) {
  ------------------
  |  Branch (446:21): [True: 4.41k, False: 32]
  ------------------
  447|  4.41k|                    add_temporal_candidate(rf, mvstack, cnt, &rb[bw8 - stride],
  448|  4.41k|                                           ref, NULL, NULL);
  449|  4.41k|                }
  450|  4.44k|            }
  451|  6.47k|        }
  452|  10.8k|    }
  453|  1.04M|    assert(*cnt <= 8);
  ------------------
  |  |  140|  1.04M|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 1.04M]
  |  |  |  Branch (140:68): [Folded, False: 1.04M]
  |  |  ------------------
  ------------------
  454|       |
  455|       |    // top/left (which, confusingly, is part of "secondary" references)
  456|  1.04M|    int have_dummy_newmv_match;
  457|  1.04M|    if ((n_rows | n_cols) != ~0U) {
  ------------------
  |  Branch (457:9): [True: 520k, False: 525k]
  ------------------
  458|   520k|        add_spatial_candidate(mvstack, cnt, 4, &b_top[-1], ref, gmv,
  459|   520k|                              &have_dummy_newmv_match, &have_row_mvs);
  460|   520k|    }
  461|       |
  462|       |    // "secondary" (non-direct neighbour) top & left edges
  463|       |    // what is different about secondary is that everything is now in 8x8 resolution
  464|  3.04M|    for (int n = 2; n <= 3; n++) {
  ------------------
  |  Branch (464:21): [True: 1.99M, False: 1.04M]
  ------------------
  465|  1.99M|        if ((unsigned) n > n_rows && (unsigned) n <= max_rows) {
  ------------------
  |  Branch (465:13): [True: 1.01M, False: 979k]
  |  Branch (465:38): [True: 116k, False: 898k]
  ------------------
  466|   116k|            n_rows += scan_row(mvstack, cnt, ref, gmv,
  467|   116k|                               &rt->r[(((by4 & 31) - 2 * n + 1) | 1) + 5][bx4 | 1],
  468|   116k|                               bw4, w4, 1 + max_rows - n, bw4 >= 16 ? 4 : 2,
  ------------------
  |  Branch (468:59): [True: 1.68k, False: 114k]
  ------------------
  469|   116k|                               &have_dummy_newmv_match, &have_row_mvs);
  470|   116k|        }
  471|       |
  472|  1.99M|        if ((unsigned) n > n_cols && (unsigned) n <= max_cols) {
  ------------------
  |  Branch (472:13): [True: 1.87M, False: 120k]
  |  Branch (472:38): [True: 1.04M, False: 832k]
  ------------------
  473|  1.04M|            n_cols += scan_col(mvstack, cnt, ref, gmv, &rt->r[((by4 & 31) | 1) + 5],
  474|  1.04M|                               bh4, h4, (bx4 - n * 2 + 1) | 1,
  475|  1.04M|                               1 + max_cols - n, bh4 >= 16 ? 4 : 2,
  ------------------
  |  Branch (475:50): [True: 6.15k, False: 1.03M]
  ------------------
  476|  1.04M|                               &have_dummy_newmv_match, &have_col_mvs);
  477|  1.04M|        }
  478|  1.99M|    }
  479|  1.04M|    assert(*cnt <= 8);
  ------------------
  |  |  140|  1.04M|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 1.04M]
  |  |  |  Branch (140:68): [Folded, False: 1.04M]
  |  |  ------------------
  ------------------
  480|       |
  481|  1.04M|    const int ref_match_count = have_col_mvs + have_row_mvs;
  482|       |
  483|       |    // context build-up
  484|  1.04M|    int refmv_ctx, newmv_ctx;
  485|  1.04M|    switch (nearest_match) {
  ------------------
  |  Branch (485:13): [True: 1.03M, False: 10.6k]
  ------------------
  486|  46.3k|    case 0:
  ------------------
  |  Branch (486:5): [True: 46.3k, False: 999k]
  ------------------
  487|  46.3k|        refmv_ctx = imin(2, ref_match_count);
  488|  46.3k|        newmv_ctx = ref_match_count > 0;
  489|  46.3k|        break;
  490|   514k|    case 1:
  ------------------
  |  Branch (490:5): [True: 514k, False: 531k]
  ------------------
  491|   514k|        refmv_ctx = imin(ref_match_count * 3, 4);
  492|   514k|        newmv_ctx = 3 - have_newmv;
  493|   514k|        break;
  494|   474k|    case 2:
  ------------------
  |  Branch (494:5): [True: 474k, False: 571k]
  ------------------
  495|   474k|        refmv_ctx = 5;
  496|   474k|        newmv_ctx = 5 - have_newmv;
  497|   474k|        break;
  498|  1.04M|    }
  499|       |
  500|       |    // sorting (nearest, then "secondary")
  501|  1.01M|    int len = nearest_cnt;
  502|  2.41M|    while (len) {
  ------------------
  |  Branch (502:12): [True: 1.40M, False: 1.01M]
  ------------------
  503|  1.40M|        int last = 0;
  504|  2.29M|        for (int n = 1; n < len; n++) {
  ------------------
  |  Branch (504:25): [True: 894k, False: 1.40M]
  ------------------
  505|   894k|            if (mvstack[n - 1].weight < mvstack[n].weight) {
  ------------------
  |  Branch (505:17): [True: 447k, False: 446k]
  ------------------
  506|   447k|#define EXCHANGE(a, b) do { refmvs_candidate tmp = a; a = b; b = tmp; } while (0)
  507|   447k|                EXCHANGE(mvstack[n - 1], mvstack[n]);
  ------------------
  |  |  506|   447k|#define EXCHANGE(a, b) do { refmvs_candidate tmp = a; a = b; b = tmp; } while (0)
  |  |  ------------------
  |  |  |  Branch (506:80): [Folded, False: 447k]
  |  |  ------------------
  ------------------
  508|   447k|                last = n;
  509|   447k|            }
  510|   894k|        }
  511|  1.40M|        len = last;
  512|  1.40M|    }
  513|  1.01M|    len = *cnt;
  514|  1.94M|    while (len > nearest_cnt) {
  ------------------
  |  Branch (514:12): [True: 929k, False: 1.01M]
  ------------------
  515|   929k|        int last = nearest_cnt;
  516|  1.41M|        for (int n = nearest_cnt + 1; n < len; n++) {
  ------------------
  |  Branch (516:39): [True: 484k, False: 929k]
  ------------------
  517|   484k|            if (mvstack[n - 1].weight < mvstack[n].weight) {
  ------------------
  |  Branch (517:17): [True: 8.97k, False: 475k]
  ------------------
  518|  8.97k|                EXCHANGE(mvstack[n - 1], mvstack[n]);
  ------------------
  |  |  506|  8.97k|#define EXCHANGE(a, b) do { refmvs_candidate tmp = a; a = b; b = tmp; } while (0)
  |  |  ------------------
  |  |  |  Branch (506:80): [Folded, False: 8.97k]
  |  |  ------------------
  ------------------
  519|  8.97k|#undef EXCHANGE
  520|  8.97k|                last = n;
  521|  8.97k|            }
  522|   484k|        }
  523|   929k|        len = last;
  524|   929k|    }
  525|       |
  526|  1.01M|    if (ref.ref[1] > 0) {
  ------------------
  |  Branch (526:9): [True: 15.1k, False: 1.00M]
  ------------------
  527|  15.1k|        if (*cnt < 2) {
  ------------------
  |  Branch (527:13): [True: 11.4k, False: 3.72k]
  ------------------
  528|  11.4k|            const int sign0 = rf->sign_bias[ref.ref[0] - 1];
  529|  11.4k|            const int sign1 = rf->sign_bias[ref.ref[1] - 1];
  530|  11.4k|            const int sz4 = imin(w4, h4);
  531|  11.4k|            refmvs_candidate *const same = &mvstack[*cnt];
  532|  11.4k|            int same_count[4] = { 0 };
  533|       |
  534|       |            // non-self references in top
  535|  16.2k|            if (n_rows != ~0U) for (int x = 0; x < sz4;) {
  ------------------
  |  Branch (535:17): [True: 7.80k, False: 3.60k]
  |  Branch (535:48): [True: 8.41k, False: 7.80k]
  ------------------
  536|  8.41k|                const refmvs_block *const cand_b = &b_top[x];
  537|  8.41k|                add_compound_extended_candidate(same, same_count, cand_b,
  538|  8.41k|                                                sign0, sign1, ref, rf->sign_bias);
  539|  8.41k|                x += dav1d_block_dimensions[cand_b->bs][0];
  540|  8.41k|            }
  541|       |
  542|       |            // non-self references in left
  543|  21.1k|            if (n_cols != ~0U) for (int y = 0; y < sz4;) {
  ------------------
  |  Branch (543:17): [True: 9.96k, False: 1.44k]
  |  Branch (543:48): [True: 11.2k, False: 9.96k]
  ------------------
  544|  11.2k|                const refmvs_block *const cand_b = &b_left[y][bx4 - 1];
  545|  11.2k|                add_compound_extended_candidate(same, same_count, cand_b,
  546|  11.2k|                                                sign0, sign1, ref, rf->sign_bias);
  547|  11.2k|                y += dav1d_block_dimensions[cand_b->bs][1];
  548|  11.2k|            }
  549|       |
  550|  11.4k|            refmvs_candidate *const diff = &same[2];
  551|  11.4k|            const int *const diff_count = &same_count[2];
  552|       |
  553|       |            // merge together
  554|  34.2k|            for (int n = 0; n < 2; n++) {
  ------------------
  |  Branch (554:29): [True: 22.8k, False: 11.4k]
  ------------------
  555|  22.8k|                int m = same_count[n];
  556|       |
  557|  22.8k|                if (m >= 2) continue;
  ------------------
  |  Branch (557:21): [True: 5.17k, False: 17.6k]
  ------------------
  558|       |
  559|  17.6k|                const int l = diff_count[n];
  560|  17.6k|                if (l) {
  ------------------
  |  Branch (560:21): [True: 15.0k, False: 2.55k]
  ------------------
  561|  15.0k|                    same[m].mv.mv[n] = diff[0].mv.mv[n];
  562|  15.0k|                    if (++m == 2) continue;
  ------------------
  |  Branch (562:25): [True: 9.48k, False: 5.60k]
  ------------------
  563|  5.60k|                    if (l == 2) {
  ------------------
  |  Branch (563:25): [True: 4.05k, False: 1.55k]
  ------------------
  564|  4.05k|                        same[1].mv.mv[n] = diff[1].mv.mv[n];
  565|  4.05k|                        continue;
  566|  4.05k|                    }
  567|  5.60k|                }
  568|  5.66k|                do {
  569|  5.66k|                    same[m].mv.mv[n] = tgmv[n];
  570|  5.66k|                } while (++m < 2);
  ------------------
  |  Branch (570:26): [True: 1.55k, False: 4.10k]
  ------------------
  571|  4.10k|            }
  572|       |
  573|       |            // if the first extended was the same as the non-extended one,
  574|       |            // then replace it with the second extended one
  575|  11.4k|            int n = *cnt;
  576|  11.4k|            if (n == 1 && mvstack[0].mv.n == same[0].mv.n)
  ------------------
  |  Branch (576:17): [True: 5.54k, False: 5.86k]
  |  Branch (576:27): [True: 4.24k, False: 1.30k]
  ------------------
  577|  4.24k|                mvstack[1].mv = mvstack[2].mv;
  578|  17.2k|            do {
  579|  17.2k|                mvstack[n].weight = 2;
  580|  17.2k|            } while (++n < 2);
  ------------------
  |  Branch (580:22): [True: 5.86k, False: 11.4k]
  ------------------
  581|  11.4k|            *cnt = 2;
  582|  11.4k|        }
  583|       |
  584|       |        // clamping
  585|  15.1k|        const int left = -(bx4 + bw4 + 4) * 4 * 8;
  586|  15.1k|        const int right = (rf->iw4 - bx4 + 4) * 4 * 8;
  587|  15.1k|        const int top = -(by4 + bh4 + 4) * 4 * 8;
  588|  15.1k|        const int bottom = (rf->ih4 - by4 + 4) * 4 * 8;
  589|       |
  590|  15.1k|        const int n_refmvs = *cnt;
  591|  15.1k|        int n = 0;
  592|  32.7k|        do {
  593|  32.7k|            mvstack[n].mv.mv[0].x = iclip(mvstack[n].mv.mv[0].x, left, right);
  594|  32.7k|            mvstack[n].mv.mv[0].y = iclip(mvstack[n].mv.mv[0].y, top, bottom);
  595|  32.7k|            mvstack[n].mv.mv[1].x = iclip(mvstack[n].mv.mv[1].x, left, right);
  596|  32.7k|            mvstack[n].mv.mv[1].y = iclip(mvstack[n].mv.mv[1].y, top, bottom);
  597|  32.7k|        } while (++n < n_refmvs);
  ------------------
  |  Branch (597:18): [True: 17.6k, False: 15.1k]
  ------------------
  598|       |
  599|  15.1k|        switch (refmv_ctx >> 1) {
  ------------------
  |  Branch (599:17): [True: 15.1k, False: 18.4E]
  ------------------
  600|  6.53k|        case 0:
  ------------------
  |  Branch (600:9): [True: 6.53k, False: 8.60k]
  ------------------
  601|  6.53k|            *ctx = imin(newmv_ctx, 1);
  602|  6.53k|            break;
  603|  5.68k|        case 1:
  ------------------
  |  Branch (603:9): [True: 5.68k, False: 9.45k]
  ------------------
  604|  5.68k|            *ctx = 1 + imin(newmv_ctx, 3);
  605|  5.68k|            break;
  606|  2.92k|        case 2:
  ------------------
  |  Branch (606:9): [True: 2.92k, False: 12.2k]
  ------------------
  607|  2.92k|            *ctx = iclip(3 + newmv_ctx, 4, 7);
  608|  2.92k|            break;
  609|  15.1k|        }
  610|       |
  611|  15.1k|        return;
  612|  1.00M|    } else if (*cnt < 2 && ref.ref[0] > 0) {
  ------------------
  |  Branch (612:16): [True: 83.6k, False: 918k]
  |  Branch (612:28): [True: 42.9k, False: 40.6k]
  ------------------
  613|  42.9k|        const int sign = rf->sign_bias[ref.ref[0] - 1];
  614|  42.9k|        const int sz4 = imin(w4, h4);
  615|       |
  616|       |        // non-self references in top
  617|  60.1k|        if (n_rows != ~0U) for (int x = 0; x < sz4 && *cnt < 2;) {
  ------------------
  |  Branch (617:13): [True: 29.3k, False: 13.6k]
  |  Branch (617:44): [True: 30.8k, False: 29.2k]
  |  Branch (617:55): [True: 30.7k, False: 148]
  ------------------
  618|  30.7k|            const refmvs_block *const cand_b = &b_top[x];
  619|  30.7k|            add_single_extended_candidate(mvstack, cnt, cand_b, sign, rf->sign_bias);
  620|  30.7k|            x += dav1d_block_dimensions[cand_b->bs][0];
  621|  30.7k|        }
  622|       |
  623|       |        // non-self references in left
  624|  69.3k|        if (n_cols != ~0U) for (int y = 0; y < sz4 && *cnt < 2;) {
  ------------------
  |  Branch (624:13): [True: 35.8k, False: 7.15k]
  |  Branch (624:44): [True: 38.0k, False: 31.3k]
  |  Branch (624:55): [True: 33.5k, False: 4.49k]
  ------------------
  625|  33.5k|            const refmvs_block *const cand_b = &b_left[y][bx4 - 1];
  626|  33.5k|            add_single_extended_candidate(mvstack, cnt, cand_b, sign, rf->sign_bias);
  627|  33.5k|            y += dav1d_block_dimensions[cand_b->bs][1];
  628|  33.5k|        }
  629|  42.9k|    }
  630|  1.00M|    assert(*cnt <= 8);
  ------------------
  |  |  140|  1.00M|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 1.00M]
  |  |  |  Branch (140:68): [Folded, False: 1.00M]
  |  |  ------------------
  ------------------
  631|       |
  632|       |    // clamping
  633|  1.00M|    int n_refmvs = *cnt;
  634|  1.00M|    if (n_refmvs) {
  ------------------
  |  Branch (634:9): [True: 982k, False: 20.0k]
  ------------------
  635|   982k|        const int left = -(bx4 + bw4 + 4) * 4 * 8;
  636|   982k|        const int right = (rf->iw4 - bx4 + 4) * 4 * 8;
  637|   982k|        const int top = -(by4 + bh4 + 4) * 4 * 8;
  638|   982k|        const int bottom = (rf->ih4 - by4 + 4) * 4 * 8;
  639|       |
  640|   982k|        int n = 0;
  641|  3.04M|        do {
  642|  3.04M|            mvstack[n].mv.mv[0].x = iclip(mvstack[n].mv.mv[0].x, left, right);
  643|  3.04M|            mvstack[n].mv.mv[0].y = iclip(mvstack[n].mv.mv[0].y, top, bottom);
  644|  3.04M|        } while (++n < n_refmvs);
  ------------------
  |  Branch (644:18): [True: 2.06M, False: 982k]
  ------------------
  645|   982k|    }
  646|       |
  647|  1.10M|    for (int n = *cnt; n < 2; n++)
  ------------------
  |  Branch (647:24): [True: 100k, False: 1.00M]
  ------------------
  648|   100k|        mvstack[n].mv.mv[0] = tgmv[0];
  649|       |
  650|  1.00M|    *ctx = (refmv_ctx << 4) | (globalmv_ctx << 3) | newmv_ctx;
  651|  1.00M|}
dav1d_refmvs_tile_sbrow_init:
  657|  48.2k|{
  658|  48.2k|    if (rf->n_tile_threads == 1) tile_row_idx = 0;
  ------------------
  |  Branch (658:9): [True: 2.74k, False: 45.4k]
  ------------------
  659|  48.2k|    rt->rp_proj = &rf->rp_proj[16 * rf->rp_stride * tile_row_idx];
  660|  48.2k|    const ptrdiff_t r_stride = rf->rp_stride * 2;
  661|  48.2k|    const ptrdiff_t pass_off = (rf->n_frame_threads > 1 && pass == 2) ?
  ------------------
  |  Branch (661:33): [True: 0, False: 48.2k]
  |  Branch (661:60): [True: 0, False: 0]
  ------------------
  662|  48.2k|        35 * 2 * rf->n_blocks : 0;
  663|  48.2k|    refmvs_block *r = &rf->r[35 * r_stride * tile_row_idx + pass_off];
  664|  48.2k|    const int sbsz = rf->sbsz;
  665|  48.2k|    const int off = (sbsz * sby) & 16;
  666|  1.22M|    for (int i = 0; i < sbsz; i++, r += r_stride)
  ------------------
  |  Branch (666:21): [True: 1.17M, False: 48.2k]
  ------------------
  667|  1.17M|        rt->r[off + 5 + i] = r;
  668|  48.2k|    rt->r[off + 0] = r;
  669|  48.2k|    r += r_stride;
  670|  48.2k|    rt->r[off + 1] = NULL;
  671|  48.2k|    rt->r[off + 2] = r;
  672|  48.2k|    r += r_stride;
  673|  48.2k|    rt->r[off + 3] = NULL;
  674|  48.2k|    rt->r[off + 4] = r;
  675|  48.2k|    if (sby & 1) {
  ------------------
  |  Branch (675:9): [True: 19.4k, False: 28.7k]
  ------------------
  676|  19.4k|#define EXCHANGE(a, b) do { void *const tmp = a; a = b; b = tmp; } while (0)
  677|  19.4k|        EXCHANGE(rt->r[off + 0], rt->r[off + sbsz + 0]);
  ------------------
  |  |  676|  19.4k|#define EXCHANGE(a, b) do { void *const tmp = a; a = b; b = tmp; } while (0)
  |  |  ------------------
  |  |  |  Branch (676:75): [Folded, False: 19.4k]
  |  |  ------------------
  ------------------
  678|  19.4k|        EXCHANGE(rt->r[off + 2], rt->r[off + sbsz + 2]);
  ------------------
  |  |  676|  19.4k|#define EXCHANGE(a, b) do { void *const tmp = a; a = b; b = tmp; } while (0)
  |  |  ------------------
  |  |  |  Branch (676:75): [Folded, False: 19.4k]
  |  |  ------------------
  ------------------
  679|  19.4k|        EXCHANGE(rt->r[off + 4], rt->r[off + sbsz + 4]);
  ------------------
  |  |  676|  19.4k|#define EXCHANGE(a, b) do { void *const tmp = a; a = b; b = tmp; } while (0)
  |  |  ------------------
  |  |  |  Branch (676:75): [Folded, False: 19.4k]
  |  |  ------------------
  ------------------
  680|  19.4k|#undef EXCHANGE
  681|  19.4k|    }
  682|       |
  683|  48.2k|    rt->rf = rf;
  684|  48.2k|    rt->tile_row.start = tile_row_start4;
  685|  48.2k|    rt->tile_row.end = imin(tile_row_end4, rf->ih4);
  686|  48.2k|    rt->tile_col.start = tile_col_start4;
  687|  48.2k|    rt->tile_col.end = imin(tile_col_end4, rf->iw4);
  688|  48.2k|}
dav1d_refmvs_init_frame:
  807|  8.37k|{
  808|  8.37k|    const int rp_stride = ((frm_hdr->width[0] + 127) & ~127) >> 3;
  809|  8.37k|    const int n_tile_rows = n_tile_threads > 1 ? frm_hdr->tiling.rows : 1;
  ------------------
  |  Branch (809:29): [True: 7.85k, False: 520]
  ------------------
  810|  8.37k|    const int n_blocks = rp_stride * n_tile_rows;
  811|       |
  812|  8.37k|    rf->sbsz = 16 << seq_hdr->sb128;
  813|  8.37k|    rf->frm_hdr = frm_hdr;
  814|  8.37k|    rf->iw8 = (frm_hdr->width[0] + 7) >> 3;
  815|  8.37k|    rf->ih8 = (frm_hdr->height + 7) >> 3;
  816|  8.37k|    rf->iw4 = rf->iw8 << 1;
  817|  8.37k|    rf->ih4 = rf->ih8 << 1;
  818|  8.37k|    rf->rp = rp;
  819|  8.37k|    rf->rp_stride = rp_stride;
  820|  8.37k|    rf->n_tile_threads = n_tile_threads;
  821|  8.37k|    rf->n_frame_threads = n_frame_threads;
  822|       |
  823|  8.37k|    if (n_blocks != rf->n_blocks) {
  ------------------
  |  Branch (823:9): [True: 6.45k, False: 1.92k]
  ------------------
  824|  6.45k|        const size_t r_sz = sizeof(*rf->r) * 35 * 2 * n_blocks * (1 + (n_frame_threads > 1));
  825|  6.45k|        const size_t rp_proj_sz = sizeof(*rf->rp_proj) * 16 * n_blocks;
  826|       |        /* Note that sizeof(*rf->r) == 12, but it's accessed using 16-byte unaligned
  827|       |         * loads in save_tmvs() asm which can overread 4 bytes into rp_proj. */
  828|  6.45k|        dav1d_free_aligned(rf->r);
  ------------------
  |  |  136|  6.45k|#define dav1d_free_aligned(ptr) dav1d_free_aligned_internal(ptr)
  ------------------
  829|  6.45k|        rf->r = dav1d_alloc_aligned(ALLOC_REFMVS, r_sz + rp_proj_sz, 64);
  ------------------
  |  |  134|  6.45k|#define dav1d_alloc_aligned(type, sz, align) dav1d_alloc_aligned_internal(sz, align)
  ------------------
  830|  6.45k|        if (!rf->r) {
  ------------------
  |  Branch (830:13): [True: 0, False: 6.45k]
  ------------------
  831|      0|            rf->n_blocks = 0;
  832|      0|            return DAV1D_ERR(ENOMEM);
  ------------------
  |  |   56|      0|#define DAV1D_ERR(e) (-(e)) ///< Negate POSIX error code.
  ------------------
  833|      0|        }
  834|       |
  835|  6.45k|        rf->rp_proj = (refmvs_temporal_block*)((uintptr_t)rf->r + r_sz);
  836|  6.45k|        rf->n_blocks = n_blocks;
  837|  6.45k|    }
  838|       |
  839|  8.37k|    const int poc = frm_hdr->frame_offset;
  840|  66.9k|    for (int i = 0; i < 7; i++) {
  ------------------
  |  Branch (840:21): [True: 58.6k, False: 8.37k]
  ------------------
  841|  58.6k|        const int poc_diff = get_poc_diff(seq_hdr->order_hint_n_bits,
  842|  58.6k|                                          ref_poc[i], poc);
  843|  58.6k|        rf->sign_bias[i] = poc_diff > 0;
  844|  58.6k|        rf->mfmv_sign[i] = poc_diff < 0;
  845|  58.6k|        rf->pocdiff[i] = iclip(get_poc_diff(seq_hdr->order_hint_n_bits,
  846|  58.6k|                                            poc, ref_poc[i]), -31, 31);
  847|  58.6k|    }
  848|       |
  849|       |    // temporal MV setup
  850|  8.37k|    rf->n_mfmvs = 0;
  851|  8.37k|    rf->rp_ref = rp_ref;
  852|  8.37k|    if (frm_hdr->use_ref_frame_mvs && seq_hdr->order_hint_n_bits) {
  ------------------
  |  Branch (852:9): [True: 3.28k, False: 5.09k]
  |  Branch (852:39): [True: 3.28k, False: 0]
  ------------------
  853|  3.28k|        int total = 2;
  854|  3.28k|        if (rp_ref[0] && ref_ref_poc[0][6] != ref_poc[3] /* alt-of-last != gold */) {
  ------------------
  |  Branch (854:13): [True: 337, False: 2.94k]
  |  Branch (854:26): [True: 298, False: 39]
  ------------------
  855|    298|            rf->mfmv_ref[rf->n_mfmvs++] = 0; // last
  856|    298|            total = 3;
  857|    298|        }
  858|  3.28k|        if (rp_ref[4] && get_poc_diff(seq_hdr->order_hint_n_bits, ref_poc[4],
  ------------------
  |  Branch (858:13): [True: 405, False: 2.87k]
  |  Branch (858:26): [True: 308, False: 97]
  ------------------
  859|    405|                                      frm_hdr->frame_offset) > 0)
  860|    308|        {
  861|    308|            rf->mfmv_ref[rf->n_mfmvs++] = 4; // bwd
  862|    308|        }
  863|  3.28k|        if (rp_ref[5] && get_poc_diff(seq_hdr->order_hint_n_bits, ref_poc[5],
  ------------------
  |  Branch (863:13): [True: 95, False: 3.18k]
  |  Branch (863:26): [True: 20, False: 75]
  ------------------
  864|     95|                                      frm_hdr->frame_offset) > 0)
  865|     20|        {
  866|     20|            rf->mfmv_ref[rf->n_mfmvs++] = 5; // altref2
  867|     20|        }
  868|  3.28k|        if (rf->n_mfmvs < total && rp_ref[6] &&
  ------------------
  |  Branch (868:13): [True: 3.26k, False: 12]
  |  Branch (868:36): [True: 850, False: 2.41k]
  ------------------
  869|    850|            get_poc_diff(seq_hdr->order_hint_n_bits, ref_poc[6],
  ------------------
  |  Branch (869:13): [True: 690, False: 160]
  ------------------
  870|    850|                         frm_hdr->frame_offset) > 0)
  871|    690|        {
  872|    690|            rf->mfmv_ref[rf->n_mfmvs++] = 6; // altref
  873|    690|        }
  874|  3.28k|        if (rf->n_mfmvs < total && rp_ref[1])
  ------------------
  |  Branch (874:13): [True: 3.02k, False: 252]
  |  Branch (874:36): [True: 282, False: 2.74k]
  ------------------
  875|    282|            rf->mfmv_ref[rf->n_mfmvs++] = 1; // last2
  876|       |
  877|  4.87k|        for (int n = 0; n < rf->n_mfmvs; n++) {
  ------------------
  |  Branch (877:25): [True: 1.59k, False: 3.28k]
  ------------------
  878|  1.59k|            const int rpoc = ref_poc[rf->mfmv_ref[n]];
  879|  1.59k|            const int diff1 = get_poc_diff(seq_hdr->order_hint_n_bits,
  880|  1.59k|                                           rpoc, frm_hdr->frame_offset);
  881|  1.59k|            if (abs(diff1) > 31) {
  ------------------
  |  Branch (881:17): [True: 14, False: 1.58k]
  ------------------
  882|     14|                rf->mfmv_ref2cur[n] = INVALID_REF2CUR;
  ------------------
  |  |   41|     14|#define INVALID_REF2CUR (-32)
  ------------------
  883|  1.58k|            } else {
  884|  1.58k|                rf->mfmv_ref2cur[n] = rf->mfmv_ref[n] < 4 ? -diff1 : diff1;
  ------------------
  |  Branch (884:39): [True: 576, False: 1.00k]
  ------------------
  885|  12.6k|                for (int m = 0; m < 7; m++) {
  ------------------
  |  Branch (885:33): [True: 11.0k, False: 1.58k]
  ------------------
  886|  11.0k|                    const int rrpoc = ref_ref_poc[rf->mfmv_ref[n]][m];
  887|  11.0k|                    const int diff2 = get_poc_diff(seq_hdr->order_hint_n_bits,
  888|  11.0k|                                                   rpoc, rrpoc);
  889|       |                    // unsigned comparison also catches the < 0 case
  890|  11.0k|                    rf->mfmv_ref2ref[n][m] = (unsigned) diff2 > 31U ? 0 : diff2;
  ------------------
  |  Branch (890:46): [True: 992, False: 10.0k]
  ------------------
  891|  11.0k|                }
  892|  1.58k|            }
  893|  1.59k|        }
  894|  3.28k|    }
  895|  8.37k|    rf->use_ref_frame_mvs = rf->n_mfmvs > 0;
  896|       |
  897|  8.37k|    return 0;
  898|  8.37k|}
dav1d_refmvs_dsp_init:
  921|  17.2k|{
  922|  17.2k|    c->load_tmvs = load_tmvs_c;
  923|  17.2k|    c->save_tmvs = save_tmvs_c;
  924|  17.2k|    c->splat_mv = splat_mv_c;
  925|       |
  926|  17.2k|#if HAVE_ASM
  927|       |#if ARCH_AARCH64 || ARCH_ARM
  928|       |    refmvs_dsp_init_arm(c);
  929|       |#elif ARCH_LOONGARCH64
  930|       |    refmvs_dsp_init_loongarch(c);
  931|       |#elif ARCH_X86
  932|       |    refmvs_dsp_init_x86(c);
  933|  17.2k|#endif
  934|  17.2k|#endif
  935|  17.2k|}
refmvs.c:scan_row:
  102|   665k|{
  103|   665k|    const refmvs_block *cand_b = b;
  104|   665k|    const enum BlockSize first_cand_bs = cand_b->bs;
  105|   665k|    const uint8_t *const first_cand_b_dim = dav1d_block_dimensions[first_cand_bs];
  106|   665k|    int cand_bw4 = first_cand_b_dim[0];
  107|   665k|    int len = imax(step, imin(bw4, cand_bw4));
  108|       |
  109|   665k|    if (bw4 <= cand_bw4) {
  ------------------
  |  Branch (109:9): [True: 623k, False: 42.2k]
  ------------------
  110|       |        // FIXME weight can be higher for odd blocks (bx4 & 1), but then the
  111|       |        // position of the first block has to be odd already, i.e. not just
  112|       |        // for row_offset=-3/-5
  113|       |        // FIXME why can this not be cand_bw4?
  114|   623k|        const int weight = bw4 == 1 ? 2 :
  ------------------
  |  Branch (114:28): [True: 485k, False: 138k]
  ------------------
  115|   623k|                           imax(2, imin(2 * max_rows, first_cand_b_dim[1]));
  116|   623k|        add_spatial_candidate(mvstack, cnt, len * weight, cand_b, ref, gmv,
  117|   623k|                              have_newmv_match, have_refmv_match);
  118|   623k|        return weight >> 1;
  119|   623k|    }
  120|       |
  121|  58.5k|    for (int x = 0;;) {
  122|       |        // FIXME if we overhang above, we could fill a bitmask so we don't have
  123|       |        // to repeat the add_spatial_candidate() for the next row, but just increase
  124|       |        // the weight here
  125|  58.5k|        add_spatial_candidate(mvstack, cnt, len * 2, cand_b, ref, gmv,
  126|  58.5k|                              have_newmv_match, have_refmv_match);
  127|  58.5k|        x += len;
  128|  58.5k|        if (x >= w4) return 1;
  ------------------
  |  Branch (128:13): [True: 33.0k, False: 25.4k]
  ------------------
  129|  25.4k|        cand_b = &b[x];
  130|  25.4k|        cand_bw4 = dav1d_block_dimensions[cand_b->bs][0];
  131|  25.4k|        assert(cand_bw4 < bw4);
  ------------------
  |  |  140|  25.4k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 25.4k]
  |  |  |  Branch (140:68): [Folded, False: 25.4k]
  |  |  ------------------
  ------------------
  132|  25.4k|        len = imax(step, cand_bw4);
  133|  25.4k|    }
  134|  42.2k|}
refmvs.c:scan_col:
  141|  2.02M|{
  142|  2.02M|    const refmvs_block *cand_b = &b[0][bx4];
  143|  2.02M|    const enum BlockSize first_cand_bs = cand_b->bs;
  144|  2.02M|    const uint8_t *const first_cand_b_dim = dav1d_block_dimensions[first_cand_bs];
  145|  2.02M|    int cand_bh4 = first_cand_b_dim[1];
  146|  2.02M|    int len = imax(step, imin(bh4, cand_bh4));
  147|       |
  148|  2.02M|    if (bh4 <= cand_bh4) {
  ------------------
  |  Branch (148:9): [True: 1.95M, False: 76.5k]
  ------------------
  149|       |        // FIXME weight can be higher for odd blocks (by4 & 1), but then the
  150|       |        // position of the first block has to be odd already, i.e. not just
  151|       |        // for col_offset=-3/-5
  152|       |        // FIXME why can this not be cand_bh4?
  153|  1.95M|        const int weight = bh4 == 1 ? 2 :
  ------------------
  |  Branch (153:28): [True: 1.72M, False: 229k]
  ------------------
  154|  1.95M|                           imax(2, imin(2 * max_cols, first_cand_b_dim[0]));
  155|  1.95M|        add_spatial_candidate(mvstack, cnt, len * weight, cand_b, ref, gmv,
  156|  1.95M|                            have_newmv_match, have_refmv_match);
  157|  1.95M|        return weight >> 1;
  158|  1.95M|    }
  159|       |
  160|  82.6k|    for (int y = 0;;) {
  161|       |        // FIXME if we overhang above, we could fill a bitmask so we don't have
  162|       |        // to repeat the add_spatial_candidate() for the next row, but just increase
  163|       |        // the weight here
  164|  82.6k|        add_spatial_candidate(mvstack, cnt, len * 2, cand_b, ref, gmv,
  165|  82.6k|                              have_newmv_match, have_refmv_match);
  166|  82.6k|        y += len;
  167|  82.6k|        if (y >= h4) return 1;
  ------------------
  |  Branch (167:13): [True: 42.6k, False: 40.0k]
  ------------------
  168|  40.0k|        cand_b = &b[y][bx4];
  169|  40.0k|        cand_bh4 = dav1d_block_dimensions[cand_b->bs][1];
  170|  40.0k|        assert(cand_bh4 < bh4);
  ------------------
  |  |  140|  40.0k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 40.0k]
  |  |  |  Branch (140:68): [Folded, False: 40.0k]
  |  |  ------------------
  ------------------
  171|  40.0k|        len = imax(step, cand_bh4);
  172|  40.0k|    }
  173|  76.5k|}
refmvs.c:add_spatial_candidate:
   46|  3.38M|{
   47|  3.38M|    if (b->mv.mv[0].n == INVALID_MV) return; // intra block, no intrabc
  ------------------
  |  |   40|  3.38M|#define INVALID_MV 0x80008000
  ------------------
  |  Branch (47:9): [True: 153k, False: 3.23M]
  ------------------
   48|       |
   49|  3.23M|    if (ref.ref[1] == -1) {
  ------------------
  |  Branch (49:9): [True: 3.16M, False: 66.2k]
  ------------------
   50|  3.36M|        for (int n = 0; n < 2; n++) {
  ------------------
  |  Branch (50:25): [True: 3.27M, False: 91.6k]
  ------------------
   51|  3.27M|            if (b->ref.ref[n] == ref.ref[0]) {
  ------------------
  |  Branch (51:17): [True: 3.07M, False: 197k]
  ------------------
   52|  3.07M|                const mv cand_mv = ((b->mf & 1) && gmv[0].n != INVALID_MV) ?
  ------------------
  |  |   40|  49.4k|#define INVALID_MV 0x80008000
  ------------------
  |  Branch (52:37): [True: 49.4k, False: 3.02M]
  |  Branch (52:52): [True: 9.18k, False: 40.2k]
  ------------------
   53|  3.06M|                                   gmv[0] : b->mv.mv[n];
   54|       |
   55|  3.07M|                *have_refmv_match = 1;
   56|  3.07M|                *have_newmv_match |= b->mf >> 1;
   57|       |
   58|  3.07M|                const int last = *cnt;
   59|  7.02M|                for (int m = 0; m < last; m++)
  ------------------
  |  Branch (59:33): [True: 4.13M, False: 2.89M]
  ------------------
   60|  4.13M|                    if (mvstack[m].mv.mv[0].n == cand_mv.n) {
  ------------------
  |  Branch (60:25): [True: 180k, False: 3.95M]
  ------------------
   61|   180k|                        mvstack[m].weight += weight;
   62|   180k|                        return;
   63|   180k|                    }
   64|       |
   65|  2.89M|                if (last < 8) {
  ------------------
  |  Branch (65:21): [True: 2.87M, False: 22.8k]
  ------------------
   66|  2.87M|                    mvstack[last].mv.mv[0] = cand_mv;
   67|  2.87M|                    mvstack[last].weight = weight;
   68|  2.87M|                    *cnt = last + 1;
   69|  2.87M|                }
   70|  2.89M|                return;
   71|  3.07M|            }
   72|  3.27M|        }
   73|  3.16M|    } else if (b->ref.pair == ref.pair) {
  ------------------
  |  Branch (73:16): [True: 23.0k, False: 43.2k]
  ------------------
   74|  23.0k|        const refmvs_mvpair cand_mv = { .mv = {
   75|  23.0k|            [0] = ((b->mf & 1) && gmv[0].n != INVALID_MV) ? gmv[0] : b->mv.mv[0],
  ------------------
  |  |   40|  1.89k|#define INVALID_MV 0x80008000
  ------------------
  |  Branch (75:20): [True: 1.89k, False: 21.1k]
  |  Branch (75:35): [True: 464, False: 1.43k]
  ------------------
   76|  23.0k|            [1] = ((b->mf & 1) && gmv[1].n != INVALID_MV) ? gmv[1] : b->mv.mv[1],
  ------------------
  |  |   40|  1.89k|#define INVALID_MV 0x80008000
  ------------------
  |  Branch (76:20): [True: 1.89k, False: 21.1k]
  |  Branch (76:35): [True: 435, False: 1.46k]
  ------------------
   77|  23.0k|        }};
   78|       |
   79|  23.0k|        *have_refmv_match = 1;
   80|  23.0k|        *have_newmv_match |= b->mf >> 1;
   81|       |
   82|  23.0k|        const int last = *cnt;
   83|  34.6k|        for (int n = 0; n < last; n++)
  ------------------
  |  Branch (83:25): [True: 19.4k, False: 15.2k]
  ------------------
   84|  19.4k|            if (mvstack[n].mv.n == cand_mv.n) {
  ------------------
  |  Branch (84:17): [True: 7.79k, False: 11.6k]
  ------------------
   85|  7.79k|                mvstack[n].weight += weight;
   86|  7.79k|                return;
   87|  7.79k|            }
   88|       |
   89|  15.2k|        if (last < 8) {
  ------------------
  |  Branch (89:13): [True: 15.2k, False: 18.4E]
  ------------------
   90|  15.2k|            mvstack[last].mv = cand_mv;
   91|  15.2k|            mvstack[last].weight = weight;
   92|  15.2k|            *cnt = last + 1;
   93|  15.2k|        }
   94|  15.2k|    }
   95|  3.23M|}
refmvs.c:add_temporal_candidate:
  198|  56.8k|{
  199|  56.8k|    if (rb->mv.n == INVALID_MV) return;
  ------------------
  |  |   40|  56.8k|#define INVALID_MV 0x80008000
  ------------------
  |  Branch (199:9): [True: 46.8k, False: 9.96k]
  ------------------
  200|       |
  201|  9.96k|    union mv mv = mv_projection(rb->mv, rf->pocdiff[ref.ref[0] - 1], rb->ref);
  202|  9.96k|    fix_mv_precision(rf->frm_hdr, &mv);
  203|       |
  204|  9.96k|    const int last = *cnt;
  205|  9.96k|    if (ref.ref[1] == -1) {
  ------------------
  |  Branch (205:9): [True: 7.92k, False: 2.03k]
  ------------------
  206|  7.92k|        if (globalmv_ctx)
  ------------------
  |  Branch (206:13): [True: 1.60k, False: 6.32k]
  ------------------
  207|  1.60k|            *globalmv_ctx = (abs(mv.x - gmv[0].x) | abs(mv.y - gmv[0].y)) >= 16;
  208|       |
  209|  12.0k|        for (int n = 0; n < last; n++)
  ------------------
  |  Branch (209:25): [True: 10.8k, False: 1.12k]
  ------------------
  210|  10.8k|            if (mvstack[n].mv.mv[0].n == mv.n) {
  ------------------
  |  Branch (210:17): [True: 6.79k, False: 4.09k]
  ------------------
  211|  6.79k|                mvstack[n].weight += 2;
  212|  6.79k|                return;
  213|  6.79k|            }
  214|  1.12k|        if (last < 8) {
  ------------------
  |  Branch (214:13): [True: 1.12k, False: 0]
  ------------------
  215|  1.12k|            mvstack[last].mv.mv[0] = mv;
  216|  1.12k|            mvstack[last].weight = 2;
  217|  1.12k|            *cnt = last + 1;
  218|  1.12k|        }
  219|  2.03k|    } else {
  220|  2.03k|        refmvs_mvpair mvp = { .mv = {
  221|  2.03k|            [0] = mv,
  222|  2.03k|            [1] = mv_projection(rb->mv, rf->pocdiff[ref.ref[1] - 1], rb->ref),
  223|  2.03k|        }};
  224|  2.03k|        fix_mv_precision(rf->frm_hdr, &mvp.mv[1]);
  225|       |
  226|  2.86k|        for (int n = 0; n < last; n++)
  ------------------
  |  Branch (226:25): [True: 2.57k, False: 292]
  ------------------
  227|  2.57k|            if (mvstack[n].mv.n == mvp.n) {
  ------------------
  |  Branch (227:17): [True: 1.74k, False: 823]
  ------------------
  228|  1.74k|                mvstack[n].weight += 2;
  229|  1.74k|                return;
  230|  1.74k|            }
  231|    292|        if (last < 8) {
  ------------------
  |  Branch (231:13): [True: 292, False: 0]
  ------------------
  232|    292|            mvstack[last].mv = mvp;
  233|    292|            mvstack[last].weight = 2;
  234|    292|            *cnt = last + 1;
  235|    292|        }
  236|    292|    }
  237|  9.96k|}
refmvs.c:mv_projection:
  175|  12.0k|static inline union mv mv_projection(const union mv mv, const int num, const int den) {
  176|  12.0k|    static const uint16_t div_mult[32] = {
  177|  12.0k|           0, 16384, 8192, 5461, 4096, 3276, 2730, 2340,
  178|  12.0k|        2048,  1820, 1638, 1489, 1365, 1260, 1170, 1092,
  179|  12.0k|        1024,   963,  910,  862,  819,  780,  744,  712,
  180|  12.0k|         682,   655,  630,  606,  585,  564,  546,  528
  181|  12.0k|    };
  182|  12.0k|    assert(den > 0 && den < 32);
  ------------------
  |  |  140|  24.0k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:30): [True: 12.0k, False: 0]
  |  |  |  Branch (140:30): [True: 12.0k, False: 0]
  |  |  |  Branch (140:68): [Folded, False: 12.0k]
  |  |  ------------------
  ------------------
  183|  12.0k|    assert(num > -32 && num < 32);
  ------------------
  |  |  140|  24.0k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:30): [True: 12.0k, False: 0]
  |  |  |  Branch (140:30): [True: 12.0k, False: 0]
  |  |  |  Branch (140:68): [Folded, False: 12.0k]
  |  |  ------------------
  ------------------
  184|  12.0k|    const int frac = num * div_mult[den];
  185|  12.0k|    const int y = mv.y * frac, x = mv.x * frac;
  186|       |    // Round and clip according to AV1 spec section 7.9.3
  187|  12.0k|    return (union mv) { // 0x3fff == (1 << 14) - 1
  188|  12.0k|        .y = iclip((y + 8192 + (y >> 31)) >> 14, -0x3fff, 0x3fff),
  189|  12.0k|        .x = iclip((x + 8192 + (x >> 31)) >> 14, -0x3fff, 0x3fff)
  190|  12.0k|    };
  191|  12.0k|}
refmvs.c:add_compound_extended_candidate:
  245|  19.6k|{
  246|  19.6k|    refmvs_candidate *const diff = &same[2];
  247|  19.6k|    int *const diff_count = &same_count[2];
  248|       |
  249|  50.0k|    for (int n = 0; n < 2; n++) {
  ------------------
  |  Branch (249:21): [True: 38.7k, False: 11.2k]
  ------------------
  250|  38.7k|        const int cand_ref = cand_b->ref.ref[n];
  251|       |
  252|  38.7k|        if (cand_ref <= 0) break;
  ------------------
  |  Branch (252:13): [True: 8.32k, False: 30.3k]
  ------------------
  253|       |
  254|  30.3k|        mv cand_mv = cand_b->mv.mv[n];
  255|  30.3k|        if (cand_ref == ref.ref[0]) {
  ------------------
  |  Branch (255:13): [True: 10.6k, False: 19.7k]
  ------------------
  256|  10.6k|            if (same_count[0] < 2)
  ------------------
  |  Branch (256:17): [True: 10.3k, False: 288]
  ------------------
  257|  10.3k|                same[same_count[0]++].mv.mv[0] = cand_mv;
  258|  10.6k|            if (diff_count[1] < 2) {
  ------------------
  |  Branch (258:17): [True: 9.33k, False: 1.30k]
  ------------------
  259|  9.33k|                if (sign1 ^ sign_bias[cand_ref - 1]) {
  ------------------
  |  Branch (259:21): [True: 700, False: 8.63k]
  ------------------
  260|    700|                    cand_mv.y = -cand_mv.y;
  261|    700|                    cand_mv.x = -cand_mv.x;
  262|    700|                }
  263|  9.33k|                diff[diff_count[1]++].mv.mv[1] = cand_mv;
  264|  9.33k|            }
  265|  19.7k|        } else if (cand_ref == ref.ref[1]) {
  ------------------
  |  Branch (265:20): [True: 10.7k, False: 8.97k]
  ------------------
  266|  10.7k|            if (same_count[1] < 2)
  ------------------
  |  Branch (266:17): [True: 10.4k, False: 298]
  ------------------
  267|  10.4k|                same[same_count[1]++].mv.mv[1] = cand_mv;
  268|  10.7k|            if (diff_count[0] < 2) {
  ------------------
  |  Branch (268:17): [True: 8.95k, False: 1.81k]
  ------------------
  269|  8.95k|                if (sign0 ^ sign_bias[cand_ref - 1]) {
  ------------------
  |  Branch (269:21): [True: 800, False: 8.15k]
  ------------------
  270|    800|                    cand_mv.y = -cand_mv.y;
  271|    800|                    cand_mv.x = -cand_mv.x;
  272|    800|                }
  273|  8.95k|                diff[diff_count[0]++].mv.mv[0] = cand_mv;
  274|  8.95k|            }
  275|  10.7k|        } else {
  276|  8.97k|            mv i_cand_mv = (union mv) {
  277|  8.97k|                .x = -cand_mv.x,
  278|  8.97k|                .y = -cand_mv.y
  279|  8.97k|            };
  280|       |
  281|  8.97k|            if (diff_count[0] < 2) {
  ------------------
  |  Branch (281:17): [True: 7.23k, False: 1.74k]
  ------------------
  282|  7.23k|                diff[diff_count[0]++].mv.mv[0] =
  283|  7.23k|                    sign0 ^ sign_bias[cand_ref - 1] ?
  ------------------
  |  Branch (283:21): [True: 356, False: 6.87k]
  ------------------
  284|  6.87k|                    i_cand_mv : cand_mv;
  285|  7.23k|            }
  286|       |
  287|  8.97k|            if (diff_count[1] < 2) {
  ------------------
  |  Branch (287:17): [True: 6.73k, False: 2.24k]
  ------------------
  288|  6.73k|                diff[diff_count[1]++].mv.mv[1] =
  289|  6.73k|                    sign1 ^ sign_bias[cand_ref - 1] ?
  ------------------
  |  Branch (289:21): [True: 566, False: 6.16k]
  ------------------
  290|  6.16k|                    i_cand_mv : cand_mv;
  291|  6.73k|            }
  292|  8.97k|        }
  293|  30.3k|    }
  294|  19.6k|}
refmvs.c:add_single_extended_candidate:
  299|  64.2k|{
  300|   128k|    for (int n = 0; n < 2; n++) {
  ------------------
  |  Branch (300:21): [True: 125k, False: 3.41k]
  ------------------
  301|   125k|        const int cand_ref = cand_b->ref.ref[n];
  302|       |
  303|   125k|        if (cand_ref <= 0) break;
  ------------------
  |  Branch (303:13): [True: 60.8k, False: 64.4k]
  ------------------
  304|       |        // we need to continue even if cand_ref == ref.ref[0], since
  305|       |        // the candidate could have been added as a globalmv variant,
  306|       |        // which changes the value
  307|       |        // FIXME if scan_{row,col}() returned a mask for the nearest
  308|       |        // edge, we could skip the appropriate ones here
  309|       |
  310|  64.4k|        mv cand_mv = cand_b->mv.mv[n];
  311|  64.4k|        if (sign ^ sign_bias[cand_ref - 1]) {
  ------------------
  |  Branch (311:13): [True: 1.13k, False: 63.3k]
  ------------------
  312|  1.13k|            cand_mv.y = -cand_mv.y;
  313|  1.13k|            cand_mv.x = -cand_mv.x;
  314|  1.13k|        }
  315|       |
  316|  64.4k|        int m;
  317|  64.4k|        const int last = *cnt;
  318|  75.0k|        for (m = 0; m < last; m++)
  ------------------
  |  Branch (318:21): [True: 58.7k, False: 16.2k]
  ------------------
  319|  58.7k|            if (cand_mv.n == mvstack[m].mv.mv[0].n)
  ------------------
  |  Branch (319:17): [True: 48.2k, False: 10.5k]
  ------------------
  320|  48.2k|                break;
  321|  64.4k|        if (m == last) {
  ------------------
  |  Branch (321:13): [True: 16.2k, False: 48.2k]
  ------------------
  322|  16.2k|            mvstack[m].mv.mv[0] = cand_mv;
  323|  16.2k|            mvstack[m].weight = 2; // "minimal"
  324|  16.2k|            *cnt = last + 1;
  325|  16.2k|        }
  326|  64.4k|    }
  327|  64.2k|}

decode.c:dav1d_refmvs_save_tmvs:
  145|  7.74k|{
  146|  7.74k|    const refmvs_frame *const rf = rt->rf;
  147|       |
  148|  7.74k|    assert(row_start8 >= 0);
  ------------------
  |  |  140|  7.74k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 7.74k]
  |  |  |  Branch (140:68): [Folded, False: 7.74k]
  |  |  ------------------
  ------------------
  149|  7.74k|    assert((unsigned) (row_end8 - row_start8) <= 16U);
  ------------------
  |  |  140|  7.74k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 7.74k]
  |  |  |  Branch (140:68): [Folded, False: 7.74k]
  |  |  ------------------
  ------------------
  150|  7.74k|    row_end8 = imin(row_end8, rf->ih8);
  151|  7.74k|    col_end8 = imin(col_end8, rf->iw8);
  152|       |
  153|  7.74k|    const ptrdiff_t stride = rf->rp_stride;
  154|  7.74k|    const uint8_t *const ref_sign = rf->mfmv_sign;
  155|  7.74k|    refmvs_temporal_block *rp = &rf->rp[row_start8 * stride];
  156|       |
  157|  7.74k|    dsp->save_tmvs(rp, stride, rt->r + 6, ref_sign,
  158|  7.74k|                   col_end8, row_end8, col_start8, row_start8);
  159|  7.74k|}

dav1d_init_last_nonzero_col_from_eob_tables:
  350|  2.94k|COLD void dav1d_init_last_nonzero_col_from_eob_tables(void) {
  351|       |    static pthread_once_t initted = PTHREAD_ONCE_INIT;
  352|  2.94k|    pthread_once(&initted, init_internal);
  353|  2.94k|}
scan.c:init_internal:
  333|      1|static COLD void init_internal(void) {
  334|      1|    init_tbl(last_nonzero_col_from_eob_4x4,   scan_4x4,    4,  4);
  335|      1|    init_tbl(last_nonzero_col_from_eob_8x8,   scan_8x8,    8,  8);
  336|      1|    init_tbl(last_nonzero_col_from_eob_16x16, scan_16x16, 16, 16);
  337|      1|    init_tbl(last_nonzero_col_from_eob_32x32, scan_32x32, 32, 32);
  338|      1|    init_tbl(last_nonzero_col_from_eob_4x8,   scan_4x8,    4,  8);
  339|      1|    init_tbl(last_nonzero_col_from_eob_8x4,   scan_8x4,    8,  4);
  340|      1|    init_tbl(last_nonzero_col_from_eob_8x16,  scan_8x16,   8, 16);
  341|      1|    init_tbl(last_nonzero_col_from_eob_16x8,  scan_16x8,  16,  8);
  342|      1|    init_tbl(last_nonzero_col_from_eob_16x32, scan_16x32, 16, 32);
  343|      1|    init_tbl(last_nonzero_col_from_eob_32x16, scan_32x16, 32, 16);
  344|      1|    init_tbl(last_nonzero_col_from_eob_4x16,  scan_4x16,   4, 16);
  345|      1|    init_tbl(last_nonzero_col_from_eob_16x4,  scan_16x4,  16,  4);
  346|      1|    init_tbl(last_nonzero_col_from_eob_8x32,  scan_8x32,   8, 32);
  347|      1|    init_tbl(last_nonzero_col_from_eob_32x8,  scan_32x8,  32,  8);
  348|      1|}
scan.c:init_tbl:
  321|     14|{
  322|     14|    int max_col = 0;
  323|    218|    for (int y = 0, n = 0; y < h; y++) {
  ------------------
  |  Branch (323:28): [True: 204, False: 14]
  ------------------
  324|  3.54k|        for (int x = 0; x < w; x++, n++) {
  ------------------
  |  Branch (324:25): [True: 3.34k, False: 204]
  ------------------
  325|  3.34k|            const int rc = scan[n];
  326|  3.34k|            const int rcx = rc & (h - 1);
  327|  3.34k|            max_col = imax(max_col, rcx);
  328|  3.34k|            last_nonzero_col_from_eob[n] = max_col;
  329|  3.34k|        }
  330|    204|    }
  331|     14|}

thread_task.c:dav1d_set_thread_name:
  152|   534k|static inline void dav1d_set_thread_name(const char *const name) {
  153|       |    prctl(PR_SET_NAME, name);
  154|   534k|}

dav1d_task_create_tile_sbrow:
  270|  20.0k|{
  271|  20.0k|    Dav1dTask *tasks = f->task_thread.tile_tasks[0];
  272|  20.0k|    const int uses_2pass = f->c->n_fc > 1;
  273|  20.0k|    const int num_tasks = f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows;
  274|  20.0k|    if (pass < 2) {
  ------------------
  |  Branch (274:9): [True: 20.0k, False: 0]
  ------------------
  275|  20.0k|        int alloc_num_tasks = num_tasks * (1 + uses_2pass);
  276|  20.0k|        if (alloc_num_tasks > f->task_thread.num_tile_tasks) {
  ------------------
  |  Branch (276:13): [True: 15.1k, False: 4.90k]
  ------------------
  277|  15.1k|            const size_t size = sizeof(Dav1dTask) * alloc_num_tasks;
  278|  15.1k|            tasks = dav1d_realloc(ALLOC_COMMON_CTX, f->task_thread.tile_tasks[0], size);
  ------------------
  |  |  133|  15.1k|#define dav1d_realloc(type, ptr, sz) realloc(ptr, sz)
  ------------------
  279|  15.1k|            if (!tasks) return -1;
  ------------------
  |  Branch (279:17): [True: 0, False: 15.1k]
  ------------------
  280|  15.1k|            memset(tasks, 0, size);
  281|  15.1k|            f->task_thread.tile_tasks[0] = tasks;
  282|  15.1k|            f->task_thread.num_tile_tasks = alloc_num_tasks;
  283|  15.1k|        }
  284|  20.0k|        f->task_thread.tile_tasks[1] = tasks + num_tasks;
  285|  20.0k|    }
  286|  20.0k|    tasks += num_tasks * (pass & 1);
  287|       |
  288|  20.0k|    Dav1dTask *pf_t;
  289|  20.0k|    if (create_filter_sbrow(f, pass, &pf_t))
  ------------------
  |  Branch (289:9): [True: 0, False: 20.0k]
  ------------------
  290|      0|        return -1;
  291|       |
  292|  20.0k|    Dav1dTask *prev_t = NULL;
  293|  50.2k|    for (int tile_idx = 0; tile_idx < num_tasks; tile_idx++) {
  ------------------
  |  Branch (293:28): [True: 30.2k, False: 20.0k]
  ------------------
  294|  30.2k|        Dav1dTileState *const ts = &f->ts[tile_idx];
  295|  30.2k|        Dav1dTask *t = &tasks[tile_idx];
  296|  30.2k|        t->sby = ts->tiling.row_start >> f->sb_shift;
  297|  30.2k|        if (pf_t && t->sby) {
  ------------------
  |  Branch (297:13): [True: 26.5k, False: 3.67k]
  |  Branch (297:21): [True: 2.39k, False: 24.1k]
  ------------------
  298|  2.39k|            prev_t->next = pf_t;
  299|  2.39k|            prev_t = pf_t;
  300|  2.39k|            pf_t = NULL;
  301|  2.39k|        }
  302|  30.2k|        t->recon_progress = 0;
  303|  30.2k|        t->deblock_progress = 0;
  304|  30.2k|        t->deps_skip = 0;
  305|  30.2k|        t->type = pass != 1 ? DAV1D_TASK_TYPE_TILE_RECONSTRUCTION :
  ------------------
  |  Branch (305:19): [True: 30.2k, False: 0]
  ------------------
  306|  30.2k|                              DAV1D_TASK_TYPE_TILE_ENTROPY;
  307|  30.2k|        t->frame_idx = (int)(f - f->c->fc);
  308|  30.2k|        if (prev_t) prev_t->next = t;
  ------------------
  |  Branch (308:13): [True: 10.1k, False: 20.0k]
  ------------------
  309|  30.2k|        prev_t = t;
  310|  30.2k|    }
  311|  20.0k|    if (pf_t) {
  ------------------
  |  Branch (311:9): [True: 17.6k, False: 2.39k]
  ------------------
  312|  17.6k|        prev_t->next = pf_t;
  313|  17.6k|        prev_t = pf_t;
  314|  17.6k|    }
  315|  20.0k|    prev_t->next = NULL;
  316|       |
  317|  20.0k|    atomic_store(&f->task_thread.done[pass & 1], 0);
  318|       |
  319|       |    // XXX in theory this could be done locklessly, at this point they are no
  320|       |    // tasks in the frameQ, so no other runner should be using this lock, but
  321|       |    // we must add both passes at once
  322|  20.0k|    pthread_mutex_lock(&f->task_thread.pending_tasks.lock);
  323|  20.0k|    assert(f->task_thread.pending_tasks.head == NULL || pass == 2);
  ------------------
  |  |  140|  20.0k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:30): [True: 20.0k, False: 0]
  |  |  |  Branch (140:30): [True: 0, False: 0]
  |  |  |  Branch (140:68): [Folded, False: 20.0k]
  |  |  ------------------
  ------------------
  324|  20.0k|    if (!f->task_thread.pending_tasks.head)
  ------------------
  |  Branch (324:9): [True: 20.0k, False: 0]
  ------------------
  325|  20.0k|        f->task_thread.pending_tasks.head = &tasks[0];
  326|      0|    else
  327|      0|        f->task_thread.pending_tasks.tail->next = &tasks[0];
  328|  20.0k|    f->task_thread.pending_tasks.tail = prev_t;
  329|  20.0k|    atomic_store(&f->task_thread.pending_tasks.merge, 1);
  330|  20.0k|    atomic_store(&f->task_thread.init_done, 1);
  331|  20.0k|    pthread_mutex_unlock(&f->task_thread.pending_tasks.lock);
  332|       |
  333|  20.0k|    return 0;
  334|  20.0k|}
dav1d_task_delayed_fg:
  351|    242|{
  352|    242|    struct TaskThreadData *const ttd = &c->task_thread;
  353|    242|    ttd->delayed_fg.in = in;
  354|    242|    ttd->delayed_fg.out = out;
  355|    242|    ttd->delayed_fg.type = DAV1D_TASK_TYPE_FG_PREP;
  356|    242|    atomic_init(&ttd->delayed_fg.progress[0], 0);
  357|    242|    atomic_init(&ttd->delayed_fg.progress[1], 0);
  358|    242|    pthread_mutex_lock(&ttd->lock);
  359|    242|    ttd->delayed_fg.exec = 1;
  360|    242|    ttd->delayed_fg.finished = 0;
  361|    242|    pthread_cond_signal(&ttd->cond);
  362|    242|    do {
  363|    242|        pthread_cond_wait(&ttd->delayed_fg.cond, &ttd->lock);
  364|    242|    } while (!ttd->delayed_fg.finished);
  ------------------
  |  Branch (364:14): [True: 0, False: 242]
  ------------------
  365|    242|    pthread_mutex_unlock(&ttd->lock);
  366|    242|}
dav1d_worker_task:
  550|   534k|void *dav1d_worker_task(void *data) {
  551|   534k|    Dav1dTaskContext *const tc = data;
  552|   534k|    const Dav1dContext *const c = tc->c;
  553|   534k|    struct TaskThreadData *const ttd = tc->task_thread.ttd;
  554|       |
  555|   534k|    dav1d_set_thread_name("dav1d-worker");
  556|       |
  557|   534k|    pthread_mutex_lock(&ttd->lock);
  558|  1.98M|    for (;;) {
  559|  1.98M|        if (tc->task_thread.die) break;
  ------------------
  |  Branch (559:13): [True: 535k, False: 1.44M]
  ------------------
  560|  1.44M|        if (atomic_load(c->flush)) goto park;
  ------------------
  |  Branch (560:13): [True: 1.34k, False: 1.44M]
  ------------------
  561|       |
  562|  1.44M|        merge_pending(c);
  563|  1.44M|        if (ttd->delayed_fg.exec) { // run delayed film grain first
  ------------------
  |  Branch (563:13): [True: 2.06k, False: 1.44M]
  ------------------
  564|  2.06k|            delayed_fg_task(c, ttd);
  565|  2.06k|            continue;
  566|  2.06k|        }
  567|  1.44M|        Dav1dFrameContext *f;
  568|  1.44M|        Dav1dTask *t, *prev_t = NULL;
  569|  1.44M|        if (c->n_fc > 1) { // run init tasks second
  ------------------
  |  Branch (569:13): [True: 0, False: 1.44M]
  ------------------
  570|      0|            for (unsigned i = 0; i < c->n_fc; i++) {
  ------------------
  |  Branch (570:34): [True: 0, False: 0]
  ------------------
  571|      0|                const unsigned first = atomic_load(&ttd->first);
  572|      0|                f = &c->fc[(first + i) % c->n_fc];
  573|      0|                if (atomic_load(&f->task_thread.init_done)) continue;
  ------------------
  |  Branch (573:21): [True: 0, False: 0]
  ------------------
  574|      0|                t = f->task_thread.task_head;
  575|      0|                if (!t) continue;
  ------------------
  |  Branch (575:21): [True: 0, False: 0]
  ------------------
  576|      0|                if (t->type == DAV1D_TASK_TYPE_INIT) goto found;
  ------------------
  |  Branch (576:21): [True: 0, False: 0]
  ------------------
  577|      0|                if (t->type == DAV1D_TASK_TYPE_INIT_CDF) {
  ------------------
  |  Branch (577:21): [True: 0, False: 0]
  ------------------
  578|       |                    // XXX This can be a simple else, if adding tasks of both
  579|       |                    // passes at once (in dav1d_task_create_tile_sbrow).
  580|       |                    // Adding the tasks to the pending Q can result in a
  581|       |                    // thread merging them before setting init_done.
  582|       |                    // We will need to set init_done before adding to the
  583|       |                    // pending Q, so maybe return the tasks, set init_done,
  584|       |                    // and add to pending Q only then.
  585|      0|                    const int p1 = f->in_cdf.progress ?
  ------------------
  |  Branch (585:36): [True: 0, False: 0]
  ------------------
  586|      0|                        atomic_load(f->in_cdf.progress) : 1;
  587|      0|                    if (p1) {
  ------------------
  |  Branch (587:25): [True: 0, False: 0]
  ------------------
  588|      0|                        atomic_fetch_or(&f->task_thread.error, p1 == TILE_ERROR);
  589|      0|                        goto found;
  590|      0|                    }
  591|      0|                }
  592|      0|            }
  593|      0|        }
  594|  1.58M|        while (ttd->cur < c->n_fc) { // run decoding tasks last
  ------------------
  |  Branch (594:16): [True: 687k, False: 902k]
  ------------------
  595|   687k|            const unsigned first = atomic_load(&ttd->first);
  596|   687k|            f = &c->fc[(first + ttd->cur) % c->n_fc];
  597|   687k|            merge_pending_frame(f);
  598|   687k|            prev_t = f->task_thread.task_cur_prev;
  599|   687k|            t = prev_t ? prev_t->next : f->task_thread.task_head;
  ------------------
  |  Branch (599:17): [True: 9.21k, False: 677k]
  ------------------
  600|   910k|            while (t) {
  ------------------
  |  Branch (600:20): [True: 767k, False: 143k]
  ------------------
  601|   767k|                if (t->type == DAV1D_TASK_TYPE_INIT_CDF) goto next;
  ------------------
  |  Branch (601:21): [True: 0, False: 767k]
  ------------------
  602|   767k|                else if (t->type == DAV1D_TASK_TYPE_TILE_ENTROPY ||
  ------------------
  |  Branch (602:26): [True: 0, False: 767k]
  ------------------
  603|   767k|                         t->type == DAV1D_TASK_TYPE_TILE_RECONSTRUCTION)
  ------------------
  |  Branch (603:26): [True: 155k, False: 612k]
  ------------------
  604|   155k|                {
  605|       |                    // if not bottom sbrow of tile, this task will be re-added
  606|       |                    // after it's finished
  607|   155k|                    if (!check_tile(t, f, c->n_fc > 1))
  ------------------
  |  Branch (607:25): [True: 155k, False: 0]
  ------------------
  608|   155k|                        goto found;
  609|   612k|                } else if (t->recon_progress) {
  ------------------
  |  Branch (609:28): [True: 525k, False: 86.3k]
  ------------------
  610|   525k|                    const int p = t->type == DAV1D_TASK_TYPE_ENTROPY_PROGRESS;
  611|   525k|                    int error = atomic_load(&f->task_thread.error);
  612|   525k|                    assert(!atomic_load(&f->task_thread.done[p]) || error);
  ------------------
  |  |  140|   525k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:30): [True: 525k, False: 0]
  |  |  |  Branch (140:30): [True: 0, False: 0]
  |  |  |  Branch (140:68): [Folded, False: 525k]
  |  |  ------------------
  ------------------
  613|   525k|                    const int tile_row_base = f->frame_hdr->tiling.cols *
  614|   525k|                                              f->frame_thread.next_tile_row[p];
  615|   525k|                    if (p) {
  ------------------
  |  Branch (615:25): [True: 0, False: 525k]
  ------------------
  616|      0|                        atomic_int *const prog = &f->frame_thread.entropy_progress;
  617|      0|                        const int p1 = atomic_load(prog);
  618|      0|                        if (p1 < t->sby) goto next;
  ------------------
  |  Branch (618:29): [True: 0, False: 0]
  ------------------
  619|      0|                        atomic_fetch_or(&f->task_thread.error, p1 == TILE_ERROR);
  620|      0|                    }
  621|   916k|                    for (int tc = 0; tc < f->frame_hdr->tiling.cols; tc++) {
  ------------------
  |  Branch (621:38): [True: 538k, False: 377k]
  ------------------
  622|   538k|                        Dav1dTileState *const ts = &f->ts[tile_row_base + tc];
  623|   538k|                        const int p2 = atomic_load(&ts->progress[p]);
  624|   538k|                        if (p2 < t->recon_progress) goto next;
  ------------------
  |  Branch (624:29): [True: 148k, False: 390k]
  ------------------
  625|   538k|                        atomic_fetch_or(&f->task_thread.error, p2 == TILE_ERROR);
  626|   390k|                    }
  627|   377k|                    if (t->sby + 1 < f->sbh) {
  ------------------
  |  Branch (627:25): [True: 357k, False: 20.0k]
  ------------------
  628|       |                        // add sby+1 to list to replace this one
  629|   357k|                        Dav1dTask *next_t = &t[1];
  630|   357k|                        *next_t = *t;
  631|   357k|                        next_t->sby++;
  632|   357k|                        const int ntr = f->frame_thread.next_tile_row[p] + 1;
  633|   357k|                        const int start = f->frame_hdr->tiling.row_start_sb[ntr];
  634|   357k|                        if (next_t->sby == start)
  ------------------
  |  Branch (634:29): [True: 5.63k, False: 352k]
  ------------------
  635|  5.63k|                            f->frame_thread.next_tile_row[p] = ntr;
  636|   357k|                        next_t->recon_progress = next_t->sby + 1;
  637|   357k|                        insert_task(f, next_t, 0);
  638|   357k|                    }
  639|   377k|                    goto found;
  640|   525k|                } else if (t->type == DAV1D_TASK_TYPE_CDEF) {
  ------------------
  |  Branch (640:28): [True: 79.0k, False: 7.34k]
  ------------------
  641|  79.0k|                    atomic_uint *prog = f->frame_thread.copy_lpf_progress;
  642|  79.0k|                    const int p1 = atomic_load(&prog[(t->sby - 1) >> 5]);
  643|  79.0k|                    if (p1 & (1U << ((t->sby - 1) & 31)))
  ------------------
  |  Branch (643:25): [True: 9.69k, False: 69.3k]
  ------------------
  644|  9.69k|                        goto found;
  645|  79.0k|                } else {
  646|  7.34k|                    assert(t->deblock_progress);
  ------------------
  |  |  140|  7.34k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 7.34k]
  |  |  |  Branch (140:68): [Folded, False: 7.34k]
  |  |  ------------------
  ------------------
  647|  7.34k|                    const int p1 = atomic_load(&f->frame_thread.deblock_progress);
  648|  7.34k|                    if (p1 >= t->deblock_progress) {
  ------------------
  |  Branch (648:25): [True: 966, False: 6.37k]
  ------------------
  649|    966|                        atomic_fetch_or(&f->task_thread.error, p1 == TILE_ERROR);
  650|    966|                        goto found;
  651|    966|                    }
  652|  7.34k|                }
  653|   223k|            next:
  654|   223k|                prev_t = t;
  655|   223k|                t = t->next;
  656|   223k|                f->task_thread.task_cur_prev = prev_t;
  657|   223k|            }
  658|   143k|            ttd->cur++;
  659|   143k|        }
  660|   902k|        if (reset_task_cur(c, ttd, UINT_MAX)) continue;
  ------------------
  |  Branch (660:13): [True: 3.18k, False: 899k]
  ------------------
  661|   899k|        if (merge_pending(c)) continue;
  ------------------
  |  Branch (661:13): [True: 940, False: 898k]
  ------------------
  662|   899k|    park:
  663|   899k|        tc->task_thread.flushed = 1;
  664|   899k|        pthread_cond_signal(&tc->task_thread.td.cond);
  665|       |        // we want to be woken up next time progress is signaled
  666|   899k|        atomic_store(&ttd->cond_signaled, 0);
  667|   899k|        pthread_cond_wait(&ttd->cond, &ttd->lock);
  668|   899k|        tc->task_thread.flushed = 0;
  669|   899k|        reset_task_cur(c, ttd, UINT_MAX);
  670|   899k|        continue;
  671|       |
  672|   543k|    found:
  673|       |        // remove t from list
  674|   543k|        if (prev_t) prev_t->next = t->next;
  ------------------
  |  Branch (674:13): [True: 67.6k, False: 475k]
  ------------------
  675|   475k|        else f->task_thread.task_head = t->next;
  676|   543k|        if (!t->next) f->task_thread.task_tail = prev_t;
  ------------------
  |  Branch (676:13): [True: 33.4k, False: 510k]
  ------------------
  677|   543k|        if (t->type > DAV1D_TASK_TYPE_INIT_CDF && !f->task_thread.task_head)
  ------------------
  |  Branch (677:13): [True: 543k, False: 0]
  |  Branch (677:51): [True: 20.1k, False: 523k]
  ------------------
  678|  20.1k|            ttd->cur++;
  679|   543k|        t->next = NULL;
  680|       |        // we don't need to check cond_signaled here, since we found a task
  681|       |        // after the last signal so we want to re-signal the next waiting thread
  682|       |        // and again won't need to signal after that
  683|   543k|        atomic_store(&ttd->cond_signaled, 1);
  684|   543k|        pthread_cond_signal(&ttd->cond);
  685|   543k|        pthread_mutex_unlock(&ttd->lock);
  686|   772k|    found_unlocked:;
  687|   772k|        const int flush = atomic_load(c->flush);
  688|   772k|        int error = atomic_fetch_or(&f->task_thread.error, flush) | flush;
  689|       |
  690|       |        // run it
  691|   772k|        tc->f = f;
  692|   772k|        int sby = t->sby;
  693|   772k|        switch (t->type) {
  694|      0|        case DAV1D_TASK_TYPE_INIT: {
  ------------------
  |  Branch (694:9): [True: 0, False: 772k]
  ------------------
  695|      0|            assert(c->n_fc > 1);
  ------------------
  |  |  140|      0|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 0]
  |  |  |  Branch (140:68): [Folded, False: 0]
  |  |  ------------------
  ------------------
  696|      0|            int res = dav1d_decode_frame_init(f);
  697|      0|            int p1 = f->in_cdf.progress ? atomic_load(f->in_cdf.progress) : 1;
  ------------------
  |  Branch (697:22): [True: 0, False: 0]
  ------------------
  698|      0|            if (res || p1 == TILE_ERROR) {
  ------------------
  |  |   36|      0|#define TILE_ERROR (INT_MAX - 1)
  ------------------
  |  Branch (698:17): [True: 0, False: 0]
  |  Branch (698:24): [True: 0, False: 0]
  ------------------
  699|      0|                pthread_mutex_lock(&ttd->lock);
  700|      0|                abort_frame(f, res ? res : DAV1D_ERR(EINVAL));
  ------------------
  |  |   56|      0|#define DAV1D_ERR(e) (-(e)) ///< Negate POSIX error code.
  ------------------
  |  Branch (700:32): [True: 0, False: 0]
  ------------------
  701|      0|                reset_task_cur(c, ttd, t->frame_idx);
  702|      0|            } else {
  703|      0|                t->type = DAV1D_TASK_TYPE_INIT_CDF;
  704|      0|                if (p1) goto found_unlocked;
  ------------------
  |  Branch (704:21): [True: 0, False: 0]
  ------------------
  705|      0|                add_pending(f, t);
  706|      0|                pthread_mutex_lock(&ttd->lock);
  707|      0|            }
  708|      0|            continue;
  709|      0|        }
  710|      0|        case DAV1D_TASK_TYPE_INIT_CDF: {
  ------------------
  |  Branch (710:9): [True: 0, False: 772k]
  ------------------
  711|      0|            assert(c->n_fc > 1);
  ------------------
  |  |  140|      0|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 0]
  |  |  |  Branch (140:68): [Folded, False: 0]
  |  |  ------------------
  ------------------
  712|      0|            int res = DAV1D_ERR(EINVAL);
  ------------------
  |  |   56|      0|#define DAV1D_ERR(e) (-(e)) ///< Negate POSIX error code.
  ------------------
  713|      0|            if (!atomic_load(&f->task_thread.error))
  ------------------
  |  Branch (713:17): [True: 0, False: 0]
  ------------------
  714|      0|                res = dav1d_decode_frame_init_cdf(f);
  715|      0|            if (f->frame_hdr->refresh_context && !f->task_thread.update_set) {
  ------------------
  |  Branch (715:17): [True: 0, False: 0]
  |  Branch (715:50): [True: 0, False: 0]
  ------------------
  716|      0|                atomic_store(f->out_cdf.progress, res < 0 ? TILE_ERROR : 1);
  ------------------
  |  Branch (716:17): [True: 0, False: 0]
  ------------------
  717|      0|            }
  718|      0|            if (!res) {
  ------------------
  |  Branch (718:17): [True: 0, False: 0]
  ------------------
  719|      0|                assert(c->n_fc > 1);
  ------------------
  |  |  140|      0|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 0]
  |  |  |  Branch (140:68): [Folded, False: 0]
  |  |  ------------------
  ------------------
  720|      0|                for (int p = 1; p <= 2; p++) {
  ------------------
  |  Branch (720:33): [True: 0, False: 0]
  ------------------
  721|      0|                    const int res = dav1d_task_create_tile_sbrow(f, p, 0);
  722|      0|                    if (res) {
  ------------------
  |  Branch (722:25): [True: 0, False: 0]
  ------------------
  723|      0|                        pthread_mutex_lock(&ttd->lock);
  724|       |                        // memory allocation failed
  725|      0|                        atomic_store(&f->task_thread.done[2 - p], 1);
  726|      0|                        atomic_store(&f->task_thread.error, -1);
  727|      0|                        atomic_fetch_sub(&f->task_thread.task_counter,
  728|      0|                                         f->frame_hdr->tiling.cols *
  729|      0|                                         f->frame_hdr->tiling.rows + f->sbh);
  730|      0|                        atomic_store(&f->sr_cur.progress[p - 1], FRAME_ERROR);
  731|      0|                        if (p == 2 && atomic_load(&f->task_thread.done[1])) {
  ------------------
  |  Branch (731:29): [True: 0, False: 0]
  |  Branch (731:39): [True: 0, False: 0]
  ------------------
  732|      0|                            assert(!atomic_load(&f->task_thread.task_counter));
  ------------------
  |  |  140|      0|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 0]
  |  |  |  Branch (140:68): [Folded, False: 0]
  |  |  ------------------
  ------------------
  733|      0|                            dav1d_decode_frame_exit(f, DAV1D_ERR(ENOMEM));
  ------------------
  |  |   56|      0|#define DAV1D_ERR(e) (-(e)) ///< Negate POSIX error code.
  ------------------
  734|      0|                            f->n_tile_data = 0;
  735|      0|                            pthread_cond_signal(&f->task_thread.cond);
  736|      0|                        } else {
  737|      0|                            pthread_mutex_unlock(&ttd->lock);
  738|      0|                        }
  739|      0|                    }
  740|      0|                }
  741|      0|                pthread_mutex_lock(&ttd->lock);
  742|      0|            } else {
  743|      0|                pthread_mutex_lock(&ttd->lock);
  744|      0|                abort_frame(f, res);
  745|      0|                reset_task_cur(c, ttd, t->frame_idx);
  746|      0|                atomic_store(&f->task_thread.init_done, 1);
  747|      0|            }
  748|      0|            continue;
  749|      0|        }
  750|      0|        case DAV1D_TASK_TYPE_TILE_ENTROPY:
  ------------------
  |  Branch (750:9): [True: 0, False: 772k]
  ------------------
  751|   384k|        case DAV1D_TASK_TYPE_TILE_RECONSTRUCTION: {
  ------------------
  |  Branch (751:9): [True: 384k, False: 388k]
  ------------------
  752|   384k|            const int p = t->type == DAV1D_TASK_TYPE_TILE_ENTROPY;
  753|   384k|            const int tile_idx = (int)(t - f->task_thread.tile_tasks[p]);
  754|   384k|            Dav1dTileState *const ts = &f->ts[tile_idx];
  755|       |
  756|   384k|            tc->ts = ts;
  757|   384k|            tc->by = sby << f->sb_shift;
  758|   384k|            const int uses_2pass = c->n_fc > 1;
  759|   384k|            tc->frame_thread.pass = !uses_2pass ? 0 :
  ------------------
  |  Branch (759:37): [True: 384k, False: 3]
  ------------------
  760|   384k|                1 + (t->type == DAV1D_TASK_TYPE_TILE_RECONSTRUCTION);
  761|   384k|            if (!error) error = dav1d_decode_tile_sbrow(tc);
  ------------------
  |  Branch (761:17): [True: 143k, False: 240k]
  ------------------
  762|   384k|            const int progress = error ? TILE_ERROR : 1 + sby;
  ------------------
  |  |   36|   254k|#define TILE_ERROR (INT_MAX - 1)
  ------------------
  |  Branch (762:34): [True: 254k, False: 130k]
  ------------------
  763|       |
  764|       |            // signal progress
  765|   384k|            atomic_fetch_or(&f->task_thread.error, error);
  766|   384k|            if (((sby + 1) << f->sb_shift) < ts->tiling.row_end) {
  ------------------
  |  Branch (766:17): [True: 354k, False: 30.1k]
  ------------------
  767|   354k|                t->sby++;
  768|   354k|                t->deps_skip = 0;
  769|   354k|                if (!check_tile(t, f, uses_2pass)) {
  ------------------
  |  Branch (769:21): [True: 229k, False: 124k]
  ------------------
  770|   229k|                    atomic_store(&ts->progress[p], progress);
  771|   229k|                    reset_task_cur_async(ttd, t->frame_idx, c->n_fc);
  772|   229k|                    if (!atomic_fetch_or(&ttd->cond_signaled, 1))
  ------------------
  |  Branch (772:25): [True: 489, False: 229k]
  ------------------
  773|    489|                        pthread_cond_signal(&ttd->cond);
  774|   229k|                    goto found_unlocked;
  775|   229k|                }
  776|   354k|                atomic_store(&ts->progress[p], progress);
  777|   124k|                add_pending(f, t);
  778|   124k|                pthread_mutex_lock(&ttd->lock);
  779|   124k|            } else {
  780|  30.1k|                pthread_mutex_lock(&ttd->lock);
  781|  30.1k|                atomic_store(&ts->progress[p], progress);
  782|  30.1k|                reset_task_cur(c, ttd, t->frame_idx);
  783|  30.1k|                error = atomic_load(&f->task_thread.error);
  784|  30.1k|                if (f->frame_hdr->refresh_context &&
  ------------------
  |  Branch (784:21): [True: 8.50k, False: 21.6k]
  ------------------
  785|  8.50k|                    tc->frame_thread.pass <= 1 && f->task_thread.update_set &&
  ------------------
  |  Branch (785:21): [True: 8.50k, False: 0]
  |  Branch (785:51): [True: 8.50k, False: 0]
  ------------------
  786|  8.50k|                    f->frame_hdr->tiling.update == tile_idx)
  ------------------
  |  Branch (786:21): [True: 8.03k, False: 465]
  ------------------
  787|  8.03k|                {
  788|  8.03k|                    if (!error)
  ------------------
  |  Branch (788:25): [True: 6.67k, False: 1.36k]
  ------------------
  789|  6.67k|                        dav1d_cdf_thread_update(f->frame_hdr, f->out_cdf.data.cdf,
  790|  6.67k|                                                &f->ts[f->frame_hdr->tiling.update].cdf);
  791|  8.03k|                    if (c->n_fc > 1)
  ------------------
  |  Branch (791:25): [True: 0, False: 8.03k]
  ------------------
  792|  8.03k|                        atomic_store(f->out_cdf.progress, error ? TILE_ERROR : 1);
  ------------------
  |  Branch (792:25): [True: 0, False: 0]
  ------------------
  793|  8.03k|                }
  794|  30.1k|                if (atomic_fetch_sub(&f->task_thread.task_counter, 1) - 1 == 0 &&
  ------------------
  |  Branch (794:21): [True: 169, False: 30.0k]
  ------------------
  795|  30.1k|                    atomic_load(&f->task_thread.done[0]) &&
  ------------------
  |  Branch (795:21): [True: 169, False: 0]
  ------------------
  796|    169|                    (!uses_2pass || atomic_load(&f->task_thread.done[1])))
  ------------------
  |  Branch (796:22): [True: 169, False: 0]
  |  Branch (796:37): [True: 0, False: 0]
  ------------------
  797|    169|                {
  798|    169|                    error = atomic_load(&f->task_thread.error);
  799|    169|                    dav1d_decode_frame_exit(f, error == 1 ? DAV1D_ERR(EINVAL) :
  ------------------
  |  |   56|    169|#define DAV1D_ERR(e) (-(e)) ///< Negate POSIX error code.
  ------------------
  |  Branch (799:48): [True: 169, False: 0]
  ------------------
  800|    169|                                            error ? DAV1D_ERR(ENOMEM) : 0);
  ------------------
  |  |   56|      0|#define DAV1D_ERR(e) (-(e)) ///< Negate POSIX error code.
  ------------------
  |  Branch (800:45): [True: 0, False: 0]
  ------------------
  801|    169|                    f->n_tile_data = 0;
  802|    169|                    pthread_cond_signal(&f->task_thread.cond);
  803|    169|                }
  804|  30.1k|                assert(atomic_load(&f->task_thread.task_counter) >= 0);
  ------------------
  |  |  140|  30.1k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 30.1k]
  |  |  |  Branch (140:68): [Folded, False: 30.1k]
  |  |  ------------------
  ------------------
  805|  30.1k|                if (!atomic_fetch_or(&ttd->cond_signaled, 1))
  ------------------
  |  Branch (805:21): [True: 18.0k, False: 12.1k]
  ------------------
  806|  18.0k|                    pthread_cond_signal(&ttd->cond);
  807|  30.1k|            }
  808|   155k|            continue;
  809|   384k|        }
  810|   193k|        case DAV1D_TASK_TYPE_DEBLOCK_COLS:
  ------------------
  |  Branch (810:9): [True: 193k, False: 579k]
  ------------------
  811|   193k|            if (!atomic_load(&f->task_thread.error))
  ------------------
  |  Branch (811:17): [True: 78.6k, False: 114k]
  ------------------
  812|  78.6k|                f->bd_fn.filter_sbrow_deblock_cols(f, sby);
  813|   193k|            if (ensure_progress(ttd, f, t, DAV1D_TASK_TYPE_DEBLOCK_ROWS,
  ------------------
  |  Branch (813:17): [True: 966, False: 192k]
  ------------------
  814|   193k|                                &f->frame_thread.deblock_progress,
  815|   193k|                                &t->deblock_progress)) continue;
  816|       |            // fall-through
  817|   301k|        case DAV1D_TASK_TYPE_DEBLOCK_ROWS:
  ------------------
  |  Branch (817:9): [True: 109k, False: 663k]
  ------------------
  818|   301k|            if (!atomic_load(&f->task_thread.error))
  ------------------
  |  Branch (818:17): [True: 105k, False: 196k]
  ------------------
  819|   105k|                f->bd_fn.filter_sbrow_deblock_rows(f, sby);
  820|       |            // signal deblock progress
  821|   301k|            if (f->frame_hdr->loopfilter.level_y[0] ||
  ------------------
  |  Branch (821:17): [True: 176k, False: 124k]
  ------------------
  822|   124k|                f->frame_hdr->loopfilter.level_y[1])
  ------------------
  |  Branch (822:17): [True: 16.3k, False: 108k]
  ------------------
  823|   193k|            {
  824|   193k|                error = atomic_load(&f->task_thread.error);
  825|   193k|                atomic_store(&f->frame_thread.deblock_progress,
  ------------------
  |  Branch (825:17): [True: 114k, False: 78.6k]
  ------------------
  826|   193k|                             error ? TILE_ERROR : sby + 1);
  827|   193k|                reset_task_cur_async(ttd, t->frame_idx, c->n_fc);
  828|   193k|                if (!atomic_fetch_or(&ttd->cond_signaled, 1))
  ------------------
  |  Branch (828:21): [True: 4.93k, False: 188k]
  ------------------
  829|  4.93k|                    pthread_cond_signal(&ttd->cond);
  830|   193k|            } else if (f->seq_hdr->cdef || f->lf.restore_planes) {
  ------------------
  |  Branch (830:24): [True: 107k, False: 1.40k]
  |  Branch (830:44): [True: 1.40k, False: 0]
  ------------------
  831|   108k|                atomic_fetch_or(&f->frame_thread.copy_lpf_progress[sby >> 5],
  832|   108k|                                1U << (sby & 31));
  833|       |                // CDEF needs the top buffer to be saved by lr_copy_lpf of the
  834|       |                // previous sbrow
  835|   108k|                if (sby) {
  ------------------
  |  Branch (835:21): [True: 100k, False: 7.76k]
  ------------------
  836|   100k|                    int prog = atomic_load(&f->frame_thread.copy_lpf_progress[(sby - 1) >> 5]);
  837|   100k|                    if (~prog & (1U << ((sby - 1) & 31))) {
  ------------------
  |  Branch (837:25): [True: 9.68k, False: 91.0k]
  ------------------
  838|  9.68k|                        t->type = DAV1D_TASK_TYPE_CDEF;
  839|  9.68k|                        t->recon_progress = t->deblock_progress = 0;
  840|  9.68k|                        add_pending(f, t);
  841|  9.68k|                        pthread_mutex_lock(&ttd->lock);
  842|  9.68k|                        continue;
  843|  9.68k|                    }
  844|   100k|                }
  845|   108k|            }
  846|       |            // fall-through
  847|   301k|        case DAV1D_TASK_TYPE_CDEF:
  ------------------
  |  Branch (847:9): [True: 9.68k, False: 763k]
  ------------------
  848|   301k|            if (f->seq_hdr->cdef) {
  ------------------
  |  Branch (848:17): [True: 236k, False: 65.2k]
  ------------------
  849|   236k|                if (!atomic_load(&f->task_thread.error))
  ------------------
  |  Branch (849:21): [True: 89.7k, False: 146k]
  ------------------
  850|  89.7k|                    f->bd_fn.filter_sbrow_cdef(tc, sby);
  851|   236k|                reset_task_cur_async(ttd, t->frame_idx, c->n_fc);
  852|   236k|                if (!atomic_fetch_or(&ttd->cond_signaled, 1))
  ------------------
  |  Branch (852:21): [True: 8.24k, False: 228k]
  ------------------
  853|  8.24k|                    pthread_cond_signal(&ttd->cond);
  854|   236k|            }
  855|       |            // fall-through
  856|   301k|        case DAV1D_TASK_TYPE_SUPER_RESOLUTION:
  ------------------
  |  Branch (856:9): [True: 111, False: 772k]
  ------------------
  857|   301k|            if (f->frame_hdr->width[0] != f->frame_hdr->width[1])
  ------------------
  |  Branch (857:17): [True: 6.77k, False: 295k]
  ------------------
  858|  6.77k|                if (!atomic_load(&f->task_thread.error))
  ------------------
  |  Branch (858:21): [True: 2.77k, False: 3.99k]
  ------------------
  859|  2.77k|                    f->bd_fn.filter_sbrow_resize(f, sby);
  860|       |            // fall-through
  861|   301k|        case DAV1D_TASK_TYPE_LOOP_RESTORATION:
  ------------------
  |  Branch (861:9): [True: 0, False: 772k]
  ------------------
  862|   301k|            if (!atomic_load(&f->task_thread.error) && f->lf.restore_planes)
  ------------------
  |  Branch (862:17): [True: 105k, False: 196k]
  |  Branch (862:56): [True: 61.0k, False: 43.9k]
  ------------------
  863|  61.0k|                f->bd_fn.filter_sbrow_lr(f, sby);
  864|       |            // fall-through
  865|   377k|        case DAV1D_TASK_TYPE_RECONSTRUCTION_PROGRESS:
  ------------------
  |  Branch (865:9): [True: 75.4k, False: 697k]
  ------------------
  866|       |            // dummy to cover for no post-filters
  867|   377k|        case DAV1D_TASK_TYPE_ENTROPY_PROGRESS:
  ------------------
  |  Branch (867:9): [True: 0, False: 772k]
  ------------------
  868|       |            // dummy to convert tile progress to frame
  869|   377k|            break;
  870|      0|        default: abort();
  ------------------
  |  Branch (870:9): [True: 0, False: 772k]
  ------------------
  871|   772k|        }
  872|       |        // if task completed [typically LR], signal picture progress as per below
  873|   377k|        const int uses_2pass = c->n_fc > 1;
  874|   377k|        const int sbh = f->sbh;
  875|   377k|        const int sbsz = f->sb_step * 4;
  876|   377k|        if (t->type == DAV1D_TASK_TYPE_ENTROPY_PROGRESS) {
  ------------------
  |  Branch (876:13): [True: 0, False: 377k]
  ------------------
  877|      0|            error = atomic_load(&f->task_thread.error);
  878|      0|            const unsigned y = sby + 1 == sbh ? UINT_MAX : (unsigned)(sby + 1) * sbsz;
  ------------------
  |  Branch (878:32): [True: 0, False: 0]
  ------------------
  879|      0|            assert(c->n_fc > 1);
  ------------------
  |  |  140|      0|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 0]
  |  |  |  Branch (140:68): [Folded, False: 0]
  |  |  ------------------
  ------------------
  880|      0|            if (f->sr_cur.p.data[0] /* upon flush, this can be free'ed already */)
  ------------------
  |  Branch (880:17): [True: 0, False: 0]
  ------------------
  881|      0|                atomic_store(&f->sr_cur.progress[0], error ? FRAME_ERROR : y);
  ------------------
  |  Branch (881:17): [True: 0, False: 0]
  ------------------
  882|      0|            atomic_store(&f->frame_thread.entropy_progress,
  ------------------
  |  Branch (882:13): [True: 0, False: 0]
  ------------------
  883|      0|                         error ? TILE_ERROR : sby + 1);
  884|      0|            if (sby + 1 == sbh)
  ------------------
  |  Branch (884:17): [True: 0, False: 0]
  ------------------
  885|      0|                atomic_store(&f->task_thread.done[1], 1);
  886|      0|            pthread_mutex_lock(&ttd->lock);
  887|      0|            const int num_tasks = atomic_fetch_sub(&f->task_thread.task_counter, 1) - 1;
  888|      0|            if (sby + 1 < sbh && num_tasks) {
  ------------------
  |  Branch (888:17): [True: 0, False: 0]
  |  Branch (888:34): [True: 0, False: 0]
  ------------------
  889|      0|                reset_task_cur(c, ttd, t->frame_idx);
  890|      0|                continue;
  891|      0|            }
  892|      0|            if (!num_tasks && atomic_load(&f->task_thread.done[0]) &&
  ------------------
  |  Branch (892:17): [True: 0, False: 0]
  |  Branch (892:31): [True: 0, False: 0]
  ------------------
  893|      0|                atomic_load(&f->task_thread.done[1]))
  ------------------
  |  Branch (893:17): [True: 0, False: 0]
  ------------------
  894|      0|            {
  895|      0|                error = atomic_load(&f->task_thread.error);
  896|      0|                dav1d_decode_frame_exit(f, error == 1 ? DAV1D_ERR(EINVAL) :
  ------------------
  |  |   56|      0|#define DAV1D_ERR(e) (-(e)) ///< Negate POSIX error code.
  ------------------
  |  Branch (896:44): [True: 0, False: 0]
  ------------------
  897|      0|                                        error ? DAV1D_ERR(ENOMEM) : 0);
  ------------------
  |  |   56|      0|#define DAV1D_ERR(e) (-(e)) ///< Negate POSIX error code.
  ------------------
  |  Branch (897:41): [True: 0, False: 0]
  ------------------
  898|      0|                f->n_tile_data = 0;
  899|      0|                pthread_cond_signal(&f->task_thread.cond);
  900|      0|            }
  901|      0|            reset_task_cur(c, ttd, t->frame_idx);
  902|      0|            continue;
  903|      0|        }
  904|       |    // t->type != DAV1D_TASK_TYPE_ENTROPY_PROGRESS
  905|   377k|        atomic_fetch_or(&f->frame_thread.frame_progress[sby >> 5],
  906|   377k|                        1U << (sby & 31));
  907|   377k|        pthread_mutex_lock(&f->task_thread.lock);
  908|   377k|        sby = get_frame_progress(c, f);
  909|   377k|        error = atomic_load(&f->task_thread.error);
  910|   377k|        const unsigned y = sby + 1 == sbh ? UINT_MAX : (unsigned)(sby + 1) * sbsz;
  ------------------
  |  Branch (910:28): [True: 20.4k, False: 356k]
  ------------------
  911|   377k|        if (c->n_fc > 1 && f->sr_cur.p.data[0] /* upon flush, this can be free'ed already */)
  ------------------
  |  Branch (911:13): [True: 0, False: 377k]
  |  Branch (911:28): [True: 0, False: 0]
  ------------------
  912|   377k|            atomic_store(&f->sr_cur.progress[1], error ? FRAME_ERROR : y);
  ------------------
  |  Branch (912:13): [True: 0, False: 0]
  ------------------
  913|   377k|        pthread_mutex_unlock(&f->task_thread.lock);
  914|   377k|        if (sby + 1 == sbh)
  ------------------
  |  Branch (914:13): [True: 20.4k, False: 356k]
  ------------------
  915|   377k|            atomic_store(&f->task_thread.done[0], 1);
  916|   377k|        pthread_mutex_lock(&ttd->lock);
  917|   377k|        const int num_tasks = atomic_fetch_sub(&f->task_thread.task_counter, 1) - 1;
  918|   377k|        if (sby + 1 < sbh && num_tasks) {
  ------------------
  |  Branch (918:13): [True: 357k, False: 20.1k]
  |  Branch (918:30): [True: 354k, False: 2.53k]
  ------------------
  919|   354k|            reset_task_cur(c, ttd, t->frame_idx);
  920|   354k|            continue;
  921|   354k|        }
  922|  22.6k|        if (!num_tasks && atomic_load(&f->task_thread.done[0]) &&
  ------------------
  |  Branch (922:13): [True: 19.8k, False: 2.82k]
  |  Branch (922:27): [True: 19.8k, False: 0]
  ------------------
  923|  19.8k|            (!uses_2pass || atomic_load(&f->task_thread.done[1])))
  ------------------
  |  Branch (923:14): [True: 19.8k, False: 0]
  |  Branch (923:29): [True: 0, False: 0]
  ------------------
  924|  19.8k|        {
  925|  19.8k|            error = atomic_load(&f->task_thread.error);
  926|  19.8k|            dav1d_decode_frame_exit(f, error == 1 ? DAV1D_ERR(EINVAL) :
  ------------------
  |  |   56|  9.14k|#define DAV1D_ERR(e) (-(e)) ///< Negate POSIX error code.
  ------------------
  |  Branch (926:40): [True: 9.14k, False: 10.7k]
  ------------------
  927|  19.8k|                                    error ? DAV1D_ERR(ENOMEM) : 0);
  ------------------
  |  |   56|      0|#define DAV1D_ERR(e) (-(e)) ///< Negate POSIX error code.
  ------------------
  |  Branch (927:37): [True: 0, False: 10.7k]
  ------------------
  928|  19.8k|            f->n_tile_data = 0;
  929|  19.8k|            pthread_cond_signal(&f->task_thread.cond);
  930|  19.8k|        }
  931|  22.6k|        reset_task_cur(c, ttd, t->frame_idx);
  932|  22.6k|    }
  933|   534k|    pthread_mutex_unlock(&ttd->lock);
  934|       |
  935|       |    return NULL;
  936|   534k|}
thread_task.c:create_filter_sbrow:
  215|  20.0k|{
  216|  20.0k|    const int has_deblock = f->frame_hdr->loopfilter.level_y[0] ||
  ------------------
  |  Branch (216:29): [True: 8.79k, False: 11.2k]
  ------------------
  217|  11.2k|                            f->frame_hdr->loopfilter.level_y[1];
  ------------------
  |  Branch (217:29): [True: 926, False: 10.3k]
  ------------------
  218|  20.0k|    const int has_cdef = f->seq_hdr->cdef;
  219|  20.0k|    const int has_resize = f->frame_hdr->width[0] != f->frame_hdr->width[1];
  220|  20.0k|    const int has_lr = f->lf.restore_planes;
  221|       |
  222|  20.0k|    Dav1dTask *tasks = f->task_thread.tasks;
  223|  20.0k|    const int uses_2pass = f->c->n_fc > 1;
  224|  20.0k|    int num_tasks = f->sbh * (1 + uses_2pass);
  225|  20.0k|    if (num_tasks > f->task_thread.num_tasks) {
  ------------------
  |  Branch (225:9): [True: 14.4k, False: 5.59k]
  ------------------
  226|  14.4k|        const size_t size = sizeof(Dav1dTask) * num_tasks;
  227|  14.4k|        tasks = dav1d_realloc(ALLOC_COMMON_CTX, f->task_thread.tasks, size);
  ------------------
  |  |  133|  14.4k|#define dav1d_realloc(type, ptr, sz) realloc(ptr, sz)
  ------------------
  228|  14.4k|        if (!tasks) return -1;
  ------------------
  |  Branch (228:13): [True: 0, False: 14.4k]
  ------------------
  229|  14.4k|        memset(tasks, 0, size);
  230|  14.4k|        f->task_thread.tasks = tasks;
  231|  14.4k|        f->task_thread.num_tasks = num_tasks;
  232|  14.4k|    }
  233|  20.0k|    tasks += f->sbh * (pass & 1);
  234|       |
  235|  20.0k|    if (pass & 1) {
  ------------------
  |  Branch (235:9): [True: 0, False: 20.0k]
  ------------------
  236|      0|        f->frame_thread.entropy_progress = 0;
  237|  20.0k|    } else {
  238|  20.0k|        const int prog_sz = ((f->sbh + 31) & ~31) >> 5;
  239|  20.0k|        if (prog_sz > f->frame_thread.prog_sz) {
  ------------------
  |  Branch (239:13): [True: 14.4k, False: 5.60k]
  ------------------
  240|  14.4k|            atomic_uint *const prog = dav1d_realloc(ALLOC_COMMON_CTX, f->frame_thread.frame_progress,
  ------------------
  |  |  133|  14.4k|#define dav1d_realloc(type, ptr, sz) realloc(ptr, sz)
  ------------------
  241|  14.4k|                                                    2 * prog_sz * sizeof(*prog));
  242|  14.4k|            if (!prog) return -1;
  ------------------
  |  Branch (242:17): [True: 0, False: 14.4k]
  ------------------
  243|  14.4k|            f->frame_thread.frame_progress = prog;
  244|  14.4k|            f->frame_thread.copy_lpf_progress = prog + prog_sz;
  245|  14.4k|        }
  246|  20.0k|        f->frame_thread.prog_sz = prog_sz;
  247|  20.0k|        memset(f->frame_thread.frame_progress, 0, prog_sz * sizeof(atomic_uint));
  248|  20.0k|        memset(f->frame_thread.copy_lpf_progress, 0, prog_sz * sizeof(atomic_uint));
  249|  20.0k|        atomic_store(&f->frame_thread.deblock_progress, 0);
  250|  20.0k|    }
  251|  20.0k|    f->frame_thread.next_tile_row[pass & 1] = 0;
  252|       |
  253|  20.0k|    Dav1dTask *t = &tasks[0];
  254|  20.0k|    t->sby = 0;
  255|  20.0k|    t->recon_progress = 1;
  256|  20.0k|    t->deblock_progress = 0;
  257|  20.0k|    t->type = pass == 1 ? DAV1D_TASK_TYPE_ENTROPY_PROGRESS :
  ------------------
  |  Branch (257:15): [True: 0, False: 20.0k]
  ------------------
  258|  20.0k|              has_deblock ? DAV1D_TASK_TYPE_DEBLOCK_COLS :
  ------------------
  |  Branch (258:15): [True: 9.72k, False: 10.3k]
  ------------------
  259|  20.0k|              has_cdef || has_lr /* i.e. LR backup */ ? DAV1D_TASK_TYPE_DEBLOCK_ROWS :
  ------------------
  |  Branch (259:15): [True: 7.73k, False: 2.57k]
  |  Branch (259:27): [True: 55, False: 2.51k]
  ------------------
  260|  10.3k|              has_resize ? DAV1D_TASK_TYPE_SUPER_RESOLUTION :
  ------------------
  |  Branch (260:15): [True: 53, False: 2.46k]
  ------------------
  261|  2.51k|              DAV1D_TASK_TYPE_RECONSTRUCTION_PROGRESS;
  262|  20.0k|    t->frame_idx = (int)(f - f->c->fc);
  263|       |
  264|  20.0k|    *res_t = t;
  265|  20.0k|    return 0;
  266|  20.0k|}
thread_task.c:insert_task:
  172|   543k|{
  173|   543k|    insert_tasks(f, t, t, cond_signal);
  174|   543k|}
thread_task.c:insert_tasks:
  118|   543k|{
  119|       |    // insert task back into task queue
  120|   543k|    Dav1dTask *t_ptr, *prev_t = NULL;
  121|   543k|    for (t_ptr = f->task_thread.task_head;
  122|  1.27M|         t_ptr; prev_t = t_ptr, t_ptr = t_ptr->next)
  ------------------
  |  Branch (122:10): [True: 807k, False: 468k]
  ------------------
  123|   807k|    {
  124|       |        // entropy coding precedes other steps
  125|   807k|        if (t_ptr->type == DAV1D_TASK_TYPE_TILE_ENTROPY) {
  ------------------
  |  Branch (125:13): [True: 0, False: 807k]
  ------------------
  126|      0|            if (first->type > DAV1D_TASK_TYPE_TILE_ENTROPY) continue;
  ------------------
  |  Branch (126:17): [True: 0, False: 0]
  ------------------
  127|       |            // both are entropy
  128|      0|            if (first->sby > t_ptr->sby) continue;
  ------------------
  |  Branch (128:17): [True: 0, False: 0]
  ------------------
  129|      0|            if (first->sby < t_ptr->sby) {
  ------------------
  |  Branch (129:17): [True: 0, False: 0]
  ------------------
  130|      0|                insert_tasks_between(f, first, last, prev_t, t_ptr, cond_signal);
  131|      0|                return;
  132|      0|            }
  133|       |            // same sby
  134|   807k|        } else {
  135|   807k|            if (first->type == DAV1D_TASK_TYPE_TILE_ENTROPY) {
  ------------------
  |  Branch (135:17): [True: 0, False: 807k]
  ------------------
  136|      0|                insert_tasks_between(f, first, last, prev_t, t_ptr, cond_signal);
  137|      0|                return;
  138|      0|            }
  139|   807k|            if (first->sby > t_ptr->sby) continue;
  ------------------
  |  Branch (139:17): [True: 587k, False: 220k]
  ------------------
  140|   220k|            if (first->sby < t_ptr->sby) {
  ------------------
  |  Branch (140:17): [True: 73.2k, False: 147k]
  ------------------
  141|  73.2k|                insert_tasks_between(f, first, last, prev_t, t_ptr, cond_signal);
  142|  73.2k|                return;
  143|  73.2k|            }
  144|       |            // same sby
  145|   147k|            if (first->type > t_ptr->type) continue;
  ------------------
  |  Branch (145:17): [True: 132k, False: 15.0k]
  ------------------
  146|  15.0k|            if (first->type < t_ptr->type) {
  ------------------
  |  Branch (146:17): [True: 1.74k, False: 13.2k]
  ------------------
  147|  1.74k|                insert_tasks_between(f, first, last, prev_t, t_ptr, cond_signal);
  148|  1.74k|                return;
  149|  1.74k|            }
  150|       |            // same task type
  151|  15.0k|        }
  152|       |
  153|       |        // sort by tile-id
  154|  13.2k|        assert(first->type == DAV1D_TASK_TYPE_TILE_RECONSTRUCTION ||
  ------------------
  |  |  140|  13.2k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:30): [True: 13.2k, False: 0]
  |  |  |  Branch (140:30): [True: 0, False: 0]
  |  |  |  Branch (140:68): [Folded, False: 13.2k]
  |  |  ------------------
  ------------------
  155|  13.2k|               first->type == DAV1D_TASK_TYPE_TILE_ENTROPY);
  156|  13.2k|        assert(first->type == t_ptr->type);
  ------------------
  |  |  140|  13.2k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 13.2k]
  |  |  |  Branch (140:68): [Folded, False: 13.2k]
  |  |  ------------------
  ------------------
  157|  13.2k|        assert(t_ptr->sby == first->sby);
  ------------------
  |  |  140|  13.2k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 13.2k]
  |  |  |  Branch (140:68): [Folded, False: 13.2k]
  |  |  ------------------
  ------------------
  158|  13.2k|        const int p = first->type == DAV1D_TASK_TYPE_TILE_ENTROPY;
  159|  13.2k|        const int t_tile_idx = (int) (first - f->task_thread.tile_tasks[p]);
  160|  13.2k|        const int p_tile_idx = (int) (t_ptr - f->task_thread.tile_tasks[p]);
  161|  13.2k|        assert(t_tile_idx != p_tile_idx);
  ------------------
  |  |  140|  13.2k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 13.2k]
  |  |  |  Branch (140:68): [Folded, False: 13.2k]
  |  |  ------------------
  ------------------
  162|  13.2k|        if (t_tile_idx > p_tile_idx) continue;
  ------------------
  |  Branch (162:13): [True: 13.2k, False: 67]
  ------------------
  163|     67|        insert_tasks_between(f, first, last, prev_t, t_ptr, cond_signal);
  164|     67|        return;
  165|  13.2k|    }
  166|       |    // append at the end
  167|   468k|    insert_tasks_between(f, first, last, prev_t, NULL, cond_signal);
  168|   468k|}
thread_task.c:insert_tasks_between:
  102|   543k|{
  103|   543k|    struct TaskThreadData *const ttd = f->task_thread.ttd;
  104|   543k|    if (atomic_load(f->c->flush)) return;
  ------------------
  |  Branch (104:9): [True: 0, False: 543k]
  ------------------
  105|   543k|    assert(!a || a->next == b);
  ------------------
  |  |  140|  1.06M|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:30): [True: 26.9k, False: 516k]
  |  |  |  Branch (140:30): [True: 516k, False: 0]
  |  |  |  Branch (140:68): [Folded, False: 543k]
  |  |  ------------------
  ------------------
  106|   543k|    if (!a) f->task_thread.task_head = first;
  ------------------
  |  Branch (106:9): [True: 26.9k, False: 516k]
  ------------------
  107|   516k|    else a->next = first;
  108|   543k|    if (!b) f->task_thread.task_tail = last;
  ------------------
  |  Branch (108:9): [True: 468k, False: 75.0k]
  ------------------
  109|   543k|    last->next = b;
  110|   543k|    reset_task_cur(f->c, ttd, first->frame_idx);
  111|   543k|    if (cond_signal && !atomic_fetch_or(&ttd->cond_signaled, 1))
  ------------------
  |  Branch (111:9): [True: 0, False: 543k]
  |  Branch (111:24): [True: 0, False: 0]
  ------------------
  112|      0|        pthread_cond_signal(&ttd->cond);
  113|   543k|}
thread_task.c:merge_pending:
  206|  2.34M|static inline int merge_pending(const Dav1dContext *const c) {
  207|  2.34M|    int res = 0;
  208|  4.69M|    for (unsigned i = 0; i < c->n_fc; i++)
  ------------------
  |  Branch (208:26): [True: 2.34M, False: 2.34M]
  ------------------
  209|  2.34M|        res |= merge_pending_frame(&c->fc[i]);
  210|  2.34M|    return res;
  211|  2.34M|}
thread_task.c:delayed_fg_task:
  467|  2.06k|{
  468|  2.06k|    const Dav1dPicture *const in = ttd->delayed_fg.in;
  469|  2.06k|    Dav1dPicture *const out = ttd->delayed_fg.out;
  470|  2.06k|#if CONFIG_16BPC
  471|  2.06k|    int off;
  472|  2.06k|    if (out->p.bpc != 8)
  ------------------
  |  Branch (472:9): [True: 1.33k, False: 730]
  ------------------
  473|  1.33k|        off = (out->p.bpc >> 1) - 4;
  474|  2.06k|#endif
  475|  2.06k|    switch (ttd->delayed_fg.type) {
  476|    242|    case DAV1D_TASK_TYPE_FG_PREP:
  ------------------
  |  Branch (476:5): [True: 242, False: 1.81k]
  ------------------
  477|    242|        ttd->delayed_fg.exec = 0;
  478|    242|        if (atomic_load(&ttd->cond_signaled))
  ------------------
  |  Branch (478:13): [True: 0, False: 242]
  ------------------
  479|      0|            pthread_cond_signal(&ttd->cond);
  480|    242|        pthread_mutex_unlock(&ttd->lock);
  481|    242|        switch (out->p.bpc) {
  482|      0|#if CONFIG_8BPC
  483|    101|        case 8:
  ------------------
  |  Branch (483:9): [True: 101, False: 141]
  ------------------
  484|    101|            dav1d_prep_grain_8bpc(&c->dsp[0].fg, out, in,
  485|    101|                                  ttd->delayed_fg.scaling_8bpc,
  486|    101|                                  ttd->delayed_fg.grain_lut_8bpc);
  487|    101|            break;
  488|      0|#endif
  489|      0|#if CONFIG_16BPC
  490|     75|        case 10:
  ------------------
  |  Branch (490:9): [True: 75, False: 167]
  ------------------
  491|    141|        case 12:
  ------------------
  |  Branch (491:9): [True: 66, False: 176]
  ------------------
  492|    141|            dav1d_prep_grain_16bpc(&c->dsp[off].fg, out, in,
  493|    141|                                   ttd->delayed_fg.scaling_16bpc,
  494|    141|                                   ttd->delayed_fg.grain_lut_16bpc);
  495|    141|            break;
  496|      0|#endif
  497|      0|        default: abort();
  ------------------
  |  Branch (497:9): [True: 0, False: 242]
  ------------------
  498|    242|        }
  499|    242|        ttd->delayed_fg.type = DAV1D_TASK_TYPE_FG_APPLY;
  500|    242|        pthread_mutex_lock(&ttd->lock);
  501|    242|        ttd->delayed_fg.exec = 1;
  502|       |        // fall-through
  503|  2.06k|    case DAV1D_TASK_TYPE_FG_APPLY:;
  ------------------
  |  Branch (503:5): [True: 1.81k, False: 242]
  ------------------
  504|  2.06k|        int row = atomic_fetch_add(&ttd->delayed_fg.progress[0], 1);
  505|  2.06k|        pthread_mutex_unlock(&ttd->lock);
  506|  2.06k|        int progmax = (out->p.h + FG_BLOCK_SIZE - 1) / FG_BLOCK_SIZE;
  ------------------
  |  |   37|  2.06k|#define FG_BLOCK_SIZE 32
  ------------------
                      int progmax = (out->p.h + FG_BLOCK_SIZE - 1) / FG_BLOCK_SIZE;
  ------------------
  |  |   37|  2.06k|#define FG_BLOCK_SIZE 32
  ------------------
  507|  23.0k|        while (row < progmax) {
  ------------------
  |  Branch (507:16): [True: 21.5k, False: 1.55k]
  ------------------
  508|  21.5k|            if (row + 1 < progmax)
  ------------------
  |  Branch (508:17): [True: 21.2k, False: 298]
  ------------------
  509|  21.2k|                pthread_cond_signal(&ttd->cond);
  510|    298|            else {
  511|    298|                pthread_mutex_lock(&ttd->lock);
  512|    298|                ttd->delayed_fg.exec = 0;
  513|    298|                pthread_mutex_unlock(&ttd->lock);
  514|    298|            }
  515|  21.5k|            switch (out->p.bpc) {
  516|      0|#if CONFIG_8BPC
  517|  5.89k|            case 8:
  ------------------
  |  Branch (517:13): [True: 5.89k, False: 15.6k]
  ------------------
  518|  5.89k|                dav1d_apply_grain_row_8bpc(&c->dsp[0].fg, out, in,
  519|  5.89k|                                           ttd->delayed_fg.scaling_8bpc,
  520|  5.89k|                                           ttd->delayed_fg.grain_lut_8bpc, row);
  521|  5.89k|                break;
  522|      0|#endif
  523|      0|#if CONFIG_16BPC
  524|  1.75k|            case 10:
  ------------------
  |  Branch (524:13): [True: 1.75k, False: 19.7k]
  ------------------
  525|  15.6k|            case 12:
  ------------------
  |  Branch (525:13): [True: 13.9k, False: 7.59k]
  ------------------
  526|  15.6k|                dav1d_apply_grain_row_16bpc(&c->dsp[off].fg, out, in,
  527|  15.6k|                                            ttd->delayed_fg.scaling_16bpc,
  528|  15.6k|                                            ttd->delayed_fg.grain_lut_16bpc, row);
  529|  15.6k|                break;
  530|      0|#endif
  531|      0|            default: abort();
  ------------------
  |  Branch (531:13): [True: 0, False: 21.5k]
  ------------------
  532|  21.5k|            }
  533|  21.0k|            row = atomic_fetch_add(&ttd->delayed_fg.progress[0], 1);
  534|  21.0k|            atomic_fetch_add(&ttd->delayed_fg.progress[1], 1);
  535|  21.0k|        }
  536|  1.55k|        pthread_mutex_lock(&ttd->lock);
  537|  1.55k|        ttd->delayed_fg.exec = 0;
  538|  1.55k|        int done = atomic_fetch_add(&ttd->delayed_fg.progress[1], 1) + 1;
  539|  1.55k|        progmax = atomic_load(&ttd->delayed_fg.progress[0]);
  540|       |        // signal for completion only once the last runner reaches this
  541|  1.55k|        if (done >= progmax) {
  ------------------
  |  Branch (541:13): [True: 242, False: 1.31k]
  ------------------
  542|    242|            ttd->delayed_fg.finished = 1;
  543|    242|            pthread_cond_signal(&ttd->delayed_fg.cond);
  544|    242|        }
  545|  1.55k|        break;
  546|      0|    default: abort();
  ------------------
  |  Branch (546:5): [True: 0, False: 2.06k]
  ------------------
  547|  2.06k|    }
  548|  2.06k|}
thread_task.c:merge_pending_frame:
  188|  3.03M|static inline int merge_pending_frame(Dav1dFrameContext *const f) {
  189|  3.03M|    int const merge = atomic_load(&f->task_thread.pending_tasks.merge);
  190|  3.03M|    if (merge) {
  ------------------
  |  Branch (190:9): [True: 152k, False: 2.88M]
  ------------------
  191|   152k|        pthread_mutex_lock(&f->task_thread.pending_tasks.lock);
  192|   152k|        Dav1dTask *t = f->task_thread.pending_tasks.head;
  193|   152k|        f->task_thread.pending_tasks.head = NULL;
  194|   152k|        f->task_thread.pending_tasks.tail = NULL;
  195|   152k|        atomic_store(&f->task_thread.pending_tasks.merge, 0);
  196|   152k|        pthread_mutex_unlock(&f->task_thread.pending_tasks.lock);
  197|   338k|        while (t) {
  ------------------
  |  Branch (197:16): [True: 185k, False: 152k]
  ------------------
  198|   185k|            Dav1dTask *const tmp = t->next;
  199|   185k|            insert_task(f, t, 0);
  200|   185k|            t = tmp;
  201|   185k|        }
  202|   152k|    }
  203|  3.03M|    return merge;
  204|  3.03M|}
thread_task.c:check_tile:
  389|   509k|{
  390|   509k|    const int tp = t->type == DAV1D_TASK_TYPE_TILE_ENTROPY;
  391|   509k|    const int tile_idx = (int)(t - f->task_thread.tile_tasks[tp]);
  392|   509k|    Dav1dTileState *const ts = &f->ts[tile_idx];
  393|   509k|    const int p1 = atomic_load(&ts->progress[tp]);
  394|   509k|    if (p1 < t->sby) return 1;
  ------------------
  |  Branch (394:9): [True: 124k, False: 384k]
  ------------------
  395|   384k|    int error = p1 == TILE_ERROR;
  ------------------
  |  |   36|   384k|#define TILE_ERROR (INT_MAX - 1)
  ------------------
  396|   384k|    error |= atomic_fetch_or(&f->task_thread.error, error);
  397|   384k|    if (!error && frame_mt && !tp) {
  ------------------
  |  Branch (397:9): [True: 144k, False: 240k]
  |  Branch (397:19): [True: 0, False: 144k]
  |  Branch (397:31): [True: 0, False: 0]
  ------------------
  398|      0|        const int p2 = atomic_load(&ts->progress[1]);
  399|      0|        if (p2 <= t->sby) return 1;
  ------------------
  |  Branch (399:13): [True: 0, False: 0]
  ------------------
  400|      0|        error = p2 == TILE_ERROR;
  ------------------
  |  |   36|      0|#define TILE_ERROR (INT_MAX - 1)
  ------------------
  401|      0|        error |= atomic_fetch_or(&f->task_thread.error, error);
  402|      0|    }
  403|   384k|    if (!error && frame_mt && !IS_KEY_OR_INTRA(f->frame_hdr)) {
  ------------------
  |  |   43|      0|    (!IS_INTER_OR_SWITCH(frame_header))
  |  |  ------------------
  |  |  |  |   36|      0|    ((frame_header)->frame_type & 1)
  |  |  ------------------
  ------------------
  |  Branch (403:9): [True: 144k, False: 240k]
  |  Branch (403:19): [True: 0, False: 144k]
  |  Branch (403:31): [True: 0, False: 0]
  ------------------
  404|       |        // check reference state
  405|      0|        const Dav1dThreadPicture *p = &f->sr_cur;
  406|      0|        const int ss_ver = p->p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
  407|      0|        const unsigned p_b = (t->sby + 1) << (f->sb_shift + 2);
  408|      0|        const int tile_sby = t->sby - (ts->tiling.row_start >> f->sb_shift);
  409|      0|        const int (*const lowest_px)[2] = ts->lowest_pixel[tile_sby];
  410|      0|        for (int n = t->deps_skip; n < 7; n++, t->deps_skip++) {
  ------------------
  |  Branch (410:36): [True: 0, False: 0]
  ------------------
  411|      0|            unsigned lowest;
  412|      0|            if (tp) {
  ------------------
  |  Branch (412:17): [True: 0, False: 0]
  ------------------
  413|       |                // if temporal mv refs are disabled, we only need this
  414|       |                // for the primary ref; if segmentation is disabled, we
  415|       |                // don't even need that
  416|      0|                lowest = p_b;
  417|      0|            } else {
  418|       |                // +8 is postfilter-induced delay
  419|      0|                const int y = lowest_px[n][0] == INT_MIN ? INT_MIN :
  ------------------
  |  Branch (419:31): [True: 0, False: 0]
  ------------------
  420|      0|                              lowest_px[n][0] + 8;
  421|      0|                const int uv = lowest_px[n][1] == INT_MIN ? INT_MIN :
  ------------------
  |  Branch (421:32): [True: 0, False: 0]
  ------------------
  422|      0|                               lowest_px[n][1] * (1 << ss_ver) + 8;
  423|      0|                const int max = imax(y, uv);
  424|      0|                if (max == INT_MIN) continue;
  ------------------
  |  Branch (424:21): [True: 0, False: 0]
  ------------------
  425|      0|                lowest = iclip(max, 1, f->refp[n].p.p.h);
  426|      0|            }
  427|      0|            const unsigned p3 = atomic_load(&f->refp[n].progress[!tp]);
  428|      0|            if (p3 < lowest) return 1;
  ------------------
  |  Branch (428:17): [True: 0, False: 0]
  ------------------
  429|      0|            atomic_fetch_or(&f->task_thread.error, p3 == FRAME_ERROR);
  430|      0|        }
  431|      0|    }
  432|   384k|    return 0;
  433|   384k|}
thread_task.c:reset_task_cur:
   50|  2.75M|{
   51|  2.75M|    const unsigned first = atomic_load(&ttd->first);
   52|  2.75M|    unsigned reset_frame_idx = atomic_exchange(&ttd->reset_task_cur, UINT_MAX);
   53|  2.75M|    if (reset_frame_idx < first) {
  ------------------
  |  Branch (53:9): [True: 0, False: 2.75M]
  ------------------
   54|      0|        if (frame_idx == UINT_MAX) return 0;
  ------------------
  |  Branch (54:13): [True: 0, False: 0]
  ------------------
   55|      0|        reset_frame_idx = UINT_MAX;
   56|      0|    }
   57|  2.75M|    if (!ttd->cur && c->fc[first].task_thread.task_cur_prev == NULL)
  ------------------
  |  Branch (57:9): [True: 915k, False: 1.83M]
  |  Branch (57:22): [True: 847k, False: 68.2k]
  ------------------
   58|   847k|        return 0;
   59|  1.90M|    if (reset_frame_idx != UINT_MAX) {
  ------------------
  |  Branch (59:9): [True: 38.5k, False: 1.86M]
  ------------------
   60|  38.5k|        if (frame_idx == UINT_MAX) {
  ------------------
  |  Branch (60:13): [True: 12.5k, False: 25.9k]
  ------------------
   61|  12.5k|            if (reset_frame_idx > first + ttd->cur)
  ------------------
  |  Branch (61:17): [True: 0, False: 12.5k]
  ------------------
   62|      0|                return 0;
   63|  12.5k|            ttd->cur = reset_frame_idx - first;
   64|  12.5k|            goto cur_found;
   65|  12.5k|        }
   66|  1.86M|    } else if (frame_idx == UINT_MAX)
  ------------------
  |  Branch (66:16): [True: 1.62M, False: 240k]
  ------------------
   67|  1.62M|        return 0;
   68|   266k|    if (frame_idx < first) frame_idx += c->n_fc;
  ------------------
  |  Branch (68:9): [True: 0, False: 266k]
  ------------------
   69|   266k|    const unsigned min_frame_idx = umin(reset_frame_idx, frame_idx);
   70|   266k|    const unsigned cur_frame_idx = first + ttd->cur;
   71|   266k|    if (ttd->cur < c->n_fc && cur_frame_idx < min_frame_idx)
  ------------------
  |  Branch (71:9): [True: 57.4k, False: 208k]
  |  Branch (71:31): [True: 0, False: 57.4k]
  ------------------
   72|      0|        return 0;
   73|   322k|    for (ttd->cur = min_frame_idx - first; ttd->cur < c->n_fc; ttd->cur++)
  ------------------
  |  Branch (73:44): [True: 266k, False: 56.4k]
  ------------------
   74|   266k|        if (c->fc[(first + ttd->cur) % c->n_fc].task_thread.task_head)
  ------------------
  |  Branch (74:13): [True: 209k, False: 56.4k]
  ------------------
   75|   209k|            break;
   76|   278k|cur_found:
   77|   500k|    for (unsigned i = ttd->cur; i < c->n_fc; i++)
  ------------------
  |  Branch (77:33): [True: 222k, False: 278k]
  ------------------
   78|   222k|        c->fc[(first + i) % c->n_fc].task_thread.task_cur_prev = NULL;
   79|   278k|    return 1;
   80|   266k|}
thread_task.c:add_pending:
  176|   135k|static inline void add_pending(Dav1dFrameContext *const f, Dav1dTask *const t) {
  177|   135k|    pthread_mutex_lock(&f->task_thread.pending_tasks.lock);
  178|   135k|    t->next = NULL;
  179|   135k|    if (!f->task_thread.pending_tasks.head)
  ------------------
  |  Branch (179:9): [True: 132k, False: 2.67k]
  ------------------
  180|   132k|        f->task_thread.pending_tasks.head = t;
  181|  2.67k|    else
  182|  2.67k|        f->task_thread.pending_tasks.tail->next = t;
  183|   135k|    f->task_thread.pending_tasks.tail = t;
  184|       |    atomic_store(&f->task_thread.pending_tasks.merge, 1);
  185|   135k|    pthread_mutex_unlock(&f->task_thread.pending_tasks.lock);
  186|   135k|}
thread_task.c:reset_task_cur_async:
   84|   659k|{
   85|   659k|    const unsigned first = atomic_load(&ttd->first);
   86|   659k|    if (frame_idx < first) frame_idx += n_frames;
  ------------------
  |  Branch (86:9): [True: 0, False: 659k]
  ------------------
   87|   659k|    unsigned last_idx = frame_idx;
   88|   659k|    do {
   89|   659k|        frame_idx = last_idx;
   90|   659k|        last_idx = atomic_exchange(&ttd->reset_task_cur, frame_idx);
   91|   659k|    } while (last_idx < frame_idx);
  ------------------
  |  Branch (91:14): [True: 0, False: 659k]
  ------------------
   92|   659k|    if (frame_idx == first && atomic_load(&ttd->first) != first) {
  ------------------
  |  Branch (92:9): [True: 659k, False: 18.4E]
  |  Branch (92:31): [True: 0, False: 659k]
  ------------------
   93|      0|        unsigned expected = frame_idx;
   94|       |        atomic_compare_exchange_strong(&ttd->reset_task_cur, &expected, UINT_MAX);
   95|      0|    }
   96|   659k|}
thread_task.c:ensure_progress:
  372|   193k|{
  373|       |    // deblock_rows (non-LR portion) depends on deblock of previous sbrow,
  374|       |    // so ensure that completed. if not, re-add to task-queue; else, fall-through
  375|   193k|    int p1 = atomic_load(state);
  376|   193k|    if (p1 < t->sby) {
  ------------------
  |  Branch (376:9): [True: 964, False: 192k]
  ------------------
  377|    964|        t->type = type;
  378|    964|        t->recon_progress = t->deblock_progress = 0;
  379|    964|        *target = t->sby;
  380|    964|        add_pending(f, t);
  381|    964|        pthread_mutex_lock(&ttd->lock);
  382|    964|        return 1;
  383|    964|    }
  384|   192k|    return 0;
  385|   193k|}
thread_task.c:get_frame_progress:
  437|   377k|{
  438|   377k|    unsigned frame_prog = c->n_fc > 1 ? atomic_load(&f->sr_cur.progress[1]) : 0;
  ------------------
  |  Branch (438:27): [True: 0, False: 377k]
  ------------------
  439|   377k|    if (frame_prog >= FRAME_ERROR)
  ------------------
  |  |   35|   377k|#define FRAME_ERROR (UINT_MAX - 1)
  ------------------
  |  Branch (439:9): [True: 0, False: 377k]
  ------------------
  440|      0|        return f->sbh - 1;
  441|   377k|    int idx = frame_prog >> (f->sb_shift + 7);
  442|   377k|    int prog;
  443|  1.57M|    do {
  444|  1.57M|        atomic_uint *state = &f->frame_thread.frame_progress[idx];
  445|  1.57M|        const unsigned val = ~atomic_load(state);
  446|  1.57M|        prog = val ? ctz(val) : 32;
  ------------------
  |  Branch (446:16): [True: 376k, False: 1.19M]
  ------------------
  447|  1.57M|        if (prog != 32) break;
  ------------------
  |  Branch (447:13): [True: 376k, False: 1.19M]
  ------------------
  448|  1.19M|        prog = 0;
  449|  1.19M|    } while (++idx < f->frame_thread.prog_sz);
  ------------------
  |  Branch (449:14): [True: 1.19M, False: 981]
  ------------------
  450|   377k|    return ((idx << 5) | prog) - 1;
  451|   377k|}

dav1d_get_shear_params:
   80|  8.18k|int dav1d_get_shear_params(Dav1dWarpedMotionParams *const wm) {
   81|  8.18k|    const int32_t *const mat = wm->matrix;
   82|       |
   83|  8.18k|    if (mat[2] <= 0) return 1;
  ------------------
  |  Branch (83:9): [True: 0, False: 8.18k]
  ------------------
   84|       |
   85|  8.18k|    wm->u.p.alpha = iclip_wmp(mat[2] - 0x10000);
   86|  8.18k|    wm->u.p.beta = iclip_wmp(mat[3]);
   87|       |
   88|  8.18k|    int shift;
   89|  8.18k|    const int y = apply_sign(resolve_divisor_32(abs(mat[2]), &shift), mat[2]);
   90|  8.18k|    const int64_t v1 = ((int64_t) mat[4] * 0x10000) * y;
   91|  8.18k|    const int rnd = (1 << shift) >> 1;
   92|  8.18k|    wm->u.p.gamma = iclip_wmp(apply_sign64((int) ((llabs(v1) + rnd) >> shift), v1));
   93|  8.18k|    const int64_t v2 = ((int64_t) mat[3] * mat[4]) * y;
   94|  8.18k|    wm->u.p.delta = iclip_wmp(mat[5] -
   95|  8.18k|                          apply_sign64((int) ((llabs(v2) + rnd) >> shift), v2) -
   96|  8.18k|                          0x10000);
   97|       |
   98|  8.18k|    return (4 * abs(wm->u.p.alpha) + 7 * abs(wm->u.p.beta) >= 0x10000) ||
  ------------------
  |  Branch (98:12): [True: 257, False: 7.92k]
  ------------------
   99|  7.92k|           (4 * abs(wm->u.p.gamma) + 4 * abs(wm->u.p.delta) >= 0x10000);
  ------------------
  |  Branch (99:12): [True: 75, False: 7.84k]
  ------------------
  100|  8.18k|}
dav1d_find_affine_int:
  153|  5.48k|{
  154|  5.48k|    int32_t *const mat = wm->matrix;
  155|  5.48k|    int a[2][2] = { { 0, 0 }, { 0, 0 } };
  156|  5.48k|    int bx[2] = { 0, 0 };
  157|  5.48k|    int by[2] = { 0, 0 };
  158|  5.48k|    const int rsuy = 2 * bh4 - 1;
  159|  5.48k|    const int rsux = 2 * bw4 - 1;
  160|  5.48k|    const int suy = rsuy * 8;
  161|  5.48k|    const int sux = rsux * 8;
  162|  5.48k|    const int duy = suy + mv.y;
  163|  5.48k|    const int dux = sux + mv.x;
  164|  5.48k|    const int isuy = by4 * 4 + rsuy;
  165|  5.48k|    const int isux = bx4 * 4 + rsux;
  166|       |
  167|  16.0k|    for (int i = 0; i < np; i++) {
  ------------------
  |  Branch (167:21): [True: 10.5k, False: 5.48k]
  ------------------
  168|  10.5k|        const int dx = pts[i][1][0] - dux;
  169|  10.5k|        const int dy = pts[i][1][1] - duy;
  170|  10.5k|        const int sx = pts[i][0][0] - sux;
  171|  10.5k|        const int sy = pts[i][0][1] - suy;
  172|  10.5k|        if (abs(sx - dx) < 256 && abs(sy - dy) < 256) {
  ------------------
  |  Branch (172:13): [True: 10.3k, False: 175]
  |  Branch (172:35): [True: 10.3k, False: 48]
  ------------------
  173|  10.3k|            a[0][0] += ((sx * sx) >> 2) + sx * 2 + 8;
  174|  10.3k|            a[0][1] += ((sx * sy) >> 2) + sx + sy + 4;
  175|  10.3k|            a[1][1] += ((sy * sy) >> 2) + sy * 2 + 8;
  176|  10.3k|            bx[0] += ((sx * dx) >> 2) + sx + dx + 8;
  177|  10.3k|            bx[1] += ((sy * dx) >> 2) + sy + dx + 4;
  178|  10.3k|            by[0] += ((sx * dy) >> 2) + sx + dy + 4;
  179|  10.3k|            by[1] += ((sy * dy) >> 2) + sy + dy + 8;
  180|  10.3k|        }
  181|  10.5k|    }
  182|       |
  183|       |    // compute determinant of a
  184|  5.48k|    const int64_t det = (int64_t) a[0][0] * a[1][1] - (int64_t) a[0][1] * a[0][1];
  185|  5.48k|    if (det == 0) return 1;
  ------------------
  |  Branch (185:9): [True: 222, False: 5.26k]
  ------------------
  186|  5.26k|    int shift, idet = apply_sign64(resolve_divisor_64(llabs(det), &shift), det);
  187|  5.26k|    shift -= 16;
  188|  5.26k|    if (shift < 0) {
  ------------------
  |  Branch (188:9): [True: 0, False: 5.26k]
  ------------------
  189|      0|        idet <<= -shift;
  190|      0|        shift = 0;
  191|      0|    }
  192|       |
  193|       |    // solve the least-squares
  194|  5.26k|    mat[2] = get_mult_shift_diag((int64_t) a[1][1] * bx[0] -
  195|  5.26k|                                 (int64_t) a[0][1] * bx[1], idet, shift);
  196|  5.26k|    mat[3] = get_mult_shift_ndiag((int64_t) a[0][0] * bx[1] -
  197|  5.26k|                                  (int64_t) a[0][1] * bx[0], idet, shift);
  198|  5.26k|    mat[4] = get_mult_shift_ndiag((int64_t) a[1][1] * by[0] -
  199|  5.26k|                                  (int64_t) a[0][1] * by[1], idet, shift);
  200|  5.26k|    mat[5] = get_mult_shift_diag((int64_t) a[0][0] * by[1] -
  201|  5.26k|                                 (int64_t) a[0][1] * by[0], idet, shift);
  202|       |
  203|  5.26k|    mat[0] = iclip(mv.x * 0x2000 - (isux * (mat[2] - 0x10000) + isuy * mat[3]),
  204|  5.26k|                   -0x800000, 0x7fffff);
  205|  5.26k|    mat[1] = iclip(mv.y * 0x2000 - (isux * mat[4] + isuy * (mat[5] - 0x10000)),
  206|  5.26k|                   -0x800000, 0x7fffff);
  207|       |
  208|  5.26k|    return 0;
  209|  5.48k|}
warpmv.c:iclip_wmp:
   63|  32.7k|static inline int iclip_wmp(const int v) {
   64|  32.7k|    const int cv = iclip(v, INT16_MIN, INT16_MAX);
   65|       |
   66|  32.7k|    return apply_sign((abs(cv) + 32) >> 6, cv) * (1 << 6);
   67|  32.7k|}
warpmv.c:resolve_divisor_32:
   69|  8.17k|static inline int resolve_divisor_32(const unsigned d, int *const shift) {
   70|  8.17k|    *shift = ulog2(d);
   71|  8.17k|    const int e = d - (1 << *shift);
   72|  8.18k|    const int f = *shift > 8 ? (e + (1 << (*shift - 9))) >> (*shift - 8) :
  ------------------
  |  Branch (72:19): [True: 8.18k, False: 18.4E]
  ------------------
   73|  18.4E|                               e << (8 - *shift);
   74|  8.17k|    assert(f <= 256);
  ------------------
  |  |  140|  8.17k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 8.17k]
  |  |  |  Branch (140:68): [Folded, False: 8.17k]
  |  |  ------------------
  ------------------
   75|  8.17k|    *shift += 14;
   76|       |    // Use f as lookup into the precomputed table of multipliers
   77|  8.17k|    return div_lut[f];
   78|  8.17k|}
warpmv.c:resolve_divisor_64:
  102|  5.26k|static int resolve_divisor_64(const uint64_t d, int *const shift) {
  103|  5.26k|    *shift = u64log2(d);
  104|  5.26k|    const int64_t e = d - (1LL << *shift);
  105|  5.26k|    const int64_t f = *shift > 8 ? (e + (1LL << (*shift - 9))) >> (*shift - 8) :
  ------------------
  |  Branch (105:23): [True: 5.26k, False: 1]
  ------------------
  106|  5.26k|                                   e << (8 - *shift);
  107|  5.26k|    assert(f <= 256);
  ------------------
  |  |  140|  5.26k|#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
  |  |  ------------------
  |  |  |  Branch (140:28): [True: 0, False: 5.26k]
  |  |  |  Branch (140:68): [Folded, False: 5.26k]
  |  |  ------------------
  ------------------
  108|  5.26k|    *shift += 14;
  109|       |    // Use f as lookup into the precomputed table of multipliers
  110|  5.26k|    return div_lut[f];
  111|  5.26k|}
warpmv.c:get_mult_shift_diag:
  125|  10.5k|{
  126|  10.5k|    const int64_t v1 = px * idet;
  127|  10.5k|    const int v2 = apply_sign64((int) ((llabs(v1) +
  128|  10.5k|                                        ((1LL << shift) >> 1)) >> shift),
  129|  10.5k|                                v1);
  130|  10.5k|    return iclip(v2, 0xe001, 0x11fff);
  131|  10.5k|}
warpmv.c:get_mult_shift_ndiag:
  115|  10.5k|{
  116|  10.5k|    const int64_t v1 = px * idet;
  117|  10.5k|    const int v2 = apply_sign64((int) ((llabs(v1) +
  118|  10.5k|                                        ((1LL << shift) >> 1)) >> shift),
  119|  10.5k|                                v1);
  120|  10.5k|    return iclip(v2, -0x1fff, 0x1fff);
  121|  10.5k|}

dav1d_init_ii_wedge_masks:
  207|      1|COLD void dav1d_init_ii_wedge_masks(void) {
  208|       |    // This function is guaranteed to be called only once
  209|       |
  210|      1|    enum WedgeMasterLineType {
  211|      1|        WEDGE_MASTER_LINE_ODD,
  212|      1|        WEDGE_MASTER_LINE_EVEN,
  213|      1|        WEDGE_MASTER_LINE_VERT,
  214|      1|        N_WEDGE_MASTER_LINES,
  215|      1|    };
  216|      1|    static const uint8_t wedge_master_border[N_WEDGE_MASTER_LINES][8] = {
  217|      1|        [WEDGE_MASTER_LINE_ODD]  = {  1,  2,  6, 18, 37, 53, 60, 63 },
  218|      1|        [WEDGE_MASTER_LINE_EVEN] = {  1,  4, 11, 27, 46, 58, 62, 63 },
  219|      1|        [WEDGE_MASTER_LINE_VERT] = {  0,  2,  7, 21, 43, 57, 62, 64 },
  220|      1|    };
  221|      1|    uint8_t master[6][64 * 64];
  222|       |
  223|       |    // create master templates
  224|     65|    for (int y = 0, off = 0; y < 64; y++, off += 64)
  ------------------
  |  Branch (224:30): [True: 64, False: 1]
  ------------------
  225|     64|        insert_border(&master[WEDGE_VERTICAL][off],
  226|     64|                      wedge_master_border[WEDGE_MASTER_LINE_VERT], 32);
  227|     33|    for (int y = 0, off = 0, ctr = 48; y < 64; y += 2, off += 128, ctr--)
  ------------------
  |  Branch (227:40): [True: 32, False: 1]
  ------------------
  228|     32|    {
  229|     32|        insert_border(&master[WEDGE_OBLIQUE63][off],
  230|     32|                      wedge_master_border[WEDGE_MASTER_LINE_EVEN], ctr);
  231|     32|        insert_border(&master[WEDGE_OBLIQUE63][off + 64],
  232|     32|                      wedge_master_border[WEDGE_MASTER_LINE_ODD], ctr - 1);
  233|     32|    }
  234|       |
  235|      1|    transpose(master[WEDGE_OBLIQUE27], master[WEDGE_OBLIQUE63]);
  236|      1|    transpose(master[WEDGE_HORIZONTAL], master[WEDGE_VERTICAL]);
  237|      1|    hflip(master[WEDGE_OBLIQUE117], master[WEDGE_OBLIQUE63]);
  238|      1|    hflip(master[WEDGE_OBLIQUE153], master[WEDGE_OBLIQUE27]);
  239|       |
  240|      1|#define fill(w, h, sz_422, sz_420, hvsw, signs) \
  241|      1|    fill2d_16x2(w, h, BS_##w##x##h - BS_32x32, \
  242|      1|                master, wedge_codebook_16_##hvsw, \
  243|      1|                dav1d_masks.wedge_444_##w##x##h, \
  244|      1|                dav1d_masks.wedge_422_##sz_422, \
  245|      1|                dav1d_masks.wedge_420_##sz_420, signs)
  246|       |
  247|      1|    fill(32, 32, 16x32, 16x16, heqw, 0x7bfb);
  ------------------
  |  |  241|      1|    fill2d_16x2(w, h, BS_##w##x##h - BS_32x32, \
  |  |  242|      1|                master, wedge_codebook_16_##hvsw, \
  |  |  243|      1|                dav1d_masks.wedge_444_##w##x##h, \
  |  |  244|      1|                dav1d_masks.wedge_422_##sz_422, \
  |  |  245|      1|                dav1d_masks.wedge_420_##sz_420, signs)
  ------------------
  248|      1|    fill(32, 16, 16x16, 16x8,  hltw, 0x7beb);
  ------------------
  |  |  241|      1|    fill2d_16x2(w, h, BS_##w##x##h - BS_32x32, \
  |  |  242|      1|                master, wedge_codebook_16_##hvsw, \
  |  |  243|      1|                dav1d_masks.wedge_444_##w##x##h, \
  |  |  244|      1|                dav1d_masks.wedge_422_##sz_422, \
  |  |  245|      1|                dav1d_masks.wedge_420_##sz_420, signs)
  ------------------
  249|      1|    fill(32,  8, 16x8,  16x4,  hltw, 0x6beb);
  ------------------
  |  |  241|      1|    fill2d_16x2(w, h, BS_##w##x##h - BS_32x32, \
  |  |  242|      1|                master, wedge_codebook_16_##hvsw, \
  |  |  243|      1|                dav1d_masks.wedge_444_##w##x##h, \
  |  |  244|      1|                dav1d_masks.wedge_422_##sz_422, \
  |  |  245|      1|                dav1d_masks.wedge_420_##sz_420, signs)
  ------------------
  250|      1|    fill(16, 32,  8x32,  8x16, hgtw, 0x7beb);
  ------------------
  |  |  241|      1|    fill2d_16x2(w, h, BS_##w##x##h - BS_32x32, \
  |  |  242|      1|                master, wedge_codebook_16_##hvsw, \
  |  |  243|      1|                dav1d_masks.wedge_444_##w##x##h, \
  |  |  244|      1|                dav1d_masks.wedge_422_##sz_422, \
  |  |  245|      1|                dav1d_masks.wedge_420_##sz_420, signs)
  ------------------
  251|      1|    fill(16, 16,  8x16,  8x8,  heqw, 0x7bfb);
  ------------------
  |  |  241|      1|    fill2d_16x2(w, h, BS_##w##x##h - BS_32x32, \
  |  |  242|      1|                master, wedge_codebook_16_##hvsw, \
  |  |  243|      1|                dav1d_masks.wedge_444_##w##x##h, \
  |  |  244|      1|                dav1d_masks.wedge_422_##sz_422, \
  |  |  245|      1|                dav1d_masks.wedge_420_##sz_420, signs)
  ------------------
  252|      1|    fill(16,  8,  8x8,   8x4,  hltw, 0x7beb);
  ------------------
  |  |  241|      1|    fill2d_16x2(w, h, BS_##w##x##h - BS_32x32, \
  |  |  242|      1|                master, wedge_codebook_16_##hvsw, \
  |  |  243|      1|                dav1d_masks.wedge_444_##w##x##h, \
  |  |  244|      1|                dav1d_masks.wedge_422_##sz_422, \
  |  |  245|      1|                dav1d_masks.wedge_420_##sz_420, signs)
  ------------------
  253|      1|    fill( 8, 32,  4x32,  4x16, hgtw, 0x7aeb);
  ------------------
  |  |  241|      1|    fill2d_16x2(w, h, BS_##w##x##h - BS_32x32, \
  |  |  242|      1|                master, wedge_codebook_16_##hvsw, \
  |  |  243|      1|                dav1d_masks.wedge_444_##w##x##h, \
  |  |  244|      1|                dav1d_masks.wedge_422_##sz_422, \
  |  |  245|      1|                dav1d_masks.wedge_420_##sz_420, signs)
  ------------------
  254|      1|    fill( 8, 16,  4x16,  4x8,  hgtw, 0x7beb);
  ------------------
  |  |  241|      1|    fill2d_16x2(w, h, BS_##w##x##h - BS_32x32, \
  |  |  242|      1|                master, wedge_codebook_16_##hvsw, \
  |  |  243|      1|                dav1d_masks.wedge_444_##w##x##h, \
  |  |  244|      1|                dav1d_masks.wedge_422_##sz_422, \
  |  |  245|      1|                dav1d_masks.wedge_420_##sz_420, signs)
  ------------------
  255|      1|    fill( 8,  8,  4x8,   4x4,  heqw, 0x7bfb);
  ------------------
  |  |  241|      1|    fill2d_16x2(w, h, BS_##w##x##h - BS_32x32, \
  |  |  242|      1|                master, wedge_codebook_16_##hvsw, \
  |  |  243|      1|                dav1d_masks.wedge_444_##w##x##h, \
  |  |  244|      1|                dav1d_masks.wedge_422_##sz_422, \
  |  |  245|      1|                dav1d_masks.wedge_420_##sz_420, signs)
  ------------------
  256|      1|#undef fill
  257|       |
  258|      1|    memset(dav1d_masks.ii_dc, 32, 32 * 32);
  259|      4|    for (int c = 0; c < 3; c++) {
  ------------------
  |  Branch (259:21): [True: 3, False: 1]
  ------------------
  260|      3|        dav1d_masks.offsets[c][BS_32x32-BS_32x32].ii[II_DC_PRED] =
  261|      3|        dav1d_masks.offsets[c][BS_32x16-BS_32x32].ii[II_DC_PRED] =
  262|      3|        dav1d_masks.offsets[c][BS_16x32-BS_32x32].ii[II_DC_PRED] =
  263|      3|        dav1d_masks.offsets[c][BS_16x16-BS_32x32].ii[II_DC_PRED] =
  264|      3|        dav1d_masks.offsets[c][BS_16x8 -BS_32x32].ii[II_DC_PRED] =
  265|      3|        dav1d_masks.offsets[c][BS_8x16 -BS_32x32].ii[II_DC_PRED] =
  266|      3|        dav1d_masks.offsets[c][BS_8x8  -BS_32x32].ii[II_DC_PRED] =
  267|      3|            MASK_OFFSET(dav1d_masks.ii_dc);
  ------------------
  |  |  129|      3|#define MASK_OFFSET(x) ((uint16_t)(((uintptr_t)(x) - (uintptr_t)&dav1d_masks) >> 3))
  ------------------
  268|      3|    }
  269|       |
  270|      1|#define BUILD_NONDC_II_MASKS(w, h, step) \
  271|      1|    build_nondc_ii_masks(dav1d_masks.ii_nondc_##w##x##h, w, h, step)
  272|       |
  273|      1|#define ASSIGN_NONDC_II_OFFSET(bs, w444, h444, w422, h422, w420, h420) \
  274|      1|    dav1d_masks.offsets[0][bs-BS_32x32].ii[p + 1] = \
  275|      1|        MASK_OFFSET(&dav1d_masks.ii_nondc_##w444##x##h444[p*w444*h444]); \
  276|      1|    dav1d_masks.offsets[1][bs-BS_32x32].ii[p + 1] = \
  277|      1|        MASK_OFFSET(&dav1d_masks.ii_nondc_##w422##x##h422[p*w422*h422]); \
  278|      1|    dav1d_masks.offsets[2][bs-BS_32x32].ii[p + 1] = \
  279|      1|        MASK_OFFSET(&dav1d_masks.ii_nondc_##w420##x##h420[p*w420*h420])
  280|       |
  281|      1|    BUILD_NONDC_II_MASKS(32, 32, 1);
  ------------------
  |  |  271|      1|    build_nondc_ii_masks(dav1d_masks.ii_nondc_##w##x##h, w, h, step)
  ------------------
  282|      1|    BUILD_NONDC_II_MASKS(16, 32, 1);
  ------------------
  |  |  271|      1|    build_nondc_ii_masks(dav1d_masks.ii_nondc_##w##x##h, w, h, step)
  ------------------
  283|      1|    BUILD_NONDC_II_MASKS(16, 16, 2);
  ------------------
  |  |  271|      1|    build_nondc_ii_masks(dav1d_masks.ii_nondc_##w##x##h, w, h, step)
  ------------------
  284|      1|    BUILD_NONDC_II_MASKS( 8, 32, 1);
  ------------------
  |  |  271|      1|    build_nondc_ii_masks(dav1d_masks.ii_nondc_##w##x##h, w, h, step)
  ------------------
  285|      1|    BUILD_NONDC_II_MASKS( 8, 16, 2);
  ------------------
  |  |  271|      1|    build_nondc_ii_masks(dav1d_masks.ii_nondc_##w##x##h, w, h, step)
  ------------------
  286|      1|    BUILD_NONDC_II_MASKS( 8,  8, 4);
  ------------------
  |  |  271|      1|    build_nondc_ii_masks(dav1d_masks.ii_nondc_##w##x##h, w, h, step)
  ------------------
  287|      1|    BUILD_NONDC_II_MASKS( 4, 16, 2);
  ------------------
  |  |  271|      1|    build_nondc_ii_masks(dav1d_masks.ii_nondc_##w##x##h, w, h, step)
  ------------------
  288|      1|    BUILD_NONDC_II_MASKS( 4,  8, 4);
  ------------------
  |  |  271|      1|    build_nondc_ii_masks(dav1d_masks.ii_nondc_##w##x##h, w, h, step)
  ------------------
  289|      1|    BUILD_NONDC_II_MASKS( 4,  4, 8);
  ------------------
  |  |  271|      1|    build_nondc_ii_masks(dav1d_masks.ii_nondc_##w##x##h, w, h, step)
  ------------------
  290|      4|    for (int p = 0; p < 3; p++) {
  ------------------
  |  Branch (290:21): [True: 3, False: 1]
  ------------------
  291|      3|        ASSIGN_NONDC_II_OFFSET(BS_32x32, 32, 32, 16, 32, 16, 16);
  ------------------
  |  |  274|      3|    dav1d_masks.offsets[0][bs-BS_32x32].ii[p + 1] = \
  |  |  275|      3|        MASK_OFFSET(&dav1d_masks.ii_nondc_##w444##x##h444[p*w444*h444]); \
  |  |  ------------------
  |  |  |  |  129|      3|#define MASK_OFFSET(x) ((uint16_t)(((uintptr_t)(x) - (uintptr_t)&dav1d_masks) >> 3))
  |  |  ------------------
  |  |  276|      3|    dav1d_masks.offsets[1][bs-BS_32x32].ii[p + 1] = \
  |  |  277|      3|        MASK_OFFSET(&dav1d_masks.ii_nondc_##w422##x##h422[p*w422*h422]); \
  |  |  ------------------
  |  |  |  |  129|      3|#define MASK_OFFSET(x) ((uint16_t)(((uintptr_t)(x) - (uintptr_t)&dav1d_masks) >> 3))
  |  |  ------------------
  |  |  278|      3|    dav1d_masks.offsets[2][bs-BS_32x32].ii[p + 1] = \
  |  |  279|      3|        MASK_OFFSET(&dav1d_masks.ii_nondc_##w420##x##h420[p*w420*h420])
  |  |  ------------------
  |  |  |  |  129|      3|#define MASK_OFFSET(x) ((uint16_t)(((uintptr_t)(x) - (uintptr_t)&dav1d_masks) >> 3))
  |  |  ------------------
  ------------------
  292|      3|        ASSIGN_NONDC_II_OFFSET(BS_32x16, 32, 32, 16, 16, 16, 16);
  ------------------
  |  |  274|      3|    dav1d_masks.offsets[0][bs-BS_32x32].ii[p + 1] = \
  |  |  275|      3|        MASK_OFFSET(&dav1d_masks.ii_nondc_##w444##x##h444[p*w444*h444]); \
  |  |  ------------------
  |  |  |  |  129|      3|#define MASK_OFFSET(x) ((uint16_t)(((uintptr_t)(x) - (uintptr_t)&dav1d_masks) >> 3))
  |  |  ------------------
  |  |  276|      3|    dav1d_masks.offsets[1][bs-BS_32x32].ii[p + 1] = \
  |  |  277|      3|        MASK_OFFSET(&dav1d_masks.ii_nondc_##w422##x##h422[p*w422*h422]); \
  |  |  ------------------
  |  |  |  |  129|      3|#define MASK_OFFSET(x) ((uint16_t)(((uintptr_t)(x) - (uintptr_t)&dav1d_masks) >> 3))
  |  |  ------------------
  |  |  278|      3|    dav1d_masks.offsets[2][bs-BS_32x32].ii[p + 1] = \
  |  |  279|      3|        MASK_OFFSET(&dav1d_masks.ii_nondc_##w420##x##h420[p*w420*h420])
  |  |  ------------------
  |  |  |  |  129|      3|#define MASK_OFFSET(x) ((uint16_t)(((uintptr_t)(x) - (uintptr_t)&dav1d_masks) >> 3))
  |  |  ------------------
  ------------------
  293|      3|        ASSIGN_NONDC_II_OFFSET(BS_16x32, 16, 32,  8, 32,  8, 16);
  ------------------
  |  |  274|      3|    dav1d_masks.offsets[0][bs-BS_32x32].ii[p + 1] = \
  |  |  275|      3|        MASK_OFFSET(&dav1d_masks.ii_nondc_##w444##x##h444[p*w444*h444]); \
  |  |  ------------------
  |  |  |  |  129|      3|#define MASK_OFFSET(x) ((uint16_t)(((uintptr_t)(x) - (uintptr_t)&dav1d_masks) >> 3))
  |  |  ------------------
  |  |  276|      3|    dav1d_masks.offsets[1][bs-BS_32x32].ii[p + 1] = \
  |  |  277|      3|        MASK_OFFSET(&dav1d_masks.ii_nondc_##w422##x##h422[p*w422*h422]); \
  |  |  ------------------
  |  |  |  |  129|      3|#define MASK_OFFSET(x) ((uint16_t)(((uintptr_t)(x) - (uintptr_t)&dav1d_masks) >> 3))
  |  |  ------------------
  |  |  278|      3|    dav1d_masks.offsets[2][bs-BS_32x32].ii[p + 1] = \
  |  |  279|      3|        MASK_OFFSET(&dav1d_masks.ii_nondc_##w420##x##h420[p*w420*h420])
  |  |  ------------------
  |  |  |  |  129|      3|#define MASK_OFFSET(x) ((uint16_t)(((uintptr_t)(x) - (uintptr_t)&dav1d_masks) >> 3))
  |  |  ------------------
  ------------------
  294|      3|        ASSIGN_NONDC_II_OFFSET(BS_16x16, 16, 16,  8, 16,  8,  8);
  ------------------
  |  |  274|      3|    dav1d_masks.offsets[0][bs-BS_32x32].ii[p + 1] = \
  |  |  275|      3|        MASK_OFFSET(&dav1d_masks.ii_nondc_##w444##x##h444[p*w444*h444]); \
  |  |  ------------------
  |  |  |  |  129|      3|#define MASK_OFFSET(x) ((uint16_t)(((uintptr_t)(x) - (uintptr_t)&dav1d_masks) >> 3))
  |  |  ------------------
  |  |  276|      3|    dav1d_masks.offsets[1][bs-BS_32x32].ii[p + 1] = \
  |  |  277|      3|        MASK_OFFSET(&dav1d_masks.ii_nondc_##w422##x##h422[p*w422*h422]); \
  |  |  ------------------
  |  |  |  |  129|      3|#define MASK_OFFSET(x) ((uint16_t)(((uintptr_t)(x) - (uintptr_t)&dav1d_masks) >> 3))
  |  |  ------------------
  |  |  278|      3|    dav1d_masks.offsets[2][bs-BS_32x32].ii[p + 1] = \
  |  |  279|      3|        MASK_OFFSET(&dav1d_masks.ii_nondc_##w420##x##h420[p*w420*h420])
  |  |  ------------------
  |  |  |  |  129|      3|#define MASK_OFFSET(x) ((uint16_t)(((uintptr_t)(x) - (uintptr_t)&dav1d_masks) >> 3))
  |  |  ------------------
  ------------------
  295|      3|        ASSIGN_NONDC_II_OFFSET(BS_16x8,  16, 16,  8,  8,  8,  8);
  ------------------
  |  |  274|      3|    dav1d_masks.offsets[0][bs-BS_32x32].ii[p + 1] = \
  |  |  275|      3|        MASK_OFFSET(&dav1d_masks.ii_nondc_##w444##x##h444[p*w444*h444]); \
  |  |  ------------------
  |  |  |  |  129|      3|#define MASK_OFFSET(x) ((uint16_t)(((uintptr_t)(x) - (uintptr_t)&dav1d_masks) >> 3))
  |  |  ------------------
  |  |  276|      3|    dav1d_masks.offsets[1][bs-BS_32x32].ii[p + 1] = \
  |  |  277|      3|        MASK_OFFSET(&dav1d_masks.ii_nondc_##w422##x##h422[p*w422*h422]); \
  |  |  ------------------
  |  |  |  |  129|      3|#define MASK_OFFSET(x) ((uint16_t)(((uintptr_t)(x) - (uintptr_t)&dav1d_masks) >> 3))
  |  |  ------------------
  |  |  278|      3|    dav1d_masks.offsets[2][bs-BS_32x32].ii[p + 1] = \
  |  |  279|      3|        MASK_OFFSET(&dav1d_masks.ii_nondc_##w420##x##h420[p*w420*h420])
  |  |  ------------------
  |  |  |  |  129|      3|#define MASK_OFFSET(x) ((uint16_t)(((uintptr_t)(x) - (uintptr_t)&dav1d_masks) >> 3))
  |  |  ------------------
  ------------------
  296|      3|        ASSIGN_NONDC_II_OFFSET(BS_8x16,   8, 16,  4, 16,  4,  8);
  ------------------
  |  |  274|      3|    dav1d_masks.offsets[0][bs-BS_32x32].ii[p + 1] = \
  |  |  275|      3|        MASK_OFFSET(&dav1d_masks.ii_nondc_##w444##x##h444[p*w444*h444]); \
  |  |  ------------------
  |  |  |  |  129|      3|#define MASK_OFFSET(x) ((uint16_t)(((uintptr_t)(x) - (uintptr_t)&dav1d_masks) >> 3))
  |  |  ------------------
  |  |  276|      3|    dav1d_masks.offsets[1][bs-BS_32x32].ii[p + 1] = \
  |  |  277|      3|        MASK_OFFSET(&dav1d_masks.ii_nondc_##w422##x##h422[p*w422*h422]); \
  |  |  ------------------
  |  |  |  |  129|      3|#define MASK_OFFSET(x) ((uint16_t)(((uintptr_t)(x) - (uintptr_t)&dav1d_masks) >> 3))
  |  |  ------------------
  |  |  278|      3|    dav1d_masks.offsets[2][bs-BS_32x32].ii[p + 1] = \
  |  |  279|      3|        MASK_OFFSET(&dav1d_masks.ii_nondc_##w420##x##h420[p*w420*h420])
  |  |  ------------------
  |  |  |  |  129|      3|#define MASK_OFFSET(x) ((uint16_t)(((uintptr_t)(x) - (uintptr_t)&dav1d_masks) >> 3))
  |  |  ------------------
  ------------------
  297|      3|        ASSIGN_NONDC_II_OFFSET(BS_8x8,    8,  8,  4,  8,  4,  4);
  ------------------
  |  |  274|      3|    dav1d_masks.offsets[0][bs-BS_32x32].ii[p + 1] = \
  |  |  275|      3|        MASK_OFFSET(&dav1d_masks.ii_nondc_##w444##x##h444[p*w444*h444]); \
  |  |  ------------------
  |  |  |  |  129|      3|#define MASK_OFFSET(x) ((uint16_t)(((uintptr_t)(x) - (uintptr_t)&dav1d_masks) >> 3))
  |  |  ------------------
  |  |  276|      3|    dav1d_masks.offsets[1][bs-BS_32x32].ii[p + 1] = \
  |  |  277|      3|        MASK_OFFSET(&dav1d_masks.ii_nondc_##w422##x##h422[p*w422*h422]); \
  |  |  ------------------
  |  |  |  |  129|      3|#define MASK_OFFSET(x) ((uint16_t)(((uintptr_t)(x) - (uintptr_t)&dav1d_masks) >> 3))
  |  |  ------------------
  |  |  278|      3|    dav1d_masks.offsets[2][bs-BS_32x32].ii[p + 1] = \
  |  |  279|      3|        MASK_OFFSET(&dav1d_masks.ii_nondc_##w420##x##h420[p*w420*h420])
  |  |  ------------------
  |  |  |  |  129|      3|#define MASK_OFFSET(x) ((uint16_t)(((uintptr_t)(x) - (uintptr_t)&dav1d_masks) >> 3))
  |  |  ------------------
  ------------------
  298|      3|    }
  299|      1|}
wedge.c:insert_border:
   90|    128|{
   91|    128|    if (ctr > 4) memset(dst, 0, ctr - 4);
  ------------------
  |  Branch (91:9): [True: 128, False: 0]
  ------------------
   92|    128|    memcpy(dst + imax(ctr, 4) - 4, src + imax(4 - ctr, 0), imin(64 - ctr, 8));
   93|    128|    if (ctr < 64 - 4)
  ------------------
  |  Branch (93:9): [True: 128, False: 0]
  ------------------
   94|    128|        memset(dst + ctr + 4, 64, 64 - 4 - ctr);
   95|    128|}
wedge.c:transpose:
   97|      2|static void transpose(uint8_t *const dst, const uint8_t *const src) {
   98|    130|    for (int y = 0, y_off = 0; y < 64; y++, y_off += 64)
  ------------------
  |  Branch (98:32): [True: 128, False: 2]
  ------------------
   99|  8.32k|        for (int x = 0, x_off = 0; x < 64; x++, x_off += 64)
  ------------------
  |  Branch (99:36): [True: 8.19k, False: 128]
  ------------------
  100|  8.19k|            dst[x_off + y] = src[y_off + x];
  101|      2|}
wedge.c:hflip:
  103|      2|static void hflip(uint8_t *const dst, const uint8_t *const src) {
  104|    130|    for (int y = 0, y_off = 0; y < 64; y++, y_off += 64)
  ------------------
  |  Branch (104:32): [True: 128, False: 2]
  ------------------
  105|  8.32k|        for (int x = 0; x < 64; x++)
  ------------------
  |  Branch (105:25): [True: 8.19k, False: 128]
  ------------------
  106|  8.19k|            dst[y_off + 64 - 1 - x] = src[y_off + x];
  107|      2|}
wedge.c:fill2d_16x2:
  153|      9|{
  154|      9|    const int n_stride_444 = (w * h);
  155|      9|    const int n_stride_422 = n_stride_444 >> 1;
  156|      9|    const int n_stride_420 = n_stride_444 >> 2;
  157|      9|    const int sign_stride_422 = 16 * n_stride_422;
  158|      9|    const int sign_stride_420 = 16 * n_stride_420;
  159|       |
  160|       |    // assign pointer offsets in lookup table
  161|    153|    for (int n = 0; n < 16; n++) {
  ------------------
  |  Branch (161:21): [True: 144, False: 9]
  ------------------
  162|    144|        const int sign = signs & 1;
  163|       |
  164|    144|        copy2d(masks_444, master[cb[n].direction], sign, w, h,
  165|    144|               32 - (w * cb[n].x_offset >> 3), 32 - (h * cb[n].y_offset >> 3));
  166|       |
  167|       |        // not using !sign is intentional here, since 444 does not require
  168|       |        // any rounding since no chroma subsampling is applied.
  169|    144|        dav1d_masks.offsets[0][bs].wedge[0][n] =
  170|    144|        dav1d_masks.offsets[0][bs].wedge[1][n] = MASK_OFFSET(masks_444);
  ------------------
  |  |  129|    144|#define MASK_OFFSET(x) ((uint16_t)(((uintptr_t)(x) - (uintptr_t)&dav1d_masks) >> 3))
  ------------------
  171|       |
  172|    144|        dav1d_masks.offsets[1][bs].wedge[0][n] =
  173|    144|            init_chroma(&masks_422[ sign * sign_stride_422], masks_444, 0, w, h, 0);
  174|    144|        dav1d_masks.offsets[1][bs].wedge[1][n] =
  175|    144|            init_chroma(&masks_422[!sign * sign_stride_422], masks_444, 1, w, h, 0);
  176|    144|        dav1d_masks.offsets[2][bs].wedge[0][n] =
  177|    144|            init_chroma(&masks_420[ sign * sign_stride_420], masks_444, 0, w, h, 1);
  178|    144|        dav1d_masks.offsets[2][bs].wedge[1][n] =
  179|    144|            init_chroma(&masks_420[!sign * sign_stride_420], masks_444, 1, w, h, 1);
  180|       |
  181|    144|        signs >>= 1;
  182|    144|        masks_444 += n_stride_444;
  183|    144|        masks_422 += n_stride_422;
  184|    144|        masks_420 += n_stride_420;
  185|    144|    }
  186|      9|}
wedge.c:copy2d:
  111|    144|{
  112|    144|    src += y_off * 64 + x_off;
  113|    144|    if (sign) {
  ------------------
  |  Branch (113:9): [True: 109, False: 35]
  ------------------
  114|  2.14k|        for (int y = 0; y < h; y++) {
  ------------------
  |  Branch (114:25): [True: 2.03k, False: 109]
  ------------------
  115|  40.4k|            for (int x = 0; x < w; x++)
  ------------------
  |  Branch (115:29): [True: 38.4k, False: 2.03k]
  ------------------
  116|  38.4k|                dst[x] = 64 - src[x];
  117|  2.03k|            src += 64;
  118|  2.03k|            dst += w;
  119|  2.03k|        }
  120|    109|    } else {
  121|    691|        for (int y = 0; y < h; y++) {
  ------------------
  |  Branch (121:25): [True: 656, False: 35]
  ------------------
  122|    656|            memcpy(dst, src, w);
  123|    656|            src += 64;
  124|    656|            dst += w;
  125|    656|        }
  126|     35|    }
  127|    144|}
wedge.c:init_chroma:
  134|    576|{
  135|    576|    const uint16_t offset = MASK_OFFSET(chroma);
  ------------------
  |  |  129|    576|#define MASK_OFFSET(x) ((uint16_t)(((uintptr_t)(x) - (uintptr_t)&dav1d_masks) >> 3))
  ------------------
  136|  8.64k|    for (int y = 0; y < h; y += 1 + ss_ver) {
  ------------------
  |  Branch (136:21): [True: 8.06k, False: 576]
  ------------------
  137|  83.3k|        for (int x = 0; x < w; x += 2) {
  ------------------
  |  Branch (137:25): [True: 75.2k, False: 8.06k]
  ------------------
  138|  75.2k|            int sum = luma[x] + luma[x + 1] + 1;
  139|  75.2k|            if (ss_ver) sum += luma[w + x] + luma[w + x + 1] + 1;
  ------------------
  |  Branch (139:17): [True: 25.0k, False: 50.1k]
  ------------------
  140|  75.2k|            chroma[x >> 1] = (sum - sign) >> (1 + ss_ver);
  141|  75.2k|        }
  142|  8.06k|        luma += w << ss_ver;
  143|  8.06k|        chroma += w >> 1;
  144|  8.06k|    }
  145|    576|    return offset;
  146|    576|}
wedge.c:build_nondc_ii_masks:
  190|      9|{
  191|      9|    static const uint8_t ii_weights_1d[32] = {
  192|      9|        60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10,  8,  7,
  193|      9|         6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,
  194|      9|    };
  195|       |
  196|      9|    uint8_t *const mask_h  = &mask_v[w * h];
  197|      9|    uint8_t *const mask_sm = &mask_h[w * h];
  198|    173|    for (int y = 0, off = 0; y < h; y++, off += w) {
  ------------------
  |  Branch (198:30): [True: 164, False: 9]
  ------------------
  199|    164|        memset(&mask_v[off], ii_weights_1d[y * step], w);
  200|  2.51k|        for (int x = 0; x < w; x++) {
  ------------------
  |  Branch (200:25): [True: 2.35k, False: 164]
  ------------------
  201|  2.35k|            mask_sm[off + x] = ii_weights_1d[imin(x, y) * step];
  202|  2.35k|            mask_h[off + x] = ii_weights_1d[x * step];
  203|  2.35k|        }
  204|    164|    }
  205|      9|}

cdef_tmpl.c:cdef_dsp_init_x86:
   46|  7.82k|static ALWAYS_INLINE void cdef_dsp_init_x86(Dav1dCdefDSPContext *const c) {
   47|  7.82k|    const unsigned flags = dav1d_get_cpu_flags();
   48|       |
   49|  7.82k|#if BITDEPTH == 8
   50|  7.82k|    if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
  ------------------
  |  Branch (50:9): [True: 0, False: 7.82k]
  ------------------
   51|       |
   52|  7.82k|    c->fb[0] = BF(dav1d_cdef_filter_8x8, sse2);
  ------------------
  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   53|  7.82k|    c->fb[1] = BF(dav1d_cdef_filter_4x8, sse2);
  ------------------
  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   54|  7.82k|    c->fb[2] = BF(dav1d_cdef_filter_4x4, sse2);
  ------------------
  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   55|  7.82k|#endif
   56|       |
   57|  7.82k|    if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
  ------------------
  |  Branch (57:9): [True: 0, False: 7.82k]
  ------------------
   58|       |
   59|  7.82k|    c->dir = BF(dav1d_cdef_dir, ssse3);
  ------------------
  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   60|  7.82k|    c->fb[0] = BF(dav1d_cdef_filter_8x8, ssse3);
  ------------------
  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   61|  7.82k|    c->fb[1] = BF(dav1d_cdef_filter_4x8, ssse3);
  ------------------
  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   62|  7.82k|    c->fb[2] = BF(dav1d_cdef_filter_4x4, ssse3);
  ------------------
  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   63|       |
   64|  7.82k|    if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return;
  ------------------
  |  Branch (64:9): [True: 0, False: 7.82k]
  ------------------
   65|       |
   66|  7.82k|    c->dir = BF(dav1d_cdef_dir, sse4);
  ------------------
  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   67|  7.82k|#if BITDEPTH == 8
   68|  7.82k|    c->fb[0] = BF(dav1d_cdef_filter_8x8, sse4);
  ------------------
  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   69|  7.82k|    c->fb[1] = BF(dav1d_cdef_filter_4x8, sse4);
  ------------------
  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   70|  7.82k|    c->fb[2] = BF(dav1d_cdef_filter_4x4, sse4);
  ------------------
  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   71|  7.82k|#endif
   72|       |
   73|  7.82k|#if ARCH_X86_64
   74|  7.82k|    if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
  ------------------
  |  Branch (74:9): [True: 0, False: 7.82k]
  ------------------
   75|       |
   76|  7.82k|    c->dir = BF(dav1d_cdef_dir, avx2);
  ------------------
  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   77|  7.82k|    c->fb[0] = BF(dav1d_cdef_filter_8x8, avx2);
  ------------------
  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   78|  7.82k|    c->fb[1] = BF(dav1d_cdef_filter_4x8, avx2);
  ------------------
  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   79|  7.82k|    c->fb[2] = BF(dav1d_cdef_filter_4x4, avx2);
  ------------------
  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   80|       |
   81|  7.82k|    if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
  ------------------
  |  Branch (81:9): [True: 7.82k, False: 0]
  ------------------
   82|       |
   83|      0|    c->fb[0] = BF(dav1d_cdef_filter_8x8, avx512icl);
  ------------------
  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   84|      0|    c->fb[1] = BF(dav1d_cdef_filter_4x8, avx512icl);
  ------------------
  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   85|      0|    c->fb[2] = BF(dav1d_cdef_filter_4x4, avx512icl);
  ------------------
  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   86|      0|#endif
   87|      0|}

dav1d_get_cpu_flags_x86:
   47|      1|COLD unsigned dav1d_get_cpu_flags_x86(void) {
   48|      1|    union {
   49|      1|        CpuidRegisters r;
   50|      1|        struct {
   51|      1|            uint32_t max_leaf;
   52|      1|            char vendor[12];
   53|      1|        };
   54|      1|    } cpu;
   55|      1|    dav1d_cpu_cpuid(&cpu.r, 0, 0);
   56|      1|    unsigned flags = dav1d_get_default_cpu_flags();
   57|       |
   58|      1|    if (cpu.max_leaf >= 1) {
  ------------------
  |  Branch (58:9): [True: 1, False: 0]
  ------------------
   59|      1|        CpuidRegisters r;
   60|      1|        dav1d_cpu_cpuid(&r, 1, 0);
   61|      1|        const unsigned family = ((r.eax >> 8) & 0x0f) + ((r.eax >> 20) & 0xff);
   62|       |
   63|      1|        if (X(r.edx, 0x06008000)) /* CMOV/SSE/SSE2 */ {
  ------------------
  |  |   45|      1|#define X(reg, mask) (((reg) & (mask)) == (mask))
  |  |  ------------------
  |  |  |  Branch (45:22): [True: 1, False: 0]
  |  |  ------------------
  ------------------
   64|      1|            flags |= DAV1D_X86_CPU_FLAG_SSE2;
   65|      1|            if (X(r.ecx, 0x00000201)) /* SSE3/SSSE3 */ {
  ------------------
  |  |   45|      1|#define X(reg, mask) (((reg) & (mask)) == (mask))
  |  |  ------------------
  |  |  |  Branch (45:22): [True: 1, False: 0]
  |  |  ------------------
  ------------------
   66|      1|                flags |= DAV1D_X86_CPU_FLAG_SSSE3;
   67|      1|                if (X(r.ecx, 0x00080000)) /* SSE4.1 */
  ------------------
  |  |   45|      1|#define X(reg, mask) (((reg) & (mask)) == (mask))
  |  |  ------------------
  |  |  |  Branch (45:22): [True: 1, False: 0]
  |  |  ------------------
  ------------------
   68|      1|                    flags |= DAV1D_X86_CPU_FLAG_SSE41;
   69|      1|            }
   70|      1|        }
   71|      1|#if ARCH_X86_64
   72|       |        /* We only support >128-bit SIMD on x86-64. */
   73|      1|        if (X(r.ecx, 0x18000000)) /* OSXSAVE/AVX */ {
  ------------------
  |  |   45|      1|#define X(reg, mask) (((reg) & (mask)) == (mask))
  |  |  ------------------
  |  |  |  Branch (45:22): [True: 1, False: 0]
  |  |  ------------------
  ------------------
   74|      1|            const uint64_t xcr0 = dav1d_cpu_xgetbv(0);
   75|      1|            if (X(xcr0, 0x00000006)) /* XMM/YMM */ {
  ------------------
  |  |   45|      1|#define X(reg, mask) (((reg) & (mask)) == (mask))
  |  |  ------------------
  |  |  |  Branch (45:22): [True: 1, False: 0]
  |  |  ------------------
  ------------------
   76|      1|                if (cpu.max_leaf >= 7) {
  ------------------
  |  Branch (76:21): [True: 1, False: 0]
  ------------------
   77|      1|                    dav1d_cpu_cpuid(&r, 7, 0);
   78|      1|                    if (X(r.ebx, 0x00000128)) /* BMI1/BMI2/AVX2 */ {
  ------------------
  |  |   45|      1|#define X(reg, mask) (((reg) & (mask)) == (mask))
  |  |  ------------------
  |  |  |  Branch (45:22): [True: 1, False: 0]
  |  |  ------------------
  ------------------
   79|      1|                        flags |= DAV1D_X86_CPU_FLAG_AVX2;
   80|      1|                        if (X(xcr0, 0x000000e0)) /* ZMM/OPMASK */ {
  ------------------
  |  |   45|      1|#define X(reg, mask) (((reg) & (mask)) == (mask))
  |  |  ------------------
  |  |  |  Branch (45:22): [True: 0, False: 1]
  |  |  ------------------
  ------------------
   81|      0|                            if (X(r.ebx, 0xd0230000) && X(r.ecx, 0x00005f42))
  ------------------
  |  |   45|      0|#define X(reg, mask) (((reg) & (mask)) == (mask))
  |  |  ------------------
  |  |  |  Branch (45:22): [True: 0, False: 0]
  |  |  ------------------
  ------------------
                                          if (X(r.ebx, 0xd0230000) && X(r.ecx, 0x00005f42))
  ------------------
  |  |   45|      0|#define X(reg, mask) (((reg) & (mask)) == (mask))
  |  |  ------------------
  |  |  |  Branch (45:22): [True: 0, False: 0]
  |  |  ------------------
  ------------------
   82|      0|                                flags |= DAV1D_X86_CPU_FLAG_AVX512ICL;
   83|      0|                        }
   84|      1|                    }
   85|      1|                }
   86|      1|            }
   87|      1|        }
   88|      1|#endif
   89|      1|        if (!memcmp(cpu.vendor, "AuthenticAMD", sizeof(cpu.vendor))) {
  ------------------
  |  Branch (89:13): [True: 1, False: 0]
  ------------------
   90|      1|            if ((flags & DAV1D_X86_CPU_FLAG_AVX2) && family <= 0x19) {
  ------------------
  |  Branch (90:17): [True: 1, False: 0]
  |  Branch (90:54): [True: 1, False: 0]
  ------------------
   91|       |                /* Excavator, Zen, Zen+, Zen 2, Zen 3, Zen 3+, Zen 4 */
   92|      1|                flags |= DAV1D_X86_CPU_FLAG_SLOW_GATHER;
   93|      1|            }
   94|      1|        }
   95|      1|    }
   96|       |
   97|      1|    return flags;
   98|      1|}

filmgrain_tmpl.c:film_grain_dsp_init_x86:
   45|  15.4k|static ALWAYS_INLINE void film_grain_dsp_init_x86(Dav1dFilmGrainDSPContext *const c) {
   46|  15.4k|    const unsigned flags = dav1d_get_cpu_flags();
   47|       |
   48|  15.4k|    if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
  ------------------
  |  Branch (48:9): [True: 0, False: 15.4k]
  ------------------
   49|       |
   50|  15.4k|    c->generate_grain_y = BF(dav1d_generate_grain_y, ssse3);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   51|  15.4k|    c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, ssse3);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   52|  15.4k|    c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, ssse3);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   53|  15.4k|    c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, ssse3);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   54|  15.4k|    c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, ssse3);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   55|  15.4k|    c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, ssse3);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   56|  15.4k|    c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, ssse3);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   57|  15.4k|    c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, ssse3);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   58|       |
   59|  15.4k|#if ARCH_X86_64
   60|  15.4k|    if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
  ------------------
  |  Branch (60:9): [True: 0, False: 15.4k]
  ------------------
   61|       |
   62|  15.4k|    c->generate_grain_y = BF(dav1d_generate_grain_y, avx2);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   63|  15.4k|    c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, avx2);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   64|  15.4k|    c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, avx2);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   65|  15.4k|    c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, avx2);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   66|       |
   67|  15.4k|    if (!(flags & DAV1D_X86_CPU_FLAG_SLOW_GATHER)) {
  ------------------
  |  Branch (67:9): [True: 0, False: 15.4k]
  ------------------
   68|      0|        c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, avx2);
  ------------------
  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   69|      0|        c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, avx2);
  ------------------
  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   70|      0|        c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, avx2);
  ------------------
  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   71|      0|        c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, avx2);
  ------------------
  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   72|      0|    }
   73|       |
   74|  15.4k|    if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
  ------------------
  |  Branch (74:9): [True: 15.4k, False: 0]
  ------------------
   75|       |
   76|      0|    if (BITDEPTH == 8 || !(flags & DAV1D_X86_CPU_FLAG_SLOW_GATHER)) {
  ------------------
  |  Branch (76:9): [True: 0, Folded]
  |  Branch (76:26): [True: 0, False: 0]
  ------------------
   77|      0|        c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, avx512icl);
  ------------------
  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   78|      0|        c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, avx512icl);
  ------------------
  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   79|      0|        c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, avx512icl);
  ------------------
  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   80|      0|        c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, avx512icl);
  ------------------
  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   81|      0|    }
   82|      0|#endif
   83|      0|}

ipred_tmpl.c:intra_pred_dsp_init_x86:
   71|  15.4k|static ALWAYS_INLINE void intra_pred_dsp_init_x86(Dav1dIntraPredDSPContext *const c) {
   72|  15.4k|    const unsigned flags = dav1d_get_cpu_flags();
   73|       |
   74|  15.4k|    if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
  ------------------
  |  Branch (74:9): [True: 0, False: 15.4k]
  ------------------
   75|       |
   76|  15.4k|    init_angular_ipred_fn(DC_PRED,       ipred_dc,       ssse3);
  ------------------
  |  |   39|  15.4k|    init_fn(intra_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|  15.4k|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   77|  15.4k|    init_angular_ipred_fn(DC_128_PRED,   ipred_dc_128,   ssse3);
  ------------------
  |  |   39|  15.4k|    init_fn(intra_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|  15.4k|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   78|  15.4k|    init_angular_ipred_fn(TOP_DC_PRED,   ipred_dc_top,   ssse3);
  ------------------
  |  |   39|  15.4k|    init_fn(intra_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|  15.4k|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   79|  15.4k|    init_angular_ipred_fn(LEFT_DC_PRED,  ipred_dc_left,  ssse3);
  ------------------
  |  |   39|  15.4k|    init_fn(intra_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|  15.4k|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   80|  15.4k|    init_angular_ipred_fn(HOR_PRED,      ipred_h,        ssse3);
  ------------------
  |  |   39|  15.4k|    init_fn(intra_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|  15.4k|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   81|  15.4k|    init_angular_ipred_fn(VERT_PRED,     ipred_v,        ssse3);
  ------------------
  |  |   39|  15.4k|    init_fn(intra_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|  15.4k|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   82|  15.4k|    init_angular_ipred_fn(PAETH_PRED,    ipred_paeth,    ssse3);
  ------------------
  |  |   39|  15.4k|    init_fn(intra_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|  15.4k|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   83|  15.4k|    init_angular_ipred_fn(SMOOTH_PRED,   ipred_smooth,   ssse3);
  ------------------
  |  |   39|  15.4k|    init_fn(intra_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|  15.4k|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   84|  15.4k|    init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, ssse3);
  ------------------
  |  |   39|  15.4k|    init_fn(intra_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|  15.4k|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   85|  15.4k|    init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, ssse3);
  ------------------
  |  |   39|  15.4k|    init_fn(intra_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|  15.4k|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   86|  15.4k|    init_angular_ipred_fn(Z1_PRED,       ipred_z1,       ssse3);
  ------------------
  |  |   39|  15.4k|    init_fn(intra_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|  15.4k|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   87|  15.4k|    init_angular_ipred_fn(Z2_PRED,       ipred_z2,       ssse3);
  ------------------
  |  |   39|  15.4k|    init_fn(intra_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|  15.4k|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   88|  15.4k|    init_angular_ipred_fn(Z3_PRED,       ipred_z3,       ssse3);
  ------------------
  |  |   39|  15.4k|    init_fn(intra_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|  15.4k|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   89|  15.4k|    init_angular_ipred_fn(FILTER_PRED,   ipred_filter,   ssse3);
  ------------------
  |  |   39|  15.4k|    init_fn(intra_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|  15.4k|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   90|       |
   91|  15.4k|    init_cfl_pred_fn(DC_PRED,      ipred_cfl,      ssse3);
  ------------------
  |  |   41|  15.4k|    init_fn(cfl_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|  15.4k|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   92|  15.4k|    init_cfl_pred_fn(DC_128_PRED,  ipred_cfl_128,  ssse3);
  ------------------
  |  |   41|  15.4k|    init_fn(cfl_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|  15.4k|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   93|  15.4k|    init_cfl_pred_fn(TOP_DC_PRED,  ipred_cfl_top,  ssse3);
  ------------------
  |  |   41|  15.4k|    init_fn(cfl_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|  15.4k|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   94|  15.4k|    init_cfl_pred_fn(LEFT_DC_PRED, ipred_cfl_left, ssse3);
  ------------------
  |  |   41|  15.4k|    init_fn(cfl_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|  15.4k|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   95|       |
   96|  15.4k|    init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I420 - 1, ipred_cfl_ac_420, ssse3);
  ------------------
  |  |   43|  15.4k|    init_fn(cfl_ac, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|  15.4k|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   97|  15.4k|    init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I422 - 1, ipred_cfl_ac_422, ssse3);
  ------------------
  |  |   43|  15.4k|    init_fn(cfl_ac, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|  15.4k|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   98|  15.4k|    init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I444 - 1, ipred_cfl_ac_444, ssse3);
  ------------------
  |  |   43|  15.4k|    init_fn(cfl_ac, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|  15.4k|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   99|       |
  100|  15.4k|    c->pal_pred = BF(dav1d_pal_pred, ssse3);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
  101|       |
  102|  15.4k|#if ARCH_X86_64
  103|  15.4k|    if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
  ------------------
  |  Branch (103:9): [True: 0, False: 15.4k]
  ------------------
  104|       |
  105|  15.4k|    init_angular_ipred_fn(DC_PRED,       ipred_dc,       avx2);
  ------------------
  |  |   39|  15.4k|    init_fn(intra_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|  15.4k|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  106|  15.4k|    init_angular_ipred_fn(DC_128_PRED,   ipred_dc_128,   avx2);
  ------------------
  |  |   39|  15.4k|    init_fn(intra_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|  15.4k|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  107|  15.4k|    init_angular_ipred_fn(TOP_DC_PRED,   ipred_dc_top,   avx2);
  ------------------
  |  |   39|  15.4k|    init_fn(intra_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|  15.4k|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  108|  15.4k|    init_angular_ipred_fn(LEFT_DC_PRED,  ipred_dc_left,  avx2);
  ------------------
  |  |   39|  15.4k|    init_fn(intra_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|  15.4k|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  109|  15.4k|    init_angular_ipred_fn(HOR_PRED,      ipred_h,        avx2);
  ------------------
  |  |   39|  15.4k|    init_fn(intra_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|  15.4k|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  110|  15.4k|    init_angular_ipred_fn(VERT_PRED,     ipred_v,        avx2);
  ------------------
  |  |   39|  15.4k|    init_fn(intra_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|  15.4k|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  111|  15.4k|    init_angular_ipred_fn(PAETH_PRED,    ipred_paeth,    avx2);
  ------------------
  |  |   39|  15.4k|    init_fn(intra_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|  15.4k|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  112|  15.4k|    init_angular_ipred_fn(SMOOTH_PRED,   ipred_smooth,   avx2);
  ------------------
  |  |   39|  15.4k|    init_fn(intra_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|  15.4k|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  113|  15.4k|    init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, avx2);
  ------------------
  |  |   39|  15.4k|    init_fn(intra_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|  15.4k|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  114|  15.4k|    init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, avx2);
  ------------------
  |  |   39|  15.4k|    init_fn(intra_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|  15.4k|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  115|  15.4k|    init_angular_ipred_fn(Z1_PRED,       ipred_z1,       avx2);
  ------------------
  |  |   39|  15.4k|    init_fn(intra_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|  15.4k|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  116|  15.4k|    init_angular_ipred_fn(Z2_PRED,       ipred_z2,       avx2);
  ------------------
  |  |   39|  15.4k|    init_fn(intra_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|  15.4k|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  117|  15.4k|    init_angular_ipred_fn(Z3_PRED,       ipred_z3,       avx2);
  ------------------
  |  |   39|  15.4k|    init_fn(intra_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|  15.4k|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  118|  15.4k|    init_angular_ipred_fn(FILTER_PRED,   ipred_filter,   avx2);
  ------------------
  |  |   39|  15.4k|    init_fn(intra_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|  15.4k|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  119|       |
  120|  15.4k|    init_cfl_pred_fn(DC_PRED,      ipred_cfl,      avx2);
  ------------------
  |  |   41|  15.4k|    init_fn(cfl_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|  15.4k|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  121|  15.4k|    init_cfl_pred_fn(DC_128_PRED,  ipred_cfl_128,  avx2);
  ------------------
  |  |   41|  15.4k|    init_fn(cfl_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|  15.4k|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  122|  15.4k|    init_cfl_pred_fn(TOP_DC_PRED,  ipred_cfl_top,  avx2);
  ------------------
  |  |   41|  15.4k|    init_fn(cfl_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|  15.4k|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  123|  15.4k|    init_cfl_pred_fn(LEFT_DC_PRED, ipred_cfl_left, avx2);
  ------------------
  |  |   41|  15.4k|    init_fn(cfl_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|  15.4k|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  124|       |
  125|  15.4k|    init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I420 - 1, ipred_cfl_ac_420, avx2);
  ------------------
  |  |   43|  15.4k|    init_fn(cfl_ac, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|  15.4k|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  126|  15.4k|    init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I422 - 1, ipred_cfl_ac_422, avx2);
  ------------------
  |  |   43|  15.4k|    init_fn(cfl_ac, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|  15.4k|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  127|  15.4k|    init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I444 - 1, ipred_cfl_ac_444, avx2);
  ------------------
  |  |   43|  15.4k|    init_fn(cfl_ac, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|  15.4k|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  128|       |
  129|  15.4k|    c->pal_pred = BF(dav1d_pal_pred, avx2);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
  130|       |
  131|  15.4k|    if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
  ------------------
  |  Branch (131:9): [True: 15.4k, False: 0]
  ------------------
  132|       |
  133|      0|#if BITDEPTH == 8
  134|      0|    init_angular_ipred_fn(DC_PRED,       ipred_dc,       avx512icl);
  ------------------
  |  |   39|      0|    init_fn(intra_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|  15.4k|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  135|      0|    init_angular_ipred_fn(DC_128_PRED,   ipred_dc_128,   avx512icl);
  ------------------
  |  |   39|      0|    init_fn(intra_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|      0|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  136|      0|    init_angular_ipred_fn(TOP_DC_PRED,   ipred_dc_top,   avx512icl);
  ------------------
  |  |   39|      0|    init_fn(intra_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|      0|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  137|      0|    init_angular_ipred_fn(LEFT_DC_PRED,  ipred_dc_left,  avx512icl);
  ------------------
  |  |   39|      0|    init_fn(intra_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|      0|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  138|      0|    init_angular_ipred_fn(HOR_PRED,      ipred_h,        avx512icl);
  ------------------
  |  |   39|      0|    init_fn(intra_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|      0|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  139|      0|    init_angular_ipred_fn(VERT_PRED,     ipred_v,        avx512icl);
  ------------------
  |  |   39|      0|    init_fn(intra_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|      0|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  140|      0|    init_angular_ipred_fn(Z2_PRED,       ipred_z2,       avx512icl);
  ------------------
  |  |   39|      0|    init_fn(intra_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|      0|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  141|      0|#endif
  142|      0|    init_angular_ipred_fn(PAETH_PRED,    ipred_paeth,    avx512icl);
  ------------------
  |  |   39|      0|    init_fn(intra_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|      0|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  143|      0|    init_angular_ipred_fn(SMOOTH_PRED,   ipred_smooth,   avx512icl);
  ------------------
  |  |   39|      0|    init_fn(intra_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|      0|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  144|      0|    init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, avx512icl);
  ------------------
  |  |   39|      0|    init_fn(intra_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|      0|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  145|      0|    init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, avx512icl);
  ------------------
  |  |   39|      0|    init_fn(intra_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|      0|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  146|      0|    init_angular_ipred_fn(Z1_PRED,       ipred_z1,       avx512icl);
  ------------------
  |  |   39|      0|    init_fn(intra_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|      0|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  147|      0|    init_angular_ipred_fn(Z2_PRED,       ipred_z2,       avx512icl);
  ------------------
  |  |   39|      0|    init_fn(intra_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|      0|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  148|      0|    init_angular_ipred_fn(Z3_PRED,       ipred_z3,       avx512icl);
  ------------------
  |  |   39|      0|    init_fn(intra_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|      0|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  149|      0|    init_angular_ipred_fn(FILTER_PRED,   ipred_filter,   avx512icl);
  ------------------
  |  |   39|      0|    init_fn(intra_pred, type, name, suffix)
  |  |  ------------------
  |  |  |  |   36|      0|    c->type0[type1] = BF(dav1d_##name, suffix)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  150|       |
  151|      0|    c->pal_pred = BF(dav1d_pal_pred, avx512icl);
  ------------------
  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
  152|      0|#endif
  153|      0|}

itx_tmpl.c:itx_dsp_init_x86:
  112|  7.82k|{
  113|  7.82k|#define assign_itx_bpc_fn(pfx, w, h, type, type_enum, bpc, ext) \
  114|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  115|  7.82k|        BF_BPC(dav1d_inv_txfm_add_##type##_##w##x##h, bpc, ext)
  116|       |
  117|  7.82k|#define assign_itx1_bpc_fn(pfx, w, h, bpc, ext) \
  118|  7.82k|    assign_itx_bpc_fn(pfx, w, h, dct_dct,           DCT_DCT,           bpc, ext)
  119|       |
  120|  7.82k|#define assign_itx2_bpc_fn(pfx, w, h, bpc, ext) \
  121|  7.82k|    assign_itx1_bpc_fn(pfx, w, h, bpc, ext); \
  122|  7.82k|    assign_itx_bpc_fn(pfx, w, h, identity_identity, IDTX,              bpc, ext)
  123|       |
  124|  7.82k|#define assign_itx12_bpc_fn(pfx, w, h, bpc, ext) \
  125|  7.82k|    assign_itx2_bpc_fn(pfx, w, h, bpc, ext); \
  126|  7.82k|    assign_itx_bpc_fn(pfx, w, h, dct_adst,          ADST_DCT,          bpc, ext); \
  127|  7.82k|    assign_itx_bpc_fn(pfx, w, h, dct_flipadst,      FLIPADST_DCT,      bpc, ext); \
  128|  7.82k|    assign_itx_bpc_fn(pfx, w, h, dct_identity,      H_DCT,             bpc, ext); \
  129|  7.82k|    assign_itx_bpc_fn(pfx, w, h, adst_dct,          DCT_ADST,          bpc, ext); \
  130|  7.82k|    assign_itx_bpc_fn(pfx, w, h, adst_adst,         ADST_ADST,         bpc, ext); \
  131|  7.82k|    assign_itx_bpc_fn(pfx, w, h, adst_flipadst,     FLIPADST_ADST,     bpc, ext); \
  132|  7.82k|    assign_itx_bpc_fn(pfx, w, h, flipadst_dct,      DCT_FLIPADST,      bpc, ext); \
  133|  7.82k|    assign_itx_bpc_fn(pfx, w, h, flipadst_adst,     ADST_FLIPADST,     bpc, ext); \
  134|  7.82k|    assign_itx_bpc_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, bpc, ext); \
  135|  7.82k|    assign_itx_bpc_fn(pfx, w, h, identity_dct,      V_DCT,             bpc, ext)
  136|       |
  137|  7.82k|#define assign_itx16_bpc_fn(pfx, w, h, bpc, ext) \
  138|  7.82k|    assign_itx12_bpc_fn(pfx, w, h, bpc, ext); \
  139|  7.82k|    assign_itx_bpc_fn(pfx, w, h, adst_identity,     H_ADST,            bpc, ext); \
  140|  7.82k|    assign_itx_bpc_fn(pfx, w, h, flipadst_identity, H_FLIPADST,        bpc, ext); \
  141|  7.82k|    assign_itx_bpc_fn(pfx, w, h, identity_adst,     V_ADST,            bpc, ext); \
  142|  7.82k|    assign_itx_bpc_fn(pfx, w, h, identity_flipadst, V_FLIPADST,        bpc, ext)
  143|       |
  144|  7.82k|    const unsigned flags = dav1d_get_cpu_flags();
  145|       |
  146|  7.82k|    if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
  ------------------
  |  Branch (146:9): [True: 0, False: 7.82k]
  ------------------
  147|       |
  148|  7.82k|    assign_itx_fn(, 4, 4, wht_wht, WHT_WHT, sse2);
  ------------------
  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  ------------------
  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  ------------------
  ------------------
  149|       |
  150|  7.82k|    if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
  ------------------
  |  Branch (150:9): [True: 0, False: 7.82k]
  ------------------
  151|       |
  152|  7.82k|#if BITDEPTH == 8
  153|  7.82k|    assign_itx16_fn(,   4,  4, ssse3);
  ------------------
  |  |  101|  7.82k|    assign_itx12_fn(pfx, w, h, ext); \
  |  |  ------------------
  |  |  |  |   88|  7.82k|    assign_itx2_fn(pfx, w, h, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   84|  7.82k|    assign_itx1_fn(pfx, w, h, ext); \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   81|  7.82k|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   85|  7.82k|    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   89|  7.82k|    assign_itx_fn(pfx, w, h, dct_adst,          ADST_DCT,          ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   90|  7.82k|    assign_itx_fn(pfx, w, h, dct_flipadst,      FLIPADST_DCT,      ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   91|  7.82k|    assign_itx_fn(pfx, w, h, dct_identity,      H_DCT,             ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   92|  7.82k|    assign_itx_fn(pfx, w, h, adst_dct,          DCT_ADST,          ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   93|  7.82k|    assign_itx_fn(pfx, w, h, adst_adst,         ADST_ADST,         ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   94|  7.82k|    assign_itx_fn(pfx, w, h, adst_flipadst,     FLIPADST_ADST,     ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   95|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_dct,      DCT_FLIPADST,      ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   96|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_adst,     ADST_FLIPADST,     ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   97|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   98|  7.82k|    assign_itx_fn(pfx, w, h, identity_dct,      V_DCT,             ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  102|  7.82k|    assign_itx_fn(pfx, w, h, adst_identity,     H_ADST,            ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  103|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST,        ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  104|  7.82k|    assign_itx_fn(pfx, w, h, identity_adst,     V_ADST,            ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  105|  7.82k|    assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST,        ext)
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  154|  7.82k|    assign_itx16_fn(R,  4,  8, ssse3);
  ------------------
  |  |  101|  7.82k|    assign_itx12_fn(pfx, w, h, ext); \
  |  |  ------------------
  |  |  |  |   88|  7.82k|    assign_itx2_fn(pfx, w, h, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   84|  7.82k|    assign_itx1_fn(pfx, w, h, ext); \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   81|  7.82k|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   85|  7.82k|    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   89|  7.82k|    assign_itx_fn(pfx, w, h, dct_adst,          ADST_DCT,          ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   90|  7.82k|    assign_itx_fn(pfx, w, h, dct_flipadst,      FLIPADST_DCT,      ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   91|  7.82k|    assign_itx_fn(pfx, w, h, dct_identity,      H_DCT,             ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   92|  7.82k|    assign_itx_fn(pfx, w, h, adst_dct,          DCT_ADST,          ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   93|  7.82k|    assign_itx_fn(pfx, w, h, adst_adst,         ADST_ADST,         ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   94|  7.82k|    assign_itx_fn(pfx, w, h, adst_flipadst,     FLIPADST_ADST,     ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   95|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_dct,      DCT_FLIPADST,      ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   96|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_adst,     ADST_FLIPADST,     ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   97|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   98|  7.82k|    assign_itx_fn(pfx, w, h, identity_dct,      V_DCT,             ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  102|  7.82k|    assign_itx_fn(pfx, w, h, adst_identity,     H_ADST,            ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  103|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST,        ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  104|  7.82k|    assign_itx_fn(pfx, w, h, identity_adst,     V_ADST,            ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  105|  7.82k|    assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST,        ext)
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  155|  7.82k|    assign_itx16_fn(R,  8,  4, ssse3);
  ------------------
  |  |  101|  7.82k|    assign_itx12_fn(pfx, w, h, ext); \
  |  |  ------------------
  |  |  |  |   88|  7.82k|    assign_itx2_fn(pfx, w, h, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   84|  7.82k|    assign_itx1_fn(pfx, w, h, ext); \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   81|  7.82k|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   85|  7.82k|    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   89|  7.82k|    assign_itx_fn(pfx, w, h, dct_adst,          ADST_DCT,          ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   90|  7.82k|    assign_itx_fn(pfx, w, h, dct_flipadst,      FLIPADST_DCT,      ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   91|  7.82k|    assign_itx_fn(pfx, w, h, dct_identity,      H_DCT,             ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   92|  7.82k|    assign_itx_fn(pfx, w, h, adst_dct,          DCT_ADST,          ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   93|  7.82k|    assign_itx_fn(pfx, w, h, adst_adst,         ADST_ADST,         ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   94|  7.82k|    assign_itx_fn(pfx, w, h, adst_flipadst,     FLIPADST_ADST,     ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   95|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_dct,      DCT_FLIPADST,      ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   96|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_adst,     ADST_FLIPADST,     ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   97|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   98|  7.82k|    assign_itx_fn(pfx, w, h, identity_dct,      V_DCT,             ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  102|  7.82k|    assign_itx_fn(pfx, w, h, adst_identity,     H_ADST,            ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  103|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST,        ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  104|  7.82k|    assign_itx_fn(pfx, w, h, identity_adst,     V_ADST,            ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  105|  7.82k|    assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST,        ext)
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  156|  7.82k|    assign_itx16_fn(,   8,  8, ssse3);
  ------------------
  |  |  101|  7.82k|    assign_itx12_fn(pfx, w, h, ext); \
  |  |  ------------------
  |  |  |  |   88|  7.82k|    assign_itx2_fn(pfx, w, h, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   84|  7.82k|    assign_itx1_fn(pfx, w, h, ext); \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   81|  7.82k|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   85|  7.82k|    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   89|  7.82k|    assign_itx_fn(pfx, w, h, dct_adst,          ADST_DCT,          ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   90|  7.82k|    assign_itx_fn(pfx, w, h, dct_flipadst,      FLIPADST_DCT,      ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   91|  7.82k|    assign_itx_fn(pfx, w, h, dct_identity,      H_DCT,             ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   92|  7.82k|    assign_itx_fn(pfx, w, h, adst_dct,          DCT_ADST,          ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   93|  7.82k|    assign_itx_fn(pfx, w, h, adst_adst,         ADST_ADST,         ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   94|  7.82k|    assign_itx_fn(pfx, w, h, adst_flipadst,     FLIPADST_ADST,     ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   95|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_dct,      DCT_FLIPADST,      ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   96|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_adst,     ADST_FLIPADST,     ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   97|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   98|  7.82k|    assign_itx_fn(pfx, w, h, identity_dct,      V_DCT,             ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  102|  7.82k|    assign_itx_fn(pfx, w, h, adst_identity,     H_ADST,            ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  103|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST,        ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  104|  7.82k|    assign_itx_fn(pfx, w, h, identity_adst,     V_ADST,            ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  105|  7.82k|    assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST,        ext)
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  157|  7.82k|    assign_itx16_fn(R,  4, 16, ssse3);
  ------------------
  |  |  101|  7.82k|    assign_itx12_fn(pfx, w, h, ext); \
  |  |  ------------------
  |  |  |  |   88|  7.82k|    assign_itx2_fn(pfx, w, h, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   84|  7.82k|    assign_itx1_fn(pfx, w, h, ext); \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   81|  7.82k|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   85|  7.82k|    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   89|  7.82k|    assign_itx_fn(pfx, w, h, dct_adst,          ADST_DCT,          ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   90|  7.82k|    assign_itx_fn(pfx, w, h, dct_flipadst,      FLIPADST_DCT,      ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   91|  7.82k|    assign_itx_fn(pfx, w, h, dct_identity,      H_DCT,             ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   92|  7.82k|    assign_itx_fn(pfx, w, h, adst_dct,          DCT_ADST,          ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   93|  7.82k|    assign_itx_fn(pfx, w, h, adst_adst,         ADST_ADST,         ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   94|  7.82k|    assign_itx_fn(pfx, w, h, adst_flipadst,     FLIPADST_ADST,     ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   95|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_dct,      DCT_FLIPADST,      ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   96|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_adst,     ADST_FLIPADST,     ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   97|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   98|  7.82k|    assign_itx_fn(pfx, w, h, identity_dct,      V_DCT,             ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  102|  7.82k|    assign_itx_fn(pfx, w, h, adst_identity,     H_ADST,            ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  103|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST,        ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  104|  7.82k|    assign_itx_fn(pfx, w, h, identity_adst,     V_ADST,            ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  105|  7.82k|    assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST,        ext)
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  158|  7.82k|    assign_itx16_fn(R, 16,  4, ssse3);
  ------------------
  |  |  101|  7.82k|    assign_itx12_fn(pfx, w, h, ext); \
  |  |  ------------------
  |  |  |  |   88|  7.82k|    assign_itx2_fn(pfx, w, h, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   84|  7.82k|    assign_itx1_fn(pfx, w, h, ext); \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   81|  7.82k|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   85|  7.82k|    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   89|  7.82k|    assign_itx_fn(pfx, w, h, dct_adst,          ADST_DCT,          ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   90|  7.82k|    assign_itx_fn(pfx, w, h, dct_flipadst,      FLIPADST_DCT,      ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   91|  7.82k|    assign_itx_fn(pfx, w, h, dct_identity,      H_DCT,             ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   92|  7.82k|    assign_itx_fn(pfx, w, h, adst_dct,          DCT_ADST,          ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   93|  7.82k|    assign_itx_fn(pfx, w, h, adst_adst,         ADST_ADST,         ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   94|  7.82k|    assign_itx_fn(pfx, w, h, adst_flipadst,     FLIPADST_ADST,     ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   95|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_dct,      DCT_FLIPADST,      ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   96|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_adst,     ADST_FLIPADST,     ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   97|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   98|  7.82k|    assign_itx_fn(pfx, w, h, identity_dct,      V_DCT,             ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  102|  7.82k|    assign_itx_fn(pfx, w, h, adst_identity,     H_ADST,            ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  103|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST,        ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  104|  7.82k|    assign_itx_fn(pfx, w, h, identity_adst,     V_ADST,            ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  105|  7.82k|    assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST,        ext)
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  159|  7.82k|    assign_itx16_fn(R,  8, 16, ssse3);
  ------------------
  |  |  101|  7.82k|    assign_itx12_fn(pfx, w, h, ext); \
  |  |  ------------------
  |  |  |  |   88|  7.82k|    assign_itx2_fn(pfx, w, h, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   84|  7.82k|    assign_itx1_fn(pfx, w, h, ext); \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   81|  7.82k|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   85|  7.82k|    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   89|  7.82k|    assign_itx_fn(pfx, w, h, dct_adst,          ADST_DCT,          ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   90|  7.82k|    assign_itx_fn(pfx, w, h, dct_flipadst,      FLIPADST_DCT,      ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   91|  7.82k|    assign_itx_fn(pfx, w, h, dct_identity,      H_DCT,             ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   92|  7.82k|    assign_itx_fn(pfx, w, h, adst_dct,          DCT_ADST,          ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   93|  7.82k|    assign_itx_fn(pfx, w, h, adst_adst,         ADST_ADST,         ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   94|  7.82k|    assign_itx_fn(pfx, w, h, adst_flipadst,     FLIPADST_ADST,     ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   95|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_dct,      DCT_FLIPADST,      ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   96|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_adst,     ADST_FLIPADST,     ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   97|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   98|  7.82k|    assign_itx_fn(pfx, w, h, identity_dct,      V_DCT,             ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  102|  7.82k|    assign_itx_fn(pfx, w, h, adst_identity,     H_ADST,            ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  103|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST,        ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  104|  7.82k|    assign_itx_fn(pfx, w, h, identity_adst,     V_ADST,            ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  105|  7.82k|    assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST,        ext)
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  160|  7.82k|    assign_itx16_fn(R, 16,  8, ssse3);
  ------------------
  |  |  101|  7.82k|    assign_itx12_fn(pfx, w, h, ext); \
  |  |  ------------------
  |  |  |  |   88|  7.82k|    assign_itx2_fn(pfx, w, h, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   84|  7.82k|    assign_itx1_fn(pfx, w, h, ext); \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   81|  7.82k|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   85|  7.82k|    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   89|  7.82k|    assign_itx_fn(pfx, w, h, dct_adst,          ADST_DCT,          ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   90|  7.82k|    assign_itx_fn(pfx, w, h, dct_flipadst,      FLIPADST_DCT,      ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   91|  7.82k|    assign_itx_fn(pfx, w, h, dct_identity,      H_DCT,             ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   92|  7.82k|    assign_itx_fn(pfx, w, h, adst_dct,          DCT_ADST,          ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   93|  7.82k|    assign_itx_fn(pfx, w, h, adst_adst,         ADST_ADST,         ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   94|  7.82k|    assign_itx_fn(pfx, w, h, adst_flipadst,     FLIPADST_ADST,     ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   95|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_dct,      DCT_FLIPADST,      ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   96|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_adst,     ADST_FLIPADST,     ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   97|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   98|  7.82k|    assign_itx_fn(pfx, w, h, identity_dct,      V_DCT,             ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  102|  7.82k|    assign_itx_fn(pfx, w, h, adst_identity,     H_ADST,            ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  103|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST,        ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  104|  7.82k|    assign_itx_fn(pfx, w, h, identity_adst,     V_ADST,            ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  105|  7.82k|    assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST,        ext)
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  161|  7.82k|    assign_itx12_fn(,  16, 16, ssse3);
  ------------------
  |  |   88|  7.82k|    assign_itx2_fn(pfx, w, h, ext); \
  |  |  ------------------
  |  |  |  |   84|  7.82k|    assign_itx1_fn(pfx, w, h, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   81|  7.82k|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   85|  7.82k|    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   89|  7.82k|    assign_itx_fn(pfx, w, h, dct_adst,          ADST_DCT,          ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   90|  7.82k|    assign_itx_fn(pfx, w, h, dct_flipadst,      FLIPADST_DCT,      ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   91|  7.82k|    assign_itx_fn(pfx, w, h, dct_identity,      H_DCT,             ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   92|  7.82k|    assign_itx_fn(pfx, w, h, adst_dct,          DCT_ADST,          ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   93|  7.82k|    assign_itx_fn(pfx, w, h, adst_adst,         ADST_ADST,         ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   94|  7.82k|    assign_itx_fn(pfx, w, h, adst_flipadst,     FLIPADST_ADST,     ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   95|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_dct,      DCT_FLIPADST,      ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   96|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_adst,     ADST_FLIPADST,     ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   97|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   98|  7.82k|    assign_itx_fn(pfx, w, h, identity_dct,      V_DCT,             ext)
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  162|  7.82k|    assign_itx2_fn (R,  8, 32, ssse3);
  ------------------
  |  |   84|  7.82k|    assign_itx1_fn(pfx, w, h, ext); \
  |  |  ------------------
  |  |  |  |   81|  7.82k|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   85|  7.82k|    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  163|  7.82k|    assign_itx2_fn (R, 32,  8, ssse3);
  ------------------
  |  |   84|  7.82k|    assign_itx1_fn(pfx, w, h, ext); \
  |  |  ------------------
  |  |  |  |   81|  7.82k|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   85|  7.82k|    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  164|  7.82k|    assign_itx2_fn (R, 16, 32, ssse3);
  ------------------
  |  |   84|  7.82k|    assign_itx1_fn(pfx, w, h, ext); \
  |  |  ------------------
  |  |  |  |   81|  7.82k|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   85|  7.82k|    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  165|  7.82k|    assign_itx2_fn (R, 32, 16, ssse3);
  ------------------
  |  |   84|  7.82k|    assign_itx1_fn(pfx, w, h, ext); \
  |  |  ------------------
  |  |  |  |   81|  7.82k|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   85|  7.82k|    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  166|  7.82k|    assign_itx2_fn (,  32, 32, ssse3);
  ------------------
  |  |   84|  7.82k|    assign_itx1_fn(pfx, w, h, ext); \
  |  |  ------------------
  |  |  |  |   81|  7.82k|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   85|  7.82k|    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  167|  7.82k|    assign_itx1_fn (R, 16, 64, ssse3);
  ------------------
  |  |   81|  7.82k|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  168|  7.82k|    assign_itx1_fn (R, 32, 64, ssse3);
  ------------------
  |  |   81|  7.82k|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  169|  7.82k|    assign_itx1_fn (R, 64, 16, ssse3);
  ------------------
  |  |   81|  7.82k|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  170|  7.82k|    assign_itx1_fn (R, 64, 32, ssse3);
  ------------------
  |  |   81|  7.82k|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  171|  7.82k|    assign_itx1_fn ( , 64, 64, ssse3);
  ------------------
  |  |   81|  7.82k|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  172|  7.82k|    *all_simd = 1;
  173|  7.82k|#endif
  174|       |
  175|  7.82k|    if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return;
  ------------------
  |  Branch (175:9): [True: 0, False: 7.82k]
  ------------------
  176|       |
  177|       |#if BITDEPTH == 16
  178|       |    if (bpc == 10) {
  179|       |        assign_itx16_fn(,   4,  4, sse4);
  180|       |        assign_itx16_fn(R,  4,  8, sse4);
  181|       |        assign_itx16_fn(R,  4, 16, sse4);
  182|       |        assign_itx16_fn(R,  8,  4, sse4);
  183|       |        assign_itx16_fn(,   8,  8, sse4);
  184|       |        assign_itx16_fn(R,  8, 16, sse4);
  185|       |        assign_itx16_fn(R, 16,  4, sse4);
  186|       |        assign_itx16_fn(R, 16,  8, sse4);
  187|       |        assign_itx12_fn(,  16, 16, sse4);
  188|       |        assign_itx2_fn (R,  8, 32, sse4);
  189|       |        assign_itx2_fn (R, 32,  8, sse4);
  190|       |        assign_itx2_fn (R, 16, 32, sse4);
  191|       |        assign_itx2_fn (R, 32, 16, sse4);
  192|       |        assign_itx2_fn (,  32, 32, sse4);
  193|       |        assign_itx1_fn (R, 16, 64, sse4);
  194|       |        assign_itx1_fn (R, 32, 64, sse4);
  195|       |        assign_itx1_fn (R, 64, 16, sse4);
  196|       |        assign_itx1_fn (R, 64, 32, sse4);
  197|       |        assign_itx1_fn (,  64, 64, sse4);
  198|       |        *all_simd = 1;
  199|       |    }
  200|       |#endif
  201|       |
  202|  7.82k|#if ARCH_X86_64
  203|  7.82k|    if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
  ------------------
  |  Branch (203:9): [True: 0, False: 7.82k]
  ------------------
  204|       |
  205|  7.82k|    assign_itx_fn(, 4, 4, wht_wht, WHT_WHT, avx2);
  ------------------
  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  ------------------
  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  ------------------
  ------------------
  206|       |
  207|  7.82k|#if BITDEPTH == 8
  208|  7.82k|    assign_itx16_fn( ,  4,  4, avx2);
  ------------------
  |  |  101|  7.82k|    assign_itx12_fn(pfx, w, h, ext); \
  |  |  ------------------
  |  |  |  |   88|  7.82k|    assign_itx2_fn(pfx, w, h, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   84|  7.82k|    assign_itx1_fn(pfx, w, h, ext); \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   81|  7.82k|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   85|  7.82k|    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   89|  7.82k|    assign_itx_fn(pfx, w, h, dct_adst,          ADST_DCT,          ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   90|  7.82k|    assign_itx_fn(pfx, w, h, dct_flipadst,      FLIPADST_DCT,      ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   91|  7.82k|    assign_itx_fn(pfx, w, h, dct_identity,      H_DCT,             ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   92|  7.82k|    assign_itx_fn(pfx, w, h, adst_dct,          DCT_ADST,          ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   93|  7.82k|    assign_itx_fn(pfx, w, h, adst_adst,         ADST_ADST,         ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   94|  7.82k|    assign_itx_fn(pfx, w, h, adst_flipadst,     FLIPADST_ADST,     ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   95|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_dct,      DCT_FLIPADST,      ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   96|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_adst,     ADST_FLIPADST,     ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   97|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   98|  7.82k|    assign_itx_fn(pfx, w, h, identity_dct,      V_DCT,             ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  102|  7.82k|    assign_itx_fn(pfx, w, h, adst_identity,     H_ADST,            ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  103|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST,        ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  104|  7.82k|    assign_itx_fn(pfx, w, h, identity_adst,     V_ADST,            ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  105|  7.82k|    assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST,        ext)
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  209|  7.82k|    assign_itx16_fn(R,  4,  8, avx2);
  ------------------
  |  |  101|  7.82k|    assign_itx12_fn(pfx, w, h, ext); \
  |  |  ------------------
  |  |  |  |   88|  7.82k|    assign_itx2_fn(pfx, w, h, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   84|  7.82k|    assign_itx1_fn(pfx, w, h, ext); \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   81|  7.82k|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   85|  7.82k|    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   89|  7.82k|    assign_itx_fn(pfx, w, h, dct_adst,          ADST_DCT,          ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   90|  7.82k|    assign_itx_fn(pfx, w, h, dct_flipadst,      FLIPADST_DCT,      ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   91|  7.82k|    assign_itx_fn(pfx, w, h, dct_identity,      H_DCT,             ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   92|  7.82k|    assign_itx_fn(pfx, w, h, adst_dct,          DCT_ADST,          ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   93|  7.82k|    assign_itx_fn(pfx, w, h, adst_adst,         ADST_ADST,         ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   94|  7.82k|    assign_itx_fn(pfx, w, h, adst_flipadst,     FLIPADST_ADST,     ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   95|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_dct,      DCT_FLIPADST,      ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   96|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_adst,     ADST_FLIPADST,     ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   97|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   98|  7.82k|    assign_itx_fn(pfx, w, h, identity_dct,      V_DCT,             ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  102|  7.82k|    assign_itx_fn(pfx, w, h, adst_identity,     H_ADST,            ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  103|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST,        ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  104|  7.82k|    assign_itx_fn(pfx, w, h, identity_adst,     V_ADST,            ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  105|  7.82k|    assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST,        ext)
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  210|  7.82k|    assign_itx16_fn(R,  4, 16, avx2);
  ------------------
  |  |  101|  7.82k|    assign_itx12_fn(pfx, w, h, ext); \
  |  |  ------------------
  |  |  |  |   88|  7.82k|    assign_itx2_fn(pfx, w, h, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   84|  7.82k|    assign_itx1_fn(pfx, w, h, ext); \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   81|  7.82k|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   85|  7.82k|    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   89|  7.82k|    assign_itx_fn(pfx, w, h, dct_adst,          ADST_DCT,          ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   90|  7.82k|    assign_itx_fn(pfx, w, h, dct_flipadst,      FLIPADST_DCT,      ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   91|  7.82k|    assign_itx_fn(pfx, w, h, dct_identity,      H_DCT,             ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   92|  7.82k|    assign_itx_fn(pfx, w, h, adst_dct,          DCT_ADST,          ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   93|  7.82k|    assign_itx_fn(pfx, w, h, adst_adst,         ADST_ADST,         ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   94|  7.82k|    assign_itx_fn(pfx, w, h, adst_flipadst,     FLIPADST_ADST,     ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   95|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_dct,      DCT_FLIPADST,      ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   96|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_adst,     ADST_FLIPADST,     ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   97|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   98|  7.82k|    assign_itx_fn(pfx, w, h, identity_dct,      V_DCT,             ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  102|  7.82k|    assign_itx_fn(pfx, w, h, adst_identity,     H_ADST,            ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  103|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST,        ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  104|  7.82k|    assign_itx_fn(pfx, w, h, identity_adst,     V_ADST,            ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  105|  7.82k|    assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST,        ext)
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  211|  7.82k|    assign_itx16_fn(R,  8,  4, avx2);
  ------------------
  |  |  101|  7.82k|    assign_itx12_fn(pfx, w, h, ext); \
  |  |  ------------------
  |  |  |  |   88|  7.82k|    assign_itx2_fn(pfx, w, h, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   84|  7.82k|    assign_itx1_fn(pfx, w, h, ext); \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   81|  7.82k|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   85|  7.82k|    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   89|  7.82k|    assign_itx_fn(pfx, w, h, dct_adst,          ADST_DCT,          ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   90|  7.82k|    assign_itx_fn(pfx, w, h, dct_flipadst,      FLIPADST_DCT,      ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   91|  7.82k|    assign_itx_fn(pfx, w, h, dct_identity,      H_DCT,             ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   92|  7.82k|    assign_itx_fn(pfx, w, h, adst_dct,          DCT_ADST,          ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   93|  7.82k|    assign_itx_fn(pfx, w, h, adst_adst,         ADST_ADST,         ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   94|  7.82k|    assign_itx_fn(pfx, w, h, adst_flipadst,     FLIPADST_ADST,     ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   95|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_dct,      DCT_FLIPADST,      ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   96|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_adst,     ADST_FLIPADST,     ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   97|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   98|  7.82k|    assign_itx_fn(pfx, w, h, identity_dct,      V_DCT,             ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  102|  7.82k|    assign_itx_fn(pfx, w, h, adst_identity,     H_ADST,            ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  103|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST,        ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  104|  7.82k|    assign_itx_fn(pfx, w, h, identity_adst,     V_ADST,            ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  105|  7.82k|    assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST,        ext)
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  212|  7.82k|    assign_itx16_fn( ,  8,  8, avx2);
  ------------------
  |  |  101|  7.82k|    assign_itx12_fn(pfx, w, h, ext); \
  |  |  ------------------
  |  |  |  |   88|  7.82k|    assign_itx2_fn(pfx, w, h, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   84|  7.82k|    assign_itx1_fn(pfx, w, h, ext); \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   81|  7.82k|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   85|  7.82k|    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   89|  7.82k|    assign_itx_fn(pfx, w, h, dct_adst,          ADST_DCT,          ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   90|  7.82k|    assign_itx_fn(pfx, w, h, dct_flipadst,      FLIPADST_DCT,      ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   91|  7.82k|    assign_itx_fn(pfx, w, h, dct_identity,      H_DCT,             ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   92|  7.82k|    assign_itx_fn(pfx, w, h, adst_dct,          DCT_ADST,          ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   93|  7.82k|    assign_itx_fn(pfx, w, h, adst_adst,         ADST_ADST,         ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   94|  7.82k|    assign_itx_fn(pfx, w, h, adst_flipadst,     FLIPADST_ADST,     ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   95|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_dct,      DCT_FLIPADST,      ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   96|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_adst,     ADST_FLIPADST,     ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   97|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   98|  7.82k|    assign_itx_fn(pfx, w, h, identity_dct,      V_DCT,             ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  102|  7.82k|    assign_itx_fn(pfx, w, h, adst_identity,     H_ADST,            ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  103|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST,        ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  104|  7.82k|    assign_itx_fn(pfx, w, h, identity_adst,     V_ADST,            ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  105|  7.82k|    assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST,        ext)
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  213|  7.82k|    assign_itx16_fn(R,  8, 16, avx2);
  ------------------
  |  |  101|  7.82k|    assign_itx12_fn(pfx, w, h, ext); \
  |  |  ------------------
  |  |  |  |   88|  7.82k|    assign_itx2_fn(pfx, w, h, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   84|  7.82k|    assign_itx1_fn(pfx, w, h, ext); \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   81|  7.82k|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   85|  7.82k|    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   89|  7.82k|    assign_itx_fn(pfx, w, h, dct_adst,          ADST_DCT,          ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   90|  7.82k|    assign_itx_fn(pfx, w, h, dct_flipadst,      FLIPADST_DCT,      ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   91|  7.82k|    assign_itx_fn(pfx, w, h, dct_identity,      H_DCT,             ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   92|  7.82k|    assign_itx_fn(pfx, w, h, adst_dct,          DCT_ADST,          ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   93|  7.82k|    assign_itx_fn(pfx, w, h, adst_adst,         ADST_ADST,         ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   94|  7.82k|    assign_itx_fn(pfx, w, h, adst_flipadst,     FLIPADST_ADST,     ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   95|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_dct,      DCT_FLIPADST,      ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   96|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_adst,     ADST_FLIPADST,     ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   97|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   98|  7.82k|    assign_itx_fn(pfx, w, h, identity_dct,      V_DCT,             ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  102|  7.82k|    assign_itx_fn(pfx, w, h, adst_identity,     H_ADST,            ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  103|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST,        ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  104|  7.82k|    assign_itx_fn(pfx, w, h, identity_adst,     V_ADST,            ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  105|  7.82k|    assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST,        ext)
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  214|  7.82k|    assign_itx2_fn (R,  8, 32, avx2);
  ------------------
  |  |   84|  7.82k|    assign_itx1_fn(pfx, w, h, ext); \
  |  |  ------------------
  |  |  |  |   81|  7.82k|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   85|  7.82k|    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  215|  7.82k|    assign_itx16_fn(R, 16,  4, avx2);
  ------------------
  |  |  101|  7.82k|    assign_itx12_fn(pfx, w, h, ext); \
  |  |  ------------------
  |  |  |  |   88|  7.82k|    assign_itx2_fn(pfx, w, h, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   84|  7.82k|    assign_itx1_fn(pfx, w, h, ext); \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   81|  7.82k|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   85|  7.82k|    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   89|  7.82k|    assign_itx_fn(pfx, w, h, dct_adst,          ADST_DCT,          ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   90|  7.82k|    assign_itx_fn(pfx, w, h, dct_flipadst,      FLIPADST_DCT,      ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   91|  7.82k|    assign_itx_fn(pfx, w, h, dct_identity,      H_DCT,             ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   92|  7.82k|    assign_itx_fn(pfx, w, h, adst_dct,          DCT_ADST,          ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   93|  7.82k|    assign_itx_fn(pfx, w, h, adst_adst,         ADST_ADST,         ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   94|  7.82k|    assign_itx_fn(pfx, w, h, adst_flipadst,     FLIPADST_ADST,     ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   95|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_dct,      DCT_FLIPADST,      ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   96|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_adst,     ADST_FLIPADST,     ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   97|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   98|  7.82k|    assign_itx_fn(pfx, w, h, identity_dct,      V_DCT,             ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  102|  7.82k|    assign_itx_fn(pfx, w, h, adst_identity,     H_ADST,            ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  103|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST,        ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  104|  7.82k|    assign_itx_fn(pfx, w, h, identity_adst,     V_ADST,            ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  105|  7.82k|    assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST,        ext)
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  216|  7.82k|    assign_itx16_fn(R, 16,  8, avx2);
  ------------------
  |  |  101|  7.82k|    assign_itx12_fn(pfx, w, h, ext); \
  |  |  ------------------
  |  |  |  |   88|  7.82k|    assign_itx2_fn(pfx, w, h, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   84|  7.82k|    assign_itx1_fn(pfx, w, h, ext); \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   81|  7.82k|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   85|  7.82k|    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   89|  7.82k|    assign_itx_fn(pfx, w, h, dct_adst,          ADST_DCT,          ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   90|  7.82k|    assign_itx_fn(pfx, w, h, dct_flipadst,      FLIPADST_DCT,      ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   91|  7.82k|    assign_itx_fn(pfx, w, h, dct_identity,      H_DCT,             ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   92|  7.82k|    assign_itx_fn(pfx, w, h, adst_dct,          DCT_ADST,          ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   93|  7.82k|    assign_itx_fn(pfx, w, h, adst_adst,         ADST_ADST,         ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   94|  7.82k|    assign_itx_fn(pfx, w, h, adst_flipadst,     FLIPADST_ADST,     ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   95|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_dct,      DCT_FLIPADST,      ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   96|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_adst,     ADST_FLIPADST,     ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   97|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   98|  7.82k|    assign_itx_fn(pfx, w, h, identity_dct,      V_DCT,             ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  102|  7.82k|    assign_itx_fn(pfx, w, h, adst_identity,     H_ADST,            ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  103|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST,        ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  104|  7.82k|    assign_itx_fn(pfx, w, h, identity_adst,     V_ADST,            ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  105|  7.82k|    assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST,        ext)
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  217|  7.82k|    assign_itx12_fn( , 16, 16, avx2);
  ------------------
  |  |   88|  7.82k|    assign_itx2_fn(pfx, w, h, ext); \
  |  |  ------------------
  |  |  |  |   84|  7.82k|    assign_itx1_fn(pfx, w, h, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   81|  7.82k|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   85|  7.82k|    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   89|  7.82k|    assign_itx_fn(pfx, w, h, dct_adst,          ADST_DCT,          ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   90|  7.82k|    assign_itx_fn(pfx, w, h, dct_flipadst,      FLIPADST_DCT,      ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   91|  7.82k|    assign_itx_fn(pfx, w, h, dct_identity,      H_DCT,             ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   92|  7.82k|    assign_itx_fn(pfx, w, h, adst_dct,          DCT_ADST,          ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   93|  7.82k|    assign_itx_fn(pfx, w, h, adst_adst,         ADST_ADST,         ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   94|  7.82k|    assign_itx_fn(pfx, w, h, adst_flipadst,     FLIPADST_ADST,     ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   95|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_dct,      DCT_FLIPADST,      ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   96|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_adst,     ADST_FLIPADST,     ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   97|  7.82k|    assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   98|  7.82k|    assign_itx_fn(pfx, w, h, identity_dct,      V_DCT,             ext)
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  218|  7.82k|    assign_itx2_fn (R, 16, 32, avx2);
  ------------------
  |  |   84|  7.82k|    assign_itx1_fn(pfx, w, h, ext); \
  |  |  ------------------
  |  |  |  |   81|  7.82k|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   85|  7.82k|    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  219|  7.82k|    assign_itx1_fn (R, 16, 64, avx2);
  ------------------
  |  |   81|  7.82k|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  220|  7.82k|    assign_itx2_fn (R, 32,  8, avx2);
  ------------------
  |  |   84|  7.82k|    assign_itx1_fn(pfx, w, h, ext); \
  |  |  ------------------
  |  |  |  |   81|  7.82k|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   85|  7.82k|    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  221|  7.82k|    assign_itx2_fn (R, 32, 16, avx2);
  ------------------
  |  |   84|  7.82k|    assign_itx1_fn(pfx, w, h, ext); \
  |  |  ------------------
  |  |  |  |   81|  7.82k|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   85|  7.82k|    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  222|  7.82k|    assign_itx2_fn ( , 32, 32, avx2);
  ------------------
  |  |   84|  7.82k|    assign_itx1_fn(pfx, w, h, ext); \
  |  |  ------------------
  |  |  |  |   81|  7.82k|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   85|  7.82k|    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  223|  7.82k|    assign_itx1_fn (R, 32, 64, avx2);
  ------------------
  |  |   81|  7.82k|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  224|  7.82k|    assign_itx1_fn (R, 64, 16, avx2);
  ------------------
  |  |   81|  7.82k|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  225|  7.82k|    assign_itx1_fn (R, 64, 32, avx2);
  ------------------
  |  |   81|  7.82k|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  226|  7.82k|    assign_itx1_fn ( , 64, 64, avx2);
  ------------------
  |  |   81|  7.82k|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  227|       |#else
  228|       |    if (bpc == 10) {
  229|       |        assign_itx16_bpc_fn( ,  4,  4, 10, avx2);
  230|       |        assign_itx16_bpc_fn(R,  4,  8, 10, avx2);
  231|       |        assign_itx16_bpc_fn(R,  4, 16, 10, avx2);
  232|       |        assign_itx16_bpc_fn(R,  8,  4, 10, avx2);
  233|       |        assign_itx16_bpc_fn( ,  8,  8, 10, avx2);
  234|       |        assign_itx16_bpc_fn(R,  8, 16, 10, avx2);
  235|       |        assign_itx2_bpc_fn (R,  8, 32, 10, avx2);
  236|       |        assign_itx16_bpc_fn(R, 16,  4, 10, avx2);
  237|       |        assign_itx16_bpc_fn(R, 16,  8, 10, avx2);
  238|       |        assign_itx12_bpc_fn( , 16, 16, 10, avx2);
  239|       |        assign_itx2_bpc_fn (R, 16, 32, 10, avx2);
  240|       |        assign_itx1_bpc_fn (R, 16, 64, 10, avx2);
  241|       |        assign_itx2_bpc_fn (R, 32,  8, 10, avx2);
  242|       |        assign_itx2_bpc_fn (R, 32, 16, 10, avx2);
  243|       |        assign_itx2_bpc_fn ( , 32, 32, 10, avx2);
  244|       |        assign_itx1_bpc_fn (R, 32, 64, 10, avx2);
  245|       |        assign_itx1_bpc_fn (R, 64, 16, 10, avx2);
  246|       |        assign_itx1_bpc_fn (R, 64, 32, 10, avx2);
  247|       |        assign_itx1_bpc_fn ( , 64, 64, 10, avx2);
  248|       |    } else {
  249|       |        assign_itx16_bpc_fn( ,  4,  4, 12, avx2);
  250|       |        assign_itx16_bpc_fn(R,  4,  8, 12, avx2);
  251|       |        assign_itx16_bpc_fn(R,  4, 16, 12, avx2);
  252|       |        assign_itx16_bpc_fn(R,  8,  4, 12, avx2);
  253|       |        assign_itx16_bpc_fn( ,  8,  8, 12, avx2);
  254|       |        assign_itx16_bpc_fn(R,  8, 16, 12, avx2);
  255|       |        assign_itx2_bpc_fn (R,  8, 32, 12, avx2);
  256|       |        assign_itx16_bpc_fn(R, 16,  4, 12, avx2);
  257|       |        assign_itx16_bpc_fn(R, 16,  8, 12, avx2);
  258|       |        assign_itx12_bpc_fn( , 16, 16, 12, avx2);
  259|       |        assign_itx2_bpc_fn (R, 32,  8, 12, avx2);
  260|       |        assign_itx_bpc_fn(R, 16, 32, identity_identity, IDTX, 12, avx2);
  261|       |        assign_itx_bpc_fn(R, 32, 16, identity_identity, IDTX, 12, avx2);
  262|       |        assign_itx_bpc_fn( , 32, 32, identity_identity, IDTX, 12, avx2);
  263|       |    }
  264|       |#endif
  265|       |
  266|  7.82k|    if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
  ------------------
  |  Branch (266:9): [True: 7.82k, False: 0]
  ------------------
  267|       |
  268|      0|#if BITDEPTH == 8
  269|  7.82k|    assign_itx16_fn( ,  4,  4, avx512icl); // no wht
  ------------------
  |  |  101|  7.82k|    assign_itx12_fn(pfx, w, h, ext); \
  |  |  ------------------
  |  |  |  |   88|  7.82k|    assign_itx2_fn(pfx, w, h, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   84|      0|    assign_itx1_fn(pfx, w, h, ext); \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   81|      0|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   85|  7.82k|    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   89|      0|    assign_itx_fn(pfx, w, h, dct_adst,          ADST_DCT,          ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   90|      0|    assign_itx_fn(pfx, w, h, dct_flipadst,      FLIPADST_DCT,      ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   91|      0|    assign_itx_fn(pfx, w, h, dct_identity,      H_DCT,             ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   92|      0|    assign_itx_fn(pfx, w, h, adst_dct,          DCT_ADST,          ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   93|      0|    assign_itx_fn(pfx, w, h, adst_adst,         ADST_ADST,         ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   94|      0|    assign_itx_fn(pfx, w, h, adst_flipadst,     FLIPADST_ADST,     ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   95|      0|    assign_itx_fn(pfx, w, h, flipadst_dct,      DCT_FLIPADST,      ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   96|      0|    assign_itx_fn(pfx, w, h, flipadst_adst,     ADST_FLIPADST,     ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   97|      0|    assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   98|  7.82k|    assign_itx_fn(pfx, w, h, identity_dct,      V_DCT,             ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  102|      0|    assign_itx_fn(pfx, w, h, adst_identity,     H_ADST,            ext); \
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  103|      0|    assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST,        ext); \
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  104|      0|    assign_itx_fn(pfx, w, h, identity_adst,     V_ADST,            ext); \
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  105|  7.82k|    assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST,        ext)
  |  |  ------------------
  |  |  |  |   77|  7.82k|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|  7.82k|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|  7.82k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  270|      0|    assign_itx16_fn(R,  4,  8, avx512icl);
  ------------------
  |  |  101|      0|    assign_itx12_fn(pfx, w, h, ext); \
  |  |  ------------------
  |  |  |  |   88|      0|    assign_itx2_fn(pfx, w, h, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   84|      0|    assign_itx1_fn(pfx, w, h, ext); \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   81|      0|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   85|      0|    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   89|      0|    assign_itx_fn(pfx, w, h, dct_adst,          ADST_DCT,          ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   90|      0|    assign_itx_fn(pfx, w, h, dct_flipadst,      FLIPADST_DCT,      ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   91|      0|    assign_itx_fn(pfx, w, h, dct_identity,      H_DCT,             ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   92|      0|    assign_itx_fn(pfx, w, h, adst_dct,          DCT_ADST,          ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   93|      0|    assign_itx_fn(pfx, w, h, adst_adst,         ADST_ADST,         ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   94|      0|    assign_itx_fn(pfx, w, h, adst_flipadst,     FLIPADST_ADST,     ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   95|      0|    assign_itx_fn(pfx, w, h, flipadst_dct,      DCT_FLIPADST,      ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   96|      0|    assign_itx_fn(pfx, w, h, flipadst_adst,     ADST_FLIPADST,     ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   97|      0|    assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   98|      0|    assign_itx_fn(pfx, w, h, identity_dct,      V_DCT,             ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  102|      0|    assign_itx_fn(pfx, w, h, adst_identity,     H_ADST,            ext); \
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  103|      0|    assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST,        ext); \
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  104|      0|    assign_itx_fn(pfx, w, h, identity_adst,     V_ADST,            ext); \
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  105|      0|    assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST,        ext)
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  271|      0|    assign_itx16_fn(R,  4, 16, avx512icl);
  ------------------
  |  |  101|      0|    assign_itx12_fn(pfx, w, h, ext); \
  |  |  ------------------
  |  |  |  |   88|      0|    assign_itx2_fn(pfx, w, h, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   84|      0|    assign_itx1_fn(pfx, w, h, ext); \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   81|      0|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   85|      0|    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   89|      0|    assign_itx_fn(pfx, w, h, dct_adst,          ADST_DCT,          ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   90|      0|    assign_itx_fn(pfx, w, h, dct_flipadst,      FLIPADST_DCT,      ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   91|      0|    assign_itx_fn(pfx, w, h, dct_identity,      H_DCT,             ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   92|      0|    assign_itx_fn(pfx, w, h, adst_dct,          DCT_ADST,          ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   93|      0|    assign_itx_fn(pfx, w, h, adst_adst,         ADST_ADST,         ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   94|      0|    assign_itx_fn(pfx, w, h, adst_flipadst,     FLIPADST_ADST,     ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   95|      0|    assign_itx_fn(pfx, w, h, flipadst_dct,      DCT_FLIPADST,      ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   96|      0|    assign_itx_fn(pfx, w, h, flipadst_adst,     ADST_FLIPADST,     ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   97|      0|    assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   98|      0|    assign_itx_fn(pfx, w, h, identity_dct,      V_DCT,             ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  102|      0|    assign_itx_fn(pfx, w, h, adst_identity,     H_ADST,            ext); \
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  103|      0|    assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST,        ext); \
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  104|      0|    assign_itx_fn(pfx, w, h, identity_adst,     V_ADST,            ext); \
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  105|      0|    assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST,        ext)
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  272|      0|    assign_itx16_fn(R,  8,  4, avx512icl);
  ------------------
  |  |  101|      0|    assign_itx12_fn(pfx, w, h, ext); \
  |  |  ------------------
  |  |  |  |   88|      0|    assign_itx2_fn(pfx, w, h, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   84|      0|    assign_itx1_fn(pfx, w, h, ext); \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   81|      0|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   85|      0|    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   89|      0|    assign_itx_fn(pfx, w, h, dct_adst,          ADST_DCT,          ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   90|      0|    assign_itx_fn(pfx, w, h, dct_flipadst,      FLIPADST_DCT,      ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   91|      0|    assign_itx_fn(pfx, w, h, dct_identity,      H_DCT,             ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   92|      0|    assign_itx_fn(pfx, w, h, adst_dct,          DCT_ADST,          ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   93|      0|    assign_itx_fn(pfx, w, h, adst_adst,         ADST_ADST,         ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   94|      0|    assign_itx_fn(pfx, w, h, adst_flipadst,     FLIPADST_ADST,     ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   95|      0|    assign_itx_fn(pfx, w, h, flipadst_dct,      DCT_FLIPADST,      ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   96|      0|    assign_itx_fn(pfx, w, h, flipadst_adst,     ADST_FLIPADST,     ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   97|      0|    assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   98|      0|    assign_itx_fn(pfx, w, h, identity_dct,      V_DCT,             ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  102|      0|    assign_itx_fn(pfx, w, h, adst_identity,     H_ADST,            ext); \
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  103|      0|    assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST,        ext); \
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  104|      0|    assign_itx_fn(pfx, w, h, identity_adst,     V_ADST,            ext); \
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  105|      0|    assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST,        ext)
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  273|      0|    assign_itx16_fn( ,  8,  8, avx512icl);
  ------------------
  |  |  101|      0|    assign_itx12_fn(pfx, w, h, ext); \
  |  |  ------------------
  |  |  |  |   88|      0|    assign_itx2_fn(pfx, w, h, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   84|      0|    assign_itx1_fn(pfx, w, h, ext); \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   81|      0|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   85|      0|    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   89|      0|    assign_itx_fn(pfx, w, h, dct_adst,          ADST_DCT,          ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   90|      0|    assign_itx_fn(pfx, w, h, dct_flipadst,      FLIPADST_DCT,      ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   91|      0|    assign_itx_fn(pfx, w, h, dct_identity,      H_DCT,             ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   92|      0|    assign_itx_fn(pfx, w, h, adst_dct,          DCT_ADST,          ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   93|      0|    assign_itx_fn(pfx, w, h, adst_adst,         ADST_ADST,         ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   94|      0|    assign_itx_fn(pfx, w, h, adst_flipadst,     FLIPADST_ADST,     ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   95|      0|    assign_itx_fn(pfx, w, h, flipadst_dct,      DCT_FLIPADST,      ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   96|      0|    assign_itx_fn(pfx, w, h, flipadst_adst,     ADST_FLIPADST,     ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   97|      0|    assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   98|      0|    assign_itx_fn(pfx, w, h, identity_dct,      V_DCT,             ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  102|      0|    assign_itx_fn(pfx, w, h, adst_identity,     H_ADST,            ext); \
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  103|      0|    assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST,        ext); \
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  104|      0|    assign_itx_fn(pfx, w, h, identity_adst,     V_ADST,            ext); \
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  105|      0|    assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST,        ext)
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  274|      0|    assign_itx16_fn(R,  8, 16, avx512icl);
  ------------------
  |  |  101|      0|    assign_itx12_fn(pfx, w, h, ext); \
  |  |  ------------------
  |  |  |  |   88|      0|    assign_itx2_fn(pfx, w, h, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   84|      0|    assign_itx1_fn(pfx, w, h, ext); \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   81|      0|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   85|      0|    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   89|      0|    assign_itx_fn(pfx, w, h, dct_adst,          ADST_DCT,          ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   90|      0|    assign_itx_fn(pfx, w, h, dct_flipadst,      FLIPADST_DCT,      ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   91|      0|    assign_itx_fn(pfx, w, h, dct_identity,      H_DCT,             ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   92|      0|    assign_itx_fn(pfx, w, h, adst_dct,          DCT_ADST,          ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   93|      0|    assign_itx_fn(pfx, w, h, adst_adst,         ADST_ADST,         ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   94|      0|    assign_itx_fn(pfx, w, h, adst_flipadst,     FLIPADST_ADST,     ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   95|      0|    assign_itx_fn(pfx, w, h, flipadst_dct,      DCT_FLIPADST,      ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   96|      0|    assign_itx_fn(pfx, w, h, flipadst_adst,     ADST_FLIPADST,     ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   97|      0|    assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   98|      0|    assign_itx_fn(pfx, w, h, identity_dct,      V_DCT,             ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  102|      0|    assign_itx_fn(pfx, w, h, adst_identity,     H_ADST,            ext); \
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  103|      0|    assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST,        ext); \
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  104|      0|    assign_itx_fn(pfx, w, h, identity_adst,     V_ADST,            ext); \
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  105|      0|    assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST,        ext)
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  275|      0|    assign_itx2_fn (R,  8, 32, avx512icl);
  ------------------
  |  |   84|      0|    assign_itx1_fn(pfx, w, h, ext); \
  |  |  ------------------
  |  |  |  |   81|      0|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   85|      0|    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  276|      0|    assign_itx16_fn(R, 16,  4, avx512icl);
  ------------------
  |  |  101|      0|    assign_itx12_fn(pfx, w, h, ext); \
  |  |  ------------------
  |  |  |  |   88|      0|    assign_itx2_fn(pfx, w, h, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   84|      0|    assign_itx1_fn(pfx, w, h, ext); \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   81|      0|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   85|      0|    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   89|      0|    assign_itx_fn(pfx, w, h, dct_adst,          ADST_DCT,          ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   90|      0|    assign_itx_fn(pfx, w, h, dct_flipadst,      FLIPADST_DCT,      ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   91|      0|    assign_itx_fn(pfx, w, h, dct_identity,      H_DCT,             ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   92|      0|    assign_itx_fn(pfx, w, h, adst_dct,          DCT_ADST,          ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   93|      0|    assign_itx_fn(pfx, w, h, adst_adst,         ADST_ADST,         ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   94|      0|    assign_itx_fn(pfx, w, h, adst_flipadst,     FLIPADST_ADST,     ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   95|      0|    assign_itx_fn(pfx, w, h, flipadst_dct,      DCT_FLIPADST,      ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   96|      0|    assign_itx_fn(pfx, w, h, flipadst_adst,     ADST_FLIPADST,     ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   97|      0|    assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   98|      0|    assign_itx_fn(pfx, w, h, identity_dct,      V_DCT,             ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  102|      0|    assign_itx_fn(pfx, w, h, adst_identity,     H_ADST,            ext); \
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  103|      0|    assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST,        ext); \
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  104|      0|    assign_itx_fn(pfx, w, h, identity_adst,     V_ADST,            ext); \
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  105|      0|    assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST,        ext)
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  277|      0|    assign_itx16_fn(R, 16,  8, avx512icl);
  ------------------
  |  |  101|      0|    assign_itx12_fn(pfx, w, h, ext); \
  |  |  ------------------
  |  |  |  |   88|      0|    assign_itx2_fn(pfx, w, h, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   84|      0|    assign_itx1_fn(pfx, w, h, ext); \
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   81|      0|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |   85|      0|    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   89|      0|    assign_itx_fn(pfx, w, h, dct_adst,          ADST_DCT,          ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   90|      0|    assign_itx_fn(pfx, w, h, dct_flipadst,      FLIPADST_DCT,      ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   91|      0|    assign_itx_fn(pfx, w, h, dct_identity,      H_DCT,             ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   92|      0|    assign_itx_fn(pfx, w, h, adst_dct,          DCT_ADST,          ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   93|      0|    assign_itx_fn(pfx, w, h, adst_adst,         ADST_ADST,         ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   94|      0|    assign_itx_fn(pfx, w, h, adst_flipadst,     FLIPADST_ADST,     ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   95|      0|    assign_itx_fn(pfx, w, h, flipadst_dct,      DCT_FLIPADST,      ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   96|      0|    assign_itx_fn(pfx, w, h, flipadst_adst,     ADST_FLIPADST,     ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   97|      0|    assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   98|      0|    assign_itx_fn(pfx, w, h, identity_dct,      V_DCT,             ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  102|      0|    assign_itx_fn(pfx, w, h, adst_identity,     H_ADST,            ext); \
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  103|      0|    assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST,        ext); \
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  104|      0|    assign_itx_fn(pfx, w, h, identity_adst,     V_ADST,            ext); \
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  105|      0|    assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST,        ext)
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  278|      0|    assign_itx12_fn( , 16, 16, avx512icl);
  ------------------
  |  |   88|      0|    assign_itx2_fn(pfx, w, h, ext); \
  |  |  ------------------
  |  |  |  |   84|      0|    assign_itx1_fn(pfx, w, h, ext); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   81|      0|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   85|      0|    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   89|      0|    assign_itx_fn(pfx, w, h, dct_adst,          ADST_DCT,          ext); \
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   90|      0|    assign_itx_fn(pfx, w, h, dct_flipadst,      FLIPADST_DCT,      ext); \
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   91|      0|    assign_itx_fn(pfx, w, h, dct_identity,      H_DCT,             ext); \
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   92|      0|    assign_itx_fn(pfx, w, h, adst_dct,          DCT_ADST,          ext); \
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   93|      0|    assign_itx_fn(pfx, w, h, adst_adst,         ADST_ADST,         ext); \
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   94|      0|    assign_itx_fn(pfx, w, h, adst_flipadst,     FLIPADST_ADST,     ext); \
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   95|      0|    assign_itx_fn(pfx, w, h, flipadst_dct,      DCT_FLIPADST,      ext); \
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   96|      0|    assign_itx_fn(pfx, w, h, flipadst_adst,     ADST_FLIPADST,     ext); \
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   97|      0|    assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   98|      0|    assign_itx_fn(pfx, w, h, identity_dct,      V_DCT,             ext)
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  279|      0|    assign_itx2_fn (R, 16, 32, avx512icl);
  ------------------
  |  |   84|      0|    assign_itx1_fn(pfx, w, h, ext); \
  |  |  ------------------
  |  |  |  |   81|      0|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   85|      0|    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  280|      0|    assign_itx1_fn (R, 16, 64, avx512icl);
  ------------------
  |  |   81|      0|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  281|      0|    assign_itx2_fn (R, 32,  8, avx512icl);
  ------------------
  |  |   84|      0|    assign_itx1_fn(pfx, w, h, ext); \
  |  |  ------------------
  |  |  |  |   81|      0|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   85|      0|    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  282|      0|    assign_itx2_fn (R, 32, 16, avx512icl);
  ------------------
  |  |   84|      0|    assign_itx1_fn(pfx, w, h, ext); \
  |  |  ------------------
  |  |  |  |   81|      0|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   85|      0|    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  283|      0|    assign_itx2_fn ( , 32, 32, avx512icl);
  ------------------
  |  |   84|      0|    assign_itx1_fn(pfx, w, h, ext); \
  |  |  ------------------
  |  |  |  |   81|      0|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |   85|      0|    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  284|      0|    assign_itx1_fn (R, 32, 64, avx512icl);
  ------------------
  |  |   81|      0|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  285|      0|    assign_itx1_fn (R, 64, 16, avx512icl);
  ------------------
  |  |   81|      0|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  286|      0|    assign_itx1_fn (R, 64, 32, avx512icl);
  ------------------
  |  |   81|      0|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  287|      0|    assign_itx1_fn ( , 64, 64, avx512icl);
  ------------------
  |  |   81|      0|    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
  |  |  ------------------
  |  |  |  |   77|      0|    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
  |  |  |  |   78|      0|        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
  |  |  |  |  ------------------
  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  288|       |#else
  289|       |    if (bpc == 10) {
  290|       |        assign_itx16_bpc_fn( ,  8,  8, 10, avx512icl);
  291|       |        assign_itx16_bpc_fn(R,  8, 16, 10, avx512icl);
  292|       |        assign_itx2_bpc_fn (R,  8, 32, 10, avx512icl);
  293|       |        assign_itx16_bpc_fn(R, 16,  8, 10, avx512icl);
  294|       |        assign_itx12_bpc_fn( , 16, 16, 10, avx512icl);
  295|       |        assign_itx2_bpc_fn (R, 16, 32, 10, avx512icl);
  296|       |        assign_itx2_bpc_fn (R, 32,  8, 10, avx512icl);
  297|       |        assign_itx2_bpc_fn (R, 32, 16, 10, avx512icl);
  298|       |        assign_itx2_bpc_fn ( , 32, 32, 10, avx512icl);
  299|       |        assign_itx1_bpc_fn (R, 16, 64, 10, avx512icl);
  300|       |        assign_itx1_bpc_fn (R, 32, 64, 10, avx512icl);
  301|       |        assign_itx1_bpc_fn (R, 64, 16, 10, avx512icl);
  302|       |        assign_itx1_bpc_fn (R, 64, 32, 10, avx512icl);
  303|       |        assign_itx1_bpc_fn ( , 64, 64, 10, avx512icl);
  304|       |    }
  305|       |#endif
  306|      0|#endif
  307|      0|}

loopfilter_tmpl.c:loop_filter_dsp_init_x86:
   41|  15.4k|static ALWAYS_INLINE void loop_filter_dsp_init_x86(Dav1dLoopFilterDSPContext *const c) {
   42|  15.4k|    const unsigned flags = dav1d_get_cpu_flags();
   43|       |
   44|  15.4k|    if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
  ------------------
  |  Branch (44:9): [True: 0, False: 15.4k]
  ------------------
   45|       |
   46|  15.4k|    c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, ssse3);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   47|  15.4k|    c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, ssse3);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   48|  15.4k|    c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, ssse3);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   49|  15.4k|    c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, ssse3);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   50|       |
   51|  15.4k|#if ARCH_X86_64
   52|  15.4k|    if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
  ------------------
  |  Branch (52:9): [True: 0, False: 15.4k]
  ------------------
   53|       |
   54|  15.4k|    c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, avx2);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   55|  15.4k|    c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, avx2);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   56|  15.4k|    c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, avx2);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   57|  15.4k|    c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, avx2);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   58|       |
   59|  15.4k|    if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
  ------------------
  |  Branch (59:9): [True: 15.4k, False: 0]
  ------------------
   60|       |
   61|      0|    c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, avx512icl);
  ------------------
  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   62|      0|    c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, avx512icl);
  ------------------
  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   63|       |
   64|      0|    if (!(flags & DAV1D_X86_CPU_FLAG_SLOW_GATHER)) {
  ------------------
  |  Branch (64:9): [True: 0, False: 0]
  ------------------
   65|      0|        c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, avx512icl);
  ------------------
  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   66|      0|        c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, avx512icl);
  ------------------
  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   67|      0|    }
   68|      0|#endif
   69|      0|}

looprestoration_tmpl.c:loop_restoration_dsp_init_x86:
   50|  15.4k|static ALWAYS_INLINE void loop_restoration_dsp_init_x86(Dav1dLoopRestorationDSPContext *const c, const int bpc) {
   51|  15.4k|    const unsigned flags = dav1d_get_cpu_flags();
   52|       |
   53|  15.4k|    if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
  ------------------
  |  Branch (53:9): [True: 0, False: 15.4k]
  ------------------
   54|  15.4k|#if BITDEPTH == 8
   55|  15.4k|    c->wiener[0] = BF(dav1d_wiener_filter7, sse2);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   56|  15.4k|    c->wiener[1] = BF(dav1d_wiener_filter5, sse2);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   57|  15.4k|#endif
   58|       |
   59|  15.4k|    if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
  ------------------
  |  Branch (59:9): [True: 0, False: 15.4k]
  ------------------
   60|  15.4k|    c->wiener[0] = BF(dav1d_wiener_filter7, ssse3);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   61|  15.4k|    c->wiener[1] = BF(dav1d_wiener_filter5, ssse3);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   62|  15.4k|    if (BITDEPTH == 8 || bpc == 10) {
  ------------------
  |  Branch (62:9): [True: 7.82k, Folded]
  |  Branch (62:26): [True: 4.68k, False: 2.94k]
  ------------------
   63|  12.5k|        c->sgr[0] = BF(dav1d_sgr_filter_5x5, ssse3);
  ------------------
  |  |   52|  12.5k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   64|  12.5k|        c->sgr[1] = BF(dav1d_sgr_filter_3x3, ssse3);
  ------------------
  |  |   52|  12.5k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   65|  12.5k|        c->sgr[2] = BF(dav1d_sgr_filter_mix, ssse3);
  ------------------
  |  |   52|  12.5k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   66|  12.5k|    }
   67|       |
   68|  15.4k|#if ARCH_X86_64
   69|  15.4k|    if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
  ------------------
  |  Branch (69:9): [True: 0, False: 15.4k]
  ------------------
   70|       |
   71|  15.4k|    c->wiener[0] = BF(dav1d_wiener_filter7, avx2);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   72|  15.4k|    c->wiener[1] = BF(dav1d_wiener_filter5, avx2);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   73|  15.4k|    if (BITDEPTH == 8 || bpc == 10) {
  ------------------
  |  Branch (73:9): [True: 7.82k, Folded]
  |  Branch (73:26): [True: 4.68k, False: 2.94k]
  ------------------
   74|  12.5k|        c->sgr[0] = BF(dav1d_sgr_filter_5x5, avx2);
  ------------------
  |  |   52|  12.5k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   75|  12.5k|        c->sgr[1] = BF(dav1d_sgr_filter_3x3, avx2);
  ------------------
  |  |   52|  12.5k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   76|  12.5k|        c->sgr[2] = BF(dav1d_sgr_filter_mix, avx2);
  ------------------
  |  |   52|  12.5k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   77|  12.5k|    }
   78|       |
   79|  15.4k|    if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
  ------------------
  |  Branch (79:9): [True: 15.4k, False: 0]
  ------------------
   80|       |
   81|      0|    c->wiener[0] = BF(dav1d_wiener_filter7, avx512icl);
  ------------------
  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   82|      0|#if BITDEPTH == 8
   83|       |    /* With VNNI we don't need a 5-tap version. */
   84|      0|    c->wiener[1] = c->wiener[0];
   85|       |#else
   86|       |    c->wiener[1] = BF(dav1d_wiener_filter5, avx512icl);
   87|       |#endif
   88|      0|    if (BITDEPTH == 8 || bpc == 10) {
  ------------------
  |  Branch (88:9): [True: 0, Folded]
  |  Branch (88:26): [True: 0, False: 0]
  ------------------
   89|      0|        c->sgr[0] = BF(dav1d_sgr_filter_5x5, avx512icl);
  ------------------
  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   90|      0|        c->sgr[1] = BF(dav1d_sgr_filter_3x3, avx512icl);
  ------------------
  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   91|      0|        c->sgr[2] = BF(dav1d_sgr_filter_mix, avx512icl);
  ------------------
  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
   92|      0|    }
   93|      0|#endif
   94|      0|}

mc_tmpl.c:mc_dsp_init_x86:
   92|  15.4k|static ALWAYS_INLINE void mc_dsp_init_x86(Dav1dMCDSPContext *const c) {
   93|  15.4k|    const unsigned flags = dav1d_get_cpu_flags();
   94|       |
   95|  15.4k|    if(!(flags & DAV1D_X86_CPU_FLAG_SSSE3))
  ------------------
  |  Branch (95:8): [True: 0, False: 15.4k]
  ------------------
   96|      0|        return;
   97|       |
   98|  15.4k|    init_8tap_fns(ssse3);
  ------------------
  |  |  143|  15.4k|    init_8tap_gen(mc,  opt); \
  |  |  ------------------
  |  |  |  |  132|  15.4k|    init_##name##_fn(FILTER_2D_8TAP_REGULAR,        8tap_regular,        opt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   36|  15.4k|    c->mc[type] = BF(dav1d_put_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  133|  15.4k|    init_##name##_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, opt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   36|  15.4k|    c->mc[type] = BF(dav1d_put_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  134|  15.4k|    init_##name##_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  opt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   36|  15.4k|    c->mc[type] = BF(dav1d_put_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  135|  15.4k|    init_##name##_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, opt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   36|  15.4k|    c->mc[type] = BF(dav1d_put_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  136|  15.4k|    init_##name##_fn(FILTER_2D_8TAP_SMOOTH,         8tap_smooth,         opt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   36|  15.4k|    c->mc[type] = BF(dav1d_put_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  137|  15.4k|    init_##name##_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp,   opt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   36|  15.4k|    c->mc[type] = BF(dav1d_put_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  138|  15.4k|    init_##name##_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular,  opt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   36|  15.4k|    c->mc[type] = BF(dav1d_put_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  139|  15.4k|    init_##name##_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   opt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   36|  15.4k|    c->mc[type] = BF(dav1d_put_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  140|  15.4k|    init_##name##_fn(FILTER_2D_8TAP_SHARP,          8tap_sharp,          opt)
  |  |  |  |  ------------------
  |  |  |  |  |  |   36|  15.4k|    c->mc[type] = BF(dav1d_put_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  144|  15.4k|    init_8tap_gen(mct, opt)
  |  |  ------------------
  |  |  |  |  132|  15.4k|    init_##name##_fn(FILTER_2D_8TAP_REGULAR,        8tap_regular,        opt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   38|  15.4k|    c->mct[type] = BF(dav1d_prep_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  133|  15.4k|    init_##name##_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, opt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   38|  15.4k|    c->mct[type] = BF(dav1d_prep_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  134|  15.4k|    init_##name##_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  opt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   38|  15.4k|    c->mct[type] = BF(dav1d_prep_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  135|  15.4k|    init_##name##_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, opt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   38|  15.4k|    c->mct[type] = BF(dav1d_prep_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  136|  15.4k|    init_##name##_fn(FILTER_2D_8TAP_SMOOTH,         8tap_smooth,         opt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   38|  15.4k|    c->mct[type] = BF(dav1d_prep_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  137|  15.4k|    init_##name##_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp,   opt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   38|  15.4k|    c->mct[type] = BF(dav1d_prep_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  138|  15.4k|    init_##name##_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular,  opt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   38|  15.4k|    c->mct[type] = BF(dav1d_prep_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  139|  15.4k|    init_##name##_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   opt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   38|  15.4k|    c->mct[type] = BF(dav1d_prep_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  140|  15.4k|    init_##name##_fn(FILTER_2D_8TAP_SHARP,          8tap_sharp,          opt)
  |  |  |  |  ------------------
  |  |  |  |  |  |   38|  15.4k|    c->mct[type] = BF(dav1d_prep_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   99|       |
  100|  15.4k|    init_mc_fn(FILTER_2D_BILINEAR,             bilin,               ssse3);
  ------------------
  |  |   36|  15.4k|    c->mc[type] = BF(dav1d_put_##name, suffix)
  |  |  ------------------
  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  ------------------
  ------------------
  101|  15.4k|    init_mct_fn(FILTER_2D_BILINEAR,            bilin,               ssse3);
  ------------------
  |  |   38|  15.4k|    c->mct[type] = BF(dav1d_prep_##name, suffix)
  |  |  ------------------
  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  ------------------
  ------------------
  102|       |
  103|  15.4k|    init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR,        8tap_scaled_regular,        ssse3);
  ------------------
  |  |   40|  15.4k|    c->mc_scaled[type] = BF(dav1d_put_##name, suffix)
  |  |  ------------------
  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  ------------------
  ------------------
  104|  15.4k|    init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, ssse3);
  ------------------
  |  |   40|  15.4k|    c->mc_scaled[type] = BF(dav1d_put_##name, suffix)
  |  |  ------------------
  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  ------------------
  ------------------
  105|  15.4k|    init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_scaled_regular_sharp,  ssse3);
  ------------------
  |  |   40|  15.4k|    c->mc_scaled[type] = BF(dav1d_put_##name, suffix)
  |  |  ------------------
  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  ------------------
  ------------------
  106|  15.4k|    init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, ssse3);
  ------------------
  |  |   40|  15.4k|    c->mc_scaled[type] = BF(dav1d_put_##name, suffix)
  |  |  ------------------
  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  ------------------
  ------------------
  107|  15.4k|    init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH,         8tap_scaled_smooth,         ssse3);
  ------------------
  |  |   40|  15.4k|    c->mc_scaled[type] = BF(dav1d_put_##name, suffix)
  |  |  ------------------
  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  ------------------
  ------------------
  108|  15.4k|    init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_scaled_smooth_sharp,   ssse3);
  ------------------
  |  |   40|  15.4k|    c->mc_scaled[type] = BF(dav1d_put_##name, suffix)
  |  |  ------------------
  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  ------------------
  ------------------
  109|  15.4k|    init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_scaled_sharp_regular,  ssse3);
  ------------------
  |  |   40|  15.4k|    c->mc_scaled[type] = BF(dav1d_put_##name, suffix)
  |  |  ------------------
  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  ------------------
  ------------------
  110|  15.4k|    init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_scaled_sharp_smooth,   ssse3);
  ------------------
  |  |   40|  15.4k|    c->mc_scaled[type] = BF(dav1d_put_##name, suffix)
  |  |  ------------------
  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  ------------------
  ------------------
  111|  15.4k|    init_mc_scaled_fn(FILTER_2D_8TAP_SHARP,          8tap_scaled_sharp,          ssse3);
  ------------------
  |  |   40|  15.4k|    c->mc_scaled[type] = BF(dav1d_put_##name, suffix)
  |  |  ------------------
  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  ------------------
  ------------------
  112|  15.4k|    init_mc_scaled_fn(FILTER_2D_BILINEAR,            bilin_scaled,               ssse3);
  ------------------
  |  |   40|  15.4k|    c->mc_scaled[type] = BF(dav1d_put_##name, suffix)
  |  |  ------------------
  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  ------------------
  ------------------
  113|       |
  114|  15.4k|    init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR,        8tap_scaled_regular,        ssse3);
  ------------------
  |  |   42|  15.4k|    c->mct_scaled[type] = BF(dav1d_prep_##name, suffix)
  |  |  ------------------
  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  ------------------
  ------------------
  115|  15.4k|    init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, ssse3);
  ------------------
  |  |   42|  15.4k|    c->mct_scaled[type] = BF(dav1d_prep_##name, suffix)
  |  |  ------------------
  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  ------------------
  ------------------
  116|  15.4k|    init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_scaled_regular_sharp,  ssse3);
  ------------------
  |  |   42|  15.4k|    c->mct_scaled[type] = BF(dav1d_prep_##name, suffix)
  |  |  ------------------
  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  ------------------
  ------------------
  117|  15.4k|    init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, ssse3);
  ------------------
  |  |   42|  15.4k|    c->mct_scaled[type] = BF(dav1d_prep_##name, suffix)
  |  |  ------------------
  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  ------------------
  ------------------
  118|  15.4k|    init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH,         8tap_scaled_smooth,         ssse3);
  ------------------
  |  |   42|  15.4k|    c->mct_scaled[type] = BF(dav1d_prep_##name, suffix)
  |  |  ------------------
  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  ------------------
  ------------------
  119|  15.4k|    init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_scaled_smooth_sharp,   ssse3);
  ------------------
  |  |   42|  15.4k|    c->mct_scaled[type] = BF(dav1d_prep_##name, suffix)
  |  |  ------------------
  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  ------------------
  ------------------
  120|  15.4k|    init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_scaled_sharp_regular,  ssse3);
  ------------------
  |  |   42|  15.4k|    c->mct_scaled[type] = BF(dav1d_prep_##name, suffix)
  |  |  ------------------
  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  ------------------
  ------------------
  121|  15.4k|    init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_scaled_sharp_smooth,   ssse3);
  ------------------
  |  |   42|  15.4k|    c->mct_scaled[type] = BF(dav1d_prep_##name, suffix)
  |  |  ------------------
  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  ------------------
  ------------------
  122|  15.4k|    init_mct_scaled_fn(FILTER_2D_8TAP_SHARP,          8tap_scaled_sharp,          ssse3);
  ------------------
  |  |   42|  15.4k|    c->mct_scaled[type] = BF(dav1d_prep_##name, suffix)
  |  |  ------------------
  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  ------------------
  ------------------
  123|  15.4k|    init_mct_scaled_fn(FILTER_2D_BILINEAR,            bilin_scaled,               ssse3);
  ------------------
  |  |   42|  15.4k|    c->mct_scaled[type] = BF(dav1d_prep_##name, suffix)
  |  |  ------------------
  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  ------------------
  ------------------
  124|       |
  125|  15.4k|    c->avg = BF(dav1d_avg, ssse3);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
  126|  15.4k|    c->w_avg = BF(dav1d_w_avg, ssse3);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
  127|  15.4k|    c->mask = BF(dav1d_mask, ssse3);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
  128|  15.4k|    c->w_mask[0] = BF(dav1d_w_mask_444, ssse3);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
  129|  15.4k|    c->w_mask[1] = BF(dav1d_w_mask_422, ssse3);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
  130|  15.4k|    c->w_mask[2] = BF(dav1d_w_mask_420, ssse3);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
  131|  15.4k|    c->blend = BF(dav1d_blend, ssse3);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
  132|  15.4k|    c->blend_v = BF(dav1d_blend_v, ssse3);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
  133|  15.4k|    c->blend_h = BF(dav1d_blend_h, ssse3);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
  134|  15.4k|    c->warp8x8  = BF(dav1d_warp_affine_8x8, ssse3);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
  135|  15.4k|    c->warp8x8t = BF(dav1d_warp_affine_8x8t, ssse3);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
  136|  15.4k|    c->emu_edge = BF(dav1d_emu_edge, ssse3);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
  137|  15.4k|    c->resize = BF(dav1d_resize, ssse3);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
  138|       |
  139|  15.4k|    if(!(flags & DAV1D_X86_CPU_FLAG_SSE41))
  ------------------
  |  Branch (139:8): [True: 0, False: 15.4k]
  ------------------
  140|      0|        return;
  141|       |
  142|  15.4k|#if BITDEPTH == 8
  143|  15.4k|    c->warp8x8  = BF(dav1d_warp_affine_8x8, sse4);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
  144|  15.4k|    c->warp8x8t = BF(dav1d_warp_affine_8x8t, sse4);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
  145|  15.4k|#endif
  146|       |
  147|  15.4k|#if ARCH_X86_64
  148|  15.4k|    if (!(flags & DAV1D_X86_CPU_FLAG_AVX2))
  ------------------
  |  Branch (148:9): [True: 0, False: 15.4k]
  ------------------
  149|      0|        return;
  150|       |
  151|  15.4k|    init_8tap_fns(avx2);
  ------------------
  |  |  143|  15.4k|    init_8tap_gen(mc,  opt); \
  |  |  ------------------
  |  |  |  |  132|  15.4k|    init_##name##_fn(FILTER_2D_8TAP_REGULAR,        8tap_regular,        opt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   36|  15.4k|    c->mc[type] = BF(dav1d_put_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  133|  15.4k|    init_##name##_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, opt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   36|  15.4k|    c->mc[type] = BF(dav1d_put_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  134|  15.4k|    init_##name##_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  opt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   36|  15.4k|    c->mc[type] = BF(dav1d_put_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  135|  15.4k|    init_##name##_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, opt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   36|  15.4k|    c->mc[type] = BF(dav1d_put_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  136|  15.4k|    init_##name##_fn(FILTER_2D_8TAP_SMOOTH,         8tap_smooth,         opt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   36|  15.4k|    c->mc[type] = BF(dav1d_put_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  137|  15.4k|    init_##name##_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp,   opt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   36|  15.4k|    c->mc[type] = BF(dav1d_put_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  138|  15.4k|    init_##name##_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular,  opt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   36|  15.4k|    c->mc[type] = BF(dav1d_put_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  139|  15.4k|    init_##name##_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   opt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   36|  15.4k|    c->mc[type] = BF(dav1d_put_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  140|  15.4k|    init_##name##_fn(FILTER_2D_8TAP_SHARP,          8tap_sharp,          opt)
  |  |  |  |  ------------------
  |  |  |  |  |  |   36|  15.4k|    c->mc[type] = BF(dav1d_put_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  144|  15.4k|    init_8tap_gen(mct, opt)
  |  |  ------------------
  |  |  |  |  132|  15.4k|    init_##name##_fn(FILTER_2D_8TAP_REGULAR,        8tap_regular,        opt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   38|  15.4k|    c->mct[type] = BF(dav1d_prep_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  133|  15.4k|    init_##name##_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, opt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   38|  15.4k|    c->mct[type] = BF(dav1d_prep_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  134|  15.4k|    init_##name##_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  opt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   38|  15.4k|    c->mct[type] = BF(dav1d_prep_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  135|  15.4k|    init_##name##_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, opt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   38|  15.4k|    c->mct[type] = BF(dav1d_prep_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  136|  15.4k|    init_##name##_fn(FILTER_2D_8TAP_SMOOTH,         8tap_smooth,         opt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   38|  15.4k|    c->mct[type] = BF(dav1d_prep_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  137|  15.4k|    init_##name##_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp,   opt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   38|  15.4k|    c->mct[type] = BF(dav1d_prep_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  138|  15.4k|    init_##name##_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular,  opt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   38|  15.4k|    c->mct[type] = BF(dav1d_prep_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  139|  15.4k|    init_##name##_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   opt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   38|  15.4k|    c->mct[type] = BF(dav1d_prep_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  140|  15.4k|    init_##name##_fn(FILTER_2D_8TAP_SHARP,          8tap_sharp,          opt)
  |  |  |  |  ------------------
  |  |  |  |  |  |   38|  15.4k|    c->mct[type] = BF(dav1d_prep_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  152|       |
  153|  15.4k|    init_mc_fn(FILTER_2D_BILINEAR,            bilin,               avx2);
  ------------------
  |  |   36|  15.4k|    c->mc[type] = BF(dav1d_put_##name, suffix)
  |  |  ------------------
  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  ------------------
  ------------------
  154|  15.4k|    init_mct_fn(FILTER_2D_BILINEAR,           bilin,               avx2);
  ------------------
  |  |   38|  15.4k|    c->mct[type] = BF(dav1d_prep_##name, suffix)
  |  |  ------------------
  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  ------------------
  ------------------
  155|       |
  156|  15.4k|    init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR,        8tap_scaled_regular,        avx2);
  ------------------
  |  |   40|  15.4k|    c->mc_scaled[type] = BF(dav1d_put_##name, suffix)
  |  |  ------------------
  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  ------------------
  ------------------
  157|  15.4k|    init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2);
  ------------------
  |  |   40|  15.4k|    c->mc_scaled[type] = BF(dav1d_put_##name, suffix)
  |  |  ------------------
  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  ------------------
  ------------------
  158|  15.4k|    init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_scaled_regular_sharp,  avx2);
  ------------------
  |  |   40|  15.4k|    c->mc_scaled[type] = BF(dav1d_put_##name, suffix)
  |  |  ------------------
  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  ------------------
  ------------------
  159|  15.4k|    init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, avx2);
  ------------------
  |  |   40|  15.4k|    c->mc_scaled[type] = BF(dav1d_put_##name, suffix)
  |  |  ------------------
  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  ------------------
  ------------------
  160|  15.4k|    init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH,         8tap_scaled_smooth,         avx2);
  ------------------
  |  |   40|  15.4k|    c->mc_scaled[type] = BF(dav1d_put_##name, suffix)
  |  |  ------------------
  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  ------------------
  ------------------
  161|  15.4k|    init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_scaled_smooth_sharp,   avx2);
  ------------------
  |  |   40|  15.4k|    c->mc_scaled[type] = BF(dav1d_put_##name, suffix)
  |  |  ------------------
  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  ------------------
  ------------------
  162|  15.4k|    init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_scaled_sharp_regular,  avx2);
  ------------------
  |  |   40|  15.4k|    c->mc_scaled[type] = BF(dav1d_put_##name, suffix)
  |  |  ------------------
  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  ------------------
  ------------------
  163|  15.4k|    init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_scaled_sharp_smooth,   avx2);
  ------------------
  |  |   40|  15.4k|    c->mc_scaled[type] = BF(dav1d_put_##name, suffix)
  |  |  ------------------
  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  ------------------
  ------------------
  164|  15.4k|    init_mc_scaled_fn(FILTER_2D_8TAP_SHARP,          8tap_scaled_sharp,          avx2);
  ------------------
  |  |   40|  15.4k|    c->mc_scaled[type] = BF(dav1d_put_##name, suffix)
  |  |  ------------------
  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  ------------------
  ------------------
  165|  15.4k|    init_mc_scaled_fn(FILTER_2D_BILINEAR,            bilin_scaled,               avx2);
  ------------------
  |  |   40|  15.4k|    c->mc_scaled[type] = BF(dav1d_put_##name, suffix)
  |  |  ------------------
  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  ------------------
  ------------------
  166|       |
  167|  15.4k|    init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR,        8tap_scaled_regular,        avx2);
  ------------------
  |  |   42|  15.4k|    c->mct_scaled[type] = BF(dav1d_prep_##name, suffix)
  |  |  ------------------
  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  ------------------
  ------------------
  168|  15.4k|    init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2);
  ------------------
  |  |   42|  15.4k|    c->mct_scaled[type] = BF(dav1d_prep_##name, suffix)
  |  |  ------------------
  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  ------------------
  ------------------
  169|  15.4k|    init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_scaled_regular_sharp,  avx2);
  ------------------
  |  |   42|  15.4k|    c->mct_scaled[type] = BF(dav1d_prep_##name, suffix)
  |  |  ------------------
  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  ------------------
  ------------------
  170|  15.4k|    init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, avx2);
  ------------------
  |  |   42|  15.4k|    c->mct_scaled[type] = BF(dav1d_prep_##name, suffix)
  |  |  ------------------
  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  ------------------
  ------------------
  171|  15.4k|    init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH,         8tap_scaled_smooth,         avx2);
  ------------------
  |  |   42|  15.4k|    c->mct_scaled[type] = BF(dav1d_prep_##name, suffix)
  |  |  ------------------
  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  ------------------
  ------------------
  172|  15.4k|    init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_scaled_smooth_sharp,   avx2);
  ------------------
  |  |   42|  15.4k|    c->mct_scaled[type] = BF(dav1d_prep_##name, suffix)
  |  |  ------------------
  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  ------------------
  ------------------
  173|  15.4k|    init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_scaled_sharp_regular,  avx2);
  ------------------
  |  |   42|  15.4k|    c->mct_scaled[type] = BF(dav1d_prep_##name, suffix)
  |  |  ------------------
  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  ------------------
  ------------------
  174|  15.4k|    init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_scaled_sharp_smooth,   avx2);
  ------------------
  |  |   42|  15.4k|    c->mct_scaled[type] = BF(dav1d_prep_##name, suffix)
  |  |  ------------------
  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  ------------------
  ------------------
  175|  15.4k|    init_mct_scaled_fn(FILTER_2D_8TAP_SHARP,          8tap_scaled_sharp,          avx2);
  ------------------
  |  |   42|  15.4k|    c->mct_scaled[type] = BF(dav1d_prep_##name, suffix)
  |  |  ------------------
  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  ------------------
  ------------------
  176|  15.4k|    init_mct_scaled_fn(FILTER_2D_BILINEAR,            bilin_scaled,               avx2);
  ------------------
  |  |   42|  15.4k|    c->mct_scaled[type] = BF(dav1d_prep_##name, suffix)
  |  |  ------------------
  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  ------------------
  ------------------
  177|       |
  178|  15.4k|    c->avg = BF(dav1d_avg, avx2);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
  179|  15.4k|    c->w_avg = BF(dav1d_w_avg, avx2);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
  180|  15.4k|    c->mask = BF(dav1d_mask, avx2);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
  181|  15.4k|    c->w_mask[0] = BF(dav1d_w_mask_444, avx2);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
  182|  15.4k|    c->w_mask[1] = BF(dav1d_w_mask_422, avx2);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
  183|  15.4k|    c->w_mask[2] = BF(dav1d_w_mask_420, avx2);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
  184|  15.4k|    c->blend = BF(dav1d_blend, avx2);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
  185|  15.4k|    c->blend_v = BF(dav1d_blend_v, avx2);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
  186|  15.4k|    c->blend_h = BF(dav1d_blend_h, avx2);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
  187|  15.4k|    c->warp8x8  = BF(dav1d_warp_affine_8x8, avx2);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
  188|  15.4k|    c->warp8x8t = BF(dav1d_warp_affine_8x8t, avx2);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
  189|  15.4k|    c->emu_edge = BF(dav1d_emu_edge, avx2);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
  190|  15.4k|    c->resize = BF(dav1d_resize, avx2);
  ------------------
  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
  191|       |
  192|  15.4k|    if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL))
  ------------------
  |  Branch (192:9): [True: 15.4k, False: 0]
  ------------------
  193|  15.4k|        return;
  194|       |
  195|  15.4k|    init_8tap_fns(avx512icl);
  ------------------
  |  |  143|      0|    init_8tap_gen(mc,  opt); \
  |  |  ------------------
  |  |  |  |  132|      0|    init_##name##_fn(FILTER_2D_8TAP_REGULAR,        8tap_regular,        opt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   36|  15.4k|    c->mc[type] = BF(dav1d_put_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  133|      0|    init_##name##_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, opt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   36|      0|    c->mc[type] = BF(dav1d_put_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  134|      0|    init_##name##_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  opt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   36|      0|    c->mc[type] = BF(dav1d_put_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  135|      0|    init_##name##_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, opt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   36|      0|    c->mc[type] = BF(dav1d_put_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  136|      0|    init_##name##_fn(FILTER_2D_8TAP_SMOOTH,         8tap_smooth,         opt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   36|      0|    c->mc[type] = BF(dav1d_put_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  137|      0|    init_##name##_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp,   opt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   36|      0|    c->mc[type] = BF(dav1d_put_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  138|      0|    init_##name##_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular,  opt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   36|      0|    c->mc[type] = BF(dav1d_put_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  139|      0|    init_##name##_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   opt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   36|      0|    c->mc[type] = BF(dav1d_put_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  140|  15.4k|    init_##name##_fn(FILTER_2D_8TAP_SHARP,          8tap_sharp,          opt)
  |  |  |  |  ------------------
  |  |  |  |  |  |   36|  15.4k|    c->mc[type] = BF(dav1d_put_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  144|      0|    init_8tap_gen(mct, opt)
  |  |  ------------------
  |  |  |  |  132|      0|    init_##name##_fn(FILTER_2D_8TAP_REGULAR,        8tap_regular,        opt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   38|      0|    c->mct[type] = BF(dav1d_prep_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  133|  15.4k|    init_##name##_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, opt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   38|      0|    c->mct[type] = BF(dav1d_prep_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  134|  15.4k|    init_##name##_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  opt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   38|      0|    c->mct[type] = BF(dav1d_prep_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  135|  15.4k|    init_##name##_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, opt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   38|      0|    c->mct[type] = BF(dav1d_prep_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  136|  15.4k|    init_##name##_fn(FILTER_2D_8TAP_SMOOTH,         8tap_smooth,         opt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   38|      0|    c->mct[type] = BF(dav1d_prep_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  137|  15.4k|    init_##name##_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp,   opt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   38|      0|    c->mct[type] = BF(dav1d_prep_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  138|  15.4k|    init_##name##_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular,  opt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   38|      0|    c->mct[type] = BF(dav1d_prep_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  139|  15.4k|    init_##name##_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   opt); \
  |  |  |  |  ------------------
  |  |  |  |  |  |   38|      0|    c->mct[type] = BF(dav1d_prep_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  140|  15.4k|    init_##name##_fn(FILTER_2D_8TAP_SHARP,          8tap_sharp,          opt)
  |  |  |  |  ------------------
  |  |  |  |  |  |   38|  15.4k|    c->mct[type] = BF(dav1d_prep_##name, suffix)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   52|  15.4k|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  196|       |
  197|      0|    init_mc_fn (FILTER_2D_BILINEAR,            bilin,               avx512icl);
  ------------------
  |  |   36|      0|    c->mc[type] = BF(dav1d_put_##name, suffix)
  |  |  ------------------
  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  ------------------
  ------------------
  198|      0|    init_mct_fn(FILTER_2D_BILINEAR,            bilin,               avx512icl);
  ------------------
  |  |   38|      0|    c->mct[type] = BF(dav1d_prep_##name, suffix)
  |  |  ------------------
  |  |  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  |  |  ------------------
  ------------------
  199|       |
  200|      0|    c->avg = BF(dav1d_avg, avx512icl);
  ------------------
  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
  201|      0|    c->w_avg = BF(dav1d_w_avg, avx512icl);
  ------------------
  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
  202|      0|    c->mask = BF(dav1d_mask, avx512icl);
  ------------------
  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
  203|      0|    c->w_mask[0] = BF(dav1d_w_mask_444, avx512icl);
  ------------------
  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
  204|      0|    c->w_mask[1] = BF(dav1d_w_mask_422, avx512icl);
  ------------------
  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
  205|      0|    c->w_mask[2] = BF(dav1d_w_mask_420, avx512icl);
  ------------------
  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
  206|      0|    c->blend = BF(dav1d_blend, avx512icl);
  ------------------
  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
  207|      0|    c->blend_v = BF(dav1d_blend_v, avx512icl);
  ------------------
  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
  208|      0|    c->blend_h = BF(dav1d_blend_h, avx512icl);
  ------------------
  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
  209|       |
  210|      0|    if (!(flags & DAV1D_X86_CPU_FLAG_SLOW_GATHER)) {
  ------------------
  |  Branch (210:9): [True: 0, False: 0]
  ------------------
  211|      0|        c->resize = BF(dav1d_resize, avx512icl);
  ------------------
  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
  212|      0|        c->warp8x8  = BF(dav1d_warp_affine_8x8, avx512icl);
  ------------------
  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
  213|      0|        c->warp8x8t = BF(dav1d_warp_affine_8x8t, avx512icl);
  ------------------
  |  |   52|      0|#define BF(x, suffix) x##_8bpc_##suffix
  ------------------
  214|      0|    }
  215|      0|#endif
  216|      0|}

msac.c:msac_init_x86:
   59|  32.2k|static ALWAYS_INLINE void msac_init_x86(MsacContext *const s) {
   60|  32.2k|    const unsigned flags = dav1d_get_cpu_flags();
   61|       |
   62|  32.2k|    if (flags & DAV1D_X86_CPU_FLAG_SSE2) {
  ------------------
  |  Branch (62:9): [True: 32.2k, False: 0]
  ------------------
   63|  32.2k|        s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_sse2;
   64|  32.2k|    }
   65|       |
   66|  32.2k|    if (flags & DAV1D_X86_CPU_FLAG_AVX2) {
  ------------------
  |  Branch (66:9): [True: 32.2k, False: 0]
  ------------------
   67|  32.2k|        s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_avx2;
   68|  32.2k|    }
   69|  32.2k|}

pal.c:pal_dsp_init_x86:
   34|  17.2k|static ALWAYS_INLINE void pal_dsp_init_x86(Dav1dPalDSPContext *const c) {
   35|  17.2k|    const unsigned flags = dav1d_get_cpu_flags();
   36|       |
   37|  17.2k|    if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
  ------------------
  |  Branch (37:9): [True: 0, False: 17.2k]
  ------------------
   38|       |
   39|  17.2k|    c->pal_idx_finish = dav1d_pal_idx_finish_ssse3;
   40|       |
   41|  17.2k|#if ARCH_X86_64
   42|  17.2k|    if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
  ------------------
  |  Branch (42:9): [True: 0, False: 17.2k]
  ------------------
   43|       |
   44|  17.2k|    c->pal_idx_finish = dav1d_pal_idx_finish_avx2;
   45|       |
   46|  17.2k|    if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
  ------------------
  |  Branch (46:9): [True: 17.2k, False: 0]
  ------------------
   47|       |
   48|      0|    c->pal_idx_finish = dav1d_pal_idx_finish_avx512icl;
   49|      0|#endif
   50|      0|}

refmvs.c:refmvs_dsp_init_x86:
   41|  17.2k|static ALWAYS_INLINE void refmvs_dsp_init_x86(Dav1dRefmvsDSPContext *const c) {
   42|  17.2k|    const unsigned flags = dav1d_get_cpu_flags();
   43|       |
   44|  17.2k|    if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
  ------------------
  |  Branch (44:9): [True: 0, False: 17.2k]
  ------------------
   45|       |
   46|  17.2k|    c->splat_mv = dav1d_splat_mv_sse2;
   47|       |
   48|  17.2k|    if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
  ------------------
  |  Branch (48:9): [True: 0, False: 17.2k]
  ------------------
   49|       |
   50|  17.2k|    c->save_tmvs = dav1d_save_tmvs_ssse3;
   51|       |
   52|  17.2k|    if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return;
  ------------------
  |  Branch (52:9): [True: 0, False: 17.2k]
  ------------------
   53|  17.2k|#if ARCH_X86_64
   54|  17.2k|    c->load_tmvs = dav1d_load_tmvs_sse4;
   55|       |
   56|  17.2k|    if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
  ------------------
  |  Branch (56:9): [True: 0, False: 17.2k]
  ------------------
   57|       |
   58|  17.2k|    c->save_tmvs = dav1d_save_tmvs_avx2;
   59|  17.2k|    c->splat_mv = dav1d_splat_mv_avx2;
   60|       |
   61|  17.2k|    if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
  ------------------
  |  Branch (61:9): [True: 17.2k, False: 0]
  ------------------
   62|       |
   63|      0|    c->save_tmvs = dav1d_save_tmvs_avx512icl;
   64|      0|    c->splat_mv = dav1d_splat_mv_avx512icl;
   65|      0|#endif
   66|      0|}

cpu_id.cc:_ZN6libyuvL11SetCpuFlagsEi:
  127|      1|static __inline void SetCpuFlags(int cpu_flags) {
  128|      1|  LIBYUV_API extern int cpu_info_;
  129|      1|#ifdef __ATOMIC_RELAXED
  130|      1|  __atomic_store_n(&cpu_info_, cpu_flags, __ATOMIC_RELAXED);
  131|       |#else
  132|       |  cpu_info_ = cpu_flags;
  133|       |#endif
  134|      1|}
planar_functions.cc:_ZN6libyuvL11TestCpuFlagEi:
   77|  1.56k|static __inline int TestCpuFlag(int test_flag) {
   78|  1.56k|  LIBYUV_API extern int cpu_info_;
   79|  1.56k|#ifdef __ATOMIC_RELAXED
   80|  1.56k|  int cpu_info = __atomic_load_n(&cpu_info_, __ATOMIC_RELAXED);
   81|       |#else
   82|       |  int cpu_info = cpu_info_;
   83|       |#endif
   84|  1.56k|  return (!cpu_info ? InitCpuFlags() : cpu_info) & test_flag;
  ------------------
  |  Branch (84:11): [True: 0, False: 1.56k]
  ------------------
   85|  1.56k|}
scale.cc:_ZN6libyuvL11TestCpuFlagEi:
   77|  13.1k|static __inline int TestCpuFlag(int test_flag) {
   78|  13.1k|  LIBYUV_API extern int cpu_info_;
   79|  13.1k|#ifdef __ATOMIC_RELAXED
   80|  13.1k|  int cpu_info = __atomic_load_n(&cpu_info_, __ATOMIC_RELAXED);
   81|       |#else
   82|       |  int cpu_info = cpu_info_;
   83|       |#endif
   84|  13.1k|  return (!cpu_info ? InitCpuFlags() : cpu_info) & test_flag;
  ------------------
  |  Branch (84:11): [True: 0, False: 13.1k]
  ------------------
   85|  13.1k|}
scale_common.cc:_ZN6libyuvL11TestCpuFlagEi:
   77|  2.50k|static __inline int TestCpuFlag(int test_flag) {
   78|  2.50k|  LIBYUV_API extern int cpu_info_;
   79|  2.50k|#ifdef __ATOMIC_RELAXED
   80|  2.50k|  int cpu_info = __atomic_load_n(&cpu_info_, __ATOMIC_RELAXED);
   81|       |#else
   82|       |  int cpu_info = cpu_info_;
   83|       |#endif
   84|  2.50k|  return (!cpu_info ? InitCpuFlags() : cpu_info) & test_flag;
  ------------------
  |  Branch (84:11): [True: 1, False: 2.49k]
  ------------------
   85|  2.50k|}

CpuId:
   64|      5|void CpuId(int info_eax, int info_ecx, int* cpu_info) {
   65|       |#if defined(_MSC_VER)
   66|       |// Visual C version uses intrinsic or inline x86 assembly.
   67|       |#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
   68|       |  __cpuidex(cpu_info, info_eax, info_ecx);
   69|       |#elif defined(_M_IX86)
   70|       |  __asm {
   71|       |    mov        eax, info_eax
   72|       |    mov        ecx, info_ecx
   73|       |    mov        edi, cpu_info
   74|       |    cpuid
   75|       |    mov        [edi], eax
   76|       |    mov        [edi + 4], ebx
   77|       |    mov        [edi + 8], ecx
   78|       |    mov        [edi + 12], edx
   79|       |  }
   80|       |#else  // Visual C but not x86
   81|       |  if (info_ecx == 0) {
   82|       |    __cpuid(cpu_info, info_eax);
   83|       |  } else {
   84|       |    cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0u;
   85|       |  }
   86|       |#endif
   87|       |// GCC version uses inline x86 assembly.
   88|       |#else  // defined(_MSC_VER)
   89|      5|  int info_ebx, info_edx;
   90|      5|  asm volatile(
   91|       |#if defined(__i386__) && defined(__PIC__)
   92|       |      // Preserve ebx for fpic 32 bit.
   93|       |      "mov         %%ebx, %%edi                  \n"
   94|       |      "cpuid                                     \n"
   95|       |      "xchg        %%edi, %%ebx                  \n"
   96|       |      : "=D"(info_ebx),
   97|       |#else
   98|      5|      "cpuid                                     \n"
   99|      5|      : "=b"(info_ebx),
  100|      5|#endif  //  defined( __i386__) && defined(__PIC__)
  101|      5|        "+a"(info_eax), "+c"(info_ecx), "=d"(info_edx));
  102|      5|  cpu_info[0] = info_eax;
  103|      5|  cpu_info[1] = info_ebx;
  104|      5|  cpu_info[2] = info_ecx;
  105|      5|  cpu_info[3] = info_edx;
  106|      5|#endif  // defined(_MSC_VER)
  107|      5|}
MaskCpuFlags:
  482|      1|int MaskCpuFlags(int enable_flags) {
  483|      1|  int cpu_info = GetCpuFlags() & enable_flags;
  484|      1|  SetCpuFlags(cpu_info);
  485|      1|  return cpu_info;
  486|      1|}
InitCpuFlags:
  489|      1|int InitCpuFlags(void) {
  490|      1|  return MaskCpuFlags(-1);
  491|      1|}
cpu_id.cc:_ZN6libyuvL11GetCpuFlagsEv:
  385|      1|static SAFEBUFFERS int GetCpuFlags(void) {
  386|      1|  int cpu_info = 0;
  387|      1|#if !defined(__pnacl__) && !defined(__CLR_VER) &&                   \
  388|      1|    (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
  389|      1|     defined(_M_IX86))
  390|      1|  int cpu_info0[4] = {0, 0, 0, 0};
  391|      1|  int cpu_info1[4] = {0, 0, 0, 0};
  392|      1|  int cpu_info7[4] = {0, 0, 0, 0};
  393|      1|  int cpu_einfo7[4] = {0, 0, 0, 0};
  394|      1|  int cpu_info24[4] = {0, 0, 0, 0};
  395|      1|  int cpu_amdinfo21[4] = {0, 0, 0, 0};
  396|      1|  CpuId(0, 0, cpu_info0);
  397|      1|  CpuId(1, 0, cpu_info1);
  398|      1|  if (cpu_info0[0] >= 7) {
  ------------------
  |  Branch (398:7): [True: 1, False: 0]
  ------------------
  399|      1|    CpuId(7, 0, cpu_info7);
  400|      1|    CpuId(7, 1, cpu_einfo7);
  401|      1|    CpuId(0x80000021, 0, cpu_amdinfo21);
  402|      1|  }
  403|      1|  if (cpu_info0[0] >= 0x24) {
  ------------------
  |  Branch (403:7): [True: 0, False: 1]
  ------------------
  404|      0|    CpuId(0x24, 0, cpu_info24);
  405|      0|  }
  406|      1|  cpu_info = kCpuHasX86 | ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
  ------------------
  |  Branch (406:28): [True: 1, False: 0]
  ------------------
  407|      1|             ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
  ------------------
  |  Branch (407:15): [True: 1, False: 0]
  ------------------
  408|      1|             ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
  ------------------
  |  Branch (408:15): [True: 1, False: 0]
  ------------------
  409|      1|             ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
  ------------------
  |  Branch (409:15): [True: 1, False: 0]
  ------------------
  410|      1|             ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) |
  ------------------
  |  Branch (410:15): [True: 0, False: 1]
  ------------------
  411|      1|             ((cpu_info7[3] & 0x00000010) ? kCpuHasFSMR : 0);
  ------------------
  |  Branch (411:15): [True: 0, False: 1]
  ------------------
  412|       |
  413|       |  // AVX requires OS saves YMM registers.
  414|      1|  if (((cpu_info1[2] & 0x1c000000) == 0x1c000000) &&  // AVX and OSXSave
  ------------------
  |  Branch (414:7): [True: 1, False: 0]
  ------------------
  415|      1|      ((GetXCR0() & 6) == 6)) {  // Test OS saves YMM registers
  ------------------
  |  Branch (415:7): [True: 1, False: 0]
  ------------------
  416|      1|    cpu_info |= kCpuHasAVX | ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) |
  ------------------
  |  Branch (416:31): [True: 1, False: 0]
  ------------------
  417|      1|                ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
  ------------------
  |  Branch (417:18): [True: 1, False: 0]
  ------------------
  418|      1|                ((cpu_info1[2] & 0x20000000) ? kCpuHasF16C : 0) |
  ------------------
  |  Branch (418:18): [True: 1, False: 0]
  ------------------
  419|      1|                ((cpu_einfo7[0] & 0x00000010) ? kCpuHasAVXVNNI : 0) |
  ------------------
  |  Branch (419:18): [True: 0, False: 1]
  ------------------
  420|      1|                ((cpu_einfo7[3] & 0x00000010) ? kCpuHasAVXVNNIINT8 : 0);
  ------------------
  |  Branch (420:18): [True: 0, False: 1]
  ------------------
  421|       |
  422|      1|    cpu_info |= ((cpu_amdinfo21[0] & 0x00008000) ? kCpuHasERMS : 0);
  ------------------
  |  Branch (422:18): [True: 0, False: 1]
  ------------------
  423|       |
  424|       |    // Detect AVX512bw
  425|      1|    if ((GetXCR0() & 0xe0) == 0xe0) {
  ------------------
  |  Branch (425:9): [True: 0, False: 1]
  ------------------
  426|      0|      cpu_info |= ((cpu_info7[1] & 0x40000000) ? kCpuHasAVX512BW : 0) |
  ------------------
  |  Branch (426:20): [True: 0, False: 0]
  ------------------
  427|      0|                  ((cpu_info7[1] & 0x80000000) ? kCpuHasAVX512VL : 0) |
  ------------------
  |  Branch (427:20): [True: 0, False: 0]
  ------------------
  428|      0|                  ((cpu_info7[2] & 0x00000002) ? kCpuHasAVX512VBMI : 0) |
  ------------------
  |  Branch (428:20): [True: 0, False: 0]
  ------------------
  429|      0|                  ((cpu_info7[2] & 0x00000040) ? kCpuHasAVX512VBMI2 : 0) |
  ------------------
  |  Branch (429:20): [True: 0, False: 0]
  ------------------
  430|      0|                  ((cpu_info7[2] & 0x00000800) ? kCpuHasAVX512VNNI : 0) |
  ------------------
  |  Branch (430:20): [True: 0, False: 0]
  ------------------
  431|      0|                  ((cpu_info7[2] & 0x00001000) ? kCpuHasAVX512VBITALG : 0) |
  ------------------
  |  Branch (431:20): [True: 0, False: 0]
  ------------------
  432|      0|                  ((cpu_einfo7[3] & 0x00080000) ? kCpuHasAVX10 : 0) |
  ------------------
  |  Branch (432:20): [True: 0, False: 0]
  ------------------
  433|      0|                  ((cpu_info7[3] & 0x02000000) ? kCpuHasAMXINT8 : 0);
  ------------------
  |  Branch (433:20): [True: 0, False: 0]
  ------------------
  434|      0|      if (cpu_info0[0] >= 0x24 && (cpu_einfo7[3] & 0x00080000)) {
  ------------------
  |  Branch (434:11): [True: 0, False: 0]
  |  Branch (434:35): [True: 0, False: 0]
  ------------------
  435|      0|        cpu_info |= ((cpu_info24[1] & 0xFF) >= 2) ? kCpuHasAVX10_2 : 0;
  ------------------
  |  Branch (435:21): [True: 0, False: 0]
  ------------------
  436|      0|      }
  437|      0|    }
  438|      1|  }
  439|      1|#endif
  440|       |#if defined(__loongarch__) && defined(__linux__)
  441|       |  cpu_info = LoongArchCpuCaps();
  442|       |  cpu_info |= kCpuHasLOONGARCH;
  443|       |#endif
  444|       |#if defined(__aarch64__)
  445|       |#if defined(__linux__)
  446|       |  // getauxval is supported since Android SDK version 18, minimum at time of
  447|       |  // writing is 21, so should be safe to always use this. If getauxval is
  448|       |  // somehow disabled then getauxval returns 0, which will leave Neon enabled
  449|       |  // since Neon is mandatory on AArch64.
  450|       |  unsigned long hwcap = getauxval(AT_HWCAP);
  451|       |  unsigned long hwcap2 = getauxval(AT_HWCAP2);
  452|       |  cpu_info = AArch64CpuCaps(hwcap, hwcap2);
  453|       |#else
  454|       |  cpu_info = AArch64CpuCaps();
  455|       |#endif
  456|       |  cpu_info |= kCpuHasARM;
  457|       |#endif  // __aarch64__
  458|       |#if defined(__arm__)
  459|       |  // gcc -mfpu=neon defines __ARM_NEON__
  460|       |  // __ARM_NEON__ generates code that requires Neon.  NaCL also requires Neon.
  461|       |  // For Linux, /proc/cpuinfo can be tested but without that assume Neon.
  462|       |  // Linux arm parse text file for neon detect.
  463|       |#if defined(__linux__)
  464|       |  cpu_info = ArmCpuCaps("/proc/cpuinfo");
  465|       |#elif defined(__ARM_NEON__)
  466|       |  cpu_info = kCpuHasNEON;
  467|       |#else
  468|       |  cpu_info = 0;
  469|       |#endif
  470|       |  cpu_info |= kCpuHasARM;
  471|       |#endif  // __arm__
  472|       |#if defined(__riscv) && defined(__linux__)
  473|       |  cpu_info = RiscvCpuCaps("/proc/cpuinfo");
  474|       |  cpu_info |= kCpuHasRISCV;
  475|       |#endif  // __riscv
  476|      1|  cpu_info |= kCpuInitialized;
  477|      1|  return cpu_info;
  478|      1|}
cpu_id.cc:_ZN6libyuvL7GetXCR0Ev:
  133|      2|static int GetXCR0() {
  134|      2|  int xcr0 = 0;
  135|       |#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
  136|       |  xcr0 = (int)_xgetbv(0);  // VS2010 SP1 required.  NOLINT
  137|       |#elif defined(__i386__) || defined(__x86_64__)
  138|       |  asm(".byte 0x0f, 0x01, 0xd0" : "=a"(xcr0) : "c"(0) : "%edx");
  139|      2|#endif  // defined(__i386__) || defined(__x86_64__)
  140|      2|  return xcr0;
  141|      2|}

CopyPlane:
   32|    392|               int height) {
   33|    392|  int y;
   34|    392|  void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C;
   35|    392|  if (width <= 0 || height == 0) {
  ------------------
  |  Branch (35:7): [True: 0, False: 392]
  |  Branch (35:21): [True: 0, False: 392]
  ------------------
   36|      0|    return;
   37|      0|  }
   38|       |  // Negative height means invert the image.
   39|    392|  if (height < 0) {
  ------------------
  |  Branch (39:7): [True: 0, False: 392]
  ------------------
   40|      0|    height = -height;
   41|      0|    dst_y = dst_y + (height - 1) * dst_stride_y;
   42|      0|    dst_stride_y = -dst_stride_y;
   43|      0|  }
   44|       |  // Coalesce rows.
   45|    392|  if (src_stride_y == width && dst_stride_y == width) {
  ------------------
  |  Branch (45:7): [True: 0, False: 392]
  |  Branch (45:32): [True: 0, False: 0]
  ------------------
   46|      0|    width *= height;
   47|      0|    height = 1;
   48|      0|    src_stride_y = dst_stride_y = 0;
   49|      0|  }
   50|       |  // Nothing to do.
   51|    392|  if (src_y == dst_y && src_stride_y == dst_stride_y) {
  ------------------
  |  Branch (51:7): [True: 0, False: 392]
  |  Branch (51:25): [True: 0, False: 0]
  ------------------
   52|      0|    return;
   53|      0|  }
   54|       |
   55|    392|#if defined(HAS_COPYROW_SSE2)
   56|    392|  if (TestCpuFlag(kCpuHasSSE2)) {
  ------------------
  |  Branch (56:7): [True: 392, False: 0]
  ------------------
   57|    392|    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
  ------------------
  |  |  999|    392|#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1)))
  |  |  ------------------
  |  |  |  Branch (999:26): [True: 12, False: 380]
  |  |  ------------------
  ------------------
   58|    392|  }
   59|    392|#endif
   60|    392|#if defined(HAS_COPYROW_AVX)
   61|    392|  if (TestCpuFlag(kCpuHasAVX)) {
  ------------------
  |  Branch (61:7): [True: 392, False: 0]
  ------------------
   62|    392|    CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
  ------------------
  |  |  999|    392|#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1)))
  |  |  ------------------
  |  |  |  Branch (999:26): [True: 12, False: 380]
  |  |  ------------------
  ------------------
   63|    392|  }
   64|    392|#endif
   65|    392|#if defined(HAS_COPYROW_AVX512BW)
   66|    392|  if (TestCpuFlag(kCpuHasAVX512BW)) {
  ------------------
  |  Branch (66:7): [True: 0, False: 392]
  ------------------
   67|      0|    CopyRow = IS_ALIGNED(width, 128) ? CopyRow_AVX512BW : CopyRow_Any_AVX512BW;
  ------------------
  |  |  999|      0|#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1)))
  |  |  ------------------
  |  |  |  Branch (999:26): [True: 0, False: 0]
  |  |  ------------------
  ------------------
   68|      0|  }
   69|    392|#endif
   70|    392|#if defined(HAS_COPYROW_ERMS)
   71|    392|  if (TestCpuFlag(kCpuHasERMS)) {
  ------------------
  |  Branch (71:7): [True: 0, False: 392]
  ------------------
   72|      0|    CopyRow = CopyRow_ERMS;
   73|      0|  }
   74|    392|#endif
   75|       |#if defined(HAS_COPYROW_NEON)
   76|       |  if (TestCpuFlag(kCpuHasNEON)) {
   77|       |    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
   78|       |  }
   79|       |#endif
   80|       |#if defined(HAS_COPYROW_SME)
   81|       |  if (TestCpuFlag(kCpuHasSME)) {
   82|       |    CopyRow = CopyRow_SME;
   83|       |  }
   84|       |#endif
   85|       |#if defined(HAS_COPYROW_RVV)
   86|       |  if (TestCpuFlag(kCpuHasRVV)) {
   87|       |    CopyRow = CopyRow_RVV;
   88|       |  }
   89|       |#endif
   90|       |
   91|       |  // Copy plane
   92|  26.6k|  for (y = 0; y < height; ++y) {
  ------------------
  |  Branch (92:15): [True: 26.2k, False: 392]
  ------------------
   93|  26.2k|    CopyRow(src_y, dst_y, width);
   94|  26.2k|    src_y += src_stride_y;
   95|  26.2k|    dst_y += dst_stride_y;
   96|  26.2k|  }
   97|    392|}
CopyPlane_16:
  105|    224|                  int height) {
  106|    224|  CopyPlane((const uint8_t*)src_y, src_stride_y * 2, (uint8_t*)dst_y,
  107|    224|            dst_stride_y * 2, width * 2, height);
  108|    224|}

CopyRow_Any_AVX:
  919|  25.4k|  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) {     \
  920|  25.4k|    SIMD_ALIGNED(uint8_t vin[128]);                                       \
  ------------------
  |  |  938|  25.4k|#define SIMD_ALIGNED(var) var __attribute__((aligned(32)))
  ------------------
  921|  25.4k|    SIMD_ALIGNED(uint8_t vout[128]);                                      \
  ------------------
  |  |  938|  25.4k|#define SIMD_ALIGNED(var) var __attribute__((aligned(32)))
  ------------------
  922|  25.4k|    memset(vin, 0, sizeof(vin)); /* for YUY2 and msan */                  \
  923|  25.4k|    int r = width & MASK;                                                 \
  924|  25.4k|    int n = width & ~MASK;                                                \
  925|  25.4k|    if (n > 0) {                                                          \
  ------------------
  |  Branch (925:9): [True: 25.2k, False: 244]
  ------------------
  926|  25.2k|      ANY_SIMD(src_ptr, dst_ptr, n);                                      \
  927|  25.2k|    }                                                                     \
  928|  25.4k|    ptrdiff_t np = n;                                                     \
  929|  25.4k|    memcpy(vin, src_ptr + (np >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
  ------------------
  |  |   32|  25.4k|#define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift))
  ------------------
  930|  25.4k|    ANY_SIMD(vin, vout, MASK + 1);                                        \
  931|  25.4k|    memcpy(dst_ptr + np * BPP, vout, r * BPP);                            \
  932|  25.4k|  }
InterpolateRow_Any_AVX2:
 1800|   220k|               int width, int source_y_fraction) {                   \
 1801|   220k|    SIMD_ALIGNED(TS vin[64 * 2]);                                    \
  ------------------
  |  |  938|   220k|#define SIMD_ALIGNED(var) var __attribute__((aligned(32)))
  ------------------
 1802|   220k|    SIMD_ALIGNED(TD vout[64]);                                       \
  ------------------
  |  |  938|   220k|#define SIMD_ALIGNED(var) var __attribute__((aligned(32)))
  ------------------
 1803|   220k|    memset(vin, 0, sizeof(vin)); /* for msan */                      \
 1804|   220k|    int r = width & MASK;                                            \
 1805|   220k|    int n = width & ~MASK;                                           \
 1806|   220k|    if (n > 0) {                                                     \
  ------------------
  |  Branch (1806:9): [True: 193k, False: 27.1k]
  ------------------
 1807|   193k|      ANY_SIMD(dst_ptr, src_ptr, src_stride, n, source_y_fraction);  \
 1808|   193k|    }                                                                \
 1809|   220k|    ptrdiff_t np = n;                                                \
 1810|   220k|    memcpy(vin, src_ptr + np * SBPP, r * SBPP * sizeof(TS));         \
 1811|   220k|    if (source_y_fraction) {                                         \
  ------------------
  |  Branch (1811:9): [True: 205k, False: 14.8k]
  ------------------
 1812|   205k|      memcpy(vin + 64, src_ptr + src_stride + np * SBPP,             \
 1813|   205k|             r * SBPP * sizeof(TS));                                 \
 1814|   205k|    }                                                                \
 1815|   220k|    ANY_SIMD(vout, vin, 64, MASK + 1, source_y_fraction);            \
 1816|   220k|    memcpy(dst_ptr + np * BPP, vout, r * BPP * sizeof(TD));          \
 1817|   220k|  }

InterpolateRow_16_C:
 3543|   277k|                         int source_y_fraction) {
 3544|   277k|  int y1_fraction = source_y_fraction;
 3545|   277k|  int y0_fraction = 256 - y1_fraction;
 3546|   277k|  const uint16_t* src_ptr1 = src_ptr + src_stride;
 3547|   277k|  int x;
 3548|   277k|  assert(source_y_fraction >= 0);
 3549|   277k|  assert(source_y_fraction < 256);
 3550|       |
 3551|   277k|  if (y1_fraction == 0) {
  ------------------
  |  Branch (3551:7): [True: 16.3k, False: 260k]
  ------------------
 3552|  16.3k|    memcpy(dst_ptr, src_ptr, width * 2);
 3553|  16.3k|    return;
 3554|  16.3k|  }
 3555|   260k|  if (y1_fraction == 128) {
  ------------------
  |  Branch (3555:7): [True: 2.12k, False: 258k]
  ------------------
 3556|  2.12k|    HalfRow_16_C(src_ptr, src_stride, dst_ptr, width);
 3557|  2.12k|    return;
 3558|  2.12k|  }
 3559|  27.8M|  for (x = 0; x < width; ++x) {
  ------------------
  |  Branch (3559:15): [True: 27.5M, False: 258k]
  ------------------
 3560|  27.5M|    dst_ptr[0] = STATIC_CAST(
  ------------------
  |  |   25|  27.5M|#define STATIC_CAST(type, expr) static_cast<type>(expr)
  ------------------
 3561|  27.5M|        uint16_t,
 3562|  27.5M|        (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8);
 3563|  27.5M|    ++src_ptr;
 3564|  27.5M|    ++src_ptr1;
 3565|  27.5M|    ++dst_ptr;
 3566|  27.5M|  }
 3567|   258k|}
row_common.cc:_ZN6libyuvL12HalfRow_16_CEPKtlPti:
 3487|  2.12k|                         int width) {
 3488|  2.12k|  int x;
 3489|   158k|  for (x = 0; x < width; ++x) {
  ------------------
  |  Branch (3489:15): [True: 156k, False: 2.12k]
  ------------------
 3490|   156k|    dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
 3491|   156k|  }
 3492|  2.12k|}

CopyRow_AVX:
 6314|  51.4k|void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {
 6315|  51.4k|  asm volatile(
 6316|  51.4k|      "1:          \n"
 6317|  51.4k|      "vmovdqu     (%0),%%ymm0                   \n"
 6318|  51.4k|      "vmovdqu     0x20(%0),%%ymm1               \n"
 6319|  51.4k|      "lea         0x40(%0),%0                   \n"
 6320|  51.4k|      "vmovdqu     %%ymm0,(%1)                   \n"
 6321|  51.4k|      "vmovdqu     %%ymm1,0x20(%1)               \n"
 6322|  51.4k|      "lea         0x40(%1),%1                   \n"
 6323|  51.4k|      "sub         $0x40,%2                      \n"
 6324|  51.4k|      "jg          1b                            \n"
 6325|  51.4k|      "vzeroupper  \n"
 6326|  51.4k|      : "+r"(src),   // %0
 6327|  51.4k|        "+r"(dst),   // %1
 6328|  51.4k|        "+r"(width)  // %2
 6329|  51.4k|      :
 6330|  51.4k|      : "memory", "cc", "xmm0", "xmm1");
 6331|  51.4k|}
InterpolateRow_AVX2:
 8554|   447k|                         int source_y_fraction) {
 8555|   447k|  asm volatile(
 8556|   447k|      "sub         %1,%0                         \n"
 8557|   447k|      "cmp         $0x0,%3                       \n"
 8558|   447k|      "je          100f                          \n"
 8559|   447k|      "cmp         $0x80,%3                      \n"
 8560|   447k|      "je          50f                           \n"
 8561|       |
 8562|   447k|      "vmovd       %3,%%xmm0                     \n"
 8563|   447k|      "neg         %3                            \n"
 8564|   447k|      "add         $0x100,%3                     \n"
 8565|   447k|      "vmovd       %3,%%xmm5                     \n"
 8566|   447k|      "vpunpcklbw  %%xmm0,%%xmm5,%%xmm5          \n"
 8567|   447k|      "vpunpcklwd  %%xmm5,%%xmm5,%%xmm5          \n"
 8568|   447k|      "vbroadcastss %%xmm5,%%ymm5                \n"
 8569|   447k|      "mov         $0x80808080,%%eax             \n"
 8570|   447k|      "vmovd       %%eax,%%xmm4                  \n"
 8571|   447k|      "vbroadcastss %%xmm4,%%ymm4                \n"
 8572|       |
 8573|       |      // General purpose row blend.
 8574|   447k|      LABELALIGN
 8575|   447k|      "1:          \n"
 8576|   447k|      "vmovdqu     (%1),%%ymm0                   \n"
 8577|   447k|      "vmovdqu     0x00(%1,%4,1),%%ymm2          \n"
 8578|   447k|      "vpunpckhbw  %%ymm2,%%ymm0,%%ymm1          \n"
 8579|   447k|      "vpunpcklbw  %%ymm2,%%ymm0,%%ymm0          \n"
 8580|   447k|      "vpsubb      %%ymm4,%%ymm1,%%ymm1          \n"
 8581|   447k|      "vpsubb      %%ymm4,%%ymm0,%%ymm0          \n"
 8582|   447k|      "vpmaddubsw  %%ymm1,%%ymm5,%%ymm1          \n"
 8583|   447k|      "vpmaddubsw  %%ymm0,%%ymm5,%%ymm0          \n"
 8584|   447k|      "vpaddw      %%ymm4,%%ymm1,%%ymm1          \n"
 8585|   447k|      "vpaddw      %%ymm4,%%ymm0,%%ymm0          \n"
 8586|   447k|      "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
 8587|   447k|      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
 8588|   447k|      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
 8589|   447k|      "vmovdqu     %%ymm0,0x00(%1,%0,1)          \n"
 8590|   447k|      "lea         0x20(%1),%1                   \n"
 8591|   447k|      "sub         $0x20,%2                      \n"
 8592|   447k|      "jg          1b                            \n"
 8593|   447k|      "jmp         99f                           \n"
 8594|       |
 8595|       |      // Blend 50 / 50.
 8596|   447k|      LABELALIGN
 8597|   447k|      "50:         \n"
 8598|   447k|      "vmovdqu     (%1),%%ymm0                   \n"
 8599|   447k|      "vpavgb      0x00(%1,%4,1),%%ymm0,%%ymm0   \n"
 8600|   447k|      "vmovdqu     %%ymm0,0x00(%1,%0,1)          \n"
 8601|   447k|      "lea         0x20(%1),%1                   \n"
 8602|   447k|      "sub         $0x20,%2                      \n"
 8603|   447k|      "jg          50b                           \n"
 8604|   447k|      "jmp         99f                           \n"
 8605|       |
 8606|       |      // Blend 100 / 0 - Copy row unchanged.
 8607|   447k|      LABELALIGN
 8608|   447k|      "100:        \n"
 8609|   447k|      "vmovdqu     (%1),%%ymm0                   \n"
 8610|   447k|      "vmovdqu     %%ymm0,0x00(%1,%0,1)          \n"
 8611|   447k|      "lea         0x20(%1),%1                   \n"
 8612|   447k|      "sub         $0x20,%2                      \n"
 8613|   447k|      "jg          100b                          \n"
 8614|       |
 8615|   447k|      "99:         \n"
 8616|   447k|      "vzeroupper  \n"
 8617|   447k|      : "+r"(dst_ptr),               // %0
 8618|   447k|        "+r"(src_ptr),               // %1
 8619|   447k|        "+r"(width),                 // %2
 8620|   447k|        "+r"(source_y_fraction)      // %3
 8621|   447k|      : "r"((intptr_t)(src_stride))  // %4
 8622|   447k|      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm4", "xmm5");
 8623|   447k|}

ScalePlane:
 1983|  6.45k|               enum FilterMode filtering) {
 1984|       |  // Simplify filtering when possible.
 1985|  6.45k|  filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
 1986|  6.45k|                                filtering);
 1987|       |
 1988|       |  // Negative height means invert the image.
 1989|  6.45k|  if (src_height < 0) {
  ------------------
  |  Branch (1989:7): [True: 0, False: 6.45k]
  ------------------
 1990|      0|    src_height = -src_height;
 1991|      0|    src = src + (src_height - 1) * (int64_t)src_stride;
 1992|      0|    src_stride = -src_stride;
 1993|      0|  }
 1994|       |  // Use specialized scales to improve performance for common resolutions.
 1995|       |  // For example, all the 1/2 scalings will use ScalePlaneDown2()
 1996|  6.45k|  if (dst_width == src_width && dst_height == src_height) {
  ------------------
  |  Branch (1996:7): [True: 1.41k, False: 5.04k]
  |  Branch (1996:33): [True: 168, False: 1.25k]
  ------------------
 1997|       |    // Straight copy.
 1998|    168|    CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height);
 1999|    168|    return 0;
 2000|    168|  }
 2001|  6.29k|  if (dst_width == src_width && filtering != kFilterBox) {
  ------------------
  |  Branch (2001:7): [True: 1.25k, False: 5.04k]
  |  Branch (2001:33): [True: 1.25k, False: 0]
  ------------------
 2002|  1.25k|    int dy = 0;
 2003|  1.25k|    int y = 0;
 2004|       |    // When scaling down, use the center 2 rows to filter.
 2005|       |    // When scaling up, last row of destination uses the last 2 source rows.
 2006|  1.25k|    if (dst_height <= src_height) {
  ------------------
  |  Branch (2006:9): [True: 439, False: 811]
  ------------------
 2007|    439|      dy = FixedDiv(src_height, dst_height);
  ------------------
  |  |  265|    439|#define FixedDiv FixedDiv_X86
  ------------------
 2008|    439|      y = CENTERSTART(dy, -32768);  // Subtract 0.5 (32768) to center filter.
  ------------------
  |  |   32|    439|#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s)
  |  |  ------------------
  |  |  |  Branch (32:28): [True: 0, False: 439]
  |  |  ------------------
  ------------------
 2009|    811|    } else if (src_height > 1 && dst_height > 1) {
  ------------------
  |  Branch (2009:16): [True: 753, False: 58]
  |  Branch (2009:34): [True: 753, False: 0]
  ------------------
 2010|    753|      dy = FixedDiv1(src_height, dst_height);
  ------------------
  |  |  266|    753|#define FixedDiv1 FixedDiv1_X86
  ------------------
 2011|    753|    }
 2012|       |    // Arbitrary scale vertically, but unscaled horizontally.
 2013|  1.25k|    ScalePlaneVertical(src_height, dst_width, dst_height, src_stride,
 2014|  1.25k|                       dst_stride, src, dst, 0, y, dy, /*bpp=*/1, filtering);
 2015|  1.25k|    return 0;
 2016|  1.25k|  }
 2017|  5.04k|  if (dst_width <= Abs(src_width) && dst_height <= src_height) {
  ------------------
  |  Branch (2017:7): [True: 1.38k, False: 3.66k]
  |  Branch (2017:38): [True: 1.01k, False: 361]
  ------------------
 2018|       |    // Scale down.
 2019|  1.01k|    if (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) {
  ------------------
  |  Branch (2019:9): [True: 8, False: 1.01k]
  |  Branch (2019:43): [True: 0, False: 8]
  ------------------
 2020|       |      // optimized, 3/4
 2021|      0|      ScalePlaneDown34(src_width, src_height, dst_width, dst_height, src_stride,
 2022|      0|                       dst_stride, src, dst, filtering);
 2023|      0|      return 0;
 2024|      0|    }
 2025|  1.01k|    if (2 * dst_width == src_width && 2 * dst_height == src_height) {
  ------------------
  |  Branch (2025:9): [True: 47, False: 972]
  |  Branch (2025:39): [True: 14, False: 33]
  ------------------
 2026|       |      // optimized, 1/2
 2027|     14|      ScalePlaneDown2(src_width, src_height, dst_width, dst_height, src_stride,
 2028|     14|                      dst_stride, src, dst, filtering);
 2029|     14|      return 0;
 2030|     14|    }
 2031|       |    // 3/8 rounded up for odd sized chroma height.
 2032|  1.00k|    if (8 * dst_width == 3 * src_width && 8 * dst_height == 3 * src_height) {
  ------------------
  |  Branch (2032:9): [True: 0, False: 1.00k]
  |  Branch (2032:43): [True: 0, False: 0]
  ------------------
 2033|       |      // optimized, 3/8
 2034|      0|      ScalePlaneDown38(src_width, src_height, dst_width, dst_height, src_stride,
 2035|      0|                       dst_stride, src, dst, filtering);
 2036|      0|      return 0;
 2037|      0|    }
 2038|  1.00k|    if (4 * dst_width == src_width && 4 * dst_height == src_height &&
  ------------------
  |  Branch (2038:9): [True: 23, False: 982]
  |  Branch (2038:39): [True: 0, False: 23]
  ------------------
 2039|      0|        (filtering == kFilterBox || filtering == kFilterNone)) {
  ------------------
  |  Branch (2039:10): [True: 0, False: 0]
  |  Branch (2039:37): [True: 0, False: 0]
  ------------------
 2040|       |      // optimized, 1/4
 2041|      0|      ScalePlaneDown4(src_width, src_height, dst_width, dst_height, src_stride,
 2042|      0|                      dst_stride, src, dst, filtering);
 2043|      0|      return 0;
 2044|      0|    }
 2045|  1.00k|  }
 2046|  5.02k|  if (filtering == kFilterBox && dst_height * 2 < src_height) {
  ------------------
  |  Branch (2046:7): [True: 170, False: 4.85k]
  |  Branch (2046:34): [True: 170, False: 0]
  ------------------
 2047|    170|    return ScalePlaneBox(src_width, src_height, dst_width, dst_height,
 2048|    170|                         src_stride, dst_stride, src, dst);
 2049|    170|  }
 2050|  4.85k|  if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) {
  ------------------
  |  Branch (2050:7): [True: 265, False: 4.59k]
  |  Branch (2050:43): [True: 54, False: 211]
  ------------------
 2051|     54|    ScalePlaneUp2_Linear(src_width, src_height, dst_width, dst_height,
 2052|     54|                         src_stride, dst_stride, src, dst);
 2053|     54|    return 0;
 2054|     54|  }
 2055|  4.80k|  if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width &&
  ------------------
  |  Branch (2055:7): [True: 160, False: 4.64k]
  |  Branch (2055:45): [True: 73, False: 87]
  ------------------
 2056|     73|      (filtering == kFilterBilinear || filtering == kFilterBox)) {
  ------------------
  |  Branch (2056:8): [True: 57, False: 16]
  |  Branch (2056:40): [True: 0, False: 16]
  ------------------
 2057|     57|    ScalePlaneUp2_Bilinear(src_width, src_height, dst_width, dst_height,
 2058|     57|                           src_stride, dst_stride, src, dst);
 2059|     57|    return 0;
 2060|     57|  }
 2061|  4.74k|  if (filtering && dst_height > src_height) {
  ------------------
  |  Branch (2061:7): [True: 4.09k, False: 654]
  |  Branch (2061:20): [True: 1.80k, False: 2.28k]
  ------------------
 2062|  1.80k|    return ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height,
 2063|  1.80k|                                src_stride, dst_stride, src, dst, filtering);
 2064|  1.80k|  }
 2065|  2.94k|  if (filtering) {
  ------------------
  |  Branch (2065:7): [True: 2.28k, False: 654]
  ------------------
 2066|  2.28k|    return ScalePlaneBilinearDown(src_width, src_height, dst_width, dst_height,
 2067|  2.28k|                                  src_stride, dst_stride, src, dst, filtering);
 2068|  2.28k|  }
 2069|    654|  ScalePlaneSimple(src_width, src_height, dst_width, dst_height, src_stride,
 2070|    654|                   dst_stride, src, dst);
 2071|    654|  return 0;
 2072|  2.94k|}
ScalePlane_16:
 2083|  5.14k|                  enum FilterMode filtering) {
 2084|       |  // Simplify filtering when possible.
 2085|  5.14k|  filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
 2086|  5.14k|                                filtering);
 2087|       |
 2088|       |  // Negative height means invert the image.
 2089|  5.14k|  if (src_height < 0) {
  ------------------
  |  Branch (2089:7): [True: 0, False: 5.14k]
  ------------------
 2090|      0|    src_height = -src_height;
 2091|      0|    src = src + (src_height - 1) * (int64_t)src_stride;
 2092|      0|    src_stride = -src_stride;
 2093|      0|  }
 2094|       |  // Use specialized scales to improve performance for common resolutions.
 2095|       |  // For example, all the 1/2 scalings will use ScalePlaneDown2()
 2096|  5.14k|  if (dst_width == src_width && dst_height == src_height) {
  ------------------
  |  Branch (2096:7): [True: 1.13k, False: 4.01k]
  |  Branch (2096:33): [True: 224, False: 907]
  ------------------
 2097|       |    // Straight copy.
 2098|    224|    CopyPlane_16(src, src_stride, dst, dst_stride, dst_width, dst_height);
 2099|    224|    return 0;
 2100|    224|  }
 2101|  4.92k|  if (dst_width == src_width && filtering != kFilterBox) {
  ------------------
  |  Branch (2101:7): [True: 907, False: 4.01k]
  |  Branch (2101:33): [True: 907, False: 0]
  ------------------
 2102|    907|    int dy = 0;
 2103|    907|    int y = 0;
 2104|       |    // When scaling down, use the center 2 rows to filter.
 2105|       |    // When scaling up, last row of destination uses the last 2 source rows.
 2106|    907|    if (dst_height <= src_height) {
  ------------------
  |  Branch (2106:9): [True: 331, False: 576]
  ------------------
 2107|    331|      dy = FixedDiv(src_height, dst_height);
  ------------------
  |  |  265|    331|#define FixedDiv FixedDiv_X86
  ------------------
 2108|    331|      y = CENTERSTART(dy, -32768);  // Subtract 0.5 (32768) to center filter.
  ------------------
  |  |   32|    331|#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s)
  |  |  ------------------
  |  |  |  Branch (32:28): [True: 0, False: 331]
  |  |  ------------------
  ------------------
 2109|       |      // When scaling up, ensure the last row of destination uses the last
 2110|       |      // source. Avoid divide by zero for dst_height but will do no scaling
 2111|       |      // later.
 2112|    576|    } else if (src_height > 1 && dst_height > 1) {
  ------------------
  |  Branch (2112:16): [True: 540, False: 36]
  |  Branch (2112:34): [True: 540, False: 0]
  ------------------
 2113|    540|      dy = FixedDiv1(src_height, dst_height);
  ------------------
  |  |  266|    540|#define FixedDiv1 FixedDiv1_X86
  ------------------
 2114|    540|    }
 2115|       |    // Arbitrary scale vertically, but unscaled horizontally.
 2116|    907|    ScalePlaneVertical_16(src_height, dst_width, dst_height, src_stride,
 2117|    907|                          dst_stride, src, dst, 0, y, dy, /*bpp=*/1, filtering);
 2118|    907|    return 0;
 2119|    907|  }
 2120|  4.01k|  if (dst_width <= Abs(src_width) && dst_height <= src_height) {
  ------------------
  |  Branch (2120:7): [True: 981, False: 3.03k]
  |  Branch (2120:38): [True: 769, False: 212]
  ------------------
 2121|       |    // Scale down.
 2122|    769|    if (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) {
  ------------------
  |  Branch (2122:9): [True: 8, False: 761]
  |  Branch (2122:43): [True: 0, False: 8]
  ------------------
 2123|       |      // optimized, 3/4
 2124|      0|      ScalePlaneDown34_16(src_width, src_height, dst_width, dst_height,
 2125|      0|                          src_stride, dst_stride, src, dst, filtering);
 2126|      0|      return 0;
 2127|      0|    }
 2128|    769|    if (2 * dst_width == src_width && 2 * dst_height == src_height) {
  ------------------
  |  Branch (2128:9): [True: 40, False: 729]
  |  Branch (2128:39): [True: 16, False: 24]
  ------------------
 2129|       |      // optimized, 1/2
 2130|     16|      ScalePlaneDown2_16(src_width, src_height, dst_width, dst_height,
 2131|     16|                         src_stride, dst_stride, src, dst, filtering);
 2132|     16|      return 0;
 2133|     16|    }
 2134|       |    // 3/8 rounded up for odd sized chroma height.
 2135|    753|    if (8 * dst_width == 3 * src_width && 8 * dst_height == 3 * src_height) {
  ------------------
  |  Branch (2135:9): [True: 2, False: 751]
  |  Branch (2135:43): [True: 0, False: 2]
  ------------------
 2136|       |      // optimized, 3/8
 2137|      0|      ScalePlaneDown38_16(src_width, src_height, dst_width, dst_height,
 2138|      0|                          src_stride, dst_stride, src, dst, filtering);
 2139|      0|      return 0;
 2140|      0|    }
 2141|    753|    if (4 * dst_width == src_width && 4 * dst_height == src_height &&
  ------------------
  |  Branch (2141:9): [True: 6, False: 747]
  |  Branch (2141:39): [True: 0, False: 6]
  ------------------
 2142|      0|        (filtering == kFilterBox || filtering == kFilterNone)) {
  ------------------
  |  Branch (2142:10): [True: 0, False: 0]
  |  Branch (2142:37): [True: 0, False: 0]
  ------------------
 2143|       |      // optimized, 1/4
 2144|      0|      ScalePlaneDown4_16(src_width, src_height, dst_width, dst_height,
 2145|      0|                         src_stride, dst_stride, src, dst, filtering);
 2146|      0|      return 0;
 2147|      0|    }
 2148|    753|  }
 2149|  4.00k|  if (filtering == kFilterBox && dst_height * 2 < src_height) {
  ------------------
  |  Branch (2149:7): [True: 181, False: 3.82k]
  |  Branch (2149:34): [True: 181, False: 0]
  ------------------
 2150|    181|    return ScalePlaneBox_16(src_width, src_height, dst_width, dst_height,
 2151|    181|                            src_stride, dst_stride, src, dst);
 2152|    181|  }
 2153|  3.82k|  if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) {
  ------------------
  |  Branch (2153:7): [True: 263, False: 3.55k]
  |  Branch (2153:43): [True: 0, False: 263]
  ------------------
 2154|      0|    ScalePlaneUp2_16_Linear(src_width, src_height, dst_width, dst_height,
 2155|      0|                            src_stride, dst_stride, src, dst);
 2156|      0|    return 0;
 2157|      0|  }
 2158|  3.82k|  if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width &&
  ------------------
  |  Branch (2158:7): [True: 71, False: 3.74k]
  |  Branch (2158:45): [True: 12, False: 59]
  ------------------
 2159|     12|      (filtering == kFilterBilinear || filtering == kFilterBox)) {
  ------------------
  |  Branch (2159:8): [True: 0, False: 12]
  |  Branch (2159:40): [True: 0, False: 12]
  ------------------
 2160|      0|    ScalePlaneUp2_16_Bilinear(src_width, src_height, dst_width, dst_height,
 2161|      0|                              src_stride, dst_stride, src, dst);
 2162|      0|    return 0;
 2163|      0|  }
 2164|  3.82k|  if (filtering && dst_height > src_height) {
  ------------------
  |  Branch (2164:7): [True: 3.35k, False: 464]
  |  Branch (2164:20): [True: 1.81k, False: 1.54k]
  ------------------
 2165|  1.81k|    return ScalePlaneBilinearUp_16(src_width, src_height, dst_width, dst_height,
 2166|  1.81k|                                   src_stride, dst_stride, src, dst, filtering);
 2167|  1.81k|  }
 2168|  2.00k|  if (filtering) {
  ------------------
  |  Branch (2168:7): [True: 1.54k, False: 464]
  ------------------
 2169|  1.54k|    return ScalePlaneBilinearDown_16(src_width, src_height, dst_width,
 2170|  1.54k|                                     dst_height, src_stride, dst_stride, src,
 2171|  1.54k|                                     dst, filtering);
 2172|  1.54k|  }
 2173|    464|  ScalePlaneSimple_16(src_width, src_height, dst_width, dst_height, src_stride,
 2174|    464|                      dst_stride, src, dst);
 2175|    464|  return 0;
 2176|  2.00k|}
ScalePlane_12:
 2187|  5.24k|                  enum FilterMode filtering) {
 2188|       |  // Simplify filtering when possible.
 2189|  5.24k|  filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
 2190|  5.24k|                                filtering);
 2191|       |
 2192|       |  // Negative height means invert the image.
 2193|  5.24k|  if (src_height < 0) {
  ------------------
  |  Branch (2193:7): [True: 0, False: 5.24k]
  ------------------
 2194|      0|    src_height = -src_height;
 2195|      0|    src = src + (src_height - 1) * (int64_t)src_stride;
 2196|      0|    src_stride = -src_stride;
 2197|      0|  }
 2198|       |
 2199|  5.24k|  if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) {
  ------------------
  |  Branch (2199:7): [True: 407, False: 4.83k]
  |  Branch (2199:43): [True: 63, False: 344]
  ------------------
 2200|     63|    ScalePlaneUp2_12_Linear(src_width, src_height, dst_width, dst_height,
 2201|     63|                            src_stride, dst_stride, src, dst);
 2202|     63|    return 0;
 2203|     63|  }
 2204|  5.18k|  if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width &&
  ------------------
  |  Branch (2204:7): [True: 128, False: 5.05k]
  |  Branch (2204:45): [True: 63, False: 65]
  ------------------
 2205|     63|      (filtering == kFilterBilinear || filtering == kFilterBox)) {
  ------------------
  |  Branch (2205:8): [True: 35, False: 28]
  |  Branch (2205:40): [True: 0, False: 28]
  ------------------
 2206|     35|    ScalePlaneUp2_12_Bilinear(src_width, src_height, dst_width, dst_height,
 2207|     35|                              src_stride, dst_stride, src, dst);
 2208|     35|    return 0;
 2209|     35|  }
 2210|       |
 2211|  5.14k|  return ScalePlane_16(src, src_stride, src_width, src_height, dst, dst_stride,
 2212|  5.14k|                       dst_width, dst_height, filtering);
 2213|  5.18k|}
scale.cc:_ZN6libyuvL3AbsEi:
   27|  17.9k|static __inline int Abs(int v) {
   28|  17.9k|  return v >= 0 ? v : -v;
  ------------------
  |  Branch (28:10): [True: 17.9k, False: 0]
  ------------------
   29|  17.9k|}
scale.cc:_ZN6libyuvL15ScalePlaneDown2EiiiiiiPKhPhNS_10FilterModeE:
   46|     14|                            enum FilterMode filtering) {
   47|     14|  int y;
   48|     14|  void (*ScaleRowDown2)(const uint8_t* src_ptr, ptrdiff_t src_stride,
   49|     14|                        uint8_t* dst_ptr, int dst_width) =
   50|     14|      filtering == kFilterNone
  ------------------
  |  Branch (50:7): [True: 0, False: 14]
  ------------------
   51|     14|          ? ScaleRowDown2_C
   52|     14|          : (filtering == kFilterLinear ? ScaleRowDown2Linear_C
  ------------------
  |  Branch (52:14): [True: 0, False: 14]
  ------------------
   53|     14|                                        : ScaleRowDown2Box_C);
   54|     14|  int row_stride = src_stride * 2;
   55|     14|  (void)src_width;
   56|     14|  (void)src_height;
   57|     14|  if (!filtering) {
  ------------------
  |  Branch (57:7): [True: 0, False: 14]
  ------------------
   58|      0|    src_ptr += src_stride;  // Point to odd rows.
   59|      0|    src_stride = 0;
   60|      0|  }
   61|       |
   62|       |#if defined(HAS_SCALEROWDOWN2_NEON)
   63|       |  if (TestCpuFlag(kCpuHasNEON)) {
   64|       |    ScaleRowDown2 =
   65|       |        filtering == kFilterNone
   66|       |            ? ScaleRowDown2_Any_NEON
   67|       |            : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_NEON
   68|       |                                          : ScaleRowDown2Box_Any_NEON);
   69|       |    if (IS_ALIGNED(dst_width, 16)) {
   70|       |      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_NEON
   71|       |                                               : (filtering == kFilterLinear
   72|       |                                                      ? ScaleRowDown2Linear_NEON
   73|       |                                                      : ScaleRowDown2Box_NEON);
   74|       |    }
   75|       |  }
   76|       |#endif
   77|       |#if defined(HAS_SCALEROWDOWN2_SME)
   78|       |  if (TestCpuFlag(kCpuHasSME)) {
   79|       |    ScaleRowDown2 = filtering == kFilterNone     ? ScaleRowDown2_SME
   80|       |                    : filtering == kFilterLinear ? ScaleRowDown2Linear_SME
   81|       |                                                 : ScaleRowDown2Box_SME;
   82|       |  }
   83|       |#endif
   84|     14|#if defined(HAS_SCALEROWDOWN2_SSSE3)
   85|     14|  if (TestCpuFlag(kCpuHasSSSE3)) {
  ------------------
  |  Branch (85:7): [True: 14, False: 0]
  ------------------
   86|     14|    ScaleRowDown2 =
   87|     14|        filtering == kFilterNone
  ------------------
  |  Branch (87:9): [True: 0, False: 14]
  ------------------
   88|     14|            ? ScaleRowDown2_Any_SSSE3
   89|     14|            : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSSE3
  ------------------
  |  Branch (89:16): [True: 0, False: 14]
  ------------------
   90|     14|                                          : ScaleRowDown2Box_Any_SSSE3);
   91|     14|    if (IS_ALIGNED(dst_width, 16)) {
  ------------------
  |  |  999|     14|#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1)))
  |  |  ------------------
  |  |  |  Branch (999:26): [True: 0, False: 14]
  |  |  ------------------
  ------------------
   92|      0|      ScaleRowDown2 =
   93|      0|          filtering == kFilterNone
  ------------------
  |  Branch (93:11): [True: 0, False: 0]
  ------------------
   94|      0|              ? ScaleRowDown2_SSSE3
   95|      0|              : (filtering == kFilterLinear ? ScaleRowDown2Linear_SSSE3
  ------------------
  |  Branch (95:18): [True: 0, False: 0]
  ------------------
   96|      0|                                            : ScaleRowDown2Box_SSSE3);
   97|      0|    }
   98|     14|  }
   99|     14|#endif
  100|     14|#if defined(HAS_SCALEROWDOWN2_AVX2)
  101|     14|  if (TestCpuFlag(kCpuHasAVX2)) {
  ------------------
  |  Branch (101:7): [True: 14, False: 0]
  ------------------
  102|     14|    ScaleRowDown2 =
  103|     14|        filtering == kFilterNone
  ------------------
  |  Branch (103:9): [True: 0, False: 14]
  ------------------
  104|     14|            ? ScaleRowDown2_Any_AVX2
  105|     14|            : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2
  ------------------
  |  Branch (105:16): [True: 0, False: 14]
  ------------------
  106|     14|                                          : ScaleRowDown2Box_Any_AVX2);
  107|     14|    if (IS_ALIGNED(dst_width, 32)) {
  ------------------
  |  |  999|     14|#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1)))
  |  |  ------------------
  |  |  |  Branch (999:26): [True: 0, False: 14]
  |  |  ------------------
  ------------------
  108|      0|      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2
  ------------------
  |  Branch (108:23): [True: 0, False: 0]
  ------------------
  109|      0|                                               : (filtering == kFilterLinear
  ------------------
  |  Branch (109:51): [True: 0, False: 0]
  ------------------
  110|      0|                                                      ? ScaleRowDown2Linear_AVX2
  111|      0|                                                      : ScaleRowDown2Box_AVX2);
  112|      0|    }
  113|     14|  }
  114|     14|#endif
  115|       |#if defined(HAS_SCALEROWDOWN2_LSX)
  116|       |  if (TestCpuFlag(kCpuHasLSX)) {
  117|       |    ScaleRowDown2 =
  118|       |        filtering == kFilterNone
  119|       |            ? ScaleRowDown2_Any_LSX
  120|       |            : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_LSX
  121|       |                                          : ScaleRowDown2Box_Any_LSX);
  122|       |    if (IS_ALIGNED(dst_width, 32)) {
  123|       |      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_LSX
  124|       |                                               : (filtering == kFilterLinear
  125|       |                                                      ? ScaleRowDown2Linear_LSX
  126|       |                                                      : ScaleRowDown2Box_LSX);
  127|       |    }
  128|       |  }
  129|       |#endif
  130|       |#if defined(HAS_SCALEROWDOWN2_RVV)
  131|       |  if (TestCpuFlag(kCpuHasRVV)) {
  132|       |    ScaleRowDown2 = filtering == kFilterNone
  133|       |                        ? ScaleRowDown2_RVV
  134|       |                        : (filtering == kFilterLinear ? ScaleRowDown2Linear_RVV
  135|       |                                                      : ScaleRowDown2Box_RVV);
  136|       |  }
  137|       |#endif
  138|       |
  139|     14|  if (filtering == kFilterLinear) {
  ------------------
  |  Branch (139:7): [True: 0, False: 14]
  ------------------
  140|      0|    src_stride = 0;
  141|      0|  }
  142|       |  // TODO(fbarchard): Loop through source height to allow odd height.
  143|    322|  for (y = 0; y < dst_height; ++y) {
  ------------------
  |  Branch (143:15): [True: 308, False: 14]
  ------------------
  144|    308|    ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
  145|    308|    src_ptr += row_stride;
  146|    308|    dst_ptr += dst_stride;
  147|    308|  }
  148|     14|}
scale.cc:_ZN6libyuvL13ScalePlaneBoxEiiiiiiPKhPh:
  907|    170|                         uint8_t* dst_ptr) {
  908|    170|  int j, k;
  909|       |  // Initial source x/y coordinate and step values as 16.16 fixed point.
  910|    170|  int x = 0;
  911|    170|  int y = 0;
  912|    170|  int dx = 0;
  913|    170|  int dy = 0;
  914|    170|  const int max_y = (src_height << 16);
  915|    170|  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, &x, &y,
  916|    170|             &dx, &dy);
  917|    170|  src_width = Abs(src_width);
  918|    170|  {
  919|       |    // Allocate a row buffer of uint16_t.
  920|    170|    align_buffer_64(row16, src_width * 2);
  ------------------
  |  | 1002|    170|  void* var##_mem = malloc((size) + 63);                      /* NOLINT */ \
  |  | 1003|    170|  uint8_t* var = (uint8_t*)(((intptr_t)var##_mem + 63) & ~63) /* NOLINT */
  ------------------
  921|    170|    if (!row16)
  ------------------
  |  Branch (921:9): [True: 0, False: 170]
  ------------------
  922|      0|      return 1;
  923|    170|    void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
  924|    170|                         const uint16_t* src_ptr, uint8_t* dst_ptr) =
  925|    170|        (dx & 0xffff) ? ScaleAddCols2_C
  ------------------
  |  Branch (925:9): [True: 119, False: 51]
  ------------------
  926|    170|                      : ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C);
  ------------------
  |  Branch (926:26): [True: 51, False: 0]
  ------------------
  927|    170|    void (*ScaleAddRow)(const uint8_t* src_ptr, uint16_t* dst_ptr,
  928|    170|                        int src_width) = ScaleAddRow_C;
  929|    170|#if defined(HAS_SCALEADDROW_SSE2)
  930|    170|    if (TestCpuFlag(kCpuHasSSE2)) {
  ------------------
  |  Branch (930:9): [True: 170, False: 0]
  ------------------
  931|    170|      ScaleAddRow = ScaleAddRow_Any_SSE2;
  932|    170|      if (IS_ALIGNED(src_width, 16)) {
  ------------------
  |  |  999|    170|#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1)))
  |  |  ------------------
  |  |  |  Branch (999:26): [True: 32, False: 138]
  |  |  ------------------
  ------------------
  933|     32|        ScaleAddRow = ScaleAddRow_SSE2;
  934|     32|      }
  935|    170|    }
  936|    170|#endif
  937|    170|#if defined(HAS_SCALEADDROW_AVX2)
  938|    170|    if (TestCpuFlag(kCpuHasAVX2)) {
  ------------------
  |  Branch (938:9): [True: 170, False: 0]
  ------------------
  939|    170|      ScaleAddRow = ScaleAddRow_Any_AVX2;
  940|    170|      if (IS_ALIGNED(src_width, 32)) {
  ------------------
  |  |  999|    170|#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1)))
  |  |  ------------------
  |  |  |  Branch (999:26): [True: 20, False: 150]
  |  |  ------------------
  ------------------
  941|     20|        ScaleAddRow = ScaleAddRow_AVX2;
  942|     20|      }
  943|    170|    }
  944|    170|#endif
  945|       |#if defined(HAS_SCALEADDROW_NEON)
  946|       |    if (TestCpuFlag(kCpuHasNEON)) {
  947|       |      ScaleAddRow = ScaleAddRow_Any_NEON;
  948|       |      if (IS_ALIGNED(src_width, 16)) {
  949|       |        ScaleAddRow = ScaleAddRow_NEON;
  950|       |      }
  951|       |    }
  952|       |#endif
  953|       |#if defined(HAS_SCALEADDROW_LSX)
  954|       |    if (TestCpuFlag(kCpuHasLSX)) {
  955|       |      ScaleAddRow = ScaleAddRow_Any_LSX;
  956|       |      if (IS_ALIGNED(src_width, 16)) {
  957|       |        ScaleAddRow = ScaleAddRow_LSX;
  958|       |      }
  959|       |    }
  960|       |#endif
  961|       |#if defined(HAS_SCALEADDROW_RVV)
  962|       |    if (TestCpuFlag(kCpuHasRVV)) {
  963|       |      ScaleAddRow = ScaleAddRow_RVV;
  964|       |    }
  965|       |#endif
  966|       |
  967|  7.56k|    for (j = 0; j < dst_height; ++j) {
  ------------------
  |  Branch (967:17): [True: 7.39k, False: 170]
  ------------------
  968|  7.39k|      int boxheight;
  969|  7.39k|      int iy = y >> 16;
  970|  7.39k|      const uint8_t* src = src_ptr + iy * (int64_t)src_stride;
  971|  7.39k|      y += dy;
  972|  7.39k|      if (y > max_y) {
  ------------------
  |  Branch (972:11): [True: 0, False: 7.39k]
  ------------------
  973|      0|        y = max_y;
  974|      0|      }
  975|  7.39k|      boxheight = MIN1((y >> 16) - iy);
  ------------------
  |  |  778|  7.39k|#define MIN1(x) ((x) < 1 ? 1 : (x))
  |  |  ------------------
  |  |  |  Branch (778:18): [True: 0, False: 7.39k]
  |  |  ------------------
  ------------------
  976|  7.39k|      memset(row16, 0, src_width * 2);
  977|  55.7k|      for (k = 0; k < boxheight; ++k) {
  ------------------
  |  Branch (977:19): [True: 48.3k, False: 7.39k]
  ------------------
  978|  48.3k|        ScaleAddRow(src, (uint16_t*)(row16), src_width);
  979|  48.3k|        src += src_stride;
  980|  48.3k|      }
  981|  7.39k|      ScaleAddCols(dst_width, boxheight, x, dx, (uint16_t*)(row16), dst_ptr);
  982|  7.39k|      dst_ptr += dst_stride;
  983|  7.39k|    }
  984|    170|    free_aligned_buffer_64(row16);
  ------------------
  |  | 1006|    170|  free(var##_mem);                  \
  |  | 1007|    170|  var = NULL
  ------------------
  985|    170|  }
  986|      0|  return 0;
  987|    170|}
scale.cc:_ZN6libyuvL15ScaleAddCols2_CEiiiiPKtPh:
  805|  6.01k|                            uint8_t* dst_ptr) {
  806|  6.01k|  int i;
  807|  6.01k|  int scaletbl[2];
  808|  6.01k|  int minboxwidth = dx >> 16;
  809|  6.01k|  int boxwidth;
  810|  6.01k|  scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight);
  ------------------
  |  |  778|  6.01k|#define MIN1(x) ((x) < 1 ? 1 : (x))
  |  |  ------------------
  |  |  |  Branch (778:18): [True: 0, False: 6.01k]
  |  |  ------------------
  ------------------
  811|  6.01k|  scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight);
  ------------------
  |  |  778|  6.01k|#define MIN1(x) ((x) < 1 ? 1 : (x))
  |  |  ------------------
  |  |  |  Branch (778:18): [True: 0, False: 6.01k]
  |  |  ------------------
  ------------------
  812|   354k|  for (i = 0; i < dst_width; ++i) {
  ------------------
  |  Branch (812:15): [True: 348k, False: 6.01k]
  ------------------
  813|   348k|    int ix = x >> 16;
  814|   348k|    x += dx;
  815|   348k|    boxwidth = MIN1((x >> 16) - ix);
  ------------------
  |  |  778|   348k|#define MIN1(x) ((x) < 1 ? 1 : (x))
  |  |  ------------------
  |  |  |  Branch (778:18): [True: 0, False: 348k]
  |  |  ------------------
  ------------------
  816|   348k|    int scaletbl_index = boxwidth - minboxwidth;
  817|       |    assert((scaletbl_index == 0) || (scaletbl_index == 1));
  818|   348k|    *dst_ptr++ = (uint8_t)(SumPixels(boxwidth, src_ptr + ix) *
  819|   348k|                               scaletbl[scaletbl_index] >>
  820|   348k|                           16);
  821|   348k|  }
  822|  6.01k|}
scale.cc:_ZN6libyuvL9SumPixelsEiPKt:
  780|   471k|static __inline uint32_t SumPixels(int iboxwidth, const uint16_t* src_ptr) {
  781|   471k|  uint32_t sum = 0u;
  782|   471k|  int x;
  783|   471k|  assert(iboxwidth > 0);
  784|  3.04M|  for (x = 0; x < iboxwidth; ++x) {
  ------------------
  |  Branch (784:15): [True: 2.57M, False: 471k]
  ------------------
  785|  2.57M|    sum += src_ptr[x];
  786|  2.57M|  }
  787|   471k|  return sum;
  788|   471k|}
scale.cc:_ZN6libyuvL15ScaleAddCols1_CEiiiiPKtPh:
  867|  1.38k|                            uint8_t* dst_ptr) {
  868|  1.38k|  int boxwidth = MIN1(dx >> 16);
  ------------------
  |  |  778|  1.38k|#define MIN1(x) ((x) < 1 ? 1 : (x))
  |  |  ------------------
  |  |  |  Branch (778:18): [True: 0, False: 1.38k]
  |  |  ------------------
  ------------------
  869|  1.38k|  int scaleval = 65536 / (boxwidth * boxheight);
  870|  1.38k|  int i;
  871|  1.38k|  x >>= 16;
  872|   124k|  for (i = 0; i < dst_width; ++i) {
  ------------------
  |  Branch (872:15): [True: 122k, False: 1.38k]
  ------------------
  873|   122k|    *dst_ptr++ = (uint8_t)(SumPixels(boxwidth, src_ptr + x) * scaleval >> 16);
  874|   122k|    x += boxwidth;
  875|   122k|  }
  876|  1.38k|}
scale.cc:_ZN6libyuvL20ScalePlaneUp2_LinearEiiiiiiPKhPh:
 1431|     54|                                 uint8_t* dst_ptr) {
 1432|     54|  void (*ScaleRowUp)(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) =
 1433|     54|      ScaleRowUp2_Linear_Any_C;
 1434|     54|  int i;
 1435|     54|  int y;
 1436|     54|  int dy;
 1437|       |
 1438|     54|  (void)src_width;
 1439|       |  // This function can only scale up by 2 times horizontally.
 1440|     54|  assert(src_width == ((dst_width + 1) / 2));
 1441|       |
 1442|     54|#ifdef HAS_SCALEROWUP2_LINEAR_SSE2
 1443|     54|  if (TestCpuFlag(kCpuHasSSE2)) {
  ------------------
  |  Branch (1443:7): [True: 54, False: 0]
  ------------------
 1444|     54|    ScaleRowUp = ScaleRowUp2_Linear_Any_SSE2;
 1445|     54|  }
 1446|     54|#endif
 1447|       |
 1448|     54|#ifdef HAS_SCALEROWUP2_LINEAR_SSSE3
 1449|     54|  if (TestCpuFlag(kCpuHasSSSE3)) {
  ------------------
  |  Branch (1449:7): [True: 54, False: 0]
  ------------------
 1450|     54|    ScaleRowUp = ScaleRowUp2_Linear_Any_SSSE3;
 1451|     54|  }
 1452|     54|#endif
 1453|       |
 1454|     54|#ifdef HAS_SCALEROWUP2_LINEAR_AVX2
 1455|     54|  if (TestCpuFlag(kCpuHasAVX2)) {
  ------------------
  |  Branch (1455:7): [True: 54, False: 0]
  ------------------
 1456|     54|    ScaleRowUp = ScaleRowUp2_Linear_Any_AVX2;
 1457|     54|  }
 1458|     54|#endif
 1459|       |
 1460|       |#ifdef HAS_SCALEROWUP2_LINEAR_NEON
 1461|       |  if (TestCpuFlag(kCpuHasNEON)) {
 1462|       |    ScaleRowUp = ScaleRowUp2_Linear_Any_NEON;
 1463|       |  }
 1464|       |#endif
 1465|       |#ifdef HAS_SCALEROWUP2_LINEAR_RVV
 1466|       |  if (TestCpuFlag(kCpuHasRVV)) {
 1467|       |    ScaleRowUp = ScaleRowUp2_Linear_RVV;
 1468|       |  }
 1469|       |#endif
 1470|       |
 1471|     54|  if (dst_height == 1) {
  ------------------
  |  Branch (1471:7): [True: 0, False: 54]
  ------------------
 1472|      0|    ScaleRowUp(src_ptr + ((src_height - 1) / 2) * (int64_t)src_stride, dst_ptr,
 1473|      0|               dst_width);
 1474|     54|  } else {
 1475|     54|    dy = FixedDiv(src_height - 1, dst_height - 1);
  ------------------
  |  |  265|     54|#define FixedDiv FixedDiv_X86
  ------------------
 1476|     54|    y = (1 << 15) - 1;
 1477|  2.25k|    for (i = 0; i < dst_height; ++i) {
  ------------------
  |  Branch (1477:17): [True: 2.20k, False: 54]
  ------------------
 1478|  2.20k|      ScaleRowUp(src_ptr + (y >> 16) * (int64_t)src_stride, dst_ptr, dst_width);
 1479|  2.20k|      dst_ptr += dst_stride;
 1480|  2.20k|      y += dy;
 1481|  2.20k|    }
 1482|     54|  }
 1483|     54|}
scale.cc:_ZN6libyuvL22ScalePlaneUp2_BilinearEiiiiiiPKhPh:
 1496|     57|                                   uint8_t* dst_ptr) {
 1497|     57|  void (*Scale2RowUp)(const uint8_t* src_ptr, ptrdiff_t src_stride,
 1498|     57|                      uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
 1499|     57|      ScaleRowUp2_Bilinear_Any_C;
 1500|     57|  int x;
 1501|       |
 1502|     57|  (void)src_width;
 1503|       |  // This function can only scale up by 2 times.
 1504|     57|  assert(src_width == ((dst_width + 1) / 2));
 1505|     57|  assert(src_height == ((dst_height + 1) / 2));
 1506|       |
 1507|     57|#ifdef HAS_SCALEROWUP2_BILINEAR_SSE2
 1508|     57|  if (TestCpuFlag(kCpuHasSSE2)) {
  ------------------
  |  Branch (1508:7): [True: 57, False: 0]
  ------------------
 1509|     57|    Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSE2;
 1510|     57|  }
 1511|     57|#endif
 1512|       |
 1513|     57|#ifdef HAS_SCALEROWUP2_BILINEAR_SSSE3
 1514|     57|  if (TestCpuFlag(kCpuHasSSSE3)) {
  ------------------
  |  Branch (1514:7): [True: 57, False: 0]
  ------------------
 1515|     57|    Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSSE3;
 1516|     57|  }
 1517|     57|#endif
 1518|       |
 1519|     57|#ifdef HAS_SCALEROWUP2_BILINEAR_AVX2
 1520|     57|  if (TestCpuFlag(kCpuHasAVX2)) {
  ------------------
  |  Branch (1520:7): [True: 57, False: 0]
  ------------------
 1521|     57|    Scale2RowUp = ScaleRowUp2_Bilinear_Any_AVX2;
 1522|     57|  }
 1523|     57|#endif
 1524|       |
 1525|       |#ifdef HAS_SCALEROWUP2_BILINEAR_NEON
 1526|       |  if (TestCpuFlag(kCpuHasNEON)) {
 1527|       |    Scale2RowUp = ScaleRowUp2_Bilinear_Any_NEON;
 1528|       |  }
 1529|       |#endif
 1530|       |#ifdef HAS_SCALEROWUP2_BILINEAR_RVV
 1531|       |  if (TestCpuFlag(kCpuHasRVV)) {
 1532|       |    Scale2RowUp = ScaleRowUp2_Bilinear_RVV;
 1533|       |  }
 1534|       |#endif
 1535|       |
 1536|     57|  Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
 1537|     57|  dst_ptr += dst_stride;
 1538|  1.05k|  for (x = 0; x < src_height - 1; ++x) {
  ------------------
  |  Branch (1538:15): [True: 995, False: 57]
  ------------------
 1539|    995|    Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
 1540|    995|    src_ptr += src_stride;
 1541|       |    // TODO(fbarchard): Test performance of writing one row of destination at a
 1542|       |    // time.
 1543|    995|    dst_ptr += 2 * dst_stride;
 1544|    995|  }
 1545|     57|  if (!(dst_height & 1)) {
  ------------------
  |  Branch (1545:7): [True: 30, False: 27]
  ------------------
 1546|     30|    Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
 1547|     30|  }
 1548|     57|}
scale.cc:_ZN6libyuvL20ScalePlaneBilinearUpEiiiiiiPKhPhNS_10FilterModeE:
 1275|  1.80k|                                enum FilterMode filtering) {
 1276|  1.80k|  int j;
 1277|       |  // Initial source x/y coordinate and step values as 16.16 fixed point.
 1278|  1.80k|  int x = 0;
 1279|  1.80k|  int y = 0;
 1280|  1.80k|  int dx = 0;
 1281|  1.80k|  int dy = 0;
 1282|  1.80k|  const int max_y = (src_height - 1) << 16;
 1283|  1.80k|  void (*InterpolateRow)(uint8_t* dst_ptr, const uint8_t* src_ptr,
 1284|  1.80k|                         ptrdiff_t src_stride, int dst_width,
 1285|  1.80k|                         int source_y_fraction) = InterpolateRow_C;
 1286|  1.80k|  void (*ScaleFilterCols)(uint8_t* dst_ptr, const uint8_t* src_ptr,
 1287|  1.80k|                          int dst_width, int x, int dx) =
 1288|  1.80k|      filtering ? ScaleFilterCols_C : ScaleCols_C;
  ------------------
  |  Branch (1288:7): [True: 1.80k, False: 0]
  ------------------
 1289|  1.80k|  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
 1290|  1.80k|             &dx, &dy);
 1291|  1.80k|  src_width = Abs(src_width);
 1292|       |
 1293|  1.80k|#if defined(HAS_INTERPOLATEROW_SSSE3)
 1294|  1.80k|  if (TestCpuFlag(kCpuHasSSSE3)) {
  ------------------
  |  Branch (1294:7): [True: 1.80k, False: 0]
  ------------------
 1295|  1.80k|    InterpolateRow = InterpolateRow_Any_SSSE3;
 1296|  1.80k|    if (IS_ALIGNED(dst_width, 16)) {
  ------------------
  |  |  999|  1.80k|#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1)))
  |  |  ------------------
  |  |  |  Branch (999:26): [True: 381, False: 1.42k]
  |  |  ------------------
  ------------------
 1297|    381|      InterpolateRow = InterpolateRow_SSSE3;
 1298|    381|    }
 1299|  1.80k|  }
 1300|  1.80k|#endif
 1301|  1.80k|#if defined(HAS_INTERPOLATEROW_AVX2)
 1302|  1.80k|  if (TestCpuFlag(kCpuHasAVX2)) {
  ------------------
  |  Branch (1302:7): [True: 1.80k, False: 0]
  ------------------
 1303|  1.80k|    InterpolateRow = InterpolateRow_Any_AVX2;
 1304|  1.80k|    if (IS_ALIGNED(dst_width, 32)) {
  ------------------
  |  |  999|  1.80k|#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1)))
  |  |  ------------------
  |  |  |  Branch (999:26): [True: 183, False: 1.62k]
  |  |  ------------------
  ------------------
 1305|    183|      InterpolateRow = InterpolateRow_AVX2;
 1306|    183|    }
 1307|  1.80k|  }
 1308|  1.80k|#endif
 1309|       |#if defined(HAS_INTERPOLATEROW_NEON)
 1310|       |  if (TestCpuFlag(kCpuHasNEON)) {
 1311|       |    InterpolateRow = InterpolateRow_Any_NEON;
 1312|       |    if (IS_ALIGNED(dst_width, 16)) {
 1313|       |      InterpolateRow = InterpolateRow_NEON;
 1314|       |    }
 1315|       |  }
 1316|       |#endif
 1317|       |#if defined(HAS_INTERPOLATEROW_SME)
 1318|       |  if (TestCpuFlag(kCpuHasSME)) {
 1319|       |    InterpolateRow = InterpolateRow_SME;
 1320|       |  }
 1321|       |#endif
 1322|       |#if defined(HAS_INTERPOLATEROW_RVV)
 1323|       |  if (TestCpuFlag(kCpuHasRVV)) {
 1324|       |    InterpolateRow = InterpolateRow_RVV;
 1325|       |  }
 1326|       |#endif
 1327|       |
 1328|  1.80k|  if (filtering && src_width >= 32768) {
  ------------------
  |  Branch (1328:7): [True: 1.80k, False: 0]
  |  Branch (1328:20): [True: 0, False: 1.80k]
  ------------------
 1329|      0|    ScaleFilterCols = ScaleFilterCols64_C;
 1330|      0|  }
 1331|  1.80k|#if defined(HAS_SCALEFILTERCOLS_SSSE3)
 1332|  1.80k|  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
  ------------------
  |  Branch (1332:7): [True: 1.80k, False: 0]
  |  Branch (1332:20): [True: 1.80k, False: 0]
  |  Branch (1332:49): [True: 1.80k, False: 0]
  ------------------
 1333|  1.80k|    ScaleFilterCols = ScaleFilterCols_SSSE3;
 1334|  1.80k|  }
 1335|  1.80k|#endif
 1336|       |#if defined(HAS_SCALEFILTERCOLS_NEON)
 1337|       |  if (filtering && TestCpuFlag(kCpuHasNEON) && src_width < 32768) {
 1338|       |    ScaleFilterCols = ScaleFilterCols_Any_NEON;
 1339|       |    if (IS_ALIGNED(dst_width, 8)) {
 1340|       |      ScaleFilterCols = ScaleFilterCols_NEON;
 1341|       |    }
 1342|       |  }
 1343|       |#endif
 1344|       |#if defined(HAS_SCALEFILTERCOLS_LSX)
 1345|       |  if (filtering && TestCpuFlag(kCpuHasLSX) && src_width < 32768) {
 1346|       |    ScaleFilterCols = ScaleFilterCols_Any_LSX;
 1347|       |    if (IS_ALIGNED(dst_width, 16)) {
 1348|       |      ScaleFilterCols = ScaleFilterCols_LSX;
 1349|       |    }
 1350|       |  }
 1351|       |#endif
 1352|  1.80k|  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
  ------------------
  |  Branch (1352:7): [True: 0, False: 1.80k]
  |  Branch (1352:21): [True: 0, False: 0]
  |  Branch (1352:51): [True: 0, False: 0]
  ------------------
 1353|      0|    ScaleFilterCols = ScaleColsUp2_C;
 1354|       |#if defined(HAS_SCALECOLS_SSE2)
 1355|       |    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
 1356|       |      ScaleFilterCols = ScaleColsUp2_SSE2;
 1357|       |    }
 1358|       |#endif
 1359|      0|  }
 1360|       |
 1361|  1.80k|  if (y > max_y) {
  ------------------
  |  Branch (1361:7): [True: 729, False: 1.07k]
  ------------------
 1362|    729|    y = max_y;
 1363|    729|  }
 1364|  1.80k|  {
 1365|  1.80k|    int yi = y >> 16;
 1366|  1.80k|    const uint8_t* src = src_ptr + yi * (int64_t)src_stride;
 1367|       |
 1368|       |    // Allocate 2 row buffers.
 1369|  1.80k|    const int row_size = (dst_width + 31) & ~31;
 1370|  1.80k|    align_buffer_64(row, row_size * 2);
  ------------------
  |  | 1002|  1.80k|  void* var##_mem = malloc((size) + 63);                      /* NOLINT */ \
  |  | 1003|  1.80k|  uint8_t* var = (uint8_t*)(((intptr_t)var##_mem + 63) & ~63) /* NOLINT */
  ------------------
 1371|  1.80k|    if (!row)
  ------------------
  |  Branch (1371:9): [True: 0, False: 1.80k]
  ------------------
 1372|      0|      return 1;
 1373|       |
 1374|  1.80k|    uint8_t* rowptr = row;
 1375|  1.80k|    int rowstride = row_size;
 1376|  1.80k|    int lasty = yi;
 1377|       |
 1378|  1.80k|    ScaleFilterCols(rowptr, src, dst_width, x, dx);
 1379|  1.80k|    if (src_height > 1) {
  ------------------
  |  Branch (1379:9): [True: 1.07k, False: 729]
  ------------------
 1380|  1.07k|      src += src_stride;
 1381|  1.07k|    }
 1382|  1.80k|    ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx);
 1383|  1.80k|    if (src_height > 2) {
  ------------------
  |  Branch (1383:9): [True: 925, False: 878]
  ------------------
 1384|    925|      src += src_stride;
 1385|    925|    }
 1386|       |
 1387|   104k|    for (j = 0; j < dst_height; ++j) {
  ------------------
  |  Branch (1387:17): [True: 102k, False: 1.80k]
  ------------------
 1388|   102k|      yi = y >> 16;
 1389|   102k|      if (yi != lasty) {
  ------------------
  |  Branch (1389:11): [True: 33.8k, False: 68.7k]
  ------------------
 1390|  33.8k|        if (y > max_y) {
  ------------------
  |  Branch (1390:13): [True: 0, False: 33.8k]
  ------------------
 1391|      0|          y = max_y;
 1392|      0|          yi = y >> 16;
 1393|      0|          src = src_ptr + yi * (int64_t)src_stride;
 1394|      0|        }
 1395|  33.8k|        if (yi != lasty) {
  ------------------
  |  Branch (1395:13): [True: 33.8k, False: 0]
  ------------------
 1396|  33.8k|          ScaleFilterCols(rowptr, src, dst_width, x, dx);
 1397|  33.8k|          rowptr += rowstride;
 1398|  33.8k|          rowstride = -rowstride;
 1399|  33.8k|          lasty = yi;
 1400|  33.8k|          if ((y + 65536) < max_y) {
  ------------------
  |  Branch (1400:15): [True: 32.9k, False: 925]
  ------------------
 1401|  32.9k|            src += src_stride;
 1402|  32.9k|          }
 1403|  33.8k|        }
 1404|  33.8k|      }
 1405|   102k|      if (filtering == kFilterLinear) {
  ------------------
  |  Branch (1405:11): [True: 11.6k, False: 90.8k]
  ------------------
 1406|  11.6k|        InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0);
 1407|  90.8k|      } else {
 1408|  90.8k|        int yf = (y >> 8) & 255;
 1409|  90.8k|        InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf);
 1410|  90.8k|      }
 1411|   102k|      dst_ptr += dst_stride;
 1412|   102k|      y += dy;
 1413|   102k|    }
 1414|  1.80k|    free_aligned_buffer_64(row);
  ------------------
  |  | 1006|  1.80k|  free(var##_mem);                  \
  |  | 1007|  1.80k|  var = NULL
  ------------------
 1415|  1.80k|  }
 1416|      0|  return 0;
 1417|  1.80k|}
scale.cc:_ZN6libyuvL22ScalePlaneBilinearDownEiiiiiiPKhPhNS_10FilterModeE:
 1055|  2.28k|                                  enum FilterMode filtering) {
 1056|       |  // Initial source x/y coordinate and step values as 16.16 fixed point.
 1057|  2.28k|  int x = 0;
 1058|  2.28k|  int y = 0;
 1059|  2.28k|  int dx = 0;
 1060|  2.28k|  int dy = 0;
 1061|       |  // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
 1062|       |  // Allocate a row buffer.
 1063|  2.28k|  align_buffer_64(row, src_width);
  ------------------
  |  | 1002|  2.28k|  void* var##_mem = malloc((size) + 63);                      /* NOLINT */ \
  |  | 1003|  2.28k|  uint8_t* var = (uint8_t*)(((intptr_t)var##_mem + 63) & ~63) /* NOLINT */
  ------------------
 1064|  2.28k|  if (!row)
  ------------------
  |  Branch (1064:7): [True: 0, False: 2.28k]
  ------------------
 1065|      0|    return 1;
 1066|       |
 1067|  2.28k|  const int max_y = (src_height - 1) << 16;
 1068|  2.28k|  int j;
 1069|  2.28k|  void (*ScaleFilterCols)(uint8_t* dst_ptr, const uint8_t* src_ptr,
 1070|  2.28k|                          int dst_width, int x, int dx) =
 1071|  2.28k|      (src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C;
  ------------------
  |  Branch (1071:7): [True: 0, False: 2.28k]
  ------------------
 1072|  2.28k|  void (*InterpolateRow)(uint8_t* dst_ptr, const uint8_t* src_ptr,
 1073|  2.28k|                         ptrdiff_t src_stride, int dst_width,
 1074|  2.28k|                         int source_y_fraction) = InterpolateRow_C;
 1075|  2.28k|  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
 1076|  2.28k|             &dx, &dy);
 1077|  2.28k|  src_width = Abs(src_width);
 1078|       |
 1079|  2.28k|#if defined(HAS_INTERPOLATEROW_SSSE3)
 1080|  2.28k|  if (TestCpuFlag(kCpuHasSSSE3)) {
  ------------------
  |  Branch (1080:7): [True: 2.28k, False: 0]
  ------------------
 1081|  2.28k|    InterpolateRow = InterpolateRow_Any_SSSE3;
 1082|  2.28k|    if (IS_ALIGNED(src_width, 16)) {
  ------------------
  |  |  999|  2.28k|#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1)))
  |  |  ------------------
  |  |  |  Branch (999:26): [True: 296, False: 1.99k]
  |  |  ------------------
  ------------------
 1083|    296|      InterpolateRow = InterpolateRow_SSSE3;
 1084|    296|    }
 1085|  2.28k|  }
 1086|  2.28k|#endif
 1087|  2.28k|#if defined(HAS_INTERPOLATEROW_AVX2)
 1088|  2.28k|  if (TestCpuFlag(kCpuHasAVX2)) {
  ------------------
  |  Branch (1088:7): [True: 2.28k, False: 0]
  ------------------
 1089|  2.28k|    InterpolateRow = InterpolateRow_Any_AVX2;
 1090|  2.28k|    if (IS_ALIGNED(src_width, 32)) {
  ------------------
  |  |  999|  2.28k|#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1)))
  |  |  ------------------
  |  |  |  Branch (999:26): [True: 126, False: 2.16k]
  |  |  ------------------
  ------------------
 1091|    126|      InterpolateRow = InterpolateRow_AVX2;
 1092|    126|    }
 1093|  2.28k|  }
 1094|  2.28k|#endif
 1095|       |#if defined(HAS_INTERPOLATEROW_NEON)
 1096|       |  if (TestCpuFlag(kCpuHasNEON)) {
 1097|       |    InterpolateRow = InterpolateRow_Any_NEON;
 1098|       |    if (IS_ALIGNED(src_width, 16)) {
 1099|       |      InterpolateRow = InterpolateRow_NEON;
 1100|       |    }
 1101|       |  }
 1102|       |#endif
 1103|       |#if defined(HAS_INTERPOLATEROW_SME)
 1104|       |  if (TestCpuFlag(kCpuHasSME)) {
 1105|       |    InterpolateRow = InterpolateRow_SME;
 1106|       |  }
 1107|       |#endif
 1108|       |#if defined(HAS_INTERPOLATEROW_LSX)
 1109|       |  if (TestCpuFlag(kCpuHasLSX)) {
 1110|       |    InterpolateRow = InterpolateRow_Any_LSX;
 1111|       |    if (IS_ALIGNED(src_width, 32)) {
 1112|       |      InterpolateRow = InterpolateRow_LSX;
 1113|       |    }
 1114|       |  }
 1115|       |#endif
 1116|       |#if defined(HAS_INTERPOLATEROW_RVV)
 1117|       |  if (TestCpuFlag(kCpuHasRVV)) {
 1118|       |    InterpolateRow = InterpolateRow_RVV;
 1119|       |  }
 1120|       |#endif
 1121|       |
 1122|  2.28k|#if defined(HAS_SCALEFILTERCOLS_SSSE3)
 1123|  2.28k|  if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
  ------------------
  |  Branch (1123:7): [True: 2.28k, False: 0]
  |  Branch (1123:36): [True: 2.28k, False: 0]
  ------------------
 1124|  2.28k|    ScaleFilterCols = ScaleFilterCols_SSSE3;
 1125|  2.28k|  }
 1126|  2.28k|#endif
 1127|       |#if defined(HAS_SCALEFILTERCOLS_NEON)
 1128|       |  if (TestCpuFlag(kCpuHasNEON) && src_width < 32768) {
 1129|       |    ScaleFilterCols = ScaleFilterCols_Any_NEON;
 1130|       |    if (IS_ALIGNED(dst_width, 8)) {
 1131|       |      ScaleFilterCols = ScaleFilterCols_NEON;
 1132|       |    }
 1133|       |  }
 1134|       |#endif
 1135|       |#if defined(HAS_SCALEFILTERCOLS_LSX)
 1136|       |  if (TestCpuFlag(kCpuHasLSX) && src_width < 32768) {
 1137|       |    ScaleFilterCols = ScaleFilterCols_Any_LSX;
 1138|       |    if (IS_ALIGNED(dst_width, 16)) {
 1139|       |      ScaleFilterCols = ScaleFilterCols_LSX;
 1140|       |    }
 1141|       |  }
 1142|       |#endif
 1143|  2.28k|  if (y > max_y) {
  ------------------
  |  Branch (1143:7): [True: 8, False: 2.28k]
  ------------------
 1144|      8|    y = max_y;
 1145|      8|  }
 1146|       |
 1147|   128k|  for (j = 0; j < dst_height; ++j) {
  ------------------
  |  Branch (1147:15): [True: 126k, False: 2.28k]
  ------------------
 1148|   126k|    int yi = y >> 16;
 1149|   126k|    const uint8_t* src = src_ptr + yi * (int64_t)src_stride;
 1150|   126k|    if (filtering == kFilterLinear) {
  ------------------
  |  Branch (1150:9): [True: 93.5k, False: 32.8k]
  ------------------
 1151|  93.5k|      ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
 1152|  93.5k|    } else {
 1153|  32.8k|      int yf = (y >> 8) & 255;
 1154|  32.8k|      InterpolateRow(row, src, src_stride, src_width, yf);
 1155|  32.8k|      ScaleFilterCols(dst_ptr, row, dst_width, x, dx);
 1156|  32.8k|    }
 1157|   126k|    dst_ptr += dst_stride;
 1158|   126k|    y += dy;
 1159|   126k|    if (y > max_y) {
  ------------------
  |  Branch (1159:9): [True: 3.50k, False: 122k]
  ------------------
 1160|  3.50k|      y = max_y;
 1161|  3.50k|    }
 1162|   126k|  }
 1163|       |  free_aligned_buffer_64(row);
  ------------------
  |  | 1006|  2.28k|  free(var##_mem);                  \
  |  | 1007|  2.28k|  var = NULL
  ------------------
 1164|  2.28k|  return 0;
 1165|  2.28k|}
scale.cc:_ZN6libyuvL16ScalePlaneSimpleEiiiiiiPKhPh:
 1905|    654|                             uint8_t* dst_ptr) {
 1906|    654|  int i;
 1907|    654|  void (*ScaleCols)(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width,
 1908|    654|                    int x, int dx) = ScaleCols_C;
 1909|       |  // Initial source x/y coordinate and step values as 16.16 fixed point.
 1910|    654|  int x = 0;
 1911|    654|  int y = 0;
 1912|    654|  int dx = 0;
 1913|    654|  int dy = 0;
 1914|    654|  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, &x, &y,
 1915|    654|             &dx, &dy);
 1916|    654|  src_width = Abs(src_width);
 1917|       |
 1918|    654|  if (src_width * 2 == dst_width && x < 0x8000) {
  ------------------
  |  Branch (1918:7): [True: 32, False: 622]
  |  Branch (1918:37): [True: 32, False: 0]
  ------------------
 1919|     32|    ScaleCols = ScaleColsUp2_C;
 1920|       |#if defined(HAS_SCALECOLS_SSE2)
 1921|       |    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
 1922|       |      ScaleCols = ScaleColsUp2_SSE2;
 1923|       |    }
 1924|       |#endif
 1925|     32|  }
 1926|       |
 1927|  11.3k|  for (i = 0; i < dst_height; ++i) {
  ------------------
  |  Branch (1927:15): [True: 10.7k, False: 654]
  ------------------
 1928|  10.7k|    ScaleCols(dst_ptr, src_ptr + (y >> 16) * (int64_t)src_stride, dst_width, x,
 1929|  10.7k|              dx);
 1930|  10.7k|    dst_ptr += dst_stride;
 1931|  10.7k|    y += dy;
 1932|  10.7k|  }
 1933|    654|}
scale.cc:_ZN6libyuvL18ScalePlaneDown2_16EiiiiiiPKtPtNS_10FilterModeE:
  158|     16|                               enum FilterMode filtering) {
  159|     16|  int y;
  160|     16|  void (*ScaleRowDown2)(const uint16_t* src_ptr, ptrdiff_t src_stride,
  161|     16|                        uint16_t* dst_ptr, int dst_width) =
  162|     16|      filtering == kFilterNone
  ------------------
  |  Branch (162:7): [True: 0, False: 16]
  ------------------
  163|     16|          ? ScaleRowDown2_16_C
  164|     16|          : (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C
  ------------------
  |  Branch (164:14): [True: 0, False: 16]
  ------------------
  165|     16|                                        : ScaleRowDown2Box_16_C);
  166|     16|  int row_stride = src_stride * 2;
  167|     16|  (void)src_width;
  168|     16|  (void)src_height;
  169|     16|  if (!filtering) {
  ------------------
  |  Branch (169:7): [True: 0, False: 16]
  ------------------
  170|      0|    src_ptr += src_stride;  // Point to odd rows.
  171|      0|    src_stride = 0;
  172|      0|  }
  173|       |
  174|       |#if defined(HAS_SCALEROWDOWN2_16_NEON)
  175|       |  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) {
  176|       |    ScaleRowDown2 = filtering == kFilterNone     ? ScaleRowDown2_16_NEON
  177|       |                    : filtering == kFilterLinear ? ScaleRowDown2Linear_16_NEON
  178|       |                                                 : ScaleRowDown2Box_16_NEON;
  179|       |  }
  180|       |#endif
  181|       |#if defined(HAS_SCALEROWDOWN2_16_SME)
  182|       |  if (TestCpuFlag(kCpuHasSME)) {
  183|       |    ScaleRowDown2 = filtering == kFilterNone     ? ScaleRowDown2_16_SME
  184|       |                    : filtering == kFilterLinear ? ScaleRowDown2Linear_16_SME
  185|       |                                                 : ScaleRowDown2Box_16_SME;
  186|       |  }
  187|       |#endif
  188|       |#if defined(HAS_SCALEROWDOWN2_16_SSE2)
  189|       |  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
  190|       |    ScaleRowDown2 =
  191|       |        filtering == kFilterNone
  192|       |            ? ScaleRowDown2_16_SSE2
  193|       |            : (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2
  194|       |                                          : ScaleRowDown2Box_16_SSE2);
  195|       |  }
  196|       |#endif
  197|       |
  198|     16|  if (filtering == kFilterLinear) {
  ------------------
  |  Branch (198:7): [True: 0, False: 16]
  ------------------
  199|      0|    src_stride = 0;
  200|      0|  }
  201|       |  // TODO(fbarchard): Loop through source height to allow odd height.
  202|    424|  for (y = 0; y < dst_height; ++y) {
  ------------------
  |  Branch (202:15): [True: 408, False: 16]
  ------------------
  203|    408|    ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
  204|    408|    src_ptr += row_stride;
  205|    408|    dst_ptr += dst_stride;
  206|    408|  }
  207|     16|}
scale.cc:_ZN6libyuvL16ScalePlaneBox_16EiiiiiiPKtPt:
  996|    181|                            uint16_t* dst_ptr) {
  997|    181|  int j, k;
  998|       |  // Initial source x/y coordinate and step values as 16.16 fixed point.
  999|    181|  int x = 0;
 1000|    181|  int y = 0;
 1001|    181|  int dx = 0;
 1002|    181|  int dy = 0;
 1003|    181|  const int max_y = (src_height << 16);
 1004|    181|  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, &x, &y,
 1005|    181|             &dx, &dy);
 1006|    181|  src_width = Abs(src_width);
 1007|    181|  {
 1008|       |    // Allocate a row buffer of uint32_t.
 1009|    181|    align_buffer_64(row32, src_width * 4);
  ------------------
  |  | 1002|    181|  void* var##_mem = malloc((size) + 63);                      /* NOLINT */ \
  |  | 1003|    181|  uint8_t* var = (uint8_t*)(((intptr_t)var##_mem + 63) & ~63) /* NOLINT */
  ------------------
 1010|    181|    if (!row32)
  ------------------
  |  Branch (1010:9): [True: 0, False: 181]
  ------------------
 1011|      0|      return 1;
 1012|    181|    void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
 1013|    181|                         const uint32_t* src_ptr, uint16_t* dst_ptr) =
 1014|    181|        (dx & 0xffff) ? ScaleAddCols2_16_C : ScaleAddCols1_16_C;
  ------------------
  |  Branch (1014:9): [True: 133, False: 48]
  ------------------
 1015|    181|    void (*ScaleAddRow)(const uint16_t* src_ptr, uint32_t* dst_ptr,
 1016|    181|                        int src_width) = ScaleAddRow_16_C;
 1017|       |
 1018|       |#if defined(HAS_SCALEADDROW_16_SSE2)
 1019|       |    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16)) {
 1020|       |      ScaleAddRow = ScaleAddRow_16_SSE2;
 1021|       |    }
 1022|       |#endif
 1023|       |
 1024|  8.79k|    for (j = 0; j < dst_height; ++j) {
  ------------------
  |  Branch (1024:17): [True: 8.61k, False: 181]
  ------------------
 1025|  8.61k|      int boxheight;
 1026|  8.61k|      int iy = y >> 16;
 1027|  8.61k|      const uint16_t* src = src_ptr + iy * (int64_t)src_stride;
 1028|  8.61k|      y += dy;
 1029|  8.61k|      if (y > max_y) {
  ------------------
  |  Branch (1029:11): [True: 0, False: 8.61k]
  ------------------
 1030|      0|        y = max_y;
 1031|      0|      }
 1032|  8.61k|      boxheight = MIN1((y >> 16) - iy);
  ------------------
  |  |  778|  8.61k|#define MIN1(x) ((x) < 1 ? 1 : (x))
  |  |  ------------------
  |  |  |  Branch (778:18): [True: 0, False: 8.61k]
  |  |  ------------------
  ------------------
 1033|  8.61k|      memset(row32, 0, src_width * 4);
 1034|   109k|      for (k = 0; k < boxheight; ++k) {
  ------------------
  |  Branch (1034:19): [True: 100k, False: 8.61k]
  ------------------
 1035|   100k|        ScaleAddRow(src, (uint32_t*)(row32), src_width);
 1036|   100k|        src += src_stride;
 1037|   100k|      }
 1038|  8.61k|      ScaleAddCols(dst_width, boxheight, x, dx, (uint32_t*)(row32), dst_ptr);
 1039|  8.61k|      dst_ptr += dst_stride;
 1040|  8.61k|    }
 1041|    181|    free_aligned_buffer_64(row32);
  ------------------
  |  | 1006|    181|  free(var##_mem);                  \
  |  | 1007|    181|  var = NULL
  ------------------
 1042|    181|  }
 1043|      0|  return 0;
 1044|    181|}
scale.cc:_ZN6libyuvL18ScaleAddCols2_16_CEiiiiPKjPt:
  829|  6.96k|                               uint16_t* dst_ptr) {
  830|  6.96k|  int i;
  831|  6.96k|  int scaletbl[2];
  832|  6.96k|  int minboxwidth = dx >> 16;
  833|  6.96k|  int boxwidth;
  834|  6.96k|  scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight);
  ------------------
  |  |  778|  6.96k|#define MIN1(x) ((x) < 1 ? 1 : (x))
  |  |  ------------------
  |  |  |  Branch (778:18): [True: 0, False: 6.96k]
  |  |  ------------------
  ------------------
  835|  6.96k|  scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight);
  ------------------
  |  |  778|  6.96k|#define MIN1(x) ((x) < 1 ? 1 : (x))
  |  |  ------------------
  |  |  |  Branch (778:18): [True: 0, False: 6.96k]
  |  |  ------------------
  ------------------
  836|   481k|  for (i = 0; i < dst_width; ++i) {
  ------------------
  |  Branch (836:15): [True: 474k, False: 6.96k]
  ------------------
  837|   474k|    int ix = x >> 16;
  838|   474k|    x += dx;
  839|   474k|    boxwidth = MIN1((x >> 16) - ix);
  ------------------
  |  |  778|   474k|#define MIN1(x) ((x) < 1 ? 1 : (x))
  |  |  ------------------
  |  |  |  Branch (778:18): [True: 0, False: 474k]
  |  |  ------------------
  ------------------
  840|   474k|    int scaletbl_index = boxwidth - minboxwidth;
  841|       |    assert((scaletbl_index == 0) || (scaletbl_index == 1));
  842|   474k|    *dst_ptr++ =
  843|   474k|        SumPixels_16(boxwidth, src_ptr + ix) * scaletbl[scaletbl_index] >> 16;
  844|   474k|  }
  845|  6.96k|}
scale.cc:_ZN6libyuvL12SumPixels_16EiPKj:
  790|   556k|static __inline uint32_t SumPixels_16(int iboxwidth, const uint32_t* src_ptr) {
  791|   556k|  uint32_t sum = 0u;
  792|   556k|  int x;
  793|   556k|  assert(iboxwidth > 0);
  794|  2.50M|  for (x = 0; x < iboxwidth; ++x) {
  ------------------
  |  Branch (794:15): [True: 1.94M, False: 556k]
  ------------------
  795|  1.94M|    sum += src_ptr[x];
  796|  1.94M|  }
  797|   556k|  return sum;
  798|   556k|}
scale.cc:_ZN6libyuvL18ScaleAddCols1_16_CEiiiiPKjPt:
  883|  1.64k|                               uint16_t* dst_ptr) {
  884|  1.64k|  int boxwidth = MIN1(dx >> 16);
  ------------------
  |  |  778|  1.64k|#define MIN1(x) ((x) < 1 ? 1 : (x))
  |  |  ------------------
  |  |  |  Branch (778:18): [True: 0, False: 1.64k]
  |  |  ------------------
  ------------------
  885|  1.64k|  int scaleval = 65536 / (boxwidth * boxheight);
  886|  1.64k|  int i;
  887|  83.5k|  for (i = 0; i < dst_width; ++i) {
  ------------------
  |  Branch (887:15): [True: 81.8k, False: 1.64k]
  ------------------
  888|  81.8k|    *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + x) * scaleval >> 16;
  889|  81.8k|    x += boxwidth;
  890|  81.8k|  }
  891|  1.64k|}
scale.cc:_ZN6libyuvL23ScalePlaneBilinearUp_16EiiiiiiPKtPtNS_10FilterModeE:
 1764|  1.81k|                                   enum FilterMode filtering) {
 1765|  1.81k|  int j;
 1766|       |  // Initial source x/y coordinate and step values as 16.16 fixed point.
 1767|  1.81k|  int x = 0;
 1768|  1.81k|  int y = 0;
 1769|  1.81k|  int dx = 0;
 1770|  1.81k|  int dy = 0;
 1771|  1.81k|  const int max_y = (src_height - 1) << 16;
 1772|  1.81k|  void (*InterpolateRow)(uint16_t* dst_ptr, const uint16_t* src_ptr,
 1773|  1.81k|                         ptrdiff_t src_stride, int dst_width,
 1774|  1.81k|                         int source_y_fraction) = InterpolateRow_16_C;
 1775|  1.81k|  void (*ScaleFilterCols)(uint16_t* dst_ptr, const uint16_t* src_ptr,
 1776|  1.81k|                          int dst_width, int x, int dx) =
 1777|  1.81k|      filtering ? ScaleFilterCols_16_C : ScaleCols_16_C;
  ------------------
  |  Branch (1777:7): [True: 1.81k, False: 0]
  ------------------
 1778|  1.81k|  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
 1779|  1.81k|             &dx, &dy);
 1780|  1.81k|  src_width = Abs(src_width);
 1781|       |
 1782|       |#if defined(HAS_INTERPOLATEROW_16_SSE2)
 1783|       |  if (TestCpuFlag(kCpuHasSSE2)) {
 1784|       |    InterpolateRow = InterpolateRow_16_Any_SSE2;
 1785|       |    if (IS_ALIGNED(dst_width, 16)) {
 1786|       |      InterpolateRow = InterpolateRow_16_SSE2;
 1787|       |    }
 1788|       |  }
 1789|       |#endif
 1790|       |#if defined(HAS_INTERPOLATEROW_16_SSSE3)
 1791|       |  if (TestCpuFlag(kCpuHasSSSE3)) {
 1792|       |    InterpolateRow = InterpolateRow_16_Any_SSSE3;
 1793|       |    if (IS_ALIGNED(dst_width, 16)) {
 1794|       |      InterpolateRow = InterpolateRow_16_SSSE3;
 1795|       |    }
 1796|       |  }
 1797|       |#endif
 1798|       |#if defined(HAS_INTERPOLATEROW_16_AVX2)
 1799|       |  if (TestCpuFlag(kCpuHasAVX2)) {
 1800|       |    InterpolateRow = InterpolateRow_16_Any_AVX2;
 1801|       |    if (IS_ALIGNED(dst_width, 32)) {
 1802|       |      InterpolateRow = InterpolateRow_16_AVX2;
 1803|       |    }
 1804|       |  }
 1805|       |#endif
 1806|       |#if defined(HAS_INTERPOLATEROW_16_NEON)
 1807|       |  if (TestCpuFlag(kCpuHasNEON)) {
 1808|       |    InterpolateRow = InterpolateRow_16_Any_NEON;
 1809|       |    if (IS_ALIGNED(dst_width, 16)) {
 1810|       |      InterpolateRow = InterpolateRow_16_NEON;
 1811|       |    }
 1812|       |  }
 1813|       |#endif
 1814|       |#if defined(HAS_INTERPOLATEROW_16_SME)
 1815|       |  if (TestCpuFlag(kCpuHasSME)) {
 1816|       |    InterpolateRow = InterpolateRow_16_SME;
 1817|       |  }
 1818|       |#endif
 1819|       |
 1820|  1.81k|  if (filtering && src_width >= 32768) {
  ------------------
  |  Branch (1820:7): [True: 1.81k, False: 0]
  |  Branch (1820:20): [True: 0, False: 1.81k]
  ------------------
 1821|      0|    ScaleFilterCols = ScaleFilterCols64_16_C;
 1822|      0|  }
 1823|       |#if defined(HAS_SCALEFILTERCOLS_16_SSSE3)
 1824|       |  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
 1825|       |    ScaleFilterCols = ScaleFilterCols_16_SSSE3;
 1826|       |  }
 1827|       |#endif
 1828|  1.81k|  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
  ------------------
  |  Branch (1828:7): [True: 0, False: 1.81k]
  |  Branch (1828:21): [True: 0, False: 0]
  |  Branch (1828:51): [True: 0, False: 0]
  ------------------
 1829|      0|    ScaleFilterCols = ScaleColsUp2_16_C;
 1830|       |#if defined(HAS_SCALECOLS_16_SSE2)
 1831|       |    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
 1832|       |      ScaleFilterCols = ScaleColsUp2_16_SSE2;
 1833|       |    }
 1834|       |#endif
 1835|      0|  }
 1836|  1.81k|  if (y > max_y) {
  ------------------
  |  Branch (1836:7): [True: 519, False: 1.29k]
  ------------------
 1837|    519|    y = max_y;
 1838|    519|  }
 1839|  1.81k|  {
 1840|  1.81k|    int yi = y >> 16;
 1841|  1.81k|    const uint16_t* src = src_ptr + yi * (int64_t)src_stride;
 1842|       |
 1843|       |    // Allocate 2 row buffers.
 1844|  1.81k|    const int row_size = (dst_width + 31) & ~31;
 1845|  1.81k|    align_buffer_64(row, row_size * 4);
  ------------------
  |  | 1002|  1.81k|  void* var##_mem = malloc((size) + 63);                      /* NOLINT */ \
  |  | 1003|  1.81k|  uint8_t* var = (uint8_t*)(((intptr_t)var##_mem + 63) & ~63) /* NOLINT */
  ------------------
 1846|  1.81k|    int rowstride = row_size;
 1847|  1.81k|    int lasty = yi;
 1848|  1.81k|    uint16_t* rowptr = (uint16_t*)row;
 1849|  1.81k|    if (!row)
  ------------------
  |  Branch (1849:9): [True: 0, False: 1.81k]
  ------------------
 1850|      0|      return 1;
 1851|       |
 1852|  1.81k|    ScaleFilterCols(rowptr, src, dst_width, x, dx);
 1853|  1.81k|    if (src_height > 1) {
  ------------------
  |  Branch (1853:9): [True: 1.29k, False: 519]
  ------------------
 1854|  1.29k|      src += src_stride;
 1855|  1.29k|    }
 1856|  1.81k|    ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx);
 1857|  1.81k|    if (src_height > 2) {
  ------------------
  |  Branch (1857:9): [True: 1.07k, False: 742]
  ------------------
 1858|  1.07k|      src += src_stride;
 1859|  1.07k|    }
 1860|       |
 1861|   117k|    for (j = 0; j < dst_height; ++j) {
  ------------------
  |  Branch (1861:17): [True: 115k, False: 1.81k]
  ------------------
 1862|   115k|      yi = y >> 16;
 1863|   115k|      if (yi != lasty) {
  ------------------
  |  Branch (1863:11): [True: 32.0k, False: 83.6k]
  ------------------
 1864|  32.0k|        if (y > max_y) {
  ------------------
  |  Branch (1864:13): [True: 0, False: 32.0k]
  ------------------
 1865|      0|          y = max_y;
 1866|      0|          yi = y >> 16;
 1867|      0|          src = src_ptr + yi * (int64_t)src_stride;
 1868|      0|        }
 1869|  32.0k|        if (yi != lasty) {
  ------------------
  |  Branch (1869:13): [True: 32.0k, False: 0]
  ------------------
 1870|  32.0k|          ScaleFilterCols(rowptr, src, dst_width, x, dx);
 1871|  32.0k|          rowptr += rowstride;
 1872|  32.0k|          rowstride = -rowstride;
 1873|  32.0k|          lasty = yi;
 1874|  32.0k|          if ((y + 65536) < max_y) {
  ------------------
  |  Branch (1874:15): [True: 31.0k, False: 1.07k]
  ------------------
 1875|  31.0k|            src += src_stride;
 1876|  31.0k|          }
 1877|  32.0k|        }
 1878|  32.0k|      }
 1879|   115k|      if (filtering == kFilterLinear) {
  ------------------
  |  Branch (1879:11): [True: 11.9k, False: 103k]
  ------------------
 1880|  11.9k|        InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0);
 1881|   103k|      } else {
 1882|   103k|        int yf = (y >> 8) & 255;
 1883|   103k|        InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf);
 1884|   103k|      }
 1885|   115k|      dst_ptr += dst_stride;
 1886|   115k|      y += dy;
 1887|   115k|    }
 1888|  1.81k|    free_aligned_buffer_64(row);
  ------------------
  |  | 1006|  1.81k|  free(var##_mem);                  \
  |  | 1007|  1.81k|  var = NULL
  ------------------
 1889|  1.81k|  }
 1890|      0|  return 0;
 1891|  1.81k|}
scale.cc:_ZN6libyuvL25ScalePlaneBilinearDown_16EiiiiiiPKtPtNS_10FilterModeE:
 1175|  1.54k|                                     enum FilterMode filtering) {
 1176|       |  // Initial source x/y coordinate and step values as 16.16 fixed point.
 1177|  1.54k|  int x = 0;
 1178|  1.54k|  int y = 0;
 1179|  1.54k|  int dx = 0;
 1180|  1.54k|  int dy = 0;
 1181|       |  // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
 1182|       |  // Allocate a row buffer.
 1183|  1.54k|  align_buffer_64(row, src_width * 2);
  ------------------
  |  | 1002|  1.54k|  void* var##_mem = malloc((size) + 63);                      /* NOLINT */ \
  |  | 1003|  1.54k|  uint8_t* var = (uint8_t*)(((intptr_t)var##_mem + 63) & ~63) /* NOLINT */
  ------------------
 1184|  1.54k|  if (!row)
  ------------------
  |  Branch (1184:7): [True: 0, False: 1.54k]
  ------------------
 1185|      0|    return 1;
 1186|       |
 1187|  1.54k|  const int max_y = (src_height - 1) << 16;
 1188|  1.54k|  int j;
 1189|  1.54k|  void (*ScaleFilterCols)(uint16_t* dst_ptr, const uint16_t* src_ptr,
 1190|  1.54k|                          int dst_width, int x, int dx) =
 1191|  1.54k|      (src_width >= 32768) ? ScaleFilterCols64_16_C : ScaleFilterCols_16_C;
  ------------------
  |  Branch (1191:7): [True: 0, False: 1.54k]
  ------------------
 1192|  1.54k|  void (*InterpolateRow)(uint16_t* dst_ptr, const uint16_t* src_ptr,
 1193|  1.54k|                         ptrdiff_t src_stride, int dst_width,
 1194|  1.54k|                         int source_y_fraction) = InterpolateRow_16_C;
 1195|  1.54k|  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
 1196|  1.54k|             &dx, &dy);
 1197|  1.54k|  src_width = Abs(src_width);
 1198|       |
 1199|       |#if defined(HAS_INTERPOLATEROW_16_SSE2)
 1200|       |  if (TestCpuFlag(kCpuHasSSE2)) {
 1201|       |    InterpolateRow = InterpolateRow_16_Any_SSE2;
 1202|       |    if (IS_ALIGNED(src_width, 16)) {
 1203|       |      InterpolateRow = InterpolateRow_16_SSE2;
 1204|       |    }
 1205|       |  }
 1206|       |#endif
 1207|       |#if defined(HAS_INTERPOLATEROW_16_SSSE3)
 1208|       |  if (TestCpuFlag(kCpuHasSSSE3)) {
 1209|       |    InterpolateRow = InterpolateRow_16_Any_SSSE3;
 1210|       |    if (IS_ALIGNED(src_width, 16)) {
 1211|       |      InterpolateRow = InterpolateRow_16_SSSE3;
 1212|       |    }
 1213|       |  }
 1214|       |#endif
 1215|       |#if defined(HAS_INTERPOLATEROW_16_AVX2)
 1216|       |  if (TestCpuFlag(kCpuHasAVX2)) {
 1217|       |    InterpolateRow = InterpolateRow_16_Any_AVX2;
 1218|       |    if (IS_ALIGNED(src_width, 32)) {
 1219|       |      InterpolateRow = InterpolateRow_16_AVX2;
 1220|       |    }
 1221|       |  }
 1222|       |#endif
 1223|       |#if defined(HAS_INTERPOLATEROW_16_NEON)
 1224|       |  if (TestCpuFlag(kCpuHasNEON)) {
 1225|       |    InterpolateRow = InterpolateRow_16_Any_NEON;
 1226|       |    if (IS_ALIGNED(src_width, 16)) {
 1227|       |      InterpolateRow = InterpolateRow_16_NEON;
 1228|       |    }
 1229|       |  }
 1230|       |#endif
 1231|       |#if defined(HAS_INTERPOLATEROW_16_SME)
 1232|       |  if (TestCpuFlag(kCpuHasSME)) {
 1233|       |    InterpolateRow = InterpolateRow_16_SME;
 1234|       |  }
 1235|       |#endif
 1236|       |
 1237|       |#if defined(HAS_SCALEFILTERCOLS_16_SSSE3)
 1238|       |  if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
 1239|       |    ScaleFilterCols = ScaleFilterCols_16_SSSE3;
 1240|       |  }
 1241|       |#endif
 1242|  1.54k|  if (y > max_y) {
  ------------------
  |  Branch (1242:7): [True: 10, False: 1.53k]
  ------------------
 1243|     10|    y = max_y;
 1244|     10|  }
 1245|       |
 1246|   143k|  for (j = 0; j < dst_height; ++j) {
  ------------------
  |  Branch (1246:15): [True: 141k, False: 1.54k]
  ------------------
 1247|   141k|    int yi = y >> 16;
 1248|   141k|    const uint16_t* src = src_ptr + yi * (int64_t)src_stride;
 1249|   141k|    if (filtering == kFilterLinear) {
  ------------------
  |  Branch (1249:9): [True: 83.4k, False: 58.4k]
  ------------------
 1250|  83.4k|      ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
 1251|  83.4k|    } else {
 1252|  58.4k|      int yf = (y >> 8) & 255;
 1253|  58.4k|      InterpolateRow((uint16_t*)row, src, src_stride, src_width, yf);
 1254|  58.4k|      ScaleFilterCols(dst_ptr, (uint16_t*)row, dst_width, x, dx);
 1255|  58.4k|    }
 1256|   141k|    dst_ptr += dst_stride;
 1257|   141k|    y += dy;
 1258|   141k|    if (y > max_y) {
  ------------------
  |  Branch (1258:9): [True: 2.21k, False: 139k]
  ------------------
 1259|  2.21k|      y = max_y;
 1260|  2.21k|    }
 1261|   141k|  }
 1262|       |  free_aligned_buffer_64(row);
  ------------------
  |  | 1006|  1.54k|  free(var##_mem);                  \
  |  | 1007|  1.54k|  var = NULL
  ------------------
 1263|  1.54k|  return 0;
 1264|  1.54k|}
scale.cc:_ZN6libyuvL19ScalePlaneSimple_16EiiiiiiPKtPt:
 1942|    464|                                uint16_t* dst_ptr) {
 1943|    464|  int i;
 1944|    464|  void (*ScaleCols)(uint16_t* dst_ptr, const uint16_t* src_ptr, int dst_width,
 1945|    464|                    int x, int dx) = ScaleCols_16_C;
 1946|       |  // Initial source x/y coordinate and step values as 16.16 fixed point.
 1947|    464|  int x = 0;
 1948|    464|  int y = 0;
 1949|    464|  int dx = 0;
 1950|    464|  int dy = 0;
 1951|    464|  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, &x, &y,
 1952|    464|             &dx, &dy);
 1953|    464|  src_width = Abs(src_width);
 1954|       |
 1955|    464|  if (src_width * 2 == dst_width && x < 0x8000) {
  ------------------
  |  Branch (1955:7): [True: 26, False: 438]
  |  Branch (1955:37): [True: 26, False: 0]
  ------------------
 1956|     26|    ScaleCols = ScaleColsUp2_16_C;
 1957|       |#if defined(HAS_SCALECOLS_16_SSE2)
 1958|       |    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
 1959|       |      ScaleCols = ScaleColsUp2_16_SSE2;
 1960|       |    }
 1961|       |#endif
 1962|     26|  }
 1963|       |
 1964|  11.2k|  for (i = 0; i < dst_height; ++i) {
  ------------------
  |  Branch (1964:15): [True: 10.7k, False: 464]
  ------------------
 1965|  10.7k|    ScaleCols(dst_ptr, src_ptr + (y >> 16) * (int64_t)src_stride, dst_width, x,
 1966|  10.7k|              dx);
 1967|  10.7k|    dst_ptr += dst_stride;
 1968|  10.7k|    y += dy;
 1969|  10.7k|  }
 1970|    464|}
scale.cc:_ZN6libyuvL23ScalePlaneUp2_12_LinearEiiiiiiPKtPt:
 1562|     63|                                    uint16_t* dst_ptr) {
 1563|     63|  void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr,
 1564|     63|                     int dst_width) = ScaleRowUp2_Linear_16_Any_C;
 1565|     63|  int i;
 1566|     63|  int y;
 1567|     63|  int dy;
 1568|       |
 1569|     63|  (void)src_width;
 1570|       |  // This function can only scale up by 2 times horizontally.
 1571|     63|  assert(src_width == ((dst_width + 1) / 2));
 1572|       |
 1573|     63|#ifdef HAS_SCALEROWUP2_LINEAR_12_SSSE3
 1574|     63|  if (TestCpuFlag(kCpuHasSSSE3)) {
  ------------------
  |  Branch (1574:7): [True: 63, False: 0]
  ------------------
 1575|     63|    ScaleRowUp = ScaleRowUp2_Linear_12_Any_SSSE3;
 1576|     63|  }
 1577|     63|#endif
 1578|       |
 1579|     63|#ifdef HAS_SCALEROWUP2_LINEAR_12_AVX2
 1580|     63|  if (TestCpuFlag(kCpuHasAVX2)) {
  ------------------
  |  Branch (1580:7): [True: 63, False: 0]
  ------------------
 1581|     63|    ScaleRowUp = ScaleRowUp2_Linear_12_Any_AVX2;
 1582|     63|  }
 1583|     63|#endif
 1584|       |
 1585|       |#ifdef HAS_SCALEROWUP2_LINEAR_12_NEON
 1586|       |  if (TestCpuFlag(kCpuHasNEON)) {
 1587|       |    ScaleRowUp = ScaleRowUp2_Linear_12_Any_NEON;
 1588|       |  }
 1589|       |#endif
 1590|       |
 1591|     63|  if (dst_height == 1) {
  ------------------
  |  Branch (1591:7): [True: 6, False: 57]
  ------------------
 1592|      6|    ScaleRowUp(src_ptr + ((src_height - 1) / 2) * (int64_t)src_stride, dst_ptr,
 1593|      6|               dst_width);
 1594|     57|  } else {
 1595|     57|    dy = FixedDiv(src_height - 1, dst_height - 1);
  ------------------
  |  |  265|     57|#define FixedDiv FixedDiv_X86
  ------------------
 1596|     57|    y = (1 << 15) - 1;
 1597|  1.76k|    for (i = 0; i < dst_height; ++i) {
  ------------------
  |  Branch (1597:17): [True: 1.70k, False: 57]
  ------------------
 1598|  1.70k|      ScaleRowUp(src_ptr + (y >> 16) * (int64_t)src_stride, dst_ptr, dst_width);
 1599|  1.70k|      dst_ptr += dst_stride;
 1600|  1.70k|      y += dy;
 1601|  1.70k|    }
 1602|     57|  }
 1603|     63|}
scale.cc:_ZN6libyuvL25ScalePlaneUp2_12_BilinearEiiiiiiPKtPt:
 1617|     35|                                      uint16_t* dst_ptr) {
 1618|     35|  void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride,
 1619|     35|                      uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
 1620|     35|      ScaleRowUp2_Bilinear_16_Any_C;
 1621|     35|  int x;
 1622|       |
 1623|     35|  (void)src_width;
 1624|       |  // This function can only scale up by 2 times.
 1625|     35|  assert(src_width == ((dst_width + 1) / 2));
 1626|     35|  assert(src_height == ((dst_height + 1) / 2));
 1627|       |
 1628|     35|#ifdef HAS_SCALEROWUP2_BILINEAR_12_SSSE3
 1629|     35|  if (TestCpuFlag(kCpuHasSSSE3)) {
  ------------------
  |  Branch (1629:7): [True: 35, False: 0]
  ------------------
 1630|     35|    Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_SSSE3;
 1631|     35|  }
 1632|     35|#endif
 1633|       |
 1634|     35|#ifdef HAS_SCALEROWUP2_BILINEAR_12_AVX2
 1635|     35|  if (TestCpuFlag(kCpuHasAVX2)) {
  ------------------
  |  Branch (1635:7): [True: 35, False: 0]
  ------------------
 1636|     35|    Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_AVX2;
 1637|     35|  }
 1638|     35|#endif
 1639|       |
 1640|       |#ifdef HAS_SCALEROWUP2_BILINEAR_12_NEON
 1641|       |  if (TestCpuFlag(kCpuHasNEON)) {
 1642|       |    Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_NEON;
 1643|       |  }
 1644|       |#endif
 1645|       |
 1646|     35|  Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
 1647|     35|  dst_ptr += dst_stride;
 1648|    332|  for (x = 0; x < src_height - 1; ++x) {
  ------------------
  |  Branch (1648:15): [True: 297, False: 35]
  ------------------
 1649|    297|    Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
 1650|    297|    src_ptr += src_stride;
 1651|    297|    dst_ptr += 2 * dst_stride;
 1652|    297|  }
 1653|     35|  if (!(dst_height & 1)) {
  ------------------
  |  Branch (1653:7): [True: 12, False: 23]
  ------------------
 1654|     12|    Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
 1655|     12|  }
 1656|     35|}

ScaleRowDown2Box_Any_AVX2:
   27|    308|               int dst_width) {                                                \
   28|    308|    int r = (int)((unsigned int)dst_width % (MASK + 1)); /* NOLINT */          \
   29|    308|    int n = dst_width - r;                                                     \
   30|    308|    if (n > 0) {                                                               \
  ------------------
  |  Branch (30:9): [True: 300, False: 8]
  ------------------
   31|    300|      SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                      \
   32|    300|    }                                                                          \
   33|    308|    SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                   \
   34|    308|                   dst_ptr + n * BPP, r);                                      \
   35|    308|  }
ScaleAddRow_Any_AVX2:
  505|  41.8k|  void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { \
  506|  41.8k|    int n = src_width & ~MASK;                                             \
  507|  41.8k|    if (n > 0) {                                                           \
  ------------------
  |  Branch (507:9): [True: 41.2k, False: 654]
  ------------------
  508|  41.2k|      SCALEADDROW_SIMD(src_ptr, dst_ptr, n);                               \
  509|  41.2k|    }                                                                      \
  510|  41.8k|    SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK);             \
  511|  41.8k|  }
ScaleRowUp2_Linear_Any_AVX2:
  571|  2.20k|  void NAME(const PTYPE* src_ptr, PTYPE* dst_ptr, int dst_width) { \
  572|  2.20k|    int work_width = (dst_width - 1) & ~1;                         \
  573|  2.20k|    int r = work_width & MASK;                                     \
  574|  2.20k|    int n = work_width & ~MASK;                                    \
  575|  2.20k|    dst_ptr[0] = src_ptr[0];                                       \
  576|  2.20k|    if (work_width > 0) {                                          \
  ------------------
  |  Branch (576:9): [True: 2.20k, False: 0]
  ------------------
  577|  2.20k|      if (n != 0) {                                                \
  ------------------
  |  Branch (577:11): [True: 1.76k, False: 444]
  ------------------
  578|  1.76k|        SIMD(src_ptr, dst_ptr + 1, n);                             \
  579|  1.76k|      }                                                            \
  580|  2.20k|      C(src_ptr + (n / 2), dst_ptr + n + 1, r);                    \
  581|  2.20k|    }                                                              \
  582|  2.20k|    dst_ptr[dst_width - 1] = src_ptr[(dst_width - 1) / 2];         \
  583|  2.20k|  }
ScaleRowUp2_Linear_12_Any_AVX2:
  571|  1.71k|  void NAME(const PTYPE* src_ptr, PTYPE* dst_ptr, int dst_width) { \
  572|  1.71k|    int work_width = (dst_width - 1) & ~1;                         \
  573|  1.71k|    int r = work_width & MASK;                                     \
  574|  1.71k|    int n = work_width & ~MASK;                                    \
  575|  1.71k|    dst_ptr[0] = src_ptr[0];                                       \
  576|  1.71k|    if (work_width > 0) {                                          \
  ------------------
  |  Branch (576:9): [True: 1.71k, False: 0]
  ------------------
  577|  1.71k|      if (n != 0) {                                                \
  ------------------
  |  Branch (577:11): [True: 1.30k, False: 410]
  ------------------
  578|  1.30k|        SIMD(src_ptr, dst_ptr + 1, n);                             \
  579|  1.30k|      }                                                            \
  580|  1.71k|      C(src_ptr + (n / 2), dst_ptr + n + 1, r);                    \
  581|  1.71k|    }                                                              \
  582|  1.71k|    dst_ptr[dst_width - 1] = src_ptr[(dst_width - 1) / 2];         \
  583|  1.71k|  }
ScaleRowUp2_Bilinear_Any_AVX2:
  694|  1.08k|            ptrdiff_t dst_stride, int dst_width) {                        \
  695|  1.08k|    int work_width = (dst_width - 1) & ~1;                                \
  696|  1.08k|    int r = work_width & MASK;                                            \
  697|  1.08k|    int n = work_width & ~MASK;                                           \
  698|  1.08k|    const PTYPE* sa = src_ptr;                                            \
  699|  1.08k|    const PTYPE* sb = src_ptr + src_stride;                               \
  700|  1.08k|    PTYPE* da = dst_ptr;                                                  \
  701|  1.08k|    PTYPE* db = dst_ptr + dst_stride;                                     \
  702|  1.08k|    da[0] = (3 * sa[0] + sb[0] + 2) >> 2;                                 \
  703|  1.08k|    db[0] = (sa[0] + 3 * sb[0] + 2) >> 2;                                 \
  704|  1.08k|    if (work_width > 0) {                                                 \
  ------------------
  |  Branch (704:9): [True: 1.08k, False: 0]
  ------------------
  705|  1.08k|      if (n != 0) {                                                       \
  ------------------
  |  Branch (705:11): [True: 988, False: 94]
  ------------------
  706|    988|        SIMD(sa, sb - sa, da + 1, db - da, n);                            \
  707|    988|      }                                                                   \
  708|  1.08k|      C(sa + (n / 2), sb - sa, da + n + 1, db - da, r);                   \
  709|  1.08k|    }                                                                     \
  710|  1.08k|    da[dst_width - 1] =                                                   \
  711|  1.08k|        (3 * sa[(dst_width - 1) / 2] + sb[(dst_width - 1) / 2] + 2) >> 2; \
  712|  1.08k|    db[dst_width - 1] =                                                   \
  713|  1.08k|        (sa[(dst_width - 1) / 2] + 3 * sb[(dst_width - 1) / 2] + 2) >> 2; \
  714|  1.08k|  }
ScaleRowUp2_Bilinear_12_Any_AVX2:
  694|    344|            ptrdiff_t dst_stride, int dst_width) {                        \
  695|    344|    int work_width = (dst_width - 1) & ~1;                                \
  696|    344|    int r = work_width & MASK;                                            \
  697|    344|    int n = work_width & ~MASK;                                           \
  698|    344|    const PTYPE* sa = src_ptr;                                            \
  699|    344|    const PTYPE* sb = src_ptr + src_stride;                               \
  700|    344|    PTYPE* da = dst_ptr;                                                  \
  701|    344|    PTYPE* db = dst_ptr + dst_stride;                                     \
  702|    344|    da[0] = (3 * sa[0] + sb[0] + 2) >> 2;                                 \
  703|    344|    db[0] = (sa[0] + 3 * sb[0] + 2) >> 2;                                 \
  704|    344|    if (work_width > 0) {                                                 \
  ------------------
  |  Branch (704:9): [True: 344, False: 0]
  ------------------
  705|    344|      if (n != 0) {                                                       \
  ------------------
  |  Branch (705:11): [True: 326, False: 18]
  ------------------
  706|    326|        SIMD(sa, sb - sa, da + 1, db - da, n);                            \
  707|    326|      }                                                                   \
  708|    344|      C(sa + (n / 2), sb - sa, da + n + 1, db - da, r);                   \
  709|    344|    }                                                                     \
  710|    344|    da[dst_width - 1] =                                                   \
  711|    344|        (3 * sa[(dst_width - 1) / 2] + sb[(dst_width - 1) / 2] + 2) >> 2; \
  712|    344|    db[dst_width - 1] =                                                   \
  713|    344|        (sa[(dst_width - 1) / 2] + 3 * sb[(dst_width - 1) / 2] + 2) >> 2; \
  714|    344|  }

ScaleRowDown2Box_C:
  213|    308|                        int dst_width) {
  214|    308|  const uint8_t* s = src_ptr;
  215|    308|  const uint8_t* t = src_ptr + src_stride;
  216|    308|  int x;
  217|  3.00k|  for (x = 0; x < dst_width - 1; x += 2) {
  ------------------
  |  Branch (217:15): [True: 2.70k, False: 308]
  ------------------
  218|  2.70k|    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
  219|  2.70k|    dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
  220|  2.70k|    dst += 2;
  221|  2.70k|    s += 4;
  222|  2.70k|    t += 4;
  223|  2.70k|  }
  224|    308|  if (dst_width & 1) {
  ------------------
  |  Branch (224:7): [True: 8, False: 300]
  ------------------
  225|      8|    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
  226|      8|  }
  227|    308|}
ScaleRowDown2Box_16_C:
  256|    408|                           int dst_width) {
  257|    408|  const uint16_t* s = src_ptr;
  258|    408|  const uint16_t* t = src_ptr + src_stride;
  259|    408|  int x;
  260|  10.4k|  for (x = 0; x < dst_width - 1; x += 2) {
  ------------------
  |  Branch (260:15): [True: 10.0k, False: 408]
  ------------------
  261|  10.0k|    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
  262|  10.0k|    dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
  263|  10.0k|    dst += 2;
  264|  10.0k|    s += 4;
  265|  10.0k|    t += 4;
  266|  10.0k|  }
  267|    408|  if (dst_width & 1) {
  ------------------
  |  Branch (267:7): [True: 8, False: 400]
  ------------------
  268|      8|    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
  269|      8|  }
  270|    408|}
ScaleRowUp2_Linear_C:
  574|  2.20k|                          int dst_width) {
  575|  2.20k|  int src_width = dst_width >> 1;
  576|  2.20k|  int x;
  577|  2.20k|  assert((dst_width % 2 == 0) && (dst_width >= 0));
  578|  28.2k|  for (x = 0; x < src_width; ++x) {
  ------------------
  |  Branch (578:15): [True: 26.0k, False: 2.20k]
  ------------------
  579|  26.0k|    dst_ptr[2 * x + 0] = (src_ptr[x + 0] * 3 + src_ptr[x + 1] * 1 + 2) >> 2;
  580|  26.0k|    dst_ptr[2 * x + 1] = (src_ptr[x + 0] * 1 + src_ptr[x + 1] * 3 + 2) >> 2;
  581|  26.0k|  }
  582|  2.20k|}
ScaleRowUp2_Bilinear_C:
  598|  1.08k|                            int dst_width) {
  599|  1.08k|  const uint8_t* s = src_ptr;
  600|  1.08k|  const uint8_t* t = src_ptr + src_stride;
  601|  1.08k|  uint8_t* d = dst_ptr;
  602|  1.08k|  uint8_t* e = dst_ptr + dst_stride;
  603|  1.08k|  int src_width = dst_width >> 1;
  604|  1.08k|  int x;
  605|  1.08k|  assert((dst_width % 2 == 0) && (dst_width >= 0));
  606|  10.0k|  for (x = 0; x < src_width; ++x) {
  ------------------
  |  Branch (606:15): [True: 8.92k, False: 1.08k]
  ------------------
  607|  8.92k|    d[2 * x + 0] =
  608|  8.92k|        (s[x + 0] * 9 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 1 + 8) >> 4;
  609|  8.92k|    d[2 * x + 1] =
  610|  8.92k|        (s[x + 0] * 3 + s[x + 1] * 9 + t[x + 0] * 1 + t[x + 1] * 3 + 8) >> 4;
  611|  8.92k|    e[2 * x + 0] =
  612|  8.92k|        (s[x + 0] * 3 + s[x + 1] * 1 + t[x + 0] * 9 + t[x + 1] * 3 + 8) >> 4;
  613|  8.92k|    e[2 * x + 1] =
  614|  8.92k|        (s[x + 0] * 1 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 9 + 8) >> 4;
  615|  8.92k|  }
  616|  1.08k|}
ScaleRowUp2_Linear_16_C:
  621|  1.71k|                             int dst_width) {
  622|  1.71k|  int src_width = dst_width >> 1;
  623|  1.71k|  int x;
  624|  1.71k|  assert((dst_width % 2 == 0) && (dst_width >= 0));
  625|  11.0k|  for (x = 0; x < src_width; ++x) {
  ------------------
  |  Branch (625:15): [True: 9.32k, False: 1.71k]
  ------------------
  626|  9.32k|    dst_ptr[2 * x + 0] = (src_ptr[x + 0] * 3 + src_ptr[x + 1] * 1 + 2) >> 2;
  627|  9.32k|    dst_ptr[2 * x + 1] = (src_ptr[x + 0] * 1 + src_ptr[x + 1] * 3 + 2) >> 2;
  628|  9.32k|  }
  629|  1.71k|}
ScaleRowUp2_Bilinear_16_C:
  636|    344|                               int dst_width) {
  637|    344|  const uint16_t* s = src_ptr;
  638|    344|  const uint16_t* t = src_ptr + src_stride;
  639|    344|  uint16_t* d = dst_ptr;
  640|    344|  uint16_t* e = dst_ptr + dst_stride;
  641|    344|  int src_width = dst_width >> 1;
  642|    344|  int x;
  643|    344|  assert((dst_width % 2 == 0) && (dst_width >= 0));
  644|  1.76k|  for (x = 0; x < src_width; ++x) {
  ------------------
  |  Branch (644:15): [True: 1.41k, False: 344]
  ------------------
  645|  1.41k|    d[2 * x + 0] =
  646|  1.41k|        (s[x + 0] * 9 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 1 + 8) >> 4;
  647|  1.41k|    d[2 * x + 1] =
  648|  1.41k|        (s[x + 0] * 3 + s[x + 1] * 9 + t[x + 0] * 1 + t[x + 1] * 3 + 8) >> 4;
  649|  1.41k|    e[2 * x + 0] =
  650|  1.41k|        (s[x + 0] * 3 + s[x + 1] * 1 + t[x + 0] * 9 + t[x + 1] * 3 + 8) >> 4;
  651|  1.41k|    e[2 * x + 1] =
  652|  1.41k|        (s[x + 0] * 1 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 9 + 8) >> 4;
  653|  1.41k|  }
  654|    344|}
ScaleCols_C:
  661|  10.3k|                 int dx) {
  662|  10.3k|  int j;
  663|   255k|  for (j = 0; j < dst_width - 1; j += 2) {
  ------------------
  |  Branch (663:15): [True: 244k, False: 10.3k]
  ------------------
  664|   244k|    dst_ptr[0] = src_ptr[x >> 16];
  665|   244k|    x += dx;
  666|   244k|    dst_ptr[1] = src_ptr[x >> 16];
  667|   244k|    x += dx;
  668|   244k|    dst_ptr += 2;
  669|   244k|  }
  670|  10.3k|  if (dst_width & 1) {
  ------------------
  |  Branch (670:7): [True: 6.79k, False: 3.58k]
  ------------------
  671|  6.79k|    dst_ptr[0] = src_ptr[x >> 16];
  672|  6.79k|  }
  673|  10.3k|}
ScaleCols_16_C:
  679|  10.6k|                    int dx) {
  680|  10.6k|  int j;
  681|   362k|  for (j = 0; j < dst_width - 1; j += 2) {
  ------------------
  |  Branch (681:15): [True: 351k, False: 10.6k]
  ------------------
  682|   351k|    dst_ptr[0] = src_ptr[x >> 16];
  683|   351k|    x += dx;
  684|   351k|    dst_ptr[1] = src_ptr[x >> 16];
  685|   351k|    x += dx;
  686|   351k|    dst_ptr += 2;
  687|   351k|  }
  688|  10.6k|  if (dst_width & 1) {
  ------------------
  |  Branch (688:7): [True: 2.97k, False: 7.68k]
  ------------------
  689|  2.97k|    dst_ptr[0] = src_ptr[x >> 16];
  690|  2.97k|  }
  691|  10.6k|}
ScaleColsUp2_C:
  698|    364|                    int dx) {
  699|    364|  int j;
  700|    364|  (void)x;
  701|    364|  (void)dx;
  702|    728|  for (j = 0; j < dst_width - 1; j += 2) {
  ------------------
  |  Branch (702:15): [True: 364, False: 364]
  ------------------
  703|    364|    dst_ptr[1] = dst_ptr[0] = src_ptr[0];
  704|    364|    src_ptr += 1;
  705|    364|    dst_ptr += 2;
  706|    364|  }
  707|    364|  if (dst_width & 1) {
  ------------------
  |  Branch (707:7): [True: 0, False: 364]
  ------------------
  708|      0|    dst_ptr[0] = src_ptr[0];
  709|      0|  }
  710|    364|}
ScaleColsUp2_16_C:
  716|     90|                       int dx) {
  717|     90|  int j;
  718|     90|  (void)x;
  719|     90|  (void)dx;
  720|    180|  for (j = 0; j < dst_width - 1; j += 2) {
  ------------------
  |  Branch (720:15): [True: 90, False: 90]
  ------------------
  721|     90|    dst_ptr[1] = dst_ptr[0] = src_ptr[0];
  722|     90|    src_ptr += 1;
  723|     90|    dst_ptr += 2;
  724|     90|  }
  725|     90|  if (dst_width & 1) {
  ------------------
  |  Branch (725:7): [True: 0, False: 90]
  ------------------
  726|      0|    dst_ptr[0] = src_ptr[0];
  727|      0|  }
  728|     90|}
ScaleFilterCols_16_C:
  806|   177k|                          int dx) {
  807|   177k|  int j;
  808|  11.1M|  for (j = 0; j < dst_width - 1; j += 2) {
  ------------------
  |  Branch (808:15): [True: 10.9M, False: 177k]
  ------------------
  809|  10.9M|    int xi = x >> 16;
  810|  10.9M|    int a = src_ptr[xi];
  811|  10.9M|    int b = src_ptr[xi + 1];
  812|  10.9M|    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
  ------------------
  |  |  798|  10.9M|  (uint16_t)(            \
  |  |  799|  10.9M|      (int)(a) +         \
  |  |  800|  10.9M|      (int)((((int64_t)((f)) * ((int64_t)(b) - (int)(a))) + 0x8000) >> 16))
  ------------------
  813|  10.9M|    x += dx;
  814|  10.9M|    xi = x >> 16;
  815|  10.9M|    a = src_ptr[xi];
  816|  10.9M|    b = src_ptr[xi + 1];
  817|  10.9M|    dst_ptr[1] = BLENDER(a, b, x & 0xffff);
  ------------------
  |  |  798|  10.9M|  (uint16_t)(            \
  |  |  799|  10.9M|      (int)(a) +         \
  |  |  800|  10.9M|      (int)((((int64_t)((f)) * ((int64_t)(b) - (int)(a))) + 0x8000) >> 16))
  ------------------
  818|  10.9M|    x += dx;
  819|  10.9M|    dst_ptr += 2;
  820|  10.9M|  }
  821|   177k|  if (dst_width & 1) {
  ------------------
  |  Branch (821:7): [True: 32.3k, False: 145k]
  ------------------
  822|  32.3k|    int xi = x >> 16;
  823|  32.3k|    int a = src_ptr[xi];
  824|  32.3k|    int b = src_ptr[xi + 1];
  825|  32.3k|    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
  ------------------
  |  |  798|  32.3k|  (uint16_t)(            \
  |  |  799|  32.3k|      (int)(a) +         \
  |  |  800|  32.3k|      (int)((((int64_t)((f)) * ((int64_t)(b) - (int)(a))) + 0x8000) >> 16))
  ------------------
  826|  32.3k|  }
  827|   177k|}
ScaleAddRow_C:
 1002|  41.8k|void ScaleAddRow_C(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
 1003|  41.8k|  int x;
 1004|  41.8k|  assert(src_width > 0);
 1005|   366k|  for (x = 0; x < src_width - 1; x += 2) {
  ------------------
  |  Branch (1005:15): [True: 324k, False: 41.8k]
  ------------------
 1006|   324k|    dst_ptr[0] += src_ptr[0];
 1007|   324k|    dst_ptr[1] += src_ptr[1];
 1008|   324k|    src_ptr += 2;
 1009|   324k|    dst_ptr += 2;
 1010|   324k|  }
 1011|  41.8k|  if (src_width & 1) {
  ------------------
  |  Branch (1011:7): [True: 26.4k, False: 15.4k]
  ------------------
 1012|  26.4k|    dst_ptr[0] += src_ptr[0];
 1013|  26.4k|  }
 1014|  41.8k|}
ScaleAddRow_16_C:
 1018|   100k|                      int src_width) {
 1019|   100k|  int x;
 1020|   100k|  assert(src_width > 0);
 1021|  8.02M|  for (x = 0; x < src_width - 1; x += 2) {
  ------------------
  |  Branch (1021:15): [True: 7.92M, False: 100k]
  ------------------
 1022|  7.92M|    dst_ptr[0] += src_ptr[0];
 1023|  7.92M|    dst_ptr[1] += src_ptr[1];
 1024|  7.92M|    src_ptr += 2;
 1025|  7.92M|    dst_ptr += 2;
 1026|  7.92M|  }
 1027|   100k|  if (src_width & 1) {
  ------------------
  |  Branch (1027:7): [True: 19.4k, False: 81.1k]
  ------------------
 1028|  19.4k|    dst_ptr[0] += src_ptr[0];
 1029|  19.4k|  }
 1030|   100k|}
ScalePlaneVertical:
 1628|  1.25k|                        enum FilterMode filtering) {
 1629|       |  // TODO(fbarchard): Allow higher bpp.
 1630|  1.25k|  int dst_width_bytes = dst_width * bpp;
 1631|  1.25k|  void (*InterpolateRow)(uint8_t* dst_argb, const uint8_t* src_argb,
 1632|  1.25k|                         ptrdiff_t src_stride, int dst_width,
 1633|  1.25k|                         int source_y_fraction) = InterpolateRow_C;
 1634|  1.25k|  const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
  ------------------
  |  Branch (1634:21): [True: 1.19k, False: 58]
  ------------------
 1635|  1.25k|  int j;
 1636|  1.25k|  assert(bpp >= 1 && bpp <= 4);
 1637|  1.25k|  assert(src_height != 0);
 1638|  1.25k|  assert(dst_width > 0);
 1639|  1.25k|  assert(dst_height > 0);
 1640|  1.25k|  src_argb += (x >> 16) * bpp;
 1641|  1.25k|#if defined(HAS_INTERPOLATEROW_SSSE3)
 1642|  1.25k|  if (TestCpuFlag(kCpuHasSSSE3)) {
  ------------------
  |  Branch (1642:7): [True: 1.25k, False: 0]
  ------------------
 1643|  1.25k|    InterpolateRow = InterpolateRow_Any_SSSE3;
 1644|  1.25k|    if (IS_ALIGNED(dst_width_bytes, 16)) {
  ------------------
  |  |  999|  1.25k|#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1)))
  |  |  ------------------
  |  |  |  Branch (999:26): [True: 388, False: 862]
  |  |  ------------------
  ------------------
 1645|    388|      InterpolateRow = InterpolateRow_SSSE3;
 1646|    388|    }
 1647|  1.25k|  }
 1648|  1.25k|#endif
 1649|  1.25k|#if defined(HAS_INTERPOLATEROW_AVX2)
 1650|  1.25k|  if (TestCpuFlag(kCpuHasAVX2)) {
  ------------------
  |  Branch (1650:7): [True: 1.25k, False: 0]
  ------------------
 1651|  1.25k|    InterpolateRow = InterpolateRow_Any_AVX2;
 1652|  1.25k|    if (IS_ALIGNED(dst_width_bytes, 32)) {
  ------------------
  |  |  999|  1.25k|#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1)))
  |  |  ------------------
  |  |  |  Branch (999:26): [True: 220, False: 1.03k]
  |  |  ------------------
  ------------------
 1653|    220|      InterpolateRow = InterpolateRow_AVX2;
 1654|    220|    }
 1655|  1.25k|  }
 1656|  1.25k|#endif
 1657|       |#if defined(HAS_INTERPOLATEROW_NEON)
 1658|       |  if (TestCpuFlag(kCpuHasNEON)) {
 1659|       |    InterpolateRow = InterpolateRow_Any_NEON;
 1660|       |    if (IS_ALIGNED(dst_width_bytes, 16)) {
 1661|       |      InterpolateRow = InterpolateRow_NEON;
 1662|       |    }
 1663|       |  }
 1664|       |#endif
 1665|       |#if defined(HAS_INTERPOLATEROW_SME)
 1666|       |  if (TestCpuFlag(kCpuHasSME)) {
 1667|       |    InterpolateRow = InterpolateRow_SME;
 1668|       |  }
 1669|       |#endif
 1670|       |#if defined(HAS_INTERPOLATEROW_LSX)
 1671|       |  if (TestCpuFlag(kCpuHasLSX)) {
 1672|       |    InterpolateRow = InterpolateRow_Any_LSX;
 1673|       |    if (IS_ALIGNED(dst_width_bytes, 32)) {
 1674|       |      InterpolateRow = InterpolateRow_LSX;
 1675|       |    }
 1676|       |  }
 1677|       |#endif
 1678|       |#if defined(HAS_INTERPOLATEROW_RVV)
 1679|       |  if (TestCpuFlag(kCpuHasRVV)) {
 1680|       |    InterpolateRow = InterpolateRow_RVV;
 1681|       |  }
 1682|       |#endif
 1683|       |
 1684|   119k|  for (j = 0; j < dst_height; ++j) {
  ------------------
  |  Branch (1684:15): [True: 118k, False: 1.25k]
  ------------------
 1685|   118k|    int yi;
 1686|   118k|    int yf;
 1687|   118k|    if (y > max_y) {
  ------------------
  |  Branch (1687:9): [True: 0, False: 118k]
  ------------------
 1688|      0|      y = max_y;
 1689|      0|    }
 1690|   118k|    yi = y >> 16;
 1691|   118k|    yf = filtering ? ((y >> 8) & 255) : 0;
  ------------------
  |  Branch (1691:10): [True: 116k, False: 2.07k]
  ------------------
 1692|   118k|    InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride,
 1693|   118k|                   dst_width_bytes, yf);
 1694|   118k|    dst_argb += dst_stride;
 1695|   118k|    y += dy;
 1696|   118k|  }
 1697|  1.25k|}
ScalePlaneVertical_16:
 1710|    907|                           enum FilterMode filtering) {
 1711|       |  // TODO(fbarchard): Allow higher wpp.
 1712|    907|  int dst_width_words = dst_width * wpp;
 1713|    907|  void (*InterpolateRow)(uint16_t* dst_argb, const uint16_t* src_argb,
 1714|    907|                         ptrdiff_t src_stride, int dst_width,
 1715|    907|                         int source_y_fraction) = InterpolateRow_16_C;
 1716|    907|  const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
  ------------------
  |  Branch (1716:21): [True: 871, False: 36]
  ------------------
 1717|    907|  int j;
 1718|    907|  assert(wpp >= 1 && wpp <= 2);
 1719|    907|  assert(src_height != 0);
 1720|    907|  assert(dst_width > 0);
 1721|    907|  assert(dst_height > 0);
 1722|    907|  src_argb += (x >> 16) * wpp;
 1723|       |#if defined(HAS_INTERPOLATEROW_16_SSE2)
 1724|       |  if (TestCpuFlag(kCpuHasSSE2)) {
 1725|       |    InterpolateRow = InterpolateRow_16_Any_SSE2;
 1726|       |    if (IS_ALIGNED(dst_width_words, 16)) {
 1727|       |      InterpolateRow = InterpolateRow_16_SSE2;
 1728|       |    }
 1729|       |  }
 1730|       |#endif
 1731|       |#if defined(HAS_INTERPOLATEROW_16_SSSE3)
 1732|       |  if (TestCpuFlag(kCpuHasSSSE3)) {
 1733|       |    InterpolateRow = InterpolateRow_16_Any_SSSE3;
 1734|       |    if (IS_ALIGNED(dst_width_words, 16)) {
 1735|       |      InterpolateRow = InterpolateRow_16_SSSE3;
 1736|       |    }
 1737|       |  }
 1738|       |#endif
 1739|       |#if defined(HAS_INTERPOLATEROW_16_AVX2)
 1740|       |  if (TestCpuFlag(kCpuHasAVX2)) {
 1741|       |    InterpolateRow = InterpolateRow_16_Any_AVX2;
 1742|       |    if (IS_ALIGNED(dst_width_words, 32)) {
 1743|       |      InterpolateRow = InterpolateRow_16_AVX2;
 1744|       |    }
 1745|       |  }
 1746|       |#endif
 1747|       |#if defined(HAS_INTERPOLATEROW_16_NEON)
 1748|       |  if (TestCpuFlag(kCpuHasNEON)) {
 1749|       |    InterpolateRow = InterpolateRow_16_Any_NEON;
 1750|       |    if (IS_ALIGNED(dst_width_words, 8)) {
 1751|       |      InterpolateRow = InterpolateRow_16_NEON;
 1752|       |    }
 1753|       |  }
 1754|       |#endif
 1755|       |#if defined(HAS_INTERPOLATEROW_16_SME)
 1756|       |  if (TestCpuFlag(kCpuHasSME)) {
 1757|       |    InterpolateRow = InterpolateRow_16_SME;
 1758|       |  }
 1759|       |#endif
 1760|   103k|  for (j = 0; j < dst_height; ++j) {
  ------------------
  |  Branch (1760:15): [True: 103k, False: 907]
  ------------------
 1761|   103k|    int yi;
 1762|   103k|    int yf;
 1763|   103k|    if (y > max_y) {
  ------------------
  |  Branch (1763:9): [True: 0, False: 103k]
  ------------------
 1764|      0|      y = max_y;
 1765|      0|    }
 1766|   103k|    yi = y >> 16;
 1767|   103k|    yf = filtering ? ((y >> 8) & 255) : 0;
  ------------------
  |  Branch (1767:10): [True: 100k, False: 2.33k]
  ------------------
 1768|   103k|    InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride,
 1769|   103k|                   dst_width_words, yf);
 1770|   103k|    dst_argb += dst_stride;
 1771|   103k|    y += dy;
 1772|   103k|  }
 1773|    907|}
ScaleFilterReduce:
 1849|  16.8k|                                  enum FilterMode filtering) {
 1850|  16.8k|  if (src_width < 0) {
  ------------------
  |  Branch (1850:7): [True: 0, False: 16.8k]
  ------------------
 1851|      0|    src_width = -src_width;
 1852|      0|  }
 1853|  16.8k|  if (src_height < 0) {
  ------------------
  |  Branch (1853:7): [True: 0, False: 16.8k]
  ------------------
 1854|      0|    src_height = -src_height;
 1855|      0|  }
 1856|  16.8k|  if (filtering == kFilterBox) {
  ------------------
  |  Branch (1856:7): [True: 11.8k, False: 4.96k]
  ------------------
 1857|       |    // If scaling either axis to 0.5 or larger, switch from Box to Bilinear.
 1858|  11.8k|    if (dst_width * 2 >= src_width || dst_height * 2 >= src_height) {
  ------------------
  |  Branch (1858:9): [True: 10.9k, False: 925]
  |  Branch (1858:39): [True: 393, False: 532]
  ------------------
 1859|  11.3k|      filtering = kFilterBilinear;
 1860|  11.3k|    }
 1861|  11.8k|  }
 1862|  16.8k|  if (filtering == kFilterBilinear) {
  ------------------
  |  Branch (1862:7): [True: 14.3k, False: 2.46k]
  ------------------
 1863|  14.3k|    if (src_height == 1) {
  ------------------
  |  Branch (1863:9): [True: 2.16k, False: 12.2k]
  ------------------
 1864|  2.16k|      filtering = kFilterLinear;
 1865|  2.16k|    }
 1866|       |    // TODO(fbarchard): Detect any odd scale factor and reduce to Linear.
 1867|  14.3k|    if (dst_height == src_height || dst_height * 3 == src_height) {
  ------------------
  |  Branch (1867:9): [True: 2.51k, False: 11.8k]
  |  Branch (1867:37): [True: 11, False: 11.8k]
  ------------------
 1868|  2.52k|      filtering = kFilterLinear;
 1869|  2.52k|    }
 1870|       |    // TODO(fbarchard): Remove 1 pixel wide filter restriction, which is to
 1871|       |    // avoid reading 2 pixels horizontally that causes memory exception.
 1872|  14.3k|    if (src_width == 1) {
  ------------------
  |  Branch (1872:9): [True: 1.23k, False: 13.1k]
  ------------------
 1873|  1.23k|      filtering = kFilterNone;
 1874|  1.23k|    }
 1875|  14.3k|  }
 1876|  16.8k|  if (filtering == kFilterLinear) {
  ------------------
  |  Branch (1876:7): [True: 4.88k, False: 11.9k]
  ------------------
 1877|  4.88k|    if (src_width == 1) {
  ------------------
  |  Branch (1877:9): [True: 0, False: 4.88k]
  ------------------
 1878|      0|      filtering = kFilterNone;
 1879|      0|    }
 1880|       |    // TODO(fbarchard): Detect any odd scale factor and reduce to None.
 1881|  4.88k|    if (dst_width == src_width || dst_width * 3 == src_width) {
  ------------------
  |  Branch (1881:9): [True: 378, False: 4.50k]
  |  Branch (1881:35): [True: 23, False: 4.47k]
  ------------------
 1882|    401|      filtering = kFilterNone;
 1883|    401|    }
 1884|  4.88k|  }
 1885|  16.8k|  return filtering;
 1886|  16.8k|}
ScaleSlope:
 1909|  8.91k|                int* dy) {
 1910|  8.91k|  assert(x != NULL);
 1911|  8.91k|  assert(y != NULL);
 1912|  8.91k|  assert(dx != NULL);
 1913|  8.91k|  assert(dy != NULL);
 1914|  8.91k|  assert(src_width != 0);
 1915|  8.91k|  assert(src_height != 0);
 1916|  8.91k|  assert(dst_width > 0);
 1917|  8.91k|  assert(dst_height > 0);
 1918|       |  // Check for 1 pixel and avoid FixedDiv overflow.
 1919|  8.91k|  if (dst_width == 1 && src_width >= 32768) {
  ------------------
  |  Branch (1919:7): [True: 155, False: 8.76k]
  |  Branch (1919:25): [True: 0, False: 155]
  ------------------
 1920|      0|    dst_width = src_width;
 1921|      0|  }
 1922|  8.91k|  if (dst_height == 1 && src_height >= 32768) {
  ------------------
  |  Branch (1922:7): [True: 164, False: 8.75k]
  |  Branch (1922:26): [True: 0, False: 164]
  ------------------
 1923|      0|    dst_height = src_height;
 1924|      0|  }
 1925|  8.91k|  if (filtering == kFilterBox) {
  ------------------
  |  Branch (1925:7): [True: 351, False: 8.56k]
  ------------------
 1926|       |    // Scale step for point sampling duplicates all pixels equally.
 1927|    351|    *dx = FixedDiv(Abs(src_width), dst_width);
  ------------------
  |  |  265|    351|#define FixedDiv FixedDiv_X86
  ------------------
 1928|    351|    *dy = FixedDiv(src_height, dst_height);
  ------------------
  |  |  265|    351|#define FixedDiv FixedDiv_X86
  ------------------
 1929|    351|    *x = 0;
 1930|    351|    *y = 0;
 1931|  8.56k|  } else if (filtering == kFilterBilinear) {
  ------------------
  |  Branch (1931:14): [True: 4.28k, False: 4.27k]
  ------------------
 1932|       |    // Scale step for bilinear sampling renders last pixel once for upsample.
 1933|  4.28k|    if (dst_width <= Abs(src_width)) {
  ------------------
  |  Branch (1933:9): [True: 700, False: 3.58k]
  ------------------
 1934|    700|      *dx = FixedDiv(Abs(src_width), dst_width);
  ------------------
  |  |  265|    700|#define FixedDiv FixedDiv_X86
  ------------------
 1935|    700|      *x = CENTERSTART(*dx, -32768);  // Subtract 0.5 (32768) to center filter.
  ------------------
  |  | 1898|    700|#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s)
  |  |  ------------------
  |  |  |  Branch (1898:28): [True: 0, False: 700]
  |  |  ------------------
  ------------------
 1936|  3.58k|    } else if (src_width > 1 && dst_width > 1) {
  ------------------
  |  Branch (1936:16): [True: 3.58k, False: 0]
  |  Branch (1936:33): [True: 3.58k, False: 0]
  ------------------
 1937|  3.58k|      *dx = FixedDiv1(Abs(src_width), dst_width);
  ------------------
  |  |  266|  3.58k|#define FixedDiv1 FixedDiv1_X86
  ------------------
 1938|  3.58k|      *x = 0;
 1939|  3.58k|    }
 1940|  4.28k|    if (dst_height <= src_height) {
  ------------------
  |  Branch (1940:9): [True: 1.92k, False: 2.36k]
  ------------------
 1941|  1.92k|      *dy = FixedDiv(src_height, dst_height);
  ------------------
  |  |  265|  1.92k|#define FixedDiv FixedDiv_X86
  ------------------
 1942|  1.92k|      *y = CENTERSTART(*dy, -32768);  // Subtract 0.5 (32768) to center filter.
  ------------------
  |  | 1898|  1.92k|#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s)
  |  |  ------------------
  |  |  |  Branch (1898:28): [True: 0, False: 1.92k]
  |  |  ------------------
  ------------------
 1943|  2.36k|    } else if (src_height > 1 && dst_height > 1) {
  ------------------
  |  Branch (1943:16): [True: 2.36k, False: 0]
  |  Branch (1943:34): [True: 2.36k, False: 0]
  ------------------
 1944|  2.36k|      *dy = FixedDiv1(src_height, dst_height);
  ------------------
  |  |  266|  2.36k|#define FixedDiv1 FixedDiv1_X86
  ------------------
 1945|  2.36k|      *y = 0;
 1946|  2.36k|    }
 1947|  4.28k|  } else if (filtering == kFilterLinear) {
  ------------------
  |  Branch (1947:14): [True: 3.16k, False: 1.11k]
  ------------------
 1948|       |    // Scale step for bilinear sampling renders last pixel once for upsample.
 1949|  3.16k|    if (dst_width <= Abs(src_width)) {
  ------------------
  |  Branch (1949:9): [True: 1.25k, False: 1.90k]
  ------------------
 1950|  1.25k|      *dx = FixedDiv(Abs(src_width), dst_width);
  ------------------
  |  |  265|  1.25k|#define FixedDiv FixedDiv_X86
  ------------------
 1951|  1.25k|      *x = CENTERSTART(*dx, -32768);  // Subtract 0.5 (32768) to center filter.
  ------------------
  |  | 1898|  1.25k|#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s)
  |  |  ------------------
  |  |  |  Branch (1898:28): [True: 0, False: 1.25k]
  |  |  ------------------
  ------------------
 1952|  1.90k|    } else if (src_width > 1 && dst_width > 1) {
  ------------------
  |  Branch (1952:16): [True: 1.90k, False: 0]
  |  Branch (1952:33): [True: 1.90k, False: 0]
  ------------------
 1953|  1.90k|      *dx = FixedDiv1(Abs(src_width), dst_width);
  ------------------
  |  |  266|  1.90k|#define FixedDiv1 FixedDiv1_X86
  ------------------
 1954|  1.90k|      *x = 0;
 1955|  1.90k|    }
 1956|  3.16k|    *dy = FixedDiv(src_height, dst_height);
  ------------------
  |  |  265|  3.16k|#define FixedDiv FixedDiv_X86
  ------------------
 1957|  3.16k|    *y = *dy >> 1;
 1958|  3.16k|  } else {
 1959|       |    // Scale step for point sampling duplicates all pixels equally.
 1960|  1.11k|    *dx = FixedDiv(Abs(src_width), dst_width);
  ------------------
  |  |  265|  1.11k|#define FixedDiv FixedDiv_X86
  ------------------
 1961|  1.11k|    *dy = FixedDiv(src_height, dst_height);
  ------------------
  |  |  265|  1.11k|#define FixedDiv FixedDiv_X86
  ------------------
 1962|  1.11k|    *x = CENTERSTART(*dx, 0);
  ------------------
  |  | 1898|  1.11k|#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s)
  |  |  ------------------
  |  |  |  Branch (1898:28): [True: 0, False: 1.11k]
  |  |  ------------------
  ------------------
 1963|  1.11k|    *y = CENTERSTART(*dy, 0);
  ------------------
  |  | 1898|  1.11k|#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s)
  |  |  ------------------
  |  |  |  Branch (1898:28): [True: 0, False: 1.11k]
  |  |  ------------------
  ------------------
 1964|  1.11k|  }
 1965|       |  // Negative src_width means horizontally mirror.
 1966|  8.91k|  if (src_width < 0) {
  ------------------
  |  Branch (1966:7): [True: 0, False: 8.91k]
  ------------------
 1967|      0|    *x += (dst_width - 1) * *dx;
 1968|      0|    *dx = -*dx;
 1969|       |    // src_width = -src_width;   // Caller must do this.
 1970|      0|  }
 1971|  8.91k|}
scale_common.cc:_ZN6libyuvL3AbsEi:
   45|  16.3k|static __inline int Abs(int v) {
   46|  16.3k|  return v >= 0 ? v : -v;
  ------------------
  |  Branch (46:10): [True: 16.3k, False: 0]
  ------------------
   47|  16.3k|}

ScaleRowDown2Box_AVX2:
  256|    300|                           int dst_width) {
  257|    300|  asm volatile(
  258|    300|      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
  259|    300|      "vpabsb      %%ymm4,%%ymm4                 \n"
  260|    300|      "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
  261|       |
  262|    300|      LABELALIGN
  263|    300|      "1:          \n"
  264|    300|      "vmovdqu     (%0),%%ymm0                   \n"
  265|    300|      "vmovdqu     0x20(%0),%%ymm1               \n"
  266|    300|      "vmovdqu     0x00(%0,%3,1),%%ymm2          \n"
  267|    300|      "vmovdqu     0x20(%0,%3,1),%%ymm3          \n"
  268|    300|      "lea         0x40(%0),%0                   \n"
  269|    300|      "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"
  270|    300|      "vpmaddubsw  %%ymm4,%%ymm1,%%ymm1          \n"
  271|    300|      "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
  272|    300|      "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
  273|    300|      "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
  274|    300|      "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
  275|    300|      "vpsrlw      $0x1,%%ymm0,%%ymm0            \n"
  276|    300|      "vpsrlw      $0x1,%%ymm1,%%ymm1            \n"
  277|    300|      "vpavgw      %%ymm5,%%ymm0,%%ymm0          \n"
  278|    300|      "vpavgw      %%ymm5,%%ymm1,%%ymm1          \n"
  279|    300|      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
  280|    300|      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
  281|    300|      "vmovdqu     %%ymm0,(%1)                   \n"
  282|    300|      "lea         0x20(%1),%1                   \n"
  283|    300|      "sub         $0x20,%2                      \n"
  284|    300|      "jg          1b                            \n"
  285|    300|      "vzeroupper  \n"
  286|    300|      : "+r"(src_ptr),               // %0
  287|    300|        "+r"(dst_ptr),               // %1
  288|    300|        "+r"(dst_width)              // %2
  289|    300|      : "r"((intptr_t)(src_stride))  // %3
  290|    300|      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
  291|    300|}
ScaleRowUp2_Linear_AVX2:
 1388|  1.76k|                             int dst_width) {
 1389|  1.76k|  asm volatile(
 1390|  1.76k|      "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
 1391|  1.76k|      "vpsrlw      $15,%%ymm4,%%ymm4             \n"
 1392|  1.76k|      "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2
 1393|  1.76k|      "vbroadcastf128 %3,%%ymm3                  \n"
 1394|       |
 1395|  1.76k|      LABELALIGN
 1396|  1.76k|      "1:          \n"
 1397|  1.76k|      "vmovdqu     (%0),%%xmm0                   \n"  // 0123456789ABCDEF
 1398|  1.76k|      "vmovdqu     1(%0),%%xmm1                  \n"  // 123456789ABCDEF0
 1399|  1.76k|      "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"
 1400|  1.76k|      "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"
 1401|  1.76k|      "vpunpcklwd  %%ymm0,%%ymm0,%%ymm0          \n"
 1402|  1.76k|      "vpunpcklwd  %%ymm1,%%ymm1,%%ymm1          \n"
 1403|  1.76k|      "vpunpckhdq  %%ymm1,%%ymm0,%%ymm2          \n"
 1404|  1.76k|      "vpunpckldq  %%ymm1,%%ymm0,%%ymm0          \n"
 1405|  1.76k|      "vpmaddubsw  %%ymm3,%%ymm2,%%ymm1          \n"  // 3*near+far (hi)
 1406|  1.76k|      "vpmaddubsw  %%ymm3,%%ymm0,%%ymm0          \n"  // 3*near+far (lo)
 1407|  1.76k|      "vpaddw      %%ymm4,%%ymm0,%%ymm0          \n"  // 3*near+far+2 (lo)
 1408|  1.76k|      "vpaddw      %%ymm4,%%ymm1,%%ymm1          \n"  // 3*near+far+2 (hi)
 1409|  1.76k|      "vpsrlw      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far (lo)
 1410|  1.76k|      "vpsrlw      $2,%%ymm1,%%ymm1              \n"  // 3/4*near+1/4*far (hi)
 1411|  1.76k|      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
 1412|  1.76k|      "vmovdqu     %%ymm0,(%1)                   \n"
 1413|       |
 1414|  1.76k|      "lea         0x10(%0),%0                   \n"
 1415|  1.76k|      "lea         0x20(%1),%1                   \n"  // 16 sample to 32 sample
 1416|  1.76k|      "sub         $0x20,%2                      \n"
 1417|  1.76k|      "jg          1b                            \n"
 1418|  1.76k|      "vzeroupper  \n"
 1419|  1.76k|      : "+r"(src_ptr),      // %0
 1420|  1.76k|        "+r"(dst_ptr),      // %1
 1421|  1.76k|        "+r"(dst_width)     // %2
 1422|  1.76k|      : "m"(kLinearMadd31)  // %3
 1423|  1.76k|      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
 1424|  1.76k|}
ScaleRowUp2_Bilinear_AVX2:
 1432|    988|                               int dst_width) {
 1433|    988|  asm volatile(
 1434|    988|      "vpcmpeqw    %%ymm6,%%ymm6,%%ymm6          \n"
 1435|    988|      "vpsrlw      $15,%%ymm6,%%ymm6             \n"
 1436|    988|      "vpsllw      $3,%%ymm6,%%ymm6              \n"  // all 8
 1437|    988|      "vbroadcastf128 %5,%%ymm7                  \n"
 1438|       |
 1439|    988|      LABELALIGN
 1440|    988|      "1:          \n"
 1441|    988|      "vmovdqu     (%0),%%xmm0                   \n"  // 0123456789ABCDEF
 1442|    988|      "vmovdqu     1(%0),%%xmm1                  \n"  // 123456789ABCDEF0
 1443|    988|      "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"
 1444|    988|      "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"
 1445|    988|      "vpunpcklwd  %%ymm0,%%ymm0,%%ymm0          \n"
 1446|    988|      "vpunpcklwd  %%ymm1,%%ymm1,%%ymm1          \n"
 1447|    988|      "vpunpckhdq  %%ymm1,%%ymm0,%%ymm2          \n"
 1448|    988|      "vpunpckldq  %%ymm1,%%ymm0,%%ymm0          \n"
 1449|    988|      "vpmaddubsw  %%ymm7,%%ymm2,%%ymm1          \n"  // 3*near+far (1, hi)
 1450|    988|      "vpmaddubsw  %%ymm7,%%ymm0,%%ymm0          \n"  // 3*near+far (1, lo)
 1451|       |
 1452|    988|      "vmovdqu     (%0,%3),%%xmm2                \n"  // 0123456789ABCDEF
 1453|    988|      "vmovdqu     1(%0,%3),%%xmm3               \n"  // 123456789ABCDEF0
 1454|    988|      "vpermq      $0b11011000,%%ymm2,%%ymm2     \n"
 1455|    988|      "vpermq      $0b11011000,%%ymm3,%%ymm3     \n"
 1456|    988|      "vpunpcklwd  %%ymm2,%%ymm2,%%ymm2          \n"
 1457|    988|      "vpunpcklwd  %%ymm3,%%ymm3,%%ymm3          \n"
 1458|    988|      "vpunpckhdq  %%ymm3,%%ymm2,%%ymm4          \n"
 1459|    988|      "vpunpckldq  %%ymm3,%%ymm2,%%ymm2          \n"
 1460|    988|      "vpmaddubsw  %%ymm7,%%ymm4,%%ymm3          \n"  // 3*near+far (2, hi)
 1461|    988|      "vpmaddubsw  %%ymm7,%%ymm2,%%ymm2          \n"  // 3*near+far (2, lo)
 1462|       |
 1463|       |      // ymm0 ymm1
 1464|       |      // ymm2 ymm3
 1465|       |
 1466|    988|      "vpaddw      %%ymm0,%%ymm0,%%ymm4          \n"  // 6*near+2*far (1, lo)
 1467|    988|      "vpaddw      %%ymm6,%%ymm2,%%ymm5          \n"  // 3*near+far+8 (2, lo)
 1468|    988|      "vpaddw      %%ymm4,%%ymm0,%%ymm4          \n"  // 9*near+3*far (1, lo)
 1469|    988|      "vpaddw      %%ymm4,%%ymm5,%%ymm4          \n"  // 9 3 3 1 + 8 (1, lo)
 1470|    988|      "vpsrlw      $4,%%ymm4,%%ymm4              \n"  // ^ div by 16 (1, lo)
 1471|       |
 1472|    988|      "vpaddw      %%ymm2,%%ymm2,%%ymm5          \n"  // 6*near+2*far (2, lo)
 1473|    988|      "vpaddw      %%ymm6,%%ymm0,%%ymm0          \n"  // 3*near+far+8 (1, lo)
 1474|    988|      "vpaddw      %%ymm5,%%ymm2,%%ymm5          \n"  // 9*near+3*far (2, lo)
 1475|    988|      "vpaddw      %%ymm5,%%ymm0,%%ymm5          \n"  // 9 3 3 1 + 8 (2, lo)
 1476|    988|      "vpsrlw      $4,%%ymm5,%%ymm5              \n"  // ^ div by 16 (2, lo)
 1477|       |
 1478|    988|      "vpaddw      %%ymm1,%%ymm1,%%ymm0          \n"  // 6*near+2*far (1, hi)
 1479|    988|      "vpaddw      %%ymm6,%%ymm3,%%ymm2          \n"  // 3*near+far+8 (2, hi)
 1480|    988|      "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 9*near+3*far (1, hi)
 1481|    988|      "vpaddw      %%ymm0,%%ymm2,%%ymm0          \n"  // 9 3 3 1 + 8 (1, hi)
 1482|    988|      "vpsrlw      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16 (1, hi)
 1483|       |
 1484|    988|      "vpaddw      %%ymm3,%%ymm3,%%ymm2          \n"  // 6*near+2*far (2, hi)
 1485|    988|      "vpaddw      %%ymm6,%%ymm1,%%ymm1          \n"  // 3*near+far+8 (1, hi)
 1486|    988|      "vpaddw      %%ymm2,%%ymm3,%%ymm2          \n"  // 9*near+3*far (2, hi)
 1487|    988|      "vpaddw      %%ymm2,%%ymm1,%%ymm2          \n"  // 9 3 3 1 + 8 (2, hi)
 1488|    988|      "vpsrlw      $4,%%ymm2,%%ymm2              \n"  // ^ div by 16 (2, hi)
 1489|       |
 1490|    988|      "vpackuswb   %%ymm0,%%ymm4,%%ymm4          \n"
 1491|    988|      "vmovdqu     %%ymm4,(%1)                   \n"  // store above
 1492|    988|      "vpackuswb   %%ymm2,%%ymm5,%%ymm5          \n"
 1493|    988|      "vmovdqu     %%ymm5,(%1,%4)                \n"  // store below
 1494|       |
 1495|    988|      "lea         0x10(%0),%0                   \n"
 1496|    988|      "lea         0x20(%1),%1                   \n"  // 16 sample to 32 sample
 1497|    988|      "sub         $0x20,%2                      \n"
 1498|    988|      "jg          1b                            \n"
 1499|    988|      "vzeroupper  \n"
 1500|    988|      : "+r"(src_ptr),                // %0
 1501|    988|        "+r"(dst_ptr),                // %1
 1502|    988|        "+r"(dst_width)               // %2
 1503|    988|      : "r"((intptr_t)(src_stride)),  // %3
 1504|    988|        "r"((intptr_t)(dst_stride)),  // %4
 1505|    988|        "m"(kLinearMadd31)            // %5
 1506|    988|      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
 1507|    988|        "xmm7");
 1508|    988|}
ScaleRowUp2_Linear_12_AVX2:
 1514|  1.30k|                                int dst_width) {
 1515|  1.30k|  asm volatile(
 1516|  1.30k|      "vbroadcastf128 %3,%%ymm5                  \n"
 1517|  1.30k|      "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
 1518|  1.30k|      "vpsrlw      $15,%%ymm4,%%ymm4             \n"
 1519|  1.30k|      "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2
 1520|       |
 1521|  1.30k|      LABELALIGN
 1522|  1.30k|      "1:          \n"
 1523|  1.30k|      "vmovdqu     (%0),%%ymm0                   \n"  // 0123456789ABCDEF (16b)
 1524|  1.30k|      "vmovdqu     2(%0),%%ymm1                  \n"  // 123456789ABCDEF0 (16b)
 1525|       |
 1526|  1.30k|      "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"  // 012389AB4567CDEF
 1527|  1.30k|      "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"  // 12349ABC5678DEF0
 1528|       |
 1529|  1.30k|      "vpunpckhwd  %%ymm1,%%ymm0,%%ymm2          \n"  // 899AABBCCDDEEFF0 (near)
 1530|  1.30k|      "vpunpcklwd  %%ymm1,%%ymm0,%%ymm0          \n"  // 0112233445566778 (near)
 1531|  1.30k|      "vpshufb     %%ymm5,%%ymm2,%%ymm3          \n"  // 98A9BACBDCEDFE0F (far)
 1532|  1.30k|      "vpshufb     %%ymm5,%%ymm0,%%ymm1          \n"  // 1021324354657687 (far)
 1533|       |
 1534|  1.30k|      "vpaddw      %%ymm4,%%ymm1,%%ymm1          \n"  // far+2
 1535|  1.30k|      "vpaddw      %%ymm4,%%ymm3,%%ymm3          \n"  // far+2
 1536|  1.30k|      "vpaddw      %%ymm0,%%ymm1,%%ymm1          \n"  // near+far+2
 1537|  1.30k|      "vpaddw      %%ymm2,%%ymm3,%%ymm3          \n"  // near+far+2
 1538|  1.30k|      "vpaddw      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near
 1539|  1.30k|      "vpaddw      %%ymm2,%%ymm2,%%ymm2          \n"  // 2*near
 1540|  1.30k|      "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 3*near+far+2
 1541|  1.30k|      "vpaddw      %%ymm2,%%ymm3,%%ymm2          \n"  // 3*near+far+2
 1542|       |
 1543|  1.30k|      "vpsrlw      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far
 1544|  1.30k|      "vpsrlw      $2,%%ymm2,%%ymm2              \n"  // 3/4*near+1/4*far
 1545|  1.30k|      "vmovdqu     %%ymm0,(%1)                   \n"
 1546|  1.30k|      "vmovdqu     %%ymm2,32(%1)                 \n"
 1547|       |
 1548|  1.30k|      "lea         0x20(%0),%0                   \n"
 1549|  1.30k|      "lea         0x40(%1),%1                   \n"  // 16 sample to 32 sample
 1550|  1.30k|      "sub         $0x20,%2                      \n"
 1551|  1.30k|      "jg          1b                            \n"
 1552|  1.30k|      "vzeroupper  \n"
 1553|  1.30k|      : "+r"(src_ptr),          // %0
 1554|  1.30k|        "+r"(dst_ptr),          // %1
 1555|  1.30k|        "+r"(dst_width)         // %2
 1556|  1.30k|      : "m"(kLinearShuffleFar)  // %3
 1557|  1.30k|      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 1558|  1.30k|}
ScaleRowUp2_Bilinear_12_AVX2:
 1566|    326|                                  int dst_width) {
 1567|    326|  asm volatile(
 1568|    326|      "vbroadcastf128 %5,%%ymm5                  \n"
 1569|    326|      "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
 1570|    326|      "vpsrlw      $15,%%ymm4,%%ymm4             \n"
 1571|    326|      "vpsllw      $3,%%ymm4,%%ymm4              \n"  // all 8
 1572|       |
 1573|    326|      LABELALIGN
 1574|    326|      "1:          \n"
 1575|       |
 1576|    326|      "vmovdqu     (%0),%%xmm0                   \n"  // 01234567 (16b)
 1577|    326|      "vmovdqu     2(%0),%%xmm1                  \n"  // 12345678 (16b)
 1578|    326|      "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"  // 0123000045670000
 1579|    326|      "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"  // 1234000056780000
 1580|    326|      "vpunpcklwd  %%ymm1,%%ymm0,%%ymm0          \n"  // 0112233445566778 (near)
 1581|    326|      "vpshufb     %%ymm5,%%ymm0,%%ymm1          \n"  // 1021324354657687 (far)
 1582|    326|      "vpaddw      %%ymm0,%%ymm1,%%ymm1          \n"  // near+far
 1583|    326|      "vpaddw      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near
 1584|    326|      "vpaddw      %%ymm0,%%ymm1,%%ymm2          \n"  // 3*near+far (1)
 1585|       |
 1586|    326|      "vmovdqu     (%0,%3,2),%%xmm0              \n"  // 01234567 (16b)
 1587|    326|      "vmovdqu     2(%0,%3,2),%%xmm1             \n"  // 12345678 (16b)
 1588|    326|      "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"  // 0123000045670000
 1589|    326|      "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"  // 1234000056780000
 1590|    326|      "vpunpcklwd  %%ymm1,%%ymm0,%%ymm0          \n"  // 0112233445566778 (near)
 1591|    326|      "vpshufb     %%ymm5,%%ymm0,%%ymm1          \n"  // 1021324354657687 (far)
 1592|    326|      "vpaddw      %%ymm0,%%ymm1,%%ymm1          \n"  // near+far
 1593|    326|      "vpaddw      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near
 1594|    326|      "vpaddw      %%ymm0,%%ymm1,%%ymm3          \n"  // 3*near+far (2)
 1595|       |
 1596|    326|      "vpaddw      %%ymm2,%%ymm2,%%ymm0          \n"  // 6*near+2*far (1)
 1597|    326|      "vpaddw      %%ymm4,%%ymm3,%%ymm1          \n"  // 3*near+far+8 (2)
 1598|    326|      "vpaddw      %%ymm0,%%ymm2,%%ymm0          \n"  // 9*near+3*far (1)
 1599|    326|      "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 9 3 3 1 + 8 (1)
 1600|    326|      "vpsrlw      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16
 1601|    326|      "vmovdqu     %%ymm0,(%1)                   \n"  // store above
 1602|       |
 1603|    326|      "vpaddw      %%ymm3,%%ymm3,%%ymm0          \n"  // 6*near+2*far (2)
 1604|    326|      "vpaddw      %%ymm4,%%ymm2,%%ymm1          \n"  // 3*near+far+8 (1)
 1605|    326|      "vpaddw      %%ymm0,%%ymm3,%%ymm0          \n"  // 9*near+3*far (2)
 1606|    326|      "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 9 3 3 1 + 8 (2)
 1607|    326|      "vpsrlw      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16
 1608|    326|      "vmovdqu     %%ymm0,(%1,%4,2)              \n"  // store below
 1609|       |
 1610|    326|      "lea         0x10(%0),%0                   \n"
 1611|    326|      "lea         0x20(%1),%1                   \n"  // 8 sample to 16 sample
 1612|    326|      "sub         $0x10,%2                      \n"
 1613|    326|      "jg          1b                            \n"
 1614|    326|      "vzeroupper  \n"
 1615|    326|      : "+r"(src_ptr),                // %0
 1616|    326|        "+r"(dst_ptr),                // %1
 1617|    326|        "+r"(dst_width)               // %2
 1618|    326|      : "r"((intptr_t)(src_stride)),  // %3
 1619|    326|        "r"((intptr_t)(dst_stride)),  // %4
 1620|    326|        "m"(kLinearShuffleFar)        // %5
 1621|    326|      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 1622|    326|}
ScaleAddRow_AVX2:
 1792|  47.7k|                      int src_width) {
 1793|  47.7k|      asm volatile("vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
 1794|       |
 1795|  47.7k|               LABELALIGN
 1796|  47.7k|      "1:          \n"
 1797|  47.7k|      "vmovdqu     (%0),%%ymm3                   \n"
 1798|  47.7k|      "lea         0x20(%0),%0                   \n"  // src_ptr += 32
 1799|  47.7k|      "vpermq      $0xd8,%%ymm3,%%ymm3           \n"
 1800|  47.7k|      "vpunpcklbw  %%ymm5,%%ymm3,%%ymm2          \n"
 1801|  47.7k|      "vpunpckhbw  %%ymm5,%%ymm3,%%ymm3          \n"
 1802|  47.7k|      "vpaddusw    (%1),%%ymm2,%%ymm0            \n"
 1803|  47.7k|      "vpaddusw    0x20(%1),%%ymm3,%%ymm1        \n"
 1804|  47.7k|      "vmovdqu     %%ymm0,(%1)                   \n"
 1805|  47.7k|      "vmovdqu     %%ymm1,0x20(%1)               \n"
 1806|  47.7k|      "lea         0x40(%1),%1                   \n"
 1807|  47.7k|      "sub         $0x20,%2                      \n"
 1808|  47.7k|      "jg          1b                            \n"
 1809|  47.7k|      "vzeroupper  \n"
 1810|  47.7k|               : "+r"(src_ptr),   // %0
 1811|  47.7k|                 "+r"(dst_ptr),   // %1
 1812|  47.7k|                 "+r"(src_width)  // %2
 1813|  47.7k|               :
 1814|  47.7k|               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 1815|  47.7k|}
ScaleFilterCols_SSSE3:
 1832|   163k|                           int dx) {
 1833|   163k|  intptr_t x0, x1, temp_pixel;
 1834|   163k|  asm volatile(
 1835|   163k|      "movd        %6,%%xmm2                     \n"
 1836|   163k|      "movd        %7,%%xmm3                     \n"
 1837|   163k|      "movl        $0x04040000,%k2               \n"
 1838|   163k|      "movd        %k2,%%xmm5                    \n"
 1839|   163k|      "pcmpeqb     %%xmm6,%%xmm6                 \n"
 1840|   163k|      "psrlw       $0x9,%%xmm6                   \n"  // 0x007f007f
 1841|   163k|      "pcmpeqb     %%xmm7,%%xmm7                 \n"
 1842|   163k|      "psrlw       $15,%%xmm7                    \n"  // 0x00010001
 1843|       |
 1844|   163k|      "pextrw      $0x1,%%xmm2,%k3               \n"
 1845|   163k|      "subl        $0x2,%5                       \n"
 1846|   163k|      "jl          29f                           \n"
 1847|   163k|      "movdqa      %%xmm2,%%xmm0                 \n"
 1848|   163k|      "paddd       %%xmm3,%%xmm0                 \n"
 1849|   163k|      "punpckldq   %%xmm0,%%xmm2                 \n"
 1850|   163k|      "punpckldq   %%xmm3,%%xmm3                 \n"
 1851|   163k|      "paddd       %%xmm3,%%xmm3                 \n"
 1852|   163k|      "pextrw      $0x3,%%xmm2,%k4               \n"
 1853|       |
 1854|   163k|      LABELALIGN
 1855|   163k|      "2:          \n"
 1856|   163k|      "movdqa      %%xmm2,%%xmm1                 \n"
 1857|   163k|      "paddd       %%xmm3,%%xmm2                 \n"
 1858|   163k|      "movzwl      0x00(%1,%3,1),%k2             \n"
 1859|   163k|      "movd        %k2,%%xmm0                    \n"
 1860|   163k|      "psrlw       $0x9,%%xmm1                   \n"
 1861|   163k|      "movzwl      0x00(%1,%4,1),%k2             \n"
 1862|   163k|      "movd        %k2,%%xmm4                    \n"
 1863|   163k|      "pshufb      %%xmm5,%%xmm1                 \n"
 1864|   163k|      "punpcklwd   %%xmm4,%%xmm0                 \n"
 1865|   163k|      "psubb       %8,%%xmm0                     \n"  // make pixels signed.
 1866|   163k|      "pxor        %%xmm6,%%xmm1                 \n"  // 128 - f = (f ^ 127 ) +
 1867|       |                                                      // 1
 1868|   163k|      "paddusb     %%xmm7,%%xmm1                 \n"
 1869|   163k|      "pmaddubsw   %%xmm0,%%xmm1                 \n"
 1870|   163k|      "pextrw      $0x1,%%xmm2,%k3               \n"
 1871|   163k|      "pextrw      $0x3,%%xmm2,%k4               \n"
 1872|   163k|      "paddw       %9,%%xmm1                     \n"  // make pixels unsigned.
 1873|   163k|      "psrlw       $0x7,%%xmm1                   \n"
 1874|   163k|      "packuswb    %%xmm1,%%xmm1                 \n"
 1875|   163k|      "movd        %%xmm1,%k2                    \n"
 1876|   163k|      "mov         %w2,(%0)                      \n"
 1877|   163k|      "lea         0x2(%0),%0                    \n"
 1878|   163k|      "subl        $0x2,%5                       \n"
 1879|   163k|      "jge         2b                            \n"
 1880|       |
 1881|   163k|      LABELALIGN
 1882|   163k|      "29:         \n"
 1883|   163k|      "addl        $0x1,%5                       \n"
 1884|   163k|      "jl          99f                           \n"
 1885|   163k|      "movzwl      0x00(%1,%3,1),%k2             \n"
 1886|   163k|      "movd        %k2,%%xmm0                    \n"
 1887|   163k|      "psrlw       $0x9,%%xmm2                   \n"
 1888|   163k|      "pshufb      %%xmm5,%%xmm2                 \n"
 1889|   163k|      "psubb       %8,%%xmm0                     \n"  // make pixels signed.
 1890|   163k|      "pxor        %%xmm6,%%xmm2                 \n"
 1891|   163k|      "paddusb     %%xmm7,%%xmm2                 \n"
 1892|   163k|      "pmaddubsw   %%xmm0,%%xmm2                 \n"
 1893|   163k|      "paddw       %9,%%xmm2                     \n"  // make pixels unsigned.
 1894|   163k|      "psrlw       $0x7,%%xmm2                   \n"
 1895|   163k|      "packuswb    %%xmm2,%%xmm2                 \n"
 1896|   163k|      "movd        %%xmm2,%k2                    \n"
 1897|   163k|      "mov         %b2,(%0)                      \n"
 1898|   163k|      "99:         \n"
 1899|   163k|      : "+r"(dst_ptr),      // %0
 1900|   163k|        "+r"(src_ptr),      // %1
 1901|   163k|        "=&a"(temp_pixel),  // %2
 1902|   163k|        "=&r"(x0),          // %3
 1903|   163k|        "=&r"(x1),          // %4
 1904|   163k|#if defined(__x86_64__)
 1905|   163k|        "+rm"(dst_width)  // %5
 1906|       |#else
 1907|       |        "+m"(dst_width)  // %5
 1908|       |#endif
 1909|   163k|      : "rm"(x),   // %6
 1910|   163k|        "rm"(dx),  // %7
 1911|   163k|#if defined(__x86_64__)
 1912|   163k|        "x"(kFsub80),  // %8
 1913|   163k|        "x"(kFadd40)   // %9
 1914|       |#else
 1915|       |        "m"(kFsub80),    // %8
 1916|       |        "m"(kFadd40)     // %9
 1917|       |#endif
 1918|   163k|      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
 1919|   163k|        "xmm7");
 1920|   163k|}
FixedDiv_X86:
 2295|  10.8k|int FixedDiv_X86(int num, int div) {
 2296|  10.8k|  asm volatile(
 2297|  10.8k|      "cdq         \n"
 2298|  10.8k|      "shld        $0x10,%%eax,%%edx             \n"
 2299|  10.8k|      "shl         $0x10,%%eax                   \n"
 2300|  10.8k|      "idiv        %1                            \n"
 2301|  10.8k|      "mov         %0, %%eax                     \n"
 2302|  10.8k|      : "+a"(num)  // %0
 2303|  10.8k|      : "c"(div)   // %1
 2304|  10.8k|      : "memory", "cc", "edx");
 2305|  10.8k|  return num;
 2306|  10.8k|}
FixedDiv1_X86:
 2309|  9.15k|int FixedDiv1_X86(int num, int div) {
 2310|  9.15k|  asm volatile(
 2311|  9.15k|      "cdq         \n"
 2312|  9.15k|      "shld        $0x10,%%eax,%%edx             \n"
 2313|  9.15k|      "shl         $0x10,%%eax                   \n"
 2314|  9.15k|      "sub         $0x10001,%%eax                \n"
 2315|  9.15k|      "sbb         $0x0,%%edx                    \n"
 2316|  9.15k|      "sub         $0x1,%1                       \n"
 2317|  9.15k|      "idiv        %1                            \n"
 2318|  9.15k|      "mov         %0, %%eax                     \n"
 2319|  9.15k|      : "+a"(num)  // %0
 2320|  9.15k|      : "c"(div)   // %1
 2321|  9.15k|      : "memory", "cc", "edx");
 2322|  9.15k|  return num;
 2323|  9.15k|}

_ZNK4avif16UniquePtrDeleterclEP11avifDecoder:
   21|  20.5k|    void operator()(avifDecoder * decoder) const { avifDecoderDestroy(decoder); }

avifGetPixelFormatInfo:
   40|  34.4k|{
   41|  34.4k|    memset(info, 0, sizeof(avifPixelFormatInfo));
   42|       |
   43|  34.4k|    switch (format) {
   44|  17.0k|        case AVIF_PIXEL_FORMAT_YUV444:
  ------------------
  |  Branch (44:9): [True: 17.0k, False: 17.3k]
  ------------------
   45|  17.0k|            info->chromaShiftX = 0;
   46|  17.0k|            info->chromaShiftY = 0;
   47|  17.0k|            break;
   48|       |
   49|  1.86k|        case AVIF_PIXEL_FORMAT_YUV422:
  ------------------
  |  Branch (49:9): [True: 1.86k, False: 32.5k]
  ------------------
   50|  1.86k|            info->chromaShiftX = 1;
   51|  1.86k|            info->chromaShiftY = 0;
   52|  1.86k|            break;
   53|       |
   54|  11.1k|        case AVIF_PIXEL_FORMAT_YUV420:
  ------------------
  |  Branch (54:9): [True: 11.1k, False: 23.3k]
  ------------------
   55|  11.1k|            info->chromaShiftX = 1;
   56|  11.1k|            info->chromaShiftY = 1;
   57|  11.1k|            break;
   58|       |
   59|  3.54k|        case AVIF_PIXEL_FORMAT_YUV400:
  ------------------
  |  Branch (59:9): [True: 3.54k, False: 30.9k]
  ------------------
   60|  3.54k|            info->monochrome = AVIF_TRUE;
  ------------------
  |  |   88|  3.54k|#define AVIF_TRUE 1
  ------------------
   61|       |            // The nonexistent chroma is considered as subsampled in each dimension
   62|       |            // according to the AV1 specification. See sections 5.5.2 and 6.4.2.
   63|  3.54k|            info->chromaShiftX = 1;
   64|  3.54k|            info->chromaShiftY = 1;
   65|  3.54k|            break;
   66|       |
   67|    868|        case AVIF_PIXEL_FORMAT_NONE:
  ------------------
  |  Branch (67:9): [True: 868, False: 33.5k]
  ------------------
   68|    868|        case AVIF_PIXEL_FORMAT_COUNT:
  ------------------
  |  Branch (68:9): [True: 0, False: 34.4k]
  ------------------
   69|    868|        default:
  ------------------
  |  Branch (69:9): [True: 0, False: 34.4k]
  ------------------
   70|    868|            break;
   71|  34.4k|    }
   72|  34.4k|}
avifImageSetDefaults:
  135|  84.7k|{
  136|  84.7k|    memset(image, 0, sizeof(avifImage));
  137|  84.7k|    image->yuvRange = AVIF_RANGE_FULL;
  138|  84.7k|    image->colorPrimaries = AVIF_COLOR_PRIMARIES_UNSPECIFIED;
  139|  84.7k|    image->transferCharacteristics = AVIF_TRANSFER_CHARACTERISTICS_UNSPECIFIED;
  140|  84.7k|    image->matrixCoefficients = AVIF_MATRIX_COEFFICIENTS_UNSPECIFIED;
  141|  84.7k|}
avifImageCreate:
  144|  81.9k|{
  145|       |    // width and height are checked when actually used, for example by avifImageAllocatePlanes().
  146|  81.9k|    AVIF_CHECKERR(depth <= 16, NULL); // avifImage only supports up to 16 bits per sample. See avifImageUsesU16().
  ------------------
  |  |   45|  81.9k|    do {                        \
  |  |   46|  81.9k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 81.9k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  81.9k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 81.9k]
  |  |  ------------------
  ------------------
  147|       |    // Cast to silence "comparison of unsigned expression is always true" warning.
  148|  81.9k|    AVIF_CHECKERR((int)yuvFormat >= AVIF_PIXEL_FORMAT_NONE && yuvFormat < AVIF_PIXEL_FORMAT_COUNT, NULL);
  ------------------
  |  |   45|  81.9k|    do {                        \
  |  |   46|   163k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:15): [True: 81.9k, False: 0]
  |  |  |  Branch (46:15): [True: 81.9k, False: 0]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  81.9k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 81.9k]
  |  |  ------------------
  ------------------
  149|       |
  150|  81.9k|    avifImage * image = (avifImage *)avifAlloc(sizeof(avifImage));
  151|  81.9k|    AVIF_CHECKERR(image, NULL);
  ------------------
  |  |   45|  81.9k|    do {                        \
  |  |   46|  81.9k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 81.9k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  81.9k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 81.9k]
  |  |  ------------------
  ------------------
  152|  81.9k|    avifImageSetDefaults(image);
  153|  81.9k|    image->width = width;
  154|  81.9k|    image->height = height;
  155|  81.9k|    image->depth = depth;
  156|  81.9k|    image->yuvFormat = yuvFormat;
  157|  81.9k|    return image;
  158|  81.9k|}
avifImageCreateEmpty:
  161|  81.9k|{
  162|  81.9k|    return avifImageCreate(0, 0, 0, AVIF_PIXEL_FORMAT_NONE);
  163|  81.9k|}
avifImageCopyNoAlloc:
  166|  2.78k|{
  167|  2.78k|    dstImage->width = srcImage->width;
  168|  2.78k|    dstImage->height = srcImage->height;
  169|  2.78k|    dstImage->depth = srcImage->depth;
  170|  2.78k|    dstImage->yuvFormat = srcImage->yuvFormat;
  171|  2.78k|    dstImage->yuvRange = srcImage->yuvRange;
  172|  2.78k|    dstImage->yuvChromaSamplePosition = srcImage->yuvChromaSamplePosition;
  173|  2.78k|    dstImage->alphaPremultiplied = srcImage->alphaPremultiplied;
  174|       |
  175|  2.78k|    dstImage->colorPrimaries = srcImage->colorPrimaries;
  176|  2.78k|    dstImage->transferCharacteristics = srcImage->transferCharacteristics;
  177|  2.78k|    dstImage->matrixCoefficients = srcImage->matrixCoefficients;
  178|  2.78k|    dstImage->clli = srcImage->clli;
  179|       |
  180|  2.78k|    dstImage->transformFlags = srcImage->transformFlags;
  181|  2.78k|    dstImage->pasp = srcImage->pasp;
  182|  2.78k|    dstImage->clap = srcImage->clap;
  183|  2.78k|    dstImage->irot = srcImage->irot;
  184|  2.78k|    dstImage->imir = srcImage->imir;
  185|  2.78k|}
avifImageCopySamples:
  188|    929|{
  189|    929|    assert(srcImage->depth == dstImage->depth);
  190|    929|    if (planes & AVIF_PLANES_YUV) {
  ------------------
  |  Branch (190:9): [True: 901, False: 28]
  ------------------
  191|    901|        assert(srcImage->yuvFormat == dstImage->yuvFormat);
  192|       |        // Note that there may be a mismatch between srcImage->yuvRange and dstImage->yuvRange
  193|       |        // because libavif allows for 'colr' and AV1 OBU video range values to differ.
  194|    901|    }
  195|    929|    const size_t bytesPerPixel = avifImageUsesU16(srcImage) ? 2 : 1;
  ------------------
  |  Branch (195:34): [True: 293, False: 636]
  ------------------
  196|       |
  197|    929|    const avifBool skipColor = !(planes & AVIF_PLANES_YUV);
  198|    929|    const avifBool skipAlpha = !(planes & AVIF_PLANES_A);
  199|  4.64k|    for (int c = AVIF_CHAN_Y; c <= AVIF_CHAN_A; ++c) {
  ------------------
  |  Branch (199:31): [True: 3.71k, False: 929]
  ------------------
  200|  3.71k|        const avifBool alpha = c == AVIF_CHAN_A;
  201|  3.71k|        if ((skipColor && !alpha) || (skipAlpha && alpha)) {
  ------------------
  |  Branch (201:14): [True: 112, False: 3.60k]
  |  Branch (201:27): [True: 84, False: 28]
  |  Branch (201:39): [True: 3.60k, False: 28]
  |  Branch (201:52): [True: 901, False: 2.70k]
  ------------------
  202|    985|            continue;
  203|    985|        }
  204|       |
  205|  2.73k|        const uint32_t planeWidth = avifImagePlaneWidth(srcImage, c);
  206|  2.73k|        const uint32_t planeHeight = avifImagePlaneHeight(srcImage, c);
  207|  2.73k|        const uint8_t * srcRow = avifImagePlane(srcImage, c);
  208|  2.73k|        uint8_t * dstRow = avifImagePlane(dstImage, c);
  209|  2.73k|        const uint32_t srcRowBytes = avifImagePlaneRowBytes(srcImage, c);
  210|  2.73k|        const uint32_t dstRowBytes = avifImagePlaneRowBytes(dstImage, c);
  211|  2.73k|        assert(!srcRow == !dstRow);
  212|  2.73k|        if (!srcRow) {
  ------------------
  |  Branch (212:13): [True: 356, False: 2.37k]
  ------------------
  213|    356|            continue;
  214|    356|        }
  215|  2.73k|        assert(planeWidth == avifImagePlaneWidth(dstImage, c));
  216|  2.37k|        assert(planeHeight == avifImagePlaneHeight(dstImage, c));
  217|       |
  218|  2.37k|        const size_t planeWidthBytes = planeWidth * bytesPerPixel;
  219|   133k|        for (uint32_t y = 0; y < planeHeight; ++y) {
  ------------------
  |  Branch (219:30): [True: 130k, False: 2.37k]
  ------------------
  220|   130k|            memcpy(dstRow, srcRow, planeWidthBytes);
  221|   130k|            srcRow += srcRowBytes;
  222|   130k|            dstRow += dstRowBytes;
  223|   130k|        }
  224|  2.37k|    }
  225|    929|}
avifImageSetViewRect:
  323|  2.78k|{
  324|  2.78k|    avifPixelFormatInfo formatInfo;
  325|  2.78k|    avifGetPixelFormatInfo(srcImage->yuvFormat, &formatInfo);
  326|  2.78k|    if ((rect->width > srcImage->width) || (rect->height > srcImage->height) || (rect->x > (srcImage->width - rect->width)) ||
  ------------------
  |  Branch (326:9): [True: 0, False: 2.78k]
  |  Branch (326:44): [True: 0, False: 2.78k]
  |  Branch (326:81): [True: 0, False: 2.78k]
  ------------------
  327|  2.78k|        (rect->y > (srcImage->height - rect->height))) {
  ------------------
  |  Branch (327:9): [True: 0, False: 2.78k]
  ------------------
  328|      0|        return AVIF_RESULT_INVALID_ARGUMENT;
  329|      0|    }
  330|  2.78k|    if (!formatInfo.monochrome && ((rect->x & formatInfo.chromaShiftX) || (rect->y & formatInfo.chromaShiftY))) {
  ------------------
  |  Branch (330:9): [True: 2.25k, False: 535]
  |  Branch (330:36): [True: 0, False: 2.25k]
  |  Branch (330:75): [True: 0, False: 2.25k]
  ------------------
  331|      0|        return AVIF_RESULT_INVALID_ARGUMENT;
  332|      0|    }
  333|  2.78k|    avifImageFreePlanes(dstImage, AVIF_PLANES_ALL); // dstImage->imageOwnsYUVPlanes and dstImage->imageOwnsAlphaPlane set to AVIF_FALSE.
  334|  2.78k|    avifImageCopyNoAlloc(dstImage, srcImage);
  335|  2.78k|    dstImage->width = rect->width;
  336|  2.78k|    dstImage->height = rect->height;
  337|  2.78k|    const uint32_t pixelBytes = (srcImage->depth > 8) ? 2 : 1;
  ------------------
  |  Branch (337:33): [True: 879, False: 1.90k]
  ------------------
  338|  2.78k|    if (srcImage->yuvPlanes[AVIF_CHAN_Y]) {
  ------------------
  |  Branch (338:9): [True: 2.73k, False: 56]
  ------------------
  339|  10.9k|        for (int yuvPlane = AVIF_CHAN_Y; yuvPlane <= AVIF_CHAN_V; ++yuvPlane) {
  ------------------
  |  Branch (339:42): [True: 8.19k, False: 2.73k]
  ------------------
  340|  8.19k|            if (srcImage->yuvRowBytes[yuvPlane]) {
  ------------------
  |  Branch (340:17): [True: 7.12k, False: 1.07k]
  ------------------
  341|  7.12k|                const size_t planeX = (yuvPlane == AVIF_CHAN_Y) ? rect->x : (rect->x >> formatInfo.chromaShiftX);
  ------------------
  |  Branch (341:39): [True: 2.73k, False: 4.39k]
  ------------------
  342|  7.12k|                const size_t planeY = (yuvPlane == AVIF_CHAN_Y) ? rect->y : (rect->y >> formatInfo.chromaShiftY);
  ------------------
  |  Branch (342:39): [True: 2.73k, False: 4.39k]
  ------------------
  343|  7.12k|                dstImage->yuvPlanes[yuvPlane] =
  344|  7.12k|                    srcImage->yuvPlanes[yuvPlane] + planeY * srcImage->yuvRowBytes[yuvPlane] + planeX * pixelBytes;
  345|  7.12k|                dstImage->yuvRowBytes[yuvPlane] = srcImage->yuvRowBytes[yuvPlane];
  346|  7.12k|            }
  347|  8.19k|        }
  348|  2.73k|    }
  349|  2.78k|    if (srcImage->alphaPlane) {
  ------------------
  |  Branch (349:9): [True: 84, False: 2.70k]
  ------------------
  350|     84|        dstImage->alphaPlane = srcImage->alphaPlane + (size_t)rect->y * srcImage->alphaRowBytes + (size_t)rect->x * pixelBytes;
  351|     84|        dstImage->alphaRowBytes = srcImage->alphaRowBytes;
  352|     84|    }
  353|  2.78k|    return AVIF_RESULT_OK;
  354|  2.78k|}
avifImageDestroy:
  357|  81.9k|{
  358|  81.9k|    if (image->gainMap) {
  ------------------
  |  Branch (358:9): [True: 398, False: 81.5k]
  ------------------
  359|    398|        avifGainMapDestroy(image->gainMap);
  360|    398|    }
  361|  81.9k|    avifImageFreePlanes(image, AVIF_PLANES_ALL);
  362|  81.9k|    avifRWDataFree(&image->icc);
  363|  81.9k|    avifRWDataFree(&image->exif);
  364|  81.9k|    avifRWDataFree(&image->xmp);
  365|   107k|    for (size_t i = 0; i < image->numProperties; ++i) {
  ------------------
  |  Branch (365:24): [True: 25.6k, False: 81.9k]
  ------------------
  366|  25.6k|        avifRWDataFree(&image->properties[i].boxPayload);
  367|  25.6k|    }
  368|  81.9k|    avifFree(image->properties);
  369|       |    image->properties = NULL;
  370|  81.9k|    image->numProperties = 0;
  371|  81.9k|    avifFree(image);
  372|  81.9k|}
avifImageSetMetadataXMP:
  380|    222|{
  381|    222|    return avifRWDataSet(&image->xmp, xmp, xmpSize);
  382|    222|}
avifImagePushProperty:
  385|  25.6k|{
  386|  25.6k|    AVIF_CHECKERR(image->numProperties < SIZE_MAX / sizeof(avifImageItemProperty), AVIF_RESULT_INVALID_ARGUMENT);
  ------------------
  |  |   45|  25.6k|    do {                        \
  |  |   46|  25.6k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 25.6k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  25.6k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 25.6k]
  |  |  ------------------
  ------------------
  387|       |    // Shallow copy the current properties.
  388|  25.6k|    const size_t numProperties = image->numProperties + 1;
  389|  25.6k|    avifImageItemProperty * const properties = (avifImageItemProperty *)avifAlloc(numProperties * sizeof(properties[0]));
  390|  25.6k|    AVIF_CHECKERR(properties != NULL, AVIF_RESULT_OUT_OF_MEMORY);
  ------------------
  |  |   45|  25.6k|    do {                        \
  |  |   46|  25.6k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 25.6k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  25.6k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 25.6k]
  |  |  ------------------
  ------------------
  391|  25.6k|    if (image->numProperties != 0) {
  ------------------
  |  Branch (391:9): [True: 10.0k, False: 15.6k]
  ------------------
  392|  10.0k|        memcpy(properties, image->properties, image->numProperties * sizeof(properties[0]));
  393|  10.0k|    }
  394|       |    // Free the old array and replace it by the new one.
  395|  25.6k|    avifFree(image->properties);
  396|  25.6k|    image->properties = properties;
  397|  25.6k|    image->numProperties = numProperties;
  398|       |    // Set the new property.
  399|  25.6k|    avifImageItemProperty * const property = &image->properties[image->numProperties - 1];
  400|  25.6k|    memset(property, 0, sizeof(*property));
  401|  25.6k|    memcpy(property->boxtype, boxtype, sizeof(property->boxtype));
  402|  25.6k|    memcpy(property->usertype, usertype, sizeof(property->usertype));
  403|  25.6k|    AVIF_CHECKRES(avifRWDataSet(&property->boxPayload, boxPayload, boxPayloadSize));
  ------------------
  |  |   54|  25.6k|    do {                                  \
  |  |   55|  25.6k|        const avifResult result__ = (A);  \
  |  |   56|  25.6k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 0, False: 25.6k]
  |  |  ------------------
  |  |   57|      0|            avifBreakOnError();           \
  |  |   58|      0|            return result__;              \
  |  |   59|      0|        }                                 \
  |  |   60|  25.6k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 25.6k]
  |  |  ------------------
  ------------------
  404|  25.6k|    return AVIF_RESULT_OK;
  405|  25.6k|}
avifImageAllocatePlanes:
  428|  5.29k|{
  429|  5.29k|    if (image->width == 0 || image->height == 0 || image->depth == 0 || image->depth > 16) {
  ------------------
  |  Branch (429:9): [True: 0, False: 5.29k]
  |  Branch (429:30): [True: 0, False: 5.29k]
  |  Branch (429:52): [True: 0, False: 5.29k]
  |  Branch (429:73): [True: 0, False: 5.29k]
  ------------------
  430|      0|        return AVIF_RESULT_INVALID_ARGUMENT;
  431|      0|    }
  432|  5.29k|    const uint32_t channelSize = avifImageUsesU16(image) ? 2 : 1;
  ------------------
  |  Branch (432:34): [True: 2.41k, False: 2.88k]
  ------------------
  433|  5.29k|    if (image->width > UINT32_MAX / channelSize) {
  ------------------
  |  Branch (433:9): [True: 0, False: 5.29k]
  ------------------
  434|      0|        return AVIF_RESULT_INVALID_ARGUMENT;
  435|      0|    }
  436|  5.29k|    const uint32_t fullRowBytes = channelSize * image->width;
  437|  5.29k|    if (image->height > PTRDIFF_MAX / fullRowBytes) {
  ------------------
  |  Branch (437:9): [True: 0, False: 5.29k]
  ------------------
  438|      0|        return AVIF_RESULT_INVALID_ARGUMENT;
  439|      0|    }
  440|  5.29k|    const size_t fullSize = (size_t)fullRowBytes * image->height;
  441|       |
  442|  5.29k|    if ((planes & AVIF_PLANES_YUV) && (image->yuvFormat != AVIF_PIXEL_FORMAT_NONE)) {
  ------------------
  |  Branch (442:9): [True: 4.63k, False: 663]
  |  Branch (442:39): [True: 4.63k, False: 0]
  ------------------
  443|  4.63k|        avifPixelFormatInfo info;
  444|  4.63k|        avifGetPixelFormatInfo(image->yuvFormat, &info);
  445|       |
  446|  4.63k|        image->imageOwnsYUVPlanes = AVIF_TRUE;
  ------------------
  |  |   88|  4.63k|#define AVIF_TRUE 1
  ------------------
  447|  4.63k|        if (!image->yuvPlanes[AVIF_CHAN_Y]) {
  ------------------
  |  Branch (447:13): [True: 4.63k, False: 0]
  ------------------
  448|  4.63k|            image->yuvPlanes[AVIF_CHAN_Y] = (uint8_t *)avifAlloc(fullSize);
  449|  4.63k|            if (!image->yuvPlanes[AVIF_CHAN_Y]) {
  ------------------
  |  Branch (449:17): [True: 0, False: 4.63k]
  ------------------
  450|      0|                return AVIF_RESULT_OUT_OF_MEMORY;
  451|      0|            }
  452|  4.63k|            image->yuvRowBytes[AVIF_CHAN_Y] = fullRowBytes;
  453|  4.63k|        }
  454|       |
  455|  4.63k|        if (!info.monochrome) {
  ------------------
  |  Branch (455:13): [True: 3.83k, False: 798]
  ------------------
  456|       |            // Intermediary computation as 64 bits in case width or height is exactly UINT32_MAX.
  457|  3.83k|            const uint32_t shiftedW = (uint32_t)(((uint64_t)image->width + info.chromaShiftX) >> info.chromaShiftX);
  458|  3.83k|            const uint32_t shiftedH = (uint32_t)(((uint64_t)image->height + info.chromaShiftY) >> info.chromaShiftY);
  459|       |
  460|       |            // These are less than or equal to fullRowBytes/fullSize. No need to check overflows.
  461|  3.83k|            const uint32_t uvRowBytes = channelSize * shiftedW;
  462|  3.83k|            const size_t uvSize = (size_t)uvRowBytes * shiftedH;
  463|       |
  464|  11.5k|            for (int uvPlane = AVIF_CHAN_U; uvPlane <= AVIF_CHAN_V; ++uvPlane) {
  ------------------
  |  Branch (464:45): [True: 7.67k, False: 3.83k]
  ------------------
  465|  7.67k|                if (!image->yuvPlanes[uvPlane]) {
  ------------------
  |  Branch (465:21): [True: 7.67k, False: 0]
  ------------------
  466|  7.67k|                    image->yuvPlanes[uvPlane] = (uint8_t *)avifAlloc(uvSize);
  467|  7.67k|                    if (!image->yuvPlanes[uvPlane]) {
  ------------------
  |  Branch (467:25): [True: 0, False: 7.67k]
  ------------------
  468|      0|                        return AVIF_RESULT_OUT_OF_MEMORY;
  469|      0|                    }
  470|  7.67k|                    image->yuvRowBytes[uvPlane] = uvRowBytes;
  471|  7.67k|                }
  472|  7.67k|            }
  473|  3.83k|        }
  474|  4.63k|    }
  475|  5.29k|    if (planes & AVIF_PLANES_A) {
  ------------------
  |  Branch (475:9): [True: 663, False: 4.63k]
  ------------------
  476|    663|        image->imageOwnsAlphaPlane = AVIF_TRUE;
  ------------------
  |  |   88|    663|#define AVIF_TRUE 1
  ------------------
  477|    663|        if (!image->alphaPlane) {
  ------------------
  |  Branch (477:13): [True: 663, False: 0]
  ------------------
  478|    663|            image->alphaPlane = (uint8_t *)avifAlloc(fullSize);
  479|    663|            if (!image->alphaPlane) {
  ------------------
  |  Branch (479:17): [True: 0, False: 663]
  ------------------
  480|      0|                return AVIF_RESULT_OUT_OF_MEMORY;
  481|      0|            }
  482|    663|            image->alphaRowBytes = fullRowBytes;
  483|    663|        }
  484|    663|    }
  485|  5.29k|    return AVIF_RESULT_OK;
  486|  5.29k|}
avifImageFreePlanes:
  489|   160k|{
  490|   160k|    if ((planes & AVIF_PLANES_YUV) && (image->yuvFormat != AVIF_PIXEL_FORMAT_NONE)) {
  ------------------
  |  Branch (490:9): [True: 154k, False: 5.30k]
  |  Branch (490:39): [True: 70.0k, False: 84.7k]
  ------------------
  491|  70.0k|        if (image->imageOwnsYUVPlanes) {
  ------------------
  |  Branch (491:13): [True: 4.63k, False: 65.3k]
  ------------------
  492|  4.63k|            avifFree(image->yuvPlanes[AVIF_CHAN_Y]);
  493|  4.63k|            avifFree(image->yuvPlanes[AVIF_CHAN_U]);
  494|  4.63k|            avifFree(image->yuvPlanes[AVIF_CHAN_V]);
  495|  4.63k|        }
  496|  70.0k|        image->yuvPlanes[AVIF_CHAN_Y] = NULL;
  497|  70.0k|        image->yuvRowBytes[AVIF_CHAN_Y] = 0;
  498|  70.0k|        image->yuvPlanes[AVIF_CHAN_U] = NULL;
  499|  70.0k|        image->yuvRowBytes[AVIF_CHAN_U] = 0;
  500|  70.0k|        image->yuvPlanes[AVIF_CHAN_V] = NULL;
  501|  70.0k|        image->yuvRowBytes[AVIF_CHAN_V] = 0;
  502|  70.0k|        image->imageOwnsYUVPlanes = AVIF_FALSE;
  ------------------
  |  |   89|  70.0k|#define AVIF_FALSE 0
  ------------------
  503|  70.0k|    }
  504|   160k|    if (planes & AVIF_PLANES_A) {
  ------------------
  |  Branch (504:9): [True: 136k, False: 23.1k]
  ------------------
  505|   136k|        if (image->imageOwnsAlphaPlane) {
  ------------------
  |  Branch (505:13): [True: 573, False: 136k]
  ------------------
  506|    573|            avifFree(image->alphaPlane);
  507|    573|        }
  508|   136k|        image->alphaPlane = NULL;
  509|   136k|        image->alphaRowBytes = 0;
  510|   136k|        image->imageOwnsAlphaPlane = AVIF_FALSE;
  ------------------
  |  |   89|   136k|#define AVIF_FALSE 0
  ------------------
  511|   136k|    }
  512|   160k|}
avifImageStealPlanes:
  515|  13.1k|{
  516|  13.1k|    avifImageFreePlanes(dstImage, planes);
  517|       |
  518|  13.1k|    if (planes & AVIF_PLANES_YUV) {
  ------------------
  |  Branch (518:9): [True: 10.9k, False: 2.15k]
  ------------------
  519|  10.9k|        dstImage->yuvPlanes[AVIF_CHAN_Y] = srcImage->yuvPlanes[AVIF_CHAN_Y];
  520|  10.9k|        dstImage->yuvRowBytes[AVIF_CHAN_Y] = srcImage->yuvRowBytes[AVIF_CHAN_Y];
  521|  10.9k|        dstImage->yuvPlanes[AVIF_CHAN_U] = srcImage->yuvPlanes[AVIF_CHAN_U];
  522|  10.9k|        dstImage->yuvRowBytes[AVIF_CHAN_U] = srcImage->yuvRowBytes[AVIF_CHAN_U];
  523|  10.9k|        dstImage->yuvPlanes[AVIF_CHAN_V] = srcImage->yuvPlanes[AVIF_CHAN_V];
  524|  10.9k|        dstImage->yuvRowBytes[AVIF_CHAN_V] = srcImage->yuvRowBytes[AVIF_CHAN_V];
  525|       |
  526|  10.9k|        srcImage->yuvPlanes[AVIF_CHAN_Y] = NULL;
  527|  10.9k|        srcImage->yuvRowBytes[AVIF_CHAN_Y] = 0;
  528|  10.9k|        srcImage->yuvPlanes[AVIF_CHAN_U] = NULL;
  529|  10.9k|        srcImage->yuvRowBytes[AVIF_CHAN_U] = 0;
  530|  10.9k|        srcImage->yuvPlanes[AVIF_CHAN_V] = NULL;
  531|  10.9k|        srcImage->yuvRowBytes[AVIF_CHAN_V] = 0;
  532|       |
  533|  10.9k|        dstImage->yuvFormat = srcImage->yuvFormat;
  534|  10.9k|        dstImage->imageOwnsYUVPlanes = srcImage->imageOwnsYUVPlanes;
  535|  10.9k|        srcImage->imageOwnsYUVPlanes = AVIF_FALSE;
  ------------------
  |  |   89|  10.9k|#define AVIF_FALSE 0
  ------------------
  536|  10.9k|    }
  537|  13.1k|    if (planes & AVIF_PLANES_A) {
  ------------------
  |  Branch (537:9): [True: 2.15k, False: 10.9k]
  ------------------
  538|  2.15k|        dstImage->alphaPlane = srcImage->alphaPlane;
  539|  2.15k|        dstImage->alphaRowBytes = srcImage->alphaRowBytes;
  540|       |
  541|  2.15k|        srcImage->alphaPlane = NULL;
  542|  2.15k|        srcImage->alphaRowBytes = 0;
  543|       |
  544|  2.15k|        dstImage->imageOwnsAlphaPlane = srcImage->imageOwnsAlphaPlane;
  545|  2.15k|        srcImage->imageOwnsAlphaPlane = AVIF_FALSE;
  ------------------
  |  |   89|  2.15k|#define AVIF_FALSE 0
  ------------------
  546|  2.15k|    }
  547|  13.1k|}
avifImageUsesU16:
  550|  6.22k|{
  551|  6.22k|    return (image->depth > 8);
  552|  6.22k|}
avifImagePlane:
  583|  5.46k|{
  584|  5.46k|    if ((channel == AVIF_CHAN_Y) || (channel == AVIF_CHAN_U) || (channel == AVIF_CHAN_V)) {
  ------------------
  |  Branch (584:9): [True: 1.80k, False: 3.66k]
  |  Branch (584:37): [True: 1.80k, False: 1.85k]
  |  Branch (584:65): [True: 1.80k, False: 56]
  ------------------
  585|  5.40k|        return image->yuvPlanes[channel];
  586|  5.40k|    }
  587|     56|    if (channel == AVIF_CHAN_A) {
  ------------------
  |  Branch (587:9): [True: 56, False: 0]
  ------------------
  588|     56|        return image->alphaPlane;
  589|     56|    }
  590|      0|    return NULL;
  591|     56|}
avifImagePlaneRowBytes:
  594|  5.46k|{
  595|  5.46k|    if ((channel == AVIF_CHAN_Y) || (channel == AVIF_CHAN_U) || (channel == AVIF_CHAN_V)) {
  ------------------
  |  Branch (595:9): [True: 1.80k, False: 3.66k]
  |  Branch (595:37): [True: 1.80k, False: 1.85k]
  |  Branch (595:65): [True: 1.80k, False: 56]
  ------------------
  596|  5.40k|        return image->yuvRowBytes[channel];
  597|  5.40k|    }
  598|     56|    if (channel == AVIF_CHAN_A) {
  ------------------
  |  Branch (598:9): [True: 56, False: 0]
  ------------------
  599|     56|        return image->alphaRowBytes;
  600|     56|    }
  601|      0|    return 0;
  602|     56|}
avifImagePlaneWidth:
  605|  18.7k|{
  606|  18.7k|    if (channel == AVIF_CHAN_Y) {
  ------------------
  |  Branch (606:9): [True: 5.16k, False: 13.5k]
  ------------------
  607|  5.16k|        return image->width;
  608|  5.16k|    }
  609|  13.5k|    if ((channel == AVIF_CHAN_U) || (channel == AVIF_CHAN_V)) {
  ------------------
  |  Branch (609:9): [True: 9.09k, False: 4.44k]
  |  Branch (609:37): [True: 4.41k, False: 28]
  ------------------
  610|  13.5k|        avifPixelFormatInfo formatInfo;
  611|  13.5k|        avifGetPixelFormatInfo(image->yuvFormat, &formatInfo);
  612|  13.5k|        if (formatInfo.monochrome) {
  ------------------
  |  Branch (612:13): [True: 1.10k, False: 12.4k]
  ------------------
  613|  1.10k|            return 0;
  614|  1.10k|        }
  615|  12.4k|        return (image->width + formatInfo.chromaShiftX) >> formatInfo.chromaShiftX;
  616|  13.5k|    }
  617|     28|    if ((channel == AVIF_CHAN_A) && image->alphaPlane) {
  ------------------
  |  Branch (617:9): [True: 28, False: 0]
  |  Branch (617:37): [True: 28, False: 0]
  ------------------
  618|     28|        return image->width;
  619|     28|    }
  620|      0|    return 0;
  621|     28|}
avifImagePlaneHeight:
  624|  18.7k|{
  625|  18.7k|    if (channel == AVIF_CHAN_Y) {
  ------------------
  |  Branch (625:9): [True: 5.16k, False: 13.5k]
  ------------------
  626|  5.16k|        return image->height;
  627|  5.16k|    }
  628|  13.5k|    if ((channel == AVIF_CHAN_U) || (channel == AVIF_CHAN_V)) {
  ------------------
  |  Branch (628:9): [True: 9.09k, False: 4.44k]
  |  Branch (628:37): [True: 4.41k, False: 28]
  ------------------
  629|  13.5k|        avifPixelFormatInfo formatInfo;
  630|  13.5k|        avifGetPixelFormatInfo(image->yuvFormat, &formatInfo);
  631|  13.5k|        if (formatInfo.monochrome) {
  ------------------
  |  Branch (631:13): [True: 1.10k, False: 12.4k]
  ------------------
  632|  1.10k|            return 0;
  633|  1.10k|        }
  634|  12.4k|        return (image->height + formatInfo.chromaShiftY) >> formatInfo.chromaShiftY;
  635|  13.5k|    }
  636|     28|    if ((channel == AVIF_CHAN_A) && image->alphaPlane) {
  ------------------
  |  Branch (636:9): [True: 28, False: 0]
  |  Branch (636:37): [True: 28, False: 0]
  ------------------
  637|     28|        return image->height;
  638|     28|    }
  639|      0|    return 0;
  640|     28|}
avifDimensionsTooLarge:
  643|  63.8k|{
  644|  63.8k|    if (width > (imageSizeLimit / height)) {
  ------------------
  |  Branch (644:9): [True: 87, False: 63.7k]
  ------------------
  645|     87|        return AVIF_TRUE;
  ------------------
  |  |   88|     87|#define AVIF_TRUE 1
  ------------------
  646|     87|    }
  647|  63.7k|    if ((imageDimensionLimit != 0) && ((width > imageDimensionLimit) || (height > imageDimensionLimit))) {
  ------------------
  |  Branch (647:9): [True: 63.7k, False: 0]
  |  Branch (647:40): [True: 26, False: 63.7k]
  |  Branch (647:73): [True: 23, False: 63.7k]
  ------------------
  648|     49|        return AVIF_TRUE;
  ------------------
  |  |   88|     49|#define AVIF_TRUE 1
  ------------------
  649|     49|    }
  650|  63.7k|    return AVIF_FALSE;
  ------------------
  |  |   89|  63.7k|#define AVIF_FALSE 0
  ------------------
  651|  63.7k|}
avifCodecDestroy:
  656|  39.5k|{
  657|  39.5k|    if (codec && codec->destroyInternal) {
  ------------------
  |  Branch (657:9): [True: 39.5k, False: 0]
  |  Branch (657:18): [True: 39.5k, False: 0]
  ------------------
  658|  39.5k|        codec->destroyInternal(codec);
  659|  39.5k|    }
  660|  39.5k|    avifFree(codec);
  661|  39.5k|}
avifIsAlpha:
 1017|   181k|{
 1018|   181k|    if (itemCategory == AVIF_ITEM_ALPHA) {
  ------------------
  |  Branch (1018:9): [True: 21.1k, False: 160k]
  ------------------
 1019|  21.1k|        return AVIF_TRUE;
  ------------------
  |  |   88|  21.1k|#define AVIF_TRUE 1
  ------------------
 1020|  21.1k|    }
 1021|   160k|    if (itemCategory >= AVIF_ITEM_SAMPLE_TRANSFORM_INPUT_0_ALPHA &&
  ------------------
  |  Branch (1021:9): [True: 0, False: 160k]
  ------------------
 1022|      0|        itemCategory < AVIF_ITEM_SAMPLE_TRANSFORM_INPUT_0_ALPHA + AVIF_SAMPLE_TRANSFORM_MAX_NUM_EXTRA_INPUT_IMAGE_ITEMS) {
  ------------------
  |  |  424|      0|    (AVIF_ITEM_SAMPLE_TRANSFORM_INPUT_0_ALPHA - AVIF_ITEM_SAMPLE_TRANSFORM_INPUT_0_COLOR)
  ------------------
  |  Branch (1022:9): [True: 0, False: 0]
  ------------------
 1023|      0|        return AVIF_TRUE;
  ------------------
  |  |   88|      0|#define AVIF_TRUE 1
  ------------------
 1024|      0|    }
 1025|   160k|    return AVIF_FALSE;
  ------------------
  |  |   89|   160k|#define AVIF_FALSE 0
  ------------------
 1026|   160k|}
avifAreGridDimensionsValid:
 1031|    421|{
 1032|       |    // ISO/IEC 23000-22:2019, Section 7.3.11.4.2:
 1033|       |    //   - the tile_width shall be greater than or equal to 64, and should be a multiple of 64
 1034|       |    //   - the tile_height shall be greater than or equal to 64, and should be a multiple of 64
 1035|       |    // The "should" part is ignored here.
 1036|    421|    if ((tileW < 64) || (tileH < 64)) {
  ------------------
  |  Branch (1036:9): [True: 0, False: 421]
  |  Branch (1036:25): [True: 4, False: 417]
  ------------------
 1037|      4|        avifDiagnosticsPrintf(diag,
 1038|      4|                              "Grid image tile width (%u) or height (%u) cannot be smaller than 64. "
 1039|      4|                              "See MIAF (ISO/IEC 23000-22:2019), Section 7.3.11.4.2",
 1040|      4|                              tileW,
 1041|      4|                              tileH);
 1042|      4|        return AVIF_FALSE;
  ------------------
  |  |   89|      4|#define AVIF_FALSE 0
  ------------------
 1043|      4|    }
 1044|       |
 1045|       |    // ISO/IEC 23000-22:2019, Section 7.3.11.4.2:
 1046|       |    //   - when the images are in the 4:2:2 chroma sampling format the horizontal tile offsets and widths,
 1047|       |    //     and the output width, shall be even numbers;
 1048|       |    //   - when the images are in the 4:2:0 chroma sampling format both the horizontal and vertical tile
 1049|       |    //     offsets and widths, and the output width and height, shall be even numbers.
 1050|       |    // If the rules above were not respected, the following problematic situation may happen:
 1051|       |    //   Some 4:2:0 image is 650 pixels wide and has 10 cell columns, each being 65 pixels wide.
 1052|       |    //   The chroma plane of the whole image is 325 pixels wide. The chroma plane of each cell is 33 pixels wide.
 1053|       |    //   33*10 - 325 gives 5 extra pixels with no specified destination in the reconstructed image.
 1054|       |
 1055|       |    // Tile offsets are not enforced since they depend on tile size (ISO/IEC 23008-12:2017, Section 6.6.2.3.1):
 1056|       |    //   The reconstructed image is formed by tiling the input images into a grid [...] without gap or overlap
 1057|    417|    if ((((yuvFormat == AVIF_PIXEL_FORMAT_YUV420) || (yuvFormat == AVIF_PIXEL_FORMAT_YUV422)) &&
  ------------------
  |  Branch (1057:11): [True: 148, False: 269]
  |  Branch (1057:54): [True: 7, False: 262]
  ------------------
 1058|    155|         (((imageW % 2) != 0) || ((tileW % 2) != 0))) ||
  ------------------
  |  Branch (1058:11): [True: 3, False: 152]
  |  Branch (1058:34): [True: 2, False: 150]
  ------------------
 1059|    412|        ((yuvFormat == AVIF_PIXEL_FORMAT_YUV420) && (((imageH % 2) != 0) || ((tileH % 2) != 0)))) {
  ------------------
  |  Branch (1059:10): [True: 143, False: 269]
  |  Branch (1059:54): [True: 3, False: 140]
  |  Branch (1059:77): [True: 3, False: 137]
  ------------------
 1060|     11|        avifDiagnosticsPrintf(diag,
 1061|     11|                              "Grid image width (%u) or height (%u) or tile width (%u) or height (%u) "
 1062|     11|                              "shall be even if chroma is subsampled in that dimension. "
 1063|     11|                              "See MIAF (ISO/IEC 23000-22:2019), Section 7.3.11.4.2",
 1064|     11|                              imageW,
 1065|     11|                              imageH,
 1066|     11|                              tileW,
 1067|     11|                              tileH);
 1068|     11|        return AVIF_FALSE;
  ------------------
  |  |   89|     11|#define AVIF_FALSE 0
  ------------------
 1069|     11|    }
 1070|    406|    return AVIF_TRUE;
  ------------------
  |  |   88|    406|#define AVIF_TRUE 1
  ------------------
 1071|    417|}
avifCodecTypeFromChoice:
 1241|  39.5k|{
 1242|  39.5k|    struct AvailableCodec * availableCodec = findAvailableCodec(choice, requiredFlags);
 1243|  39.5k|    if (availableCodec) {
  ------------------
  |  Branch (1243:9): [True: 39.5k, False: 0]
  ------------------
 1244|  39.5k|        return availableCodec->type;
 1245|  39.5k|    }
 1246|      0|    return AVIF_CODEC_TYPE_UNKNOWN;
 1247|  39.5k|}
avifCodecCreate:
 1260|  39.5k|{
 1261|  39.5k|    *codec = NULL;
 1262|  39.5k|    struct AvailableCodec * availableCodec = findAvailableCodec(choice, requiredFlags);
 1263|  39.5k|    AVIF_CHECKERR(availableCodec != NULL, AVIF_RESULT_NO_CODEC_AVAILABLE);
  ------------------
  |  |   45|  39.5k|    do {                        \
  |  |   46|  39.5k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 39.5k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  39.5k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 39.5k]
  |  |  ------------------
  ------------------
 1264|  39.5k|    *codec = availableCodec->create();
 1265|  39.5k|    AVIF_CHECKERR(*codec != NULL, AVIF_RESULT_OUT_OF_MEMORY);
  ------------------
  |  |   45|  39.5k|    do {                        \
  |  |   46|  39.5k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 39.5k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  39.5k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 39.5k]
  |  |  ------------------
  ------------------
 1266|  39.5k|    return AVIF_RESULT_OK;
 1267|  39.5k|}
avifGainMapCreate:
 1307|    398|{
 1308|    398|    avifGainMap * gainMap = (avifGainMap *)avifAlloc(sizeof(avifGainMap));
 1309|    398|    if (!gainMap) {
  ------------------
  |  Branch (1309:9): [True: 0, False: 398]
  ------------------
 1310|      0|        return NULL;
 1311|      0|    }
 1312|    398|    avifGainMapSetDefaults(gainMap);
 1313|       |    // Note that some functions like avifDecoderFindGainMapItem() allocate avifGainMap directly on
 1314|       |    // the stack instead of calling avifGainMapCreate() to simplify error handling. This works under
 1315|       |    // the assumption that no complex initialization (such as dynamic allocation of fields) takes
 1316|       |    // place here. If this function becomes more complex than one alloc + setDefaults, such code
 1317|       |    // might need to be changed.
 1318|    398|    return gainMap;
 1319|    398|}
avifGainMapSetDefaults:
 1322|  1.09k|{
 1323|  1.09k|    memset(gainMap, 0, sizeof(avifGainMap));
 1324|  1.09k|    gainMap->altColorPrimaries = AVIF_COLOR_PRIMARIES_UNSPECIFIED;
 1325|  1.09k|    gainMap->altTransferCharacteristics = AVIF_TRANSFER_CHARACTERISTICS_UNSPECIFIED;
 1326|  1.09k|    gainMap->altMatrixCoefficients = AVIF_MATRIX_COEFFICIENTS_UNSPECIFIED;
 1327|  1.09k|    gainMap->altYUVRange = AVIF_RANGE_FULL;
 1328|  1.09k|    gainMap->useBaseColorSpace = AVIF_TRUE;
  ------------------
  |  |   88|  1.09k|#define AVIF_TRUE 1
  ------------------
 1329|       |    // Set all denominators to valid values (1).
 1330|  4.37k|    for (int i = 0; i < 3; ++i) {
  ------------------
  |  Branch (1330:21): [True: 3.27k, False: 1.09k]
  ------------------
 1331|  3.27k|        gainMap->gainMapMin[i].d = 1;
 1332|  3.27k|        gainMap->gainMapMax[i].d = 1;
 1333|  3.27k|        gainMap->gainMapGamma[i].n = 1;
 1334|  3.27k|        gainMap->gainMapGamma[i].d = 1;
 1335|  3.27k|        gainMap->baseOffset[i].d = 1;
 1336|  3.27k|        gainMap->alternateOffset[i].d = 1;
 1337|  3.27k|    }
 1338|  1.09k|    gainMap->baseHdrHeadroom.d = 1;
 1339|  1.09k|    gainMap->alternateHdrHeadroom.d = 1;
 1340|  1.09k|}
avifGainMapDestroy:
 1343|    398|{
 1344|    398|    if (gainMap->image) {
  ------------------
  |  Branch (1344:9): [True: 295, False: 103]
  ------------------
 1345|    295|        avifImageDestroy(gainMap->image);
 1346|    295|    }
 1347|    398|    avifRWDataFree(&gainMap->altICC);
 1348|    398|    avifFree(gainMap);
 1349|    398|}
avif.c:findAvailableCodec:
 1214|  79.0k|{
 1215|   121k|    for (int i = 0; i < availableCodecsCount; ++i) {
  ------------------
  |  Branch (1215:21): [True: 121k, False: 0]
  ------------------
 1216|   121k|        if ((choice != AVIF_CODEC_CHOICE_AUTO) && (availableCodecs[i].choice != choice)) {
  ------------------
  |  Branch (1216:13): [True: 103k, False: 18.4k]
  |  Branch (1216:51): [True: 42.4k, False: 60.6k]
  ------------------
 1217|  42.4k|            continue;
 1218|  42.4k|        }
 1219|  79.0k|        if (requiredFlags && ((availableCodecs[i].flags & requiredFlags) != requiredFlags)) {
  ------------------
  |  Branch (1219:13): [True: 79.0k, False: 0]
  |  Branch (1219:30): [True: 0, False: 79.0k]
  ------------------
 1220|      0|            continue;
 1221|      0|        }
 1222|  79.0k|        if ((choice == AVIF_CODEC_CHOICE_AUTO) && (availableCodecs[i].choice == AVIF_CODEC_CHOICE_AVM)) {
  ------------------
  |  Branch (1222:13): [True: 18.4k, False: 60.6k]
  |  Branch (1222:51): [True: 0, False: 18.4k]
  ------------------
 1223|       |            // AV2 is experimental and cannot be the default, it must be explicitly selected.
 1224|      0|            continue;
 1225|      0|        }
 1226|  79.0k|        return &availableCodecs[i];
 1227|  79.0k|    }
 1228|      0|    return NULL;
 1229|  79.0k|}

avifCodecCreateAOM:
 1386|  21.2k|{
 1387|  21.2k|    avifCodec * codec = (avifCodec *)avifAlloc(sizeof(avifCodec));
 1388|  21.2k|    if (codec == NULL) {
  ------------------
  |  Branch (1388:9): [True: 0, False: 21.2k]
  ------------------
 1389|      0|        return NULL;
 1390|      0|    }
 1391|  21.2k|    memset(codec, 0, sizeof(struct avifCodec));
 1392|       |
 1393|  21.2k|#if defined(AVIF_CODEC_AOM_DECODE)
 1394|  21.2k|    codec->getNextImage = aomCodecGetNextImage;
 1395|  21.2k|#endif
 1396|       |
 1397|  21.2k|#if defined(AVIF_CODEC_AOM_ENCODE)
 1398|  21.2k|    codec->encodeImage = aomCodecEncodeImage;
 1399|  21.2k|    codec->encodeFinish = aomCodecEncodeFinish;
 1400|  21.2k|#endif
 1401|       |
 1402|  21.2k|    codec->destroyInternal = aomCodecDestroyInternal;
 1403|  21.2k|    codec->internal = (struct avifCodecInternal *)avifAlloc(sizeof(struct avifCodecInternal));
 1404|  21.2k|    if (codec->internal == NULL) {
  ------------------
  |  Branch (1404:9): [True: 0, False: 21.2k]
  ------------------
 1405|      0|        avifFree(codec);
 1406|      0|        return NULL;
 1407|      0|    }
 1408|  21.2k|    memset(codec->internal, 0, sizeof(struct avifCodecInternal));
 1409|  21.2k|    return codec;
 1410|  21.2k|}
codec_aom.c:aomCodecGetNextImage:
  108|  23.4k|{
  109|  23.4k|    assert(sample);
  110|       |
  111|  23.4k|    aom_codec_iface_t * const decoderInterface = aom_codec_av1_dx();
  112|  23.4k|    struct aom_codec_stream_info streamInfo = { 0 };
  113|  23.4k|    aom_codec_err_t err = aom_codec_peek_stream_info(decoderInterface, sample->data.data, sample->data.size, &streamInfo);
  114|  23.4k|    if (err != AOM_CODEC_OK) {
  ------------------
  |  Branch (114:9): [True: 1.00k, False: 22.4k]
  ------------------
  115|  1.00k|        avifDiagnosticsPrintf(codec->diag, "aom_codec_peek_stream_info() failed: %s", aom_codec_err_to_string(err));
  116|  1.00k|        return AVIF_FALSE;
  ------------------
  |  |   89|  1.00k|#define AVIF_FALSE 0
  ------------------
  117|  1.00k|    }
  118|  22.4k|    if (streamInfo.w == 0 || streamInfo.h == 0) {
  ------------------
  |  Branch (118:9): [True: 3.74k, False: 18.7k]
  |  Branch (118:30): [True: 0, False: 18.7k]
  ------------------
  119|       |        // The sequence header was not found.
  120|  3.74k|        if (!codec->internal->decoderInitialized) {
  ------------------
  |  Branch (120:13): [True: 15, False: 3.73k]
  ------------------
  121|       |            // Treat it as an error if the first frame isn't preceded by a sequence header.
  122|     15|            return AVIF_FALSE;
  ------------------
  |  |   89|     15|#define AVIF_FALSE 0
  ------------------
  123|     15|        }
  124|  18.7k|    } else {
  125|  18.7k|        if (avifDimensionsTooLarge(streamInfo.w, streamInfo.h, codec->imageSizeLimit, codec->imageDimensionLimit)) {
  ------------------
  |  Branch (125:13): [True: 33, False: 18.7k]
  ------------------
  126|     33|            avifDiagnosticsPrintf(codec->diag, "Image dimensions too large: %dx%d", streamInfo.w, streamInfo.h);
  127|     33|            return AVIF_FALSE;
  ------------------
  |  |   89|     33|#define AVIF_FALSE 0
  ------------------
  128|     33|        }
  129|  18.7k|    }
  130|       |
  131|  22.4k|    if (!codec->internal->decoderInitialized) {
  ------------------
  |  Branch (131:9): [True: 17.9k, False: 4.50k]
  ------------------
  132|  17.9k|        aom_codec_dec_cfg_t cfg;
  133|  17.9k|        memset(&cfg, 0, sizeof(aom_codec_dec_cfg_t));
  134|  17.9k|        cfg.threads = codec->maxThreads;
  135|  17.9k|        cfg.allow_lowbitdepth = 1;
  136|       |
  137|  17.9k|        if (aom_codec_dec_init(&codec->internal->decoder, decoderInterface, &cfg, 0)) {
  ------------------
  |  Branch (137:13): [True: 0, False: 17.9k]
  ------------------
  138|      0|            aomDiagPrintf(codec->diag, "aom_codec_dec_init()", &codec->internal->decoder);
  139|      0|            return AVIF_FALSE;
  ------------------
  |  |   89|      0|#define AVIF_FALSE 0
  ------------------
  140|      0|        }
  141|  17.9k|        codec->internal->decoderInitialized = AVIF_TRUE;
  ------------------
  |  |   88|  17.9k|#define AVIF_TRUE 1
  ------------------
  142|       |
  143|  17.9k|        if (aom_codec_control(&codec->internal->decoder, AV1D_SET_OUTPUT_ALL_LAYERS, codec->allLayers)) {
  ------------------
  |  Branch (143:13): [True: 0, False: 17.9k]
  ------------------
  144|      0|            aomDiagPrintf(codec->diag, "aom_codec_control(AV1D_SET_OUTPUT_ALL_LAYERS)", &codec->internal->decoder);
  145|      0|            return AVIF_FALSE;
  ------------------
  |  |   89|      0|#define AVIF_FALSE 0
  ------------------
  146|      0|        }
  147|  17.9k|        if (aom_codec_control(&codec->internal->decoder, AV1D_SET_OPERATING_POINT, codec->operatingPoint)) {
  ------------------
  |  Branch (147:13): [True: 0, False: 17.9k]
  ------------------
  148|      0|            aomDiagPrintf(codec->diag, "aom_codec_control(AV1D_SET_OPERATING_POINT)", &codec->internal->decoder);
  149|      0|            return AVIF_FALSE;
  ------------------
  |  |   89|      0|#define AVIF_FALSE 0
  ------------------
  150|      0|        }
  151|       |
  152|  17.9k|        codec->internal->iter = NULL;
  153|  17.9k|    }
  154|       |
  155|  22.4k|    aom_image_t * nextFrame = NULL;
  156|  22.4k|    uint8_t spatialID = AVIF_SPATIAL_ID_UNSET;
  ------------------
  |  |  461|  22.4k|#define AVIF_SPATIAL_ID_UNSET 0xff
  ------------------
  157|  29.6k|    for (;;) {
  158|  29.6k|        nextFrame = aom_codec_get_frame(&codec->internal->decoder, &codec->internal->iter);
  159|  29.6k|        if (nextFrame) {
  ------------------
  |  Branch (159:13): [True: 7.04k, False: 22.5k]
  ------------------
  160|  7.04k|            if (spatialID != AVIF_SPATIAL_ID_UNSET) {
  ------------------
  |  |  461|  7.04k|#define AVIF_SPATIAL_ID_UNSET 0xff
  ------------------
  |  Branch (160:17): [True: 0, False: 7.04k]
  ------------------
  161|       |                // This requires libaom v3.1.2 or later, which has the fix for
  162|       |                // https://crbug.com/aomedia/2993.
  163|      0|                if (spatialID == nextFrame->spatial_id) {
  ------------------
  |  Branch (163:21): [True: 0, False: 0]
  ------------------
  164|       |                    // Found the correct spatial_id.
  165|      0|                    break;
  166|      0|                }
  167|  7.04k|            } else {
  168|       |                // Got an image!
  169|  7.04k|                break;
  170|  7.04k|            }
  171|  22.5k|        } else if (sample) {
  ------------------
  |  Branch (171:20): [True: 22.4k, False: 128]
  ------------------
  172|  22.4k|            codec->internal->iter = NULL;
  173|  22.4k|            if (aom_codec_decode(&codec->internal->decoder, sample->data.data, sample->data.size, NULL)) {
  ------------------
  |  Branch (173:17): [True: 15.2k, False: 7.15k]
  ------------------
  174|  15.2k|                aomDiagPrintf(codec->diag, "aom_codec_decode()", &codec->internal->decoder);
  175|  15.2k|                return AVIF_FALSE;
  ------------------
  |  |   89|  15.2k|#define AVIF_FALSE 0
  ------------------
  176|  15.2k|            }
  177|  7.15k|            spatialID = sample->spatialID;
  178|  7.15k|            sample = NULL;
  179|  7.15k|        } else {
  180|    128|            break;
  181|    128|        }
  182|  29.6k|    }
  183|       |
  184|  7.16k|    if (nextFrame) {
  ------------------
  |  Branch (184:9): [True: 7.04k, False: 128]
  ------------------
  185|  7.04k|        codec->internal->image = nextFrame;
  186|  7.04k|    } else {
  187|    128|        if (alpha && codec->internal->image) {
  ------------------
  |  Branch (187:13): [True: 4, False: 124]
  |  Branch (187:22): [True: 4, False: 0]
  ------------------
  188|       |            // Special case: reuse last alpha frame
  189|    124|        } else {
  190|    124|            return AVIF_FALSE;
  ------------------
  |  |   89|    124|#define AVIF_FALSE 0
  ------------------
  191|    124|        }
  192|    128|    }
  193|       |
  194|  7.04k|    avifBool isColor = !alpha;
  195|  7.04k|    if (isColor) {
  ------------------
  |  Branch (195:9): [True: 5.71k, False: 1.33k]
  ------------------
  196|       |        // Color (YUV) planes - set image to correct size / format, fill color
  197|       |
  198|  5.71k|        avifPixelFormat yuvFormat = AVIF_PIXEL_FORMAT_NONE;
  199|  5.71k|        switch (codec->internal->image->fmt) {
  200|  2.60k|            case AOM_IMG_FMT_I420:
  ------------------
  |  Branch (200:13): [True: 2.60k, False: 3.10k]
  ------------------
  201|  3.19k|            case AOM_IMG_FMT_I42016:
  ------------------
  |  Branch (201:13): [True: 592, False: 5.12k]
  ------------------
  202|  3.19k|                yuvFormat = AVIF_PIXEL_FORMAT_YUV420;
  203|  3.19k|                break;
  204|     52|            case AOM_IMG_FMT_I422:
  ------------------
  |  Branch (204:13): [True: 52, False: 5.66k]
  ------------------
  205|    964|            case AOM_IMG_FMT_I42216:
  ------------------
  |  Branch (205:13): [True: 912, False: 4.80k]
  ------------------
  206|    964|                yuvFormat = AVIF_PIXEL_FORMAT_YUV422;
  207|    964|                break;
  208|  1.32k|            case AOM_IMG_FMT_I444:
  ------------------
  |  Branch (208:13): [True: 1.32k, False: 4.39k]
  ------------------
  209|  1.55k|            case AOM_IMG_FMT_I44416:
  ------------------
  |  Branch (209:13): [True: 229, False: 5.48k]
  ------------------
  210|  1.55k|                yuvFormat = AVIF_PIXEL_FORMAT_YUV444;
  211|  1.55k|                break;
  212|      0|            case AOM_IMG_FMT_NONE:
  ------------------
  |  Branch (212:13): [True: 0, False: 5.71k]
  ------------------
  213|      0|#if defined(AOM_HAVE_IMG_FMT_NV12)
  214|       |            // Although the libaom encoder supports the NV12 image format as an input format, the
  215|       |            // libaom decoder does not support NV12 as an output format.
  216|      0|            case AOM_IMG_FMT_NV12:
  ------------------
  |  Branch (216:13): [True: 0, False: 5.71k]
  ------------------
  217|      0|#endif
  218|      0|            case AOM_IMG_FMT_YV12:
  ------------------
  |  Branch (218:13): [True: 0, False: 5.71k]
  ------------------
  219|      0|            case AOM_IMG_FMT_YV1216:
  ------------------
  |  Branch (219:13): [True: 0, False: 5.71k]
  ------------------
  220|      0|            default:
  ------------------
  |  Branch (220:13): [True: 0, False: 5.71k]
  ------------------
  221|      0|                return AVIF_FALSE;
  ------------------
  |  |   89|      0|#define AVIF_FALSE 0
  ------------------
  222|  5.71k|        }
  223|  5.71k|        if (codec->internal->image->monochrome) {
  ------------------
  |  Branch (223:13): [True: 235, False: 5.47k]
  ------------------
  224|    235|            yuvFormat = AVIF_PIXEL_FORMAT_YUV400;
  225|    235|        }
  226|       |
  227|  5.71k|        image->width = codec->internal->image->d_w;
  228|  5.71k|        image->height = codec->internal->image->d_h;
  229|  5.71k|        image->depth = codec->internal->image->bit_depth;
  230|       |
  231|  5.71k|        image->yuvFormat = yuvFormat;
  232|  5.71k|        image->yuvRange = (codec->internal->image->range == AOM_CR_STUDIO_RANGE) ? AVIF_RANGE_LIMITED : AVIF_RANGE_FULL;
  ------------------
  |  Branch (232:27): [True: 980, False: 4.73k]
  ------------------
  233|  5.71k|        image->yuvChromaSamplePosition = (avifChromaSamplePosition)codec->internal->image->csp;
  234|       |
  235|  5.71k|        image->colorPrimaries = (avifColorPrimaries)codec->internal->image->cp;
  236|  5.71k|        image->transferCharacteristics = (avifTransferCharacteristics)codec->internal->image->tc;
  237|  5.71k|        image->matrixCoefficients = (avifMatrixCoefficients)codec->internal->image->mc;
  238|       |
  239|       |        // Steal the pointers from the decoder's image directly
  240|  5.71k|        avifImageFreePlanes(image, AVIF_PLANES_YUV);
  241|  5.71k|        int yuvPlaneCount = (yuvFormat == AVIF_PIXEL_FORMAT_YUV400) ? 1 : 3;
  ------------------
  |  Branch (241:29): [True: 235, False: 5.47k]
  ------------------
  242|  22.3k|        for (int yuvPlane = 0; yuvPlane < yuvPlaneCount; ++yuvPlane) {
  ------------------
  |  Branch (242:32): [True: 16.6k, False: 5.71k]
  ------------------
  243|  16.6k|            image->yuvPlanes[yuvPlane] = codec->internal->image->planes[yuvPlane];
  244|  16.6k|            image->yuvRowBytes[yuvPlane] = codec->internal->image->stride[yuvPlane];
  245|  16.6k|        }
  246|  5.71k|        image->imageOwnsYUVPlanes = AVIF_FALSE;
  ------------------
  |  |   89|  5.71k|#define AVIF_FALSE 0
  ------------------
  247|  5.71k|    } else {
  248|       |        // Alpha plane - set image to correct size, fill alpha
  249|       |
  250|  1.33k|        image->width = codec->internal->image->d_w;
  251|  1.33k|        image->height = codec->internal->image->d_h;
  252|  1.33k|        image->depth = codec->internal->image->bit_depth;
  253|       |
  254|  1.33k|        avifImageFreePlanes(image, AVIF_PLANES_A);
  255|  1.33k|        image->alphaPlane = codec->internal->image->planes[0];
  256|  1.33k|        image->alphaRowBytes = codec->internal->image->stride[0];
  257|  1.33k|        *isLimitedRangeAlpha = (codec->internal->image->range == AOM_CR_STUDIO_RANGE);
  258|  1.33k|        image->imageOwnsAlphaPlane = AVIF_FALSE;
  ------------------
  |  |   89|  1.33k|#define AVIF_FALSE 0
  ------------------
  259|  1.33k|    }
  260|       |
  261|  7.04k|    return AVIF_TRUE;
  ------------------
  |  |   88|  7.04k|#define AVIF_TRUE 1
  ------------------
  262|  7.04k|}
codec_aom.c:aomDiagPrintf:
   95|  15.2k|{
   96|  15.2k|    const char * error = aom_codec_error(ctx);
   97|  15.2k|    const char * error_detail = aom_codec_error_detail(ctx);
   98|  15.2k|    avifDiagnosticsPrintf(diag, "%s failed: %s: %s", func, error, error_detail ? error_detail : "no error detail");
  ------------------
  |  Branch (98:67): [True: 13.9k, False: 1.30k]
  ------------------
   99|  15.2k|}
codec_aom.c:aomCodecDestroyInternal:
   77|  21.2k|{
   78|  21.2k|#if defined(AVIF_CODEC_AOM_DECODE)
   79|  21.2k|    if (codec->internal->decoderInitialized) {
  ------------------
  |  Branch (79:9): [True: 17.9k, False: 3.27k]
  ------------------
   80|  17.9k|        aom_codec_destroy(&codec->internal->decoder);
   81|  17.9k|    }
   82|  21.2k|#endif
   83|       |
   84|  21.2k|#if defined(AVIF_CODEC_AOM_ENCODE)
   85|  21.2k|    if (codec->internal->encoderInitialized) {
  ------------------
  |  Branch (85:9): [True: 0, False: 21.2k]
  ------------------
   86|      0|        aom_codec_destroy(&codec->internal->encoder);
   87|      0|    }
   88|  21.2k|#endif
   89|       |
   90|  21.2k|    avifFree(codec->internal);
   91|  21.2k|}

avifCodecCreateDav1d:
  234|  18.3k|{
  235|  18.3k|    avifCodec * codec = (avifCodec *)avifAlloc(sizeof(avifCodec));
  236|  18.3k|    if (codec == NULL) {
  ------------------
  |  Branch (236:9): [True: 0, False: 18.3k]
  ------------------
  237|      0|        return NULL;
  238|      0|    }
  239|  18.3k|    memset(codec, 0, sizeof(struct avifCodec));
  240|  18.3k|    codec->getNextImage = dav1dCodecGetNextImage;
  241|  18.3k|    codec->destroyInternal = dav1dCodecDestroyInternal;
  242|       |
  243|  18.3k|    codec->internal = (struct avifCodecInternal *)avifAlloc(sizeof(struct avifCodecInternal));
  244|  18.3k|    if (codec->internal == NULL) {
  ------------------
  |  Branch (244:9): [True: 0, False: 18.3k]
  ------------------
  245|      0|        avifFree(codec);
  246|      0|        return NULL;
  247|      0|    }
  248|  18.3k|    memset(codec->internal, 0, sizeof(struct avifCodecInternal));
  249|  18.3k|    return codec;
  250|  18.3k|}
codec_dav1d.c:dav1dCodecGetNextImage:
   63|  20.8k|{
   64|  20.8k|    if (codec->internal->dav1dContext == NULL) {
  ------------------
  |  Branch (64:9): [True: 17.2k, False: 3.64k]
  ------------------
   65|  17.2k|        Dav1dSettings dav1dSettings;
   66|  17.2k|        dav1d_default_settings(&dav1dSettings);
   67|       |        // Give all available threads to decode a single frame as fast as possible
   68|  17.2k|#if DAV1D_API_VERSION_MAJOR >= 6
   69|  17.2k|        dav1dSettings.max_frame_delay = 1;
   70|  17.2k|        dav1dSettings.n_threads = AVIF_CLAMP(codec->maxThreads, 1, DAV1D_MAX_THREADS);
  ------------------
  |  |   18|  17.2k|#define AVIF_CLAMP(x, low, high) (((x) < (low)) ? (low) : (((high) < (x)) ? (high) : (x)))
  |  |  ------------------
  |  |  |  Branch (18:35): [True: 453, False: 16.7k]
  |  |  |  Branch (18:60): [True: 0, False: 16.7k]
  |  |  ------------------
  ------------------
   71|       |#else
   72|       |        dav1dSettings.n_frame_threads = 1;
   73|       |        dav1dSettings.n_tile_threads = AVIF_CLAMP(codec->maxThreads, 1, DAV1D_MAX_TILE_THREADS);
   74|       |#endif // DAV1D_API_VERSION_MAJOR >= 6
   75|       |        // Set a maximum frame size limit to avoid OOM'ing fuzzers. In 32-bit builds, if
   76|       |        // frame_size_limit > 8192 * 8192, dav1d reduces frame_size_limit to 8192 * 8192 and logs
   77|       |        // a message, so we set frame_size_limit to at most 8192 * 8192 to avoid the dav1d_log
   78|       |        // message.
   79|  17.2k|        dav1dSettings.frame_size_limit = (sizeof(size_t) < 8) ? AVIF_MIN(codec->imageSizeLimit, 8192 * 8192) : codec->imageSizeLimit;
  ------------------
  |  |   19|      0|#define AVIF_MIN(a, b) (((a) < (b)) ? (a) : (b))
  |  |  ------------------
  |  |  |  Branch (19:25): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  |  Branch (79:42): [Folded, False: 17.2k]
  ------------------
   80|  17.2k|        dav1dSettings.logger.cookie = codec;
   81|  17.2k|        dav1dSettings.logger.callback = avifDav1dLogCallback;
   82|  17.2k|        dav1dSettings.operating_point = codec->operatingPoint;
   83|  17.2k|        dav1dSettings.all_layers = codec->allLayers;
   84|       |
   85|  17.2k|        if (dav1d_open(&codec->internal->dav1dContext, &dav1dSettings) != 0) {
  ------------------
  |  Branch (85:13): [True: 0, False: 17.2k]
  ------------------
   86|      0|            return AVIF_FALSE;
  ------------------
  |  |   89|      0|#define AVIF_FALSE 0
  ------------------
   87|      0|        }
   88|  17.2k|    }
   89|       |
   90|  20.8k|    avifBool gotPicture = AVIF_FALSE;
  ------------------
  |  |   89|  20.8k|#define AVIF_FALSE 0
  ------------------
   91|  20.8k|    Dav1dPicture nextFrame;
   92|  20.8k|    memset(&nextFrame, 0, sizeof(Dav1dPicture));
   93|       |
   94|  20.8k|    Dav1dData dav1dData;
   95|  20.8k|    if (dav1d_data_wrap(&dav1dData, sample->data.data, sample->data.size, avifDav1dFreeCallback, NULL) != 0) {
  ------------------
  |  Branch (95:9): [True: 0, False: 20.8k]
  ------------------
   96|      0|        return AVIF_FALSE;
  ------------------
  |  |   89|      0|#define AVIF_FALSE 0
  ------------------
   97|      0|    }
   98|       |
   99|  20.8k|    int res;
  100|  20.8k|    for (;;) {
  101|  20.8k|        if (dav1dData.data) {
  ------------------
  |  Branch (101:13): [True: 20.8k, False: 0]
  ------------------
  102|  20.8k|            res = dav1d_send_data(codec->internal->dav1dContext, &dav1dData);
  103|  20.8k|            if ((res < 0) && (res != DAV1D_ERR(EAGAIN))) {
  ------------------
  |  Branch (103:17): [True: 10.8k, False: 10.0k]
  |  Branch (103:30): [True: 10.8k, False: 0]
  ------------------
  104|  10.8k|                dav1d_data_unref(&dav1dData);
  105|  10.8k|                return AVIF_FALSE;
  ------------------
  |  |   89|  10.8k|#define AVIF_FALSE 0
  ------------------
  106|  10.8k|            }
  107|  20.8k|        }
  108|       |
  109|  10.0k|        res = dav1d_get_picture(codec->internal->dav1dContext, &nextFrame);
  110|  10.0k|        if (res == DAV1D_ERR(EAGAIN)) {
  ------------------
  |  Branch (110:13): [True: 1.28k, False: 8.73k]
  ------------------
  111|  1.28k|            if (dav1dData.data) {
  ------------------
  |  Branch (111:17): [True: 0, False: 1.28k]
  ------------------
  112|       |                // send more data
  113|      0|                continue;
  114|      0|            }
  115|  1.28k|            return AVIF_FALSE;
  ------------------
  |  |   89|  1.28k|#define AVIF_FALSE 0
  ------------------
  116|  8.73k|        } else if (res < 0) {
  ------------------
  |  Branch (116:20): [True: 0, False: 8.73k]
  ------------------
  117|       |            // No more frames
  118|      0|            if (dav1dData.data) {
  ------------------
  |  Branch (118:17): [True: 0, False: 0]
  ------------------
  119|      0|                dav1d_data_unref(&dav1dData);
  120|      0|            }
  121|      0|            return AVIF_FALSE;
  ------------------
  |  |   89|      0|#define AVIF_FALSE 0
  ------------------
  122|  8.73k|        } else {
  123|       |            // Got a picture!
  124|  8.73k|            if ((sample->spatialID != AVIF_SPATIAL_ID_UNSET) && (sample->spatialID != nextFrame.frame_hdr->spatial_id)) {
  ------------------
  |  |  461|  8.73k|#define AVIF_SPATIAL_ID_UNSET 0xff
  ------------------
  |  Branch (124:17): [True: 0, False: 8.73k]
  |  Branch (124:65): [True: 0, False: 0]
  ------------------
  125|       |                // Layer selection: skip this unwanted layer
  126|      0|                dav1d_picture_unref(&nextFrame);
  127|  8.73k|            } else {
  128|  8.73k|                gotPicture = AVIF_TRUE;
  ------------------
  |  |   88|  8.73k|#define AVIF_TRUE 1
  ------------------
  129|  8.73k|                break;
  130|  8.73k|            }
  131|  8.73k|        }
  132|  10.0k|    }
  133|  8.73k|    if (dav1dData.data) {
  ------------------
  |  Branch (133:9): [True: 0, False: 8.73k]
  ------------------
  134|      0|        dav1d_data_unref(&dav1dData);
  135|      0|    }
  136|       |
  137|       |    // Drain all buffered frames in the decoder.
  138|       |    //
  139|       |    // The sample should have only one frame of the desired layer. If there are more frames after
  140|       |    // that frame, we need to discard them so that they won't be mistakenly output when the decoder
  141|       |    // is used to decode another sample.
  142|  8.73k|    Dav1dPicture bufferedFrame;
  143|  8.73k|    memset(&bufferedFrame, 0, sizeof(Dav1dPicture));
  144|  9.21k|    do {
  145|  9.21k|        res = dav1d_get_picture(codec->internal->dav1dContext, &bufferedFrame);
  146|  9.21k|        if (res < 0) {
  ------------------
  |  Branch (146:13): [True: 8.73k, False: 484]
  ------------------
  147|  8.73k|            if (res != DAV1D_ERR(EAGAIN)) {
  ------------------
  |  Branch (147:17): [True: 1.44k, False: 7.28k]
  ------------------
  148|  1.44k|                if (gotPicture) {
  ------------------
  |  Branch (148:21): [True: 1.44k, False: 0]
  ------------------
  149|  1.44k|                    dav1d_picture_unref(&nextFrame);
  150|  1.44k|                }
  151|  1.44k|                return AVIF_FALSE;
  ------------------
  |  |   89|  1.44k|#define AVIF_FALSE 0
  ------------------
  152|  1.44k|            }
  153|  8.73k|        } else {
  154|    484|            dav1d_picture_unref(&bufferedFrame);
  155|    484|        }
  156|  9.21k|    } while (res == 0);
  ------------------
  |  Branch (156:14): [True: 484, False: 7.28k]
  ------------------
  157|       |
  158|  7.28k|    if (gotPicture) {
  ------------------
  |  Branch (158:9): [True: 7.28k, False: 0]
  ------------------
  159|  7.28k|        dav1d_picture_unref(&codec->internal->dav1dPicture);
  160|  7.28k|        codec->internal->dav1dPicture = nextFrame;
  161|  7.28k|        codec->internal->colorRange = codec->internal->dav1dPicture.seq_hdr->color_range ? AVIF_RANGE_FULL : AVIF_RANGE_LIMITED;
  ------------------
  |  Branch (161:39): [True: 4.96k, False: 2.31k]
  ------------------
  162|  7.28k|        codec->internal->hasPicture = AVIF_TRUE;
  ------------------
  |  |   88|  7.28k|#define AVIF_TRUE 1
  ------------------
  163|  7.28k|    } else {
  164|      0|        if (alpha && codec->internal->hasPicture) {
  ------------------
  |  Branch (164:13): [True: 0, False: 0]
  |  Branch (164:22): [True: 0, False: 0]
  ------------------
  165|       |            // Special case: reuse last alpha frame
  166|      0|        } else {
  167|      0|            return AVIF_FALSE;
  ------------------
  |  |   89|      0|#define AVIF_FALSE 0
  ------------------
  168|      0|        }
  169|      0|    }
  170|       |
  171|  7.28k|    Dav1dPicture * dav1dImage = &codec->internal->dav1dPicture;
  172|  7.28k|    avifBool isColor = !alpha;
  173|  7.28k|    if (isColor) {
  ------------------
  |  Branch (173:9): [True: 6.36k, False: 917]
  ------------------
  174|       |        // Color (YUV) planes - set image to correct size / format, fill color
  175|       |
  176|  6.36k|        avifPixelFormat yuvFormat = AVIF_PIXEL_FORMAT_NONE;
  177|  6.36k|        switch (dav1dImage->p.layout) {
  ------------------
  |  Branch (177:17): [True: 6.36k, False: 0]
  ------------------
  178|    867|            case DAV1D_PIXEL_LAYOUT_I400:
  ------------------
  |  Branch (178:13): [True: 867, False: 5.49k]
  ------------------
  179|    867|                yuvFormat = AVIF_PIXEL_FORMAT_YUV400;
  180|    867|                break;
  181|  2.03k|            case DAV1D_PIXEL_LAYOUT_I420:
  ------------------
  |  Branch (181:13): [True: 2.03k, False: 4.33k]
  ------------------
  182|  2.03k|                yuvFormat = AVIF_PIXEL_FORMAT_YUV420;
  183|  2.03k|                break;
  184|    453|            case DAV1D_PIXEL_LAYOUT_I422:
  ------------------
  |  Branch (184:13): [True: 453, False: 5.91k]
  ------------------
  185|    453|                yuvFormat = AVIF_PIXEL_FORMAT_YUV422;
  186|    453|                break;
  187|  3.01k|            case DAV1D_PIXEL_LAYOUT_I444:
  ------------------
  |  Branch (187:13): [True: 3.01k, False: 3.35k]
  ------------------
  188|  3.01k|                yuvFormat = AVIF_PIXEL_FORMAT_YUV444;
  189|  3.01k|                break;
  190|  6.36k|        }
  191|       |
  192|  6.36k|        image->width = dav1dImage->p.w;
  193|  6.36k|        image->height = dav1dImage->p.h;
  194|  6.36k|        image->depth = dav1dImage->p.bpc;
  195|       |
  196|  6.36k|        image->yuvFormat = yuvFormat;
  197|  6.36k|        image->yuvRange = codec->internal->colorRange;
  198|  6.36k|        image->yuvChromaSamplePosition = (avifChromaSamplePosition)dav1dImage->seq_hdr->chr;
  199|       |
  200|  6.36k|        image->colorPrimaries = (avifColorPrimaries)dav1dImage->seq_hdr->pri;
  201|  6.36k|        image->transferCharacteristics = (avifTransferCharacteristics)dav1dImage->seq_hdr->trc;
  202|  6.36k|        image->matrixCoefficients = (avifMatrixCoefficients)dav1dImage->seq_hdr->mtrx;
  203|       |
  204|       |        // Steal the pointers from the decoder's image directly
  205|  6.36k|        avifImageFreePlanes(image, AVIF_PLANES_YUV);
  206|  6.36k|        int yuvPlaneCount = (yuvFormat == AVIF_PIXEL_FORMAT_YUV400) ? 1 : 3;
  ------------------
  |  Branch (206:29): [True: 867, False: 5.49k]
  ------------------
  207|  23.7k|        for (int yuvPlane = 0; yuvPlane < yuvPlaneCount; ++yuvPlane) {
  ------------------
  |  Branch (207:32): [True: 17.3k, False: 6.36k]
  ------------------
  208|  17.3k|            image->yuvPlanes[yuvPlane] = dav1dImage->data[yuvPlane];
  209|  17.3k|            image->yuvRowBytes[yuvPlane] = (uint32_t)dav1dImage->stride[(yuvPlane == AVIF_CHAN_Y) ? 0 : 1];
  ------------------
  |  Branch (209:73): [True: 6.36k, False: 10.9k]
  ------------------
  210|  17.3k|        }
  211|  6.36k|        image->imageOwnsYUVPlanes = AVIF_FALSE;
  ------------------
  |  |   89|  6.36k|#define AVIF_FALSE 0
  ------------------
  212|  6.36k|    } else {
  213|       |        // Alpha plane - set image to correct size, fill alpha
  214|       |
  215|    917|        image->width = dav1dImage->p.w;
  216|    917|        image->height = dav1dImage->p.h;
  217|    917|        image->depth = dav1dImage->p.bpc;
  218|       |
  219|    917|        avifImageFreePlanes(image, AVIF_PLANES_A);
  220|    917|        image->alphaPlane = dav1dImage->data[0];
  221|    917|        image->alphaRowBytes = (uint32_t)dav1dImage->stride[0];
  222|    917|        *isLimitedRangeAlpha = (codec->internal->colorRange == AVIF_RANGE_LIMITED);
  223|    917|        image->imageOwnsAlphaPlane = AVIF_FALSE;
  ------------------
  |  |   89|    917|#define AVIF_FALSE 0
  ------------------
  224|    917|    }
  225|  7.28k|    return AVIF_TRUE;
  ------------------
  |  |   88|  7.28k|#define AVIF_TRUE 1
  ------------------
  226|  7.28k|}
codec_dav1d.c:avifDav1dLogCallback:
   42|  8.13k|{
   43|  8.13k|    avifCodec * codec = (avifCodec *)cookie;
   44|  8.13k|    vsnprintf(codec->diag->error, AVIF_DIAGNOSTICS_ERROR_BUFFER_SIZE, format, ap);
  ------------------
  |  |   91|  8.13k|#define AVIF_DIAGNOSTICS_ERROR_BUFFER_SIZE 256
  ------------------
   45|  8.13k|}
codec_dav1d.c:avifDav1dFreeCallback:
   35|  20.8k|{
   36|       |    // This data is owned by the decoder; nothing to free here
   37|  20.8k|    (void)buf;
   38|  20.8k|    (void)cookie;
   39|  20.8k|}
codec_dav1d.c:dav1dCodecDestroyInternal:
   48|  18.3k|{
   49|  18.3k|    if (codec->internal->hasPicture) {
  ------------------
  |  Branch (49:9): [True: 5.86k, False: 12.4k]
  ------------------
   50|  5.86k|        dav1d_picture_unref(&codec->internal->dav1dPicture);
   51|  5.86k|    }
   52|  18.3k|    if (codec->internal->dav1dContext) {
  ------------------
  |  Branch (52:9): [True: 17.2k, False: 1.07k]
  ------------------
   53|  17.2k|        dav1d_close(&codec->internal->dav1dContext);
   54|  17.2k|    }
   55|  18.3k|    avifFree(codec->internal);
   56|  18.3k|}

avifDiagnosticsClearError:
   11|   142k|{
   12|   142k|    *diag->error = '\0';
   13|   142k|}
avifDiagnosticsPrintf:
   19|  49.4k|{
   20|  49.4k|    if (!diag) {
  ------------------
  |  Branch (20:9): [True: 0, False: 49.4k]
  ------------------
   21|       |        // It is possible this is NULL (e.g. calls to avifPeekCompatibleFileType())
   22|      0|        return;
   23|      0|    }
   24|  49.4k|    if (*diag->error) {
  ------------------
  |  Branch (24:9): [True: 19.6k, False: 29.8k]
  ------------------
   25|       |        // There is already a detailed error set.
   26|  19.6k|        return;
   27|  19.6k|    }
   28|       |
   29|  29.8k|    va_list args;
   30|  29.8k|    va_start(args, format);
   31|  29.8k|    vsnprintf(diag->error, AVIF_DIAGNOSTICS_ERROR_BUFFER_SIZE, format, args);
  ------------------
  |  |   91|  29.8k|#define AVIF_DIAGNOSTICS_ERROR_BUFFER_SIZE 256
  ------------------
   32|  29.8k|    diag->error[AVIF_DIAGNOSTICS_ERROR_BUFFER_SIZE - 1] = '\0';
  ------------------
  |  |   91|  29.8k|#define AVIF_DIAGNOSTICS_ERROR_BUFFER_SIZE 256
  ------------------
   33|       |    va_end(args);
   34|  29.8k|}

avifGetExifTiffHeaderOffset:
   10|    531|{
   11|    531|    const uint8_t tiffHeaderBE[4] = { 'M', 'M', 0, 42 };
   12|    531|    const uint8_t tiffHeaderLE[4] = { 'I', 'I', 42, 0 };
   13|    531|    exifSize = AVIF_MIN(exifSize, UINT32_MAX);
  ------------------
  |  |   19|    531|#define AVIF_MIN(a, b) (((a) < (b)) ? (a) : (b))
  |  |  ------------------
  |  |  |  Branch (19:25): [True: 531, False: 0]
  |  |  ------------------
  ------------------
   14|  17.7k|    for (*offset = 0; *offset + 4 < exifSize; ++*offset) {
  ------------------
  |  Branch (14:23): [True: 17.7k, False: 3]
  ------------------
   15|  17.7k|        if (!memcmp(&exif[*offset], tiffHeaderBE, 4) || !memcmp(&exif[*offset], tiffHeaderLE, 4)) {
  ------------------
  |  Branch (15:13): [True: 0, False: 17.7k]
  |  Branch (15:57): [True: 528, False: 17.1k]
  ------------------
   16|    528|            return AVIF_RESULT_OK;
   17|    528|        }
   18|  17.7k|    }
   19|       |    // Couldn't find the TIFF header
   20|      3|    return AVIF_RESULT_INVALID_EXIF_PAYLOAD;
   21|    531|}

avifGainMapValidateMetadata:
  416|    497|{
  417|  1.91k|    for (int i = 0; i < 3; ++i) {
  ------------------
  |  Branch (417:21): [True: 1.45k, False: 463]
  ------------------
  418|  1.45k|        if (gainMap->gainMapMin[i].d == 0 || gainMap->gainMapMax[i].d == 0 || gainMap->gainMapGamma[i].d == 0 ||
  ------------------
  |  Branch (418:13): [True: 3, False: 1.44k]
  |  Branch (418:46): [True: 3, False: 1.44k]
  |  Branch (418:79): [True: 5, False: 1.43k]
  ------------------
  419|  1.43k|            gainMap->baseOffset[i].d == 0 || gainMap->alternateOffset[i].d == 0) {
  ------------------
  |  Branch (419:13): [True: 4, False: 1.43k]
  |  Branch (419:46): [True: 4, False: 1.43k]
  ------------------
  420|     19|            avifDiagnosticsPrintf(diag, "Per-channel denominator is 0 in gain map metadata");
  421|     19|            return AVIF_RESULT_INVALID_ARGUMENT;
  422|     19|        }
  423|  1.43k|        if ((int64_t)gainMap->gainMapMax[i].n * gainMap->gainMapMin[i].d <
  ------------------
  |  Branch (423:13): [True: 11, False: 1.42k]
  ------------------
  424|  1.43k|            (int64_t)gainMap->gainMapMin[i].n * gainMap->gainMapMax[i].d) {
  425|     11|            avifDiagnosticsPrintf(diag, "Per-channel max is less than per-channel min in gain map metadata");
  426|     11|            return AVIF_RESULT_INVALID_ARGUMENT;
  427|     11|        }
  428|  1.42k|        if (gainMap->gainMapGamma[i].n == 0) {
  ------------------
  |  Branch (428:13): [True: 4, False: 1.41k]
  ------------------
  429|      4|            avifDiagnosticsPrintf(diag, "Per-channel gamma is 0 in gain map metadata");
  430|      4|            return AVIF_RESULT_INVALID_ARGUMENT;
  431|      4|        }
  432|  1.42k|    }
  433|    463|    if (gainMap->baseHdrHeadroom.d == 0 || gainMap->alternateHdrHeadroom.d == 0) {
  ------------------
  |  Branch (433:9): [True: 2, False: 461]
  |  Branch (433:44): [True: 1, False: 460]
  ------------------
  434|      3|        avifDiagnosticsPrintf(diag, "Headroom denominator is 0 in gain map metadata");
  435|      3|        return AVIF_RESULT_INVALID_ARGUMENT;
  436|      3|    }
  437|    460|    if (gainMap->useBaseColorSpace != 0 && gainMap->useBaseColorSpace != 1) {
  ------------------
  |  Branch (437:9): [True: 165, False: 295]
  |  Branch (437:44): [True: 0, False: 165]
  ------------------
  438|      0|        avifDiagnosticsPrintf(diag, "useBaseColorSpace is %d in gain map metadata", gainMap->useBaseColorSpace);
  439|      0|        return AVIF_RESULT_INVALID_ARGUMENT;
  440|      0|    }
  441|    460|    return AVIF_RESULT_OK;
  442|    460|}

avifIODestroy:
   85|  41.0k|{
   86|  41.0k|    if (io && io->destroy) {
  ------------------
  |  Branch (86:9): [True: 20.5k, False: 20.5k]
  |  Branch (86:15): [True: 20.5k, False: 0]
  ------------------
   87|  20.5k|        io->destroy(io);
   88|  20.5k|    }
   89|  41.0k|}
avifIOCreateMemoryReader:
  134|  20.5k|{
  135|  20.5k|    avifIOMemoryReader * reader = (avifIOMemoryReader *)avifAlloc(sizeof(avifIOMemoryReader));
  136|  20.5k|    if (reader == NULL) {
  ------------------
  |  Branch (136:9): [True: 0, False: 20.5k]
  ------------------
  137|      0|        return NULL;
  138|      0|    }
  139|  20.5k|    memset(reader, 0, sizeof(avifIOMemoryReader));
  140|  20.5k|    reader->io.destroy = avifIOMemoryReaderDestroy;
  141|  20.5k|    reader->io.read = avifIOMemoryReaderRead;
  142|  20.5k|    reader->io.sizeHint = size;
  143|  20.5k|    reader->io.persistent = AVIF_TRUE;
  ------------------
  |  |   88|  20.5k|#define AVIF_TRUE 1
  ------------------
  144|  20.5k|    reader->rodata.data = data;
  145|  20.5k|    reader->rodata.size = size;
  146|  20.5k|    return (avifIO *)reader;
  147|  20.5k|}
io.c:avifIOMemoryReaderDestroy:
  129|  20.5k|{
  130|  20.5k|    avifFree(io);
  131|  20.5k|}
io.c:avifIOMemoryReaderRead:
  101|   138k|{
  102|       |    // printf("avifIOMemoryReaderRead offset %" PRIu64 " size %zu\n", offset, size);
  103|       |
  104|   138k|    if (readFlags != 0) {
  ------------------
  |  Branch (104:9): [True: 0, False: 138k]
  ------------------
  105|       |        // Unsupported readFlags
  106|      0|        return AVIF_RESULT_IO_ERROR;
  107|      0|    }
  108|       |
  109|   138k|    avifIOMemoryReader * reader = (avifIOMemoryReader *)io;
  110|       |
  111|       |    // Sanitize/clamp incoming request
  112|   138k|    if (offset > reader->rodata.size) {
  ------------------
  |  Branch (112:9): [True: 0, False: 138k]
  ------------------
  113|       |        // The offset is past the end of the buffer.
  114|      0|        return AVIF_RESULT_IO_ERROR;
  115|      0|    }
  116|   138k|    uint64_t availableSize = reader->rodata.size - offset;
  117|   138k|    if (size > availableSize) {
  ------------------
  |  Branch (117:9): [True: 1.10k, False: 137k]
  ------------------
  118|  1.10k|        size = (size_t)availableSize;
  119|  1.10k|    }
  120|       |
  121|       |    // Prevent the offset addition from triggering an undefined behavior
  122|       |    // sanitizer error if data is NULL (happens even with offset zero).
  123|   138k|    out->data = offset ? reader->rodata.data + offset : reader->rodata.data;
  ------------------
  |  Branch (123:17): [True: 117k, False: 20.6k]
  ------------------
  124|   138k|    out->size = size;
  125|   138k|    return AVIF_RESULT_OK;
  126|   138k|}

avifAlloc:
   10|   821k|{
   11|       |    assert(size != 0); // Implementation-defined. See https://en.cppreference.com/w/cpp/memory/c/malloc
   12|   821k|    return malloc(size);
   13|   821k|}
avifFree:
   16|  1.27M|{
   17|  1.27M|    free(p);
   18|  1.27M|}

avifSequenceHeaderParse:
  713|  34.7k|{
  714|  34.7k|    switch (codecType) {
  715|  34.7k|        case AVIF_CODEC_TYPE_AV1:
  ------------------
  |  Branch (715:9): [True: 34.7k, False: 0]
  ------------------
  716|  34.7k|            return av1SequenceHeaderParse(header, sample);
  717|       |#if defined(AVIF_CODEC_AVM)
  718|       |        case AVIF_CODEC_TYPE_AV2:
  719|       |            return av2SequenceHeaderParse(header, sample);
  720|       |#endif
  721|      0|        default:
  ------------------
  |  Branch (721:9): [True: 0, False: 34.7k]
  ------------------
  722|      0|            return AVIF_FALSE;
  ------------------
  |  |   89|      0|#define AVIF_FALSE 0
  ------------------
  723|  34.7k|    }
  724|  34.7k|}
obu.c:av1SequenceHeaderParse:
  597|  34.7k|{
  598|  34.7k|    avifROData obus = *sample;
  599|       |
  600|       |    // Find the sequence header OBU
  601|  71.2k|    while (obus.size > 0) {
  ------------------
  |  Branch (601:12): [True: 70.6k, False: 656]
  ------------------
  602|  70.6k|        avifBits bits;
  603|  70.6k|        avifBitsInit(&bits, obus.data, obus.size);
  604|       |
  605|       |        // obu_header()
  606|  70.6k|        const uint32_t obu_forbidden_bit = avifBitsRead(&bits, 1);
  607|  70.6k|        if (obu_forbidden_bit != 0) {
  ------------------
  |  Branch (607:13): [True: 787, False: 69.8k]
  ------------------
  608|    787|            return AVIF_FALSE;
  ------------------
  |  |   89|    787|#define AVIF_FALSE 0
  ------------------
  609|    787|        }
  610|  69.8k|        const uint32_t obu_type = avifBitsRead(&bits, 4);
  611|  69.8k|        const uint32_t obu_extension_flag = avifBitsRead(&bits, 1);
  612|  69.8k|        const uint32_t obu_has_size_field = avifBitsRead(&bits, 1);
  613|  69.8k|        avifBitsRead(&bits, 1); // obu_reserved_1bit
  614|       |
  615|  69.8k|        if (obu_extension_flag) {   // obu_extension_header()
  ------------------
  |  Branch (615:13): [True: 10.5k, False: 59.2k]
  ------------------
  616|  10.5k|            avifBitsRead(&bits, 8); // temporal_id, spatial_id, extension_header_reserved_3bits
  617|  10.5k|        }
  618|       |
  619|  69.8k|        uint32_t obu_size = 0;
  620|  69.8k|        if (obu_has_size_field)
  ------------------
  |  Branch (620:13): [True: 68.3k, False: 1.45k]
  ------------------
  621|  68.3k|            obu_size = avifBitsReadUleb128(&bits);
  622|  1.45k|        else
  623|  1.45k|            obu_size = (int)obus.size - 1 - obu_extension_flag;
  624|       |
  625|  69.8k|        if (bits.error) {
  ------------------
  |  Branch (625:13): [True: 1.59k, False: 68.2k]
  ------------------
  626|  1.59k|            return AVIF_FALSE;
  ------------------
  |  |   89|  1.59k|#define AVIF_FALSE 0
  ------------------
  627|  1.59k|        }
  628|       |
  629|  68.2k|        const uint32_t init_bit_pos = avifBitsReadPos(&bits);
  630|  68.2k|        const uint32_t init_byte_pos = init_bit_pos >> 3;
  631|  68.2k|        if (obu_size > obus.size - init_byte_pos)
  ------------------
  |  Branch (631:13): [True: 2.11k, False: 66.1k]
  ------------------
  632|  2.11k|            return AVIF_FALSE;
  ------------------
  |  |   89|  2.11k|#define AVIF_FALSE 0
  ------------------
  633|       |
  634|  66.1k|        if (obu_type == 1) { // Sequence Header
  ------------------
  |  Branch (634:13): [True: 29.5k, False: 36.5k]
  ------------------
  635|  29.5k|            avifBits seqHdrBits;
  636|  29.5k|            avifBitsInit(&seqHdrBits, obus.data + init_byte_pos, obu_size);
  637|  29.5k|            return parseAV1SequenceHeader(&seqHdrBits, header);
  638|  29.5k|        }
  639|       |
  640|       |        // Skip this OBU
  641|  36.5k|        obus.data += (size_t)obu_size + init_byte_pos;
  642|  36.5k|        obus.size -= (size_t)obu_size + init_byte_pos;
  643|  36.5k|    }
  644|    656|    return AVIF_FALSE;
  ------------------
  |  |   89|    656|#define AVIF_FALSE 0
  ------------------
  645|  34.7k|}
obu.c:avifBitsInit:
   63|   100k|{
   64|   100k|    bits->ptr = bits->start = data;
   65|   100k|    bits->end = &bits->start[size];
   66|   100k|    bits->bitsLeft = 0;
   67|   100k|    bits->state = 0;
   68|   100k|    bits->error = 0;
   69|   100k|    bits->eof = (size == 0);
   70|   100k|}
obu.c:avifBitsRead:
   89|  1.48M|{
   90|  1.48M|    if (n > bits->bitsLeft)
  ------------------
  |  Branch (90:9): [True: 636k, False: 848k]
  ------------------
   91|   636k|        avifBitsRefill(bits, n);
   92|       |
   93|  1.48M|    const uint64_t state = bits->state;
   94|  1.48M|    bits->bitsLeft -= n;
   95|  1.48M|    bits->state <<= n;
   96|       |
   97|  1.48M|    return (uint32_t)(state >> (64 - n));
   98|  1.48M|}
obu.c:avifBitsRefill:
   73|   636k|{
   74|   636k|    uint64_t state = 0;
   75|   786k|    do {
   76|   786k|        state <<= 8;
   77|   786k|        bits->bitsLeft += 8;
   78|   786k|        if (!bits->eof)
  ------------------
  |  Branch (78:13): [True: 669k, False: 117k]
  ------------------
   79|   669k|            state |= *bits->ptr++;
   80|   786k|        if (bits->ptr >= bits->end) {
  ------------------
  |  Branch (80:13): [True: 141k, False: 644k]
  ------------------
   81|   141k|            bits->error = bits->eof;
   82|   141k|            bits->eof = 1;
   83|   141k|        }
   84|   786k|    } while (n > bits->bitsLeft);
  ------------------
  |  Branch (84:14): [True: 150k, False: 636k]
  ------------------
   85|   636k|    bits->state |= state << (64 - bits->bitsLeft);
   86|   636k|}
obu.c:avifBitsReadUleb128:
  101|  68.3k|{
  102|  68.3k|    uint64_t val = 0;
  103|  68.3k|    uint32_t more;
  104|  68.3k|    uint32_t i = 0;
  105|       |
  106|  80.5k|    do {
  107|  80.5k|        const uint32_t v = avifBitsRead(bits, 8);
  108|  80.5k|        more = v & 0x80;
  109|  80.5k|        val |= ((uint64_t)(v & 0x7F)) << i;
  110|  80.5k|        i += 7;
  111|  80.5k|    } while (more && i < 56);
  ------------------
  |  Branch (111:14): [True: 12.4k, False: 68.1k]
  |  Branch (111:22): [True: 12.1k, False: 251]
  ------------------
  112|       |
  113|  68.3k|    if (val > UINT32_MAX || more) {
  ------------------
  |  Branch (113:9): [True: 1.16k, False: 67.2k]
  |  Branch (113:29): [True: 0, False: 67.2k]
  ------------------
  114|  1.16k|        bits->error = 1;
  115|  1.16k|        return 0;
  116|  1.16k|    }
  117|       |
  118|  67.2k|    return (uint32_t)val;
  119|  68.3k|}
obu.c:avifBitsReadPos:
   58|  68.2k|{
   59|  68.2k|    return (uint32_t)(bits->ptr - bits->start) * 8 - bits->bitsLeft;
   60|  68.2k|}
obu.c:parseAV1SequenceHeader:
  446|  29.5k|{
  447|  29.5k|    AVIF_CHECK(parseAV1SequenceHeaderProfile(bits, header));
  ------------------
  |  |   36|  29.5k|    do {                        \
  |  |   37|  29.5k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 633, False: 28.9k]
  |  |  ------------------
  |  |   38|    633|            avifBreakOnError(); \
  |  |   39|    633|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|    633|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|    633|        }                       \
  |  |   41|  29.5k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 28.9k]
  |  |  ------------------
  ------------------
  448|  28.9k|    AVIF_CHECK(parseSequenceHeaderLevelIdxAndTier(bits, header));
  ------------------
  |  |   36|  28.9k|    do {                        \
  |  |   37|  28.9k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 6.25k, False: 22.6k]
  |  |  ------------------
  |  |   38|  6.25k|            avifBreakOnError(); \
  |  |   39|  6.25k|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|  6.25k|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|  6.25k|        }                       \
  |  |   41|  28.9k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 22.6k]
  |  |  ------------------
  ------------------
  449|       |
  450|  22.6k|    AVIF_CHECK(parseSequenceHeaderFrameMaxDimensions(bits, header));
  ------------------
  |  |   36|  22.6k|    do {                        \
  |  |   37|  22.6k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1.50k, False: 21.1k]
  |  |  ------------------
  |  |   38|  1.50k|            avifBreakOnError(); \
  |  |   39|  1.50k|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|  1.50k|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|  1.50k|        }                       \
  |  |   41|  22.6k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 21.1k]
  |  |  ------------------
  ------------------
  451|  21.1k|    avifBitsRead(bits, 1); // use_128x128_superblock
  452|  21.1k|    AVIF_CHECK(parseSequenceHeaderEnabledFeatures(bits, header));
  ------------------
  |  |   36|  21.1k|    do {                        \
  |  |   37|  21.1k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1.97k, False: 19.2k]
  |  |  ------------------
  |  |   38|  1.97k|            avifBreakOnError(); \
  |  |   39|  1.97k|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|  1.97k|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|  1.97k|        }                       \
  |  |   41|  21.1k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 19.2k]
  |  |  ------------------
  ------------------
  453|       |
  454|  19.2k|    avifBitsRead(bits, 3); // enable_superres, enable_cdef, enable_restoration
  455|       |
  456|  19.2k|    AVIF_CHECK(parseAV1SequenceHeaderColorConfig(bits, header));
  ------------------
  |  |   36|  19.2k|    do {                        \
  |  |   37|  19.2k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 5.34k, False: 13.8k]
  |  |  ------------------
  |  |   38|  5.34k|            avifBreakOnError(); \
  |  |   39|  5.34k|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|  5.34k|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|  5.34k|        }                       \
  |  |   41|  19.2k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 13.8k]
  |  |  ------------------
  ------------------
  457|  13.8k|    if (!header->av1C.monochrome) {
  ------------------
  |  Branch (457:9): [True: 8.74k, False: 5.11k]
  ------------------
  458|  8.74k|        avifBitsRead(bits, 1); // separate_uv_delta_q
  459|  8.74k|    }
  460|       |
  461|  13.8k|    avifBitsRead(bits, 1); // film_grain_params_present
  462|  13.8k|    return !bits->error;
  463|  19.2k|}
obu.c:parseAV1SequenceHeaderProfile:
  153|  29.5k|{
  154|  29.5k|    uint32_t seq_profile = avifBitsRead(bits, 3);
  155|  29.5k|    if (seq_profile > 2) {
  ------------------
  |  Branch (155:9): [True: 419, False: 29.1k]
  ------------------
  156|    419|        return AVIF_FALSE;
  ------------------
  |  |   89|    419|#define AVIF_FALSE 0
  ------------------
  157|    419|    }
  158|  29.1k|    header->av1C.seqProfile = (uint8_t)seq_profile;
  159|  29.1k|    return !bits->error;
  160|  29.5k|}
obu.c:parseSequenceHeaderLevelIdxAndTier:
  175|  28.9k|{
  176|  28.9k|    uint32_t still_picture = avifBitsRead(bits, 1);
  177|  28.9k|    header->reduced_still_picture_header = (uint8_t)avifBitsRead(bits, 1);
  178|  28.9k|    if (header->reduced_still_picture_header && !still_picture) {
  ------------------
  |  Branch (178:9): [True: 14.5k, False: 14.3k]
  |  Branch (178:49): [True: 283, False: 14.2k]
  ------------------
  179|    283|        return AVIF_FALSE;
  ------------------
  |  |   89|    283|#define AVIF_FALSE 0
  ------------------
  180|    283|    }
  181|       |
  182|  28.6k|    if (header->reduced_still_picture_header) {
  ------------------
  |  Branch (182:9): [True: 14.2k, False: 14.3k]
  ------------------
  183|  14.2k|        header->av1C.seqLevelIdx0 = (uint8_t)avifBitsRead(bits, 5);
  184|  14.2k|        header->av1C.seqTier0 = 0;
  185|  14.3k|    } else {
  186|  14.3k|        uint32_t timing_info_present_flag = avifBitsRead(bits, 1);
  187|  14.3k|        uint32_t decoder_model_info_present_flag = 0;
  188|  14.3k|        uint32_t buffer_delay_length = 0;
  189|  14.3k|        if (timing_info_present_flag) { // timing_info()
  ------------------
  |  Branch (189:13): [True: 7.84k, False: 6.52k]
  ------------------
  190|  7.84k|            avifBitsRead(bits, 32);     // num_units_in_display_tick
  191|  7.84k|            avifBitsRead(bits, 32);     // time_scale
  192|  7.84k|            uint32_t equal_picture_interval = avifBitsRead(bits, 1);
  193|  7.84k|            if (equal_picture_interval) {
  ------------------
  |  Branch (193:17): [True: 4.05k, False: 3.79k]
  ------------------
  194|  4.05k|                uint32_t num_ticks_per_picture_minus_1 = avifBitsReadVLC(bits);
  195|  4.05k|                if (num_ticks_per_picture_minus_1 == 0xFFFFFFFFU)
  ------------------
  |  Branch (195:21): [True: 259, False: 3.79k]
  ------------------
  196|    259|                    return AVIF_FALSE;
  ------------------
  |  |   89|    259|#define AVIF_FALSE 0
  ------------------
  197|  4.05k|            }
  198|       |
  199|  7.58k|            decoder_model_info_present_flag = avifBitsRead(bits, 1);
  200|  7.58k|            if (decoder_model_info_present_flag) { // decoder_model_info()
  ------------------
  |  Branch (200:17): [True: 2.83k, False: 4.74k]
  ------------------
  201|  2.83k|                buffer_delay_length = avifBitsRead(bits, 5) + 1;
  202|  2.83k|                avifBitsRead(bits, 32); // num_units_in_decoding_tick
  203|  2.83k|                avifBitsRead(bits, 10); // buffer_removal_time_length_minus_1, frame_presentation_time_length_minus_1
  204|  2.83k|            }
  205|  7.58k|        }
  206|       |
  207|  14.1k|        uint32_t initial_display_delay_present_flag = avifBitsRead(bits, 1);
  208|  14.1k|        uint32_t operating_points_cnt = avifBitsRead(bits, 5) + 1;
  209|   143k|        for (uint32_t i = 0; i < operating_points_cnt; i++) {
  ------------------
  |  Branch (209:30): [True: 129k, False: 14.1k]
  ------------------
  210|   129k|            avifBitsRead(bits, 12); // operating_point_idc
  211|   129k|            uint32_t seq_level_idx = avifBitsRead(bits, 5);
  212|   129k|            if (i == 0) {
  ------------------
  |  Branch (212:17): [True: 14.1k, False: 115k]
  ------------------
  213|  14.1k|                header->av1C.seqLevelIdx0 = (uint8_t)seq_level_idx;
  214|  14.1k|                header->av1C.seqTier0 = 0;
  215|  14.1k|            }
  216|   129k|            if (seq_level_idx > 7) {
  ------------------
  |  Branch (216:17): [True: 64.5k, False: 65.0k]
  ------------------
  217|  64.5k|                uint32_t seq_tier = avifBitsRead(bits, 1);
  218|  64.5k|                if (i == 0) {
  ------------------
  |  Branch (218:21): [True: 4.77k, False: 59.7k]
  ------------------
  219|  4.77k|                    header->av1C.seqTier0 = (uint8_t)seq_tier;
  220|  4.77k|                }
  221|  64.5k|            }
  222|   129k|            if (decoder_model_info_present_flag) {
  ------------------
  |  Branch (222:17): [True: 43.8k, False: 85.7k]
  ------------------
  223|  43.8k|                uint32_t decoder_model_present_for_this_op = avifBitsRead(bits, 1);
  224|  43.8k|                if (decoder_model_present_for_this_op) {     // operating_parameters_info()
  ------------------
  |  Branch (224:21): [True: 12.4k, False: 31.4k]
  ------------------
  225|  12.4k|                    avifBitsRead(bits, buffer_delay_length); // decoder_buffer_delay
  226|  12.4k|                    avifBitsRead(bits, buffer_delay_length); // encoder_buffer_delay
  227|  12.4k|                    avifBitsRead(bits, 1);                   // low_delay_mode_flag
  228|  12.4k|                }
  229|  43.8k|            }
  230|   129k|            if (initial_display_delay_present_flag) {
  ------------------
  |  Branch (230:17): [True: 72.7k, False: 56.8k]
  ------------------
  231|  72.7k|                uint32_t initial_display_delay_present_for_this_op = avifBitsRead(bits, 1);
  232|  72.7k|                if (initial_display_delay_present_for_this_op) {
  ------------------
  |  Branch (232:21): [True: 28.0k, False: 44.7k]
  ------------------
  233|  28.0k|                    avifBitsRead(bits, 4); // initial_display_delay_minus_1
  234|  28.0k|                }
  235|  72.7k|            }
  236|   129k|        }
  237|  14.1k|    }
  238|  28.4k|    return !bits->error;
  239|  28.6k|}
obu.c:avifBitsReadVLC:
  122|  4.05k|{
  123|  4.05k|    int numBits = 0;
  124|  17.5k|    while (!avifBitsRead(bits, 1))
  ------------------
  |  Branch (124:12): [True: 13.7k, False: 3.79k]
  ------------------
  125|  13.7k|        if (++numBits == 32)
  ------------------
  |  Branch (125:13): [True: 259, False: 13.5k]
  ------------------
  126|    259|            return 0xFFFFFFFFU;
  127|  3.79k|    return numBits ? ((1U << numBits) - 1) + avifBitsRead(bits, numBits) : 0;
  ------------------
  |  Branch (127:12): [True: 1.76k, False: 2.03k]
  ------------------
  128|  4.05k|}
obu.c:parseSequenceHeaderFrameMaxDimensions:
  242|  22.6k|{
  243|  22.6k|    uint32_t frame_width_bits = avifBitsRead(bits, 4) + 1;
  244|  22.6k|    uint32_t frame_height_bits = avifBitsRead(bits, 4) + 1;
  245|  22.6k|    header->maxWidth = avifBitsRead(bits, frame_width_bits) + 1;   // max_frame_width
  246|  22.6k|    header->maxHeight = avifBitsRead(bits, frame_height_bits) + 1; // max_frame_height
  247|  22.6k|    uint32_t frame_id_numbers_present_flag = 0;
  248|  22.6k|    if (!header->reduced_still_picture_header) {
  ------------------
  |  Branch (248:9): [True: 8.59k, False: 14.0k]
  ------------------
  249|  8.59k|        frame_id_numbers_present_flag = avifBitsRead(bits, 1);
  250|  8.59k|    }
  251|  22.6k|    if (frame_id_numbers_present_flag) {
  ------------------
  |  Branch (251:9): [True: 2.84k, False: 19.8k]
  ------------------
  252|  2.84k|        avifBitsRead(bits, 7); // delta_frame_id_length_minus_2, additional_frame_id_length_minus_1
  253|  2.84k|    }
  254|  22.6k|    return !bits->error;
  255|  22.6k|}
obu.c:parseSequenceHeaderEnabledFeatures:
  258|  21.1k|{
  259|  21.1k|    avifBitsRead(bits, 2); // enable_filter_intra, enable_intra_edge_filter
  260|       |
  261|  21.1k|    if (!header->reduced_still_picture_header) {
  ------------------
  |  Branch (261:9): [True: 7.54k, False: 13.6k]
  ------------------
  262|  7.54k|        avifBitsRead(bits, 4); // enable_interintra_compound, enable_masked_compound, enable_warped_motion, enable_dual_filter
  263|  7.54k|        uint32_t enable_order_hint = avifBitsRead(bits, 1);
  264|  7.54k|        if (enable_order_hint) {
  ------------------
  |  Branch (264:13): [True: 4.96k, False: 2.57k]
  ------------------
  265|  4.96k|            avifBitsRead(bits, 2); // enable_jnt_comp, enable_ref_frame_mvs
  266|  4.96k|        }
  267|       |
  268|  7.54k|        uint32_t seq_force_screen_content_tools = 0;
  269|  7.54k|        uint32_t seq_choose_screen_content_tools = avifBitsRead(bits, 1);
  270|  7.54k|        if (seq_choose_screen_content_tools) {
  ------------------
  |  Branch (270:13): [True: 3.93k, False: 3.60k]
  ------------------
  271|  3.93k|            seq_force_screen_content_tools = 2;
  272|  3.93k|        } else {
  273|  3.60k|            seq_force_screen_content_tools = avifBitsRead(bits, 1);
  274|  3.60k|        }
  275|  7.54k|        if (seq_force_screen_content_tools > 0) {
  ------------------
  |  Branch (275:13): [True: 5.59k, False: 1.95k]
  ------------------
  276|  5.59k|            uint32_t seq_choose_integer_mv = avifBitsRead(bits, 1);
  277|  5.59k|            if (!seq_choose_integer_mv) {
  ------------------
  |  Branch (277:17): [True: 2.20k, False: 3.38k]
  ------------------
  278|  2.20k|                avifBitsRead(bits, 1); // seq_force_integer_mv
  279|  2.20k|            }
  280|  5.59k|        }
  281|  7.54k|        if (enable_order_hint) {
  ------------------
  |  Branch (281:13): [True: 4.96k, False: 2.57k]
  ------------------
  282|  4.96k|            avifBitsRead(bits, 3); // order_hint_bits_minus_1
  283|  4.96k|        }
  284|  7.54k|    }
  285|       |
  286|  21.1k|    return !bits->error;
  287|  21.1k|}
obu.c:parseAV1SequenceHeaderColorConfig:
  291|  19.2k|{
  292|  19.2k|    header->bitDepth = 8;
  293|  19.2k|    header->chromaSamplePosition = AVIF_CHROMA_SAMPLE_POSITION_UNKNOWN;
  294|  19.2k|    header->av1C.chromaSamplePosition = (uint8_t)header->chromaSamplePosition;
  295|  19.2k|    uint32_t high_bitdepth = avifBitsRead(bits, 1);
  296|  19.2k|    header->av1C.highBitdepth = (uint8_t)high_bitdepth;
  297|  19.2k|    if ((header->av1C.seqProfile == 2) && high_bitdepth) {
  ------------------
  |  Branch (297:9): [True: 3.96k, False: 15.2k]
  |  Branch (297:43): [True: 3.06k, False: 901]
  ------------------
  298|  3.06k|        uint32_t twelve_bit = avifBitsRead(bits, 1);
  299|  3.06k|        header->bitDepth = twelve_bit ? 12 : 10;
  ------------------
  |  Branch (299:28): [True: 2.67k, False: 387]
  ------------------
  300|  3.06k|        header->av1C.twelveBit = (uint8_t)twelve_bit;
  301|  16.1k|    } else /* if (seq_profile <= 2) */ {
  302|  16.1k|        header->bitDepth = high_bitdepth ? 10 : 8;
  ------------------
  |  Branch (302:28): [True: 7.11k, False: 9.03k]
  ------------------
  303|  16.1k|        header->av1C.twelveBit = 0;
  304|  16.1k|    }
  305|  19.2k|    uint32_t mono_chrome = 0;
  306|  19.2k|    if (header->av1C.seqProfile != 1) {
  ------------------
  |  Branch (306:9): [True: 11.9k, False: 7.25k]
  ------------------
  307|  11.9k|        mono_chrome = avifBitsRead(bits, 1);
  308|  11.9k|    }
  309|  19.2k|    header->av1C.monochrome = (uint8_t)mono_chrome;
  310|  19.2k|    uint32_t color_description_present_flag = avifBitsRead(bits, 1);
  311|  19.2k|    if (color_description_present_flag) {
  ------------------
  |  Branch (311:9): [True: 9.04k, False: 10.1k]
  ------------------
  312|  9.04k|        header->colorPrimaries = (avifColorPrimaries)avifBitsRead(bits, 8);                   // color_primaries
  313|  9.04k|        header->transferCharacteristics = (avifTransferCharacteristics)avifBitsRead(bits, 8); // transfer_characteristics
  314|  9.04k|        header->matrixCoefficients = (avifMatrixCoefficients)avifBitsRead(bits, 8);           // matrix_coefficients
  315|  10.1k|    } else {
  316|  10.1k|        header->colorPrimaries = AVIF_COLOR_PRIMARIES_UNSPECIFIED;
  317|  10.1k|        header->transferCharacteristics = AVIF_TRANSFER_CHARACTERISTICS_UNSPECIFIED;
  318|  10.1k|        header->matrixCoefficients = AVIF_MATRIX_COEFFICIENTS_UNSPECIFIED;
  319|  10.1k|    }
  320|  19.2k|    if (mono_chrome) {
  ------------------
  |  Branch (320:9): [True: 6.90k, False: 12.2k]
  ------------------
  321|  6.90k|        header->range = avifBitsRead(bits, 1) ? AVIF_RANGE_FULL : AVIF_RANGE_LIMITED; // color_range
  ------------------
  |  Branch (321:25): [True: 3.80k, False: 3.10k]
  ------------------
  322|  6.90k|        header->av1C.chromaSubsamplingX = 1;
  323|  6.90k|        header->av1C.chromaSubsamplingY = 1;
  324|  6.90k|        header->yuvFormat = AVIF_PIXEL_FORMAT_YUV400;
  325|  12.2k|    } else if (header->colorPrimaries == AVIF_COLOR_PRIMARIES_BT709 &&
  ------------------
  |  Branch (325:16): [True: 1.84k, False: 10.4k]
  ------------------
  326|  1.84k|               header->transferCharacteristics == AVIF_TRANSFER_CHARACTERISTICS_SRGB &&
  ------------------
  |  Branch (326:16): [True: 964, False: 878]
  ------------------
  327|    964|               header->matrixCoefficients == AVIF_MATRIX_COEFFICIENTS_IDENTITY) {
  ------------------
  |  Branch (327:16): [True: 36, False: 928]
  ------------------
  328|     36|        header->range = AVIF_RANGE_FULL;
  329|     36|        header->av1C.chromaSubsamplingX = 0;
  330|     36|        header->av1C.chromaSubsamplingY = 0;
  331|     36|        header->yuvFormat = AVIF_PIXEL_FORMAT_YUV444;
  332|  12.2k|    } else {
  333|  12.2k|        uint32_t subsampling_x = 0;
  334|  12.2k|        uint32_t subsampling_y = 0;
  335|  12.2k|        header->range = avifBitsRead(bits, 1) ? AVIF_RANGE_FULL : AVIF_RANGE_LIMITED; // color_range
  ------------------
  |  Branch (335:25): [True: 5.59k, False: 6.67k]
  ------------------
  336|  12.2k|        switch (header->av1C.seqProfile) {
  337|  2.77k|            case 0:
  ------------------
  |  Branch (337:13): [True: 2.77k, False: 9.48k]
  ------------------
  338|  2.77k|                subsampling_x = 1;
  339|  2.77k|                subsampling_y = 1;
  340|  2.77k|                header->yuvFormat = AVIF_PIXEL_FORMAT_YUV420;
  341|  2.77k|                break;
  342|  7.21k|            case 1:
  ------------------
  |  Branch (342:13): [True: 7.21k, False: 5.04k]
  ------------------
  343|  7.21k|                subsampling_x = 0;
  344|  7.21k|                subsampling_y = 0;
  345|  7.21k|                header->yuvFormat = AVIF_PIXEL_FORMAT_YUV444;
  346|  7.21k|                break;
  347|  2.26k|            case 2:
  ------------------
  |  Branch (347:13): [True: 2.26k, False: 9.99k]
  ------------------
  348|  2.26k|                if (header->bitDepth == 12) {
  ------------------
  |  Branch (348:21): [True: 1.22k, False: 1.04k]
  ------------------
  349|  1.22k|                    subsampling_x = avifBitsRead(bits, 1);
  350|  1.22k|                    if (subsampling_x) {
  ------------------
  |  Branch (350:25): [True: 428, False: 800]
  ------------------
  351|    428|                        subsampling_y = avifBitsRead(bits, 1);
  352|    428|                    }
  353|  1.22k|                } else {
  354|  1.04k|                    subsampling_x = 1;
  355|  1.04k|                    subsampling_y = 0;
  356|  1.04k|                }
  357|  2.26k|                if (subsampling_x) {
  ------------------
  |  Branch (357:21): [True: 1.46k, False: 800]
  ------------------
  358|  1.46k|                    header->yuvFormat = subsampling_y ? AVIF_PIXEL_FORMAT_YUV420 : AVIF_PIXEL_FORMAT_YUV422;
  ------------------
  |  Branch (358:41): [True: 286, False: 1.18k]
  ------------------
  359|  1.46k|                } else {
  360|    800|                    header->yuvFormat = AVIF_PIXEL_FORMAT_YUV444;
  361|    800|                }
  362|  2.26k|                break;
  363|      0|            default:
  ------------------
  |  Branch (363:13): [True: 0, False: 12.2k]
  ------------------
  364|      0|                return AVIF_FALSE;
  ------------------
  |  |   89|      0|#define AVIF_FALSE 0
  ------------------
  365|  12.2k|        }
  366|       |
  367|  12.2k|        if (subsampling_x && subsampling_y) {
  ------------------
  |  Branch (367:13): [True: 4.24k, False: 8.01k]
  |  Branch (367:30): [True: 3.06k, False: 1.18k]
  ------------------
  368|  3.06k|            header->chromaSamplePosition = (avifChromaSamplePosition)avifBitsRead(bits, 2); // chroma_sample_position
  369|  3.06k|            header->av1C.chromaSamplePosition = (uint8_t)header->chromaSamplePosition;
  370|  3.06k|        }
  371|  12.2k|        header->av1C.chromaSubsamplingX = (uint8_t)subsampling_x;
  372|  12.2k|        header->av1C.chromaSubsamplingY = (uint8_t)subsampling_y;
  373|  12.2k|    }
  374|       |
  375|  19.2k|    return !bits->error;
  376|  19.2k|}

avifRWDataRealloc:
    9|   103k|{
   10|   103k|    if (raw->size != newSize) {
  ------------------
  |  Branch (10:9): [True: 88.7k, False: 14.8k]
  ------------------
   11|  88.7k|        uint8_t * newData = (uint8_t *)avifAlloc(newSize);
   12|  88.7k|        AVIF_CHECKERR(newData, AVIF_RESULT_OUT_OF_MEMORY);
  ------------------
  |  |   45|  88.7k|    do {                        \
  |  |   46|  88.7k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 88.7k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  88.7k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 88.7k]
  |  |  ------------------
  ------------------
   13|  88.7k|        if (raw->size && newSize) {
  ------------------
  |  Branch (13:13): [True: 87, False: 88.6k]
  |  Branch (13:26): [True: 87, False: 0]
  ------------------
   14|     87|            memcpy(newData, raw->data, AVIF_MIN(raw->size, newSize));
  ------------------
  |  |   19|     87|#define AVIF_MIN(a, b) (((a) < (b)) ? (a) : (b))
  |  |  ------------------
  |  |  |  Branch (19:25): [True: 85, False: 2]
  |  |  ------------------
  ------------------
   15|     87|        }
   16|  88.7k|        avifFree(raw->data);
   17|  88.7k|        raw->data = newData;
   18|  88.7k|        raw->size = newSize;
   19|  88.7k|    }
   20|   103k|    return AVIF_RESULT_OK;
   21|   103k|}
avifRWDataSet:
   24|  81.8k|{
   25|  81.8k|    if (len) {
  ------------------
  |  Branch (25:9): [True: 73.6k, False: 8.22k]
  ------------------
   26|  73.6k|        AVIF_CHECKRES(avifRWDataRealloc(raw, len));
  ------------------
  |  |   54|  73.6k|    do {                                  \
  |  |   55|  73.6k|        const avifResult result__ = (A);  \
  |  |   56|  73.6k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 0, False: 73.6k]
  |  |  ------------------
  |  |   57|      0|            avifBreakOnError();           \
  |  |   58|      0|            return result__;              \
  |  |   59|      0|        }                                 \
  |  |   60|  73.6k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 73.6k]
  |  |  ------------------
  ------------------
   27|  73.6k|        memcpy(raw->data, data, len);
   28|  73.6k|    } else {
   29|  8.22k|        avifRWDataFree(raw);
   30|  8.22k|    }
   31|  81.8k|    return AVIF_RESULT_OK;
   32|  81.8k|}
avifRWDataFree:
   35|   371k|{
   36|   371k|    avifFree(raw->data);
   37|       |    raw->data = NULL;
   38|   371k|    raw->size = 0;
   39|   371k|}

avifCodecDecodeInputCreate:
  482|  45.0k|{
  483|  45.0k|    avifCodecDecodeInput * decodeInput = (avifCodecDecodeInput *)avifAlloc(sizeof(avifCodecDecodeInput));
  484|  45.0k|    if (decodeInput == NULL) {
  ------------------
  |  Branch (484:9): [True: 0, False: 45.0k]
  ------------------
  485|      0|        return NULL;
  486|      0|    }
  487|  45.0k|    memset(decodeInput, 0, sizeof(avifCodecDecodeInput));
  488|  45.0k|    if (!avifArrayCreate(&decodeInput->samples, sizeof(avifDecodeSample), 1)) {
  ------------------
  |  Branch (488:9): [True: 0, False: 45.0k]
  ------------------
  489|      0|        avifFree(decodeInput);
  490|      0|        return NULL;
  491|      0|    }
  492|  45.0k|    return decodeInput;
  493|  45.0k|}
avifCodecDecodeInputDestroy:
  496|  45.0k|{
  497|   110k|    for (uint32_t sampleIndex = 0; sampleIndex < decodeInput->samples.count; ++sampleIndex) {
  ------------------
  |  Branch (497:36): [True: 65.7k, False: 45.0k]
  ------------------
  498|  65.7k|        avifDecodeSample * sample = &decodeInput->samples.sample[sampleIndex];
  499|  65.7k|        if (sample->ownsData) {
  ------------------
  |  Branch (499:13): [True: 5.07k, False: 60.7k]
  ------------------
  500|  5.07k|            avifRWDataFree((avifRWData *)&sample->data);
  501|  5.07k|        }
  502|  65.7k|    }
  503|  45.0k|    avifArrayDestroy(&decodeInput->samples);
  504|  45.0k|    avifFree(decodeInput);
  505|  45.0k|}
avifPeekCompatibleFileType:
 5039|    392|{
 5040|    392|    BEGIN_STREAM(s, input->data, input->size, NULL, NULL);
  ------------------
  |  |  738|    392|    avifROStream VARNAME;                               \
  |  |  739|    392|    avifROData VARNAME##_roData;                        \
  |  |  740|    392|    VARNAME##_roData.data = PTR;                        \
  |  |  741|    392|    VARNAME##_roData.size = SIZE;                       \
  |  |  742|    392|    avifROStreamStart(&VARNAME, &VARNAME##_roData, DIAG, CONTEXT)
  ------------------
 5041|       |
 5042|    392|    avifBoxHeader header;
 5043|    392|    if (!avifROStreamReadBoxHeaderPartial(&s, &header, /*topLevel=*/AVIF_TRUE) || memcmp(header.type, "ftyp", 4)) {
  ------------------
  |  |   88|    392|#define AVIF_TRUE 1
  ------------------
  |  Branch (5043:9): [True: 0, False: 392]
  |  Branch (5043:83): [True: 136, False: 256]
  ------------------
 5044|    136|        return AVIF_FALSE;
  ------------------
  |  |   89|    136|#define AVIF_FALSE 0
  ------------------
 5045|    136|    }
 5046|    256|    if (header.isSizeZeroBox) {
  ------------------
  |  Branch (5046:9): [True: 0, False: 256]
  ------------------
 5047|       |        // The ftyp box goes on till the end of the file. Either there is no brand requiring anything in the file but a
 5048|       |        // FileTypebox (so not AVIF), or it is invalid.
 5049|      0|        return AVIF_FALSE;
  ------------------
  |  |   89|      0|#define AVIF_FALSE 0
  ------------------
 5050|      0|    }
 5051|    256|    AVIF_CHECK(avifROStreamHasBytesLeft(&s, header.size));
  ------------------
  |  |   36|    256|    do {                        \
  |  |   37|    256|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 0, False: 256]
  |  |  ------------------
  |  |   38|      0|            avifBreakOnError(); \
  |  |   39|      0|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      0|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      0|        }                       \
  |  |   41|    256|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 256]
  |  |  ------------------
  ------------------
 5052|       |
 5053|    256|    avifFileType ftyp;
 5054|    256|    memset(&ftyp, 0, sizeof(avifFileType));
 5055|    256|    avifBool parsed = avifParseFileTypeBox(&ftyp, avifROStreamCurrent(&s), header.size, NULL);
 5056|    256|    if (!parsed) {
  ------------------
  |  Branch (5056:9): [True: 0, False: 256]
  ------------------
 5057|      0|        return AVIF_FALSE;
  ------------------
  |  |   89|      0|#define AVIF_FALSE 0
  ------------------
 5058|      0|    }
 5059|    256|    return avifFileTypeIsCompatible(&ftyp);
 5060|    256|}
avifDecoderCreate:
 5075|  20.5k|{
 5076|  20.5k|    avifDecoder * decoder = (avifDecoder *)avifAlloc(sizeof(avifDecoder));
 5077|  20.5k|    if (decoder == NULL) {
  ------------------
  |  Branch (5077:9): [True: 0, False: 20.5k]
  ------------------
 5078|      0|        return NULL;
 5079|      0|    }
 5080|  20.5k|    memset(decoder, 0, sizeof(avifDecoder));
 5081|  20.5k|    decoder->maxThreads = 1;
 5082|  20.5k|    decoder->imageSizeLimit = AVIF_DEFAULT_IMAGE_SIZE_LIMIT;
  ------------------
  |  |   95|  20.5k|#define AVIF_DEFAULT_IMAGE_SIZE_LIMIT (16384 * 16384)
  ------------------
 5083|  20.5k|    decoder->imageDimensionLimit = AVIF_DEFAULT_IMAGE_DIMENSION_LIMIT;
  ------------------
  |  |   98|  20.5k|#define AVIF_DEFAULT_IMAGE_DIMENSION_LIMIT 32768
  ------------------
 5084|  20.5k|    decoder->imageCountLimit = AVIF_DEFAULT_IMAGE_COUNT_LIMIT;
  ------------------
  |  |  101|  20.5k|#define AVIF_DEFAULT_IMAGE_COUNT_LIMIT (12 * 3600 * 60)
  ------------------
 5085|  20.5k|    decoder->strictFlags = AVIF_STRICT_ENABLED;
 5086|  20.5k|    decoder->imageContentToDecode = AVIF_IMAGE_CONTENT_DECODE_DEFAULT;
 5087|  20.5k|    return decoder;
 5088|  20.5k|}
avifDecoderDestroy:
 5105|  20.5k|{
 5106|  20.5k|    avifDecoderCleanup(decoder);
 5107|  20.5k|    avifIODestroy(decoder->io);
 5108|  20.5k|    avifFree(decoder);
 5109|  20.5k|}
avifDecoderSetIO:
 5118|  20.5k|{
 5119|  20.5k|    avifIODestroy(decoder->io);
 5120|  20.5k|    decoder->io = io;
 5121|  20.5k|}
avifDecoderParse:
 5289|  20.5k|{
 5290|  20.5k|    avifDiagnosticsClearError(&decoder->diag);
 5291|       |
 5292|       |    // An imageSizeLimit greater than AVIF_DEFAULT_IMAGE_SIZE_LIMIT and the special value of 0 to
 5293|       |    // disable the limit are not yet implemented.
 5294|  20.5k|    if ((decoder->imageSizeLimit > AVIF_DEFAULT_IMAGE_SIZE_LIMIT) || (decoder->imageSizeLimit == 0)) {
  ------------------
  |  |   95|  20.5k|#define AVIF_DEFAULT_IMAGE_SIZE_LIMIT (16384 * 16384)
  ------------------
  |  Branch (5294:9): [True: 0, False: 20.5k]
  |  Branch (5294:70): [True: 0, False: 20.5k]
  ------------------
 5295|      0|        return AVIF_RESULT_NOT_IMPLEMENTED;
 5296|      0|    }
 5297|       |    // Color only or alpha only is not currently supported.
 5298|  20.5k|    if ((decoder->imageContentToDecode & AVIF_IMAGE_CONTENT_COLOR_AND_ALPHA) != 0 &&
  ------------------
  |  Branch (5298:9): [True: 18.0k, False: 2.41k]
  ------------------
 5299|  18.0k|        (decoder->imageContentToDecode & AVIF_IMAGE_CONTENT_COLOR_AND_ALPHA) != AVIF_IMAGE_CONTENT_COLOR_AND_ALPHA) {
  ------------------
  |  Branch (5299:9): [True: 2, False: 18.0k]
  ------------------
 5300|      2|        avifDiagnosticsPrintf(&decoder->diag, "imageContentToDecode set to only color or only alpha is not supported");
 5301|      2|        return AVIF_RESULT_NOT_IMPLEMENTED;
 5302|      2|    }
 5303|  20.5k|    if (!decoder->io || !decoder->io->read) {
  ------------------
  |  Branch (5303:9): [True: 0, False: 20.5k]
  |  Branch (5303:25): [True: 0, False: 20.5k]
  ------------------
 5304|      0|        return AVIF_RESULT_IO_NOT_SET;
 5305|      0|    }
 5306|       |
 5307|       |    // Cleanup anything lingering in the decoder
 5308|  20.5k|    avifDecoderCleanup(decoder);
 5309|       |
 5310|       |    // -----------------------------------------------------------------------
 5311|       |    // Parse BMFF boxes
 5312|       |
 5313|  20.5k|    decoder->data = avifDecoderDataCreate();
 5314|  20.5k|    AVIF_CHECKERR(decoder->data != NULL, AVIF_RESULT_OUT_OF_MEMORY);
  ------------------
  |  |   45|  20.5k|    do {                        \
  |  |   46|  20.5k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 20.5k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  20.5k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 20.5k]
  |  |  ------------------
  ------------------
 5315|  20.5k|    decoder->data->diag = &decoder->diag;
 5316|       |
 5317|  20.5k|    AVIF_CHECKRES(avifParse(decoder));
  ------------------
  |  |   54|  20.5k|    do {                                  \
  |  |   55|  20.5k|        const avifResult result__ = (A);  \
  |  |   56|  20.5k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 1.56k, False: 18.9k]
  |  |  ------------------
  |  |   57|  1.56k|            avifBreakOnError();           \
  |  |   58|  1.56k|            return result__;              \
  |  |   59|  1.56k|        }                                 \
  |  |   60|  20.5k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 18.9k]
  |  |  ------------------
  ------------------
 5318|       |
 5319|       |    // Walk the decoded items (if any) and harvest ispe
 5320|  18.9k|    avifDecoderData * data = decoder->data;
 5321|  66.6k|    for (uint32_t itemIndex = 0; itemIndex < data->meta->items.count; ++itemIndex) {
  ------------------
  |  Branch (5321:34): [True: 47.8k, False: 18.7k]
  ------------------
 5322|  47.8k|        avifDecoderItem * item = data->meta->items.item[itemIndex];
 5323|  47.8k|        if (avifDecoderItemShouldBeSkipped(item)) {
  ------------------
  |  Branch (5323:13): [True: 12.7k, False: 35.1k]
  ------------------
 5324|  12.7k|            continue;
 5325|  12.7k|        }
 5326|       |
 5327|  35.1k|        const avifProperty * ispeProp = avifPropertyArrayFind(&item->properties, "ispe");
 5328|  35.1k|        if (ispeProp) {
  ------------------
  |  Branch (5328:13): [True: 34.2k, False: 877]
  ------------------
 5329|  34.2k|            item->width = ispeProp->u.ispe.width;
 5330|  34.2k|            item->height = ispeProp->u.ispe.height;
 5331|       |
 5332|  34.2k|            if ((item->width == 0) || (item->height == 0)) {
  ------------------
  |  Branch (5332:17): [True: 1, False: 34.2k]
  |  Branch (5332:39): [True: 1, False: 34.2k]
  ------------------
 5333|      2|                avifDiagnosticsPrintf(data->diag, "Item ID [%u] has an invalid size [%ux%u]", item->id, item->width, item->height);
 5334|      2|                return AVIF_RESULT_BMFF_PARSE_FAILED;
 5335|      2|            }
 5336|  34.2k|            if (avifDimensionsTooLarge(item->width, item->height, decoder->imageSizeLimit, decoder->imageDimensionLimit)) {
  ------------------
  |  Branch (5336:17): [True: 90, False: 34.1k]
  ------------------
 5337|     90|                avifDiagnosticsPrintf(data->diag, "Item ID [%u] dimensions are too large [%ux%u]", item->id, item->width, item->height);
 5338|     90|                return AVIF_RESULT_BMFF_PARSE_FAILED;
 5339|     90|            }
 5340|  34.2k|        } else {
 5341|    877|            const avifProperty * auxCProp = avifPropertyArrayFind(&item->properties, "auxC");
 5342|    877|            if (auxCProp && isAlphaURN(auxCProp->u.auxC.auxType)) {
  ------------------
  |  Branch (5342:17): [True: 865, False: 12]
  |  Branch (5342:29): [True: 805, False: 60]
  ------------------
 5343|    805|                if (decoder->strictFlags & AVIF_STRICT_ALPHA_ISPE_REQUIRED) {
  ------------------
  |  Branch (5343:21): [True: 1, False: 804]
  ------------------
 5344|      1|                    avifDiagnosticsPrintf(data->diag,
 5345|      1|                                          "[Strict] Alpha auxiliary image item ID [%u] is missing a mandatory ispe property",
 5346|      1|                                          item->id);
 5347|      1|                    return AVIF_RESULT_BMFF_PARSE_FAILED;
 5348|      1|                }
 5349|    805|            } else {
 5350|     72|                avifDiagnosticsPrintf(data->diag, "Item ID [%u] is missing a mandatory ispe property", item->id);
 5351|     72|                return AVIF_RESULT_BMFF_PARSE_FAILED;
 5352|     72|            }
 5353|    877|        }
 5354|  35.1k|    }
 5355|  18.7k|    return avifDecoderReset(decoder);
 5356|  18.9k|}
avifDecoderReset:
 6078|  36.6k|{
 6079|  36.6k|    avifDiagnosticsClearError(&decoder->diag);
 6080|       |
 6081|  36.6k|    avifDecoderData * data = decoder->data;
 6082|  36.6k|    if (!data) {
  ------------------
  |  Branch (6082:9): [True: 0, False: 36.6k]
  ------------------
 6083|       |        // Nothing to reset.
 6084|      0|        return AVIF_RESULT_OK;
 6085|      0|    }
 6086|       |
 6087|   329k|    for (int c = 0; c < AVIF_ITEM_CATEGORY_COUNT; ++c) {
  ------------------
  |  Branch (6087:21): [True: 293k, False: 36.6k]
  ------------------
 6088|   293k|        memset(&data->tileInfos[c].grid, 0, sizeof(data->tileInfos[c].grid));
 6089|   293k|    }
 6090|  36.6k|    avifDecoderDataClearTiles(data);
 6091|       |
 6092|       |    // Prepare / cleanup decoded image state
 6093|  36.6k|    if (decoder->image) {
  ------------------
  |  Branch (6093:9): [True: 17.8k, False: 18.7k]
  ------------------
 6094|  17.8k|        avifImageDestroy(decoder->image);
 6095|  17.8k|    }
 6096|  36.6k|    decoder->image = avifImageCreateEmpty();
 6097|  36.6k|    AVIF_CHECKERR(decoder->image, AVIF_RESULT_OUT_OF_MEMORY);
  ------------------
  |  |   45|  36.6k|    do {                        \
  |  |   46|  36.6k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 36.6k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  36.6k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 36.6k]
  |  |  ------------------
  ------------------
 6098|  36.6k|    decoder->progressiveState = AVIF_PROGRESSIVE_STATE_UNAVAILABLE;
 6099|  36.6k|    data->cicpSet = AVIF_FALSE;
  ------------------
  |  |   89|  36.6k|#define AVIF_FALSE 0
  ------------------
 6100|       |
 6101|  36.6k|    memset(&decoder->ioStats, 0, sizeof(decoder->ioStats));
 6102|       |
 6103|       |    // Color only or alpha only is not currently supported.
 6104|  36.6k|    if ((decoder->imageContentToDecode & AVIF_IMAGE_CONTENT_COLOR_AND_ALPHA) != 0 &&
  ------------------
  |  Branch (6104:9): [True: 33.8k, False: 2.82k]
  ------------------
 6105|  33.8k|        (decoder->imageContentToDecode & AVIF_IMAGE_CONTENT_COLOR_AND_ALPHA) != AVIF_IMAGE_CONTENT_COLOR_AND_ALPHA) {
  ------------------
  |  Branch (6105:9): [True: 0, False: 33.8k]
  ------------------
 6106|      0|        avifDiagnosticsPrintf(&decoder->diag, "imageContentToDecode set to only color or only alpha is not supported");
 6107|      0|        return AVIF_RESULT_NOT_IMPLEMENTED;
 6108|      0|    }
 6109|       |
 6110|       |    // -----------------------------------------------------------------------
 6111|       |    // Build decode input
 6112|       |
 6113|  36.6k|    data->sourceSampleTable = NULL; // Reset
 6114|  36.6k|    if (decoder->requestedSource == AVIF_DECODER_SOURCE_AUTO) {
  ------------------
  |  Branch (6114:9): [True: 20.2k, False: 16.3k]
  ------------------
 6115|       |        // Honor the major brand (avif or avis) if present, otherwise prefer avis (tracks) if possible.
 6116|  20.2k|        if (!memcmp(data->majorBrand, "avis", 4)) {
  ------------------
  |  Branch (6116:13): [True: 3.63k, False: 16.6k]
  ------------------
 6117|  3.63k|            data->source = AVIF_DECODER_SOURCE_TRACKS;
 6118|  16.6k|        } else if (!memcmp(data->majorBrand, "avif", 4)) {
  ------------------
  |  Branch (6118:20): [True: 14.8k, False: 1.84k]
  ------------------
 6119|  14.8k|            data->source = AVIF_DECODER_SOURCE_PRIMARY_ITEM;
 6120|  14.8k|        } else if (data->tracks.count > 0) {
  ------------------
  |  Branch (6120:20): [True: 166, False: 1.68k]
  ------------------
 6121|    166|            data->source = AVIF_DECODER_SOURCE_TRACKS;
 6122|  1.68k|        } else {
 6123|  1.68k|            data->source = AVIF_DECODER_SOURCE_PRIMARY_ITEM;
 6124|  1.68k|        }
 6125|  20.2k|    } else {
 6126|  16.3k|        data->source = decoder->requestedSource;
 6127|  16.3k|    }
 6128|       |
 6129|  36.6k|    avifCodecType colorCodecType = AVIF_CODEC_TYPE_UNKNOWN;
 6130|  36.6k|    const avifPropertyArray * colorProperties = NULL;
 6131|  36.6k|    const avifPropertyArray * alphaProperties = NULL;
 6132|  36.6k|    const avifPropertyArray * gainMapProperties = NULL;
 6133|  36.6k|    if (data->source == AVIF_DECODER_SOURCE_TRACKS) {
  ------------------
  |  Branch (6133:9): [True: 3.79k, False: 32.8k]
  ------------------
 6134|  3.79k|        avifTrack * colorTrack = NULL;
 6135|  3.79k|        avifTrack * alphaTrack = NULL;
 6136|       |
 6137|       |        // Find primary track - this probably needs some better detection
 6138|  3.79k|        uint32_t colorTrackIndex = 0;
 6139|  3.91k|        for (; colorTrackIndex < data->tracks.count; ++colorTrackIndex) {
  ------------------
  |  Branch (6139:16): [True: 3.89k, False: 20]
  ------------------
 6140|  3.89k|            avifTrack * track = &data->tracks.track[colorTrackIndex];
 6141|  3.89k|            if (!track->sampleTable) {
  ------------------
  |  Branch (6141:17): [True: 47, False: 3.84k]
  ------------------
 6142|     47|                continue;
 6143|     47|            }
 6144|  3.84k|            if (!track->id) { // trak box might be missing a tkhd box inside, skip it
  ------------------
  |  Branch (6144:17): [True: 6, False: 3.83k]
  ------------------
 6145|      6|                continue;
 6146|      6|            }
 6147|  3.83k|            if (!track->sampleTable->chunks.count) {
  ------------------
  |  Branch (6147:17): [True: 37, False: 3.80k]
  ------------------
 6148|     37|                continue;
 6149|     37|            }
 6150|  3.80k|            colorCodecType = avifSampleTableGetCodecType(track->sampleTable);
 6151|  3.80k|            if (colorCodecType == AVIF_CODEC_TYPE_UNKNOWN) {
  ------------------
  |  Branch (6151:17): [True: 22, False: 3.78k]
  ------------------
 6152|     22|                continue;
 6153|     22|            }
 6154|  3.78k|            if (track->auxForID != 0) {
  ------------------
  |  Branch (6154:17): [True: 2, False: 3.77k]
  ------------------
 6155|      2|                continue;
 6156|      2|            }
 6157|       |            // HEIF (ISO/IEC 23008-12:2022), Section 7.1:
 6158|       |            //   In order to distinguish image sequences from video, the handler type in the
 6159|       |            //   HandlerBox of the track is 'pict' to indicate an image sequence track.
 6160|       |            // But we do not check the handler type because it may break some existing files.
 6161|       |
 6162|       |            // Found one!
 6163|  3.77k|            break;
 6164|  3.78k|        }
 6165|  3.79k|        if (colorTrackIndex == data->tracks.count) {
  ------------------
  |  Branch (6165:13): [True: 20, False: 3.77k]
  ------------------
 6166|     20|            avifDiagnosticsPrintf(&decoder->diag, "Failed to find AV1 color track");
 6167|     20|            return AVIF_RESULT_NO_CONTENT;
 6168|     20|        }
 6169|  3.77k|        colorTrack = &data->tracks.track[colorTrackIndex];
 6170|       |
 6171|  3.77k|        colorProperties = avifSampleTableGetProperties(colorTrack->sampleTable, colorCodecType);
 6172|  3.77k|        if (!colorProperties) {
  ------------------
  |  Branch (6172:13): [True: 0, False: 3.77k]
  ------------------
 6173|      0|            avifDiagnosticsPrintf(&decoder->diag, "Failed to find AV1 color track's color properties");
 6174|      0|            return AVIF_RESULT_BMFF_PARSE_FAILED;
 6175|      0|        }
 6176|       |
 6177|       |        // Find Exif and/or XMP metadata, if any
 6178|  3.77k|        if (colorTrack->meta) {
  ------------------
  |  Branch (6178:13): [True: 3.77k, False: 0]
  ------------------
 6179|       |            // See the comment above avifDecoderFindMetadata() for the explanation of using 0 here
 6180|  3.77k|            avifResult findResult = avifDecoderFindMetadata(decoder, colorTrack->meta, decoder->image, 0);
 6181|  3.77k|            if (findResult != AVIF_RESULT_OK) {
  ------------------
  |  Branch (6181:17): [True: 9, False: 3.76k]
  ------------------
 6182|      9|                return findResult;
 6183|      9|            }
 6184|  3.77k|        }
 6185|       |
 6186|  3.76k|        uint32_t alphaTrackIndex = 0;
 6187|  3.76k|        avifCodecType alphaCodecType = AVIF_CODEC_TYPE_UNKNOWN;
 6188|  8.54k|        for (; alphaTrackIndex < data->tracks.count; ++alphaTrackIndex) {
  ------------------
  |  Branch (6188:16): [True: 5.24k, False: 3.29k]
  ------------------
 6189|  5.24k|            avifTrack * track = &data->tracks.track[alphaTrackIndex];
 6190|  5.24k|            if (!track->sampleTable) {
  ------------------
  |  Branch (6190:17): [True: 181, False: 5.06k]
  ------------------
 6191|    181|                continue;
 6192|    181|            }
 6193|  5.06k|            if (!track->id) {
  ------------------
  |  Branch (6193:17): [True: 5, False: 5.05k]
  ------------------
 6194|      5|                continue;
 6195|      5|            }
 6196|  5.05k|            if (!track->sampleTable->chunks.count) {
  ------------------
  |  Branch (6196:17): [True: 63, False: 4.99k]
  ------------------
 6197|     63|                continue;
 6198|     63|            }
 6199|  4.99k|            alphaCodecType = avifSampleTableGetCodecType(track->sampleTable);
 6200|  4.99k|            if (alphaCodecType == AVIF_CODEC_TYPE_UNKNOWN) {
  ------------------
  |  Branch (6200:17): [True: 317, False: 4.67k]
  ------------------
 6201|    317|                continue;
 6202|    317|            }
 6203|  4.67k|            const avifPropertyArray * properties = avifSampleTableGetProperties(track->sampleTable, alphaCodecType);
 6204|  4.67k|            const avifProperty * auxiProp = properties ? avifPropertyArrayFind(properties, "auxi") : NULL;
  ------------------
  |  Branch (6204:45): [True: 4.67k, False: 0]
  ------------------
 6205|       |            // If auxi is present, check that it contains the alpha URN.
 6206|       |            // If auxi is not present, assume that the track is alpha. This is for backward compatibility with
 6207|       |            // old versions of libavif that did not write this property, see
 6208|       |            // https://github.com/AOMediaCodec/libavif/commit/98faa17
 6209|  4.67k|            if (auxiProp && !isAlphaURN(auxiProp->u.auxC.auxType)) {
  ------------------
  |  Branch (6209:17): [True: 990, False: 3.68k]
  |  Branch (6209:29): [True: 451, False: 539]
  ------------------
 6210|    451|                continue;
 6211|    451|            }
 6212|       |            // Do not check the track's handlerType. It should be "auxv" according to
 6213|       |            // HEIF (ISO/IEC 23008-12:2022), Section 7.5.3.1, but old versions of libavif used to write
 6214|       |            // "pict" instead. See https://github.com/AOMediaCodec/libavif/commit/65d0af9
 6215|       |
 6216|  4.22k|            if (track->auxForID == colorTrack->id) {
  ------------------
  |  Branch (6216:17): [True: 471, False: 3.75k]
  ------------------
 6217|       |                // Found it!
 6218|    471|                alphaProperties = properties;
 6219|    471|                break;
 6220|    471|            }
 6221|  4.22k|        }
 6222|  3.76k|        if (alphaTrackIndex != data->tracks.count) {
  ------------------
  |  Branch (6222:13): [True: 471, False: 3.29k]
  ------------------
 6223|    471|            alphaTrack = &data->tracks.track[alphaTrackIndex];
 6224|    471|        }
 6225|       |
 6226|  3.76k|        const uint8_t operatingPoint = 0; // No way to set operating point via tracks
 6227|  3.76k|        avifTile * colorTile = avifDecoderDataCreateTile(data, colorCodecType, colorTrack->width, colorTrack->height, operatingPoint);
 6228|  3.76k|        AVIF_CHECKERR(colorTile != NULL, AVIF_RESULT_OUT_OF_MEMORY);
  ------------------
  |  |   45|  3.76k|    do {                        \
  |  |   46|  3.76k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 3.76k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  3.76k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 3.76k]
  |  |  ------------------
  ------------------
 6229|  3.76k|        AVIF_CHECKRES(avifCodecDecodeInputFillFromSampleTable(colorTile->input,
  ------------------
  |  |   54|  3.76k|    do {                                  \
  |  |   55|  3.76k|        const avifResult result__ = (A);  \
  |  |   56|  3.76k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 159, False: 3.61k]
  |  |  ------------------
  |  |   57|    159|            avifBreakOnError();           \
  |  |   58|    159|            return result__;              \
  |  |   59|    159|        }                                 \
  |  |   60|  3.76k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 3.61k]
  |  |  ------------------
  ------------------
 6230|  3.76k|                                                              colorTrack->sampleTable,
 6231|  3.76k|                                                              decoder->imageCountLimit,
 6232|  3.76k|                                                              decoder->io->sizeHint,
 6233|  3.76k|                                                              data->diag));
 6234|  3.61k|        data->tileInfos[AVIF_ITEM_COLOR].tileCount = 1;
 6235|       |
 6236|  3.61k|        if (alphaTrack) {
  ------------------
  |  Branch (6236:13): [True: 469, False: 3.14k]
  ------------------
 6237|    469|            avifTile * alphaTile = avifDecoderDataCreateTile(data, alphaCodecType, alphaTrack->width, alphaTrack->height, operatingPoint);
 6238|    469|            AVIF_CHECKERR(alphaTile != NULL, AVIF_RESULT_OUT_OF_MEMORY);
  ------------------
  |  |   45|    469|    do {                        \
  |  |   46|    469|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 469]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|    469|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 469]
  |  |  ------------------
  ------------------
 6239|    469|            AVIF_CHECKRES(avifCodecDecodeInputFillFromSampleTable(alphaTile->input,
  ------------------
  |  |   54|    469|    do {                                  \
  |  |   55|    469|        const avifResult result__ = (A);  \
  |  |   56|    469|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 1, False: 468]
  |  |  ------------------
  |  |   57|      1|            avifBreakOnError();           \
  |  |   58|      1|            return result__;              \
  |  |   59|      1|        }                                 \
  |  |   60|    469|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 468]
  |  |  ------------------
  ------------------
 6240|    469|                                                                  alphaTrack->sampleTable,
 6241|    469|                                                                  decoder->imageCountLimit,
 6242|    469|                                                                  decoder->io->sizeHint,
 6243|    469|                                                                  data->diag));
 6244|    468|            alphaTile->input->itemCategory = AVIF_ITEM_ALPHA;
 6245|    468|            data->tileInfos[AVIF_ITEM_ALPHA].tileCount = 1;
 6246|    468|        }
 6247|       |
 6248|       |        // Stash off sample table for future timing information
 6249|  3.60k|        data->sourceSampleTable = colorTrack->sampleTable;
 6250|       |
 6251|       |        // Image sequence timing
 6252|  3.60k|        decoder->imageIndex = -1;
 6253|  3.60k|        decoder->imageCount = (int)colorTile->input->samples.count;
 6254|  3.60k|        decoder->timescale = colorTrack->mediaTimescale;
 6255|  3.60k|        decoder->durationInTimescales = colorTrack->mediaDuration;
 6256|  3.60k|        if (colorTrack->mediaTimescale) {
  ------------------
  |  Branch (6256:13): [True: 3.13k, False: 478]
  ------------------
 6257|  3.13k|            decoder->duration = (double)decoder->durationInTimescales / (double)colorTrack->mediaTimescale;
 6258|  3.13k|        } else {
 6259|    478|            decoder->duration = 0;
 6260|    478|        }
 6261|       |        // If the alphaTrack->repetitionCount and colorTrack->repetitionCount are different, we will simply use the
 6262|       |        // colorTrack's repetitionCount.
 6263|  3.60k|        decoder->repetitionCount = colorTrack->repetitionCount;
 6264|       |
 6265|  3.60k|        memset(&decoder->imageTiming, 0, sizeof(decoder->imageTiming)); // to be set in avifDecoderNextImage()
 6266|       |
 6267|  3.60k|        decoder->image->width = colorTrack->width;
 6268|  3.60k|        decoder->image->height = colorTrack->height;
 6269|  3.60k|        decoder->alphaPresent = (alphaTrack != NULL);
 6270|  3.60k|        decoder->image->alphaPremultiplied = decoder->alphaPresent && (colorTrack->premByID == alphaTrack->id);
  ------------------
  |  Branch (6270:46): [True: 468, False: 3.14k]
  |  Branch (6270:71): [True: 0, False: 468]
  ------------------
 6271|  32.8k|    } else {
 6272|       |        // Create from items
 6273|       |
 6274|  32.8k|        if (data->meta->primaryItemID == 0) {
  ------------------
  |  Branch (6274:13): [True: 13, False: 32.8k]
  ------------------
 6275|       |            // A primary item is required
 6276|     13|            avifDiagnosticsPrintf(&decoder->diag, "Primary item not specified");
 6277|     13|            return AVIF_RESULT_MISSING_IMAGE_ITEM;
 6278|     13|        }
 6279|       |
 6280|       |        // Main item of each group category (top-level item such as grid or single tile), if any.
 6281|  32.8k|        avifDecoderItem * mainItems[AVIF_ITEM_CATEGORY_COUNT];
 6282|  32.8k|        avifCodecType codecType[AVIF_ITEM_CATEGORY_COUNT];
 6283|   295k|        for (int c = 0; c < AVIF_ITEM_CATEGORY_COUNT; ++c) {
  ------------------
  |  Branch (6283:25): [True: 262k, False: 32.8k]
  ------------------
 6284|   262k|            mainItems[c] = NULL;
 6285|   262k|            codecType[c] = AVIF_CODEC_TYPE_UNKNOWN;
 6286|   262k|        }
 6287|       |
 6288|       |        // Mandatory primary color item
 6289|  32.8k|        mainItems[AVIF_ITEM_COLOR] = avifMetaFindColorItem(data->meta);
 6290|  32.8k|        if (!mainItems[AVIF_ITEM_COLOR]) {
  ------------------
  |  Branch (6290:13): [True: 93, False: 32.7k]
  ------------------
 6291|     93|            avifDiagnosticsPrintf(&decoder->diag, "Primary item not found");
 6292|     93|            return AVIF_RESULT_MISSING_IMAGE_ITEM;
 6293|     93|        }
 6294|  32.7k|        AVIF_CHECKRES(avifDecoderItemReadAndParse(decoder,
  ------------------
  |  |   54|  32.7k|    do {                                  \
  |  |   55|  32.7k|        const avifResult result__ = (A);  \
  |  |   56|  32.7k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 25, False: 32.7k]
  |  |  ------------------
  |  |   57|     25|            avifBreakOnError();           \
  |  |   58|     25|            return result__;              \
  |  |   59|     25|        }                                 \
  |  |   60|  32.7k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 32.7k]
  |  |  ------------------
  ------------------
 6295|  32.7k|                                                  mainItems[AVIF_ITEM_COLOR],
 6296|  32.7k|                                                  /*isItemInInput=*/AVIF_TRUE,
 6297|  32.7k|                                                  &data->tileInfos[AVIF_ITEM_COLOR].grid,
 6298|  32.7k|                                                  &codecType[AVIF_ITEM_COLOR]));
 6299|  32.7k|        colorProperties = &mainItems[AVIF_ITEM_COLOR]->properties;
 6300|  32.7k|        colorCodecType = codecType[AVIF_ITEM_COLOR];
 6301|       |
 6302|       |        // Optional alpha auxiliary item
 6303|  32.7k|        avifBool isAlphaItemInInput;
 6304|  32.7k|        AVIF_CHECKRES(avifMetaFindAlphaItem(data->meta,
  ------------------
  |  |   54|  32.7k|    do {                                  \
  |  |   55|  32.7k|        const avifResult result__ = (A);  \
  |  |   56|  32.7k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 1.37k, False: 31.3k]
  |  |  ------------------
  |  |   57|  1.37k|            avifBreakOnError();           \
  |  |   58|  1.37k|            return result__;              \
  |  |   59|  1.37k|        }                                 \
  |  |   60|  32.7k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 31.3k]
  |  |  ------------------
  ------------------
 6305|  32.7k|                                            mainItems[AVIF_ITEM_COLOR],
 6306|  32.7k|                                            &data->tileInfos[AVIF_ITEM_COLOR],
 6307|  32.7k|                                            &mainItems[AVIF_ITEM_ALPHA],
 6308|  32.7k|                                            &data->tileInfos[AVIF_ITEM_ALPHA],
 6309|  32.7k|                                            &isAlphaItemInInput));
 6310|  31.3k|        if (mainItems[AVIF_ITEM_ALPHA]) {
  ------------------
  |  Branch (6310:13): [True: 6.38k, False: 24.9k]
  ------------------
 6311|  6.38k|            AVIF_CHECKRES(avifDecoderItemReadAndParse(decoder,
  ------------------
  |  |   54|  6.38k|    do {                                  \
  |  |   55|  6.38k|        const avifResult result__ = (A);  \
  |  |   56|  6.38k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 1, False: 6.37k]
  |  |  ------------------
  |  |   57|      1|            avifBreakOnError();           \
  |  |   58|      1|            return result__;              \
  |  |   59|      1|        }                                 \
  |  |   60|  6.38k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 6.37k]
  |  |  ------------------
  ------------------
 6312|  6.38k|                                                      mainItems[AVIF_ITEM_ALPHA],
 6313|  6.38k|                                                      isAlphaItemInInput,
 6314|  6.38k|                                                      &data->tileInfos[AVIF_ITEM_ALPHA].grid,
 6315|  6.38k|                                                      &codecType[AVIF_ITEM_ALPHA]));
 6316|  6.38k|        }
 6317|       |
 6318|       |        // Section 10.2.6 of 23008-12:2024/AMD 1:2024(E):
 6319|       |        //   'tmap' brand
 6320|       |        //   This brand enables file players to identify and decode HEIF files containing tone-map derived image
 6321|       |        //   items. When present, this brand shall be among the brands included in the compatible_brands
 6322|       |        //   array of the FileTypeBox.
 6323|       |        //
 6324|       |        // If the file contains a 'tmap' item but doesn't have the 'tmap' brand, it is technically invalid.
 6325|       |        // However, we don't report any error because in order to do detect this case consistently, we would
 6326|       |        // need to remove the early exit in avifParse() to check if a 'tmap' item might be present
 6327|       |        // further down the file. Instead, we simply ignore tmap items in files that lack the 'tmap' brand.
 6328|  31.3k|        if (avifBrandArrayHasBrand(&data->compatibleBrands, "tmap")) {
  ------------------
  |  Branch (6328:13): [True: 2.90k, False: 28.4k]
  ------------------
 6329|  2.90k|            avifDecoderItem * gainMapItem;
 6330|  2.90k|            avifCodecType gainMapCodecType;
 6331|  2.90k|            AVIF_CHECKRES(avifDecoderFindGainMapItem(decoder, mainItems[AVIF_ITEM_COLOR], &gainMapItem, &gainMapCodecType));
  ------------------
  |  |   54|  2.90k|    do {                                  \
  |  |   55|  2.90k|        const avifResult result__ = (A);  \
  |  |   56|  2.90k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 108, False: 2.79k]
  |  |  ------------------
  |  |   57|    108|            avifBreakOnError();           \
  |  |   58|    108|            return result__;              \
  |  |   59|    108|        }                                 \
  |  |   60|  2.90k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 2.79k]
  |  |  ------------------
  ------------------
 6332|  2.79k|            if (gainMapItem != NULL && decoder->imageContentToDecode & AVIF_IMAGE_CONTENT_GAIN_MAP) {
  ------------------
  |  Branch (6332:17): [True: 398, False: 2.40k]
  |  Branch (6332:40): [True: 295, False: 103]
  ------------------
 6333|    295|                mainItems[AVIF_ITEM_GAIN_MAP] = gainMapItem;
 6334|    295|                codecType[AVIF_ITEM_GAIN_MAP] = gainMapCodecType;
 6335|    295|            }
 6336|  2.79k|        }
 6337|       |
 6338|       |        // AVIF_ITEM_SAMPLE_TRANSFORM (not used through mainItems because not a coded item (well grids are not coded items either but it's different)).
 6339|  31.2k|        avifDecoderItem * const sampleTransformItem = avifDecoderDataFindSampleTransformImageItem(data);
 6340|  31.2k|        if ((decoder->imageContentToDecode & AVIF_IMAGE_CONTENT_COLOR_AND_ALPHA) &&
  ------------------
  |  Branch (6340:13): [True: 30.5k, False: 651]
  ------------------
 6341|  30.5k|            (decoder->imageContentToDecode & AVIF_IMAGE_CONTENT_SAMPLE_TRANSFORMS) && sampleTransformItem != NULL) {
  ------------------
  |  Branch (6341:13): [True: 0, False: 30.5k]
  |  Branch (6341:87): [True: 0, False: 0]
  ------------------
 6342|      0|            AVIF_ASSERT_OR_RETURN(data->sampleTransformNumInputImageItems == 0);
  ------------------
  |  |   64|      0|#define AVIF_ASSERT_OR_RETURN(A) AVIF_CHECKERR((A), AVIF_RESULT_INTERNAL_ERROR)
  |  |  ------------------
  |  |  |  |   45|      0|    do {                        \
  |  |  |  |   46|      0|        if (!(A)) {             \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (46:13): [True: 0, False: 0]
  |  |  |  |  ------------------
  |  |  |  |   47|      0|            avifBreakOnError(); \
  |  |  |  |   48|      0|            return ERR;         \
  |  |  |  |   49|      0|        }                       \
  |  |  |  |   50|      0|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (50:14): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 6343|       |
 6344|      0|            for (uint32_t i = 0; i < data->meta->items.count; ++i) {
  ------------------
  |  Branch (6344:34): [True: 0, False: 0]
  ------------------
 6345|      0|                avifDecoderItem * inputImageItem = data->meta->items.item[i];
 6346|      0|                if (inputImageItem->dimgForID == sampleTransformItem->id) {
  ------------------
  |  Branch (6346:21): [True: 0, False: 0]
  ------------------
 6347|      0|                    ++data->sampleTransformNumInputImageItems;
 6348|      0|                }
 6349|      0|            }
 6350|       |            // Check max number of input items allowed by the format.
 6351|      0|            if (data->sampleTransformNumInputImageItems > 32) {
  ------------------
  |  Branch (6351:17): [True: 0, False: 0]
  ------------------
 6352|      0|                avifDiagnosticsPrintf(data->diag,
 6353|      0|                                      "Box[sato] too many input items, format allows up to 32, got %d",
 6354|      0|                                      data->sampleTransformNumInputImageItems);
 6355|      0|                return AVIF_RESULT_BMFF_PARSE_FAILED;
 6356|      0|            }
 6357|       |            // Check max number of input items supported by this implementation.
 6358|      0|            AVIF_CHECKERR(data->sampleTransformNumInputImageItems <= AVIF_SAMPLE_TRANSFORM_MAX_NUM_INPUT_IMAGE_ITEMS,
  ------------------
  |  |   45|      0|    do {                        \
  |  |   46|      0|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 0]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|      0|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 0]
  |  |  ------------------
  ------------------
 6359|      0|                          AVIF_RESULT_NOT_IMPLEMENTED);
 6360|       |
 6361|      0|            uint32_t numExtraInputImageItems = 0;
 6362|      0|            for (uint32_t i = 0; i < data->meta->items.count; ++i) {
  ------------------
  |  Branch (6362:34): [True: 0, False: 0]
  ------------------
 6363|      0|                avifDecoderItem * inputImageItem = data->meta->items.item[i];
 6364|      0|                if (inputImageItem->dimgForID != sampleTransformItem->id) {
  ------------------
  |  Branch (6364:21): [True: 0, False: 0]
  ------------------
 6365|      0|                    continue;
 6366|      0|                }
 6367|      0|                if (avifDecoderItemShouldBeSkipped(inputImageItem)) {
  ------------------
  |  Branch (6367:21): [True: 0, False: 0]
  ------------------
 6368|      0|                    avifDiagnosticsPrintf(data->diag, "Box[sato] input item %u is not a supported image type", inputImageItem->id);
 6369|      0|                    return AVIF_RESULT_DECODE_SAMPLE_TRANSFORM_FAILED;
 6370|      0|                }
 6371|       |
 6372|      0|                AVIF_ASSERT_OR_RETURN(inputImageItem->dimgIdx < AVIF_SAMPLE_TRANSFORM_MAX_NUM_INPUT_IMAGE_ITEMS);
  ------------------
  |  |   64|      0|#define AVIF_ASSERT_OR_RETURN(A) AVIF_CHECKERR((A), AVIF_RESULT_INTERNAL_ERROR)
  |  |  ------------------
  |  |  |  |   45|      0|    do {                        \
  |  |  |  |   46|      0|        if (!(A)) {             \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (46:13): [True: 0, False: 0]
  |  |  |  |  ------------------
  |  |  |  |   47|      0|            avifBreakOnError(); \
  |  |  |  |   48|      0|            return ERR;         \
  |  |  |  |   49|      0|        }                       \
  |  |  |  |   50|      0|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (50:14): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 6373|      0|                avifItemCategory * category = &data->sampleTransformInputImageItems[inputImageItem->dimgIdx];
 6374|      0|                avifBool foundItem = AVIF_FALSE;
  ------------------
  |  |   89|      0|#define AVIF_FALSE 0
  ------------------
 6375|      0|                for (int c = AVIF_ITEM_COLOR; c < AVIF_ITEM_CATEGORY_COUNT; ++c) {
  ------------------
  |  Branch (6375:47): [True: 0, False: 0]
  ------------------
 6376|      0|                    if (mainItems[c] && inputImageItem->id == mainItems[c]->id) {
  ------------------
  |  Branch (6376:25): [True: 0, False: 0]
  |  Branch (6376:41): [True: 0, False: 0]
  ------------------
 6377|      0|                        *category = c;
 6378|      0|                        AVIF_CHECKERR(*category == AVIF_ITEM_COLOR, AVIF_RESULT_NOT_IMPLEMENTED);
  ------------------
  |  |   45|      0|    do {                        \
  |  |   46|      0|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 0]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|      0|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 0]
  |  |  ------------------
  ------------------
 6379|      0|                        foundItem = AVIF_TRUE;
  ------------------
  |  |   88|      0|#define AVIF_TRUE 1
  ------------------
 6380|      0|                        break;
 6381|      0|                    }
 6382|      0|                }
 6383|      0|                if (!foundItem) {
  ------------------
  |  Branch (6383:21): [True: 0, False: 0]
  ------------------
 6384|      0|                    AVIF_CHECKERR(numExtraInputImageItems < AVIF_SAMPLE_TRANSFORM_MAX_NUM_EXTRA_INPUT_IMAGE_ITEMS,
  ------------------
  |  |   45|      0|    do {                        \
  |  |   46|      0|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 0]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|      0|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 0]
  |  |  ------------------
  ------------------
 6385|      0|                                  AVIF_RESULT_NOT_IMPLEMENTED);
 6386|      0|                    *category = (avifItemCategory)(AVIF_ITEM_SAMPLE_TRANSFORM_INPUT_0_COLOR + numExtraInputImageItems);
 6387|      0|                    const avifItemCategory alphaCategory =
 6388|      0|                        (avifItemCategory)(AVIF_ITEM_SAMPLE_TRANSFORM_INPUT_0_ALPHA + numExtraInputImageItems);
 6389|      0|                    mainItems[*category] = inputImageItem;
 6390|      0|                    ++numExtraInputImageItems;
 6391|       |
 6392|      0|                    AVIF_CHECKRES(avifDecoderItemReadAndParse(decoder,
  ------------------
  |  |   54|      0|    do {                                  \
  |  |   55|      0|        const avifResult result__ = (A);  \
  |  |   56|      0|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 0, False: 0]
  |  |  ------------------
  |  |   57|      0|            avifBreakOnError();           \
  |  |   58|      0|            return result__;              \
  |  |   59|      0|        }                                 \
  |  |   60|      0|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 0]
  |  |  ------------------
  ------------------
 6393|      0|                                                              inputImageItem,
 6394|      0|                                                              /*isItemInInput=*/AVIF_TRUE,
 6395|      0|                                                              &data->tileInfos[*category].grid,
 6396|      0|                                                              &codecType[*category]));
 6397|       |
 6398|       |                    // Optional alpha auxiliary item
 6399|      0|                    avifBool isAlphaInputImageItemInInput = AVIF_FALSE;
  ------------------
  |  |   89|      0|#define AVIF_FALSE 0
  ------------------
 6400|      0|                    AVIF_CHECKRES(avifMetaFindAlphaItem(data->meta,
  ------------------
  |  |   54|      0|    do {                                  \
  |  |   55|      0|        const avifResult result__ = (A);  \
  |  |   56|      0|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 0, False: 0]
  |  |  ------------------
  |  |   57|      0|            avifBreakOnError();           \
  |  |   58|      0|            return result__;              \
  |  |   59|      0|        }                                 \
  |  |   60|      0|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 0]
  |  |  ------------------
  ------------------
 6401|      0|                                                        mainItems[*category],
 6402|      0|                                                        &data->tileInfos[*category],
 6403|      0|                                                        &mainItems[alphaCategory],
 6404|      0|                                                        &data->tileInfos[alphaCategory],
 6405|      0|                                                        &isAlphaInputImageItemInInput));
 6406|       |
 6407|      0|                    AVIF_CHECKERR(!mainItems[alphaCategory] == !mainItems[AVIF_ITEM_ALPHA], AVIF_RESULT_NOT_IMPLEMENTED);
  ------------------
  |  |   45|      0|    do {                        \
  |  |   46|      0|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 0]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|      0|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 0]
  |  |  ------------------
  ------------------
 6408|      0|                    if (mainItems[alphaCategory] != NULL) {
  ------------------
  |  Branch (6408:25): [True: 0, False: 0]
  ------------------
 6409|      0|                        AVIF_CHECKERR(isAlphaInputImageItemInInput == isAlphaItemInInput, AVIF_RESULT_NOT_IMPLEMENTED);
  ------------------
  |  |   45|      0|    do {                        \
  |  |   46|      0|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 0]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|      0|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 0]
  |  |  ------------------
  ------------------
 6410|      0|                        AVIF_CHECKERR((mainItems[*category]->premByID == mainItems[alphaCategory]->id) ==
  ------------------
  |  |   45|      0|    do {                        \
  |  |   46|      0|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 0]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|      0|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 0]
  |  |  ------------------
  ------------------
 6411|      0|                                          (mainItems[AVIF_ITEM_COLOR]->premByID == mainItems[AVIF_ITEM_ALPHA]->id),
 6412|      0|                                      AVIF_RESULT_NOT_IMPLEMENTED);
 6413|      0|                        AVIF_CHECKRES(avifDecoderItemReadAndParse(decoder,
  ------------------
  |  |   54|      0|    do {                                  \
  |  |   55|      0|        const avifResult result__ = (A);  \
  |  |   56|      0|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 0, False: 0]
  |  |  ------------------
  |  |   57|      0|            avifBreakOnError();           \
  |  |   58|      0|            return result__;              \
  |  |   59|      0|        }                                 \
  |  |   60|      0|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 0]
  |  |  ------------------
  ------------------
 6414|      0|                                                                  mainItems[alphaCategory],
 6415|      0|                                                                  isAlphaInputImageItemInInput,
 6416|      0|                                                                  &data->tileInfos[alphaCategory].grid,
 6417|      0|                                                                  &codecType[alphaCategory]));
 6418|      0|                    }
 6419|      0|                }
 6420|      0|            }
 6421|       |
 6422|      0|            AVIF_ASSERT_OR_RETURN(data->meta->sampleTransformExpression.tokens == NULL);
  ------------------
  |  |   64|      0|#define AVIF_ASSERT_OR_RETURN(A) AVIF_CHECKERR((A), AVIF_RESULT_INTERNAL_ERROR)
  |  |  ------------------
  |  |  |  |   45|      0|    do {                        \
  |  |  |  |   46|      0|        if (!(A)) {             \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (46:13): [True: 0, False: 0]
  |  |  |  |  ------------------
  |  |  |  |   47|      0|            avifBreakOnError(); \
  |  |  |  |   48|      0|            return ERR;         \
  |  |  |  |   49|      0|        }                       \
  |  |  |  |   50|      0|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (50:14): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 6423|      0|            avifROData satoData;
 6424|      0|            AVIF_CHECKRES(avifDecoderItemRead(sampleTransformItem, decoder->io, &satoData, 0, 0, data->diag));
  ------------------
  |  |   54|      0|    do {                                  \
  |  |   55|      0|        const avifResult result__ = (A);  \
  |  |   56|      0|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 0, False: 0]
  |  |  ------------------
  |  |   57|      0|            avifBreakOnError();           \
  |  |   58|      0|            return result__;              \
  |  |   59|      0|        }                                 \
  |  |   60|      0|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 0]
  |  |  ------------------
  ------------------
 6425|      0|            AVIF_CHECKRES(avifParseSampleTransformImageBox(satoData.data,
  ------------------
  |  |   54|      0|    do {                                  \
  |  |   55|      0|        const avifResult result__ = (A);  \
  |  |   56|      0|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 0, False: 0]
  |  |  ------------------
  |  |   57|      0|            avifBreakOnError();           \
  |  |   58|      0|            return result__;              \
  |  |   59|      0|        }                                 \
  |  |   60|      0|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 0]
  |  |  ------------------
  ------------------
 6426|      0|                                                           satoData.size,
 6427|      0|                                                           data->sampleTransformNumInputImageItems,
 6428|      0|                                                           &data->meta->sampleTransformExpression,
 6429|      0|                                                           data->diag));
 6430|      0|            AVIF_CHECKRES(avifDecoderSampleTransformItemValidateProperties(sampleTransformItem, data->diag));
  ------------------
  |  |   54|      0|    do {                                  \
  |  |   55|      0|        const avifResult result__ = (A);  \
  |  |   56|      0|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 0, False: 0]
  |  |  ------------------
  |  |   57|      0|            avifBreakOnError();           \
  |  |   58|      0|            return result__;              \
  |  |   59|      0|        }                                 \
  |  |   60|      0|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 0]
  |  |  ------------------
  ------------------
 6431|      0|            const avifProperty * pixiProp = avifPropertyArrayFind(&sampleTransformItem->properties, "pixi");
 6432|      0|            AVIF_ASSERT_OR_RETURN(pixiProp != NULL);
  ------------------
  |  |   64|      0|#define AVIF_ASSERT_OR_RETURN(A) AVIF_CHECKERR((A), AVIF_RESULT_INTERNAL_ERROR)
  |  |  ------------------
  |  |  |  |   45|      0|    do {                        \
  |  |  |  |   46|      0|        if (!(A)) {             \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (46:13): [True: 0, False: 0]
  |  |  |  |  ------------------
  |  |  |  |   47|      0|            avifBreakOnError(); \
  |  |  |  |   48|      0|            return ERR;         \
  |  |  |  |   49|      0|        }                       \
  |  |  |  |   50|      0|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (50:14): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 6433|      0|            data->meta->sampleTransformDepth = pixiProp->u.pixi.planeDepths[0];
 6434|      0|        }
 6435|       |
 6436|       |        // Find Exif and/or XMP metadata, if any
 6437|  31.2k|        AVIF_CHECKRES(avifDecoderFindMetadata(decoder, data->meta, decoder->image, mainItems[AVIF_ITEM_COLOR]->id));
  ------------------
  |  |   54|  31.2k|    do {                                  \
  |  |   55|  31.2k|        const avifResult result__ = (A);  \
  |  |   56|  31.2k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 4, False: 31.2k]
  |  |  ------------------
  |  |   57|      4|            avifBreakOnError();           \
  |  |   58|      4|            return result__;              \
  |  |   59|      4|        }                                 \
  |  |   60|  31.2k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 31.2k]
  |  |  ------------------
  ------------------
 6438|       |
 6439|       |        // Set all counts and timing to safe-but-uninteresting values
 6440|  31.2k|        decoder->imageIndex = -1;
 6441|  31.2k|        decoder->imageCount = 1;
 6442|  31.2k|        decoder->imageTiming.timescale = 1;
 6443|  31.2k|        decoder->imageTiming.pts = 0;
 6444|  31.2k|        decoder->imageTiming.ptsInTimescales = 0;
 6445|  31.2k|        decoder->imageTiming.duration = 1;
 6446|  31.2k|        decoder->imageTiming.durationInTimescales = 1;
 6447|  31.2k|        decoder->timescale = 1;
 6448|  31.2k|        decoder->duration = 1;
 6449|  31.2k|        decoder->durationInTimescales = 1;
 6450|       |
 6451|   279k|        for (int c = AVIF_ITEM_COLOR; c < AVIF_ITEM_CATEGORY_COUNT; ++c) {
  ------------------
  |  Branch (6451:39): [True: 248k, False: 31.0k]
  ------------------
 6452|   248k|            if (!mainItems[c]) {
  ------------------
  |  Branch (6452:17): [True: 210k, False: 37.8k]
  ------------------
 6453|   210k|                continue;
 6454|   210k|            }
 6455|  37.8k|            AVIF_ASSERT_OR_RETURN(c != AVIF_ITEM_SAMPLE_TRANSFORM); // See sampleTransformItem.
  ------------------
  |  |   64|  37.8k|#define AVIF_ASSERT_OR_RETURN(A) AVIF_CHECKERR((A), AVIF_RESULT_INTERNAL_ERROR)
  |  |  ------------------
  |  |  |  |   45|  37.8k|    do {                        \
  |  |  |  |   46|  37.8k|        if (!(A)) {             \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (46:13): [True: 0, False: 37.8k]
  |  |  |  |  ------------------
  |  |  |  |   47|      0|            avifBreakOnError(); \
  |  |  |  |   48|      0|            return ERR;         \
  |  |  |  |   49|      0|        }                       \
  |  |  |  |   50|  37.8k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (50:14): [Folded, False: 37.8k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 6456|       |
 6457|  37.8k|            if (avifIsAlpha((avifItemCategory)c) && !mainItems[c]->width && !mainItems[c]->height) {
  ------------------
  |  Branch (6457:17): [True: 6.33k, False: 31.5k]
  |  Branch (6457:53): [True: 51, False: 6.28k]
  |  Branch (6457:77): [True: 51, False: 0]
  ------------------
 6458|       |                // NON-STANDARD: Alpha subimage does not have an ispe property; adopt width/height from color item
 6459|     51|                AVIF_ASSERT_OR_RETURN(!(decoder->strictFlags & AVIF_STRICT_ALPHA_ISPE_REQUIRED));
  ------------------
  |  |   64|     51|#define AVIF_ASSERT_OR_RETURN(A) AVIF_CHECKERR((A), AVIF_RESULT_INTERNAL_ERROR)
  |  |  ------------------
  |  |  |  |   45|     51|    do {                        \
  |  |  |  |   46|     51|        if (!(A)) {             \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (46:13): [True: 0, False: 51]
  |  |  |  |  ------------------
  |  |  |  |   47|      0|            avifBreakOnError(); \
  |  |  |  |   48|      0|            return ERR;         \
  |  |  |  |   49|      0|        }                       \
  |  |  |  |   50|     51|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (50:14): [Folded, False: 51]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 6460|     51|                mainItems[c]->width = mainItems[AVIF_ITEM_COLOR]->width;
 6461|     51|                mainItems[c]->height = mainItems[AVIF_ITEM_COLOR]->height;
 6462|     51|            }
 6463|       |
 6464|  37.8k|            AVIF_CHECKRES(avifDecoderAdoptGridTileCodecTypeIfNeeded(decoder, mainItems[c], &data->tileInfos[c]));
  ------------------
  |  |   54|  37.8k|    do {                                  \
  |  |   55|  37.8k|        const avifResult result__ = (A);  \
  |  |   56|  37.8k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 41, False: 37.8k]
  |  |  ------------------
  |  |   57|     41|            avifBreakOnError();           \
  |  |   58|     41|            return result__;              \
  |  |   59|     41|        }                                 \
  |  |   60|  37.8k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 37.8k]
  |  |  ------------------
  ------------------
 6465|       |
 6466|  37.8k|            if (c == AVIF_ITEM_COLOR || c == AVIF_ITEM_ALPHA) {
  ------------------
  |  Branch (6466:17): [True: 31.1k, False: 6.62k]
  |  Branch (6466:41): [True: 6.33k, False: 293]
  ------------------
 6467|  37.5k|                if (!(decoder->imageContentToDecode & AVIF_IMAGE_CONTENT_COLOR_AND_ALPHA)) {
  ------------------
  |  Branch (6467:21): [True: 729, False: 36.7k]
  ------------------
 6468|    729|                    continue;
 6469|    729|                }
 6470|  37.5k|            } else if (c == AVIF_ITEM_SAMPLE_TRANSFORM_INPUT_0_COLOR || c == AVIF_ITEM_SAMPLE_TRANSFORM_INPUT_1_COLOR ||
  ------------------
  |  Branch (6470:24): [True: 0, False: 293]
  |  Branch (6470:73): [True: 0, False: 293]
  ------------------
 6471|    293|                       c == AVIF_ITEM_SAMPLE_TRANSFORM_INPUT_0_ALPHA || c == AVIF_ITEM_SAMPLE_TRANSFORM_INPUT_1_ALPHA) {
  ------------------
  |  Branch (6471:24): [True: 0, False: 293]
  |  Branch (6471:73): [True: 0, False: 293]
  ------------------
 6472|      0|                AVIF_ASSERT_OR_RETURN((decoder->imageContentToDecode & AVIF_IMAGE_CONTENT_COLOR_AND_ALPHA) &&
  ------------------
  |  |   64|      0|#define AVIF_ASSERT_OR_RETURN(A) AVIF_CHECKERR((A), AVIF_RESULT_INTERNAL_ERROR)
  |  |  ------------------
  |  |  |  |   45|      0|    do {                        \
  |  |  |  |   46|      0|        if (!(A)) {             \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (46:15): [True: 0, False: 0]
  |  |  |  |  |  Branch (46:15): [True: 0, False: 0]
  |  |  |  |  ------------------
  |  |  |  |   47|      0|            avifBreakOnError(); \
  |  |  |  |   48|      0|            return ERR;         \
  |  |  |  |   49|      0|        }                       \
  |  |  |  |   50|      0|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (50:14): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 6473|      0|                                      (decoder->imageContentToDecode & AVIF_IMAGE_CONTENT_SAMPLE_TRANSFORMS));
 6474|    293|            } else {
 6475|    293|                AVIF_ASSERT_OR_RETURN(c == AVIF_ITEM_GAIN_MAP);
  ------------------
  |  |   64|    293|#define AVIF_ASSERT_OR_RETURN(A) AVIF_CHECKERR((A), AVIF_RESULT_INTERNAL_ERROR)
  |  |  ------------------
  |  |  |  |   45|    293|    do {                        \
  |  |  |  |   46|    293|        if (!(A)) {             \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (46:13): [True: 0, False: 293]
  |  |  |  |  ------------------
  |  |  |  |   47|      0|            avifBreakOnError(); \
  |  |  |  |   48|      0|            return ERR;         \
  |  |  |  |   49|      0|        }                       \
  |  |  |  |   50|    293|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (50:14): [Folded, False: 293]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 6476|    293|                if (!(decoder->imageContentToDecode & AVIF_IMAGE_CONTENT_GAIN_MAP)) {
  ------------------
  |  Branch (6476:21): [True: 0, False: 293]
  ------------------
 6477|      0|                    continue;
 6478|      0|                }
 6479|    293|            }
 6480|       |
 6481|  37.0k|            AVIF_CHECKRES(avifDecoderGenerateImageTiles(decoder, &data->tileInfos[c], mainItems[c], (avifItemCategory)c));
  ------------------
  |  |   54|  37.0k|    do {                                  \
  |  |   55|  37.0k|        const avifResult result__ = (A);  \
  |  |   56|  37.0k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 101, False: 36.9k]
  |  |  ------------------
  |  |   57|    101|            avifBreakOnError();           \
  |  |   58|    101|            return result__;              \
  |  |   59|    101|        }                                 \
  |  |   60|  37.0k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 36.9k]
  |  |  ------------------
  ------------------
 6482|       |
 6483|  36.9k|            avifStrictFlags strictFlags = decoder->strictFlags;
 6484|  36.9k|            if (avifIsAlpha((avifItemCategory)c) && !isAlphaItemInInput) {
  ------------------
  |  Branch (6484:17): [True: 6.22k, False: 30.7k]
  |  Branch (6484:53): [True: 1.36k, False: 4.86k]
  ------------------
 6485|       |                // In this case, the made up grid item will not have an associated pixi property. So validate everything else
 6486|       |                // but the pixi property.
 6487|  1.36k|                strictFlags &= ~(avifStrictFlags)AVIF_STRICT_PIXI_REQUIRED;
 6488|  1.36k|            }
 6489|  36.9k|            AVIF_CHECKRES(
  ------------------
  |  |   54|  36.9k|    do {                                  \
  |  |   55|  36.9k|        const avifResult result__ = (A);  \
  |  |   56|  36.9k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 41, False: 36.9k]
  |  |  ------------------
  |  |   57|     41|            avifBreakOnError();           \
  |  |   58|     41|            return result__;              \
  |  |   59|     41|        }                                 \
  |  |   60|  36.9k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 36.9k]
  |  |  ------------------
  ------------------
 6490|  36.9k|                avifDecoderItemValidateProperties(mainItems[c], avifGetConfigurationPropertyName(codecType[c]), &decoder->diag, strictFlags));
 6491|  36.9k|        }
 6492|       |
 6493|  31.0k|        if (mainItems[AVIF_ITEM_COLOR]->progressive) {
  ------------------
  |  Branch (6493:13): [True: 5.16k, False: 25.8k]
  ------------------
 6494|  5.16k|            decoder->progressiveState = AVIF_PROGRESSIVE_STATE_AVAILABLE;
 6495|       |            // data->tileInfos[AVIF_ITEM_COLOR].firstTileIndex is not yet defined but will be set to 0 a few lines below.
 6496|  5.16k|            const avifTile * colorTile = &data->tiles.tile[0];
 6497|  5.16k|            if (colorTile->input->samples.count > 1) {
  ------------------
  |  Branch (6497:17): [True: 3.55k, False: 1.60k]
  ------------------
 6498|  3.55k|                decoder->progressiveState = AVIF_PROGRESSIVE_STATE_ACTIVE;
 6499|  3.55k|                decoder->imageCount = (int)colorTile->input->samples.count;
 6500|  3.55k|            }
 6501|  5.16k|        }
 6502|       |
 6503|  31.0k|        decoder->image->width = mainItems[AVIF_ITEM_COLOR]->width;
 6504|  31.0k|        decoder->image->height = mainItems[AVIF_ITEM_COLOR]->height;
 6505|  31.0k|        decoder->alphaPresent = (mainItems[AVIF_ITEM_ALPHA] != NULL);
 6506|  31.0k|        decoder->image->alphaPremultiplied = decoder->alphaPresent &&
  ------------------
  |  Branch (6506:46): [True: 6.32k, False: 24.7k]
  ------------------
 6507|  6.32k|                                             (mainItems[AVIF_ITEM_COLOR]->premByID == mainItems[AVIF_ITEM_ALPHA]->id);
  ------------------
  |  Branch (6507:46): [True: 0, False: 6.32k]
  ------------------
 6508|       |
 6509|  31.0k|        if (mainItems[AVIF_ITEM_ALPHA]) {
  ------------------
  |  Branch (6509:13): [True: 6.32k, False: 24.7k]
  ------------------
 6510|  6.32k|            alphaProperties = &mainItems[AVIF_ITEM_ALPHA]->properties;
 6511|  6.32k|        }
 6512|  31.0k|        if (mainItems[AVIF_ITEM_GAIN_MAP]) {
  ------------------
  |  Branch (6512:13): [True: 285, False: 30.7k]
  ------------------
 6513|    285|            AVIF_ASSERT_OR_RETURN(decoder->image->gainMap && decoder->image->gainMap->image);
  ------------------
  |  |   64|    285|#define AVIF_ASSERT_OR_RETURN(A) AVIF_CHECKERR((A), AVIF_RESULT_INTERNAL_ERROR)
  |  |  ------------------
  |  |  |  |   45|    285|    do {                        \
  |  |  |  |   46|    570|        if (!(A)) {             \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (46:15): [True: 285, False: 0]
  |  |  |  |  |  Branch (46:15): [True: 285, False: 0]
  |  |  |  |  ------------------
  |  |  |  |   47|      0|            avifBreakOnError(); \
  |  |  |  |   48|      0|            return ERR;         \
  |  |  |  |   49|      0|        }                       \
  |  |  |  |   50|    285|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (50:14): [Folded, False: 285]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 6514|    285|            decoder->image->gainMap->image->width = mainItems[AVIF_ITEM_GAIN_MAP]->width;
 6515|    285|            decoder->image->gainMap->image->height = mainItems[AVIF_ITEM_GAIN_MAP]->height;
 6516|       |            // Must be called after avifDecoderAdoptGridTileCodecType() which among other things copies the
 6517|       |            // codec config property from the first tile of a grid to the grid item (when grids are used).
 6518|    285|            AVIF_CHECKRES(avifReadCodecConfigProperty(decoder->image->gainMap->image,
  ------------------
  |  |   54|    285|    do {                                  \
  |  |   55|    285|        const avifResult result__ = (A);  \
  |  |   56|    285|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 0, False: 285]
  |  |  ------------------
  |  |   57|      0|            avifBreakOnError();           \
  |  |   58|      0|            return result__;              \
  |  |   59|      0|        }                                 \
  |  |   60|    285|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 285]
  |  |  ------------------
  ------------------
 6519|    285|                                                      &mainItems[AVIF_ITEM_GAIN_MAP]->properties,
 6520|    285|                                                      codecType[AVIF_ITEM_GAIN_MAP]));
 6521|    285|            gainMapProperties = &mainItems[AVIF_ITEM_GAIN_MAP]->properties;
 6522|    285|        }
 6523|  31.0k|    }
 6524|       |
 6525|  34.6k|    uint32_t firstTileIndex = 0;
 6526|   311k|    for (int c = 0; c < AVIF_ITEM_CATEGORY_COUNT; ++c) {
  ------------------
  |  Branch (6526:21): [True: 277k, False: 34.6k]
  ------------------
 6527|   277k|        data->tileInfos[c].firstTileIndex = firstTileIndex;
 6528|   277k|        firstTileIndex += data->tileInfos[c].tileCount;
 6529|   277k|    }
 6530|       |
 6531|       |    // Sanity check tiles
 6532|  79.3k|    for (uint32_t tileIndex = 0; tileIndex < data->tiles.count; ++tileIndex) {
  ------------------
  |  Branch (6532:34): [True: 44.6k, False: 34.6k]
  ------------------
 6533|  44.6k|        avifTile * tile = &data->tiles.tile[tileIndex];
 6534|   109k|        for (uint32_t sampleIndex = 0; sampleIndex < tile->input->samples.count; ++sampleIndex) {
  ------------------
  |  Branch (6534:40): [True: 65.2k, False: 44.6k]
  ------------------
 6535|  65.2k|            avifDecodeSample * sample = &tile->input->samples.sample[sampleIndex];
 6536|  65.2k|            if (!sample->size) {
  ------------------
  |  Branch (6536:17): [True: 6, False: 65.2k]
  ------------------
 6537|       |                // Every sample must have some data
 6538|      6|                return AVIF_RESULT_BMFF_PARSE_FAILED;
 6539|      6|            }
 6540|       |
 6541|  65.2k|            if (tile->input->itemCategory == AVIF_ITEM_COLOR) {
  ------------------
  |  Branch (6541:17): [True: 53.9k, False: 11.3k]
  ------------------
 6542|  53.9k|                decoder->ioStats.colorOBUSize += sample->size;
 6543|  53.9k|            } else if (tile->input->itemCategory == AVIF_ITEM_ALPHA) {
  ------------------
  |  Branch (6543:24): [True: 10.6k, False: 638]
  ------------------
 6544|  10.6k|                decoder->ioStats.alphaOBUSize += sample->size;
 6545|  10.6k|            }
 6546|  65.2k|        }
 6547|  44.6k|    }
 6548|       |
 6549|  34.6k|    AVIF_CHECKRES(avifReadColorProperties(decoder->io,
  ------------------
  |  |   54|  34.6k|    do {                                  \
  |  |   55|  34.6k|        const avifResult result__ = (A);  \
  |  |   56|  34.6k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 2, False: 34.6k]
  |  |  ------------------
  |  |   57|      2|            avifBreakOnError();           \
  |  |   58|      2|            return result__;              \
  |  |   59|      2|        }                                 \
  |  |   60|  34.6k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 34.6k]
  |  |  ------------------
  ------------------
 6550|  34.6k|                                          colorProperties,
 6551|  34.6k|                                          &decoder->image->icc,
 6552|  34.6k|                                          &decoder->image->colorPrimaries,
 6553|  34.6k|                                          &decoder->image->transferCharacteristics,
 6554|  34.6k|                                          &decoder->image->matrixCoefficients,
 6555|  34.6k|                                          &decoder->image->yuvRange,
 6556|  34.6k|                                          &data->cicpSet));
 6557|       |
 6558|  34.6k|    const avifProperty * clliProp = avifPropertyArrayFind(colorProperties, "clli");
 6559|  34.6k|    if (clliProp) {
  ------------------
  |  Branch (6559:9): [True: 923, False: 33.7k]
  ------------------
 6560|    923|        decoder->image->clli = clliProp->u.clli;
 6561|    923|    }
 6562|       |
 6563|       |    // Transformations
 6564|  34.6k|    const avifProperty * paspProp = avifPropertyArrayFind(colorProperties, "pasp");
 6565|  34.6k|    if (paspProp) {
  ------------------
  |  Branch (6565:9): [True: 0, False: 34.6k]
  ------------------
 6566|      0|        decoder->image->transformFlags |= AVIF_TRANSFORM_PASP;
 6567|      0|        decoder->image->pasp = paspProp->u.pasp;
 6568|      0|    }
 6569|  34.6k|    const avifProperty * clapProp = avifPropertyArrayFind(colorProperties, "clap");
 6570|  34.6k|    if (clapProp) {
  ------------------
  |  Branch (6570:9): [True: 0, False: 34.6k]
  ------------------
 6571|      0|        decoder->image->transformFlags |= AVIF_TRANSFORM_CLAP;
 6572|      0|        decoder->image->clap = clapProp->u.clap;
 6573|      0|    }
 6574|  34.6k|    const avifProperty * irotProp = avifPropertyArrayFind(colorProperties, "irot");
 6575|  34.6k|    if (irotProp) {
  ------------------
  |  Branch (6575:9): [True: 2.71k, False: 31.9k]
  ------------------
 6576|  2.71k|        decoder->image->transformFlags |= AVIF_TRANSFORM_IROT;
 6577|  2.71k|        decoder->image->irot = irotProp->u.irot;
 6578|  2.71k|    }
 6579|  34.6k|    const avifProperty * imirProp = avifPropertyArrayFind(colorProperties, "imir");
 6580|  34.6k|    if (imirProp) {
  ------------------
  |  Branch (6580:9): [True: 0, False: 34.6k]
  ------------------
 6581|      0|        decoder->image->transformFlags |= AVIF_TRANSFORM_IMIR;
 6582|      0|        decoder->image->imir = imirProp->u.imir;
 6583|      0|    }
 6584|  34.6k|    if (alphaProperties) {
  ------------------
  |  Branch (6584:9): [True: 6.78k, False: 27.8k]
  ------------------
 6585|  6.78k|        AVIF_CHECKRES(avifDecoderCheckAlphaProperties(decoder, alphaProperties));
  ------------------
  |  |   54|  6.78k|    do {                                  \
  |  |   55|  6.78k|        const avifResult result__ = (A);  \
  |  |   56|  6.78k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 1, False: 6.78k]
  |  |  ------------------
  |  |   57|      1|            avifBreakOnError();           \
  |  |   58|      1|            return result__;              \
  |  |   59|      1|        }                                 \
  |  |   60|  6.78k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 6.78k]
  |  |  ------------------
  ------------------
 6586|  6.78k|    }
 6587|  34.6k|    if (gainMapProperties) {
  ------------------
  |  Branch (6587:9): [True: 284, False: 34.3k]
  ------------------
 6588|    284|        AVIF_CHECKRES(avifDecoderCheckGainMapProperties(decoder, gainMapProperties));
  ------------------
  |  |   54|    284|    do {                                  \
  |  |   55|    284|        const avifResult result__ = (A);  \
  |  |   56|    284|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 0, False: 284]
  |  |  ------------------
  |  |   57|      0|            avifBreakOnError();           \
  |  |   58|      0|            return result__;              \
  |  |   59|      0|        }                                 \
  |  |   60|    284|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 284]
  |  |  ------------------
  ------------------
 6589|    284|    }
 6590|       |
 6591|  34.6k|    if (!data->cicpSet && (data->tiles.count > 0)) {
  ------------------
  |  Branch (6591:9): [True: 14.8k, False: 19.8k]
  |  Branch (6591:27): [True: 14.6k, False: 169]
  ------------------
 6592|  14.6k|        avifTile * firstTile = &data->tiles.tile[0];
 6593|  14.6k|        if (firstTile->input->samples.count > 0) {
  ------------------
  |  Branch (6593:13): [True: 14.6k, False: 0]
  ------------------
 6594|  14.6k|            avifDecodeSample * sample = &firstTile->input->samples.sample[0];
 6595|       |
 6596|       |            // Harvest CICP from the AV1's sequence header, which should be very close to the front
 6597|       |            // of the first sample. Read in successively larger chunks until we successfully parse the sequence.
 6598|  14.6k|            static const size_t searchSampleChunkIncrement = 64;
 6599|  14.6k|            static const size_t searchSampleSizeMax = 4096;
 6600|  14.6k|            size_t searchSampleSize = 0;
 6601|  34.9k|            do {
 6602|  34.9k|                searchSampleSize += searchSampleChunkIncrement;
 6603|  34.9k|                if (searchSampleSize > sample->size) {
  ------------------
  |  Branch (6603:21): [True: 5.83k, False: 29.1k]
  ------------------
 6604|  5.83k|                    searchSampleSize = sample->size;
 6605|  5.83k|                }
 6606|       |
 6607|  34.9k|                avifResult prepareResult = avifDecoderPrepareSample(decoder, sample, searchSampleSize);
 6608|  34.9k|                if (prepareResult != AVIF_RESULT_OK) {
  ------------------
  |  Branch (6608:21): [True: 243, False: 34.7k]
  ------------------
 6609|    243|                    return prepareResult;
 6610|    243|                }
 6611|       |
 6612|  34.7k|                avifSequenceHeader sequenceHeader;
 6613|  34.7k|                if (avifSequenceHeaderParse(&sequenceHeader, &sample->data, firstTile->codecType)) {
  ------------------
  |  Branch (6613:21): [True: 12.5k, False: 22.1k]
  ------------------
 6614|  12.5k|                    data->cicpSet = AVIF_TRUE;
  ------------------
  |  |   88|  12.5k|#define AVIF_TRUE 1
  ------------------
 6615|  12.5k|                    decoder->image->colorPrimaries = sequenceHeader.colorPrimaries;
 6616|  12.5k|                    decoder->image->transferCharacteristics = sequenceHeader.transferCharacteristics;
 6617|  12.5k|                    decoder->image->matrixCoefficients = sequenceHeader.matrixCoefficients;
 6618|  12.5k|                    decoder->image->yuvRange = sequenceHeader.range;
 6619|  12.5k|                    break;
 6620|  12.5k|                }
 6621|  34.7k|            } while (searchSampleSize != sample->size && searchSampleSize < searchSampleSizeMax);
  ------------------
  |  Branch (6621:22): [True: 20.4k, False: 1.70k]
  |  Branch (6621:58): [True: 20.3k, False: 171]
  ------------------
 6622|  14.6k|        }
 6623|  14.6k|    }
 6624|       |
 6625|  34.3k|    AVIF_CHECKRES(avifReadCodecConfigProperty(decoder->image, colorProperties, colorCodecType));
  ------------------
  |  |   54|  34.3k|    do {                                  \
  |  |   55|  34.3k|        const avifResult result__ = (A);  \
  |  |   56|  34.3k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 29, False: 34.3k]
  |  |  ------------------
  |  |   57|     29|            avifBreakOnError();           \
  |  |   58|     29|            return result__;              \
  |  |   59|     29|        }                                 \
  |  |   60|  34.3k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 34.3k]
  |  |  ------------------
  ------------------
 6626|  34.3k|    if (decoder->data->meta->sampleTransformExpression.count > 0) {
  ------------------
  |  Branch (6626:9): [True: 0, False: 34.3k]
  ------------------
 6627|      0|        AVIF_ASSERT_OR_RETURN(decoder->data->meta->sampleTransformDepth != 0);
  ------------------
  |  |   64|      0|#define AVIF_ASSERT_OR_RETURN(A) AVIF_CHECKERR((A), AVIF_RESULT_INTERNAL_ERROR)
  |  |  ------------------
  |  |  |  |   45|      0|    do {                        \
  |  |  |  |   46|      0|        if (!(A)) {             \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (46:13): [True: 0, False: 0]
  |  |  |  |  ------------------
  |  |  |  |   47|      0|            avifBreakOnError(); \
  |  |  |  |   48|      0|            return ERR;         \
  |  |  |  |   49|      0|        }                       \
  |  |  |  |   50|      0|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (50:14): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 6628|      0|        decoder->image->depth = decoder->data->meta->sampleTransformDepth;
 6629|      0|    }
 6630|       |
 6631|       |    // Expose as raw bytes all other properties that libavif does not care about.
 6632|   187k|    for (size_t i = 0; i < colorProperties->count; ++i) {
  ------------------
  |  Branch (6632:24): [True: 152k, False: 34.3k]
  ------------------
 6633|   152k|        const avifProperty * property = &colorProperties->prop[i];
 6634|   152k|        if (property->isOpaque) {
  ------------------
  |  Branch (6634:13): [True: 25.6k, False: 127k]
  ------------------
 6635|  25.6k|            AVIF_CHECKRES(avifImagePushProperty(decoder->image,
  ------------------
  |  |   54|  25.6k|    do {                                  \
  |  |   55|  25.6k|        const avifResult result__ = (A);  \
  |  |   56|  25.6k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 0, False: 25.6k]
  |  |  ------------------
  |  |   57|      0|            avifBreakOnError();           \
  |  |   58|      0|            return result__;              \
  |  |   59|      0|        }                                 \
  |  |   60|  25.6k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 25.6k]
  |  |  ------------------
  ------------------
 6636|  25.6k|                                                property->type,
 6637|  25.6k|                                                property->u.opaque.usertype,
 6638|  25.6k|                                                property->u.opaque.boxPayload.data,
 6639|  25.6k|                                                property->u.opaque.boxPayload.size));
 6640|  25.6k|        }
 6641|   152k|    }
 6642|       |
 6643|  34.3k|    if (gainMapProperties) {
  ------------------
  |  Branch (6643:9): [True: 278, False: 34.0k]
  ------------------
 6644|  1.44k|        for (size_t i = 0; i < gainMapProperties->count; ++i) {
  ------------------
  |  Branch (6644:28): [True: 1.17k, False: 278]
  ------------------
 6645|  1.17k|            const avifProperty * property = &gainMapProperties->prop[i];
 6646|  1.17k|            if (property->isOpaque) {
  ------------------
  |  Branch (6646:17): [True: 32, False: 1.13k]
  ------------------
 6647|     32|                AVIF_CHECKRES(avifImagePushProperty(decoder->image->gainMap->image,
  ------------------
  |  |   54|     32|    do {                                  \
  |  |   55|     32|        const avifResult result__ = (A);  \
  |  |   56|     32|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 0, False: 32]
  |  |  ------------------
  |  |   57|      0|            avifBreakOnError();           \
  |  |   58|      0|            return result__;              \
  |  |   59|      0|        }                                 \
  |  |   60|     32|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 32]
  |  |  ------------------
  ------------------
 6648|     32|                                                    property->type,
 6649|     32|                                                    property->u.opaque.usertype,
 6650|     32|                                                    property->u.opaque.boxPayload.data,
 6651|     32|                                                    property->u.opaque.boxPayload.size));
 6652|     32|            }
 6653|  1.17k|        }
 6654|    278|    }
 6655|  34.3k|    return AVIF_RESULT_OK;
 6656|  34.3k|}
avifDecoderNextImage:
 6973|  44.4k|{
 6974|  44.4k|    avifDiagnosticsClearError(&decoder->diag);
 6975|       |
 6976|  44.4k|    if (!decoder->data || decoder->data->tiles.count == 0) {
  ------------------
  |  Branch (6976:9): [True: 0, False: 44.4k]
  |  Branch (6976:27): [True: 487, False: 44.0k]
  ------------------
 6977|       |        // Nothing has been parsed yet
 6978|    487|        return AVIF_RESULT_NO_CONTENT;
 6979|    487|    }
 6980|       |
 6981|  44.0k|    if (!decoder->io || !decoder->io->read) {
  ------------------
  |  Branch (6981:9): [True: 0, False: 44.0k]
  |  Branch (6981:25): [True: 0, False: 44.0k]
  ------------------
 6982|      0|        return AVIF_RESULT_IO_NOT_SET;
 6983|      0|    }
 6984|       |
 6985|  44.0k|    if (avifDecoderDataFrameFullyDecoded(decoder->data)) {
  ------------------
  |  Branch (6985:9): [True: 10.1k, False: 33.8k]
  ------------------
 6986|       |        // A frame was decoded during the last avifDecoderNextImage() call.
 6987|  91.1k|        for (int c = 0; c < AVIF_ITEM_CATEGORY_COUNT; ++c) {
  ------------------
  |  Branch (6987:25): [True: 81.0k, False: 10.1k]
  ------------------
 6988|  81.0k|            decoder->data->tileInfos[c].decodedTileCount = 0;
 6989|  81.0k|        }
 6990|  10.1k|    }
 6991|       |
 6992|  44.0k|    AVIF_ASSERT_OR_RETURN(decoder->data->tiles.count == (decoder->data->tileInfos[AVIF_ITEM_CATEGORY_COUNT - 1].firstTileIndex +
  ------------------
  |  |   64|  44.0k|#define AVIF_ASSERT_OR_RETURN(A) AVIF_CHECKERR((A), AVIF_RESULT_INTERNAL_ERROR)
  |  |  ------------------
  |  |  |  |   45|  44.0k|    do {                        \
  |  |  |  |   46|  44.0k|        if (!(A)) {             \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (46:13): [True: 0, False: 44.0k]
  |  |  |  |  ------------------
  |  |  |  |   47|      0|            avifBreakOnError(); \
  |  |  |  |   48|      0|            return ERR;         \
  |  |  |  |   49|      0|        }                       \
  |  |  |  |   50|  44.0k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (50:14): [Folded, False: 44.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 6993|  44.0k|                                                         decoder->data->tileInfos[AVIF_ITEM_CATEGORY_COUNT - 1].tileCount));
 6994|       |
 6995|  44.0k|    const uint32_t nextImageIndex = (uint32_t)(decoder->imageIndex + 1);
 6996|       |
 6997|       |    // Ensure that we have created the codecs before proceeding with the decoding.
 6998|  44.0k|    if (!decoder->data->tiles.tile[0].codec) {
  ------------------
  |  Branch (6998:9): [True: 33.8k, False: 10.1k]
  ------------------
 6999|  33.8k|        AVIF_CHECKRES(avifDecoderCreateCodecs(decoder));
  ------------------
  |  |   54|  33.8k|    do {                                  \
  |  |   55|  33.8k|        const avifResult result__ = (A);  \
  |  |   56|  33.8k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 0, False: 33.8k]
  |  |  ------------------
  |  |   57|      0|            avifBreakOnError();           \
  |  |   58|      0|            return result__;              \
  |  |   59|      0|        }                                 \
  |  |   60|  33.8k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 33.8k]
  |  |  ------------------
  ------------------
 7000|  33.8k|    }
 7001|       |
 7002|       |    // Acquire all sample data for the current image first, allowing for any read call to bail out
 7003|       |    // with AVIF_RESULT_WAITING_ON_IO harmlessly / idempotently, unless decoder->allowIncremental.
 7004|  44.0k|    avifResult prepareTileResult[AVIF_ITEM_CATEGORY_COUNT];
 7005|   368k|    for (int c = 0; c < AVIF_ITEM_CATEGORY_COUNT; ++c) {
  ------------------
  |  Branch (7005:21): [True: 327k, False: 40.4k]
  ------------------
 7006|   327k|        prepareTileResult[c] = avifDecoderPrepareTiles(decoder, nextImageIndex, &decoder->data->tileInfos[c]);
 7007|   327k|        if (!decoder->allowIncremental || (prepareTileResult[c] != AVIF_RESULT_WAITING_ON_IO)) {
  ------------------
  |  Branch (7007:13): [True: 169k, False: 158k]
  |  Branch (7007:43): [True: 158k, False: 0]
  ------------------
 7008|   327k|            AVIF_CHECKRES(prepareTileResult[c]);
  ------------------
  |  |   54|   327k|    do {                                  \
  |  |   55|   327k|        const avifResult result__ = (A);  \
  |  |   56|   327k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 3.56k, False: 324k]
  |  |  ------------------
  |  |   57|  3.56k|            avifBreakOnError();           \
  |  |   58|  3.56k|            return result__;              \
  |  |   59|  3.56k|        }                                 \
  |  |   60|   327k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 324k]
  |  |  ------------------
  ------------------
 7009|   327k|        }
 7010|   327k|    }
 7011|       |
 7012|       |    // Decode all available color tiles now, then all available alpha tiles, then all available bit
 7013|       |    // depth extension tiles. The order of appearance of the tiles in the bitstream is left to the
 7014|       |    // encoder's choice, and decoding as many as possible of each category in parallel is beneficial
 7015|       |    // for incremental decoding, as pixel rows need all channels to be decoded before being
 7016|       |    // accessible to the user.
 7017|   122k|    for (int c = 0; c < AVIF_ITEM_CATEGORY_COUNT; ++c) {
  ------------------
  |  Branch (7017:21): [True: 112k, False: 10.1k]
  ------------------
 7018|   112k|        AVIF_CHECKRES(avifDecoderDecodeTiles(decoder, nextImageIndex, &decoder->data->tileInfos[c]));
  ------------------
  |  |   54|   112k|    do {                                  \
  |  |   55|   112k|        const avifResult result__ = (A);  \
  |  |   56|   112k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 30.3k, False: 82.1k]
  |  |  ------------------
  |  |   57|  30.3k|            avifBreakOnError();           \
  |  |   58|  30.3k|            return result__;              \
  |  |   59|  30.3k|        }                                 \
  |  |   60|   112k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 82.1k]
  |  |  ------------------
  ------------------
 7019|   112k|    }
 7020|       |
 7021|  10.1k|    if (!avifDecoderDataFrameFullyDecoded(decoder->data)) {
  ------------------
  |  Branch (7021:9): [True: 0, False: 10.1k]
  ------------------
 7022|      0|        AVIF_ASSERT_OR_RETURN(decoder->allowIncremental);
  ------------------
  |  |   64|      0|#define AVIF_ASSERT_OR_RETURN(A) AVIF_CHECKERR((A), AVIF_RESULT_INTERNAL_ERROR)
  |  |  ------------------
  |  |  |  |   45|      0|    do {                        \
  |  |  |  |   46|      0|        if (!(A)) {             \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (46:13): [True: 0, False: 0]
  |  |  |  |  ------------------
  |  |  |  |   47|      0|            avifBreakOnError(); \
  |  |  |  |   48|      0|            return ERR;         \
  |  |  |  |   49|      0|        }                       \
  |  |  |  |   50|      0|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (50:14): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 7023|       |        // The image is not completely decoded. There should be no error unrelated to missing bytes,
 7024|       |        // and at least some missing bytes.
 7025|      0|        avifResult firstNonOkResult = AVIF_RESULT_OK;
 7026|      0|        for (int c = 0; c < AVIF_ITEM_CATEGORY_COUNT; ++c) {
  ------------------
  |  Branch (7026:25): [True: 0, False: 0]
  ------------------
 7027|      0|            AVIF_ASSERT_OR_RETURN(prepareTileResult[c] == AVIF_RESULT_OK || prepareTileResult[c] == AVIF_RESULT_WAITING_ON_IO);
  ------------------
  |  |   64|      0|#define AVIF_ASSERT_OR_RETURN(A) AVIF_CHECKERR((A), AVIF_RESULT_INTERNAL_ERROR)
  |  |  ------------------
  |  |  |  |   45|      0|    do {                        \
  |  |  |  |   46|      0|        if (!(A)) {             \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (46:15): [True: 0, False: 0]
  |  |  |  |  |  Branch (46:15): [True: 0, False: 0]
  |  |  |  |  ------------------
  |  |  |  |   47|      0|            avifBreakOnError(); \
  |  |  |  |   48|      0|            return ERR;         \
  |  |  |  |   49|      0|        }                       \
  |  |  |  |   50|      0|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (50:14): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 7028|      0|            if (firstNonOkResult == AVIF_RESULT_OK) {
  ------------------
  |  Branch (7028:17): [True: 0, False: 0]
  ------------------
 7029|      0|                firstNonOkResult = prepareTileResult[c];
 7030|      0|            }
 7031|      0|        }
 7032|      0|        AVIF_ASSERT_OR_RETURN(firstNonOkResult != AVIF_RESULT_OK);
  ------------------
  |  |   64|      0|#define AVIF_ASSERT_OR_RETURN(A) AVIF_CHECKERR((A), AVIF_RESULT_INTERNAL_ERROR)
  |  |  ------------------
  |  |  |  |   45|      0|    do {                        \
  |  |  |  |   46|      0|        if (!(A)) {             \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (46:13): [True: 0, False: 0]
  |  |  |  |  ------------------
  |  |  |  |   47|      0|            avifBreakOnError(); \
  |  |  |  |   48|      0|            return ERR;         \
  |  |  |  |   49|      0|        }                       \
  |  |  |  |   50|      0|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (50:14): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 7033|       |        // Return the "not enough bytes" status now instead of moving on to the next frame.
 7034|      0|        return AVIF_RESULT_WAITING_ON_IO;
 7035|      0|    }
 7036|  91.1k|    for (int c = 0; c < AVIF_ITEM_CATEGORY_COUNT; ++c) {
  ------------------
  |  Branch (7036:21): [True: 81.0k, False: 10.1k]
  ------------------
 7037|  81.0k|        AVIF_ASSERT_OR_RETURN(prepareTileResult[c] == AVIF_RESULT_OK);
  ------------------
  |  |   64|  81.0k|#define AVIF_ASSERT_OR_RETURN(A) AVIF_CHECKERR((A), AVIF_RESULT_INTERNAL_ERROR)
  |  |  ------------------
  |  |  |  |   45|  81.0k|    do {                        \
  |  |  |  |   46|  81.0k|        if (!(A)) {             \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (46:13): [True: 0, False: 81.0k]
  |  |  |  |  ------------------
  |  |  |  |   47|      0|            avifBreakOnError(); \
  |  |  |  |   48|      0|            return ERR;         \
  |  |  |  |   49|      0|        }                       \
  |  |  |  |   50|  81.0k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (50:14): [Folded, False: 81.0k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 7038|  81.0k|    }
 7039|       |
 7040|       |    // If decoder->data->tileInfos[AVIF_ITEM_COLOR].tileCount == 0, it means
 7041|       |    // decoder->imageContentToDecode & AVIF_IMAGE_CONTENT_COLOR_AND_ALPHA was equal to 0.
 7042|       |    // Only apply Sample Transforms if there is a color item to apply it onto.
 7043|  10.1k|    if (decoder->data->tileInfos[AVIF_ITEM_COLOR].tileCount != 0 && decoder->data->meta->sampleTransformExpression.count > 0) {
  ------------------
  |  Branch (7043:9): [True: 10.0k, False: 46]
  |  Branch (7043:69): [True: 0, False: 10.0k]
  ------------------
 7044|      0|        AVIF_CHECKRES(avifDecoderApplySampleTransform(decoder, decoder->image));
  ------------------
  |  |   54|      0|    do {                                  \
  |  |   55|      0|        const avifResult result__ = (A);  \
  |  |   56|      0|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 0, False: 0]
  |  |  ------------------
  |  |   57|      0|            avifBreakOnError();           \
  |  |   58|      0|            return result__;              \
  |  |   59|      0|        }                                 \
  |  |   60|      0|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 0]
  |  |  ------------------
  ------------------
 7045|      0|    }
 7046|       |
 7047|       |    // Only advance decoder->imageIndex once the image is completely decoded, so that
 7048|       |    // avifDecoderNthImage(decoder, decoder->imageIndex + 1) is equivalent to avifDecoderNextImage(decoder)
 7049|       |    // if the previous call to avifDecoderNextImage() returned AVIF_RESULT_WAITING_ON_IO.
 7050|  10.1k|    decoder->imageIndex = (int)nextImageIndex;
 7051|       |    // The decoded tile counts will be reset to 0 the next time avifDecoderNextImage() is called,
 7052|       |    // for avifDecoderDecodedRowCount() to work until then.
 7053|  10.1k|    if (decoder->data->sourceSampleTable) {
  ------------------
  |  Branch (7053:9): [True: 5.58k, False: 4.53k]
  ------------------
 7054|       |        // Decoding from a track! Provide timing information.
 7055|       |
 7056|  5.58k|        avifResult timingResult = avifDecoderNthImageTiming(decoder, decoder->imageIndex, &decoder->imageTiming);
 7057|  5.58k|        if (timingResult != AVIF_RESULT_OK) {
  ------------------
  |  Branch (7057:13): [True: 0, False: 5.58k]
  ------------------
 7058|      0|            return timingResult;
 7059|      0|        }
 7060|  5.58k|    }
 7061|  10.1k|    return AVIF_RESULT_OK;
 7062|  10.1k|}
avifDecoderNthImageTiming:
 7065|  5.58k|{
 7066|  5.58k|    if (!decoder->data) {
  ------------------
  |  Branch (7066:9): [True: 0, False: 5.58k]
  ------------------
 7067|       |        // Nothing has been parsed yet
 7068|      0|        return AVIF_RESULT_NO_CONTENT;
 7069|      0|    }
 7070|       |
 7071|  5.58k|    if ((frameIndex > INT_MAX) || ((int)frameIndex >= decoder->imageCount)) {
  ------------------
  |  Branch (7071:9): [True: 0, False: 5.58k]
  |  Branch (7071:35): [True: 0, False: 5.58k]
  ------------------
 7072|       |        // Impossible index
 7073|      0|        return AVIF_RESULT_NO_IMAGES_REMAINING;
 7074|      0|    }
 7075|       |
 7076|  5.58k|    if (!decoder->data->sourceSampleTable) {
  ------------------
  |  Branch (7076:9): [True: 0, False: 5.58k]
  ------------------
 7077|       |        // There isn't any real timing associated with this decode, so
 7078|       |        // just hand back the defaults chosen in avifDecoderReset().
 7079|      0|        *outTiming = decoder->imageTiming;
 7080|      0|        return AVIF_RESULT_OK;
 7081|      0|    }
 7082|       |
 7083|  5.58k|    outTiming->timescale = decoder->timescale;
 7084|  5.58k|    outTiming->ptsInTimescales = 0;
 7085|  9.52k|    for (uint32_t imageIndex = 0; imageIndex < frameIndex; ++imageIndex) {
  ------------------
  |  Branch (7085:35): [True: 3.93k, False: 5.58k]
  ------------------
 7086|  3.93k|        outTiming->ptsInTimescales += avifSampleTableGetImageDelta(decoder->data->sourceSampleTable, imageIndex);
 7087|  3.93k|    }
 7088|  5.58k|    outTiming->durationInTimescales = avifSampleTableGetImageDelta(decoder->data->sourceSampleTable, frameIndex);
 7089|       |
 7090|  5.58k|    if (outTiming->timescale > 0) {
  ------------------
  |  Branch (7090:9): [True: 5.09k, False: 492]
  ------------------
 7091|  5.09k|        outTiming->pts = (double)outTiming->ptsInTimescales / (double)outTiming->timescale;
 7092|  5.09k|        outTiming->duration = (double)outTiming->durationInTimescales / (double)outTiming->timescale;
 7093|  5.09k|    } else {
 7094|    492|        outTiming->pts = 0.0;
 7095|    492|        outTiming->duration = 0.0;
 7096|    492|    }
 7097|  5.58k|    return AVIF_RESULT_OK;
 7098|  5.58k|}
read.c:avifParseFileTypeBox:
 4780|  20.4k|{
 4781|  20.4k|    BEGIN_STREAM(s, raw, rawLen, diag, "Box[ftyp]");
  ------------------
  |  |  738|  20.4k|    avifROStream VARNAME;                               \
  |  |  739|  20.4k|    avifROData VARNAME##_roData;                        \
  |  |  740|  20.4k|    VARNAME##_roData.data = PTR;                        \
  |  |  741|  20.4k|    VARNAME##_roData.size = SIZE;                       \
  |  |  742|  20.4k|    avifROStreamStart(&VARNAME, &VARNAME##_roData, DIAG, CONTEXT)
  ------------------
 4782|       |
 4783|  20.4k|    AVIF_CHECK(avifROStreamRead(&s, ftyp->majorBrand, 4));
  ------------------
  |  |   36|  20.4k|    do {                        \
  |  |   37|  20.4k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 4, False: 20.4k]
  |  |  ------------------
  |  |   38|      4|            avifBreakOnError(); \
  |  |   39|      4|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      4|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      4|        }                       \
  |  |   41|  20.4k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 20.4k]
  |  |  ------------------
  ------------------
 4784|  20.4k|    AVIF_CHECK(avifROStreamRead(&s, ftyp->minorVersion, 4));
  ------------------
  |  |   36|  20.4k|    do {                        \
  |  |   37|  20.4k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 2, False: 20.4k]
  |  |  ------------------
  |  |   38|      2|            avifBreakOnError(); \
  |  |   39|      2|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      2|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      2|        }                       \
  |  |   41|  20.4k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 20.4k]
  |  |  ------------------
  ------------------
 4785|       |
 4786|  20.4k|    size_t compatibleBrandsBytes = avifROStreamRemainingBytes(&s);
 4787|  20.4k|    if ((compatibleBrandsBytes % 4) != 0) {
  ------------------
  |  Branch (4787:9): [True: 14, False: 20.3k]
  ------------------
 4788|     14|        avifDiagnosticsPrintf(diag, "Box[ftyp] contains a compatible brands section that isn't divisible by 4 [%zu]", compatibleBrandsBytes);
 4789|     14|        return AVIF_FALSE;
  ------------------
  |  |   89|     14|#define AVIF_FALSE 0
  ------------------
 4790|     14|    }
 4791|  20.3k|    ftyp->compatibleBrands = avifROStreamCurrent(&s);
 4792|  20.3k|    AVIF_CHECK(avifROStreamSkip(&s, compatibleBrandsBytes));
  ------------------
  |  |   36|  20.3k|    do {                        \
  |  |   37|  20.3k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 0, False: 20.3k]
  |  |  ------------------
  |  |   38|      0|            avifBreakOnError(); \
  |  |   39|      0|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      0|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      0|        }                       \
  |  |   41|  20.3k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 20.3k]
  |  |  ------------------
  ------------------
 4793|  20.3k|    ftyp->compatibleBrandsCount = (int)compatibleBrandsBytes / 4;
 4794|       |
 4795|  20.3k|    return AVIF_TRUE;
  ------------------
  |  |   88|  20.3k|#define AVIF_TRUE 1
  ------------------
 4796|  20.3k|}
read.c:avifFileTypeIsCompatible:
 5030|  20.3k|{
 5031|  20.3k|    return avifFileTypeHasBrand(ftyp, "avif") || avifFileTypeHasBrand(ftyp, "avis")
  ------------------
  |  Branch (5031:12): [True: 20.1k, False: 202]
  |  Branch (5031:50): [True: 191, False: 11]
  ------------------
 5032|       |#if defined(AVIF_ENABLE_EXPERIMENTAL_MINI)
 5033|       |           || avifFileTypeHasBrand(ftyp, "mif3")
 5034|       |#endif // AVIF_ENABLE_EXPERIMENTAL_MINI
 5035|  20.3k|        ;
 5036|  20.3k|}
read.c:avifFileTypeHasBrand:
 5015|  80.9k|{
 5016|  80.9k|    if (!memcmp(ftyp->majorBrand, brand, 4)) {
  ------------------
  |  Branch (5016:9): [True: 33.5k, False: 47.4k]
  ------------------
 5017|  33.5k|        return AVIF_TRUE;
  ------------------
  |  |   88|  33.5k|#define AVIF_TRUE 1
  ------------------
 5018|  33.5k|    }
 5019|       |
 5020|   207k|    for (int compatibleBrandIndex = 0; compatibleBrandIndex < ftyp->compatibleBrandsCount; ++compatibleBrandIndex) {
  ------------------
  |  Branch (5020:40): [True: 171k, False: 36.3k]
  ------------------
 5021|   171k|        const uint8_t * compatibleBrand = &ftyp->compatibleBrands[4 * compatibleBrandIndex];
 5022|   171k|        if (!memcmp(compatibleBrand, brand, 4)) {
  ------------------
  |  Branch (5022:13): [True: 11.0k, False: 160k]
  ------------------
 5023|  11.0k|            return AVIF_TRUE;
  ------------------
  |  |   88|  11.0k|#define AVIF_TRUE 1
  ------------------
 5024|  11.0k|        }
 5025|   171k|    }
 5026|  36.3k|    return AVIF_FALSE;
  ------------------
  |  |   89|  36.3k|#define AVIF_FALSE 0
  ------------------
 5027|  47.4k|}
read.c:avifDecoderCleanup:
 5091|  41.0k|{
 5092|  41.0k|    if (decoder->data) {
  ------------------
  |  Branch (5092:9): [True: 20.5k, False: 20.5k]
  ------------------
 5093|  20.5k|        avifDecoderDataDestroy(decoder->data);
 5094|  20.5k|        decoder->data = NULL;
 5095|  20.5k|    }
 5096|       |
 5097|  41.0k|    if (decoder->image) {
  ------------------
  |  Branch (5097:9): [True: 18.7k, False: 22.2k]
  ------------------
 5098|  18.7k|        avifImageDestroy(decoder->image);
 5099|       |        decoder->image = NULL;
 5100|  18.7k|    }
 5101|  41.0k|    avifDiagnosticsClearError(&decoder->diag);
 5102|  41.0k|}
read.c:avifDecoderDataDestroy:
 1120|  20.5k|{
 1121|  20.5k|    if (data->meta) {
  ------------------
  |  Branch (1121:9): [True: 20.5k, False: 0]
  ------------------
 1122|  20.5k|        avifMetaDestroy(data->meta);
 1123|  20.5k|    }
 1124|  24.1k|    for (uint32_t i = 0; i < data->tracks.count; ++i) {
  ------------------
  |  Branch (1124:26): [True: 3.60k, False: 20.5k]
  ------------------
 1125|  3.60k|        avifTrack * track = &data->tracks.track[i];
 1126|  3.60k|        if (track->sampleTable) {
  ------------------
  |  Branch (1126:13): [True: 3.14k, False: 460]
  ------------------
 1127|  3.14k|            avifSampleTableDestroy(track->sampleTable);
 1128|  3.14k|        }
 1129|  3.60k|        if (track->meta) {
  ------------------
  |  Branch (1129:13): [True: 3.60k, False: 0]
  ------------------
 1130|  3.60k|            avifMetaDestroy(track->meta);
 1131|  3.60k|        }
 1132|  3.60k|    }
 1133|  20.5k|    avifArrayDestroy(&data->tracks);
 1134|  20.5k|    avifDecoderDataClearTiles(data);
 1135|  20.5k|    avifArrayDestroy(&data->tiles);
 1136|  20.5k|    avifArrayDestroy(&data->compatibleBrands);
 1137|  20.5k|    avifFree(data);
 1138|  20.5k|}
read.c:avifMetaDestroy:
  868|  24.1k|{
  869|  76.6k|    for (uint32_t i = 0; i < meta->items.count; ++i) {
  ------------------
  |  Branch (869:26): [True: 52.5k, False: 24.1k]
  ------------------
  870|  52.5k|        avifDecoderItem * item = meta->items.item[i];
  871|  52.5k|        avifPropertyArrayDestroy(&item->properties);
  872|  52.5k|        avifArrayDestroy(&item->extents);
  873|  52.5k|        if (item->ownsMergedExtents) {
  ------------------
  |  Branch (873:13): [True: 15.1k, False: 37.4k]
  ------------------
  874|  15.1k|            avifRWDataFree(&item->mergedExtents);
  875|  15.1k|        }
  876|  52.5k|        avifFree(item);
  877|  52.5k|    }
  878|  24.1k|    avifArrayDestroy(&meta->items);
  879|  24.1k|    avifPropertyArrayDestroy(&meta->properties);
  880|  24.1k|    avifRWDataFree(&meta->idat);
  881|  24.1k|    avifArrayDestroy(&meta->sampleTransformExpression);
  882|  25.5k|    for (uint32_t i = 0; i < meta->entityToGroups.count; ++i) {
  ------------------
  |  Branch (882:26): [True: 1.41k, False: 24.1k]
  ------------------
  883|  1.41k|        avifArrayDestroy(&meta->entityToGroups.groups[i].entityIDs);
  884|  1.41k|    }
  885|  24.1k|    avifArrayDestroy(&meta->entityToGroups);
  886|  24.1k|    avifFree(meta);
  887|  24.1k|}
read.c:avifPropertyArrayDestroy:
  332|  79.7k|{
  333|   415k|    for (size_t i = 0; i < array->count; ++i) {
  ------------------
  |  Branch (333:24): [True: 335k, False: 79.7k]
  ------------------
  334|   335k|        if (array->prop[i].isOpaque) {
  ------------------
  |  Branch (334:13): [True: 47.1k, False: 288k]
  ------------------
  335|  47.1k|            avifRWDataFree(&array->prop[i].u.opaque.boxPayload);
  336|  47.1k|        }
  337|   335k|    }
  338|  79.7k|    avifArrayDestroy(array);
  339|  79.7k|}
read.c:avifSampleTableDestroy:
  342|  3.14k|{
  343|  3.14k|    avifArrayDestroy(&sampleTable->chunks);
  344|  6.22k|    for (uint32_t i = 0; i < sampleTable->sampleDescriptions.count; ++i) {
  ------------------
  |  Branch (344:26): [True: 3.08k, False: 3.14k]
  ------------------
  345|  3.08k|        avifSampleDescription * description = &sampleTable->sampleDescriptions.description[i];
  346|  3.08k|        avifPropertyArrayDestroy(&description->properties);
  347|  3.08k|    }
  348|  3.14k|    avifArrayDestroy(&sampleTable->sampleDescriptions);
  349|  3.14k|    avifArrayDestroy(&sampleTable->sampleToChunks);
  350|  3.14k|    avifArrayDestroy(&sampleTable->sampleSizes);
  351|  3.14k|    avifArrayDestroy(&sampleTable->timeToSamples);
  352|  3.14k|    avifArrayDestroy(&sampleTable->syncSamples);
  353|  3.14k|    avifFree(sampleTable);
  354|  3.14k|}
read.c:avifMetaFindOrCreateItem:
  907|   228k|{
  908|   228k|    *item = NULL;
  909|   228k|    AVIF_ASSERT_OR_RETURN(itemID != 0);
  ------------------
  |  |   64|   228k|#define AVIF_ASSERT_OR_RETURN(A) AVIF_CHECKERR((A), AVIF_RESULT_INTERNAL_ERROR)
  |  |  ------------------
  |  |  |  |   45|   228k|    do {                        \
  |  |  |  |   46|   228k|        if (!(A)) {             \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (46:13): [True: 0, False: 228k]
  |  |  |  |  ------------------
  |  |  |  |   47|      0|            avifBreakOnError(); \
  |  |  |  |   48|      0|            return ERR;         \
  |  |  |  |   49|      0|        }                       \
  |  |  |  |   50|   228k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (50:14): [Folded, False: 228k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  910|       |
  911|   548k|    for (uint32_t i = 0; i < meta->items.count; ++i) {
  ------------------
  |  Branch (911:26): [True: 495k, False: 52.5k]
  ------------------
  912|   495k|        if (meta->items.item[i]->id == itemID) {
  ------------------
  |  Branch (912:13): [True: 175k, False: 320k]
  ------------------
  913|   175k|            *item = meta->items.item[i];
  914|   175k|            return AVIF_RESULT_OK;
  915|   175k|        }
  916|   495k|    }
  917|       |
  918|  52.5k|    avifDecoderItem ** itemPtr = (avifDecoderItem **)avifArrayPush(&meta->items);
  919|  52.5k|    AVIF_CHECKERR(itemPtr != NULL, AVIF_RESULT_OUT_OF_MEMORY);
  ------------------
  |  |   45|  52.5k|    do {                        \
  |  |   46|  52.5k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 52.5k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  52.5k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 52.5k]
  |  |  ------------------
  ------------------
  920|  52.5k|    *item = (avifDecoderItem *)avifAlloc(sizeof(avifDecoderItem));
  921|  52.5k|    if (*item == NULL) {
  ------------------
  |  Branch (921:9): [True: 0, False: 52.5k]
  ------------------
  922|      0|        avifArrayPop(&meta->items);
  923|      0|        return AVIF_RESULT_OUT_OF_MEMORY;
  924|      0|    }
  925|  52.5k|    memset(*item, 0, sizeof(avifDecoderItem));
  926|       |
  927|  52.5k|    *itemPtr = *item;
  928|  52.5k|    if (!avifArrayCreate(&(*item)->properties, sizeof(avifProperty), 16)) {
  ------------------
  |  Branch (928:9): [True: 0, False: 52.5k]
  ------------------
  929|      0|        avifFree(*item);
  930|      0|        *item = NULL;
  931|      0|        avifArrayPop(&meta->items);
  932|      0|        return AVIF_RESULT_OUT_OF_MEMORY;
  933|      0|    }
  934|  52.5k|    if (!avifArrayCreate(&(*item)->extents, sizeof(avifExtent), 1)) {
  ------------------
  |  Branch (934:9): [True: 0, False: 52.5k]
  ------------------
  935|      0|        avifPropertyArrayDestroy(&(*item)->properties);
  936|      0|        avifFree(*item);
  937|      0|        *item = NULL;
  938|      0|        avifArrayPop(&meta->items);
  939|      0|        return AVIF_RESULT_OUT_OF_MEMORY;
  940|      0|    }
  941|  52.5k|    (*item)->id = itemID;
  942|  52.5k|    (*item)->meta = meta;
  943|  52.5k|    return AVIF_RESULT_OK;
  944|  52.5k|}
read.c:avifDecoderDataCreate:
  996|  20.5k|{
  997|  20.5k|    avifDecoderData * data = (avifDecoderData *)avifAlloc(sizeof(avifDecoderData));
  998|  20.5k|    if (data == NULL) {
  ------------------
  |  Branch (998:9): [True: 0, False: 20.5k]
  ------------------
  999|      0|        return NULL;
 1000|      0|    }
 1001|  20.5k|    memset(data, 0, sizeof(avifDecoderData));
 1002|  20.5k|    data->meta = avifMetaCreate();
 1003|  20.5k|    if (data->meta == NULL || !avifArrayCreate(&data->tracks, sizeof(avifTrack), 2) ||
  ------------------
  |  Branch (1003:9): [True: 0, False: 20.5k]
  |  Branch (1003:31): [True: 0, False: 20.5k]
  ------------------
 1004|  20.5k|        !avifArrayCreate(&data->tiles, sizeof(avifTile), 8)) {
  ------------------
  |  Branch (1004:9): [True: 0, False: 20.5k]
  ------------------
 1005|      0|        avifDecoderDataDestroy(data);
 1006|      0|        return NULL;
 1007|      0|    }
 1008|  20.5k|    return data;
 1009|  20.5k|}
read.c:avifMetaCreate:
  853|  24.1k|{
  854|  24.1k|    avifMeta * meta = (avifMeta *)avifAlloc(sizeof(avifMeta));
  855|  24.1k|    if (meta == NULL) {
  ------------------
  |  Branch (855:9): [True: 0, False: 24.1k]
  ------------------
  856|      0|        return NULL;
  857|      0|    }
  858|  24.1k|    memset(meta, 0, sizeof(avifMeta));
  859|  24.1k|    if (!avifArrayCreate(&meta->items, sizeof(avifDecoderItem *), 8) || !avifArrayCreate(&meta->properties, sizeof(avifProperty), 16) ||
  ------------------
  |  Branch (859:9): [True: 0, False: 24.1k]
  |  Branch (859:73): [True: 0, False: 24.1k]
  ------------------
  860|  24.1k|        !avifArrayCreate(&meta->entityToGroups, sizeof(avifEntityToGroup), 1)) {
  ------------------
  |  Branch (860:9): [True: 0, False: 24.1k]
  ------------------
  861|      0|        avifMetaDestroy(meta);
  862|      0|        return NULL;
  863|      0|    }
  864|  24.1k|    return meta;
  865|  24.1k|}
read.c:avifParse:
 4802|  20.5k|{
 4803|       |    // Note: this top-level function is the only avifParse*() function that returns avifResult instead of avifBool.
 4804|       |    // Be sure to use AVIF_CHECKERR() in this function with an explicit error result instead of simply using AVIF_CHECK().
 4805|       |
 4806|  20.5k|    avifResult readResult;
 4807|  20.5k|    uint64_t parseOffset = 0;
 4808|  20.5k|    avifDecoderData * data = decoder->data;
 4809|  20.5k|    avifBool ftypSeen = AVIF_FALSE;
  ------------------
  |  |   89|  20.5k|#define AVIF_FALSE 0
  ------------------
 4810|  20.5k|    avifBool metaSeen = AVIF_FALSE;
  ------------------
  |  |   89|  20.5k|#define AVIF_FALSE 0
  ------------------
 4811|  20.5k|    avifBool metaIsSizeZero = AVIF_FALSE;
  ------------------
  |  |   89|  20.5k|#define AVIF_FALSE 0
  ------------------
 4812|  20.5k|    avifBool moovSeen = AVIF_FALSE;
  ------------------
  |  |   89|  20.5k|#define AVIF_FALSE 0
  ------------------
 4813|  20.5k|    avifBool needsMeta = AVIF_FALSE;
  ------------------
  |  |   89|  20.5k|#define AVIF_FALSE 0
  ------------------
 4814|  20.5k|    avifBool needsMoov = AVIF_FALSE;
  ------------------
  |  |   89|  20.5k|#define AVIF_FALSE 0
  ------------------
 4815|       |#if defined(AVIF_ENABLE_EXPERIMENTAL_MINI)
 4816|       |    avifBool miniSeen = AVIF_FALSE;
 4817|       |    avifBool needsMini = AVIF_FALSE;
 4818|       |#endif
 4819|  20.5k|    avifBool needsTmap = AVIF_FALSE;
  ------------------
  |  |   89|  20.5k|#define AVIF_FALSE 0
  ------------------
 4820|  20.5k|    avifBool tmapSeen = AVIF_FALSE;
  ------------------
  |  |   89|  20.5k|#define AVIF_FALSE 0
  ------------------
 4821|  20.5k|    avifFileType ftyp = { 0 };
 4822|       |
 4823|  43.7k|    for (;;) {
 4824|       |        // Read just enough to get the next box header (a max of 32 bytes)
 4825|  43.7k|        avifROData headerContents;
 4826|  43.7k|        if ((decoder->io->sizeHint > 0) && (parseOffset > decoder->io->sizeHint)) {
  ------------------
  |  Branch (4826:13): [True: 43.7k, False: 21]
  |  Branch (4826:44): [True: 181, False: 43.5k]
  ------------------
 4827|    181|            return AVIF_RESULT_BMFF_PARSE_FAILED;
 4828|    181|        }
 4829|  43.6k|        readResult = decoder->io->read(decoder->io, 0, parseOffset, 32, &headerContents);
 4830|  43.6k|        if (readResult != AVIF_RESULT_OK) {
  ------------------
  |  Branch (4830:13): [True: 0, False: 43.6k]
  ------------------
 4831|      0|            return readResult;
 4832|      0|        }
 4833|  43.6k|        if (!headerContents.size) {
  ------------------
  |  Branch (4833:13): [True: 53, False: 43.5k]
  ------------------
 4834|       |            // If we got AVIF_RESULT_OK from the reader but received 0 bytes,
 4835|       |            // we've reached the end of the file with no errors. Hooray!
 4836|     53|            break;
 4837|     53|        }
 4838|       |
 4839|       |        // Parse the header, and find out how many bytes it actually was
 4840|  43.5k|        BEGIN_STREAM(headerStream, headerContents.data, headerContents.size, &decoder->diag, "File-level box header");
  ------------------
  |  |  738|  43.5k|    avifROStream VARNAME;                               \
  |  |  739|  43.5k|    avifROData VARNAME##_roData;                        \
  |  |  740|  43.5k|    VARNAME##_roData.data = PTR;                        \
  |  |  741|  43.5k|    VARNAME##_roData.size = SIZE;                       \
  |  |  742|  43.5k|    avifROStreamStart(&VARNAME, &VARNAME##_roData, DIAG, CONTEXT)
  ------------------
 4841|  43.5k|        avifBoxHeader header;
 4842|  43.5k|        AVIF_CHECKERR(avifROStreamReadBoxHeaderPartial(&headerStream, &header, /*topLevel=*/AVIF_TRUE), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  43.5k|    do {                        \
  |  |   46|  43.5k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 21, False: 43.5k]
  |  |  ------------------
  |  |   47|     21|            avifBreakOnError(); \
  |  |   48|     21|            return ERR;         \
  |  |   49|     21|        }                       \
  |  |   50|  43.5k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 43.5k]
  |  |  ------------------
  ------------------
 4843|  43.5k|        parseOffset += avifROStreamOffset(&headerStream);
 4844|  43.5k|        AVIF_ASSERT_OR_RETURN(decoder->io->sizeHint == 0 || parseOffset <= decoder->io->sizeHint);
  ------------------
  |  |   64|  43.5k|#define AVIF_ASSERT_OR_RETURN(A) AVIF_CHECKERR((A), AVIF_RESULT_INTERNAL_ERROR)
  |  |  ------------------
  |  |  |  |   45|  43.5k|    do {                        \
  |  |  |  |   46|  87.0k|        if (!(A)) {             \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (46:15): [True: 0, False: 43.5k]
  |  |  |  |  |  Branch (46:15): [True: 43.5k, False: 0]
  |  |  |  |  ------------------
  |  |  |  |   47|      0|            avifBreakOnError(); \
  |  |  |  |   48|      0|            return ERR;         \
  |  |  |  |   49|      0|        }                       \
  |  |  |  |   50|  43.5k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (50:14): [Folded, False: 43.5k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 4845|       |
 4846|       |        // Try to get the remainder of the box, if necessary
 4847|  43.5k|        uint64_t boxOffset = 0;
 4848|  43.5k|        avifROData boxContents = AVIF_DATA_EMPTY;
  ------------------
  |  |  254|  43.5k|#define AVIF_DATA_EMPTY { NULL, 0 }
  ------------------
 4849|       |
 4850|  43.5k|        avifBool isFtyp = AVIF_FALSE, isMeta = AVIF_FALSE, isMoov = AVIF_FALSE;
  ------------------
  |  |   89|  43.5k|#define AVIF_FALSE 0
  ------------------
                      avifBool isFtyp = AVIF_FALSE, isMeta = AVIF_FALSE, isMoov = AVIF_FALSE;
  ------------------
  |  |   89|  43.5k|#define AVIF_FALSE 0
  ------------------
                      avifBool isFtyp = AVIF_FALSE, isMeta = AVIF_FALSE, isMoov = AVIF_FALSE;
  ------------------
  |  |   89|  43.5k|#define AVIF_FALSE 0
  ------------------
 4851|  43.5k|        avifBool isNonSkippableVariableLengthBox = AVIF_FALSE;
  ------------------
  |  |   89|  43.5k|#define AVIF_FALSE 0
  ------------------
 4852|  43.5k|        if (!memcmp(header.type, "ftyp", 4)) {
  ------------------
  |  Branch (4852:13): [True: 20.2k, False: 23.2k]
  ------------------
 4853|  20.2k|            isFtyp = AVIF_TRUE;
  ------------------
  |  |   88|  20.2k|#define AVIF_TRUE 1
  ------------------
 4854|  20.2k|            isNonSkippableVariableLengthBox = AVIF_TRUE;
  ------------------
  |  |   88|  20.2k|#define AVIF_TRUE 1
  ------------------
 4855|  23.2k|        } else if (!memcmp(header.type, "meta", 4)) {
  ------------------
  |  Branch (4855:20): [True: 20.0k, False: 3.23k]
  ------------------
 4856|  20.0k|            isMeta = AVIF_TRUE;
  ------------------
  |  |   88|  20.0k|#define AVIF_TRUE 1
  ------------------
 4857|  20.0k|            isNonSkippableVariableLengthBox = AVIF_TRUE;
  ------------------
  |  |   88|  20.0k|#define AVIF_TRUE 1
  ------------------
 4858|  20.0k|            metaIsSizeZero = header.isSizeZeroBox;
 4859|  20.0k|        } else if (!memcmp(header.type, "moov", 4)) {
  ------------------
  |  Branch (4859:20): [True: 2.70k, False: 532]
  ------------------
 4860|  2.70k|            isMoov = AVIF_TRUE;
  ------------------
  |  |   88|  2.70k|#define AVIF_TRUE 1
  ------------------
 4861|  2.70k|            isNonSkippableVariableLengthBox = AVIF_TRUE;
  ------------------
  |  |   88|  2.70k|#define AVIF_TRUE 1
  ------------------
 4862|  2.70k|        }
 4863|       |#if defined(AVIF_ENABLE_EXPERIMENTAL_MINI)
 4864|       |        avifBool isMini = AVIF_FALSE;
 4865|       |        if (!isNonSkippableVariableLengthBox && !memcmp(header.type, "mini", 4)) {
 4866|       |            isMini = AVIF_TRUE;
 4867|       |            isNonSkippableVariableLengthBox = AVIF_TRUE;
 4868|       |        }
 4869|       |#endif
 4870|       |
 4871|  43.5k|        if (!isFtyp && (isNonSkippableVariableLengthBox || !memcmp(header.type, "free", 4) || !memcmp(header.type, "skip", 4) ||
  ------------------
  |  Branch (4871:13): [True: 23.2k, False: 20.2k]
  |  Branch (4871:25): [True: 22.7k, False: 532]
  |  Branch (4871:60): [True: 2, False: 530]
  |  Branch (4871:95): [True: 0, False: 530]
  ------------------
 4872|  22.7k|                        !memcmp(header.type, "mdat", 4))) {
  ------------------
  |  Branch (4872:25): [True: 4, False: 526]
  ------------------
 4873|       |            // Section 6.3.4 of ISO/IEC 14496-12:
 4874|       |            //   The FileTypeBox shall occur before any variable-length box (e.g. movie, free space, media data).
 4875|  22.7k|            AVIF_CHECKERR(ftypSeen, AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  22.7k|    do {                        \
  |  |   46|  22.7k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 3, False: 22.7k]
  |  |  ------------------
  |  |   47|      3|            avifBreakOnError(); \
  |  |   48|      3|            return ERR;         \
  |  |   49|      3|        }                       \
  |  |   50|  22.7k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 22.7k]
  |  |  ------------------
  ------------------
 4876|  22.7k|        }
 4877|       |
 4878|  43.5k|        if (isNonSkippableVariableLengthBox) {
  ------------------
  |  Branch (4878:13): [True: 43.0k, False: 531]
  ------------------
 4879|  43.0k|            boxOffset = parseOffset;
 4880|  43.0k|            size_t sizeToRead;
 4881|  43.0k|            if (header.isSizeZeroBox) {
  ------------------
  |  Branch (4881:17): [True: 808, False: 42.1k]
  ------------------
 4882|       |                // The box body goes till the end of the file.
 4883|    808|                if (decoder->io->sizeHint != 0 && decoder->io->sizeHint - parseOffset < SIZE_MAX) {
  ------------------
  |  Branch (4883:21): [True: 808, False: 0]
  |  Branch (4883:51): [True: 808, False: 0]
  ------------------
 4884|    808|                    sizeToRead = (size_t)(decoder->io->sizeHint - parseOffset);
 4885|    808|                } else {
 4886|      0|                    sizeToRead = SIZE_MAX; // This will get truncated. See the documentation of avifIOReadFunc.
 4887|      0|                }
 4888|  42.1k|            } else {
 4889|  42.1k|                sizeToRead = header.size;
 4890|  42.1k|            }
 4891|  43.0k|            readResult = decoder->io->read(decoder->io, 0, parseOffset, sizeToRead, &boxContents);
 4892|  43.0k|            if (readResult != AVIF_RESULT_OK) {
  ------------------
  |  Branch (4892:17): [True: 0, False: 43.0k]
  ------------------
 4893|      0|                return readResult;
 4894|      0|            }
 4895|  43.0k|            if (header.isSizeZeroBox) {
  ------------------
  |  Branch (4895:17): [True: 808, False: 42.1k]
  ------------------
 4896|    808|                header.size = boxContents.size;
 4897|  42.1k|            } else if (boxContents.size != header.size) {
  ------------------
  |  Branch (4897:24): [True: 138, False: 42.0k]
  ------------------
 4898|       |                // A truncated box, bail out
 4899|    138|                return AVIF_RESULT_TRUNCATED_DATA;
 4900|    138|            }
 4901|  43.0k|        } else if (header.isSizeZeroBox) {
  ------------------
  |  Branch (4901:20): [True: 2, False: 529]
  ------------------
 4902|       |            // An unknown top level box with size 0 was found. If we reach here it means we haven't completed parsing successfully
 4903|       |            // since there are no further boxes left.
 4904|      2|            return AVIF_RESULT_BMFF_PARSE_FAILED;
 4905|    529|        } else if (header.size > (UINT64_MAX - parseOffset)) {
  ------------------
  |  Branch (4905:20): [True: 1, False: 528]
  ------------------
 4906|      1|            return AVIF_RESULT_BMFF_PARSE_FAILED;
 4907|      1|        }
 4908|  43.3k|        parseOffset += header.size;
 4909|       |
 4910|  43.3k|        if (isFtyp) {
  ------------------
  |  Branch (4910:13): [True: 20.1k, False: 23.2k]
  ------------------
 4911|  20.1k|            AVIF_CHECKERR(!ftypSeen, AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  20.1k|    do {                        \
  |  |   46|  20.1k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 20.1k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  20.1k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 20.1k]
  |  |  ------------------
  ------------------
 4912|  20.1k|            AVIF_CHECKERR(avifParseFileTypeBox(&ftyp, boxContents.data, boxContents.size, data->diag), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  20.1k|    do {                        \
  |  |   46|  20.1k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 20, False: 20.1k]
  |  |  ------------------
  |  |   47|     20|            avifBreakOnError(); \
  |  |   48|     20|            return ERR;         \
  |  |   49|     20|        }                       \
  |  |   50|  20.1k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 20.1k]
  |  |  ------------------
  ------------------
 4913|  20.1k|            AVIF_CHECKERR(avifFileTypeIsCompatible(&ftyp), AVIF_RESULT_INVALID_FTYP);
  ------------------
  |  |   45|  20.1k|    do {                        \
  |  |   46|  20.1k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 11, False: 20.1k]
  |  |  ------------------
  |  |   47|     11|            avifBreakOnError(); \
  |  |   48|     11|            return ERR;         \
  |  |   49|     11|        }                       \
  |  |   50|  20.1k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 20.1k]
  |  |  ------------------
  ------------------
 4914|  20.1k|            ftypSeen = AVIF_TRUE;
  ------------------
  |  |   88|  20.1k|#define AVIF_TRUE 1
  ------------------
 4915|  20.1k|            memcpy(data->majorBrand, ftyp.majorBrand, 4); // Remember the major brand for future AVIF_DECODER_SOURCE_AUTO decisions
 4916|  20.1k|            if (ftyp.compatibleBrandsCount > 0) {
  ------------------
  |  Branch (4916:17): [True: 20.1k, False: 9]
  ------------------
 4917|  20.1k|                AVIF_CHECKERR(avifArrayCreate(&data->compatibleBrands, sizeof(avifBrand), ftyp.compatibleBrandsCount),
  ------------------
  |  |   45|  20.1k|    do {                        \
  |  |   46|  20.1k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 20.1k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  20.1k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 20.1k]
  |  |  ------------------
  ------------------
 4918|  20.1k|                              AVIF_RESULT_OUT_OF_MEMORY);
 4919|  20.1k|                memcpy(data->compatibleBrands.brand, ftyp.compatibleBrands, sizeof(avifBrand) * ftyp.compatibleBrandsCount);
 4920|  20.1k|                data->compatibleBrands.count = ftyp.compatibleBrandsCount;
 4921|  20.1k|            }
 4922|  20.1k|            needsMeta = avifFileTypeHasBrand(&ftyp, "avif");
 4923|  20.1k|            needsMoov = avifFileTypeHasBrand(&ftyp, "avis");
 4924|       |#if defined(AVIF_ENABLE_EXPERIMENTAL_MINI)
 4925|       |            needsMini = avifFileTypeHasBrand(&ftyp, "mif3");
 4926|       |            if (needsMini) {
 4927|       |                AVIF_CHECKERR(!needsMeta, AVIF_RESULT_INVALID_FTYP);
 4928|       |                // Section O.2.1.2 of ISO/IEC 23008-12:2014, CDAM 2:
 4929|       |                //   When the 'mif3' brand is present as the major_brand of the FileTypeBox,
 4930|       |                //   the minor_version of the FileTypeBox shall be 0 or a brand that is either
 4931|       |                //   structurally compatible with the 'mif3' brand, such as a codec brand
 4932|       |                //   complying with the 'mif3' structural brand, or a brand to which the file
 4933|       |                //   conforms after the equivalent MetaBox has been transformed from
 4934|       |                //   MinimizedImageBox as specified in Clause O.4.
 4935|       |                AVIF_CHECKERR(!memcmp(ftyp.minorVersion, "\0\0\0\0", 4) || !memcmp(ftyp.minorVersion, "avif", 4),
 4936|       |                              AVIF_RESULT_BMFF_PARSE_FAILED);
 4937|       |            }
 4938|       |#endif // AVIF_ENABLE_EXPERIMENTAL_MINI
 4939|  20.1k|            needsTmap = avifFileTypeHasBrand(&ftyp, "tmap");
 4940|  20.1k|            if (needsTmap) {
  ------------------
  |  Branch (4940:17): [True: 1.59k, False: 18.5k]
  ------------------
 4941|  1.59k|                needsMeta = AVIF_TRUE;
  ------------------
  |  |   88|  1.59k|#define AVIF_TRUE 1
  ------------------
 4942|  1.59k|            }
 4943|  23.2k|        } else if (isMeta) {
  ------------------
  |  Branch (4943:20): [True: 20.0k, False: 3.22k]
  ------------------
 4944|  20.0k|            AVIF_CHECKERR(!metaSeen, AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  20.0k|    do {                        \
  |  |   46|  20.0k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 1, False: 20.0k]
  |  |  ------------------
  |  |   47|      1|            avifBreakOnError(); \
  |  |   48|      1|            return ERR;         \
  |  |   49|      1|        }                       \
  |  |   50|  20.0k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 20.0k]
  |  |  ------------------
  ------------------
 4945|       |#if defined(AVIF_ENABLE_EXPERIMENTAL_MINI)
 4946|       |            AVIF_CHECKERR(!miniSeen, AVIF_RESULT_BMFF_PARSE_FAILED);
 4947|       |#endif
 4948|  20.0k|            AVIF_CHECKRES(avifParseMetaBox(data->meta, boxOffset, boxContents.data, boxContents.size, data->diag));
  ------------------
  |  |   54|  20.0k|    do {                                  \
  |  |   55|  20.0k|        const avifResult result__ = (A);  \
  |  |   56|  20.0k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 819, False: 19.1k]
  |  |  ------------------
  |  |   57|    819|            avifBreakOnError();           \
  |  |   58|    819|            return result__;              \
  |  |   59|    819|        }                                 \
  |  |   60|  20.0k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 19.1k]
  |  |  ------------------
  ------------------
 4949|  19.1k|            metaSeen = AVIF_TRUE;
  ------------------
  |  |   88|  19.1k|#define AVIF_TRUE 1
  ------------------
 4950|       |
 4951|  60.3k|            for (uint32_t itemIndex = 0; itemIndex < data->meta->items.count; ++itemIndex) {
  ------------------
  |  Branch (4951:42): [True: 42.9k, False: 17.3k]
  ------------------
 4952|  42.9k|                if (!memcmp(data->meta->items.item[itemIndex]->type, "tmap", 4)) {
  ------------------
  |  Branch (4952:21): [True: 1.79k, False: 41.2k]
  ------------------
 4953|  1.79k|                    tmapSeen = AVIF_TRUE;
  ------------------
  |  |   88|  1.79k|#define AVIF_TRUE 1
  ------------------
 4954|  1.79k|                    break;
 4955|  1.79k|                }
 4956|  42.9k|            }
 4957|       |
 4958|       |#if defined(AVIF_ENABLE_EXPERIMENTAL_MINI)
 4959|       |        } else if (isMini) {
 4960|       |            AVIF_CHECKERR(!metaSeen, AVIF_RESULT_BMFF_PARSE_FAILED);
 4961|       |            AVIF_CHECKERR(!miniSeen, AVIF_RESULT_BMFF_PARSE_FAILED);
 4962|       |            const avifBool isAvifAccordingToMinorVersion = !memcmp(ftyp.minorVersion, "avif", 4);
 4963|       |            AVIF_CHECKRES(
 4964|       |                avifParseMinimizedImageBox(data, boxOffset, boxContents.data, boxContents.size, isAvifAccordingToMinorVersion, data->diag));
 4965|       |            miniSeen = AVIF_TRUE;
 4966|       |#endif
 4967|  19.1k|        } else if (isMoov) {
  ------------------
  |  Branch (4967:20): [True: 2.69k, False: 528]
  ------------------
 4968|  2.69k|            AVIF_CHECKERR(!moovSeen, AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  2.69k|    do {                        \
  |  |   46|  2.69k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 2.69k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  2.69k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 2.69k]
  |  |  ------------------
  ------------------
 4969|  2.69k|            AVIF_CHECKRES(
  ------------------
  |  |   54|  2.69k|    do {                                  \
  |  |   55|  2.69k|        const avifResult result__ = (A);  \
  |  |   56|  2.69k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 318, False: 2.38k]
  |  |  ------------------
  |  |   57|    318|            avifBreakOnError();           \
  |  |   58|    318|            return result__;              \
  |  |   59|    318|        }                                 \
  |  |   60|  2.69k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 2.38k]
  |  |  ------------------
  ------------------
 4970|  2.69k|                avifParseMovieBox(data, boxOffset, boxContents.data, boxContents.size, decoder->imageSizeLimit, decoder->imageDimensionLimit));
 4971|  2.38k|            moovSeen = AVIF_TRUE;
  ------------------
  |  |   88|  2.38k|#define AVIF_TRUE 1
  ------------------
 4972|  2.38k|            decoder->imageSequenceTrackPresent = AVIF_TRUE;
  ------------------
  |  |   88|  2.38k|#define AVIF_TRUE 1
  ------------------
 4973|  2.38k|        }
 4974|       |
 4975|       |#if defined(AVIF_ENABLE_EXPERIMENTAL_MINI)
 4976|       |        if (ftypSeen && !needsMini) {
 4977|       |            // When MinimizedImageBox is present in a file, the 'mif3' brand or a derived brand that implies the 'mif3'
 4978|       |            // brand shall be the major brand or present among the compatible brands in the FileTypeBox.
 4979|       |            AVIF_CHECKERR(!miniSeen, AVIF_RESULT_BMFF_PARSE_FAILED);
 4980|       |        }
 4981|       |#endif // AVIF_ENABLE_EXPERIMENTAL_MINI
 4982|       |
 4983|       |        // See if there is enough information to consider Parse() a success and early-out:
 4984|       |        // * If the brand 'avif' is present, require a meta box
 4985|       |        // * If the brand 'avis' is present, require a moov box
 4986|       |        // * If AVIF_ENABLE_EXPERIMENTAL_MINI is defined and the brand 'mif3' is present, require a mini box
 4987|  42.2k|        avifBool sawEverythingNeeded = ftypSeen && (!needsMeta || metaSeen) && (!needsMoov || moovSeen) && (!needsTmap || tmapSeen);
  ------------------
  |  Branch (4987:40): [True: 42.0k, False: 198]
  |  Branch (4987:53): [True: 568, False: 41.4k]
  |  Branch (4987:67): [True: 21.3k, False: 20.1k]
  |  Branch (4987:81): [True: 16.5k, False: 5.31k]
  |  Branch (4987:95): [True: 2.37k, False: 2.93k]
  |  Branch (4987:109): [True: 17.3k, False: 1.55k]
  |  Branch (4987:123): [True: 1.54k, False: 10]
  ------------------
 4988|       |#if defined(AVIF_ENABLE_EXPERIMENTAL_MINI)
 4989|       |        sawEverythingNeeded = sawEverythingNeeded && (!needsMini || miniSeen);
 4990|       |#endif
 4991|  42.2k|        if (sawEverythingNeeded) {
  ------------------
  |  Branch (4991:13): [True: 18.9k, False: 23.2k]
  ------------------
 4992|  18.9k|            return AVIF_RESULT_OK;
 4993|  18.9k|        }
 4994|  42.2k|    }
 4995|     53|    if (!ftypSeen) {
  ------------------
  |  Branch (4995:9): [True: 25, False: 28]
  ------------------
 4996|     25|        return AVIF_RESULT_INVALID_FTYP;
 4997|     25|    }
 4998|     28|    if ((needsMeta && !metaSeen) || (needsMoov && !moovSeen)) {
  ------------------
  |  Branch (4998:10): [True: 17, False: 11]
  |  Branch (4998:23): [True: 15, False: 2]
  |  Branch (4998:38): [True: 12, False: 1]
  |  Branch (4998:51): [True: 12, False: 0]
  ------------------
 4999|     27|        return AVIF_RESULT_TRUNCATED_DATA;
 5000|     27|    }
 5001|      1|    if (needsTmap && !tmapSeen) {
  ------------------
  |  Branch (5001:9): [True: 1, False: 0]
  |  Branch (5001:22): [True: 1, False: 0]
  ------------------
 5002|      1|        return metaIsSizeZero ? AVIF_RESULT_TRUNCATED_DATA : AVIF_RESULT_BMFF_PARSE_FAILED;
  ------------------
  |  Branch (5002:16): [True: 0, False: 1]
  ------------------
 5003|      1|    }
 5004|       |#if defined(AVIF_ENABLE_EXPERIMENTAL_MINI)
 5005|       |    if (needsMini && !miniSeen) {
 5006|       |        return AVIF_RESULT_TRUNCATED_DATA;
 5007|       |    }
 5008|       |#endif
 5009|      0|    return AVIF_RESULT_OK;
 5010|      1|}
read.c:avifParseMetaBox:
 3452|  20.3k|{
 3453|  20.3k|    BEGIN_STREAM(s, raw, rawLen, diag, "Box[meta]");
  ------------------
  |  |  738|  20.3k|    avifROStream VARNAME;                               \
  |  |  739|  20.3k|    avifROData VARNAME##_roData;                        \
  |  |  740|  20.3k|    VARNAME##_roData.data = PTR;                        \
  |  |  741|  20.3k|    VARNAME##_roData.size = SIZE;                       \
  |  |  742|  20.3k|    avifROStreamStart(&VARNAME, &VARNAME##_roData, DIAG, CONTEXT)
  ------------------
 3454|       |
 3455|  20.3k|    uint32_t flags;
 3456|  20.3k|    AVIF_CHECKERR(avifROStreamReadAndEnforceVersion(&s, 0, &flags), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  20.3k|    do {                        \
  |  |   46|  20.3k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 12, False: 20.3k]
  |  |  ------------------
  |  |   47|     12|            avifBreakOnError(); \
  |  |   48|     12|            return ERR;         \
  |  |   49|     12|        }                       \
  |  |   50|  20.3k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 20.3k]
  |  |  ------------------
  ------------------
 3457|       |
 3458|  20.3k|    ++meta->idatID; // for tracking idat
 3459|       |
 3460|  20.3k|    avifBool firstBox = AVIF_TRUE;
  ------------------
  |  |   88|  20.3k|#define AVIF_TRUE 1
  ------------------
 3461|  20.3k|    uint32_t uniqueBoxFlags = 0;
 3462|   136k|    while (avifROStreamHasBytesLeft(&s, 1)) {
  ------------------
  |  Branch (3462:12): [True: 116k, False: 19.5k]
  ------------------
 3463|   116k|        avifBoxHeader header;
 3464|   116k|        AVIF_CHECKERR(avifROStreamReadBoxHeader(&s, &header), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|   116k|    do {                        \
  |  |   46|   116k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 100, False: 116k]
  |  |  ------------------
  |  |   47|    100|            avifBreakOnError(); \
  |  |   48|    100|            return ERR;         \
  |  |   49|    100|        }                       \
  |  |   50|   116k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 116k]
  |  |  ------------------
  ------------------
 3465|       |
 3466|   116k|        if (firstBox) {
  ------------------
  |  Branch (3466:13): [True: 20.3k, False: 96.5k]
  ------------------
 3467|  20.3k|            if (!memcmp(header.type, "hdlr", 4)) {
  ------------------
  |  Branch (3467:17): [True: 20.3k, False: 2]
  ------------------
 3468|  20.3k|                uint8_t handlerType[4];
 3469|  20.3k|                AVIF_CHECKERR(avifParseHandlerBox(avifROStreamCurrent(&s), header.size, handlerType, diag), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  20.3k|    do {                        \
  |  |   46|  20.3k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 57, False: 20.2k]
  |  |  ------------------
  |  |   47|     57|            avifBreakOnError(); \
  |  |   48|     57|            return ERR;         \
  |  |   49|     57|        }                       \
  |  |   50|  20.3k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 20.2k]
  |  |  ------------------
  ------------------
 3470|       |                // HEIF (ISO/IEC 23008-12:2022), Section 6.2:
 3471|       |                //   The handler type for the MetaBox shall be 'pict'.
 3472|  20.2k|                if (memcmp(handlerType, "pict", 4) != 0) {
  ------------------
  |  Branch (3472:21): [True: 8, False: 20.2k]
  ------------------
 3473|      8|                    avifDiagnosticsPrintf(diag, "Box[hdlr] handler_type is not 'pict'");
 3474|      8|                    return AVIF_RESULT_BMFF_PARSE_FAILED;
 3475|      8|                }
 3476|  20.2k|                firstBox = AVIF_FALSE;
  ------------------
  |  |   89|  20.2k|#define AVIF_FALSE 0
  ------------------
 3477|  20.2k|            } else {
 3478|       |                // hdlr must be the first box!
 3479|      2|                avifDiagnosticsPrintf(diag, "Box[meta] does not have a Box[hdlr] as its first child box");
 3480|      2|                return AVIF_RESULT_BMFF_PARSE_FAILED;
 3481|      2|            }
 3482|  96.5k|        } else if (!memcmp(header.type, "hdlr", 4)) {
  ------------------
  |  Branch (3482:20): [True: 1, False: 96.5k]
  ------------------
 3483|      1|            avifDiagnosticsPrintf(diag, "Box[meta] contains a duplicate unique box of type 'hdlr'");
 3484|      1|            return AVIF_RESULT_BMFF_PARSE_FAILED;
 3485|  96.5k|        } else if (!memcmp(header.type, "iloc", 4)) {
  ------------------
  |  Branch (3485:20): [True: 20.0k, False: 76.5k]
  ------------------
 3486|  20.0k|            AVIF_CHECKERR(uniqueBoxSeen(&uniqueBoxFlags, AVIF_UNIQUE_ILOC, "meta", "iloc", diag), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  20.0k|    do {                        \
  |  |   46|  20.0k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 1, False: 20.0k]
  |  |  ------------------
  |  |   47|      1|            avifBreakOnError(); \
  |  |   48|      1|            return ERR;         \
  |  |   49|      1|        }                       \
  |  |   50|  20.0k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 20.0k]
  |  |  ------------------
  ------------------
 3487|  20.0k|            AVIF_CHECKRES(avifParseItemLocationBox(meta, avifROStreamCurrent(&s), header.size, diag));
  ------------------
  |  |   54|  20.0k|    do {                                  \
  |  |   55|  20.0k|        const avifResult result__ = (A);  \
  |  |   56|  20.0k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 275, False: 19.7k]
  |  |  ------------------
  |  |   57|    275|            avifBreakOnError();           \
  |  |   58|    275|            return result__;              \
  |  |   59|    275|        }                                 \
  |  |   60|  20.0k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 19.7k]
  |  |  ------------------
  ------------------
 3488|  76.5k|        } else if (!memcmp(header.type, "pitm", 4)) {
  ------------------
  |  Branch (3488:20): [True: 19.4k, False: 57.0k]
  ------------------
 3489|  19.4k|            AVIF_CHECKERR(uniqueBoxSeen(&uniqueBoxFlags, AVIF_UNIQUE_PITM, "meta", "pitm", diag), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  19.4k|    do {                        \
  |  |   46|  19.4k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 19.4k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  19.4k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 19.4k]
  |  |  ------------------
  ------------------
 3490|  19.4k|            AVIF_CHECKERR(avifParsePrimaryItemBox(meta, avifROStreamCurrent(&s), header.size, diag), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  19.4k|    do {                        \
  |  |   46|  19.4k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 3, False: 19.4k]
  |  |  ------------------
  |  |   47|      3|            avifBreakOnError(); \
  |  |   48|      3|            return ERR;         \
  |  |   49|      3|        }                       \
  |  |   50|  19.4k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 19.4k]
  |  |  ------------------
  ------------------
 3491|  57.0k|        } else if (!memcmp(header.type, "idat", 4)) {
  ------------------
  |  Branch (3491:20): [True: 2.96k, False: 54.0k]
  ------------------
 3492|  2.96k|            AVIF_CHECKERR(uniqueBoxSeen(&uniqueBoxFlags, AVIF_UNIQUE_IDAT, "meta", "idat", diag), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  2.96k|    do {                        \
  |  |   46|  2.96k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 2.96k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  2.96k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 2.96k]
  |  |  ------------------
  ------------------
 3493|  2.96k|            AVIF_CHECKERR(avifParseItemDataBox(meta, avifROStreamCurrent(&s), header.size, diag), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  2.96k|    do {                        \
  |  |   46|  2.96k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 1, False: 2.96k]
  |  |  ------------------
  |  |   47|      1|            avifBreakOnError(); \
  |  |   48|      1|            return ERR;         \
  |  |   49|      1|        }                       \
  |  |   50|  2.96k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 2.96k]
  |  |  ------------------
  ------------------
 3494|  54.0k|        } else if (!memcmp(header.type, "iprp", 4)) {
  ------------------
  |  Branch (3494:20): [True: 19.3k, False: 34.7k]
  ------------------
 3495|  19.3k|            AVIF_CHECKERR(uniqueBoxSeen(&uniqueBoxFlags, AVIF_UNIQUE_IPRP, "meta", "iprp", diag), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  19.3k|    do {                        \
  |  |   46|  19.3k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 19.3k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  19.3k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 19.3k]
  |  |  ------------------
  ------------------
 3496|  19.3k|            AVIF_CHECKRES(avifParseItemPropertiesBox(meta, rawOffset + avifROStreamOffset(&s), avifROStreamCurrent(&s), header.size, diag));
  ------------------
  |  |   54|  19.3k|    do {                                  \
  |  |   55|  19.3k|        const avifResult result__ = (A);  \
  |  |   56|  19.3k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 209, False: 19.0k]
  |  |  ------------------
  |  |   57|    209|            avifBreakOnError();           \
  |  |   58|    209|            return result__;              \
  |  |   59|    209|        }                                 \
  |  |   60|  19.3k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 19.0k]
  |  |  ------------------
  ------------------
 3497|  34.7k|        } else if (!memcmp(header.type, "iinf", 4)) {
  ------------------
  |  Branch (3497:20): [True: 19.6k, False: 15.0k]
  ------------------
 3498|  19.6k|            AVIF_CHECKERR(uniqueBoxSeen(&uniqueBoxFlags, AVIF_UNIQUE_IINF, "meta", "iinf", diag), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  19.6k|    do {                        \
  |  |   46|  19.6k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 19.6k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  19.6k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 19.6k]
  |  |  ------------------
  ------------------
 3499|  19.6k|            AVIF_CHECKRES(avifParseItemInfoBox(meta, avifROStreamCurrent(&s), header.size, diag));
  ------------------
  |  |   54|  19.6k|    do {                                  \
  |  |   55|  19.6k|        const avifResult result__ = (A);  \
  |  |   56|  19.6k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 54, False: 19.6k]
  |  |  ------------------
  |  |   57|     54|            avifBreakOnError();           \
  |  |   58|     54|            return result__;              \
  |  |   59|     54|        }                                 \
  |  |   60|  19.6k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 19.6k]
  |  |  ------------------
  ------------------
 3500|  19.6k|        } else if (!memcmp(header.type, "iref", 4)) {
  ------------------
  |  Branch (3500:20): [True: 11.8k, False: 3.20k]
  ------------------
 3501|  11.8k|            AVIF_CHECKERR(uniqueBoxSeen(&uniqueBoxFlags, AVIF_UNIQUE_IREF, "meta", "iref", diag), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  11.8k|    do {                        \
  |  |   46|  11.8k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 11.8k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  11.8k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 11.8k]
  |  |  ------------------
  ------------------
 3502|  11.8k|            AVIF_CHECKRES(avifParseItemReferenceBox(meta, avifROStreamCurrent(&s), header.size, diag));
  ------------------
  |  |   54|  11.8k|    do {                                  \
  |  |   55|  11.8k|        const avifResult result__ = (A);  \
  |  |   56|  11.8k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 75, False: 11.8k]
  |  |  ------------------
  |  |   57|     75|            avifBreakOnError();           \
  |  |   58|     75|            return result__;              \
  |  |   59|     75|        }                                 \
  |  |   60|  11.8k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 11.8k]
  |  |  ------------------
  ------------------
 3503|  11.8k|        } else if (!memcmp(header.type, "grpl", 4)) {
  ------------------
  |  Branch (3503:20): [True: 1.41k, False: 1.78k]
  ------------------
 3504|  1.41k|            AVIF_CHECKERR(uniqueBoxSeen(&uniqueBoxFlags, AVIF_UNIQUE_GRPL, "meta", "grpl", diag), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  1.41k|    do {                        \
  |  |   46|  1.41k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 1.41k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  1.41k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 1.41k]
  |  |  ------------------
  ------------------
 3505|  1.41k|            AVIF_CHECKRES(avifParseGroupsListBox(meta, avifROStreamCurrent(&s), header.size, diag));
  ------------------
  |  |   54|  1.41k|    do {                                  \
  |  |   55|  1.41k|        const avifResult result__ = (A);  \
  |  |   56|  1.41k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 22, False: 1.39k]
  |  |  ------------------
  |  |   57|     22|            avifBreakOnError();           \
  |  |   58|     22|            return result__;              \
  |  |   59|     22|        }                                 \
  |  |   60|  1.41k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 1.39k]
  |  |  ------------------
  ------------------
 3506|  1.41k|        }
 3507|       |
 3508|   116k|        AVIF_CHECKERR(avifROStreamSkip(&s, header.size), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|   116k|    do {                        \
  |  |   46|   116k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 116k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|   116k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 116k]
  |  |  ------------------
  ------------------
 3509|   116k|    }
 3510|  19.5k|    if (firstBox) {
  ------------------
  |  Branch (3510:9): [True: 1, False: 19.5k]
  ------------------
 3511|       |        // The meta box must not be empty (it must contain at least a hdlr box)
 3512|      1|        avifDiagnosticsPrintf(diag, "Box[meta] has no child boxes");
 3513|      1|        return AVIF_RESULT_BMFF_PARSE_FAILED;
 3514|      1|    }
 3515|  19.5k|    return AVIF_RESULT_OK;
 3516|  19.5k|}
read.c:avifParseHandlerBox:
 1956|  23.5k|{
 1957|  23.5k|    BEGIN_STREAM(s, raw, rawLen, diag, "Box[hdlr]");
  ------------------
  |  |  738|  23.5k|    avifROStream VARNAME;                               \
  |  |  739|  23.5k|    avifROData VARNAME##_roData;                        \
  |  |  740|  23.5k|    VARNAME##_roData.data = PTR;                        \
  |  |  741|  23.5k|    VARNAME##_roData.size = SIZE;                       \
  |  |  742|  23.5k|    avifROStreamStart(&VARNAME, &VARNAME##_roData, DIAG, CONTEXT)
  ------------------
 1958|       |
 1959|  23.5k|    AVIF_CHECK(avifROStreamReadAndEnforceVersion(&s, /*enforcedVersion=*/0, /*flags=*/NULL));
  ------------------
  |  |   36|  23.5k|    do {                        \
  |  |   37|  23.5k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 2, False: 23.5k]
  |  |  ------------------
  |  |   38|      2|            avifBreakOnError(); \
  |  |   39|      2|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      2|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      2|        }                       \
  |  |   41|  23.5k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 23.5k]
  |  |  ------------------
  ------------------
 1960|       |
 1961|  23.5k|    uint32_t predefined;
 1962|  23.5k|    AVIF_CHECK(avifROStreamReadU32(&s, &predefined)); // unsigned int(32) pre_defined = 0;
  ------------------
  |  |   36|  23.5k|    do {                        \
  |  |   37|  23.5k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 23.5k]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|  23.5k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 23.5k]
  |  |  ------------------
  ------------------
 1963|  23.5k|    if (predefined != 0) {
  ------------------
  |  Branch (1963:9): [True: 46, False: 23.5k]
  ------------------
 1964|     46|        avifDiagnosticsPrintf(diag, "Box[hdlr] contains a pre_defined value that is nonzero");
 1965|     46|        return AVIF_FALSE;
  ------------------
  |  |   89|     46|#define AVIF_FALSE 0
  ------------------
 1966|     46|    }
 1967|       |
 1968|  23.5k|    AVIF_CHECK(avifROStreamRead(&s, handlerType, 4)); // unsigned int(32) handler_type;
  ------------------
  |  |   36|  23.5k|    do {                        \
  |  |   37|  23.5k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 23.5k]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|  23.5k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 23.5k]
  |  |  ------------------
  ------------------
 1969|       |
 1970|  94.1k|    for (int i = 0; i < 3; ++i) {
  ------------------
  |  Branch (1970:21): [True: 70.6k, False: 23.5k]
  ------------------
 1971|  70.6k|        uint32_t reserved;
 1972|  70.6k|        AVIF_CHECK(avifROStreamReadU32(&s, &reserved)); // const unsigned int(32)[3] reserved = 0;
  ------------------
  |  |   36|  70.6k|    do {                        \
  |  |   37|  70.6k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 3, False: 70.6k]
  |  |  ------------------
  |  |   38|      3|            avifBreakOnError(); \
  |  |   39|      3|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      3|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      3|        }                       \
  |  |   41|  70.6k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 70.6k]
  |  |  ------------------
  ------------------
 1973|  70.6k|    }
 1974|       |
 1975|       |    // Verify that a valid string is here, but don't bother to store it
 1976|  23.5k|    AVIF_CHECK(avifROStreamReadString(&s, NULL, 0)); // string name;
  ------------------
  |  |   36|  23.5k|    do {                        \
  |  |   37|  23.5k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 6, False: 23.5k]
  |  |  ------------------
  |  |   38|      6|            avifBreakOnError(); \
  |  |   39|      6|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      6|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      6|        }                       \
  |  |   41|  23.5k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 23.5k]
  |  |  ------------------
  ------------------
 1977|  23.5k|    return AVIF_TRUE;
  ------------------
  |  |   88|  23.5k|#define AVIF_TRUE 1
  ------------------
 1978|  23.5k|}
read.c:uniqueBoxSeen:
  763|  94.7k|{
  764|  94.7k|    const uint32_t flag = 1 << whichFlag;
  765|  94.7k|    if (*uniqueBoxFlags & flag) {
  ------------------
  |  Branch (765:9): [True: 1, False: 94.7k]
  ------------------
  766|       |        // This box has already been seen. Error!
  767|      1|        avifDiagnosticsPrintf(diagnostics, "Box[%s] contains a duplicate unique box of type '%s'", parentBoxType, boxType);
  768|      1|        return AVIF_FALSE;
  ------------------
  |  |   89|      1|#define AVIF_FALSE 0
  ------------------
  769|      1|    }
  770|       |
  771|       |    // Mark this box as seen.
  772|  94.7k|    *uniqueBoxFlags |= flag;
  773|  94.7k|    return AVIF_TRUE;
  ------------------
  |  |   88|  94.7k|#define AVIF_TRUE 1
  ------------------
  774|  94.7k|}
read.c:avifParseItemLocationBox:
 1981|  20.0k|{
 1982|  20.0k|    BEGIN_STREAM(s, raw, rawLen, diag, "Box[iloc]");
  ------------------
  |  |  738|  20.0k|    avifROStream VARNAME;                               \
  |  |  739|  20.0k|    avifROData VARNAME##_roData;                        \
  |  |  740|  20.0k|    VARNAME##_roData.data = PTR;                        \
  |  |  741|  20.0k|    VARNAME##_roData.size = SIZE;                       \
  |  |  742|  20.0k|    avifROStreamStart(&VARNAME, &VARNAME##_roData, DIAG, CONTEXT)
  ------------------
 1983|       |
 1984|       |    // Section 8.11.3.2 of ISO/IEC 14496-12.
 1985|  20.0k|    uint8_t version;
 1986|  20.0k|    AVIF_CHECKERR(avifROStreamReadVersionAndFlags(&s, &version, NULL), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  20.0k|    do {                        \
  |  |   46|  20.0k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 2, False: 20.0k]
  |  |  ------------------
  |  |   47|      2|            avifBreakOnError(); \
  |  |   48|      2|            return ERR;         \
  |  |   49|      2|        }                       \
  |  |   50|  20.0k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 20.0k]
  |  |  ------------------
  ------------------
 1987|  20.0k|    if (version > 2) {
  ------------------
  |  Branch (1987:9): [True: 2, False: 20.0k]
  ------------------
 1988|      2|        avifDiagnosticsPrintf(diag, "Box[iloc] has an unsupported version [%u]", version);
 1989|      2|        return AVIF_RESULT_BMFF_PARSE_FAILED;
 1990|      2|    }
 1991|       |
 1992|  20.0k|    uint8_t offsetSize, lengthSize, baseOffsetSize, indexSize = 0;
 1993|  20.0k|    uint32_t reserved;
 1994|  20.0k|    AVIF_CHECKERR(avifROStreamReadBitsU8(&s, &offsetSize, /*bitCount=*/4), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(4) offset_size;
  ------------------
  |  |   45|  20.0k|    do {                        \
  |  |   46|  20.0k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 2, False: 20.0k]
  |  |  ------------------
  |  |   47|      2|            avifBreakOnError(); \
  |  |   48|      2|            return ERR;         \
  |  |   49|      2|        }                       \
  |  |   50|  20.0k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 20.0k]
  |  |  ------------------
  ------------------
 1995|  20.0k|    AVIF_CHECKERR(avifROStreamReadBitsU8(&s, &lengthSize, /*bitCount=*/4), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(4) length_size;
  ------------------
  |  |   45|  20.0k|    do {                        \
  |  |   46|  20.0k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 20.0k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  20.0k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 20.0k]
  |  |  ------------------
  ------------------
 1996|  20.0k|    AVIF_CHECKERR(avifROStreamReadBitsU8(&s, &baseOffsetSize, /*bitCount=*/4), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(4) base_offset_size;
  ------------------
  |  |   45|  20.0k|    do {                        \
  |  |   46|  20.0k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 1, False: 20.0k]
  |  |  ------------------
  |  |   47|      1|            avifBreakOnError(); \
  |  |   48|      1|            return ERR;         \
  |  |   49|      1|        }                       \
  |  |   50|  20.0k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 20.0k]
  |  |  ------------------
  ------------------
 1997|  20.0k|    if (version == 1 || version == 2) {
  ------------------
  |  Branch (1997:9): [True: 3.17k, False: 16.8k]
  |  Branch (1997:25): [True: 92, False: 16.7k]
  ------------------
 1998|  3.26k|        AVIF_CHECKERR(avifROStreamReadBitsU8(&s, &indexSize, /*bitCount=*/4), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(4) index_size;
  ------------------
  |  |   45|  3.26k|    do {                        \
  |  |   46|  3.26k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 3.26k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  3.26k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 3.26k]
  |  |  ------------------
  ------------------
 1999|  16.7k|    } else {
 2000|  16.7k|        AVIF_CHECKERR(avifROStreamReadBitsU32(&s, &reserved, /*bitCount=*/4), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(4) reserved;
  ------------------
  |  |   45|  16.7k|    do {                        \
  |  |   46|  16.7k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 16.7k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  16.7k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 16.7k]
  |  |  ------------------
  ------------------
 2001|  16.7k|    }
 2002|       |
 2003|       |    // Section 8.11.3.3 of ISO/IEC 14496-12.
 2004|  20.0k|    if ((offsetSize != 0 && offsetSize != 4 && offsetSize != 8) || (lengthSize != 0 && lengthSize != 4 && lengthSize != 8) ||
  ------------------
  |  Branch (2004:10): [True: 19.8k, False: 199]
  |  Branch (2004:29): [True: 43, False: 19.7k]
  |  Branch (2004:48): [True: 3, False: 40]
  |  Branch (2004:69): [True: 19.8k, False: 162]
  |  Branch (2004:88): [True: 112, False: 19.7k]
  |  Branch (2004:107): [True: 4, False: 108]
  ------------------
 2005|  20.0k|        (baseOffsetSize != 0 && baseOffsetSize != 4 && baseOffsetSize != 8) || (indexSize != 0 && indexSize != 4 && indexSize != 8)) {
  ------------------
  |  Branch (2005:10): [True: 291, False: 19.7k]
  |  Branch (2005:33): [True: 33, False: 258]
  |  Branch (2005:56): [True: 4, False: 29]
  |  Branch (2005:81): [True: 21, False: 19.9k]
  |  Branch (2005:99): [True: 9, False: 12]
  |  Branch (2005:117): [True: 3, False: 6]
  ------------------
 2006|     14|        avifDiagnosticsPrintf(diag, "Box[iloc] has an invalid size");
 2007|     14|        return AVIF_RESULT_BMFF_PARSE_FAILED;
 2008|     14|    }
 2009|       |
 2010|  20.0k|    uint16_t tmp16;
 2011|  20.0k|    uint32_t itemCount;
 2012|  20.0k|    if (version < 2) {
  ------------------
  |  Branch (2012:9): [True: 19.9k, False: 86]
  ------------------
 2013|  19.9k|        AVIF_CHECKERR(avifROStreamReadU16(&s, &tmp16), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(16) item_count;
  ------------------
  |  |   45|  19.9k|    do {                        \
  |  |   46|  19.9k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 1, False: 19.9k]
  |  |  ------------------
  |  |   47|      1|            avifBreakOnError(); \
  |  |   48|      1|            return ERR;         \
  |  |   49|      1|        }                       \
  |  |   50|  19.9k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 19.9k]
  |  |  ------------------
  ------------------
 2014|  19.9k|        itemCount = tmp16;
 2015|  19.9k|    } else {
 2016|     86|        AVIF_CHECKERR(avifROStreamReadU32(&s, &itemCount), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(32) item_count;
  ------------------
  |  |   45|     86|    do {                        \
  |  |   46|     86|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 2, False: 84]
  |  |  ------------------
  |  |   47|      2|            avifBreakOnError(); \
  |  |   48|      2|            return ERR;         \
  |  |   49|      2|        }                       \
  |  |   50|     86|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 84]
  |  |  ------------------
  ------------------
 2017|     86|    }
 2018|  66.5k|    for (uint32_t i = 0; i < itemCount; ++i) {
  ------------------
  |  Branch (2018:26): [True: 46.7k, False: 19.7k]
  ------------------
 2019|  46.7k|        uint32_t itemID;
 2020|  46.7k|        if (version < 2) {
  ------------------
  |  Branch (2020:13): [True: 46.6k, False: 149]
  ------------------
 2021|  46.6k|            AVIF_CHECKERR(avifROStreamReadU16(&s, &tmp16), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(16) item_ID;
  ------------------
  |  |   45|  46.6k|    do {                        \
  |  |   46|  46.6k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 10, False: 46.6k]
  |  |  ------------------
  |  |   47|     10|            avifBreakOnError(); \
  |  |   48|     10|            return ERR;         \
  |  |   49|     10|        }                       \
  |  |   50|  46.6k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 46.6k]
  |  |  ------------------
  ------------------
 2022|  46.6k|            itemID = tmp16;
 2023|  46.6k|        } else {
 2024|    149|            AVIF_CHECKERR(avifROStreamReadU32(&s, &itemID), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(32) item_ID;
  ------------------
  |  |   45|    149|    do {                        \
  |  |   46|    149|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 2, False: 147]
  |  |  ------------------
  |  |   47|      2|            avifBreakOnError(); \
  |  |   48|      2|            return ERR;         \
  |  |   49|      2|        }                       \
  |  |   50|    149|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 147]
  |  |  ------------------
  ------------------
 2025|    149|        }
 2026|  46.7k|        AVIF_CHECKRES(avifCheckItemID("iloc", itemID, diag));
  ------------------
  |  |   54|  46.7k|    do {                                  \
  |  |   55|  46.7k|        const avifResult result__ = (A);  \
  |  |   56|  46.7k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 7, False: 46.7k]
  |  |  ------------------
  |  |   57|      7|            avifBreakOnError();           \
  |  |   58|      7|            return result__;              \
  |  |   59|      7|        }                                 \
  |  |   60|  46.7k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 46.7k]
  |  |  ------------------
  ------------------
 2027|       |
 2028|  46.7k|        avifDecoderItem * item;
 2029|  46.7k|        AVIF_CHECKRES(avifMetaFindOrCreateItem(meta, itemID, &item));
  ------------------
  |  |   54|  46.7k|    do {                                  \
  |  |   55|  46.7k|        const avifResult result__ = (A);  \
  |  |   56|  46.7k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 0, False: 46.7k]
  |  |  ------------------
  |  |   57|      0|            avifBreakOnError();           \
  |  |   58|      0|            return result__;              \
  |  |   59|      0|        }                                 \
  |  |   60|  46.7k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 46.7k]
  |  |  ------------------
  ------------------
 2030|  46.7k|        if (item->extents.count > 0) {
  ------------------
  |  Branch (2030:13): [True: 16, False: 46.7k]
  ------------------
 2031|       |            // This item has already been given extents via this iloc box. This is invalid.
 2032|     16|            avifDiagnosticsPrintf(diag, "Item ID [%u] contains duplicate sets of extents", itemID);
 2033|     16|            return AVIF_RESULT_BMFF_PARSE_FAILED;
 2034|     16|        }
 2035|       |
 2036|  46.7k|        if (version == 1 || version == 2) {
  ------------------
  |  Branch (2036:13): [True: 6.26k, False: 40.5k]
  |  Branch (2036:29): [True: 147, False: 40.3k]
  ------------------
 2037|  6.40k|            AVIF_CHECKERR(avifROStreamReadBitsU32(&s, &reserved, /*bitCount=*/12), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(12) reserved = 0;
  ------------------
  |  |   45|  6.40k|    do {                        \
  |  |   46|  6.40k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 9, False: 6.40k]
  |  |  ------------------
  |  |   47|      9|            avifBreakOnError(); \
  |  |   48|      9|            return ERR;         \
  |  |   49|      9|        }                       \
  |  |   50|  6.40k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 6.40k]
  |  |  ------------------
  ------------------
 2038|  6.40k|            if (reserved) {
  ------------------
  |  Branch (2038:17): [True: 73, False: 6.32k]
  ------------------
 2039|     73|                avifDiagnosticsPrintf(diag, "Box[iloc] has a non null reserved field [%u]", reserved);
 2040|     73|                return AVIF_RESULT_BMFF_PARSE_FAILED;
 2041|     73|            }
 2042|  6.32k|            uint8_t constructionMethod;
 2043|  6.32k|            AVIF_CHECKERR(avifROStreamReadBitsU8(&s, &constructionMethod, /*bitCount=*/4),
  ------------------
  |  |   45|  6.32k|    do {                        \
  |  |   46|  6.32k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 6.32k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  6.32k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 6.32k]
  |  |  ------------------
  ------------------
 2044|  6.32k|                          AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(4) construction_method;
 2045|  6.32k|            if (constructionMethod != 0 /* file offset */ && constructionMethod != 1 /* idat offset */) {
  ------------------
  |  Branch (2045:17): [True: 6.16k, False: 167]
  |  Branch (2045:62): [True: 3, False: 6.15k]
  ------------------
 2046|       |                // construction method 2 (item offset) unsupported
 2047|      3|                avifDiagnosticsPrintf(diag, "Box[iloc] has an unsupported construction method [%u]", constructionMethod);
 2048|      3|                return AVIF_RESULT_BMFF_PARSE_FAILED;
 2049|      3|            }
 2050|  6.32k|            if (constructionMethod == 1) {
  ------------------
  |  Branch (2050:17): [True: 6.15k, False: 167]
  ------------------
 2051|  6.15k|                item->idatStored = AVIF_TRUE;
  ------------------
  |  |   88|  6.15k|#define AVIF_TRUE 1
  ------------------
 2052|  6.15k|            }
 2053|  6.32k|        }
 2054|       |
 2055|  46.6k|        uint16_t dataReferenceIndex;
 2056|  46.6k|        AVIF_CHECKERR(avifROStreamReadU16(&s, &dataReferenceIndex), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(16) data_reference_index;
  ------------------
  |  |   45|  46.6k|    do {                        \
  |  |   46|  46.6k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 7, False: 46.6k]
  |  |  ------------------
  |  |   47|      7|            avifBreakOnError(); \
  |  |   48|      7|            return ERR;         \
  |  |   49|      7|        }                       \
  |  |   50|  46.6k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 46.6k]
  |  |  ------------------
  ------------------
 2057|  46.6k|        uint64_t baseOffset;
 2058|  46.6k|        AVIF_CHECKERR(avifROStreamReadUX8(&s, &baseOffset, baseOffsetSize), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(base_offset_size*8) base_offset;
  ------------------
  |  |   45|  46.6k|    do {                        \
  |  |   46|  46.6k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 5, False: 46.6k]
  |  |  ------------------
  |  |   47|      5|            avifBreakOnError(); \
  |  |   48|      5|            return ERR;         \
  |  |   49|      5|        }                       \
  |  |   50|  46.6k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 46.6k]
  |  |  ------------------
  ------------------
 2059|  46.6k|        uint16_t extentCount;
 2060|  46.6k|        AVIF_CHECKERR(avifROStreamReadU16(&s, &extentCount), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(16) extent_count;
  ------------------
  |  |   45|  46.6k|    do {                        \
  |  |   46|  46.6k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 3, False: 46.6k]
  |  |  ------------------
  |  |   47|      3|            avifBreakOnError(); \
  |  |   48|      3|            return ERR;         \
  |  |   49|      3|        }                       \
  |  |   50|  46.6k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 46.6k]
  |  |  ------------------
  ------------------
 2061|  9.62M|        for (int extentIter = 0; extentIter < extentCount; ++extentIter) {
  ------------------
  |  Branch (2061:34): [True: 9.57M, False: 46.5k]
  ------------------
 2062|  9.57M|            if ((version == 1 || version == 2) && indexSize > 0) {
  ------------------
  |  Branch (2062:18): [True: 91.7k, False: 9.48M]
  |  Branch (2062:34): [True: 1.55M, False: 7.93M]
  |  Branch (2062:51): [True: 96, False: 1.64M]
  ------------------
 2063|       |                // Section 8.11.3.1 of ISO/IEC 14496-12:
 2064|       |                //   The item_reference_index is only used for the method item_offset; it indicates the 1-based index
 2065|       |                //   of the item reference with referenceType 'iloc' linked from this item. If index_size is 0, then
 2066|       |                //   the value 1 is implied; the value 0 is reserved.
 2067|     96|                uint64_t itemReferenceIndex; // Ignored unless construction_method=2 which is unsupported, but still read it.
 2068|     96|                AVIF_CHECKERR(avifROStreamReadUX8(&s, &itemReferenceIndex, indexSize),
  ------------------
  |  |   45|     96|    do {                        \
  |  |   46|     96|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 10, False: 86]
  |  |  ------------------
  |  |   47|     10|            avifBreakOnError(); \
  |  |   48|     10|            return ERR;         \
  |  |   49|     10|        }                       \
  |  |   50|     96|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 86]
  |  |  ------------------
  ------------------
 2069|     96|                              AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(index_size*8) item_reference_index;
 2070|     96|            }
 2071|       |
 2072|  9.57M|            uint64_t extentOffset;
 2073|  9.57M|            AVIF_CHECKERR(avifROStreamReadUX8(&s, &extentOffset, offsetSize), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(offset_size*8) extent_offset;
  ------------------
  |  |   45|  9.57M|    do {                        \
  |  |   46|  9.57M|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 40, False: 9.57M]
  |  |  ------------------
  |  |   47|     40|            avifBreakOnError(); \
  |  |   48|     40|            return ERR;         \
  |  |   49|     40|        }                       \
  |  |   50|  9.57M|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 9.57M]
  |  |  ------------------
  ------------------
 2074|  9.57M|            uint64_t extentLength;
 2075|  9.57M|            AVIF_CHECKERR(avifROStreamReadUX8(&s, &extentLength, lengthSize), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(length_size*8) extent_length;
  ------------------
  |  |   45|  9.57M|    do {                        \
  |  |   46|  9.57M|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 49, False: 9.57M]
  |  |  ------------------
  |  |   47|     49|            avifBreakOnError(); \
  |  |   48|     49|            return ERR;         \
  |  |   49|     49|        }                       \
  |  |   50|  9.57M|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 9.57M]
  |  |  ------------------
  ------------------
 2076|       |
 2077|  9.57M|            avifExtent * extent = (avifExtent *)avifArrayPush(&item->extents);
 2078|  9.57M|            AVIF_CHECKERR(extent != NULL, AVIF_RESULT_OUT_OF_MEMORY);
  ------------------
  |  |   45|  9.57M|    do {                        \
  |  |   46|  9.57M|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 9.57M]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  9.57M|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 9.57M]
  |  |  ------------------
  ------------------
 2079|  9.57M|            if (extentOffset > UINT64_MAX - baseOffset) {
  ------------------
  |  Branch (2079:17): [True: 1, False: 9.57M]
  ------------------
 2080|      1|                avifDiagnosticsPrintf(diag,
 2081|      1|                                      "Item ID [%u] contains an extent offset which overflows: [base: %" PRIu64 " offset:%" PRIu64 "]",
 2082|      1|                                      itemID,
 2083|      1|                                      baseOffset,
 2084|      1|                                      extentOffset);
 2085|      1|                return AVIF_RESULT_BMFF_PARSE_FAILED;
 2086|      1|            }
 2087|  9.57M|            uint64_t offset = baseOffset + extentOffset;
 2088|  9.57M|            extent->offset = offset;
 2089|       |#if UINT64_MAX > SIZE_MAX
 2090|       |            if (extentLength > SIZE_MAX) {
 2091|       |                avifDiagnosticsPrintf(diag, "Item ID [%u] contains an extent length which overflows: [%" PRIu64 "]", itemID, extentLength);
 2092|       |                return AVIF_RESULT_BMFF_PARSE_FAILED;
 2093|       |            }
 2094|       |#endif
 2095|  9.57M|            extent->size = (size_t)extentLength;
 2096|  9.57M|            if (extent->size > SIZE_MAX - item->size) {
  ------------------
  |  Branch (2096:17): [True: 16, False: 9.57M]
  ------------------
 2097|     16|                avifDiagnosticsPrintf(diag,
 2098|     16|                                      "Item ID [%u] contains an extent length which overflows the item size: [%zu, %zu]",
 2099|     16|                                      itemID,
 2100|     16|                                      extent->size,
 2101|     16|                                      item->size);
 2102|     16|                return AVIF_RESULT_BMFF_PARSE_FAILED;
 2103|     16|            }
 2104|  9.57M|            item->size += extent->size;
 2105|  9.57M|        }
 2106|  46.6k|    }
 2107|  19.7k|    return AVIF_RESULT_OK;
 2108|  20.0k|}
read.c:avifCheckItemID:
  890|   180k|{
  891|       |    // Section 8.11.1.1 of ISO/IEC 14496-12 about MetaBox definition:
  892|       |    //   The item_ID value of 0 should not be used
  893|       |    // Section 8.11.6 of ISO/IEC 14496-12 about ItemInfoEntry syntax and semantics:
  894|       |    //   item_ID contains either 0 for the primary resource (e.g. the XML contained in an XMLBox)
  895|       |    //   or the ID of the item for which the following information is defined.
  896|       |    // Assuming 'infe' is the only way to properly define an item in AVIF, a compliant item cannot have an ID of zero.
  897|       |    // One way to bypass that rule would be to have 'infe' with item_ID being 0, referring to "the primary resource",
  898|       |    // and 'pitm' defining "the primary resource" as the item with an item_ID of 0. libavif considers that as invalid.
  899|   180k|    if (itemID == 0) {
  ------------------
  |  Branch (899:9): [True: 35, False: 180k]
  ------------------
  900|     35|        avifDiagnosticsPrintf(diag, "Box[%.4s] has an invalid item ID [%u]", boxFourcc, itemID);
  901|     35|        return AVIF_RESULT_BMFF_PARSE_FAILED;
  902|     35|    }
  903|   180k|    return AVIF_RESULT_OK;
  904|   180k|}
read.c:avifParsePrimaryItemBox:
 3152|  19.4k|{
 3153|  19.4k|    if (meta->primaryItemID > 0) {
  ------------------
  |  Branch (3153:9): [True: 0, False: 19.4k]
  ------------------
 3154|       |        // Illegal to have multiple pitm boxes, bail out
 3155|      0|        avifDiagnosticsPrintf(diag, "Multiple boxes of unique Box[pitm] found");
 3156|      0|        return AVIF_FALSE;
  ------------------
  |  |   89|      0|#define AVIF_FALSE 0
  ------------------
 3157|      0|    }
 3158|       |
 3159|  19.4k|    BEGIN_STREAM(s, raw, rawLen, diag, "Box[pitm]");
  ------------------
  |  |  738|  19.4k|    avifROStream VARNAME;                               \
  |  |  739|  19.4k|    avifROData VARNAME##_roData;                        \
  |  |  740|  19.4k|    VARNAME##_roData.data = PTR;                        \
  |  |  741|  19.4k|    VARNAME##_roData.size = SIZE;                       \
  |  |  742|  19.4k|    avifROStreamStart(&VARNAME, &VARNAME##_roData, DIAG, CONTEXT)
  ------------------
 3160|       |
 3161|  19.4k|    uint8_t version;
 3162|  19.4k|    AVIF_CHECK(avifROStreamReadVersionAndFlags(&s, &version, NULL));
  ------------------
  |  |   36|  19.4k|    do {                        \
  |  |   37|  19.4k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 19.4k]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|  19.4k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 19.4k]
  |  |  ------------------
  ------------------
 3163|       |
 3164|  19.4k|    if (version == 0) {
  ------------------
  |  Branch (3164:9): [True: 19.4k, False: 9]
  ------------------
 3165|  19.4k|        uint16_t tmp16;
 3166|  19.4k|        AVIF_CHECK(avifROStreamReadU16(&s, &tmp16)); // unsigned int(16) item_ID;
  ------------------
  |  |   36|  19.4k|    do {                        \
  |  |   37|  19.4k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 19.4k]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|  19.4k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 19.4k]
  |  |  ------------------
  ------------------
 3167|  19.4k|        meta->primaryItemID = tmp16;
 3168|  19.4k|    } else {
 3169|      9|        AVIF_CHECK(avifROStreamReadU32(&s, &meta->primaryItemID)); // unsigned int(32) item_ID;
  ------------------
  |  |   36|      9|    do {                        \
  |  |   37|      9|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 8]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|      9|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 8]
  |  |  ------------------
  ------------------
 3170|      9|    }
 3171|  19.4k|    return AVIF_TRUE;
  ------------------
  |  |   88|  19.4k|#define AVIF_TRUE 1
  ------------------
 3172|  19.4k|}
read.c:avifParseItemDataBox:
 3175|  2.96k|{
 3176|       |    // Check to see if we've already seen an idat box for this meta box. If so, bail out
 3177|  2.96k|    if (meta->idat.size > 0) {
  ------------------
  |  Branch (3177:9): [True: 0, False: 2.96k]
  ------------------
 3178|      0|        avifDiagnosticsPrintf(diag, "Meta box contains multiple idat boxes");
 3179|      0|        return AVIF_FALSE;
  ------------------
  |  |   89|      0|#define AVIF_FALSE 0
  ------------------
 3180|      0|    }
 3181|  2.96k|    if (rawLen == 0) {
  ------------------
  |  Branch (3181:9): [True: 1, False: 2.96k]
  ------------------
 3182|      1|        avifDiagnosticsPrintf(diag, "idat box has a length of 0");
 3183|      1|        return AVIF_FALSE;
  ------------------
  |  |   89|      1|#define AVIF_FALSE 0
  ------------------
 3184|      1|    }
 3185|       |
 3186|  2.96k|    if (avifRWDataSet(&meta->idat, raw, rawLen) != AVIF_RESULT_OK) {
  ------------------
  |  Branch (3186:9): [True: 0, False: 2.96k]
  ------------------
 3187|      0|        return AVIF_FALSE;
  ------------------
  |  |   89|      0|#define AVIF_FALSE 0
  ------------------
 3188|      0|    }
 3189|  2.96k|    return AVIF_TRUE;
  ------------------
  |  |   88|  2.96k|#define AVIF_TRUE 1
  ------------------
 3190|  2.96k|}
read.c:avifParseItemPropertiesBox:
 3193|  19.3k|{
 3194|  19.3k|    BEGIN_STREAM(s, raw, rawLen, diag, "Box[iprp]");
  ------------------
  |  |  738|  19.3k|    avifROStream VARNAME;                               \
  |  |  739|  19.3k|    avifROData VARNAME##_roData;                        \
  |  |  740|  19.3k|    VARNAME##_roData.data = PTR;                        \
  |  |  741|  19.3k|    VARNAME##_roData.size = SIZE;                       \
  |  |  742|  19.3k|    avifROStreamStart(&VARNAME, &VARNAME##_roData, DIAG, CONTEXT)
  ------------------
 3195|       |
 3196|  19.3k|    avifBoxHeader ipcoHeader;
 3197|  19.3k|    AVIF_CHECKERR(avifROStreamReadBoxHeader(&s, &ipcoHeader), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  19.3k|    do {                        \
  |  |   46|  19.3k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 1, False: 19.3k]
  |  |  ------------------
  |  |   47|      1|            avifBreakOnError(); \
  |  |   48|      1|            return ERR;         \
  |  |   49|      1|        }                       \
  |  |   50|  19.3k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 19.3k]
  |  |  ------------------
  ------------------
 3198|  19.3k|    if (memcmp(ipcoHeader.type, "ipco", 4)) {
  ------------------
  |  Branch (3198:9): [True: 1, False: 19.3k]
  ------------------
 3199|      1|        avifDiagnosticsPrintf(diag, "Failed to find Box[ipco] as the first box in Box[iprp]");
 3200|      1|        return AVIF_RESULT_BMFF_PARSE_FAILED;
 3201|      1|    }
 3202|       |
 3203|       |    // Read all item properties inside of ItemPropertyContainerBox
 3204|  19.3k|    AVIF_CHECKRES(avifParseItemPropertyContainerBox(&meta->properties,
  ------------------
  |  |   54|  19.3k|    do {                                  \
  |  |   55|  19.3k|        const avifResult result__ = (A);  \
  |  |   56|  19.3k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 67, False: 19.2k]
  |  |  ------------------
  |  |   57|     67|            avifBreakOnError();           \
  |  |   58|     67|            return result__;              \
  |  |   59|     67|        }                                 \
  |  |   60|  19.3k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 19.2k]
  |  |  ------------------
  ------------------
 3205|  19.3k|                                                    rawOffset + avifROStreamOffset(&s),
 3206|  19.3k|                                                    avifROStreamCurrent(&s),
 3207|  19.3k|                                                    ipcoHeader.size,
 3208|  19.3k|                                                    /*isTrack=*/AVIF_FALSE,
 3209|  19.3k|                                                    diag));
 3210|  19.2k|    AVIF_CHECKERR(avifROStreamSkip(&s, ipcoHeader.size), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  19.2k|    do {                        \
  |  |   46|  19.2k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 19.2k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  19.2k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 19.2k]
  |  |  ------------------
  ------------------
 3211|       |
 3212|  19.2k|    uint32_t versionAndFlagsSeen[MAX_IPMA_VERSION_AND_FLAGS_SEEN];
 3213|  19.2k|    uint32_t versionAndFlagsSeenCount = 0;
 3214|       |
 3215|       |    // Now read all ItemPropertyAssociation until the end of the box, and make associations
 3216|  38.3k|    while (avifROStreamHasBytesLeft(&s, 1)) {
  ------------------
  |  Branch (3216:12): [True: 19.2k, False: 19.0k]
  ------------------
 3217|  19.2k|        avifBoxHeader ipmaHeader;
 3218|  19.2k|        AVIF_CHECKERR(avifROStreamReadBoxHeader(&s, &ipmaHeader), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  19.2k|    do {                        \
  |  |   46|  19.2k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 10, False: 19.2k]
  |  |  ------------------
  |  |   47|     10|            avifBreakOnError(); \
  |  |   48|     10|            return ERR;         \
  |  |   49|     10|        }                       \
  |  |   50|  19.2k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 19.2k]
  |  |  ------------------
  ------------------
 3219|       |
 3220|  19.2k|        if (!memcmp(ipmaHeader.type, "ipma", 4)) {
  ------------------
  |  Branch (3220:13): [True: 19.2k, False: 1]
  ------------------
 3221|  19.2k|            uint32_t versionAndFlags;
 3222|  19.2k|            AVIF_CHECKRES(avifParseItemPropertyAssociation(meta, avifROStreamCurrent(&s), ipmaHeader.size, diag, &versionAndFlags));
  ------------------
  |  |   54|  19.2k|    do {                                  \
  |  |   55|  19.2k|        const avifResult result__ = (A);  \
  |  |   56|  19.2k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 129, False: 19.1k]
  |  |  ------------------
  |  |   57|    129|            avifBreakOnError();           \
  |  |   58|    129|            return result__;              \
  |  |   59|    129|        }                                 \
  |  |   60|  19.2k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 19.1k]
  |  |  ------------------
  ------------------
 3223|  19.1k|            for (uint32_t i = 0; i < versionAndFlagsSeenCount; ++i) {
  ------------------
  |  Branch (3223:34): [True: 0, False: 19.1k]
  ------------------
 3224|      0|                if (versionAndFlagsSeen[i] == versionAndFlags) {
  ------------------
  |  Branch (3224:21): [True: 0, False: 0]
  ------------------
 3225|       |                    // BMFF (ISO/IEC 14496-12:2022) 8.11.14.1 - There shall be at most one
 3226|       |                    // ItemPropertyAssociationBox with a given pair of values of version and
 3227|       |                    // flags.
 3228|      0|                    avifDiagnosticsPrintf(diag, "Multiple Box[ipma] with a given pair of values of version and flags. See BMFF (ISO/IEC 14496-12:2022) 8.11.14.1");
 3229|      0|                    return AVIF_RESULT_BMFF_PARSE_FAILED;
 3230|      0|                }
 3231|      0|            }
 3232|  19.1k|            if (versionAndFlagsSeenCount == MAX_IPMA_VERSION_AND_FLAGS_SEEN) {
  ------------------
  |  |   39|  19.1k|#define MAX_IPMA_VERSION_AND_FLAGS_SEEN 4
  ------------------
  |  Branch (3232:17): [True: 0, False: 19.1k]
  ------------------
 3233|      0|                avifDiagnosticsPrintf(diag, "Exceeded possible count of unique ipma version and flags tuples");
 3234|      0|                return AVIF_RESULT_BMFF_PARSE_FAILED;
 3235|      0|            }
 3236|  19.1k|            versionAndFlagsSeen[versionAndFlagsSeenCount] = versionAndFlags;
 3237|  19.1k|            ++versionAndFlagsSeenCount;
 3238|  19.1k|        } else {
 3239|       |            // These must all be type ipma
 3240|      1|            avifDiagnosticsPrintf(diag, "Box[iprp] contains a box that isn't type 'ipma'");
 3241|      1|            return AVIF_RESULT_BMFF_PARSE_FAILED;
 3242|      1|        }
 3243|       |
 3244|  19.1k|        AVIF_CHECKERR(avifROStreamSkip(&s, ipmaHeader.size), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  19.1k|    do {                        \
  |  |   46|  19.1k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 19.1k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  19.1k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 19.1k]
  |  |  ------------------
  ------------------
 3245|  19.1k|    }
 3246|  19.0k|    return AVIF_RESULT_OK;
 3247|  19.2k|}
read.c:avifParseItemPropertyContainerBox:
 2922|  22.1k|{
 2923|  22.1k|    BEGIN_STREAM(s, raw, rawLen, diag, "Box[ipco]");
  ------------------
  |  |  738|  22.1k|    avifROStream VARNAME;                               \
  |  |  739|  22.1k|    avifROData VARNAME##_roData;                        \
  |  |  740|  22.1k|    VARNAME##_roData.data = PTR;                        \
  |  |  741|  22.1k|    VARNAME##_roData.size = SIZE;                       \
  |  |  742|  22.1k|    avifROStreamStart(&VARNAME, &VARNAME##_roData, DIAG, CONTEXT)
  ------------------
 2924|       |
 2925|   168k|    while (avifROStreamHasBytesLeft(&s, 1)) {
  ------------------
  |  Branch (2925:12): [True: 146k, False: 22.1k]
  ------------------
 2926|   146k|        avifBoxHeader header;
 2927|   146k|        AVIF_CHECKERR(avifROStreamReadBoxHeader(&s, &header), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|   146k|    do {                        \
  |  |   46|   146k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 20, False: 146k]
  |  |  ------------------
  |  |   47|     20|            avifBreakOnError(); \
  |  |   48|     20|            return ERR;         \
  |  |   49|     20|        }                       \
  |  |   50|   146k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 146k]
  |  |  ------------------
  ------------------
 2928|       |
 2929|   146k|        avifProperty * prop = (avifProperty *)avifArrayPush(properties);
 2930|   146k|        AVIF_CHECKERR(prop != NULL, AVIF_RESULT_OUT_OF_MEMORY);
  ------------------
  |  |   45|   146k|    do {                        \
  |  |   46|   146k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 146k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|   146k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 146k]
  |  |  ------------------
  ------------------
 2931|   146k|        memcpy(prop->type, header.type, 4);
 2932|   146k|        prop->isOpaque = AVIF_FALSE;
  ------------------
  |  |   89|   146k|#define AVIF_FALSE 0
  ------------------
 2933|   146k|        if (!memcmp(header.type, "ispe", 4)) {
  ------------------
  |  Branch (2933:13): [True: 23.2k, False: 123k]
  ------------------
 2934|  23.2k|            AVIF_CHECKERR(avifParseImageSpatialExtentsProperty(prop, avifROStreamCurrent(&s), header.size, diag),
  ------------------
  |  |   45|  23.2k|    do {                        \
  |  |   46|  23.2k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 3, False: 23.2k]
  |  |  ------------------
  |  |   47|      3|            avifBreakOnError(); \
  |  |   48|      3|            return ERR;         \
  |  |   49|      3|        }                       \
  |  |   50|  23.2k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 23.2k]
  |  |  ------------------
  ------------------
 2935|  23.2k|                          AVIF_RESULT_BMFF_PARSE_FAILED);
 2936|   123k|        } else if ((!memcmp(header.type, "auxC", 4) && !isTrack) || (!memcmp(header.type, "auxi", 4) && isTrack)) {
  ------------------
  |  Branch (2936:21): [True: 10.1k, False: 113k]
  |  Branch (2936:56): [True: 10.1k, False: 1]
  |  Branch (2936:70): [True: 605, False: 112k]
  |  Branch (2936:105): [True: 603, False: 2]
  ------------------
 2937|  10.7k|            AVIF_CHECKERR(avifParseAuxiliaryTypeProperty(prop, avifROStreamCurrent(&s), header.size, diag), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  10.7k|    do {                        \
  |  |   46|  10.7k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 2, False: 10.7k]
  |  |  ------------------
  |  |   47|      2|            avifBreakOnError(); \
  |  |   48|      2|            return ERR;         \
  |  |   49|      2|        }                       \
  |  |   50|  10.7k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 10.7k]
  |  |  ------------------
  ------------------
 2938|   112k|        } else if (!memcmp(header.type, "colr", 4)) {
  ------------------
  |  Branch (2938:20): [True: 20.1k, False: 92.3k]
  ------------------
 2939|  20.1k|            AVIF_CHECKERR(avifParseColourInformationBox(prop, rawOffset + avifROStreamOffset(&s), avifROStreamCurrent(&s), header.size, diag),
  ------------------
  |  |   45|  20.1k|    do {                        \
  |  |   46|  20.1k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 6, False: 20.1k]
  |  |  ------------------
  |  |   47|      6|            avifBreakOnError(); \
  |  |   48|      6|            return ERR;         \
  |  |   49|      6|        }                       \
  |  |   50|  20.1k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 20.1k]
  |  |  ------------------
  ------------------
 2940|  20.1k|                          AVIF_RESULT_BMFF_PARSE_FAILED);
 2941|  92.3k|        } else if (!memcmp(header.type, "av1C", 4)) {
  ------------------
  |  Branch (2941:20): [True: 33.3k, False: 58.9k]
  ------------------
 2942|  33.3k|            AVIF_CHECKERR(avifParseCodecConfigurationBoxProperty(prop, avifROStreamCurrent(&s), header.size, "av1C", diag),
  ------------------
  |  |   45|  33.3k|    do {                        \
  |  |   46|  33.3k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 6, False: 33.3k]
  |  |  ------------------
  |  |   47|      6|            avifBreakOnError(); \
  |  |   48|      6|            return ERR;         \
  |  |   49|      6|        }                       \
  |  |   50|  33.3k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 33.3k]
  |  |  ------------------
  ------------------
 2943|  33.3k|                          AVIF_RESULT_BMFF_PARSE_FAILED);
 2944|       |#if defined(AVIF_CODEC_AVM)
 2945|       |        } else if (!memcmp(header.type, "av2C", 4)) {
 2946|       |            AVIF_CHECKERR(avifParseCodecConfigurationBoxProperty(prop, avifROStreamCurrent(&s), header.size, "av2C", diag),
 2947|       |                          AVIF_RESULT_BMFF_PARSE_FAILED);
 2948|       |#endif
 2949|  58.9k|        } else if (!memcmp(header.type, "pasp", 4)) {
  ------------------
  |  Branch (2949:20): [True: 0, False: 58.9k]
  ------------------
 2950|      0|            AVIF_CHECKERR(avifParsePixelAspectRatioBoxProperty(prop, avifROStreamCurrent(&s), header.size, diag),
  ------------------
  |  |   45|      0|    do {                        \
  |  |   46|      0|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 0]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|      0|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 0]
  |  |  ------------------
  ------------------
 2951|      0|                          AVIF_RESULT_BMFF_PARSE_FAILED);
 2952|  58.9k|        } else if (!memcmp(header.type, "clap", 4)) {
  ------------------
  |  Branch (2952:20): [True: 12, False: 58.9k]
  ------------------
 2953|     12|            AVIF_CHECKERR(avifParseCleanApertureBoxProperty(prop, avifROStreamCurrent(&s), header.size, diag),
  ------------------
  |  |   45|     12|    do {                        \
  |  |   46|     12|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 6, False: 6]
  |  |  ------------------
  |  |   47|      6|            avifBreakOnError(); \
  |  |   48|      6|            return ERR;         \
  |  |   49|      6|        }                       \
  |  |   50|     12|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 6]
  |  |  ------------------
  ------------------
 2954|     12|                          AVIF_RESULT_BMFF_PARSE_FAILED);
 2955|  58.9k|        } else if (!memcmp(header.type, "irot", 4)) {
  ------------------
  |  Branch (2955:20): [True: 1.38k, False: 57.5k]
  ------------------
 2956|  1.38k|            AVIF_CHECKERR(avifParseImageRotationProperty(prop, avifROStreamCurrent(&s), header.size, diag), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  1.38k|    do {                        \
  |  |   46|  1.38k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 2, False: 1.38k]
  |  |  ------------------
  |  |   47|      2|            avifBreakOnError(); \
  |  |   48|      2|            return ERR;         \
  |  |   49|      2|        }                       \
  |  |   50|  1.38k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 1.38k]
  |  |  ------------------
  ------------------
 2957|  57.5k|        } else if (!memcmp(header.type, "imir", 4)) {
  ------------------
  |  Branch (2957:20): [True: 6, False: 57.5k]
  ------------------
 2958|      6|            AVIF_CHECKERR(avifParseImageMirrorProperty(prop, avifROStreamCurrent(&s), header.size, diag), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|      6|    do {                        \
  |  |   46|      6|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 2, False: 4]
  |  |  ------------------
  |  |   47|      2|            avifBreakOnError(); \
  |  |   48|      2|            return ERR;         \
  |  |   49|      2|        }                       \
  |  |   50|      6|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 4]
  |  |  ------------------
  ------------------
 2959|  57.5k|        } else if (!memcmp(header.type, "pixi", 4)) {
  ------------------
  |  Branch (2959:20): [True: 31.4k, False: 26.1k]
  ------------------
 2960|  31.4k|            AVIF_CHECKRES(avifParsePixelInformationProperty(prop, avifROStreamCurrent(&s), header.size, diag));
  ------------------
  |  |   54|  31.4k|    do {                                  \
  |  |   55|  31.4k|        const avifResult result__ = (A);  \
  |  |   56|  31.4k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 16, False: 31.4k]
  |  |  ------------------
  |  |   57|     16|            avifBreakOnError();           \
  |  |   58|     16|            return result__;              \
  |  |   59|     16|        }                                 \
  |  |   60|  31.4k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 31.4k]
  |  |  ------------------
  ------------------
 2961|  31.4k|        } else if (!memcmp(header.type, "a1op", 4)) {
  ------------------
  |  Branch (2961:20): [True: 2, False: 26.1k]
  ------------------
 2962|      2|            AVIF_CHECKERR(avifParseOperatingPointSelectorProperty(prop, avifROStreamCurrent(&s), header.size, diag),
  ------------------
  |  |   45|      2|    do {                        \
  |  |   46|      2|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 1, False: 1]
  |  |  ------------------
  |  |   47|      1|            avifBreakOnError(); \
  |  |   48|      1|            return ERR;         \
  |  |   49|      1|        }                       \
  |  |   50|      2|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 1]
  |  |  ------------------
  ------------------
 2963|      2|                          AVIF_RESULT_BMFF_PARSE_FAILED);
 2964|  26.1k|        } else if (!memcmp(header.type, "lsel", 4)) {
  ------------------
  |  Branch (2964:20): [True: 0, False: 26.1k]
  ------------------
 2965|      0|            AVIF_CHECKERR(avifParseLayerSelectorProperty(prop, avifROStreamCurrent(&s), header.size, diag), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|      0|    do {                        \
  |  |   46|      0|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 0]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|      0|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 0]
  |  |  ------------------
  ------------------
 2966|  26.1k|        } else if (!memcmp(header.type, "a1lx", 4)) {
  ------------------
  |  Branch (2966:20): [True: 4.23k, False: 21.8k]
  ------------------
 2967|  4.23k|            AVIF_CHECKERR(avifParseAV1LayeredImageIndexingProperty(prop, avifROStreamCurrent(&s), header.size, diag),
  ------------------
  |  |   45|  4.23k|    do {                        \
  |  |   46|  4.23k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 8, False: 4.22k]
  |  |  ------------------
  |  |   47|      8|            avifBreakOnError(); \
  |  |   48|      8|            return ERR;         \
  |  |   49|      8|        }                       \
  |  |   50|  4.23k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 4.22k]
  |  |  ------------------
  ------------------
 2968|  4.23k|                          AVIF_RESULT_BMFF_PARSE_FAILED);
 2969|  21.8k|        } else if (!memcmp(header.type, "clli", 4)) {
  ------------------
  |  Branch (2969:20): [True: 470, False: 21.4k]
  ------------------
 2970|    470|            AVIF_CHECKRES(avifParseContentLightLevelInformationBox(prop, avifROStreamCurrent(&s), header.size, diag));
  ------------------
  |  |   54|    470|    do {                                  \
  |  |   55|    470|        const avifResult result__ = (A);  \
  |  |   56|    470|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 3, False: 467]
  |  |  ------------------
  |  |   57|      3|            avifBreakOnError();           \
  |  |   58|      3|            return result__;              \
  |  |   59|      3|        }                                 \
  |  |   60|    470|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 467]
  |  |  ------------------
  ------------------
 2971|  21.4k|        } else {
 2972|  21.4k|            prop->isOpaque = AVIF_TRUE;
  ------------------
  |  |   88|  21.4k|#define AVIF_TRUE 1
  ------------------
 2973|  21.4k|            memset(&prop->u.opaque, 0, sizeof(prop->u.opaque));
 2974|  21.4k|            memcpy(prop->u.opaque.usertype, header.usertype, sizeof(prop->u.opaque.usertype));
 2975|  21.4k|            AVIF_CHECKRES(avifRWDataSet(&prop->u.opaque.boxPayload, avifROStreamCurrent(&s), header.size));
  ------------------
  |  |   54|  21.4k|    do {                                  \
  |  |   55|  21.4k|        const avifResult result__ = (A);  \
  |  |   56|  21.4k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 0, False: 21.4k]
  |  |  ------------------
  |  |   57|      0|            avifBreakOnError();           \
  |  |   58|      0|            return result__;              \
  |  |   59|      0|        }                                 \
  |  |   60|  21.4k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 21.4k]
  |  |  ------------------
  ------------------
 2976|  21.4k|        }
 2977|       |
 2978|   146k|        AVIF_CHECKERR(avifROStreamSkip(&s, header.size), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|   146k|    do {                        \
  |  |   46|   146k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 146k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|   146k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 146k]
  |  |  ------------------
  ------------------
 2979|   146k|    }
 2980|  22.1k|    return AVIF_RESULT_OK;
 2981|  22.1k|}
read.c:avifParseImageSpatialExtentsProperty:
 2475|  23.2k|{
 2476|  23.2k|    BEGIN_STREAM(s, raw, rawLen, diag, "Box[ispe]");
  ------------------
  |  |  738|  23.2k|    avifROStream VARNAME;                               \
  |  |  739|  23.2k|    avifROData VARNAME##_roData;                        \
  |  |  740|  23.2k|    VARNAME##_roData.data = PTR;                        \
  |  |  741|  23.2k|    VARNAME##_roData.size = SIZE;                       \
  |  |  742|  23.2k|    avifROStreamStart(&VARNAME, &VARNAME##_roData, DIAG, CONTEXT)
  ------------------
 2477|  23.2k|    AVIF_CHECK(avifROStreamReadAndEnforceVersion(&s, /*enforcedVersion=*/0, /*flags=*/NULL));
  ------------------
  |  |   36|  23.2k|    do {                        \
  |  |   37|  23.2k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 23.2k]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|  23.2k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 23.2k]
  |  |  ------------------
  ------------------
 2478|       |
 2479|  23.2k|    avifImageSpatialExtents * ispe = &prop->u.ispe;
 2480|  23.2k|    AVIF_CHECK(avifROStreamReadU32(&s, &ispe->width));
  ------------------
  |  |   36|  23.2k|    do {                        \
  |  |   37|  23.2k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 23.2k]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|  23.2k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 23.2k]
  |  |  ------------------
  ------------------
 2481|  23.2k|    AVIF_CHECK(avifROStreamReadU32(&s, &ispe->height));
  ------------------
  |  |   36|  23.2k|    do {                        \
  |  |   37|  23.2k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 23.2k]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|  23.2k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 23.2k]
  |  |  ------------------
  ------------------
 2482|  23.2k|    return AVIF_TRUE;
  ------------------
  |  |   88|  23.2k|#define AVIF_TRUE 1
  ------------------
 2483|  23.2k|}
read.c:avifParseAuxiliaryTypeProperty:
 2486|  10.7k|{
 2487|  10.7k|    BEGIN_STREAM(s, raw, rawLen, diag, "Box[auxC]");
  ------------------
  |  |  738|  10.7k|    avifROStream VARNAME;                               \
  |  |  739|  10.7k|    avifROData VARNAME##_roData;                        \
  |  |  740|  10.7k|    VARNAME##_roData.data = PTR;                        \
  |  |  741|  10.7k|    VARNAME##_roData.size = SIZE;                       \
  |  |  742|  10.7k|    avifROStreamStart(&VARNAME, &VARNAME##_roData, DIAG, CONTEXT)
  ------------------
 2488|  10.7k|    AVIF_CHECK(avifROStreamReadAndEnforceVersion(&s, /*enforcedVersion=*/0, /*flags=*/NULL));
  ------------------
  |  |   36|  10.7k|    do {                        \
  |  |   37|  10.7k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 10.7k]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|  10.7k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 10.7k]
  |  |  ------------------
  ------------------
 2489|       |
 2490|  10.7k|    AVIF_CHECK(avifROStreamReadString(&s, prop->u.auxC.auxType, AUXTYPE_SIZE));
  ------------------
  |  |   36|  10.7k|    do {                        \
  |  |   37|  10.7k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 10.7k]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|  10.7k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 10.7k]
  |  |  ------------------
  ------------------
 2491|  10.7k|    return AVIF_TRUE;
  ------------------
  |  |   88|  10.7k|#define AVIF_TRUE 1
  ------------------
 2492|  10.7k|}
read.c:avifParseColourInformationBox:
 2495|  20.1k|{
 2496|  20.1k|    BEGIN_STREAM(s, raw, rawLen, diag, "Box[colr]");
  ------------------
  |  |  738|  20.1k|    avifROStream VARNAME;                               \
  |  |  739|  20.1k|    avifROData VARNAME##_roData;                        \
  |  |  740|  20.1k|    VARNAME##_roData.data = PTR;                        \
  |  |  741|  20.1k|    VARNAME##_roData.size = SIZE;                       \
  |  |  742|  20.1k|    avifROStreamStart(&VARNAME, &VARNAME##_roData, DIAG, CONTEXT)
  ------------------
 2497|       |
 2498|  20.1k|    avifColourInformationBox * colr = &prop->u.colr;
 2499|  20.1k|    colr->hasICC = AVIF_FALSE;
  ------------------
  |  |   89|  20.1k|#define AVIF_FALSE 0
  ------------------
 2500|  20.1k|    colr->hasNCLX = AVIF_FALSE;
  ------------------
  |  |   89|  20.1k|#define AVIF_FALSE 0
  ------------------
 2501|       |
 2502|  20.1k|    uint8_t colorType[4]; // unsigned int(32) colour_type;
 2503|  20.1k|    AVIF_CHECK(avifROStreamRead(&s, colorType, 4));
  ------------------
  |  |   36|  20.1k|    do {                        \
  |  |   37|  20.1k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 20.1k]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|  20.1k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 20.1k]
  |  |  ------------------
  ------------------
 2504|  20.1k|    if (!memcmp(colorType, "rICC", 4) || !memcmp(colorType, "prof", 4)) {
  ------------------
  |  Branch (2504:9): [True: 0, False: 20.1k]
  |  Branch (2504:42): [True: 61, False: 20.0k]
  ------------------
 2505|       |        // Remember the offset of the ICC payload relative to the beginning of the stream. A direct pointer cannot be stored
 2506|       |        // because decoder->io->persistent could have been AVIF_FALSE when obtaining raw through decoder->io->read().
 2507|       |        // The bytes could be copied now instead of remembering the offset, but it is as invasive as passing rawOffset everywhere.
 2508|     61|        colr->iccOffset = rawOffset + avifROStreamOffset(&s);
 2509|     61|        colr->iccSize = avifROStreamRemainingBytes(&s);
 2510|     61|        if (colr->iccSize == 0) {
  ------------------
  |  Branch (2510:13): [True: 0, False: 61]
  ------------------
 2511|      0|            avifDiagnosticsPrintf(diag, "Box[colr] contains empty ICC_profile");
 2512|      0|            return AVIF_FALSE;
  ------------------
  |  |   89|      0|#define AVIF_FALSE 0
  ------------------
 2513|      0|        }
 2514|     61|        colr->hasICC = AVIF_TRUE;
  ------------------
  |  |   88|     61|#define AVIF_TRUE 1
  ------------------
 2515|  20.0k|    } else if (!memcmp(colorType, "nclx", 4)) {
  ------------------
  |  Branch (2515:16): [True: 17.4k, False: 2.57k]
  ------------------
 2516|  17.4k|        AVIF_CHECK(avifROStreamReadU16(&s, &colr->colorPrimaries));          // unsigned int(16) colour_primaries;
  ------------------
  |  |   36|  17.4k|    do {                        \
  |  |   37|  17.4k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 17.4k]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|  17.4k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 17.4k]
  |  |  ------------------
  ------------------
 2517|  17.4k|        AVIF_CHECK(avifROStreamReadU16(&s, &colr->transferCharacteristics)); // unsigned int(16) transfer_characteristics;
  ------------------
  |  |   36|  17.4k|    do {                        \
  |  |   37|  17.4k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 17.4k]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|  17.4k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 17.4k]
  |  |  ------------------
  ------------------
 2518|  17.4k|        AVIF_CHECK(avifROStreamReadU16(&s, &colr->matrixCoefficients));      // unsigned int(16) matrix_coefficients;
  ------------------
  |  |   36|  17.4k|    do {                        \
  |  |   37|  17.4k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 17.4k]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|  17.4k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 17.4k]
  |  |  ------------------
  ------------------
 2519|  17.4k|        uint8_t full_range_flag;
 2520|  17.4k|        AVIF_CHECK(avifROStreamReadBitsU8(&s, &full_range_flag, /*bitCount=*/1)); // unsigned int(1) full_range_flag;
  ------------------
  |  |   36|  17.4k|    do {                        \
  |  |   37|  17.4k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 17.4k]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|  17.4k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 17.4k]
  |  |  ------------------
  ------------------
 2521|  17.4k|        colr->range = full_range_flag ? AVIF_RANGE_FULL : AVIF_RANGE_LIMITED;
  ------------------
  |  Branch (2521:23): [True: 16.6k, False: 800]
  ------------------
 2522|  17.4k|        uint8_t reserved;
 2523|  17.4k|        AVIF_CHECK(avifROStreamReadBitsU8(&s, &reserved, /*bitCount=*/7)); // unsigned int(7) reserved = 0;
  ------------------
  |  |   36|  17.4k|    do {                        \
  |  |   37|  17.4k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 0, False: 17.4k]
  |  |  ------------------
  |  |   38|      0|            avifBreakOnError(); \
  |  |   39|      0|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      0|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      0|        }                       \
  |  |   41|  17.4k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 17.4k]
  |  |  ------------------
  ------------------
 2524|  17.4k|        if (reserved) {
  ------------------
  |  Branch (2524:13): [True: 1, False: 17.4k]
  ------------------
 2525|      1|            avifDiagnosticsPrintf(diag, "Box[colr] contains nonzero reserved bits [%u]", reserved);
 2526|      1|            return AVIF_FALSE;
  ------------------
  |  |   89|      1|#define AVIF_FALSE 0
  ------------------
 2527|      1|        }
 2528|  17.4k|        colr->hasNCLX = AVIF_TRUE;
  ------------------
  |  |   88|  17.4k|#define AVIF_TRUE 1
  ------------------
 2529|  17.4k|    }
 2530|  20.1k|    return AVIF_TRUE;
  ------------------
  |  |   88|  20.1k|#define AVIF_TRUE 1
  ------------------
 2531|  20.1k|}
read.c:avifParseCodecConfigurationBoxProperty:
 2706|  33.3k|{
 2707|  33.3k|    char diagContext[10];
 2708|  33.3k|    snprintf(diagContext, sizeof(diagContext), "Box[%.4s]", configPropName); // "Box[av1C]" or "Box[av2C]"
 2709|  33.3k|    BEGIN_STREAM(s, raw, rawLen, diag, diagContext);
  ------------------
  |  |  738|  33.3k|    avifROStream VARNAME;                               \
  |  |  739|  33.3k|    avifROData VARNAME##_roData;                        \
  |  |  740|  33.3k|    VARNAME##_roData.data = PTR;                        \
  |  |  741|  33.3k|    VARNAME##_roData.size = SIZE;                       \
  |  |  742|  33.3k|    avifROStreamStart(&VARNAME, &VARNAME##_roData, DIAG, CONTEXT)
  ------------------
 2710|  33.3k|    return avifParseCodecConfiguration(&s, &prop->u.av1C, configPropName, diag);
 2711|  33.3k|}
read.c:avifParseCodecConfiguration:
 2652|  33.3k|{
 2653|  33.3k|    const size_t av1COffset = avifROStreamOffset(s);
 2654|       |
 2655|  33.3k|    uint32_t marker, version;
 2656|  33.3k|    AVIF_CHECK(avifROStreamReadBitsU32(s, &marker, /*bitCount=*/1)); // unsigned int (1) marker = 1;
  ------------------
  |  |   36|  33.3k|    do {                        \
  |  |   37|  33.3k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 33.3k]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|  33.3k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 33.3k]
  |  |  ------------------
  ------------------
 2657|  33.3k|    if (!marker) {
  ------------------
  |  Branch (2657:9): [True: 1, False: 33.3k]
  ------------------
 2658|      1|        avifDiagnosticsPrintf(diag, "%.4s contains illegal marker: [%u]", configPropName, marker);
 2659|      1|        return AVIF_FALSE;
  ------------------
  |  |   89|      1|#define AVIF_FALSE 0
  ------------------
 2660|      1|    }
 2661|  33.3k|    AVIF_CHECK(avifROStreamReadBitsU32(s, &version, /*bitCount=*/7)); // unsigned int (7) version = 1;
  ------------------
  |  |   36|  33.3k|    do {                        \
  |  |   37|  33.3k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 0, False: 33.3k]
  |  |  ------------------
  |  |   38|      0|            avifBreakOnError(); \
  |  |   39|      0|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      0|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      0|        }                       \
  |  |   41|  33.3k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 33.3k]
  |  |  ------------------
  ------------------
 2662|  33.3k|    if (version != 1) {
  ------------------
  |  Branch (2662:9): [True: 1, False: 33.3k]
  ------------------
 2663|      1|        avifDiagnosticsPrintf(diag, "%.4s contains illegal version: [%u]", configPropName, version);
 2664|      1|        return AVIF_FALSE;
  ------------------
  |  |   89|      1|#define AVIF_FALSE 0
  ------------------
 2665|      1|    }
 2666|       |
 2667|  33.3k|    AVIF_CHECK(avifROStreamReadBitsU8(s, &config->seqProfile, /*bitCount=*/3));         // unsigned int (3) seq_profile;
  ------------------
  |  |   36|  33.3k|    do {                        \
  |  |   37|  33.3k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 33.3k]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|  33.3k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 33.3k]
  |  |  ------------------
  ------------------
 2668|  33.3k|    AVIF_CHECK(avifROStreamReadBitsU8(s, &config->seqLevelIdx0, /*bitCount=*/5));       // unsigned int (5) seq_level_idx_0;
  ------------------
  |  |   36|  33.3k|    do {                        \
  |  |   37|  33.3k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 0, False: 33.3k]
  |  |  ------------------
  |  |   38|      0|            avifBreakOnError(); \
  |  |   39|      0|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      0|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      0|        }                       \
  |  |   41|  33.3k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 33.3k]
  |  |  ------------------
  ------------------
 2669|  33.3k|    AVIF_CHECK(avifROStreamReadBitsU8(s, &config->seqTier0, /*bitCount=*/1));           // unsigned int (1) seq_tier_0;
  ------------------
  |  |   36|  33.3k|    do {                        \
  |  |   37|  33.3k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 33.3k]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|  33.3k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 33.3k]
  |  |  ------------------
  ------------------
 2670|  33.3k|    AVIF_CHECK(avifROStreamReadBitsU8(s, &config->highBitdepth, /*bitCount=*/1));       // unsigned int (1) high_bitdepth;
  ------------------
  |  |   36|  33.3k|    do {                        \
  |  |   37|  33.3k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 0, False: 33.3k]
  |  |  ------------------
  |  |   38|      0|            avifBreakOnError(); \
  |  |   39|      0|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      0|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      0|        }                       \
  |  |   41|  33.3k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 33.3k]
  |  |  ------------------
  ------------------
 2671|  33.3k|    AVIF_CHECK(avifROStreamReadBitsU8(s, &config->twelveBit, /*bitCount=*/1));          // unsigned int (1) twelve_bit;
  ------------------
  |  |   36|  33.3k|    do {                        \
  |  |   37|  33.3k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 0, False: 33.3k]
  |  |  ------------------
  |  |   38|      0|            avifBreakOnError(); \
  |  |   39|      0|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      0|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      0|        }                       \
  |  |   41|  33.3k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 33.3k]
  |  |  ------------------
  ------------------
 2672|  33.3k|    AVIF_CHECK(avifROStreamReadBitsU8(s, &config->monochrome, /*bitCount=*/1));         // unsigned int (1) monochrome;
  ------------------
  |  |   36|  33.3k|    do {                        \
  |  |   37|  33.3k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 0, False: 33.3k]
  |  |  ------------------
  |  |   38|      0|            avifBreakOnError(); \
  |  |   39|      0|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      0|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      0|        }                       \
  |  |   41|  33.3k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 33.3k]
  |  |  ------------------
  ------------------
 2673|  33.3k|    AVIF_CHECK(avifROStreamReadBitsU8(s, &config->chromaSubsamplingX, /*bitCount=*/1)); // unsigned int (1) chroma_subsampling_x;
  ------------------
  |  |   36|  33.3k|    do {                        \
  |  |   37|  33.3k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 0, False: 33.3k]
  |  |  ------------------
  |  |   38|      0|            avifBreakOnError(); \
  |  |   39|      0|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      0|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      0|        }                       \
  |  |   41|  33.3k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 33.3k]
  |  |  ------------------
  ------------------
 2674|  33.3k|    AVIF_CHECK(avifROStreamReadBitsU8(s, &config->chromaSubsamplingY, /*bitCount=*/1)); // unsigned int (1) chroma_subsampling_y;
  ------------------
  |  |   36|  33.3k|    do {                        \
  |  |   37|  33.3k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 0, False: 33.3k]
  |  |  ------------------
  |  |   38|      0|            avifBreakOnError(); \
  |  |   39|      0|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      0|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      0|        }                       \
  |  |   41|  33.3k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 33.3k]
  |  |  ------------------
  ------------------
 2675|  33.3k|    AVIF_CHECK(avifROStreamReadBitsU8(s, &config->chromaSamplePosition, /*bitCount=*/2)); // unsigned int (2) chroma_sample_position;
  ------------------
  |  |   36|  33.3k|    do {                        \
  |  |   37|  33.3k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 0, False: 33.3k]
  |  |  ------------------
  |  |   38|      0|            avifBreakOnError(); \
  |  |   39|      0|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      0|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      0|        }                       \
  |  |   41|  33.3k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 33.3k]
  |  |  ------------------
  ------------------
 2676|       |
 2677|       |    // unsigned int (3) reserved = 0;
 2678|       |    // unsigned int (1) initial_presentation_delay_present;
 2679|       |    // if (initial_presentation_delay_present) {
 2680|       |    //   unsigned int (4) initial_presentation_delay_minus_one;
 2681|       |    // } else {
 2682|       |    //   unsigned int (4) reserved = 0;
 2683|       |    // }
 2684|  33.3k|    AVIF_CHECK(avifROStreamSkip(s, /*byteCount=*/1));
  ------------------
  |  |   36|  33.3k|    do {                        \
  |  |   37|  33.3k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 33.3k]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|  33.3k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 33.3k]
  |  |  ------------------
  ------------------
 2685|       |
 2686|       |    // According to section 2.2.1 of AV1 Image File Format specification v1.1.0:
 2687|       |    //   - Sequence Header OBUs should not be present in the AV1CodecConfigurationBox.
 2688|       |    //   - If a Sequence Header OBU is present in the AV1CodecConfigurationBox,
 2689|       |    //     it shall match the Sequence Header OBU in the AV1 Image Item Data.
 2690|       |    //   - Metadata OBUs, if present, shall match the values given in other item properties,
 2691|       |    //     such as the PixelInformationProperty or ColourInformationBox.
 2692|       |    // See https://aomediacodec.github.io/av1-avif/v1.1.0.html#av1-configuration-item-property.
 2693|       |    // For simplicity, the constraints above are not enforced.
 2694|       |    // The following is skipped by avifParseItemPropertyContainerBox().
 2695|       |    // unsigned int (8) configOBUs[];
 2696|       |
 2697|  33.3k|    AVIF_CHECK(avifROStreamOffset(s) - av1COffset == 4); // Make sure avifParseCodecConfiguration() reads exactly 4 bytes.
  ------------------
  |  |   36|  33.3k|    do {                        \
  |  |   37|  33.3k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 0, False: 33.3k]
  |  |  ------------------
  |  |   38|      0|            avifBreakOnError(); \
  |  |   39|      0|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      0|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      0|        }                       \
  |  |   41|  33.3k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 33.3k]
  |  |  ------------------
  ------------------
 2698|  33.3k|    return AVIF_TRUE;
  ------------------
  |  |   88|  33.3k|#define AVIF_TRUE 1
  ------------------
 2699|  33.3k|}
read.c:avifParseCleanApertureBoxProperty:
 2724|     12|{
 2725|     12|    BEGIN_STREAM(s, raw, rawLen, diag, "Box[clap]");
  ------------------
  |  |  738|     12|    avifROStream VARNAME;                               \
  |  |  739|     12|    avifROData VARNAME##_roData;                        \
  |  |  740|     12|    VARNAME##_roData.data = PTR;                        \
  |  |  741|     12|    VARNAME##_roData.size = SIZE;                       \
  |  |  742|     12|    avifROStreamStart(&VARNAME, &VARNAME##_roData, DIAG, CONTEXT)
  ------------------
 2726|       |
 2727|     12|    avifCleanApertureBox * clap = &prop->u.clap;
 2728|     12|    AVIF_CHECK(avifROStreamReadU32(&s, &clap->widthN));    // unsigned int(32) cleanApertureWidthN;
  ------------------
  |  |   36|     12|    do {                        \
  |  |   37|     12|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 11]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|     12|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 11]
  |  |  ------------------
  ------------------
 2729|     11|    AVIF_CHECK(avifROStreamReadU32(&s, &clap->widthD));    // unsigned int(32) cleanApertureWidthD;
  ------------------
  |  |   36|     11|    do {                        \
  |  |   37|     11|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 0, False: 11]
  |  |  ------------------
  |  |   38|      0|            avifBreakOnError(); \
  |  |   39|      0|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      0|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      0|        }                       \
  |  |   41|     11|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 11]
  |  |  ------------------
  ------------------
 2730|     11|    AVIF_CHECK(avifROStreamReadU32(&s, &clap->heightN));   // unsigned int(32) cleanApertureHeightN;
  ------------------
  |  |   36|     11|    do {                        \
  |  |   37|     11|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 10]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|     11|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 10]
  |  |  ------------------
  ------------------
 2731|     10|    AVIF_CHECK(avifROStreamReadU32(&s, &clap->heightD));   // unsigned int(32) cleanApertureHeightD;
  ------------------
  |  |   36|     10|    do {                        \
  |  |   37|     10|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 0, False: 10]
  |  |  ------------------
  |  |   38|      0|            avifBreakOnError(); \
  |  |   39|      0|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      0|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      0|        }                       \
  |  |   41|     10|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 10]
  |  |  ------------------
  ------------------
 2732|     10|    AVIF_CHECK(avifROStreamReadU32(&s, &clap->horizOffN)); // unsigned int(32) horizOffN;
  ------------------
  |  |   36|     10|    do {                        \
  |  |   37|     10|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 9]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|     10|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 9]
  |  |  ------------------
  ------------------
 2733|      9|    AVIF_CHECK(avifROStreamReadU32(&s, &clap->horizOffD)); // unsigned int(32) horizOffD;
  ------------------
  |  |   36|      9|    do {                        \
  |  |   37|      9|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 8]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|      9|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 8]
  |  |  ------------------
  ------------------
 2734|      8|    AVIF_CHECK(avifROStreamReadU32(&s, &clap->vertOffN));  // unsigned int(32) vertOffN;
  ------------------
  |  |   36|      8|    do {                        \
  |  |   37|      8|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 7]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|      8|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 7]
  |  |  ------------------
  ------------------
 2735|      7|    AVIF_CHECK(avifROStreamReadU32(&s, &clap->vertOffD));  // unsigned int(32) vertOffD;
  ------------------
  |  |   36|      7|    do {                        \
  |  |   37|      7|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 6]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|      7|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 6]
  |  |  ------------------
  ------------------
 2736|      6|    return AVIF_TRUE;
  ------------------
  |  |   88|      6|#define AVIF_TRUE 1
  ------------------
 2737|      7|}
read.c:avifParseImageRotationProperty:
 2740|  1.38k|{
 2741|  1.38k|    BEGIN_STREAM(s, raw, rawLen, diag, "Box[irot]");
  ------------------
  |  |  738|  1.38k|    avifROStream VARNAME;                               \
  |  |  739|  1.38k|    avifROData VARNAME##_roData;                        \
  |  |  740|  1.38k|    VARNAME##_roData.data = PTR;                        \
  |  |  741|  1.38k|    VARNAME##_roData.size = SIZE;                       \
  |  |  742|  1.38k|    avifROStreamStart(&VARNAME, &VARNAME##_roData, DIAG, CONTEXT)
  ------------------
 2742|       |
 2743|  1.38k|    avifImageRotation * irot = &prop->u.irot;
 2744|  1.38k|    uint8_t reserved;
 2745|  1.38k|    AVIF_CHECK(avifROStreamReadBitsU8(&s, &reserved, /*bitCount=*/6)); // unsigned int (6) reserved = 0;
  ------------------
  |  |   36|  1.38k|    do {                        \
  |  |   37|  1.38k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 1.38k]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|  1.38k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 1.38k]
  |  |  ------------------
  ------------------
 2746|  1.38k|    if (reserved) {
  ------------------
  |  Branch (2746:9): [True: 1, False: 1.38k]
  ------------------
 2747|      1|        avifDiagnosticsPrintf(diag, "Box[irot] contains nonzero reserved bits [%u]", reserved);
 2748|      1|        return AVIF_FALSE;
  ------------------
  |  |   89|      1|#define AVIF_FALSE 0
  ------------------
 2749|      1|    }
 2750|  1.38k|    AVIF_CHECK(avifROStreamReadBitsU8(&s, &irot->angle, /*bitCount=*/2)); // unsigned int (2) angle;
  ------------------
  |  |   36|  1.38k|    do {                        \
  |  |   37|  1.38k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 0, False: 1.38k]
  |  |  ------------------
  |  |   38|      0|            avifBreakOnError(); \
  |  |   39|      0|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      0|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      0|        }                       \
  |  |   41|  1.38k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 1.38k]
  |  |  ------------------
  ------------------
 2751|  1.38k|    return AVIF_TRUE;
  ------------------
  |  |   88|  1.38k|#define AVIF_TRUE 1
  ------------------
 2752|  1.38k|}
read.c:avifParseImageMirrorProperty:
 2755|      6|{
 2756|      6|    BEGIN_STREAM(s, raw, rawLen, diag, "Box[imir]");
  ------------------
  |  |  738|      6|    avifROStream VARNAME;                               \
  |  |  739|      6|    avifROData VARNAME##_roData;                        \
  |  |  740|      6|    VARNAME##_roData.data = PTR;                        \
  |  |  741|      6|    VARNAME##_roData.size = SIZE;                       \
  |  |  742|      6|    avifROStreamStart(&VARNAME, &VARNAME##_roData, DIAG, CONTEXT)
  ------------------
 2757|       |
 2758|      6|    avifImageMirror * imir = &prop->u.imir;
 2759|      6|    uint8_t reserved;
 2760|      6|    AVIF_CHECK(avifROStreamReadBitsU8(&s, &reserved, /*bitCount=*/7)); // unsigned int(7) reserved = 0;
  ------------------
  |  |   36|      6|    do {                        \
  |  |   37|      6|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 5]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|      6|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 5]
  |  |  ------------------
  ------------------
 2761|      5|    if (reserved) {
  ------------------
  |  Branch (2761:9): [True: 1, False: 4]
  ------------------
 2762|      1|        avifDiagnosticsPrintf(diag, "Box[imir] contains nonzero reserved bits [%u]", reserved);
 2763|      1|        return AVIF_FALSE;
  ------------------
  |  |   89|      1|#define AVIF_FALSE 0
  ------------------
 2764|      1|    }
 2765|      4|    AVIF_CHECK(avifROStreamReadBitsU8(&s, &imir->axis, /*bitCount=*/1)); // unsigned int(1) axis;
  ------------------
  |  |   36|      4|    do {                        \
  |  |   37|      4|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 0, False: 4]
  |  |  ------------------
  |  |   38|      0|            avifBreakOnError(); \
  |  |   39|      0|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      0|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      0|        }                       \
  |  |   41|      4|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 4]
  |  |  ------------------
  ------------------
 2766|      4|    return AVIF_TRUE;
  ------------------
  |  |   88|      4|#define AVIF_TRUE 1
  ------------------
 2767|      4|}
read.c:avifParsePixelInformationProperty:
 2770|  31.4k|{
 2771|  31.4k|    BEGIN_STREAM(s, raw, rawLen, diag, "Box[pixi]");
  ------------------
  |  |  738|  31.4k|    avifROStream VARNAME;                               \
  |  |  739|  31.4k|    avifROData VARNAME##_roData;                        \
  |  |  740|  31.4k|    VARNAME##_roData.data = PTR;                        \
  |  |  741|  31.4k|    VARNAME##_roData.size = SIZE;                       \
  |  |  742|  31.4k|    avifROStreamStart(&VARNAME, &VARNAME##_roData, DIAG, CONTEXT)
  ------------------
 2772|  31.4k|    uint32_t flags = 0; // px_flags
 2773|  31.4k|    AVIF_CHECKERR(avifROStreamReadAndEnforceVersion(&s, /*enforcedVersion=*/0, &flags), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  31.4k|    do {                        \
  |  |   46|  31.4k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 1, False: 31.4k]
  |  |  ------------------
  |  |   47|      1|            avifBreakOnError(); \
  |  |   48|      1|            return ERR;         \
  |  |   49|      1|        }                       \
  |  |   50|  31.4k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 31.4k]
  |  |  ------------------
  ------------------
 2774|       |
 2775|  31.4k|    avifPixelInformationProperty * pixi = &prop->u.pixi;
 2776|  31.4k|    AVIF_CHECKERR(avifROStreamRead(&s, &pixi->planeCount, 1), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int (8) num_channels;
  ------------------
  |  |   45|  31.4k|    do {                        \
  |  |   46|  31.4k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 1, False: 31.4k]
  |  |  ------------------
  |  |   47|      1|            avifBreakOnError(); \
  |  |   48|      1|            return ERR;         \
  |  |   49|      1|        }                       \
  |  |   50|  31.4k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 31.4k]
  |  |  ------------------
  ------------------
 2777|  31.4k|    if (pixi->planeCount < 1 || pixi->planeCount > MAX_PIXI_PLANE_DEPTHS) {
  ------------------
  |  |  122|  31.4k|#define MAX_PIXI_PLANE_DEPTHS 4
  ------------------
  |  Branch (2777:9): [True: 1, False: 31.4k]
  |  Branch (2777:33): [True: 5, False: 31.4k]
  ------------------
 2778|      6|        avifDiagnosticsPrintf(diag, "Box[pixi] contains unsupported plane count [%u]", pixi->planeCount);
 2779|      6|        return AVIF_RESULT_NOT_IMPLEMENTED;
 2780|      6|    }
 2781|   102k|    for (uint8_t i = 0; i < pixi->planeCount; ++i) {
  ------------------
  |  Branch (2781:25): [True: 71.4k, False: 31.4k]
  ------------------
 2782|  71.4k|        AVIF_CHECKERR(avifROStreamRead(&s, &pixi->planeDepths[i], 1), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int (8) bits_per_channel;
  ------------------
  |  |   45|  71.4k|    do {                        \
  |  |   46|  71.4k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 2, False: 71.4k]
  |  |  ------------------
  |  |   47|      2|            avifBreakOnError(); \
  |  |   48|      2|            return ERR;         \
  |  |   49|      2|        }                       \
  |  |   50|  71.4k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 71.4k]
  |  |  ------------------
  ------------------
 2783|  71.4k|        if (pixi->planeDepths[i] == 0) {
  ------------------
  |  Branch (2783:13): [True: 1, False: 71.4k]
  ------------------
 2784|      1|            avifDiagnosticsPrintf(diag, "Box[pixi] plane depth shall not be 0 for channel %u", i);
 2785|      1|            return AVIF_RESULT_BMFF_PARSE_FAILED;
 2786|      1|        }
 2787|  71.4k|        if (pixi->planeDepths[i] > 16) {
  ------------------
  |  Branch (2787:13): [True: 4, False: 71.4k]
  ------------------
 2788|      4|            avifDiagnosticsPrintf(diag, "Box[pixi] plane depth %d is not supported", (int)pixi->planeDepths[i]);
 2789|      4|            return AVIF_RESULT_NOT_IMPLEMENTED;
 2790|      4|        }
 2791|  71.4k|        if (pixi->planeDepths[i] != pixi->planeDepths[0]) {
  ------------------
  |  Branch (2791:13): [True: 1, False: 71.4k]
  ------------------
 2792|      1|            avifDiagnosticsPrintf(diag,
 2793|      1|                                  "Box[pixi] contains unsupported mismatched plane depths [%u != %u]",
 2794|      1|                                  pixi->planeDepths[i],
 2795|      1|                                  pixi->planeDepths[0]);
 2796|      1|            return AVIF_RESULT_NOT_IMPLEMENTED;
 2797|      1|        }
 2798|  71.4k|    }
 2799|       |#if defined(AVIF_ENABLE_EXPERIMENTAL_EXTENDED_PIXI)
 2800|       |    if (flags & 1) {
 2801|       |        for (uint8_t i = 0; i < pixi->planeCount; ++i) {
 2802|       |            uint8_t channelIdc, reserved, componentFormat, channelLabelFlag;
 2803|       |            AVIF_CHECKERR(avifROStreamReadBitsU8(&s, &channelIdc, /*bitCount=*/3), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(3) channel_idc;
 2804|       |            AVIF_CHECKERR(avifROStreamReadBitsU8(&s, &reserved, /*bitCount=*/1), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(1) reserved = 0;
 2805|       |            AVIF_CHECKERR(avifROStreamReadBitsU8(&s, &componentFormat, /*bitCount=*/2), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(2) component_format;
 2806|       |            AVIF_CHECKERR(avifROStreamReadBitsU8(&s, &pixi->subsamplingFlag[i], /*bitCount=*/1),
 2807|       |                          AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(1) subsampling_flag;
 2808|       |            AVIF_CHECKERR(avifROStreamReadBitsU8(&s, &channelLabelFlag, /*bitCount=*/1),
 2809|       |                          AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(1) channel_label_flag;
 2810|       |            if (pixi->subsamplingFlag[i]) {
 2811|       |                AVIF_CHECKERR(avifROStreamReadBitsU8(&s, &pixi->subsamplingType[i], /*bitCount=*/4),
 2812|       |                              AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(4) subsampling_type;
 2813|       |                AVIF_CHECKERR(avifROStreamReadBitsU8(&s, &pixi->subsamplingLocation[i], /*bitCount=*/4),
 2814|       |                              AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(4) subsampling_location;
 2815|       |            }
 2816|       |
 2817|       |            // ISO/IEC 23008-12:2024/CDAM 2:2025 section 6.5.6.3:
 2818|       |            //   This field indicates the contents of the channel. A value of 0 indicates colour/grayscale. A value of
 2819|       |            //   1 indicates alpha. A value of 2 indicates depth. Values 3-7 are reserved for future use. At most one
 2820|       |            //   channel shall have a channel_idc of 1.
 2821|       |            if (channelIdc != 0) {
 2822|       |                avifDiagnosticsPrintf(diag, "Box[pixi] contains unsupported channel_idc %u for channel %u", channelIdc, i);
 2823|       |                return AVIF_RESULT_NOT_IMPLEMENTED;
 2824|       |            }
 2825|       |            if (reserved != 0) {
 2826|       |                avifDiagnosticsPrintf(diag, "Box[pixi] contains non-zero reserved field %u for channel %u", reserved, i);
 2827|       |                return AVIF_RESULT_BMFF_PARSE_FAILED;
 2828|       |            }
 2829|       |            // ISO/IEC 23008-12:2024/CDAM 2:2025 section 6.5.6.3:
 2830|       |            //   component_format: This field indicates the data type of the channel as defined by the component_format
 2831|       |            //   values in ISO/IEC 23001-17 where component_bit_depth is considered to be equal to bits_per_channel.
 2832|       |            // ISO/IEC 23001-17 section 5.2.1.2:
 2833|       |            //   component_format: When equal to 0, component value is an unsigned integer coded on component_bit_depth bits.
 2834|       |            if (componentFormat != 0) {
 2835|       |                avifDiagnosticsPrintf(diag, "Box[pixi] contains unsupported component_format %u for channel %u", componentFormat, i);
 2836|       |                return AVIF_RESULT_NOT_IMPLEMENTED;
 2837|       |            }
 2838|       |            if (pixi->subsamplingFlag[i]) {
 2839|       |                if (pixi->subsamplingType[i] >= AVIF_PIXI_SUBSAMPLING_RESERVED) {
 2840|       |                    avifDiagnosticsPrintf(diag,
 2841|       |                                          "Box[pixi] contains reserved subsampling_type %u for channel %u",
 2842|       |                                          pixi->subsamplingType[i],
 2843|       |                                          i);
 2844|       |                    return AVIF_RESULT_BMFF_PARSE_FAILED;
 2845|       |                }
 2846|       |                if (pixi->subsamplingLocation[i] > 4) {
 2847|       |                    avifDiagnosticsPrintf(diag,
 2848|       |                                          "Box[pixi] contains reserved subsampling_location %u for channel %u",
 2849|       |                                          pixi->subsamplingLocation[i],
 2850|       |                                          i);
 2851|       |                    return AVIF_RESULT_BMFF_PARSE_FAILED;
 2852|       |                }
 2853|       |            }
 2854|       |            if (channelLabelFlag) {
 2855|       |                AVIF_CHECKERR(avifROStreamReadString(&s, NULL, 0), AVIF_RESULT_BMFF_PARSE_FAILED); // utf8string channel_label; (skipped)
 2856|       |            }
 2857|       |        }
 2858|       |    }
 2859|       |#endif // AVIF_ENABLE_EXPERIMENTAL_EXTENDED_PIXI
 2860|  31.4k|    return AVIF_RESULT_OK;
 2861|  31.4k|}
read.c:avifParseOperatingPointSelectorProperty:
 2864|      2|{
 2865|      2|    BEGIN_STREAM(s, raw, rawLen, diag, "Box[a1op]");
  ------------------
  |  |  738|      2|    avifROStream VARNAME;                               \
  |  |  739|      2|    avifROData VARNAME##_roData;                        \
  |  |  740|      2|    VARNAME##_roData.data = PTR;                        \
  |  |  741|      2|    VARNAME##_roData.size = SIZE;                       \
  |  |  742|      2|    avifROStreamStart(&VARNAME, &VARNAME##_roData, DIAG, CONTEXT)
  ------------------
 2866|       |
 2867|      2|    avifOperatingPointSelectorProperty * a1op = &prop->u.a1op;
 2868|      2|    AVIF_CHECK(avifROStreamRead(&s, &a1op->opIndex, 1));
  ------------------
  |  |   36|      2|    do {                        \
  |  |   37|      2|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 0, False: 2]
  |  |  ------------------
  |  |   38|      0|            avifBreakOnError(); \
  |  |   39|      0|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      0|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      0|        }                       \
  |  |   41|      2|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 2]
  |  |  ------------------
  ------------------
 2869|      2|    if (a1op->opIndex > 31) { // 31 is AV1's max operating point value
  ------------------
  |  Branch (2869:9): [True: 1, False: 1]
  ------------------
 2870|      1|        avifDiagnosticsPrintf(diag, "Box[a1op] contains an unsupported operating point [%u]", a1op->opIndex);
 2871|      1|        return AVIF_FALSE;
  ------------------
  |  |   89|      1|#define AVIF_FALSE 0
  ------------------
 2872|      1|    }
 2873|      1|    return AVIF_TRUE;
  ------------------
  |  |   88|      1|#define AVIF_TRUE 1
  ------------------
 2874|      2|}
read.c:avifParseAV1LayeredImageIndexingProperty:
 2890|  4.23k|{
 2891|  4.23k|    BEGIN_STREAM(s, raw, rawLen, diag, "Box[a1lx]");
  ------------------
  |  |  738|  4.23k|    avifROStream VARNAME;                               \
  |  |  739|  4.23k|    avifROData VARNAME##_roData;                        \
  |  |  740|  4.23k|    VARNAME##_roData.data = PTR;                        \
  |  |  741|  4.23k|    VARNAME##_roData.size = SIZE;                       \
  |  |  742|  4.23k|    avifROStreamStart(&VARNAME, &VARNAME##_roData, DIAG, CONTEXT)
  ------------------
 2892|       |
 2893|  4.23k|    avifAV1LayeredImageIndexingProperty * a1lx = &prop->u.a1lx;
 2894|       |
 2895|  4.23k|    uint8_t largeSize = 0;
 2896|  4.23k|    AVIF_CHECK(avifROStreamRead(&s, &largeSize, 1));
  ------------------
  |  |   36|  4.23k|    do {                        \
  |  |   37|  4.23k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 4.23k]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|  4.23k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 4.23k]
  |  |  ------------------
  ------------------
 2897|  4.23k|    if (largeSize & 0xFE) {
  ------------------
  |  Branch (2897:9): [True: 1, False: 4.22k]
  ------------------
 2898|      1|        avifDiagnosticsPrintf(diag, "Box[a1lx] has bits set in the reserved section [%u]", largeSize);
 2899|      1|        return AVIF_FALSE;
  ------------------
  |  |   89|      1|#define AVIF_FALSE 0
  ------------------
 2900|      1|    }
 2901|       |
 2902|  16.9k|    for (int i = 0; i < 3; ++i) {
  ------------------
  |  Branch (2902:21): [True: 12.6k, False: 4.22k]
  ------------------
 2903|  12.6k|        if (largeSize) {
  ------------------
  |  Branch (2903:13): [True: 6, False: 12.6k]
  ------------------
 2904|      6|            AVIF_CHECK(avifROStreamReadU32(&s, &a1lx->layerSize[i]));
  ------------------
  |  |   36|      6|    do {                        \
  |  |   37|      6|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 3, False: 3]
  |  |  ------------------
  |  |   38|      3|            avifBreakOnError(); \
  |  |   39|      3|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      3|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      3|        }                       \
  |  |   41|      6|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 3]
  |  |  ------------------
  ------------------
 2905|  12.6k|        } else {
 2906|  12.6k|            uint16_t layerSize16;
 2907|  12.6k|            AVIF_CHECK(avifROStreamReadU16(&s, &layerSize16));
  ------------------
  |  |   36|  12.6k|    do {                        \
  |  |   37|  12.6k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 3, False: 12.6k]
  |  |  ------------------
  |  |   38|      3|            avifBreakOnError(); \
  |  |   39|      3|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      3|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      3|        }                       \
  |  |   41|  12.6k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 12.6k]
  |  |  ------------------
  ------------------
 2908|  12.6k|            a1lx->layerSize[i] = (uint32_t)layerSize16;
 2909|  12.6k|        }
 2910|  12.6k|    }
 2911|       |
 2912|       |    // Layer sizes will be validated later (when the item's size is known)
 2913|  4.22k|    return AVIF_TRUE;
  ------------------
  |  |   88|  4.22k|#define AVIF_TRUE 1
  ------------------
 2914|  4.22k|}
read.c:avifParseContentLightLevelInformationBox:
 2540|    470|{
 2541|    470|    BEGIN_STREAM(s, raw, rawLen, diag, "Box[clli]");
  ------------------
  |  |  738|    470|    avifROStream VARNAME;                               \
  |  |  739|    470|    avifROData VARNAME##_roData;                        \
  |  |  740|    470|    VARNAME##_roData.data = PTR;                        \
  |  |  741|    470|    VARNAME##_roData.size = SIZE;                       \
  |  |  742|    470|    avifROStreamStart(&VARNAME, &VARNAME##_roData, DIAG, CONTEXT)
  ------------------
 2542|    470|    AVIF_CHECKRES(avifParseContentLightLevelInformation(&s, &prop->u.clli));
  ------------------
  |  |   54|    470|    do {                                  \
  |  |   55|    470|        const avifResult result__ = (A);  \
  |  |   56|    470|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 3, False: 467]
  |  |  ------------------
  |  |   57|      3|            avifBreakOnError();           \
  |  |   58|      3|            return result__;              \
  |  |   59|      3|        }                                 \
  |  |   60|    470|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 467]
  |  |  ------------------
  ------------------
 2543|    467|    return AVIF_RESULT_OK;
 2544|    470|}
read.c:avifParseContentLightLevelInformation:
 2534|    470|{
 2535|    470|    AVIF_CHECKERR(avifROStreamReadBitsU16(s, &clli->maxCLL, 16), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(16) max_content_light_level
  ------------------
  |  |   45|    470|    do {                        \
  |  |   46|    470|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 1, False: 469]
  |  |  ------------------
  |  |   47|      1|            avifBreakOnError(); \
  |  |   48|      1|            return ERR;         \
  |  |   49|      1|        }                       \
  |  |   50|    470|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 469]
  |  |  ------------------
  ------------------
 2536|    469|    AVIF_CHECKERR(avifROStreamReadBitsU16(s, &clli->maxPALL, 16), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(16) max_pic_average_light_level
  ------------------
  |  |   45|    469|    do {                        \
  |  |   46|    469|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 2, False: 467]
  |  |  ------------------
  |  |   47|      2|            avifBreakOnError(); \
  |  |   48|      2|            return ERR;         \
  |  |   49|      2|        }                       \
  |  |   50|    469|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 467]
  |  |  ------------------
  ------------------
 2537|    467|    return AVIF_RESULT_OK;
 2538|    469|}
read.c:avifParseItemPropertyAssociation:
 2984|  19.2k|{
 2985|       |    // NOTE: If this function ever adds support for versions other than [0,1] or flags other than
 2986|       |    //       [0,1], please increase the value of MAX_IPMA_VERSION_AND_FLAGS_SEEN accordingly.
 2987|       |
 2988|  19.2k|    BEGIN_STREAM(s, raw, rawLen, diag, "Box[ipma]");
  ------------------
  |  |  738|  19.2k|    avifROStream VARNAME;                               \
  |  |  739|  19.2k|    avifROData VARNAME##_roData;                        \
  |  |  740|  19.2k|    VARNAME##_roData.data = PTR;                        \
  |  |  741|  19.2k|    VARNAME##_roData.size = SIZE;                       \
  |  |  742|  19.2k|    avifROStreamStart(&VARNAME, &VARNAME##_roData, DIAG, CONTEXT)
  ------------------
 2989|       |
 2990|  19.2k|    uint8_t version;
 2991|  19.2k|    uint32_t flags;
 2992|  19.2k|    AVIF_CHECKERR(avifROStreamReadVersionAndFlags(&s, &version, &flags), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  19.2k|    do {                        \
  |  |   46|  19.2k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 1, False: 19.2k]
  |  |  ------------------
  |  |   47|      1|            avifBreakOnError(); \
  |  |   48|      1|            return ERR;         \
  |  |   49|      1|        }                       \
  |  |   50|  19.2k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 19.2k]
  |  |  ------------------
  ------------------
 2993|  19.2k|    avifBool propertyIndexIsU15 = ((flags & 0x1) != 0);
 2994|  19.2k|    *outVersionAndFlags = ((uint32_t)version << 24) | flags;
 2995|       |
 2996|  19.2k|    uint32_t entryCount;
 2997|  19.2k|    AVIF_CHECKERR(avifROStreamReadU32(&s, &entryCount), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  19.2k|    do {                        \
  |  |   46|  19.2k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 1, False: 19.2k]
  |  |  ------------------
  |  |   47|      1|            avifBreakOnError(); \
  |  |   48|      1|            return ERR;         \
  |  |   49|      1|        }                       \
  |  |   50|  19.2k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 19.2k]
  |  |  ------------------
  ------------------
 2998|  19.2k|    unsigned int prevItemID = 0;
 2999|  61.9k|    for (uint32_t entryIndex = 0; entryIndex < entryCount; ++entryIndex) {
  ------------------
  |  Branch (2999:35): [True: 42.8k, False: 19.1k]
  ------------------
 3000|       |        // ISO/IEC 14496-12, Seventh edition, 2022-01, Section 8.11.14.1:
 3001|       |        //   Each ItemPropertyAssociationBox shall be ordered by increasing item_ID, and there shall
 3002|       |        //   be at most one occurrence of a given item_ID, in the set of ItemPropertyAssociationBox
 3003|       |        //   boxes.
 3004|  42.8k|        unsigned int itemID;
 3005|  42.8k|        if (version < 1) {
  ------------------
  |  Branch (3005:13): [True: 42.8k, False: 68]
  ------------------
 3006|  42.8k|            uint16_t tmp;
 3007|  42.8k|            AVIF_CHECKERR(avifROStreamReadU16(&s, &tmp), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  42.8k|    do {                        \
  |  |   46|  42.8k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 6, False: 42.8k]
  |  |  ------------------
  |  |   47|      6|            avifBreakOnError(); \
  |  |   48|      6|            return ERR;         \
  |  |   49|      6|        }                       \
  |  |   50|  42.8k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 42.8k]
  |  |  ------------------
  ------------------
 3008|  42.8k|            itemID = tmp;
 3009|  42.8k|        } else {
 3010|     68|            AVIF_CHECKERR(avifROStreamReadU32(&s, &itemID), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|     68|    do {                        \
  |  |   46|     68|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 10, False: 58]
  |  |  ------------------
  |  |   47|     10|            avifBreakOnError(); \
  |  |   48|     10|            return ERR;         \
  |  |   49|     10|        }                       \
  |  |   50|     68|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 58]
  |  |  ------------------
  ------------------
 3011|     68|        }
 3012|  42.8k|        AVIF_CHECKRES(avifCheckItemID("ipma", itemID, diag));
  ------------------
  |  |   54|  42.8k|    do {                                  \
  |  |   55|  42.8k|        const avifResult result__ = (A);  \
  |  |   56|  42.8k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 1, False: 42.8k]
  |  |  ------------------
  |  |   57|      1|            avifBreakOnError();           \
  |  |   58|      1|            return result__;              \
  |  |   59|      1|        }                                 \
  |  |   60|  42.8k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 42.8k]
  |  |  ------------------
  ------------------
 3013|  42.8k|        if (itemID <= prevItemID) {
  ------------------
  |  Branch (3013:13): [True: 5, False: 42.8k]
  ------------------
 3014|      5|            avifDiagnosticsPrintf(diag, "Box[ipma] item IDs are not ordered by increasing ID");
 3015|      5|            return AVIF_RESULT_BMFF_PARSE_FAILED;
 3016|      5|        }
 3017|  42.8k|        prevItemID = itemID;
 3018|       |
 3019|  42.8k|        avifDecoderItem * item;
 3020|  42.8k|        AVIF_CHECKRES(avifMetaFindOrCreateItem(meta, itemID, &item));
  ------------------
  |  |   54|  42.8k|    do {                                  \
  |  |   55|  42.8k|        const avifResult result__ = (A);  \
  |  |   56|  42.8k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 0, False: 42.8k]
  |  |  ------------------
  |  |   57|      0|            avifBreakOnError();           \
  |  |   58|      0|            return result__;              \
  |  |   59|      0|        }                                 \
  |  |   60|  42.8k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 42.8k]
  |  |  ------------------
  ------------------
 3021|  42.8k|        if (item->ipmaSeen) {
  ------------------
  |  Branch (3021:13): [True: 0, False: 42.8k]
  ------------------
 3022|      0|            avifDiagnosticsPrintf(diag, "Duplicate Box[ipma] for item ID [%u]", itemID);
 3023|      0|            return AVIF_RESULT_BMFF_PARSE_FAILED;
 3024|      0|        }
 3025|  42.8k|        item->ipmaSeen = AVIF_TRUE;
  ------------------
  |  |   88|  42.8k|#define AVIF_TRUE 1
  ------------------
 3026|       |
 3027|  42.8k|        uint8_t associationCount;
 3028|  42.8k|        AVIF_CHECKERR(avifROStreamRead(&s, &associationCount, 1), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  42.8k|    do {                        \
  |  |   46|  42.8k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 2, False: 42.8k]
  |  |  ------------------
  |  |   47|      2|            avifBreakOnError(); \
  |  |   48|      2|            return ERR;         \
  |  |   49|      2|        }                       \
  |  |   50|  42.8k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 42.8k]
  |  |  ------------------
  ------------------
 3029|   229k|        for (uint8_t associationIndex = 0; associationIndex < associationCount; ++associationIndex) {
  ------------------
  |  Branch (3029:44): [True: 186k, False: 42.7k]
  ------------------
 3030|   186k|            uint8_t essential;
 3031|   186k|            AVIF_CHECKERR(avifROStreamReadBitsU8(&s, &essential, /*bitCount=*/1), AVIF_RESULT_BMFF_PARSE_FAILED); // bit(1) essential;
  ------------------
  |  |   45|   186k|    do {                        \
  |  |   46|   186k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 21, False: 186k]
  |  |  ------------------
  |  |   47|     21|            avifBreakOnError(); \
  |  |   48|     21|            return ERR;         \
  |  |   49|     21|        }                       \
  |  |   50|   186k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 186k]
  |  |  ------------------
  ------------------
 3032|   186k|            uint32_t propertyIndex;
 3033|   186k|            AVIF_CHECKERR(avifROStreamReadBitsU32(&s, &propertyIndex, /*bitCount=*/propertyIndexIsU15 ? 15 : 7),
  ------------------
  |  |   45|   186k|    do {                        \
  |  |   46|   373k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 3, False: 186k]
  |  |  |  Branch (46:15): [True: 32, False: 186k]
  |  |  ------------------
  |  |   47|      3|            avifBreakOnError(); \
  |  |   48|      3|            return ERR;         \
  |  |   49|      3|        }                       \
  |  |   50|   186k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 186k]
  |  |  ------------------
  ------------------
 3034|   186k|                          AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(7/15) property_index;
 3035|       |
 3036|       |            // ISO/IEC 14496-12 Section 8.11.14.3:
 3037|       |            //   0 indicating that no property is associated (the essential indicator shall also be 0)
 3038|   186k|            if (propertyIndex == 0) {
  ------------------
  |  Branch (3038:17): [True: 1.31k, False: 185k]
  ------------------
 3039|  1.31k|                if (essential) {
  ------------------
  |  Branch (3039:21): [True: 1, False: 1.31k]
  ------------------
 3040|      1|                    avifDiagnosticsPrintf(diag, "Box[ipma] for item ID [%u] contains an illegal essential property index 0", itemID);
 3041|      1|                    return AVIF_RESULT_BMFF_PARSE_FAILED;
 3042|      1|                }
 3043|  1.31k|                continue;
 3044|  1.31k|            }
 3045|   185k|            --propertyIndex; // 1-indexed
 3046|       |
 3047|   185k|            if (propertyIndex >= meta->properties.count) {
  ------------------
  |  Branch (3047:17): [True: 72, False: 185k]
  ------------------
 3048|     72|                avifDiagnosticsPrintf(diag,
 3049|     72|                                      "Box[ipma] for item ID [%u] contains an illegal property index [%u] (out of [%u] properties)",
 3050|     72|                                      itemID,
 3051|     72|                                      propertyIndex,
 3052|     72|                                      meta->properties.count);
 3053|     72|                return AVIF_RESULT_BMFF_PARSE_FAILED;
 3054|     72|            }
 3055|       |
 3056|       |            // Copy property to item
 3057|   185k|            const avifProperty * srcProp = &meta->properties.prop[propertyIndex];
 3058|       |
 3059|       |            // Some properties are supported and parsed by libavif.
 3060|       |            // Other properties are forwarded to the user as opaque blobs.
 3061|   185k|            const avifBool supportedType = !srcProp->isOpaque;
 3062|   185k|            if (supportedType) {
  ------------------
  |  Branch (3062:17): [True: 159k, False: 25.7k]
  ------------------
 3063|   159k|                if (essential) {
  ------------------
  |  Branch (3063:21): [True: 37.9k, False: 121k]
  ------------------
 3064|       |                    // Verify that it is legal for this property to be flagged as essential. Any
 3065|       |                    // types in this list are *required* in the spec to not be flagged as essential
 3066|       |                    // when associated with an item.
 3067|  37.9k|                    static const char * const nonessentialTypes[] = {
 3068|       |
 3069|       |                        // AVIF: Section 2.3.2.3.2: "If associated, it shall not be marked as essential."
 3070|  37.9k|                        "a1lx"
 3071|       |
 3072|  37.9k|                    };
 3073|  37.9k|                    size_t nonessentialTypesCount = sizeof(nonessentialTypes) / sizeof(nonessentialTypes[0]);
 3074|  75.8k|                    for (size_t i = 0; i < nonessentialTypesCount; ++i) {
  ------------------
  |  Branch (3074:40): [True: 37.9k, False: 37.9k]
  ------------------
 3075|  37.9k|                        if (!memcmp(srcProp->type, nonessentialTypes[i], 4)) {
  ------------------
  |  Branch (3075:29): [True: 1, False: 37.9k]
  ------------------
 3076|      1|                            avifDiagnosticsPrintf(diag,
 3077|      1|                                                  "Item ID [%u] has a %s property association which must not be marked essential, but is",
 3078|      1|                                                  itemID,
 3079|      1|                                                  nonessentialTypes[i]);
 3080|      1|                            return AVIF_RESULT_BMFF_PARSE_FAILED;
 3081|      1|                        }
 3082|  37.9k|                    }
 3083|   121k|                } else {
 3084|       |                    // Verify that it is legal for this property to not be flagged as essential. Any
 3085|       |                    // types in this list are *required* in the spec to be flagged as essential when
 3086|       |                    // associated with an item.
 3087|   121k|                    static const char * const essentialTypes[] = {
 3088|       |
 3089|       |                        // AVIF: Section 2.3.2.1.1: "If associated, it shall be marked as essential."
 3090|   121k|                        "a1op",
 3091|       |
 3092|       |                        // HEIF: Section 6.5.11.1: "essential shall be equal to 1 for an 'lsel' item property."
 3093|   121k|                        "lsel",
 3094|       |
 3095|       |                        // MIAF 2019/Amd. 2:2021: Section 7.3.9:
 3096|       |                        //   All transformative properties associated with coded and derived images shall be
 3097|       |                        //   marked as essential
 3098|       |                        // It makes no sense to allow for non-essential crop/orientation associated with an item
 3099|       |                        // that is not a coded or derived image, so for simplicity 'item' is not checked here.
 3100|   121k|                        "clap",
 3101|   121k|                        "irot",
 3102|   121k|                        "imir"
 3103|       |
 3104|   121k|                    };
 3105|   121k|                    size_t essentialTypesCount = sizeof(essentialTypes) / sizeof(essentialTypes[0]);
 3106|   730k|                    for (size_t i = 0; i < essentialTypesCount; ++i) {
  ------------------
  |  Branch (3106:40): [True: 608k, False: 121k]
  ------------------
 3107|   608k|                        if (!memcmp(srcProp->type, essentialTypes[i], 4)) {
  ------------------
  |  Branch (3107:29): [True: 5, False: 608k]
  ------------------
 3108|      5|                            avifDiagnosticsPrintf(diag,
 3109|      5|                                                  "Item ID [%u] has a %s property association which must be marked essential, but is not",
 3110|      5|                                                  itemID,
 3111|      5|                                                  essentialTypes[i]);
 3112|      5|                            return AVIF_RESULT_BMFF_PARSE_FAILED;
 3113|      5|                        }
 3114|   608k|                    }
 3115|   121k|                }
 3116|       |
 3117|       |                // Supported and valid; associate it with this item.
 3118|   159k|                avifProperty * dstProp = (avifProperty *)avifArrayPush(&item->properties);
 3119|   159k|                AVIF_CHECKERR(dstProp != NULL, AVIF_RESULT_OUT_OF_MEMORY);
  ------------------
  |  |   45|   159k|    do {                        \
  |  |   46|   159k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 159k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|   159k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 159k]
  |  |  ------------------
  ------------------
 3120|   159k|                *dstProp = *srcProp;
 3121|   159k|            } else {
 3122|  25.7k|                if (essential) {
  ------------------
  |  Branch (3122:21): [True: 3.32k, False: 22.4k]
  ------------------
 3123|       |                    // ISO/IEC 23008-12 Section 10.2.1:
 3124|       |                    //   Under any brand, the primary item (or an alternative if alternative support is required)
 3125|       |                    //   shall be processable by a reader implementing only the required features of that brand.
 3126|       |                    //   Specifically, given that each brand has a set of properties that a reader is required to
 3127|       |                    //   support: the item shall not have properties that are marked as essential and are outside
 3128|       |                    //   this set.
 3129|       |                    // It is assumed that this rule also applies to items the primary item depends on (such as
 3130|       |                    // the cells of a grid).
 3131|       |
 3132|       |                    // Discovered an essential item property that libavif doesn't support!
 3133|       |                    // Make a note to ignore this item later.
 3134|  3.32k|                    item->hasUnsupportedEssentialProperty = AVIF_TRUE;
  ------------------
  |  |   88|  3.32k|#define AVIF_TRUE 1
  ------------------
 3135|  3.32k|                }
 3136|       |
 3137|       |                // Will be forwarded to the user through avifImage::properties.
 3138|  25.7k|                avifProperty * dstProp = (avifProperty *)avifArrayPush(&item->properties);
 3139|  25.7k|                AVIF_CHECKERR(dstProp != NULL, AVIF_RESULT_OUT_OF_MEMORY);
  ------------------
  |  |   45|  25.7k|    do {                        \
  |  |   46|  25.7k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 25.7k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  25.7k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 25.7k]
  |  |  ------------------
  ------------------
 3140|  25.7k|                dstProp->isOpaque = AVIF_TRUE;
  ------------------
  |  |   88|  25.7k|#define AVIF_TRUE 1
  ------------------
 3141|  25.7k|                memcpy(dstProp->type, srcProp->type, sizeof(dstProp->type));
 3142|  25.7k|                memcpy(dstProp->u.opaque.usertype, srcProp->u.opaque.usertype, sizeof(dstProp->u.opaque.usertype));
 3143|  25.7k|                AVIF_CHECKRES(
  ------------------
  |  |   54|  25.7k|    do {                                  \
  |  |   55|  25.7k|        const avifResult result__ = (A);  \
  |  |   56|  25.7k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 0, False: 25.7k]
  |  |  ------------------
  |  |   57|      0|            avifBreakOnError();           \
  |  |   58|      0|            return result__;              \
  |  |   59|      0|        }                                 \
  |  |   60|  25.7k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 25.7k]
  |  |  ------------------
  ------------------
 3144|  25.7k|                    avifRWDataSet(&dstProp->u.opaque.boxPayload, srcProp->u.opaque.boxPayload.data, srcProp->u.opaque.boxPayload.size));
 3145|  25.7k|            }
 3146|   185k|        }
 3147|  42.8k|    }
 3148|  19.1k|    return AVIF_RESULT_OK;
 3149|  19.2k|}
read.c:avifParseItemInfoBox:
 3301|  19.6k|{
 3302|  19.6k|    BEGIN_STREAM(s, raw, rawLen, diag, "Box[iinf]");
  ------------------
  |  |  738|  19.6k|    avifROStream VARNAME;                               \
  |  |  739|  19.6k|    avifROData VARNAME##_roData;                        \
  |  |  740|  19.6k|    VARNAME##_roData.data = PTR;                        \
  |  |  741|  19.6k|    VARNAME##_roData.size = SIZE;                       \
  |  |  742|  19.6k|    avifROStreamStart(&VARNAME, &VARNAME##_roData, DIAG, CONTEXT)
  ------------------
 3303|       |
 3304|  19.6k|    uint8_t version;
 3305|  19.6k|    AVIF_CHECKERR(avifROStreamReadVersionAndFlags(&s, &version, NULL), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  19.6k|    do {                        \
  |  |   46|  19.6k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 2, False: 19.6k]
  |  |  ------------------
  |  |   47|      2|            avifBreakOnError(); \
  |  |   48|      2|            return ERR;         \
  |  |   49|      2|        }                       \
  |  |   50|  19.6k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 19.6k]
  |  |  ------------------
  ------------------
 3306|  19.6k|    uint32_t entryCount;
 3307|  19.6k|    if (version == 0) {
  ------------------
  |  Branch (3307:9): [True: 19.6k, False: 25]
  ------------------
 3308|  19.6k|        uint16_t tmp;
 3309|  19.6k|        AVIF_CHECKERR(avifROStreamReadU16(&s, &tmp), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(16) entry_count;
  ------------------
  |  |   45|  19.6k|    do {                        \
  |  |   46|  19.6k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 1, False: 19.6k]
  |  |  ------------------
  |  |   47|      1|            avifBreakOnError(); \
  |  |   48|      1|            return ERR;         \
  |  |   49|      1|        }                       \
  |  |   50|  19.6k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 19.6k]
  |  |  ------------------
  ------------------
 3310|  19.6k|        entryCount = tmp;
 3311|  19.6k|    } else if (version == 1) {
  ------------------
  |  Branch (3311:16): [True: 21, False: 4]
  ------------------
 3312|     21|        AVIF_CHECKERR(avifROStreamReadU32(&s, &entryCount), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(32) entry_count;
  ------------------
  |  |   45|     21|    do {                        \
  |  |   46|     21|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 1, False: 20]
  |  |  ------------------
  |  |   47|      1|            avifBreakOnError(); \
  |  |   48|      1|            return ERR;         \
  |  |   49|      1|        }                       \
  |  |   50|     21|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 20]
  |  |  ------------------
  ------------------
 3313|     21|    } else {
 3314|      4|        avifDiagnosticsPrintf(diag, "Box[iinf] has an unsupported version %u", version);
 3315|      4|        return AVIF_RESULT_BMFF_PARSE_FAILED;
 3316|      4|    }
 3317|       |
 3318|  65.3k|    for (uint32_t entryIndex = 0; entryIndex < entryCount; ++entryIndex) {
  ------------------
  |  Branch (3318:35): [True: 45.7k, False: 19.6k]
  ------------------
 3319|  45.7k|        avifBoxHeader infeHeader;
 3320|  45.7k|        AVIF_CHECKERR(avifROStreamReadBoxHeader(&s, &infeHeader), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  45.7k|    do {                        \
  |  |   46|  45.7k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 30, False: 45.7k]
  |  |  ------------------
  |  |   47|     30|            avifBreakOnError(); \
  |  |   48|     30|            return ERR;         \
  |  |   49|     30|        }                       \
  |  |   50|  45.7k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 45.7k]
  |  |  ------------------
  ------------------
 3321|       |
 3322|  45.7k|        if (!memcmp(infeHeader.type, "infe", 4)) {
  ------------------
  |  Branch (3322:13): [True: 45.7k, False: 2]
  ------------------
 3323|  45.7k|            AVIF_CHECKRES(avifParseItemInfoEntry(meta, avifROStreamCurrent(&s), infeHeader.size, diag));
  ------------------
  |  |   54|  45.7k|    do {                                  \
  |  |   55|  45.7k|        const avifResult result__ = (A);  \
  |  |   56|  45.7k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 14, False: 45.7k]
  |  |  ------------------
  |  |   57|     14|            avifBreakOnError();           \
  |  |   58|     14|            return result__;              \
  |  |   59|     14|        }                                 \
  |  |   60|  45.7k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 45.7k]
  |  |  ------------------
  ------------------
 3324|  45.7k|        } else {
 3325|       |            // These must all be type infe
 3326|      2|            avifDiagnosticsPrintf(diag, "Box[iinf] contains a box that isn't type 'infe'");
 3327|      2|            return AVIF_RESULT_BMFF_PARSE_FAILED;
 3328|      2|        }
 3329|       |
 3330|  45.7k|        AVIF_CHECKERR(avifROStreamSkip(&s, infeHeader.size), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  45.7k|    do {                        \
  |  |   46|  45.7k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 45.7k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  45.7k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 45.7k]
  |  |  ------------------
  ------------------
 3331|  45.7k|    }
 3332|       |
 3333|  19.6k|    return AVIF_RESULT_OK;
 3334|  19.6k|}
read.c:avifParseItemInfoEntry:
 3250|  45.7k|{
 3251|       |    // Section 8.11.6.2 of ISO/IEC 14496-12.
 3252|  45.7k|    BEGIN_STREAM(s, raw, rawLen, diag, "Box[infe]");
  ------------------
  |  |  738|  45.7k|    avifROStream VARNAME;                               \
  |  |  739|  45.7k|    avifROData VARNAME##_roData;                        \
  |  |  740|  45.7k|    VARNAME##_roData.data = PTR;                        \
  |  |  741|  45.7k|    VARNAME##_roData.size = SIZE;                       \
  |  |  742|  45.7k|    avifROStreamStart(&VARNAME, &VARNAME##_roData, DIAG, CONTEXT)
  ------------------
 3253|       |
 3254|  45.7k|    uint8_t version;
 3255|  45.7k|    uint32_t flags;
 3256|  45.7k|    AVIF_CHECKERR(avifROStreamReadVersionAndFlags(&s, &version, &flags), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  45.7k|    do {                        \
  |  |   46|  45.7k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 1, False: 45.7k]
  |  |  ------------------
  |  |   47|      1|            avifBreakOnError(); \
  |  |   48|      1|            return ERR;         \
  |  |   49|      1|        }                       \
  |  |   50|  45.7k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 45.7k]
  |  |  ------------------
  ------------------
 3257|       |    // Version 2+ is required for item_type
 3258|  45.7k|    if (version != 2 && version != 3) {
  ------------------
  |  Branch (3258:9): [True: 22, False: 45.6k]
  |  Branch (3258:25): [True: 3, False: 19]
  ------------------
 3259|      3|        avifDiagnosticsPrintf(s.diag, "%s: Expecting box version 2 or 3, got version %u", s.diagContext, version);
 3260|      3|        return AVIF_RESULT_BMFF_PARSE_FAILED;
 3261|      3|    }
 3262|       |    // Ignore flags&1. A value of 1 corresponds to a hidden image item (not intended to be displayed).
 3263|       |    // There could be files wrongly setting that flag to 1 for items output as "to be displayed"
 3264|       |    // by libavif so far, so keep that lenient behavior for simplicity and backward compatibility.
 3265|       |
 3266|  45.7k|    uint32_t itemID;
 3267|  45.7k|    if (version == 2) {
  ------------------
  |  Branch (3267:9): [True: 45.6k, False: 19]
  ------------------
 3268|  45.6k|        uint16_t tmp;
 3269|  45.6k|        AVIF_CHECKERR(avifROStreamReadU16(&s, &tmp), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(16) item_ID;
  ------------------
  |  |   45|  45.6k|    do {                        \
  |  |   46|  45.6k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 1, False: 45.6k]
  |  |  ------------------
  |  |   47|      1|            avifBreakOnError(); \
  |  |   48|      1|            return ERR;         \
  |  |   49|      1|        }                       \
  |  |   50|  45.6k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 45.6k]
  |  |  ------------------
  ------------------
 3270|  45.6k|        itemID = tmp;
 3271|  45.6k|    } else {
 3272|     19|        AVIF_ASSERT_OR_RETURN(version == 3);
  ------------------
  |  |   64|     19|#define AVIF_ASSERT_OR_RETURN(A) AVIF_CHECKERR((A), AVIF_RESULT_INTERNAL_ERROR)
  |  |  ------------------
  |  |  |  |   45|     19|    do {                        \
  |  |  |  |   46|     19|        if (!(A)) {             \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (46:13): [True: 0, False: 19]
  |  |  |  |  ------------------
  |  |  |  |   47|      0|            avifBreakOnError(); \
  |  |  |  |   48|      0|            return ERR;         \
  |  |  |  |   49|      0|        }                       \
  |  |  |  |   50|     19|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (50:14): [Folded, False: 19]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 3273|     19|        AVIF_CHECKERR(avifROStreamReadU32(&s, &itemID), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(32) item_ID;
  ------------------
  |  |   45|     19|    do {                        \
  |  |   46|     19|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 19]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|     19|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 19]
  |  |  ------------------
  ------------------
 3274|     19|    }
 3275|  45.7k|    AVIF_CHECKRES(avifCheckItemID("infe", itemID, diag));
  ------------------
  |  |   54|  45.7k|    do {                                  \
  |  |   55|  45.7k|        const avifResult result__ = (A);  \
  |  |   56|  45.7k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 1, False: 45.7k]
  |  |  ------------------
  |  |   57|      1|            avifBreakOnError();           \
  |  |   58|      1|            return result__;              \
  |  |   59|      1|        }                                 \
  |  |   60|  45.7k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 45.7k]
  |  |  ------------------
  ------------------
 3276|  45.7k|    uint16_t itemProtectionIndex;
 3277|  45.7k|    AVIF_CHECKERR(avifROStreamReadU16(&s, &itemProtectionIndex), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(16) item_protection_index;
  ------------------
  |  |   45|  45.7k|    do {                        \
  |  |   46|  45.7k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 1, False: 45.7k]
  |  |  ------------------
  |  |   47|      1|            avifBreakOnError(); \
  |  |   48|      1|            return ERR;         \
  |  |   49|      1|        }                       \
  |  |   50|  45.7k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 45.7k]
  |  |  ------------------
  ------------------
 3278|  45.7k|    uint8_t itemType[4];
 3279|  45.7k|    AVIF_CHECKERR(avifROStreamRead(&s, itemType, 4), AVIF_RESULT_BMFF_PARSE_FAILED);   // unsigned int(32) item_type;
  ------------------
  |  |   45|  45.7k|    do {                        \
  |  |   46|  45.7k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 1, False: 45.7k]
  |  |  ------------------
  |  |   47|      1|            avifBreakOnError(); \
  |  |   48|      1|            return ERR;         \
  |  |   49|      1|        }                       \
  |  |   50|  45.7k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 45.7k]
  |  |  ------------------
  ------------------
 3280|  45.7k|    AVIF_CHECKERR(avifROStreamReadString(&s, NULL, 0), AVIF_RESULT_BMFF_PARSE_FAILED); // utf8string item_name; (skipped)
  ------------------
  |  |   45|  45.7k|    do {                        \
  |  |   46|  45.7k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 5, False: 45.7k]
  |  |  ------------------
  |  |   47|      5|            avifBreakOnError(); \
  |  |   48|      5|            return ERR;         \
  |  |   49|      5|        }                       \
  |  |   50|  45.7k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 45.7k]
  |  |  ------------------
  ------------------
 3281|  45.7k|    avifContentType contentType;
 3282|  45.7k|    if (!memcmp(itemType, "mime", 4)) {
  ------------------
  |  Branch (3282:9): [True: 1.28k, False: 44.4k]
  ------------------
 3283|  1.28k|        AVIF_CHECKERR(avifROStreamReadString(&s, contentType.contentType, CONTENTTYPE_SIZE), AVIF_RESULT_BMFF_PARSE_FAILED); // utf8string content_type;
  ------------------
  |  |   45|  1.28k|    do {                        \
  |  |   46|  1.28k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 1, False: 1.28k]
  |  |  ------------------
  |  |   47|      1|            avifBreakOnError(); \
  |  |   48|      1|            return ERR;         \
  |  |   49|      1|        }                       \
  |  |   50|  1.28k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 1.28k]
  |  |  ------------------
  ------------------
 3284|       |        // utf8string content_encoding; //optional
 3285|  44.4k|    } else {
 3286|       |        // if (item_type == 'uri ') {
 3287|       |        //  utf8string item_uri_type;
 3288|       |        // }
 3289|  44.4k|        memset(&contentType, 0, sizeof(contentType));
 3290|  44.4k|    }
 3291|       |
 3292|  45.7k|    avifDecoderItem * item;
 3293|  45.7k|    AVIF_CHECKRES(avifMetaFindOrCreateItem(meta, itemID, &item));
  ------------------
  |  |   54|  45.7k|    do {                                  \
  |  |   55|  45.7k|        const avifResult result__ = (A);  \
  |  |   56|  45.7k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 0, False: 45.7k]
  |  |  ------------------
  |  |   57|      0|            avifBreakOnError();           \
  |  |   58|      0|            return result__;              \
  |  |   59|      0|        }                                 \
  |  |   60|  45.7k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 45.7k]
  |  |  ------------------
  ------------------
 3294|       |
 3295|  45.7k|    memcpy(item->type, itemType, sizeof(itemType));
 3296|  45.7k|    item->contentType = contentType;
 3297|  45.7k|    return AVIF_RESULT_OK;
 3298|  45.7k|}
read.c:avifParseItemReferenceBox:
 3337|  11.8k|{
 3338|  11.8k|    BEGIN_STREAM(s, raw, rawLen, diag, "Box[iref]");
  ------------------
  |  |  738|  11.8k|    avifROStream VARNAME;                               \
  |  |  739|  11.8k|    avifROData VARNAME##_roData;                        \
  |  |  740|  11.8k|    VARNAME##_roData.data = PTR;                        \
  |  |  741|  11.8k|    VARNAME##_roData.size = SIZE;                       \
  |  |  742|  11.8k|    avifROStreamStart(&VARNAME, &VARNAME##_roData, DIAG, CONTEXT)
  ------------------
 3339|       |
 3340|  11.8k|    uint8_t version;
 3341|  11.8k|    AVIF_CHECKERR(avifROStreamReadVersionAndFlags(&s, &version, NULL), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  11.8k|    do {                        \
  |  |   46|  11.8k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 1, False: 11.8k]
  |  |  ------------------
  |  |   47|      1|            avifBreakOnError(); \
  |  |   48|      1|            return ERR;         \
  |  |   49|      1|        }                       \
  |  |   50|  11.8k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 11.8k]
  |  |  ------------------
  ------------------
 3342|  11.8k|    if (version > 1) {
  ------------------
  |  Branch (3342:9): [True: 41, False: 11.8k]
  ------------------
 3343|       |        // iref versions > 1 are not supported. Skip it.
 3344|     41|        return AVIF_RESULT_OK;
 3345|     41|    }
 3346|       |
 3347|  30.8k|    while (avifROStreamHasBytesLeft(&s, 1)) {
  ------------------
  |  Branch (3347:12): [True: 19.0k, False: 11.7k]
  ------------------
 3348|  19.0k|        avifBoxHeader irefHeader;
 3349|  19.0k|        AVIF_CHECKERR(avifROStreamReadBoxHeader(&s, &irefHeader), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  19.0k|    do {                        \
  |  |   46|  19.0k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 7, False: 19.0k]
  |  |  ------------------
  |  |   47|      7|            avifBreakOnError(); \
  |  |   48|      7|            return ERR;         \
  |  |   49|      7|        }                       \
  |  |   50|  19.0k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 19.0k]
  |  |  ------------------
  ------------------
 3350|       |
 3351|  19.0k|        uint32_t fromID = 0;
 3352|  19.0k|        if (version == 0) {
  ------------------
  |  Branch (3352:13): [True: 19.0k, False: 29]
  ------------------
 3353|  19.0k|            uint16_t tmp;
 3354|  19.0k|            AVIF_CHECKERR(avifROStreamReadU16(&s, &tmp), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(16) from_item_ID;
  ------------------
  |  |   45|  19.0k|    do {                        \
  |  |   46|  19.0k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 1, False: 19.0k]
  |  |  ------------------
  |  |   47|      1|            avifBreakOnError(); \
  |  |   48|      1|            return ERR;         \
  |  |   49|      1|        }                       \
  |  |   50|  19.0k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 19.0k]
  |  |  ------------------
  ------------------
 3355|  19.0k|            fromID = tmp;
 3356|  19.0k|        } else {
 3357|       |            // version == 1
 3358|     29|            AVIF_CHECKERR(avifROStreamReadU32(&s, &fromID), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(32) from_item_ID;
  ------------------
  |  |   45|     29|    do {                        \
  |  |   46|     29|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 29]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|     29|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 29]
  |  |  ------------------
  ------------------
 3359|     29|        }
 3360|       |        // ISO 14496-12 section 8.11.12.1: "index values start at 1"
 3361|  19.0k|        AVIF_CHECKRES(avifCheckItemID("iref", fromID, diag));
  ------------------
  |  |   54|  19.0k|    do {                                  \
  |  |   55|  19.0k|        const avifResult result__ = (A);  \
  |  |   56|  19.0k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 1, False: 19.0k]
  |  |  ------------------
  |  |   57|      1|            avifBreakOnError();           \
  |  |   58|      1|            return result__;              \
  |  |   59|      1|        }                                 \
  |  |   60|  19.0k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 19.0k]
  |  |  ------------------
  ------------------
 3362|       |
 3363|  19.0k|        avifDecoderItem * item;
 3364|  19.0k|        AVIF_CHECKRES(avifMetaFindOrCreateItem(meta, fromID, &item));
  ------------------
  |  |   54|  19.0k|    do {                                  \
  |  |   55|  19.0k|        const avifResult result__ = (A);  \
  |  |   56|  19.0k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 0, False: 19.0k]
  |  |  ------------------
  |  |   57|      0|            avifBreakOnError();           \
  |  |   58|      0|            return result__;              \
  |  |   59|      0|        }                                 \
  |  |   60|  19.0k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 19.0k]
  |  |  ------------------
  ------------------
 3365|  19.0k|        if (!memcmp(irefHeader.type, "dimg", 4)) {
  ------------------
  |  Branch (3365:13): [True: 4.15k, False: 14.9k]
  ------------------
 3366|  4.15k|            if (item->hasDimgFrom) {
  ------------------
  |  Branch (3366:17): [True: 1, False: 4.15k]
  ------------------
 3367|       |                // ISO/IEC 23008-12 (HEIF) 6.6.1: The number of SingleItemTypeReferenceBoxes with the box type 'dimg'
 3368|       |                // and with the same value of from_item_ID shall not be greater than 1.
 3369|      1|                avifDiagnosticsPrintf(diag, "Box[iinf] contains duplicate boxes of type 'dimg' with the same from_item_ID value %u", fromID);
 3370|      1|                return AVIF_RESULT_BMFF_PARSE_FAILED;
 3371|      1|            }
 3372|  4.15k|            item->hasDimgFrom = AVIF_TRUE;
  ------------------
  |  |   88|  4.15k|#define AVIF_TRUE 1
  ------------------
 3373|  4.15k|        }
 3374|       |
 3375|  19.0k|        uint16_t referenceCount = 0;
 3376|  19.0k|        AVIF_CHECKERR(avifROStreamReadU16(&s, &referenceCount), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(16) reference_count;
  ------------------
  |  |   45|  19.0k|    do {                        \
  |  |   46|  19.0k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 1, False: 19.0k]
  |  |  ------------------
  |  |   47|      1|            avifBreakOnError(); \
  |  |   48|      1|            return ERR;         \
  |  |   49|      1|        }                       \
  |  |   50|  19.0k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 19.0k]
  |  |  ------------------
  ------------------
 3377|       |
 3378|  44.6k|        for (uint16_t refIndex = 0; refIndex < referenceCount; ++refIndex) {
  ------------------
  |  Branch (3378:37): [True: 25.6k, False: 19.0k]
  ------------------
 3379|  25.6k|            uint32_t toID = 0;
 3380|  25.6k|            if (version == 0) {
  ------------------
  |  Branch (3380:17): [True: 25.3k, False: 282]
  ------------------
 3381|  25.3k|                uint16_t tmp;
 3382|  25.3k|                AVIF_CHECKERR(avifROStreamReadU16(&s, &tmp), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(16) to_item_ID;
  ------------------
  |  |   45|  25.3k|    do {                        \
  |  |   46|  25.3k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 18, False: 25.3k]
  |  |  ------------------
  |  |   47|     18|            avifBreakOnError(); \
  |  |   48|     18|            return ERR;         \
  |  |   49|     18|        }                       \
  |  |   50|  25.3k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 25.3k]
  |  |  ------------------
  ------------------
 3383|  25.3k|                toID = tmp;
 3384|  25.3k|            } else {
 3385|       |                // version == 1
 3386|    282|                AVIF_CHECKERR(avifROStreamReadU32(&s, &toID), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(32) to_item_ID;
  ------------------
  |  |   45|    282|    do {                        \
  |  |   46|    282|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 17, False: 265]
  |  |  ------------------
  |  |   47|     17|            avifBreakOnError(); \
  |  |   48|     17|            return ERR;         \
  |  |   49|     17|        }                       \
  |  |   50|    282|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 265]
  |  |  ------------------
  ------------------
 3387|    282|            }
 3388|  25.6k|            AVIF_CHECKRES(avifCheckItemID("iref", toID, diag));
  ------------------
  |  |   54|  25.6k|    do {                                  \
  |  |   55|  25.6k|        const avifResult result__ = (A);  \
  |  |   56|  25.6k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 25, False: 25.6k]
  |  |  ------------------
  |  |   57|     25|            avifBreakOnError();           \
  |  |   58|     25|            return result__;              \
  |  |   59|     25|        }                                 \
  |  |   60|  25.6k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 25.6k]
  |  |  ------------------
  ------------------
 3389|       |
 3390|       |            // Read this reference as "{fromID} is a {irefType} for {toID}"
 3391|  25.6k|            if (!memcmp(irefHeader.type, "thmb", 4)) {
  ------------------
  |  Branch (3391:17): [True: 0, False: 25.6k]
  ------------------
 3392|      0|                item->thumbnailForID = toID;
 3393|  25.6k|            } else if (!memcmp(irefHeader.type, "auxl", 4)) {
  ------------------
  |  Branch (3393:24): [True: 12.2k, False: 13.3k]
  ------------------
 3394|  12.2k|                item->auxForID = toID;
 3395|  13.3k|            } else if (!memcmp(irefHeader.type, "cdsc", 4)) {
  ------------------
  |  Branch (3395:24): [True: 1.44k, False: 11.8k]
  ------------------
 3396|  1.44k|                item->descForID = toID;
 3397|  11.8k|            } else if (!memcmp(irefHeader.type, "dimg", 4)) {
  ------------------
  |  Branch (3397:24): [True: 10.0k, False: 1.82k]
  ------------------
 3398|       |                // derived images refer in the opposite direction
 3399|  10.0k|                avifDecoderItem * dimg;
 3400|  10.0k|                AVIF_CHECKRES(avifMetaFindOrCreateItem(meta, toID, &dimg));
  ------------------
  |  |   54|  10.0k|    do {                                  \
  |  |   55|  10.0k|        const avifResult result__ = (A);  \
  |  |   56|  10.0k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 0, False: 10.0k]
  |  |  ------------------
  |  |   57|      0|            avifBreakOnError();           \
  |  |   58|      0|            return result__;              \
  |  |   59|      0|        }                                 \
  |  |   60|  10.0k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 10.0k]
  |  |  ------------------
  ------------------
 3401|       |
 3402|       |                // Section 8.11.12.1 of ISO/IEC 14496-12:
 3403|       |                //   The items linked to are then represented by an array of to_item_IDs;
 3404|       |                //   within a given array, a given value shall occur at most once.
 3405|  10.0k|                AVIF_CHECKERR(dimg->dimgForID != fromID, AVIF_RESULT_INVALID_IMAGE_GRID);
  ------------------
  |  |   45|  10.0k|    do {                        \
  |  |   46|  10.0k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 1, False: 10.0k]
  |  |  ------------------
  |  |   47|      1|            avifBreakOnError(); \
  |  |   48|      1|            return ERR;         \
  |  |   49|      1|        }                       \
  |  |   50|  10.0k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 10.0k]
  |  |  ------------------
  ------------------
 3406|       |                // A given value may occur within multiple arrays but this is not supported by libavif.
 3407|  10.0k|                AVIF_CHECKERR(dimg->dimgForID == 0, AVIF_RESULT_NOT_IMPLEMENTED);
  ------------------
  |  |   45|  10.0k|    do {                        \
  |  |   46|  10.0k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 2, False: 10.0k]
  |  |  ------------------
  |  |   47|      2|            avifBreakOnError(); \
  |  |   48|      2|            return ERR;         \
  |  |   49|      2|        }                       \
  |  |   50|  10.0k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 10.0k]
  |  |  ------------------
  ------------------
 3408|  10.0k|                dimg->dimgForID = fromID;
 3409|  10.0k|                dimg->dimgIdx = refIndex;
 3410|  10.0k|            } else if (!memcmp(irefHeader.type, "prem", 4)) {
  ------------------
  |  Branch (3410:24): [True: 0, False: 1.82k]
  ------------------
 3411|      0|                item->premByID = toID;
 3412|      0|            }
 3413|  25.6k|        }
 3414|  19.0k|    }
 3415|       |
 3416|  11.7k|    return AVIF_RESULT_OK;
 3417|  11.8k|}
read.c:avifParseGroupsListBox:
 3420|  1.41k|{
 3421|  1.41k|    BEGIN_STREAM(s, raw, rawLen, diag, "Box[grpl]");
  ------------------
  |  |  738|  1.41k|    avifROStream VARNAME;                               \
  |  |  739|  1.41k|    avifROData VARNAME##_roData;                        \
  |  |  740|  1.41k|    VARNAME##_roData.data = PTR;                        \
  |  |  741|  1.41k|    VARNAME##_roData.size = SIZE;                       \
  |  |  742|  1.41k|    avifROStreamStart(&VARNAME, &VARNAME##_roData, DIAG, CONTEXT)
  ------------------
 3422|       |
 3423|  2.81k|    while (avifROStreamHasBytesLeft(&s, 1)) {
  ------------------
  |  Branch (3423:12): [True: 1.42k, False: 1.39k]
  ------------------
 3424|  1.42k|        avifBoxHeader groupHeader;
 3425|  1.42k|        AVIF_CHECKERR(avifROStreamReadBoxHeader(&s, &groupHeader), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  1.42k|    do {                        \
  |  |   46|  1.42k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 2, False: 1.41k]
  |  |  ------------------
  |  |   47|      2|            avifBreakOnError(); \
  |  |   48|      2|            return ERR;         \
  |  |   49|      2|        }                       \
  |  |   50|  1.42k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 1.41k]
  |  |  ------------------
  ------------------
 3426|       |        // We don't check the flag or version as they depend on the grouping type (and for simplicity).
 3427|       |        // ISO/IEC 14496-12:2024 Section 8.15.3.2
 3428|       |        //   version shall be 0 unless defined otherwise for the grouping_type. Any values of flags such that
 3429|       |        //   (flags & 0x000FFF) is not equal to 0 are reserved. The values of flags shall be such that (flags
 3430|       |        //   & 0xFFF000) is equal to 0 unless defined otherwise for the grouping_type.
 3431|  1.41k|        AVIF_CHECKERR(avifROStreamReadVersionAndFlags(&s, NULL, NULL), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  1.41k|    do {                        \
  |  |   46|  1.41k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 1, False: 1.41k]
  |  |  ------------------
  |  |   47|      1|            avifBreakOnError(); \
  |  |   48|      1|            return ERR;         \
  |  |   49|      1|        }                       \
  |  |   50|  1.41k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 1.41k]
  |  |  ------------------
  ------------------
 3432|       |
 3433|  1.41k|        avifEntityToGroup * group = avifArrayPush(&meta->entityToGroups);
 3434|  1.41k|        AVIF_CHECKERR(group != NULL, AVIF_RESULT_OUT_OF_MEMORY);
  ------------------
  |  |   45|  1.41k|    do {                        \
  |  |   46|  1.41k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 1.41k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  1.41k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 1.41k]
  |  |  ------------------
  ------------------
 3435|  1.41k|        AVIF_CHECKERR(avifArrayCreate(&group->entityIDs, sizeof(uint32_t), 2), AVIF_RESULT_OUT_OF_MEMORY);
  ------------------
  |  |   45|  1.41k|    do {                        \
  |  |   46|  1.41k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 1.41k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  1.41k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 1.41k]
  |  |  ------------------
  ------------------
 3436|       |
 3437|  1.41k|        memcpy(group->groupingType, groupHeader.type, 4);
 3438|  1.41k|        AVIF_CHECKERR(avifROStreamReadU32(&s, &group->groupID), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  1.41k|    do {                        \
  |  |   46|  1.41k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 1.41k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  1.41k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 1.41k]
  |  |  ------------------
  ------------------
 3439|  1.41k|        uint32_t numEntitiesInGroup;
 3440|  1.41k|        AVIF_CHECKERR(avifROStreamReadU32(&s, &numEntitiesInGroup), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  1.41k|    do {                        \
  |  |   46|  1.41k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 1.41k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  1.41k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 1.41k]
  |  |  ------------------
  ------------------
 3441|  4.24k|        for (uint32_t i = 0; i < numEntitiesInGroup; ++i) {
  ------------------
  |  Branch (3441:30): [True: 2.84k, False: 1.39k]
  ------------------
 3442|  2.84k|            uint32_t * entityId = avifArrayPush(&group->entityIDs);
 3443|  2.84k|            AVIF_CHECKERR(entityId != NULL, AVIF_RESULT_OUT_OF_MEMORY);
  ------------------
  |  |   45|  2.84k|    do {                        \
  |  |   46|  2.84k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 2.84k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  2.84k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 2.84k]
  |  |  ------------------
  ------------------
 3444|  2.84k|            AVIF_CHECKERR(avifROStreamReadU32(&s, entityId), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  2.84k|    do {                        \
  |  |   46|  2.84k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 19, False: 2.83k]
  |  |  ------------------
  |  |   47|     19|            avifBreakOnError(); \
  |  |   48|     19|            return ERR;         \
  |  |   49|     19|        }                       \
  |  |   50|  2.84k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 2.83k]
  |  |  ------------------
  ------------------
 3445|  2.84k|        }
 3446|  1.41k|    }
 3447|       |
 3448|  1.39k|    return AVIF_RESULT_OK;
 3449|  1.41k|}
read.c:avifParseMovieBox:
 4025|  2.69k|{
 4026|  2.69k|    BEGIN_STREAM(s, raw, rawLen, data->diag, "Box[moov]");
  ------------------
  |  |  738|  2.69k|    avifROStream VARNAME;                               \
  |  |  739|  2.69k|    avifROData VARNAME##_roData;                        \
  |  |  740|  2.69k|    VARNAME##_roData.data = PTR;                        \
  |  |  741|  2.69k|    VARNAME##_roData.size = SIZE;                       \
  |  |  742|  2.69k|    avifROStreamStart(&VARNAME, &VARNAME##_roData, DIAG, CONTEXT)
  ------------------
 4027|       |
 4028|  2.69k|    avifBool hasTrak = AVIF_FALSE;
  ------------------
  |  |   89|  2.69k|#define AVIF_FALSE 0
  ------------------
 4029|  9.15k|    while (avifROStreamHasBytesLeft(&s, 1)) {
  ------------------
  |  Branch (4029:12): [True: 6.77k, False: 2.38k]
  ------------------
 4030|  6.77k|        avifBoxHeader header;
 4031|  6.77k|        AVIF_CHECKERR(avifROStreamReadBoxHeader(&s, &header), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  6.77k|    do {                        \
  |  |   46|  6.77k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 37, False: 6.73k]
  |  |  ------------------
  |  |   47|     37|            avifBreakOnError(); \
  |  |   48|     37|            return ERR;         \
  |  |   49|     37|        }                       \
  |  |   50|  6.77k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 6.73k]
  |  |  ------------------
  ------------------
 4032|       |
 4033|  6.73k|        if (!memcmp(header.type, "trak", 4)) {
  ------------------
  |  Branch (4033:13): [True: 3.60k, False: 3.13k]
  ------------------
 4034|  3.60k|            AVIF_CHECKRES(avifParseTrackBox(data, rawOffset + avifROStreamOffset(&s), avifROStreamCurrent(&s), header.size));
  ------------------
  |  |   54|  3.60k|    do {                                  \
  |  |   55|  3.60k|        const avifResult result__ = (A);  \
  |  |   56|  3.60k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 273, False: 3.32k]
  |  |  ------------------
  |  |   57|    273|            avifBreakOnError();           \
  |  |   58|    273|            return result__;              \
  |  |   59|    273|        }                                 \
  |  |   60|  3.60k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 3.32k]
  |  |  ------------------
  ------------------
 4035|  3.32k|            hasTrak = AVIF_TRUE;
  ------------------
  |  |   88|  3.32k|#define AVIF_TRUE 1
  ------------------
 4036|       |
 4037|  3.32k|            const avifTrack * track = &data->tracks.track[data->tracks.count - 1];
 4038|  3.32k|            if (!memcmp(track->handlerType, "pict", 4) || !memcmp(track->handlerType, "vide", 4) ||
  ------------------
  |  Branch (4038:17): [True: 1.98k, False: 1.34k]
  |  Branch (4038:59): [True: 0, False: 1.34k]
  ------------------
 4039|  2.71k|                !memcmp(track->handlerType, "auxv", 4)) {
  ------------------
  |  Branch (4039:17): [True: 729, False: 612]
  ------------------
 4040|  2.71k|                if ((track->width == 0) || (track->height == 0)) {
  ------------------
  |  Branch (4040:21): [True: 3, False: 2.71k]
  |  Branch (4040:44): [True: 1, False: 2.71k]
  ------------------
 4041|      4|                    avifDiagnosticsPrintf(data->diag, "Track ID [%u] has an invalid size [%ux%u]", track->id, track->width, track->height);
 4042|      4|                    return AVIF_RESULT_BMFF_PARSE_FAILED;
 4043|      4|                }
 4044|  2.71k|                if (avifDimensionsTooLarge(track->width, track->height, imageSizeLimit, imageDimensionLimit)) {
  ------------------
  |  Branch (4044:21): [True: 3, False: 2.70k]
  ------------------
 4045|      3|                    avifDiagnosticsPrintf(data->diag,
 4046|      3|                                          "Track ID [%u] dimensions are too large [%ux%u]",
 4047|      3|                                          track->id,
 4048|      3|                                          track->width,
 4049|      3|                                          track->height);
 4050|      3|                    return AVIF_RESULT_BMFF_PARSE_FAILED;
 4051|      3|                }
 4052|  2.71k|            }
 4053|  3.32k|        }
 4054|       |
 4055|  6.45k|        AVIF_CHECKERR(avifROStreamSkip(&s, header.size), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  6.45k|    do {                        \
  |  |   46|  6.45k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 6.45k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  6.45k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 6.45k]
  |  |  ------------------
  ------------------
 4056|  6.45k|    }
 4057|  2.38k|    if (!hasTrak) {
  ------------------
  |  Branch (4057:9): [True: 1, False: 2.38k]
  ------------------
 4058|      1|        avifDiagnosticsPrintf(data->diag, "moov box does not contain any tracks");
 4059|      1|        return AVIF_RESULT_BMFF_PARSE_FAILED;
 4060|      1|    }
 4061|  2.38k|    return AVIF_RESULT_OK;
 4062|  2.38k|}
read.c:avifParseTrackBox:
 3940|  3.60k|{
 3941|  3.60k|    BEGIN_STREAM(s, raw, rawLen, data->diag, "Box[trak]");
  ------------------
  |  |  738|  3.60k|    avifROStream VARNAME;                               \
  |  |  739|  3.60k|    avifROData VARNAME##_roData;                        \
  |  |  740|  3.60k|    VARNAME##_roData.data = PTR;                        \
  |  |  741|  3.60k|    VARNAME##_roData.size = SIZE;                       \
  |  |  742|  3.60k|    avifROStreamStart(&VARNAME, &VARNAME##_roData, DIAG, CONTEXT)
  ------------------
 3942|       |
 3943|  3.60k|    avifTrack * track = avifDecoderDataCreateTrack(data);
 3944|  3.60k|    AVIF_CHECKERR(track != NULL, AVIF_RESULT_OUT_OF_MEMORY);
  ------------------
  |  |   45|  3.60k|    do {                        \
  |  |   46|  3.60k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 3.60k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  3.60k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 3.60k]
  |  |  ------------------
  ------------------
 3945|       |
 3946|  3.60k|    avifBool edtsBoxSeen = AVIF_FALSE;
  ------------------
  |  |   89|  3.60k|#define AVIF_FALSE 0
  ------------------
 3947|  3.60k|    avifBool tkhdSeen = AVIF_FALSE;
  ------------------
  |  |   89|  3.60k|#define AVIF_FALSE 0
  ------------------
 3948|  15.1k|    while (avifROStreamHasBytesLeft(&s, 1)) {
  ------------------
  |  Branch (3948:12): [True: 11.7k, False: 3.33k]
  ------------------
 3949|  11.7k|        avifBoxHeader header;
 3950|  11.7k|        AVIF_CHECKERR(avifROStreamReadBoxHeader(&s, &header), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  11.7k|    do {                        \
  |  |   46|  11.7k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 42, False: 11.7k]
  |  |  ------------------
  |  |   47|     42|            avifBreakOnError(); \
  |  |   48|     42|            return ERR;         \
  |  |   49|     42|        }                       \
  |  |   50|  11.7k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 11.7k]
  |  |  ------------------
  ------------------
 3951|       |
 3952|  11.7k|        if (!memcmp(header.type, "tkhd", 4)) {
  ------------------
  |  Branch (3952:13): [True: 3.57k, False: 8.17k]
  ------------------
 3953|  3.57k|            if (tkhdSeen) {
  ------------------
  |  Branch (3953:17): [True: 1, False: 3.57k]
  ------------------
 3954|      1|                avifDiagnosticsPrintf(data->diag, "Box[trak] contains a duplicate unique box of type 'tkhd'");
 3955|      1|                return AVIF_RESULT_BMFF_PARSE_FAILED;
 3956|      1|            }
 3957|  3.57k|            AVIF_CHECKERR(avifParseTrackHeaderBox(track, avifROStreamCurrent(&s), header.size, data->diag), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  3.57k|    do {                        \
  |  |   46|  3.57k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 14, False: 3.56k]
  |  |  ------------------
  |  |   47|     14|            avifBreakOnError(); \
  |  |   48|     14|            return ERR;         \
  |  |   49|     14|        }                       \
  |  |   50|  3.57k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 3.56k]
  |  |  ------------------
  ------------------
 3958|  3.56k|            tkhdSeen = AVIF_TRUE;
  ------------------
  |  |   88|  3.56k|#define AVIF_TRUE 1
  ------------------
 3959|  8.17k|        } else if (!memcmp(header.type, "meta", 4)) {
  ------------------
  |  Branch (3959:20): [True: 382, False: 7.79k]
  ------------------
 3960|    382|            AVIF_CHECKRES(
  ------------------
  |  |   54|    382|    do {                                  \
  |  |   55|    382|        const avifResult result__ = (A);  \
  |  |   56|    382|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 2, False: 380]
  |  |  ------------------
  |  |   57|      2|            avifBreakOnError();           \
  |  |   58|      2|            return result__;              \
  |  |   59|      2|        }                                 \
  |  |   60|    382|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 380]
  |  |  ------------------
  ------------------
 3961|    382|                avifParseMetaBox(track->meta, rawOffset + avifROStreamOffset(&s), avifROStreamCurrent(&s), header.size, data->diag));
 3962|  7.79k|        } else if (!memcmp(header.type, "mdia", 4)) {
  ------------------
  |  Branch (3962:20): [True: 3.38k, False: 4.41k]
  ------------------
 3963|  3.38k|            AVIF_CHECKRES(avifParseMediaBox(track, rawOffset + avifROStreamOffset(&s), avifROStreamCurrent(&s), header.size, data->diag));
  ------------------
  |  |   54|  3.38k|    do {                                  \
  |  |   55|  3.38k|        const avifResult result__ = (A);  \
  |  |   56|  3.38k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 183, False: 3.19k]
  |  |  ------------------
  |  |   57|    183|            avifBreakOnError();           \
  |  |   58|    183|            return result__;              \
  |  |   59|    183|        }                                 \
  |  |   60|  3.38k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 3.19k]
  |  |  ------------------
  ------------------
 3964|  4.41k|        } else if (!memcmp(header.type, "tref", 4)) {
  ------------------
  |  Branch (3964:20): [True: 767, False: 3.64k]
  ------------------
 3965|    767|            AVIF_CHECKERR(avifTrackReferenceBox(track, avifROStreamCurrent(&s), header.size, data->diag), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|    767|    do {                        \
  |  |   46|    767|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 3, False: 764]
  |  |  ------------------
  |  |   47|      3|            avifBreakOnError(); \
  |  |   48|      3|            return ERR;         \
  |  |   49|      3|        }                       \
  |  |   50|    767|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 764]
  |  |  ------------------
  ------------------
 3966|  3.64k|        } else if (!memcmp(header.type, "edts", 4)) {
  ------------------
  |  Branch (3966:20): [True: 3.29k, False: 352]
  ------------------
 3967|  3.29k|            if (edtsBoxSeen) {
  ------------------
  |  Branch (3967:17): [True: 1, False: 3.29k]
  ------------------
 3968|      1|                avifDiagnosticsPrintf(data->diag, "Box[trak] contains a duplicate unique box of type 'edts'");
 3969|      1|                return AVIF_RESULT_BMFF_PARSE_FAILED;
 3970|      1|            }
 3971|  3.29k|            AVIF_CHECKERR(avifParseEditBox(track, avifROStreamCurrent(&s), header.size, data->diag), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  3.29k|    do {                        \
  |  |   46|  3.29k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 24, False: 3.27k]
  |  |  ------------------
  |  |   47|     24|            avifBreakOnError(); \
  |  |   48|     24|            return ERR;         \
  |  |   49|     24|        }                       \
  |  |   50|  3.29k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 3.27k]
  |  |  ------------------
  ------------------
 3972|  3.27k|            edtsBoxSeen = AVIF_TRUE;
  ------------------
  |  |   88|  3.27k|#define AVIF_TRUE 1
  ------------------
 3973|  3.27k|        }
 3974|       |
 3975|  11.5k|        AVIF_CHECKERR(avifROStreamSkip(&s, header.size), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  11.5k|    do {                        \
  |  |   46|  11.5k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 11.5k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  11.5k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 11.5k]
  |  |  ------------------
  ------------------
 3976|  11.5k|    }
 3977|  3.33k|    if (!tkhdSeen) {
  ------------------
  |  Branch (3977:9): [True: 2, False: 3.32k]
  ------------------
 3978|      2|        avifDiagnosticsPrintf(data->diag, "Box[trak] does not contain a mandatory [tkhd] box");
 3979|      2|        return AVIF_RESULT_BMFF_PARSE_FAILED;
 3980|      2|    }
 3981|  3.32k|    if (!edtsBoxSeen) {
  ------------------
  |  Branch (3981:9): [True: 228, False: 3.10k]
  ------------------
 3982|    228|        track->repetitionCount = AVIF_REPETITION_COUNT_UNKNOWN;
  ------------------
  |  |  122|    228|#define AVIF_REPETITION_COUNT_UNKNOWN -2
  ------------------
 3983|  3.10k|    } else if (track->isRepeating) {
  ------------------
  |  Branch (3983:16): [True: 1.59k, False: 1.50k]
  ------------------
 3984|  1.59k|        if (track->trackDuration == AVIF_INDEFINITE_DURATION64) {
  ------------------
  |  |  828|  1.59k|#define AVIF_INDEFINITE_DURATION64 UINT64_MAX
  ------------------
  |  Branch (3984:13): [True: 1.41k, False: 184]
  ------------------
 3985|       |            // If isRepeating is true and the track duration is unknown/indefinite, then set the repetition count to infinite
 3986|       |            // (Section 9.6.1 of ISO/IEC 23008-12 Part 12).
 3987|  1.41k|            track->repetitionCount = AVIF_REPETITION_COUNT_INFINITE;
  ------------------
  |  |  119|  1.41k|#define AVIF_REPETITION_COUNT_INFINITE -1
  ------------------
 3988|  1.41k|        } else {
 3989|       |            // Section 9.6.1. of ISO/IEC 23008-12 Part 12: 1, the entire edit list is repeated a sufficient number of times to
 3990|       |            // equal the track duration.
 3991|       |            //
 3992|       |            // Since libavif uses repetitionCount (which is 0-based), we subtract the value by 1 to derive the number of
 3993|       |            // repetitions.
 3994|    184|            AVIF_ASSERT_OR_RETURN(track->segmentDuration != 0);
  ------------------
  |  |   64|    184|#define AVIF_ASSERT_OR_RETURN(A) AVIF_CHECKERR((A), AVIF_RESULT_INTERNAL_ERROR)
  |  |  ------------------
  |  |  |  |   45|    184|    do {                        \
  |  |  |  |   46|    184|        if (!(A)) {             \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (46:13): [True: 0, False: 184]
  |  |  |  |  ------------------
  |  |  |  |   47|      0|            avifBreakOnError(); \
  |  |  |  |   48|      0|            return ERR;         \
  |  |  |  |   49|      0|        }                       \
  |  |  |  |   50|    184|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (50:14): [Folded, False: 184]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 3995|       |            // We specifically check for trackDuration == 0 here and not when it is actually read in order to accept files which
 3996|       |            // inadvertently has a trackDuration of 0 without any edit lists.
 3997|    184|            if (track->trackDuration == 0) {
  ------------------
  |  Branch (3997:17): [True: 1, False: 183]
  ------------------
 3998|      1|                avifDiagnosticsPrintf(data->diag, "Invalid track duration 0.");
 3999|      1|                return AVIF_RESULT_BMFF_PARSE_FAILED;
 4000|      1|            }
 4001|    183|            const uint64_t repetitionCount =
 4002|    183|                (track->trackDuration / track->segmentDuration) + (track->trackDuration % track->segmentDuration != 0) - 1;
 4003|    183|            if (repetitionCount > INT_MAX) {
  ------------------
  |  Branch (4003:17): [True: 134, False: 49]
  ------------------
 4004|       |                // repetitionCount does not fit in an integer and hence it is
 4005|       |                // likely to be a very large value. So, we just set it to
 4006|       |                // infinite.
 4007|    134|                track->repetitionCount = AVIF_REPETITION_COUNT_INFINITE;
  ------------------
  |  |  119|    134|#define AVIF_REPETITION_COUNT_INFINITE -1
  ------------------
 4008|    134|            } else {
 4009|     49|                track->repetitionCount = (int)repetitionCount;
 4010|     49|            }
 4011|    183|        }
 4012|  1.59k|    } else {
 4013|  1.50k|        track->repetitionCount = 0;
 4014|  1.50k|    }
 4015|       |
 4016|  3.32k|    return AVIF_RESULT_OK;
 4017|  3.32k|}
read.c:avifDecoderDataCreateTrack:
 1071|  3.60k|{
 1072|  3.60k|    avifTrack * track = (avifTrack *)avifArrayPush(&data->tracks);
 1073|  3.60k|    if (track == NULL) {
  ------------------
  |  Branch (1073:9): [True: 0, False: 3.60k]
  ------------------
 1074|      0|        return NULL;
 1075|      0|    }
 1076|  3.60k|    track->meta = avifMetaCreate();
 1077|  3.60k|    if (track->meta == NULL) {
  ------------------
  |  Branch (1077:9): [True: 0, False: 3.60k]
  ------------------
 1078|      0|        avifArrayPop(&data->tracks);
 1079|      0|        return NULL;
 1080|      0|    }
 1081|  3.60k|    return track;
 1082|  3.60k|}
read.c:avifParseTrackHeaderBox:
 3519|  3.57k|{
 3520|  3.57k|    BEGIN_STREAM(s, raw, rawLen, diag, "Box[tkhd]");
  ------------------
  |  |  738|  3.57k|    avifROStream VARNAME;                               \
  |  |  739|  3.57k|    avifROData VARNAME##_roData;                        \
  |  |  740|  3.57k|    VARNAME##_roData.data = PTR;                        \
  |  |  741|  3.57k|    VARNAME##_roData.size = SIZE;                       \
  |  |  742|  3.57k|    avifROStreamStart(&VARNAME, &VARNAME##_roData, DIAG, CONTEXT)
  ------------------
 3521|       |
 3522|  3.57k|    uint8_t version;
 3523|  3.57k|    AVIF_CHECK(avifROStreamReadVersionAndFlags(&s, &version, NULL));
  ------------------
  |  |   36|  3.57k|    do {                        \
  |  |   37|  3.57k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 3.57k]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|  3.57k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 3.57k]
  |  |  ------------------
  ------------------
 3524|       |
 3525|  3.57k|    uint32_t ignored32, trackID;
 3526|  3.57k|    uint64_t ignored64;
 3527|  3.57k|    if (version == 1) {
  ------------------
  |  Branch (3527:9): [True: 3.25k, False: 316]
  ------------------
 3528|  3.25k|        AVIF_CHECK(avifROStreamReadU64(&s, &ignored64));            // unsigned int(64) creation_time;
  ------------------
  |  |   36|  3.25k|    do {                        \
  |  |   37|  3.25k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 3.25k]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|  3.25k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 3.25k]
  |  |  ------------------
  ------------------
 3529|  3.25k|        AVIF_CHECK(avifROStreamReadU64(&s, &ignored64));            // unsigned int(64) modification_time;
  ------------------
  |  |   36|  3.25k|    do {                        \
  |  |   37|  3.25k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 3.25k]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|  3.25k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 3.25k]
  |  |  ------------------
  ------------------
 3530|  3.25k|        AVIF_CHECK(avifROStreamReadU32(&s, &trackID));              // unsigned int(32) track_ID;
  ------------------
  |  |   36|  3.25k|    do {                        \
  |  |   37|  3.25k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 3.25k]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|  3.25k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 3.25k]
  |  |  ------------------
  ------------------
 3531|  3.25k|        AVIF_CHECK(avifROStreamReadU32(&s, &ignored32));            // const unsigned int(32) reserved = 0;
  ------------------
  |  |   36|  3.25k|    do {                        \
  |  |   37|  3.25k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 3.25k]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|  3.25k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 3.25k]
  |  |  ------------------
  ------------------
 3532|  3.25k|        AVIF_CHECK(avifROStreamReadU64(&s, &track->trackDuration)); // unsigned int(64) duration;
  ------------------
  |  |   36|  3.25k|    do {                        \
  |  |   37|  3.25k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 3.25k]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|  3.25k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 3.25k]
  |  |  ------------------
  ------------------
 3533|  3.25k|    } else if (version == 0) {
  ------------------
  |  Branch (3533:16): [True: 315, False: 1]
  ------------------
 3534|    315|        uint32_t trackDuration;
 3535|    315|        AVIF_CHECK(avifROStreamReadU32(&s, &ignored32));     // unsigned int(32) creation_time;
  ------------------
  |  |   36|    315|    do {                        \
  |  |   37|    315|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 314]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|    315|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 314]
  |  |  ------------------
  ------------------
 3536|    314|        AVIF_CHECK(avifROStreamReadU32(&s, &ignored32));     // unsigned int(32) modification_time;
  ------------------
  |  |   36|    314|    do {                        \
  |  |   37|    314|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 0, False: 314]
  |  |  ------------------
  |  |   38|      0|            avifBreakOnError(); \
  |  |   39|      0|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      0|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      0|        }                       \
  |  |   41|    314|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 314]
  |  |  ------------------
  ------------------
 3537|    314|        AVIF_CHECK(avifROStreamReadU32(&s, &trackID));       // unsigned int(32) track_ID;
  ------------------
  |  |   36|    314|    do {                        \
  |  |   37|    314|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 313]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|    314|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 313]
  |  |  ------------------
  ------------------
 3538|    313|        AVIF_CHECK(avifROStreamReadU32(&s, &ignored32));     // const unsigned int(32) reserved = 0;
  ------------------
  |  |   36|    313|    do {                        \
  |  |   37|    313|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 312]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|    313|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 312]
  |  |  ------------------
  ------------------
 3539|    312|        AVIF_CHECK(avifROStreamReadU32(&s, &trackDuration)); // unsigned int(32) duration;
  ------------------
  |  |   36|    312|    do {                        \
  |  |   37|    312|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 311]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|    312|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 311]
  |  |  ------------------
  ------------------
 3540|    311|        track->trackDuration = (trackDuration == AVIF_INDEFINITE_DURATION32) ? AVIF_INDEFINITE_DURATION64 : trackDuration;
  ------------------
  |  |  829|    311|#define AVIF_INDEFINITE_DURATION32 UINT32_MAX
  ------------------
                      track->trackDuration = (trackDuration == AVIF_INDEFINITE_DURATION32) ? AVIF_INDEFINITE_DURATION64 : trackDuration;
  ------------------
  |  |  828|    311|#define AVIF_INDEFINITE_DURATION64 UINT64_MAX
  ------------------
  |  Branch (3540:32): [True: 1, False: 310]
  ------------------
 3541|    311|    } else {
 3542|       |        // Unsupported version
 3543|      1|        avifDiagnosticsPrintf(diag, "Box[tkhd] has an unsupported version [%u]", version);
 3544|      1|        return AVIF_FALSE;
  ------------------
  |  |   89|      1|#define AVIF_FALSE 0
  ------------------
 3545|      1|    }
 3546|  3.56k|    track->id = trackID;
 3547|       |
 3548|       |    // Skipping the following 52 bytes here:
 3549|       |    // ------------------------------------
 3550|       |    // const unsigned int(32)[2] reserved = 0;
 3551|       |    // template int(16) layer = 0;
 3552|       |    // template int(16) alternate_group = 0;
 3553|       |    // template int(16) volume = {if track_is_audio 0x0100 else 0};
 3554|       |    // const unsigned int(16) reserved = 0;
 3555|       |    // template int(32)[9] matrix= { 0x00010000,0,0,0,0x00010000,0,0,0,0x40000000 }; // unity matrix
 3556|  3.56k|    AVIF_CHECK(avifROStreamSkip(&s, 52));
  ------------------
  |  |   36|  3.56k|    do {                        \
  |  |   37|  3.56k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 3.56k]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|  3.56k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 3.56k]
  |  |  ------------------
  ------------------
 3557|       |
 3558|  3.56k|    uint32_t width, height;
 3559|  3.56k|    AVIF_CHECK(avifROStreamReadU32(&s, &width));  // unsigned int(32) width;
  ------------------
  |  |   36|  3.56k|    do {                        \
  |  |   37|  3.56k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 3.56k]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|  3.56k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 3.56k]
  |  |  ------------------
  ------------------
 3560|  3.56k|    AVIF_CHECK(avifROStreamReadU32(&s, &height)); // unsigned int(32) height;
  ------------------
  |  |   36|  3.56k|    do {                        \
  |  |   37|  3.56k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 3.56k]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|  3.56k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 3.56k]
  |  |  ------------------
  ------------------
 3561|  3.56k|    track->width = width >> 16;
 3562|  3.56k|    track->height = height >> 16;
 3563|       |
 3564|       |    // TODO: support scaling based on width/height track header info?
 3565|       |
 3566|  3.56k|    return AVIF_TRUE;
  ------------------
  |  |   88|  3.56k|#define AVIF_TRUE 1
  ------------------
 3567|  3.56k|}
read.c:avifParseMediaBox:
 3827|  3.38k|{
 3828|  3.38k|    BEGIN_STREAM(s, raw, rawLen, diag, "Box[mdia]");
  ------------------
  |  |  738|  3.38k|    avifROStream VARNAME;                               \
  |  |  739|  3.38k|    avifROData VARNAME##_roData;                        \
  |  |  740|  3.38k|    VARNAME##_roData.data = PTR;                        \
  |  |  741|  3.38k|    VARNAME##_roData.size = SIZE;                       \
  |  |  742|  3.38k|    avifROStreamStart(&VARNAME, &VARNAME##_roData, DIAG, CONTEXT)
  ------------------
 3829|       |
 3830|  13.3k|    while (avifROStreamHasBytesLeft(&s, 1)) {
  ------------------
  |  Branch (3830:12): [True: 10.1k, False: 3.19k]
  ------------------
 3831|  10.1k|        avifBoxHeader header;
 3832|  10.1k|        AVIF_CHECKERR(avifROStreamReadBoxHeader(&s, &header), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  10.1k|    do {                        \
  |  |   46|  10.1k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 12, False: 10.1k]
  |  |  ------------------
  |  |   47|     12|            avifBreakOnError(); \
  |  |   48|     12|            return ERR;         \
  |  |   49|     12|        }                       \
  |  |   50|  10.1k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 10.1k]
  |  |  ------------------
  ------------------
 3833|       |
 3834|  10.1k|        if (!memcmp(header.type, "mdhd", 4)) {
  ------------------
  |  Branch (3834:13): [True: 3.07k, False: 7.04k]
  ------------------
 3835|  3.07k|            AVIF_CHECKERR(avifParseMediaHeaderBox(track, avifROStreamCurrent(&s), header.size, diag), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  3.07k|    do {                        \
  |  |   46|  3.07k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 10, False: 3.06k]
  |  |  ------------------
  |  |   47|     10|            avifBreakOnError(); \
  |  |   48|     10|            return ERR;         \
  |  |   49|     10|        }                       \
  |  |   50|  3.07k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 3.06k]
  |  |  ------------------
  ------------------
 3836|  7.04k|        } else if (!memcmp(header.type, "minf", 4)) {
  ------------------
  |  Branch (3836:20): [True: 3.21k, False: 3.83k]
  ------------------
 3837|  3.21k|            AVIF_CHECKRES(
  ------------------
  |  |   54|  3.21k|    do {                                  \
  |  |   55|  3.21k|        const avifResult result__ = (A);  \
  |  |   56|  3.21k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 159, False: 3.05k]
  |  |  ------------------
  |  |   57|    159|            avifBreakOnError();           \
  |  |   58|    159|            return result__;              \
  |  |   59|    159|        }                                 \
  |  |   60|  3.21k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 3.05k]
  |  |  ------------------
  ------------------
 3838|  3.21k|                avifParseMediaInformationBox(track, rawOffset + avifROStreamOffset(&s), avifROStreamCurrent(&s), header.size, diag));
 3839|  3.83k|        } else if (!memcmp(header.type, "hdlr", 4)) {
  ------------------
  |  Branch (3839:20): [True: 3.29k, False: 546]
  ------------------
 3840|  3.29k|            AVIF_CHECKERR(avifParseHandlerBox(avifROStreamCurrent(&s), header.size, track->handlerType, diag),
  ------------------
  |  |   45|  3.29k|    do {                        \
  |  |   46|  3.29k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 2, False: 3.28k]
  |  |  ------------------
  |  |   47|      2|            avifBreakOnError(); \
  |  |   48|      2|            return ERR;         \
  |  |   49|      2|        }                       \
  |  |   50|  3.29k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 3.28k]
  |  |  ------------------
  ------------------
 3841|  3.29k|                          AVIF_RESULT_BMFF_PARSE_FAILED);
 3842|  3.29k|        }
 3843|       |
 3844|  9.95k|        AVIF_CHECKERR(avifROStreamSkip(&s, header.size), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  9.95k|    do {                        \
  |  |   46|  9.95k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 9.95k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  9.95k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 9.95k]
  |  |  ------------------
  ------------------
 3845|  9.95k|    }
 3846|  3.19k|    return AVIF_RESULT_OK;
 3847|  3.38k|}
read.c:avifParseMediaHeaderBox:
 3570|  3.07k|{
 3571|  3.07k|    BEGIN_STREAM(s, raw, rawLen, diag, "Box[mdhd]");
  ------------------
  |  |  738|  3.07k|    avifROStream VARNAME;                               \
  |  |  739|  3.07k|    avifROData VARNAME##_roData;                        \
  |  |  740|  3.07k|    VARNAME##_roData.data = PTR;                        \
  |  |  741|  3.07k|    VARNAME##_roData.size = SIZE;                       \
  |  |  742|  3.07k|    avifROStreamStart(&VARNAME, &VARNAME##_roData, DIAG, CONTEXT)
  ------------------
 3572|       |
 3573|  3.07k|    uint8_t version;
 3574|  3.07k|    AVIF_CHECK(avifROStreamReadVersionAndFlags(&s, &version, NULL));
  ------------------
  |  |   36|  3.07k|    do {                        \
  |  |   37|  3.07k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 3.07k]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|  3.07k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 3.07k]
  |  |  ------------------
  ------------------
 3575|       |
 3576|  3.07k|    uint32_t ignored32, mediaTimescale, mediaDuration32;
 3577|  3.07k|    uint64_t ignored64, mediaDuration64;
 3578|  3.07k|    if (version == 1) {
  ------------------
  |  Branch (3578:9): [True: 2.80k, False: 277]
  ------------------
 3579|  2.80k|        AVIF_CHECK(avifROStreamReadU64(&s, &ignored64));       // unsigned int(64) creation_time;
  ------------------
  |  |   36|  2.80k|    do {                        \
  |  |   37|  2.80k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 2.80k]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|  2.80k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 2.80k]
  |  |  ------------------
  ------------------
 3580|  2.80k|        AVIF_CHECK(avifROStreamReadU64(&s, &ignored64));       // unsigned int(64) modification_time;
  ------------------
  |  |   36|  2.80k|    do {                        \
  |  |   37|  2.80k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 2.79k]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|  2.80k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 2.79k]
  |  |  ------------------
  ------------------
 3581|  2.79k|        AVIF_CHECK(avifROStreamReadU32(&s, &mediaTimescale));  // unsigned int(32) timescale;
  ------------------
  |  |   36|  2.79k|    do {                        \
  |  |   37|  2.79k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 2.79k]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|  2.79k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 2.79k]
  |  |  ------------------
  ------------------
 3582|  2.79k|        AVIF_CHECK(avifROStreamReadU64(&s, &mediaDuration64)); // unsigned int(64) duration;
  ------------------
  |  |   36|  2.79k|    do {                        \
  |  |   37|  2.79k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 2.79k]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|  2.79k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 2.79k]
  |  |  ------------------
  ------------------
 3583|  2.79k|        track->mediaDuration = mediaDuration64;
 3584|  2.79k|    } else if (version == 0) {
  ------------------
  |  Branch (3584:16): [True: 276, False: 1]
  ------------------
 3585|    276|        AVIF_CHECK(avifROStreamReadU32(&s, &ignored32));       // unsigned int(32) creation_time;
  ------------------
  |  |   36|    276|    do {                        \
  |  |   37|    276|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 275]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|    276|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 275]
  |  |  ------------------
  ------------------
 3586|    275|        AVIF_CHECK(avifROStreamReadU32(&s, &ignored32));       // unsigned int(32) modification_time;
  ------------------
  |  |   36|    275|    do {                        \
  |  |   37|    275|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 274]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|    275|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 274]
  |  |  ------------------
  ------------------
 3587|    274|        AVIF_CHECK(avifROStreamReadU32(&s, &mediaTimescale));  // unsigned int(32) timescale;
  ------------------
  |  |   36|    274|    do {                        \
  |  |   37|    274|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 273]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|    274|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 273]
  |  |  ------------------
  ------------------
 3588|    273|        AVIF_CHECK(avifROStreamReadU32(&s, &mediaDuration32)); // unsigned int(32) duration;
  ------------------
  |  |   36|    273|    do {                        \
  |  |   37|    273|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 272]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|    273|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 272]
  |  |  ------------------
  ------------------
 3589|    272|        track->mediaDuration = (uint64_t)mediaDuration32;
 3590|    272|    } else {
 3591|       |        // Unsupported version
 3592|      1|        avifDiagnosticsPrintf(diag, "Box[mdhd] has an unsupported version [%u]", version);
 3593|      1|        return AVIF_FALSE;
  ------------------
  |  |   89|      1|#define AVIF_FALSE 0
  ------------------
 3594|      1|    }
 3595|       |
 3596|  3.06k|    track->mediaTimescale = mediaTimescale;
 3597|  3.06k|    return AVIF_TRUE;
  ------------------
  |  |   88|  3.06k|#define AVIF_TRUE 1
  ------------------
 3598|  3.07k|}
read.c:avifParseMediaInformationBox:
 3810|  3.21k|{
 3811|  3.21k|    BEGIN_STREAM(s, raw, rawLen, diag, "Box[minf]");
  ------------------
  |  |  738|  3.21k|    avifROStream VARNAME;                               \
  |  |  739|  3.21k|    avifROData VARNAME##_roData;                        \
  |  |  740|  3.21k|    VARNAME##_roData.data = PTR;                        \
  |  |  741|  3.21k|    VARNAME##_roData.size = SIZE;                       \
  |  |  742|  3.21k|    avifROStreamStart(&VARNAME, &VARNAME##_roData, DIAG, CONTEXT)
  ------------------
 3812|       |
 3813|  12.6k|    while (avifROStreamHasBytesLeft(&s, 1)) {
  ------------------
  |  Branch (3813:12): [True: 9.64k, False: 3.05k]
  ------------------
 3814|  9.64k|        avifBoxHeader header;
 3815|  9.64k|        AVIF_CHECKERR(avifROStreamReadBoxHeader(&s, &header), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  9.64k|    do {                        \
  |  |   46|  9.64k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 6, False: 9.63k]
  |  |  ------------------
  |  |   47|      6|            avifBreakOnError(); \
  |  |   48|      6|            return ERR;         \
  |  |   49|      6|        }                       \
  |  |   50|  9.64k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 9.63k]
  |  |  ------------------
  ------------------
 3816|       |
 3817|  9.63k|        if (!memcmp(header.type, "stbl", 4)) {
  ------------------
  |  Branch (3817:13): [True: 3.14k, False: 6.49k]
  ------------------
 3818|  3.14k|            AVIF_CHECKRES(avifParseSampleTableBox(track, rawOffset + avifROStreamOffset(&s), avifROStreamCurrent(&s), header.size, diag));
  ------------------
  |  |   54|  3.14k|    do {                                  \
  |  |   55|  3.14k|        const avifResult result__ = (A);  \
  |  |   56|  3.14k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 153, False: 2.98k]
  |  |  ------------------
  |  |   57|    153|            avifBreakOnError();           \
  |  |   58|    153|            return result__;              \
  |  |   59|    153|        }                                 \
  |  |   60|  3.14k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 2.98k]
  |  |  ------------------
  ------------------
 3819|  3.14k|        }
 3820|       |
 3821|  9.48k|        AVIF_CHECKERR(avifROStreamSkip(&s, header.size), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  9.48k|    do {                        \
  |  |   46|  9.48k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 9.48k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  9.48k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 9.48k]
  |  |  ------------------
  ------------------
 3822|  9.48k|    }
 3823|  3.05k|    return AVIF_RESULT_OK;
 3824|  3.21k|}
read.c:avifParseSampleTableBox:
 3769|  3.14k|{
 3770|  3.14k|    if (track->sampleTable) {
  ------------------
  |  Branch (3770:9): [True: 0, False: 3.14k]
  ------------------
 3771|       |        // A TrackBox may only have one SampleTable
 3772|      0|        avifDiagnosticsPrintf(diag, "Duplicate Box[stbl] for a single track detected");
 3773|      0|        return AVIF_RESULT_BMFF_PARSE_FAILED;
 3774|      0|    }
 3775|  3.14k|    track->sampleTable = avifSampleTableCreate();
 3776|  3.14k|    AVIF_CHECKERR(track->sampleTable != NULL, AVIF_RESULT_OUT_OF_MEMORY);
  ------------------
  |  |   45|  3.14k|    do {                        \
  |  |   46|  3.14k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 3.14k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  3.14k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 3.14k]
  |  |  ------------------
  ------------------
 3777|       |
 3778|  3.14k|    BEGIN_STREAM(s, raw, rawLen, diag, "Box[stbl]");
  ------------------
  |  |  738|  3.14k|    avifROStream VARNAME;                               \
  |  |  739|  3.14k|    avifROData VARNAME##_roData;                        \
  |  |  740|  3.14k|    VARNAME##_roData.data = PTR;                        \
  |  |  741|  3.14k|    VARNAME##_roData.size = SIZE;                       \
  |  |  742|  3.14k|    avifROStreamStart(&VARNAME, &VARNAME##_roData, DIAG, CONTEXT)
  ------------------
 3779|       |
 3780|  21.3k|    while (avifROStreamHasBytesLeft(&s, 1)) {
  ------------------
  |  Branch (3780:12): [True: 18.3k, False: 2.98k]
  ------------------
 3781|  18.3k|        avifBoxHeader header;
 3782|  18.3k|        AVIF_CHECKERR(avifROStreamReadBoxHeader(&s, &header), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  18.3k|    do {                        \
  |  |   46|  18.3k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 25, False: 18.3k]
  |  |  ------------------
  |  |   47|     25|            avifBreakOnError(); \
  |  |   48|     25|            return ERR;         \
  |  |   49|     25|        }                       \
  |  |   50|  18.3k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 18.3k]
  |  |  ------------------
  ------------------
 3783|       |
 3784|  18.3k|        if (!memcmp(header.type, "stco", 4)) {
  ------------------
  |  Branch (3784:13): [True: 2.97k, False: 15.3k]
  ------------------
 3785|  2.97k|            AVIF_CHECKRES(avifParseChunkOffsetBox(track->sampleTable, AVIF_FALSE, avifROStreamCurrent(&s), header.size, diag));
  ------------------
  |  |   54|  2.97k|    do {                                  \
  |  |   55|  2.97k|        const avifResult result__ = (A);  \
  |  |   56|  2.97k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 21, False: 2.95k]
  |  |  ------------------
  |  |   57|     21|            avifBreakOnError();           \
  |  |   58|     21|            return result__;              \
  |  |   59|     21|        }                                 \
  |  |   60|  2.97k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 2.95k]
  |  |  ------------------
  ------------------
 3786|  15.3k|        } else if (!memcmp(header.type, "co64", 4)) {
  ------------------
  |  Branch (3786:20): [True: 0, False: 15.3k]
  ------------------
 3787|      0|            AVIF_CHECKRES(avifParseChunkOffsetBox(track->sampleTable, AVIF_TRUE, avifROStreamCurrent(&s), header.size, diag));
  ------------------
  |  |   54|      0|    do {                                  \
  |  |   55|      0|        const avifResult result__ = (A);  \
  |  |   56|      0|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 0, False: 0]
  |  |  ------------------
  |  |   57|      0|            avifBreakOnError();           \
  |  |   58|      0|            return result__;              \
  |  |   59|      0|        }                                 \
  |  |   60|      0|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 0]
  |  |  ------------------
  ------------------
 3788|  15.3k|        } else if (!memcmp(header.type, "stsc", 4)) {
  ------------------
  |  Branch (3788:20): [True: 3.06k, False: 12.3k]
  ------------------
 3789|  3.06k|            AVIF_CHECKRES(avifParseSampleToChunkBox(track->sampleTable, avifROStreamCurrent(&s), header.size, diag));
  ------------------
  |  |   54|  3.06k|    do {                                  \
  |  |   55|  3.06k|        const avifResult result__ = (A);  \
  |  |   56|  3.06k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 32, False: 3.03k]
  |  |  ------------------
  |  |   57|     32|            avifBreakOnError();           \
  |  |   58|     32|            return result__;              \
  |  |   59|     32|        }                                 \
  |  |   60|  3.06k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 3.03k]
  |  |  ------------------
  ------------------
 3790|  12.3k|        } else if (!memcmp(header.type, "stsz", 4)) {
  ------------------
  |  Branch (3790:20): [True: 3.02k, False: 9.27k]
  ------------------
 3791|  3.02k|            AVIF_CHECKRES(avifParseSampleSizeBox(track->sampleTable, avifROStreamCurrent(&s), header.size, diag));
  ------------------
  |  |   54|  3.02k|    do {                                  \
  |  |   55|  3.02k|        const avifResult result__ = (A);  \
  |  |   56|  3.02k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 11, False: 3.01k]
  |  |  ------------------
  |  |   57|     11|            avifBreakOnError();           \
  |  |   58|     11|            return result__;              \
  |  |   59|     11|        }                                 \
  |  |   60|  3.02k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 3.01k]
  |  |  ------------------
  ------------------
 3792|  9.27k|        } else if (!memcmp(header.type, "stss", 4)) {
  ------------------
  |  Branch (3792:20): [True: 2.79k, False: 6.48k]
  ------------------
 3793|  2.79k|            AVIF_CHECKRES(avifParseSyncSampleBox(track->sampleTable, avifROStreamCurrent(&s), header.size, diag));
  ------------------
  |  |   54|  2.79k|    do {                                  \
  |  |   55|  2.79k|        const avifResult result__ = (A);  \
  |  |   56|  2.79k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 16, False: 2.78k]
  |  |  ------------------
  |  |   57|     16|            avifBreakOnError();           \
  |  |   58|     16|            return result__;              \
  |  |   59|     16|        }                                 \
  |  |   60|  2.79k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 2.78k]
  |  |  ------------------
  ------------------
 3794|  6.48k|        } else if (!memcmp(header.type, "stts", 4)) {
  ------------------
  |  Branch (3794:20): [True: 3.00k, False: 3.47k]
  ------------------
 3795|  3.00k|            AVIF_CHECKRES(avifParseTimeToSampleBox(track->sampleTable, avifROStreamCurrent(&s), header.size, diag));
  ------------------
  |  |   54|  3.00k|    do {                                  \
  |  |   55|  3.00k|        const avifResult result__ = (A);  \
  |  |   56|  3.00k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 19, False: 2.98k]
  |  |  ------------------
  |  |   57|     19|            avifBreakOnError();           \
  |  |   58|     19|            return result__;              \
  |  |   59|     19|        }                                 \
  |  |   60|  3.00k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 2.98k]
  |  |  ------------------
  ------------------
 3796|  3.47k|        } else if (!memcmp(header.type, "stsd", 4)) {
  ------------------
  |  Branch (3796:20): [True: 3.09k, False: 385]
  ------------------
 3797|  3.09k|            AVIF_CHECKRES(avifParseSampleDescriptionBox(track->sampleTable,
  ------------------
  |  |   54|  3.09k|    do {                                  \
  |  |   55|  3.09k|        const avifResult result__ = (A);  \
  |  |   56|  3.09k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 29, False: 3.06k]
  |  |  ------------------
  |  |   57|     29|            avifBreakOnError();           \
  |  |   58|     29|            return result__;              \
  |  |   59|     29|        }                                 \
  |  |   60|  3.09k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 3.06k]
  |  |  ------------------
  ------------------
 3798|  3.09k|                                                        rawOffset + avifROStreamOffset(&s),
 3799|  3.09k|                                                        avifROStreamCurrent(&s),
 3800|  3.09k|                                                        header.size,
 3801|  3.09k|                                                        diag));
 3802|  3.09k|        }
 3803|       |
 3804|  18.2k|        AVIF_CHECKERR(avifROStreamSkip(&s, header.size), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  18.2k|    do {                        \
  |  |   46|  18.2k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 18.2k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  18.2k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 18.2k]
  |  |  ------------------
  ------------------
 3805|  18.2k|    }
 3806|  2.98k|    return AVIF_RESULT_OK;
 3807|  3.14k|}
read.c:avifSampleTableCreate:
  313|  3.14k|{
  314|  3.14k|    avifSampleTable * sampleTable = (avifSampleTable *)avifAlloc(sizeof(avifSampleTable));
  315|  3.14k|    if (sampleTable == NULL) {
  ------------------
  |  Branch (315:9): [True: 0, False: 3.14k]
  ------------------
  316|      0|        return NULL;
  317|      0|    }
  318|  3.14k|    memset(sampleTable, 0, sizeof(avifSampleTable));
  319|  3.14k|    if (!avifArrayCreate(&sampleTable->chunks, sizeof(avifSampleTableChunk), 16) ||
  ------------------
  |  Branch (319:9): [True: 0, False: 3.14k]
  ------------------
  320|  3.14k|        !avifArrayCreate(&sampleTable->sampleDescriptions, sizeof(avifSampleDescription), 2) ||
  ------------------
  |  Branch (320:9): [True: 0, False: 3.14k]
  ------------------
  321|  3.14k|        !avifArrayCreate(&sampleTable->sampleToChunks, sizeof(avifSampleTableSampleToChunk), 16) ||
  ------------------
  |  Branch (321:9): [True: 0, False: 3.14k]
  ------------------
  322|  3.14k|        !avifArrayCreate(&sampleTable->sampleSizes, sizeof(avifSampleTableSampleSize), 16) ||
  ------------------
  |  Branch (322:9): [True: 0, False: 3.14k]
  ------------------
  323|  3.14k|        !avifArrayCreate(&sampleTable->timeToSamples, sizeof(avifSampleTableTimeToSample), 16) ||
  ------------------
  |  Branch (323:9): [True: 0, False: 3.14k]
  ------------------
  324|  3.14k|        !avifArrayCreate(&sampleTable->syncSamples, sizeof(avifSyncSample), 16)) {
  ------------------
  |  Branch (324:9): [True: 0, False: 3.14k]
  ------------------
  325|      0|        avifSampleTableDestroy(sampleTable);
  326|      0|        return NULL;
  327|      0|    }
  328|  3.14k|    return sampleTable;
  329|  3.14k|}
read.c:avifParseChunkOffsetBox:
 3601|  2.97k|{
 3602|  2.97k|    BEGIN_STREAM(s, raw, rawLen, diag, largeOffsets ? "Box[co64]" : "Box[stco]");
  ------------------
  |  |  738|  2.97k|    avifROStream VARNAME;                               \
  |  |  739|  2.97k|    avifROData VARNAME##_roData;                        \
  |  |  740|  2.97k|    VARNAME##_roData.data = PTR;                        \
  |  |  741|  2.97k|    VARNAME##_roData.size = SIZE;                       \
  |  |  742|  5.94k|    avifROStreamStart(&VARNAME, &VARNAME##_roData, DIAG, CONTEXT)
  |  |  ------------------
  |  |  |  Branch (742:58): [True: 0, False: 2.97k]
  |  |  ------------------
  ------------------
 3603|       |
 3604|  2.97k|    AVIF_CHECKERR(avifROStreamReadAndEnforceVersion(&s, /*enforcedVersion=*/0, /*flags=*/NULL), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  2.97k|    do {                        \
  |  |   46|  2.97k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 1, False: 2.97k]
  |  |  ------------------
  |  |   47|      1|            avifBreakOnError(); \
  |  |   48|      1|            return ERR;         \
  |  |   49|      1|        }                       \
  |  |   50|  2.97k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 2.97k]
  |  |  ------------------
  ------------------
 3605|       |
 3606|  2.97k|    uint32_t entryCount;
 3607|  2.97k|    AVIF_CHECKERR(avifROStreamReadU32(&s, &entryCount), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(32) entry_count;
  ------------------
  |  |   45|  2.97k|    do {                        \
  |  |   46|  2.97k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 1, False: 2.97k]
  |  |  ------------------
  |  |   47|      1|            avifBreakOnError(); \
  |  |   48|      1|            return ERR;         \
  |  |   49|      1|        }                       \
  |  |   50|  2.97k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 2.97k]
  |  |  ------------------
  ------------------
 3608|  5.94k|    for (uint32_t i = 0; i < entryCount; ++i) {
  ------------------
  |  Branch (3608:26): [True: 2.99k, False: 2.95k]
  ------------------
 3609|  2.99k|        uint64_t offset;
 3610|  2.99k|        if (largeOffsets) {
  ------------------
  |  Branch (3610:13): [True: 0, False: 2.99k]
  ------------------
 3611|      0|            AVIF_CHECKERR(avifROStreamReadU64(&s, &offset), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(64) chunk_offset;
  ------------------
  |  |   45|      0|    do {                        \
  |  |   46|      0|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 0]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|      0|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 0]
  |  |  ------------------
  ------------------
 3612|  2.99k|        } else {
 3613|  2.99k|            uint32_t offset32;
 3614|  2.99k|            AVIF_CHECKERR(avifROStreamReadU32(&s, &offset32), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(32) chunk_offset;
  ------------------
  |  |   45|  2.99k|    do {                        \
  |  |   46|  2.99k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 19, False: 2.97k]
  |  |  ------------------
  |  |   47|     19|            avifBreakOnError(); \
  |  |   48|     19|            return ERR;         \
  |  |   49|     19|        }                       \
  |  |   50|  2.99k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 2.97k]
  |  |  ------------------
  ------------------
 3615|  2.97k|            offset = (uint64_t)offset32;
 3616|  2.97k|        }
 3617|       |
 3618|  2.97k|        avifSampleTableChunk * chunk = (avifSampleTableChunk *)avifArrayPush(&sampleTable->chunks);
 3619|  2.97k|        AVIF_CHECKERR(chunk != NULL, AVIF_RESULT_OUT_OF_MEMORY);
  ------------------
  |  |   45|  2.97k|    do {                        \
  |  |   46|  2.97k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 2.97k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  2.97k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 2.97k]
  |  |  ------------------
  ------------------
 3620|  2.97k|        chunk->offset = offset;
 3621|  2.97k|    }
 3622|  2.95k|    return AVIF_RESULT_OK;
 3623|  2.97k|}
read.c:avifParseSampleToChunkBox:
 3626|  3.06k|{
 3627|  3.06k|    BEGIN_STREAM(s, raw, rawLen, diag, "Box[stsc]");
  ------------------
  |  |  738|  3.06k|    avifROStream VARNAME;                               \
  |  |  739|  3.06k|    avifROData VARNAME##_roData;                        \
  |  |  740|  3.06k|    VARNAME##_roData.data = PTR;                        \
  |  |  741|  3.06k|    VARNAME##_roData.size = SIZE;                       \
  |  |  742|  3.06k|    avifROStreamStart(&VARNAME, &VARNAME##_roData, DIAG, CONTEXT)
  ------------------
 3628|       |
 3629|  3.06k|    AVIF_CHECKERR(avifROStreamReadAndEnforceVersion(&s, /*enforcedVersion=*/0, /*flags=*/NULL), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  3.06k|    do {                        \
  |  |   46|  3.06k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 1, False: 3.06k]
  |  |  ------------------
  |  |   47|      1|            avifBreakOnError(); \
  |  |   48|      1|            return ERR;         \
  |  |   49|      1|        }                       \
  |  |   50|  3.06k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 3.06k]
  |  |  ------------------
  ------------------
 3630|       |
 3631|  3.06k|    uint32_t entryCount;
 3632|  3.06k|    AVIF_CHECKERR(avifROStreamReadU32(&s, &entryCount), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(32) entry_count;
  ------------------
  |  |   45|  3.06k|    do {                        \
  |  |   46|  3.06k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 1, False: 3.06k]
  |  |  ------------------
  |  |   47|      1|            avifBreakOnError(); \
  |  |   48|      1|            return ERR;         \
  |  |   49|      1|        }                       \
  |  |   50|  3.06k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 3.06k]
  |  |  ------------------
  ------------------
 3633|  3.06k|    uint32_t prevFirstChunk = 0;
 3634|  6.11k|    for (uint32_t i = 0; i < entryCount; ++i) {
  ------------------
  |  Branch (3634:26): [True: 3.07k, False: 3.03k]
  ------------------
 3635|  3.07k|        avifSampleTableSampleToChunk * sampleToChunk = (avifSampleTableSampleToChunk *)avifArrayPush(&sampleTable->sampleToChunks);
 3636|  3.07k|        AVIF_CHECKERR(sampleToChunk != NULL, AVIF_RESULT_OUT_OF_MEMORY);
  ------------------
  |  |   45|  3.07k|    do {                        \
  |  |   46|  3.07k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 3.07k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  3.07k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 3.07k]
  |  |  ------------------
  ------------------
 3637|  3.07k|        AVIF_CHECKERR(avifROStreamReadU32(&s, &sampleToChunk->firstChunk), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(32) first_chunk;
  ------------------
  |  |   45|  3.07k|    do {                        \
  |  |   46|  3.07k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 11, False: 3.06k]
  |  |  ------------------
  |  |   47|     11|            avifBreakOnError(); \
  |  |   48|     11|            return ERR;         \
  |  |   49|     11|        }                       \
  |  |   50|  3.07k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 3.06k]
  |  |  ------------------
  ------------------
 3638|  3.06k|        AVIF_CHECKERR(avifROStreamReadU32(&s, &sampleToChunk->samplesPerChunk), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(32) samples_per_chunk;
  ------------------
  |  |   45|  3.06k|    do {                        \
  |  |   46|  3.06k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 2, False: 3.06k]
  |  |  ------------------
  |  |   47|      2|            avifBreakOnError(); \
  |  |   48|      2|            return ERR;         \
  |  |   49|      2|        }                       \
  |  |   50|  3.06k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 3.06k]
  |  |  ------------------
  ------------------
 3639|  3.06k|        AVIF_CHECKERR(avifROStreamReadU32(&s, &sampleToChunk->sampleDescriptionIndex),
  ------------------
  |  |   45|  3.06k|    do {                        \
  |  |   46|  3.06k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 2, False: 3.06k]
  |  |  ------------------
  |  |   47|      2|            avifBreakOnError(); \
  |  |   48|      2|            return ERR;         \
  |  |   49|      2|        }                       \
  |  |   50|  3.06k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 3.06k]
  |  |  ------------------
  ------------------
 3640|  3.06k|                      AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(32) sample_description_index;
 3641|       |        // The first_chunk fields should start with 1 and be strictly increasing.
 3642|  3.06k|        if (i == 0) {
  ------------------
  |  Branch (3642:13): [True: 3.05k, False: 4]
  ------------------
 3643|  3.05k|            if (sampleToChunk->firstChunk != 1) {
  ------------------
  |  Branch (3643:17): [True: 14, False: 3.04k]
  ------------------
 3644|     14|                avifDiagnosticsPrintf(diag, "Box[stsc] does not begin with chunk 1 [%u]", sampleToChunk->firstChunk);
 3645|     14|                return AVIF_RESULT_BMFF_PARSE_FAILED;
 3646|     14|            }
 3647|  3.05k|        } else {
 3648|      4|            if (sampleToChunk->firstChunk <= prevFirstChunk) {
  ------------------
  |  Branch (3648:17): [True: 1, False: 3]
  ------------------
 3649|      1|                avifDiagnosticsPrintf(diag, "Box[stsc] chunks are not strictly increasing");
 3650|      1|                return AVIF_RESULT_BMFF_PARSE_FAILED;
 3651|      1|            }
 3652|      4|        }
 3653|  3.04k|        prevFirstChunk = sampleToChunk->firstChunk;
 3654|  3.04k|    }
 3655|  3.03k|    return AVIF_RESULT_OK;
 3656|  3.06k|}
read.c:avifParseSampleSizeBox:
 3659|  3.02k|{
 3660|  3.02k|    BEGIN_STREAM(s, raw, rawLen, diag, "Box[stsz]");
  ------------------
  |  |  738|  3.02k|    avifROStream VARNAME;                               \
  |  |  739|  3.02k|    avifROData VARNAME##_roData;                        \
  |  |  740|  3.02k|    VARNAME##_roData.data = PTR;                        \
  |  |  741|  3.02k|    VARNAME##_roData.size = SIZE;                       \
  |  |  742|  3.02k|    avifROStreamStart(&VARNAME, &VARNAME##_roData, DIAG, CONTEXT)
  ------------------
 3661|       |
 3662|  3.02k|    AVIF_CHECKERR(avifROStreamReadAndEnforceVersion(&s, /*enforcedVersion=*/0, /*flags=*/NULL), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  3.02k|    do {                        \
  |  |   46|  3.02k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 1, False: 3.02k]
  |  |  ------------------
  |  |   47|      1|            avifBreakOnError(); \
  |  |   48|      1|            return ERR;         \
  |  |   49|      1|        }                       \
  |  |   50|  3.02k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 3.02k]
  |  |  ------------------
  ------------------
 3663|       |
 3664|  3.02k|    uint32_t allSamplesSize, sampleCount;
 3665|  3.02k|    AVIF_CHECKERR(avifROStreamReadU32(&s, &allSamplesSize), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(32) sample_size;
  ------------------
  |  |   45|  3.02k|    do {                        \
  |  |   46|  3.02k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 1, False: 3.02k]
  |  |  ------------------
  |  |   47|      1|            avifBreakOnError(); \
  |  |   48|      1|            return ERR;         \
  |  |   49|      1|        }                       \
  |  |   50|  3.02k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 3.02k]
  |  |  ------------------
  ------------------
 3666|  3.02k|    AVIF_CHECKERR(avifROStreamReadU32(&s, &sampleCount), AVIF_RESULT_BMFF_PARSE_FAILED);    // unsigned int(32) sample_count;
  ------------------
  |  |   45|  3.02k|    do {                        \
  |  |   46|  3.02k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 1, False: 3.02k]
  |  |  ------------------
  |  |   47|      1|            avifBreakOnError(); \
  |  |   48|      1|            return ERR;         \
  |  |   49|      1|        }                       \
  |  |   50|  3.02k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 3.02k]
  |  |  ------------------
  ------------------
 3667|       |
 3668|  3.02k|    if (allSamplesSize > 0) {
  ------------------
  |  Branch (3668:9): [True: 108, False: 2.91k]
  ------------------
 3669|    108|        sampleTable->allSamplesSize = allSamplesSize;
 3670|  2.91k|    } else {
 3671|  17.6k|        for (uint32_t i = 0; i < sampleCount; ++i) {
  ------------------
  |  Branch (3671:30): [True: 14.7k, False: 2.90k]
  ------------------
 3672|  14.7k|            avifSampleTableSampleSize * sampleSize = (avifSampleTableSampleSize *)avifArrayPush(&sampleTable->sampleSizes);
 3673|  14.7k|            AVIF_CHECKERR(sampleSize != NULL, AVIF_RESULT_OUT_OF_MEMORY);
  ------------------
  |  |   45|  14.7k|    do {                        \
  |  |   46|  14.7k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 14.7k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  14.7k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 14.7k]
  |  |  ------------------
  ------------------
 3674|  14.7k|            AVIF_CHECKERR(avifROStreamReadU32(&s, &sampleSize->size), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(32) entry_size;
  ------------------
  |  |   45|  14.7k|    do {                        \
  |  |   46|  14.7k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 8, False: 14.7k]
  |  |  ------------------
  |  |   47|      8|            avifBreakOnError(); \
  |  |   48|      8|            return ERR;         \
  |  |   49|      8|        }                       \
  |  |   50|  14.7k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 14.7k]
  |  |  ------------------
  ------------------
 3675|  14.7k|        }
 3676|  2.91k|    }
 3677|  3.01k|    return AVIF_RESULT_OK;
 3678|  3.02k|}
read.c:avifParseSyncSampleBox:
 3681|  2.79k|{
 3682|  2.79k|    BEGIN_STREAM(s, raw, rawLen, diag, "Box[stss]");
  ------------------
  |  |  738|  2.79k|    avifROStream VARNAME;                               \
  |  |  739|  2.79k|    avifROData VARNAME##_roData;                        \
  |  |  740|  2.79k|    VARNAME##_roData.data = PTR;                        \
  |  |  741|  2.79k|    VARNAME##_roData.size = SIZE;                       \
  |  |  742|  2.79k|    avifROStreamStart(&VARNAME, &VARNAME##_roData, DIAG, CONTEXT)
  ------------------
 3683|       |
 3684|  2.79k|    AVIF_CHECKERR(avifROStreamReadAndEnforceVersion(&s, /*enforcedVersion=*/0, /*flags=*/NULL), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  2.79k|    do {                        \
  |  |   46|  2.79k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 1, False: 2.79k]
  |  |  ------------------
  |  |   47|      1|            avifBreakOnError(); \
  |  |   48|      1|            return ERR;         \
  |  |   49|      1|        }                       \
  |  |   50|  2.79k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 2.79k]
  |  |  ------------------
  ------------------
 3685|       |
 3686|  2.79k|    uint32_t entryCount;
 3687|  2.79k|    AVIF_CHECKERR(avifROStreamReadU32(&s, &entryCount), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(32) entry_count;
  ------------------
  |  |   45|  2.79k|    do {                        \
  |  |   46|  2.79k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 1, False: 2.79k]
  |  |  ------------------
  |  |   47|      1|            avifBreakOnError(); \
  |  |   48|      1|            return ERR;         \
  |  |   49|      1|        }                       \
  |  |   50|  2.79k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 2.79k]
  |  |  ------------------
  ------------------
 3688|       |
 3689|  7.41k|    for (uint32_t i = 0; i < entryCount; ++i) {
  ------------------
  |  Branch (3689:26): [True: 4.63k, False: 2.78k]
  ------------------
 3690|  4.63k|        uint32_t sampleNumber = 0;
 3691|  4.63k|        AVIF_CHECKERR(avifROStreamReadU32(&s, &sampleNumber), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(32) sample_number;
  ------------------
  |  |   45|  4.63k|    do {                        \
  |  |   46|  4.63k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 14, False: 4.62k]
  |  |  ------------------
  |  |   47|     14|            avifBreakOnError(); \
  |  |   48|     14|            return ERR;         \
  |  |   49|     14|        }                       \
  |  |   50|  4.63k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 4.62k]
  |  |  ------------------
  ------------------
 3692|  4.62k|        avifSyncSample * syncSample = (avifSyncSample *)avifArrayPush(&sampleTable->syncSamples);
 3693|  4.62k|        AVIF_CHECKERR(syncSample != NULL, AVIF_RESULT_OUT_OF_MEMORY);
  ------------------
  |  |   45|  4.62k|    do {                        \
  |  |   46|  4.62k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 4.62k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  4.62k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 4.62k]
  |  |  ------------------
  ------------------
 3694|  4.62k|        syncSample->sampleNumber = sampleNumber;
 3695|  4.62k|    }
 3696|  2.78k|    return AVIF_RESULT_OK;
 3697|  2.79k|}
read.c:avifParseTimeToSampleBox:
 3700|  3.00k|{
 3701|  3.00k|    BEGIN_STREAM(s, raw, rawLen, diag, "Box[stts]");
  ------------------
  |  |  738|  3.00k|    avifROStream VARNAME;                               \
  |  |  739|  3.00k|    avifROData VARNAME##_roData;                        \
  |  |  740|  3.00k|    VARNAME##_roData.data = PTR;                        \
  |  |  741|  3.00k|    VARNAME##_roData.size = SIZE;                       \
  |  |  742|  3.00k|    avifROStreamStart(&VARNAME, &VARNAME##_roData, DIAG, CONTEXT)
  ------------------
 3702|       |
 3703|  3.00k|    AVIF_CHECKERR(avifROStreamReadAndEnforceVersion(&s, /*enforcedVersion=*/0, /*flags=*/NULL), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  3.00k|    do {                        \
  |  |   46|  3.00k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 1, False: 3.00k]
  |  |  ------------------
  |  |   47|      1|            avifBreakOnError(); \
  |  |   48|      1|            return ERR;         \
  |  |   49|      1|        }                       \
  |  |   50|  3.00k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 3.00k]
  |  |  ------------------
  ------------------
 3704|       |
 3705|  3.00k|    uint32_t entryCount;
 3706|  3.00k|    AVIF_CHECKERR(avifROStreamReadU32(&s, &entryCount), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(32) entry_count;
  ------------------
  |  |   45|  3.00k|    do {                        \
  |  |   46|  3.00k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 1, False: 3.00k]
  |  |  ------------------
  |  |   47|      1|            avifBreakOnError(); \
  |  |   48|      1|            return ERR;         \
  |  |   49|      1|        }                       \
  |  |   50|  3.00k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 3.00k]
  |  |  ------------------
  ------------------
 3707|       |
 3708|  6.01k|    for (uint32_t i = 0; i < entryCount; ++i) {
  ------------------
  |  Branch (3708:26): [True: 3.02k, False: 2.98k]
  ------------------
 3709|  3.02k|        avifSampleTableTimeToSample * timeToSample = (avifSampleTableTimeToSample *)avifArrayPush(&sampleTable->timeToSamples);
 3710|  3.02k|        AVIF_CHECKERR(timeToSample != NULL, AVIF_RESULT_OUT_OF_MEMORY);
  ------------------
  |  |   45|  3.02k|    do {                        \
  |  |   46|  3.02k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 3.02k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  3.02k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 3.02k]
  |  |  ------------------
  ------------------
 3711|  3.02k|        AVIF_CHECKERR(avifROStreamReadU32(&s, &timeToSample->sampleCount), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(32) sample_count;
  ------------------
  |  |   45|  3.02k|    do {                        \
  |  |   46|  3.02k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 16, False: 3.01k]
  |  |  ------------------
  |  |   47|     16|            avifBreakOnError(); \
  |  |   48|     16|            return ERR;         \
  |  |   49|     16|        }                       \
  |  |   50|  3.02k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 3.01k]
  |  |  ------------------
  ------------------
 3712|  3.01k|        AVIF_CHECKERR(avifROStreamReadU32(&s, &timeToSample->sampleDelta), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(32) sample_delta;
  ------------------
  |  |   45|  3.01k|    do {                        \
  |  |   46|  3.01k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 1, False: 3.01k]
  |  |  ------------------
  |  |   47|      1|            avifBreakOnError(); \
  |  |   48|      1|            return ERR;         \
  |  |   49|      1|        }                       \
  |  |   50|  3.01k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 3.01k]
  |  |  ------------------
  ------------------
 3713|  3.01k|    }
 3714|  2.98k|    return AVIF_RESULT_OK;
 3715|  3.00k|}
read.c:avifParseSampleDescriptionBox:
 3722|  3.09k|{
 3723|  3.09k|    BEGIN_STREAM(s, raw, rawLen, diag, "Box[stsd]");
  ------------------
  |  |  738|  3.09k|    avifROStream VARNAME;                               \
  |  |  739|  3.09k|    avifROData VARNAME##_roData;                        \
  |  |  740|  3.09k|    VARNAME##_roData.data = PTR;                        \
  |  |  741|  3.09k|    VARNAME##_roData.size = SIZE;                       \
  |  |  742|  3.09k|    avifROStreamStart(&VARNAME, &VARNAME##_roData, DIAG, CONTEXT)
  ------------------
 3724|       |
 3725|  3.09k|    uint8_t version;
 3726|  3.09k|    AVIF_CHECKERR(avifROStreamReadVersionAndFlags(&s, &version, NULL), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  3.09k|    do {                        \
  |  |   46|  3.09k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 1, False: 3.09k]
  |  |  ------------------
  |  |   47|      1|            avifBreakOnError(); \
  |  |   48|      1|            return ERR;         \
  |  |   49|      1|        }                       \
  |  |   50|  3.09k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 3.09k]
  |  |  ------------------
  ------------------
 3727|       |
 3728|       |    // Section 8.5.2.3 of ISO/IEC 14496-12:
 3729|       |    //   version is set to zero. A version number of 1 shall be treated as a version of 0.
 3730|  3.09k|    if (version != 0 && version != 1) {
  ------------------
  |  Branch (3730:9): [True: 2, False: 3.08k]
  |  Branch (3730:25): [True: 1, False: 1]
  ------------------
 3731|      1|        avifDiagnosticsPrintf(diag, "Box[stsd]: Expecting box version 0 or 1, got version %u", version);
 3732|      1|        return AVIF_RESULT_BMFF_PARSE_FAILED;
 3733|      1|    }
 3734|       |
 3735|  3.09k|    uint32_t entryCount;
 3736|  3.09k|    AVIF_CHECKERR(avifROStreamReadU32(&s, &entryCount), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(32) entry_count;
  ------------------
  |  |   45|  3.09k|    do {                        \
  |  |   46|  3.09k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 1, False: 3.08k]
  |  |  ------------------
  |  |   47|      1|            avifBreakOnError(); \
  |  |   48|      1|            return ERR;         \
  |  |   49|      1|        }                       \
  |  |   50|  3.09k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 3.08k]
  |  |  ------------------
  ------------------
 3737|       |
 3738|  6.16k|    for (uint32_t i = 0; i < entryCount; ++i) {
  ------------------
  |  Branch (3738:26): [True: 3.10k, False: 3.06k]
  ------------------
 3739|  3.10k|        avifBoxHeader sampleEntryHeader;
 3740|  3.10k|        AVIF_CHECKERR(avifROStreamReadBoxHeader(&s, &sampleEntryHeader), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  3.10k|    do {                        \
  |  |   46|  3.10k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 17, False: 3.08k]
  |  |  ------------------
  |  |   47|     17|            avifBreakOnError(); \
  |  |   48|     17|            return ERR;         \
  |  |   49|     17|        }                       \
  |  |   50|  3.10k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 3.08k]
  |  |  ------------------
  ------------------
 3741|       |
 3742|  3.08k|        avifSampleDescription * description = (avifSampleDescription *)avifArrayPush(&sampleTable->sampleDescriptions);
 3743|  3.08k|        AVIF_CHECKERR(description != NULL, AVIF_RESULT_OUT_OF_MEMORY);
  ------------------
  |  |   45|  3.08k|    do {                        \
  |  |   46|  3.08k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 3.08k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  3.08k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 3.08k]
  |  |  ------------------
  ------------------
 3744|  3.08k|        if (!avifArrayCreate(&description->properties, sizeof(avifProperty), 16)) {
  ------------------
  |  Branch (3744:13): [True: 0, False: 3.08k]
  ------------------
 3745|      0|            avifArrayPop(&sampleTable->sampleDescriptions);
 3746|      0|            return AVIF_RESULT_OUT_OF_MEMORY;
 3747|      0|        }
 3748|  3.08k|        memcpy(description->format, sampleEntryHeader.type, sizeof(description->format));
 3749|  3.08k|        const size_t sampleEntryBytes = sampleEntryHeader.size;
 3750|  3.08k|        if (avifGetCodecType(description->format) != AVIF_CODEC_TYPE_UNKNOWN) {
  ------------------
  |  Branch (3750:13): [True: 2.89k, False: 192]
  ------------------
 3751|  2.89k|            if (sampleEntryBytes < VISUALSAMPLEENTRY_SIZE) {
  ------------------
  |  Branch (3751:17): [True: 1, False: 2.89k]
  ------------------
 3752|      1|                avifDiagnosticsPrintf(diag, "Not enough bytes to parse VisualSampleEntry");
 3753|      1|                return AVIF_RESULT_BMFF_PARSE_FAILED;
 3754|      1|            }
 3755|  2.89k|            AVIF_CHECKRES(avifParseItemPropertyContainerBox(&description->properties,
  ------------------
  |  |   54|  2.89k|    do {                                  \
  |  |   55|  2.89k|        const avifResult result__ = (A);  \
  |  |   56|  2.89k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 8, False: 2.88k]
  |  |  ------------------
  |  |   57|      8|            avifBreakOnError();           \
  |  |   58|      8|            return result__;              \
  |  |   59|      8|        }                                 \
  |  |   60|  2.89k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 2.88k]
  |  |  ------------------
  ------------------
 3756|  2.89k|                                                            rawOffset + avifROStreamOffset(&s) + VISUALSAMPLEENTRY_SIZE,
 3757|  2.89k|                                                            avifROStreamCurrent(&s) + VISUALSAMPLEENTRY_SIZE,
 3758|  2.89k|                                                            sampleEntryBytes - VISUALSAMPLEENTRY_SIZE,
 3759|  2.89k|                                                            /*isTrack=*/AVIF_TRUE,
 3760|  2.89k|                                                            diag));
 3761|  2.89k|        }
 3762|       |
 3763|  3.07k|        AVIF_CHECKERR(avifROStreamSkip(&s, sampleEntryBytes), AVIF_RESULT_BMFF_PARSE_FAILED);
  ------------------
  |  |   45|  3.07k|    do {                        \
  |  |   46|  3.07k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 3.07k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  3.07k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 3.07k]
  |  |  ------------------
  ------------------
 3764|  3.07k|    }
 3765|  3.06k|    return AVIF_RESULT_OK;
 3766|  3.08k|}
read.c:avifGetCodecType:
   45|   269k|{
   46|   269k|    if (!memcmp(fourcc, "av01", 4)) {
  ------------------
  |  Branch (46:9): [True: 238k, False: 31.3k]
  ------------------
   47|   238k|        return AVIF_CODEC_TYPE_AV1;
   48|   238k|    }
   49|       |#if defined(AVIF_CODEC_AVM)
   50|       |    if (!memcmp(fourcc, "av02", 4)) {
   51|       |        return AVIF_CODEC_TYPE_AV2;
   52|       |    }
   53|       |#endif
   54|  31.3k|    return AVIF_CODEC_TYPE_UNKNOWN;
   55|   269k|}
read.c:avifTrackReferenceBox:
 3850|    767|{
 3851|    767|    BEGIN_STREAM(s, raw, rawLen, diag, "Box[tref]");
  ------------------
  |  |  738|    767|    avifROStream VARNAME;                               \
  |  |  739|    767|    avifROData VARNAME##_roData;                        \
  |  |  740|    767|    VARNAME##_roData.data = PTR;                        \
  |  |  741|    767|    VARNAME##_roData.size = SIZE;                       \
  |  |  742|    767|    avifROStreamStart(&VARNAME, &VARNAME##_roData, DIAG, CONTEXT)
  ------------------
 3852|       |
 3853|  1.53k|    while (avifROStreamHasBytesLeft(&s, 1)) {
  ------------------
  |  Branch (3853:12): [True: 769, False: 764]
  ------------------
 3854|    769|        avifBoxHeader header;
 3855|    769|        AVIF_CHECK(avifROStreamReadBoxHeader(&s, &header));
  ------------------
  |  |   36|    769|    do {                        \
  |  |   37|    769|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 2, False: 767]
  |  |  ------------------
  |  |   38|      2|            avifBreakOnError(); \
  |  |   39|      2|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      2|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      2|        }                       \
  |  |   41|    769|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 767]
  |  |  ------------------
  ------------------
 3856|       |
 3857|    767|        if (!memcmp(header.type, "auxl", 4)) {
  ------------------
  |  Branch (3857:13): [True: 710, False: 57]
  ------------------
 3858|    710|            uint32_t toID;
 3859|    710|            AVIF_CHECK(avifROStreamReadU32(&s, &toID));                       // unsigned int(32) track_IDs[];
  ------------------
  |  |   36|    710|    do {                        \
  |  |   37|    710|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 0, False: 710]
  |  |  ------------------
  |  |   38|      0|            avifBreakOnError(); \
  |  |   39|      0|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      0|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      0|        }                       \
  |  |   41|    710|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 710]
  |  |  ------------------
  ------------------
 3860|    710|            AVIF_CHECK(avifROStreamSkip(&s, header.size - sizeof(uint32_t))); // just take the first one
  ------------------
  |  |   36|    710|    do {                        \
  |  |   37|    710|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 709]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|    710|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 709]
  |  |  ------------------
  ------------------
 3861|    709|            track->auxForID = toID;
 3862|    709|        } else if (!memcmp(header.type, "prem", 4)) {
  ------------------
  |  Branch (3862:20): [True: 0, False: 57]
  ------------------
 3863|      0|            uint32_t byID;
 3864|      0|            AVIF_CHECK(avifROStreamReadU32(&s, &byID));                       // unsigned int(32) track_IDs[];
  ------------------
  |  |   36|      0|    do {                        \
  |  |   37|      0|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 0, False: 0]
  |  |  ------------------
  |  |   38|      0|            avifBreakOnError(); \
  |  |   39|      0|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      0|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      0|        }                       \
  |  |   41|      0|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 0]
  |  |  ------------------
  ------------------
 3865|      0|            AVIF_CHECK(avifROStreamSkip(&s, header.size - sizeof(uint32_t))); // just take the first one
  ------------------
  |  |   36|      0|    do {                        \
  |  |   37|      0|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 0, False: 0]
  |  |  ------------------
  |  |   38|      0|            avifBreakOnError(); \
  |  |   39|      0|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      0|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      0|        }                       \
  |  |   41|      0|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 0]
  |  |  ------------------
  ------------------
 3866|      0|            track->premByID = byID;
 3867|     57|        } else {
 3868|     57|            AVIF_CHECK(avifROStreamSkip(&s, header.size));
  ------------------
  |  |   36|     57|    do {                        \
  |  |   37|     57|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 0, False: 57]
  |  |  ------------------
  |  |   38|      0|            avifBreakOnError(); \
  |  |   39|      0|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      0|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      0|        }                       \
  |  |   41|     57|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 57]
  |  |  ------------------
  ------------------
 3869|     57|        }
 3870|    767|    }
 3871|    764|    return AVIF_TRUE;
  ------------------
  |  |   88|    764|#define AVIF_TRUE 1
  ------------------
 3872|    767|}
read.c:avifParseEditBox:
 3914|  3.29k|{
 3915|  3.29k|    BEGIN_STREAM(s, raw, rawLen, diag, "Box[edts]");
  ------------------
  |  |  738|  3.29k|    avifROStream VARNAME;                               \
  |  |  739|  3.29k|    avifROData VARNAME##_roData;                        \
  |  |  740|  3.29k|    VARNAME##_roData.data = PTR;                        \
  |  |  741|  3.29k|    VARNAME##_roData.size = SIZE;                       \
  |  |  742|  3.29k|    avifROStreamStart(&VARNAME, &VARNAME##_roData, DIAG, CONTEXT)
  ------------------
 3916|       |
 3917|  3.29k|    avifBool elstBoxSeen = AVIF_FALSE;
  ------------------
  |  |   89|  3.29k|#define AVIF_FALSE 0
  ------------------
 3918|  6.58k|    while (avifROStreamHasBytesLeft(&s, 1)) {
  ------------------
  |  Branch (3918:12): [True: 3.30k, False: 3.27k]
  ------------------
 3919|  3.30k|        avifBoxHeader header;
 3920|  3.30k|        AVIF_CHECK(avifROStreamReadBoxHeader(&s, &header));
  ------------------
  |  |   36|  3.30k|    do {                        \
  |  |   37|  3.30k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 5, False: 3.30k]
  |  |  ------------------
  |  |   38|      5|            avifBreakOnError(); \
  |  |   39|      5|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      5|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      5|        }                       \
  |  |   41|  3.30k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 3.30k]
  |  |  ------------------
  ------------------
 3921|       |
 3922|  3.30k|        if (!memcmp(header.type, "elst", 4)) {
  ------------------
  |  Branch (3922:13): [True: 3.29k, False: 12]
  ------------------
 3923|  3.29k|            if (elstBoxSeen) {
  ------------------
  |  Branch (3923:17): [True: 0, False: 3.29k]
  ------------------
 3924|      0|                avifDiagnosticsPrintf(diag, "More than one [elst] Box was found.");
 3925|      0|                return AVIF_FALSE;
  ------------------
  |  |   89|      0|#define AVIF_FALSE 0
  ------------------
 3926|      0|            }
 3927|  3.29k|            AVIF_CHECK(avifParseEditListBox(track, avifROStreamCurrent(&s), header.size, diag));
  ------------------
  |  |   36|  3.29k|    do {                        \
  |  |   37|  3.29k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 17, False: 3.27k]
  |  |  ------------------
  |  |   38|     17|            avifBreakOnError(); \
  |  |   39|     17|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|     17|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|     17|        }                       \
  |  |   41|  3.29k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 3.27k]
  |  |  ------------------
  ------------------
 3928|  3.27k|            elstBoxSeen = AVIF_TRUE;
  ------------------
  |  |   88|  3.27k|#define AVIF_TRUE 1
  ------------------
 3929|  3.27k|        }
 3930|  3.28k|        AVIF_CHECK(avifROStreamSkip(&s, header.size));
  ------------------
  |  |   36|  3.28k|    do {                        \
  |  |   37|  3.28k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 0, False: 3.28k]
  |  |  ------------------
  |  |   38|      0|            avifBreakOnError(); \
  |  |   39|      0|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      0|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      0|        }                       \
  |  |   41|  3.28k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 3.28k]
  |  |  ------------------
  ------------------
 3931|  3.28k|    }
 3932|  3.27k|    if (!elstBoxSeen) {
  ------------------
  |  Branch (3932:9): [True: 2, False: 3.27k]
  ------------------
 3933|      2|        avifDiagnosticsPrintf(diag, "Box[edts] contains no [elst] Box.");
 3934|      2|        return AVIF_FALSE;
  ------------------
  |  |   89|      2|#define AVIF_FALSE 0
  ------------------
 3935|      2|    }
 3936|  3.27k|    return AVIF_TRUE;
  ------------------
  |  |   88|  3.27k|#define AVIF_TRUE 1
  ------------------
 3937|  3.27k|}
read.c:avifParseEditListBox:
 3875|  3.29k|{
 3876|  3.29k|    BEGIN_STREAM(s, raw, rawLen, diag, "Box[elst]");
  ------------------
  |  |  738|  3.29k|    avifROStream VARNAME;                               \
  |  |  739|  3.29k|    avifROData VARNAME##_roData;                        \
  |  |  740|  3.29k|    VARNAME##_roData.data = PTR;                        \
  |  |  741|  3.29k|    VARNAME##_roData.size = SIZE;                       \
  |  |  742|  3.29k|    avifROStreamStart(&VARNAME, &VARNAME##_roData, DIAG, CONTEXT)
  ------------------
 3877|       |
 3878|  3.29k|    uint8_t version;
 3879|  3.29k|    uint32_t flags;
 3880|  3.29k|    AVIF_CHECK(avifROStreamReadVersionAndFlags(&s, &version, &flags));
  ------------------
  |  |   36|  3.29k|    do {                        \
  |  |   37|  3.29k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 3.29k]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|  3.29k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 3.29k]
  |  |  ------------------
  ------------------
 3881|       |
 3882|  3.29k|    if ((flags & 1) == 0) {
  ------------------
  |  Branch (3882:9): [True: 1.62k, False: 1.66k]
  ------------------
 3883|  1.62k|        track->isRepeating = AVIF_FALSE;
  ------------------
  |  |   89|  1.62k|#define AVIF_FALSE 0
  ------------------
 3884|  1.62k|        return AVIF_TRUE;
  ------------------
  |  |   88|  1.62k|#define AVIF_TRUE 1
  ------------------
 3885|  1.62k|    }
 3886|       |
 3887|  1.66k|    track->isRepeating = AVIF_TRUE;
  ------------------
  |  |   88|  1.66k|#define AVIF_TRUE 1
  ------------------
 3888|  1.66k|    uint32_t entryCount;
 3889|  1.66k|    AVIF_CHECK(avifROStreamReadU32(&s, &entryCount)); // unsigned int(32) entry_count;
  ------------------
  |  |   36|  1.66k|    do {                        \
  |  |   37|  1.66k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 1.66k]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|  1.66k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 1.66k]
  |  |  ------------------
  ------------------
 3890|  1.66k|    if (entryCount != 1) {
  ------------------
  |  Branch (3890:9): [True: 12, False: 1.65k]
  ------------------
 3891|     12|        avifDiagnosticsPrintf(diag, "Box[elst] contains an entry_count != 1 [%u]", entryCount);
 3892|     12|        return AVIF_FALSE;
  ------------------
  |  |   89|     12|#define AVIF_FALSE 0
  ------------------
 3893|     12|    }
 3894|       |
 3895|  1.65k|    if (version == 1) {
  ------------------
  |  Branch (3895:9): [True: 1.65k, False: 2]
  ------------------
 3896|  1.65k|        AVIF_CHECK(avifROStreamReadU64(&s, &track->segmentDuration)); // unsigned int(64) segment_duration;
  ------------------
  |  |   36|  1.65k|    do {                        \
  |  |   37|  1.65k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 1.65k]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|  1.65k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 1.65k]
  |  |  ------------------
  ------------------
 3897|  1.65k|    } else if (version == 0) {
  ------------------
  |  Branch (3897:16): [True: 1, False: 1]
  ------------------
 3898|      1|        uint32_t segmentDuration;
 3899|      1|        AVIF_CHECK(avifROStreamReadU32(&s, &segmentDuration)); // unsigned int(32) segment_duration;
  ------------------
  |  |   36|      1|    do {                        \
  |  |   37|      1|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 0, False: 1]
  |  |  ------------------
  |  |   38|      0|            avifBreakOnError(); \
  |  |   39|      0|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      0|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      0|        }                       \
  |  |   41|      1|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 1]
  |  |  ------------------
  ------------------
 3900|      1|        track->segmentDuration = segmentDuration;
 3901|      1|    } else {
 3902|       |        // Unsupported version
 3903|      1|        avifDiagnosticsPrintf(diag, "Box[elst] has an unsupported version [%u]", version);
 3904|      1|        return AVIF_FALSE;
  ------------------
  |  |   89|      1|#define AVIF_FALSE 0
  ------------------
 3905|      1|    }
 3906|  1.65k|    if (track->segmentDuration == 0) {
  ------------------
  |  Branch (3906:9): [True: 1, False: 1.65k]
  ------------------
 3907|      1|        avifDiagnosticsPrintf(diag, "Box[elst] Invalid value for segment_duration (0).");
 3908|      1|        return AVIF_FALSE;
  ------------------
  |  |   89|      1|#define AVIF_FALSE 0
  ------------------
 3909|      1|    }
 3910|  1.65k|    return AVIF_TRUE;
  ------------------
  |  |   88|  1.65k|#define AVIF_TRUE 1
  ------------------
 3911|  1.65k|}
read.c:avifDecoderItemShouldBeSkipped:
 5283|   169k|{
 5284|   169k|    return !item->size || item->hasUnsupportedEssentialProperty ||
  ------------------
  |  Branch (5284:12): [True: 12.3k, False: 156k]
  |  Branch (5284:27): [True: 7.24k, False: 149k]
  ------------------
 5285|   149k|           (avifGetCodecType(item->type) == AVIF_CODEC_TYPE_UNKNOWN && memcmp(item->type, "grid", 4)) || item->thumbnailForID != 0;
  ------------------
  |  Branch (5285:13): [True: 25.5k, False: 123k]
  |  Branch (5285:72): [True: 15.5k, False: 10.0k]
  |  Branch (5285:106): [True: 0, False: 133k]
  ------------------
 5286|   169k|}
read.c:avifPropertyArrayFind:
  200|   510k|{
  201|  2.32M|    for (uint32_t propertyIndex = 0; propertyIndex < properties->count; ++propertyIndex) {
  ------------------
  |  Branch (201:38): [True: 1.99M, False: 334k]
  ------------------
  202|  1.99M|        const avifProperty * prop = &properties->prop[propertyIndex];
  203|  1.99M|        if (!memcmp(prop->type, type, 4)) {
  ------------------
  |  Branch (203:13): [True: 175k, False: 1.81M]
  ------------------
  204|   175k|            return prop;
  205|   175k|        }
  206|  1.99M|    }
  207|   334k|    return NULL;
  208|   510k|}
read.c:isAlphaURN:
 1948|  14.3k|{
 1949|  14.3k|    return !strcmp(urn, AVIF_URN_ALPHA0) || !strcmp(urn, AVIF_URN_ALPHA1);
  ------------------
  |  |   78|  14.3k|#define AVIF_URN_ALPHA0 "urn:mpeg:mpegB:cicp:systems:auxiliary:alpha"
  ------------------
                  return !strcmp(urn, AVIF_URN_ALPHA0) || !strcmp(urn, AVIF_URN_ALPHA1);
  ------------------
  |  |   79|  3.78k|#define AVIF_URN_ALPHA1 "urn:mpeg:hevc:2015:auxid:1"
  ------------------
  |  Branch (1949:12): [True: 10.5k, False: 3.78k]
  |  Branch (1949:45): [True: 0, False: 3.78k]
  ------------------
 1950|  14.3k|}
read.c:avifDecoderDataClearTiles:
 1085|  57.1k|{
 1086|   102k|    for (unsigned int i = 0; i < data->tiles.count; ++i) {
  ------------------
  |  Branch (1086:30): [True: 45.0k, False: 57.1k]
  ------------------
 1087|  45.0k|        avifTile * tile = &data->tiles.tile[i];
 1088|  45.0k|        if (tile->input) {
  ------------------
  |  Branch (1088:13): [True: 45.0k, False: 0]
  ------------------
 1089|  45.0k|            avifCodecDecodeInputDestroy(tile->input);
 1090|  45.0k|            tile->input = NULL;
 1091|  45.0k|        }
 1092|  45.0k|        if (tile->codec) {
  ------------------
  |  Branch (1092:13): [True: 44.3k, False: 683]
  ------------------
 1093|       |            // Check if tile->codec was created separately and destroy it in that case.
 1094|  44.3k|            if (tile->codec != data->codec && tile->codec != data->codecAlpha) {
  ------------------
  |  Branch (1094:17): [True: 10.5k, False: 33.7k]
  |  Branch (1094:47): [True: 10.0k, False: 466]
  ------------------
 1095|  10.0k|                avifCodecDestroy(tile->codec);
 1096|  10.0k|            }
 1097|  44.3k|            tile->codec = NULL;
 1098|  44.3k|        }
 1099|  45.0k|        if (tile->image) {
  ------------------
  |  Branch (1099:13): [True: 45.0k, False: 0]
  ------------------
 1100|  45.0k|            avifImageDestroy(tile->image);
 1101|  45.0k|            tile->image = NULL;
 1102|  45.0k|        }
 1103|  45.0k|    }
 1104|  57.1k|    data->tiles.count = 0;
 1105|   514k|    for (int c = 0; c < AVIF_ITEM_CATEGORY_COUNT; ++c) {
  ------------------
  |  Branch (1105:21): [True: 457k, False: 57.1k]
  ------------------
 1106|   457k|        data->tileInfos[c].tileCount = 0;
 1107|   457k|        data->tileInfos[c].decodedTileCount = 0;
 1108|   457k|    }
 1109|  57.1k|    if (data->codec) {
  ------------------
  |  Branch (1109:9): [True: 28.9k, False: 28.1k]
  ------------------
 1110|  28.9k|        avifCodecDestroy(data->codec);
 1111|  28.9k|        data->codec = NULL;
 1112|  28.9k|    }
 1113|  57.1k|    if (data->codecAlpha) {
  ------------------
  |  Branch (1113:9): [True: 466, False: 56.6k]
  ------------------
 1114|    466|        avifCodecDestroy(data->codecAlpha);
 1115|       |        data->codecAlpha = NULL;
 1116|    466|    }
 1117|  57.1k|}
read.c:avifSampleTableGetCodecType:
  372|  8.79k|{
  373|  9.06k|    for (uint32_t i = 0; i < sampleTable->sampleDescriptions.count; ++i) {
  ------------------
  |  Branch (373:26): [True: 8.73k, False: 339]
  ------------------
  374|  8.73k|        const avifCodecType codecType = avifGetCodecType(sampleTable->sampleDescriptions.description[i].format);
  375|  8.73k|        if (codecType != AVIF_CODEC_TYPE_UNKNOWN) {
  ------------------
  |  Branch (375:13): [True: 8.45k, False: 273]
  ------------------
  376|  8.45k|            return codecType;
  377|  8.45k|        }
  378|  8.73k|    }
  379|    339|    return AVIF_CODEC_TYPE_UNKNOWN;
  380|  8.79k|}
read.c:avifSampleTableGetProperties:
  448|  8.45k|{
  449|  8.45k|    for (uint32_t i = 0; i < sampleTable->sampleDescriptions.count; ++i) {
  ------------------
  |  Branch (449:26): [True: 8.45k, False: 0]
  ------------------
  450|  8.45k|        const avifSampleDescription * description = &sampleTable->sampleDescriptions.description[i];
  451|  8.45k|        if (avifGetCodecType(description->format) == codecType) {
  ------------------
  |  Branch (451:13): [True: 8.45k, False: 0]
  ------------------
  452|  8.45k|            return &description->properties;
  453|  8.45k|        }
  454|  8.45k|    }
  455|      0|    return NULL;
  456|  8.45k|}
read.c:avifDecoderFindMetadata:
 1884|  35.0k|{
 1885|  35.0k|    if (decoder->ignoreExif && decoder->ignoreXMP) {
  ------------------
  |  Branch (1885:9): [True: 16.6k, False: 18.3k]
  |  Branch (1885:32): [True: 10.1k, False: 6.50k]
  ------------------
 1886|       |        // Nothing to do!
 1887|  10.1k|        return AVIF_RESULT_OK;
 1888|  10.1k|    }
 1889|       |
 1890|  84.4k|    for (uint32_t itemIndex = 0; itemIndex < meta->items.count; ++itemIndex) {
  ------------------
  |  Branch (1890:34): [True: 59.6k, False: 24.8k]
  ------------------
 1891|  59.6k|        avifDecoderItem * item = meta->items.item[itemIndex];
 1892|  59.6k|        if (!item->size) {
  ------------------
  |  Branch (1892:13): [True: 6.62k, False: 53.0k]
  ------------------
 1893|  6.62k|            continue;
 1894|  6.62k|        }
 1895|  53.0k|        if (item->hasUnsupportedEssentialProperty) {
  ------------------
  |  Branch (1895:13): [True: 4.01k, False: 49.0k]
  ------------------
 1896|       |            // An essential property isn't supported by libavif; ignore the item.
 1897|  4.01k|            continue;
 1898|  4.01k|        }
 1899|       |
 1900|  49.0k|        if ((colorId > 0) && (item->descForID != colorId)) {
  ------------------
  |  Branch (1900:13): [True: 48.2k, False: 830]
  |  Branch (1900:30): [True: 47.5k, False: 653]
  ------------------
 1901|       |            // Not a content description (metadata) for the colorOBU, skip it
 1902|  47.5k|            continue;
 1903|  47.5k|        }
 1904|       |
 1905|  1.48k|        if (!decoder->ignoreExif && !memcmp(item->type, "Exif", 4)) {
  ------------------
  |  Branch (1905:13): [True: 1.22k, False: 261]
  |  Branch (1905:37): [True: 532, False: 690]
  ------------------
 1906|    532|            avifROData exifContents;
 1907|    532|            avifResult readResult = avifDecoderItemRead(item, decoder->io, &exifContents, 0, 0, &decoder->diag);
 1908|    532|            if (readResult != AVIF_RESULT_OK) {
  ------------------
  |  Branch (1908:17): [True: 1, False: 531]
  ------------------
 1909|      1|                return readResult;
 1910|      1|            }
 1911|       |
 1912|       |            // Advance past Annex A.2.1's header
 1913|    531|            BEGIN_STREAM(exifBoxStream, exifContents.data, exifContents.size, &decoder->diag, "Exif header");
  ------------------
  |  |  738|    531|    avifROStream VARNAME;                               \
  |  |  739|    531|    avifROData VARNAME##_roData;                        \
  |  |  740|    531|    VARNAME##_roData.data = PTR;                        \
  |  |  741|    531|    VARNAME##_roData.size = SIZE;                       \
  |  |  742|    531|    avifROStreamStart(&VARNAME, &VARNAME##_roData, DIAG, CONTEXT)
  ------------------
 1914|       |#if defined(AVIF_ENABLE_EXPERIMENTAL_MINI)
 1915|       |            // The MinimizedImageBox does not signal the exifTiffHeaderOffset.
 1916|       |            if (!meta->fromMiniBox)
 1917|       |#endif
 1918|    531|            {
 1919|    531|                uint32_t exifTiffHeaderOffset;
 1920|    531|                AVIF_CHECKERR(avifROStreamReadU32(&exifBoxStream, &exifTiffHeaderOffset),
  ------------------
  |  |   45|    531|    do {                        \
  |  |   46|    531|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 531]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|    531|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 531]
  |  |  ------------------
  ------------------
 1921|    531|                              AVIF_RESULT_INVALID_EXIF_PAYLOAD); // unsigned int(32) exif_tiff_header_offset;
 1922|    531|                size_t expectedExifTiffHeaderOffset;
 1923|    531|                AVIF_CHECKRES(avifGetExifTiffHeaderOffset(avifROStreamCurrent(&exifBoxStream),
  ------------------
  |  |   54|    531|    do {                                  \
  |  |   55|    531|        const avifResult result__ = (A);  \
  |  |   56|    531|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 3, False: 528]
  |  |  ------------------
  |  |   57|      3|            avifBreakOnError();           \
  |  |   58|      3|            return result__;              \
  |  |   59|      3|        }                                 \
  |  |   60|    531|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 528]
  |  |  ------------------
  ------------------
 1924|    531|                                                          avifROStreamRemainingBytes(&exifBoxStream),
 1925|    531|                                                          &expectedExifTiffHeaderOffset));
 1926|    528|                AVIF_CHECKERR(exifTiffHeaderOffset == expectedExifTiffHeaderOffset, AVIF_RESULT_INVALID_EXIF_PAYLOAD);
  ------------------
  |  |   45|    528|    do {                        \
  |  |   46|    528|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 7, False: 521]
  |  |  ------------------
  |  |   47|      7|            avifBreakOnError(); \
  |  |   48|      7|            return ERR;         \
  |  |   49|      7|        }                       \
  |  |   50|    528|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 521]
  |  |  ------------------
  ------------------
 1927|    528|            }
 1928|       |
 1929|    521|            AVIF_CHECKRES(avifRWDataSet(&image->exif, avifROStreamCurrent(&exifBoxStream), avifROStreamRemainingBytes(&exifBoxStream)));
  ------------------
  |  |   54|    521|    do {                                  \
  |  |   55|    521|        const avifResult result__ = (A);  \
  |  |   56|    521|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 0, False: 521]
  |  |  ------------------
  |  |   57|      0|            avifBreakOnError();           \
  |  |   58|      0|            return result__;              \
  |  |   59|      0|        }                                 \
  |  |   60|    521|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 521]
  |  |  ------------------
  ------------------
 1930|    951|        } else if (!decoder->ignoreXMP && !memcmp(item->type, "mime", 4) &&
  ------------------
  |  Branch (1930:20): [True: 590, False: 361]
  |  Branch (1930:43): [True: 429, False: 161]
  ------------------
 1931|    429|                   !strcmp(item->contentType.contentType, AVIF_CONTENT_TYPE_XMP)) {
  ------------------
  |  |   81|    429|#define AVIF_CONTENT_TYPE_XMP "application/rdf+xml"
  ------------------
  |  Branch (1931:20): [True: 224, False: 205]
  ------------------
 1932|    224|            avifROData xmpContents;
 1933|    224|            avifResult readResult = avifDecoderItemRead(item, decoder->io, &xmpContents, 0, 0, &decoder->diag);
 1934|    224|            if (readResult != AVIF_RESULT_OK) {
  ------------------
  |  Branch (1934:17): [True: 2, False: 222]
  ------------------
 1935|      2|                return readResult;
 1936|      2|            }
 1937|       |
 1938|    222|            AVIF_CHECKRES(avifImageSetMetadataXMP(image, xmpContents.data, xmpContents.size));
  ------------------
  |  |   54|    222|    do {                                  \
  |  |   55|    222|        const avifResult result__ = (A);  \
  |  |   56|    222|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 0, False: 222]
  |  |  ------------------
  |  |   57|      0|            avifBreakOnError();           \
  |  |   58|      0|            return result__;              \
  |  |   59|      0|        }                                 \
  |  |   60|    222|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 222]
  |  |  ------------------
  ------------------
 1939|    222|        }
 1940|  1.48k|    }
 1941|  24.8k|    return AVIF_RESULT_OK;
 1942|  24.8k|}
read.c:avifDecoderDataCreateTile:
 1040|  45.0k|{
 1041|  45.0k|    avifTile * tile = (avifTile *)avifArrayPush(&data->tiles);
 1042|  45.0k|    if (tile == NULL) {
  ------------------
  |  Branch (1042:9): [True: 0, False: 45.0k]
  ------------------
 1043|      0|        return NULL;
 1044|      0|    }
 1045|  45.0k|    tile->codecType = codecType;
 1046|  45.0k|    tile->image = avifImageCreateEmpty();
 1047|  45.0k|    if (!tile->image) {
  ------------------
  |  Branch (1047:9): [True: 0, False: 45.0k]
  ------------------
 1048|      0|        goto error;
 1049|      0|    }
 1050|  45.0k|    tile->input = avifCodecDecodeInputCreate();
 1051|  45.0k|    if (!tile->input) {
  ------------------
  |  Branch (1051:9): [True: 0, False: 45.0k]
  ------------------
 1052|      0|        goto error;
 1053|      0|    }
 1054|  45.0k|    tile->width = width;
 1055|  45.0k|    tile->height = height;
 1056|  45.0k|    tile->operatingPoint = operatingPoint;
 1057|  45.0k|    return tile;
 1058|       |
 1059|      0|error:
 1060|      0|    if (tile->input) {
  ------------------
  |  Branch (1060:9): [True: 0, False: 0]
  ------------------
 1061|      0|        avifCodecDecodeInputDestroy(tile->input);
 1062|      0|    }
 1063|      0|    if (tile->image) {
  ------------------
  |  Branch (1063:9): [True: 0, False: 0]
  ------------------
 1064|      0|        avifImageDestroy(tile->image);
 1065|      0|    }
 1066|      0|    avifArrayPop(&data->tiles);
 1067|       |    return NULL;
 1068|  45.0k|}
read.c:avifCodecDecodeInputFillFromSampleTable:
  526|  4.23k|{
  527|  4.23k|    if (imageCountLimit) {
  ------------------
  |  Branch (527:9): [True: 4.23k, False: 0]
  ------------------
  528|       |        // Verify that the we're not about to exceed the frame count limit.
  529|       |
  530|  4.23k|        uint32_t imageCountLeft = imageCountLimit;
  531|  8.46k|        for (uint32_t chunkIndex = 0; chunkIndex < sampleTable->chunks.count; ++chunkIndex) {
  ------------------
  |  Branch (531:39): [True: 4.23k, False: 4.22k]
  ------------------
  532|       |            // First, figure out how many samples are in this chunk
  533|  4.23k|            uint32_t sampleCount = avifGetSampleCountOfChunk(&sampleTable->sampleToChunks, chunkIndex);
  534|  4.23k|            if (sampleCount == 0) {
  ------------------
  |  Branch (534:17): [True: 4, False: 4.23k]
  ------------------
  535|       |                // chunks with 0 samples are invalid
  536|      4|                avifDiagnosticsPrintf(diag, "Sample table contains a chunk with 0 samples");
  537|      4|                return AVIF_RESULT_BMFF_PARSE_FAILED;
  538|      4|            }
  539|       |
  540|  4.23k|            if (sampleCount > imageCountLeft) {
  ------------------
  |  Branch (540:17): [True: 12, False: 4.22k]
  ------------------
  541|       |                // This file exceeds the imageCountLimit, bail out
  542|     12|                avifDiagnosticsPrintf(diag, "Exceeded avifDecoder's imageCountLimit");
  543|     12|                return AVIF_RESULT_BMFF_PARSE_FAILED;
  544|     12|            }
  545|  4.22k|            imageCountLeft -= sampleCount;
  546|  4.22k|        }
  547|  4.23k|    }
  548|       |
  549|  4.22k|    uint32_t sampleSizeIndex = 0;
  550|  8.30k|    for (uint32_t chunkIndex = 0; chunkIndex < sampleTable->chunks.count; ++chunkIndex) {
  ------------------
  |  Branch (550:35): [True: 4.22k, False: 4.07k]
  ------------------
  551|  4.22k|        avifSampleTableChunk * chunk = &sampleTable->chunks.chunk[chunkIndex];
  552|       |
  553|       |        // First, figure out how many samples are in this chunk
  554|  4.22k|        uint32_t sampleCount = avifGetSampleCountOfChunk(&sampleTable->sampleToChunks, chunkIndex);
  555|  4.22k|        if (sampleCount == 0) {
  ------------------
  |  Branch (555:13): [True: 0, False: 4.22k]
  ------------------
  556|       |            // chunks with 0 samples are invalid
  557|      0|            avifDiagnosticsPrintf(diag, "Sample table contains a chunk with 0 samples");
  558|      0|            return AVIF_RESULT_BMFF_PARSE_FAILED;
  559|      0|        }
  560|       |
  561|  4.22k|        uint64_t sampleOffset = chunk->offset;
  562|  24.8k|        for (uint32_t sampleIndex = 0; sampleIndex < sampleCount; ++sampleIndex) {
  ------------------
  |  Branch (562:40): [True: 20.7k, False: 4.07k]
  ------------------
  563|  20.7k|            uint32_t sampleSize = sampleTable->allSamplesSize;
  564|  20.7k|            if (sampleSize == 0) {
  ------------------
  |  Branch (564:17): [True: 20.3k, False: 422]
  ------------------
  565|  20.3k|                if (sampleSizeIndex >= sampleTable->sampleSizes.count) {
  ------------------
  |  Branch (565:21): [True: 5, False: 20.2k]
  ------------------
  566|       |                    // We've run out of samples to sum
  567|      5|                    avifDiagnosticsPrintf(diag, "Truncated sample table");
  568|      5|                    return AVIF_RESULT_BMFF_PARSE_FAILED;
  569|      5|                }
  570|  20.2k|                avifSampleTableSampleSize * sampleSizePtr = &sampleTable->sampleSizes.sampleSize[sampleSizeIndex];
  571|  20.2k|                sampleSize = sampleSizePtr->size;
  572|  20.2k|            }
  573|       |
  574|  20.7k|            avifDecodeSample * sample = (avifDecodeSample *)avifArrayPush(&decodeInput->samples);
  575|  20.7k|            AVIF_CHECKERR(sample != NULL, AVIF_RESULT_OUT_OF_MEMORY);
  ------------------
  |  |   45|  20.7k|    do {                        \
  |  |   46|  20.7k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 20.7k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  20.7k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 20.7k]
  |  |  ------------------
  ------------------
  576|  20.7k|            sample->offset = sampleOffset;
  577|  20.7k|            sample->size = sampleSize;
  578|  20.7k|            sample->spatialID = AVIF_SPATIAL_ID_UNSET; // Not filtering by spatial_id
  ------------------
  |  |  461|  20.7k|#define AVIF_SPATIAL_ID_UNSET 0xff
  ------------------
  579|  20.7k|            sample->sync = AVIF_FALSE;                 // to potentially be set to true following the outer loop
  ------------------
  |  |   89|  20.7k|#define AVIF_FALSE 0
  ------------------
  580|       |
  581|  20.7k|            if (sampleSize > UINT64_MAX - sampleOffset) {
  ------------------
  |  Branch (581:17): [True: 0, False: 20.7k]
  ------------------
  582|      0|                avifDiagnosticsPrintf(diag,
  583|      0|                                      "Sample table contains an offset/size pair which overflows: [%" PRIu64 " / %u]",
  584|      0|                                      sampleOffset,
  585|      0|                                      sampleSize);
  586|      0|                return AVIF_RESULT_BMFF_PARSE_FAILED;
  587|      0|            }
  588|  20.7k|            if (sizeHint && ((sampleOffset + sampleSize) > sizeHint)) {
  ------------------
  |  Branch (588:17): [True: 20.7k, False: 0]
  |  Branch (588:29): [True: 139, False: 20.5k]
  ------------------
  589|    139|                avifDiagnosticsPrintf(diag, "Exceeded avifIO's sizeHint, possibly truncated data");
  590|    139|                return AVIF_RESULT_BMFF_PARSE_FAILED;
  591|    139|            }
  592|       |
  593|  20.5k|            sampleOffset += sampleSize;
  594|  20.5k|            ++sampleSizeIndex;
  595|  20.5k|        }
  596|  4.22k|    }
  597|       |
  598|       |    // Mark appropriate samples as sync
  599|  9.85k|    for (uint32_t syncSampleIndex = 0; syncSampleIndex < sampleTable->syncSamples.count; ++syncSampleIndex) {
  ------------------
  |  Branch (599:40): [True: 5.77k, False: 4.07k]
  ------------------
  600|  5.77k|        uint32_t frameIndex = sampleTable->syncSamples.syncSample[syncSampleIndex].sampleNumber - 1; // sampleNumber is 1-based
  601|  5.77k|        if (frameIndex < decodeInput->samples.count) {
  ------------------
  |  Branch (601:13): [True: 5.40k, False: 365]
  ------------------
  602|  5.40k|            decodeInput->samples.sample[frameIndex].sync = AVIF_TRUE;
  ------------------
  |  |   88|  5.40k|#define AVIF_TRUE 1
  ------------------
  603|  5.40k|        }
  604|  5.77k|    }
  605|       |
  606|       |    // Assume frame 0 is sync, just in case the stss box is absent in the BMFF. (Unnecessary?)
  607|  4.07k|    if (decodeInput->samples.count > 0) {
  ------------------
  |  Branch (607:9): [True: 4.07k, False: 0]
  ------------------
  608|  4.07k|        decodeInput->samples.sample[0].sync = AVIF_TRUE;
  ------------------
  |  |   88|  4.07k|#define AVIF_TRUE 1
  ------------------
  609|  4.07k|    }
  610|  4.07k|    return AVIF_RESULT_OK;
  611|  4.22k|}
read.c:avifGetSampleCountOfChunk:
  509|  8.46k|{
  510|  8.46k|    uint32_t sampleCount = 0;
  511|  8.46k|    for (int sampleToChunkIndex = sampleToChunks->count - 1; sampleToChunkIndex >= 0; --sampleToChunkIndex) {
  ------------------
  |  Branch (511:62): [True: 8.45k, False: 3]
  ------------------
  512|  8.45k|        const avifSampleTableSampleToChunk * sampleToChunk = &sampleToChunks->sampleToChunk[sampleToChunkIndex];
  513|  8.45k|        if (sampleToChunk->firstChunk <= (chunkIndex + 1)) {
  ------------------
  |  Branch (513:13): [True: 8.45k, False: 0]
  ------------------
  514|  8.45k|            sampleCount = sampleToChunk->samplesPerChunk;
  515|  8.45k|            break;
  516|  8.45k|        }
  517|  8.45k|    }
  518|  8.46k|    return sampleCount;
  519|  8.46k|}
read.c:avifMetaFindColorItem:
 5472|  32.8k|{
 5473|  36.8k|    for (uint32_t itemIndex = 0; itemIndex < meta->items.count; ++itemIndex) {
  ------------------
  |  Branch (5473:34): [True: 36.7k, False: 93]
  ------------------
 5474|  36.7k|        avifDecoderItem * item = meta->items.item[itemIndex];
 5475|  36.7k|        if (avifDecoderItemShouldBeSkipped(item)) {
  ------------------
  |  Branch (5475:13): [True: 793, False: 35.9k]
  ------------------
 5476|    793|            continue;
 5477|    793|        }
 5478|  35.9k|        if (item->id == meta->primaryItemID) {
  ------------------
  |  Branch (5478:13): [True: 32.7k, False: 3.20k]
  ------------------
 5479|  32.7k|            return item;
 5480|  32.7k|        }
 5481|  35.9k|    }
 5482|     93|    return NULL;
 5483|  32.8k|}
read.c:avifDecoderItemReadAndParse:
 2441|  39.5k|{
 2442|  39.5k|    if (!memcmp(item->type, "grid", 4)) {
  ------------------
  |  Branch (2442:9): [True: 4.89k, False: 34.6k]
  ------------------
 2443|  4.89k|        if (isItemInInput) {
  ------------------
  |  Branch (2443:13): [True: 3.50k, False: 1.39k]
  ------------------
 2444|  3.50k|            avifROData readData;
 2445|  3.50k|            AVIF_CHECKRES(avifDecoderItemRead(item, decoder->io, &readData, 0, 0, decoder->data->diag));
  ------------------
  |  |   54|  3.50k|    do {                                  \
  |  |   55|  3.50k|        const avifResult result__ = (A);  \
  |  |   56|  3.50k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 5, False: 3.49k]
  |  |  ------------------
  |  |   57|      5|            avifBreakOnError();           \
  |  |   58|      5|            return result__;              \
  |  |   59|      5|        }                                 \
  |  |   60|  3.50k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 3.49k]
  |  |  ------------------
  ------------------
 2446|  3.49k|            AVIF_CHECKRES(avifParseImageGridBox(grid,
  ------------------
  |  |   54|  3.49k|    do {                                  \
  |  |   55|  3.49k|        const avifResult result__ = (A);  \
  |  |   56|  3.49k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 35, False: 3.46k]
  |  |  ------------------
  |  |   57|     35|            avifBreakOnError();           \
  |  |   58|     35|            return result__;              \
  |  |   59|     35|        }                                 \
  |  |   60|  3.49k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 3.46k]
  |  |  ------------------
  ------------------
 2447|  3.49k|                                                readData.data,
 2448|  3.49k|                                                readData.size,
 2449|  3.49k|                                                decoder->imageSizeLimit,
 2450|  3.49k|                                                decoder->imageDimensionLimit,
 2451|  3.49k|                                                decoder->data->diag));
 2452|       |            // Validate that there are exactly the same number of dimg items to form the grid.
 2453|  3.46k|            uint32_t dimgItemCount = 0;
 2454|  23.2k|            for (uint32_t i = 0; i < item->meta->items.count; ++i) {
  ------------------
  |  Branch (2454:34): [True: 19.8k, False: 3.46k]
  ------------------
 2455|  19.8k|                if (item->meta->items.item[i]->dimgForID == item->id) {
  ------------------
  |  Branch (2455:21): [True: 7.44k, False: 12.3k]
  ------------------
 2456|  7.44k|                    ++dimgItemCount;
 2457|  7.44k|                }
 2458|  19.8k|            }
 2459|  3.46k|            AVIF_CHECKERR(dimgItemCount == grid->rows * grid->columns, AVIF_RESULT_INVALID_IMAGE_GRID);
  ------------------
  |  |   45|  3.46k|    do {                        \
  |  |   46|  3.46k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 9, False: 3.45k]
  |  |  ------------------
  |  |   47|      9|            avifBreakOnError(); \
  |  |   48|      9|            return ERR;         \
  |  |   49|      9|        }                       \
  |  |   50|  3.46k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 3.45k]
  |  |  ------------------
  ------------------
 2460|  3.46k|        } else {
 2461|       |            // item was generated for convenience and is not part of the bitstream.
 2462|       |            // grid information should already be set.
 2463|  1.39k|            AVIF_ASSERT_OR_RETURN(grid->rows > 0 && grid->columns > 0);
  ------------------
  |  |   64|  1.39k|#define AVIF_ASSERT_OR_RETURN(A) AVIF_CHECKERR((A), AVIF_RESULT_INTERNAL_ERROR)
  |  |  ------------------
  |  |  |  |   45|  1.39k|    do {                        \
  |  |  |  |   46|  2.78k|        if (!(A)) {             \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (46:15): [True: 1.39k, False: 0]
  |  |  |  |  |  Branch (46:15): [True: 1.39k, False: 0]
  |  |  |  |  ------------------
  |  |  |  |   47|      0|            avifBreakOnError(); \
  |  |  |  |   48|      0|            return ERR;         \
  |  |  |  |   49|      0|        }                       \
  |  |  |  |   50|  1.39k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (50:14): [Folded, False: 1.39k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 2464|  1.39k|        }
 2465|  4.84k|        *codecType = avifDecoderItemGetGridCodecType(item);
 2466|  4.84k|        AVIF_CHECKERR(*codecType != AVIF_CODEC_TYPE_UNKNOWN, AVIF_RESULT_INVALID_IMAGE_GRID);
  ------------------
  |  |   45|  4.84k|    do {                        \
  |  |   46|  4.84k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 4, False: 4.84k]
  |  |  ------------------
  |  |   47|      4|            avifBreakOnError(); \
  |  |   48|      4|            return ERR;         \
  |  |   49|      4|        }                       \
  |  |   50|  4.84k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 4.84k]
  |  |  ------------------
  ------------------
 2467|  34.6k|    } else {
 2468|  34.6k|        *codecType = avifGetCodecType(item->type);
 2469|  34.6k|        AVIF_ASSERT_OR_RETURN(*codecType != AVIF_CODEC_TYPE_UNKNOWN);
  ------------------
  |  |   64|  34.6k|#define AVIF_ASSERT_OR_RETURN(A) AVIF_CHECKERR((A), AVIF_RESULT_INTERNAL_ERROR)
  |  |  ------------------
  |  |  |  |   45|  34.6k|    do {                        \
  |  |  |  |   46|  34.6k|        if (!(A)) {             \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (46:13): [True: 0, False: 34.6k]
  |  |  |  |  ------------------
  |  |  |  |   47|      0|            avifBreakOnError(); \
  |  |  |  |   48|      0|            return ERR;         \
  |  |  |  |   49|      0|        }                       \
  |  |  |  |   50|  34.6k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (50:14): [Folded, False: 34.6k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 2470|  34.6k|    }
 2471|  39.4k|    return AVIF_RESULT_OK;
 2472|  39.5k|}
read.c:avifParseImageGridBox:
 2116|  3.49k|{
 2117|  3.49k|    BEGIN_STREAM(s, raw, rawLen, diag, "Box[grid]");
  ------------------
  |  |  738|  3.49k|    avifROStream VARNAME;                               \
  |  |  739|  3.49k|    avifROData VARNAME##_roData;                        \
  |  |  740|  3.49k|    VARNAME##_roData.data = PTR;                        \
  |  |  741|  3.49k|    VARNAME##_roData.size = SIZE;                       \
  |  |  742|  3.49k|    avifROStreamStart(&VARNAME, &VARNAME##_roData, DIAG, CONTEXT)
  ------------------
 2118|       |
 2119|  3.49k|    uint8_t version, flags;
 2120|  3.49k|    AVIF_CHECKERR(avifROStreamRead(&s, &version, 1), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(8) version = 0;
  ------------------
  |  |   45|  3.49k|    do {                        \
  |  |   46|  3.49k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 3.49k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  3.49k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 3.49k]
  |  |  ------------------
  ------------------
 2121|  3.49k|    if (version != 0) {
  ------------------
  |  Branch (2121:9): [True: 16, False: 3.47k]
  ------------------
 2122|     16|        avifDiagnosticsPrintf(diag, "Box[grid] has unsupported version [%u]", version);
 2123|     16|        return AVIF_RESULT_NOT_IMPLEMENTED;
 2124|     16|    }
 2125|  3.47k|    uint8_t rowsMinusOne, columnsMinusOne;
 2126|  3.47k|    AVIF_CHECKERR(avifROStreamRead(&s, &flags, 1), AVIF_RESULT_BMFF_PARSE_FAILED);           // unsigned int(8) flags;
  ------------------
  |  |   45|  3.47k|    do {                        \
  |  |   46|  3.47k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 1, False: 3.47k]
  |  |  ------------------
  |  |   47|      1|            avifBreakOnError(); \
  |  |   48|      1|            return ERR;         \
  |  |   49|      1|        }                       \
  |  |   50|  3.47k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 3.47k]
  |  |  ------------------
  ------------------
 2127|  3.47k|    AVIF_CHECKERR(avifROStreamRead(&s, &rowsMinusOne, 1), AVIF_RESULT_BMFF_PARSE_FAILED);    // unsigned int(8) rows_minus_one;
  ------------------
  |  |   45|  3.47k|    do {                        \
  |  |   46|  3.47k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 1, False: 3.47k]
  |  |  ------------------
  |  |   47|      1|            avifBreakOnError(); \
  |  |   48|      1|            return ERR;         \
  |  |   49|      1|        }                       \
  |  |   50|  3.47k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 3.47k]
  |  |  ------------------
  ------------------
 2128|  3.47k|    AVIF_CHECKERR(avifROStreamRead(&s, &columnsMinusOne, 1), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(8) columns_minus_one;
  ------------------
  |  |   45|  3.47k|    do {                        \
  |  |   46|  3.47k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 1, False: 3.47k]
  |  |  ------------------
  |  |   47|      1|            avifBreakOnError(); \
  |  |   48|      1|            return ERR;         \
  |  |   49|      1|        }                       \
  |  |   50|  3.47k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 3.47k]
  |  |  ------------------
  ------------------
 2129|  3.47k|    grid->rows = (uint32_t)rowsMinusOne + 1;
 2130|  3.47k|    grid->columns = (uint32_t)columnsMinusOne + 1;
 2131|       |
 2132|  3.47k|    uint32_t fieldLength = ((flags & 1) + 1) * 16;
 2133|  3.47k|    if (fieldLength == 16) {
  ------------------
  |  Branch (2133:9): [True: 3.46k, False: 8]
  ------------------
 2134|  3.46k|        uint16_t outputWidth16, outputHeight16;
 2135|  3.46k|        AVIF_CHECKERR(avifROStreamReadU16(&s, &outputWidth16), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(FieldLength) output_width;
  ------------------
  |  |   45|  3.46k|    do {                        \
  |  |   46|  3.46k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 1, False: 3.46k]
  |  |  ------------------
  |  |   47|      1|            avifBreakOnError(); \
  |  |   48|      1|            return ERR;         \
  |  |   49|      1|        }                       \
  |  |   50|  3.46k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 3.46k]
  |  |  ------------------
  ------------------
 2136|  3.46k|        AVIF_CHECKERR(avifROStreamReadU16(&s, &outputHeight16), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(FieldLength) output_height;
  ------------------
  |  |   45|  3.46k|    do {                        \
  |  |   46|  3.46k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 1, False: 3.46k]
  |  |  ------------------
  |  |   47|      1|            avifBreakOnError(); \
  |  |   48|      1|            return ERR;         \
  |  |   49|      1|        }                       \
  |  |   50|  3.46k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 3.46k]
  |  |  ------------------
  ------------------
 2137|  3.46k|        grid->outputWidth = outputWidth16;
 2138|  3.46k|        grid->outputHeight = outputHeight16;
 2139|  3.46k|    } else {
 2140|      8|        if (fieldLength != 32) {
  ------------------
  |  Branch (2140:13): [True: 0, False: 8]
  ------------------
 2141|       |            // This should be impossible
 2142|      0|            avifDiagnosticsPrintf(diag, "Grid box contains illegal field length: [%u]", fieldLength);
 2143|      0|            return AVIF_RESULT_INVALID_IMAGE_GRID;
 2144|      0|        }
 2145|      8|        AVIF_CHECKERR(avifROStreamReadU32(&s, &grid->outputWidth), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(FieldLength) output_width;
  ------------------
  |  |   45|      8|    do {                        \
  |  |   46|      8|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 1, False: 7]
  |  |  ------------------
  |  |   47|      1|            avifBreakOnError(); \
  |  |   48|      1|            return ERR;         \
  |  |   49|      1|        }                       \
  |  |   50|      8|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 7]
  |  |  ------------------
  ------------------
 2146|      7|        AVIF_CHECKERR(avifROStreamReadU32(&s, &grid->outputHeight), AVIF_RESULT_BMFF_PARSE_FAILED); // unsigned int(FieldLength) output_height;
  ------------------
  |  |   45|      7|    do {                        \
  |  |   46|      7|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 2, False: 5]
  |  |  ------------------
  |  |   47|      2|            avifBreakOnError(); \
  |  |   48|      2|            return ERR;         \
  |  |   49|      2|        }                       \
  |  |   50|      7|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 5]
  |  |  ------------------
  ------------------
 2147|      7|    }
 2148|  3.47k|    if ((grid->outputWidth == 0) || (grid->outputHeight == 0)) {
  ------------------
  |  Branch (2148:9): [True: 1, False: 3.47k]
  |  Branch (2148:37): [True: 1, False: 3.46k]
  ------------------
 2149|      2|        avifDiagnosticsPrintf(diag, "Grid box contains illegal dimensions: [%u x %u]", grid->outputWidth, grid->outputHeight);
 2150|      2|        return AVIF_RESULT_INVALID_IMAGE_GRID;
 2151|      2|    }
 2152|  3.46k|    if (avifDimensionsTooLarge(grid->outputWidth, grid->outputHeight, imageSizeLimit, imageDimensionLimit)) {
  ------------------
  |  Branch (2152:9): [True: 8, False: 3.46k]
  ------------------
 2153|      8|        avifDiagnosticsPrintf(diag, "Grid box dimensions are too large: [%u x %u]", grid->outputWidth, grid->outputHeight);
 2154|      8|        return AVIF_RESULT_NOT_IMPLEMENTED;
 2155|      8|    }
 2156|  3.46k|    if (avifROStreamRemainingBytes(&s) != 0) {
  ------------------
  |  Branch (2156:9): [True: 1, False: 3.46k]
  ------------------
 2157|      1|        return AVIF_RESULT_BMFF_PARSE_FAILED;
 2158|      1|    }
 2159|  3.46k|    return AVIF_RESULT_OK;
 2160|  3.46k|}
read.c:avifDecoderItemGetGridCodecType:
 1567|  4.84k|{
 1568|  13.4k|    for (uint32_t i = 0; i < gridItem->meta->items.count; ++i) {
  ------------------
  |  Branch (1568:26): [True: 13.4k, False: 4]
  ------------------
 1569|  13.4k|        avifDecoderItem * item = gridItem->meta->items.item[i];
 1570|  13.4k|        const avifCodecType tileCodecType = avifGetCodecType(item->type);
 1571|  13.4k|        if ((item->dimgForID == gridItem->id) && (tileCodecType != AVIF_CODEC_TYPE_UNKNOWN)) {
  ------------------
  |  Branch (1571:13): [True: 4.87k, False: 8.57k]
  |  Branch (1571:50): [True: 4.84k, False: 31]
  ------------------
 1572|  4.84k|            return tileCodecType;
 1573|  4.84k|        }
 1574|  13.4k|    }
 1575|      4|    return AVIF_CODEC_TYPE_UNKNOWN;
 1576|  4.84k|}
read.c:avifMetaFindAlphaItem:
 5507|  32.7k|{
 5508|   111k|    for (uint32_t itemIndex = 0; itemIndex < meta->items.count; ++itemIndex) {
  ------------------
  |  Branch (5508:34): [True: 84.0k, False: 27.7k]
  ------------------
 5509|  84.0k|        avifDecoderItem * item = meta->items.item[itemIndex];
 5510|  84.0k|        if (avifDecoderItemShouldBeSkipped(item)) {
  ------------------
  |  Branch (5510:13): [True: 21.5k, False: 62.4k]
  ------------------
 5511|  21.5k|            continue;
 5512|  21.5k|        }
 5513|  62.4k|        if (avifDecoderItemIsAlphaAux(item, colorItem->id)) {
  ------------------
  |  Branch (5513:13): [True: 4.98k, False: 57.4k]
  ------------------
 5514|  4.98k|            *alphaItem = item;
 5515|  4.98k|            *isAlphaItemInInput = AVIF_TRUE;
  ------------------
  |  |   88|  4.98k|#define AVIF_TRUE 1
  ------------------
 5516|  4.98k|            return AVIF_RESULT_OK;
 5517|  4.98k|        }
 5518|  62.4k|    }
 5519|  27.7k|    if (memcmp(colorItem->type, "grid", 4)) {
  ------------------
  |  Branch (5519:9): [True: 24.4k, False: 3.30k]
  ------------------
 5520|  24.4k|        *alphaItem = NULL;
 5521|  24.4k|        *isAlphaItemInInput = AVIF_FALSE;
  ------------------
  |  |   89|  24.4k|#define AVIF_FALSE 0
  ------------------
 5522|  24.4k|        return AVIF_RESULT_OK;
 5523|  24.4k|    }
 5524|       |    // If color item is a grid, check if there is an alpha channel which is represented as an auxl item to each color tile item.
 5525|  3.30k|    const uint32_t tileCount = colorInfo->grid.rows * colorInfo->grid.columns;
 5526|  3.30k|    if (tileCount == 0) {
  ------------------
  |  Branch (5526:9): [True: 0, False: 3.30k]
  ------------------
 5527|      0|        *alphaItem = NULL;
 5528|      0|        *isAlphaItemInInput = AVIF_FALSE;
  ------------------
  |  |   89|      0|#define AVIF_FALSE 0
  ------------------
 5529|      0|        return AVIF_RESULT_OK;
 5530|      0|    }
 5531|       |    // Keep the same 'dimg' order as it defines where each tile is located in the reconstructed image.
 5532|  3.30k|    uint32_t * dimgIdxToAlphaItemIdx = (uint32_t *)avifAlloc(tileCount * sizeof(uint32_t));
 5533|  3.30k|    AVIF_CHECKERR(dimgIdxToAlphaItemIdx != NULL, AVIF_RESULT_OUT_OF_MEMORY);
  ------------------
  |  |   45|  3.30k|    do {                        \
  |  |   46|  3.30k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 3.30k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  3.30k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 3.30k]
  |  |  ------------------
  ------------------
 5534|  3.30k|    const uint32_t itemIndexNotSet = UINT32_MAX;
 5535|  10.1k|    for (uint32_t dimgIdx = 0; dimgIdx < tileCount; ++dimgIdx) {
  ------------------
  |  Branch (5535:32): [True: 6.85k, False: 3.30k]
  ------------------
 5536|  6.85k|        dimgIdxToAlphaItemIdx[dimgIdx] = itemIndexNotSet;
 5537|  6.85k|    }
 5538|  3.30k|    uint32_t alphaItemCount = 0;
 5539|  12.7k|    for (uint32_t i = 0; i < meta->items.count; ++i) {
  ------------------
  |  Branch (5539:26): [True: 11.3k, False: 1.39k]
  ------------------
 5540|  11.3k|        const avifDecoderItem * const item = meta->items.item[i];
 5541|  11.3k|        if (item->dimgForID == colorItem->id) {
  ------------------
  |  Branch (5541:13): [True: 4.73k, False: 6.57k]
  ------------------
 5542|  4.73k|            avifBool seenAlphaForCurrentItem = AVIF_FALSE;
  ------------------
  |  |   89|  4.73k|#define AVIF_FALSE 0
  ------------------
 5543|  26.4k|            for (uint32_t j = 0; j < meta->items.count; ++j) {
  ------------------
  |  Branch (5543:34): [True: 23.0k, False: 3.36k]
  ------------------
 5544|  23.0k|                avifDecoderItem * auxlItem = meta->items.item[j];
 5545|  23.0k|                if (avifDecoderItemIsAlphaAux(auxlItem, item->id)) {
  ------------------
  |  Branch (5545:21): [True: 4.20k, False: 18.8k]
  ------------------
 5546|  4.20k|                    if (seenAlphaForCurrentItem || auxlItem->dimgForID != 0 || item->dimgIdx >= tileCount ||
  ------------------
  |  Branch (5546:25): [True: 1, False: 4.20k]
  |  Branch (5546:52): [True: 1.37k, False: 2.83k]
  |  Branch (5546:80): [True: 0, False: 2.83k]
  ------------------
 5547|  2.83k|                        dimgIdxToAlphaItemIdx[item->dimgIdx] != itemIndexNotSet) {
  ------------------
  |  Branch (5547:25): [True: 0, False: 2.83k]
  ------------------
 5548|       |                        // One of the following invalid cases:
 5549|       |                        // * Multiple items are claiming to be the alpha auxiliary of the current item.
 5550|       |                        // * Alpha auxiliary is dimg for another item.
 5551|       |                        // * There are too many items in the dimg array (also checked later in avifFillDimgIdxToItemIdxArray()).
 5552|       |                        // * There is a repetition in the dimg array (also checked later in avifFillDimgIdxToItemIdxArray()).
 5553|  1.37k|                        avifFree(dimgIdxToAlphaItemIdx);
 5554|  1.37k|                        return AVIF_RESULT_INVALID_IMAGE_GRID;
 5555|  1.37k|                    }
 5556|  2.83k|                    dimgIdxToAlphaItemIdx[item->dimgIdx] = j;
 5557|  2.83k|                    ++alphaItemCount;
 5558|  2.83k|                    seenAlphaForCurrentItem = AVIF_TRUE;
  ------------------
  |  |   88|  2.83k|#define AVIF_TRUE 1
  ------------------
 5559|  2.83k|                }
 5560|  23.0k|            }
 5561|  3.36k|            if (!seenAlphaForCurrentItem) {
  ------------------
  |  Branch (5561:17): [True: 537, False: 2.82k]
  ------------------
 5562|       |                // No alpha auxiliary item was found for the current item. Treat this as an image without alpha.
 5563|    537|                avifFree(dimgIdxToAlphaItemIdx);
 5564|    537|                *alphaItem = NULL;
 5565|    537|                *isAlphaItemInInput = AVIF_FALSE;
  ------------------
  |  |   89|    537|#define AVIF_FALSE 0
  ------------------
 5566|    537|                return AVIF_RESULT_OK;
 5567|    537|            }
 5568|  3.36k|        }
 5569|  11.3k|    }
 5570|  1.39k|    if (alphaItemCount != tileCount) {
  ------------------
  |  Branch (5570:9): [True: 0, False: 1.39k]
  ------------------
 5571|      0|        avifFree(dimgIdxToAlphaItemIdx);
 5572|      0|        return AVIF_RESULT_INVALID_IMAGE_GRID;
 5573|      0|    }
 5574|       |    // Find an unused ID.
 5575|  1.39k|    avifResult result;
 5576|  1.39k|    if (meta->items.count >= UINT32_MAX - 1) {
  ------------------
  |  Branch (5576:9): [True: 0, False: 1.39k]
  ------------------
 5577|       |        // In the improbable case where all IDs are used.
 5578|      0|        result = AVIF_RESULT_DECODE_ALPHA_FAILED;
 5579|  1.39k|    } else {
 5580|  1.39k|        uint32_t newItemID = 0;
 5581|  1.39k|        avifBool isUsed;
 5582|  5.58k|        do {
 5583|  5.58k|            ++newItemID;
 5584|  5.58k|            isUsed = AVIF_FALSE;
  ------------------
  |  |   89|  5.58k|#define AVIF_FALSE 0
  ------------------
 5585|  16.9k|            for (uint32_t i = 0; i < meta->items.count; ++i) {
  ------------------
  |  Branch (5585:34): [True: 15.5k, False: 1.39k]
  ------------------
 5586|  15.5k|                if (meta->items.item[i]->id == newItemID) {
  ------------------
  |  Branch (5586:21): [True: 4.19k, False: 11.3k]
  ------------------
 5587|  4.19k|                    isUsed = AVIF_TRUE;
  ------------------
  |  |   88|  4.19k|#define AVIF_TRUE 1
  ------------------
 5588|  4.19k|                    break;
 5589|  4.19k|                }
 5590|  15.5k|            }
 5591|  5.58k|        } while (isUsed && newItemID != 0);
  ------------------
  |  Branch (5591:18): [True: 4.19k, False: 1.39k]
  |  Branch (5591:28): [True: 4.19k, False: 0]
  ------------------
 5592|  1.39k|        result = avifMetaFindOrCreateItem(meta, newItemID, alphaItem); // Create new empty item.
 5593|  1.39k|    }
 5594|  1.39k|    if (result != AVIF_RESULT_OK) {
  ------------------
  |  Branch (5594:9): [True: 0, False: 1.39k]
  ------------------
 5595|      0|        avifFree(dimgIdxToAlphaItemIdx);
 5596|      0|        return result;
 5597|      0|    }
 5598|  1.39k|    memcpy((*alphaItem)->type, "grid", 4); // Make it a grid and register alpha items as its tiles.
 5599|  1.39k|    (*alphaItem)->width = colorItem->width;
 5600|  1.39k|    (*alphaItem)->height = colorItem->height;
 5601|  4.17k|    for (uint32_t dimgIdx = 0; dimgIdx < tileCount; ++dimgIdx) {
  ------------------
  |  Branch (5601:32): [True: 2.78k, False: 1.39k]
  ------------------
 5602|  2.78k|        if (dimgIdxToAlphaItemIdx[dimgIdx] >= meta->items.count) {
  ------------------
  |  Branch (5602:13): [True: 0, False: 2.78k]
  ------------------
 5603|      0|            avifFree(dimgIdxToAlphaItemIdx);
 5604|      0|            AVIF_ASSERT_NOT_REACHED_OR_RETURN;
  ------------------
  |  |   66|      0|    do {                                   \
  |  |   67|      0|        avifBreakOnError();                \
  |  |   68|      0|        return AVIF_RESULT_INTERNAL_ERROR; \
  |  |   69|      0|    } while (0)
  |  |  ------------------
  |  |  |  Branch (69:14): [Folded, False: 0]
  |  |  ------------------
  ------------------
 5605|      0|        }
 5606|  2.78k|        avifDecoderItem * alphaTileItem = meta->items.item[dimgIdxToAlphaItemIdx[dimgIdx]];
 5607|  2.78k|        alphaTileItem->dimgForID = (*alphaItem)->id;
 5608|  2.78k|        alphaTileItem->dimgIdx = dimgIdx;
 5609|  2.78k|    }
 5610|  1.39k|    avifFree(dimgIdxToAlphaItemIdx);
 5611|  1.39k|    *isAlphaItemInInput = AVIF_FALSE;
  ------------------
  |  |   89|  1.39k|#define AVIF_FALSE 0
  ------------------
 5612|  1.39k|    alphaInfo->grid = colorInfo->grid;
 5613|  1.39k|    return AVIF_RESULT_OK;
 5614|  1.39k|}
read.c:avifDecoderItemIsAlphaAux:
 5488|  85.5k|{
 5489|  85.5k|    if (item->auxForID != colorItemId)
  ------------------
  |  Branch (5489:9): [True: 72.5k, False: 12.9k]
  ------------------
 5490|  72.5k|        return AVIF_FALSE;
  ------------------
  |  |   89|  72.5k|#define AVIF_FALSE 0
  ------------------
 5491|  12.9k|    const avifProperty * auxCProp = avifPropertyArrayFind(&item->properties, "auxC");
 5492|  12.9k|    return auxCProp && isAlphaURN(auxCProp->u.auxC.auxType);
  ------------------
  |  Branch (5492:12): [True: 12.4k, False: 528]
  |  Branch (5492:24): [True: 9.18k, False: 3.27k]
  ------------------
 5493|  85.5k|}
read.c:avifBrandArrayHasBrand:
 5063|  31.3k|{
 5064|   155k|    for (uint32_t brandIndex = 0; brandIndex < brands->count; ++brandIndex) {
  ------------------
  |  Branch (5064:35): [True: 127k, False: 28.4k]
  ------------------
 5065|   127k|        if (!memcmp(brands->brand[brandIndex], brand, 4)) {
  ------------------
  |  Branch (5065:13): [True: 2.90k, False: 124k]
  ------------------
 5066|  2.90k|            return AVIF_TRUE;
  ------------------
  |  |   88|  2.90k|#define AVIF_TRUE 1
  ------------------
 5067|  2.90k|        }
 5068|   127k|    }
 5069|  28.4k|    return AVIF_FALSE;
  ------------------
  |  |   89|  28.4k|#define AVIF_FALSE 0
  ------------------
 5070|  31.3k|}
read.c:avifDecoderFindGainMapItem:
 5775|  2.90k|{
 5776|  2.90k|    *gainMapItem = NULL;
 5777|  2.90k|    *gainMapCodecType = AVIF_CODEC_TYPE_UNKNOWN;
 5778|       |
 5779|  2.90k|    avifDecoderData * data = decoder->data;
 5780|       |
 5781|       |    // Find tmap and gain map item ids.
 5782|  2.90k|    uint32_t gainMapItemID;
 5783|  2.90k|    avifDecoderItem * toneMappedImageItemTmp;
 5784|  2.90k|    AVIF_CHECKRES(avifDecoderDataFindToneMappedImageItem(data, colorItem, &toneMappedImageItemTmp, &gainMapItemID));
  ------------------
  |  |   54|  2.90k|    do {                                  \
  |  |   55|  2.90k|        const avifResult result__ = (A);  \
  |  |   56|  2.90k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 3, False: 2.90k]
  |  |  ------------------
  |  |   57|      3|            avifBreakOnError();           \
  |  |   58|      3|            return result__;              \
  |  |   59|      3|        }                                 \
  |  |   60|  2.90k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 2.90k]
  |  |  ------------------
  ------------------
 5785|  2.90k|    if (!toneMappedImageItemTmp || !gainMapItemID) {
  ------------------
  |  Branch (5785:9): [True: 983, False: 1.92k]
  |  Branch (5785:36): [True: 0, False: 1.92k]
  ------------------
 5786|    983|        return AVIF_RESULT_OK;
 5787|    983|    }
 5788|       |
 5789|  1.92k|    if (!avifIsPreferredAlternativeTo(data, toneMappedImageItemTmp->id, colorItem->id)) {
  ------------------
  |  Branch (5789:9): [True: 1.21k, False: 704]
  ------------------
 5790|  1.21k|        return AVIF_RESULT_OK;
 5791|  1.21k|    }
 5792|       |
 5793|       |    // Parse tmap item data (containing the gain map metadata).
 5794|    704|    avifROData tmapData;
 5795|    704|    AVIF_CHECKRES(avifDecoderItemRead(toneMappedImageItemTmp, decoder->io, &tmapData, 0, 0, data->diag));
  ------------------
  |  |   54|    704|    do {                                  \
  |  |   55|    704|        const avifResult result__ = (A);  \
  |  |   56|    704|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 9, False: 695]
  |  |  ------------------
  |  |   57|      9|            avifBreakOnError();           \
  |  |   58|      9|            return result__;              \
  |  |   59|      9|        }                                 \
  |  |   60|    704|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 695]
  |  |  ------------------
  ------------------
 5796|       |    // Allocate avifGainMap on the stack instead of using avifGainMapCreate() to simplify error handling.
 5797|    695|    avifGainMap gainMapTmp;
 5798|    695|    avifGainMapSetDefaults(&gainMapTmp);
 5799|    695|    avifResult result = avifParseToneMappedImageBox(&gainMapTmp, tmapData.data, tmapData.size, data->diag);
 5800|    695|    if (result == AVIF_RESULT_NOT_IMPLEMENTED) {
  ------------------
  |  Branch (5800:9): [True: 180, False: 515]
  ------------------
 5801|       |        // Unsupported gain map version. Simply ignore the gain map.
 5802|    180|        return AVIF_RESULT_OK;
 5803|    180|    }
 5804|    515|    AVIF_CHECKRES(result);
  ------------------
  |  |   54|    515|    do {                                  \
  |  |   55|    515|        const avifResult result__ = (A);  \
  |  |   56|    515|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 55, False: 460]
  |  |  ------------------
  |  |   57|     55|            avifBreakOnError();           \
  |  |   58|     55|            return result__;              \
  |  |   59|     55|        }                                 \
  |  |   60|    515|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 460]
  |  |  ------------------
  ------------------
 5805|       |
 5806|    460|    avifDecoderItem * gainMapItemTmp;
 5807|    460|    AVIF_CHECKRES(avifMetaFindOrCreateItem(data->meta, gainMapItemID, &gainMapItemTmp));
  ------------------
  |  |   54|    460|    do {                                  \
  |  |   55|    460|        const avifResult result__ = (A);  \
  |  |   56|    460|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 0, False: 460]
  |  |  ------------------
  |  |   57|      0|            avifBreakOnError();           \
  |  |   58|      0|            return result__;              \
  |  |   59|      0|        }                                 \
  |  |   60|    460|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 460]
  |  |  ------------------
  ------------------
 5808|    460|    if (avifDecoderItemShouldBeSkipped(gainMapItemTmp)) {
  ------------------
  |  Branch (5808:9): [True: 23, False: 437]
  ------------------
 5809|     23|        return AVIF_RESULT_NOT_IMPLEMENTED;
 5810|     23|    }
 5811|       |
 5812|    437|    avifCodecType gainMapCodecTypeTmp;
 5813|    437|    result = avifDecoderItemReadAndParse(decoder,
 5814|    437|                                         gainMapItemTmp,
 5815|    437|                                         /*isItemInInput=*/AVIF_TRUE,
  ------------------
  |  |   88|    437|#define AVIF_TRUE 1
  ------------------
 5816|    437|                                         &data->tileInfos[AVIF_ITEM_GAIN_MAP].grid,
 5817|    437|                                         &gainMapCodecTypeTmp);
 5818|    437|    if (result == AVIF_RESULT_NOT_IMPLEMENTED) {
  ------------------
  |  Branch (5818:9): [True: 21, False: 416]
  ------------------
 5819|     21|        return AVIF_RESULT_OK;
 5820|     21|    }
 5821|    416|    AVIF_CHECKRES(result);
  ------------------
  |  |   54|    416|    do {                                  \
  |  |   55|    416|        const avifResult result__ = (A);  \
  |  |   56|    416|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 6, False: 410]
  |  |  ------------------
  |  |   57|      6|            avifBreakOnError();           \
  |  |   58|      6|            return result__;              \
  |  |   59|      6|        }                                 \
  |  |   60|    416|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 410]
  |  |  ------------------
  ------------------
 5822|       |
 5823|       |    // This may allocate gainMapTmp.altICC which must be freed in case of error.
 5824|    410|    result = avifReadColorProperties(decoder->io,
 5825|    410|                                     &toneMappedImageItemTmp->properties,
 5826|    410|                                     &gainMapTmp.altICC,
 5827|    410|                                     &gainMapTmp.altColorPrimaries,
 5828|    410|                                     &gainMapTmp.altTransferCharacteristics,
 5829|    410|                                     &gainMapTmp.altMatrixCoefficients,
 5830|    410|                                     &gainMapTmp.altYUVRange,
 5831|    410|                                     /*cicpSet=*/NULL);
 5832|    410|    if (result != AVIF_RESULT_OK) {
  ------------------
  |  Branch (5832:9): [True: 5, False: 405]
  ------------------
 5833|      5|        avifRWDataFree(&gainMapTmp.altICC);
 5834|      5|        return result;
 5835|      5|    }
 5836|       |
 5837|    405|    const avifProperty * clliProp = avifPropertyArrayFind(&toneMappedImageItemTmp->properties, "clli");
 5838|    405|    if (clliProp) {
  ------------------
  |  Branch (5838:9): [True: 0, False: 405]
  ------------------
 5839|      0|        gainMapTmp.altCLLI = clliProp->u.clli;
 5840|      0|    }
 5841|       |
 5842|    405|    const avifProperty * pixiProp = avifPropertyArrayFind(&toneMappedImageItemTmp->properties, "pixi");
 5843|    405|    if (pixiProp) {
  ------------------
  |  Branch (5843:9): [True: 369, False: 36]
  ------------------
 5844|    369|        gainMapTmp.altPlaneCount = pixiProp->u.pixi.planeCount;
 5845|    369|        gainMapTmp.altDepth = pixiProp->u.pixi.planeDepths[0];
 5846|    369|    }
 5847|       |
 5848|    405|    const avifProperty * ispeProp = avifPropertyArrayFind(&toneMappedImageItemTmp->properties, "ispe");
 5849|    405|    if (!ispeProp) {
  ------------------
  |  Branch (5849:9): [True: 3, False: 402]
  ------------------
 5850|       |        // HEIF (ISO/IEC 23008-12:2022), Section 6.5.3.1:
 5851|       |        // Every image item shall be associated with one property of this type, prior to the association
 5852|       |        // of all transformative properties.
 5853|      3|        avifDiagnosticsPrintf(data->diag, "Box[tmap] missing mandatory ispe property");
 5854|      3|        avifRWDataFree(&gainMapTmp.altICC);
 5855|      3|        return AVIF_RESULT_BMFF_PARSE_FAILED;
 5856|      3|    }
 5857|    402|    if (ispeProp->u.ispe.width != colorItem->width || ispeProp->u.ispe.height != colorItem->height) {
  ------------------
  |  Branch (5857:9): [True: 1, False: 401]
  |  Branch (5857:55): [True: 1, False: 400]
  ------------------
 5858|      2|        avifDiagnosticsPrintf(data->diag, "Box[tmap] ispe property width/height does not match base image");
 5859|      2|        avifRWDataFree(&gainMapTmp.altICC);
 5860|      2|        return AVIF_RESULT_BMFF_PARSE_FAILED;
 5861|      2|    }
 5862|       |
 5863|    400|    if (avifPropertyArrayFind(&toneMappedImageItemTmp->properties, "pasp") ||
  ------------------
  |  Branch (5863:9): [True: 0, False: 400]
  ------------------
 5864|    400|        avifPropertyArrayFind(&toneMappedImageItemTmp->properties, "clap") ||
  ------------------
  |  Branch (5864:9): [True: 0, False: 400]
  ------------------
 5865|    400|        avifPropertyArrayFind(&toneMappedImageItemTmp->properties, "irot") ||
  ------------------
  |  Branch (5865:9): [True: 0, False: 400]
  ------------------
 5866|    400|        avifPropertyArrayFind(&toneMappedImageItemTmp->properties, "imir")) {
  ------------------
  |  Branch (5866:9): [True: 0, False: 400]
  ------------------
 5867|       |        // libavif requires the bitstream contain the same pasp, clap, irot, imir
 5868|       |        // properties for both the base and gain map image items used as input to
 5869|       |        // the tone-mapped derived image item. libavif also requires the tone-mapped
 5870|       |        // derived image item itself not be associated with these properties. This is
 5871|       |        // enforced at encoding. Other patterns are rejected at decoding.
 5872|      0|        avifDiagnosticsPrintf(data->diag,
 5873|      0|                              "Box[tmap] 'pasp', 'clap', 'irot' and 'imir' properties must be associated with base and gain map items instead of 'tmap'");
 5874|      0|        avifRWDataFree(&gainMapTmp.altICC);
 5875|      0|        return AVIF_RESULT_INVALID_TONE_MAPPED_IMAGE;
 5876|      0|    }
 5877|       |
 5878|    400|    avifColorPrimaries colorPrimaries = AVIF_COLOR_PRIMARIES_UNSPECIFIED;
 5879|    400|    avifTransferCharacteristics transferCharacteristics = AVIF_TRANSFER_CHARACTERISTICS_UNSPECIFIED;
 5880|    400|    avifMatrixCoefficients matrixCoefficients = AVIF_MATRIX_COEFFICIENTS_UNSPECIFIED;
 5881|    400|    avifRange yuvRange = AVIF_RANGE_FULL;
 5882|    400|    avifBool cicpSet = AVIF_FALSE;
  ------------------
  |  |   89|    400|#define AVIF_FALSE 0
  ------------------
 5883|       |    // Look for a colr nclx box. Other colr box types (e.g. ICC) are not supported.
 5884|    400|    result =
 5885|    400|        avifReadColorNclxProperty(&gainMapItemTmp->properties, &colorPrimaries, &transferCharacteristics, &matrixCoefficients, &yuvRange, &cicpSet);
 5886|    400|    if (result != AVIF_RESULT_OK) {
  ------------------
  |  Branch (5886:9): [True: 2, False: 398]
  ------------------
 5887|      2|        avifRWDataFree(&gainMapTmp.altICC);
 5888|      2|        return result;
 5889|      2|    }
 5890|       |
 5891|       |    // -- Everything is valid, do memory allocations and fill in output data. --
 5892|       |
 5893|    398|    decoder->image->gainMap = avifGainMapCreate();
 5894|    398|    if (!decoder->image->gainMap) {
  ------------------
  |  Branch (5894:9): [True: 0, False: 398]
  ------------------
 5895|      0|        avifRWDataFree(&gainMapTmp.altICC);
 5896|      0|        return AVIF_RESULT_OUT_OF_MEMORY;
 5897|      0|    }
 5898|       |
 5899|    398|    if (decoder->imageContentToDecode & AVIF_IMAGE_CONTENT_GAIN_MAP) {
  ------------------
  |  Branch (5899:9): [True: 295, False: 103]
  ------------------
 5900|    295|        avifImage * image = avifImageCreateEmpty();
 5901|    295|        if (!image) {
  ------------------
  |  Branch (5901:13): [True: 0, False: 295]
  ------------------
 5902|      0|            avifRWDataFree(&gainMapTmp.altICC);
 5903|      0|            return AVIF_RESULT_OUT_OF_MEMORY;
 5904|      0|        }
 5905|    295|        if (cicpSet) {
  ------------------
  |  Branch (5905:13): [True: 270, False: 25]
  ------------------
 5906|    270|            image->colorPrimaries = colorPrimaries;
 5907|    270|            image->transferCharacteristics = transferCharacteristics;
 5908|    270|            image->matrixCoefficients = matrixCoefficients;
 5909|    270|            image->yuvRange = yuvRange;
 5910|    270|        }
 5911|    295|        gainMapTmp.image = image;
 5912|    295|    }
 5913|       |
 5914|       |    // Only set the output pointers after everything has been validated.
 5915|    398|    *decoder->image->gainMap = gainMapTmp;
 5916|    398|    *gainMapItem = gainMapItemTmp;
 5917|    398|    *gainMapCodecType = gainMapCodecTypeTmp;
 5918|    398|    return AVIF_RESULT_OK;
 5919|    398|}
read.c:avifDecoderDataFindToneMappedImageItem:
 5698|  2.90k|{
 5699|  13.3k|    for (uint32_t itemIndex = 0; itemIndex < data->meta->items.count; ++itemIndex) {
  ------------------
  |  Branch (5699:34): [True: 12.3k, False: 983]
  ------------------
 5700|  12.3k|        avifDecoderItem * item = data->meta->items.item[itemIndex];
 5701|  12.3k|        if (!item->size || item->hasUnsupportedEssentialProperty || item->thumbnailForID != 0) {
  ------------------
  |  Branch (5701:13): [True: 1.61k, False: 10.7k]
  |  Branch (5701:28): [True: 1.94k, False: 8.77k]
  |  Branch (5701:69): [True: 0, False: 8.77k]
  ------------------
 5702|  3.56k|            continue;
 5703|  3.56k|        }
 5704|  8.77k|        if (!memcmp(item->type, "tmap", 4)) {
  ------------------
  |  Branch (5704:13): [True: 2.34k, False: 6.43k]
  ------------------
 5705|       |            // The tmap box should be associated (via 'iref'->'dimg') to two items:
 5706|       |            // the first one is the base image, the second one is the gain map.
 5707|  2.34k|            uint32_t dimgItemIDs[2] = { 0, 0 };
 5708|  2.34k|            uint32_t numDimgItemIDs = 0;
 5709|  17.7k|            for (uint32_t otherItemIndex = 0; otherItemIndex < data->meta->items.count; ++otherItemIndex) {
  ------------------
  |  Branch (5709:47): [True: 15.4k, False: 2.34k]
  ------------------
 5710|  15.4k|                avifDecoderItem * otherItem = data->meta->items.item[otherItemIndex];
 5711|  15.4k|                if (otherItem->dimgForID != item->id) {
  ------------------
  |  Branch (5711:21): [True: 10.7k, False: 4.68k]
  ------------------
 5712|  10.7k|                    continue;
 5713|  10.7k|                }
 5714|  4.68k|                if (otherItem->dimgIdx < 2) {
  ------------------
  |  Branch (5714:21): [True: 4.68k, False: 0]
  ------------------
 5715|  4.68k|                    AVIF_ASSERT_OR_RETURN(dimgItemIDs[otherItem->dimgIdx] == 0);
  ------------------
  |  |   64|  4.68k|#define AVIF_ASSERT_OR_RETURN(A) AVIF_CHECKERR((A), AVIF_RESULT_INTERNAL_ERROR)
  |  |  ------------------
  |  |  |  |   45|  4.68k|    do {                        \
  |  |  |  |   46|  4.68k|        if (!(A)) {             \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (46:13): [True: 0, False: 4.68k]
  |  |  |  |  ------------------
  |  |  |  |   47|      0|            avifBreakOnError(); \
  |  |  |  |   48|      0|            return ERR;         \
  |  |  |  |   49|      0|        }                       \
  |  |  |  |   50|  4.68k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (50:14): [Folded, False: 4.68k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 5716|  4.68k|                    dimgItemIDs[otherItem->dimgIdx] = otherItem->id;
 5717|  4.68k|                }
 5718|  4.68k|                numDimgItemIDs++;
 5719|  4.68k|            }
 5720|       |            // Even with numDimgItemIDs == 2, one of the ids could be 0 if there are duplicate entries in the 'dimg' box.
 5721|  2.34k|            if (numDimgItemIDs != 2 || dimgItemIDs[0] == 0 || dimgItemIDs[1] == 0) {
  ------------------
  |  Branch (5721:17): [True: 3, False: 2.34k]
  |  Branch (5721:40): [True: 0, False: 2.34k]
  |  Branch (5721:63): [True: 0, False: 2.34k]
  ------------------
 5722|      3|                avifDiagnosticsPrintf(data->diag, "box[dimg] for 'tmap' item %d must have exactly 2 entries with distinct ids", item->id);
 5723|      3|                return AVIF_RESULT_INVALID_TONE_MAPPED_IMAGE;
 5724|      3|            }
 5725|  2.34k|            if (dimgItemIDs[0] != colorItem->id) {
  ------------------
  |  Branch (5725:17): [True: 423, False: 1.92k]
  ------------------
 5726|    423|                continue;
 5727|    423|            }
 5728|       |
 5729|  1.92k|            *toneMappedImageItem = item;
 5730|  1.92k|            *gainMapItemID = dimgItemIDs[1];
 5731|  1.92k|            return AVIF_RESULT_OK;
 5732|  2.34k|        }
 5733|  8.77k|    }
 5734|    983|    *toneMappedImageItem = NULL;
 5735|    983|    *gainMapItemID = 0;
 5736|    983|    return AVIF_RESULT_OK;
 5737|  2.90k|}
read.c:avifIsPreferredAlternativeTo:
 5743|  1.92k|{
 5744|  2.59k|    for (uint32_t i = 0; i < data->meta->entityToGroups.count; ++i) {
  ------------------
  |  Branch (5744:26): [True: 1.45k, False: 1.13k]
  ------------------
 5745|  1.45k|        avifEntityToGroup * group = &data->meta->entityToGroups.groups[i];
 5746|  1.45k|        if (memcmp(group->groupingType, "altr", 4) != 0) {
  ------------------
  |  Branch (5746:13): [True: 63, False: 1.39k]
  ------------------
 5747|     63|            continue;
 5748|     63|        }
 5749|  1.39k|        avifBool id1Found = AVIF_FALSE;
  ------------------
  |  |   89|  1.39k|#define AVIF_FALSE 0
  ------------------
 5750|  3.39k|        for (uint32_t j = 0; j < group->entityIDs.count; ++j) {
  ------------------
  |  Branch (5750:30): [True: 2.77k, False: 613]
  ------------------
 5751|  2.77k|            if (group->entityIDs.ids[j] == id1) {
  ------------------
  |  Branch (5751:17): [True: 1.00k, False: 1.77k]
  ------------------
 5752|  1.00k|                id1Found = AVIF_TRUE;
  ------------------
  |  |   88|  1.00k|#define AVIF_TRUE 1
  ------------------
 5753|  1.77k|            } else if (group->entityIDs.ids[j] == id2) {
  ------------------
  |  Branch (5753:24): [True: 783, False: 992]
  ------------------
 5754|       |                // Assume id2 is only present in one altr group, as per ISO/IEC 14496-12:2022
 5755|       |                // Section 8.15.3.1:
 5756|       |                // Any entity_id value shall be mapped to only one grouping of type 'altr'.
 5757|    783|                return id1Found;
 5758|    783|            }
 5759|  2.77k|        }
 5760|  1.39k|    }
 5761|  1.13k|    return AVIF_FALSE;
  ------------------
  |  |   89|  1.13k|#define AVIF_FALSE 0
  ------------------
 5762|  1.92k|}
read.c:avifParseToneMappedImageBox:
 2206|    695|{
 2207|    695|    BEGIN_STREAM(s, raw, rawLen, diag, "Box[tmap]");
  ------------------
  |  |  738|    695|    avifROStream VARNAME;                               \
  |  |  739|    695|    avifROData VARNAME##_roData;                        \
  |  |  740|    695|    VARNAME##_roData.data = PTR;                        \
  |  |  741|    695|    VARNAME##_roData.size = SIZE;                       \
  |  |  742|    695|    avifROStreamStart(&VARNAME, &VARNAME##_roData, DIAG, CONTEXT)
  ------------------
 2208|       |
 2209|    695|    uint8_t version;
 2210|    695|    AVIF_CHECKERR(avifROStreamRead(&s, &version, 1), AVIF_RESULT_INVALID_TONE_MAPPED_IMAGE); // unsigned int(8) version = 0;
  ------------------
  |  |   45|    695|    do {                        \
  |  |   46|    695|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 695]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|    695|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 695]
  |  |  ------------------
  ------------------
 2211|    695|    if (version != 0) {
  ------------------
  |  Branch (2211:9): [True: 165, False: 530]
  ------------------
 2212|    165|        avifDiagnosticsPrintf(diag, "Box[tmap] has unsupported version [%u]", version);
 2213|    165|        return AVIF_RESULT_NOT_IMPLEMENTED;
 2214|    165|    }
 2215|       |
 2216|    530|    uint16_t minimumVersion;
 2217|    530|    AVIF_CHECKERR(avifROStreamReadU16(&s, &minimumVersion), AVIF_RESULT_INVALID_TONE_MAPPED_IMAGE); // unsigned int(16) minimum_version;
  ------------------
  |  |   45|    530|    do {                        \
  |  |   46|    530|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 1, False: 529]
  |  |  ------------------
  |  |   47|      1|            avifBreakOnError(); \
  |  |   48|      1|            return ERR;         \
  |  |   49|      1|        }                       \
  |  |   50|    530|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 529]
  |  |  ------------------
  ------------------
 2218|    529|    const uint16_t supportedMetadataVersion = 0;
 2219|    529|    if (minimumVersion > supportedMetadataVersion) {
  ------------------
  |  Branch (2219:9): [True: 15, False: 514]
  ------------------
 2220|     15|        avifDiagnosticsPrintf(diag, "Box[tmap] has unsupported minimum version [%u]", minimumVersion);
 2221|     15|        return AVIF_RESULT_NOT_IMPLEMENTED;
 2222|     15|    }
 2223|    514|    uint16_t writerVersion;
 2224|    514|    AVIF_CHECKERR(avifROStreamReadU16(&s, &writerVersion), AVIF_RESULT_INVALID_TONE_MAPPED_IMAGE); // unsigned int(16) writer_version;
  ------------------
  |  |   45|    514|    do {                        \
  |  |   46|    514|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 1, False: 513]
  |  |  ------------------
  |  |   47|      1|            avifBreakOnError(); \
  |  |   48|      1|            return ERR;         \
  |  |   49|      1|        }                       \
  |  |   50|    514|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 513]
  |  |  ------------------
  ------------------
 2225|    513|    AVIF_CHECKERR(writerVersion >= minimumVersion, AVIF_RESULT_INVALID_TONE_MAPPED_IMAGE);
  ------------------
  |  |   45|    513|    do {                        \
  |  |   46|    513|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 513]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|    513|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 513]
  |  |  ------------------
  ------------------
 2226|       |
 2227|    513|    AVIF_CHECKERR(avifParseGainMapMetadata(gainMap, &s), AVIF_RESULT_INVALID_TONE_MAPPED_IMAGE);
  ------------------
  |  |   45|    513|    do {                        \
  |  |   46|    513|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 15, False: 498]
  |  |  ------------------
  |  |   47|     15|            avifBreakOnError(); \
  |  |   48|     15|            return ERR;         \
  |  |   49|     15|        }                       \
  |  |   50|    513|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 498]
  |  |  ------------------
  ------------------
 2228|       |
 2229|    498|    if (writerVersion <= supportedMetadataVersion) {
  ------------------
  |  Branch (2229:9): [True: 161, False: 337]
  ------------------
 2230|    161|        AVIF_CHECKERR(avifROStreamRemainingBytes(&s) == 0, AVIF_RESULT_INVALID_TONE_MAPPED_IMAGE);
  ------------------
  |  |   45|    161|    do {                        \
  |  |   46|    161|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 1, False: 160]
  |  |  ------------------
  |  |   47|      1|            avifBreakOnError(); \
  |  |   48|      1|            return ERR;         \
  |  |   49|      1|        }                       \
  |  |   50|    161|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 160]
  |  |  ------------------
  ------------------
 2231|    161|    }
 2232|       |
 2233|    497|    if (avifGainMapValidateMetadata(gainMap, diag) != AVIF_RESULT_OK) {
  ------------------
  |  Branch (2233:9): [True: 37, False: 460]
  ------------------
 2234|     37|        return AVIF_RESULT_INVALID_TONE_MAPPED_IMAGE;
 2235|     37|    }
 2236|       |
 2237|    460|    return AVIF_RESULT_OK;
 2238|    497|}
read.c:avifParseGainMapMetadata:
 2163|    513|{
 2164|    513|    uint32_t isMultichannel;
 2165|    513|    AVIF_CHECK(avifROStreamReadBitsU32(s, &isMultichannel, 1)); // unsigned int(1) is_multichannel;
  ------------------
  |  |   36|    513|    do {                        \
  |  |   37|    513|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 512]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|    513|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 512]
  |  |  ------------------
  ------------------
 2166|    512|    const uint8_t channelCount = isMultichannel ? 3 : 1;
  ------------------
  |  Branch (2166:34): [True: 210, False: 302]
  ------------------
 2167|       |
 2168|    512|    uint32_t useBaseColorSpace;
 2169|    512|    AVIF_CHECK(avifROStreamReadBitsU32(s, &useBaseColorSpace, 1)); // unsigned int(1) use_base_colour_space;
  ------------------
  |  |   36|    512|    do {                        \
  |  |   37|    512|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 0, False: 512]
  |  |  ------------------
  |  |   38|      0|            avifBreakOnError(); \
  |  |   39|      0|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      0|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      0|        }                       \
  |  |   41|    512|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 512]
  |  |  ------------------
  ------------------
 2170|    512|    gainMap->useBaseColorSpace = useBaseColorSpace ? AVIF_TRUE : AVIF_FALSE;
  ------------------
  |  |   88|    189|#define AVIF_TRUE 1
  ------------------
                  gainMap->useBaseColorSpace = useBaseColorSpace ? AVIF_TRUE : AVIF_FALSE;
  ------------------
  |  |   89|    835|#define AVIF_FALSE 0
  ------------------
  |  Branch (2170:34): [True: 189, False: 323]
  ------------------
 2171|       |
 2172|    512|    uint32_t reserved;
 2173|    512|    AVIF_CHECK(avifROStreamReadBitsU32(s, &reserved, 6)); // unsigned int(6) reserved;
  ------------------
  |  |   36|    512|    do {                        \
  |  |   37|    512|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 0, False: 512]
  |  |  ------------------
  |  |   38|      0|            avifBreakOnError(); \
  |  |   39|      0|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      0|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      0|        }                       \
  |  |   41|    512|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 512]
  |  |  ------------------
  ------------------
 2174|       |
 2175|    512|    AVIF_CHECK(avifROStreamReadU32(s, &gainMap->baseHdrHeadroom.n));      // unsigned int(32) base_hdr_headroom_numerator;
  ------------------
  |  |   36|    512|    do {                        \
  |  |   37|    512|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 511]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|    512|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 511]
  |  |  ------------------
  ------------------
 2176|    511|    AVIF_CHECK(avifROStreamReadU32(s, &gainMap->baseHdrHeadroom.d));      // unsigned int(32) base_hdr_headroom_denominator;
  ------------------
  |  |   36|    511|    do {                        \
  |  |   37|    511|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 510]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|    511|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 510]
  |  |  ------------------
  ------------------
 2177|    510|    AVIF_CHECK(avifROStreamReadU32(s, &gainMap->alternateHdrHeadroom.n)); // unsigned int(32) alternate_hdr_headroom_numerator;
  ------------------
  |  |   36|    510|    do {                        \
  |  |   37|    510|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 509]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|    510|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 509]
  |  |  ------------------
  ------------------
 2178|    509|    AVIF_CHECK(avifROStreamReadU32(s, &gainMap->alternateHdrHeadroom.d)); // unsigned int(32) alternate_hdr_headroom_denominator;
  ------------------
  |  |   36|    509|    do {                        \
  |  |   37|    509|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 508]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|    509|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 508]
  |  |  ------------------
  ------------------
 2179|       |
 2180|  1.41k|    for (int c = 0; c < channelCount; ++c) {
  ------------------
  |  Branch (2180:21): [True: 921, False: 498]
  ------------------
 2181|    921|        AVIF_CHECK(avifROStreamReadU32(s, (uint32_t *)&gainMap->gainMapMin[c].n)); // int(32) gain_map_min_numerator;
  ------------------
  |  |   36|    921|    do {                        \
  |  |   37|    921|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 920]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|    921|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 920]
  |  |  ------------------
  ------------------
 2182|    920|        AVIF_CHECK(avifROStreamReadU32(s, &gainMap->gainMapMin[c].d));             // unsigned int(32) gain_map_min_denominator;
  ------------------
  |  |   36|    920|    do {                        \
  |  |   37|    920|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 919]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|    920|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 919]
  |  |  ------------------
  ------------------
 2183|    919|        AVIF_CHECK(avifROStreamReadU32(s, (uint32_t *)&gainMap->gainMapMax[c].n)); // int(32) gain_map_max_numerator;
  ------------------
  |  |   36|    919|    do {                        \
  |  |   37|    919|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 918]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|    919|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 918]
  |  |  ------------------
  ------------------
 2184|    918|        AVIF_CHECK(avifROStreamReadU32(s, &gainMap->gainMapMax[c].d));             // unsigned int(32) gain_map_max_denominator;
  ------------------
  |  |   36|    918|    do {                        \
  |  |   37|    918|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 917]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|    918|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 917]
  |  |  ------------------
  ------------------
 2185|    917|        AVIF_CHECK(avifROStreamReadU32(s, &gainMap->gainMapGamma[c].n));           // unsigned int(32) gamma_numerator;
  ------------------
  |  |   36|    917|    do {                        \
  |  |   37|    917|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 916]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|    917|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 916]
  |  |  ------------------
  ------------------
 2186|    916|        AVIF_CHECK(avifROStreamReadU32(s, &gainMap->gainMapGamma[c].d));           // unsigned int(32) gamma_denominator;
  ------------------
  |  |   36|    916|    do {                        \
  |  |   37|    916|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 915]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|    916|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 915]
  |  |  ------------------
  ------------------
 2187|    915|        AVIF_CHECK(avifROStreamReadU32(s, (uint32_t *)&gainMap->baseOffset[c].n)); // int(32) base_offset_numerator;
  ------------------
  |  |   36|    915|    do {                        \
  |  |   37|    915|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 914]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|    915|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 914]
  |  |  ------------------
  ------------------
 2188|    914|        AVIF_CHECK(avifROStreamReadU32(s, &gainMap->baseOffset[c].d));             // unsigned int(32) base_offset_denominator;
  ------------------
  |  |   36|    914|    do {                        \
  |  |   37|    914|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 913]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|    914|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 913]
  |  |  ------------------
  ------------------
 2189|    913|        AVIF_CHECK(avifROStreamReadU32(s, (uint32_t *)&gainMap->alternateOffset[c].n)); // int(32) alternate_offset_numerator;
  ------------------
  |  |   36|    913|    do {                        \
  |  |   37|    913|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 912]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|    913|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 912]
  |  |  ------------------
  ------------------
 2190|    912|        AVIF_CHECK(avifROStreamReadU32(s, &gainMap->alternateOffset[c].d)); // unsigned int(32) alternate_offset_denominator;
  ------------------
  |  |   36|    912|    do {                        \
  |  |   37|    912|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 911]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|    912|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 911]
  |  |  ------------------
  ------------------
 2191|    912|    }
 2192|       |
 2193|       |    // Fill the remaining values by copying those from the first channel.
 2194|  1.09k|    for (int c = channelCount; c < 3; ++c) {
  ------------------
  |  Branch (2194:32): [True: 594, False: 498]
  ------------------
 2195|    594|        gainMap->gainMapMin[c] = gainMap->gainMapMin[0];
 2196|    594|        gainMap->gainMapMax[c] = gainMap->gainMapMax[0];
 2197|    594|        gainMap->gainMapGamma[c] = gainMap->gainMapGamma[0];
 2198|    594|        gainMap->baseOffset[c] = gainMap->baseOffset[0];
 2199|    594|        gainMap->alternateOffset[c] = gainMap->alternateOffset[0];
 2200|    594|    }
 2201|    498|    return AVIF_TRUE;
  ------------------
  |  |   88|    498|#define AVIF_TRUE 1
  ------------------
 2202|    508|}
read.c:avifReadColorNclxProperty:
 5632|  35.4k|{
 5633|  35.4k|    assert(cicpSet == NULL || *cicpSet == AVIF_FALSE);
 5634|  35.4k|    avifBool colrNCLXSeen = AVIF_FALSE;
  ------------------
  |  |   89|  35.4k|#define AVIF_FALSE 0
  ------------------
 5635|   192k|    for (uint32_t propertyIndex = 0; propertyIndex < properties->count; ++propertyIndex) {
  ------------------
  |  Branch (5635:38): [True: 157k, False: 35.4k]
  ------------------
 5636|   157k|        avifProperty * prop = &properties->prop[propertyIndex];
 5637|   157k|        if (!memcmp(prop->type, "colr", 4) && prop->u.colr.hasNCLX) {
  ------------------
  |  Branch (5637:13): [True: 23.7k, False: 133k]
  |  Branch (5637:47): [True: 20.5k, False: 3.23k]
  ------------------
 5638|  20.5k|            if (colrNCLXSeen) {
  ------------------
  |  Branch (5638:17): [True: 7, False: 20.5k]
  ------------------
 5639|      7|                return AVIF_RESULT_BMFF_PARSE_FAILED;
 5640|      7|            }
 5641|  20.5k|            colrNCLXSeen = AVIF_TRUE;
  ------------------
  |  |   88|  20.5k|#define AVIF_TRUE 1
  ------------------
 5642|  20.5k|            if (cicpSet != NULL) {
  ------------------
  |  Branch (5642:17): [True: 20.1k, False: 368]
  ------------------
 5643|  20.1k|                *cicpSet = AVIF_TRUE;
  ------------------
  |  |   88|  20.1k|#define AVIF_TRUE 1
  ------------------
 5644|  20.1k|            }
 5645|  20.5k|            *colorPrimaries = prop->u.colr.colorPrimaries;
 5646|  20.5k|            *transferCharacteristics = prop->u.colr.transferCharacteristics;
 5647|  20.5k|            *matrixCoefficients = prop->u.colr.matrixCoefficients;
 5648|  20.5k|            *yuvRange = prop->u.colr.range;
 5649|  20.5k|        }
 5650|   157k|    }
 5651|  35.4k|    return AVIF_RESULT_OK;
 5652|  35.4k|}
read.c:avifDecoderDataFindSampleTransformImageItem:
 6006|  31.2k|{
 6007|   110k|    for (uint32_t itemIndex = 0; itemIndex < data->meta->items.count; ++itemIndex) {
  ------------------
  |  Branch (6007:34): [True: 79.0k, False: 31.2k]
  ------------------
 6008|  79.0k|        avifDecoderItem * item = data->meta->items.item[itemIndex];
 6009|  79.0k|        if (!memcmp(item->type, "sato", 4) && item->id != data->meta->primaryItemID && item->size != 0 &&
  ------------------
  |  Branch (6009:13): [True: 0, False: 79.0k]
  |  Branch (6009:47): [True: 0, False: 0]
  |  Branch (6009:88): [True: 0, False: 0]
  ------------------
 6010|      0|            !item->hasUnsupportedEssentialProperty && item->thumbnailForID == 0 &&
  ------------------
  |  Branch (6010:13): [True: 0, False: 0]
  |  Branch (6010:55): [True: 0, False: 0]
  ------------------
 6011|      0|            avifIsPreferredAlternativeTo(data, item->id, data->meta->primaryItemID)) {
  ------------------
  |  Branch (6011:13): [True: 0, False: 0]
  ------------------
 6012|      0|            return item;
 6013|      0|        }
 6014|  79.0k|    }
 6015|  31.2k|    return NULL;
 6016|  31.2k|}
read.c:avifDecoderItemRead:
 1424|  66.7k|{
 1425|  66.7k|    if (item->mergedExtents.data && !item->partialMergedExtents) {
  ------------------
  |  Branch (1425:9): [True: 41.0k, False: 25.7k]
  |  Branch (1425:37): [True: 19.7k, False: 21.3k]
  ------------------
 1426|       |        // Multiple extents have already been concatenated for this item, just return it
 1427|  19.7k|        if (offset >= item->mergedExtents.size) {
  ------------------
  |  Branch (1427:13): [True: 0, False: 19.7k]
  ------------------
 1428|      0|            avifDiagnosticsPrintf(diag, "Item ID %u read has overflowing offset", item->id);
 1429|      0|            return AVIF_RESULT_TRUNCATED_DATA;
 1430|      0|        }
 1431|  19.7k|        outData->data = item->mergedExtents.data + offset;
 1432|  19.7k|        outData->size = item->mergedExtents.size - offset;
 1433|  19.7k|        return AVIF_RESULT_OK;
 1434|  19.7k|    }
 1435|       |
 1436|  47.0k|    if (item->extents.count == 0) {
  ------------------
  |  Branch (1436:9): [True: 0, False: 47.0k]
  ------------------
 1437|      0|        avifDiagnosticsPrintf(diag, "Item ID %u has zero extents", item->id);
 1438|      0|        return AVIF_RESULT_TRUNCATED_DATA;
 1439|      0|    }
 1440|       |
 1441|       |    // Find this item's source of all extents' data, based on the construction method
 1442|  47.0k|    const avifRWData * idatBuffer = NULL;
 1443|  47.0k|    if (item->idatStored) {
  ------------------
  |  Branch (1443:9): [True: 7.87k, False: 39.2k]
  ------------------
 1444|       |        // construction_method: idat(1)
 1445|       |
 1446|  7.87k|        if (item->meta->idat.size > 0) {
  ------------------
  |  Branch (1446:13): [True: 7.87k, False: 6]
  ------------------
 1447|  7.87k|            idatBuffer = &item->meta->idat;
 1448|  7.87k|        } else {
 1449|       |            // no associated idat box was found in the meta box, bail out
 1450|      6|            avifDiagnosticsPrintf(diag, "Item ID %u is stored in an idat, but no associated idat box was found", item->id);
 1451|      6|            return AVIF_RESULT_NO_CONTENT;
 1452|      6|        }
 1453|  7.87k|    }
 1454|       |
 1455|       |    // Merge extents into a single contiguous buffer
 1456|  47.0k|    if ((io->sizeHint > 0) && (item->size > io->sizeHint)) {
  ------------------
  |  Branch (1456:9): [True: 47.0k, False: 0]
  |  Branch (1456:31): [True: 12, False: 47.0k]
  ------------------
 1457|       |        // Sanity check: somehow the sum of extents exceeds the entire file or idat size!
 1458|     12|        avifDiagnosticsPrintf(diag, "Item ID %u reported size failed size hint sanity check. Truncated data?", item->id);
 1459|     12|        return AVIF_RESULT_TRUNCATED_DATA;
 1460|     12|    }
 1461|       |
 1462|  47.0k|    if (offset >= item->size) {
  ------------------
  |  Branch (1462:9): [True: 0, False: 47.0k]
  ------------------
 1463|      0|        avifDiagnosticsPrintf(diag, "Item ID %u read has overflowing offset", item->id);
 1464|      0|        return AVIF_RESULT_TRUNCATED_DATA;
 1465|      0|    }
 1466|  47.0k|    const size_t maxOutputSize = item->size - offset;
 1467|  47.0k|    const size_t readOutputSize = (partialByteCount && (partialByteCount < maxOutputSize)) ? partialByteCount : maxOutputSize;
  ------------------
  |  Branch (1467:36): [True: 44.4k, False: 2.61k]
  |  Branch (1467:56): [True: 21.6k, False: 22.7k]
  ------------------
 1468|  47.0k|    const size_t totalBytesToRead = offset + readOutputSize;
 1469|       |
 1470|       |    // If there is a single extent for this item and the source of the read buffer is going to be
 1471|       |    // persistent for the lifetime of the avifDecoder (whether it comes from its own internal
 1472|       |    // idatBuffer or from a known-persistent IO), we can avoid buffer duplication and just use the
 1473|       |    // preexisting buffer.
 1474|  47.0k|    avifBool singlePersistentBuffer = ((item->extents.count == 1) && (idatBuffer || io->persistent));
  ------------------
  |  Branch (1474:40): [True: 34.8k, False: 12.2k]
  |  Branch (1474:71): [True: 252, False: 34.6k]
  |  Branch (1474:85): [True: 16.8k, False: 17.7k]
  ------------------
 1475|  47.0k|    if (!singlePersistentBuffer) {
  ------------------
  |  Branch (1475:9): [True: 29.9k, False: 17.1k]
  ------------------
 1476|       |        // Always allocate the item's full size here, as progressive image decodes will do partial
 1477|       |        // reads into this buffer and begin feeding the buffer to the underlying AV1 decoder, but
 1478|       |        // will then write more into this buffer without flushing the AV1 decoder (which is still
 1479|       |        // holding the address of the previous allocation of this buffer). This strategy avoids
 1480|       |        // use-after-free issues in the AV1 decoder and unnecessary reallocs as a typical
 1481|       |        // progressive decode use case will eventually decode the final layer anyway.
 1482|  29.9k|        AVIF_CHECKRES(avifRWDataRealloc(&item->mergedExtents, item->size));
  ------------------
  |  |   54|  29.9k|    do {                                  \
  |  |   55|  29.9k|        const avifResult result__ = (A);  \
  |  |   56|  29.9k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 0, False: 29.9k]
  |  |  ------------------
  |  |   57|      0|            avifBreakOnError();           \
  |  |   58|      0|            return result__;              \
  |  |   59|      0|        }                                 \
  |  |   60|  29.9k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 29.9k]
  |  |  ------------------
  ------------------
 1483|  29.9k|        item->ownsMergedExtents = AVIF_TRUE;
  ------------------
  |  |   88|  29.9k|#define AVIF_TRUE 1
  ------------------
 1484|  29.9k|    }
 1485|       |
 1486|       |    // Set this until we manage to fill the entire mergedExtents buffer
 1487|  47.0k|    item->partialMergedExtents = AVIF_TRUE;
  ------------------
  |  |   88|  47.0k|#define AVIF_TRUE 1
  ------------------
 1488|       |
 1489|  47.0k|    size_t writeOffset = 0; // Write offset for item->mergedExtents.data
 1490|  47.0k|    size_t remainingBytes = totalBytesToRead;
 1491|  53.3k|    for (uint32_t extentIter = 0; extentIter < item->extents.count; ++extentIter) {
  ------------------
  |  Branch (1491:35): [True: 53.3k, False: 0]
  ------------------
 1492|  53.3k|        avifExtent * extent = &item->extents.extent[extentIter];
 1493|       |
 1494|  53.3k|        size_t bytesToRead = extent->size;
 1495|  53.3k|        if (bytesToRead > remainingBytes) {
  ------------------
  |  Branch (1495:13): [True: 19.8k, False: 33.4k]
  ------------------
 1496|  19.8k|            bytesToRead = remainingBytes;
 1497|  19.8k|        }
 1498|       |
 1499|  53.3k|        avifROData offsetBuffer;
 1500|  53.3k|        if (idatBuffer) {
  ------------------
  |  Branch (1500:13): [True: 12.0k, False: 41.2k]
  ------------------
 1501|  12.0k|            if (extent->offset > idatBuffer->size) {
  ------------------
  |  Branch (1501:17): [True: 131, False: 11.9k]
  ------------------
 1502|    131|                avifDiagnosticsPrintf(diag, "Item ID %u has impossible extent offset in idat buffer", item->id);
 1503|    131|                return AVIF_RESULT_BMFF_PARSE_FAILED;
 1504|    131|            }
 1505|       |            // Since extent->offset (a uint64_t) is not bigger than idatBuffer->size (a size_t),
 1506|       |            // it is safe to cast extent->offset to size_t.
 1507|  11.9k|            const size_t extentOffset = (size_t)extent->offset;
 1508|  11.9k|            if (extent->size > idatBuffer->size - extentOffset) {
  ------------------
  |  Branch (1508:17): [True: 17, False: 11.9k]
  ------------------
 1509|     17|                avifDiagnosticsPrintf(diag, "Item ID %u has impossible extent size in idat buffer", item->id);
 1510|     17|                return AVIF_RESULT_BMFF_PARSE_FAILED;
 1511|     17|            }
 1512|  11.9k|            offsetBuffer.data = idatBuffer->data + extentOffset;
 1513|  11.9k|            offsetBuffer.size = idatBuffer->size - extentOffset;
 1514|  41.2k|        } else {
 1515|       |            // construction_method: file(0)
 1516|       |
 1517|  41.2k|            if ((io->sizeHint > 0) && (extent->offset > io->sizeHint)) {
  ------------------
  |  Branch (1517:17): [True: 41.2k, False: 0]
  |  Branch (1517:39): [True: 109, False: 41.1k]
  ------------------
 1518|    109|                avifDiagnosticsPrintf(diag, "Item ID %u extent offset failed size hint sanity check. Truncated data?", item->id);
 1519|    109|                return AVIF_RESULT_BMFF_PARSE_FAILED;
 1520|    109|            }
 1521|  41.1k|            avifResult readResult = io->read(io, 0, extent->offset, bytesToRead, &offsetBuffer);
 1522|  41.1k|            if (readResult != AVIF_RESULT_OK) {
  ------------------
  |  Branch (1522:17): [True: 0, False: 41.1k]
  ------------------
 1523|      0|                return readResult;
 1524|      0|            }
 1525|  41.1k|            if (bytesToRead != offsetBuffer.size) {
  ------------------
  |  Branch (1525:17): [True: 452, False: 40.6k]
  ------------------
 1526|    452|                avifDiagnosticsPrintf(diag,
 1527|    452|                                      "Item ID %u tried to read %zu bytes, but only received %zu bytes",
 1528|    452|                                      item->id,
 1529|    452|                                      bytesToRead,
 1530|    452|                                      offsetBuffer.size);
 1531|    452|                return AVIF_RESULT_TRUNCATED_DATA;
 1532|    452|            }
 1533|  41.1k|        }
 1534|       |
 1535|  52.6k|        if (singlePersistentBuffer) {
  ------------------
  |  Branch (1535:13): [True: 16.8k, False: 35.7k]
  ------------------
 1536|  16.8k|            item->mergedExtents.data = (uint8_t *)offsetBuffer.data; // const_cast
 1537|  16.8k|            AVIF_ASSERT_OR_RETURN(bytesToRead <= offsetBuffer.size);
  ------------------
  |  |   64|  16.8k|#define AVIF_ASSERT_OR_RETURN(A) AVIF_CHECKERR((A), AVIF_RESULT_INTERNAL_ERROR)
  |  |  ------------------
  |  |  |  |   45|  16.8k|    do {                        \
  |  |  |  |   46|  16.8k|        if (!(A)) {             \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (46:13): [True: 0, False: 16.8k]
  |  |  |  |  ------------------
  |  |  |  |   47|      0|            avifBreakOnError(); \
  |  |  |  |   48|      0|            return ERR;         \
  |  |  |  |   49|      0|        }                       \
  |  |  |  |   50|  16.8k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (50:14): [Folded, False: 16.8k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1538|  16.8k|            item->mergedExtents.size = bytesToRead;
 1539|  35.7k|        } else {
 1540|  35.7k|            AVIF_ASSERT_OR_RETURN(item->ownsMergedExtents);
  ------------------
  |  |   64|  35.7k|#define AVIF_ASSERT_OR_RETURN(A) AVIF_CHECKERR((A), AVIF_RESULT_INTERNAL_ERROR)
  |  |  ------------------
  |  |  |  |   45|  35.7k|    do {                        \
  |  |  |  |   46|  35.7k|        if (!(A)) {             \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (46:13): [True: 0, False: 35.7k]
  |  |  |  |  ------------------
  |  |  |  |   47|      0|            avifBreakOnError(); \
  |  |  |  |   48|      0|            return ERR;         \
  |  |  |  |   49|      0|        }                       \
  |  |  |  |   50|  35.7k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (50:14): [Folded, False: 35.7k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1541|  35.7k|            AVIF_ASSERT_OR_RETURN(writeOffset < item->mergedExtents.size);
  ------------------
  |  |   64|  35.7k|#define AVIF_ASSERT_OR_RETURN(A) AVIF_CHECKERR((A), AVIF_RESULT_INTERNAL_ERROR)
  |  |  ------------------
  |  |  |  |   45|  35.7k|    do {                        \
  |  |  |  |   46|  35.7k|        if (!(A)) {             \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (46:13): [True: 0, False: 35.7k]
  |  |  |  |  ------------------
  |  |  |  |   47|      0|            avifBreakOnError(); \
  |  |  |  |   48|      0|            return ERR;         \
  |  |  |  |   49|      0|        }                       \
  |  |  |  |   50|  35.7k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (50:14): [Folded, False: 35.7k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1542|  35.7k|            AVIF_ASSERT_OR_RETURN(bytesToRead <= item->mergedExtents.size - writeOffset);
  ------------------
  |  |   64|  35.7k|#define AVIF_ASSERT_OR_RETURN(A) AVIF_CHECKERR((A), AVIF_RESULT_INTERNAL_ERROR)
  |  |  ------------------
  |  |  |  |   45|  35.7k|    do {                        \
  |  |  |  |   46|  35.7k|        if (!(A)) {             \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (46:13): [True: 0, False: 35.7k]
  |  |  |  |  ------------------
  |  |  |  |   47|      0|            avifBreakOnError(); \
  |  |  |  |   48|      0|            return ERR;         \
  |  |  |  |   49|      0|        }                       \
  |  |  |  |   50|  35.7k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (50:14): [Folded, False: 35.7k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1543|  35.7k|            memcpy(item->mergedExtents.data + writeOffset, offsetBuffer.data, bytesToRead);
 1544|  35.7k|            writeOffset += bytesToRead;
 1545|  35.7k|        }
 1546|       |
 1547|  52.6k|        remainingBytes -= bytesToRead;
 1548|  52.6k|        if (remainingBytes == 0) {
  ------------------
  |  Branch (1548:13): [True: 46.3k, False: 6.27k]
  ------------------
 1549|       |            // This happens when partialByteCount is set
 1550|  46.3k|            break;
 1551|  46.3k|        }
 1552|  52.6k|    }
 1553|  46.3k|    if (remainingBytes != 0) {
  ------------------
  |  Branch (1553:9): [True: 0, False: 46.3k]
  ------------------
 1554|       |        // This should be impossible?
 1555|      0|        avifDiagnosticsPrintf(diag, "Item ID %u has %zu unexpected trailing bytes", item->id, remainingBytes);
 1556|      0|        return AVIF_RESULT_TRUNCATED_DATA;
 1557|      0|    }
 1558|       |
 1559|  46.3k|    outData->data = item->mergedExtents.data + offset;
 1560|  46.3k|    outData->size = readOutputSize;
 1561|  46.3k|    item->partialMergedExtents = (item->size != totalBytesToRead);
 1562|  46.3k|    return AVIF_RESULT_OK;
 1563|  46.3k|}
read.c:avifDecoderAdoptGridTileCodecTypeIfNeeded:
 1679|  37.8k|{
 1680|  37.8k|    if ((info->grid.rows > 0) && (info->grid.columns > 0)) {
  ------------------
  |  Branch (1680:9): [True: 3.42k, False: 34.4k]
  |  Branch (1680:34): [True: 3.42k, False: 0]
  ------------------
 1681|       |        // The number of tiles was verified in avifDecoderItemReadAndParse().
 1682|  3.42k|        const uint32_t numTiles = info->grid.rows * info->grid.columns;
 1683|  3.42k|        uint32_t * dimgIdxToItemIdx = (uint32_t *)avifAlloc(numTiles * sizeof(uint32_t));
 1684|  3.42k|        AVIF_CHECKERR(dimgIdxToItemIdx != NULL, AVIF_RESULT_OUT_OF_MEMORY);
  ------------------
  |  |   45|  3.42k|    do {                        \
  |  |   46|  3.42k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 3.42k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  3.42k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 3.42k]
  |  |  ------------------
  ------------------
 1685|  3.42k|        avifResult result = avifFillDimgIdxToItemIdxArray(dimgIdxToItemIdx, numTiles, item);
 1686|  3.42k|        if (result == AVIF_RESULT_OK) {
  ------------------
  |  Branch (1686:13): [True: 3.42k, False: 0]
  ------------------
 1687|  3.42k|            result = avifDecoderAdoptGridTileCodecType(decoder, item, dimgIdxToItemIdx, numTiles);
 1688|  3.42k|        }
 1689|  3.42k|        avifFree(dimgIdxToItemIdx);
 1690|  3.42k|        AVIF_CHECKRES(result);
  ------------------
  |  |   54|  3.42k|    do {                                  \
  |  |   55|  3.42k|        const avifResult result__ = (A);  \
  |  |   56|  3.42k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 41, False: 3.38k]
  |  |  ------------------
  |  |   57|     41|            avifBreakOnError();           \
  |  |   58|     41|            return result__;              \
  |  |   59|     41|        }                                 \
  |  |   60|  3.42k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 3.38k]
  |  |  ------------------
  ------------------
 1691|  3.42k|    }
 1692|  37.8k|    return AVIF_RESULT_OK;
 1693|  37.8k|}
read.c:avifFillDimgIdxToItemIdxArray:
 1581|  6.67k|{
 1582|  6.67k|    const uint32_t itemIndexNotSet = UINT32_MAX;
 1583|  21.0k|    for (uint32_t dimgIdx = 0; dimgIdx < numExpectedTiles; ++dimgIdx) {
  ------------------
  |  Branch (1583:32): [True: 14.3k, False: 6.67k]
  ------------------
 1584|  14.3k|        dimgIdxToItemIdx[dimgIdx] = itemIndexNotSet;
 1585|  14.3k|    }
 1586|  6.67k|    uint32_t numTiles = 0;
 1587|  47.4k|    for (uint32_t i = 0; i < gridItem->meta->items.count; ++i) {
  ------------------
  |  Branch (1587:26): [True: 40.7k, False: 6.67k]
  ------------------
 1588|  40.7k|        if (gridItem->meta->items.item[i]->dimgForID == gridItem->id) {
  ------------------
  |  Branch (1588:13): [True: 14.3k, False: 26.4k]
  ------------------
 1589|  14.3k|            const uint32_t tileItemDimgIdx = gridItem->meta->items.item[i]->dimgIdx;
 1590|  14.3k|            AVIF_CHECKERR(tileItemDimgIdx < numExpectedTiles, AVIF_RESULT_INVALID_IMAGE_GRID);
  ------------------
  |  |   45|  14.3k|    do {                        \
  |  |   46|  14.3k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 14.3k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  14.3k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 14.3k]
  |  |  ------------------
  ------------------
 1591|  14.3k|            AVIF_CHECKERR(dimgIdxToItemIdx[tileItemDimgIdx] == itemIndexNotSet, AVIF_RESULT_INVALID_IMAGE_GRID);
  ------------------
  |  |   45|  14.3k|    do {                        \
  |  |   46|  14.3k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 14.3k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  14.3k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 14.3k]
  |  |  ------------------
  ------------------
 1592|  14.3k|            dimgIdxToItemIdx[tileItemDimgIdx] = i;
 1593|  14.3k|            ++numTiles;
 1594|  14.3k|        }
 1595|  40.7k|    }
 1596|       |    // The number of tiles has been verified in avifDecoderItemReadAndParse().
 1597|  6.67k|    AVIF_ASSERT_OR_RETURN(numTiles == numExpectedTiles);
  ------------------
  |  |   64|  6.67k|#define AVIF_ASSERT_OR_RETURN(A) AVIF_CHECKERR((A), AVIF_RESULT_INTERNAL_ERROR)
  |  |  ------------------
  |  |  |  |   45|  6.67k|    do {                        \
  |  |  |  |   46|  6.67k|        if (!(A)) {             \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (46:13): [True: 0, False: 6.67k]
  |  |  |  |  ------------------
  |  |  |  |   47|      0|            avifBreakOnError(); \
  |  |  |  |   48|      0|            return ERR;         \
  |  |  |  |   49|      0|        }                       \
  |  |  |  |   50|  6.67k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (50:14): [Folded, False: 6.67k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1598|  6.67k|    return AVIF_RESULT_OK;
 1599|  6.67k|}
read.c:avifDecoderAdoptGridTileCodecType:
 1607|  3.42k|{
 1608|  3.42k|    avifDecoderItem * firstTileItem = NULL;
 1609|  10.7k|    for (uint32_t dimgIdx = 0; dimgIdx < numTiles; ++dimgIdx) {
  ------------------
  |  Branch (1609:32): [True: 7.31k, False: 3.38k]
  ------------------
 1610|  7.31k|        const uint32_t itemIdx = dimgIdxToItemIdx[dimgIdx];
 1611|  7.31k|        AVIF_ASSERT_OR_RETURN(itemIdx < gridItem->meta->items.count);
  ------------------
  |  |   64|  7.31k|#define AVIF_ASSERT_OR_RETURN(A) AVIF_CHECKERR((A), AVIF_RESULT_INTERNAL_ERROR)
  |  |  ------------------
  |  |  |  |   45|  7.31k|    do {                        \
  |  |  |  |   46|  7.31k|        if (!(A)) {             \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (46:13): [True: 0, False: 7.31k]
  |  |  |  |  ------------------
  |  |  |  |   47|      0|            avifBreakOnError(); \
  |  |  |  |   48|      0|            return ERR;         \
  |  |  |  |   49|      0|        }                       \
  |  |  |  |   50|  7.31k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (50:14): [Folded, False: 7.31k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1612|  7.31k|        avifDecoderItem * item = gridItem->meta->items.item[itemIdx];
 1613|       |
 1614|       |        // According to HEIF (ISO 14496-12), Section 6.6.2.3.1, the SingleItemTypeReferenceBox of type 'dimg'
 1615|       |        // identifies the input images of the derived image item of type 'grid'. Since the reference_count
 1616|       |        // shall be equal to rows*columns, unknown tile item types cannot be skipped but must be considered
 1617|       |        // as errors.
 1618|  7.31k|        const avifCodecType tileCodecType = avifGetCodecType(item->type);
 1619|  7.31k|        if (tileCodecType == AVIF_CODEC_TYPE_UNKNOWN) {
  ------------------
  |  Branch (1619:13): [True: 37, False: 7.27k]
  ------------------
 1620|     37|            char type[4];
 1621|    185|            for (int j = 0; j < 4; j++) {
  ------------------
  |  Branch (1621:29): [True: 148, False: 37]
  ------------------
 1622|    148|                if (isprint((unsigned char)item->type[j])) {
  ------------------
  |  Branch (1622:21): [True: 44, False: 104]
  ------------------
 1623|     44|                    type[j] = item->type[j];
 1624|    104|                } else {
 1625|    104|                    type[j] = '.';
 1626|    104|                }
 1627|    148|            }
 1628|     37|            avifDiagnosticsPrintf(&decoder->diag,
 1629|     37|                                  "Tile item ID %u has an unknown item type '%.4s' (%02x%02x%02x%02x)",
 1630|     37|                                  item->id,
 1631|     37|                                  type,
 1632|     37|                                  item->type[0],
 1633|     37|                                  item->type[1],
 1634|     37|                                  item->type[2],
 1635|     37|                                  item->type[3]);
 1636|     37|            return AVIF_RESULT_INVALID_IMAGE_GRID;
 1637|     37|        }
 1638|       |
 1639|  7.27k|        if (item->hasUnsupportedEssentialProperty) {
  ------------------
  |  Branch (1639:13): [True: 3, False: 7.27k]
  ------------------
 1640|       |            // An essential property isn't supported by libavif; can't
 1641|       |            // decode a grid image if any tile in the grid isn't supported.
 1642|      3|            avifDiagnosticsPrintf(&decoder->diag, "Grid image contains tile with an unsupported property marked as essential");
 1643|      3|            return AVIF_RESULT_INVALID_IMAGE_GRID;
 1644|      3|        }
 1645|       |
 1646|  7.27k|        if (firstTileItem == NULL) {
  ------------------
  |  Branch (1646:13): [True: 3.39k, False: 3.87k]
  ------------------
 1647|  3.39k|            firstTileItem = item;
 1648|       |            // Adopt the configuration property of the first image item tile, so that it can be queried from
 1649|       |            // the top-level color/alpha item during avifDecoderReset().
 1650|  3.39k|            const avifCodecType codecType = avifGetCodecType(item->type);
 1651|  3.39k|            const char * configPropName = avifGetConfigurationPropertyName(codecType);
 1652|  3.39k|            const avifProperty * srcProp = avifPropertyArrayFind(&item->properties, configPropName);
 1653|  3.39k|            if (!srcProp) {
  ------------------
  |  Branch (1653:17): [True: 1, False: 3.39k]
  ------------------
 1654|      1|                avifDiagnosticsPrintf(&decoder->diag, "Grid image's first tile is missing an %s property", configPropName);
 1655|      1|                return AVIF_RESULT_INVALID_IMAGE_GRID;
 1656|      1|            }
 1657|  3.39k|            avifProperty * dstProp = (avifProperty *)avifArrayPush(&gridItem->properties);
 1658|  3.39k|            AVIF_CHECKERR(dstProp != NULL, AVIF_RESULT_OUT_OF_MEMORY);
  ------------------
  |  |   45|  3.39k|    do {                        \
  |  |   46|  3.39k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 3.39k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  3.39k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 3.39k]
  |  |  ------------------
  ------------------
 1659|  3.39k|            *dstProp = *srcProp;
 1660|       |
 1661|  3.87k|        } else if (memcmp(item->type, firstTileItem->type, 4)) {
  ------------------
  |  Branch (1661:20): [True: 0, False: 3.87k]
  ------------------
 1662|       |            // MIAF (ISO 23000-22:2019), Section 7.3.11.4.1:
 1663|       |            //   All input images of a grid image item shall use the same coding format [...]
 1664|       |            // The coding format is defined by the item type.
 1665|      0|            avifDiagnosticsPrintf(&decoder->diag,
 1666|      0|                                  "Tile item ID %u of type '%.4s' differs from other tile type '%.4s'",
 1667|      0|                                  item->id,
 1668|      0|                                  (const char *)item->type,
 1669|      0|                                  (const char *)firstTileItem->type);
 1670|      0|            return AVIF_RESULT_INVALID_IMAGE_GRID;
 1671|      0|        }
 1672|  7.27k|    }
 1673|  3.38k|    return AVIF_RESULT_OK;
 1674|  3.42k|}
read.c:avifDecoderGenerateImageTiles:
 6019|  37.0k|{
 6020|  37.0k|    const uint32_t previousTileCount = decoder->data->tiles.count;
 6021|  37.0k|    if ((info->grid.rows > 0) && (info->grid.columns > 0)) {
  ------------------
  |  Branch (6021:9): [True: 3.25k, False: 33.8k]
  |  Branch (6021:34): [True: 3.25k, False: 0]
  ------------------
 6022|       |        // The number of tiles was verified in avifDecoderItemReadAndParse().
 6023|  3.25k|        const uint32_t numTiles = info->grid.rows * info->grid.columns;
 6024|  3.25k|        uint32_t * dimgIdxToItemIdx = (uint32_t *)avifAlloc(numTiles * sizeof(uint32_t));
 6025|  3.25k|        AVIF_CHECKERR(dimgIdxToItemIdx != NULL, AVIF_RESULT_OUT_OF_MEMORY);
  ------------------
  |  |   45|  3.25k|    do {                        \
  |  |   46|  3.25k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 3.25k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  3.25k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 3.25k]
  |  |  ------------------
  ------------------
 6026|  3.25k|        avifResult result = avifFillDimgIdxToItemIdxArray(dimgIdxToItemIdx, numTiles, item);
 6027|  3.25k|        if (result == AVIF_RESULT_OK) {
  ------------------
  |  Branch (6027:13): [True: 3.25k, False: 0]
  ------------------
 6028|  3.25k|            result = avifDecoderGenerateImageGridTiles(decoder, item, itemCategory, dimgIdxToItemIdx, numTiles);
 6029|  3.25k|        }
 6030|  3.25k|        avifFree(dimgIdxToItemIdx);
 6031|  3.25k|        AVIF_CHECKRES(result);
  ------------------
  |  |   54|  3.25k|    do {                                  \
  |  |   55|  3.25k|        const avifResult result__ = (A);  \
  |  |   56|  3.25k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 19, False: 3.23k]
  |  |  ------------------
  |  |   57|     19|            avifBreakOnError();           \
  |  |   58|     19|            return result__;              \
  |  |   59|     19|        }                                 \
  |  |   60|  3.25k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 3.23k]
  |  |  ------------------
  ------------------
 6032|  33.8k|    } else {
 6033|  33.8k|        AVIF_CHECKERR(item->size != 0, AVIF_RESULT_MISSING_IMAGE_ITEM);
  ------------------
  |  |   45|  33.8k|    do {                        \
  |  |   46|  33.8k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 33.8k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  33.8k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 33.8k]
  |  |  ------------------
  ------------------
 6034|       |
 6035|  33.8k|        const avifCodecType codecType = avifGetCodecType(item->type);
 6036|  33.8k|        AVIF_ASSERT_OR_RETURN(codecType != AVIF_CODEC_TYPE_UNKNOWN);
  ------------------
  |  |   64|  33.8k|#define AVIF_ASSERT_OR_RETURN(A) AVIF_CHECKERR((A), AVIF_RESULT_INTERNAL_ERROR)
  |  |  ------------------
  |  |  |  |   45|  33.8k|    do {                        \
  |  |  |  |   46|  33.8k|        if (!(A)) {             \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (46:13): [True: 0, False: 33.8k]
  |  |  |  |  ------------------
  |  |  |  |   47|      0|            avifBreakOnError(); \
  |  |  |  |   48|      0|            return ERR;         \
  |  |  |  |   49|      0|        }                       \
  |  |  |  |   50|  33.8k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (50:14): [Folded, False: 33.8k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 6037|  33.8k|        avifTile * tile =
 6038|  33.8k|            avifDecoderDataCreateTile(decoder->data, codecType, item->width, item->height, avifDecoderItemOperatingPoint(item));
 6039|  33.8k|        AVIF_CHECKERR(tile, AVIF_RESULT_OUT_OF_MEMORY);
  ------------------
  |  |   45|  33.8k|    do {                        \
  |  |   46|  33.8k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 33.8k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  33.8k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 33.8k]
  |  |  ------------------
  ------------------
 6040|  33.8k|        AVIF_CHECKRES(avifCodecDecodeInputFillFromDecoderItem(tile->input,
  ------------------
  |  |   54|  33.8k|    do {                                  \
  |  |   55|  33.8k|        const avifResult result__ = (A);  \
  |  |   56|  33.8k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 82, False: 33.7k]
  |  |  ------------------
  |  |   57|     82|            avifBreakOnError();           \
  |  |   58|     82|            return result__;              \
  |  |   59|     82|        }                                 \
  |  |   60|  33.8k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 33.7k]
  |  |  ------------------
  ------------------
 6041|  33.8k|                                                              item,
 6042|  33.8k|                                                              decoder->allowProgressive,
 6043|  33.8k|                                                              decoder->imageCountLimit,
 6044|  33.8k|                                                              decoder->io->sizeHint,
 6045|  33.8k|                                                              &decoder->diag));
 6046|  33.7k|        tile->input->itemCategory = itemCategory;
 6047|  33.7k|    }
 6048|  36.9k|    info->tileCount = decoder->data->tiles.count - previousTileCount;
 6049|  36.9k|    return AVIF_RESULT_OK;
 6050|  37.0k|}
read.c:avifDecoderGenerateImageGridTiles:
 1701|  3.25k|{
 1702|  3.25k|    avifBool progressive = AVIF_TRUE;
  ------------------
  |  |   88|  3.25k|#define AVIF_TRUE 1
  ------------------
 1703|  10.2k|    for (uint32_t dimgIdx = 0; dimgIdx < numTiles; ++dimgIdx) {
  ------------------
  |  Branch (1703:32): [True: 6.97k, False: 3.23k]
  ------------------
 1704|  6.97k|        const uint32_t itemIdx = dimgIdxToItemIdx[dimgIdx];
 1705|  6.97k|        AVIF_ASSERT_OR_RETURN(itemIdx < gridItem->meta->items.count);
  ------------------
  |  |   64|  6.97k|#define AVIF_ASSERT_OR_RETURN(A) AVIF_CHECKERR((A), AVIF_RESULT_INTERNAL_ERROR)
  |  |  ------------------
  |  |  |  |   45|  6.97k|    do {                        \
  |  |  |  |   46|  6.97k|        if (!(A)) {             \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (46:13): [True: 0, False: 6.97k]
  |  |  |  |  ------------------
  |  |  |  |   47|      0|            avifBreakOnError(); \
  |  |  |  |   48|      0|            return ERR;         \
  |  |  |  |   49|      0|        }                       \
  |  |  |  |   50|  6.97k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (50:14): [Folded, False: 6.97k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1706|  6.97k|        avifDecoderItem * item = gridItem->meta->items.item[itemIdx];
 1707|       |
 1708|  6.97k|        const avifCodecType tileCodecType = avifGetCodecType(item->type);
 1709|  6.97k|        AVIF_CHECKERR(tileCodecType != AVIF_CODEC_TYPE_UNKNOWN, AVIF_RESULT_INVALID_IMAGE_GRID);
  ------------------
  |  |   45|  6.97k|    do {                        \
  |  |   46|  6.97k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 6.97k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  6.97k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 6.97k]
  |  |  ------------------
  ------------------
 1710|  6.97k|        const avifTile * tile =
 1711|  6.97k|            avifDecoderDataCreateTile(decoder->data, tileCodecType, item->width, item->height, avifDecoderItemOperatingPoint(item));
 1712|  6.97k|        AVIF_CHECKERR(tile != NULL, AVIF_RESULT_OUT_OF_MEMORY);
  ------------------
  |  |   45|  6.97k|    do {                        \
  |  |   46|  6.97k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 6.97k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  6.97k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 6.97k]
  |  |  ------------------
  ------------------
 1713|  6.97k|        AVIF_CHECKRES(avifCodecDecodeInputFillFromDecoderItem(tile->input,
  ------------------
  |  |   54|  6.97k|    do {                                  \
  |  |   55|  6.97k|        const avifResult result__ = (A);  \
  |  |   56|  6.97k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 19, False: 6.95k]
  |  |  ------------------
  |  |   57|     19|            avifBreakOnError();           \
  |  |   58|     19|            return result__;              \
  |  |   59|     19|        }                                 \
  |  |   60|  6.97k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 6.95k]
  |  |  ------------------
  ------------------
 1714|  6.97k|                                                              item,
 1715|  6.97k|                                                              decoder->allowProgressive,
 1716|  6.97k|                                                              decoder->imageCountLimit,
 1717|  6.97k|                                                              decoder->io->sizeHint,
 1718|  6.97k|                                                              &decoder->diag));
 1719|  6.95k|        tile->input->itemCategory = itemCategory;
 1720|       |
 1721|  6.95k|        if (!item->progressive) {
  ------------------
  |  Branch (1721:13): [True: 6.95k, False: 0]
  ------------------
 1722|  6.95k|            progressive = AVIF_FALSE;
  ------------------
  |  |   89|  6.95k|#define AVIF_FALSE 0
  ------------------
 1723|  6.95k|        }
 1724|  6.95k|    }
 1725|  3.23k|    if (itemCategory == AVIF_ITEM_COLOR && progressive) {
  ------------------
  |  Branch (1725:9): [True: 1.74k, False: 1.48k]
  |  Branch (1725:44): [True: 0, False: 1.74k]
  ------------------
 1726|       |        // If all the items that make up the grid are progressive, then propagate that status to the top-level grid item.
 1727|      0|        gridItem->progressive = AVIF_TRUE;
  ------------------
  |  |   88|      0|#define AVIF_TRUE 1
  ------------------
 1728|      0|    }
 1729|  3.23k|    return AVIF_RESULT_OK;
 1730|  3.25k|}
read.c:avifDecoderItemOperatingPoint:
 1232|  40.8k|{
 1233|  40.8k|    const avifProperty * a1opProp = avifPropertyArrayFind(&item->properties, "a1op");
 1234|  40.8k|    if (a1opProp) {
  ------------------
  |  Branch (1234:9): [True: 0, False: 40.8k]
  ------------------
 1235|      0|        return a1opProp->u.a1op.opIndex;
 1236|      0|    }
 1237|  40.8k|    return 0; // default
 1238|  40.8k|}
read.c:avifCodecDecodeInputFillFromDecoderItem:
  619|  40.8k|{
  620|  40.8k|    if (sizeHint && (item->size > sizeHint)) {
  ------------------
  |  Branch (620:9): [True: 40.8k, False: 0]
  |  Branch (620:21): [True: 98, False: 40.7k]
  ------------------
  621|     98|        avifDiagnosticsPrintf(diag, "Exceeded avifIO's sizeHint, possibly truncated data");
  622|     98|        return AVIF_RESULT_BMFF_PARSE_FAILED;
  623|     98|    }
  624|       |
  625|  40.7k|    uint8_t layerCount = 0;
  626|  40.7k|    size_t layerSizes[4] = { 0 };
  627|  40.7k|    const avifProperty * a1lxProp = avifPropertyArrayFind(&item->properties, "a1lx");
  628|  40.7k|    if (a1lxProp) {
  ------------------
  |  Branch (628:9): [True: 6.15k, False: 34.5k]
  ------------------
  629|       |        // Calculate layer count and all layer sizes from the a1lx box, and then validate
  630|       |
  631|  6.15k|        size_t remainingSize = item->size;
  632|  12.3k|        for (int i = 0; i < 3; ++i) {
  ------------------
  |  Branch (632:25): [True: 12.3k, False: 7]
  ------------------
  633|  12.3k|            ++layerCount;
  634|       |
  635|  12.3k|            const size_t layerSize = (size_t)a1lxProp->u.a1lx.layerSize[i];
  636|  12.3k|            if (layerSize) {
  ------------------
  |  Branch (636:17): [True: 6.21k, False: 6.14k]
  ------------------
  637|  6.21k|                if (layerSize >= remainingSize) { // >= instead of > because there must be room for the last layer
  ------------------
  |  Branch (637:21): [True: 3, False: 6.21k]
  ------------------
  638|      3|                    avifDiagnosticsPrintf(diag, "a1lx layer index [%d] does not fit in item size", i);
  639|      3|                    return AVIF_RESULT_BMFF_PARSE_FAILED;
  640|      3|                }
  641|  6.21k|                layerSizes[i] = layerSize;
  642|  6.21k|                remainingSize -= layerSize;
  643|  6.21k|            } else {
  644|  6.14k|                layerSizes[i] = remainingSize;
  645|  6.14k|                remainingSize = 0;
  646|  6.14k|                break;
  647|  6.14k|            }
  648|  12.3k|        }
  649|  6.15k|        if (remainingSize > 0) {
  ------------------
  |  Branch (649:13): [True: 7, False: 6.14k]
  ------------------
  650|      7|            AVIF_ASSERT_OR_RETURN(layerCount == 3);
  ------------------
  |  |   64|      7|#define AVIF_ASSERT_OR_RETURN(A) AVIF_CHECKERR((A), AVIF_RESULT_INTERNAL_ERROR)
  |  |  ------------------
  |  |  |  |   45|      7|    do {                        \
  |  |  |  |   46|      7|        if (!(A)) {             \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (46:13): [True: 0, False: 7]
  |  |  |  |  ------------------
  |  |  |  |   47|      0|            avifBreakOnError(); \
  |  |  |  |   48|      0|            return ERR;         \
  |  |  |  |   49|      0|        }                       \
  |  |  |  |   50|      7|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (50:14): [Folded, False: 7]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  651|      7|            ++layerCount;
  652|      7|            layerSizes[3] = remainingSize;
  653|      7|        }
  654|  6.15k|    }
  655|       |
  656|  40.7k|    const avifProperty * lselProp = avifPropertyArrayFind(&item->properties, "lsel");
  657|       |    // Progressive images offer layers via the a1lxProp, but don't specify a layer selection with lsel.
  658|       |    //
  659|       |    // For backward compatibility with earlier drafts of AVIF spec v1.1.0, treat an absent lsel as
  660|       |    // equivalent to layer_id == 0xFFFF during the transitional period. Remove !lselProp when the test
  661|       |    // images have been updated to the v1.1.0 spec.
  662|  40.7k|    item->progressive = (a1lxProp && (!lselProp || (lselProp->u.lsel.layerID == 0xFFFF)));
  ------------------
  |  Branch (662:26): [True: 6.15k, False: 34.5k]
  |  Branch (662:39): [True: 6.15k, False: 0]
  |  Branch (662:52): [True: 0, False: 0]
  ------------------
  663|  40.7k|    if (lselProp && (lselProp->u.lsel.layerID != 0xFFFF)) {
  ------------------
  |  Branch (663:9): [True: 0, False: 40.7k]
  |  Branch (663:21): [True: 0, False: 0]
  ------------------
  664|       |        // Layer selection. This requires that the underlying AV1 codec decodes all layers,
  665|       |        // and then only returns the requested layer as a single frame. To the user of libavif,
  666|       |        // this appears to be a single frame.
  667|       |
  668|      0|        decodeInput->allLayers = AVIF_TRUE;
  ------------------
  |  |   88|      0|#define AVIF_TRUE 1
  ------------------
  669|       |
  670|      0|        size_t sampleSize = 0;
  671|      0|        if (layerCount > 0) {
  ------------------
  |  Branch (671:13): [True: 0, False: 0]
  ------------------
  672|       |            // Optimization: If we're selecting a layer that doesn't require the entire image's payload (hinted via the a1lx box)
  673|       |
  674|      0|            if (lselProp->u.lsel.layerID >= layerCount) {
  ------------------
  |  Branch (674:17): [True: 0, False: 0]
  ------------------
  675|      0|                avifDiagnosticsPrintf(diag,
  676|      0|                                      "lsel property requests layer index [%u] which isn't present in a1lx property ([%u] layers)",
  677|      0|                                      lselProp->u.lsel.layerID,
  678|      0|                                      layerCount);
  679|      0|                return AVIF_RESULT_BMFF_PARSE_FAILED;
  680|      0|            }
  681|       |
  682|      0|            for (uint8_t i = 0; i <= lselProp->u.lsel.layerID; ++i) {
  ------------------
  |  Branch (682:33): [True: 0, False: 0]
  ------------------
  683|      0|                sampleSize += layerSizes[i];
  684|      0|            }
  685|      0|        } else {
  686|       |            // This layer's payload subsection is unknown, just use the whole payload
  687|      0|            sampleSize = item->size;
  688|      0|        }
  689|       |
  690|      0|        avifDecodeSample * sample = (avifDecodeSample *)avifArrayPush(&decodeInput->samples);
  691|      0|        AVIF_CHECKERR(sample != NULL, AVIF_RESULT_OUT_OF_MEMORY);
  ------------------
  |  |   45|      0|    do {                        \
  |  |   46|      0|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 0]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|      0|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 0]
  |  |  ------------------
  ------------------
  692|      0|        sample->itemID = item->id;
  693|      0|        sample->offset = 0;
  694|      0|        sample->size = sampleSize;
  695|      0|        AVIF_ASSERT_OR_RETURN(lselProp->u.lsel.layerID < AVIF_MAX_AV1_LAYER_COUNT);
  ------------------
  |  |   64|      0|#define AVIF_ASSERT_OR_RETURN(A) AVIF_CHECKERR((A), AVIF_RESULT_INTERNAL_ERROR)
  |  |  ------------------
  |  |  |  |   45|      0|    do {                        \
  |  |  |  |   46|      0|        if (!(A)) {             \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (46:13): [True: 0, False: 0]
  |  |  |  |  ------------------
  |  |  |  |   47|      0|            avifBreakOnError(); \
  |  |  |  |   48|      0|            return ERR;         \
  |  |  |  |   49|      0|        }                       \
  |  |  |  |   50|      0|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (50:14): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  696|      0|        sample->spatialID = (uint8_t)lselProp->u.lsel.layerID;
  697|      0|        sample->sync = AVIF_TRUE;
  ------------------
  |  |   88|      0|#define AVIF_TRUE 1
  ------------------
  698|  40.7k|    } else if (allowProgressive && item->progressive) {
  ------------------
  |  Branch (698:16): [True: 21.8k, False: 18.8k]
  |  Branch (698:36): [True: 4.31k, False: 17.5k]
  ------------------
  699|       |        // Progressive image. Decode all layers and expose them all to the user.
  700|       |
  701|  4.31k|        if (imageCountLimit && (layerCount > imageCountLimit)) {
  ------------------
  |  Branch (701:13): [True: 4.31k, False: 0]
  |  Branch (701:32): [True: 0, False: 4.31k]
  ------------------
  702|      0|            avifDiagnosticsPrintf(diag, "Exceeded avifDecoder's imageCountLimit (progressive)");
  703|      0|            return AVIF_RESULT_BMFF_PARSE_FAILED;
  704|      0|        }
  705|       |
  706|  4.31k|        decodeInput->allLayers = AVIF_TRUE;
  ------------------
  |  |   88|  4.31k|#define AVIF_TRUE 1
  ------------------
  707|       |
  708|  4.31k|        size_t offset = 0;
  709|  12.9k|        for (int i = 0; i < layerCount; ++i) {
  ------------------
  |  Branch (709:25): [True: 8.67k, False: 4.31k]
  ------------------
  710|  8.67k|            avifDecodeSample * sample = (avifDecodeSample *)avifArrayPush(&decodeInput->samples);
  711|  8.67k|            AVIF_CHECKERR(sample != NULL, AVIF_RESULT_OUT_OF_MEMORY);
  ------------------
  |  |   45|  8.67k|    do {                        \
  |  |   46|  8.67k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 8.67k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  8.67k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 8.67k]
  |  |  ------------------
  ------------------
  712|  8.67k|            sample->itemID = item->id;
  713|  8.67k|            sample->offset = offset;
  714|  8.67k|            sample->size = layerSizes[i];
  715|  8.67k|            sample->spatialID = AVIF_SPATIAL_ID_UNSET;
  ------------------
  |  |  461|  8.67k|#define AVIF_SPATIAL_ID_UNSET 0xff
  ------------------
  716|  8.67k|            sample->sync = (i == 0); // Assume all layers depend on the first layer
  717|       |
  718|  8.67k|            offset += layerSizes[i];
  719|  8.67k|        }
  720|  36.3k|    } else {
  721|       |        // Typical case: Use the entire item's payload for a single frame output
  722|       |
  723|  36.3k|        avifDecodeSample * sample = (avifDecodeSample *)avifArrayPush(&decodeInput->samples);
  724|  36.3k|        AVIF_CHECKERR(sample != NULL, AVIF_RESULT_OUT_OF_MEMORY);
  ------------------
  |  |   45|  36.3k|    do {                        \
  |  |   46|  36.3k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 36.3k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  36.3k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 36.3k]
  |  |  ------------------
  ------------------
  725|  36.3k|        sample->itemID = item->id;
  726|  36.3k|        sample->offset = 0;
  727|  36.3k|        sample->size = item->size;
  728|  36.3k|        sample->spatialID = AVIF_SPATIAL_ID_UNSET;
  ------------------
  |  |  461|  36.3k|#define AVIF_SPATIAL_ID_UNSET 0xff
  ------------------
  729|  36.3k|        sample->sync = AVIF_TRUE;
  ------------------
  |  |   88|  36.3k|#define AVIF_TRUE 1
  ------------------
  730|  36.3k|    }
  731|  40.7k|    return AVIF_RESULT_OK;
  732|  40.7k|}
read.c:avifDecoderItemValidateProperties:
 1244|  36.9k|{
 1245|  36.9k|    const avifProperty * const configProp = avifPropertyArrayFind(&item->properties, configPropName);
 1246|  36.9k|    if (!configProp) {
  ------------------
  |  Branch (1246:9): [True: 7, False: 36.9k]
  ------------------
 1247|       |        // An item configuration property box is mandatory in all valid AVIF configurations. Bail out.
 1248|      7|        avifDiagnosticsPrintf(diag, "Item ID %u of type '%.4s' is missing mandatory %s property", item->id, (const char *)item->type, configPropName);
 1249|      7|        return AVIF_RESULT_BMFF_PARSE_FAILED;
 1250|      7|    }
 1251|       |
 1252|  36.9k|    if (!memcmp(item->type, "grid", 4)) {
  ------------------
  |  Branch (1252:9): [True: 3.23k, False: 33.7k]
  ------------------
 1253|  22.9k|        for (uint32_t i = 0; i < item->meta->items.count; ++i) {
  ------------------
  |  Branch (1253:30): [True: 19.7k, False: 3.22k]
  ------------------
 1254|  19.7k|            avifDecoderItem * tile = item->meta->items.item[i];
 1255|  19.7k|            if (tile->dimgForID != item->id) {
  ------------------
  |  Branch (1255:17): [True: 12.7k, False: 6.93k]
  ------------------
 1256|  12.7k|                continue;
 1257|  12.7k|            }
 1258|       |            // Tile item types were checked in avifDecoderGenerateImageTiles(), no need to do it here.
 1259|       |
 1260|       |            // MIAF (ISO 23000-22:2019), Section 7.3.11.4.1:
 1261|       |            //   All input images of a grid image item shall use the same [...] chroma sampling format,
 1262|       |            //   and the same decoder configuration (see 7.3.6.2).
 1263|       |
 1264|       |            // The chroma sampling format is part of the decoder configuration.
 1265|  6.93k|            const avifProperty * tileConfigProp = avifPropertyArrayFind(&tile->properties, configPropName);
 1266|  6.93k|            if (!tileConfigProp) {
  ------------------
  |  Branch (1266:17): [True: 2, False: 6.93k]
  ------------------
 1267|      2|                avifDiagnosticsPrintf(diag,
 1268|      2|                                      "Tile item ID %u of type '%.4s' is missing mandatory %s property",
 1269|      2|                                      tile->id,
 1270|      2|                                      (const char *)tile->type,
 1271|      2|                                      configPropName);
 1272|      2|                return AVIF_RESULT_BMFF_PARSE_FAILED;
 1273|      2|            }
 1274|       |            // configProp was copied from a tile item to the grid item. Comparing tileConfigProp with it
 1275|       |            // is equivalent to comparing tileConfigProp with the configPropName from the first tile.
 1276|  6.93k|            if ((tileConfigProp->u.av1C.seqProfile != configProp->u.av1C.seqProfile) ||
  ------------------
  |  Branch (1276:17): [True: 3, False: 6.92k]
  ------------------
 1277|  6.92k|                (tileConfigProp->u.av1C.seqLevelIdx0 != configProp->u.av1C.seqLevelIdx0) ||
  ------------------
  |  Branch (1277:17): [True: 1, False: 6.92k]
  ------------------
 1278|  6.92k|                (tileConfigProp->u.av1C.seqTier0 != configProp->u.av1C.seqTier0) ||
  ------------------
  |  Branch (1278:17): [True: 0, False: 6.92k]
  ------------------
 1279|  6.92k|                (tileConfigProp->u.av1C.highBitdepth != configProp->u.av1C.highBitdepth) ||
  ------------------
  |  Branch (1279:17): [True: 1, False: 6.92k]
  ------------------
 1280|  6.92k|                (tileConfigProp->u.av1C.twelveBit != configProp->u.av1C.twelveBit) ||
  ------------------
  |  Branch (1280:17): [True: 0, False: 6.92k]
  ------------------
 1281|  6.92k|                (tileConfigProp->u.av1C.monochrome != configProp->u.av1C.monochrome) ||
  ------------------
  |  Branch (1281:17): [True: 1, False: 6.92k]
  ------------------
 1282|  6.92k|                (tileConfigProp->u.av1C.chromaSubsamplingX != configProp->u.av1C.chromaSubsamplingX) ||
  ------------------
  |  Branch (1282:17): [True: 0, False: 6.92k]
  ------------------
 1283|  6.92k|                (tileConfigProp->u.av1C.chromaSubsamplingY != configProp->u.av1C.chromaSubsamplingY) ||
  ------------------
  |  Branch (1283:17): [True: 0, False: 6.92k]
  ------------------
 1284|  6.92k|                (tileConfigProp->u.av1C.chromaSamplePosition != configProp->u.av1C.chromaSamplePosition)) {
  ------------------
  |  Branch (1284:17): [True: 0, False: 6.92k]
  ------------------
 1285|      6|                avifDiagnosticsPrintf(diag,
 1286|      6|                                      "The fields of the %s property of tile item ID %u of type '%.4s' differs from other tiles",
 1287|      6|                                      configPropName,
 1288|      6|                                      tile->id,
 1289|      6|                                      (const char *)tile->type);
 1290|      6|                return AVIF_RESULT_BMFF_PARSE_FAILED;
 1291|      6|            }
 1292|  6.93k|        }
 1293|  3.23k|    }
 1294|       |
 1295|  36.9k|    const avifProperty * pixiProp = avifPropertyArrayFind(&item->properties, "pixi");
 1296|  36.9k|    if (!pixiProp && (strictFlags & AVIF_STRICT_PIXI_REQUIRED)) {
  ------------------
  |  Branch (1296:9): [True: 2.55k, False: 34.4k]
  |  Branch (1296:22): [True: 18, False: 2.53k]
  ------------------
 1297|       |        // A pixi box is mandatory in all valid AVIF configurations. Bail out.
 1298|     18|        avifDiagnosticsPrintf(diag,
 1299|     18|                              "[Strict] Item ID %u of type '%.4s' is missing mandatory pixi property",
 1300|     18|                              item->id,
 1301|     18|                              (const char *)item->type);
 1302|     18|        return AVIF_RESULT_BMFF_PARSE_FAILED;
 1303|     18|    }
 1304|       |
 1305|  36.9k|    if (pixiProp) {
  ------------------
  |  Branch (1305:9): [True: 34.4k, False: 2.53k]
  ------------------
 1306|  34.4k|        const uint32_t configDepth = avifCodecConfigurationBoxGetDepth(&configProp->u.av1C);
 1307|   121k|        for (uint8_t i = 0; i < pixiProp->u.pixi.planeCount; ++i) {
  ------------------
  |  Branch (1307:29): [True: 87.4k, False: 34.4k]
  ------------------
 1308|  87.4k|            if (pixiProp->u.pixi.planeDepths[i] != configDepth) {
  ------------------
  |  Branch (1308:17): [True: 8, False: 87.4k]
  ------------------
 1309|       |                // pixi depth must match configuration property depth
 1310|      8|                avifDiagnosticsPrintf(diag,
 1311|      8|                                      "Item ID %u depth specified by pixi property [%u] does not match %s property depth [%u]",
 1312|      8|                                      item->id,
 1313|      8|                                      pixiProp->u.pixi.planeDepths[i],
 1314|      8|                                      configPropName,
 1315|      8|                                      configDepth);
 1316|      8|                return AVIF_RESULT_BMFF_PARSE_FAILED;
 1317|      8|            }
 1318|       |#if defined(AVIF_ENABLE_EXPERIMENTAL_EXTENDED_PIXI)
 1319|       |            if (pixiProp->u.pixi.subsamplingFlag[i]) {
 1320|       |                if (pixiProp->u.pixi.subsamplingType[i] != avifCodecConfigurationBoxGetSubsamplingType(&configProp->u.av1C, i)) {
 1321|       |                    avifDiagnosticsPrintf(diag,
 1322|       |                                          "Item ID %u subsampling type specified by pixi property [%u] for channel %u does not match %s property [%u,%u]",
 1323|       |                                          item->id,
 1324|       |                                          pixiProp->u.pixi.subsamplingType[i],
 1325|       |                                          i,
 1326|       |                                          configPropName,
 1327|       |                                          configProp->u.av1C.chromaSubsamplingX,
 1328|       |                                          configProp->u.av1C.chromaSubsamplingY);
 1329|       |                    return AVIF_RESULT_BMFF_PARSE_FAILED;
 1330|       |                }
 1331|       |                if (configProp->u.av1C.chromaSamplePosition != AVIF_CHROMA_SAMPLE_POSITION_UNKNOWN) {
 1332|       |                    const avifChromaSamplePosition expectedChromaSamplePosition =
 1333|       |                        i == AVIF_CHAN_Y ? AVIF_CHROMA_SAMPLE_POSITION_COLOCATED : configProp->u.av1C.chromaSamplePosition;
 1334|       |                    if (avifSubsamplingLocationToChromaSamplePosition(pixiProp->u.pixi.subsamplingType[i],
 1335|       |                                                                      pixiProp->u.pixi.subsamplingLocation[i]) !=
 1336|       |                        expectedChromaSamplePosition) {
 1337|       |                        avifDiagnosticsPrintf(diag,
 1338|       |                                              "Item ID %u subsampling type and location specified by pixi property [%u,%u] for channel %u does not match %s property chroma sample position [%u]",
 1339|       |                                              item->id,
 1340|       |                                              pixiProp->u.pixi.subsamplingType[i],
 1341|       |                                              pixiProp->u.pixi.subsamplingLocation[i],
 1342|       |                                              i,
 1343|       |                                              configPropName,
 1344|       |                                              configProp->u.av1C.chromaSamplePosition);
 1345|       |                        return AVIF_RESULT_BMFF_PARSE_FAILED;
 1346|       |                    }
 1347|       |                }
 1348|       |            }
 1349|       |#endif // AVIF_ENABLE_EXPERIMENTAL_EXTENDED_PIXI
 1350|  87.4k|        }
 1351|  34.4k|    }
 1352|       |
 1353|       |#if defined(AVIF_ENABLE_EXPERIMENTAL_MINI)
 1354|       |    if (item->miniBoxPixelFormat != AVIF_PIXEL_FORMAT_NONE) {
 1355|       |        // This is a MinimizedImageBox ('mini').
 1356|       |
 1357|       |        avifPixelFormat av1CPixelFormat;
 1358|       |        if (configProp->u.av1C.monochrome) {
 1359|       |            av1CPixelFormat = AVIF_PIXEL_FORMAT_YUV400;
 1360|       |        } else if (configProp->u.av1C.chromaSubsamplingY == 1) {
 1361|       |            av1CPixelFormat = AVIF_PIXEL_FORMAT_YUV420;
 1362|       |        } else if (configProp->u.av1C.chromaSubsamplingX == 1) {
 1363|       |            av1CPixelFormat = AVIF_PIXEL_FORMAT_YUV422;
 1364|       |        } else {
 1365|       |            av1CPixelFormat = AVIF_PIXEL_FORMAT_YUV444;
 1366|       |        }
 1367|       |        if (item->miniBoxPixelFormat != av1CPixelFormat) {
 1368|       |            avifDiagnosticsPrintf(diag,
 1369|       |                                  "Item ID %u format [%s] specified by MinimizedImageBox does not match %s property format [%s]",
 1370|       |                                  item->id,
 1371|       |                                  avifPixelFormatToString(item->miniBoxPixelFormat),
 1372|       |                                  configPropName,
 1373|       |                                  avifPixelFormatToString(av1CPixelFormat));
 1374|       |            return AVIF_RESULT_BMFF_PARSE_FAILED;
 1375|       |        }
 1376|       |
 1377|       |        if (configProp->u.av1C.chromaSamplePosition == /*CSP_UNKNOWN=*/0) {
 1378|       |            // Section 6.4.2. Color config semantics of AV1 specification says:
 1379|       |            //   CSP_UNKNOWN - the source video transfer function must be signaled outside the AV1 bitstream
 1380|       |            // See https://aomediacodec.github.io/av1-spec/#color-config-semantics
 1381|       |
 1382|       |            // So item->miniBoxChromaSamplePosition can differ and will override the AV1 value.
 1383|       |        } else if ((uint8_t)item->miniBoxChromaSamplePosition != configProp->u.av1C.chromaSamplePosition) {
 1384|       |            avifDiagnosticsPrintf(diag,
 1385|       |                                  "Item ID %u chroma sample position [%u] specified by MinimizedImageBox does not match %s property chroma sample position [%u]",
 1386|       |                                  item->id,
 1387|       |                                  (uint32_t)item->miniBoxChromaSamplePosition,
 1388|       |                                  configPropName,
 1389|       |                                  configProp->u.av1C.chromaSamplePosition);
 1390|       |            return AVIF_RESULT_BMFF_PARSE_FAILED;
 1391|       |        }
 1392|       |    }
 1393|       |#endif // AVIF_ENABLE_EXPERIMENTAL_MINI
 1394|       |
 1395|  36.9k|    if (strictFlags & AVIF_STRICT_CLAP_VALID) {
  ------------------
  |  Branch (1395:9): [True: 18.3k, False: 18.6k]
  ------------------
 1396|  18.3k|        const avifProperty * clapProp = avifPropertyArrayFind(&item->properties, "clap");
 1397|  18.3k|        if (clapProp) {
  ------------------
  |  Branch (1397:13): [True: 0, False: 18.3k]
  ------------------
 1398|      0|            const avifProperty * ispeProp = avifPropertyArrayFind(&item->properties, "ispe");
 1399|      0|            if (!ispeProp) {
  ------------------
  |  Branch (1399:17): [True: 0, False: 0]
  ------------------
 1400|      0|                avifDiagnosticsPrintf(diag,
 1401|      0|                                      "[Strict] Item ID %u is missing an ispe property, so its clap property cannot be validated",
 1402|      0|                                      item->id);
 1403|      0|                return AVIF_RESULT_BMFF_PARSE_FAILED;
 1404|      0|            }
 1405|       |
 1406|      0|            avifCropRect cropRect;
 1407|      0|            const uint32_t imageW = ispeProp->u.ispe.width;
 1408|      0|            const uint32_t imageH = ispeProp->u.ispe.height;
 1409|      0|            const avifBool validClap = avifCropRectFromCleanApertureBox(&cropRect, &clapProp->u.clap, imageW, imageH, diag);
 1410|      0|            if (!validClap) {
  ------------------
  |  Branch (1410:17): [True: 0, False: 0]
  ------------------
 1411|      0|                return AVIF_RESULT_BMFF_PARSE_FAILED;
 1412|      0|            }
 1413|      0|        }
 1414|  18.3k|    }
 1415|  36.9k|    return AVIF_RESULT_OK;
 1416|  36.9k|}
read.c:avifCodecConfigurationBoxGetDepth:
  383|  69.0k|{
  384|  69.0k|    if (av1C->twelveBit) {
  ------------------
  |  Branch (384:9): [True: 717, False: 68.3k]
  ------------------
  385|    717|        return 12;
  386|  68.3k|    } else if (av1C->highBitdepth) {
  ------------------
  |  Branch (386:16): [True: 9.58k, False: 58.7k]
  ------------------
  387|  9.58k|        return 10;
  388|  9.58k|    }
  389|  58.7k|    return 8;
  390|  69.0k|}
read.c:avifGetConfigurationPropertyName:
   58|  75.0k|{
   59|  75.0k|    static const char kUnknown[] = "****";
   60|  75.0k|    switch (codecType) {
   61|  75.0k|        case AVIF_CODEC_TYPE_AV1:
  ------------------
  |  Branch (61:9): [True: 75.0k, False: 0]
  ------------------
   62|  75.0k|            return "av1C";
   63|       |#if defined(AVIF_CODEC_AVM)
   64|       |        case AVIF_CODEC_TYPE_AV2:
   65|       |            return "av2C";
   66|       |#endif
   67|      0|        default:
  ------------------
  |  Branch (67:9): [True: 0, False: 75.0k]
  ------------------
   68|       |            assert(AVIF_FALSE);
   69|      0|            return kUnknown; // Easier to deal with than NULL.
   70|  75.0k|    }
   71|  75.0k|}
read.c:avifReadCodecConfigProperty:
 6054|  34.6k|{
 6055|  34.6k|    const avifProperty * configProp = avifPropertyArrayFind(properties, avifGetConfigurationPropertyName(codecType));
 6056|  34.6k|    if (configProp) {
  ------------------
  |  Branch (6056:9): [True: 34.6k, False: 29]
  ------------------
 6057|  34.6k|        image->depth = avifCodecConfigurationBoxGetDepth(&configProp->u.av1C);
 6058|  34.6k|        if (configProp->u.av1C.monochrome) {
  ------------------
  |  Branch (6058:13): [True: 3.30k, False: 31.3k]
  ------------------
 6059|  3.30k|            image->yuvFormat = AVIF_PIXEL_FORMAT_YUV400;
 6060|  31.3k|        } else {
 6061|  31.3k|            if (configProp->u.av1C.chromaSubsamplingX && configProp->u.av1C.chromaSubsamplingY) {
  ------------------
  |  Branch (6061:17): [True: 8.21k, False: 23.1k]
  |  Branch (6061:58): [True: 5.41k, False: 2.79k]
  ------------------
 6062|  5.41k|                image->yuvFormat = AVIF_PIXEL_FORMAT_YUV420;
 6063|  25.9k|            } else if (configProp->u.av1C.chromaSubsamplingX) {
  ------------------
  |  Branch (6063:24): [True: 2.79k, False: 23.1k]
  ------------------
 6064|  2.79k|                image->yuvFormat = AVIF_PIXEL_FORMAT_YUV422;
 6065|  23.1k|            } else {
 6066|  23.1k|                image->yuvFormat = AVIF_PIXEL_FORMAT_YUV444;
 6067|  23.1k|            }
 6068|  31.3k|        }
 6069|  34.6k|        image->yuvChromaSamplePosition = (avifChromaSamplePosition)configProp->u.av1C.chromaSamplePosition;
 6070|  34.6k|    } else {
 6071|       |        // A configuration property box is mandatory in all valid AVIF configurations. Bail out.
 6072|     29|        return AVIF_RESULT_BMFF_PARSE_FAILED;
 6073|     29|    }
 6074|  34.6k|    return AVIF_RESULT_OK;
 6075|  34.6k|}
read.c:avifReadColorProperties:
 5669|  35.0k|{
 5670|       |    // Find and adopt all colr boxes "at most one for a given value of colour type" (HEIF 6.5.5.1, from Amendment 3)
 5671|       |    // Accept one of each type, and bail out if more than one of a given type is provided.
 5672|  35.0k|    avifBool colrICCSeen = AVIF_FALSE;
  ------------------
  |  |   89|  35.0k|#define AVIF_FALSE 0
  ------------------
 5673|   190k|    for (uint32_t propertyIndex = 0; propertyIndex < properties->count; ++propertyIndex) {
  ------------------
  |  Branch (5673:38): [True: 155k, False: 35.0k]
  ------------------
 5674|   155k|        avifProperty * prop = &properties->prop[propertyIndex];
 5675|   155k|        if (!memcmp(prop->type, "colr", 4) && prop->u.colr.hasICC) {
  ------------------
  |  Branch (5675:13): [True: 23.4k, False: 132k]
  |  Branch (5675:47): [True: 151, False: 23.2k]
  ------------------
 5676|    151|            if (colrICCSeen) {
  ------------------
  |  Branch (5676:17): [True: 2, False: 149]
  ------------------
 5677|      2|                return AVIF_RESULT_BMFF_PARSE_FAILED;
 5678|      2|            }
 5679|    149|            avifROData iccRead;
 5680|    149|            AVIF_CHECKRES(io->read(io, 0, prop->u.colr.iccOffset, prop->u.colr.iccSize, &iccRead));
  ------------------
  |  |   54|    149|    do {                                  \
  |  |   55|    149|        const avifResult result__ = (A);  \
  |  |   56|    149|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 0, False: 149]
  |  |  ------------------
  |  |   57|      0|            avifBreakOnError();           \
  |  |   58|      0|            return result__;              \
  |  |   59|      0|        }                                 \
  |  |   60|    149|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 149]
  |  |  ------------------
  ------------------
 5681|    149|            colrICCSeen = AVIF_TRUE;
  ------------------
  |  |   88|    149|#define AVIF_TRUE 1
  ------------------
 5682|    149|            AVIF_CHECKRES(avifRWDataSet(icc, iccRead.data, iccRead.size));
  ------------------
  |  |   54|    149|    do {                                  \
  |  |   55|    149|        const avifResult result__ = (A);  \
  |  |   56|    149|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 0, False: 149]
  |  |  ------------------
  |  |   57|      0|            avifBreakOnError();           \
  |  |   58|      0|            return result__;              \
  |  |   59|      0|        }                                 \
  |  |   60|    149|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 149]
  |  |  ------------------
  ------------------
 5683|    149|        }
 5684|   155k|    }
 5685|  35.0k|    return avifReadColorNclxProperty(properties, colorPrimaries, transferCharacteristics, matrixCoefficients, yuvRange, cicpSet);
 5686|  35.0k|}
read.c:avifDecoderCheckAlphaProperties:
 5922|  6.78k|{
 5923|  6.78k|    const avifImage * image = decoder->image;
 5924|       |    // The 'clap', 'irot' and 'imir' transformative properties should be applied to the alpha
 5925|       |    // auxiliary image item before considering it a plane of the color image item.
 5926|       |    // Alternatively, inequality with the transformative properties attached to the color image item
 5927|       |    // should be treated as AVIF_RESULT_NOT_IMPLEMENTED.
 5928|       |    // The latter is easier and is the behavior of libavif.
 5929|       |
 5930|  6.78k|    const avifProperty * clapProp = avifPropertyArrayFind(alphaProperties, "clap");
 5931|  6.78k|    const avifProperty * irotProp = avifPropertyArrayFind(alphaProperties, "irot");
 5932|  6.78k|    const avifProperty * imirProp = avifPropertyArrayFind(alphaProperties, "imir");
 5933|  6.78k|    if (clapProp == NULL && irotProp == NULL && imirProp == NULL) {
  ------------------
  |  Branch (5933:9): [True: 6.78k, False: 0]
  |  Branch (5933:29): [True: 6.41k, False: 372]
  |  Branch (5933:49): [True: 6.41k, False: 0]
  ------------------
 5934|       |        // However, libavif up to version 1.3.0 generated images lacking transformative property
 5935|       |        // associations with alpha auxiliary image items, so be lenient on their absence for
 5936|       |        // backward compatibility with previously generated images.
 5937|  6.41k|        return AVIF_RESULT_OK;
 5938|  6.41k|    }
 5939|       |
 5940|       |    // HEIF (ISO/IEC 23008-12), Section 6.9.1:
 5941|       |    //   When the width or the height of the alpha plane differs from the width or the height of the
 5942|       |    //   master image, respectively, the alpha plane is resized to have the same width and height as
 5943|       |    //   those of the master image.
 5944|       |    // There is no need to enforce specific 'ispe' values describing the alpha item because
 5945|       |    // the alpha item must be resized to the dimensions of the associated color item.
 5946|       |
 5947|    372|    if (!clapProp != !(image->transformFlags & AVIF_TRANSFORM_CLAP) ||
  ------------------
  |  Branch (5947:9): [True: 0, False: 372]
  ------------------
 5948|    372|        (clapProp && (clapProp->u.clap.widthN != image->clap.widthN || clapProp->u.clap.widthD != image->clap.widthD ||
  ------------------
  |  Branch (5948:10): [True: 0, False: 372]
  |  Branch (5948:23): [True: 0, False: 0]
  |  Branch (5948:72): [True: 0, False: 0]
  ------------------
 5949|      0|                      clapProp->u.clap.heightN != image->clap.heightN || clapProp->u.clap.heightD != image->clap.heightD ||
  ------------------
  |  Branch (5949:23): [True: 0, False: 0]
  |  Branch (5949:74): [True: 0, False: 0]
  ------------------
 5950|      0|                      clapProp->u.clap.horizOffN != image->clap.horizOffN || clapProp->u.clap.horizOffD != image->clap.horizOffD ||
  ------------------
  |  Branch (5950:23): [True: 0, False: 0]
  |  Branch (5950:78): [True: 0, False: 0]
  ------------------
 5951|      0|                      clapProp->u.clap.vertOffN != image->clap.vertOffN || clapProp->u.clap.vertOffD != image->clap.vertOffD))) {
  ------------------
  |  Branch (5951:23): [True: 0, False: 0]
  |  Branch (5951:76): [True: 0, False: 0]
  ------------------
 5952|      0|        avifDiagnosticsPrintf(&decoder->diag, "Clean aperture property mismatch between alpha auxiliary image item and color item");
 5953|      0|        return AVIF_RESULT_NOT_IMPLEMENTED;
 5954|      0|    }
 5955|    372|    if (!irotProp != !(image->transformFlags & AVIF_TRANSFORM_IROT) || (irotProp && irotProp->u.irot.angle != image->irot.angle)) {
  ------------------
  |  Branch (5955:9): [True: 1, False: 371]
  |  Branch (5955:73): [True: 371, False: 0]
  |  Branch (5955:85): [True: 0, False: 371]
  ------------------
 5956|      1|        avifDiagnosticsPrintf(&decoder->diag, "Rotation property mismatch between alpha auxiliary image item and color item");
 5957|      1|        return AVIF_RESULT_NOT_IMPLEMENTED;
 5958|      1|    }
 5959|    371|    if (!imirProp != !(image->transformFlags & AVIF_TRANSFORM_IMIR) || (imirProp && imirProp->u.imir.axis != image->imir.axis)) {
  ------------------
  |  Branch (5959:9): [True: 0, False: 371]
  |  Branch (5959:73): [True: 0, False: 371]
  |  Branch (5959:85): [True: 0, False: 0]
  ------------------
 5960|      0|        avifDiagnosticsPrintf(&decoder->diag, "Mirroring property mismatch between alpha auxiliary image item and color item");
 5961|      0|        return AVIF_RESULT_NOT_IMPLEMENTED;
 5962|      0|    }
 5963|    371|    return AVIF_RESULT_OK;
 5964|    371|}
read.c:avifDecoderCheckGainMapProperties:
 5967|    284|{
 5968|    284|    const avifImage * image = decoder->image;
 5969|       |    // libavif requires the bitstream contain the same 'pasp', 'clap', 'irot', 'imir'
 5970|       |    // properties for both the base and gain map image items used as input to
 5971|       |    // the tone-mapped derived image item. libavif also requires the tone-mapped
 5972|       |    // derived image item itself not be associated with these properties. This is
 5973|       |    // enforced at encoding. Other patterns are rejected at decoding.
 5974|    284|    const avifProperty * paspProp = avifPropertyArrayFind(gainMapProperties, "pasp");
 5975|    284|    if (!paspProp != !(image->transformFlags & AVIF_TRANSFORM_PASP) ||
  ------------------
  |  Branch (5975:9): [True: 0, False: 284]
  ------------------
 5976|    284|        (paspProp && (paspProp->u.pasp.hSpacing != image->pasp.hSpacing || paspProp->u.pasp.vSpacing != image->pasp.vSpacing))) {
  ------------------
  |  Branch (5976:10): [True: 0, False: 284]
  |  Branch (5976:23): [True: 0, False: 0]
  |  Branch (5976:76): [True: 0, False: 0]
  ------------------
 5977|      0|        avifDiagnosticsPrintf(&decoder->diag,
 5978|      0|                              "Pixel aspect ratio property mismatch between input items of tone-mapping derived image item");
 5979|      0|        return AVIF_RESULT_DECODE_GAIN_MAP_FAILED;
 5980|      0|    }
 5981|    284|    const avifProperty * clapProp = avifPropertyArrayFind(gainMapProperties, "clap");
 5982|    284|    if (!clapProp != !(image->transformFlags & AVIF_TRANSFORM_CLAP) ||
  ------------------
  |  Branch (5982:9): [True: 0, False: 284]
  ------------------
 5983|    284|        (clapProp && (clapProp->u.clap.widthN != image->clap.widthN || clapProp->u.clap.widthD != image->clap.widthD ||
  ------------------
  |  Branch (5983:10): [True: 0, False: 284]
  |  Branch (5983:23): [True: 0, False: 0]
  |  Branch (5983:72): [True: 0, False: 0]
  ------------------
 5984|      0|                      clapProp->u.clap.heightN != image->clap.heightN || clapProp->u.clap.heightD != image->clap.heightD ||
  ------------------
  |  Branch (5984:23): [True: 0, False: 0]
  |  Branch (5984:74): [True: 0, False: 0]
  ------------------
 5985|      0|                      clapProp->u.clap.horizOffN != image->clap.horizOffN || clapProp->u.clap.horizOffD != image->clap.horizOffD ||
  ------------------
  |  Branch (5985:23): [True: 0, False: 0]
  |  Branch (5985:78): [True: 0, False: 0]
  ------------------
 5986|      0|                      clapProp->u.clap.vertOffN != image->clap.vertOffN || clapProp->u.clap.vertOffD != image->clap.vertOffD))) {
  ------------------
  |  Branch (5986:23): [True: 0, False: 0]
  |  Branch (5986:76): [True: 0, False: 0]
  ------------------
 5987|      0|        avifDiagnosticsPrintf(&decoder->diag, "Clean aperture property mismatch between input items of tone-mapping derived image item");
 5988|      0|        return AVIF_RESULT_DECODE_GAIN_MAP_FAILED;
 5989|      0|    }
 5990|    284|    const avifProperty * irotProp = avifPropertyArrayFind(gainMapProperties, "irot");
 5991|    284|    if (!irotProp != !(image->transformFlags & AVIF_TRANSFORM_IROT) || (irotProp && irotProp->u.irot.angle != image->irot.angle)) {
  ------------------
  |  Branch (5991:9): [True: 0, False: 284]
  |  Branch (5991:73): [True: 0, False: 284]
  |  Branch (5991:85): [True: 0, False: 0]
  ------------------
 5992|      0|        avifDiagnosticsPrintf(&decoder->diag, "Rotation property mismatch between input items of tone-mapping derived image item");
 5993|      0|        return AVIF_RESULT_DECODE_GAIN_MAP_FAILED;
 5994|      0|    }
 5995|    284|    const avifProperty * imirProp = avifPropertyArrayFind(gainMapProperties, "imir");
 5996|    284|    if (!imirProp != !(image->transformFlags & AVIF_TRANSFORM_IMIR) || (imirProp && imirProp->u.imir.axis != image->imir.axis)) {
  ------------------
  |  Branch (5996:9): [True: 0, False: 284]
  |  Branch (5996:73): [True: 0, False: 284]
  |  Branch (5996:85): [True: 0, False: 0]
  ------------------
 5997|      0|        avifDiagnosticsPrintf(&decoder->diag, "Mirroring property mismatch between input items of tone-mapping derived image item");
 5998|      0|        return AVIF_RESULT_DECODE_GAIN_MAP_FAILED;
 5999|      0|    }
 6000|    284|    return AVIF_RESULT_OK;
 6001|    284|}
read.c:avifDecoderPrepareSample:
 5219|  88.0k|{
 5220|  88.0k|    if (!sample->data.size || sample->partialData) {
  ------------------
  |  Branch (5220:9): [True: 53.3k, False: 34.7k]
  |  Branch (5220:31): [True: 19.1k, False: 15.6k]
  ------------------
 5221|       |        // This sample hasn't been read from IO or had its extents fully merged yet.
 5222|       |
 5223|  72.4k|        size_t bytesToRead = sample->size;
 5224|  72.4k|        if (partialByteCount && (bytesToRead > partialByteCount)) {
  ------------------
  |  Branch (5224:13): [True: 28.5k, False: 43.8k]
  |  Branch (5224:33): [True: 23.0k, False: 5.48k]
  ------------------
 5225|  23.0k|            bytesToRead = partialByteCount;
 5226|  23.0k|        }
 5227|       |
 5228|  72.4k|        if (sample->itemID) {
  ------------------
  |  Branch (5228:13): [True: 61.8k, False: 10.6k]
  ------------------
 5229|       |            // The data comes from an item. Let avifDecoderItemRead() do the heavy lifting.
 5230|       |
 5231|  61.8k|            avifDecoderItem * item;
 5232|  61.8k|            AVIF_CHECKRES(avifMetaFindOrCreateItem(decoder->data->meta, sample->itemID, &item));
  ------------------
  |  |   54|  61.8k|    do {                                  \
  |  |   55|  61.8k|        const avifResult result__ = (A);  \
  |  |   56|  61.8k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 0, False: 61.8k]
  |  |  ------------------
  |  |   57|      0|            avifBreakOnError();           \
  |  |   58|      0|            return result__;              \
  |  |   59|      0|        }                                 \
  |  |   60|  61.8k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 61.8k]
  |  |  ------------------
  ------------------
 5233|  61.8k|            avifROData itemContents;
 5234|       |#if UINT64_MAX > SIZE_MAX
 5235|       |            if (sample->offset > SIZE_MAX) {
 5236|       |                return AVIF_RESULT_BMFF_PARSE_FAILED;
 5237|       |            }
 5238|       |#endif
 5239|  61.8k|            size_t offset = (size_t)sample->offset;
 5240|  61.8k|            avifResult readResult = avifDecoderItemRead(item, decoder->io, &itemContents, offset, bytesToRead, &decoder->diag);
 5241|  61.8k|            if (readResult != AVIF_RESULT_OK) {
  ------------------
  |  Branch (5241:17): [True: 710, False: 61.1k]
  ------------------
 5242|    710|                return readResult;
 5243|    710|            }
 5244|       |
 5245|       |            // avifDecoderItemRead is guaranteed to already be persisted by either the underlying IO
 5246|       |            // or by mergedExtents; just reuse the buffer here.
 5247|  61.1k|            sample->data = itemContents;
 5248|  61.1k|            sample->ownsData = AVIF_FALSE;
  ------------------
  |  |   89|  61.1k|#define AVIF_FALSE 0
  ------------------
 5249|  61.1k|            sample->partialData = item->partialMergedExtents;
 5250|  61.1k|        } else {
 5251|       |            // The data likely comes from a sample table. Pull the sample and make a copy if necessary.
 5252|       |
 5253|  10.6k|            avifROData sampleContents;
 5254|  10.6k|            if ((decoder->io->sizeHint > 0) && (sample->offset > decoder->io->sizeHint)) {
  ------------------
  |  Branch (5254:17): [True: 10.6k, False: 0]
  |  Branch (5254:48): [True: 0, False: 10.6k]
  ------------------
 5255|      0|                return AVIF_RESULT_BMFF_PARSE_FAILED;
 5256|      0|            }
 5257|  10.6k|            avifResult readResult = decoder->io->read(decoder->io, 0, sample->offset, bytesToRead, &sampleContents);
 5258|  10.6k|            if (readResult != AVIF_RESULT_OK) {
  ------------------
  |  Branch (5258:17): [True: 0, False: 10.6k]
  ------------------
 5259|      0|                return readResult;
 5260|      0|            }
 5261|  10.6k|            if (sampleContents.size != bytesToRead) {
  ------------------
  |  Branch (5261:17): [True: 0, False: 10.6k]
  ------------------
 5262|      0|                return AVIF_RESULT_TRUNCATED_DATA;
 5263|      0|            }
 5264|       |
 5265|  10.6k|            sample->ownsData = !decoder->io->persistent;
 5266|  10.6k|            sample->partialData = (bytesToRead != sample->size);
 5267|  10.6k|            if (decoder->io->persistent) {
  ------------------
  |  Branch (5267:17): [True: 5.46k, False: 5.15k]
  ------------------
 5268|  5.46k|                sample->data = sampleContents;
 5269|  5.46k|            } else {
 5270|  5.15k|                AVIF_CHECKRES(avifRWDataSet((avifRWData *)&sample->data, sampleContents.data, sampleContents.size));
  ------------------
  |  |   54|  5.15k|    do {                                  \
  |  |   55|  5.15k|        const avifResult result__ = (A);  \
  |  |   56|  5.15k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 0, False: 5.15k]
  |  |  ------------------
  |  |   57|      0|            avifBreakOnError();           \
  |  |   58|      0|            return result__;              \
  |  |   59|      0|        }                                 \
  |  |   60|  5.15k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 5.15k]
  |  |  ------------------
  ------------------
 5271|  5.15k|            }
 5272|  10.6k|        }
 5273|  72.4k|    }
 5274|  87.3k|    return AVIF_RESULT_OK;
 5275|  88.0k|}
read.c:avifDecoderDataFrameFullyDecoded:
 6860|  54.1k|{
 6861|   216k|    for (int c = 0; c < AVIF_ITEM_CATEGORY_COUNT; ++c) {
  ------------------
  |  Branch (6861:21): [True: 196k, False: 20.2k]
  ------------------
 6862|   196k|        if (data->tileInfos[c].decodedTileCount != data->tileInfos[c].tileCount) {
  ------------------
  |  Branch (6862:13): [True: 33.8k, False: 162k]
  ------------------
 6863|  33.8k|            return AVIF_FALSE;
  ------------------
  |  |   89|  33.8k|#define AVIF_FALSE 0
  ------------------
 6864|  33.8k|        }
 6865|   196k|    }
 6866|  20.2k|    return AVIF_TRUE;
  ------------------
  |  |   88|  20.2k|#define AVIF_TRUE 1
  ------------------
 6867|  54.1k|}
read.c:avifDecoderCreateCodecs:
 5425|  33.8k|{
 5426|  33.8k|    avifDecoderData * data = decoder->data;
 5427|  33.8k|    avifDecoderDataResetCodec(data);
 5428|       |
 5429|  33.8k|    if (data->source == AVIF_DECODER_SOURCE_TRACKS) {
  ------------------
  |  Branch (5429:9): [True: 3.59k, False: 30.2k]
  ------------------
 5430|       |        // In this case, we will use at most two codec instances (one for the color planes and one for the alpha plane).
 5431|       |        // Gain maps are not supported.
 5432|  3.59k|        AVIF_CHECKRES(avifCodecCreateInternal(decoder->codecChoice, &decoder->data->tiles.tile[0], &decoder->diag, &data->codec));
  ------------------
  |  |   54|  3.59k|    do {                                  \
  |  |   55|  3.59k|        const avifResult result__ = (A);  \
  |  |   56|  3.59k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 0, False: 3.59k]
  |  |  ------------------
  |  |   57|      0|            avifBreakOnError();           \
  |  |   58|      0|            return result__;              \
  |  |   59|      0|        }                                 \
  |  |   60|  3.59k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 3.59k]
  |  |  ------------------
  ------------------
 5433|  3.59k|        data->tiles.tile[0].codec = data->codec;
 5434|  3.59k|        if (data->tiles.count > 1) {
  ------------------
  |  Branch (5434:13): [True: 466, False: 3.12k]
  ------------------
 5435|    466|            AVIF_CHECKRES(avifCodecCreateInternal(decoder->codecChoice, &decoder->data->tiles.tile[1], &decoder->diag, &data->codecAlpha));
  ------------------
  |  |   54|    466|    do {                                  \
  |  |   55|    466|        const avifResult result__ = (A);  \
  |  |   56|    466|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 0, False: 466]
  |  |  ------------------
  |  |   57|      0|            avifBreakOnError();           \
  |  |   58|      0|            return result__;              \
  |  |   59|      0|        }                                 \
  |  |   60|    466|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 466]
  |  |  ------------------
  ------------------
 5436|    466|            data->tiles.tile[1].codec = data->codecAlpha;
 5437|    466|        }
 5438|  30.2k|    } else {
 5439|       |        // In this case, we will use one codec instance when there is only one tile or when all of the following conditions are
 5440|       |        // met:
 5441|       |        //   - The image must have exactly one layer (i.e. decoder->imageCount == 1).
 5442|       |        //   - All the tiles must have the same operating point (because the codecs take operating point once at initialization
 5443|       |        //     and do not allow it to be changed later).
 5444|       |        //   - All the tiles must have the same value for allLayers (because the codecs take allLayers once at initialization
 5445|       |        //     and do not allow it to be changed later).
 5446|       |        //   - If the image has a single tile, it must not have a single tile alpha plane (in this case we will steal the planes
 5447|       |        //     from the decoder, so we cannot use the same decoder for both the color and the alpha planes).
 5448|       |        //   - All tiles have the same type (AV1 or AV2).
 5449|       |        //   - No tile buffer access after another tile was decoded (i.e. no Sample Transform compositing because it happens
 5450|       |        //     after decoding all tiles).
 5451|       |        // Otherwise, we will use |tiles.count| decoder instances (one instance for each tile).
 5452|  30.2k|        const avifBool canUseSingleCodecInstance =
 5453|  30.2k|            ((data->tiles.count == 1) || (decoder->imageCount == 1 && avifTilesCanBeDecodedWithSameCodecInstance(data))) &&
  ------------------
  |  Branch (5453:14): [True: 23.5k, False: 6.69k]
  |  Branch (5453:43): [True: 4.85k, False: 1.84k]
  |  Branch (5453:71): [True: 1.77k, False: 3.08k]
  ------------------
 5454|  25.3k|            data->sampleTransformNumInputImageItems == 0;
  ------------------
  |  Branch (5454:13): [True: 25.3k, False: 0]
  ------------------
 5455|  30.2k|        if (canUseSingleCodecInstance) {
  ------------------
  |  Branch (5455:13): [True: 25.3k, False: 4.92k]
  ------------------
 5456|  25.3k|            AVIF_CHECKRES(avifCodecCreateInternal(decoder->codecChoice, &decoder->data->tiles.tile[0], &decoder->diag, &data->codec));
  ------------------
  |  |   54|  25.3k|    do {                                  \
  |  |   55|  25.3k|        const avifResult result__ = (A);  \
  |  |   56|  25.3k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 0, False: 25.3k]
  |  |  ------------------
  |  |   57|      0|            avifBreakOnError();           \
  |  |   58|      0|            return result__;              \
  |  |   59|      0|        }                                 \
  |  |   60|  25.3k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 25.3k]
  |  |  ------------------
  ------------------
 5457|  55.5k|            for (unsigned int i = 0; i < decoder->data->tiles.count; ++i) {
  ------------------
  |  Branch (5457:38): [True: 30.2k, False: 25.3k]
  ------------------
 5458|  30.2k|                decoder->data->tiles.tile[i].codec = data->codec;
 5459|  30.2k|            }
 5460|  25.3k|        } else {
 5461|  15.0k|            for (unsigned int i = 0; i < decoder->data->tiles.count; ++i) {
  ------------------
  |  Branch (5461:38): [True: 10.0k, False: 4.92k]
  ------------------
 5462|  10.0k|                avifTile * tile = &decoder->data->tiles.tile[i];
 5463|  10.0k|                AVIF_CHECKRES(avifCodecCreateInternal(decoder->codecChoice, tile, &decoder->diag, &tile->codec));
  ------------------
  |  |   54|  10.0k|    do {                                  \
  |  |   55|  10.0k|        const avifResult result__ = (A);  \
  |  |   56|  10.0k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 0, False: 10.0k]
  |  |  ------------------
  |  |   57|      0|            avifBreakOnError();           \
  |  |   58|      0|            return result__;              \
  |  |   59|      0|        }                                 \
  |  |   60|  10.0k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 10.0k]
  |  |  ------------------
  ------------------
 5464|  10.0k|            }
 5465|  4.92k|        }
 5466|  30.2k|    }
 5467|  33.8k|    return AVIF_RESULT_OK;
 5468|  33.8k|}
read.c:avifCodecCreateInternal:
 5359|  39.5k|{
 5360|       |#if defined(AVIF_CODEC_AVM)
 5361|       |    // AVIF_CODEC_CHOICE_AUTO leads to AVIF_CODEC_TYPE_AV1 by default. Reroute correctly.
 5362|       |    if (choice == AVIF_CODEC_CHOICE_AUTO && tile->codecType == AVIF_CODEC_TYPE_AV2) {
 5363|       |        choice = AVIF_CODEC_CHOICE_AVM;
 5364|       |    }
 5365|       |#endif
 5366|       |
 5367|  39.5k|    const avifCodecType codecTypeFromChoice = avifCodecTypeFromChoice(choice, AVIF_CODEC_FLAG_CAN_DECODE);
 5368|  39.5k|    if (codecTypeFromChoice == AVIF_CODEC_TYPE_UNKNOWN) {
  ------------------
  |  Branch (5368:9): [True: 0, False: 39.5k]
  ------------------
 5369|      0|        avifDiagnosticsPrintf(diag,
 5370|      0|                              "Tile type is %s but there is no compatible codec available to decode it",
 5371|      0|                              avifGetConfigurationPropertyName(tile->codecType));
 5372|      0|        return AVIF_RESULT_NO_CODEC_AVAILABLE;
 5373|  39.5k|    } else if (choice != AVIF_CODEC_CHOICE_AUTO && codecTypeFromChoice != tile->codecType) {
  ------------------
  |  Branch (5373:16): [True: 30.3k, False: 9.20k]
  |  Branch (5373:52): [True: 0, False: 30.3k]
  ------------------
 5374|      0|        avifDiagnosticsPrintf(diag,
 5375|      0|                              "Tile type is %s but incompatible %s codec was explicitly set as decoding implementation",
 5376|      0|                              avifGetConfigurationPropertyName(tile->codecType),
 5377|      0|                              avifCodecName(choice, AVIF_CODEC_FLAG_CAN_DECODE));
 5378|      0|        return AVIF_RESULT_DECODE_COLOR_FAILED;
 5379|      0|    }
 5380|       |
 5381|  39.5k|    AVIF_CHECKRES(avifCodecCreate(choice, AVIF_CODEC_FLAG_CAN_DECODE, codec));
  ------------------
  |  |   54|  39.5k|    do {                                  \
  |  |   55|  39.5k|        const avifResult result__ = (A);  \
  |  |   56|  39.5k|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 0, False: 39.5k]
  |  |  ------------------
  |  |   57|      0|            avifBreakOnError();           \
  |  |   58|      0|            return result__;              \
  |  |   59|      0|        }                                 \
  |  |   60|  39.5k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 39.5k]
  |  |  ------------------
  ------------------
 5382|  39.5k|    AVIF_CHECKERR(*codec, AVIF_RESULT_OUT_OF_MEMORY);
  ------------------
  |  |   45|  39.5k|    do {                        \
  |  |   46|  39.5k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (46:13): [True: 0, False: 39.5k]
  |  |  ------------------
  |  |   47|      0|            avifBreakOnError(); \
  |  |   48|      0|            return ERR;         \
  |  |   49|      0|        }                       \
  |  |   50|  39.5k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (50:14): [Folded, False: 39.5k]
  |  |  ------------------
  ------------------
 5383|  39.5k|    (*codec)->diag = diag;
 5384|  39.5k|    (*codec)->operatingPoint = tile->operatingPoint;
 5385|  39.5k|    (*codec)->allLayers = tile->input->allLayers;
 5386|  39.5k|    return AVIF_RESULT_OK;
 5387|  39.5k|}
read.c:avifTilesCanBeDecodedWithSameCodecInstance:
 5390|  4.85k|{
 5391|  4.85k|    int32_t numImageBuffers = 0, numStolenImageBuffers = 0;
 5392|  43.7k|    for (int c = 0; c < AVIF_ITEM_CATEGORY_COUNT; ++c) {
  ------------------
  |  Branch (5392:21): [True: 38.8k, False: 4.85k]
  ------------------
 5393|  38.8k|        if (data->tileInfos[c].tileCount > 0) {
  ------------------
  |  Branch (5393:13): [True: 9.36k, False: 29.4k]
  ------------------
 5394|  9.36k|            ++numImageBuffers;
 5395|  9.36k|        }
 5396|       |        // The sample operations require multiple buffers for compositing so no plane is stolen
 5397|       |        // when there is a 'sato' Sample Transform derived image item.
 5398|  38.8k|        if (c >= AVIF_SAMPLE_TRANSFORM_MIN_CATEGORY && c <= AVIF_SAMPLE_TRANSFORM_MAX_CATEGORY && data->tileInfos[c].tileCount > 0) {
  ------------------
  |  |  428|  77.7k|#define AVIF_SAMPLE_TRANSFORM_MIN_CATEGORY AVIF_ITEM_SAMPLE_TRANSFORM_INPUT_0_COLOR
  ------------------
                      if (c >= AVIF_SAMPLE_TRANSFORM_MIN_CATEGORY && c <= AVIF_SAMPLE_TRANSFORM_MAX_CATEGORY && data->tileInfos[c].tileCount > 0) {
  ------------------
  |  |  430|  58.2k|    (AVIF_ITEM_SAMPLE_TRANSFORM_INPUT_0_ALPHA + AVIF_SAMPLE_TRANSFORM_MAX_NUM_EXTRA_INPUT_IMAGE_ITEMS - 1)
  |  |  ------------------
  |  |  |  |  424|  19.4k|    (AVIF_ITEM_SAMPLE_TRANSFORM_INPUT_0_ALPHA - AVIF_ITEM_SAMPLE_TRANSFORM_INPUT_0_COLOR)
  |  |  ------------------
  ------------------
  |  Branch (5398:13): [True: 19.4k, False: 19.4k]
  |  Branch (5398:56): [True: 19.4k, False: 0]
  |  Branch (5398:99): [True: 0, False: 19.4k]
  ------------------
 5399|      0|            continue;
 5400|      0|        }
 5401|  38.8k|        if (data->tileInfos[c].tileCount == 1) {
  ------------------
  |  Branch (5401:13): [True: 6.17k, False: 32.6k]
  ------------------
 5402|  6.17k|            ++numStolenImageBuffers;
 5403|  6.17k|        }
 5404|  38.8k|    }
 5405|  4.85k|    if (numStolenImageBuffers > 0 && numImageBuffers > 1) {
  ------------------
  |  Branch (5405:9): [True: 3.08k, False: 1.77k]
  |  Branch (5405:38): [True: 3.08k, False: 0]
  ------------------
 5406|       |        // Single tile image with single tile alpha plane or gain map. In this case each tile needs its own decoder since the planes will be
 5407|       |        // "stolen". Stealing either the color or the alpha plane (or gain map) will invalidate the other ones when decode is called the second
 5408|       |        // (or third) time.
 5409|  3.08k|        return AVIF_FALSE;
  ------------------
  |  |   89|  3.08k|#define AVIF_FALSE 0
  ------------------
 5410|  3.08k|    }
 5411|  1.77k|    const uint8_t firstTileOperatingPoint = data->tiles.tile[0].operatingPoint;
 5412|  1.77k|    const avifBool firstTileAllLayers = data->tiles.tile[0].input->allLayers;
 5413|  6.61k|    for (unsigned int i = 1; i < data->tiles.count; ++i) {
  ------------------
  |  Branch (5413:30): [True: 4.84k, False: 1.77k]
  ------------------
 5414|  4.84k|        const avifTile * tile = &data->tiles.tile[i];
 5415|  4.84k|        if (tile->operatingPoint != firstTileOperatingPoint || tile->input->allLayers != firstTileAllLayers) {
  ------------------
  |  Branch (5415:13): [True: 0, False: 4.84k]
  |  Branch (5415:64): [True: 0, False: 4.84k]
  ------------------
 5416|      0|            return AVIF_FALSE;
  ------------------
  |  |   89|      0|#define AVIF_FALSE 0
  ------------------
 5417|      0|        }
 5418|       |        // avifDecoderItemValidateProperties() verified during avifDecoderParse() that all tiles
 5419|       |        // share the same coding format so no need to check for codecType equality here.
 5420|  4.84k|    }
 5421|  1.77k|    return AVIF_TRUE;
  ------------------
  |  |   88|  1.77k|#define AVIF_TRUE 1
  ------------------
 5422|  1.77k|}
read.c:avifDecoderPrepareTiles:
 6659|   327k|{
 6660|   380k|    for (unsigned int tileIndex = info->decodedTileCount; tileIndex < info->tileCount; ++tileIndex) {
  ------------------
  |  Branch (6660:59): [True: 56.1k, False: 324k]
  ------------------
 6661|  56.1k|        avifTile * tile = &decoder->data->tiles.tile[info->firstTileIndex + tileIndex];
 6662|       |
 6663|  56.1k|        if (nextImageIndex >= tile->input->samples.count) {
  ------------------
  |  Branch (6663:13): [True: 3.10k, False: 53.0k]
  ------------------
 6664|  3.10k|            return AVIF_RESULT_NO_IMAGES_REMAINING;
 6665|  3.10k|        }
 6666|       |
 6667|  53.0k|        avifDecodeSample * sample = &tile->input->samples.sample[nextImageIndex];
 6668|  53.0k|        avifResult prepareResult = avifDecoderPrepareSample(decoder, sample, 0);
 6669|  53.0k|        if (prepareResult != AVIF_RESULT_OK) {
  ------------------
  |  Branch (6669:13): [True: 467, False: 52.6k]
  ------------------
 6670|    467|            return prepareResult;
 6671|    467|        }
 6672|  53.0k|    }
 6673|   324k|    return AVIF_RESULT_OK;
 6674|   327k|}
read.c:avifDecoderDecodeTiles:
 6730|   112k|{
 6731|   112k|    const unsigned int oldDecodedTileCount = info->decodedTileCount;
 6732|   126k|    for (unsigned int tileIndex = oldDecodedTileCount; tileIndex < info->tileCount; ++tileIndex) {
  ------------------
  |  Branch (6732:56): [True: 44.3k, False: 82.1k]
  ------------------
 6733|  44.3k|        avifTile * tile = &decoder->data->tiles.tile[info->firstTileIndex + tileIndex];
 6734|       |
 6735|  44.3k|        const avifDecodeSample * sample = &tile->input->samples.sample[nextImageIndex];
 6736|  44.3k|        if (sample->data.size < sample->size) {
  ------------------
  |  Branch (6736:13): [True: 0, False: 44.3k]
  ------------------
 6737|      0|            AVIF_ASSERT_OR_RETURN(decoder->allowIncremental);
  ------------------
  |  |   64|      0|#define AVIF_ASSERT_OR_RETURN(A) AVIF_CHECKERR((A), AVIF_RESULT_INTERNAL_ERROR)
  |  |  ------------------
  |  |  |  |   45|      0|    do {                        \
  |  |  |  |   46|      0|        if (!(A)) {             \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (46:13): [True: 0, False: 0]
  |  |  |  |  ------------------
  |  |  |  |   47|      0|            avifBreakOnError(); \
  |  |  |  |   48|      0|            return ERR;         \
  |  |  |  |   49|      0|        }                       \
  |  |  |  |   50|      0|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (50:14): [Folded, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 6738|       |            // Data is missing but there is no error yet. Output available pixel rows.
 6739|      0|            return AVIF_RESULT_OK;
 6740|      0|        }
 6741|       |
 6742|  44.3k|        avifBool isLimitedRangeAlpha = AVIF_FALSE;
  ------------------
  |  |   89|  44.3k|#define AVIF_FALSE 0
  ------------------
 6743|  44.3k|        tile->codec->maxThreads = decoder->maxThreads;
 6744|  44.3k|        tile->codec->imageSizeLimit = decoder->imageSizeLimit;
 6745|  44.3k|        tile->codec->imageDimensionLimit = decoder->imageDimensionLimit;
 6746|  44.3k|        if (!tile->codec->getNextImage(tile->codec, sample, avifIsAlpha(tile->input->itemCategory), &isLimitedRangeAlpha, tile->image)) {
  ------------------
  |  Branch (6746:13): [True: 30.0k, False: 14.3k]
  ------------------
 6747|  30.0k|            avifDiagnosticsPrintf(&decoder->diag, "tile->codec->getNextImage() failed");
 6748|  30.0k|            return avifGetErrorForItemCategory(tile->input->itemCategory);
 6749|  30.0k|        }
 6750|       |
 6751|       |        // Section 2.3.4 of AV1 Codec ISO Media File Format Binding v1.2.0 says:
 6752|       |        //   the full_range_flag in the colr box shall match the color_range
 6753|       |        //   flag in the Sequence Header OBU.
 6754|       |        // See https://aomediacodec.github.io/av1-isobmff/v1.2.0.html#av1codecconfigurationbox-semantics.
 6755|       |        // If a 'colr' box of colour_type 'nclx' was parsed, a mismatch between
 6756|       |        // the 'colr' decoder->image->yuvRange and the AV1 OBU
 6757|       |        // tile->image->yuvRange should be treated as an error.
 6758|       |        // However codec_svt.c was not encoding the color_range field for
 6759|       |        // multiple years, so there probably are files in the wild that will
 6760|       |        // fail decoding if this is enforced. Thus this pattern is allowed.
 6761|       |        // Section 12.1.5.1 of ISO 14496-12 (ISOBMFF) says:
 6762|       |        //   If colour information is supplied in both this [colr] box, and also
 6763|       |        //   in the video bitstream, this box takes precedence, and over-rides
 6764|       |        //   the information in the bitstream.
 6765|       |        // So decoder->image->yuvRange is kept because it was either the 'colr'
 6766|       |        // value set when the 'colr' box was parsed, or it was the AV1 OBU value
 6767|       |        // extracted from the sequence header OBU of the first tile of the first
 6768|       |        // frame (if no 'colr' box of colour_type 'nclx' was found).
 6769|       |
 6770|       |        // Alpha plane with limited range is not allowed by the latest revision
 6771|       |        // of the specification. However, it was allowed in version 1.0.0 of the
 6772|       |        // specification. To allow such files, simply convert the alpha plane to
 6773|       |        // full range.
 6774|  14.3k|        if (avifIsAlpha(tile->input->itemCategory) && isLimitedRangeAlpha) {
  ------------------
  |  Branch (6774:13): [True: 2.24k, False: 12.0k]
  |  Branch (6774:55): [True: 234, False: 2.01k]
  ------------------
 6775|    234|            avifResult result = avifImageLimitedToFullAlpha(tile->image);
 6776|    234|            if (result != AVIF_RESULT_OK) {
  ------------------
  |  Branch (6776:17): [True: 0, False: 234]
  ------------------
 6777|      0|                avifDiagnosticsPrintf(&decoder->diag, "avifImageLimitedToFullAlpha failed");
 6778|      0|                return result;
 6779|      0|            }
 6780|    234|        }
 6781|       |
 6782|       |        // Scale the decoded image so that it corresponds to this tile's output dimensions
 6783|  14.3k|        if ((tile->width != tile->image->width) || (tile->height != tile->image->height)) {
  ------------------
  |  Branch (6783:13): [True: 3.95k, False: 10.3k]
  |  Branch (6783:52): [True: 825, False: 9.55k]
  ------------------
 6784|  4.77k|            if (avifImageScaleWithLimit(tile->image,
  ------------------
  |  Branch (6784:17): [True: 109, False: 4.66k]
  ------------------
 6785|  4.77k|                                        tile->width,
 6786|  4.77k|                                        tile->height,
 6787|  4.77k|                                        decoder->imageSizeLimit,
 6788|  4.77k|                                        decoder->imageDimensionLimit,
 6789|  4.77k|                                        &decoder->diag) != AVIF_RESULT_OK) {
 6790|    109|                return avifGetErrorForItemCategory(tile->input->itemCategory);
 6791|    109|            }
 6792|  4.77k|        }
 6793|       |
 6794|  14.2k|        ++info->decodedTileCount;
 6795|       |
 6796|  14.2k|        const avifBool isGrid = (info->grid.rows > 0) && (info->grid.columns > 0);
  ------------------
  |  Branch (6796:33): [True: 1.03k, False: 13.1k]
  |  Branch (6796:58): [True: 1.03k, False: 0]
  ------------------
 6797|  14.2k|        avifBool stealPlanes = !isGrid;
 6798|  14.2k|        if (decoder->data->meta->sampleTransformExpression.count > 0) {
  ------------------
  |  Branch (6798:13): [True: 0, False: 14.2k]
  ------------------
 6799|       |            // Keep everything as a copy for now.
 6800|      0|            stealPlanes = AVIF_FALSE;
  ------------------
  |  |   89|      0|#define AVIF_FALSE 0
  ------------------
 6801|      0|        }
 6802|  14.2k|        if (tile->input->itemCategory >= AVIF_SAMPLE_TRANSFORM_MIN_CATEGORY &&
  ------------------
  |  |  428|  28.4k|#define AVIF_SAMPLE_TRANSFORM_MIN_CATEGORY AVIF_ITEM_SAMPLE_TRANSFORM_INPUT_0_COLOR
  ------------------
  |  Branch (6802:13): [True: 0, False: 14.2k]
  ------------------
 6803|      0|            tile->input->itemCategory <= AVIF_SAMPLE_TRANSFORM_MAX_CATEGORY) {
  ------------------
  |  |  430|      0|    (AVIF_ITEM_SAMPLE_TRANSFORM_INPUT_0_ALPHA + AVIF_SAMPLE_TRANSFORM_MAX_NUM_EXTRA_INPUT_IMAGE_ITEMS - 1)
  |  |  ------------------
  |  |  |  |  424|      0|    (AVIF_ITEM_SAMPLE_TRANSFORM_INPUT_0_ALPHA - AVIF_ITEM_SAMPLE_TRANSFORM_INPUT_0_COLOR)
  |  |  ------------------
  ------------------
  |  Branch (6803:13): [True: 0, False: 0]
  ------------------
 6804|       |            // Keep Sample Transform input image item samples in tiles.
 6805|       |            // The expression will be applied in avifDecoderNextImage() below instead, once all the tiles are available.
 6806|      0|            continue;
 6807|      0|        }
 6808|       |
 6809|  14.2k|        if (!stealPlanes) {
  ------------------
  |  Branch (6809:13): [True: 1.03k, False: 13.1k]
  ------------------
 6810|  1.03k|            avifImage * dstImage = decoder->image;
 6811|  1.03k|            if (tile->input->itemCategory == AVIF_ITEM_GAIN_MAP) {
  ------------------
  |  Branch (6811:17): [True: 292, False: 745]
  ------------------
 6812|    292|                AVIF_ASSERT_OR_RETURN(dstImage->gainMap && dstImage->gainMap->image);
  ------------------
  |  |   64|    292|#define AVIF_ASSERT_OR_RETURN(A) AVIF_CHECKERR((A), AVIF_RESULT_INTERNAL_ERROR)
  |  |  ------------------
  |  |  |  |   45|    292|    do {                        \
  |  |  |  |   46|    584|        if (!(A)) {             \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (46:15): [True: 292, False: 0]
  |  |  |  |  |  Branch (46:15): [True: 292, False: 0]
  |  |  |  |  ------------------
  |  |  |  |   47|      0|            avifBreakOnError(); \
  |  |  |  |   48|      0|            return ERR;         \
  |  |  |  |   49|      0|        }                       \
  |  |  |  |   50|    292|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (50:14): [Folded, False: 292]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 6813|    292|                dstImage = dstImage->gainMap->image;
 6814|    292|            }
 6815|  1.03k|            if (tileIndex == 0) {
  ------------------
  |  Branch (6815:17): [True: 480, False: 557]
  ------------------
 6816|    480|                AVIF_CHECKRES(avifDecoderDataAllocateImagePlanes(decoder->data, info, dstImage, &decoder->data->cicpSet));
  ------------------
  |  |   54|    480|    do {                                  \
  |  |   55|    480|        const avifResult result__ = (A);  \
  |  |   56|    480|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 84, False: 396]
  |  |  ------------------
  |  |   57|     84|            avifBreakOnError();           \
  |  |   58|     84|            return result__;              \
  |  |   59|     84|        }                                 \
  |  |   60|    480|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 396]
  |  |  ------------------
  ------------------
 6817|    480|            }
 6818|    953|            AVIF_CHECKRES(avifDecoderDataCopyTileToImage(decoder->data, info, dstImage, tile, tileIndex));
  ------------------
  |  |   54|    953|    do {                                  \
  |  |   55|    953|        const avifResult result__ = (A);  \
  |  |   56|    953|        if (result__ != AVIF_RESULT_OK) { \
  |  |  ------------------
  |  |  |  Branch (56:13): [True: 24, False: 929]
  |  |  ------------------
  |  |   57|     24|            avifBreakOnError();           \
  |  |   58|     24|            return result__;              \
  |  |   59|     24|        }                                 \
  |  |   60|    953|    } while (0)
  |  |  ------------------
  |  |  |  Branch (60:14): [Folded, False: 929]
  |  |  ------------------
  ------------------
 6819|  13.1k|        } else {
 6820|  13.1k|            AVIF_ASSERT_OR_RETURN(info->tileCount == 1);
  ------------------
  |  |   64|  13.1k|#define AVIF_ASSERT_OR_RETURN(A) AVIF_CHECKERR((A), AVIF_RESULT_INTERNAL_ERROR)
  |  |  ------------------
  |  |  |  |   45|  13.1k|    do {                        \
  |  |  |  |   46|  13.1k|        if (!(A)) {             \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (46:13): [True: 0, False: 13.1k]
  |  |  |  |  ------------------
  |  |  |  |   47|      0|            avifBreakOnError(); \
  |  |  |  |   48|      0|            return ERR;         \
  |  |  |  |   49|      0|        }                       \
  |  |  |  |   50|  13.1k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (50:14): [Folded, False: 13.1k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 6821|  13.1k|            AVIF_ASSERT_OR_RETURN(tileIndex == 0);
  ------------------
  |  |   64|  13.1k|#define AVIF_ASSERT_OR_RETURN(A) AVIF_CHECKERR((A), AVIF_RESULT_INTERNAL_ERROR)
  |  |  ------------------
  |  |  |  |   45|  13.1k|    do {                        \
  |  |  |  |   46|  13.1k|        if (!(A)) {             \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (46:13): [True: 0, False: 13.1k]
  |  |  |  |  ------------------
  |  |  |  |   47|      0|            avifBreakOnError(); \
  |  |  |  |   48|      0|            return ERR;         \
  |  |  |  |   49|      0|        }                       \
  |  |  |  |   50|  13.1k|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (50:14): [Folded, False: 13.1k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 6822|  13.1k|            avifImage * src = tile->image;
 6823|       |
 6824|  13.1k|            if (tile->input->itemCategory == AVIF_ITEM_GAIN_MAP) {
  ------------------
  |  Branch (6824:17): [True: 42, False: 13.1k]
  ------------------
 6825|     42|                AVIF_ASSERT_OR_RETURN(decoder->image->gainMap && decoder->image->gainMap->image);
  ------------------
  |  |   64|     42|#define AVIF_ASSERT_OR_RETURN(A) AVIF_CHECKERR((A), AVIF_RESULT_INTERNAL_ERROR)
  |  |  ------------------
  |  |  |  |   45|     42|    do {                        \
  |  |  |  |   46|     84|        if (!(A)) {             \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (46:15): [True: 42, False: 0]
  |  |  |  |  |  Branch (46:15): [True: 42, False: 0]
  |  |  |  |  ------------------
  |  |  |  |   47|      0|            avifBreakOnError(); \
  |  |  |  |   48|      0|            return ERR;         \
  |  |  |  |   49|      0|        }                       \
  |  |  |  |   50|     42|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (50:14): [Folded, False: 42]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 6826|     42|                decoder->image->gainMap->image->width = src->width;
 6827|     42|                decoder->image->gainMap->image->height = src->height;
 6828|     42|                decoder->image->gainMap->image->depth = src->depth;
 6829|  13.1k|            } else {
 6830|  13.1k|                if ((decoder->image->width != src->width) || (decoder->image->height != src->height) ||
  ------------------
  |  Branch (6830:21): [True: 2, False: 13.1k]
  |  Branch (6830:62): [True: 2, False: 13.1k]
  ------------------
 6831|  13.1k|                    (decoder->image->depth != src->depth)) {
  ------------------
  |  Branch (6831:21): [True: 2.43k, False: 10.7k]
  ------------------
 6832|  2.43k|                    if (avifIsAlpha(tile->input->itemCategory)) {
  ------------------
  |  Branch (6832:25): [True: 56, False: 2.38k]
  ------------------
 6833|     56|                        avifDiagnosticsPrintf(&decoder->diag,
 6834|     56|                                              "The color image item does not match the alpha image item in width, height, or bit depth");
 6835|     56|                        return AVIF_RESULT_DECODE_ALPHA_FAILED;
 6836|     56|                    }
 6837|  2.38k|                    avifImageFreePlanes(decoder->image, AVIF_PLANES_ALL);
 6838|       |
 6839|  2.38k|                    decoder->image->width = src->width;
 6840|  2.38k|                    decoder->image->height = src->height;
 6841|  2.38k|                    decoder->image->depth = src->depth;
 6842|  2.38k|                }
 6843|  13.1k|            }
 6844|       |
 6845|  13.1k|            if (avifIsAlpha(tile->input->itemCategory)) {
  ------------------
  |  Branch (6845:17): [True: 2.15k, False: 10.9k]
  ------------------
 6846|  2.15k|                avifImageStealPlanes(decoder->image, src, AVIF_PLANES_A);
 6847|  10.9k|            } else if (tile->input->itemCategory == AVIF_ITEM_GAIN_MAP) {
  ------------------
  |  Branch (6847:24): [True: 42, False: 10.9k]
  ------------------
 6848|     42|                AVIF_ASSERT_OR_RETURN(decoder->image->gainMap && decoder->image->gainMap->image);
  ------------------
  |  |   64|     42|#define AVIF_ASSERT_OR_RETURN(A) AVIF_CHECKERR((A), AVIF_RESULT_INTERNAL_ERROR)
  |  |  ------------------
  |  |  |  |   45|     42|    do {                        \
  |  |  |  |   46|     84|        if (!(A)) {             \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (46:15): [True: 42, False: 0]
  |  |  |  |  |  Branch (46:15): [True: 42, False: 0]
  |  |  |  |  ------------------
  |  |  |  |   47|      0|            avifBreakOnError(); \
  |  |  |  |   48|      0|            return ERR;         \
  |  |  |  |   49|      0|        }                       \
  |  |  |  |   50|     42|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (50:14): [Folded, False: 42]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 6849|     42|                avifImageStealPlanes(decoder->image->gainMap->image, src, AVIF_PLANES_YUV);
 6850|  10.9k|            } else { // AVIF_ITEM_COLOR
 6851|  10.9k|                avifImageStealPlanes(decoder->image, src, AVIF_PLANES_YUV);
 6852|  10.9k|            }
 6853|  13.1k|        }
 6854|  14.2k|    }
 6855|  82.1k|    return AVIF_RESULT_OK;
 6856|   112k|}
read.c:avifGetErrorForItemCategory:
 6719|  30.1k|{
 6720|  30.1k|    if (itemCategory == AVIF_ITEM_GAIN_MAP) {
  ------------------
  |  Branch (6720:9): [True: 90, False: 30.0k]
  ------------------
 6721|     90|        return AVIF_RESULT_DECODE_GAIN_MAP_FAILED;
 6722|     90|    }
 6723|  30.0k|    if (itemCategory >= AVIF_SAMPLE_TRANSFORM_MIN_CATEGORY && itemCategory <= AVIF_SAMPLE_TRANSFORM_MAX_CATEGORY) {
  ------------------
  |  |  428|  60.1k|#define AVIF_SAMPLE_TRANSFORM_MIN_CATEGORY AVIF_ITEM_SAMPLE_TRANSFORM_INPUT_0_COLOR
  ------------------
                  if (itemCategory >= AVIF_SAMPLE_TRANSFORM_MIN_CATEGORY && itemCategory <= AVIF_SAMPLE_TRANSFORM_MAX_CATEGORY) {
  ------------------
  |  |  430|      0|    (AVIF_ITEM_SAMPLE_TRANSFORM_INPUT_0_ALPHA + AVIF_SAMPLE_TRANSFORM_MAX_NUM_EXTRA_INPUT_IMAGE_ITEMS - 1)
  |  |  ------------------
  |  |  |  |  424|      0|    (AVIF_ITEM_SAMPLE_TRANSFORM_INPUT_0_ALPHA - AVIF_ITEM_SAMPLE_TRANSFORM_INPUT_0_COLOR)
  |  |  ------------------
  ------------------
  |  Branch (6723:9): [True: 0, False: 30.0k]
  |  Branch (6723:63): [True: 0, False: 0]
  ------------------
 6724|      0|        return AVIF_RESULT_DECODE_SAMPLE_TRANSFORM_FAILED;
 6725|      0|    }
 6726|  30.0k|    return avifIsAlpha(itemCategory) ? AVIF_RESULT_DECODE_ALPHA_FAILED : AVIF_RESULT_DECODE_COLOR_FAILED;
  ------------------
  |  Branch (6726:12): [True: 897, False: 29.1k]
  ------------------
 6727|  30.0k|}
read.c:avifImageLimitedToFullAlpha:
 6677|    234|{
 6678|    234|    if (image->imageOwnsAlphaPlane) {
  ------------------
  |  Branch (6678:9): [True: 0, False: 234]
  ------------------
 6679|      0|        return AVIF_RESULT_NOT_IMPLEMENTED;
 6680|      0|    }
 6681|       |
 6682|    234|    const uint8_t * alphaPlane = image->alphaPlane;
 6683|    234|    const uint32_t alphaRowBytes = image->alphaRowBytes;
 6684|       |
 6685|       |    // We cannot do the range conversion in place since it will modify the
 6686|       |    // codec's internal frame buffers. Allocate memory for the conversion.
 6687|    234|    image->alphaPlane = NULL;
 6688|    234|    image->alphaRowBytes = 0;
 6689|    234|    const avifResult allocationResult = avifImageAllocatePlanes(image, AVIF_PLANES_A);
 6690|    234|    if (allocationResult != AVIF_RESULT_OK) {
  ------------------
  |  Branch (6690:9): [True: 0, False: 234]
  ------------------
 6691|      0|        return allocationResult;
 6692|      0|    }
 6693|       |
 6694|    234|    if (image->depth > 8) {
  ------------------
  |  Branch (6694:9): [True: 67, False: 167]
  ------------------
 6695|  3.38k|        for (uint32_t j = 0; j < image->height; ++j) {
  ------------------
  |  Branch (6695:30): [True: 3.32k, False: 67]
  ------------------
 6696|  3.32k|            const uint8_t * srcRow = &alphaPlane[(size_t)j * alphaRowBytes];
 6697|  3.32k|            uint8_t * dstRow = &image->alphaPlane[(size_t)j * image->alphaRowBytes];
 6698|   248k|            for (uint32_t i = 0; i < image->width; ++i) {
  ------------------
  |  Branch (6698:34): [True: 244k, False: 3.32k]
  ------------------
 6699|   244k|                int srcAlpha = *((const uint16_t *)&srcRow[i * 2]);
 6700|   244k|                int dstAlpha = avifLimitedToFullY(image->depth, srcAlpha);
 6701|   244k|                *((uint16_t *)&dstRow[i * 2]) = (uint16_t)dstAlpha;
 6702|   244k|            }
 6703|  3.32k|        }
 6704|    167|    } else {
 6705|  5.45k|        for (uint32_t j = 0; j < image->height; ++j) {
  ------------------
  |  Branch (6705:30): [True: 5.28k, False: 167]
  ------------------
 6706|  5.28k|            const uint8_t * srcRow = &alphaPlane[(size_t)j * alphaRowBytes];
 6707|  5.28k|            uint8_t * dstRow = &image->alphaPlane[(size_t)j * image->alphaRowBytes];
 6708|   408k|            for (uint32_t i = 0; i < image->width; ++i) {
  ------------------
  |  Branch (6708:34): [True: 402k, False: 5.28k]
  ------------------
 6709|   402k|                int srcAlpha = srcRow[i];
 6710|   402k|                int dstAlpha = avifLimitedToFullY(image->depth, srcAlpha);
 6711|   402k|                dstRow[i] = (uint8_t)dstAlpha;
 6712|   402k|            }
 6713|  5.28k|        }
 6714|    167|    }
 6715|    234|    return AVIF_RESULT_OK;
 6716|    234|}
read.c:avifDecoderDataAllocateImagePlanes:
 1734|    480|{
 1735|    480|    const avifTile * tile = &data->tiles.tile[info->firstTileIndex];
 1736|    480|    uint32_t dstWidth;
 1737|    480|    uint32_t dstHeight;
 1738|       |
 1739|    480|    if (info->grid.rows > 0 && info->grid.columns > 0) {
  ------------------
  |  Branch (1739:9): [True: 480, False: 0]
  |  Branch (1739:32): [True: 480, False: 0]
  ------------------
 1740|    480|        const avifImageGrid * grid = &info->grid;
 1741|       |        // Validate grid image size and tile size.
 1742|       |        //
 1743|       |        // HEIF (ISO/IEC 23008-12:2017), Section 6.6.2.3.1:
 1744|       |        //   The tiled input images shall completely "cover" the reconstructed image grid canvas, ...
 1745|    480|        if ((((uint64_t)tile->image->width * grid->columns) < grid->outputWidth) ||
  ------------------
  |  Branch (1745:13): [True: 13, False: 467]
  ------------------
 1746|    467|            (((uint64_t)tile->image->height * grid->rows) < grid->outputHeight)) {
  ------------------
  |  Branch (1746:13): [True: 7, False: 460]
  ------------------
 1747|     20|            avifDiagnosticsPrintf(data->diag,
 1748|     20|                                  "Grid image tiles do not completely cover the image (HEIF (ISO/IEC 23008-12:2017), Section 6.6.2.3.1)");
 1749|     20|            return AVIF_RESULT_INVALID_IMAGE_GRID;
 1750|     20|        }
 1751|       |        // Tiles in the rightmost column and bottommost row must overlap the reconstructed image grid canvas. See MIAF (ISO/IEC 23000-22:2019), Section 7.3.11.4.2, Figure 2.
 1752|    460|        if ((((uint64_t)tile->image->width * (grid->columns - 1)) >= grid->outputWidth) ||
  ------------------
  |  Branch (1752:13): [True: 2, False: 458]
  ------------------
 1753|    458|            (((uint64_t)tile->image->height * (grid->rows - 1)) >= grid->outputHeight)) {
  ------------------
  |  Branch (1753:13): [True: 37, False: 421]
  ------------------
 1754|     39|            avifDiagnosticsPrintf(data->diag,
 1755|     39|                                  "Grid image tiles in the rightmost column and bottommost row do not overlap the reconstructed image grid canvas. See MIAF (ISO/IEC 23000-22:2019), Section 7.3.11.4.2, Figure 2");
 1756|     39|            return AVIF_RESULT_INVALID_IMAGE_GRID;
 1757|     39|        }
 1758|    421|        if (!avifAreGridDimensionsValid(tile->image->yuvFormat,
  ------------------
  |  Branch (1758:13): [True: 15, False: 406]
  ------------------
 1759|    421|                                        grid->outputWidth,
 1760|    421|                                        grid->outputHeight,
 1761|    421|                                        tile->image->width,
 1762|    421|                                        tile->image->height,
 1763|    421|                                        data->diag)) {
 1764|     15|            return AVIF_RESULT_INVALID_IMAGE_GRID;
 1765|     15|        }
 1766|    406|        dstWidth = grid->outputWidth;
 1767|    406|        dstHeight = grid->outputHeight;
 1768|    406|    } else {
 1769|       |        // Only one tile. Width and height are inherited from the 'ispe' property of the corresponding avifDecoderItem.
 1770|      0|        dstWidth = tile->width;
 1771|      0|        dstHeight = tile->height;
 1772|      0|    }
 1773|       |
 1774|    406|    const avifBool alpha = avifIsAlpha(tile->input->itemCategory);
 1775|    406|    if (alpha) {
  ------------------
  |  Branch (1775:9): [True: 33, False: 373]
  ------------------
 1776|       |        // An alpha tile does not contain any YUV pixels.
 1777|     33|        AVIF_ASSERT_OR_RETURN(tile->image->yuvFormat == AVIF_PIXEL_FORMAT_NONE);
  ------------------
  |  |   64|     33|#define AVIF_ASSERT_OR_RETURN(A) AVIF_CHECKERR((A), AVIF_RESULT_INTERNAL_ERROR)
  |  |  ------------------
  |  |  |  |   45|     33|    do {                        \
  |  |  |  |   46|     33|        if (!(A)) {             \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (46:13): [True: 0, False: 33]
  |  |  |  |  ------------------
  |  |  |  |   47|      0|            avifBreakOnError(); \
  |  |  |  |   48|      0|            return ERR;         \
  |  |  |  |   49|      0|        }                       \
  |  |  |  |   50|     33|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (50:14): [Folded, False: 33]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1778|     33|    }
 1779|       |
 1780|    406|    const uint32_t dstDepth = tile->image->depth;
 1781|       |
 1782|       |    // Lazily populate dstImage with the new frame's properties.
 1783|    406|    const avifBool dimsOrDepthIsDifferent = (dstImage->width != dstWidth) || (dstImage->height != dstHeight) ||
  ------------------
  |  Branch (1783:45): [True: 11, False: 395]
  |  Branch (1783:78): [True: 19, False: 376]
  ------------------
 1784|    376|                                            (dstImage->depth != dstDepth);
  ------------------
  |  Branch (1784:45): [True: 130, False: 246]
  ------------------
 1785|    406|    const avifBool yuvFormatIsDifferent = !alpha && (dstImage->yuvFormat != tile->image->yuvFormat);
  ------------------
  |  Branch (1785:43): [True: 373, False: 33]
  |  Branch (1785:53): [True: 75, False: 298]
  ------------------
 1786|    406|    if (dimsOrDepthIsDifferent || yuvFormatIsDifferent) {
  ------------------
  |  Branch (1786:9): [True: 160, False: 246]
  |  Branch (1786:35): [True: 30, False: 216]
  ------------------
 1787|    190|        if (alpha) {
  ------------------
  |  Branch (1787:13): [True: 10, False: 180]
  ------------------
 1788|       |            // Alpha doesn't match size, just bail out
 1789|     10|            avifDiagnosticsPrintf(data->diag, "Alpha plane dimensions do not match color plane dimensions");
 1790|     10|            return AVIF_RESULT_INVALID_IMAGE_GRID;
 1791|     10|        }
 1792|       |
 1793|    180|        if (dimsOrDepthIsDifferent) {
  ------------------
  |  Branch (1793:13): [True: 150, False: 30]
  ------------------
 1794|    150|            avifImageFreePlanes(dstImage, AVIF_PLANES_ALL);
 1795|    150|            dstImage->width = dstWidth;
 1796|    150|            dstImage->height = dstHeight;
 1797|    150|            dstImage->depth = dstDepth;
 1798|    150|        }
 1799|    180|        if (yuvFormatIsDifferent) {
  ------------------
  |  Branch (1799:13): [True: 75, False: 105]
  ------------------
 1800|     75|            avifImageFreePlanes(dstImage, AVIF_PLANES_YUV);
 1801|     75|            dstImage->yuvFormat = tile->image->yuvFormat;
 1802|     75|        }
 1803|       |        // Keep dstImage->yuvRange which is already set to its correct value
 1804|       |        // (extracted from the 'colr' box if parsed or from a Sequence Header OBU otherwise).
 1805|       |
 1806|    180|        if (!*cicpSet) {
  ------------------
  |  Branch (1806:13): [True: 4, False: 176]
  ------------------
 1807|      4|            *cicpSet = AVIF_TRUE;
  ------------------
  |  |   88|      4|#define AVIF_TRUE 1
  ------------------
 1808|      4|            dstImage->colorPrimaries = tile->image->colorPrimaries;
 1809|      4|            dstImage->transferCharacteristics = tile->image->transferCharacteristics;
 1810|      4|            dstImage->matrixCoefficients = tile->image->matrixCoefficients;
 1811|      4|        }
 1812|    180|    }
 1813|       |
 1814|    396|    if (avifImageAllocatePlanes(dstImage, alpha ? AVIF_PLANES_A : AVIF_PLANES_YUV) != AVIF_RESULT_OK) {
  ------------------
  |  Branch (1814:9): [True: 0, False: 396]
  |  Branch (1814:43): [True: 23, False: 373]
  ------------------
 1815|      0|        avifDiagnosticsPrintf(data->diag, "Image allocation failure");
 1816|      0|        return AVIF_RESULT_OUT_OF_MEMORY;
 1817|      0|    }
 1818|    396|    return AVIF_RESULT_OK;
 1819|    396|}
read.c:avifDecoderDataCopyTileToImage:
 1828|    953|{
 1829|    953|    const avifTile * firstTile = &data->tiles.tile[info->firstTileIndex];
 1830|    953|    if (tile != firstTile) {
  ------------------
  |  Branch (1830:9): [True: 557, False: 396]
  ------------------
 1831|       |        // Check for tile consistency. All tiles in a grid image should match the first tile in the properties checked below.
 1832|    557|        if ((tile->image->width != firstTile->image->width) || (tile->image->height != firstTile->image->height) ||
  ------------------
  |  Branch (1832:13): [True: 3, False: 554]
  |  Branch (1832:64): [True: 0, False: 554]
  ------------------
 1833|    554|            (tile->image->depth != firstTile->image->depth) || (tile->image->yuvFormat != firstTile->image->yuvFormat) ||
  ------------------
  |  Branch (1833:13): [True: 6, False: 548]
  |  Branch (1833:64): [True: 3, False: 545]
  ------------------
 1834|    545|            (tile->image->yuvRange != firstTile->image->yuvRange) || (tile->image->colorPrimaries != firstTile->image->colorPrimaries) ||
  ------------------
  |  Branch (1834:13): [True: 3, False: 542]
  |  Branch (1834:70): [True: 3, False: 539]
  ------------------
 1835|    539|            (tile->image->transferCharacteristics != firstTile->image->transferCharacteristics) ||
  ------------------
  |  Branch (1835:13): [True: 3, False: 536]
  ------------------
 1836|    536|            (tile->image->matrixCoefficients != firstTile->image->matrixCoefficients)) {
  ------------------
  |  Branch (1836:13): [True: 3, False: 533]
  ------------------
 1837|     24|            avifDiagnosticsPrintf(data->diag, "Grid image contains mismatched tiles");
 1838|     24|            return AVIF_RESULT_INVALID_IMAGE_GRID;
 1839|     24|        }
 1840|    557|    }
 1841|       |
 1842|       |    // Only keep the relevant planes in the destination image. Otherwise,
 1843|       |    // unjustified failures may come from trying to copy alpha tiles with odd
 1844|       |    // coordinates into the dstImage when the chroma planes are subsampled.
 1845|    929|    avifImage dstView;
 1846|    929|    avifImageSetDefaults(&dstView);
 1847|    929|    const avifCropRect srcViewRect = { 0, 0, dstImage->width, dstImage->height };
 1848|    929|    AVIF_ASSERT_OR_RETURN(avifImageSetViewRect(&dstView, dstImage, &srcViewRect) == AVIF_RESULT_OK);
  ------------------
  |  |   64|    929|#define AVIF_ASSERT_OR_RETURN(A) AVIF_CHECKERR((A), AVIF_RESULT_INTERNAL_ERROR)
  |  |  ------------------
  |  |  |  |   45|    929|    do {                        \
  |  |  |  |   46|    929|        if (!(A)) {             \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (46:13): [True: 0, False: 929]
  |  |  |  |  ------------------
  |  |  |  |   47|      0|            avifBreakOnError(); \
  |  |  |  |   48|      0|            return ERR;         \
  |  |  |  |   49|      0|        }                       \
  |  |  |  |   50|    929|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (50:14): [Folded, False: 929]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1849|    929|    if (avifIsAlpha(tile->input->itemCategory)) {
  ------------------
  |  Branch (1849:9): [True: 28, False: 901]
  ------------------
 1850|     28|        avifImageFreePlanes(&dstView, AVIF_PLANES_YUV);
 1851|     28|        dstView.yuvFormat = AVIF_PIXEL_FORMAT_NONE;
 1852|    901|    } else {
 1853|    901|        avifImageFreePlanes(&dstView, AVIF_PLANES_A);
 1854|    901|    }
 1855|       |
 1856|    929|    avifImage srcTileView;
 1857|    929|    avifImageSetDefaults(&srcTileView);
 1858|    929|    avifImage dstTileView;
 1859|    929|    avifImageSetDefaults(&dstTileView);
 1860|    929|    avifCropRect dstTileViewRect = { 0, 0, firstTile->image->width, firstTile->image->height };
 1861|    929|    if (info->grid.rows > 0 && info->grid.columns > 0) {
  ------------------
  |  Branch (1861:9): [True: 929, False: 0]
  |  Branch (1861:32): [True: 929, False: 0]
  ------------------
 1862|    929|        unsigned int rowIndex = tileIndex / info->grid.columns;
 1863|    929|        unsigned int colIndex = tileIndex % info->grid.columns;
 1864|    929|        dstTileViewRect.x = firstTile->image->width * colIndex;
 1865|    929|        dstTileViewRect.y = firstTile->image->height * rowIndex;
 1866|    929|        if (dstTileViewRect.x + dstTileViewRect.width > info->grid.outputWidth) {
  ------------------
  |  Branch (1866:13): [True: 32, False: 897]
  ------------------
 1867|     32|            dstTileViewRect.width = info->grid.outputWidth - dstTileViewRect.x;
 1868|     32|        }
 1869|    929|        if (dstTileViewRect.y + dstTileViewRect.height > info->grid.outputHeight) {
  ------------------
  |  Branch (1869:13): [True: 85, False: 844]
  ------------------
 1870|     85|            dstTileViewRect.height = info->grid.outputHeight - dstTileViewRect.y;
 1871|     85|        }
 1872|    929|    }
 1873|    929|    const avifCropRect srcTileViewRect = { 0, 0, dstTileViewRect.width, dstTileViewRect.height };
 1874|    929|    AVIF_ASSERT_OR_RETURN(avifImageSetViewRect(&dstTileView, &dstView, &dstTileViewRect) == AVIF_RESULT_OK);
  ------------------
  |  |   64|    929|#define AVIF_ASSERT_OR_RETURN(A) AVIF_CHECKERR((A), AVIF_RESULT_INTERNAL_ERROR)
  |  |  ------------------
  |  |  |  |   45|    929|    do {                        \
  |  |  |  |   46|    929|        if (!(A)) {             \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (46:13): [True: 0, False: 929]
  |  |  |  |  ------------------
  |  |  |  |   47|      0|            avifBreakOnError(); \
  |  |  |  |   48|      0|            return ERR;         \
  |  |  |  |   49|      0|        }                       \
  |  |  |  |   50|    929|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (50:14): [Folded, False: 929]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1875|    929|    AVIF_ASSERT_OR_RETURN(avifImageSetViewRect(&srcTileView, tile->image, &srcTileViewRect) == AVIF_RESULT_OK);
  ------------------
  |  |   64|    929|#define AVIF_ASSERT_OR_RETURN(A) AVIF_CHECKERR((A), AVIF_RESULT_INTERNAL_ERROR)
  |  |  ------------------
  |  |  |  |   45|    929|    do {                        \
  |  |  |  |   46|    929|        if (!(A)) {             \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (46:13): [True: 0, False: 929]
  |  |  |  |  ------------------
  |  |  |  |   47|      0|            avifBreakOnError(); \
  |  |  |  |   48|      0|            return ERR;         \
  |  |  |  |   49|      0|        }                       \
  |  |  |  |   50|    929|    } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (50:14): [Folded, False: 929]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1876|    929|    avifImageCopySamples(&dstTileView, &srcTileView, avifIsAlpha(tile->input->itemCategory) ? AVIF_PLANES_A : AVIF_PLANES_YUV);
  ------------------
  |  Branch (1876:54): [True: 28, False: 901]
  ------------------
 1877|    929|    return AVIF_RESULT_OK;
 1878|    929|}
read.c:avifSampleTableGetImageDelta:
  357|  9.52k|{
  358|  9.52k|    uint32_t maxSampleIndex = 0;
  359|  9.52k|    for (uint32_t i = 0; i < sampleTable->timeToSamples.count; ++i) {
  ------------------
  |  Branch (359:26): [True: 9.20k, False: 326]
  ------------------
  360|  9.20k|        const avifSampleTableTimeToSample * timeToSample = &sampleTable->timeToSamples.timeToSample[i];
  361|  9.20k|        maxSampleIndex += timeToSample->sampleCount;
  362|  9.20k|        if ((imageIndex < maxSampleIndex) || (i == (sampleTable->timeToSamples.count - 1))) {
  ------------------
  |  Branch (362:13): [True: 9.13k, False: 70]
  |  Branch (362:46): [True: 70, False: 0]
  ------------------
  363|  9.20k|            return timeToSample->sampleDelta;
  364|  9.20k|        }
  365|  9.20k|    }
  366|       |
  367|       |    // TODO: fail here?
  368|    326|    return 1;
  369|  9.52k|}
read.c:avifDecoderDataResetCodec:
 1012|  33.8k|{
 1013|  78.2k|    for (unsigned int i = 0; i < data->tiles.count; ++i) {
  ------------------
  |  Branch (1013:30): [True: 44.3k, False: 33.8k]
  ------------------
 1014|  44.3k|        avifTile * tile = &data->tiles.tile[i];
 1015|  44.3k|        if (tile->image) {
  ------------------
  |  Branch (1015:13): [True: 44.3k, False: 0]
  ------------------
 1016|  44.3k|            avifImageFreePlanes(tile->image, AVIF_PLANES_ALL); // forget any pointers into codec image buffers
 1017|  44.3k|        }
 1018|  44.3k|        if (tile->codec) {
  ------------------
  |  Branch (1018:13): [True: 0, False: 44.3k]
  ------------------
 1019|       |            // Check if tile->codec was created separately and destroy it in that case.
 1020|      0|            if (tile->codec != data->codec && tile->codec != data->codecAlpha) {
  ------------------
  |  Branch (1020:17): [True: 0, False: 0]
  |  Branch (1020:47): [True: 0, False: 0]
  ------------------
 1021|      0|                avifCodecDestroy(tile->codec);
 1022|      0|            }
 1023|      0|            tile->codec = NULL;
 1024|      0|        }
 1025|  44.3k|    }
 1026|   304k|    for (int c = 0; c < AVIF_ITEM_CATEGORY_COUNT; ++c) {
  ------------------
  |  Branch (1026:21): [True: 271k, False: 33.8k]
  ------------------
 1027|   271k|        data->tileInfos[c].decodedTileCount = 0;
 1028|   271k|    }
 1029|  33.8k|    if (data->codec) {
  ------------------
  |  Branch (1029:9): [True: 0, False: 33.8k]
  ------------------
 1030|      0|        avifCodecDestroy(data->codec);
 1031|      0|        data->codec = NULL;
 1032|      0|    }
 1033|  33.8k|    if (data->codecAlpha) {
  ------------------
  |  Branch (1033:9): [True: 0, False: 33.8k]
  ------------------
 1034|      0|        avifCodecDestroy(data->codecAlpha);
 1035|       |        data->codecAlpha = NULL;
 1036|      0|    }
 1037|  33.8k|}

avifLimitedToFullY:
 1777|   647k|{
 1778|   647k|    switch (depth) {
  ------------------
  |  Branch (1778:13): [True: 647k, False: 0]
  ------------------
 1779|   402k|        case 8:
  ------------------
  |  Branch (1779:9): [True: 402k, False: 244k]
  ------------------
 1780|   402k|            LIMITED_TO_FULL(16, 235, 255);
  ------------------
  |  | 1759|   402k|    v = (((v - MINLIMITEDY) * FULLY) + ((MAXLIMITEDY - MINLIMITEDY) / 2)) / (MAXLIMITEDY - MINLIMITEDY); \
  |  | 1760|   402k|    v = AVIF_CLAMP(v, 0, FULLY)
  |  |  ------------------
  |  |  |  |   18|   402k|#define AVIF_CLAMP(x, low, high) (((x) < (low)) ? (low) : (((high) < (x)) ? (high) : (x)))
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (18:35): [True: 14.9k, False: 387k]
  |  |  |  |  |  Branch (18:60): [True: 46.0k, False: 341k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1781|   402k|            break;
 1782|   179k|        case 10:
  ------------------
  |  Branch (1782:9): [True: 179k, False: 468k]
  ------------------
 1783|   179k|            LIMITED_TO_FULL(64, 940, 1023);
  ------------------
  |  | 1759|   179k|    v = (((v - MINLIMITEDY) * FULLY) + ((MAXLIMITEDY - MINLIMITEDY) / 2)) / (MAXLIMITEDY - MINLIMITEDY); \
  |  | 1760|   179k|    v = AVIF_CLAMP(v, 0, FULLY)
  |  |  ------------------
  |  |  |  |   18|   179k|#define AVIF_CLAMP(x, low, high) (((x) < (low)) ? (low) : (((high) < (x)) ? (high) : (x)))
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (18:35): [True: 1.21k, False: 178k]
  |  |  |  |  |  Branch (18:60): [True: 1.43k, False: 176k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1784|   179k|            break;
 1785|  65.7k|        case 12:
  ------------------
  |  Branch (1785:9): [True: 65.7k, False: 582k]
  ------------------
 1786|  65.7k|            LIMITED_TO_FULL(256, 3760, 4095);
  ------------------
  |  | 1759|  65.7k|    v = (((v - MINLIMITEDY) * FULLY) + ((MAXLIMITEDY - MINLIMITEDY) / 2)) / (MAXLIMITEDY - MINLIMITEDY); \
  |  | 1760|  65.7k|    v = AVIF_CLAMP(v, 0, FULLY)
  |  |  ------------------
  |  |  |  |   18|  65.7k|#define AVIF_CLAMP(x, low, high) (((x) < (low)) ? (low) : (((high) < (x)) ? (high) : (x)))
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (18:35): [True: 272, False: 65.4k]
  |  |  |  |  |  Branch (18:60): [True: 16.8k, False: 48.5k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1787|  65.7k|            break;
 1788|   647k|    }
 1789|   647k|    return v;
 1790|   647k|}

avifImageScaleWithLimit:
   30|  4.77k|{
   31|  4.77k|    if ((image->width == dstWidth) && (image->height == dstHeight)) {
  ------------------
  |  Branch (31:9): [True: 825, False: 3.95k]
  |  Branch (31:39): [True: 0, False: 825]
  ------------------
   32|       |        // Nothing to do
   33|      0|        return AVIF_RESULT_OK;
   34|      0|    }
   35|       |
   36|  4.77k|    if ((dstWidth == 0) || (dstHeight == 0)) {
  ------------------
  |  Branch (36:9): [True: 99, False: 4.67k]
  |  Branch (36:28): [True: 0, False: 4.67k]
  ------------------
   37|     99|        avifDiagnosticsPrintf(diag, "avifImageScaleWithLimit requested invalid dst dimensions [%ux%u]", dstWidth, dstHeight);
   38|     99|        return AVIF_RESULT_INVALID_ARGUMENT;
   39|     99|    }
   40|  4.67k|    if (avifDimensionsTooLarge(dstWidth, dstHeight, imageSizeLimit, imageDimensionLimit)) {
  ------------------
  |  Branch (40:9): [True: 2, False: 4.67k]
  ------------------
   41|      2|        avifDiagnosticsPrintf(diag, "avifImageScaleWithLimit requested dst dimensions that are too large [%ux%u]", dstWidth, dstHeight);
   42|      2|        return AVIF_RESULT_NOT_IMPLEMENTED;
   43|      2|    }
   44|       |
   45|  4.67k|    uint8_t * srcYUVPlanes[AVIF_PLANE_COUNT_YUV];
   46|  4.67k|    uint32_t srcYUVRowBytes[AVIF_PLANE_COUNT_YUV];
   47|  18.7k|    for (int i = 0; i < AVIF_PLANE_COUNT_YUV; ++i) {
  ------------------
  |  |  112|  18.7k|#define AVIF_PLANE_COUNT_YUV 3
  ------------------
  |  Branch (47:21): [True: 14.0k, False: 4.67k]
  ------------------
   48|  14.0k|        srcYUVPlanes[i] = image->yuvPlanes[i];
   49|  14.0k|        image->yuvPlanes[i] = NULL;
   50|  14.0k|        srcYUVRowBytes[i] = image->yuvRowBytes[i];
   51|  14.0k|        image->yuvRowBytes[i] = 0;
   52|  14.0k|    }
   53|  4.67k|    const avifBool srcImageOwnsYUVPlanes = image->imageOwnsYUVPlanes;
   54|  4.67k|    image->imageOwnsYUVPlanes = AVIF_FALSE;
  ------------------
  |  |   89|  4.67k|#define AVIF_FALSE 0
  ------------------
   55|       |
   56|  4.67k|    uint8_t * srcAlphaPlane = image->alphaPlane;
   57|  4.67k|    image->alphaPlane = NULL;
   58|  4.67k|    uint32_t srcAlphaRowBytes = image->alphaRowBytes;
   59|  4.67k|    image->alphaRowBytes = 0;
   60|  4.67k|    const avifBool srcImageOwnsAlphaPlane = image->imageOwnsAlphaPlane;
   61|  4.67k|    image->imageOwnsAlphaPlane = AVIF_FALSE;
  ------------------
  |  |   89|  4.67k|#define AVIF_FALSE 0
  ------------------
   62|       |
   63|  4.67k|    const uint32_t srcWidth = image->width;
   64|  4.67k|    const uint32_t srcHeight = image->height;
   65|  4.67k|    const uint32_t srcUVWidth = avifImagePlaneWidth(image, AVIF_CHAN_U);
   66|  4.67k|    const uint32_t srcUVHeight = avifImagePlaneHeight(image, AVIF_CHAN_U);
   67|  4.67k|    image->width = dstWidth;
   68|  4.67k|    image->height = dstHeight;
   69|       |
   70|  4.67k|    avifResult result = AVIF_RESULT_OK;
   71|  4.67k|    if (srcYUVPlanes[0] || srcAlphaPlane) {
  ------------------
  |  Branch (71:9): [True: 4.27k, False: 406]
  |  Branch (71:28): [True: 406, False: 0]
  ------------------
   72|       |        // A simple conservative check to avoid integer overflows in libyuv's ScalePlane() and
   73|       |        // ScalePlane_12() functions.
   74|  4.67k|        if (srcWidth > 16384) {
  ------------------
  |  Branch (74:13): [True: 0, False: 4.67k]
  ------------------
   75|      0|            avifDiagnosticsPrintf(diag, "avifImageScaleWithLimit requested invalid width scale for libyuv [%u -> %u]", srcWidth, dstWidth);
   76|      0|            result = AVIF_RESULT_NOT_IMPLEMENTED;
   77|      0|            goto cleanup;
   78|      0|        }
   79|  4.67k|        if (srcHeight > 16384) {
  ------------------
  |  Branch (79:13): [True: 8, False: 4.66k]
  ------------------
   80|      8|            avifDiagnosticsPrintf(diag, "avifImageScaleWithLimit requested invalid height scale for libyuv [%u -> %u]", srcHeight, dstHeight);
   81|      8|            result = AVIF_RESULT_NOT_IMPLEMENTED;
   82|      8|            goto cleanup;
   83|      8|        }
   84|  4.67k|    }
   85|       |
   86|  4.66k|    if (srcYUVPlanes[0]) {
  ------------------
  |  Branch (86:9): [True: 4.26k, False: 406]
  ------------------
   87|  4.26k|        const avifResult allocationResult = avifImageAllocatePlanes(image, AVIF_PLANES_YUV);
   88|  4.26k|        if (allocationResult != AVIF_RESULT_OK) {
  ------------------
  |  Branch (88:13): [True: 0, False: 4.26k]
  ------------------
   89|      0|            avifDiagnosticsPrintf(diag, "Allocation of YUV planes failed: %s", avifResultToString(allocationResult));
   90|      0|            result = AVIF_RESULT_OUT_OF_MEMORY;
   91|      0|            goto cleanup;
   92|      0|        }
   93|       |
   94|  17.0k|        for (int i = 0; i < AVIF_PLANE_COUNT_YUV; ++i) {
  ------------------
  |  |  112|  17.0k|#define AVIF_PLANE_COUNT_YUV 3
  ------------------
  |  Branch (94:25): [True: 12.7k, False: 4.26k]
  ------------------
   95|  12.7k|            if (!srcYUVPlanes[i]) {
  ------------------
  |  Branch (95:17): [True: 1.49k, False: 11.2k]
  ------------------
   96|  1.49k|                continue;
   97|  1.49k|            }
   98|       |
   99|  11.2k|            const uint32_t srcW = (i == AVIF_CHAN_Y) ? srcWidth : srcUVWidth;
  ------------------
  |  Branch (99:35): [True: 4.26k, False: 7.03k]
  ------------------
  100|  11.2k|            const uint32_t srcH = (i == AVIF_CHAN_Y) ? srcHeight : srcUVHeight;
  ------------------
  |  Branch (100:35): [True: 4.26k, False: 7.03k]
  ------------------
  101|  11.2k|            const uint32_t dstW = avifImagePlaneWidth(image, i);
  102|  11.2k|            const uint32_t dstH = avifImagePlaneHeight(image, i);
  103|  11.2k|            if (image->depth > 8) {
  ------------------
  |  Branch (103:17): [True: 5.14k, False: 6.15k]
  ------------------
  104|  5.14k|                uint16_t * const srcPlane = (uint16_t *)srcYUVPlanes[i];
  105|  5.14k|                const uint32_t srcStride = srcYUVRowBytes[i] / 2;
  106|  5.14k|                uint16_t * const dstPlane = (uint16_t *)image->yuvPlanes[i];
  107|  5.14k|                const uint32_t dstStride = image->yuvRowBytes[i] / 2;
  108|  5.14k|#if LIBYUV_VERSION >= 1880
  109|  5.14k|                const int failure =
  110|  5.14k|                    ScalePlane_12(srcPlane, srcStride, srcW, srcH, dstPlane, dstStride, dstW, dstH, AVIF_LIBYUV_FILTER_MODE);
  ------------------
  |  |   22|  5.14k|#define AVIF_LIBYUV_FILTER_MODE kFilterBox
  ------------------
  111|  5.14k|                if (failure) {
  ------------------
  |  Branch (111:21): [True: 0, False: 5.14k]
  ------------------
  112|      0|                    avifDiagnosticsPrintf(diag, "ScalePlane_12() failed (%d)", failure);
  113|      0|                    result = (failure == 1) ? AVIF_RESULT_OUT_OF_MEMORY : AVIF_RESULT_UNKNOWN_ERROR;
  ------------------
  |  Branch (113:30): [True: 0, False: 0]
  ------------------
  114|      0|                    goto cleanup;
  115|      0|                }
  116|       |#elif LIBYUV_VERSION >= 1774
  117|       |                ScalePlane_12(srcPlane, srcStride, srcW, srcH, dstPlane, dstStride, dstW, dstH, AVIF_LIBYUV_FILTER_MODE);
  118|       |#else
  119|       |                ScalePlane_16(srcPlane, srcStride, srcW, srcH, dstPlane, dstStride, dstW, dstH, AVIF_LIBYUV_FILTER_MODE);
  120|       |#endif
  121|  6.15k|            } else {
  122|  6.15k|                uint8_t * const srcPlane = srcYUVPlanes[i];
  123|  6.15k|                const uint32_t srcStride = srcYUVRowBytes[i];
  124|  6.15k|                uint8_t * const dstPlane = image->yuvPlanes[i];
  125|  6.15k|                const uint32_t dstStride = image->yuvRowBytes[i];
  126|  6.15k|#if LIBYUV_VERSION >= 1880
  127|  6.15k|                const int failure = ScalePlane(srcPlane, srcStride, srcW, srcH, dstPlane, dstStride, dstW, dstH, AVIF_LIBYUV_FILTER_MODE);
  ------------------
  |  |   22|  6.15k|#define AVIF_LIBYUV_FILTER_MODE kFilterBox
  ------------------
  128|  6.15k|                if (failure) {
  ------------------
  |  Branch (128:21): [True: 0, False: 6.15k]
  ------------------
  129|      0|                    avifDiagnosticsPrintf(diag, "ScalePlane() failed (%d)", failure);
  130|      0|                    result = (failure == 1) ? AVIF_RESULT_OUT_OF_MEMORY : AVIF_RESULT_UNKNOWN_ERROR;
  ------------------
  |  Branch (130:30): [True: 0, False: 0]
  ------------------
  131|      0|                    goto cleanup;
  132|      0|                }
  133|       |#else
  134|       |                ScalePlane(srcPlane, srcStride, srcW, srcH, dstPlane, dstStride, dstW, dstH, AVIF_LIBYUV_FILTER_MODE);
  135|       |#endif
  136|  6.15k|            }
  137|  11.2k|        }
  138|  4.26k|    }
  139|       |
  140|  4.66k|    if (srcAlphaPlane) {
  ------------------
  |  Branch (140:9): [True: 406, False: 4.26k]
  ------------------
  141|    406|        const avifResult allocationResult = avifImageAllocatePlanes(image, AVIF_PLANES_A);
  142|    406|        if (allocationResult != AVIF_RESULT_OK) {
  ------------------
  |  Branch (142:13): [True: 0, False: 406]
  ------------------
  143|      0|            avifDiagnosticsPrintf(diag, "Allocation of alpha plane failed: %s", avifResultToString(allocationResult));
  144|      0|            result = AVIF_RESULT_OUT_OF_MEMORY;
  145|      0|            goto cleanup;
  146|      0|        }
  147|       |
  148|    406|        if (image->depth > 8) {
  ------------------
  |  Branch (148:13): [True: 104, False: 302]
  ------------------
  149|    104|            uint16_t * const srcPlane = (uint16_t *)srcAlphaPlane;
  150|    104|            const uint32_t srcStride = srcAlphaRowBytes / 2;
  151|    104|            uint16_t * const dstPlane = (uint16_t *)image->alphaPlane;
  152|    104|            const uint32_t dstStride = image->alphaRowBytes / 2;
  153|    104|#if LIBYUV_VERSION >= 1880
  154|    104|            const int failure =
  155|    104|                ScalePlane_12(srcPlane, srcStride, srcWidth, srcHeight, dstPlane, dstStride, dstWidth, dstHeight, AVIF_LIBYUV_FILTER_MODE);
  ------------------
  |  |   22|    104|#define AVIF_LIBYUV_FILTER_MODE kFilterBox
  ------------------
  156|    104|            if (failure) {
  ------------------
  |  Branch (156:17): [True: 0, False: 104]
  ------------------
  157|      0|                avifDiagnosticsPrintf(diag, "ScalePlane_12() failed (%d)", failure);
  158|      0|                result = (failure == 1) ? AVIF_RESULT_OUT_OF_MEMORY : AVIF_RESULT_UNKNOWN_ERROR;
  ------------------
  |  Branch (158:26): [True: 0, False: 0]
  ------------------
  159|      0|                goto cleanup;
  160|      0|            }
  161|       |#elif LIBYUV_VERSION >= 1774
  162|       |            ScalePlane_12(srcPlane, srcStride, srcWidth, srcHeight, dstPlane, dstStride, dstWidth, dstHeight, AVIF_LIBYUV_FILTER_MODE);
  163|       |#else
  164|       |            ScalePlane_16(srcPlane, srcStride, srcWidth, srcHeight, dstPlane, dstStride, dstWidth, dstHeight, AVIF_LIBYUV_FILTER_MODE);
  165|       |#endif
  166|    302|        } else {
  167|    302|            uint8_t * const srcPlane = srcAlphaPlane;
  168|    302|            const uint32_t srcStride = srcAlphaRowBytes;
  169|    302|            uint8_t * const dstPlane = image->alphaPlane;
  170|    302|            const uint32_t dstStride = image->alphaRowBytes;
  171|    302|#if LIBYUV_VERSION >= 1880
  172|    302|            const int failure =
  173|    302|                ScalePlane(srcPlane, srcStride, srcWidth, srcHeight, dstPlane, dstStride, dstWidth, dstHeight, AVIF_LIBYUV_FILTER_MODE);
  ------------------
  |  |   22|    302|#define AVIF_LIBYUV_FILTER_MODE kFilterBox
  ------------------
  174|    302|            if (failure) {
  ------------------
  |  Branch (174:17): [True: 0, False: 302]
  ------------------
  175|      0|                avifDiagnosticsPrintf(diag, "ScalePlane() failed (%d)", failure);
  176|      0|                result = (failure == 1) ? AVIF_RESULT_OUT_OF_MEMORY : AVIF_RESULT_UNKNOWN_ERROR;
  ------------------
  |  Branch (176:26): [True: 0, False: 0]
  ------------------
  177|      0|                goto cleanup;
  178|      0|            }
  179|       |#else
  180|       |            ScalePlane(srcPlane, srcStride, srcWidth, srcHeight, dstPlane, dstStride, dstWidth, dstHeight, AVIF_LIBYUV_FILTER_MODE);
  181|       |#endif
  182|    302|        }
  183|    406|    }
  184|       |
  185|  4.67k|cleanup:
  186|  4.67k|    if (srcYUVPlanes[0] && srcImageOwnsYUVPlanes) {
  ------------------
  |  Branch (186:9): [True: 4.27k, False: 406]
  |  Branch (186:28): [True: 0, False: 4.27k]
  ------------------
  187|      0|        for (int i = 0; i < AVIF_PLANE_COUNT_YUV; ++i) {
  ------------------
  |  |  112|      0|#define AVIF_PLANE_COUNT_YUV 3
  ------------------
  |  Branch (187:25): [True: 0, False: 0]
  ------------------
  188|      0|            avifFree(srcYUVPlanes[i]);
  189|      0|        }
  190|      0|    }
  191|  4.67k|    if (srcAlphaPlane && srcImageOwnsAlphaPlane) {
  ------------------
  |  Branch (191:9): [True: 406, False: 4.27k]
  |  Branch (191:26): [True: 90, False: 316]
  ------------------
  192|     90|        avifFree(srcAlphaPlane);
  193|     90|    }
  194|  4.67k|    return result;
  195|  4.66k|}

avifROStreamCurrent:
   15|   500k|{
   16|   500k|    return stream->raw->data + stream->offset;
   17|   500k|}
avifROStreamStart:
   20|   465k|{
   21|   465k|    stream->raw = raw;
   22|   465k|    stream->offset = 0;
   23|   465k|    stream->numUsedBitsInPartialByte = 0;
   24|   465k|    stream->diag = diag;
   25|   465k|    stream->diagContext = diagContext;
   26|       |
   27|       |    // If diag is non-NULL, diagContext must also be non-NULL
   28|       |    assert(!stream->diag || stream->diagContext);
   29|   465k|}
avifROStreamHasBytesLeft:
   32|  3.66M|{
   33|  3.66M|    return byteCount <= (stream->raw->size - stream->offset);
   34|  3.66M|}
avifROStreamRemainingBytes:
   37|   538k|{
   38|   538k|    return stream->raw->size - stream->offset;
   39|   538k|}
avifROStreamOffset:
   42|   188k|{
   43|   188k|    return stream->offset;
   44|   188k|}
avifROStreamSkip:
   56|   827k|{
   57|   827k|    assert(stream->numUsedBitsInPartialByte == 0); // Byte alignment is required.
   58|   827k|    if (!avifROStreamHasBytesLeft(stream, byteCount)) {
  ------------------
  |  Branch (58:9): [True: 49, False: 827k]
  ------------------
   59|     49|        avifDiagnosticsPrintf(stream->diag, "%s: Failed to skip %zu bytes, truncated data?", stream->diagContext, byteCount);
   60|     49|        return AVIF_FALSE;
  ------------------
  |  |   89|     49|#define AVIF_FALSE 0
  ------------------
   61|     49|    }
   62|   827k|    stream->offset += byteCount;
   63|   827k|    return AVIF_TRUE;
  ------------------
  |  |   88|   827k|#define AVIF_TRUE 1
  ------------------
   64|   827k|}
avifROStreamRead:
   67|  2.37M|{
   68|  2.37M|    assert(stream->numUsedBitsInPartialByte == 0); // Byte alignment is required.
   69|  2.37M|    if (!avifROStreamHasBytesLeft(stream, size)) {
  ------------------
  |  Branch (69:9): [True: 426, False: 2.37M]
  ------------------
   70|    426|        avifDiagnosticsPrintf(stream->diag, "%s: Failed to read %zu bytes, truncated data?", stream->diagContext, size);
   71|    426|        return AVIF_FALSE;
  ------------------
  |  |   89|    426|#define AVIF_FALSE 0
  ------------------
   72|    426|    }
   73|       |
   74|  2.37M|    memcpy(data, stream->raw->data + stream->offset, size);
   75|  2.37M|    stream->offset += size;
   76|  2.37M|    return AVIF_TRUE;
  ------------------
  |  |   88|  2.37M|#define AVIF_TRUE 1
  ------------------
   77|  2.37M|}
avifROStreamReadUX8:
   80|  19.2M|{
   81|  19.2M|    assert(stream->numUsedBitsInPartialByte == 0); // Byte alignment is required.
   82|  19.2M|    if (factor == 0) {
  ------------------
  |  Branch (82:9): [True: 19.0M, False: 106k]
  ------------------
   83|       |        // Don't read anything, just set to 0
   84|  19.0M|        *v = 0;
   85|  19.0M|    } else if (factor == 1) {
  ------------------
  |  Branch (85:16): [True: 0, False: 106k]
  ------------------
   86|      0|        uint8_t tmp;
   87|      0|        AVIF_CHECK(avifROStreamRead(stream, &tmp, 1));
  ------------------
  |  |   36|      0|    do {                        \
  |  |   37|      0|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 0, False: 0]
  |  |  ------------------
  |  |   38|      0|            avifBreakOnError(); \
  |  |   39|      0|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      0|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      0|        }                       \
  |  |   41|      0|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 0]
  |  |  ------------------
  ------------------
   88|      0|        *v = tmp;
   89|   106k|    } else if (factor == 2) {
  ------------------
  |  Branch (89:16): [True: 0, False: 106k]
  ------------------
   90|      0|        uint16_t tmp;
   91|      0|        AVIF_CHECK(avifROStreamReadU16(stream, &tmp));
  ------------------
  |  |   36|      0|    do {                        \
  |  |   37|      0|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 0, False: 0]
  |  |  ------------------
  |  |   38|      0|            avifBreakOnError(); \
  |  |   39|      0|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      0|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      0|        }                       \
  |  |   41|      0|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 0]
  |  |  ------------------
  ------------------
   92|      0|        *v = tmp;
   93|   106k|    } else if (factor == 4) {
  ------------------
  |  Branch (93:16): [True: 105k, False: 482]
  ------------------
   94|   105k|        uint32_t tmp;
   95|   105k|        AVIF_CHECK(avifROStreamReadU32(stream, &tmp));
  ------------------
  |  |   36|   105k|    do {                        \
  |  |   37|   105k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 40, False: 105k]
  |  |  ------------------
  |  |   38|     40|            avifBreakOnError(); \
  |  |   39|     40|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|     40|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|     40|        }                       \
  |  |   41|   105k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 105k]
  |  |  ------------------
  ------------------
   96|   105k|        *v = tmp;
   97|   105k|    } else if (factor == 8) {
  ------------------
  |  Branch (97:16): [True: 482, False: 0]
  ------------------
   98|    482|        uint64_t tmp;
   99|    482|        AVIF_CHECK(avifROStreamReadU64(stream, &tmp));
  ------------------
  |  |   36|    482|    do {                        \
  |  |   37|    482|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 64, False: 418]
  |  |  ------------------
  |  |   38|     64|            avifBreakOnError(); \
  |  |   39|     64|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|     64|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|     64|        }                       \
  |  |   41|    482|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 418]
  |  |  ------------------
  ------------------
  100|    418|        *v = tmp;
  101|    418|    } else {
  102|       |        // Unsupported factor
  103|      0|        avifDiagnosticsPrintf(stream->diag, "%s: Failed to read UX8 value; Unsupported UX8 factor [%" PRIu64 "]", stream->diagContext, factor);
  104|      0|        return AVIF_FALSE;
  ------------------
  |  |   89|      0|#define AVIF_FALSE 0
  ------------------
  105|      0|    }
  106|  19.2M|    return AVIF_TRUE;
  ------------------
  |  |   88|  19.2M|#define AVIF_TRUE 1
  ------------------
  107|  19.2M|}
avifROStreamReadU16:
  110|   469k|{
  111|   469k|    assert(stream->numUsedBitsInPartialByte == 0); // Byte alignment is required.
  112|   469k|    AVIF_CHECK(avifROStreamRead(stream, (uint8_t *)v, sizeof(uint16_t)));
  ------------------
  |  |   36|   469k|    do {                        \
  |  |   37|   469k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 61, False: 469k]
  |  |  ------------------
  |  |   38|     61|            avifBreakOnError(); \
  |  |   39|     61|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|     61|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|     61|        }                       \
  |  |   41|   469k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 469k]
  |  |  ------------------
  ------------------
  113|   469k|    *v = avifNTOHS(*v);
  114|   469k|    return AVIF_TRUE;
  ------------------
  |  |   88|   469k|#define AVIF_TRUE 1
  ------------------
  115|   469k|}
avifROStreamReadU32:
  126|   840k|{
  127|   840k|    assert(stream->numUsedBitsInPartialByte == 0); // Byte alignment is required.
  128|   840k|    AVIF_CHECK(avifROStreamRead(stream, (uint8_t *)v, sizeof(uint32_t)));
  ------------------
  |  |   36|   840k|    do {                        \
  |  |   37|   840k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 242, False: 839k]
  |  |  ------------------
  |  |   38|    242|            avifBreakOnError(); \
  |  |   39|    242|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|    242|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|    242|        }                       \
  |  |   41|   840k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 839k]
  |  |  ------------------
  ------------------
  129|   839k|    *v = avifNTOHL(*v);
  130|   839k|    return AVIF_TRUE;
  ------------------
  |  |   88|   839k|#define AVIF_TRUE 1
  ------------------
  131|   840k|}
avifROStreamReadU64:
  142|  20.5k|{
  143|  20.5k|    assert(stream->numUsedBitsInPartialByte == 0); // Byte alignment is required.
  144|  20.5k|    AVIF_CHECK(avifROStreamRead(stream, (uint8_t *)v, sizeof(uint64_t)));
  ------------------
  |  |   36|  20.5k|    do {                        \
  |  |   37|  20.5k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 72, False: 20.4k]
  |  |  ------------------
  |  |   38|     72|            avifBreakOnError(); \
  |  |   39|     72|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|     72|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|     72|        }                       \
  |  |   41|  20.5k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 20.4k]
  |  |  ------------------
  ------------------
  145|  20.4k|    *v = avifNTOH64(*v);
  146|  20.4k|    return AVIF_TRUE;
  ------------------
  |  |   88|  20.4k|#define AVIF_TRUE 1
  ------------------
  147|  20.5k|}
avifROStreamReadBitsU8:
  167|   594k|{
  168|   594k|    AVIF_CHECK(bitCount <= sizeof(*v) * 8);
  ------------------
  |  |   36|   594k|    do {                        \
  |  |   37|   594k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 0, False: 594k]
  |  |  ------------------
  |  |   38|      0|            avifBreakOnError(); \
  |  |   39|      0|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      0|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      0|        }                       \
  |  |   41|   594k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 594k]
  |  |  ------------------
  ------------------
  169|   594k|    uint32_t vU32;
  170|   594k|    AVIF_CHECK(avifROStreamReadBitsU32(stream, &vU32, bitCount));
  ------------------
  |  |   36|   594k|    do {                        \
  |  |   37|   594k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 29, False: 594k]
  |  |  ------------------
  |  |   38|     29|            avifBreakOnError(); \
  |  |   39|     29|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|     29|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|     29|        }                       \
  |  |   41|   594k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 594k]
  |  |  ------------------
  ------------------
  171|   594k|    *v = (uint8_t)vU32;
  172|   594k|    return AVIF_TRUE;
  ------------------
  |  |   88|   594k|#define AVIF_TRUE 1
  ------------------
  173|   594k|}
avifROStreamReadBitsU16:
  176|    939|{
  177|    939|    AVIF_CHECK(bitCount <= sizeof(*v) * 8);
  ------------------
  |  |   36|    939|    do {                        \
  |  |   37|    939|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 0, False: 939]
  |  |  ------------------
  |  |   38|      0|            avifBreakOnError(); \
  |  |   39|      0|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      0|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      0|        }                       \
  |  |   41|    939|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 939]
  |  |  ------------------
  ------------------
  178|    939|    uint32_t vU32;
  179|    939|    AVIF_CHECK(avifROStreamReadBitsU32(stream, &vU32, bitCount));
  ------------------
  |  |   36|    939|    do {                        \
  |  |   37|    939|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 3, False: 936]
  |  |  ------------------
  |  |   38|      3|            avifBreakOnError(); \
  |  |   39|      3|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      3|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      3|        }                       \
  |  |   41|    939|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 936]
  |  |  ------------------
  ------------------
  180|    936|    *v = (uint16_t)vU32;
  181|    936|    return AVIF_TRUE;
  ------------------
  |  |   88|    936|#define AVIF_TRUE 1
  ------------------
  182|    939|}
avifROStreamReadBitsU32:
  185|   874k|{
  186|   874k|    AVIF_CHECK(bitCount <= sizeof(*v) * 8);
  ------------------
  |  |   36|   874k|    do {                        \
  |  |   37|   874k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 0, False: 874k]
  |  |  ------------------
  |  |   38|      0|            avifBreakOnError(); \
  |  |   39|      0|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      0|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      0|        }                       \
  |  |   41|   874k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 874k]
  |  |  ------------------
  ------------------
  187|   874k|    *v = 0;
  188|  1.75M|    while (bitCount) {
  ------------------
  |  Branch (188:12): [True: 881k, False: 874k]
  ------------------
  189|   881k|        if (stream->numUsedBitsInPartialByte == 0) {
  ------------------
  |  Branch (189:13): [True: 361k, False: 520k]
  ------------------
  190|   361k|            AVIF_CHECK(avifROStreamSkip(stream, sizeof(uint8_t))); // Book a new partial byte in the stream.
  ------------------
  |  |   36|   361k|    do {                        \
  |  |   37|   361k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 46, False: 361k]
  |  |  ------------------
  |  |   38|     46|            avifBreakOnError(); \
  |  |   39|     46|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|     46|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|     46|        }                       \
  |  |   41|   361k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 361k]
  |  |  ------------------
  ------------------
  191|   361k|        }
  192|   881k|        assert(stream->offset > 0);
  193|   881k|        const uint8_t * packedBits = stream->raw->data + stream->offset - 1;
  194|       |
  195|   881k|        const size_t numBits = AVIF_MIN(bitCount, 8 - stream->numUsedBitsInPartialByte);
  ------------------
  |  |   19|   881k|#define AVIF_MIN(a, b) (((a) < (b)) ? (a) : (b))
  |  |  ------------------
  |  |  |  Branch (19:25): [True: 520k, False: 361k]
  |  |  ------------------
  ------------------
  196|   881k|        stream->numUsedBitsInPartialByte += numBits;
  197|   881k|        bitCount -= numBits;
  198|       |        // The stream bits are packed starting with the most significant bit of the first input byte.
  199|       |        // This way, packed bits can be found in the same order in the bit stream.
  200|   881k|        const uint32_t bits = (*packedBits >> (8 - stream->numUsedBitsInPartialByte)) & ((1 << numBits) - 1);
  201|       |        // The value bits are ordered from the most significant bit to the least significant bit.
  202|       |        // In the case where avifROStreamReadBitsU32() is used to parse the unsigned integer value *v
  203|       |        // over multiple aligned bytes, this order corresponds to big endianness.
  204|   881k|        *v |= bits << bitCount;
  205|       |
  206|   881k|        if (stream->numUsedBitsInPartialByte == 8) {
  ------------------
  |  Branch (206:13): [True: 361k, False: 520k]
  ------------------
  207|       |            // Start a new partial byte the next time a bit is needed.
  208|   361k|            stream->numUsedBitsInPartialByte = 0;
  209|   361k|        }
  210|   881k|    }
  211|   874k|    return AVIF_TRUE;
  ------------------
  |  |   88|   874k|#define AVIF_TRUE 1
  ------------------
  212|   874k|}
avifROStreamReadString:
  215|  81.3k|{
  216|  81.3k|    assert(stream->numUsedBitsInPartialByte == 0); // Byte alignment is required.
  217|       |
  218|       |    // Check for the presence of a null terminator in the stream.
  219|  81.3k|    size_t remainingBytes = avifROStreamRemainingBytes(stream);
  220|  81.3k|    const uint8_t * p = avifROStreamCurrent(stream);
  221|  81.3k|    avifBool foundNullTerminator = AVIF_FALSE;
  ------------------
  |  |   89|  81.3k|#define AVIF_FALSE 0
  ------------------
  222|   895k|    for (size_t i = 0; i < remainingBytes; ++i) {
  ------------------
  |  Branch (222:24): [True: 895k, False: 13]
  ------------------
  223|   895k|        if (p[i] == 0) {
  ------------------
  |  Branch (223:13): [True: 81.2k, False: 813k]
  ------------------
  224|  81.2k|            foundNullTerminator = AVIF_TRUE;
  ------------------
  |  |   88|  81.2k|#define AVIF_TRUE 1
  ------------------
  225|  81.2k|            break;
  226|  81.2k|        }
  227|   895k|    }
  228|  81.3k|    if (!foundNullTerminator) {
  ------------------
  |  Branch (228:9): [True: 13, False: 81.2k]
  ------------------
  229|     13|        avifDiagnosticsPrintf(stream->diag, "%s: Failed to find a NULL terminator when reading a string", stream->diagContext);
  230|     13|        return AVIF_FALSE;
  ------------------
  |  |   89|     13|#define AVIF_FALSE 0
  ------------------
  231|     13|    }
  232|       |
  233|  81.2k|    const char * streamString = (const char *)p;
  234|  81.2k|    size_t stringLen = strlen(streamString);
  235|  81.2k|    stream->offset += stringLen + 1; // update the stream to have read the "whole string" in
  236|       |
  237|  81.2k|    if (output && outputSize) {
  ------------------
  |  Branch (237:9): [True: 12.0k, False: 69.2k]
  |  Branch (237:19): [True: 12.0k, False: 0]
  ------------------
  238|       |        // clamp to our output buffer
  239|  12.0k|        if (stringLen >= outputSize) {
  ------------------
  |  Branch (239:13): [True: 1, False: 12.0k]
  ------------------
  240|      1|            stringLen = outputSize - 1;
  241|      1|        }
  242|  12.0k|        memcpy(output, streamString, stringLen);
  243|  12.0k|        output[stringLen] = 0;
  244|  12.0k|    }
  245|  81.2k|    return AVIF_TRUE;
  ------------------
  |  |   88|  81.2k|#define AVIF_TRUE 1
  ------------------
  246|  81.3k|}
avifROStreamReadBoxHeaderPartial:
  249|   476k|{
  250|       |    // Section 4.2.2 of ISO/IEC 14496-12.
  251|   476k|    size_t startOffset = stream->offset;
  252|       |
  253|   476k|    uint32_t smallSize;
  254|   476k|    AVIF_CHECK(avifROStreamReadU32(stream, &smallSize));   // unsigned int(32) size;
  ------------------
  |  |   36|   476k|    do {                        \
  |  |   37|   476k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 23, False: 476k]
  |  |  ------------------
  |  |   38|     23|            avifBreakOnError(); \
  |  |   39|     23|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|     23|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|     23|        }                       \
  |  |   41|   476k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 476k]
  |  |  ------------------
  ------------------
  255|   476k|    AVIF_CHECK(avifROStreamRead(stream, header->type, 4)); // unsigned int(32) type = boxtype;
  ------------------
  |  |   36|   476k|    do {                        \
  |  |   37|   476k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 11, False: 476k]
  |  |  ------------------
  |  |   38|     11|            avifBreakOnError(); \
  |  |   39|     11|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|     11|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|     11|        }                       \
  |  |   41|   476k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 476k]
  |  |  ------------------
  ------------------
  256|       |
  257|   476k|    uint64_t size = smallSize;
  258|   476k|    if (size == 1) {
  ------------------
  |  Branch (258:9): [True: 212, False: 475k]
  ------------------
  259|    212|        AVIF_CHECK(avifROStreamReadU64(stream, &size)); // unsigned int(64) largesize;
  ------------------
  |  |   36|    212|    do {                        \
  |  |   37|    212|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 1, False: 211]
  |  |  ------------------
  |  |   38|      1|            avifBreakOnError(); \
  |  |   39|      1|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      1|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      1|        }                       \
  |  |   41|    212|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 211]
  |  |  ------------------
  ------------------
  260|    212|    }
  261|       |
  262|   476k|    if (!memcmp(header->type, "uuid", 4)) {
  ------------------
  |  Branch (262:9): [True: 1.68k, False: 474k]
  ------------------
  263|  1.68k|        AVIF_CHECK(avifROStreamRead(stream, header->usertype, 16)); // unsigned int(8) usertype[16] = extended_type;
  ------------------
  |  |   36|  1.68k|    do {                        \
  |  |   37|  1.68k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 6, False: 1.67k]
  |  |  ------------------
  |  |   38|      6|            avifBreakOnError(); \
  |  |   39|      6|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      6|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      6|        }                       \
  |  |   41|  1.68k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 1.67k]
  |  |  ------------------
  ------------------
  264|   474k|    } else {
  265|   474k|        memset(header->usertype, 0, sizeof(header->usertype));
  266|   474k|    }
  267|       |
  268|   476k|    size_t bytesRead = stream->offset - startOffset;
  269|   476k|    if (size == 0) {
  ------------------
  |  Branch (269:9): [True: 836, False: 475k]
  ------------------
  270|       |        // Section 4.2.2 of ISO/IEC 14496-12.
  271|       |        //   if size is 0, then this box shall be in a top-level box (i.e. not contained in another
  272|       |        //   box), and be the last box in its 'file', and its payload extends to the end of that
  273|       |        //   enclosing 'file'. This is normally only used for a MediaDataBox ('mdat').
  274|    836|        if (!topLevel) {
  ------------------
  |  Branch (274:13): [True: 26, False: 810]
  ------------------
  275|     26|            avifDiagnosticsPrintf(stream->diag, "%s: Non-top-level box with size 0", stream->diagContext);
  276|     26|            return AVIF_FALSE;
  ------------------
  |  |   89|     26|#define AVIF_FALSE 0
  ------------------
  277|     26|        }
  278|       |
  279|       |        // The given stream may be incomplete and there is no guarantee that sizeHint is available and accurate.
  280|       |        // Otherwise size could be set to avifROStreamRemainingBytes(stream) + (stream->offset - startOffset) right now.
  281|       |
  282|       |        // Wait for avifIOReadFunc() to return AVIF_RESULT_OK.
  283|    810|        header->isSizeZeroBox = AVIF_TRUE;
  ------------------
  |  |   88|    810|#define AVIF_TRUE 1
  ------------------
  284|    810|        header->size = 0;
  285|    810|        return AVIF_TRUE;
  ------------------
  |  |   88|    810|#define AVIF_TRUE 1
  ------------------
  286|    836|    }
  287|       |
  288|   475k|    if ((size < bytesRead) || ((size - bytesRead) > SIZE_MAX)) {
  ------------------
  |  Branch (288:9): [True: 4, False: 475k]
  |  Branch (288:31): [True: 0, False: 475k]
  ------------------
  289|      4|        avifDiagnosticsPrintf(stream->diag, "%s: Header size overflow check failure", stream->diagContext);
  290|      4|        return AVIF_FALSE;
  ------------------
  |  |   89|      4|#define AVIF_FALSE 0
  ------------------
  291|      4|    }
  292|   475k|    header->isSizeZeroBox = AVIF_FALSE;
  ------------------
  |  |   89|   475k|#define AVIF_FALSE 0
  ------------------
  293|   475k|    header->size = (size_t)(size - bytesRead);
  294|   475k|    return AVIF_TRUE;
  ------------------
  |  |   88|   475k|#define AVIF_TRUE 1
  ------------------
  295|   475k|}
avifROStreamReadBoxHeader:
  298|   432k|{
  299|   432k|    AVIF_CHECK(avifROStreamReadBoxHeaderPartial(stream, header, /*topLevel=*/AVIF_FALSE));
  ------------------
  |  |   36|   432k|    do {                        \
  |  |   37|   432k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 50, False: 432k]
  |  |  ------------------
  |  |   38|     50|            avifBreakOnError(); \
  |  |   39|     50|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|     50|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|     50|        }                       \
  |  |   41|   432k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 432k]
  |  |  ------------------
  ------------------
  300|   432k|    if (header->size > avifROStreamRemainingBytes(stream)) {
  ------------------
  |  Branch (300:9): [True: 266, False: 431k]
  ------------------
  301|    266|        avifDiagnosticsPrintf(stream->diag, "%s: Child box too large, possibly truncated data", stream->diagContext);
  302|    266|        return AVIF_FALSE;
  ------------------
  |  |   89|    266|#define AVIF_FALSE 0
  ------------------
  303|    266|    }
  304|   431k|    return AVIF_TRUE;
  ------------------
  |  |   88|   431k|#define AVIF_TRUE 1
  ------------------
  305|   432k|}
avifROStreamReadVersionAndFlags:
  308|   274k|{
  309|   274k|    uint8_t versionAndFlags[4];
  310|   274k|    AVIF_CHECK(avifROStreamRead(stream, versionAndFlags, 4));
  ------------------
  |  |   36|   274k|    do {                        \
  |  |   37|   274k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 16, False: 274k]
  |  |  ------------------
  |  |   38|     16|            avifBreakOnError(); \
  |  |   39|     16|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|     16|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|     16|        }                       \
  |  |   41|   274k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 274k]
  |  |  ------------------
  ------------------
  311|   274k|    if (version) {
  ------------------
  |  Branch (311:9): [True: 273k, False: 1.41k]
  ------------------
  312|   273k|        *version = versionAndFlags[0];
  313|   273k|    }
  314|   274k|    if (flags) {
  ------------------
  |  Branch (314:9): [True: 120k, False: 154k]
  ------------------
  315|   120k|        *flags = (versionAndFlags[1] << 16) + (versionAndFlags[2] << 8) + (versionAndFlags[3] << 0);
  316|   120k|    }
  317|   274k|    return AVIF_TRUE;
  ------------------
  |  |   88|   274k|#define AVIF_TRUE 1
  ------------------
  318|   274k|}
avifROStreamReadAndEnforceVersion:
  321|   124k|{
  322|   124k|    uint8_t version;
  323|   124k|    AVIF_CHECK(avifROStreamReadVersionAndFlags(stream, &version, flags));
  ------------------
  |  |   36|   124k|    do {                        \
  |  |   37|   124k|        if (!(A)) {             \
  |  |  ------------------
  |  |  |  Branch (37:13): [True: 3, False: 124k]
  |  |  ------------------
  |  |   38|      3|            avifBreakOnError(); \
  |  |   39|      3|            return AVIF_FALSE;  \
  |  |  ------------------
  |  |  |  |   89|      3|#define AVIF_FALSE 0
  |  |  ------------------
  |  |   40|      3|        }                       \
  |  |   41|   124k|    } while (0)
  |  |  ------------------
  |  |  |  Branch (41:14): [Folded, False: 124k]
  |  |  ------------------
  ------------------
  324|   124k|    if (version != enforcedVersion) {
  ------------------
  |  Branch (324:9): [True: 19, False: 124k]
  ------------------
  325|     19|        avifDiagnosticsPrintf(stream->diag, "%s: Expecting box version %u, got version %u", stream->diagContext, enforcedVersion, version);
  326|     19|        return AVIF_FALSE;
  ------------------
  |  |   89|     19|#define AVIF_FALSE 0
  ------------------
  327|     19|    }
  328|   124k|    return AVIF_TRUE;
  ------------------
  |  |   88|   124k|#define AVIF_TRUE 1
  ------------------
  329|   124k|}

avifNTOHS:
   28|   469k|{
   29|   469k|    const uint8_t * data = (const uint8_t *)&s;
   30|   469k|    return (uint16_t)((data[1] << 0) | (data[0] << 8));
   31|   469k|}
avifNTOHL:
   51|   839k|{
   52|   839k|    const uint8_t * data = (const uint8_t *)&l;
   53|   839k|    return ((uint32_t)data[3] << 0) | ((uint32_t)data[2] << 8) | ((uint32_t)data[1] << 16) | ((uint32_t)data[0] << 24);
   54|   839k|}
avifNTOH64:
   78|  20.4k|{
   79|  20.4k|    const uint8_t * data = (const uint8_t *)&l;
   80|  20.4k|    return ((uint64_t)data[7] << 0) | ((uint64_t)data[6] << 8) | ((uint64_t)data[5] << 16) | ((uint64_t)data[4] << 24) |
   81|  20.4k|           ((uint64_t)data[3] << 32) | ((uint64_t)data[2] << 40) | ((uint64_t)data[1] << 48) | ((uint64_t)data[0] << 56);
   82|  20.4k|}
avifArrayCreate:
   88|   306k|{
   89|   306k|    avifArrayInternal * arr = (avifArrayInternal *)arrayStruct;
   90|   306k|    arr->elementSize = elementSize ? elementSize : 1;
  ------------------
  |  Branch (90:24): [True: 306k, False: 0]
  ------------------
   91|   306k|    arr->count = 0;
   92|   306k|    arr->capacity = initialCapacity;
   93|   306k|    if (arr->capacity > SIZE_MAX / arr->elementSize) {
  ------------------
  |  Branch (93:9): [True: 0, False: 306k]
  ------------------
   94|      0|        arr->ptr = NULL;
   95|      0|        arr->capacity = 0;
   96|      0|        return AVIF_FALSE;
  ------------------
  |  |   89|      0|#define AVIF_FALSE 0
  ------------------
   97|      0|    }
   98|   306k|    size_t byteCount = (size_t)arr->elementSize * arr->capacity;
   99|   306k|    arr->ptr = (uint8_t *)avifAlloc(byteCount);
  100|   306k|    if (!arr->ptr) {
  ------------------
  |  Branch (100:9): [True: 0, False: 306k]
  ------------------
  101|      0|        arr->capacity = 0;
  102|      0|        return AVIF_FALSE;
  ------------------
  |  |   89|      0|#define AVIF_FALSE 0
  ------------------
  103|      0|    }
  104|   306k|    memset(arr->ptr, 0, byteCount);
  105|   306k|    return AVIF_TRUE;
  ------------------
  |  |   88|   306k|#define AVIF_TRUE 1
  ------------------
  106|   306k|}
avifArrayPush:
  109|  10.1M|{
  110|  10.1M|    avifArrayInternal * arr = (avifArrayInternal *)arrayStruct;
  111|  10.1M|    if (arr->count == arr->capacity) {
  ------------------
  |  Branch (111:9): [True: 29.1k, False: 10.0M]
  ------------------
  112|  29.1k|        uint8_t * oldPtr = arr->ptr;
  113|  29.1k|        size_t oldByteCount = (size_t)arr->elementSize * arr->capacity;
  114|  29.1k|        if (oldByteCount > SIZE_MAX / 2 || arr->capacity > UINT32_MAX / 2) {
  ------------------
  |  Branch (114:13): [True: 0, False: 29.1k]
  |  Branch (114:44): [True: 0, False: 29.1k]
  ------------------
  115|      0|            return NULL;
  116|      0|        }
  117|  29.1k|        size_t newByteCount = oldByteCount * 2;
  118|  29.1k|        uint8_t * newPtr = (uint8_t *)avifAlloc(newByteCount);
  119|  29.1k|        if (newPtr == NULL) {
  ------------------
  |  Branch (119:13): [True: 0, False: 29.1k]
  ------------------
  120|      0|            return NULL;
  121|      0|        }
  122|  29.1k|        arr->ptr = newPtr;
  123|  29.1k|        memset(arr->ptr + oldByteCount, 0, oldByteCount);
  124|  29.1k|        memcpy(arr->ptr, oldPtr, oldByteCount);
  125|  29.1k|        arr->capacity *= 2;
  126|  29.1k|        avifFree(oldPtr);
  127|  29.1k|    }
  128|  10.1M|    ++arr->count;
  129|  10.1M|    return &arr->ptr[(arr->count - 1) * (size_t)arr->elementSize];
  130|  10.1M|}
avifArrayDestroy:
  141|   331k|{
  142|   331k|    avifArrayInternal * arr = (avifArrayInternal *)arrayStruct;
  143|   331k|    if (arr->ptr) {
  ------------------
  |  Branch (143:9): [True: 306k, False: 24.4k]
  ------------------
  144|   306k|        avifFree(arr->ptr);
  145|       |        arr->ptr = NULL;
  146|   306k|    }
  147|   331k|    memset(arr, 0, sizeof(avifArrayInternal));
  148|   331k|}

avif_fuzztest_dec.cc:_ZN4avif8testutil12_GLOBAL__N_16DecodeERKNSt3__112basic_stringIcNS2_11char_traitsIcEENS2_9allocatorIcEEEEbNS2_10unique_ptrI11avifDecoderNS_16UniquePtrDeleterEEE:
   51|  20.5k|            DecoderPtr decoder) {
   52|  20.5k|  ASSERT_FALSE(GetSeedDataDirs().empty());  // Make sure seeds are available.
  ------------------
  |  Branch (52:3): [True: 20.5k, False: 0]
  |  Branch (52:3): [True: 0, False: 20.5k]
  |  Branch (52:3): [True: 20.5k, False: 0]
  ------------------
   53|       |
   54|  20.5k|  const uint8_t* data =
   55|  20.5k|      reinterpret_cast<const uint8_t*>(arbitrary_bytes.data());
   56|  20.5k|  avifIO* const io = avifIOCreateMemoryReader(data, arbitrary_bytes.size());
   57|  20.5k|  if (io == nullptr) return;
  ------------------
  |  Branch (57:7): [True: 0, False: 20.5k]
  ------------------
   58|       |  // The Chrome's avifIO object is not persistent.
   59|  20.5k|  io->persistent = is_persistent;
   60|  20.5k|  avifDecoderSetIO(decoder.get(), io);
   61|       |
   62|  20.5k|  avifResult result = avifDecoderParse(decoder.get());
   63|       |  // AVIF_RESULT_INTERNAL_ERROR means a broken invariant and should not happen.
   64|  20.5k|  ASSERT_NE(result, AVIF_RESULT_INTERNAL_ERROR);
  ------------------
  |  Branch (64:3): [True: 20.5k, False: 0]
  |  Branch (64:3): [True: 0, False: 20.5k]
  |  Branch (64:3): [True: 20.5k, False: 0]
  ------------------
   65|  20.5k|  if (result != AVIF_RESULT_OK) return;
  ------------------
  |  Branch (65:7): [True: 2.63k, False: 17.8k]
  ------------------
   66|       |
   67|  30.7k|  for (size_t i = 0; i < decoder->image->numProperties; ++i) {
  ------------------
  |  Branch (67:22): [True: 12.8k, False: 17.8k]
  ------------------
   68|  12.8k|    const avifRWData& box_payload = decoder->image->properties[i].boxPayload;
   69|       |    // Each custom property should be found as is in the input bitstream.
   70|  12.8k|    EXPECT_NE(std::search(data, data + arbitrary_bytes.size(), box_payload.data,
  ------------------
  |  Branch (70:5): [True: 12.8k, False: 0]
  |  Branch (70:5): [True: 0, False: 12.8k]
  |  Branch (70:5): [True: 12.8k, False: 0]
  ------------------
   71|  12.8k|                          box_payload.data + box_payload.size),
   72|  12.8k|              data + arbitrary_bytes.size());
   73|  12.8k|  }
   74|       |
   75|  23.4k|  while ((result = avifDecoderNextImage(decoder.get())) == AVIF_RESULT_OK) {
  ------------------
  |  Branch (75:10): [True: 5.59k, False: 17.8k]
  ------------------
   76|  5.59k|    EXPECT_GT(decoder->image->width, 0u);
  ------------------
  |  Branch (76:5): [True: 5.59k, False: 0]
  |  Branch (76:5): [True: 0, False: 5.59k]
  |  Branch (76:5): [True: 5.59k, False: 0]
  ------------------
   77|  5.59k|    EXPECT_GT(decoder->image->height, 0u);
  ------------------
  |  Branch (77:5): [True: 5.59k, False: 0]
  |  Branch (77:5): [True: 0, False: 5.59k]
  |  Branch (77:5): [True: 5.59k, False: 0]
  ------------------
   78|  5.59k|  }
   79|  17.8k|  ASSERT_NE(result, AVIF_RESULT_INTERNAL_ERROR);
  ------------------
  |  Branch (79:3): [True: 17.8k, False: 0]
  |  Branch (79:3): [True: 0, False: 17.8k]
  |  Branch (79:3): [True: 17.8k, False: 0]
  ------------------
   80|       |
   81|       |  // Loop once.
   82|  17.8k|  result = avifDecoderReset(decoder.get());
   83|  17.8k|  ASSERT_NE(result, AVIF_RESULT_INTERNAL_ERROR);
  ------------------
  |  Branch (83:3): [True: 17.8k, False: 0]
  |  Branch (83:3): [True: 0, False: 17.8k]
  |  Branch (83:3): [True: 17.8k, False: 0]
  ------------------
   84|  17.8k|  if (result != AVIF_RESULT_OK) return;
  ------------------
  |  Branch (84:7): [True: 1.37k, False: 16.4k]
  ------------------
   85|  21.0k|  while ((result = avifDecoderNextImage(decoder.get())) == AVIF_RESULT_OK) {
  ------------------
  |  Branch (85:10): [True: 4.53k, False: 16.4k]
  ------------------
   86|  4.53k|  }
   87|       |  ASSERT_NE(result, AVIF_RESULT_INTERNAL_ERROR);
  ------------------
  |  Branch (87:3): [True: 16.4k, False: 0]
  |  Branch (87:3): [True: 0, False: 16.4k]
  |  Branch (87:3): [True: 16.4k, False: 0]
  ------------------
   88|  16.4k|}

_ZN4avif8testutil17CreateAvifDecoderE15avifCodecChoicei17avifDecoderSourcebbbbjjjj:
  138|  20.5k|                             avifStrictFlags strict_flags) {
  139|  20.5k|  DecoderPtr decoder(avifDecoderCreate());
  140|  20.5k|  if (decoder.get() == nullptr) {
  ------------------
  |  Branch (140:7): [True: 0, False: 20.5k]
  ------------------
  141|      0|    return decoder;
  142|      0|  }
  143|  20.5k|  decoder->codecChoice = codec_choice;
  144|  20.5k|  decoder->maxThreads = max_threads;
  145|  20.5k|  decoder->requestedSource = requested_source;
  146|  20.5k|  decoder->allowProgressive = allow_progressive;
  147|  20.5k|  decoder->allowIncremental = allow_incremental;
  148|  20.5k|  decoder->ignoreExif = ignore_exif;
  149|  20.5k|  decoder->ignoreXMP = ignore_xmp;
  150|  20.5k|  decoder->imageSizeLimit = image_size_limit;
  151|  20.5k|  decoder->imageDimensionLimit = image_dimension_limit;
  152|  20.5k|  decoder->imageCountLimit = image_count_limit;
  153|  20.5k|  decoder->strictFlags = strict_flags;
  154|  20.5k|  return decoder;
  155|  20.5k|}
_ZN4avif8testutil26AddGainMapOptionsToDecoderENSt3__110unique_ptrI11avifDecoderNS_16UniquePtrDeleterEEEj:
  160|  20.5k|    DecoderPtr decoder, avifImageContentTypeFlags image_content_to_decode) {
  161|  20.5k|  decoder->imageContentToDecode = image_content_to_decode;
  162|  20.5k|  return decoder;
  163|  20.5k|}
_ZN4avif8testutil15GetSeedDataDirsEv:
  243|  20.5k|std::vector<std::string> GetSeedDataDirs() {
  244|  20.5k|  const char* var = std::getenv("TEST_DATA_DIRS");
  245|  20.5k|  std::vector<std::string> res;
  246|  20.5k|  if (var == nullptr || *var == 0) return res;
  ------------------
  |  Branch (246:7): [True: 0, False: 20.5k]
  |  Branch (246:25): [True: 0, False: 20.5k]
  ------------------
  247|  20.5k|  const char* var_start = var;
  248|   984k|  while (true) {
  ------------------
  |  Branch (248:10): [True: 984k, Folded]
  ------------------
  249|   984k|    if (*var == 0 || *var == ';') {
  ------------------
  |  Branch (249:9): [True: 20.5k, False: 963k]
  |  Branch (249:22): [True: 0, False: 963k]
  ------------------
  250|  20.5k|      res.push_back(std::string(var_start, var - var_start));
  251|  20.5k|      if (*var == 0) break;
  ------------------
  |  Branch (251:11): [True: 20.5k, False: 0]
  ------------------
  252|      0|      var_start = var + 1;
  253|      0|    }
  254|   963k|    ++var;
  255|   963k|  }
  256|  20.5k|  return res;
  257|  20.5k|}
_ZN4avif8testutil21GetTestImagesContentsEmRKNSt3__16vectorI17avifAppFileFormatNS1_9allocatorIS3_EEEE:
  260|      4|    size_t max_file_size, const std::vector<avifAppFileFormat>& image_formats) {
  261|       |  // Use an environment variable to get the test data directory because
  262|       |  // fuzztest seeds are created before the main() function is called, so the
  263|       |  // test has no chance to parse command line arguments.
  264|      4|  const std::vector<std::string> test_data_dirs = GetSeedDataDirs();
  265|      4|  if (test_data_dirs.empty()) {
  ------------------
  |  Branch (265:7): [True: 0, False: 4]
  ------------------
  266|       |    // Only a warning because this can happen when running the binary with
  267|       |    // --list_fuzz_tests (such as with gtest_discover_tests() in cmake).
  268|      0|    std::cerr << "WARNING: TEST_DATA_DIRS env variable not set, unable to read "
  269|      0|                 "seed files\n";
  270|      0|    return {};
  271|      0|  }
  272|       |
  273|      4|  std::vector<std::string> seeds;
  274|      4|  for (const std::string& test_data_dir : test_data_dirs) {
  ------------------
  |  Branch (274:41): [True: 4, False: 4]
  ------------------
  275|      4|    std::cout << "Reading seeds from " << test_data_dir
  276|      4|              << " (non recursively)\n";
  277|      4|    auto tuple_vector = fuzztest::ReadFilesFromDirectory(test_data_dir);
  278|      4|    seeds.reserve(tuple_vector.size());
  279|    404|    for (auto& [file_content] : tuple_vector) {
  ------------------
  |  Branch (279:31): [True: 404, False: 4]
  ------------------
  280|    404|      if (file_content.size() > max_file_size) continue;
  ------------------
  |  Branch (280:11): [True: 12, False: 392]
  ------------------
  281|    392|      if (!image_formats.empty()) {
  ------------------
  |  Branch (281:11): [True: 392, False: 0]
  ------------------
  282|    392|        const avifAppFileFormat format = avifGuessBufferFileFormat(
  283|    392|            reinterpret_cast<const uint8_t*>(file_content.data()),
  284|    392|            file_content.size());
  285|    392|        if (std::find(image_formats.begin(), image_formats.end(), format) ==
  ------------------
  |  Branch (285:13): [True: 136, False: 256]
  ------------------
  286|    392|            image_formats.end()) {
  287|    136|          continue;
  288|    136|        }
  289|    392|      }
  290|       |
  291|    256|      seeds.push_back(std::move(file_content));
  292|    256|    }
  293|      4|  }
  294|      4|  if (seeds.empty()) {
  ------------------
  |  Branch (294:7): [True: 0, False: 4]
  ------------------
  295|      0|    std::cerr << "ERROR: no files found that match the given file size and "
  296|      0|                 "format criteria\n";
  297|      0|    std::abort();
  298|      0|  }
  299|      4|  std::cout << "Returning " << seeds.size() << " seed images\n";
  300|      4|  return seeds;
  301|      4|}

_ZN4avif8testutil24ArbitraryBaseAvifDecoderEv:
  285|      4|inline auto ArbitraryBaseAvifDecoder() {
  286|       |  // MAX_NUM_THREADS from libaom/aom_util/aom_thread.h
  287|      4|  const auto max_threads = fuzztest::InRange(0, 64);
  288|      4|  return fuzztest::Map(
  289|      4|      CreateAvifDecoder,
  290|      4|      fuzztest::ElementOf<avifCodecChoice>({AVIF_CODEC_CHOICE_AUTO,
  291|      4|                                            AVIF_CODEC_CHOICE_AOM,
  292|      4|                                            AVIF_CODEC_CHOICE_DAV1D}),
  293|      4|      max_threads,
  294|       |      /*requested_source=*/
  295|      4|      fuzztest::ElementOf(
  296|      4|          {AVIF_DECODER_SOURCE_AUTO, AVIF_DECODER_SOURCE_PRIMARY_ITEM}),
  297|      4|      /*allow_progressive=*/fuzztest::Arbitrary<bool>(),
  298|      4|      /*allow_incremental=*/fuzztest::Arbitrary<bool>(),
  299|      4|      /*ignore_exif=*/fuzztest::Arbitrary<bool>(),
  300|      4|      /*ignore_xmp=*/fuzztest::Arbitrary<bool>(),
  301|      4|      /*image_size_limit=*/fuzztest::Just(kMaxDimension * kMaxDimension),
  302|      4|      /*image_dimension_limit=*/fuzztest::Just(kMaxDimension),
  303|      4|      /*image_count_limit=*/fuzztest::Just(10),
  304|       |      /*strict_flags=*/
  305|      4|      fuzztest::BitFlagCombinationOf<avifStrictFlags>(
  306|      4|          {AVIF_STRICT_PIXI_REQUIRED, AVIF_STRICT_CLAP_VALID,
  307|      4|           AVIF_STRICT_ALPHA_ISPE_REQUIRED}));
  308|      4|}
_ZN4avif8testutil37ArbitraryAvifDecoderPossiblyNoContentEv:
  333|      4|inline auto ArbitraryAvifDecoderPossiblyNoContent() {
  334|      4|  return fuzztest::Map(
  335|      4|      AddGainMapOptionsToDecoder, ArbitraryBaseAvifDecoder(),
  336|      4|      fuzztest::BitFlagCombinationOf<avifImageContentTypeFlags>(
  337|      4|          {AVIF_IMAGE_CONTENT_COLOR_AND_ALPHA, AVIF_IMAGE_CONTENT_GAIN_MAP}));
  338|      4|}
_ZN4avif8testutil23ArbitraryImageWithSeedsERKNSt3__16vectorI17avifAppFileFormatNS1_9allocatorIS3_EEEE:
  370|      4|    const std::vector<avifAppFileFormat>& image_formats) {
  371|      4|  constexpr uint32_t kMaxSeedFileSize = 1024 * 1024;  // 1MB.
  372|      4|  return fuzztest::Arbitrary<std::string>()
  373|      4|      .WithMaxSize(kMaxSeedFileSize)
  374|      4|      .WithSeeds(GetTestImagesContents(kMaxSeedFileSize, image_formats));
  375|      4|}

